From 95f295f9fe08166307d94b641fba0b0e924d5d31 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben.dooks@codethink.co.uk>
Date: Wed, 21 Nov 2018 16:13:23 +0000
Subject: dmaengine: tegra: add tracepoints to driver

Add some trace-points to the driver to allow for debuging via the
trace pipe.

Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/trace/events/tegra_apb_dma.h | 61 ++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 include/trace/events/tegra_apb_dma.h

(limited to 'include')

diff --git a/include/trace/events/tegra_apb_dma.h b/include/trace/events/tegra_apb_dma.h
new file mode 100644
index 000000000000..0818f6286110
--- /dev/null
+++ b/include/trace/events/tegra_apb_dma.h
@@ -0,0 +1,61 @@
+#if !defined(_TRACE_TEGRA_APB_DMA_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_TEGRA_APM_DMA_H
+
+#include <linux/tracepoint.h>
+#include <linux/dmaengine.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM tegra_apb_dma
+
+TRACE_EVENT(tegra_dma_tx_status,
+	TP_PROTO(struct dma_chan *dc, dma_cookie_t cookie, struct dma_tx_state *state),
+	TP_ARGS(dc, cookie, state),
+	TP_STRUCT__entry(
+		__string(chan,	dev_name(&dc->dev->device))
+		__field(dma_cookie_t, cookie)
+		__field(__u32,	residue)
+	),
+	TP_fast_assign(
+		__assign_str(chan, dev_name(&dc->dev->device));
+		__entry->cookie = cookie;
+		__entry->residue = state ? state->residue : (u32)-1;
+	),
+	TP_printk("channel %s: dma cookie %d, residue %u",
+		  __get_str(chan), __entry->cookie, __entry->residue)
+);
+
+TRACE_EVENT(tegra_dma_complete_cb,
+	TP_PROTO(struct dma_chan *dc, int count, void *ptr),
+	TP_ARGS(dc, count, ptr),
+	TP_STRUCT__entry(
+		__string(chan,	dev_name(&dc->dev->device))
+		__field(int,	count)
+		__field(void *,	ptr)
+		),
+	TP_fast_assign(
+		__assign_str(chan, dev_name(&dc->dev->device));
+		__entry->count = count;
+		__entry->ptr = ptr;
+		),
+	TP_printk("channel %s: done %d, ptr %p",
+		  __get_str(chan), __entry->count, __entry->ptr)
+);
+
+TRACE_EVENT(tegra_dma_isr,
+	TP_PROTO(struct dma_chan *dc, int irq),
+	TP_ARGS(dc, irq),
+	TP_STRUCT__entry(
+		__string(chan,	dev_name(&dc->dev->device))
+		__field(int,	irq)
+	),
+	TP_fast_assign(
+		__assign_str(chan, dev_name(&dc->dev->device));
+		__entry->irq = irq;
+	),
+	TP_printk("%s: irq %d\n",  __get_str(chan), __entry->irq)
+);
+
+#endif /*  _TRACE_TEGRADMA_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
cgit v1.2.3


From 8fc5c73554db0ac18c0c6ac5b2099ab917f83bdf Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 9 Nov 2018 12:43:07 -0800
Subject: acpi/nfit, device-dax: Identify differentiated memory with a unique
 numa-node
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Persistent memory, as described by the ACPI NFIT (NVDIMM Firmware
Interface Table), is the first known instance of a memory range
described by a unique "target" proximity domain. Where "initiator" and
"target" proximity domains is an approach that the ACPI HMAT
(Heterogeneous Memory Attributes Table) uses to described the unique
performance properties of a memory range relative to a given initiator
(e.g. CPU or DMA device).

Currently the numa-node for a /dev/pmemX block-device or /dev/daxX.Y
char-device follows the traditional notion of 'numa-node' where the
attribute conveys the closest online numa-node. That numa-node attribute
is useful for cpu-binding and memory-binding processes *near* the
device. However, when the memory range backing a 'pmem', or 'dax' device
is onlined (memory hot-add) the memory-only-numa-node representing that
address needs to be differentiated from the set of online nodes. In
other words, the numa-node association of the device depends on whether
you can bind processes *near* the cpu-numa-node in the offline
device-case, or bind process *on* the memory-range directly after the
backing address range is onlined.

Allow for the case that platform firmware describes persistent memory
with a unique proximity domain, i.e. when it is distinct from the
proximity of DRAM and CPUs that are on the same socket. Plumb the Linux
numa-node translation of that proximity through the libnvdimm region
device to namespaces that are in device-dax mode. With this in place the
proposed kmem driver [1] can optionally discover a unique numa-node
number for the address range as it transitions the memory from an
offline state managed by a device-driver to an online memory range
managed by the core-mm.

[1]: https://lore.kernel.org/lkml/20181022201317.8558C1D8@viggo.jf.intel.com

Reported-by: Fan Du <fan.du@intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Oliver O'Halloran" <oohall@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Reviewed-by: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/powerpc/platforms/pseries/papr_scm.c | 1 +
 drivers/acpi/nfit/core.c                  | 8 ++++++--
 drivers/acpi/numa.c                       | 1 +
 drivers/dax/bus.c                         | 4 +++-
 drivers/dax/bus.h                         | 3 ++-
 drivers/dax/dax-private.h                 | 4 ++++
 drivers/dax/pmem/core.c                   | 4 +++-
 drivers/nvdimm/e820.c                     | 1 +
 drivers/nvdimm/nd.h                       | 2 +-
 drivers/nvdimm/of_pmem.c                  | 1 +
 drivers/nvdimm/region_devs.c              | 1 +
 include/linux/acpi.h                      | 5 +++++
 include/linux/libnvdimm.h                 | 1 +
 13 files changed, 30 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index 7d6457ab5d34..8806ac822627 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -236,6 +236,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 	memset(&ndr_desc, 0, sizeof(ndr_desc));
 	ndr_desc.attr_groups = region_attr_groups;
 	ndr_desc.numa_node = dev_to_node(&p->pdev->dev);
+	ndr_desc.target_node = ndr_desc.numa_node;
 	ndr_desc.res = &p->res;
 	ndr_desc.of_node = p->dn;
 	ndr_desc.provider_data = p;
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 011d3db19c80..475899974c70 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2869,11 +2869,15 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 	ndr_desc->res = &res;
 	ndr_desc->provider_data = nfit_spa;
 	ndr_desc->attr_groups = acpi_nfit_region_attribute_groups;
-	if (spa->flags & ACPI_NFIT_PROXIMITY_VALID)
+	if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) {
 		ndr_desc->numa_node = acpi_map_pxm_to_online_node(
 						spa->proximity_domain);
-	else
+		ndr_desc->target_node = acpi_map_pxm_to_node(
+				spa->proximity_domain);
+	} else {
 		ndr_desc->numa_node = NUMA_NO_NODE;
+		ndr_desc->target_node = NUMA_NO_NODE;
+	}
 
 	/*
 	 * Persistence domain bits are hierarchical, if
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 274699463b4f..b9d86babb13a 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -84,6 +84,7 @@ int acpi_map_pxm_to_node(int pxm)
 
 	return node;
 }
+EXPORT_SYMBOL(acpi_map_pxm_to_node);
 
 /**
  * acpi_map_pxm_to_online_node - Map proximity ID to online node
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 568168500217..c620ad52d7e5 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -214,7 +214,7 @@ static void dax_region_unregister(void *region)
 }
 
 struct dax_region *alloc_dax_region(struct device *parent, int region_id,
-		struct resource *res, unsigned int align,
+		struct resource *res, int target_node, unsigned int align,
 		unsigned long pfn_flags)
 {
 	struct dax_region *dax_region;
@@ -244,6 +244,7 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id,
 	dax_region->id = region_id;
 	dax_region->align = align;
 	dax_region->dev = parent;
+	dax_region->target_node = target_node;
 	if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) {
 		kfree(dax_region);
 		return NULL;
@@ -348,6 +349,7 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id,
 
 	dev_dax->dax_dev = dax_dev;
 	dev_dax->region = dax_region;
+	dev_dax->target_node = dax_region->target_node;
 	kref_get(&dax_region->kref);
 
 	inode = dax_inode(dax_dev);
diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
index ce977552ffb5..8619e3299943 100644
--- a/drivers/dax/bus.h
+++ b/drivers/dax/bus.h
@@ -10,7 +10,8 @@ struct dax_device;
 struct dax_region;
 void dax_region_put(struct dax_region *dax_region);
 struct dax_region *alloc_dax_region(struct device *parent, int region_id,
-		struct resource *res, unsigned int align, unsigned long flags);
+		struct resource *res, int target_node, unsigned int align,
+		unsigned long flags);
 
 enum dev_dax_subsys {
 	DEV_DAX_BUS,
diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index a82ce48f5884..a45612148ca0 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -26,6 +26,7 @@ void dax_bus_exit(void);
 /**
  * struct dax_region - mapping infrastructure for dax devices
  * @id: kernel-wide unique region for a memory range
+ * @target_node: effective numa node if this memory range is onlined
  * @kref: to pin while other agents have a need to do lookups
  * @dev: parent device backing this region
  * @align: allocation and mapping alignment for child dax devices
@@ -34,6 +35,7 @@ void dax_bus_exit(void);
  */
 struct dax_region {
 	int id;
+	int target_node;
 	struct kref kref;
 	struct device *dev;
 	unsigned int align;
@@ -46,6 +48,7 @@ struct dax_region {
  * data while the device is activated in the driver.
  * @region - parent region
  * @dax_dev - core dax functionality
+ * @target_node: effective numa node if dev_dax memory range is onlined
  * @dev - device core
  * @pgmap - pgmap for memmap setup / lifetime (driver owned)
  * @ref: pgmap reference count (driver owned)
@@ -54,6 +57,7 @@ struct dax_region {
 struct dev_dax {
 	struct dax_region *region;
 	struct dax_device *dax_dev;
+	int target_node;
 	struct device dev;
 	struct dev_pagemap pgmap;
 	struct percpu_ref ref;
diff --git a/drivers/dax/pmem/core.c b/drivers/dax/pmem/core.c
index bdcff1b14e95..f71019ce0647 100644
--- a/drivers/dax/pmem/core.c
+++ b/drivers/dax/pmem/core.c
@@ -20,6 +20,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys)
 	struct nd_namespace_common *ndns;
 	struct nd_dax *nd_dax = to_nd_dax(dev);
 	struct nd_pfn *nd_pfn = &nd_dax->nd_pfn;
+	struct nd_region *nd_region = to_nd_region(dev->parent);
 
 	ndns = nvdimm_namespace_common_probe(dev);
 	if (IS_ERR(ndns))
@@ -52,7 +53,8 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys)
 	memcpy(&res, &pgmap.res, sizeof(res));
 	res.start += offset;
 	dax_region = alloc_dax_region(dev, region_id, &res,
-			le32_to_cpu(pfn_sb->align), PFN_DEV|PFN_MAP);
+			nd_region->target_node, le32_to_cpu(pfn_sb->align),
+			PFN_DEV|PFN_MAP);
 	if (!dax_region)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c
index 521eaf53a52a..36be9b619187 100644
--- a/drivers/nvdimm/e820.c
+++ b/drivers/nvdimm/e820.c
@@ -47,6 +47,7 @@ static int e820_register_one(struct resource *res, void *data)
 	ndr_desc.res = res;
 	ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
 	ndr_desc.numa_node = e820_range_to_nid(res->start);
+	ndr_desc.target_node = ndr_desc.numa_node;
 	set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
 	if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
 		return -ENXIO;
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index cfde992684e7..0b3d7595b3cb 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -153,7 +153,7 @@ struct nd_region {
 	u16 ndr_mappings;
 	u64 ndr_size;
 	u64 ndr_start;
-	int id, num_lanes, ro, numa_node;
+	int id, num_lanes, ro, numa_node, target_node;
 	void *provider_data;
 	struct kernfs_node *bb_state;
 	struct badblocks bb;
diff --git a/drivers/nvdimm/of_pmem.c b/drivers/nvdimm/of_pmem.c
index 0a701837dfc0..ecaaa27438e2 100644
--- a/drivers/nvdimm/of_pmem.c
+++ b/drivers/nvdimm/of_pmem.c
@@ -68,6 +68,7 @@ static int of_pmem_region_probe(struct platform_device *pdev)
 		memset(&ndr_desc, 0, sizeof(ndr_desc));
 		ndr_desc.attr_groups = region_attr_groups;
 		ndr_desc.numa_node = dev_to_node(&pdev->dev);
+		ndr_desc.target_node = ndr_desc.numa_node;
 		ndr_desc.res = &pdev->resource[i];
 		ndr_desc.of_node = np;
 		set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index e2818f94f292..caf2f3129ccd 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -1065,6 +1065,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	nd_region->flags = ndr_desc->flags;
 	nd_region->ro = ro;
 	nd_region->numa_node = ndr_desc->numa_node;
+	nd_region->target_node = ndr_desc->target_node;
 	ida_init(&nd_region->ns_ida);
 	ida_init(&nd_region->btt_ida);
 	ida_init(&nd_region->pfn_ida);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 87715f20b69a..eddf2736e5a6 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -400,12 +400,17 @@ extern bool acpi_osi_is_win8(void);
 
 #ifdef CONFIG_ACPI_NUMA
 int acpi_map_pxm_to_online_node(int pxm);
+int acpi_map_pxm_to_node(int pxm);
 int acpi_get_node(acpi_handle handle);
 #else
 static inline int acpi_map_pxm_to_online_node(int pxm)
 {
 	return 0;
 }
+static inline int acpi_map_pxm_to_node(int pxm)
+{
+	return 0;
+}
 static inline int acpi_get_node(acpi_handle handle)
 {
 	return 0;
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 5440f11b0907..56bc545ad3b2 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -128,6 +128,7 @@ struct nd_region_desc {
 	void *provider_data;
 	int num_lanes;
 	int numa_node;
+	int target_node;
 	unsigned long flags;
 	struct device_node *of_node;
 };
-- 
cgit v1.2.3


From ebc40be2b8eec093abbbd87658a6726ff84a61f5 Mon Sep 17 00:00:00 2001
From: Fabien Dessenne <fabien.dessenne@st.com>
Date: Wed, 7 Nov 2018 11:18:34 +0100
Subject: remoteproc: fix kernel-doc comment for parse_fw

Fix the kernel-doc comment for "parse_fw" and fix a typo.

Signed-off-by: Fabien Dessenne <fabien.dessenne@st.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 include/linux/remoteproc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 507a2b524208..68e72f33c705 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -345,9 +345,9 @@ struct firmware;
  * @stop:	power off the device
  * @kick:	kick a virtqueue (virtqueue id given as a parameter)
  * @da_to_va:	optional platform hook to perform address translations
- * @load_rsc_table:	load resource table from firmware image
+ * @parse_fw:	parse firmware to extract information (e.g. resource table)
  * @find_loaded_rsc_table: find the loaded resouce table
- * @load:		load firmeware to memory, where the remote processor
+ * @load:		load firmware to memory, where the remote processor
  *			expects to find it
  * @sanity_check:	sanity check the fw image
  * @get_boot_addr:	get boot address to entry point specified in firmware
-- 
cgit v1.2.3


From d7dba6be0f31ae61f5f3296aa130f45d57d30f74 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 7 Jan 2019 13:07:36 +0200
Subject: dmaengine: dw: Remove misleading is_private property

The commit a9ddb575d6d6

   ("dmaengine: dw_dmac: Enhance device tree support")

introduces is_private property in uncertain understanding what does it mean.

First of all, documentation defines DMA_PRIVATE capability as

Documentation/crypto/async-tx-api.txt:
  The DMA_PRIVATE capability flag is used to tag dma devices that should not be
  used by the general-purpose allocator. It can be set at initialization time
  if it is known that a channel will always be private. Alternatively,
  it is set when dma_request_channel() finds an unused "public" channel.

  A couple caveats to note when implementing a driver and consumer:
  1/ Once a channel has been privately allocated it will no longer be
     considered by the general-purpose allocator even after a call to
     dma_release_channel().
  2/ Since capabilities are specified at the device level a dma_device with
     multiple channels will either have all channels public, or all channels
     private.

Documentation/driver-api/dmaengine/provider.rst:
  - DMA_PRIVATE
    The devices only supports slave transfers, and as such isn't available
    for async transfers.

The capability had been introduced by the commit 59b5ec21446b

  ("dmaengine: introduce dma_request_channel and private channels")

and some code didn't changed from that times ever.

Taking into consideration above and the fact that on all known platforms
Synopsys DesignWare DMA engine is attached to serve slave transfers,
the DMA_PRIVATE capability must be enabled for this device unconditionally.
Otherwise, as rightfully noticed in drivers/dma/at_xdmac.c:
  /*
   * Without DMA_PRIVATE the driver is not able to allocate more than
   * one channel, second allocation fails in private_candidate.
   */
because of of a caveats mentioned in above documentation excerpts.

So, remove conditional around DMA_PRIVATE followed by removal leftovers.

If someone wonders, DMA_PRIVATE can be not used if and only if the all channels
of the DMA controller are supposed to serve memory-to-memory like operations.
For example, EP93xx has two controllers, one of which can only perform
memory-to-memory transfers

Note, this change doesn't affect dmatest to be able to test such controllers.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> (maintainer:SERIAL DRIVERS)
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 Documentation/devicetree/bindings/dma/snps-dma.txt | 2 --
 drivers/dma/dw/core.c                              | 4 +---
 drivers/dma/dw/pci.c                               | 1 -
 drivers/dma/dw/platform.c                          | 3 ---
 drivers/tty/serial/8250/8250_lpss.c                | 1 -
 include/linux/platform_data/dma-dw.h               | 3 ---
 6 files changed, 1 insertion(+), 13 deletions(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/dma/snps-dma.txt b/Documentation/devicetree/bindings/dma/snps-dma.txt
index db757df7057d..0bedceed1963 100644
--- a/Documentation/devicetree/bindings/dma/snps-dma.txt
+++ b/Documentation/devicetree/bindings/dma/snps-dma.txt
@@ -23,8 +23,6 @@ Deprecated properties:
 
 
 Optional properties:
-- is_private: The device channels should be marked as private and not for by the
-  general purpose DMA channel allocator. False if not passed.
 - multi-block: Multi block transfers supported by hardware. Array property with
   one cell per channel. 0: not supported, 1 (default): supported.
 - snps,dma-protection-control: AHB HPROT[3:1] protection setting.
diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index dc053e62f894..e25503986680 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -1227,7 +1227,6 @@ int dw_dma_probe(struct dw_dma_chip *chip)
 		pdata->block_size = dma_readl(dw, MAX_BLK_SIZE);
 
 		/* Fill platform data with the default values */
-		pdata->is_private = true;
 		pdata->is_memcpy = true;
 		pdata->chan_allocation_order = CHAN_ALLOCATION_ASCENDING;
 		pdata->chan_priority = CHAN_PRIORITY_ASCENDING;
@@ -1340,8 +1339,7 @@ int dw_dma_probe(struct dw_dma_chip *chip)
 
 	/* Set capabilities */
 	dma_cap_set(DMA_SLAVE, dw->dma.cap_mask);
-	if (pdata->is_private)
-		dma_cap_set(DMA_PRIVATE, dw->dma.cap_mask);
+	dma_cap_set(DMA_PRIVATE, dw->dma.cap_mask);
 	if (pdata->is_memcpy)
 		dma_cap_set(DMA_MEMCPY, dw->dma.cap_mask);
 
diff --git a/drivers/dma/dw/pci.c b/drivers/dma/dw/pci.c
index 313ba10c6224..570498faadc3 100644
--- a/drivers/dma/dw/pci.c
+++ b/drivers/dma/dw/pci.c
@@ -17,7 +17,6 @@
 
 static struct dw_dma_platform_data mrfld_pdata = {
 	.nr_channels = 8,
-	.is_private = true,
 	.is_memcpy = true,
 	.is_idma32 = true,
 	.chan_allocation_order = CHAN_ALLOCATION_ASCENDING,
diff --git a/drivers/dma/dw/platform.c b/drivers/dma/dw/platform.c
index 31ff8113c3de..6dd8cd1820c1 100644
--- a/drivers/dma/dw/platform.c
+++ b/drivers/dma/dw/platform.c
@@ -128,9 +128,6 @@ dw_dma_parse_dt(struct platform_device *pdev)
 	pdata->nr_masters = nr_masters;
 	pdata->nr_channels = nr_channels;
 
-	if (of_property_read_bool(np, "is_private"))
-		pdata->is_private = true;
-
 	/*
 	 * All known devices, which use DT for configuration, support
 	 * memory-to-memory transfers. So enable it by default.
diff --git a/drivers/tty/serial/8250/8250_lpss.c b/drivers/tty/serial/8250/8250_lpss.c
index 98dbc796353f..53ca9ba6ab4b 100644
--- a/drivers/tty/serial/8250/8250_lpss.c
+++ b/drivers/tty/serial/8250/8250_lpss.c
@@ -153,7 +153,6 @@ static int byt_serial_setup(struct lpss8250 *lpss, struct uart_port *port)
 #ifdef CONFIG_SERIAL_8250_DMA
 static const struct dw_dma_platform_data qrk_serial_dma_pdata = {
 	.nr_channels = 2,
-	.is_private = true,
 	.chan_allocation_order = CHAN_ALLOCATION_ASCENDING,
 	.chan_priority = CHAN_PRIORITY_ASCENDING,
 	.block_size = 4095,
diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h
index 1a1d58ebffbf..d443025c5c72 100644
--- a/include/linux/platform_data/dma-dw.h
+++ b/include/linux/platform_data/dma-dw.h
@@ -38,8 +38,6 @@ struct dw_dma_slave {
 /**
  * struct dw_dma_platform_data - Controller configuration parameters
  * @nr_channels: Number of channels supported by hardware (max 8)
- * @is_private: The device channels should be marked as private and not for
- *	by the general purpose DMA channel allocator.
  * @is_memcpy: The device channels do support memory-to-memory transfers.
  * @is_idma32: The type of the DMA controller is iDMA32
  * @chan_allocation_order: Allocate channels starting from 0 or 7
@@ -53,7 +51,6 @@ struct dw_dma_slave {
  */
 struct dw_dma_platform_data {
 	unsigned int	nr_channels;
-	bool		is_private;
 	bool		is_memcpy;
 	bool		is_idma32;
 #define CHAN_ALLOCATION_ASCENDING	0	/* zero to seven */
-- 
cgit v1.2.3


From 078165779608873e7b6eae1316a39c73af9f3edc Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 7 Jan 2019 13:07:37 +0200
Subject: dmaengine: dw: Remove unused internal property

All known devices, which use DT for configuration, support
memory-to-memory transfers. So enable it by default.

The rest two cases, i.e. Intel Quark and PPC460ex, instantiate DMA driver and
use its channels exclusively for hardware, which means there is no available
channel for any other purposes anyway.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dw/core.c                | 4 +---
 drivers/dma/dw/pci.c                 | 1 -
 drivers/dma/dw/platform.c            | 6 ------
 include/linux/platform_data/dma-dw.h | 2 --
 4 files changed, 1 insertion(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index e25503986680..4982e443869c 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -1227,7 +1227,6 @@ int dw_dma_probe(struct dw_dma_chip *chip)
 		pdata->block_size = dma_readl(dw, MAX_BLK_SIZE);
 
 		/* Fill platform data with the default values */
-		pdata->is_memcpy = true;
 		pdata->chan_allocation_order = CHAN_ALLOCATION_ASCENDING;
 		pdata->chan_priority = CHAN_PRIORITY_ASCENDING;
 	} else if (chip->pdata->nr_channels > DW_DMA_MAX_NR_CHANNELS) {
@@ -1340,8 +1339,7 @@ int dw_dma_probe(struct dw_dma_chip *chip)
 	/* Set capabilities */
 	dma_cap_set(DMA_SLAVE, dw->dma.cap_mask);
 	dma_cap_set(DMA_PRIVATE, dw->dma.cap_mask);
-	if (pdata->is_memcpy)
-		dma_cap_set(DMA_MEMCPY, dw->dma.cap_mask);
+	dma_cap_set(DMA_MEMCPY, dw->dma.cap_mask);
 
 	dw->dma.dev = chip->dev;
 	dw->dma.device_alloc_chan_resources = dwc_alloc_chan_resources;
diff --git a/drivers/dma/dw/pci.c b/drivers/dma/dw/pci.c
index 570498faadc3..66d98d7ccad0 100644
--- a/drivers/dma/dw/pci.c
+++ b/drivers/dma/dw/pci.c
@@ -17,7 +17,6 @@
 
 static struct dw_dma_platform_data mrfld_pdata = {
 	.nr_channels = 8,
-	.is_memcpy = true,
 	.is_idma32 = true,
 	.chan_allocation_order = CHAN_ALLOCATION_ASCENDING,
 	.chan_priority = CHAN_PRIORITY_ASCENDING,
diff --git a/drivers/dma/dw/platform.c b/drivers/dma/dw/platform.c
index 6dd8cd1820c1..58fc1ba02a1e 100644
--- a/drivers/dma/dw/platform.c
+++ b/drivers/dma/dw/platform.c
@@ -128,12 +128,6 @@ dw_dma_parse_dt(struct platform_device *pdev)
 	pdata->nr_masters = nr_masters;
 	pdata->nr_channels = nr_channels;
 
-	/*
-	 * All known devices, which use DT for configuration, support
-	 * memory-to-memory transfers. So enable it by default.
-	 */
-	pdata->is_memcpy = true;
-
 	if (!of_property_read_u32(np, "chan_allocation_order", &tmp))
 		pdata->chan_allocation_order = (unsigned char)tmp;
 
diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h
index d443025c5c72..1c85eeee4171 100644
--- a/include/linux/platform_data/dma-dw.h
+++ b/include/linux/platform_data/dma-dw.h
@@ -38,7 +38,6 @@ struct dw_dma_slave {
 /**
  * struct dw_dma_platform_data - Controller configuration parameters
  * @nr_channels: Number of channels supported by hardware (max 8)
- * @is_memcpy: The device channels do support memory-to-memory transfers.
  * @is_idma32: The type of the DMA controller is iDMA32
  * @chan_allocation_order: Allocate channels starting from 0 or 7
  * @chan_priority: Set channel priority increasing from 0 to 7 or 7 to 0.
@@ -51,7 +50,6 @@ struct dw_dma_slave {
  */
 struct dw_dma_platform_data {
 	unsigned int	nr_channels;
-	bool		is_memcpy;
 	bool		is_idma32;
 #define CHAN_ALLOCATION_ASCENDING	0	/* zero to seven */
 #define CHAN_ALLOCATION_DESCENDING	1	/* seven to zero */
-- 
cgit v1.2.3


From 69da8be90d5e85e60b5377c47384154b9dabf592 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 7 Jan 2019 13:07:38 +0200
Subject: dmaengine: dw: Split DW and iDMA 32-bit operations

Here is a kinda big refactoring that should have been done
in the first place, when Intel iDMA 32-bit support appeared.

It splits operations which are different to Synopsys DesignWare and
Intel iDMA 32-bit controllers.

No functional change intended.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dw/Makefile              |   2 +-
 drivers/dma/dw/core.c                | 190 +++++++----------------------------
 drivers/dma/dw/dw.c                  | 112 +++++++++++++++++++++
 drivers/dma/dw/idma32.c              | 138 +++++++++++++++++++++++++
 drivers/dma/dw/internal.h            |  10 +-
 drivers/dma/dw/pci.c                 |  45 ++++++---
 drivers/dma/dw/platform.c            |   8 +-
 drivers/dma/dw/regs.h                |  13 +++
 include/linux/dma/dw.h               |   4 +
 include/linux/platform_data/dma-dw.h |   2 -
 10 files changed, 343 insertions(+), 181 deletions(-)
 create mode 100644 drivers/dma/dw/dw.c
 create mode 100644 drivers/dma/dw/idma32.c

(limited to 'include')

diff --git a/drivers/dma/dw/Makefile b/drivers/dma/dw/Makefile
index 2b949c2e4504..63ed895c09aa 100644
--- a/drivers/dma/dw/Makefile
+++ b/drivers/dma/dw/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_DW_DMAC_CORE)	+= dw_dmac_core.o
-dw_dmac_core-objs	:= core.o
+dw_dmac_core-objs	:= core.o dw.o idma32.o
 
 obj-$(CONFIG_DW_DMAC)		+= dw_dmac.o
 dw_dmac-objs		:= platform.o
diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index 4982e443869c..8a581d86ea8d 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -138,44 +138,6 @@ static void dwc_desc_put(struct dw_dma_chan *dwc, struct dw_desc *desc)
 	dwc->descs_allocated--;
 }
 
-static void dwc_initialize_chan_idma32(struct dw_dma_chan *dwc)
-{
-	u32 cfghi = 0;
-	u32 cfglo = 0;
-
-	/* Set default burst alignment */
-	cfglo |= IDMA32C_CFGL_DST_BURST_ALIGN | IDMA32C_CFGL_SRC_BURST_ALIGN;
-
-	/* Low 4 bits of the request lines */
-	cfghi |= IDMA32C_CFGH_DST_PER(dwc->dws.dst_id & 0xf);
-	cfghi |= IDMA32C_CFGH_SRC_PER(dwc->dws.src_id & 0xf);
-
-	/* Request line extension (2 bits) */
-	cfghi |= IDMA32C_CFGH_DST_PER_EXT(dwc->dws.dst_id >> 4 & 0x3);
-	cfghi |= IDMA32C_CFGH_SRC_PER_EXT(dwc->dws.src_id >> 4 & 0x3);
-
-	channel_writel(dwc, CFG_LO, cfglo);
-	channel_writel(dwc, CFG_HI, cfghi);
-}
-
-static void dwc_initialize_chan_dw(struct dw_dma_chan *dwc)
-{
-	struct dw_dma *dw = to_dw_dma(dwc->chan.device);
-	u32 cfghi = DWC_CFGH_FIFO_MODE;
-	u32 cfglo = DWC_CFGL_CH_PRIOR(dwc->priority);
-	bool hs_polarity = dwc->dws.hs_polarity;
-
-	cfghi |= DWC_CFGH_DST_PER(dwc->dws.dst_id);
-	cfghi |= DWC_CFGH_SRC_PER(dwc->dws.src_id);
-	cfghi |= DWC_CFGH_PROTCTL(dw->pdata->protctl);
-
-	/* Set polarity of handshake interface */
-	cfglo |= hs_polarity ? DWC_CFGL_HS_DST_POL | DWC_CFGL_HS_SRC_POL : 0;
-
-	channel_writel(dwc, CFG_LO, cfglo);
-	channel_writel(dwc, CFG_HI, cfghi);
-}
-
 static void dwc_initialize(struct dw_dma_chan *dwc)
 {
 	struct dw_dma *dw = to_dw_dma(dwc->chan.device);
@@ -183,10 +145,7 @@ static void dwc_initialize(struct dw_dma_chan *dwc)
 	if (test_bit(DW_DMA_IS_INITIALIZED, &dwc->flags))
 		return;
 
-	if (dw->pdata->is_idma32)
-		dwc_initialize_chan_idma32(dwc);
-	else
-		dwc_initialize_chan_dw(dwc);
+	dw->initialize_chan(dwc);
 
 	/* Enable interrupts */
 	channel_set_bit(dw, MASK.XFER, dwc->mask);
@@ -215,37 +174,6 @@ static inline void dwc_chan_disable(struct dw_dma *dw, struct dw_dma_chan *dwc)
 		cpu_relax();
 }
 
-static u32 bytes2block(struct dw_dma_chan *dwc, size_t bytes,
-			  unsigned int width, size_t *len)
-{
-	struct dw_dma *dw = to_dw_dma(dwc->chan.device);
-	u32 block;
-
-	/* Always in bytes for iDMA 32-bit */
-	if (dw->pdata->is_idma32)
-		width = 0;
-
-	if ((bytes >> width) > dwc->block_size) {
-		block = dwc->block_size;
-		*len = block << width;
-	} else {
-		block = bytes >> width;
-		*len = bytes;
-	}
-
-	return block;
-}
-
-static size_t block2bytes(struct dw_dma_chan *dwc, u32 block, u32 width)
-{
-	struct dw_dma *dw = to_dw_dma(dwc->chan.device);
-
-	if (dw->pdata->is_idma32)
-		return IDMA32C_CTLH_BLOCK_TS(block);
-
-	return DWC_CTLH_BLOCK_TS(block) << width;
-}
-
 /*----------------------------------------------------------------------*/
 
 /* Perform single block transfer */
@@ -391,10 +319,11 @@ static void dwc_complete_all(struct dw_dma *dw, struct dw_dma_chan *dwc)
 /* Returns how many bytes were already received from source */
 static inline u32 dwc_get_sent(struct dw_dma_chan *dwc)
 {
+	struct dw_dma *dw = to_dw_dma(dwc->chan.device);
 	u32 ctlhi = channel_readl(dwc, CTL_HI);
 	u32 ctllo = channel_readl(dwc, CTL_LO);
 
-	return block2bytes(dwc, ctlhi, ctllo >> 4 & 7);
+	return dw->block2bytes(dwc, ctlhi, ctllo >> 4 & 7);
 }
 
 static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc)
@@ -651,7 +580,7 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 	unsigned int		src_width;
 	unsigned int		dst_width;
 	unsigned int		data_width = dw->pdata->data_width[m_master];
-	u32			ctllo;
+	u32			ctllo, ctlhi;
 	u8			lms = DWC_LLP_LMS(m_master);
 
 	dev_vdbg(chan2dev(chan),
@@ -680,10 +609,12 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 		if (!desc)
 			goto err_desc_get;
 
+		ctlhi = dw->bytes2block(dwc, len - offset, src_width, &xfer_count);
+
 		lli_write(desc, sar, src + offset);
 		lli_write(desc, dar, dest + offset);
 		lli_write(desc, ctllo, ctllo);
-		lli_write(desc, ctlhi, bytes2block(dwc, len - offset, src_width, &xfer_count));
+		lli_write(desc, ctlhi, ctlhi);
 		desc->len = xfer_count;
 
 		if (!first) {
@@ -721,7 +652,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 	struct dma_slave_config	*sconfig = &dwc->dma_sconfig;
 	struct dw_desc		*prev;
 	struct dw_desc		*first;
-	u32			ctllo;
+	u32			ctllo, ctlhi;
 	u8			m_master = dwc->dws.m_master;
 	u8			lms = DWC_LLP_LMS(m_master);
 	dma_addr_t		reg;
@@ -768,9 +699,11 @@ slave_sg_todev_fill_desc:
 			if (!desc)
 				goto err_desc_get;
 
+			ctlhi = dw->bytes2block(dwc, len, mem_width, &dlen);
+
 			lli_write(desc, sar, mem);
 			lli_write(desc, dar, reg);
-			lli_write(desc, ctlhi, bytes2block(dwc, len, mem_width, &dlen));
+			lli_write(desc, ctlhi, ctlhi);
 			lli_write(desc, ctllo, ctllo | DWC_CTLL_SRC_WIDTH(mem_width));
 			desc->len = dlen;
 
@@ -814,9 +747,11 @@ slave_sg_fromdev_fill_desc:
 			if (!desc)
 				goto err_desc_get;
 
+			ctlhi = dw->bytes2block(dwc, len, reg_width, &dlen);
+
 			lli_write(desc, sar, reg);
 			lli_write(desc, dar, mem);
-			lli_write(desc, ctlhi, bytes2block(dwc, len, reg_width, &dlen));
+			lli_write(desc, ctlhi, ctlhi);
 			mem_width = __ffs(data_width | mem | dlen);
 			lli_write(desc, ctllo, ctllo | DWC_CTLL_DST_WIDTH(mem_width));
 			desc->len = dlen;
@@ -876,22 +811,12 @@ EXPORT_SYMBOL_GPL(dw_dma_filter);
 static int dwc_config(struct dma_chan *chan, struct dma_slave_config *sconfig)
 {
 	struct dw_dma_chan *dwc = to_dw_dma_chan(chan);
-	struct dma_slave_config *sc = &dwc->dma_sconfig;
 	struct dw_dma *dw = to_dw_dma(chan->device);
-	/*
-	 * Fix sconfig's burst size according to dw_dmac. We need to convert
-	 * them as:
-	 * 1 -> 0, 4 -> 1, 8 -> 2, 16 -> 3.
-	 *
-	 * NOTE: burst size 2 is not supported by DesignWare controller.
-	 *       iDMA 32-bit supports it.
-	 */
-	u32 s = dw->pdata->is_idma32 ? 1 : 2;
 
 	memcpy(&dwc->dma_sconfig, sconfig, sizeof(*sconfig));
 
-	sc->src_maxburst = sc->src_maxburst > 1 ? fls(sc->src_maxburst) - s : 0;
-	sc->dst_maxburst = sc->dst_maxburst > 1 ? fls(sc->dst_maxburst) - s : 0;
+	dw->encode_maxburst(dwc, &dwc->dma_sconfig.src_maxburst);
+	dw->encode_maxburst(dwc, &dwc->dma_sconfig.dst_maxburst);
 
 	return 0;
 }
@@ -900,16 +825,9 @@ static void dwc_chan_pause(struct dw_dma_chan *dwc, bool drain)
 {
 	struct dw_dma *dw = to_dw_dma(dwc->chan.device);
 	unsigned int		count = 20;	/* timeout iterations */
-	u32			cfglo;
 
-	cfglo = channel_readl(dwc, CFG_LO);
-	if (dw->pdata->is_idma32) {
-		if (drain)
-			cfglo |= IDMA32C_CFGL_CH_DRAIN;
-		else
-			cfglo &= ~IDMA32C_CFGL_CH_DRAIN;
-	}
-	channel_writel(dwc, CFG_LO, cfglo | DWC_CFGL_CH_SUSP);
+	dw->suspend_chan(dwc, drain);
+
 	while (!(channel_readl(dwc, CFG_LO) & DWC_CFGL_FIFO_EMPTY) && count--)
 		udelay(2);
 
@@ -1058,33 +976,7 @@ static void dwc_issue_pending(struct dma_chan *chan)
 
 /*----------------------------------------------------------------------*/
 
-/*
- * Program FIFO size of channels.
- *
- * By default full FIFO (512 bytes) is assigned to channel 0. Here we
- * slice FIFO on equal parts between channels.
- */
-static void idma32_fifo_partition(struct dw_dma *dw)
-{
-	u64 value = IDMA32C_FP_PSIZE_CH0(64) | IDMA32C_FP_PSIZE_CH1(64) |
-		    IDMA32C_FP_UPDATE;
-	u64 fifo_partition = 0;
-
-	if (!dw->pdata->is_idma32)
-		return;
-
-	/* Fill FIFO_PARTITION low bits (Channels 0..1, 4..5) */
-	fifo_partition |= value << 0;
-
-	/* Fill FIFO_PARTITION high bits (Channels 2..3, 6..7) */
-	fifo_partition |= value << 32;
-
-	/* Program FIFO Partition registers - 64 bytes per channel */
-	idma32_writeq(dw, FIFO_PARTITION1, fifo_partition);
-	idma32_writeq(dw, FIFO_PARTITION0, fifo_partition);
-}
-
-static void dw_dma_off(struct dw_dma *dw)
+void do_dw_dma_off(struct dw_dma *dw)
 {
 	unsigned int i;
 
@@ -1103,7 +995,7 @@ static void dw_dma_off(struct dw_dma *dw)
 		clear_bit(DW_DMA_IS_INITIALIZED, &dw->chan[i].flags);
 }
 
-static void dw_dma_on(struct dw_dma *dw)
+void do_dw_dma_on(struct dw_dma *dw)
 {
 	dma_writel(dw, CFG, DW_CFG_DMA_EN);
 }
@@ -1139,7 +1031,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
 
 	/* Enable controller here if needed */
 	if (!dw->in_use)
-		dw_dma_on(dw);
+		do_dw_dma_on(dw);
 	dw->in_use |= dwc->mask;
 
 	return 0;
@@ -1177,30 +1069,25 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
 	/* Disable controller in case it was a last user */
 	dw->in_use &= ~dwc->mask;
 	if (!dw->in_use)
-		dw_dma_off(dw);
+		do_dw_dma_off(dw);
 
 	dev_vdbg(chan2dev(chan), "%s: done\n", __func__);
 }
 
-int dw_dma_probe(struct dw_dma_chip *chip)
+int do_dma_probe(struct dw_dma_chip *chip)
 {
+	struct dw_dma *dw = chip->dw;
 	struct dw_dma_platform_data *pdata;
-	struct dw_dma		*dw;
 	bool			autocfg = false;
 	unsigned int		dw_params;
 	unsigned int		i;
 	int			err;
 
-	dw = devm_kzalloc(chip->dev, sizeof(*dw), GFP_KERNEL);
-	if (!dw)
-		return -ENOMEM;
-
 	dw->pdata = devm_kzalloc(chip->dev, sizeof(*dw->pdata), GFP_KERNEL);
 	if (!dw->pdata)
 		return -ENOMEM;
 
 	dw->regs = chip->regs;
-	chip->dw = dw;
 
 	pm_runtime_get_sync(chip->dev);
 
@@ -1250,15 +1137,10 @@ int dw_dma_probe(struct dw_dma_chip *chip)
 	dw->all_chan_mask = (1 << pdata->nr_channels) - 1;
 
 	/* Force dma off, just in case */
-	dw_dma_off(dw);
-
-	idma32_fifo_partition(dw);
+	dw->disable(dw);
 
 	/* Device and instance ID for IRQ and DMA pool */
-	if (pdata->is_idma32)
-		snprintf(dw->name, sizeof(dw->name), "idma32:dmac%d", chip->id);
-	else
-		snprintf(dw->name, sizeof(dw->name), "dw:dmac%d", chip->id);
+	dw->set_device_name(dw, chip->id);
 
 	/* Create a pool of consistent memory blocks for hardware descriptors */
 	dw->desc_pool = dmam_pool_create(dw->name, chip->dev,
@@ -1380,16 +1262,15 @@ err_pdata:
 	pm_runtime_put_sync_suspend(chip->dev);
 	return err;
 }
-EXPORT_SYMBOL_GPL(dw_dma_probe);
 
-int dw_dma_remove(struct dw_dma_chip *chip)
+int do_dma_remove(struct dw_dma_chip *chip)
 {
 	struct dw_dma		*dw = chip->dw;
 	struct dw_dma_chan	*dwc, *_dwc;
 
 	pm_runtime_get_sync(chip->dev);
 
-	dw_dma_off(dw);
+	do_dw_dma_off(dw);
 	dma_async_device_unregister(&dw->dma);
 
 	free_irq(chip->irq, dw);
@@ -1404,27 +1285,24 @@ int dw_dma_remove(struct dw_dma_chip *chip)
 	pm_runtime_put_sync_suspend(chip->dev);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(dw_dma_remove);
 
-int dw_dma_disable(struct dw_dma_chip *chip)
+int do_dw_dma_disable(struct dw_dma_chip *chip)
 {
 	struct dw_dma *dw = chip->dw;
 
-	dw_dma_off(dw);
+	dw->disable(dw);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(dw_dma_disable);
+EXPORT_SYMBOL_GPL(do_dw_dma_disable);
 
-int dw_dma_enable(struct dw_dma_chip *chip)
+int do_dw_dma_enable(struct dw_dma_chip *chip)
 {
 	struct dw_dma *dw = chip->dw;
 
-	idma32_fifo_partition(dw);
-
-	dw_dma_on(dw);
+	dw->enable(dw);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(dw_dma_enable);
+EXPORT_SYMBOL_GPL(do_dw_dma_enable);
 
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("Synopsys DesignWare DMA Controller core driver");
diff --git a/drivers/dma/dw/dw.c b/drivers/dma/dw/dw.c
new file mode 100644
index 000000000000..977aa28bf81d
--- /dev/null
+++ b/drivers/dma/dw/dw.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2007-2008 Atmel Corporation
+// Copyright (C) 2010-2011 ST Microelectronics
+// Copyright (C) 2013,2018 Intel Corporation
+
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include "internal.h"
+
+static void dw_dma_initialize_chan(struct dw_dma_chan *dwc)
+{
+	struct dw_dma *dw = to_dw_dma(dwc->chan.device);
+	u32 cfghi = DWC_CFGH_FIFO_MODE;
+	u32 cfglo = DWC_CFGL_CH_PRIOR(dwc->priority);
+	bool hs_polarity = dwc->dws.hs_polarity;
+
+	cfghi |= DWC_CFGH_DST_PER(dwc->dws.dst_id);
+	cfghi |= DWC_CFGH_SRC_PER(dwc->dws.src_id);
+	cfghi |= DWC_CFGH_PROTCTL(dw->pdata->protctl);
+
+	/* Set polarity of handshake interface */
+	cfglo |= hs_polarity ? DWC_CFGL_HS_DST_POL | DWC_CFGL_HS_SRC_POL : 0;
+
+	channel_writel(dwc, CFG_LO, cfglo);
+	channel_writel(dwc, CFG_HI, cfghi);
+}
+
+static void dw_dma_suspend_chan(struct dw_dma_chan *dwc, bool drain)
+{
+	u32 cfglo = channel_readl(dwc, CFG_LO);
+
+	channel_writel(dwc, CFG_LO, cfglo | DWC_CFGL_CH_SUSP);
+}
+
+static u32 dw_dma_bytes2block(struct dw_dma_chan *dwc,
+			      size_t bytes, unsigned int width, size_t *len)
+{
+	u32 block;
+
+	if ((bytes >> width) > dwc->block_size) {
+		block = dwc->block_size;
+		*len = dwc->block_size << width;
+	} else {
+		block = bytes >> width;
+		*len = bytes;
+	}
+
+	return block;
+}
+
+static size_t dw_dma_block2bytes(struct dw_dma_chan *dwc, u32 block, u32 width)
+{
+	return DWC_CTLH_BLOCK_TS(block) << width;
+}
+
+static void dw_dma_encode_maxburst(struct dw_dma_chan *dwc, u32 *maxburst)
+{
+	/*
+	 * Fix burst size according to dw_dmac. We need to convert them as:
+	 * 1 -> 0, 4 -> 1, 8 -> 2, 16 -> 3.
+	 */
+	*maxburst = *maxburst > 1 ? fls(*maxburst) - 2 : 0;
+}
+
+static void dw_dma_set_device_name(struct dw_dma *dw, int id)
+{
+	snprintf(dw->name, sizeof(dw->name), "dw:dmac%d", id);
+}
+
+static void dw_dma_disable(struct dw_dma *dw)
+{
+	do_dw_dma_off(dw);
+}
+
+static void dw_dma_enable(struct dw_dma *dw)
+{
+	do_dw_dma_on(dw);
+}
+
+int dw_dma_probe(struct dw_dma_chip *chip)
+{
+	struct dw_dma *dw;
+
+	dw = devm_kzalloc(chip->dev, sizeof(*dw), GFP_KERNEL);
+	if (!dw)
+		return -ENOMEM;
+
+	/* Channel operations */
+	dw->initialize_chan = dw_dma_initialize_chan;
+	dw->suspend_chan = dw_dma_suspend_chan;
+	dw->encode_maxburst = dw_dma_encode_maxburst;
+	dw->bytes2block = dw_dma_bytes2block;
+	dw->block2bytes = dw_dma_block2bytes;
+
+	/* Device operations */
+	dw->set_device_name = dw_dma_set_device_name;
+	dw->disable = dw_dma_disable;
+	dw->enable = dw_dma_enable;
+
+	chip->dw = dw;
+	return do_dma_probe(chip);
+}
+EXPORT_SYMBOL_GPL(dw_dma_probe);
+
+int dw_dma_remove(struct dw_dma_chip *chip)
+{
+	return do_dma_remove(chip);
+}
+EXPORT_SYMBOL_GPL(dw_dma_remove);
diff --git a/drivers/dma/dw/idma32.c b/drivers/dma/dw/idma32.c
new file mode 100644
index 000000000000..8707830f39ad
--- /dev/null
+++ b/drivers/dma/dw/idma32.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2013,2018 Intel Corporation
+
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include "internal.h"
+
+static void idma32_initialize_chan(struct dw_dma_chan *dwc)
+{
+	u32 cfghi = 0;
+	u32 cfglo = 0;
+
+	/* Set default burst alignment */
+	cfglo |= IDMA32C_CFGL_DST_BURST_ALIGN | IDMA32C_CFGL_SRC_BURST_ALIGN;
+
+	/* Low 4 bits of the request lines */
+	cfghi |= IDMA32C_CFGH_DST_PER(dwc->dws.dst_id & 0xf);
+	cfghi |= IDMA32C_CFGH_SRC_PER(dwc->dws.src_id & 0xf);
+
+	/* Request line extension (2 bits) */
+	cfghi |= IDMA32C_CFGH_DST_PER_EXT(dwc->dws.dst_id >> 4 & 0x3);
+	cfghi |= IDMA32C_CFGH_SRC_PER_EXT(dwc->dws.src_id >> 4 & 0x3);
+
+	channel_writel(dwc, CFG_LO, cfglo);
+	channel_writel(dwc, CFG_HI, cfghi);
+}
+
+static void idma32_suspend_chan(struct dw_dma_chan *dwc, bool drain)
+{
+	u32 cfglo = channel_readl(dwc, CFG_LO);
+
+	if (drain)
+		cfglo |= IDMA32C_CFGL_CH_DRAIN;
+	else
+		cfglo &= ~IDMA32C_CFGL_CH_DRAIN;
+
+	channel_writel(dwc, CFG_LO, cfglo | DWC_CFGL_CH_SUSP);
+}
+
+static u32 idma32_bytes2block(struct dw_dma_chan *dwc,
+			      size_t bytes, unsigned int width, size_t *len)
+{
+	u32 block;
+
+	if (bytes > dwc->block_size) {
+		block = dwc->block_size;
+		*len = dwc->block_size;
+	} else {
+		block = bytes;
+		*len = bytes;
+	}
+
+	return block;
+}
+
+static size_t idma32_block2bytes(struct dw_dma_chan *dwc, u32 block, u32 width)
+{
+	return IDMA32C_CTLH_BLOCK_TS(block);
+}
+
+static void idma32_encode_maxburst(struct dw_dma_chan *dwc, u32 *maxburst)
+{
+	*maxburst = *maxburst > 1 ? fls(*maxburst) - 1 : 0;
+}
+
+static void idma32_set_device_name(struct dw_dma *dw, int id)
+{
+	snprintf(dw->name, sizeof(dw->name), "idma32:dmac%d", id);
+}
+
+/*
+ * Program FIFO size of channels.
+ *
+ * By default full FIFO (512 bytes) is assigned to channel 0. Here we
+ * slice FIFO on equal parts between channels.
+ */
+static void idma32_fifo_partition(struct dw_dma *dw)
+{
+	u64 value = IDMA32C_FP_PSIZE_CH0(64) | IDMA32C_FP_PSIZE_CH1(64) |
+		    IDMA32C_FP_UPDATE;
+	u64 fifo_partition = 0;
+
+	/* Fill FIFO_PARTITION low bits (Channels 0..1, 4..5) */
+	fifo_partition |= value << 0;
+
+	/* Fill FIFO_PARTITION high bits (Channels 2..3, 6..7) */
+	fifo_partition |= value << 32;
+
+	/* Program FIFO Partition registers - 64 bytes per channel */
+	idma32_writeq(dw, FIFO_PARTITION1, fifo_partition);
+	idma32_writeq(dw, FIFO_PARTITION0, fifo_partition);
+}
+
+static void idma32_disable(struct dw_dma *dw)
+{
+	do_dw_dma_off(dw);
+	idma32_fifo_partition(dw);
+}
+
+static void idma32_enable(struct dw_dma *dw)
+{
+	idma32_fifo_partition(dw);
+	do_dw_dma_on(dw);
+}
+
+int idma32_dma_probe(struct dw_dma_chip *chip)
+{
+	struct dw_dma *dw;
+
+	dw = devm_kzalloc(chip->dev, sizeof(*dw), GFP_KERNEL);
+	if (!dw)
+		return -ENOMEM;
+
+	/* Channel operations */
+	dw->initialize_chan = idma32_initialize_chan;
+	dw->suspend_chan = idma32_suspend_chan;
+	dw->encode_maxburst = idma32_encode_maxburst;
+	dw->bytes2block = idma32_bytes2block;
+	dw->block2bytes = idma32_block2bytes;
+
+	/* Device operations */
+	dw->set_device_name = idma32_set_device_name;
+	dw->disable = idma32_disable;
+	dw->enable = idma32_enable;
+
+	chip->dw = dw;
+	return do_dma_probe(chip);
+}
+EXPORT_SYMBOL_GPL(idma32_dma_probe);
+
+int idma32_dma_remove(struct dw_dma_chip *chip)
+{
+	return do_dma_remove(chip);
+}
+EXPORT_SYMBOL_GPL(idma32_dma_remove);
diff --git a/drivers/dma/dw/internal.h b/drivers/dma/dw/internal.h
index 41439732ff6b..fdcac21ea665 100644
--- a/drivers/dma/dw/internal.h
+++ b/drivers/dma/dw/internal.h
@@ -15,8 +15,14 @@
 
 #include "regs.h"
 
-int dw_dma_disable(struct dw_dma_chip *chip);
-int dw_dma_enable(struct dw_dma_chip *chip);
+int do_dma_probe(struct dw_dma_chip *chip);
+int do_dma_remove(struct dw_dma_chip *chip);
+
+void do_dw_dma_on(struct dw_dma *dw);
+void do_dw_dma_off(struct dw_dma *dw);
+
+int do_dw_dma_disable(struct dw_dma_chip *chip);
+int do_dw_dma_enable(struct dw_dma_chip *chip);
 
 extern bool dw_dma_filter(struct dma_chan *chan, void *param);
 
diff --git a/drivers/dma/dw/pci.c b/drivers/dma/dw/pci.c
index 66d98d7ccad0..e9ba25b4f950 100644
--- a/drivers/dma/dw/pci.c
+++ b/drivers/dma/dw/pci.c
@@ -15,9 +15,17 @@
 
 #include "internal.h"
 
-static struct dw_dma_platform_data mrfld_pdata = {
+struct dw_dma_pci_data {
+	const struct dw_dma_platform_data *pdata;
+	int (*probe)(struct dw_dma_chip *chip);
+};
+
+static const struct dw_dma_pci_data dw_pci_data = {
+	.probe = dw_dma_probe,
+};
+
+static const struct dw_dma_platform_data idma32_pdata = {
 	.nr_channels = 8,
-	.is_idma32 = true,
 	.chan_allocation_order = CHAN_ALLOCATION_ASCENDING,
 	.chan_priority = CHAN_PRIORITY_ASCENDING,
 	.block_size = 131071,
@@ -26,9 +34,14 @@ static struct dw_dma_platform_data mrfld_pdata = {
 	.multi_block = {1, 1, 1, 1, 1, 1, 1, 1},
 };
 
+static const struct dw_dma_pci_data idma32_pci_data = {
+	.pdata = &idma32_pdata,
+	.probe = idma32_dma_probe,
+};
+
 static int dw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *pid)
 {
-	const struct dw_dma_platform_data *pdata = (void *)pid->driver_data;
+	const struct dw_dma_pci_data *data = (void *)pid->driver_data;
 	struct dw_dma_chip *chip;
 	int ret;
 
@@ -61,9 +74,9 @@ static int dw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *pid)
 	chip->id = pdev->devfn;
 	chip->regs = pcim_iomap_table(pdev)[0];
 	chip->irq = pdev->irq;
-	chip->pdata = pdata;
+	chip->pdata = data->pdata;
 
-	ret = dw_dma_probe(chip);
+	ret = data->probe(chip);
 	if (ret)
 		return ret;
 
@@ -89,7 +102,7 @@ static int dw_pci_suspend_late(struct device *dev)
 	struct pci_dev *pci = to_pci_dev(dev);
 	struct dw_dma_chip *chip = pci_get_drvdata(pci);
 
-	return dw_dma_disable(chip);
+	return do_dw_dma_disable(chip);
 };
 
 static int dw_pci_resume_early(struct device *dev)
@@ -97,7 +110,7 @@ static int dw_pci_resume_early(struct device *dev)
 	struct pci_dev *pci = to_pci_dev(dev);
 	struct dw_dma_chip *chip = pci_get_drvdata(pci);
 
-	return dw_dma_enable(chip);
+	return do_dw_dma_enable(chip);
 };
 
 #endif /* CONFIG_PM_SLEEP */
@@ -108,24 +121,24 @@ static const struct dev_pm_ops dw_pci_dev_pm_ops = {
 
 static const struct pci_device_id dw_pci_id_table[] = {
 	/* Medfield (GPDMA) */
-	{ PCI_VDEVICE(INTEL, 0x0827) },
+	{ PCI_VDEVICE(INTEL, 0x0827), (kernel_ulong_t)&dw_pci_data },
 
 	/* BayTrail */
-	{ PCI_VDEVICE(INTEL, 0x0f06) },
-	{ PCI_VDEVICE(INTEL, 0x0f40) },
+	{ PCI_VDEVICE(INTEL, 0x0f06), (kernel_ulong_t)&dw_pci_data },
+	{ PCI_VDEVICE(INTEL, 0x0f40), (kernel_ulong_t)&dw_pci_data },
 
-	/* Merrifield iDMA 32-bit (GPDMA) */
-	{ PCI_VDEVICE(INTEL, 0x11a2), (kernel_ulong_t)&mrfld_pdata },
+	/* Merrifield */
+	{ PCI_VDEVICE(INTEL, 0x11a2), (kernel_ulong_t)&idma32_pci_data },
 
 	/* Braswell */
-	{ PCI_VDEVICE(INTEL, 0x2286) },
-	{ PCI_VDEVICE(INTEL, 0x22c0) },
+	{ PCI_VDEVICE(INTEL, 0x2286), (kernel_ulong_t)&dw_pci_data },
+	{ PCI_VDEVICE(INTEL, 0x22c0), (kernel_ulong_t)&dw_pci_data },
 
 	/* Haswell */
-	{ PCI_VDEVICE(INTEL, 0x9c60) },
+	{ PCI_VDEVICE(INTEL, 0x9c60), (kernel_ulong_t)&dw_pci_data },
 
 	/* Broadwell */
-	{ PCI_VDEVICE(INTEL, 0x9ce0) },
+	{ PCI_VDEVICE(INTEL, 0x9ce0), (kernel_ulong_t)&dw_pci_data },
 
 	{ }
 };
diff --git a/drivers/dma/dw/platform.c b/drivers/dma/dw/platform.c
index 58fc1ba02a1e..d5196c97e4f4 100644
--- a/drivers/dma/dw/platform.c
+++ b/drivers/dma/dw/platform.c
@@ -255,7 +255,7 @@ static void dw_shutdown(struct platform_device *pdev)
 	struct dw_dma_chip *chip = platform_get_drvdata(pdev);
 
 	/*
-	 * We have to call dw_dma_disable() to stop any ongoing transfer. On
+	 * We have to call do_dw_dma_disable() to stop any ongoing transfer. On
 	 * some platforms we can't do that since DMA device is powered off.
 	 * Moreover we have no possibility to check if the platform is affected
 	 * or not. That's why we call pm_runtime_get_sync() / pm_runtime_put()
@@ -264,7 +264,7 @@ static void dw_shutdown(struct platform_device *pdev)
 	 * used by the driver.
 	 */
 	pm_runtime_get_sync(chip->dev);
-	dw_dma_disable(chip);
+	do_dw_dma_disable(chip);
 	pm_runtime_put_sync_suspend(chip->dev);
 
 	clk_disable_unprepare(chip->clk);
@@ -294,7 +294,7 @@ static int dw_suspend_late(struct device *dev)
 {
 	struct dw_dma_chip *chip = dev_get_drvdata(dev);
 
-	dw_dma_disable(chip);
+	do_dw_dma_disable(chip);
 	clk_disable_unprepare(chip->clk);
 
 	return 0;
@@ -309,7 +309,7 @@ static int dw_resume_early(struct device *dev)
 	if (ret)
 		return ret;
 
-	return dw_dma_enable(chip);
+	return do_dw_dma_enable(chip);
 }
 
 #endif /* CONFIG_PM_SLEEP */
diff --git a/drivers/dma/dw/regs.h b/drivers/dma/dw/regs.h
index 646c9c960c07..66aa8b227248 100644
--- a/drivers/dma/dw/regs.h
+++ b/drivers/dma/dw/regs.h
@@ -312,6 +312,19 @@ struct dw_dma {
 	u8			all_chan_mask;
 	u8			in_use;
 
+	/* Channel operations */
+	void	(*initialize_chan)(struct dw_dma_chan *dwc);
+	void	(*suspend_chan)(struct dw_dma_chan *dwc, bool drain);
+	void	(*encode_maxburst)(struct dw_dma_chan *dwc, u32 *maxburst);
+	u32	(*bytes2block)(struct dw_dma_chan *dwc, size_t bytes,
+			       unsigned int width, size_t *len);
+	size_t	(*block2bytes)(struct dw_dma_chan *dwc, u32 block, u32 width);
+
+	/* Device operations */
+	void (*set_device_name)(struct dw_dma *dw, int id);
+	void (*disable)(struct dw_dma *dw);
+	void (*enable)(struct dw_dma *dw);
+
 	/* platform data */
 	struct dw_dma_platform_data	*pdata;
 };
diff --git a/include/linux/dma/dw.h b/include/linux/dma/dw.h
index e166cac8e870..d643d331c20e 100644
--- a/include/linux/dma/dw.h
+++ b/include/linux/dma/dw.h
@@ -45,9 +45,13 @@ struct dw_dma_chip {
 #if IS_ENABLED(CONFIG_DW_DMAC_CORE)
 int dw_dma_probe(struct dw_dma_chip *chip);
 int dw_dma_remove(struct dw_dma_chip *chip);
+int idma32_dma_probe(struct dw_dma_chip *chip);
+int idma32_dma_remove(struct dw_dma_chip *chip);
 #else
 static inline int dw_dma_probe(struct dw_dma_chip *chip) { return -ENODEV; }
 static inline int dw_dma_remove(struct dw_dma_chip *chip) { return 0; }
+static inline int idma32_dma_probe(struct dw_dma_chip *chip) { return -ENODEV; }
+static inline int idma32_dma_remove(struct dw_dma_chip *chip) { return 0; }
 #endif /* CONFIG_DW_DMAC_CORE */
 
 #endif /* _DMA_DW_H */
diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h
index 1c85eeee4171..576048433809 100644
--- a/include/linux/platform_data/dma-dw.h
+++ b/include/linux/platform_data/dma-dw.h
@@ -38,7 +38,6 @@ struct dw_dma_slave {
 /**
  * struct dw_dma_platform_data - Controller configuration parameters
  * @nr_channels: Number of channels supported by hardware (max 8)
- * @is_idma32: The type of the DMA controller is iDMA32
  * @chan_allocation_order: Allocate channels starting from 0 or 7
  * @chan_priority: Set channel priority increasing from 0 to 7 or 7 to 0.
  * @block_size: Maximum block size supported by the controller
@@ -50,7 +49,6 @@ struct dw_dma_slave {
  */
 struct dw_dma_platform_data {
 	unsigned int	nr_channels;
-	bool		is_idma32;
 #define CHAN_ALLOCATION_ASCENDING	0	/* zero to seven */
 #define CHAN_ALLOCATION_DESCENDING	1	/* seven to zero */
 	unsigned char	chan_allocation_order;
-- 
cgit v1.2.3


From b466a37fbcc99ef79ea59e40ef6aa8391430b0d8 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 7 Jan 2019 13:07:41 +0200
Subject: dmaengine: dw: convert to SPDX identifiers

This patch updates license to use SPDX-License-Identifier
instead of verbose license text.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dw/Kconfig               | 2 ++
 drivers/dma/dw/core.c                | 5 +----
 drivers/dma/dw/internal.h            | 5 +----
 drivers/dma/dw/pci.c                 | 5 +----
 drivers/dma/dw/platform.c            | 5 +----
 drivers/dma/dw/regs.h                | 5 +----
 include/linux/dma/dw.h               | 5 +----
 include/linux/platform_data/dma-dw.h | 5 +----
 8 files changed, 9 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/dw/Kconfig b/drivers/dma/dw/Kconfig
index 04b9728c1d26..e5162690de8f 100644
--- a/drivers/dma/dw/Kconfig
+++ b/drivers/dma/dw/Kconfig
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
 #
 # DMA engine configuration for dw
 #
diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index b7e4dab28f8a..3a1ab52ffae0 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Core driver for the Synopsys DesignWare DMA Controller
  *
  * Copyright (C) 2007-2008 Atmel Corporation
  * Copyright (C) 2010-2011 ST Microelectronics
  * Copyright (C) 2013 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/bitops.h>
diff --git a/drivers/dma/dw/internal.h b/drivers/dma/dw/internal.h
index fdcac21ea665..1dd7a4e6dd23 100644
--- a/drivers/dma/dw/internal.h
+++ b/drivers/dma/dw/internal.h
@@ -1,11 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Driver for the Synopsys DesignWare DMA Controller
  *
  * Copyright (C) 2013 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #ifndef _DMA_DW_INTERNAL_H
diff --git a/drivers/dma/dw/pci.c b/drivers/dma/dw/pci.c
index e9ba25b4f950..e79a75db0852 100644
--- a/drivers/dma/dw/pci.c
+++ b/drivers/dma/dw/pci.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * PCI driver for the Synopsys DesignWare DMA Controller
  *
  * Copyright (C) 2013 Intel Corporation
  * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/module.h>
diff --git a/drivers/dma/dw/platform.c b/drivers/dma/dw/platform.c
index d5196c97e4f4..382dfd9e9600 100644
--- a/drivers/dma/dw/platform.c
+++ b/drivers/dma/dw/platform.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Platform driver for the Synopsys DesignWare DMA Controller
  *
@@ -6,10 +7,6 @@
  * Copyright (C) 2013 Intel Corporation
  *
  * Some parts of this driver are derived from the original dw_dmac.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/module.h>
diff --git a/drivers/dma/dw/regs.h b/drivers/dma/dw/regs.h
index 07f91325e559..3fce66ecee7a 100644
--- a/drivers/dma/dw/regs.h
+++ b/drivers/dma/dw/regs.h
@@ -1,13 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Driver for the Synopsys DesignWare AHB DMA Controller
  *
  * Copyright (C) 2005-2007 Atmel Corporation
  * Copyright (C) 2010-2011 ST Microelectronics
  * Copyright (C) 2016 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/bitops.h>
diff --git a/include/linux/dma/dw.h b/include/linux/dma/dw.h
index d643d331c20e..9752f3745f76 100644
--- a/include/linux/dma/dw.h
+++ b/include/linux/dma/dw.h
@@ -1,13 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Driver for the Synopsys DesignWare DMA Controller
  *
  * Copyright (C) 2007 Atmel Corporation
  * Copyright (C) 2010-2011 ST Microelectronics
  * Copyright (C) 2014 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #ifndef _DMA_DW_H
 #define _DMA_DW_H
diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h
index 576048433809..f3eaf9ec00a1 100644
--- a/include/linux/platform_data/dma-dw.h
+++ b/include/linux/platform_data/dma-dw.h
@@ -1,12 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Driver for the Synopsys DesignWare DMA Controller
  *
  * Copyright (C) 2007 Atmel Corporation
  * Copyright (C) 2010-2011 ST Microelectronics
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #ifndef _PLATFORM_DATA_DMA_DW_H
 #define _PLATFORM_DATA_DMA_DW_H
-- 
cgit v1.2.3


From 83d0ea237bac5fb6a7509c69a8939e999693d311 Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Fri, 21 Dec 2018 17:02:35 +0100
Subject: dt-bindings: clk: meson: add ao slow clock path ids

Add the CLKIDs for the slow clock generation path

Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Acked-by: Neil Armstrong <narmstrong@baylibre.com>
Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
Link: https://lkml.kernel.org/r/20181221160239.26265-2-jbrunet@baylibre.com
---
 include/dt-bindings/clock/axg-aoclkc.h  | 7 ++++++-
 include/dt-bindings/clock/gxbb-aoclkc.h | 7 +++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/axg-aoclkc.h b/include/dt-bindings/clock/axg-aoclkc.h
index 61955016a55b..8ec4a269c7a6 100644
--- a/include/dt-bindings/clock/axg-aoclkc.h
+++ b/include/dt-bindings/clock/axg-aoclkc.h
@@ -21,6 +21,11 @@
 #define CLKID_AO_SAR_ADC_SEL	8
 #define CLKID_AO_SAR_ADC_DIV	9
 #define CLKID_AO_SAR_ADC_CLK	10
-#define CLKID_AO_ALT_XTAL	11
+#define CLKID_AO_CTS_OSCIN	11
+#define CLKID_AO_32K_PRE	12
+#define CLKID_AO_32K_DIV	13
+#define CLKID_AO_32K_SEL	14
+#define CLKID_AO_32K		15
+#define CLKID_AO_CTS_RTC_OSCIN	16
 
 #endif
diff --git a/include/dt-bindings/clock/gxbb-aoclkc.h b/include/dt-bindings/clock/gxbb-aoclkc.h
index 9d15e2221fdb..ec3b26319fc4 100644
--- a/include/dt-bindings/clock/gxbb-aoclkc.h
+++ b/include/dt-bindings/clock/gxbb-aoclkc.h
@@ -63,5 +63,12 @@
 #define CLKID_AO_UART2		4
 #define CLKID_AO_IR_BLASTER	5
 #define CLKID_AO_CEC_32K	6
+#define CLKID_AO_CTS_OSCIN	7
+#define CLKID_AO_32K_PRE	8
+#define CLKID_AO_32K_DIV	9
+#define CLKID_AO_32K_SEL	10
+#define CLKID_AO_32K		11
+#define CLKID_AO_CTS_RTC_OSCIN	12
+#define CLKID_AO_CLK81		13
 
 #endif
-- 
cgit v1.2.3


From e2d8ffe2e76028457759988ba6216fd13eeea01b Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Tue, 13 Nov 2018 03:52:18 -0500
Subject: media: v4l2-mem2mem: add v4l2_m2m_buf_copy_data helper function

Memory-to-memory devices should copy various parts of
struct v4l2_buffer from the output buffer to the capture buffer.

Add a helper function that does that to simplify the driver code.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Reviewed-by: Paul Kocialkowski <paul.kocialkowski@bootlin.com>
Reviewed-by: Alexandre Courbot <acourbot@chromium.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/v4l2-core/v4l2-mem2mem.c | 20 ++++++++++++++++++++
 include/media/v4l2-mem2mem.h           | 20 ++++++++++++++++++++
 2 files changed, 40 insertions(+)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-mem2mem.c b/drivers/media/v4l2-core/v4l2-mem2mem.c
index 5bbdec55b7d7..631f4e2aa942 100644
--- a/drivers/media/v4l2-core/v4l2-mem2mem.c
+++ b/drivers/media/v4l2-core/v4l2-mem2mem.c
@@ -975,6 +975,26 @@ void v4l2_m2m_buf_queue(struct v4l2_m2m_ctx *m2m_ctx,
 }
 EXPORT_SYMBOL_GPL(v4l2_m2m_buf_queue);
 
+void v4l2_m2m_buf_copy_data(const struct vb2_v4l2_buffer *out_vb,
+			    struct vb2_v4l2_buffer *cap_vb,
+			    bool copy_frame_flags)
+{
+	u32 mask = V4L2_BUF_FLAG_TIMECODE | V4L2_BUF_FLAG_TSTAMP_SRC_MASK;
+
+	if (copy_frame_flags)
+		mask |= V4L2_BUF_FLAG_KEYFRAME | V4L2_BUF_FLAG_PFRAME |
+			V4L2_BUF_FLAG_BFRAME;
+
+	cap_vb->vb2_buf.timestamp = out_vb->vb2_buf.timestamp;
+
+	if (out_vb->flags & V4L2_BUF_FLAG_TIMECODE)
+		cap_vb->timecode = out_vb->timecode;
+	cap_vb->field = out_vb->field;
+	cap_vb->flags &= ~mask;
+	cap_vb->flags |= out_vb->flags & mask;
+}
+EXPORT_SYMBOL_GPL(v4l2_m2m_buf_copy_data);
+
 void v4l2_m2m_request_queue(struct media_request *req)
 {
 	struct media_request_object *obj, *obj_safe;
diff --git a/include/media/v4l2-mem2mem.h b/include/media/v4l2-mem2mem.h
index 5467264771ec..43e447dcf69d 100644
--- a/include/media/v4l2-mem2mem.h
+++ b/include/media/v4l2-mem2mem.h
@@ -622,6 +622,26 @@ v4l2_m2m_dst_buf_remove_by_idx(struct v4l2_m2m_ctx *m2m_ctx, unsigned int idx)
 	return v4l2_m2m_buf_remove_by_idx(&m2m_ctx->cap_q_ctx, idx);
 }
 
+/**
+ * v4l2_m2m_buf_copy_data() - copy buffer data from the output buffer to the
+ * capture buffer
+ *
+ * @out_vb: the output buffer that is the source of the data.
+ * @cap_vb: the capture buffer that will receive the data.
+ * @copy_frame_flags: copy the KEY/B/PFRAME flags as well.
+ *
+ * This helper function copies the timestamp, timecode (if the TIMECODE
+ * buffer flag was set), field and the TIMECODE, KEYFRAME, BFRAME, PFRAME
+ * and TSTAMP_SRC_MASK flags from @out_vb to @cap_vb.
+ *
+ * If @copy_frame_flags is false, then the KEYFRAME, BFRAME and PFRAME
+ * flags are not copied. This is typically needed for encoders that
+ * set this bits explicitly.
+ */
+void v4l2_m2m_buf_copy_data(const struct vb2_v4l2_buffer *out_vb,
+			    struct vb2_v4l2_buffer *cap_vb,
+			    bool copy_frame_flags);
+
 /* v4l2 request helper */
 
 void v4l2_m2m_request_queue(struct media_request *req);
-- 
cgit v1.2.3


From c2eb8effb265ac5cdd960d8e61ecb931e9c767cd Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Wed, 24 Oct 2018 06:50:34 -0400
Subject: media: videodev2.h: add v4l2_timeval_to_ns inline function

We want to be able to uniquely identify buffers for stateless
codecs. The internal timestamp (a u64) as stored internally in the
kernel is a suitable candidate for that, but in struct v4l2_buffer
it is represented as a struct timeval.

Add a v4l2_timeval_to_ns() function that converts the struct timeval
into a u64 in the same way that the kernel does. This makes it possible
to use this u64 elsewhere as a unique identifier of the buffer.

Since timestamps are also copied from the output buffer to the
corresponding capture buffer(s) by M2M devices, the u64 can be
used to refer to both output and capture buffers.

The plan is that in the future we redesign struct v4l2_buffer and use
u64 for the timestamp instead of a struct timeval (which has lots of
problems with 32 vs 64 bit and y2038 layout changes), and then there
is no more need to use this function.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/uapi/linux/videodev2.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index b5671ce2724f..d6eed479c3a6 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -973,6 +973,18 @@ struct v4l2_buffer {
 	};
 };
 
+/**
+ * v4l2_timeval_to_ns - Convert timeval to nanoseconds
+ * @ts:		pointer to the timeval variable to be converted
+ *
+ * Returns the scalar nanosecond representation of the timeval
+ * parameter.
+ */
+static inline __u64 v4l2_timeval_to_ns(const struct timeval *tv)
+{
+	return (__u64)tv->tv_sec * 1000000000ULL + tv->tv_usec * 1000;
+}
+
 /*  Flags for 'flags' field */
 /* Buffer is mapped (flag) */
 #define V4L2_BUF_FLAG_MAPPED			0x00000001
-- 
cgit v1.2.3


From 245ede423b43a6e081e94e0e5d4e895bd1f31228 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Wed, 24 Oct 2018 06:51:01 -0400
Subject: media: vb2: add vb2_find_timestamp()

Use v4l2_timeval_to_ns instead of timeval_to_ns to ensure that
both kernelspace and userspace will use the same conversion
function.

Next add a new vb2_find_timestamp() function to find buffers
with a specific timestamp.

This function will only look at DEQUEUED and DONE buffers, i.e.
buffers that are already processed.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/common/videobuf2/videobuf2-v4l2.c | 19 ++++++++++++++++++-
 include/media/videobuf2-v4l2.h                  | 17 +++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c
index 3a0ca2f9854f..75ea90e795d8 100644
--- a/drivers/media/common/videobuf2/videobuf2-v4l2.c
+++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c
@@ -143,7 +143,7 @@ static void __copy_timestamp(struct vb2_buffer *vb, const void *pb)
 		 * and the timecode field and flag if needed.
 		 */
 		if (q->copy_timestamp)
-			vb->timestamp = timeval_to_ns(&b->timestamp);
+			vb->timestamp = v4l2_timeval_to_ns(&b->timestamp);
 		vbuf->flags |= b->flags & V4L2_BUF_FLAG_TIMECODE;
 		if (b->flags & V4L2_BUF_FLAG_TIMECODE)
 			vbuf->timecode = b->timecode;
@@ -589,6 +589,23 @@ static const struct vb2_buf_ops v4l2_buf_ops = {
 	.copy_timestamp		= __copy_timestamp,
 };
 
+int vb2_find_timestamp(const struct vb2_queue *q, u64 timestamp,
+		       unsigned int start_idx)
+{
+	unsigned int i;
+
+	for (i = start_idx; i < q->num_buffers; i++) {
+		struct vb2_buffer *vb = q->bufs[i];
+
+		if ((vb->state == VB2_BUF_STATE_DEQUEUED ||
+		     vb->state == VB2_BUF_STATE_DONE) &&
+		    vb->timestamp == timestamp)
+			return i;
+	}
+	return -1;
+}
+EXPORT_SYMBOL_GPL(vb2_find_timestamp);
+
 /*
  * vb2_querybuf() - query video buffer information
  * @q:		videobuf queue
diff --git a/include/media/videobuf2-v4l2.h b/include/media/videobuf2-v4l2.h
index 727855463838..a9961bc776dc 100644
--- a/include/media/videobuf2-v4l2.h
+++ b/include/media/videobuf2-v4l2.h
@@ -55,6 +55,23 @@ struct vb2_v4l2_buffer {
 #define to_vb2_v4l2_buffer(vb) \
 	container_of(vb, struct vb2_v4l2_buffer, vb2_buf)
 
+/**
+ * vb2_find_timestamp() - Find buffer with given timestamp in the queue
+ *
+ * @q:		pointer to &struct vb2_queue with videobuf2 queue.
+ * @timestamp:	the timestamp to find. Only buffers in state DEQUEUED or DONE
+ *		are considered.
+ * @start_idx:	the start index (usually 0) in the buffer array to start
+ *		searching from. Note that there may be multiple buffers
+ *		with the same timestamp value, so you can restart the search
+ *		by setting @start_idx to the previously found index + 1.
+ *
+ * Returns the buffer index of the buffer with the given @timestamp, or
+ * -1 if no buffer with @timestamp was found.
+ */
+int vb2_find_timestamp(const struct vb2_queue *q, u64 timestamp,
+		       unsigned int start_idx);
+
 int vb2_querybuf(struct vb2_queue *q, struct v4l2_buffer *b);
 
 /**
-- 
cgit v1.2.3


From d998e03e322fc497454f584adfc35743713a44c5 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Fri, 9 Nov 2018 04:16:21 -0500
Subject: media: cedrus: identify buffers by timestamp

Use the new v4l2_m2m_buf_copy_data helper function and use
timestamps to refer to reference frames instead of using
buffer indices.

Also remove the padding fields in the structs, that's a bad
idea. Just use the right types to keep everything aligned.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Tested-by: Paul Kocialkowski <paul.kocialkowski@bootlin.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/v4l2-core/v4l2-ctrls.c              |  9 ---------
 drivers/staging/media/sunxi/cedrus/cedrus.h       |  9 ++++++---
 drivers/staging/media/sunxi/cedrus/cedrus_dec.c   |  2 ++
 drivers/staging/media/sunxi/cedrus/cedrus_mpeg2.c | 23 +++++++++++------------
 include/media/mpeg2-ctrls.h                       | 14 +++++---------
 5 files changed, 24 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index 5e3806feb5d7..e3bd441fa29a 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -1661,15 +1661,6 @@ static int std_validate(const struct v4l2_ctrl *ctrl, u32 idx,
 			return -EINVAL;
 		}
 
-		if (p_mpeg2_slice_params->backward_ref_index >= VIDEO_MAX_FRAME ||
-		    p_mpeg2_slice_params->forward_ref_index >= VIDEO_MAX_FRAME)
-			return -EINVAL;
-
-		if (p_mpeg2_slice_params->pad ||
-		    p_mpeg2_slice_params->picture.pad ||
-		    p_mpeg2_slice_params->sequence.pad)
-			return -EINVAL;
-
 		return 0;
 
 	case V4L2_CTRL_TYPE_MPEG2_QUANTIZATION:
diff --git a/drivers/staging/media/sunxi/cedrus/cedrus.h b/drivers/staging/media/sunxi/cedrus/cedrus.h
index 3acfdcf83691..4aedd24a9848 100644
--- a/drivers/staging/media/sunxi/cedrus/cedrus.h
+++ b/drivers/staging/media/sunxi/cedrus/cedrus.h
@@ -140,11 +140,14 @@ static inline dma_addr_t cedrus_buf_addr(struct vb2_buffer *buf,
 }
 
 static inline dma_addr_t cedrus_dst_buf_addr(struct cedrus_ctx *ctx,
-					     unsigned int index,
-					     unsigned int plane)
+					     int index, unsigned int plane)
 {
-	struct vb2_buffer *buf = ctx->dst_bufs[index];
+	struct vb2_buffer *buf;
 
+	if (index < 0)
+		return 0;
+
+	buf = ctx->dst_bufs[index];
 	return buf ? cedrus_buf_addr(buf, &ctx->dst_fmt, plane) : 0;
 }
 
diff --git a/drivers/staging/media/sunxi/cedrus/cedrus_dec.c b/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
index 591d191d4286..443fb037e1cf 100644
--- a/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
+++ b/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
@@ -50,6 +50,8 @@ void cedrus_device_run(void *priv)
 		break;
 	}
 
+	v4l2_m2m_buf_copy_data(run.src, run.dst, true);
+
 	dev->dec_ops[ctx->current_codec]->setup(ctx, &run);
 
 	/* Complete request(s) controls if needed. */
diff --git a/drivers/staging/media/sunxi/cedrus/cedrus_mpeg2.c b/drivers/staging/media/sunxi/cedrus/cedrus_mpeg2.c
index 9abd39cae38c..cb45fda9aaeb 100644
--- a/drivers/staging/media/sunxi/cedrus/cedrus_mpeg2.c
+++ b/drivers/staging/media/sunxi/cedrus/cedrus_mpeg2.c
@@ -82,7 +82,10 @@ static void cedrus_mpeg2_setup(struct cedrus_ctx *ctx, struct cedrus_run *run)
 	dma_addr_t fwd_luma_addr, fwd_chroma_addr;
 	dma_addr_t bwd_luma_addr, bwd_chroma_addr;
 	struct cedrus_dev *dev = ctx->dev;
+	struct vb2_queue *cap_q = &ctx->fh.m2m_ctx->cap_q_ctx.q;
 	const u8 *matrix;
+	int forward_idx;
+	int backward_idx;
 	unsigned int i;
 	u32 reg;
 
@@ -156,23 +159,19 @@ static void cedrus_mpeg2_setup(struct cedrus_ctx *ctx, struct cedrus_run *run)
 	cedrus_write(dev, VE_DEC_MPEG_PICBOUNDSIZE, reg);
 
 	/* Forward and backward prediction reference buffers. */
+	forward_idx = vb2_find_timestamp(cap_q,
+					 slice_params->forward_ref_ts, 0);
 
-	fwd_luma_addr = cedrus_dst_buf_addr(ctx,
-					    slice_params->forward_ref_index,
-					    0);
-	fwd_chroma_addr = cedrus_dst_buf_addr(ctx,
-					      slice_params->forward_ref_index,
-					      1);
+	fwd_luma_addr = cedrus_dst_buf_addr(ctx, forward_idx, 0);
+	fwd_chroma_addr = cedrus_dst_buf_addr(ctx, forward_idx, 1);
 
 	cedrus_write(dev, VE_DEC_MPEG_FWD_REF_LUMA_ADDR, fwd_luma_addr);
 	cedrus_write(dev, VE_DEC_MPEG_FWD_REF_CHROMA_ADDR, fwd_chroma_addr);
 
-	bwd_luma_addr = cedrus_dst_buf_addr(ctx,
-					    slice_params->backward_ref_index,
-					    0);
-	bwd_chroma_addr = cedrus_dst_buf_addr(ctx,
-					      slice_params->backward_ref_index,
-					      1);
+	backward_idx = vb2_find_timestamp(cap_q,
+					  slice_params->backward_ref_ts, 0);
+	bwd_luma_addr = cedrus_dst_buf_addr(ctx, backward_idx, 0);
+	bwd_chroma_addr = cedrus_dst_buf_addr(ctx, backward_idx, 1);
 
 	cedrus_write(dev, VE_DEC_MPEG_BWD_REF_LUMA_ADDR, bwd_luma_addr);
 	cedrus_write(dev, VE_DEC_MPEG_BWD_REF_CHROMA_ADDR, bwd_chroma_addr);
diff --git a/include/media/mpeg2-ctrls.h b/include/media/mpeg2-ctrls.h
index d21f40edc09e..6601455b3d5e 100644
--- a/include/media/mpeg2-ctrls.h
+++ b/include/media/mpeg2-ctrls.h
@@ -30,10 +30,9 @@ struct v4l2_mpeg2_sequence {
 	__u32	vbv_buffer_size;
 
 	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence extension */
-	__u8	profile_and_level_indication;
+	__u16	profile_and_level_indication;
 	__u8	progressive_sequence;
 	__u8	chroma_format;
-	__u8	pad;
 };
 
 struct v4l2_mpeg2_picture {
@@ -51,23 +50,20 @@ struct v4l2_mpeg2_picture {
 	__u8	intra_vlc_format;
 	__u8	alternate_scan;
 	__u8	repeat_first_field;
-	__u8	progressive_frame;
-	__u8	pad;
+	__u16	progressive_frame;
 };
 
 struct v4l2_ctrl_mpeg2_slice_params {
 	__u32	bit_size;
 	__u32	data_bit_offset;
+	__u64	backward_ref_ts;
+	__u64	forward_ref_ts;
 
 	struct v4l2_mpeg2_sequence sequence;
 	struct v4l2_mpeg2_picture picture;
 
 	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Slice */
-	__u8	quantiser_scale_code;
-
-	__u8	backward_ref_index;
-	__u8	forward_ref_index;
-	__u8	pad;
+	__u32	quantiser_scale_code;
 };
 
 struct v4l2_ctrl_mpeg2_quantization {
-- 
cgit v1.2.3


From d0dcde6426ce071ad447fb9d91c85ab649026114 Mon Sep 17 00:00:00 2001
From: Otto Sabart <ottosabart@seberm.com>
Date: Sun, 6 Jan 2019 00:29:15 +0100
Subject: doc: networking: convert offload files into RST and update references

This patch renames offload files. This is necessary for Sphinx.

Also update reference to checksum-offloads.rst file.

Whole kernel code was grepped for references using:
$ grep -r "\(segmentation\|checksum\)-offloads.txt" .

There should be no other references
to {segmentation,checksum}-offloads.txt files.

Signed-off-by: Otto Sabart <ottosabart@seberm.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/networking/checksum-offloads.rst     | 143 ++++++++++++++++
 Documentation/networking/checksum-offloads.txt     | 143 ----------------
 Documentation/networking/segmentation-offloads.rst | 184 +++++++++++++++++++++
 Documentation/networking/segmentation-offloads.txt | 184 ---------------------
 include/linux/skbuff.h                             |   2 +-
 5 files changed, 328 insertions(+), 328 deletions(-)
 create mode 100644 Documentation/networking/checksum-offloads.rst
 delete mode 100644 Documentation/networking/checksum-offloads.txt
 create mode 100644 Documentation/networking/segmentation-offloads.rst
 delete mode 100644 Documentation/networking/segmentation-offloads.txt

(limited to 'include')

diff --git a/Documentation/networking/checksum-offloads.rst b/Documentation/networking/checksum-offloads.rst
new file mode 100644
index 000000000000..1a1cd94a3f6d
--- /dev/null
+++ b/Documentation/networking/checksum-offloads.rst
@@ -0,0 +1,143 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============================================
+Checksum Offloads in the Linux Networking Stack
+===============================================
+
+
+Introduction
+============
+
+This document describes a set of techniques in the Linux networking stack to
+take advantage of checksum offload capabilities of various NICs.
+
+The following technologies are described:
+
+* TX Checksum Offload
+* LCO: Local Checksum Offload
+* RCO: Remote Checksum Offload
+
+Things that should be documented here but aren't yet:
+
+* RX Checksum Offload
+* CHECKSUM_UNNECESSARY conversion
+
+
+TX Checksum Offload
+===================
+
+The interface for offloading a transmit checksum to a device is explained in
+detail in comments near the top of include/linux/skbuff.h.
+
+In brief, it allows to request the device fill in a single ones-complement
+checksum defined by the sk_buff fields skb->csum_start and skb->csum_offset.
+The device should compute the 16-bit ones-complement checksum (i.e. the
+'IP-style' checksum) from csum_start to the end of the packet, and fill in the
+result at (csum_start + csum_offset).
+
+Because csum_offset cannot be negative, this ensures that the previous value of
+the checksum field is included in the checksum computation, thus it can be used
+to supply any needed corrections to the checksum (such as the sum of the
+pseudo-header for UDP or TCP).
+
+This interface only allows a single checksum to be offloaded.  Where
+encapsulation is used, the packet may have multiple checksum fields in
+different header layers, and the rest will have to be handled by another
+mechanism such as LCO or RCO.
+
+CRC32c can also be offloaded using this interface, by means of filling
+skb->csum_start and skb->csum_offset as described above, and setting
+skb->csum_not_inet: see skbuff.h comment (section 'D') for more details.
+
+No offloading of the IP header checksum is performed; it is always done in
+software.  This is OK because when we build the IP header, we obviously have it
+in cache, so summing it isn't expensive.  It's also rather short.
+
+The requirements for GSO are more complicated, because when segmenting an
+encapsulated packet both the inner and outer checksums may need to be edited or
+recomputed for each resulting segment.  See the skbuff.h comment (section 'E')
+for more details.
+
+A driver declares its offload capabilities in netdev->hw_features; see
+Documentation/networking/netdev-features.txt for more.  Note that a device
+which only advertises NETIF_F_IP[V6]_CSUM must still obey the csum_start and
+csum_offset given in the SKB; if it tries to deduce these itself in hardware
+(as some NICs do) the driver should check that the values in the SKB match
+those which the hardware will deduce, and if not, fall back to checksumming in
+software instead (with skb_csum_hwoffload_help() or one of the
+skb_checksum_help() / skb_crc32c_csum_help functions, as mentioned in
+include/linux/skbuff.h).
+
+The stack should, for the most part, assume that checksum offload is supported
+by the underlying device.  The only place that should check is
+validate_xmit_skb(), and the functions it calls directly or indirectly.  That
+function compares the offload features requested by the SKB (which may include
+other offloads besides TX Checksum Offload) and, if they are not supported or
+enabled on the device (determined by netdev->features), performs the
+corresponding offload in software.  In the case of TX Checksum Offload, that
+means calling skb_csum_hwoffload_help(skb, features).
+
+
+LCO: Local Checksum Offload
+===========================
+
+LCO is a technique for efficiently computing the outer checksum of an
+encapsulated datagram when the inner checksum is due to be offloaded.
+
+The ones-complement sum of a correctly checksummed TCP or UDP packet is equal
+to the complement of the sum of the pseudo header, because everything else gets
+'cancelled out' by the checksum field.  This is because the sum was
+complemented before being written to the checksum field.
+
+More generally, this holds in any case where the 'IP-style' ones complement
+checksum is used, and thus any checksum that TX Checksum Offload supports.
+
+That is, if we have set up TX Checksum Offload with a start/offset pair, we
+know that after the device has filled in that checksum, the ones complement sum
+from csum_start to the end of the packet will be equal to the complement of
+whatever value we put in the checksum field beforehand.  This allows us to
+compute the outer checksum without looking at the payload: we simply stop
+summing when we get to csum_start, then add the complement of the 16-bit word
+at (csum_start + csum_offset).
+
+Then, when the true inner checksum is filled in (either by hardware or by
+skb_checksum_help()), the outer checksum will become correct by virtue of the
+arithmetic.
+
+LCO is performed by the stack when constructing an outer UDP header for an
+encapsulation such as VXLAN or GENEVE, in udp_set_csum().  Similarly for the
+IPv6 equivalents, in udp6_set_csum().
+
+It is also performed when constructing an IPv4 GRE header, in
+net/ipv4/ip_gre.c:build_header().  It is *not* currently performed when
+constructing an IPv6 GRE header; the GRE checksum is computed over the whole
+packet in net/ipv6/ip6_gre.c:ip6gre_xmit2(), but it should be possible to use
+LCO here as IPv6 GRE still uses an IP-style checksum.
+
+All of the LCO implementations use a helper function lco_csum(), in
+include/linux/skbuff.h.
+
+LCO can safely be used for nested encapsulations; in this case, the outer
+encapsulation layer will sum over both its own header and the 'middle' header.
+This does mean that the 'middle' header will get summed multiple times, but
+there doesn't seem to be a way to avoid that without incurring bigger costs
+(e.g. in SKB bloat).
+
+
+RCO: Remote Checksum Offload
+============================
+
+RCO is a technique for eliding the inner checksum of an encapsulated datagram,
+allowing the outer checksum to be offloaded.  It does, however, involve a
+change to the encapsulation protocols, which the receiver must also support.
+For this reason, it is disabled by default.
+
+RCO is detailed in the following Internet-Drafts:
+
+* https://tools.ietf.org/html/draft-herbert-remotecsumoffload-00
+* https://tools.ietf.org/html/draft-herbert-vxlan-rco-00
+
+In Linux, RCO is implemented individually in each encapsulation protocol, and
+most tunnel types have flags controlling its use.  For instance, VXLAN has the
+flag VXLAN_F_REMCSUM_TX (per struct vxlan_rdst) to indicate that RCO should be
+used when transmitting to a given remote destination.
diff --git a/Documentation/networking/checksum-offloads.txt b/Documentation/networking/checksum-offloads.txt
deleted file mode 100644
index 1a1cd94a3f6d..000000000000
--- a/Documentation/networking/checksum-offloads.txt
+++ /dev/null
@@ -1,143 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-===============================================
-Checksum Offloads in the Linux Networking Stack
-===============================================
-
-
-Introduction
-============
-
-This document describes a set of techniques in the Linux networking stack to
-take advantage of checksum offload capabilities of various NICs.
-
-The following technologies are described:
-
-* TX Checksum Offload
-* LCO: Local Checksum Offload
-* RCO: Remote Checksum Offload
-
-Things that should be documented here but aren't yet:
-
-* RX Checksum Offload
-* CHECKSUM_UNNECESSARY conversion
-
-
-TX Checksum Offload
-===================
-
-The interface for offloading a transmit checksum to a device is explained in
-detail in comments near the top of include/linux/skbuff.h.
-
-In brief, it allows to request the device fill in a single ones-complement
-checksum defined by the sk_buff fields skb->csum_start and skb->csum_offset.
-The device should compute the 16-bit ones-complement checksum (i.e. the
-'IP-style' checksum) from csum_start to the end of the packet, and fill in the
-result at (csum_start + csum_offset).
-
-Because csum_offset cannot be negative, this ensures that the previous value of
-the checksum field is included in the checksum computation, thus it can be used
-to supply any needed corrections to the checksum (such as the sum of the
-pseudo-header for UDP or TCP).
-
-This interface only allows a single checksum to be offloaded.  Where
-encapsulation is used, the packet may have multiple checksum fields in
-different header layers, and the rest will have to be handled by another
-mechanism such as LCO or RCO.
-
-CRC32c can also be offloaded using this interface, by means of filling
-skb->csum_start and skb->csum_offset as described above, and setting
-skb->csum_not_inet: see skbuff.h comment (section 'D') for more details.
-
-No offloading of the IP header checksum is performed; it is always done in
-software.  This is OK because when we build the IP header, we obviously have it
-in cache, so summing it isn't expensive.  It's also rather short.
-
-The requirements for GSO are more complicated, because when segmenting an
-encapsulated packet both the inner and outer checksums may need to be edited or
-recomputed for each resulting segment.  See the skbuff.h comment (section 'E')
-for more details.
-
-A driver declares its offload capabilities in netdev->hw_features; see
-Documentation/networking/netdev-features.txt for more.  Note that a device
-which only advertises NETIF_F_IP[V6]_CSUM must still obey the csum_start and
-csum_offset given in the SKB; if it tries to deduce these itself in hardware
-(as some NICs do) the driver should check that the values in the SKB match
-those which the hardware will deduce, and if not, fall back to checksumming in
-software instead (with skb_csum_hwoffload_help() or one of the
-skb_checksum_help() / skb_crc32c_csum_help functions, as mentioned in
-include/linux/skbuff.h).
-
-The stack should, for the most part, assume that checksum offload is supported
-by the underlying device.  The only place that should check is
-validate_xmit_skb(), and the functions it calls directly or indirectly.  That
-function compares the offload features requested by the SKB (which may include
-other offloads besides TX Checksum Offload) and, if they are not supported or
-enabled on the device (determined by netdev->features), performs the
-corresponding offload in software.  In the case of TX Checksum Offload, that
-means calling skb_csum_hwoffload_help(skb, features).
-
-
-LCO: Local Checksum Offload
-===========================
-
-LCO is a technique for efficiently computing the outer checksum of an
-encapsulated datagram when the inner checksum is due to be offloaded.
-
-The ones-complement sum of a correctly checksummed TCP or UDP packet is equal
-to the complement of the sum of the pseudo header, because everything else gets
-'cancelled out' by the checksum field.  This is because the sum was
-complemented before being written to the checksum field.
-
-More generally, this holds in any case where the 'IP-style' ones complement
-checksum is used, and thus any checksum that TX Checksum Offload supports.
-
-That is, if we have set up TX Checksum Offload with a start/offset pair, we
-know that after the device has filled in that checksum, the ones complement sum
-from csum_start to the end of the packet will be equal to the complement of
-whatever value we put in the checksum field beforehand.  This allows us to
-compute the outer checksum without looking at the payload: we simply stop
-summing when we get to csum_start, then add the complement of the 16-bit word
-at (csum_start + csum_offset).
-
-Then, when the true inner checksum is filled in (either by hardware or by
-skb_checksum_help()), the outer checksum will become correct by virtue of the
-arithmetic.
-
-LCO is performed by the stack when constructing an outer UDP header for an
-encapsulation such as VXLAN or GENEVE, in udp_set_csum().  Similarly for the
-IPv6 equivalents, in udp6_set_csum().
-
-It is also performed when constructing an IPv4 GRE header, in
-net/ipv4/ip_gre.c:build_header().  It is *not* currently performed when
-constructing an IPv6 GRE header; the GRE checksum is computed over the whole
-packet in net/ipv6/ip6_gre.c:ip6gre_xmit2(), but it should be possible to use
-LCO here as IPv6 GRE still uses an IP-style checksum.
-
-All of the LCO implementations use a helper function lco_csum(), in
-include/linux/skbuff.h.
-
-LCO can safely be used for nested encapsulations; in this case, the outer
-encapsulation layer will sum over both its own header and the 'middle' header.
-This does mean that the 'middle' header will get summed multiple times, but
-there doesn't seem to be a way to avoid that without incurring bigger costs
-(e.g. in SKB bloat).
-
-
-RCO: Remote Checksum Offload
-============================
-
-RCO is a technique for eliding the inner checksum of an encapsulated datagram,
-allowing the outer checksum to be offloaded.  It does, however, involve a
-change to the encapsulation protocols, which the receiver must also support.
-For this reason, it is disabled by default.
-
-RCO is detailed in the following Internet-Drafts:
-
-* https://tools.ietf.org/html/draft-herbert-remotecsumoffload-00
-* https://tools.ietf.org/html/draft-herbert-vxlan-rco-00
-
-In Linux, RCO is implemented individually in each encapsulation protocol, and
-most tunnel types have flags controlling its use.  For instance, VXLAN has the
-flag VXLAN_F_REMCSUM_TX (per struct vxlan_rdst) to indicate that RCO should be
-used when transmitting to a given remote destination.
diff --git a/Documentation/networking/segmentation-offloads.rst b/Documentation/networking/segmentation-offloads.rst
new file mode 100644
index 000000000000..1794bfe98196
--- /dev/null
+++ b/Documentation/networking/segmentation-offloads.rst
@@ -0,0 +1,184 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================================================
+Segmentation Offloads in the Linux Networking Stack
+===================================================
+
+
+Introduction
+============
+
+This document describes a set of techniques in the Linux networking stack
+to take advantage of segmentation offload capabilities of various NICs.
+
+The following technologies are described:
+ * TCP Segmentation Offload - TSO
+ * UDP Fragmentation Offload - UFO
+ * IPIP, SIT, GRE, and UDP Tunnel Offloads
+ * Generic Segmentation Offload - GSO
+ * Generic Receive Offload - GRO
+ * Partial Generic Segmentation Offload - GSO_PARTIAL
+ * SCTP accelleration with GSO - GSO_BY_FRAGS
+
+
+TCP Segmentation Offload
+========================
+
+TCP segmentation allows a device to segment a single frame into multiple
+frames with a data payload size specified in skb_shinfo()->gso_size.
+When TCP segmentation requested the bit for either SKB_GSO_TCPV4 or
+SKB_GSO_TCPV6 should be set in skb_shinfo()->gso_type and
+skb_shinfo()->gso_size should be set to a non-zero value.
+
+TCP segmentation is dependent on support for the use of partial checksum
+offload.  For this reason TSO is normally disabled if the Tx checksum
+offload for a given device is disabled.
+
+In order to support TCP segmentation offload it is necessary to populate
+the network and transport header offsets of the skbuff so that the device
+drivers will be able determine the offsets of the IP or IPv6 header and the
+TCP header.  In addition as CHECKSUM_PARTIAL is required csum_start should
+also point to the TCP header of the packet.
+
+For IPv4 segmentation we support one of two types in terms of the IP ID.
+The default behavior is to increment the IP ID with every segment.  If the
+GSO type SKB_GSO_TCP_FIXEDID is specified then we will not increment the IP
+ID and all segments will use the same IP ID.  If a device has
+NETIF_F_TSO_MANGLEID set then the IP ID can be ignored when performing TSO
+and we will either increment the IP ID for all frames, or leave it at a
+static value based on driver preference.
+
+
+UDP Fragmentation Offload
+=========================
+
+UDP fragmentation offload allows a device to fragment an oversized UDP
+datagram into multiple IPv4 fragments.  Many of the requirements for UDP
+fragmentation offload are the same as TSO.  However the IPv4 ID for
+fragments should not increment as a single IPv4 datagram is fragmented.
+
+UFO is deprecated: modern kernels will no longer generate UFO skbs, but can
+still receive them from tuntap and similar devices. Offload of UDP-based
+tunnel protocols is still supported.
+
+
+IPIP, SIT, GRE, UDP Tunnel, and Remote Checksum Offloads
+========================================================
+
+In addition to the offloads described above it is possible for a frame to
+contain additional headers such as an outer tunnel.  In order to account
+for such instances an additional set of segmentation offload types were
+introduced including SKB_GSO_IPXIP4, SKB_GSO_IPXIP6, SKB_GSO_GRE, and
+SKB_GSO_UDP_TUNNEL.  These extra segmentation types are used to identify
+cases where there are more than just 1 set of headers.  For example in the
+case of IPIP and SIT we should have the network and transport headers moved
+from the standard list of headers to "inner" header offsets.
+
+Currently only two levels of headers are supported.  The convention is to
+refer to the tunnel headers as the outer headers, while the encapsulated
+data is normally referred to as the inner headers.  Below is the list of
+calls to access the given headers:
+
+IPIP/SIT Tunnel::
+
+             Outer                  Inner
+  MAC        skb_mac_header
+  Network    skb_network_header     skb_inner_network_header
+  Transport  skb_transport_header
+
+UDP/GRE Tunnel::
+
+             Outer                  Inner
+  MAC        skb_mac_header         skb_inner_mac_header
+  Network    skb_network_header     skb_inner_network_header
+  Transport  skb_transport_header   skb_inner_transport_header
+
+In addition to the above tunnel types there are also SKB_GSO_GRE_CSUM and
+SKB_GSO_UDP_TUNNEL_CSUM.  These two additional tunnel types reflect the
+fact that the outer header also requests to have a non-zero checksum
+included in the outer header.
+
+Finally there is SKB_GSO_TUNNEL_REMCSUM which indicates that a given tunnel
+header has requested a remote checksum offload.  In this case the inner
+headers will be left with a partial checksum and only the outer header
+checksum will be computed.
+
+
+Generic Segmentation Offload
+============================
+
+Generic segmentation offload is a pure software offload that is meant to
+deal with cases where device drivers cannot perform the offloads described
+above.  What occurs in GSO is that a given skbuff will have its data broken
+out over multiple skbuffs that have been resized to match the MSS provided
+via skb_shinfo()->gso_size.
+
+Before enabling any hardware segmentation offload a corresponding software
+offload is required in GSO.  Otherwise it becomes possible for a frame to
+be re-routed between devices and end up being unable to be transmitted.
+
+
+Generic Receive Offload
+=======================
+
+Generic receive offload is the complement to GSO.  Ideally any frame
+assembled by GRO should be segmented to create an identical sequence of
+frames using GSO, and any sequence of frames segmented by GSO should be
+able to be reassembled back to the original by GRO.  The only exception to
+this is IPv4 ID in the case that the DF bit is set for a given IP header.
+If the value of the IPv4 ID is not sequentially incrementing it will be
+altered so that it is when a frame assembled via GRO is segmented via GSO.
+
+
+Partial Generic Segmentation Offload
+====================================
+
+Partial generic segmentation offload is a hybrid between TSO and GSO.  What
+it effectively does is take advantage of certain traits of TCP and tunnels
+so that instead of having to rewrite the packet headers for each segment
+only the inner-most transport header and possibly the outer-most network
+header need to be updated.  This allows devices that do not support tunnel
+offloads or tunnel offloads with checksum to still make use of segmentation.
+
+With the partial offload what occurs is that all headers excluding the
+inner transport header are updated such that they will contain the correct
+values for if the header was simply duplicated.  The one exception to this
+is the outer IPv4 ID field.  It is up to the device drivers to guarantee
+that the IPv4 ID field is incremented in the case that a given header does
+not have the DF bit set.
+
+
+SCTP accelleration with GSO
+===========================
+
+SCTP - despite the lack of hardware support - can still take advantage of
+GSO to pass one large packet through the network stack, rather than
+multiple small packets.
+
+This requires a different approach to other offloads, as SCTP packets
+cannot be just segmented to (P)MTU. Rather, the chunks must be contained in
+IP segments, padding respected. So unlike regular GSO, SCTP can't just
+generate a big skb, set gso_size to the fragmentation point and deliver it
+to IP layer.
+
+Instead, the SCTP protocol layer builds an skb with the segments correctly
+padded and stored as chained skbs, and skb_segment() splits based on those.
+To signal this, gso_size is set to the special value GSO_BY_FRAGS.
+
+Therefore, any code in the core networking stack must be aware of the
+possibility that gso_size will be GSO_BY_FRAGS and handle that case
+appropriately.
+
+There are some helpers to make this easier:
+
+- skb_is_gso(skb) && skb_is_gso_sctp(skb) is the best way to see if
+  an skb is an SCTP GSO skb.
+
+- For size checks, the skb_gso_validate_*_len family of helpers correctly
+  considers GSO_BY_FRAGS.
+
+- For manipulating packets, skb_increase_gso_size and skb_decrease_gso_size
+  will check for GSO_BY_FRAGS and WARN if asked to manipulate these skbs.
+
+This also affects drivers with the NETIF_F_FRAGLIST & NETIF_F_GSO_SCTP bits
+set. Note also that NETIF_F_GSO_SCTP is included in NETIF_F_GSO_SOFTWARE.
diff --git a/Documentation/networking/segmentation-offloads.txt b/Documentation/networking/segmentation-offloads.txt
deleted file mode 100644
index 1794bfe98196..000000000000
--- a/Documentation/networking/segmentation-offloads.txt
+++ /dev/null
@@ -1,184 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-===================================================
-Segmentation Offloads in the Linux Networking Stack
-===================================================
-
-
-Introduction
-============
-
-This document describes a set of techniques in the Linux networking stack
-to take advantage of segmentation offload capabilities of various NICs.
-
-The following technologies are described:
- * TCP Segmentation Offload - TSO
- * UDP Fragmentation Offload - UFO
- * IPIP, SIT, GRE, and UDP Tunnel Offloads
- * Generic Segmentation Offload - GSO
- * Generic Receive Offload - GRO
- * Partial Generic Segmentation Offload - GSO_PARTIAL
- * SCTP accelleration with GSO - GSO_BY_FRAGS
-
-
-TCP Segmentation Offload
-========================
-
-TCP segmentation allows a device to segment a single frame into multiple
-frames with a data payload size specified in skb_shinfo()->gso_size.
-When TCP segmentation requested the bit for either SKB_GSO_TCPV4 or
-SKB_GSO_TCPV6 should be set in skb_shinfo()->gso_type and
-skb_shinfo()->gso_size should be set to a non-zero value.
-
-TCP segmentation is dependent on support for the use of partial checksum
-offload.  For this reason TSO is normally disabled if the Tx checksum
-offload for a given device is disabled.
-
-In order to support TCP segmentation offload it is necessary to populate
-the network and transport header offsets of the skbuff so that the device
-drivers will be able determine the offsets of the IP or IPv6 header and the
-TCP header.  In addition as CHECKSUM_PARTIAL is required csum_start should
-also point to the TCP header of the packet.
-
-For IPv4 segmentation we support one of two types in terms of the IP ID.
-The default behavior is to increment the IP ID with every segment.  If the
-GSO type SKB_GSO_TCP_FIXEDID is specified then we will not increment the IP
-ID and all segments will use the same IP ID.  If a device has
-NETIF_F_TSO_MANGLEID set then the IP ID can be ignored when performing TSO
-and we will either increment the IP ID for all frames, or leave it at a
-static value based on driver preference.
-
-
-UDP Fragmentation Offload
-=========================
-
-UDP fragmentation offload allows a device to fragment an oversized UDP
-datagram into multiple IPv4 fragments.  Many of the requirements for UDP
-fragmentation offload are the same as TSO.  However the IPv4 ID for
-fragments should not increment as a single IPv4 datagram is fragmented.
-
-UFO is deprecated: modern kernels will no longer generate UFO skbs, but can
-still receive them from tuntap and similar devices. Offload of UDP-based
-tunnel protocols is still supported.
-
-
-IPIP, SIT, GRE, UDP Tunnel, and Remote Checksum Offloads
-========================================================
-
-In addition to the offloads described above it is possible for a frame to
-contain additional headers such as an outer tunnel.  In order to account
-for such instances an additional set of segmentation offload types were
-introduced including SKB_GSO_IPXIP4, SKB_GSO_IPXIP6, SKB_GSO_GRE, and
-SKB_GSO_UDP_TUNNEL.  These extra segmentation types are used to identify
-cases where there are more than just 1 set of headers.  For example in the
-case of IPIP and SIT we should have the network and transport headers moved
-from the standard list of headers to "inner" header offsets.
-
-Currently only two levels of headers are supported.  The convention is to
-refer to the tunnel headers as the outer headers, while the encapsulated
-data is normally referred to as the inner headers.  Below is the list of
-calls to access the given headers:
-
-IPIP/SIT Tunnel::
-
-             Outer                  Inner
-  MAC        skb_mac_header
-  Network    skb_network_header     skb_inner_network_header
-  Transport  skb_transport_header
-
-UDP/GRE Tunnel::
-
-             Outer                  Inner
-  MAC        skb_mac_header         skb_inner_mac_header
-  Network    skb_network_header     skb_inner_network_header
-  Transport  skb_transport_header   skb_inner_transport_header
-
-In addition to the above tunnel types there are also SKB_GSO_GRE_CSUM and
-SKB_GSO_UDP_TUNNEL_CSUM.  These two additional tunnel types reflect the
-fact that the outer header also requests to have a non-zero checksum
-included in the outer header.
-
-Finally there is SKB_GSO_TUNNEL_REMCSUM which indicates that a given tunnel
-header has requested a remote checksum offload.  In this case the inner
-headers will be left with a partial checksum and only the outer header
-checksum will be computed.
-
-
-Generic Segmentation Offload
-============================
-
-Generic segmentation offload is a pure software offload that is meant to
-deal with cases where device drivers cannot perform the offloads described
-above.  What occurs in GSO is that a given skbuff will have its data broken
-out over multiple skbuffs that have been resized to match the MSS provided
-via skb_shinfo()->gso_size.
-
-Before enabling any hardware segmentation offload a corresponding software
-offload is required in GSO.  Otherwise it becomes possible for a frame to
-be re-routed between devices and end up being unable to be transmitted.
-
-
-Generic Receive Offload
-=======================
-
-Generic receive offload is the complement to GSO.  Ideally any frame
-assembled by GRO should be segmented to create an identical sequence of
-frames using GSO, and any sequence of frames segmented by GSO should be
-able to be reassembled back to the original by GRO.  The only exception to
-this is IPv4 ID in the case that the DF bit is set for a given IP header.
-If the value of the IPv4 ID is not sequentially incrementing it will be
-altered so that it is when a frame assembled via GRO is segmented via GSO.
-
-
-Partial Generic Segmentation Offload
-====================================
-
-Partial generic segmentation offload is a hybrid between TSO and GSO.  What
-it effectively does is take advantage of certain traits of TCP and tunnels
-so that instead of having to rewrite the packet headers for each segment
-only the inner-most transport header and possibly the outer-most network
-header need to be updated.  This allows devices that do not support tunnel
-offloads or tunnel offloads with checksum to still make use of segmentation.
-
-With the partial offload what occurs is that all headers excluding the
-inner transport header are updated such that they will contain the correct
-values for if the header was simply duplicated.  The one exception to this
-is the outer IPv4 ID field.  It is up to the device drivers to guarantee
-that the IPv4 ID field is incremented in the case that a given header does
-not have the DF bit set.
-
-
-SCTP accelleration with GSO
-===========================
-
-SCTP - despite the lack of hardware support - can still take advantage of
-GSO to pass one large packet through the network stack, rather than
-multiple small packets.
-
-This requires a different approach to other offloads, as SCTP packets
-cannot be just segmented to (P)MTU. Rather, the chunks must be contained in
-IP segments, padding respected. So unlike regular GSO, SCTP can't just
-generate a big skb, set gso_size to the fragmentation point and deliver it
-to IP layer.
-
-Instead, the SCTP protocol layer builds an skb with the segments correctly
-padded and stored as chained skbs, and skb_segment() splits based on those.
-To signal this, gso_size is set to the special value GSO_BY_FRAGS.
-
-Therefore, any code in the core networking stack must be aware of the
-possibility that gso_size will be GSO_BY_FRAGS and handle that case
-appropriately.
-
-There are some helpers to make this easier:
-
-- skb_is_gso(skb) && skb_is_gso_sctp(skb) is the best way to see if
-  an skb is an SCTP GSO skb.
-
-- For size checks, the skb_gso_validate_*_len family of helpers correctly
-  considers GSO_BY_FRAGS.
-
-- For manipulating packets, skb_increase_gso_size and skb_decrease_gso_size
-  will check for GSO_BY_FRAGS and WARN if asked to manipulate these skbs.
-
-This also affects drivers with the NETIF_F_FRAGLIST & NETIF_F_GSO_SCTP bits
-set. Note also that NETIF_F_GSO_SCTP is included in NETIF_F_GSO_SOFTWARE.
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 93f56fddd92a..4e671b46e767 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4296,7 +4296,7 @@ static inline bool skb_head_is_locked(const struct sk_buff *skb)
 /* Local Checksum Offload.
  * Compute outer checksum based on the assumption that the
  * inner checksum will be offloaded later.
- * See Documentation/networking/checksum-offloads.txt for
+ * See Documentation/networking/checksum-offloads.rst for
  * explanation of how this works.
  * Fill in outer checksum adjustment (e.g. with sum of outer
  * pseudo-header) before calling.
-- 
cgit v1.2.3


From 539cd352239f6acae4a7997bf31edca5c9a69c81 Mon Sep 17 00:00:00 2001
From: Zhikang Zhang <zhangzhikang1@huawei.com>
Date: Thu, 3 Jan 2019 20:06:38 +0800
Subject: f2fs: fix compile warnings: 'struct *' declared inside parameter list

We meet these compile warnings below, which caused by missing declare structs:
struct f2fs_io_info, struct extent, struct f2fs_sb_info.

warning: 'struct f2fs_io_info' declared inside parameter list
warning: 'struct extent_info' declared inside parameter list
warning: 'struct f2fs_sb_info' declared inside parameter list

Signed-off-by: Zhikang Zhang <zhangzhikang1@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/trace/events/f2fs.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 3ec73f17ee2a..8a28a2b1be74 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -149,6 +149,9 @@ TRACE_DEFINE_ENUM(CP_TRIMMED);
 		{ CP_SPEC_LOG_NUM,	"log type is 2" },		\
 		{ CP_RECOVER_DIR,	"dir needs recovery" })
 
+struct f2fs_sb_info;
+struct f2fs_io_info;
+struct extent_info;
 struct victim_sel_policy;
 struct f2fs_map_blocks;
 
-- 
cgit v1.2.3


From 9ac6cb5fbb1781d120ca0ad29d014d35c9c3f0c4 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Wed, 19 Dec 2018 17:48:17 +0100
Subject: i2c: add suspended flag and accessors for i2c adapters

A few drivers open code the handling of suspended adapters. It could be
handled by the core, though, to ensure generic handling. This patch adds
the flag and accessor functions. The usage of these helpers is optional,
though. See the kerneldoc in this patch. Using the new flag, we now
reject further transfers if the adapter is already marked suspended.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 Documentation/i2c/fault-codes |  4 ++++
 drivers/i2c/i2c-core-base.c   |  3 +++
 include/linux/i2c.h           | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/Documentation/i2c/fault-codes b/Documentation/i2c/fault-codes
index 47c25abb7d52..0cee0fc545b4 100644
--- a/Documentation/i2c/fault-codes
+++ b/Documentation/i2c/fault-codes
@@ -112,6 +112,10 @@ EPROTO
 	case is when the length of an SMBus block data response
 	(from the SMBus slave) is outside the range 1-32 bytes.
 
+ESHUTDOWN
+	Returned when a transfer was requested using an adapter
+	which is already suspended.
+
 ETIMEDOUT
 	This is returned by drivers when an operation took too much
 	time, and was aborted before it completed.
diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c
index 28460f6a60cc..926ca0a7477f 100644
--- a/drivers/i2c/i2c-core-base.c
+++ b/drivers/i2c/i2c-core-base.c
@@ -1232,6 +1232,7 @@ static int i2c_register_adapter(struct i2c_adapter *adap)
 	if (!adap->lock_ops)
 		adap->lock_ops = &i2c_adapter_lock_ops;
 
+	adap->locked_flags = 0;
 	rt_mutex_init(&adap->bus_lock);
 	rt_mutex_init(&adap->mux_lock);
 	mutex_init(&adap->userspace_clients_lock);
@@ -1865,6 +1866,8 @@ int __i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num)
 
 	if (WARN_ON(!msgs || num < 1))
 		return -EINVAL;
+	if (WARN_ON(test_bit(I2C_ALF_IS_SUSPENDED, &adap->locked_flags)))
+		return -ESHUTDOWN;
 
 	if (adap->quirks && i2c_check_for_quirks(adap, msgs, num))
 		return -EOPNOTSUPP;
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 65b4eaed1d96..cba59d66c00d 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -680,6 +680,8 @@ struct i2c_adapter {
 	int timeout;			/* in jiffies */
 	int retries;
 	struct device dev;		/* the adapter device */
+	unsigned long locked_flags;	/* owned by the I2C core */
+#define I2C_ALF_IS_SUSPENDED	0
 
 	int nr;
 	char name[48];
@@ -762,6 +764,38 @@ i2c_unlock_bus(struct i2c_adapter *adapter, unsigned int flags)
 	adapter->lock_ops->unlock_bus(adapter, flags);
 }
 
+/**
+ * i2c_mark_adapter_suspended - Report suspended state of the adapter to the core
+ * @adap: Adapter to mark as suspended
+ *
+ * When using this helper to mark an adapter as suspended, the core will reject
+ * further transfers to this adapter. The usage of this helper is optional but
+ * recommended for devices having distinct handlers for system suspend and
+ * runtime suspend. More complex devices are free to implement custom solutions
+ * to reject transfers when suspended.
+ */
+static inline void i2c_mark_adapter_suspended(struct i2c_adapter *adap)
+{
+	i2c_lock_bus(adap, I2C_LOCK_ROOT_ADAPTER);
+	set_bit(I2C_ALF_IS_SUSPENDED, &adap->locked_flags);
+	i2c_unlock_bus(adap, I2C_LOCK_ROOT_ADAPTER);
+}
+
+/**
+ * i2c_mark_adapter_resumed - Report resumed state of the adapter to the core
+ * @adap: Adapter to mark as resumed
+ *
+ * When using this helper to mark an adapter as resumed, the core will allow
+ * further transfers to this adapter. See also further notes to
+ * @i2c_mark_adapter_suspended().
+ */
+static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap)
+{
+	i2c_lock_bus(adap, I2C_LOCK_ROOT_ADAPTER);
+	clear_bit(I2C_ALF_IS_SUSPENDED, &adap->locked_flags);
+	i2c_unlock_bus(adap, I2C_LOCK_ROOT_ADAPTER);
+}
+
 /*flags for the client struct: */
 #define I2C_CLIENT_PEC		0x04	/* Use Packet Error Checking */
 #define I2C_CLIENT_TEN		0x10	/* we have a ten bit chip address */
-- 
cgit v1.2.3


From 96f87ee1811306d0c8cf94b8c37b0e4f725b01d1 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 8 Jan 2019 16:07:23 +0200
Subject: RDMA: Clean structures from CONFIG_INFINIBAND_ON_DEMAND_PAGING

CONFIG_INFINIBAND_ON_DEMAND_PAGING is used in general structures to
micro-optimize the memory footprint. Remove it, so it will allow us to
simplify various ODP device flows.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 --
 include/rdma/ib_verbs.h              | 2 --
 2 files changed, 4 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index b06d3b1efea8..93c288a4de2b 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -911,7 +911,6 @@ struct mlx5_ib_dev {
 	/* Prevents soft lock on massive reg MRs */
 	struct mutex			slow_path_mutex;
 	int				fill_delay;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 	struct ib_odp_caps	odp_caps;
 	u64			odp_max_size;
 	struct mlx5_ib_pf_eq	odp_pf_eq;
@@ -923,7 +922,6 @@ struct mlx5_ib_dev {
 	struct srcu_struct      mr_srcu;
 	u32			null_mkey;
 	struct workqueue_struct *advise_mr_wq;
-#endif
 	struct mlx5_ib_flow_db	*flow_db;
 	/* protect resources needed as part of reset flow */
 	spinlock_t		reset_flow_resource_lock;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index a3ceed3a040a..3ddd199ba602 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1504,12 +1504,10 @@ struct ib_ucontext {
 
 	bool cleanup_retryable;
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 	void (*invalidate_range)(struct ib_umem_odp *umem_odp,
 				 unsigned long start, unsigned long end);
 	struct mutex per_mm_list_lock;
 	struct list_head per_mm_list;
-#endif
 
 	struct ib_rdmacg_object	cg_obj;
 	/*
-- 
cgit v1.2.3


From 13859d5df418ea535926e2b57c29d5161c522b9d Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 8 Jan 2019 16:07:26 +0200
Subject: RDMA/mlx5: Embed into the code flow the ODP config option

Convert various places to more readable code, which embeds
CONFIG_INFINIBAND_ON_DEMAND_PAGING into the code flow.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/uverbs_cmd.c |  3 ---
 drivers/infiniband/hw/mlx5/main.c    | 41 ++++++++++++++++++------------------
 drivers/infiniband/hw/mlx5/mem.c     |  5 +----
 drivers/infiniband/hw/mlx5/mr.c      | 28 +++++++++++-------------
 include/rdma/ib_umem_odp.h           | 26 +++++++++++------------
 5 files changed, 46 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 549d9eedf62e..d4f1a2ef5015 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -234,14 +234,11 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
 	ucontext->closing = false;
 	ucontext->cleanup_retryable = false;
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 	mutex_init(&ucontext->per_mm_list_lock);
 	INIT_LIST_HEAD(&ucontext->per_mm_list);
 	if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
 		ucontext->invalidate_range = NULL;
 
-#endif
-
 	resp.num_comp_vectors = file->device->num_comp_vectors;
 
 	ret = get_unused_fd_flags(O_CLOEXEC);
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 06ee1f0cb22d..11e9783cefcc 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1763,9 +1763,9 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	if (err)
 		goto out_sys_pages;
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
-#endif
+	if (ibdev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)
+		context->ibucontext.invalidate_range =
+			&mlx5_ib_invalidate_range;
 
 	if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
 		err = mlx5_ib_devx_create(dev, true);
@@ -1897,12 +1897,10 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
 	struct mlx5_bfreg_info *bfregi;
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 	/* All umem's must be destroyed before destroying the ucontext. */
 	mutex_lock(&ibcontext->per_mm_list_lock);
 	WARN_ON(!list_empty(&ibcontext->per_mm_list));
 	mutex_unlock(&ibcontext->per_mm_list_lock);
-#endif
 
 	bfregi = &context->bfregi;
 	mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
@@ -5722,11 +5720,11 @@ static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device,
 void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
 {
 	mlx5_ib_cleanup_multiport_master(dev);
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	cleanup_srcu_struct(&dev->mr_srcu);
-	drain_workqueue(dev->advise_mr_wq);
-	destroy_workqueue(dev->advise_mr_wq);
-#endif
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+		cleanup_srcu_struct(&dev->mr_srcu);
+		drain_workqueue(dev->advise_mr_wq);
+		destroy_workqueue(dev->advise_mr_wq);
+	}
 	kfree(dev->port);
 }
 
@@ -5779,19 +5777,20 @@ int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
 	spin_lock_init(&dev->memic.memic_lock);
 	dev->memic.dev = mdev;
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	dev->advise_mr_wq = alloc_ordered_workqueue("mlx5_ib_advise_mr_wq", 0);
-	if (!dev->advise_mr_wq) {
-		err = -ENOMEM;
-		goto err_mp;
-	}
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+		dev->advise_mr_wq =
+			alloc_ordered_workqueue("mlx5_ib_advise_mr_wq", 0);
+		if (!dev->advise_mr_wq) {
+			err = -ENOMEM;
+			goto err_mp;
+		}
 
-	err = init_srcu_struct(&dev->mr_srcu);
-	if (err) {
-		destroy_workqueue(dev->advise_mr_wq);
-		goto err_mp;
+		err = init_srcu_struct(&dev->mr_srcu);
+		if (err) {
+			destroy_workqueue(dev->advise_mr_wq);
+			goto err_mp;
+		}
 	}
-#endif
 
 	return 0;
 err_mp:
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index 549234988bb4..9f90be296ee0 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -111,7 +111,6 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
 	*count = i;
 }
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
 {
 	u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
@@ -123,7 +122,6 @@ static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
 
 	return mtt_entry;
 }
-#endif
 
 /*
  * Populate the given array with bus addresses from the umem.
@@ -151,7 +149,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 	int len;
 	struct scatterlist *sg;
 	int entry;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+
 	if (umem->is_odp) {
 		WARN_ON(shift != 0);
 		WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE));
@@ -164,7 +162,6 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 		}
 		return;
 	}
-#endif
 
 	i = 0;
 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index c389750f771e..494a90f4348c 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -71,10 +71,9 @@ static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
 	int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	/* Wait until all page fault handlers using the mr complete. */
-	synchronize_srcu(&dev->mr_srcu);
-#endif
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+		/* Wait until all page fault handlers using the mr complete. */
+		synchronize_srcu(&dev->mr_srcu);
 
 	return err;
 }
@@ -254,9 +253,8 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
 		mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 	}
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	synchronize_srcu(&dev->mr_srcu);
-#endif
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+		synchronize_srcu(&dev->mr_srcu);
 
 	list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
 		list_del(&mr->list);
@@ -1329,8 +1327,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
 		    start, virt_addr, length, access_flags);
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	if (!start && length == U64_MAX) {
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && !start &&
+	    length == U64_MAX) {
 		if (!(access_flags & IB_ACCESS_ON_DEMAND) ||
 		    !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
 			return ERR_PTR(-EINVAL);
@@ -1340,7 +1338,6 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 			return ERR_CAST(mr);
 		return &mr->ibmr;
 	}
-#endif
 
 	err = mr_umem_get(pd, start, length, access_flags, &umem, &npages,
 			   &page_shift, &ncont, &order);
@@ -1401,9 +1398,9 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 		}
 	}
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	mr->live = 1;
-#endif
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+		mr->live = 1;
+
 	return &mr->ibmr;
 error:
 	ib_umem_release(umem);
@@ -1518,9 +1515,8 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 		}
 
 		mr->allocated_from_cache = 0;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-		mr->live = 1;
-#endif
+		if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+			mr->live = 1;
 	} else {
 		/*
 		 * Send a UMR WQE
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index 0b1446fe2fab..d3725cf13ecd 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -83,6 +83,19 @@ static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem)
 	return container_of(umem, struct ib_umem_odp, umem);
 }
 
+/*
+ * The lower 2 bits of the DMA address signal the R/W permissions for
+ * the entry. To upgrade the permissions, provide the appropriate
+ * bitmask to the map_dma_pages function.
+ *
+ * Be aware that upgrading a mapped address might result in change of
+ * the DMA address for the page.
+ */
+#define ODP_READ_ALLOWED_BIT  (1<<0ULL)
+#define ODP_WRITE_ALLOWED_BIT (1<<1ULL)
+
+#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT))
+
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 
 struct ib_ucontext_per_mm {
@@ -107,19 +120,6 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm,
 				      unsigned long addr, size_t size);
 void ib_umem_odp_release(struct ib_umem_odp *umem_odp);
 
-/*
- * The lower 2 bits of the DMA address signal the R/W permissions for
- * the entry. To upgrade the permissions, provide the appropriate
- * bitmask to the map_dma_pages function.
- *
- * Be aware that upgrading a mapped address might result in change of
- * the DMA address for the page.
- */
-#define ODP_READ_ALLOWED_BIT  (1<<0ULL)
-#define ODP_WRITE_ALLOWED_BIT (1<<1ULL)
-
-#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT))
-
 int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset,
 			      u64 bcnt, u64 access_mask,
 			      unsigned long current_seq);
-- 
cgit v1.2.3


From 0ada768517dafa1504ef5986ba04f118b7436960 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 8 Jan 2019 16:07:27 +0200
Subject: RDMA/mlx5: Delete declaration of already removed function

The implementation of mlx5_core_page_fault_resume() was removed in commit
d5d284b829a6 ("{net,IB}/mlx5: Move Page fault EQ and ODP logic to
RDMA"). This patch removes declaration too.

Fixes: d5d284b829a6 ("{net,IB}/mlx5: Move Page fault EQ and ODP logic to RDMA")
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/mlx5/driver.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 54299251d40d..b6f5839f129a 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -939,10 +939,6 @@ int mlx5_query_odp_caps(struct mlx5_core_dev *dev,
 			struct mlx5_odp_caps *odp_caps);
 int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev,
 			     u8 port_num, void *out, size_t sz);
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 token,
-				u32 wq_num, u8 type, int error);
-#endif
 
 int mlx5_init_rl_table(struct mlx5_core_dev *dev);
 void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev);
-- 
cgit v1.2.3


From 6131dc81211cfcd8b7bd210665fbe0b6ba2e5fc5 Mon Sep 17 00:00:00 2001
From: Jeffrey Hugo <jhugo@codeaurora.org>
Date: Mon, 17 Dec 2018 19:15:36 -0700
Subject: clk: qcom: smd: Add support for MSM8998 rpm clocks

Add rpm smd clocks, PMIC and bus clocks which are required on MSM8998
for clients to vote on.

Signed-off-by: Jeffrey Hugo <jhugo@codeaurora.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 .../devicetree/bindings/clock/qcom,rpmcc.txt       |  1 +
 drivers/clk/qcom/clk-smd-rpm.c                     | 63 ++++++++++++++++++++++
 include/dt-bindings/clock/qcom,rpmcc.h             | 10 ++++
 3 files changed, 74 insertions(+)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,rpmcc.txt b/Documentation/devicetree/bindings/clock/qcom,rpmcc.txt
index 87b4949e9bc8..944719bd586f 100644
--- a/Documentation/devicetree/bindings/clock/qcom,rpmcc.txt
+++ b/Documentation/devicetree/bindings/clock/qcom,rpmcc.txt
@@ -16,6 +16,7 @@ Required properties :
 			"qcom,rpmcc-msm8974", "qcom,rpmcc"
 			"qcom,rpmcc-apq8064", "qcom,rpmcc"
 			"qcom,rpmcc-msm8996", "qcom,rpmcc"
+			"qcom,rpmcc-msm8998", "qcom,rpmcc"
 			"qcom,rpmcc-qcs404", "qcom,rpmcc"
 
 - #clock-cells : shall contain 1
diff --git a/drivers/clk/qcom/clk-smd-rpm.c b/drivers/clk/qcom/clk-smd-rpm.c
index d3aadaeb2903..22dd42ad9223 100644
--- a/drivers/clk/qcom/clk-smd-rpm.c
+++ b/drivers/clk/qcom/clk-smd-rpm.c
@@ -655,10 +655,73 @@ static const struct rpm_smd_clk_desc rpm_clk_qcs404 = {
 	.num_clks = ARRAY_SIZE(qcs404_clks),
 };
 
+/* msm8998 */
+DEFINE_CLK_SMD_RPM(msm8998, snoc_clk, snoc_a_clk, QCOM_SMD_RPM_BUS_CLK, 1);
+DEFINE_CLK_SMD_RPM(msm8998, cnoc_clk, cnoc_a_clk, QCOM_SMD_RPM_BUS_CLK, 2);
+DEFINE_CLK_SMD_RPM(msm8998, ce1_clk, ce1_a_clk, QCOM_SMD_RPM_CE_CLK, 0);
+DEFINE_CLK_SMD_RPM_XO_BUFFER(msm8998, div_clk1, div_clk1_a, 0xb);
+DEFINE_CLK_SMD_RPM(msm8998, ipa_clk, ipa_a_clk, QCOM_SMD_RPM_IPA_CLK, 0);
+DEFINE_CLK_SMD_RPM_XO_BUFFER(msm8998, ln_bb_clk1, ln_bb_clk1_a, 1);
+DEFINE_CLK_SMD_RPM_XO_BUFFER(msm8998, ln_bb_clk2, ln_bb_clk2_a, 2);
+DEFINE_CLK_SMD_RPM_XO_BUFFER_PINCTRL(msm8998, ln_bb_clk3_pin, ln_bb_clk3_a_pin,
+				     3);
+DEFINE_CLK_SMD_RPM(msm8998, mmssnoc_axi_rpm_clk, mmssnoc_axi_rpm_a_clk,
+		   QCOM_SMD_RPM_MMAXI_CLK, 0);
+DEFINE_CLK_SMD_RPM(msm8998, aggre1_noc_clk, aggre1_noc_a_clk,
+		   QCOM_SMD_RPM_AGGR_CLK, 1);
+DEFINE_CLK_SMD_RPM(msm8998, aggre2_noc_clk, aggre2_noc_a_clk,
+		   QCOM_SMD_RPM_AGGR_CLK, 2);
+DEFINE_CLK_SMD_RPM_QDSS(msm8998, qdss_clk, qdss_a_clk,
+			QCOM_SMD_RPM_MISC_CLK, 1);
+DEFINE_CLK_SMD_RPM_XO_BUFFER(msm8998, rf_clk1, rf_clk1_a, 4);
+DEFINE_CLK_SMD_RPM_XO_BUFFER_PINCTRL(msm8998, rf_clk2_pin, rf_clk2_a_pin, 5);
+DEFINE_CLK_SMD_RPM_XO_BUFFER(msm8998, rf_clk3, rf_clk3_a, 6);
+DEFINE_CLK_SMD_RPM_XO_BUFFER_PINCTRL(msm8998, rf_clk3_pin, rf_clk3_a_pin, 6);
+static struct clk_smd_rpm *msm8998_clks[] = {
+	[RPM_SMD_SNOC_CLK] = &msm8998_snoc_clk,
+	[RPM_SMD_SNOC_A_CLK] = &msm8998_snoc_a_clk,
+	[RPM_SMD_CNOC_CLK] = &msm8998_cnoc_clk,
+	[RPM_SMD_CNOC_A_CLK] = &msm8998_cnoc_a_clk,
+	[RPM_SMD_CE1_CLK] = &msm8998_ce1_clk,
+	[RPM_SMD_CE1_A_CLK] = &msm8998_ce1_a_clk,
+	[RPM_SMD_DIV_CLK1] = &msm8998_div_clk1,
+	[RPM_SMD_DIV_A_CLK1] = &msm8998_div_clk1_a,
+	[RPM_SMD_IPA_CLK] = &msm8998_ipa_clk,
+	[RPM_SMD_IPA_A_CLK] = &msm8998_ipa_a_clk,
+	[RPM_SMD_LN_BB_CLK1] = &msm8998_ln_bb_clk1,
+	[RPM_SMD_LN_BB_CLK1_A] = &msm8998_ln_bb_clk1_a,
+	[RPM_SMD_LN_BB_CLK2] = &msm8998_ln_bb_clk2,
+	[RPM_SMD_LN_BB_CLK2_A] = &msm8998_ln_bb_clk2_a,
+	[RPM_SMD_LN_BB_CLK3_PIN] = &msm8998_ln_bb_clk3_pin,
+	[RPM_SMD_LN_BB_CLK3_A_PIN] = &msm8998_ln_bb_clk3_a_pin,
+	[RPM_SMD_MMAXI_CLK] = &msm8998_mmssnoc_axi_rpm_clk,
+	[RPM_SMD_MMAXI_A_CLK] = &msm8998_mmssnoc_axi_rpm_a_clk,
+	[RPM_SMD_AGGR1_NOC_CLK] = &msm8998_aggre1_noc_clk,
+	[RPM_SMD_AGGR1_NOC_A_CLK] = &msm8998_aggre1_noc_a_clk,
+	[RPM_SMD_AGGR2_NOC_CLK] = &msm8998_aggre2_noc_clk,
+	[RPM_SMD_AGGR2_NOC_A_CLK] = &msm8998_aggre2_noc_a_clk,
+	[RPM_SMD_QDSS_CLK] = &msm8998_qdss_clk,
+	[RPM_SMD_QDSS_A_CLK] = &msm8998_qdss_a_clk,
+	[RPM_SMD_RF_CLK1] = &msm8998_rf_clk1,
+	[RPM_SMD_RF_CLK1_A] = &msm8998_rf_clk1_a,
+	[RPM_SMD_RF_CLK2_PIN] = &msm8998_rf_clk2_pin,
+	[RPM_SMD_RF_CLK2_A_PIN] = &msm8998_rf_clk2_a_pin,
+	[RPM_SMD_RF_CLK3] = &msm8998_rf_clk3,
+	[RPM_SMD_RF_CLK3_A] = &msm8998_rf_clk3_a,
+	[RPM_SMD_RF_CLK3_PIN] = &msm8998_rf_clk3_pin,
+	[RPM_SMD_RF_CLK3_A_PIN] = &msm8998_rf_clk3_a_pin,
+};
+
+static const struct rpm_smd_clk_desc rpm_clk_msm8998 = {
+	.clks = msm8998_clks,
+	.num_clks = ARRAY_SIZE(msm8998_clks),
+};
+
 static const struct of_device_id rpm_smd_clk_match_table[] = {
 	{ .compatible = "qcom,rpmcc-msm8916", .data = &rpm_clk_msm8916 },
 	{ .compatible = "qcom,rpmcc-msm8974", .data = &rpm_clk_msm8974 },
 	{ .compatible = "qcom,rpmcc-msm8996", .data = &rpm_clk_msm8996 },
+	{ .compatible = "qcom,rpmcc-msm8998", .data = &rpm_clk_msm8998 },
 	{ .compatible = "qcom,rpmcc-qcs404",  .data = &rpm_clk_qcs404  },
 	{ }
 };
diff --git a/include/dt-bindings/clock/qcom,rpmcc.h b/include/dt-bindings/clock/qcom,rpmcc.h
index 3658b0c14966..ede93a0ca156 100644
--- a/include/dt-bindings/clock/qcom,rpmcc.h
+++ b/include/dt-bindings/clock/qcom,rpmcc.h
@@ -127,5 +127,15 @@
 #define RPM_SMD_BIMC_GPU_A_CLK			77
 #define RPM_SMD_QPIC_CLK			78
 #define RPM_SMD_QPIC_CLK_A			79
+#define RPM_SMD_LN_BB_CLK1			80
+#define RPM_SMD_LN_BB_CLK1_A			81
+#define RPM_SMD_LN_BB_CLK2			82
+#define RPM_SMD_LN_BB_CLK2_A			83
+#define RPM_SMD_LN_BB_CLK3_PIN			84
+#define RPM_SMD_LN_BB_CLK3_A_PIN		85
+#define RPM_SMD_RF_CLK3				86
+#define RPM_SMD_RF_CLK3_A			87
+#define RPM_SMD_RF_CLK3_PIN			88
+#define RPM_SMD_RF_CLK3_A_PIN			89
 
 #endif
-- 
cgit v1.2.3


From cc2d22477779f189595db5c515bd5ef9c75a1f35 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 7 Jan 2019 20:49:39 +0100
Subject: pwm: Drop per-chip dbg_show callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This callback was introduced in commit 62099abf67a2 ("pwm: Add debugfs
interface") in 2012 and up to now there is not a single user. So drop
this unused code.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
[thierry.reding@gmail.com: remove kerneldoc for ->dbg_show()]
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 5 +----
 include/linux/pwm.h | 4 ----
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 253a459fe0d8..3149204567f3 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -1036,10 +1036,7 @@ static int pwm_seq_show(struct seq_file *s, void *v)
 		   dev_name(chip->dev), chip->npwm,
 		   (chip->npwm != 1) ? "s" : "");
 
-	if (chip->ops->dbg_show)
-		chip->ops->dbg_show(chip, s);
-	else
-		pwm_dbg_show(chip, s);
+	pwm_dbg_show(chip, s);
 
 	return 0;
 }
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index d5199b507d79..6a544cb89de4 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -254,7 +254,6 @@ pwm_set_relative_duty_cycle(struct pwm_state *state, unsigned int duty_cycle,
  * @get_state: get the current PWM state. This function is only
  *	       called once per PWM device when the PWM chip is
  *	       registered.
- * @dbg_show: optional routine to show contents in debugfs
  * @owner: helps prevent removal of modules exporting active PWMs
  */
 struct pwm_ops {
@@ -272,9 +271,6 @@ struct pwm_ops {
 		     struct pwm_state *state);
 	void (*get_state)(struct pwm_chip *chip, struct pwm_device *pwm,
 			  struct pwm_state *state);
-#ifdef CONFIG_DEBUG_FS
-	void (*dbg_show)(struct pwm_chip *chip, struct seq_file *s);
-#endif
 	struct module *owner;
 };
 
-- 
cgit v1.2.3


From 5d0a4c11896e8b83f816f135c24b184d4ba57741 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 7 Jan 2019 20:49:41 +0100
Subject: pwm: Rearrange structures to group members by purpose
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In pwm_ops there are a few callbacks that are not supposed to be used by
new drivers. Group them at the end of the structure and add a comment.

Similarily for struct pwm_chip group the members that drivers shouldn't
care about at the end and mark them as internal with another comment.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 6a544cb89de4..b628abfffacc 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -242,11 +242,7 @@ pwm_set_relative_duty_cycle(struct pwm_state *state, unsigned int duty_cycle,
  * struct pwm_ops - PWM controller operations
  * @request: optional hook for requesting a PWM
  * @free: optional hook for freeing a PWM
- * @config: configure duty cycles and period length for this PWM
- * @set_polarity: configure the polarity of this PWM
  * @capture: capture and report PWM signal
- * @enable: enable PWM output toggling
- * @disable: disable PWM output toggling
  * @apply: atomically apply a new PWM config. The state argument
  *	   should be adjusted with the real hardware config (if the
  *	   approximate the period or duty_cycle value, state should
@@ -255,48 +251,55 @@ pwm_set_relative_duty_cycle(struct pwm_state *state, unsigned int duty_cycle,
  *	       called once per PWM device when the PWM chip is
  *	       registered.
  * @owner: helps prevent removal of modules exporting active PWMs
+ * @config: configure duty cycles and period length for this PWM
+ * @set_polarity: configure the polarity of this PWM
+ * @enable: enable PWM output toggling
+ * @disable: disable PWM output toggling
  */
 struct pwm_ops {
 	int (*request)(struct pwm_chip *chip, struct pwm_device *pwm);
 	void (*free)(struct pwm_chip *chip, struct pwm_device *pwm);
-	int (*config)(struct pwm_chip *chip, struct pwm_device *pwm,
-		      int duty_ns, int period_ns);
-	int (*set_polarity)(struct pwm_chip *chip, struct pwm_device *pwm,
-			    enum pwm_polarity polarity);
 	int (*capture)(struct pwm_chip *chip, struct pwm_device *pwm,
 		       struct pwm_capture *result, unsigned long timeout);
-	int (*enable)(struct pwm_chip *chip, struct pwm_device *pwm);
-	void (*disable)(struct pwm_chip *chip, struct pwm_device *pwm);
 	int (*apply)(struct pwm_chip *chip, struct pwm_device *pwm,
 		     struct pwm_state *state);
 	void (*get_state)(struct pwm_chip *chip, struct pwm_device *pwm,
 			  struct pwm_state *state);
 	struct module *owner;
+
+	/* Only used by legacy drivers */
+	int (*config)(struct pwm_chip *chip, struct pwm_device *pwm,
+		      int duty_ns, int period_ns);
+	int (*set_polarity)(struct pwm_chip *chip, struct pwm_device *pwm,
+			    enum pwm_polarity polarity);
+	int (*enable)(struct pwm_chip *chip, struct pwm_device *pwm);
+	void (*disable)(struct pwm_chip *chip, struct pwm_device *pwm);
 };
 
 /**
  * struct pwm_chip - abstract a PWM controller
  * @dev: device providing the PWMs
- * @list: list node for internal use
  * @ops: callbacks for this PWM controller
  * @base: number of first PWM controlled by this chip
  * @npwm: number of PWMs controlled by this chip
- * @pwms: array of PWM devices allocated by the framework
  * @of_xlate: request a PWM device given a device tree PWM specifier
  * @of_pwm_n_cells: number of cells expected in the device tree PWM specifier
+ * @list: list node for internal use
+ * @pwms: array of PWM devices allocated by the framework
  */
 struct pwm_chip {
 	struct device *dev;
-	struct list_head list;
 	const struct pwm_ops *ops;
 	int base;
 	unsigned int npwm;
 
-	struct pwm_device *pwms;
-
 	struct pwm_device * (*of_xlate)(struct pwm_chip *pc,
 					const struct of_phandle_args *args);
 	unsigned int of_pwm_n_cells;
+
+	/* only used internally by the PWM framework */
+	struct list_head list;
+	struct pwm_device *pwms;
 };
 
 /**
-- 
cgit v1.2.3


From b0ea0fa5435f9df7213a9af098558f7dd584d8e8 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Wed, 9 Jan 2019 11:15:16 +0200
Subject: IB/{core,hw}: Have ib_umem_get extract the ib_ucontext from ib_udata

ib_umem_get() can only be called in a method callback, which always has a
udata parameter. This allows ib_umem_get() to derive the ucontext pointer
directly from the udata without requiring the drivers to find it in some
way or another.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Shamir Rabinovitch <shamir.rabinovitch@oracle.com>
---
 drivers/infiniband/core/umem.c                |  9 ++++--
 drivers/infiniband/core/uverbs_main.c         | 24 ++++++++++++++++
 drivers/infiniband/hw/bnxt_re/ib_verbs.c      | 13 ++++-----
 drivers/infiniband/hw/cxgb3/iwch_provider.c   |  2 +-
 drivers/infiniband/hw/cxgb4/mem.c             |  2 +-
 drivers/infiniband/hw/hns/hns_roce_cq.c       |  9 +++---
 drivers/infiniband/hw/hns/hns_roce_db.c       |  6 ++--
 drivers/infiniband/hw/hns/hns_roce_device.h   |  3 +-
 drivers/infiniband/hw/hns/hns_roce_mr.c       |  7 ++---
 drivers/infiniband/hw/hns/hns_roce_qp.c       | 13 ++++-----
 drivers/infiniband/hw/hns/hns_roce_srq.c      |  7 ++---
 drivers/infiniband/hw/i40iw/i40iw_verbs.c     |  2 +-
 drivers/infiniband/hw/mlx4/cq.c               | 17 ++++++------
 drivers/infiniband/hw/mlx4/doorbell.c         |  6 ++--
 drivers/infiniband/hw/mlx4/mlx4_ib.h          |  3 +-
 drivers/infiniband/hw/mlx4/mr.c               | 11 ++++----
 drivers/infiniband/hw/mlx4/qp.c               | 14 ++++++----
 drivers/infiniband/hw/mlx4/srq.c              |  5 ++--
 drivers/infiniband/hw/mlx5/cq.c               | 11 ++++----
 drivers/infiniband/hw/mlx5/devx.c             |  2 +-
 drivers/infiniband/hw/mlx5/doorbell.c         |  6 ++--
 drivers/infiniband/hw/mlx5/mlx5_ib.h          |  4 ++-
 drivers/infiniband/hw/mlx5/mr.c               | 22 +++++++--------
 drivers/infiniband/hw/mlx5/odp.c              |  4 +--
 drivers/infiniband/hw/mlx5/qp.c               | 40 ++++++++++++---------------
 drivers/infiniband/hw/mlx5/srq.c              |  5 ++--
 drivers/infiniband/hw/mthca/mthca_provider.c  |  2 +-
 drivers/infiniband/hw/nes/nes_verbs.c         |  2 +-
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c   |  2 +-
 drivers/infiniband/hw/qedr/verbs.c            | 33 +++++++++++-----------
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c  |  2 +-
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c  |  3 +-
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c  |  6 ++--
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c |  4 +--
 drivers/infiniband/sw/rdmavt/mr.c             |  3 +-
 drivers/infiniband/sw/rxe/rxe_mr.c            |  2 +-
 include/rdma/ib_umem.h                        |  8 ++++--
 include/rdma/ib_verbs.h                       |  1 +
 38 files changed, 168 insertions(+), 147 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index c6144df47ea4..1efe0a74e06b 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -72,15 +72,16 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
  * If access flags indicate ODP memory, avoid pinning. Instead, stores
  * the mm for future page fault handling in conjunction with MMU notifiers.
  *
- * @context: userspace context to pin memory for
+ * @udata: userspace context to pin memory for
  * @addr: userspace virtual address to start at
  * @size: length of region to pin
  * @access: IB_ACCESS_xxx flags for memory being pinned
  * @dmasync: flush in-flight DMA when the memory region is written
  */
-struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
 			    size_t size, int access, int dmasync)
 {
+	struct ib_ucontext *context;
 	struct ib_umem *umem;
 	struct page **page_list;
 	struct vm_area_struct **vma_list;
@@ -95,6 +96,10 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 	struct scatterlist *sg, *sg_list_start;
 	unsigned int gup_flags = FOLL_WRITE;
 
+	context = rdma_get_ucontext(udata);
+	if (IS_ERR(context))
+		return ERR_CAST(context);
+
 	if (dmasync)
 		dma_attrs |= DMA_ATTR_WRITE_BARRIER;
 
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index fb0007aa0c27..996f167d1436 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -101,6 +101,30 @@ struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile)
 }
 EXPORT_SYMBOL(ib_uverbs_get_ucontext_file);
 
+/* rdma_get_ucontext - Return the ucontext from a udata
+ * @udata: The udata to get the context from
+ *
+ * This can only be called from within a uapi method that was passed ib_udata
+ * as a parameter. It returns the ucontext associated with the udata, or ERR_PTR
+ * if the udata is NULL or the ucontext has been disassociated.
+ */
+struct ib_ucontext *rdma_get_ucontext(struct ib_udata *udata)
+{
+	if (!udata)
+		return ERR_PTR(-EIO);
+
+	/*
+	 * FIXME: Really all cases that get here with a udata will have
+	 * already called ib_uverbs_get_ucontext_file, or located a uobject
+	 * that points to a ucontext. We could store that result in the udata
+	 * so this function can't fail.
+	 */
+	return ib_uverbs_get_ucontext_file(
+		container_of(udata, struct uverbs_attr_bundle, driver_udata)
+			->ufile);
+}
+EXPORT_SYMBOL(rdma_get_ucontext);
+
 int uverbs_dealloc_mw(struct ib_mw *mw)
 {
 	struct ib_pd *pd = mw->pd;
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 1e2515e2eb62..9bc637e49faa 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -895,8 +895,7 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
 	if (qplib_qp->type == CMDQ_CREATE_QP_TYPE_RC)
 		bytes += (qplib_qp->sq.max_wqe * sizeof(struct sq_psn_search));
 	bytes = PAGE_ALIGN(bytes);
-	umem = ib_umem_get(context, ureq.qpsva, bytes,
-			   IB_ACCESS_LOCAL_WRITE, 1);
+	umem = ib_umem_get(udata, ureq.qpsva, bytes, IB_ACCESS_LOCAL_WRITE, 1);
 	if (IS_ERR(umem))
 		return PTR_ERR(umem);
 
@@ -908,7 +907,7 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
 	if (!qp->qplib_qp.srq) {
 		bytes = (qplib_qp->rq.max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
 		bytes = PAGE_ALIGN(bytes);
-		umem = ib_umem_get(context, ureq.qprva, bytes,
+		umem = ib_umem_get(udata, ureq.qprva, bytes,
 				   IB_ACCESS_LOCAL_WRITE, 1);
 		if (IS_ERR(umem))
 			goto rqfail;
@@ -1370,8 +1369,7 @@ static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev,
 
 	bytes = (qplib_srq->max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
 	bytes = PAGE_ALIGN(bytes);
-	umem = ib_umem_get(context, ureq.srqva, bytes,
-			   IB_ACCESS_LOCAL_WRITE, 1);
+	umem = ib_umem_get(udata, ureq.srqva, bytes, IB_ACCESS_LOCAL_WRITE, 1);
 	if (IS_ERR(umem))
 		return PTR_ERR(umem);
 
@@ -2622,7 +2620,7 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
 			goto fail;
 		}
 
-		cq->umem = ib_umem_get(context, req.cq_va,
+		cq->umem = ib_umem_get(udata, req.cq_va,
 				       entries * sizeof(struct cq_base),
 				       IB_ACCESS_LOCAL_WRITE, 1);
 		if (IS_ERR(cq->umem)) {
@@ -3589,8 +3587,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
 	/* The fixed portion of the rkey is the same as the lkey */
 	mr->ib_mr.rkey = mr->qplib_mr.rkey;
 
-	umem = ib_umem_get(ib_pd->uobject->context, start, length,
-			   mr_access_flags, 0);
+	umem = ib_umem_get(udata, start, length, mr_access_flags, 0);
 	if (IS_ERR(umem)) {
 		dev_err(rdev_to_dev(rdev), "Failed to get umem");
 		rc = -EFAULT;
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index b34b1a1bd94b..92ee6761a3bd 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -540,7 +540,7 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
 	mhp->rhp = rhp;
 
-	mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	mhp->umem = ib_umem_get(udata, start, length, acc, 0);
 	if (IS_ERR(mhp->umem)) {
 		err = PTR_ERR(mhp->umem);
 		kfree(mhp);
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index 7b76e6f81aeb..96760a36b9fc 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -537,7 +537,7 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
 	mhp->rhp = rhp;
 
-	mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	mhp->umem = ib_umem_get(udata, start, length, acc, 0);
 	if (IS_ERR(mhp->umem))
 		goto err_free_skb;
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
index 3a485f50fede..1dfe5627006c 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
@@ -215,7 +215,7 @@ void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
 EXPORT_SYMBOL_GPL(hns_roce_free_cq);
 
 static int hns_roce_ib_get_cq_umem(struct hns_roce_dev *hr_dev,
-				   struct ib_ucontext *context,
+				   struct ib_udata *udata,
 				   struct hns_roce_cq_buf *buf,
 				   struct ib_umem **umem, u64 buf_addr, int cqe)
 {
@@ -223,7 +223,7 @@ static int hns_roce_ib_get_cq_umem(struct hns_roce_dev *hr_dev,
 	u32 page_shift;
 	u32 npages;
 
-	*umem = ib_umem_get(context, buf_addr, cqe * hr_dev->caps.cq_entry_sz,
+	*umem = ib_umem_get(udata, buf_addr, cqe * hr_dev->caps.cq_entry_sz,
 			    IB_ACCESS_LOCAL_WRITE, 1);
 	if (IS_ERR(*umem))
 		return PTR_ERR(*umem);
@@ -347,7 +347,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
 		}
 
 		/* Get user space address, write it into mtt table */
-		ret = hns_roce_ib_get_cq_umem(hr_dev, context, &hr_cq->hr_buf,
+		ret = hns_roce_ib_get_cq_umem(hr_dev, udata, &hr_cq->hr_buf,
 					      &hr_cq->umem, ucmd.buf_addr,
 					      cq_entries);
 		if (ret) {
@@ -358,7 +358,8 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
 		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
 		    (udata->outlen >= sizeof(resp))) {
 			ret = hns_roce_db_map_user(to_hr_ucontext(context),
-						   ucmd.db_addr, &hr_cq->db);
+						   udata, ucmd.db_addr,
+						   &hr_cq->db);
 			if (ret) {
 				dev_err(dev, "cq record doorbell map failed!\n");
 				goto err_mtt;
diff --git a/drivers/infiniband/hw/hns/hns_roce_db.c b/drivers/infiniband/hw/hns/hns_roce_db.c
index e2f93c1ce86a..0c6c1fe87705 100644
--- a/drivers/infiniband/hw/hns/hns_roce_db.c
+++ b/drivers/infiniband/hw/hns/hns_roce_db.c
@@ -8,7 +8,8 @@
 #include <rdma/ib_umem.h>
 #include "hns_roce_device.h"
 
-int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt,
+int hns_roce_db_map_user(struct hns_roce_ucontext *context,
+			 struct ib_udata *udata, unsigned long virt,
 			 struct hns_roce_db *db)
 {
 	struct hns_roce_user_db_page *page;
@@ -28,8 +29,7 @@ int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt,
 
 	refcount_set(&page->refcount, 1);
 	page->user_virt = (virt & PAGE_MASK);
-	page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
-				 PAGE_SIZE, 0, 0);
+	page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0, 0);
 	if (IS_ERR(page->umem)) {
 		ret = PTR_ERR(page->umem);
 		kfree(page);
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 509e467843f6..9a4be70936e0 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -1133,7 +1133,8 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
 int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq);
 void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq);
 
-int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt,
+int hns_roce_db_map_user(struct hns_roce_ucontext *context,
+			 struct ib_udata *udata, unsigned long virt,
 			 struct hns_roce_db *db);
 void hns_roce_db_unmap_user(struct hns_roce_ucontext *context,
 			    struct hns_roce_db *db);
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index ee5991bd4171..da4fffedb879 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -1110,8 +1110,7 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
-	mr->umem = ib_umem_get(pd->uobject->context, start, length,
-			       access_flags, 0);
+	mr->umem = ib_umem_get(udata, start, length, access_flags, 0);
 	if (IS_ERR(mr->umem)) {
 		ret = PTR_ERR(mr->umem);
 		goto err_free;
@@ -1220,8 +1219,8 @@ int hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, u64 length,
 		}
 		ib_umem_release(mr->umem);
 
-		mr->umem = ib_umem_get(ibmr->uobject->context, start, length,
-				       mr_access_flags, 0);
+		mr->umem =
+			ib_umem_get(udata, start, length, mr_access_flags, 0);
 		if (IS_ERR(mr->umem)) {
 			ret = PTR_ERR(mr->umem);
 			mr->umem = NULL;
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index b9ba140b50d4..accf9ce1507d 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -613,9 +613,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 			goto err_rq_sge_list;
 		}
 
-		hr_qp->umem = ib_umem_get(ib_pd->uobject->context,
-					  ucmd.buf_addr, hr_qp->buff_size, 0,
-					  0);
+		hr_qp->umem = ib_umem_get(udata, ucmd.buf_addr,
+					  hr_qp->buff_size, 0, 0);
 		if (IS_ERR(hr_qp->umem)) {
 			dev_err(dev, "ib_umem_get error for create qp\n");
 			ret = PTR_ERR(hr_qp->umem);
@@ -654,8 +653,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 		    (udata->outlen >= sizeof(resp)) &&
 		    hns_roce_qp_has_sq(init_attr)) {
 			ret = hns_roce_db_map_user(
-					to_hr_ucontext(ib_pd->uobject->context),
-					ucmd.sdb_addr, &hr_qp->sdb);
+				to_hr_ucontext(ib_pd->uobject->context), udata,
+				ucmd.sdb_addr, &hr_qp->sdb);
 			if (ret) {
 				dev_err(dev, "sq record doorbell map failed!\n");
 				goto err_mtt;
@@ -670,8 +669,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 		    (udata->outlen >= sizeof(resp)) &&
 		    hns_roce_qp_has_rq(init_attr)) {
 			ret = hns_roce_db_map_user(
-					to_hr_ucontext(ib_pd->uobject->context),
-					ucmd.db_addr, &hr_qp->rdb);
+				to_hr_ucontext(ib_pd->uobject->context), udata,
+				ucmd.db_addr, &hr_qp->rdb);
 			if (ret) {
 				dev_err(dev, "rq record doorbell map failed!\n");
 				goto err_sq_dbmap;
diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c
index 960b1946c365..8975f858b36f 100644
--- a/drivers/infiniband/hw/hns/hns_roce_srq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_srq.c
@@ -252,8 +252,8 @@ struct ib_srq *hns_roce_create_srq(struct ib_pd *pd,
 			goto err_srq;
 		}
 
-		srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
-					srq_buf_size, 0, 0);
+		srq->umem =
+			ib_umem_get(udata, ucmd.buf_addr, srq_buf_size, 0, 0);
 		if (IS_ERR(srq->umem)) {
 			ret = PTR_ERR(srq->umem);
 			goto err_srq;
@@ -280,8 +280,7 @@ struct ib_srq *hns_roce_create_srq(struct ib_pd *pd,
 			goto err_srq_mtt;
 
 		/* config index queue BA */
-		srq->idx_que.umem = ib_umem_get(pd->uobject->context,
-						ucmd.que_addr,
+		srq->idx_que.umem = ib_umem_get(udata, ucmd.que_addr,
 						srq->idx_que.buf_size, 0, 0);
 		if (IS_ERR(srq->idx_que.umem)) {
 			dev_err(hr_dev->dev,
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index 0b675b0742c2..80b66df95362 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -1852,7 +1852,7 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
 
 	if (length > I40IW_MAX_MR_SIZE)
 		return ERR_PTR(-EINVAL);
-	region = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	region = ib_umem_get(udata, start, length, acc, 0);
 	if (IS_ERR(region))
 		return (struct ib_mr *)region;
 
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 43512347b4f0..db936c12b5bd 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -134,16 +134,16 @@ static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *
 	mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf);
 }
 
-static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context,
-			       struct mlx4_ib_cq_buf *buf, struct ib_umem **umem,
-			       u64 buf_addr, int cqe)
+static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_udata *udata,
+			       struct mlx4_ib_cq_buf *buf,
+			       struct ib_umem **umem, u64 buf_addr, int cqe)
 {
 	int err;
 	int cqe_size = dev->dev->caps.cqe_size;
 	int shift;
 	int n;
 
-	*umem = ib_umem_get(context, buf_addr, cqe * cqe_size,
+	*umem = ib_umem_get(udata, buf_addr, cqe * cqe_size,
 			    IB_ACCESS_LOCAL_WRITE, 1);
 	if (IS_ERR(*umem))
 		return PTR_ERR(*umem);
@@ -213,14 +213,13 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
 		}
 
 		buf_addr = (void *)(unsigned long)ucmd.buf_addr;
-
-		err = mlx4_ib_get_cq_umem(dev, context, &cq->buf, &cq->umem,
+		err = mlx4_ib_get_cq_umem(dev, udata, &cq->buf, &cq->umem,
 					  ucmd.buf_addr, entries);
 		if (err)
 			goto err_cq;
 
-		err = mlx4_ib_db_map_user(to_mucontext(context), ucmd.db_addr,
-					  &cq->db);
+		err = mlx4_ib_db_map_user(to_mucontext(context), udata,
+					  ucmd.db_addr, &cq->db);
 		if (err)
 			goto err_mtt;
 
@@ -336,7 +335,7 @@ static int mlx4_alloc_resize_umem(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq
 	if (!cq->resize_buf)
 		return -ENOMEM;
 
-	err = mlx4_ib_get_cq_umem(dev, cq->umem->context, &cq->resize_buf->buf,
+	err = mlx4_ib_get_cq_umem(dev, udata, &cq->resize_buf->buf,
 				  &cq->resize_umem, ucmd.buf_addr, entries);
 	if (err) {
 		kfree(cq->resize_buf);
diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c
index c51740986367..3aab71b29ce8 100644
--- a/drivers/infiniband/hw/mlx4/doorbell.c
+++ b/drivers/infiniband/hw/mlx4/doorbell.c
@@ -41,7 +41,8 @@ struct mlx4_ib_user_db_page {
 	int			refcnt;
 };
 
-int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context,
+			struct ib_udata *udata, unsigned long virt,
 			struct mlx4_db *db)
 {
 	struct mlx4_ib_user_db_page *page;
@@ -61,8 +62,7 @@ int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
 
 	page->user_virt = (virt & PAGE_MASK);
 	page->refcnt    = 0;
-	page->umem      = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
-				      PAGE_SIZE, 0, 0);
+	page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0, 0);
 	if (IS_ERR(page->umem)) {
 		err = PTR_ERR(page->umem);
 		kfree(page);
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index e491f3eda6e7..60dc1347c5ab 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -722,7 +722,8 @@ static inline u8 mlx4_ib_bond_next_port(struct mlx4_ib_dev *dev)
 int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev);
 void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev);
 
-int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context,
+			struct ib_udata *udata, unsigned long virt,
 			struct mlx4_db *db);
 void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db);
 
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index c7c85c22e4e3..56639ecd53ad 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -367,7 +367,8 @@ end:
 	return block_shift;
 }
 
-static struct ib_umem *mlx4_get_umem_mr(struct ib_ucontext *context, u64 start,
+static struct ib_umem *mlx4_get_umem_mr(struct ib_ucontext *context,
+					struct ib_udata *udata, u64 start,
 					u64 length, u64 virt_addr,
 					int access_flags)
 {
@@ -398,7 +399,7 @@ static struct ib_umem *mlx4_get_umem_mr(struct ib_ucontext *context, u64 start,
 		up_read(&current->mm->mmap_sem);
 	}
 
-	return ib_umem_get(context, start, length, access_flags, 0);
+	return ib_umem_get(udata, start, length, access_flags, 0);
 }
 
 struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
@@ -415,7 +416,7 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
-	mr->umem = mlx4_get_umem_mr(pd->uobject->context, start, length,
+	mr->umem = mlx4_get_umem_mr(pd->uobject->context, udata, start, length,
 				    virt_addr, access_flags);
 	if (IS_ERR(mr->umem)) {
 		err = PTR_ERR(mr->umem);
@@ -506,8 +507,8 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
 		mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
 		ib_umem_release(mmr->umem);
 		mmr->umem =
-			mlx4_get_umem_mr(mr->uobject->context, start, length,
-					 virt_addr, mr_access_flags);
+			mlx4_get_umem_mr(mr->uobject->context, udata, start,
+					 length, virt_addr, mr_access_flags);
 		if (IS_ERR(mmr->umem)) {
 			err = PTR_ERR(mmr->umem);
 			/* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 971e9a9ebdaf..e38bab50cecf 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1015,9 +1015,11 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 				(qp->sq.wqe_cnt << qp->sq.wqe_shift);
 		}
 
-		qp->umem = ib_umem_get(pd->uobject->context,
-				(src == MLX4_IB_QP_SRC) ? ucmd.qp.buf_addr :
-				ucmd.wq.buf_addr, qp->buf_size, 0, 0);
+		qp->umem =
+			ib_umem_get(udata,
+				    (src == MLX4_IB_QP_SRC) ? ucmd.qp.buf_addr :
+							      ucmd.wq.buf_addr,
+				    qp->buf_size, 0, 0);
 		if (IS_ERR(qp->umem)) {
 			err = PTR_ERR(qp->umem);
 			goto err;
@@ -1035,9 +1037,11 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			goto err_mtt;
 
 		if (qp_has_rq(init_attr)) {
-			err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
+			err = mlx4_ib_db_map_user(
+				to_mucontext(pd->uobject->context), udata,
 				(src == MLX4_IB_QP_SRC) ? ucmd.qp.db_addr :
-				ucmd.wq.db_addr, &qp->db);
+							  ucmd.wq.db_addr,
+				&qp->db);
 			if (err)
 				goto err_mtt;
 		}
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index 4456f1b8921d..498588eac051 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -113,8 +113,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
 			goto err_srq;
 		}
 
-		srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
-					buf_size, 0, 0);
+		srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0, 0);
 		if (IS_ERR(srq->umem)) {
 			err = PTR_ERR(srq->umem);
 			goto err_srq;
@@ -130,7 +129,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
 			goto err_mtt;
 
 		err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
-					  ucmd.db_addr, &srq->db);
+					  udata, ucmd.db_addr, &srq->db);
 		if (err)
 			goto err_mtt;
 	} else {
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 90f1b0bae5b5..c283c32f30fe 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -707,15 +707,15 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 
 	*cqe_size = ucmd.cqe_size;
 
-	cq->buf.umem = ib_umem_get(context, ucmd.buf_addr,
-				   entries * ucmd.cqe_size,
-				   IB_ACCESS_LOCAL_WRITE, 1);
+	cq->buf.umem =
+		ib_umem_get(udata, ucmd.buf_addr, entries * ucmd.cqe_size,
+			    IB_ACCESS_LOCAL_WRITE, 1);
 	if (IS_ERR(cq->buf.umem)) {
 		err = PTR_ERR(cq->buf.umem);
 		return err;
 	}
 
-	err = mlx5_ib_db_map_user(to_mucontext(context), ucmd.db_addr,
+	err = mlx5_ib_db_map_user(to_mucontext(context), udata, ucmd.db_addr,
 				  &cq->db);
 	if (err)
 		goto err_umem;
@@ -1111,7 +1111,6 @@ static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
 	struct ib_umem *umem;
 	int err;
 	int npages;
-	struct ib_ucontext *context = cq->buf.umem->context;
 
 	err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd));
 	if (err)
@@ -1124,7 +1123,7 @@ static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
 	if (ucmd.cqe_size && SIZE_MAX / ucmd.cqe_size <= entries - 1)
 		return -EINVAL;
 
-	umem = ib_umem_get(context, ucmd.buf_addr,
+	umem = ib_umem_get(udata, ucmd.buf_addr,
 			   (size_t)ucmd.cqe_size * entries,
 			   IB_ACCESS_LOCAL_WRITE, 1);
 	if (IS_ERR(umem)) {
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 5a588f3cfb1b..8bb711da7ee1 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -1195,7 +1195,7 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
 	if (err)
 		return err;
 
-	obj->umem = ib_umem_get(ucontext, addr, size, access, 0);
+	obj->umem = ib_umem_get(&attrs->driver_udata, addr, size, access, 0);
 	if (IS_ERR(obj->umem))
 		return PTR_ERR(obj->umem);
 
diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c
index a0e4e6ddb71a..8f4e5f22b84c 100644
--- a/drivers/infiniband/hw/mlx5/doorbell.c
+++ b/drivers/infiniband/hw/mlx5/doorbell.c
@@ -43,7 +43,8 @@ struct mlx5_ib_user_db_page {
 	int			refcnt;
 };
 
-int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,
+int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context,
+			struct ib_udata *udata, unsigned long virt,
 			struct mlx5_db *db)
 {
 	struct mlx5_ib_user_db_page *page;
@@ -63,8 +64,7 @@ int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,
 
 	page->user_virt = (virt & PAGE_MASK);
 	page->refcnt    = 0;
-	page->umem      = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
-				      PAGE_SIZE, 0, 0);
+	page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0, 0);
 	if (IS_ERR(page->umem)) {
 		err = PTR_ERR(page->umem);
 		kfree(page);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 81efa5def8ad..b0a37ca2a714 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1037,7 +1037,8 @@ to_mflow_act(struct ib_flow_action *ibact)
 	return container_of(ibact, struct mlx5_ib_flow_action, ib_action);
 }
 
-int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,
+int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context,
+			struct ib_udata *udata, unsigned long virt,
 			struct mlx5_db *db);
 void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db);
 void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq);
@@ -1103,6 +1104,7 @@ int mlx5_ib_dealloc_mw(struct ib_mw *mw);
 int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 		       int page_shift, int flags);
 struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
+					     struct ib_udata *udata,
 					     int access_flags);
 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr);
 int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 494a90f4348c..6d763d6a189f 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -843,18 +843,17 @@ static int mr_cache_max_order(struct mlx5_ib_dev *dev)
 	return MLX5_MAX_UMR_SHIFT;
 }
 
-static int mr_umem_get(struct ib_pd *pd, u64 start, u64 length,
-		       int access_flags, struct ib_umem **umem,
-		       int *npages, int *page_shift, int *ncont,
-		       int *order)
+static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
+		       u64 start, u64 length, int access_flags,
+		       struct ib_umem **umem, int *npages, int *page_shift,
+		       int *ncont, int *order)
 {
-	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct ib_umem *u;
 	int err;
 
 	*umem = NULL;
 
-	u = ib_umem_get(pd->uobject->context, start, length, access_flags, 0);
+	u = ib_umem_get(udata, start, length, access_flags, 0);
 	err = PTR_ERR_OR_ZERO(u);
 	if (err) {
 		mlx5_ib_dbg(dev, "umem get failed (%d)\n", err);
@@ -1333,14 +1332,14 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 		    !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
 			return ERR_PTR(-EINVAL);
 
-		mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
+		mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), udata, access_flags);
 		if (IS_ERR(mr))
 			return ERR_CAST(mr);
 		return &mr->ibmr;
 	}
 
-	err = mr_umem_get(pd, start, length, access_flags, &umem, &npages,
-			   &page_shift, &ncont, &order);
+	err = mr_umem_get(dev, udata, start, length, access_flags, &umem,
+			  &npages, &page_shift, &ncont, &order);
 
 	if (err < 0)
 		return ERR_PTR(err);
@@ -1488,8 +1487,9 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 		flags |= IB_MR_REREG_TRANS;
 		ib_umem_release(mr->umem);
 		mr->umem = NULL;
-		err = mr_umem_get(pd, addr, len, access_flags, &mr->umem,
-				  &npages, &page_shift, &ncont, &order);
+		err = mr_umem_get(dev, udata, addr, len, access_flags,
+				  &mr->umem, &npages, &page_shift, &ncont,
+				  &order);
 		if (err)
 			goto err;
 	}
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 37d6653f9636..8d46b1dc5658 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -492,13 +492,13 @@ next_mr:
 }
 
 struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
+					     struct ib_udata *udata,
 					     int access_flags)
 {
-	struct ib_ucontext *ctx = pd->ibpd.uobject->context;
 	struct mlx5_ib_mr *imr;
 	struct ib_umem *umem;
 
-	umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0);
+	umem = ib_umem_get(udata, 0, 0, IB_ACCESS_ON_DEMAND, 0);
 	if (IS_ERR(umem))
 		return ERR_CAST(umem);
 
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index dd2ae640bc84..529e76f67cb6 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -645,16 +645,14 @@ int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
 	return bfregi->sys_pages[index_of_sys_page] + offset;
 }
 
-static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev,
-			    struct ib_pd *pd,
+static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 			    unsigned long addr, size_t size,
-			    struct ib_umem **umem,
-			    int *npages, int *page_shift, int *ncont,
-			    u32 *offset)
+			    struct ib_umem **umem, int *npages, int *page_shift,
+			    int *ncont, u32 *offset)
 {
 	int err;
 
-	*umem = ib_umem_get(pd->uobject->context, addr, size, 0, 0);
+	*umem = ib_umem_get(udata, addr, size, 0, 0);
 	if (IS_ERR(*umem)) {
 		mlx5_ib_dbg(dev, "umem_get failed\n");
 		return PTR_ERR(*umem);
@@ -695,10 +693,9 @@ static void destroy_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 }
 
 static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
-			  struct mlx5_ib_rwq *rwq,
+			  struct ib_udata *udata, struct mlx5_ib_rwq *rwq,
 			  struct mlx5_ib_create_wq *ucmd)
 {
-	struct mlx5_ib_ucontext *context;
 	int page_shift = 0;
 	int npages;
 	u32 offset = 0;
@@ -708,9 +705,7 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	if (!ucmd->buf_addr)
 		return -EINVAL;
 
-	context = to_mucontext(pd->uobject->context);
-	rwq->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr,
-			       rwq->buf_size, 0, 0);
+	rwq->umem = ib_umem_get(udata, ucmd->buf_addr, rwq->buf_size, 0, 0);
 	if (IS_ERR(rwq->umem)) {
 		mlx5_ib_dbg(dev, "umem_get failed\n");
 		err = PTR_ERR(rwq->umem);
@@ -735,7 +730,8 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 		    (unsigned long long)ucmd->buf_addr, rwq->buf_size,
 		    npages, page_shift, ncont, offset);
 
-	err = mlx5_ib_db_map_user(context, ucmd->db_addr, &rwq->db);
+	err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context), udata,
+				  ucmd->db_addr, &rwq->db);
 	if (err) {
 		mlx5_ib_dbg(dev, "map failed\n");
 		goto err_umem;
@@ -819,10 +815,9 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 
 	if (ucmd.buf_addr && ubuffer->buf_size) {
 		ubuffer->buf_addr = ucmd.buf_addr;
-		err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr,
-				       ubuffer->buf_size,
-				       &ubuffer->umem, &npages, &page_shift,
-				       &ncont, &offset);
+		err = mlx5_ib_umem_get(dev, udata, ubuffer->buf_addr,
+				       ubuffer->buf_size, &ubuffer->umem,
+				       &npages, &page_shift, &ncont, &offset);
 		if (err)
 			goto err_bfreg;
 	} else {
@@ -856,7 +851,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 		resp->bfreg_index = MLX5_IB_INVALID_BFREG;
 	qp->bfregn = bfregn;
 
-	err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db);
+	err = mlx5_ib_db_map_user(context, udata, ucmd.db_addr, &qp->db);
 	if (err) {
 		mlx5_ib_dbg(dev, "map failed\n");
 		goto err_free;
@@ -1119,6 +1114,7 @@ static void destroy_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
 }
 
 static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
+				   struct ib_udata *udata,
 				   struct mlx5_ib_sq *sq, void *qpin,
 				   struct ib_pd *pd)
 {
@@ -1135,9 +1131,9 @@ static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
 	int ncont = 0;
 	u32 offset = 0;
 
-	err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, ubuffer->buf_size,
-			       &sq->ubuffer.umem, &npages, &page_shift,
-			       &ncont, &offset);
+	err = mlx5_ib_umem_get(dev, udata, ubuffer->buf_addr, ubuffer->buf_size,
+			       &sq->ubuffer.umem, &npages, &page_shift, &ncont,
+			       &offset);
 	if (err)
 		return err;
 
@@ -1374,7 +1370,7 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
 		if (err)
 			return err;
 
-		err = create_raw_packet_qp_sq(dev, sq, in, pd);
+		err = create_raw_packet_qp_sq(dev, udata, sq, in, pd);
 		if (err)
 			goto err_destroy_tis;
 
@@ -5793,7 +5789,7 @@ static int prepare_user_rq(struct ib_pd *pd,
 		return err;
 	}
 
-	err = create_user_rq(dev, pd, rwq, &ucmd);
+	err = create_user_rq(dev, pd, udata, rwq, &ucmd);
 	if (err) {
 		mlx5_ib_dbg(dev, "err %d\n", err);
 		return err;
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index 4e8d18009f58..22bd774e0b4e 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -79,8 +79,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 
 	srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE);
 
-	srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size,
-				0, 0);
+	srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0, 0);
 	if (IS_ERR(srq->umem)) {
 		mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size);
 		err = PTR_ERR(srq->umem);
@@ -104,7 +103,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 
 	mlx5_ib_populate_pas(dev, srq->umem, page_shift, in->pas, 0);
 
-	err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context),
+	err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context), udata,
 				  ucmd.db_addr, &srq->db);
 	if (err) {
 		mlx5_ib_dbg(dev, "map doorbell failed\n");
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 82cb6b71ac7c..53fff6aed896 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -931,7 +931,7 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
-	mr->umem = ib_umem_get(pd->uobject->context, start, length, acc,
+	mr->umem = ib_umem_get(udata, start, length, acc,
 			       ucmd.mr_attrs & MTHCA_MR_DMASYNC);
 
 	if (IS_ERR(mr->umem)) {
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 4e7f08ee1907..feb4d259aab9 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -2134,7 +2134,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	u8 stag_key;
 	int first_page = 1;
 
-	region = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	region = ib_umem_get(udata, start, length, acc, 0);
 	if (IS_ERR(region)) {
 		return (struct ib_mr *)region;
 	}
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index c46bed0c5513..a8b40cb7679e 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -916,7 +916,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 	if (!mr)
 		return ERR_PTR(status);
-	mr->umem = ib_umem_get(ibpd->uobject->context, start, len, acc, 0);
+	mr->umem = ib_umem_get(udata, start, len, acc, 0);
 	if (IS_ERR(mr->umem)) {
 		status = -EFAULT;
 		goto umem_err;
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index b342a70e2814..ba8cb6559122 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -736,11 +736,10 @@ static inline int qedr_align_cq_entries(int entries)
 	return aligned_size / QEDR_CQE_SIZE;
 }
 
-static inline int qedr_init_user_queue(struct ib_ucontext *ib_ctx,
+static inline int qedr_init_user_queue(struct ib_udata *udata,
 				       struct qedr_dev *dev,
-				       struct qedr_userq *q,
-				       u64 buf_addr, size_t buf_len,
-				       int access, int dmasync,
+				       struct qedr_userq *q, u64 buf_addr,
+				       size_t buf_len, int access, int dmasync,
 				       int alloc_and_init)
 {
 	u32 fw_pages;
@@ -748,7 +747,7 @@ static inline int qedr_init_user_queue(struct ib_ucontext *ib_ctx,
 
 	q->buf_addr = buf_addr;
 	q->buf_len = buf_len;
-	q->umem = ib_umem_get(ib_ctx, q->buf_addr, q->buf_len, access, dmasync);
+	q->umem = ib_umem_get(udata, q->buf_addr, q->buf_len, access, dmasync);
 	if (IS_ERR(q->umem)) {
 		DP_ERR(dev, "create user queue: failed ib_umem_get, got %ld\n",
 		       PTR_ERR(q->umem));
@@ -905,9 +904,9 @@ struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
 
 		cq->cq_type = QEDR_CQ_TYPE_USER;
 
-		rc = qedr_init_user_queue(ib_ctx, dev, &cq->q, ureq.addr,
-					  ureq.len, IB_ACCESS_LOCAL_WRITE,
-					  1, 1);
+		rc = qedr_init_user_queue(udata, dev, &cq->q, ureq.addr,
+					  ureq.len, IB_ACCESS_LOCAL_WRITE, 1,
+					  1);
 		if (rc)
 			goto err0;
 
@@ -1344,7 +1343,7 @@ static void qedr_free_srq_kernel_params(struct qedr_srq *srq)
 			  hw_srq->phy_prod_pair_addr);
 }
 
-static int qedr_init_srq_user_params(struct ib_ucontext *ib_ctx,
+static int qedr_init_srq_user_params(struct ib_udata *udata,
 				     struct qedr_srq *srq,
 				     struct qedr_create_srq_ureq *ureq,
 				     int access, int dmasync)
@@ -1352,14 +1351,14 @@ static int qedr_init_srq_user_params(struct ib_ucontext *ib_ctx,
 	struct scatterlist *sg;
 	int rc;
 
-	rc = qedr_init_user_queue(ib_ctx, srq->dev, &srq->usrq, ureq->srq_addr,
+	rc = qedr_init_user_queue(udata, srq->dev, &srq->usrq, ureq->srq_addr,
 				  ureq->srq_len, access, dmasync, 1);
 	if (rc)
 		return rc;
 
-	srq->prod_umem = ib_umem_get(ib_ctx, ureq->prod_pair_addr,
-				     sizeof(struct rdma_srq_producers),
-				     access, dmasync);
+	srq->prod_umem =
+		ib_umem_get(udata, ureq->prod_pair_addr,
+			    sizeof(struct rdma_srq_producers), access, dmasync);
 	if (IS_ERR(srq->prod_umem)) {
 		qedr_free_pbl(srq->dev, &srq->usrq.pbl_info, srq->usrq.pbl_tbl);
 		ib_umem_release(srq->usrq.umem);
@@ -1468,7 +1467,7 @@ struct ib_srq *qedr_create_srq(struct ib_pd *ibpd,
 			goto err0;
 		}
 
-		rc = qedr_init_srq_user_params(ib_ctx, srq, &ureq, 0, 0);
+		rc = qedr_init_srq_user_params(udata, srq, &ureq, 0, 0);
 		if (rc)
 			goto err0;
 
@@ -1714,14 +1713,14 @@ static int qedr_create_user_qp(struct qedr_dev *dev,
 	}
 
 	/* SQ - read access only (0), dma sync not required (0) */
-	rc = qedr_init_user_queue(ib_ctx, dev, &qp->usq, ureq.sq_addr,
+	rc = qedr_init_user_queue(udata, dev, &qp->usq, ureq.sq_addr,
 				  ureq.sq_len, 0, 0, alloc_and_init);
 	if (rc)
 		return rc;
 
 	if (!qp->srq) {
 		/* RQ - read access only (0), dma sync not required (0) */
-		rc = qedr_init_user_queue(ib_ctx, dev, &qp->urq, ureq.rq_addr,
+		rc = qedr_init_user_queue(udata, dev, &qp->urq, ureq.rq_addr,
 					  ureq.rq_len, 0, 0, alloc_and_init);
 		if (rc)
 			return rc;
@@ -2719,7 +2718,7 @@ struct ib_mr *qedr_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
 
 	mr->type = QEDR_MR_USER;
 
-	mr->umem = ib_umem_get(ibpd->uobject->context, start, len, acc, 0);
+	mr->umem = ib_umem_get(udata, start, len, acc, 0);
 	if (IS_ERR(mr->umem)) {
 		rc = -EFAULT;
 		goto err0;
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
index 0f004c737620..104c7db4704f 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
@@ -141,7 +141,7 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
 			goto err_cq;
 		}
 
-		cq->umem = ib_umem_get(context, ucmd.buf_addr, ucmd.buf_size,
+		cq->umem = ib_umem_get(udata, ucmd.buf_addr, ucmd.buf_size,
 				       IB_ACCESS_LOCAL_WRITE, 1);
 		if (IS_ERR(cq->umem)) {
 			ret = PTR_ERR(cq->umem);
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
index fa96fa4fb829..a85884e90e84 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
@@ -126,8 +126,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 		return ERR_PTR(-EINVAL);
 	}
 
-	umem = ib_umem_get(pd->uobject->context, start,
-			   length, access_flags, 0);
+	umem = ib_umem_get(udata, start, length, access_flags, 0);
 	if (IS_ERR(umem)) {
 		dev_warn(&dev->pdev->dev,
 			 "could not get umem for mem region\n");
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
index 3acf74cbe266..5fc444cef011 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
@@ -262,8 +262,7 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd,
 
 			if (!is_srq) {
 				/* set qp->sq.wqe_cnt, shift, buf_size.. */
-				qp->rumem = ib_umem_get(pd->uobject->context,
-							ucmd.rbuf_addr,
+				qp->rumem = ib_umem_get(udata, ucmd.rbuf_addr,
 							ucmd.rbuf_size, 0, 0);
 				if (IS_ERR(qp->rumem)) {
 					ret = PTR_ERR(qp->rumem);
@@ -275,8 +274,7 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd,
 				qp->srq = to_vsrq(init_attr->srq);
 			}
 
-			qp->sumem = ib_umem_get(pd->uobject->context,
-						ucmd.sbuf_addr,
+			qp->sumem = ib_umem_get(udata, ucmd.sbuf_addr,
 						ucmd.sbuf_size, 0, 0);
 			if (IS_ERR(qp->sumem)) {
 				if (!is_srq)
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
index 06ba7c7a2235..951d9d68107a 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
@@ -153,9 +153,7 @@ struct ib_srq *pvrdma_create_srq(struct ib_pd *pd,
 		goto err_srq;
 	}
 
-	srq->umem = ib_umem_get(pd->uobject->context,
-				ucmd.buf_addr,
-				ucmd.buf_size, 0, 0);
+	srq->umem = ib_umem_get(udata, ucmd.buf_addr, ucmd.buf_size, 0, 0);
 	if (IS_ERR(srq->umem)) {
 		ret = PTR_ERR(srq->umem);
 		goto err_srq;
diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c
index 49c9541050d4..8b1c1e8dd7ef 100644
--- a/drivers/infiniband/sw/rdmavt/mr.c
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -388,8 +388,7 @@ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	if (length == 0)
 		return ERR_PTR(-EINVAL);
 
-	umem = ib_umem_get(pd->uobject->context, start, length,
-			   mr_access_flags, 0);
+	umem = ib_umem_get(udata, start, length, mr_access_flags, 0);
 	if (IS_ERR(umem))
 		return (void *)umem;
 
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index 9d3916b93f23..2438093776a0 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -171,7 +171,7 @@ int rxe_mem_init_user(struct rxe_pd *pd, u64 start,
 	void			*vaddr;
 	int err;
 
-	umem = ib_umem_get(pd->ibpd.uobject->context, start, length, access, 0);
+	umem = ib_umem_get(udata, start, length, access, 0);
 	if (IS_ERR(umem)) {
 		pr_warn("err %d from rxe_umem_get\n",
 			(int)PTR_ERR(umem));
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 5d3755ec5afa..73af05db04c7 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -36,6 +36,7 @@
 #include <linux/list.h>
 #include <linux/scatterlist.h>
 #include <linux/workqueue.h>
+#include <rdma/ib_verbs.h>
 
 struct ib_ucontext;
 struct ib_umem_odp;
@@ -80,7 +81,7 @@ static inline size_t ib_umem_num_pages(struct ib_umem *umem)
 
 #ifdef CONFIG_INFINIBAND_USER_MEM
 
-struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
 			    size_t size, int access, int dmasync);
 void ib_umem_release(struct ib_umem *umem);
 int ib_umem_page_count(struct ib_umem *umem);
@@ -91,9 +92,10 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
 
 #include <linux/err.h>
 
-static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context,
+static inline struct ib_umem *ib_umem_get(struct ib_udata *udata,
 					  unsigned long addr, size_t size,
-					  int access, int dmasync) {
+					  int access, int dmasync)
+{
 	return ERR_PTR(-EINVAL);
 }
 static inline void ib_umem_release(struct ib_umem *umem) { }
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 3ddd199ba602..aa1f126d3383 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -4200,6 +4200,7 @@ void rdma_roce_rescan_device(struct ib_device *ibdev);
 
 struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile);
 
+struct ib_ucontext *rdma_get_ucontext(struct ib_udata *udata);
 
 int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs);
 
-- 
cgit v1.2.3


From 5b438f4ba315db4f8c1489d175656798d58c014f Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 11 Jan 2019 13:04:57 +0800
Subject: iommu/vt-d: Support page request in scalable mode

VT-d Rev3.0 has made a few changes to the page request interface,

1. widened PRQ descriptor from 128 bits to 256 bits;
2. removed streaming response type;
3. introduced private data that requires page response even the
   request is not last request in group (LPIG).

This is a supplement to commit 1c4f88b7f1f92 ("iommu/vt-d: Shared
virtual address in scalable mode") and makes the svm code compliant
with VT-d Rev3.0.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Liu Yi L <yi.l.liu@intel.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Fixes: 1c4f88b7f1f92 ("iommu/vt-d: Shared virtual address in scalable mode")
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel-svm.c   | 77 ++++++++++++++++++++++++++-------------------
 include/linux/intel-iommu.h | 21 ++++++-------
 include/linux/intel-svm.h   |  2 +-
 3 files changed, 55 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index a2a2aa4439aa..79add5716552 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -470,20 +470,31 @@ EXPORT_SYMBOL_GPL(intel_svm_is_pasid_valid);
 
 /* Page request queue descriptor */
 struct page_req_dsc {
-	u64 srr:1;
-	u64 bof:1;
-	u64 pasid_present:1;
-	u64 lpig:1;
-	u64 pasid:20;
-	u64 bus:8;
-	u64 private:23;
-	u64 prg_index:9;
-	u64 rd_req:1;
-	u64 wr_req:1;
-	u64 exe_req:1;
-	u64 priv_req:1;
-	u64 devfn:8;
-	u64 addr:52;
+	union {
+		struct {
+			u64 type:8;
+			u64 pasid_present:1;
+			u64 priv_data_present:1;
+			u64 rsvd:6;
+			u64 rid:16;
+			u64 pasid:20;
+			u64 exe_req:1;
+			u64 pm_req:1;
+			u64 rsvd2:10;
+		};
+		u64 qw_0;
+	};
+	union {
+		struct {
+			u64 rd_req:1;
+			u64 wr_req:1;
+			u64 lpig:1;
+			u64 prg_index:9;
+			u64 addr:52;
+		};
+		u64 qw_1;
+	};
+	u64 priv_data[2];
 };
 
 #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x10)
@@ -596,7 +607,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 		/* Accounting for major/minor faults? */
 		rcu_read_lock();
 		list_for_each_entry_rcu(sdev, &svm->devs, list) {
-			if (sdev->sid == PCI_DEVID(req->bus, req->devfn))
+			if (sdev->sid == req->rid)
 				break;
 		}
 		/* Other devices can go away, but the drivers are not permitted
@@ -609,33 +620,35 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 
 		if (sdev && sdev->ops && sdev->ops->fault_cb) {
 			int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
-				(req->exe_req << 1) | (req->priv_req);
-			sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr, req->private, rwxp, result);
+				(req->exe_req << 1) | (req->pm_req);
+			sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr,
+					    req->priv_data, rwxp, result);
 		}
 		/* We get here in the error case where the PASID lookup failed,
 		   and these can be NULL. Do not use them below this point! */
 		sdev = NULL;
 		svm = NULL;
 	no_pasid:
-		if (req->lpig) {
-			/* Page Group Response */
+		if (req->lpig || req->priv_data_present) {
+			/*
+			 * Per VT-d spec. v3.0 ch7.7, system software must
+			 * respond with page group response if private data
+			 * is present (PDP) or last page in group (LPIG) bit
+			 * is set. This is an additional VT-d feature beyond
+			 * PCI ATS spec.
+			 */
 			resp.qw0 = QI_PGRP_PASID(req->pasid) |
-				QI_PGRP_DID((req->bus << 8) | req->devfn) |
+				QI_PGRP_DID(req->rid) |
 				QI_PGRP_PASID_P(req->pasid_present) |
+				QI_PGRP_PDP(req->pasid_present) |
+				QI_PGRP_RESP_CODE(result) |
 				QI_PGRP_RESP_TYPE;
 			resp.qw1 = QI_PGRP_IDX(req->prg_index) |
-				QI_PGRP_PRIV(req->private) |
-				QI_PGRP_RESP_CODE(result);
-		} else if (req->srr) {
-			/* Page Stream Response */
-			resp.qw0 = QI_PSTRM_IDX(req->prg_index) |
-				QI_PSTRM_PRIV(req->private) |
-				QI_PSTRM_BUS(req->bus) |
-				QI_PSTRM_PASID(req->pasid) |
-				QI_PSTRM_RESP_TYPE;
-			resp.qw1 = QI_PSTRM_ADDR(address) |
-				QI_PSTRM_DEVFN(req->devfn) |
-				QI_PSTRM_RESP_CODE(result);
+				QI_PGRP_LPIG(req->lpig);
+
+			if (req->priv_data_present)
+				memcpy(&resp.qw2, req->priv_data,
+				       sizeof(req->priv_data));
 		}
 		resp.qw2 = 0;
 		resp.qw3 = 0;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 0605f3bf6e79..fa364de9db18 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -374,20 +374,17 @@ enum {
 #define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid & 0xfff) << 52))
 #define QI_DEV_EIOTLB_MAX_INVS	32
 
-#define QI_PGRP_IDX(idx)	(((u64)(idx)) << 55)
-#define QI_PGRP_PRIV(priv)	(((u64)(priv)) << 32)
-#define QI_PGRP_RESP_CODE(res)	((u64)(res))
-#define QI_PGRP_PASID(pasid)	(((u64)(pasid)) << 32)
-#define QI_PGRP_DID(did)	(((u64)(did)) << 16)
+/* Page group response descriptor QW0 */
 #define QI_PGRP_PASID_P(p)	(((u64)(p)) << 4)
+#define QI_PGRP_PDP(p)		(((u64)(p)) << 5)
+#define QI_PGRP_RESP_CODE(res)	(((u64)(res)) << 12)
+#define QI_PGRP_DID(rid)	(((u64)(rid)) << 16)
+#define QI_PGRP_PASID(pasid)	(((u64)(pasid)) << 32)
+
+/* Page group response descriptor QW1 */
+#define QI_PGRP_LPIG(x)		(((u64)(x)) << 2)
+#define QI_PGRP_IDX(idx)	(((u64)(idx)) << 3)
 
-#define QI_PSTRM_ADDR(addr)	(((u64)(addr)) & VTD_PAGE_MASK)
-#define QI_PSTRM_DEVFN(devfn)	(((u64)(devfn)) << 4)
-#define QI_PSTRM_RESP_CODE(res)	((u64)(res))
-#define QI_PSTRM_IDX(idx)	(((u64)(idx)) << 55)
-#define QI_PSTRM_PRIV(priv)	(((u64)(priv)) << 32)
-#define QI_PSTRM_BUS(bus)	(((u64)(bus)) << 24)
-#define QI_PSTRM_PASID(pasid)	(((u64)(pasid)) << 4)
 
 #define QI_RESP_SUCCESS		0x0
 #define QI_RESP_INVALID		0x1
diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index 99bc5b3ae26e..e3f76315ca4d 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -20,7 +20,7 @@ struct device;
 
 struct svm_dev_ops {
 	void (*fault_cb)(struct device *dev, int pasid, u64 address,
-			 u32 private, int rwxp, int response);
+			 void *private, int rwxp, int response);
 };
 
 /* Values for rxwp in fault_cb callback */
-- 
cgit v1.2.3


From ff525b6e2d3011d5a1ef08d8b3f4f745c7245412 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Sat, 5 Jan 2019 00:01:26 +0800
Subject: scsi: libsas: Fix some indentation in libsas.h

Currently much indentation in this file is done with whitespaces instead of
tabs, which can make reading difficult, so fix this up.

Some other little minor tidy-up is done, but this file still has many other
checkpatch warnings (generally linelength > 80 or function arguments have
no identifier names).

All libsas code can be audited for checkpatch issues later.

Signed-off-by: John Garry <john.garry@huawei.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/scsi/libsas.h | 56 +++++++++++++++++++++++++--------------------------
 1 file changed, 27 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h
index 3de3b10da19a..857086cf7ebf 100644
--- a/include/scsi/libsas.h
+++ b/include/scsi/libsas.h
@@ -52,8 +52,8 @@ enum sas_phy_role {
 };
 
 enum sas_phy_type {
-        PHY_TYPE_PHYSICAL,
-        PHY_TYPE_VIRTUAL
+	PHY_TYPE_PHYSICAL,
+	PHY_TYPE_VIRTUAL
 };
 
 /* The events are mnemonically described in sas_dump.c
@@ -91,7 +91,7 @@ enum discover_event {
 
 #define to_dom_device(_obj) container_of(_obj, struct domain_device, dev_obj)
 #define to_dev_attr(_attr)  container_of(_attr, struct domain_dev_attribute,\
-                                         attr)
+					 attr)
 
 enum routing_attribute {
 	DIRECT_ROUTING,
@@ -184,37 +184,37 @@ struct domain_device {
 	spinlock_t done_lock;
 	enum sas_device_type dev_type;
 
-        enum sas_linkrate linkrate;
-        enum sas_linkrate min_linkrate;
-        enum sas_linkrate max_linkrate;
+	enum sas_linkrate linkrate;
+	enum sas_linkrate min_linkrate;
+	enum sas_linkrate max_linkrate;
 
-        int  pathways;
+	int  pathways;
 
-        struct domain_device *parent;
-        struct list_head siblings; /* devices on the same level */
-        struct asd_sas_port *port;        /* shortcut to root of the tree */
+	struct domain_device *parent;
+	struct list_head siblings; /* devices on the same level */
+	struct asd_sas_port *port;        /* shortcut to root of the tree */
 	struct sas_phy *phy;
 
-        struct list_head dev_list_node;
+	struct list_head dev_list_node;
 	struct list_head disco_list_node; /* awaiting probe or destruct */
 
-        enum sas_protocol    iproto;
-        enum sas_protocol    tproto;
+	enum sas_protocol    iproto;
+	enum sas_protocol    tproto;
 
-        struct sas_rphy *rphy;
+	struct sas_rphy *rphy;
 
-        u8  sas_addr[SAS_ADDR_SIZE];
-        u8  hashed_sas_addr[HASHED_SAS_ADDR_SIZE];
+	u8  sas_addr[SAS_ADDR_SIZE];
+	u8  hashed_sas_addr[HASHED_SAS_ADDR_SIZE];
 
-        u8  frame_rcvd[32];
+	u8  frame_rcvd[32];
 
-        union {
-                struct expander_device ex_dev;
-                struct sata_device     sata_dev; /* STP & directly attached */
+	union {
+		struct expander_device ex_dev;
+		struct sata_device     sata_dev; /* STP & directly attached */
 		struct ssp_device      ssp_dev;
-        };
+	};
 
-        void *lldd_dev;
+	void *lldd_dev;
 	unsigned long state;
 	struct kref kref;
 };
@@ -512,10 +512,10 @@ enum exec_status {
 
 /* When a task finishes with a response, the LLDD examines the
  * response:
- * 	- For an ATA task task_status_struct::stat is set to
+ *	- For an ATA task task_status_struct::stat is set to
  * SAS_PROTO_RESPONSE, and the task_status_struct::buf is set to the
  * contents of struct ata_task_resp.
- * 	- For SSP tasks, if no data is present or status/TMF response
+ *	- For SSP tasks, if no data is present or status/TMF response
  * is valid, task_status_struct::stat is set.  If data is present
  * (SENSE data), the LLDD copies up to SAS_STATUS_BUF_SIZE, sets
  * task_status_struct::buf_valid_size, and task_status_struct::stat is
@@ -671,15 +671,13 @@ extern void sas_prep_resume_ha(struct sas_ha_struct *sas_ha);
 extern void sas_resume_ha(struct sas_ha_struct *sas_ha);
 extern void sas_suspend_ha(struct sas_ha_struct *sas_ha);
 
-int sas_set_phy_speed(struct sas_phy *phy,
-		      struct sas_phy_linkrates *rates);
+int sas_set_phy_speed(struct sas_phy *phy, struct sas_phy_linkrates *rates);
 int sas_phy_reset(struct sas_phy *phy, int hard_reset);
-extern int sas_queuecommand(struct Scsi_Host * ,struct scsi_cmnd *);
+extern int sas_queuecommand(struct Scsi_Host *, struct scsi_cmnd *);
 extern int sas_target_alloc(struct scsi_target *);
 extern int sas_slave_configure(struct scsi_device *);
 extern int sas_change_queue_depth(struct scsi_device *, int new_depth);
-extern int sas_bios_param(struct scsi_device *,
-			  struct block_device *,
+extern int sas_bios_param(struct scsi_device *, struct block_device *,
 			  sector_t capacity, int *hsc);
 extern struct scsi_transport_template *
 sas_domain_attach_transport(struct sas_domain_function_template *);
-- 
cgit v1.2.3


From ea4baf7f116a18382df331db2123d98bc1c3cd83 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Tue, 18 Dec 2018 14:28:30 +0200
Subject: RDMA: Rename port_callback to init_port

Most provider routines are callback routines which ib core invokes.
_callback suffix doesn't convey information about when such callback is
invoked. Therefore, rename port_callback to init_port.

Additionally, store the init_port function pointer in ib_device_ops, so
that it can be accessed in subsequent patches when binding rdma device to
net namespace.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/core_priv.h            |  4 +---
 drivers/infiniband/core/device.c               |  7 +++----
 drivers/infiniband/core/sysfs.c                | 16 ++++++----------
 drivers/infiniband/hw/bnxt_re/main.c           |  2 +-
 drivers/infiniband/hw/cxgb3/iwch_provider.c    |  2 +-
 drivers/infiniband/hw/cxgb4/provider.c         |  2 +-
 drivers/infiniband/hw/hfi1/verbs.c             |  2 +-
 drivers/infiniband/hw/hns/hns_roce_main.c      |  2 +-
 drivers/infiniband/hw/i40iw/i40iw_verbs.c      |  2 +-
 drivers/infiniband/hw/mlx4/main.c              |  2 +-
 drivers/infiniband/hw/mlx5/main.c              |  2 +-
 drivers/infiniband/hw/mthca/mthca_provider.c   |  2 +-
 drivers/infiniband/hw/nes/nes_verbs.c          |  2 +-
 drivers/infiniband/hw/ocrdma/ocrdma_main.c     |  2 +-
 drivers/infiniband/hw/qedr/main.c              |  2 +-
 drivers/infiniband/hw/qib/qib_verbs.c          |  2 +-
 drivers/infiniband/hw/usnic/usnic_ib_main.c    |  2 +-
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c |  2 +-
 drivers/infiniband/sw/rdmavt/vt.c              |  5 ++---
 drivers/infiniband/sw/rxe/rxe_verbs.c          |  2 +-
 include/rdma/ib_verbs.h                        | 10 +++++++---
 include/rdma/rdma_vt.h                         |  3 ---
 22 files changed, 35 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 3cd830d52967..aca75c74e451 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -54,9 +54,7 @@ struct pkey_index_qp_list {
 	struct list_head    qp_list;
 };
 
-int  ib_device_register_sysfs(struct ib_device *device,
-			      int (*port_callback)(struct ib_device *,
-						   u8, struct kobject *));
+int ib_device_register_sysfs(struct ib_device *device);
 void ib_device_unregister_sysfs(struct ib_device *device);
 int ib_device_rename(struct ib_device *ibdev, const char *name);
 
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 8872453e26c0..66867e92ddea 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -574,9 +574,7 @@ port_cleanup:
  * callback for each device that is added. @device must be allocated
  * with ib_alloc_device().
  */
-int ib_register_device(struct ib_device *device, const char *name,
-		       int (*port_callback)(struct ib_device *, u8,
-					    struct kobject *))
+int ib_register_device(struct ib_device *device, const char *name)
 {
 	int ret;
 	struct ib_client *client;
@@ -613,7 +611,7 @@ int ib_register_device(struct ib_device *device, const char *name,
 		goto dev_cleanup;
 	}
 
-	ret = ib_device_register_sysfs(device, port_callback);
+	ret = ib_device_register_sysfs(device);
 	if (ret) {
 		dev_warn(&device->dev,
 			 "Couldn't register device with driver model\n");
@@ -1283,6 +1281,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	SET_DEVICE_OP(dev_ops, get_vector_affinity);
 	SET_DEVICE_OP(dev_ops, get_vf_config);
 	SET_DEVICE_OP(dev_ops, get_vf_stats);
+	SET_DEVICE_OP(dev_ops, init_port);
 	SET_DEVICE_OP(dev_ops, map_mr_sg);
 	SET_DEVICE_OP(dev_ops, map_phys_fmr);
 	SET_DEVICE_OP(dev_ops, mmap);
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 80f68eb0ba5c..7a5679933df6 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -1015,9 +1015,7 @@ err_free_stats:
 	return;
 }
 
-static int add_port(struct ib_device *device, int port_num,
-		    int (*port_callback)(struct ib_device *,
-					 u8, struct kobject *))
+static int add_port(struct ib_device *device, int port_num)
 {
 	struct ib_port *p;
 	struct ib_port_attr attr;
@@ -1113,8 +1111,8 @@ static int add_port(struct ib_device *device, int port_num,
 	if (ret)
 		goto err_free_pkey;
 
-	if (port_callback) {
-		ret = port_callback(device, port_num, &p->kobj);
+	if (device->ops.init_port) {
+		ret = device->ops.init_port(device, port_num, &p->kobj);
 		if (ret)
 			goto err_remove_pkey;
 	}
@@ -1308,9 +1306,7 @@ static void free_port_list_attributes(struct ib_device *device)
 	kobject_put(device->ports_kobj);
 }
 
-int ib_device_register_sysfs(struct ib_device *device,
-			     int (*port_callback)(struct ib_device *,
-						  u8, struct kobject *))
+int ib_device_register_sysfs(struct ib_device *device)
 {
 	struct device *class_dev = &device->dev;
 	int ret;
@@ -1330,12 +1326,12 @@ int ib_device_register_sysfs(struct ib_device *device,
 	}
 
 	if (rdma_cap_ib_switch(device)) {
-		ret = add_port(device, 0, port_callback);
+		ret = add_port(device, 0);
 		if (ret)
 			goto err_put;
 	} else {
 		for (i = 1; i <= device->phys_port_cnt; ++i) {
-			ret = add_port(device, i, port_callback);
+			ret = add_port(device, i);
 			if (ret)
 				goto err_put;
 		}
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index e7a997f2a537..797a3e943366 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -662,7 +662,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
 	rdma_set_device_sysfs_group(ibdev, &bnxt_re_dev_attr_group);
 	ibdev->driver_id = RDMA_DRIVER_BNXT_RE;
 	ib_set_device_ops(ibdev, &bnxt_re_dev_ops);
-	return ib_register_device(ibdev, "bnxt_re%d", NULL);
+	return ib_register_device(ibdev, "bnxt_re%d");
 }
 
 static void bnxt_re_dev_remove(struct bnxt_re_dev *rdev)
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 92ee6761a3bd..ffdde3cca268 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -1409,7 +1409,7 @@ int iwch_register_device(struct iwch_dev *dev)
 	dev->ibdev.driver_id = RDMA_DRIVER_CXGB3;
 	rdma_set_device_sysfs_group(&dev->ibdev, &iwch_attr_group);
 	ib_set_device_ops(&dev->ibdev, &iwch_dev_ops);
-	ret = ib_register_device(&dev->ibdev, "cxgb3_%d", NULL);
+	ret = ib_register_device(&dev->ibdev, "cxgb3_%d");
 	if (ret)
 		kfree(dev->ibdev.iwcm);
 	return ret;
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index 586b0c37481f..0a99894b0160 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -634,7 +634,7 @@ void c4iw_register_device(struct work_struct *work)
 	rdma_set_device_sysfs_group(&dev->ibdev, &c4iw_attr_group);
 	dev->ibdev.driver_id = RDMA_DRIVER_CXGB4;
 	ib_set_device_ops(&dev->ibdev, &c4iw_dev_ops);
-	ret = ib_register_device(&dev->ibdev, "cxgb4_%d", NULL);
+	ret = ib_register_device(&dev->ibdev, "cxgb4_%d");
 	if (ret)
 		goto err_kfree_iwcm;
 	return;
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index ec582d86025f..8957adf58af7 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -1622,6 +1622,7 @@ static const struct ib_device_ops hfi1_dev_ops = {
 	.alloc_rdma_netdev = hfi1_vnic_alloc_rn,
 	.get_dev_fw_str = hfi1_get_dev_fw_str,
 	.get_hw_stats = get_hw_stats,
+	.init_port = hfi1_create_port_files,
 	.modify_device = modify_device,
 	/* keep process mad in the driver */
 	.process_mad = hfi1_process_mad,
@@ -1679,7 +1680,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 	/*
 	 * Fill in rvt info object.
 	 */
-	dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files;
 	dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev;
 	dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah;
 	dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah;
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index 44a07fce0617..46ede58ef3b8 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -564,7 +564,7 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
 	ib_dev->driver_id = RDMA_DRIVER_HNS;
 	ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_ops);
 	ib_set_device_ops(ib_dev, &hns_roce_dev_ops);
-	ret = ib_register_device(ib_dev, "hns_%d", NULL);
+	ret = ib_register_device(ib_dev, "hns_%d");
 	if (ret) {
 		dev_err(dev, "ib_register_device failed!\n");
 		return ret;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index 80b66df95362..af66ab9d150b 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -2868,7 +2868,7 @@ int i40iw_register_rdma_device(struct i40iw_device *iwdev)
 	iwibdev = iwdev->iwibdev;
 	rdma_set_device_sysfs_group(&iwibdev->ibdev, &i40iw_attr_group);
 	iwibdev->ibdev.driver_id = RDMA_DRIVER_I40IW;
-	ret = ib_register_device(&iwibdev->ibdev, "i40iw%d", NULL);
+	ret = ib_register_device(&iwibdev->ibdev, "i40iw%d");
 	if (ret)
 		goto error;
 
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 1f15ec3e2b83..c3f950d82ed0 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2856,7 +2856,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 
 	rdma_set_device_sysfs_group(&ibdev->ib_dev, &mlx4_attr_group);
 	ibdev->ib_dev.driver_id = RDMA_DRIVER_MLX4;
-	if (ib_register_device(&ibdev->ib_dev, "mlx4_%d", NULL))
+	if (ib_register_device(&ibdev->ib_dev, "mlx4_%d"))
 		goto err_diag_counters;
 
 	if (mlx4_ib_mad_init(ibdev))
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 717e8535b41a..948617a60d44 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -6233,7 +6233,7 @@ int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
 		name = "mlx5_%d";
 	else
 		name = "mlx5_bond_%d";
-	return ib_register_device(&dev->ib_dev, name, NULL);
+	return ib_register_device(&dev->ib_dev, name);
 }
 
 void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 0a939ea5497b..3473c6c51b92 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -1338,7 +1338,7 @@ int mthca_register_device(struct mthca_dev *dev)
 
 	rdma_set_device_sysfs_group(&dev->ib_dev, &mthca_attr_group);
 	dev->ib_dev.driver_id = RDMA_DRIVER_MTHCA;
-	ret = ib_register_device(&dev->ib_dev, "mthca%d", NULL);
+	ret = ib_register_device(&dev->ib_dev, "mthca%d");
 	if (ret)
 		return ret;
 
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index feb4d259aab9..34601f0cbd74 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -3801,7 +3801,7 @@ int nes_register_ofa_device(struct nes_ib_device *nesibdev)
 
 	rdma_set_device_sysfs_group(&nesvnic->nesibdev->ibdev, &nes_attr_group);
 	nesvnic->nesibdev->ibdev.driver_id = RDMA_DRIVER_NES;
-	ret = ib_register_device(&nesvnic->nesibdev->ibdev, "nes%d", NULL);
+	ret = ib_register_device(&nesvnic->nesibdev->ibdev, "nes%d");
 	if (ret) {
 		return ret;
 	}
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index 1f393842453a..f45b996f617f 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -243,7 +243,7 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
 	}
 	rdma_set_device_sysfs_group(&dev->ibdev, &ocrdma_attr_group);
 	dev->ibdev.driver_id = RDMA_DRIVER_OCRDMA;
-	return ib_register_device(&dev->ibdev, "ocrdma%d", NULL);
+	return ib_register_device(&dev->ibdev, "ocrdma%d");
 }
 
 static int ocrdma_alloc_resources(struct ocrdma_dev *dev)
diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index 75940e2a8791..8e5c76d06855 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -290,7 +290,7 @@ static int qedr_register_device(struct qedr_dev *dev)
 	ib_set_device_ops(&dev->ibdev, &qedr_dev_ops);
 
 	dev->ibdev.driver_id = RDMA_DRIVER_QEDR;
-	return ib_register_device(&dev->ibdev, "qedr%d", NULL);
+	return ib_register_device(&dev->ibdev, "qedr%d");
 }
 
 /* This function allocates fast-path status block memory */
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index 276304f611ab..ff8dab8e2344 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -1494,6 +1494,7 @@ static void qib_fill_device_attr(struct qib_devdata *dd)
 }
 
 static const struct ib_device_ops qib_dev_ops = {
+	.init_port = qib_create_port_files,
 	.modify_device = qib_modify_device,
 	.process_mad = qib_process_mad,
 };
@@ -1567,7 +1568,6 @@ int qib_register_ib_device(struct qib_devdata *dd)
 	/*
 	 * Fill in rvt info object.
 	 */
-	dd->verbs_dev.rdi.driver_f.port_callback = qib_create_port_files;
 	dd->verbs_dev.rdi.driver_f.get_pci_dev = qib_get_pci_dev;
 	dd->verbs_dev.rdi.driver_f.check_ah = qib_check_ah;
 	dd->verbs_dev.rdi.driver_f.setup_wqe = qib_check_send_wqe;
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index b2323a52a0dd..3201dd1899c7 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -422,7 +422,7 @@ static void *usnic_ib_device_add(struct pci_dev *dev)
 	us_ibdev->ib_dev.driver_id = RDMA_DRIVER_USNIC;
 	rdma_set_device_sysfs_group(&us_ibdev->ib_dev, &usnic_attr_group);
 
-	if (ib_register_device(&us_ibdev->ib_dev, "usnic_%d", NULL))
+	if (ib_register_device(&us_ibdev->ib_dev, "usnic_%d"))
 		goto err_fwd_dealloc;
 
 	usnic_fwd_set_mtu(us_ibdev->ufdev, us_ibdev->netdev->mtu);
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
index eaa109dbc96a..1bc415483d9b 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
@@ -278,7 +278,7 @@ static int pvrdma_register_device(struct pvrdma_dev *dev)
 	spin_lock_init(&dev->srq_tbl_lock);
 	rdma_set_device_sysfs_group(&dev->ib_dev, &pvrdma_attr_group);
 
-	ret = ib_register_device(&dev->ib_dev, "vmw_pvrdma%d", NULL);
+	ret = ib_register_device(&dev->ib_dev, "vmw_pvrdma%d");
 	if (ret)
 		goto err_srq_free;
 
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
index 8724a1817158..7de7389d0235 100644
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -446,7 +446,7 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb)
 		 * These functions are not part of verbs specifically but are
 		 * required for rdmavt to function.
 		 */
-		if ((!rdi->driver_f.port_callback) ||
+		if ((!rdi->ibdev.ops.init_port) ||
 		    (!rdi->driver_f.get_pci_dev))
 			return -EINVAL;
 		break;
@@ -644,8 +644,7 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id)
 
 	rdi->ibdev.driver_id = driver_id;
 	/* We are now good to announce we exist */
-	ret = ib_register_device(&rdi->ibdev, dev_name(&rdi->ibdev.dev),
-				 rdi->driver_f.port_callback);
+	ret = ib_register_device(&rdi->ibdev, dev_name(&rdi->ibdev.dev));
 	if (ret) {
 		rvt_pr_err(rdi, "Failed to register driver with ib core.\n");
 		goto bail_wss;
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index b20e6e0415f5..43171148e9c5 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -1258,7 +1258,7 @@ int rxe_register_device(struct rxe_dev *rxe)
 
 	rdma_set_device_sysfs_group(dev, &rxe_attr_group);
 	dev->driver_id = RDMA_DRIVER_RXE;
-	err = ib_register_device(dev, "rxe%d", NULL);
+	err = ib_register_device(dev, "rxe%d");
 	if (err) {
 		pr_warn("%s failed with error %d\n", __func__, err);
 		goto err1;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index aa1f126d3383..1d1902fd9f87 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2504,6 +2504,12 @@ struct ib_device_ops {
 	 */
 	int (*get_hw_stats)(struct ib_device *device,
 			    struct rdma_hw_stats *stats, u8 port, int index);
+	/*
+	 * This function is called once for each port when a ib device is
+	 * registered.
+	 */
+	int (*init_port)(struct ib_device *device, u8 port_num,
+			 struct kobject *port_sysfs);
 };
 
 struct ib_device {
@@ -2620,9 +2626,7 @@ void ib_dealloc_device(struct ib_device *device);
 
 void ib_get_device_fw_str(struct ib_device *device, char *str);
 
-int ib_register_device(struct ib_device *device, const char *name,
-		       int (*port_callback)(struct ib_device *, u8,
-					    struct kobject *));
+int ib_register_device(struct ib_device *device, const char *name);
 void ib_unregister_device(struct ib_device *device);
 
 int ib_register_client   (struct ib_client *client);
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index dd0ed8048bb4..acb3bc96dfa7 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -250,9 +250,6 @@ struct rvt_driver_provided {
 	 */
 	void (*do_send)(struct rvt_qp *qp);
 
-	/* Passed to ib core registration. Callback to create syfs files */
-	int (*port_callback)(struct ib_device *, u8, struct kobject *);
-
 	/*
 	 * Returns a pointer to the undelying hardware's PCI device. This is
 	 * used to display information as to what hardware is being referenced
-- 
cgit v1.2.3


From 54747231150f0dddf68f2ee29ec2970fcc433909 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Tue, 18 Dec 2018 14:15:56 +0200
Subject: RDMA: Introduce and use rdma_device_to_ibdev()

Introduce and use rdma_device_to_ibdev() API for those drivers which are
registering one sysfs group and also use in ib_core.

In subsequent patch, device->provider_ibdev one-to-one mapping is no
longer holds true during accessing sysfs entries.
Therefore, introduce an API rdma_device_to_ibdev() that provides such
information.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c             |  2 --
 drivers/infiniband/core/sysfs.c              | 12 ++++++------
 drivers/infiniband/hw/bnxt_re/main.c         |  6 ++++--
 drivers/infiniband/hw/cxgb3/iwch_provider.c  | 14 ++++++++------
 drivers/infiniband/hw/cxgb4/provider.c       | 14 ++++++++------
 drivers/infiniband/hw/hfi1/sysfs.c           | 16 ++++++++--------
 drivers/infiniband/hw/i40iw/i40iw_verbs.c    |  5 ++---
 drivers/infiniband/hw/mlx4/main.c            |  7 ++++---
 drivers/infiniband/hw/mlx5/main.c            | 13 ++++++++-----
 drivers/infiniband/hw/mthca/mthca_provider.c |  9 ++++++---
 drivers/infiniband/hw/nes/nes_verbs.c        |  2 +-
 drivers/infiniband/hw/ocrdma/ocrdma_main.c   |  6 ++++--
 drivers/infiniband/hw/qedr/main.c            |  3 ++-
 drivers/infiniband/hw/qib/qib_sysfs.c        | 18 +++++++++---------
 drivers/infiniband/hw/usnic/usnic_ib_sysfs.c | 26 +++++++++++---------------
 drivers/infiniband/sw/rxe/rxe_verbs.c        |  4 ++--
 include/rdma/ib_verbs.h                      | 23 +++++++++++++++++++++++
 17 files changed, 106 insertions(+), 74 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 66867e92ddea..f8180cf1a004 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -296,8 +296,6 @@ struct ib_device *ib_alloc_device(size_t size)
 	device->dev.class = &ib_class;
 	device_initialize(&device->dev);
 
-	dev_set_drvdata(&device->dev, device);
-
 	INIT_LIST_HEAD(&device->event_handler_list);
 	spin_lock_init(&device->event_handler_lock);
 	rwlock_init(&device->client_data_lock);
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 7a5679933df6..c75692802da8 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -1187,7 +1187,7 @@ err_put:
 static ssize_t node_type_show(struct device *device,
 			      struct device_attribute *attr, char *buf)
 {
-	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device *dev = rdma_device_to_ibdev(device);
 
 	switch (dev->node_type) {
 	case RDMA_NODE_IB_CA:	  return sprintf(buf, "%d: CA\n", dev->node_type);
@@ -1204,7 +1204,7 @@ static DEVICE_ATTR_RO(node_type);
 static ssize_t sys_image_guid_show(struct device *device,
 				   struct device_attribute *dev_attr, char *buf)
 {
-	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device *dev = rdma_device_to_ibdev(device);
 
 	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
 		       be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]),
@@ -1217,7 +1217,7 @@ static DEVICE_ATTR_RO(sys_image_guid);
 static ssize_t node_guid_show(struct device *device,
 			      struct device_attribute *attr, char *buf)
 {
-	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device *dev = rdma_device_to_ibdev(device);
 
 	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
 		       be16_to_cpu(((__be16 *) &dev->node_guid)[0]),
@@ -1230,7 +1230,7 @@ static DEVICE_ATTR_RO(node_guid);
 static ssize_t node_desc_show(struct device *device,
 			      struct device_attribute *attr, char *buf)
 {
-	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device *dev = rdma_device_to_ibdev(device);
 
 	return sprintf(buf, "%.64s\n", dev->node_desc);
 }
@@ -1239,7 +1239,7 @@ static ssize_t node_desc_store(struct device *device,
 			       struct device_attribute *attr,
 			       const char *buf, size_t count)
 {
-	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device *dev = rdma_device_to_ibdev(device);
 	struct ib_device_modify desc = {};
 	int ret;
 
@@ -1258,7 +1258,7 @@ static DEVICE_ATTR_RW(node_desc);
 static ssize_t fw_ver_show(struct device *device, struct device_attribute *attr,
 			   char *buf)
 {
-	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device *dev = rdma_device_to_ibdev(device);
 
 	ib_get_device_fw_str(dev, buf);
 	strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX);
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 797a3e943366..16eecfa5882c 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -538,7 +538,8 @@ static struct bnxt_en_dev *bnxt_re_dev_probe(struct net_device *netdev)
 static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr,
 			   char *buf)
 {
-	struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev);
+	struct bnxt_re_dev *rdev =
+		rdma_device_to_drv_device(device, struct bnxt_re_dev, ibdev);
 
 	return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor);
 }
@@ -547,7 +548,8 @@ static DEVICE_ATTR_RO(hw_rev);
 static ssize_t hca_type_show(struct device *device,
 			     struct device_attribute *attr, char *buf)
 {
-	struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev);
+	struct bnxt_re_dev *rdev =
+		rdma_device_to_drv_device(device, struct bnxt_re_dev, ibdev);
 
 	return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->ibdev.node_desc);
 }
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index ffdde3cca268..07c20cd07f33 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -1130,8 +1130,9 @@ static int iwch_query_port(struct ib_device *ibdev,
 static ssize_t hw_rev_show(struct device *dev,
 			   struct device_attribute *attr, char *buf)
 {
-	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
-						 ibdev.dev);
+	struct iwch_dev *iwch_dev =
+			rdma_device_to_drv_device(dev, struct iwch_dev, ibdev);
+
 	pr_debug("%s dev 0x%p\n", __func__, dev);
 	return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type);
 }
@@ -1140,8 +1141,8 @@ static DEVICE_ATTR_RO(hw_rev);
 static ssize_t hca_type_show(struct device *dev,
 			     struct device_attribute *attr, char *buf)
 {
-	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
-						 ibdev.dev);
+	struct iwch_dev *iwch_dev =
+			rdma_device_to_drv_device(dev, struct iwch_dev, ibdev);
 	struct ethtool_drvinfo info;
 	struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
 
@@ -1154,8 +1155,9 @@ static DEVICE_ATTR_RO(hca_type);
 static ssize_t board_id_show(struct device *dev,
 			     struct device_attribute *attr, char *buf)
 {
-	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
-						 ibdev.dev);
+	struct iwch_dev *iwch_dev =
+			rdma_device_to_drv_device(dev, struct iwch_dev, ibdev);
+
 	pr_debug("%s dev 0x%p\n", __func__, dev);
 	return sprintf(buf, "%x.%x\n", iwch_dev->rdev.rnic_info.pdev->vendor,
 		       iwch_dev->rdev.rnic_info.pdev->device);
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index 0a99894b0160..f977f8e7e162 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -376,8 +376,9 @@ static int c4iw_query_port(struct ib_device *ibdev, u8 port,
 static ssize_t hw_rev_show(struct device *dev,
 			   struct device_attribute *attr, char *buf)
 {
-	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
-						 ibdev.dev);
+	struct c4iw_dev *c4iw_dev =
+			rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev);
+
 	pr_debug("dev 0x%p\n", dev);
 	return sprintf(buf, "%d\n",
 		       CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type));
@@ -387,8 +388,8 @@ static DEVICE_ATTR_RO(hw_rev);
 static ssize_t hca_type_show(struct device *dev,
 			     struct device_attribute *attr, char *buf)
 {
-	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
-						 ibdev.dev);
+	struct c4iw_dev *c4iw_dev =
+			rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev);
 	struct ethtool_drvinfo info;
 	struct net_device *lldev = c4iw_dev->rdev.lldi.ports[0];
 
@@ -401,8 +402,9 @@ static DEVICE_ATTR_RO(hca_type);
 static ssize_t board_id_show(struct device *dev, struct device_attribute *attr,
 			     char *buf)
 {
-	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
-						 ibdev.dev);
+	struct c4iw_dev *c4iw_dev =
+			rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev);
+
 	pr_debug("dev 0x%p\n", dev);
 	return sprintf(buf, "%x.%x\n", c4iw_dev->rdev.lldi.pdev->vendor,
 		       c4iw_dev->rdev.lldi.pdev->device);
diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c
index 2be513d4c9da..90f62c4bddba 100644
--- a/drivers/infiniband/hw/hfi1/sysfs.c
+++ b/drivers/infiniband/hw/hfi1/sysfs.c
@@ -498,7 +498,7 @@ static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr,
 			   char *buf)
 {
 	struct hfi1_ibdev *dev =
-		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
 
 	return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
 }
@@ -508,7 +508,7 @@ static ssize_t board_id_show(struct device *device,
 			     struct device_attribute *attr, char *buf)
 {
 	struct hfi1_ibdev *dev =
-		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
 	struct hfi1_devdata *dd = dd_from_dev(dev);
 	int ret;
 
@@ -524,7 +524,7 @@ static ssize_t boardversion_show(struct device *device,
 				 struct device_attribute *attr, char *buf)
 {
 	struct hfi1_ibdev *dev =
-		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
 	struct hfi1_devdata *dd = dd_from_dev(dev);
 
 	/* The string printed here is already newline-terminated. */
@@ -536,7 +536,7 @@ static ssize_t nctxts_show(struct device *device,
 			   struct device_attribute *attr, char *buf)
 {
 	struct hfi1_ibdev *dev =
-		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
 	struct hfi1_devdata *dd = dd_from_dev(dev);
 
 	/*
@@ -555,7 +555,7 @@ static ssize_t nfreectxts_show(struct device *device,
 			       struct device_attribute *attr, char *buf)
 {
 	struct hfi1_ibdev *dev =
-		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
 	struct hfi1_devdata *dd = dd_from_dev(dev);
 
 	/* Return the number of free user ports (contexts) available. */
@@ -567,7 +567,7 @@ static ssize_t serial_show(struct device *device,
 			   struct device_attribute *attr, char *buf)
 {
 	struct hfi1_ibdev *dev =
-		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
 	struct hfi1_devdata *dd = dd_from_dev(dev);
 
 	return scnprintf(buf, PAGE_SIZE, "%s", dd->serial);
@@ -579,7 +579,7 @@ static ssize_t chip_reset_store(struct device *device,
 				size_t count)
 {
 	struct hfi1_ibdev *dev =
-		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
 	struct hfi1_devdata *dd = dd_from_dev(dev);
 	int ret;
 
@@ -609,7 +609,7 @@ static ssize_t tempsense_show(struct device *device,
 			      struct device_attribute *attr, char *buf)
 {
 	struct hfi1_ibdev *dev =
-		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
 	struct hfi1_devdata *dd = dd_from_dev(dev);
 	struct hfi1_temp temp;
 	int ret;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index af66ab9d150b..12b31a8440be 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -2139,9 +2139,8 @@ static int i40iw_dereg_mr(struct ib_mr *ib_mr)
 static ssize_t hw_rev_show(struct device *dev,
 			   struct device_attribute *attr, char *buf)
 {
-	struct i40iw_ib_device *iwibdev = container_of(dev,
-						       struct i40iw_ib_device,
-						       ibdev.dev);
+	struct i40iw_ib_device *iwibdev =
+		rdma_device_to_drv_device(dev, struct i40iw_ib_device, ibdev);
 	u32 hw_rev = iwibdev->iwdev->sc_dev.hw_rev;
 
 	return sprintf(buf, "%x\n", hw_rev);
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index c3f950d82ed0..dc2ffd293a11 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2043,7 +2043,7 @@ static ssize_t hca_type_show(struct device *device,
 			     struct device_attribute *attr, char *buf)
 {
 	struct mlx4_ib_dev *dev =
-		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mlx4_ib_dev, ib_dev);
 	return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device);
 }
 static DEVICE_ATTR_RO(hca_type);
@@ -2052,7 +2052,7 @@ static ssize_t hw_rev_show(struct device *device,
 			   struct device_attribute *attr, char *buf)
 {
 	struct mlx4_ib_dev *dev =
-		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mlx4_ib_dev, ib_dev);
 	return sprintf(buf, "%x\n", dev->dev->rev_id);
 }
 static DEVICE_ATTR_RO(hw_rev);
@@ -2061,7 +2061,8 @@ static ssize_t board_id_show(struct device *device,
 			     struct device_attribute *attr, char *buf)
 {
 	struct mlx4_ib_dev *dev =
-		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mlx4_ib_dev, ib_dev);
+
 	return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN,
 		       dev->dev->board_id);
 }
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 948617a60d44..4b1b56d54301 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4104,7 +4104,7 @@ static ssize_t fw_pages_show(struct device *device,
 			     struct device_attribute *attr, char *buf)
 {
 	struct mlx5_ib_dev *dev =
-		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
 
 	return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages);
 }
@@ -4114,7 +4114,7 @@ static ssize_t reg_pages_show(struct device *device,
 			      struct device_attribute *attr, char *buf)
 {
 	struct mlx5_ib_dev *dev =
-		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
 
 	return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
 }
@@ -4124,7 +4124,8 @@ static ssize_t hca_type_show(struct device *device,
 			     struct device_attribute *attr, char *buf)
 {
 	struct mlx5_ib_dev *dev =
-		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
+
 	return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
 }
 static DEVICE_ATTR_RO(hca_type);
@@ -4133,7 +4134,8 @@ static ssize_t hw_rev_show(struct device *device,
 			   struct device_attribute *attr, char *buf)
 {
 	struct mlx5_ib_dev *dev =
-		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
+
 	return sprintf(buf, "%x\n", dev->mdev->rev_id);
 }
 static DEVICE_ATTR_RO(hw_rev);
@@ -4142,7 +4144,8 @@ static ssize_t board_id_show(struct device *device,
 			     struct device_attribute *attr, char *buf)
 {
 	struct mlx5_ib_dev *dev =
-		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
+
 	return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
 		       dev->mdev->board_id);
 }
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 3473c6c51b92..63003b4d2485 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -1081,7 +1081,8 @@ static ssize_t hw_rev_show(struct device *device,
 			   struct device_attribute *attr, char *buf)
 {
 	struct mthca_dev *dev =
-		container_of(device, struct mthca_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mthca_dev, ib_dev);
+
 	return sprintf(buf, "%x\n", dev->rev_id);
 }
 static DEVICE_ATTR_RO(hw_rev);
@@ -1090,7 +1091,8 @@ static ssize_t hca_type_show(struct device *device,
 			     struct device_attribute *attr, char *buf)
 {
 	struct mthca_dev *dev =
-		container_of(device, struct mthca_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mthca_dev, ib_dev);
+
 	switch (dev->pdev->device) {
 	case PCI_DEVICE_ID_MELLANOX_TAVOR:
 		return sprintf(buf, "MT23108\n");
@@ -1111,7 +1113,8 @@ static ssize_t board_id_show(struct device *device,
 			     struct device_attribute *attr, char *buf)
 {
 	struct mthca_dev *dev =
-		container_of(device, struct mthca_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mthca_dev, ib_dev);
+
 	return sprintf(buf, "%.*s\n", MTHCA_BOARD_ID_LEN, dev->board_id);
 }
 static DEVICE_ATTR_RO(board_id);
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 34601f0cbd74..034156f7e9ed 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -2560,7 +2560,7 @@ static ssize_t hw_rev_show(struct device *dev,
 			   struct device_attribute *attr, char *buf)
 {
 	struct nes_ib_device *nesibdev =
-			container_of(dev, struct nes_ib_device, ibdev.dev);
+		rdma_device_to_drv_device(dev, struct nes_ib_device, ibdev);
 	struct nes_vnic *nesvnic = nesibdev->nesvnic;
 
 	nes_debug(NES_DBG_INIT, "\n");
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index f45b996f617f..b0491b9ecfe4 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -118,7 +118,8 @@ static void get_dev_fw_str(struct ib_device *device, char *str)
 static ssize_t hw_rev_show(struct device *device,
 			   struct device_attribute *attr, char *buf)
 {
-	struct ocrdma_dev *dev = dev_get_drvdata(device);
+	struct ocrdma_dev *dev =
+		rdma_device_to_drv_device(device, struct ocrdma_dev, ibdev);
 
 	return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor);
 }
@@ -127,7 +128,8 @@ static DEVICE_ATTR_RO(hw_rev);
 static ssize_t hca_type_show(struct device *device,
 			     struct device_attribute *attr, char *buf)
 {
-	struct ocrdma_dev *dev = dev_get_drvdata(device);
+	struct ocrdma_dev *dev =
+		rdma_device_to_drv_device(device, struct ocrdma_dev, ibdev);
 
 	return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->model_number[0]);
 }
diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index 8e5c76d06855..f85e72b65a10 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -137,7 +137,8 @@ static int qedr_iw_port_immutable(struct ib_device *ibdev, u8 port_num,
 static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr,
 			   char *buf)
 {
-	struct qedr_dev *dev = dev_get_drvdata(device);
+	struct qedr_dev *dev =
+		rdma_device_to_drv_device(device, struct qedr_dev, ibdev);
 
 	return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->pdev->vendor);
 }
diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c
index 1cf4ca3f23e3..905206a0c2d5 100644
--- a/drivers/infiniband/hw/qib/qib_sysfs.c
+++ b/drivers/infiniband/hw/qib/qib_sysfs.c
@@ -555,7 +555,7 @@ static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr,
 			   char *buf)
 {
 	struct qib_ibdev *dev =
-		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
 
 	return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
 }
@@ -565,7 +565,7 @@ static ssize_t hca_type_show(struct device *device,
 			     struct device_attribute *attr, char *buf)
 {
 	struct qib_ibdev *dev =
-		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
 	struct qib_devdata *dd = dd_from_dev(dev);
 	int ret;
 
@@ -590,7 +590,7 @@ static ssize_t boardversion_show(struct device *device,
 				 struct device_attribute *attr, char *buf)
 {
 	struct qib_ibdev *dev =
-		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
 	struct qib_devdata *dd = dd_from_dev(dev);
 
 	/* The string printed here is already newline-terminated. */
@@ -602,7 +602,7 @@ static ssize_t localbus_info_show(struct device *device,
 				  struct device_attribute *attr, char *buf)
 {
 	struct qib_ibdev *dev =
-		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
 	struct qib_devdata *dd = dd_from_dev(dev);
 
 	/* The string printed here is already newline-terminated. */
@@ -614,7 +614,7 @@ static ssize_t nctxts_show(struct device *device,
 			   struct device_attribute *attr, char *buf)
 {
 	struct qib_ibdev *dev =
-		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
 	struct qib_devdata *dd = dd_from_dev(dev);
 
 	/* Return the number of user ports (contexts) available. */
@@ -630,7 +630,7 @@ static ssize_t nfreectxts_show(struct device *device,
 			       struct device_attribute *attr, char *buf)
 {
 	struct qib_ibdev *dev =
-		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
 	struct qib_devdata *dd = dd_from_dev(dev);
 
 	/* Return the number of free user ports (contexts) available. */
@@ -642,7 +642,7 @@ static ssize_t serial_show(struct device *device,
 			   struct device_attribute *attr, char *buf)
 {
 	struct qib_ibdev *dev =
-		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
 	struct qib_devdata *dd = dd_from_dev(dev);
 
 	buf[sizeof(dd->serial)] = '\0';
@@ -657,7 +657,7 @@ static ssize_t chip_reset_store(struct device *device,
 				size_t count)
 {
 	struct qib_ibdev *dev =
-		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
 	struct qib_devdata *dd = dd_from_dev(dev);
 	int ret;
 
@@ -679,7 +679,7 @@ static ssize_t tempsense_show(struct device *device,
 			      struct device_attribute *attr, char *buf)
 {
 	struct qib_ibdev *dev =
-		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
 	struct qib_devdata *dd = dd_from_dev(dev);
 	int ret;
 	int idx;
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
index a7e4b2ccfaf8..c85d48ae7442 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
@@ -50,7 +50,7 @@ static ssize_t board_id_show(struct device *device,
 			     struct device_attribute *attr, char *buf)
 {
 	struct usnic_ib_dev *us_ibdev =
-		container_of(device, struct usnic_ib_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev);
 	unsigned short subsystem_device_id;
 
 	mutex_lock(&us_ibdev->usdev_lock);
@@ -67,14 +67,13 @@ static DEVICE_ATTR_RO(board_id);
 static ssize_t
 config_show(struct device *device, struct device_attribute *attr, char *buf)
 {
-	struct usnic_ib_dev *us_ibdev;
+	struct usnic_ib_dev *us_ibdev =
+		rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev);
 	char *ptr;
 	unsigned left;
 	unsigned n;
 	enum usnic_vnic_res_type res_type;
 
-	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
-
 	/* Buffer space limit is 1 page */
 	ptr = buf;
 	left = PAGE_SIZE;
@@ -130,9 +129,8 @@ static DEVICE_ATTR_RO(config);
 static ssize_t
 iface_show(struct device *device, struct device_attribute *attr, char *buf)
 {
-	struct usnic_ib_dev *us_ibdev;
-
-	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+	struct usnic_ib_dev *us_ibdev =
+		rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev);
 
 	return scnprintf(buf, PAGE_SIZE, "%s\n",
 			netdev_name(us_ibdev->netdev));
@@ -142,9 +140,8 @@ static DEVICE_ATTR_RO(iface);
 static ssize_t
 max_vf_show(struct device *device, struct device_attribute *attr, char *buf)
 {
-	struct usnic_ib_dev *us_ibdev;
-
-	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+	struct usnic_ib_dev *us_ibdev =
+		rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev);
 
 	return scnprintf(buf, PAGE_SIZE, "%u\n",
 			kref_read(&us_ibdev->vf_cnt));
@@ -154,10 +151,10 @@ static DEVICE_ATTR_RO(max_vf);
 static ssize_t
 qp_per_vf_show(struct device *device, struct device_attribute *attr, char *buf)
 {
-	struct usnic_ib_dev *us_ibdev;
+	struct usnic_ib_dev *us_ibdev =
+		rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev);
 	int qp_per_vf;
 
-	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
 	qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ],
 			us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]);
 
@@ -169,9 +166,8 @@ static DEVICE_ATTR_RO(qp_per_vf);
 static ssize_t
 cq_per_vf_show(struct device *device, struct device_attribute *attr, char *buf)
 {
-	struct usnic_ib_dev *us_ibdev;
-
-	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+	struct usnic_ib_dev *us_ibdev =
+		rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev);
 
 	return scnprintf(buf, PAGE_SIZE, "%d\n",
 			us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ]);
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 43171148e9c5..3d01247a28db 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -1129,8 +1129,8 @@ static int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid)
 static ssize_t parent_show(struct device *device,
 			   struct device_attribute *attr, char *buf)
 {
-	struct rxe_dev *rxe = container_of(device, struct rxe_dev,
-					   ib_dev.dev);
+	struct rxe_dev *rxe =
+		rdma_device_to_drv_device(device, struct rxe_dev, ib_dev);
 
 	return snprintf(buf, 16, "%s\n", rxe_parent_name(rxe, 1));
 }
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 1d1902fd9f87..94b6e1dd4dab 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -4241,4 +4241,27 @@ rdma_set_device_sysfs_group(struct ib_device *dev,
 	dev->groups[1] = group;
 }
 
+/**
+ * rdma_device_to_ibdev - Get ib_device pointer from device pointer
+ *
+ * @device:	device pointer for which ib_device pointer to retrieve
+ *
+ * rdma_device_to_ibdev() retrieves ib_device pointer from device.
+ *
+ */
+static inline struct ib_device *rdma_device_to_ibdev(struct device *device)
+{
+	return container_of(device, struct ib_device, dev);
+}
+
+/**
+ * rdma_device_to_drv_device - Helper macro to reach back to driver's
+ *			       ib_device holder structure from device pointer.
+ *
+ * NOTE: New drivers should not make use of this API; This API is only for
+ * existing drivers who have exposed sysfs entries using
+ * rdma_set_device_sysfs_group().
+ */
+#define rdma_device_to_drv_device(dev, drv_dev_struct, ibdev_member)           \
+	container_of(rdma_device_to_ibdev(dev), drv_dev_struct, ibdev_member)
 #endif /* IB_VERBS_H */
-- 
cgit v1.2.3


From b33a02aadcc6330a61e511240b634dc11112e65e Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 9 Jan 2019 17:24:55 +0200
Subject: i2c: acpi: Move I2C bits from acpi.h to i2c.h

As discussed previously the best location for certain bus related bits,
e.g. I2C, is its own realm of the headers.

In order to uncontaminate acpi.h move the I2C bits to i2c.h.

There is no functional change intended.

Link: https://lkml.org/lkml/2018/11/28/744
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/acpi.h | 11 -----------
 include/linux/i2c.h  | 10 ++++++++++
 2 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 87715f20b69a..13f5cb2c4763 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1061,17 +1061,6 @@ static inline int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index)
 }
 #endif
 
-#if defined(CONFIG_ACPI) && IS_ENABLED(CONFIG_I2C)
-bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
-			       struct acpi_resource_i2c_serialbus **i2c);
-#else
-static inline bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
-					     struct acpi_resource_i2c_serialbus **i2c)
-{
-	return false;
-}
-#endif
-
 /* Device properties */
 
 #ifdef CONFIG_ACPI
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index cba59d66c00d..1f45331924d6 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -967,11 +967,21 @@ static inline int of_i2c_get_board_info(struct device *dev,
 
 #endif /* CONFIG_OF */
 
+struct acpi_resource;
+struct acpi_resource_i2c_serialbus;
+
 #if IS_ENABLED(CONFIG_ACPI)
+bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
+			       struct acpi_resource_i2c_serialbus **i2c);
 u32 i2c_acpi_find_bus_speed(struct device *dev);
 struct i2c_client *i2c_acpi_new_device(struct device *dev, int index,
 				       struct i2c_board_info *info);
 #else
+static inline bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
+					     struct acpi_resource_i2c_serialbus **i2c)
+{
+	return false;
+}
 static inline u32 i2c_acpi_find_bus_speed(struct device *dev)
 {
 	return 0;
-- 
cgit v1.2.3


From 063755ab1d1c1127adc09703185967862584935b Mon Sep 17 00:00:00 2001
From: Philippe Schenker <philippe.schenker@toradex.com>
Date: Fri, 21 Dec 2018 14:46:31 +0100
Subject: mfd: stmpe: Move ADC related defines to MFD header

Move defines that are ADC related to the header of the overlying MFD,
so they can be used from multiple sub-devices.

Signed-off-by: Philippe Schenker <philippe.schenker@toradex.com>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/input/touchscreen/stmpe-ts.c | 34 +++++++++++++---------------------
 include/linux/mfd/stmpe.h            | 11 +++++++++++
 2 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/input/touchscreen/stmpe-ts.c b/drivers/input/touchscreen/stmpe-ts.c
index 2a78e27b4495..c5d9006588a2 100644
--- a/drivers/input/touchscreen/stmpe-ts.c
+++ b/drivers/input/touchscreen/stmpe-ts.c
@@ -49,17 +49,6 @@
 
 #define STMPE_IRQ_TOUCH_DET		0
 
-#define SAMPLE_TIME(x)			((x & 0xf) << 4)
-#define MOD_12B(x)			((x & 0x1) << 3)
-#define REF_SEL(x)			((x & 0x1) << 1)
-#define ADC_FREQ(x)			(x & 0x3)
-#define AVE_CTRL(x)			((x & 0x3) << 6)
-#define DET_DELAY(x)			((x & 0x7) << 3)
-#define SETTLING(x)			(x & 0x7)
-#define FRACTION_Z(x)			(x & 0x7)
-#define I_DRIVE(x)			(x & 0x1)
-#define OP_MODE(x)			((x & 0x7) << 1)
-
 #define STMPE_TS_NAME			"stmpe-ts"
 #define XY_MASK				0xfff
 
@@ -213,9 +202,10 @@ static int stmpe_init_hw(struct stmpe_touch *ts)
 		return ret;
 	}
 
-	adc_ctrl1 = SAMPLE_TIME(ts->sample_time) | MOD_12B(ts->mod_12b) |
-		REF_SEL(ts->ref_sel);
-	adc_ctrl1_mask = SAMPLE_TIME(0xff) | MOD_12B(0xff) | REF_SEL(0xff);
+	adc_ctrl1 = STMPE_SAMPLE_TIME(ts->sample_time) |
+		    STMPE_MOD_12B(ts->mod_12b) | STMPE_REF_SEL(ts->ref_sel);
+	adc_ctrl1_mask = STMPE_SAMPLE_TIME(0xff) | STMPE_MOD_12B(0xff) |
+			 STMPE_REF_SEL(0xff);
 
 	ret = stmpe_set_bits(stmpe, STMPE_REG_ADC_CTRL1,
 			adc_ctrl1_mask, adc_ctrl1);
@@ -225,15 +215,17 @@ static int stmpe_init_hw(struct stmpe_touch *ts)
 	}
 
 	ret = stmpe_set_bits(stmpe, STMPE_REG_ADC_CTRL2,
-			ADC_FREQ(0xff), ADC_FREQ(ts->adc_freq));
+			STMPE_ADC_FREQ(0xff), STMPE_ADC_FREQ(ts->adc_freq));
 	if (ret) {
 		dev_err(dev, "Could not setup ADC\n");
 		return ret;
 	}
 
-	tsc_cfg = AVE_CTRL(ts->ave_ctrl) | DET_DELAY(ts->touch_det_delay) |
-			SETTLING(ts->settling);
-	tsc_cfg_mask = AVE_CTRL(0xff) | DET_DELAY(0xff) | SETTLING(0xff);
+	tsc_cfg = STMPE_AVE_CTRL(ts->ave_ctrl) |
+		  STMPE_DET_DELAY(ts->touch_det_delay) |
+		  STMPE_SETTLING(ts->settling);
+	tsc_cfg_mask = STMPE_AVE_CTRL(0xff) | STMPE_DET_DELAY(0xff) |
+		       STMPE_SETTLING(0xff);
 
 	ret = stmpe_set_bits(stmpe, STMPE_REG_TSC_CFG, tsc_cfg_mask, tsc_cfg);
 	if (ret) {
@@ -242,14 +234,14 @@ static int stmpe_init_hw(struct stmpe_touch *ts)
 	}
 
 	ret = stmpe_set_bits(stmpe, STMPE_REG_TSC_FRACTION_Z,
-			FRACTION_Z(0xff), FRACTION_Z(ts->fraction_z));
+			STMPE_FRACTION_Z(0xff), STMPE_FRACTION_Z(ts->fraction_z));
 	if (ret) {
 		dev_err(dev, "Could not config touch\n");
 		return ret;
 	}
 
 	ret = stmpe_set_bits(stmpe, STMPE_REG_TSC_I_DRIVE,
-			I_DRIVE(0xff), I_DRIVE(ts->i_drive));
+			STMPE_I_DRIVE(0xff), STMPE_I_DRIVE(ts->i_drive));
 	if (ret) {
 		dev_err(dev, "Could not config touch\n");
 		return ret;
@@ -263,7 +255,7 @@ static int stmpe_init_hw(struct stmpe_touch *ts)
 	}
 
 	ret = stmpe_set_bits(stmpe, STMPE_REG_TSC_CTRL,
-			OP_MODE(0xff), OP_MODE(OP_MOD_XYZ));
+			STMPE_OP_MODE(0xff), STMPE_OP_MODE(OP_MOD_XYZ));
 	if (ret) {
 		dev_err(dev, "Could not set mode\n");
 		return ret;
diff --git a/include/linux/mfd/stmpe.h b/include/linux/mfd/stmpe.h
index 4a827af17e59..c0353f6431f9 100644
--- a/include/linux/mfd/stmpe.h
+++ b/include/linux/mfd/stmpe.h
@@ -10,6 +10,17 @@
 
 #include <linux/mutex.h>
 
+#define STMPE_SAMPLE_TIME(x)	((x & 0xf) << 4)
+#define STMPE_MOD_12B(x)	((x & 0x1) << 3)
+#define STMPE_REF_SEL(x)	((x & 0x1) << 1)
+#define STMPE_ADC_FREQ(x)	(x & 0x3)
+#define STMPE_AVE_CTRL(x)	((x & 0x3) << 6)
+#define STMPE_DET_DELAY(x)	((x & 0x7) << 3)
+#define STMPE_SETTLING(x)	(x & 0x7)
+#define STMPE_FRACTION_Z(x)	(x & 0x7)
+#define STMPE_I_DRIVE(x)	(x & 0x1)
+#define STMPE_OP_MODE(x)	((x & 0x7) << 1)
+
 struct device;
 struct regulator;
 
-- 
cgit v1.2.3


From 6377cfa3b857ced301f2079ac97de6c19057ab65 Mon Sep 17 00:00:00 2001
From: Stefan Agner <stefan@agner.ch>
Date: Fri, 21 Dec 2018 14:46:32 +0100
Subject: mfd: stmpe: Preparations for STMPE ADC driver

This prepares the MFD for the STMPE ADC driver. This commit introduces
devicetree settings that are used by the ADC and adds an init function.
Common ADC settings that are shared with the touchscreen driver can now
reside in the overlying MFD.

Signed-off-by: Stefan Agner <stefan@agner.ch>
Signed-off-by: Max Krummenacher <max.krummenacher@toradex.com>
Signed-off-by: Philippe Schenker <philippe.schenker@toradex.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig       |  3 ++-
 drivers/mfd/stmpe.c       | 68 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/stmpe.h | 10 +++++++
 3 files changed, 80 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 8c5dfdce4326..bba159e8eaa4 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1204,7 +1204,7 @@ config MFD_STMPE
 
 	  Currently supported devices are:
 
-		STMPE811: GPIO, Touchscreen
+		STMPE811: GPIO, Touchscreen, ADC
 		STMPE1601: GPIO, Keypad
 		STMPE1801: GPIO, Keypad
 		STMPE2401: GPIO, Keypad
@@ -1217,6 +1217,7 @@ config MFD_STMPE
 		GPIO: stmpe-gpio
 		Keypad: stmpe-keypad
 		Touchscreen: stmpe-ts
+		ADC: stmpe-adc
 
 menu "STMicroelectronics STMPE Interface Drivers"
 depends on MFD_STMPE
diff --git a/drivers/mfd/stmpe.c b/drivers/mfd/stmpe.c
index 566caca4efd8..f582531a8f3e 100644
--- a/drivers/mfd/stmpe.c
+++ b/drivers/mfd/stmpe.c
@@ -463,6 +463,28 @@ static const struct mfd_cell stmpe_ts_cell = {
 	.num_resources	= ARRAY_SIZE(stmpe_ts_resources),
 };
 
+/*
+ * ADC (STMPE811)
+ */
+
+static struct resource stmpe_adc_resources[] = {
+	{
+		.name	= "STMPE_TEMP_SENS",
+		.flags	= IORESOURCE_IRQ,
+	},
+	{
+		.name	= "STMPE_ADC",
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static const struct mfd_cell stmpe_adc_cell = {
+	.name		= "stmpe-adc",
+	.of_compatible	= "st,stmpe-adc",
+	.resources	= stmpe_adc_resources,
+	.num_resources	= ARRAY_SIZE(stmpe_adc_resources),
+};
+
 /*
  * STMPE811 or STMPE610
  */
@@ -497,6 +519,11 @@ static struct stmpe_variant_block stmpe811_blocks[] = {
 		.irq	= STMPE811_IRQ_TOUCH_DET,
 		.block	= STMPE_BLOCK_TOUCHSCREEN,
 	},
+	{
+		.cell	= &stmpe_adc_cell,
+		.irq	= STMPE811_IRQ_TEMP_SENS,
+		.block	= STMPE_BLOCK_ADC,
+	},
 };
 
 static int stmpe811_enable(struct stmpe *stmpe, unsigned int blocks,
@@ -517,6 +544,35 @@ static int stmpe811_enable(struct stmpe *stmpe, unsigned int blocks,
 				enable ? 0 : mask);
 }
 
+int stmpe811_adc_common_init(struct stmpe *stmpe)
+{
+	int ret;
+	u8 adc_ctrl1, adc_ctrl1_mask;
+
+	adc_ctrl1 = STMPE_SAMPLE_TIME(stmpe->sample_time) |
+		    STMPE_MOD_12B(stmpe->mod_12b) |
+		    STMPE_REF_SEL(stmpe->ref_sel);
+	adc_ctrl1_mask = STMPE_SAMPLE_TIME(0xff) | STMPE_MOD_12B(0xff) |
+			 STMPE_REF_SEL(0xff);
+
+	ret = stmpe_set_bits(stmpe, STMPE811_REG_ADC_CTRL1,
+			adc_ctrl1_mask, adc_ctrl1);
+	if (ret) {
+		dev_err(stmpe->dev, "Could not setup ADC\n");
+		return ret;
+	}
+
+	ret = stmpe_set_bits(stmpe, STMPE811_REG_ADC_CTRL2,
+			STMPE_ADC_FREQ(0xff), STMPE_ADC_FREQ(stmpe->adc_freq));
+	if (ret) {
+		dev_err(stmpe->dev, "Could not setup ADC\n");
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(stmpe811_adc_common_init);
+
 static int stmpe811_get_altfunc(struct stmpe *stmpe, enum stmpe_block block)
 {
 	/* 0 for touchscreen, 1 for GPIO */
@@ -1325,6 +1381,7 @@ int stmpe_probe(struct stmpe_client_info *ci, enum stmpe_partnum partnum)
 	struct device_node *np = ci->dev->of_node;
 	struct stmpe *stmpe;
 	int ret;
+	u32 val;
 
 	pdata = devm_kzalloc(ci->dev, sizeof(*pdata), GFP_KERNEL);
 	if (!pdata)
@@ -1342,6 +1399,15 @@ int stmpe_probe(struct stmpe_client_info *ci, enum stmpe_partnum partnum)
 	mutex_init(&stmpe->irq_lock);
 	mutex_init(&stmpe->lock);
 
+	if (!of_property_read_u32(np, "st,sample-time", &val))
+		stmpe->sample_time = val;
+	if (!of_property_read_u32(np, "st,mod-12b", &val))
+		stmpe->mod_12b = val;
+	if (!of_property_read_u32(np, "st,ref-sel", &val))
+		stmpe->ref_sel = val;
+	if (!of_property_read_u32(np, "st,adc-freq", &val))
+		stmpe->adc_freq = val;
+
 	stmpe->dev = ci->dev;
 	stmpe->client = ci->client;
 	stmpe->pdata = pdata;
@@ -1433,6 +1499,8 @@ int stmpe_remove(struct stmpe *stmpe)
 	if (!IS_ERR(stmpe->vcc))
 		regulator_disable(stmpe->vcc);
 
+	__stmpe_disable(stmpe, STMPE_BLOCK_ADC);
+
 	mfd_remove_devices(stmpe->dev);
 
 	return 0;
diff --git a/include/linux/mfd/stmpe.h b/include/linux/mfd/stmpe.h
index c0353f6431f9..07f55aac9390 100644
--- a/include/linux/mfd/stmpe.h
+++ b/include/linux/mfd/stmpe.h
@@ -21,6 +21,9 @@
 #define STMPE_I_DRIVE(x)	(x & 0x1)
 #define STMPE_OP_MODE(x)	((x & 0x7) << 1)
 
+#define STMPE811_REG_ADC_CTRL1	0x20
+#define STMPE811_REG_ADC_CTRL2	0x21
+
 struct device;
 struct regulator;
 
@@ -134,6 +137,12 @@ struct stmpe {
 	u8 ier[2];
 	u8 oldier[2];
 	struct stmpe_platform_data *pdata;
+
+	/* For devices that use an ADC */
+	u8 sample_time;
+	u8 mod_12b;
+	u8 ref_sel;
+	u8 adc_freq;
 };
 
 extern int stmpe_reg_write(struct stmpe *stmpe, u8 reg, u8 data);
@@ -147,6 +156,7 @@ extern int stmpe_set_altfunc(struct stmpe *stmpe, u32 pins,
 			     enum stmpe_block block);
 extern int stmpe_enable(struct stmpe *stmpe, unsigned int blocks);
 extern int stmpe_disable(struct stmpe *stmpe, unsigned int blocks);
+extern int stmpe811_adc_common_init(struct stmpe *stmpe);
 
 #define STMPE_GPIO_NOREQ_811_TOUCH	(0xf0)
 
-- 
cgit v1.2.3


From 1d7ae53b152dbc5ba0a4f6a83ecc42ac66f52d11 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Wed, 12 Dec 2018 23:38:47 +0300
Subject: iommu: Introduce iotlb_sync_map callback

Introduce iotlb_sync_map() callback that is invoked in the end of
iommu_map(). This new callback allows IOMMU drivers to avoid syncing
after mapping of each contiguous chunk and sync only when the whole
mapping is completed, optimizing performance of the mapping operation.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 8 ++++++--
 include/linux/iommu.h | 1 +
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3ed4db334341..ed0e63f2cd9b 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1585,13 +1585,14 @@ static size_t iommu_pgsize(struct iommu_domain *domain,
 int iommu_map(struct iommu_domain *domain, unsigned long iova,
 	      phys_addr_t paddr, size_t size, int prot)
 {
+	const struct iommu_ops *ops = domain->ops;
 	unsigned long orig_iova = iova;
 	unsigned int min_pagesz;
 	size_t orig_size = size;
 	phys_addr_t orig_paddr = paddr;
 	int ret = 0;
 
-	if (unlikely(domain->ops->map == NULL ||
+	if (unlikely(ops->map == NULL ||
 		     domain->pgsize_bitmap == 0UL))
 		return -ENODEV;
 
@@ -1620,7 +1621,7 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova,
 		pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx\n",
 			 iova, &paddr, pgsize);
 
-		ret = domain->ops->map(domain, iova, paddr, pgsize, prot);
+		ret = ops->map(domain, iova, paddr, pgsize, prot);
 		if (ret)
 			break;
 
@@ -1629,6 +1630,9 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova,
 		size -= pgsize;
 	}
 
+	if (ops->iotlb_sync_map)
+		ops->iotlb_sync_map(domain);
+
 	/* unroll mapping in case something went wrong */
 	if (ret)
 		iommu_unmap(domain, orig_iova, orig_size - size);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index e90da6b6f3d1..477ef47c357c 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -201,6 +201,7 @@ struct iommu_ops {
 	void (*flush_iotlb_all)(struct iommu_domain *domain);
 	void (*iotlb_range_add)(struct iommu_domain *domain,
 				unsigned long iova, size_t size);
+	void (*iotlb_sync_map)(struct iommu_domain *domain);
 	void (*iotlb_sync)(struct iommu_domain *domain);
 	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova);
 	int (*add_device)(struct device *dev);
-- 
cgit v1.2.3


From 96efa118c03648fdc76acad9ca8fe018a6be7145 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Wed, 12 Dec 2018 23:38:52 +0300
Subject: memory: tegra: Adapt to Tegra20 device-tree binding changes

The tegra20-mc device-tree binding has been changed, GART has been
squashed into Memory Controller and now the clock property is mandatory
for Tegra20, the DT compatible has been changed as well. Adapt driver to
the DT changes.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Acked-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/memory/tegra/mc.c | 21 ++++++++-------------
 drivers/memory/tegra/mc.h |  6 ------
 include/soc/tegra/mc.h    |  2 +-
 3 files changed, 9 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/memory/tegra/mc.c b/drivers/memory/tegra/mc.c
index b99f3c620f6c..59db13287b47 100644
--- a/drivers/memory/tegra/mc.c
+++ b/drivers/memory/tegra/mc.c
@@ -51,7 +51,7 @@
 
 static const struct of_device_id tegra_mc_of_match[] = {
 #ifdef CONFIG_ARCH_TEGRA_2x_SOC
-	{ .compatible = "nvidia,tegra20-mc", .data = &tegra20_mc_soc },
+	{ .compatible = "nvidia,tegra20-mc-gart", .data = &tegra20_mc_soc },
 #endif
 #ifdef CONFIG_ARCH_TEGRA_3x_SOC
 	{ .compatible = "nvidia,tegra30-mc", .data = &tegra30_mc_soc },
@@ -638,24 +638,19 @@ static int tegra_mc_probe(struct platform_device *pdev)
 	if (IS_ERR(mc->regs))
 		return PTR_ERR(mc->regs);
 
+	mc->clk = devm_clk_get(&pdev->dev, "mc");
+	if (IS_ERR(mc->clk)) {
+		dev_err(&pdev->dev, "failed to get MC clock: %ld\n",
+			PTR_ERR(mc->clk));
+		return PTR_ERR(mc->clk);
+	}
+
 #ifdef CONFIG_ARCH_TEGRA_2x_SOC
 	if (mc->soc == &tegra20_mc_soc) {
-		res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-		mc->regs2 = devm_ioremap_resource(&pdev->dev, res);
-		if (IS_ERR(mc->regs2))
-			return PTR_ERR(mc->regs2);
-
 		isr = tegra20_mc_irq;
 	} else
 #endif
 	{
-		mc->clk = devm_clk_get(&pdev->dev, "mc");
-		if (IS_ERR(mc->clk)) {
-			dev_err(&pdev->dev, "failed to get MC clock: %ld\n",
-				PTR_ERR(mc->clk));
-			return PTR_ERR(mc->clk);
-		}
-
 		err = tegra_mc_setup_latency_allowance(mc);
 		if (err < 0) {
 			dev_err(&pdev->dev, "failed to setup latency allowance: %d\n",
diff --git a/drivers/memory/tegra/mc.h b/drivers/memory/tegra/mc.h
index 01065f12ebeb..9856f085e487 100644
--- a/drivers/memory/tegra/mc.h
+++ b/drivers/memory/tegra/mc.h
@@ -26,18 +26,12 @@
 
 static inline u32 mc_readl(struct tegra_mc *mc, unsigned long offset)
 {
-	if (mc->regs2 && offset >= 0x24)
-		return readl(mc->regs2 + offset - 0x3c);
-
 	return readl(mc->regs + offset);
 }
 
 static inline void mc_writel(struct tegra_mc *mc, u32 value,
 			     unsigned long offset)
 {
-	if (mc->regs2 && offset >= 0x24)
-		return writel(value, mc->regs2 + offset - 0x3c);
-
 	writel(value, mc->regs + offset);
 }
 
diff --git a/include/soc/tegra/mc.h b/include/soc/tegra/mc.h
index b43f37fea096..db5bfdf589b4 100644
--- a/include/soc/tegra/mc.h
+++ b/include/soc/tegra/mc.h
@@ -144,7 +144,7 @@ struct tegra_mc_soc {
 struct tegra_mc {
 	struct device *dev;
 	struct tegra_smmu *smmu;
-	void __iomem *regs, *regs2;
+	void __iomem *regs;
 	struct clk *clk;
 	int irq;
 
-- 
cgit v1.2.3


From ce2785a75dbca27375f3723f4e697a2a8dc096ee Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Wed, 12 Dec 2018 23:38:56 +0300
Subject: iommu/tegra: gart: Integrate with Memory Controller driver

The device-tree binding has been changed. There is no separate GART device
anymore, it is squashed into the Memory Controller. Integrate GART module
with the MC in a way it is done for the SMMU on Tegra30+.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/Kconfig      |  1 +
 drivers/iommu/tegra-gart.c | 71 ++++++++++++----------------------------------
 drivers/memory/tegra/mc.c  | 43 ++++++++++++++++++++++++++++
 include/soc/tegra/mc.h     | 25 ++++++++++++++++
 4 files changed, 87 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index d9a25715650e..83c099bb7288 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -282,6 +282,7 @@ config ROCKCHIP_IOMMU
 config TEGRA_IOMMU_GART
 	bool "Tegra GART IOMMU Support"
 	depends on ARCH_TEGRA_2x_SOC
+	depends on TEGRA_MC
 	select IOMMU_API
 	help
 	  Enables support for remapping discontiguous physical memory
diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
index 835fea461c59..b35ffa312a83 100644
--- a/drivers/iommu/tegra-gart.c
+++ b/drivers/iommu/tegra-gart.c
@@ -19,16 +19,17 @@
  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
-#include <linux/init.h>
 #include <linux/io.h>
 #include <linux/iommu.h>
 #include <linux/list.h>
 #include <linux/moduleparam.h>
-#include <linux/of_device.h>
+#include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 
+#include <soc/tegra/mc.h>
+
 /* bitmap of the page sizes currently supported */
 #define GART_IOMMU_PGSIZES	(SZ_4K)
 
@@ -397,9 +398,8 @@ static const struct iommu_ops gart_iommu_ops = {
 	.iotlb_sync	= gart_iommu_sync,
 };
 
-static int tegra_gart_suspend(struct device *dev)
+int tegra_gart_suspend(struct gart_device *gart)
 {
-	struct gart_device *gart = dev_get_drvdata(dev);
 	unsigned long iova;
 	u32 *data = gart->savedata;
 	unsigned long flags;
@@ -411,9 +411,8 @@ static int tegra_gart_suspend(struct device *dev)
 	return 0;
 }
 
-static int tegra_gart_resume(struct device *dev)
+int tegra_gart_resume(struct gart_device *gart)
 {
-	struct gart_device *gart = dev_get_drvdata(dev);
 	unsigned long flags;
 
 	spin_lock_irqsave(&gart->pte_lock, flags);
@@ -422,41 +421,33 @@ static int tegra_gart_resume(struct device *dev)
 	return 0;
 }
 
-static int tegra_gart_probe(struct platform_device *pdev)
+struct gart_device *tegra_gart_probe(struct device *dev, struct tegra_mc *mc)
 {
 	struct gart_device *gart;
-	struct resource *res, *res_remap;
+	struct resource *res_remap;
 	void __iomem *gart_regs;
-	struct device *dev = &pdev->dev;
 	int ret;
 
 	BUILD_BUG_ON(PAGE_SHIFT != GART_PAGE_SHIFT);
 
 	/* the GART memory aperture is required */
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	res_remap = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-	if (!res || !res_remap) {
+	res_remap = platform_get_resource(to_platform_device(dev),
+					  IORESOURCE_MEM, 1);
+	if (!res_remap) {
 		dev_err(dev, "GART memory aperture expected\n");
-		return -ENXIO;
+		return ERR_PTR(-ENXIO);
 	}
 
 	gart = devm_kzalloc(dev, sizeof(*gart), GFP_KERNEL);
 	if (!gart) {
 		dev_err(dev, "failed to allocate gart_device\n");
-		return -ENOMEM;
-	}
-
-	gart_regs = devm_ioremap(dev, res->start, resource_size(res));
-	if (!gart_regs) {
-		dev_err(dev, "failed to remap GART registers\n");
-		return -ENXIO;
+		return ERR_PTR(-ENOMEM);
 	}
 
-	ret = iommu_device_sysfs_add(&gart->iommu, &pdev->dev, NULL,
-				     dev_name(&pdev->dev));
+	ret = iommu_device_sysfs_add(&gart->iommu, dev, NULL, "gart");
 	if (ret) {
 		dev_err(dev, "Failed to register IOMMU in sysfs\n");
-		return ret;
+		return ERR_PTR(ret);
 	}
 
 	iommu_device_set_ops(&gart->iommu, &gart_iommu_ops);
@@ -468,7 +459,8 @@ static int tegra_gart_probe(struct platform_device *pdev)
 		goto remove_sysfs;
 	}
 
-	gart->dev = &pdev->dev;
+	gart->dev = dev;
+	gart_regs = mc->regs + GART_REG_BASE;
 	spin_lock_init(&gart->pte_lock);
 	spin_lock_init(&gart->client_lock);
 	INIT_LIST_HEAD(&gart->client);
@@ -483,46 +475,19 @@ static int tegra_gart_probe(struct platform_device *pdev)
 		goto unregister_iommu;
 	}
 
-	platform_set_drvdata(pdev, gart);
 	do_gart_setup(gart, NULL);
 
 	gart_handle = gart;
 
-	return 0;
+	return gart;
 
 unregister_iommu:
 	iommu_device_unregister(&gart->iommu);
 remove_sysfs:
 	iommu_device_sysfs_remove(&gart->iommu);
 
-	return ret;
-}
-
-static const struct dev_pm_ops tegra_gart_pm_ops = {
-	.suspend	= tegra_gart_suspend,
-	.resume		= tegra_gart_resume,
-};
-
-static const struct of_device_id tegra_gart_of_match[] = {
-	{ .compatible = "nvidia,tegra20-gart", },
-	{ },
-};
-
-static struct platform_driver tegra_gart_driver = {
-	.probe		= tegra_gart_probe,
-	.driver = {
-		.name	= "tegra-gart",
-		.pm	= &tegra_gart_pm_ops,
-		.of_match_table = tegra_gart_of_match,
-		.suppress_bind_attrs = true,
-	},
-};
-
-static int __init tegra_gart_init(void)
-{
-	return platform_driver_register(&tegra_gart_driver);
+	return ERR_PTR(ret);
 }
-subsys_initcall(tegra_gart_init);
 
 module_param(gart_debug, bool, 0644);
 MODULE_PARM_DESC(gart_debug, "Enable GART debugging");
diff --git a/drivers/memory/tegra/mc.c b/drivers/memory/tegra/mc.c
index 55ecfb2d8cfd..e684e234361a 100644
--- a/drivers/memory/tegra/mc.c
+++ b/drivers/memory/tegra/mc.c
@@ -702,13 +702,56 @@ static int tegra_mc_probe(struct platform_device *pdev)
 				PTR_ERR(mc->smmu));
 	}
 
+	if (IS_ENABLED(CONFIG_TEGRA_IOMMU_GART) && !mc->soc->smmu) {
+		mc->gart = tegra_gart_probe(&pdev->dev, mc);
+		if (IS_ERR(mc->gart)) {
+			dev_err(&pdev->dev, "failed to probe GART: %ld\n",
+				PTR_ERR(mc->gart));
+			mc->gart = NULL;
+		}
+	}
+
+	return 0;
+}
+
+static int tegra_mc_suspend(struct device *dev)
+{
+	struct tegra_mc *mc = dev_get_drvdata(dev);
+	int err;
+
+	if (IS_ENABLED(CONFIG_TEGRA_IOMMU_GART) && mc->gart) {
+		err = tegra_gart_suspend(mc->gart);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
+static int tegra_mc_resume(struct device *dev)
+{
+	struct tegra_mc *mc = dev_get_drvdata(dev);
+	int err;
+
+	if (IS_ENABLED(CONFIG_TEGRA_IOMMU_GART) && mc->gart) {
+		err = tegra_gart_resume(mc->gart);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static const struct dev_pm_ops tegra_mc_pm_ops = {
+	.suspend = tegra_mc_suspend,
+	.resume = tegra_mc_resume,
+};
+
 static struct platform_driver tegra_mc_driver = {
 	.driver = {
 		.name = "tegra-mc",
 		.of_match_table = tegra_mc_of_match,
+		.pm = &tegra_mc_pm_ops,
 		.suppress_bind_attrs = true,
 	},
 	.prevent_deferred_probe = true,
diff --git a/include/soc/tegra/mc.h b/include/soc/tegra/mc.h
index db5bfdf589b4..e489a028ec9f 100644
--- a/include/soc/tegra/mc.h
+++ b/include/soc/tegra/mc.h
@@ -9,6 +9,7 @@
 #ifndef __SOC_TEGRA_MC_H__
 #define __SOC_TEGRA_MC_H__
 
+#include <linux/err.h>
 #include <linux/reset-controller.h>
 #include <linux/types.h>
 
@@ -77,6 +78,7 @@ struct tegra_smmu_soc {
 
 struct tegra_mc;
 struct tegra_smmu;
+struct gart_device;
 
 #ifdef CONFIG_TEGRA_IOMMU_SMMU
 struct tegra_smmu *tegra_smmu_probe(struct device *dev,
@@ -96,6 +98,28 @@ static inline void tegra_smmu_remove(struct tegra_smmu *smmu)
 }
 #endif
 
+#ifdef CONFIG_TEGRA_IOMMU_GART
+struct gart_device *tegra_gart_probe(struct device *dev, struct tegra_mc *mc);
+int tegra_gart_suspend(struct gart_device *gart);
+int tegra_gart_resume(struct gart_device *gart);
+#else
+static inline struct gart_device *
+tegra_gart_probe(struct device *dev, struct tegra_mc *mc)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline int tegra_gart_suspend(struct gart_device *gart)
+{
+	return -ENODEV;
+}
+
+static inline int tegra_gart_resume(struct gart_device *gart)
+{
+	return -ENODEV;
+}
+#endif
+
 struct tegra_mc_reset {
 	const char *name;
 	unsigned long id;
@@ -144,6 +168,7 @@ struct tegra_mc_soc {
 struct tegra_mc {
 	struct device *dev;
 	struct tegra_smmu *smmu;
+	struct gart_device *gart;
 	void __iomem *regs;
 	struct clk *clk;
 	int irq;
-- 
cgit v1.2.3


From 3eafbd3a7745e1579665cbb8c3959a74a666b8c9 Mon Sep 17 00:00:00 2001
From: Pascal PAILLET-LME <p.paillet@st.com>
Date: Mon, 14 Jan 2019 10:05:15 +0000
Subject: dt-bindings: mfd: Document STPMIC1

STPMIC1 is a PMIC from STMicroelectronics. The STPMIC1 integrates 10
regulators, 3 power switches, a watchdog and an input for a power on key.

Signed-off-by: Pascal Paillet <p.paillet@st.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 .../devicetree/bindings/mfd/st,stpmic1.txt         | 61 ++++++++++++++++++++++
 include/dt-bindings/mfd/st,stpmic1.h               | 50 ++++++++++++++++++
 2 files changed, 111 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/mfd/st,stpmic1.txt
 create mode 100644 include/dt-bindings/mfd/st,stpmic1.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/mfd/st,stpmic1.txt b/Documentation/devicetree/bindings/mfd/st,stpmic1.txt
new file mode 100644
index 000000000000..afd45c089585
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/st,stpmic1.txt
@@ -0,0 +1,61 @@
+* STMicroelectronics STPMIC1 Power Management IC
+
+Required properties:
+- compatible:		: "st,stpmic1"
+- reg:			: The I2C slave address for the STPMIC1 chip.
+- interrupts:		: The interrupt line the device is connected to.
+- #interrupt-cells:	: Should be 1.
+- interrupt-controller:	: Marks the device node as an interrupt controller.
+			    Interrupt numbers are defined at
+			    dt-bindings/mfd/st,stpmic1.h.
+
+STPMIC1 consists in a varied group of sub-devices.
+Each sub-device binding is be described in own documentation file.
+
+Device			 Description
+------			------------
+st,stpmic1-onkey	: Power on key, see ../input/st,stpmic1-onkey.txt
+st,stpmic1-regulators	: Regulators, see ../regulator/st,stpmic1-regulator.txt
+st,stpmic1-wdt		: Watchdog, see ../watchdog/st,stpmic1-wdt.txt
+
+Example:
+
+#include <dt-bindings/mfd/st,stpmic1.h>
+
+pmic: pmic@33 {
+	compatible = "st,stpmic1";
+	reg = <0x33>;
+	interrupt-parent = <&gpioa>;
+	interrupts = <0 2>;
+
+	interrupt-controller;
+	#interrupt-cells = <2>;
+
+	onkey {
+		compatible = "st,stpmic1-onkey";
+		interrupts = <IT_PONKEY_F 0>,<IT_PONKEY_R 1>;
+		interrupt-names = "onkey-falling", "onkey-rising";
+		power-off-time-sec = <10>;
+	};
+
+	watchdog {
+		compatible = "st,stpmic1-wdt";
+	};
+
+	regulators {
+		compatible = "st,stpmic1-regulators";
+
+		vdd_core: buck1 {
+			regulator-name = "vdd_core";
+			regulator-boot-on;
+			regulator-min-microvolt = <700000>;
+			regulator-max-microvolt = <1200000>;
+		};
+		vdd: buck3 {
+			regulator-name = "vdd";
+			regulator-min-microvolt = <3300000>;
+			regulator-max-microvolt = <3300000>;
+			regulator-boot-on;
+			regulator-pull-down;
+		};
+	};
diff --git a/include/dt-bindings/mfd/st,stpmic1.h b/include/dt-bindings/mfd/st,stpmic1.h
new file mode 100644
index 000000000000..321cd08797d9
--- /dev/null
+++ b/include/dt-bindings/mfd/st,stpmic1.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) STMicroelectronics 2018 - All Rights Reserved
+ * Author: Philippe Peurichard <philippe.peurichard@st.com>,
+ * Pascal Paillet <p.paillet@st.com> for STMicroelectronics.
+ */
+
+#ifndef __DT_BINDINGS_STPMIC1_H__
+#define __DT_BINDINGS_STPMIC1_H__
+
+/* IRQ definitions */
+#define IT_PONKEY_F	0
+#define IT_PONKEY_R	1
+#define IT_WAKEUP_F	2
+#define IT_WAKEUP_R	3
+#define IT_VBUS_OTG_F	4
+#define IT_VBUS_OTG_R	5
+#define IT_SWOUT_F	6
+#define IT_SWOUT_R	7
+
+#define IT_CURLIM_BUCK1	8
+#define IT_CURLIM_BUCK2	9
+#define IT_CURLIM_BUCK3	10
+#define IT_CURLIM_BUCK4	11
+#define IT_OCP_OTG	12
+#define IT_OCP_SWOUT	13
+#define IT_OCP_BOOST	14
+#define IT_OVP_BOOST	15
+
+#define IT_CURLIM_LDO1	16
+#define IT_CURLIM_LDO2	17
+#define IT_CURLIM_LDO3	18
+#define IT_CURLIM_LDO4	19
+#define IT_CURLIM_LDO5	20
+#define IT_CURLIM_LDO6	21
+#define IT_SHORT_SWOTG	22
+#define IT_SHORT_SWOUT	23
+
+#define IT_TWARN_F	24
+#define IT_TWARN_R	25
+#define IT_VINLOW_F	26
+#define IT_VINLOW_R	27
+#define IT_SWIN_F	30
+#define IT_SWIN_R	31
+
+/* BUCK MODES definitions */
+#define STPMIC1_BUCK_MODE_NORMAL 0
+#define STPMIC1_BUCK_MODE_LP 2
+
+#endif /* __DT_BINDINGS_STPMIC1_H__ */
-- 
cgit v1.2.3


From 51908d2e9b7c7730608a19f24fc8718af745bb2f Mon Sep 17 00:00:00 2001
From: Pascal PAILLET-LME <p.paillet@st.com>
Date: Mon, 14 Jan 2019 10:05:16 +0000
Subject: mfd: stpmic1: Add STPMIC1 driver

STPMIC1 is a PMIC from STMicroelectronics. The STPMIC1 integrates 10
regulators, 3 power switches, a watchdog and an input for a power on key.

Signed-off-by: Pascal Paillet <p.paillet@st.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig         |  16 ++++
 drivers/mfd/Makefile        |   1 +
 drivers/mfd/stpmic1.c       | 213 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/stpmic1.h | 212 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 442 insertions(+)
 create mode 100644 drivers/mfd/stpmic1.c
 create mode 100644 include/linux/mfd/stpmic1.h

(limited to 'include')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 8c5dfdce4326..0761cb83d174 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1871,6 +1871,22 @@ config MFD_STM32_TIMERS
 	  for PWM and IIO Timer. This driver allow to share the
 	  registers between the others drivers.
 
+config MFD_STPMIC1
+	tristate "Support for STPMIC1 PMIC"
+	depends on (I2C=y && OF)
+	select REGMAP_I2C
+	select REGMAP_IRQ
+	select MFD_CORE
+	help
+	  Support for ST Microelectronics STPMIC1 PMIC. STPMIC1 has power on
+	  key, watchdog and regulator functionalities which are supported via
+	  the relevant subsystems. This driver provides core support for the
+	  STPMIC1. In order to use the actual functionaltiy of the device other
+	  drivers must be enabled.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called stpmic1.
+
 menu "Multimedia Capabilities Port drivers"
 	depends on ARCH_SA1100
 
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 12980a4ad460..a62fb0112d9f 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -233,6 +233,7 @@ obj-$(CONFIG_INTEL_SOC_PMIC_CHTDC_TI)	+= intel_soc_pmic_chtdc_ti.o
 obj-$(CONFIG_MFD_MT6397)	+= mt6397-core.o
 
 obj-$(CONFIG_MFD_ALTERA_A10SR)	+= altera-a10sr.o
+obj-$(CONFIG_MFD_STPMIC1)	+= stpmic1.o
 obj-$(CONFIG_MFD_SUN4I_GPADC)	+= sun4i-gpadc.o
 
 obj-$(CONFIG_MFD_STM32_LPTIMER)	+= stm32-lptimer.o
diff --git a/drivers/mfd/stpmic1.c b/drivers/mfd/stpmic1.c
new file mode 100644
index 000000000000..7dfbe8906cb8
--- /dev/null
+++ b/drivers/mfd/stpmic1.c
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) STMicroelectronics 2018
+// Author: Pascal Paillet <p.paillet@st.com>
+
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/stpmic1.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/of_platform.h>
+#include <linux/pm_wakeirq.h>
+#include <linux/regmap.h>
+
+#include <dt-bindings/mfd/st,stpmic1.h>
+
+#define STPMIC1_MAIN_IRQ 0
+
+static const struct regmap_range stpmic1_readable_ranges[] = {
+	regmap_reg_range(TURN_ON_SR, VERSION_SR),
+	regmap_reg_range(SWOFF_PWRCTRL_CR, LDO6_STDBY_CR),
+	regmap_reg_range(BST_SW_CR, BST_SW_CR),
+	regmap_reg_range(INT_PENDING_R1, INT_PENDING_R4),
+	regmap_reg_range(INT_CLEAR_R1, INT_CLEAR_R4),
+	regmap_reg_range(INT_MASK_R1, INT_MASK_R4),
+	regmap_reg_range(INT_SET_MASK_R1, INT_SET_MASK_R4),
+	regmap_reg_range(INT_CLEAR_MASK_R1, INT_CLEAR_MASK_R4),
+	regmap_reg_range(INT_SRC_R1, INT_SRC_R1),
+};
+
+static const struct regmap_range stpmic1_writeable_ranges[] = {
+	regmap_reg_range(SWOFF_PWRCTRL_CR, LDO6_STDBY_CR),
+	regmap_reg_range(BST_SW_CR, BST_SW_CR),
+	regmap_reg_range(INT_CLEAR_R1, INT_CLEAR_R4),
+	regmap_reg_range(INT_SET_MASK_R1, INT_SET_MASK_R4),
+	regmap_reg_range(INT_CLEAR_MASK_R1, INT_CLEAR_MASK_R4),
+};
+
+static const struct regmap_range stpmic1_volatile_ranges[] = {
+	regmap_reg_range(TURN_ON_SR, VERSION_SR),
+	regmap_reg_range(WCHDG_CR, WCHDG_CR),
+	regmap_reg_range(INT_PENDING_R1, INT_PENDING_R4),
+	regmap_reg_range(INT_SRC_R1, INT_SRC_R4),
+};
+
+static const struct regmap_access_table stpmic1_readable_table = {
+	.yes_ranges = stpmic1_readable_ranges,
+	.n_yes_ranges = ARRAY_SIZE(stpmic1_readable_ranges),
+};
+
+static const struct regmap_access_table stpmic1_writeable_table = {
+	.yes_ranges = stpmic1_writeable_ranges,
+	.n_yes_ranges = ARRAY_SIZE(stpmic1_writeable_ranges),
+};
+
+static const struct regmap_access_table stpmic1_volatile_table = {
+	.yes_ranges = stpmic1_volatile_ranges,
+	.n_yes_ranges = ARRAY_SIZE(stpmic1_volatile_ranges),
+};
+
+const struct regmap_config stpmic1_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.cache_type = REGCACHE_RBTREE,
+	.max_register = PMIC_MAX_REGISTER_ADDRESS,
+	.rd_table = &stpmic1_readable_table,
+	.wr_table = &stpmic1_writeable_table,
+	.volatile_table = &stpmic1_volatile_table,
+};
+
+static const struct regmap_irq stpmic1_irqs[] = {
+	REGMAP_IRQ_REG(IT_PONKEY_F, 0, 0x01),
+	REGMAP_IRQ_REG(IT_PONKEY_R, 0, 0x02),
+	REGMAP_IRQ_REG(IT_WAKEUP_F, 0, 0x04),
+	REGMAP_IRQ_REG(IT_WAKEUP_R, 0, 0x08),
+	REGMAP_IRQ_REG(IT_VBUS_OTG_F, 0, 0x10),
+	REGMAP_IRQ_REG(IT_VBUS_OTG_R, 0, 0x20),
+	REGMAP_IRQ_REG(IT_SWOUT_F, 0, 0x40),
+	REGMAP_IRQ_REG(IT_SWOUT_R, 0, 0x80),
+
+	REGMAP_IRQ_REG(IT_CURLIM_BUCK1, 1, 0x01),
+	REGMAP_IRQ_REG(IT_CURLIM_BUCK2, 1, 0x02),
+	REGMAP_IRQ_REG(IT_CURLIM_BUCK3, 1, 0x04),
+	REGMAP_IRQ_REG(IT_CURLIM_BUCK4, 1, 0x08),
+	REGMAP_IRQ_REG(IT_OCP_OTG, 1, 0x10),
+	REGMAP_IRQ_REG(IT_OCP_SWOUT, 1, 0x20),
+	REGMAP_IRQ_REG(IT_OCP_BOOST, 1, 0x40),
+	REGMAP_IRQ_REG(IT_OVP_BOOST, 1, 0x80),
+
+	REGMAP_IRQ_REG(IT_CURLIM_LDO1, 2, 0x01),
+	REGMAP_IRQ_REG(IT_CURLIM_LDO2, 2, 0x02),
+	REGMAP_IRQ_REG(IT_CURLIM_LDO3, 2, 0x04),
+	REGMAP_IRQ_REG(IT_CURLIM_LDO4, 2, 0x08),
+	REGMAP_IRQ_REG(IT_CURLIM_LDO5, 2, 0x10),
+	REGMAP_IRQ_REG(IT_CURLIM_LDO6, 2, 0x20),
+	REGMAP_IRQ_REG(IT_SHORT_SWOTG, 2, 0x40),
+	REGMAP_IRQ_REG(IT_SHORT_SWOUT, 2, 0x80),
+
+	REGMAP_IRQ_REG(IT_TWARN_F, 3, 0x01),
+	REGMAP_IRQ_REG(IT_TWARN_R, 3, 0x02),
+	REGMAP_IRQ_REG(IT_VINLOW_F, 3, 0x04),
+	REGMAP_IRQ_REG(IT_VINLOW_R, 3, 0x08),
+	REGMAP_IRQ_REG(IT_SWIN_F, 3, 0x40),
+	REGMAP_IRQ_REG(IT_SWIN_R, 3, 0x80),
+};
+
+static const struct regmap_irq_chip stpmic1_regmap_irq_chip = {
+	.name = "pmic_irq",
+	.status_base = INT_PENDING_R1,
+	.mask_base = INT_CLEAR_MASK_R1,
+	.unmask_base = INT_SET_MASK_R1,
+	.ack_base = INT_CLEAR_R1,
+	.num_regs = STPMIC1_PMIC_NUM_IRQ_REGS,
+	.irqs = stpmic1_irqs,
+	.num_irqs = ARRAY_SIZE(stpmic1_irqs),
+};
+
+static int stpmic1_probe(struct i2c_client *i2c,
+			 const struct i2c_device_id *id)
+{
+	struct stpmic1 *ddata;
+	struct device *dev = &i2c->dev;
+	int ret;
+	struct device_node *np = dev->of_node;
+	u32 reg;
+
+	ddata = devm_kzalloc(dev, sizeof(struct stpmic1), GFP_KERNEL);
+	if (!ddata)
+		return -ENOMEM;
+
+	i2c_set_clientdata(i2c, ddata);
+	ddata->dev = dev;
+
+	ddata->regmap = devm_regmap_init_i2c(i2c, &stpmic1_regmap_config);
+	if (IS_ERR(ddata->regmap))
+		return PTR_ERR(ddata->regmap);
+
+	ddata->irq = of_irq_get(np, STPMIC1_MAIN_IRQ);
+	if (ddata->irq < 0) {
+		dev_err(dev, "Failed to get main IRQ: %d\n", ddata->irq);
+		return ddata->irq;
+	}
+
+	ret = regmap_read(ddata->regmap, VERSION_SR, &reg);
+	if (ret) {
+		dev_err(dev, "Unable to read PMIC version\n");
+		return ret;
+	}
+	dev_info(dev, "PMIC Chip Version: 0x%x\n", reg);
+
+	/* Initialize PMIC IRQ Chip & associated IRQ domains */
+	ret = devm_regmap_add_irq_chip(dev, ddata->regmap, ddata->irq,
+				       IRQF_ONESHOT | IRQF_SHARED,
+				       0, &stpmic1_regmap_irq_chip,
+				       &ddata->irq_data);
+	if (ret) {
+		dev_err(dev, "IRQ Chip registration failed: %d\n", ret);
+		return ret;
+	}
+
+	return devm_of_platform_populate(dev);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int stpmic1_suspend(struct device *dev)
+{
+	struct i2c_client *i2c = to_i2c_client(dev);
+	struct stpmic1 *pmic_dev = i2c_get_clientdata(i2c);
+
+	disable_irq(pmic_dev->irq);
+
+	return 0;
+}
+
+static int stpmic1_resume(struct device *dev)
+{
+	struct i2c_client *i2c = to_i2c_client(dev);
+	struct stpmic1 *pmic_dev = i2c_get_clientdata(i2c);
+	int ret;
+
+	ret = regcache_sync(pmic_dev->regmap);
+	if (ret)
+		return ret;
+
+	enable_irq(pmic_dev->irq);
+
+	return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(stpmic1_pm, stpmic1_suspend, stpmic1_resume);
+
+static const struct of_device_id stpmic1_of_match[] = {
+	{ .compatible = "st,stpmic1", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, stpmic1_of_match);
+
+static struct i2c_driver stpmic1_driver = {
+	.driver = {
+		.name = "stpmic1",
+		.of_match_table = of_match_ptr(stpmic1_of_match),
+		.pm = &stpmic1_pm,
+	},
+	.probe = stpmic1_probe,
+};
+
+module_i2c_driver(stpmic1_driver);
+
+MODULE_DESCRIPTION("STPMIC1 PMIC Driver");
+MODULE_AUTHOR("Pascal Paillet <p.paillet@st.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mfd/stpmic1.h b/include/linux/mfd/stpmic1.h
new file mode 100644
index 000000000000..fa3f99f7e9a1
--- /dev/null
+++ b/include/linux/mfd/stpmic1.h
@@ -0,0 +1,212 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) STMicroelectronics 2018 - All Rights Reserved
+ * Author: Philippe Peurichard <philippe.peurichard@st.com>,
+ * Pascal Paillet <p.paillet@st.com> for STMicroelectronics.
+ */
+
+#ifndef __LINUX_MFD_STPMIC1_H
+#define __LINUX_MFD_STPMIC1_H
+
+#define TURN_ON_SR		0x1
+#define TURN_OFF_SR		0x2
+#define ICC_LDO_TURN_OFF_SR	0x3
+#define ICC_BUCK_TURN_OFF_SR	0x4
+#define RREQ_STATE_SR		0x5
+#define VERSION_SR		0x6
+
+#define SWOFF_PWRCTRL_CR	0x10
+#define PADS_PULL_CR		0x11
+#define BUCKS_PD_CR		0x12
+#define LDO14_PD_CR		0x13
+#define LDO56_VREF_PD_CR	0x14
+#define VBUS_DET_VIN_CR		0x15
+#define PKEY_TURNOFF_CR		0x16
+#define BUCKS_MASK_RANK_CR	0x17
+#define BUCKS_MASK_RESET_CR	0x18
+#define LDOS_MASK_RANK_CR	0x19
+#define LDOS_MASK_RESET_CR	0x1A
+#define WCHDG_CR		0x1B
+#define WCHDG_TIMER_CR		0x1C
+#define BUCKS_ICCTO_CR		0x1D
+#define LDOS_ICCTO_CR		0x1E
+
+#define BUCK1_ACTIVE_CR		0x20
+#define BUCK2_ACTIVE_CR		0x21
+#define BUCK3_ACTIVE_CR		0x22
+#define BUCK4_ACTIVE_CR		0x23
+#define VREF_DDR_ACTIVE_CR	0x24
+#define LDO1_ACTIVE_CR		0x25
+#define LDO2_ACTIVE_CR		0x26
+#define LDO3_ACTIVE_CR		0x27
+#define LDO4_ACTIVE_CR		0x28
+#define LDO5_ACTIVE_CR		0x29
+#define LDO6_ACTIVE_CR		0x2A
+
+#define BUCK1_STDBY_CR		0x30
+#define BUCK2_STDBY_CR		0x31
+#define BUCK3_STDBY_CR		0x32
+#define BUCK4_STDBY_CR		0x33
+#define VREF_DDR_STDBY_CR	0x34
+#define LDO1_STDBY_CR		0x35
+#define LDO2_STDBY_CR		0x36
+#define LDO3_STDBY_CR		0x37
+#define LDO4_STDBY_CR		0x38
+#define LDO5_STDBY_CR		0x39
+#define LDO6_STDBY_CR		0x3A
+
+#define BST_SW_CR		0x40
+
+#define INT_PENDING_R1		0x50
+#define INT_PENDING_R2		0x51
+#define INT_PENDING_R3		0x52
+#define INT_PENDING_R4		0x53
+
+#define INT_DBG_LATCH_R1	0x60
+#define INT_DBG_LATCH_R2	0x61
+#define INT_DBG_LATCH_R3	0x62
+#define INT_DBG_LATCH_R4	0x63
+
+#define INT_CLEAR_R1		0x70
+#define INT_CLEAR_R2		0x71
+#define INT_CLEAR_R3		0x72
+#define INT_CLEAR_R4		0x73
+
+#define INT_MASK_R1		0x80
+#define INT_MASK_R2		0x81
+#define INT_MASK_R3		0x82
+#define INT_MASK_R4		0x83
+
+#define INT_SET_MASK_R1		0x90
+#define INT_SET_MASK_R2		0x91
+#define INT_SET_MASK_R3		0x92
+#define INT_SET_MASK_R4		0x93
+
+#define INT_CLEAR_MASK_R1	0xA0
+#define INT_CLEAR_MASK_R2	0xA1
+#define INT_CLEAR_MASK_R3	0xA2
+#define INT_CLEAR_MASK_R4	0xA3
+
+#define INT_SRC_R1		0xB0
+#define INT_SRC_R2		0xB1
+#define INT_SRC_R3		0xB2
+#define INT_SRC_R4		0xB3
+
+#define PMIC_MAX_REGISTER_ADDRESS INT_SRC_R4
+
+#define STPMIC1_PMIC_NUM_IRQ_REGS 4
+
+#define TURN_OFF_SR_ICC_EVENT	0x08
+
+#define LDO_VOLTAGE_MASK		GENMASK(6, 2)
+#define BUCK_VOLTAGE_MASK		GENMASK(7, 2)
+#define LDO_BUCK_VOLTAGE_SHIFT		2
+
+#define LDO_ENABLE_MASK			BIT(0)
+#define BUCK_ENABLE_MASK		BIT(0)
+
+#define BUCK_HPLP_ENABLE_MASK		BIT(1)
+#define BUCK_HPLP_SHIFT			1
+
+#define STDBY_ENABLE_MASK  BIT(0)
+
+#define BUCKS_PD_CR_REG_MASK	GENMASK(7, 0)
+#define BUCK_MASK_RANK_REGISTER_MASK	GENMASK(3, 0)
+#define BUCK_MASK_RESET_REGISTER_MASK	GENMASK(3, 0)
+#define LDO1234_PULL_DOWN_REGISTER_MASK	GENMASK(7, 0)
+#define LDO56_VREF_PD_CR_REG_MASK	GENMASK(5, 0)
+#define LDO_MASK_RANK_REGISTER_MASK	GENMASK(5, 0)
+#define LDO_MASK_RESET_REGISTER_MASK	GENMASK(5, 0)
+
+#define BUCK1_PULL_DOWN_REG		BUCKS_PD_CR
+#define BUCK1_PULL_DOWN_MASK		BIT(0)
+#define BUCK2_PULL_DOWN_REG		BUCKS_PD_CR
+#define BUCK2_PULL_DOWN_MASK		BIT(2)
+#define BUCK3_PULL_DOWN_REG		BUCKS_PD_CR
+#define BUCK3_PULL_DOWN_MASK		BIT(4)
+#define BUCK4_PULL_DOWN_REG		BUCKS_PD_CR
+#define BUCK4_PULL_DOWN_MASK		BIT(6)
+
+#define LDO1_PULL_DOWN_REG		LDO14_PD_CR
+#define LDO1_PULL_DOWN_MASK		BIT(0)
+#define LDO2_PULL_DOWN_REG		LDO14_PD_CR
+#define LDO2_PULL_DOWN_MASK		BIT(2)
+#define LDO3_PULL_DOWN_REG		LDO14_PD_CR
+#define LDO3_PULL_DOWN_MASK		BIT(4)
+#define LDO4_PULL_DOWN_REG		LDO14_PD_CR
+#define LDO4_PULL_DOWN_MASK		BIT(6)
+#define LDO5_PULL_DOWN_REG		LDO56_VREF_PD_CR
+#define LDO5_PULL_DOWN_MASK		BIT(0)
+#define LDO6_PULL_DOWN_REG		LDO56_VREF_PD_CR
+#define LDO6_PULL_DOWN_MASK		BIT(2)
+#define VREF_DDR_PULL_DOWN_REG		LDO56_VREF_PD_CR
+#define VREF_DDR_PULL_DOWN_MASK		BIT(4)
+
+#define BUCKS_ICCTO_CR_REG_MASK	GENMASK(6, 0)
+#define LDOS_ICCTO_CR_REG_MASK	GENMASK(5, 0)
+
+#define LDO_BYPASS_MASK			BIT(7)
+
+/* Main PMIC Control Register
+ * SWOFF_PWRCTRL_CR
+ * Address : 0x10
+ */
+#define ICC_EVENT_ENABLED		BIT(4)
+#define PWRCTRL_POLARITY_HIGH		BIT(3)
+#define PWRCTRL_PIN_VALID		BIT(2)
+#define RESTART_REQUEST_ENABLED		BIT(1)
+#define SOFTWARE_SWITCH_OFF_ENABLED	BIT(0)
+
+/* Main PMIC PADS Control Register
+ * PADS_PULL_CR
+ * Address : 0x11
+ */
+#define WAKEUP_DETECTOR_DISABLED	BIT(4)
+#define PWRCTRL_PD_ACTIVE		BIT(3)
+#define PWRCTRL_PU_ACTIVE		BIT(2)
+#define WAKEUP_PD_ACTIVE		BIT(1)
+#define PONKEY_PU_INACTIVE		BIT(0)
+
+/* Main PMIC VINLOW Control Register
+ * VBUS_DET_VIN_CRC DMSC
+ * Address : 0x15
+ */
+#define SWIN_DETECTOR_ENABLED		BIT(7)
+#define SWOUT_DETECTOR_ENABLED		BIT(6)
+#define VINLOW_ENABLED			BIT(0)
+#define VINLOW_CTRL_REG_MASK		GENMASK(7, 0)
+
+/* USB Control Register
+ * Address : 0x40
+ */
+#define BOOST_OVP_DISABLED		BIT(7)
+#define VBUS_OTG_DETECTION_DISABLED	BIT(6)
+#define SW_OUT_DISCHARGE		BIT(5)
+#define VBUS_OTG_DISCHARGE		BIT(4)
+#define OCP_LIMIT_HIGH			BIT(3)
+#define SWIN_SWOUT_ENABLED		BIT(2)
+#define USBSW_OTG_SWITCH_ENABLED	BIT(1)
+#define BOOST_ENABLED			BIT(0)
+
+/* PKEY_TURNOFF_CR
+ * Address : 0x16
+ */
+#define PONKEY_PWR_OFF			BIT(7)
+#define PONKEY_CC_FLAG_CLEAR		BIT(6)
+#define PONKEY_TURNOFF_TIMER_MASK	GENMASK(3, 0)
+#define PONKEY_TURNOFF_MASK		GENMASK(7, 0)
+
+/*
+ * struct stpmic1 - stpmic1 master device for sub-drivers
+ * @dev: master device of the chip (can be used to access platform data)
+ * @irq: main IRQ number
+ * @regmap_irq_chip_data: irq chip data
+ */
+struct stpmic1 {
+	struct device *dev;
+	struct regmap *regmap;
+	int irq;
+	struct regmap_irq_chip_data *irq_data;
+};
+
+#endif /*  __LINUX_MFD_STPMIC1_H */
-- 
cgit v1.2.3


From c1d5fb019f65e1093c77734356f97b6e7ac4d629 Mon Sep 17 00:00:00 2001
From: "Matwey V. Kornilov" <matwey@sai.msu.ru>
Date: Fri, 9 Nov 2018 14:03:26 -0500
Subject: media: usb: pwc: Introduce TRACE_EVENTs for pwc_isoc_handler()

There were reports that PWC-based webcams don't work at some
embedded ARM platforms. [1] Isochronous transfer handler seems to
work too long leading to the issues in MUSB USB host subsystem.
Also note, that urb->giveback() handlers are still called with
disabled interrupts. In order to be able to measure performance of
PWC driver, traces are introduced in URB handler section.

[1] https://www.spinics.net/lists/linux-usb/msg165735.html

Signed-off-by: Matwey V. Kornilov <matwey@sai.msu.ru>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/usb/pwc/pwc-if.c |  7 +++++
 include/trace/events/pwc.h     | 65 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)
 create mode 100644 include/trace/events/pwc.h

(limited to 'include')

diff --git a/drivers/media/usb/pwc/pwc-if.c b/drivers/media/usb/pwc/pwc-if.c
index 72704f4d5330..53c111bd5a22 100644
--- a/drivers/media/usb/pwc/pwc-if.c
+++ b/drivers/media/usb/pwc/pwc-if.c
@@ -76,6 +76,9 @@
 #include "pwc-dec23.h"
 #include "pwc-dec1.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/pwc.h>
+
 /* Function prototypes and driver templates */
 
 /* hotplug device table support */
@@ -260,6 +263,8 @@ static void pwc_isoc_handler(struct urb *urb)
 	int i, fst, flen;
 	unsigned char *iso_buf = NULL;
 
+	trace_pwc_handler_enter(urb, pdev);
+
 	if (urb->status == -ENOENT || urb->status == -ECONNRESET ||
 	    urb->status == -ESHUTDOWN) {
 		PWC_DEBUG_OPEN("URB (%p) unlinked %ssynchronously.\n",
@@ -348,6 +353,8 @@ static void pwc_isoc_handler(struct urb *urb)
 	}
 
 handler_end:
+	trace_pwc_handler_exit(urb, pdev);
+
 	i = usb_submit_urb(urb, GFP_ATOMIC);
 	if (i != 0)
 		PWC_ERROR("Error (%d) re-submitting urb in pwc_isoc_handler.\n", i);
diff --git a/include/trace/events/pwc.h b/include/trace/events/pwc.h
new file mode 100644
index 000000000000..a2da764a3b41
--- /dev/null
+++ b/include/trace/events/pwc.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_PWC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PWC_H
+
+#include <linux/usb.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM pwc
+
+TRACE_EVENT(pwc_handler_enter,
+	TP_PROTO(struct urb *urb, struct pwc_device *pdev),
+	TP_ARGS(urb, pdev),
+	TP_STRUCT__entry(
+		__field(struct urb*, urb)
+		__field(struct pwc_frame_buf*, fbuf)
+		__field(int, urb__status)
+		__field(u32, urb__actual_length)
+		__field(int, fbuf__filled)
+		__string(name, pdev->v4l2_dev.name)
+	),
+	TP_fast_assign(
+		__entry->urb = urb;
+		__entry->fbuf = pdev->fill_buf;
+		__entry->urb__status = urb->status;
+		__entry->urb__actual_length = urb->actual_length;
+		__entry->fbuf__filled = (pdev->fill_buf
+					 ? pdev->fill_buf->filled : 0);
+		__assign_str(name, pdev->v4l2_dev.name);
+	),
+	TP_printk("dev=%s (fbuf=%p filled=%d) urb=%p (status=%d actual_length=%u)",
+		__get_str(name),
+		__entry->fbuf,
+		__entry->fbuf__filled,
+		__entry->urb,
+		__entry->urb__status,
+		__entry->urb__actual_length)
+);
+
+TRACE_EVENT(pwc_handler_exit,
+	TP_PROTO(struct urb *urb, struct pwc_device *pdev),
+	TP_ARGS(urb, pdev),
+	TP_STRUCT__entry(
+		__field(struct urb*, urb)
+		__field(struct pwc_frame_buf*, fbuf)
+		__field(int, fbuf__filled)
+		__string(name, pdev->v4l2_dev.name)
+	),
+	TP_fast_assign(
+		__entry->urb = urb;
+		__entry->fbuf = pdev->fill_buf;
+		__entry->fbuf__filled = pdev->fill_buf->filled;
+		__assign_str(name, pdev->v4l2_dev.name);
+	),
+	TP_printk(" dev=%s (fbuf=%p filled=%d) urb=%p",
+		__get_str(name),
+		__entry->fbuf,
+		__entry->fbuf__filled,
+		__entry->urb)
+);
+
+#endif /* _TRACE_PWC_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
cgit v1.2.3


From 66e988e96b31d0c06471234cabb0a77316841fc3 Mon Sep 17 00:00:00 2001
From: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
Date: Thu, 13 Dec 2018 08:31:07 -0500
Subject: media: Change Andrzej Pietrasiewicz's e-mail address

My @samsung.com address is going to cease existing soon, so change it to
an address which can actually be used to contact me.

Signed-off-by: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/common/videobuf2/videobuf2-dma-sg.c | 2 +-
 drivers/media/platform/s5p-jpeg/jpeg-core.c       | 4 ++--
 drivers/media/platform/s5p-jpeg/jpeg-core.h       | 2 +-
 drivers/media/platform/s5p-jpeg/jpeg-hw-s5p.c     | 2 +-
 drivers/media/platform/s5p-jpeg/jpeg-hw-s5p.h     | 2 +-
 drivers/media/platform/s5p-jpeg/jpeg-regs.h       | 2 +-
 include/media/videobuf2-dma-sg.h                  | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/media/common/videobuf2/videobuf2-dma-sg.c b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
index 015e737095cd..f02876d971d7 100644
--- a/drivers/media/common/videobuf2/videobuf2-dma-sg.c
+++ b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2010 Samsung Electronics
  *
- * Author: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
+ * Author: Andrzej Pietrasiewicz <andrzejtp2010@gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff --git a/drivers/media/platform/s5p-jpeg/jpeg-core.c b/drivers/media/platform/s5p-jpeg/jpeg-core.c
index 3f9000b70385..580d6d02557b 100644
--- a/drivers/media/platform/s5p-jpeg/jpeg-core.c
+++ b/drivers/media/platform/s5p-jpeg/jpeg-core.c
@@ -3,7 +3,7 @@
  * Copyright (c) 2011-2014 Samsung Electronics Co., Ltd.
  *		http://www.samsung.com
  *
- * Author: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
+ * Author: Andrzej Pietrasiewicz <andrzejtp2010@gmail.com>
  * Author: Jacek Anaszewski <j.anaszewski@samsung.com>
  *
  * This program is free software; you can redistribute it and/or modify
@@ -3220,7 +3220,7 @@ static struct platform_driver s5p_jpeg_driver = {
 
 module_platform_driver(s5p_jpeg_driver);
 
-MODULE_AUTHOR("Andrzej Pietrasiewicz <andrzej.p@samsung.com>");
+MODULE_AUTHOR("Andrzej Pietrasiewicz <andrzejtp2010@gmail.com>");
 MODULE_AUTHOR("Jacek Anaszewski <j.anaszewski@samsung.com>");
 MODULE_DESCRIPTION("Samsung JPEG codec driver");
 MODULE_LICENSE("GPL");
diff --git a/drivers/media/platform/s5p-jpeg/jpeg-core.h b/drivers/media/platform/s5p-jpeg/jpeg-core.h
index a46465e10351..90fda4b720eb 100644
--- a/drivers/media/platform/s5p-jpeg/jpeg-core.h
+++ b/drivers/media/platform/s5p-jpeg/jpeg-core.h
@@ -3,7 +3,7 @@
  * Copyright (c) 2011 Samsung Electronics Co., Ltd.
  *		http://www.samsung.com
  *
- * Author: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
+ * Author: Andrzej Pietrasiewicz <andrzejtp2010@gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/drivers/media/platform/s5p-jpeg/jpeg-hw-s5p.c b/drivers/media/platform/s5p-jpeg/jpeg-hw-s5p.c
index b5f20e722b63..59c6263a71bf 100644
--- a/drivers/media/platform/s5p-jpeg/jpeg-hw-s5p.c
+++ b/drivers/media/platform/s5p-jpeg/jpeg-hw-s5p.c
@@ -3,7 +3,7 @@
  * Copyright (c) 2011 Samsung Electronics Co., Ltd.
  *		http://www.samsung.com
  *
- * Author: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
+ * Author: Andrzej Pietrasiewicz <andrzejtp2010@gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/drivers/media/platform/s5p-jpeg/jpeg-hw-s5p.h b/drivers/media/platform/s5p-jpeg/jpeg-hw-s5p.h
index f208fa3ed738..bfe746f8f750 100644
--- a/drivers/media/platform/s5p-jpeg/jpeg-hw-s5p.h
+++ b/drivers/media/platform/s5p-jpeg/jpeg-hw-s5p.h
@@ -3,7 +3,7 @@
  * Copyright (c) 2011 Samsung Electronics Co., Ltd.
  *		http://www.samsung.com
  *
- * Author: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
+ * Author: Andrzej Pietrasiewicz <andrzejtp2010@gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/drivers/media/platform/s5p-jpeg/jpeg-regs.h b/drivers/media/platform/s5p-jpeg/jpeg-regs.h
index df790b10140c..574f0e8021e5 100644
--- a/drivers/media/platform/s5p-jpeg/jpeg-regs.h
+++ b/drivers/media/platform/s5p-jpeg/jpeg-regs.h
@@ -5,7 +5,7 @@
  * Copyright (c) 2011-2014 Samsung Electronics Co., Ltd.
  *		http://www.samsung.com
  *
- * Author: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
+ * Author: Andrzej Pietrasiewicz <andrzejtp2010@gmail.com>
  * Author: Jacek Anaszewski <j.anaszewski@samsung.com>
  *
  * This program is free software; you can redistribute it and/or modify
diff --git a/include/media/videobuf2-dma-sg.h b/include/media/videobuf2-dma-sg.h
index 52afa0e2bb17..f28fcb0cfac7 100644
--- a/include/media/videobuf2-dma-sg.h
+++ b/include/media/videobuf2-dma-sg.h
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2010 Samsung Electronics
  *
- * Author: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
+ * Author: Andrzej Pietrasiewicz <andrzejtp2010@gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
-- 
cgit v1.2.3


From 4b837c6d7ee771f68a30f362c9f68171a95be222 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Mon, 14 Jan 2019 09:01:54 -0500
Subject: media: v4l: uAPI: V4L2_BUF_TYPE_META_OUTPUT is an output buffer type

V4L2_BUF_TYPE_META_OUTPUT was added by commit 72148d1a57e7
("media: v4l: Add support for V4L2_BUF_TYPE_META_OUTPUT") but the patch
missed adding the type to the macro telling whether a given type is an
output type or not. Do that now. Getting this wrong leads to handling the
buffer as a capture buffer in a lot of places.

Fixes: 72148d1a57e7 ("media: v4l: Add support for V4L2_BUF_TYPE_META_OUTPUT")

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/uapi/linux/videodev2.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index d6eed479c3a6..c0c36c165bf4 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -161,7 +161,8 @@ enum v4l2_buf_type {
 	 || (type) == V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY	\
 	 || (type) == V4L2_BUF_TYPE_VBI_OUTPUT			\
 	 || (type) == V4L2_BUF_TYPE_SLICED_VBI_OUTPUT		\
-	 || (type) == V4L2_BUF_TYPE_SDR_OUTPUT)
+	 || (type) == V4L2_BUF_TYPE_SDR_OUTPUT			\
+	 || (type) == V4L2_BUF_TYPE_META_OUTPUT)
 
 enum v4l2_tuner_type {
 	V4L2_TUNER_RADIO	     = 1,
-- 
cgit v1.2.3


From 50656bad786d001b294764e9f047c5d5b3e4db75 Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Thu, 10 Jan 2019 11:56:09 -0500
Subject: media: v4l2-ctrl: Add control to enable h.264 constrained intra
 prediction

Allow to enable h.264 constrained intra prediction (macroblocks using
intra prediction modes are not allowed to use residual data and decoded
samples of neighboring macroblocks coded using inter prediction modes).
This control directly corresponds to the constrained_intra_pred_flag
field in the h.264 picture parameter set.

Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/uapi/v4l/extended-controls.rst | 4 ++++
 drivers/media/v4l2-core/v4l2-ctrls.c               | 2 ++
 include/uapi/linux/v4l2-controls.h                 | 1 +
 3 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/Documentation/media/uapi/v4l/extended-controls.rst b/Documentation/media/uapi/v4l/extended-controls.rst
index af4273aa5e85..235d0c293983 100644
--- a/Documentation/media/uapi/v4l/extended-controls.rst
+++ b/Documentation/media/uapi/v4l/extended-controls.rst
@@ -1154,6 +1154,10 @@ enum v4l2_mpeg_video_h264_entropy_mode -
 ``V4L2_CID_MPEG_VIDEO_H264_8X8_TRANSFORM (boolean)``
     Enable 8X8 transform for H264. Applicable to the H264 encoder.
 
+``V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION (boolean)``
+    Enable constrained intra prediction for H264. Applicable to the H264
+    encoder.
+
 ``V4L2_CID_MPEG_VIDEO_CYCLIC_INTRA_REFRESH_MB (integer)``
     Cyclic intra macroblock refresh. This is the number of continuous
     macroblocks refreshed every frame. Each frame a successive set of
diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index 8c47d8f00429..50b56185c02e 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -825,6 +825,8 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER:return "H264 Number of HC Layers";
 	case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER_QP:
 								return "H264 Set QP Value for HC Layers";
+	case V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION:
+								return "H264 Constrained Intra Pred";
 	case V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP:		return "MPEG4 I-Frame QP Value";
 	case V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP:		return "MPEG4 P-Frame QP Value";
 	case V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP:		return "MPEG4 B-Frame QP Value";
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 3dcfc6148f99..fd65c710b144 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -533,6 +533,7 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type {
 };
 #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER	(V4L2_CID_MPEG_BASE+381)
 #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER_QP	(V4L2_CID_MPEG_BASE+382)
+#define V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION	(V4L2_CID_MPEG_BASE+383)
 #define V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP	(V4L2_CID_MPEG_BASE+400)
 #define V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP	(V4L2_CID_MPEG_BASE+401)
 #define V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP	(V4L2_CID_MPEG_BASE+402)
-- 
cgit v1.2.3


From d034696cbe5a6e00f76ca4b7869c6cdef66aebd5 Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Thu, 10 Jan 2019 11:56:10 -0500
Subject: media: v4l2-ctrl: Add control for h.264 chroma qp offset

Allow to add fixed quantization parameter offset between luma and
chroma quantization parameters. This control directly corresponds
to the chroma_qp_index_offset field of the h.264 picture parameter
set.

Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/uapi/v4l/extended-controls.rst | 5 +++++
 drivers/media/v4l2-core/v4l2-ctrls.c               | 1 +
 include/uapi/linux/v4l2-controls.h                 | 1 +
 3 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/Documentation/media/uapi/v4l/extended-controls.rst b/Documentation/media/uapi/v4l/extended-controls.rst
index 235d0c293983..00934efdc9e4 100644
--- a/Documentation/media/uapi/v4l/extended-controls.rst
+++ b/Documentation/media/uapi/v4l/extended-controls.rst
@@ -1158,6 +1158,11 @@ enum v4l2_mpeg_video_h264_entropy_mode -
     Enable constrained intra prediction for H264. Applicable to the H264
     encoder.
 
+``V4L2_CID_MPEG_VIDEO_H264_CHROMA_QP_INDEX_OFFSET (integer)``
+    Specify the offset that should be added to the luma quantization
+    parameter to determine the chroma quantization parameter. Applicable
+    to the H264 encoder.
+
 ``V4L2_CID_MPEG_VIDEO_CYCLIC_INTRA_REFRESH_MB (integer)``
     Cyclic intra macroblock refresh. This is the number of continuous
     macroblocks refreshed every frame. Each frame a successive set of
diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index 50b56185c02e..99308dac2daa 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -827,6 +827,7 @@ const char *v4l2_ctrl_get_name(u32 id)
 								return "H264 Set QP Value for HC Layers";
 	case V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION:
 								return "H264 Constrained Intra Pred";
+	case V4L2_CID_MPEG_VIDEO_H264_CHROMA_QP_INDEX_OFFSET:	return "H264 Chroma QP Index Offset";
 	case V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP:		return "MPEG4 I-Frame QP Value";
 	case V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP:		return "MPEG4 P-Frame QP Value";
 	case V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP:		return "MPEG4 B-Frame QP Value";
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index fd65c710b144..06479f2fb3ae 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -534,6 +534,7 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type {
 #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER	(V4L2_CID_MPEG_BASE+381)
 #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER_QP	(V4L2_CID_MPEG_BASE+382)
 #define V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION	(V4L2_CID_MPEG_BASE+383)
+#define V4L2_CID_MPEG_VIDEO_H264_CHROMA_QP_INDEX_OFFSET		(V4L2_CID_MPEG_BASE+384)
 #define V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP	(V4L2_CID_MPEG_BASE+400)
 #define V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP	(V4L2_CID_MPEG_BASE+401)
 #define V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP	(V4L2_CID_MPEG_BASE+402)
-- 
cgit v1.2.3


From 1c3721b1f22286033abeda30b7e12439b083ed0f Mon Sep 17 00:00:00 2001
From: Steve Longerbeam <slongerbeam@gmail.com>
Date: Wed, 9 Jan 2019 13:30:04 -0500
Subject: media: videodev2.h: Add more field helper macros

Adds two helper macros:

V4L2_FIELD_IS_SEQUENTIAL: returns true if the given field type is
'sequential', that is a full frame is transmitted, or exists in
memory, as all top field lines followed by all bottom field lines,
or vice-versa.

V4L2_FIELD_IS_INTERLACED: returns true if the given field type is
'interlaced', that is a full frame is transmitted, or exists in
memory, as top field lines interlaced with bottom field lines.

Signed-off-by: Steve Longerbeam <slongerbeam@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/uapi/linux/videodev2.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index c0c36c165bf4..9a920f071ff9 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -130,6 +130,13 @@ enum v4l2_field {
 	((field) == V4L2_FIELD_BOTTOM ||\
 	 (field) == V4L2_FIELD_TOP ||\
 	 (field) == V4L2_FIELD_ALTERNATE)
+#define V4L2_FIELD_IS_INTERLACED(field) \
+	((field) == V4L2_FIELD_INTERLACED ||\
+	 (field) == V4L2_FIELD_INTERLACED_TB ||\
+	 (field) == V4L2_FIELD_INTERLACED_BT)
+#define V4L2_FIELD_IS_SEQUENTIAL(field) \
+	((field) == V4L2_FIELD_SEQ_TB ||\
+	 (field) == V4L2_FIELD_SEQ_BT)
 
 enum v4l2_buf_type {
 	V4L2_BUF_TYPE_VIDEO_CAPTURE        = 1,
-- 
cgit v1.2.3


From fc8c723852380b0f4349bb06cf553a6bba47462e Mon Sep 17 00:00:00 2001
From: Steve Longerbeam <slongerbeam@gmail.com>
Date: Wed, 9 Jan 2019 13:30:05 -0500
Subject: media: gpu: ipu-csi: Swap fields according to input/output field
 types

The function ipu_csi_init_interface() was inverting the F-bit for
NTSC case, in the CCIR_CODE_1/2 registers. The result being that
for NTSC bottom-top field order, the CSI would swap fields and
capture in top-bottom order.

Instead, base field swap on the field order of the input to the CSI,
and the field order of the requested output. If the input/output
fields are sequential but different, swap fields, otherwise do
not swap. This requires passing both the input and output mbus
frame formats to ipu_csi_init_interface().

Move this code to a new private function ipu_csi_set_bt_interlaced_codes()
that programs the CCIR_CODE_1/2 registers for interlaced BT.656 (and
possibly interlaced BT.1120 in the future).

When detecting input video standard from the input frame width/height,
make sure to double height if input field type is alternate, since
in that case input height only includes lines for one field.

Signed-off-by: Steve Longerbeam <slongerbeam@gmail.com>
Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/gpu/ipu-v3/ipu-csi.c              | 126 ++++++++++++++++++++----------
 drivers/staging/media/imx/imx-media-csi.c |   7 +-
 include/video/imx-ipu-v3.h                |   5 +-
 3 files changed, 89 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/ipu-v3/ipu-csi.c b/drivers/gpu/ipu-v3/ipu-csi.c
index aa0e30a2ba18..d1e575571a8d 100644
--- a/drivers/gpu/ipu-v3/ipu-csi.c
+++ b/drivers/gpu/ipu-v3/ipu-csi.c
@@ -325,12 +325,21 @@ static int mbus_code_to_bus_cfg(struct ipu_csi_bus_config *cfg, u32 mbus_code,
 	return 0;
 }
 
+/* translate alternate field mode based on given standard */
+static inline enum v4l2_field
+ipu_csi_translate_field(enum v4l2_field field, v4l2_std_id std)
+{
+	return (field != V4L2_FIELD_ALTERNATE) ? field :
+		((std & V4L2_STD_525_60) ?
+		 V4L2_FIELD_SEQ_BT : V4L2_FIELD_SEQ_TB);
+}
+
 /*
  * Fill a CSI bus config struct from mbus_config and mbus_framefmt.
  */
 static int fill_csi_bus_cfg(struct ipu_csi_bus_config *csicfg,
-				 struct v4l2_mbus_config *mbus_cfg,
-				 struct v4l2_mbus_framefmt *mbus_fmt)
+			    const struct v4l2_mbus_config *mbus_cfg,
+			    const struct v4l2_mbus_framefmt *mbus_fmt)
 {
 	int ret;
 
@@ -374,22 +383,76 @@ static int fill_csi_bus_cfg(struct ipu_csi_bus_config *csicfg,
 	return 0;
 }
 
+static int
+ipu_csi_set_bt_interlaced_codes(struct ipu_csi *csi,
+				const struct v4l2_mbus_framefmt *infmt,
+				const struct v4l2_mbus_framefmt *outfmt,
+				v4l2_std_id std)
+{
+	enum v4l2_field infield, outfield;
+	bool swap_fields;
+
+	/* get translated field type of input and output */
+	infield = ipu_csi_translate_field(infmt->field, std);
+	outfield = ipu_csi_translate_field(outfmt->field, std);
+
+	/*
+	 * Write the H-V-F codes the CSI will match against the
+	 * incoming data for start/end of active and blanking
+	 * field intervals. If input and output field types are
+	 * sequential but not the same (one is SEQ_BT and the other
+	 * is SEQ_TB), swap the F-bit so that the CSI will capture
+	 * field 1 lines before field 0 lines.
+	 */
+	swap_fields = (V4L2_FIELD_IS_SEQUENTIAL(infield) &&
+		       V4L2_FIELD_IS_SEQUENTIAL(outfield) &&
+		       infield != outfield);
+
+	if (!swap_fields) {
+		/*
+		 * Field0BlankEnd  = 110, Field0BlankStart  = 010
+		 * Field0ActiveEnd = 100, Field0ActiveStart = 000
+		 * Field1BlankEnd  = 111, Field1BlankStart  = 011
+		 * Field1ActiveEnd = 101, Field1ActiveStart = 001
+		 */
+		ipu_csi_write(csi, 0x40596 | CSI_CCIR_ERR_DET_EN,
+			      CSI_CCIR_CODE_1);
+		ipu_csi_write(csi, 0xD07DF, CSI_CCIR_CODE_2);
+	} else {
+		dev_dbg(csi->ipu->dev, "capture field swap\n");
+
+		/* same as above but with F-bit inverted */
+		ipu_csi_write(csi, 0xD07DF | CSI_CCIR_ERR_DET_EN,
+			      CSI_CCIR_CODE_1);
+		ipu_csi_write(csi, 0x40596, CSI_CCIR_CODE_2);
+	}
+
+	ipu_csi_write(csi, 0xFF0000, CSI_CCIR_CODE_3);
+
+	return 0;
+}
+
+
 int ipu_csi_init_interface(struct ipu_csi *csi,
-			   struct v4l2_mbus_config *mbus_cfg,
-			   struct v4l2_mbus_framefmt *mbus_fmt)
+			   const struct v4l2_mbus_config *mbus_cfg,
+			   const struct v4l2_mbus_framefmt *infmt,
+			   const struct v4l2_mbus_framefmt *outfmt)
 {
 	struct ipu_csi_bus_config cfg;
 	unsigned long flags;
 	u32 width, height, data = 0;
+	v4l2_std_id std;
 	int ret;
 
-	ret = fill_csi_bus_cfg(&cfg, mbus_cfg, mbus_fmt);
+	ret = fill_csi_bus_cfg(&cfg, mbus_cfg, infmt);
 	if (ret < 0)
 		return ret;
 
 	/* set default sensor frame width and height */
-	width = mbus_fmt->width;
-	height = mbus_fmt->height;
+	width = infmt->width;
+	height = infmt->height;
+	if (infmt->field == V4L2_FIELD_ALTERNATE)
+		height *= 2;
 
 	/* Set the CSI_SENS_CONF register remaining fields */
 	data |= cfg.data_width << CSI_SENS_CONF_DATA_WIDTH_SHIFT |
@@ -416,42 +479,22 @@ int ipu_csi_init_interface(struct ipu_csi *csi,
 		ipu_csi_write(csi, 0xFF0000, CSI_CCIR_CODE_3);
 		break;
 	case IPU_CSI_CLK_MODE_CCIR656_INTERLACED:
-		if (mbus_fmt->width == 720 && mbus_fmt->height == 576) {
-			/*
-			 * PAL case
-			 *
-			 * Field0BlankEnd = 0x6, Field0BlankStart = 0x2,
-			 * Field0ActiveEnd = 0x4, Field0ActiveStart = 0
-			 * Field1BlankEnd = 0x7, Field1BlankStart = 0x3,
-			 * Field1ActiveEnd = 0x5, Field1ActiveStart = 0x1
-			 */
-			height = 625; /* framelines for PAL */
-
-			ipu_csi_write(csi, 0x40596 | CSI_CCIR_ERR_DET_EN,
-					  CSI_CCIR_CODE_1);
-			ipu_csi_write(csi, 0xD07DF, CSI_CCIR_CODE_2);
-			ipu_csi_write(csi, 0xFF0000, CSI_CCIR_CODE_3);
-		} else if (mbus_fmt->width == 720 && mbus_fmt->height == 480) {
-			/*
-			 * NTSC case
-			 *
-			 * Field0BlankEnd = 0x7, Field0BlankStart = 0x3,
-			 * Field0ActiveEnd = 0x5, Field0ActiveStart = 0x1
-			 * Field1BlankEnd = 0x6, Field1BlankStart = 0x2,
-			 * Field1ActiveEnd = 0x4, Field1ActiveStart = 0
-			 */
-			height = 525; /* framelines for NTSC */
-
-			ipu_csi_write(csi, 0xD07DF | CSI_CCIR_ERR_DET_EN,
-					  CSI_CCIR_CODE_1);
-			ipu_csi_write(csi, 0x40596, CSI_CCIR_CODE_2);
-			ipu_csi_write(csi, 0xFF0000, CSI_CCIR_CODE_3);
+		if (width == 720 && height == 480) {
+			std = V4L2_STD_NTSC;
+			height = 525;
+		} else if (width == 720 && height == 576) {
+			std = V4L2_STD_PAL;
+			height = 625;
 		} else {
 			dev_err(csi->ipu->dev,
-				"Unsupported CCIR656 interlaced video mode\n");
-			spin_unlock_irqrestore(&csi->lock, flags);
-			return -EINVAL;
+				"Unsupported interlaced video mode\n");
+			ret = -EINVAL;
+			goto out_unlock;
 		}
+
+		ret = ipu_csi_set_bt_interlaced_codes(csi, infmt, outfmt, std);
+		if (ret)
+			goto out_unlock;
 		break;
 	case IPU_CSI_CLK_MODE_CCIR1120_PROGRESSIVE_DDR:
 	case IPU_CSI_CLK_MODE_CCIR1120_PROGRESSIVE_SDR:
@@ -476,9 +519,10 @@ int ipu_csi_init_interface(struct ipu_csi *csi,
 	dev_dbg(csi->ipu->dev, "CSI_ACT_FRM_SIZE = 0x%08X\n",
 		ipu_csi_read(csi, CSI_ACT_FRM_SIZE));
 
+out_unlock:
 	spin_unlock_irqrestore(&csi->lock, flags);
 
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(ipu_csi_init_interface);
 
diff --git a/drivers/staging/media/imx/imx-media-csi.c b/drivers/staging/media/imx/imx-media-csi.c
index 4223f8d418ae..c2a8d9cd31b7 100644
--- a/drivers/staging/media/imx/imx-media-csi.c
+++ b/drivers/staging/media/imx/imx-media-csi.c
@@ -679,12 +679,7 @@ static int csi_setup(struct csi_priv *priv)
 		priv->upstream_ep.bus.parallel.flags :
 		priv->upstream_ep.bus.mipi_csi2.flags;
 
-	/*
-	 * we need to pass input frame to CSI interface, but
-	 * with translated field type from output format
-	 */
 	if_fmt = *infmt;
-	if_fmt.field = outfmt->field;
 	crop = priv->crop;
 
 	/*
@@ -702,7 +697,7 @@ static int csi_setup(struct csi_priv *priv)
 			     priv->crop.width == 2 * priv->compose.width,
 			     priv->crop.height == 2 * priv->compose.height);
 
-	ipu_csi_init_interface(priv->csi, &mbus_cfg, &if_fmt);
+	ipu_csi_init_interface(priv->csi, &mbus_cfg, &if_fmt, outfmt);
 
 	ipu_csi_set_dest(priv->csi, priv->dest);
 
diff --git a/include/video/imx-ipu-v3.h b/include/video/imx-ipu-v3.h
index e582e8e7527a..bbc8481f567d 100644
--- a/include/video/imx-ipu-v3.h
+++ b/include/video/imx-ipu-v3.h
@@ -354,8 +354,9 @@ int ipu_prg_channel_configure(struct ipuv3_channel *ipu_chan,
  */
 struct ipu_csi;
 int ipu_csi_init_interface(struct ipu_csi *csi,
-			   struct v4l2_mbus_config *mbus_cfg,
-			   struct v4l2_mbus_framefmt *mbus_fmt);
+			   const struct v4l2_mbus_config *mbus_cfg,
+			   const struct v4l2_mbus_framefmt *infmt,
+			   const struct v4l2_mbus_framefmt *outfmt);
 bool ipu_csi_is_interlaced(struct ipu_csi *csi);
 void ipu_csi_get_window(struct ipu_csi *csi, struct v4l2_rect *w);
 void ipu_csi_set_window(struct ipu_csi *csi, struct v4l2_rect *w);
-- 
cgit v1.2.3


From 9b5c8d5ffb20cc4f4b1476cb1ad57e0f9b934788 Mon Sep 17 00:00:00 2001
From: Steve Longerbeam <slongerbeam@gmail.com>
Date: Wed, 9 Jan 2019 13:30:06 -0500
Subject: media: gpu: ipu-v3: Add planar support to interlaced scan

To support interlaced scan with planar formats, cpmem SLUV must
be programmed with the correct chroma line stride. For full and
partial planar 4:2:2 (YUV422P, NV16), chroma line stride must
be doubled. For full and partial planar 4:2:0 (YUV420, YVU420, NV12),
chroma line stride must _not_ be doubled, since a single chroma line
is shared by two luma lines.

Signed-off-by: Steve Longerbeam <slongerbeam@gmail.com>
Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/gpu/ipu-v3/ipu-cpmem.c              | 26 ++++++++++++++++++++++++--
 drivers/staging/media/imx/imx-ic-prpencvf.c |  3 ++-
 drivers/staging/media/imx/imx-media-csi.c   |  3 ++-
 include/video/imx-ipu-v3.h                  |  3 ++-
 4 files changed, 30 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/ipu-v3/ipu-cpmem.c b/drivers/gpu/ipu-v3/ipu-cpmem.c
index 163fadb8a33a..d047a6867c59 100644
--- a/drivers/gpu/ipu-v3/ipu-cpmem.c
+++ b/drivers/gpu/ipu-v3/ipu-cpmem.c
@@ -277,9 +277,10 @@ void ipu_cpmem_set_uv_offset(struct ipuv3_channel *ch, u32 u_off, u32 v_off)
 }
 EXPORT_SYMBOL_GPL(ipu_cpmem_set_uv_offset);
 
-void ipu_cpmem_interlaced_scan(struct ipuv3_channel *ch, int stride)
+void ipu_cpmem_interlaced_scan(struct ipuv3_channel *ch, int stride,
+			       u32 pixelformat)
 {
-	u32 ilo, sly;
+	u32 ilo, sly, sluv;
 
 	if (stride < 0) {
 		stride = -stride;
@@ -290,9 +291,30 @@ void ipu_cpmem_interlaced_scan(struct ipuv3_channel *ch, int stride)
 
 	sly = (stride * 2) - 1;
 
+	switch (pixelformat) {
+	case V4L2_PIX_FMT_YUV420:
+	case V4L2_PIX_FMT_YVU420:
+		sluv = stride / 2 - 1;
+		break;
+	case V4L2_PIX_FMT_NV12:
+		sluv = stride - 1;
+		break;
+	case V4L2_PIX_FMT_YUV422P:
+		sluv = stride - 1;
+		break;
+	case V4L2_PIX_FMT_NV16:
+		sluv = stride * 2 - 1;
+		break;
+	default:
+		sluv = 0;
+		break;
+	}
+
 	ipu_ch_param_write_field(ch, IPU_FIELD_SO, 1);
 	ipu_ch_param_write_field(ch, IPU_FIELD_ILO, ilo);
 	ipu_ch_param_write_field(ch, IPU_FIELD_SLY, sly);
+	if (sluv)
+		ipu_ch_param_write_field(ch, IPU_FIELD_SLUV, sluv);
 };
 EXPORT_SYMBOL_GPL(ipu_cpmem_interlaced_scan);
 
diff --git a/drivers/staging/media/imx/imx-ic-prpencvf.c b/drivers/staging/media/imx/imx-ic-prpencvf.c
index 28f41caba05d..af7224846bd5 100644
--- a/drivers/staging/media/imx/imx-ic-prpencvf.c
+++ b/drivers/staging/media/imx/imx-ic-prpencvf.c
@@ -412,7 +412,8 @@ static int prp_setup_channel(struct prp_priv *priv,
 	if (image.pix.field == V4L2_FIELD_NONE &&
 	    V4L2_FIELD_HAS_BOTH(infmt->field) &&
 	    channel == priv->out_ch)
-		ipu_cpmem_interlaced_scan(channel, image.pix.bytesperline);
+		ipu_cpmem_interlaced_scan(channel, image.pix.bytesperline,
+					  image.pix.pixelformat);
 
 	ret = ipu_ic_task_idma_init(priv->ic, channel,
 				    image.pix.width, image.pix.height,
diff --git a/drivers/staging/media/imx/imx-media-csi.c b/drivers/staging/media/imx/imx-media-csi.c
index c2a8d9cd31b7..da4808348845 100644
--- a/drivers/staging/media/imx/imx-media-csi.c
+++ b/drivers/staging/media/imx/imx-media-csi.c
@@ -512,7 +512,8 @@ static int csi_idmac_setup_channel(struct csi_priv *priv)
 	if (image.pix.field == V4L2_FIELD_NONE &&
 	    V4L2_FIELD_HAS_BOTH(infmt->field))
 		ipu_cpmem_interlaced_scan(priv->idmac_ch,
-					  image.pix.bytesperline);
+					  image.pix.bytesperline,
+					  image.pix.pixelformat);
 
 	ipu_idmac_set_double_buffer(priv->idmac_ch, true);
 
diff --git a/include/video/imx-ipu-v3.h b/include/video/imx-ipu-v3.h
index bbc8481f567d..c887f4bee5f8 100644
--- a/include/video/imx-ipu-v3.h
+++ b/include/video/imx-ipu-v3.h
@@ -258,7 +258,8 @@ void ipu_cpmem_set_stride(struct ipuv3_channel *ch, int stride);
 void ipu_cpmem_set_high_priority(struct ipuv3_channel *ch);
 void ipu_cpmem_set_buffer(struct ipuv3_channel *ch, int bufnum, dma_addr_t buf);
 void ipu_cpmem_set_uv_offset(struct ipuv3_channel *ch, u32 u_off, u32 v_off);
-void ipu_cpmem_interlaced_scan(struct ipuv3_channel *ch, int stride);
+void ipu_cpmem_interlaced_scan(struct ipuv3_channel *ch, int stride,
+			       u32 pixelformat);
 void ipu_cpmem_set_axi_id(struct ipuv3_channel *ch, u32 id);
 int ipu_cpmem_get_burstsize(struct ipuv3_channel *ch);
 void ipu_cpmem_set_burstsize(struct ipuv3_channel *ch, int burstsize);
-- 
cgit v1.2.3


From 6d7fbce7da0cd06ff3f3f30e009a15a6243f0bc0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Jan 2019 12:02:57 -0500
Subject: kill kernfs_pin_sb()

unused now and impossible to use safely anyway.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/kernfs/mount.c      | 30 ------------------------------
 include/linux/kernfs.h |  1 -
 2 files changed, 31 deletions(-)

(limited to 'include')

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index d71c9405874a..4d303047a4f8 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -377,36 +377,6 @@ void kernfs_kill_sb(struct super_block *sb)
 	kfree(info);
 }
 
-/**
- * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
- * @kernfs_root: the kernfs_root in question
- * @ns: the namespace tag
- *
- * Pin the superblock so the superblock won't be destroyed in subsequent
- * operations.  This can be used to block ->kill_sb() which may be useful
- * for kernfs users which dynamically manage superblocks.
- *
- * Returns NULL if there's no superblock associated to this kernfs_root, or
- * -EINVAL if the superblock is being freed.
- */
-struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns)
-{
-	struct kernfs_super_info *info;
-	struct super_block *sb = NULL;
-
-	mutex_lock(&kernfs_mutex);
-	list_for_each_entry(info, &root->supers, node) {
-		if (info->ns == ns) {
-			sb = info->sb;
-			if (!atomic_inc_not_zero(&info->sb->s_active))
-				sb = ERR_PTR(-EINVAL);
-			break;
-		}
-	}
-	mutex_unlock(&kernfs_mutex);
-	return sb;
-}
-
 void __init kernfs_init(void)
 {
 
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 5b36b1287a5a..44acb4c3659c 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -357,7 +357,6 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
 			       struct kernfs_root *root, unsigned long magic,
 			       bool *new_sb_created, const void *ns);
 void kernfs_kill_sb(struct super_block *sb);
-struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns);
 
 void kernfs_init(void);
 
-- 
cgit v1.2.3


From 7527a7b157d1191b23562ed70154ae93bd65f845 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Thu, 17 Jan 2019 20:14:15 +0200
Subject: IB/core: Simplify rdma cgroup registration

RDMA cgroup registration routine always returns success, so simplify
function to be void and run clang formatter over whole CONFIG_CGROUP_RDMA
art of core_priv.h.

This reduces unwinding error path for regular registration and future net
namespace change functionality for rdma device.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cgroup.c    |  5 ++---
 drivers/infiniband/core/core_priv.h | 17 +++++++++++------
 drivers/infiniband/core/device.c    |  8 +-------
 include/linux/cgroup_rdma.h         |  2 +-
 kernel/cgroup/rdma.c                |  5 +----
 5 files changed, 16 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c
index 126ac5f99db7..388fd04e5f63 100644
--- a/drivers/infiniband/core/cgroup.c
+++ b/drivers/infiniband/core/cgroup.c
@@ -21,12 +21,11 @@
  * Register with the rdma cgroup. Should be called before
  * exposing rdma device to user space applications to avoid
  * resource accounting leak.
- * Returns 0 on success or otherwise failure code.
  */
-int ib_device_register_rdmacg(struct ib_device *device)
+void ib_device_register_rdmacg(struct ib_device *device)
 {
 	device->cg_device.name = device->name;
-	return rdmacg_register_device(&device->cg_device);
+	rdmacg_register_device(&device->cg_device);
 }
 
 /**
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index aca75c74e451..42a49982f66e 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -115,7 +115,7 @@ void ib_cache_cleanup_one(struct ib_device *device);
 void ib_cache_release_one(struct ib_device *device);
 
 #ifdef CONFIG_CGROUP_RDMA
-int ib_device_register_rdmacg(struct ib_device *device);
+void ib_device_register_rdmacg(struct ib_device *device);
 void ib_device_unregister_rdmacg(struct ib_device *device);
 
 int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
@@ -126,21 +126,26 @@ void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
 			struct ib_device *device,
 			enum rdmacg_resource_type resource_index);
 #else
-static inline int ib_device_register_rdmacg(struct ib_device *device)
-{ return 0; }
+static inline void ib_device_register_rdmacg(struct ib_device *device)
+{
+}
 
 static inline void ib_device_unregister_rdmacg(struct ib_device *device)
-{ }
+{
+}
 
 static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
 				       struct ib_device *device,
 				       enum rdmacg_resource_type resource_index)
-{ return 0; }
+{
+	return 0;
+}
 
 static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
 				      struct ib_device *device,
 				      enum rdmacg_resource_type resource_index)
-{ }
+{
+}
 #endif
 
 static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 4a9aa6d10c5e..200431c540f2 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -599,12 +599,7 @@ int ib_register_device(struct ib_device *device, const char *name)
 
 	device->index = __dev_new_index();
 
-	ret = ib_device_register_rdmacg(device);
-	if (ret) {
-		dev_warn(&device->dev,
-			 "Couldn't register device with rdma cgroup\n");
-		goto dev_cleanup;
-	}
+	ib_device_register_rdmacg(device);
 
 	ret = ib_device_register_sysfs(device);
 	if (ret) {
@@ -627,7 +622,6 @@ int ib_register_device(struct ib_device *device, const char *name)
 
 cg_cleanup:
 	ib_device_unregister_rdmacg(device);
-dev_cleanup:
 	cleanup_device(device);
 out:
 	mutex_unlock(&device_mutex);
diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h
index e94290b29e99..ef1bae2983f3 100644
--- a/include/linux/cgroup_rdma.h
+++ b/include/linux/cgroup_rdma.h
@@ -39,7 +39,7 @@ struct rdmacg_device {
  * APIs for RDMA/IB stack to publish when a device wants to
  * participate in resource accounting
  */
-int rdmacg_register_device(struct rdmacg_device *device);
+void rdmacg_register_device(struct rdmacg_device *device);
 void rdmacg_unregister_device(struct rdmacg_device *device);
 
 /* APIs for RDMA/IB stack to charge/uncharge pool specific resources */
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index d3bbb757ee49..1d75ae7f1cb7 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -313,10 +313,8 @@ EXPORT_SYMBOL(rdmacg_try_charge);
  * If IB stack wish a device to participate in rdma cgroup resource
  * tracking, it must invoke this API to register with rdma cgroup before
  * any user space application can start using the RDMA resources.
- * Returns 0 on success or EINVAL when table length given is beyond
- * supported size.
  */
-int rdmacg_register_device(struct rdmacg_device *device)
+void rdmacg_register_device(struct rdmacg_device *device)
 {
 	INIT_LIST_HEAD(&device->dev_node);
 	INIT_LIST_HEAD(&device->rpools);
@@ -324,7 +322,6 @@ int rdmacg_register_device(struct rdmacg_device *device)
 	mutex_lock(&rdmacg_mutex);
 	list_add_tail(&device->dev_node, &rdmacg_devices);
 	mutex_unlock(&rdmacg_mutex);
-	return 0;
 }
 EXPORT_SYMBOL(rdmacg_register_device);
 
-- 
cgit v1.2.3


From f5d782d46aa5d4dd369e6560ce5227136b58926f Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sebastian.reichel@collabora.com>
Date: Thu, 13 Dec 2018 02:38:58 +0100
Subject: power: supply: isp1704: switch to gpiod API

This migrates isp1704 driver from old GPIO API to new descriptor
based GPIO API and drops useless platform data as a side-effect.

Migration is simple, since all mainline users are DT based and
DT API does not change. Out of tree users of the platform data
need to migrate to gpiod_lookup_table as described here:

Documentation/driver-api/gpio/board.rst

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/isp1704_charger.c | 60 +++++++---------------------------
 include/linux/power/isp1704_charger.h  | 30 -----------------
 2 files changed, 12 insertions(+), 78 deletions(-)
 delete mode 100644 include/linux/power/isp1704_charger.h

(limited to 'include')

diff --git a/drivers/power/supply/isp1704_charger.c b/drivers/power/supply/isp1704_charger.c
index 95af5f305838..a63cb5dcfa08 100644
--- a/drivers/power/supply/isp1704_charger.c
+++ b/drivers/power/supply/isp1704_charger.c
@@ -30,13 +30,12 @@
 #include <linux/power_supply.h>
 #include <linux/delay.h>
 #include <linux/of.h>
-#include <linux/of_gpio.h>
 
+#include <linux/gpio/consumer.h>
 #include <linux/usb/otg.h>
 #include <linux/usb/ulpi.h>
 #include <linux/usb/ch9.h>
 #include <linux/usb/gadget.h>
-#include <linux/power/isp1704_charger.h>
 
 /* Vendor specific Power Control register */
 #define ISP1704_PWR_CTRL		0x3d
@@ -60,6 +59,7 @@ struct isp1704_charger {
 	struct device			*dev;
 	struct power_supply		*psy;
 	struct power_supply_desc	psy_desc;
+	struct gpio_desc		*enable_gpio;
 	struct usb_phy			*phy;
 	struct notifier_block		nb;
 	struct work_struct		work;
@@ -81,18 +81,9 @@ static inline int isp1704_write(struct isp1704_charger *isp, u32 reg, u32 val)
 	return usb_phy_io_write(isp->phy, val, reg);
 }
 
-/*
- * Disable/enable the power from the isp1704 if a function for it
- * has been provided with platform data.
- */
 static void isp1704_charger_set_power(struct isp1704_charger *isp, bool on)
 {
-	struct isp1704_charger_data	*board = isp->dev->platform_data;
-
-	if (board && board->set_power)
-		board->set_power(on);
-	else if (board)
-		gpio_set_value(board->enable_gpio, on);
+	gpiod_set_value(isp->enable_gpio, on);
 }
 
 /*
@@ -405,46 +396,19 @@ static int isp1704_charger_probe(struct platform_device *pdev)
 	int			ret = -ENODEV;
 	struct power_supply_config psy_cfg = {};
 
-	struct isp1704_charger_data *pdata = dev_get_platdata(&pdev->dev);
-	struct device_node *np = pdev->dev.of_node;
-
-	if (np) {
-		int gpio = of_get_named_gpio(np, "nxp,enable-gpio", 0);
-
-		if (gpio < 0) {
-			dev_err(&pdev->dev, "missing DT GPIO nxp,enable-gpio\n");
-			return gpio;
-		}
-
-		pdata = devm_kzalloc(&pdev->dev,
-			sizeof(struct isp1704_charger_data), GFP_KERNEL);
-		if (!pdata) {
-			ret = -ENOMEM;
-			goto fail0;
-		}
-		pdata->enable_gpio = gpio;
-
-		dev_info(&pdev->dev, "init gpio %d\n", pdata->enable_gpio);
-
-		ret = devm_gpio_request_one(&pdev->dev, pdata->enable_gpio,
-					GPIOF_OUT_INIT_HIGH, "isp1704_reset");
-		if (ret) {
-			dev_err(&pdev->dev, "gpio request failed\n");
-			goto fail0;
-		}
-	}
-
-	if (!pdata) {
-		dev_err(&pdev->dev, "missing platform data!\n");
-		return -ENODEV;
-	}
-
-
 	isp = devm_kzalloc(&pdev->dev, sizeof(*isp), GFP_KERNEL);
 	if (!isp)
 		return -ENOMEM;
 
-	if (np)
+	isp->enable_gpio = devm_gpiod_get(&pdev->dev, "nxp,enable",
+					  GPIOD_OUT_HIGH);
+	if (IS_ERR(isp->enable_gpio)) {
+		ret = PTR_ERR(isp->enable_gpio);
+		dev_err(&pdev->dev, "Could not get reset gpio: %d\n", ret);
+		return ret;
+	}
+
+	if (pdev->dev.of_node)
 		isp->phy = devm_usb_get_phy_by_phandle(&pdev->dev, "usb-phy", 0);
 	else
 		isp->phy = devm_usb_get_phy(&pdev->dev, USB_PHY_TYPE_USB2);
diff --git a/include/linux/power/isp1704_charger.h b/include/linux/power/isp1704_charger.h
deleted file mode 100644
index 0105d9e7af85..000000000000
--- a/include/linux/power/isp1704_charger.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * ISP1704 USB Charger Detection driver
- *
- * Copyright (C) 2011 Nokia Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-
-#ifndef __ISP1704_CHARGER_H
-#define __ISP1704_CHARGER_H
-
-struct isp1704_charger_data {
-	void		(*set_power)(bool on);
-	int		enable_gpio;
-};
-
-#endif
-- 
cgit v1.2.3


From 2a6efbc6da5d248c4fbe7fdb7de7f3eff1de95a4 Mon Sep 17 00:00:00 2001
From: Fabrizio Castro <fabrizio.castro@bp.renesas.com>
Date: Wed, 16 Jan 2019 19:45:45 +0000
Subject: clk: renesas: r8a774c0: Add missing CANFD clock

This patch adds the missing CANFD clock to the r8a774c0 specific
clock driver.

Signed-off-by: Fabrizio Castro <fabrizio.castro@bp.renesas.com>
Reviewed-by: Chris Paterson <Chris.Paterson2@renesas.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 drivers/clk/renesas/r8a774c0-cpg-mssr.c       | 4 ++++
 include/dt-bindings/clock/r8a774c0-cpg-mssr.h | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/drivers/clk/renesas/r8a774c0-cpg-mssr.c b/drivers/clk/renesas/r8a774c0-cpg-mssr.c
index 10b96895d452..28bcc8105d57 100644
--- a/drivers/clk/renesas/r8a774c0-cpg-mssr.c
+++ b/drivers/clk/renesas/r8a774c0-cpg-mssr.c
@@ -33,6 +33,7 @@ enum clk_ids {
 	CLK_PLL1,
 	CLK_PLL3,
 	CLK_PLL0D4,
+	CLK_PLL0D6,
 	CLK_PLL0D8,
 	CLK_PLL0D20,
 	CLK_PLL0D24,
@@ -61,6 +62,7 @@ static const struct cpg_core_clk r8a774c0_core_clks[] __initconst = {
 
 	DEF_FIXED(".pll0",     CLK_PLL0,           CLK_MAIN,	   1, 100),
 	DEF_FIXED(".pll0d4",   CLK_PLL0D4,         CLK_PLL0,       4, 1),
+	DEF_FIXED(".pll0d6",   CLK_PLL0D6,         CLK_PLL0,       6, 1),
 	DEF_FIXED(".pll0d8",   CLK_PLL0D8,         CLK_PLL0,       8, 1),
 	DEF_FIXED(".pll0d20",  CLK_PLL0D20,        CLK_PLL0,      20, 1),
 	DEF_FIXED(".pll0d24",  CLK_PLL0D24,        CLK_PLL0,      24, 1),
@@ -112,6 +114,7 @@ static const struct cpg_core_clk r8a774c0_core_clks[] __initconst = {
 	DEF_GEN3_PE("s3d2c",   R8A774C0_CLK_S3D2C, CLK_S3, 2, CLK_PE, 2),
 	DEF_GEN3_PE("s3d4c",   R8A774C0_CLK_S3D4C, CLK_S3, 4, CLK_PE, 4),
 
+	DEF_DIV6P1("canfd",    R8A774C0_CLK_CANFD, CLK_PLL0D6, 0x244),
 	DEF_DIV6P1("csi0",     R8A774C0_CLK_CSI0,  CLK_PLL1D2, 0x00c),
 	DEF_DIV6P1("mso",      R8A774C0_CLK_MSO,   CLK_PLL1D2, 0x014),
 
@@ -187,6 +190,7 @@ static const struct mssr_mod_clk r8a774c0_mod_clks[] __initconst = {
 	DEF_MOD("gpio2",		 910,	R8A774C0_CLK_S3D4),
 	DEF_MOD("gpio1",		 911,	R8A774C0_CLK_S3D4),
 	DEF_MOD("gpio0",		 912,	R8A774C0_CLK_S3D4),
+	DEF_MOD("can-fd",		 914,	R8A774C0_CLK_S3D2),
 	DEF_MOD("can-if1",		 915,	R8A774C0_CLK_S3D4),
 	DEF_MOD("can-if0",		 916,	R8A774C0_CLK_S3D4),
 	DEF_MOD("i2c6",			 918,	R8A774C0_CLK_S3D2),
diff --git a/include/dt-bindings/clock/r8a774c0-cpg-mssr.h b/include/dt-bindings/clock/r8a774c0-cpg-mssr.h
index 8fe51b6aca28..8ad9cd6be8e9 100644
--- a/include/dt-bindings/clock/r8a774c0-cpg-mssr.h
+++ b/include/dt-bindings/clock/r8a774c0-cpg-mssr.h
@@ -56,5 +56,6 @@
 #define R8A774C0_CLK_CSI0		45
 #define R8A774C0_CLK_CP			46
 #define R8A774C0_CLK_CPEX		47
+#define R8A774C0_CLK_CANFD		48
 
 #endif /* __DT_BINDINGS_CLOCK_R8A774C0_CPG_MSSR_H__ */
-- 
cgit v1.2.3


From 9d034e151b407cbd2c66bc4c48b423f814533374 Mon Sep 17 00:00:00 2001
From: Fabrizio Castro <fabrizio.castro@bp.renesas.com>
Date: Thu, 17 Jan 2019 14:54:14 +0000
Subject: clk: renesas: r8a774a1: Add missing CANFD clock

This patch adds the missing CANFD clock to the r8a774a1 specific
clock driver.

Signed-off-by: Fabrizio Castro <fabrizio.castro@bp.renesas.com>
Reviewed-by: Chris Paterson <Chris.Paterson2@renesas.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 drivers/clk/renesas/r8a774a1-cpg-mssr.c       | 2 ++
 include/dt-bindings/clock/r8a774a1-cpg-mssr.h | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/drivers/clk/renesas/r8a774a1-cpg-mssr.c b/drivers/clk/renesas/r8a774a1-cpg-mssr.c
index 10e852518870..e10374119a91 100644
--- a/drivers/clk/renesas/r8a774a1-cpg-mssr.c
+++ b/drivers/clk/renesas/r8a774a1-cpg-mssr.c
@@ -102,6 +102,7 @@ static const struct cpg_core_clk r8a774a1_core_clks[] __initconst = {
 	DEF_FIXED("cp",         R8A774A1_CLK_CP,    CLK_EXTAL,      2, 1),
 	DEF_FIXED("cpex",       R8A774A1_CLK_CPEX,  CLK_EXTAL,      2, 1),
 
+	DEF_DIV6P1("canfd",     R8A774A1_CLK_CANFD, CLK_PLL1_DIV4, 0x244),
 	DEF_DIV6P1("csi0",      R8A774A1_CLK_CSI0,  CLK_PLL1_DIV4, 0x00c),
 	DEF_DIV6P1("mso",       R8A774A1_CLK_MSO,   CLK_PLL1_DIV4, 0x014),
 	DEF_DIV6P1("hdmi",      R8A774A1_CLK_HDMI,  CLK_PLL1_DIV4, 0x250),
@@ -191,6 +192,7 @@ static const struct mssr_mod_clk r8a774a1_mod_clks[] __initconst = {
 	DEF_MOD("gpio2",		 910,	R8A774A1_CLK_S3D4),
 	DEF_MOD("gpio1",		 911,	R8A774A1_CLK_S3D4),
 	DEF_MOD("gpio0",		 912,	R8A774A1_CLK_S3D4),
+	DEF_MOD("can-fd",		 914,	R8A774A1_CLK_S3D2),
 	DEF_MOD("can-if1",		 915,	R8A774A1_CLK_S3D4),
 	DEF_MOD("can-if0",		 916,	R8A774A1_CLK_S3D4),
 	DEF_MOD("i2c6",			 918,	R8A774A1_CLK_S0D6),
diff --git a/include/dt-bindings/clock/r8a774a1-cpg-mssr.h b/include/dt-bindings/clock/r8a774a1-cpg-mssr.h
index 9bc5d45ff4b5..e355363f40c2 100644
--- a/include/dt-bindings/clock/r8a774a1-cpg-mssr.h
+++ b/include/dt-bindings/clock/r8a774a1-cpg-mssr.h
@@ -54,5 +54,6 @@
 #define R8A774A1_CLK_CPEX		43
 #define R8A774A1_CLK_R			44
 #define R8A774A1_CLK_OSC		45
+#define R8A774A1_CLK_CANFD		46
 
 #endif /* __DT_BINDINGS_CLOCK_R8A774A1_CPG_MSSR_H__ */
-- 
cgit v1.2.3


From 386a35eb70569b3158392eb573fe42589a669da4 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Thu, 17 Jan 2019 11:33:42 -0200
Subject: media: tw9910.h: remove obsolete soc_camera.h include.

This include isn't use anymore, so drop it.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/media/i2c/tw9910.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/media/i2c/tw9910.h b/include/media/i2c/tw9910.h
index bec8f7bce745..2f93799d5a21 100644
--- a/include/media/i2c/tw9910.h
+++ b/include/media/i2c/tw9910.h
@@ -16,8 +16,6 @@
 #ifndef __TW9910_H__
 #define __TW9910_H__
 
-#include <media/soc_camera.h>
-
 /**
  * tw9910_mpout_pin - MPOUT (multi-purpose output) pin functions
  */
-- 
cgit v1.2.3


From 43a445f188e1a4d58e1be052b23afc9aacccbdd8 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Thu, 17 Jan 2019 11:41:23 -0200
Subject: media: sh_mobile_ceu_camera: remove obsolete soc_camera driver

This driver got converted to not depend on soc_camera in commit
32e5a70dc8f4 ("media: platform: Add Renesas CEU driver").

There's no sense in keeping the old version there.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/platform/soc_camera/Kconfig          |    9 -
 drivers/media/platform/soc_camera/Makefile         |    3 -
 .../platform/soc_camera/sh_mobile_ceu_camera.c     | 1810 --------------------
 include/media/drv-intf/sh_mobile_ceu.h             |   29 -
 4 files changed, 1851 deletions(-)
 delete mode 100644 drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c
 delete mode 100644 include/media/drv-intf/sh_mobile_ceu.h

(limited to 'include')

diff --git a/drivers/media/platform/soc_camera/Kconfig b/drivers/media/platform/soc_camera/Kconfig
index 669d116b8f09..94907d0611d6 100644
--- a/drivers/media/platform/soc_camera/Kconfig
+++ b/drivers/media/platform/soc_camera/Kconfig
@@ -15,12 +15,3 @@ config SOC_CAMERA_PLATFORM
 	depends on SOC_CAMERA
 	help
 	  This is a generic SoC camera platform driver, useful for testing
-
-config VIDEO_SH_MOBILE_CEU
-	tristate "SuperH Mobile CEU Interface driver"
-	depends on VIDEO_DEV && SOC_CAMERA && HAVE_CLK
-	depends on ARCH_SHMOBILE || COMPILE_TEST
-	select VIDEOBUF2_DMA_CONTIG
-	select SOC_CAMERA_SCALE_CROP
-	---help---
-	  This is a v4l2 driver for the SuperH Mobile CEU Interface
diff --git a/drivers/media/platform/soc_camera/Makefile b/drivers/media/platform/soc_camera/Makefile
index 07a451e8b228..aa4b855a0882 100644
--- a/drivers/media/platform/soc_camera/Makefile
+++ b/drivers/media/platform/soc_camera/Makefile
@@ -4,6 +4,3 @@ obj-$(CONFIG_SOC_CAMERA_SCALE_CROP)	+= soc_scale_crop.o
 # a platform subdevice driver stub, allowing to support cameras by adding a
 # couple of callback functions to the board code
 obj-$(CONFIG_SOC_CAMERA_PLATFORM)	+= soc_camera_platform.o
-
-# soc-camera host drivers have to be linked after camera drivers
-obj-$(CONFIG_VIDEO_SH_MOBILE_CEU)	+= sh_mobile_ceu_camera.o
diff --git a/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c b/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c
deleted file mode 100644
index 6803f744e307..000000000000
--- a/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c
+++ /dev/null
@@ -1,1810 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * V4L2 Driver for SuperH Mobile CEU interface
- *
- * Copyright (C) 2008 Magnus Damm
- *
- * Based on V4L2 Driver for PXA camera host - "pxa_camera.c",
- *
- * Copyright (C) 2006, Sascha Hauer, Pengutronix
- * Copyright (C) 2008, Guennadi Liakhovetski <kernel@pengutronix.de>
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/io.h>
-#include <linux/completion.h>
-#include <linux/delay.h>
-#include <linux/dma-mapping.h>
-#include <linux/err.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/moduleparam.h>
-#include <linux/of.h>
-#include <linux/time.h>
-#include <linux/slab.h>
-#include <linux/device.h>
-#include <linux/platform_device.h>
-#include <linux/videodev2.h>
-#include <linux/pm_runtime.h>
-#include <linux/sched.h>
-
-#include <media/v4l2-async.h>
-#include <media/v4l2-common.h>
-#include <media/v4l2-dev.h>
-#include <media/soc_camera.h>
-#include <media/drv-intf/sh_mobile_ceu.h>
-#include <media/videobuf2-dma-contig.h>
-#include <media/v4l2-mediabus.h>
-#include <media/drv-intf/soc_mediabus.h>
-
-#include "soc_scale_crop.h"
-
-/* register offsets for sh7722 / sh7723 */
-
-#define CAPSR  0x00 /* Capture start register */
-#define CAPCR  0x04 /* Capture control register */
-#define CAMCR  0x08 /* Capture interface control register */
-#define CMCYR  0x0c /* Capture interface cycle  register */
-#define CAMOR  0x10 /* Capture interface offset register */
-#define CAPWR  0x14 /* Capture interface width register */
-#define CAIFR  0x18 /* Capture interface input format register */
-#define CSTCR  0x20 /* Camera strobe control register (<= sh7722) */
-#define CSECR  0x24 /* Camera strobe emission count register (<= sh7722) */
-#define CRCNTR 0x28 /* CEU register control register */
-#define CRCMPR 0x2c /* CEU register forcible control register */
-#define CFLCR  0x30 /* Capture filter control register */
-#define CFSZR  0x34 /* Capture filter size clip register */
-#define CDWDR  0x38 /* Capture destination width register */
-#define CDAYR  0x3c /* Capture data address Y register */
-#define CDACR  0x40 /* Capture data address C register */
-#define CDBYR  0x44 /* Capture data bottom-field address Y register */
-#define CDBCR  0x48 /* Capture data bottom-field address C register */
-#define CBDSR  0x4c /* Capture bundle destination size register */
-#define CFWCR  0x5c /* Firewall operation control register */
-#define CLFCR  0x60 /* Capture low-pass filter control register */
-#define CDOCR  0x64 /* Capture data output control register */
-#define CDDCR  0x68 /* Capture data complexity level register */
-#define CDDAR  0x6c /* Capture data complexity level address register */
-#define CEIER  0x70 /* Capture event interrupt enable register */
-#define CETCR  0x74 /* Capture event flag clear register */
-#define CSTSR  0x7c /* Capture status register */
-#define CSRTR  0x80 /* Capture software reset register */
-#define CDSSR  0x84 /* Capture data size register */
-#define CDAYR2 0x90 /* Capture data address Y register 2 */
-#define CDACR2 0x94 /* Capture data address C register 2 */
-#define CDBYR2 0x98 /* Capture data bottom-field address Y register 2 */
-#define CDBCR2 0x9c /* Capture data bottom-field address C register 2 */
-
-#undef DEBUG_GEOMETRY
-#ifdef DEBUG_GEOMETRY
-#define dev_geo	dev_info
-#else
-#define dev_geo	dev_dbg
-#endif
-
-/* per video frame buffer */
-struct sh_mobile_ceu_buffer {
-	struct vb2_v4l2_buffer vb; /* v4l buffer must be first */
-	struct list_head queue;
-};
-
-struct sh_mobile_ceu_dev {
-	struct soc_camera_host ici;
-
-	unsigned int irq;
-	void __iomem *base;
-	size_t video_limit;
-	size_t buf_total;
-
-	spinlock_t lock;		/* Protects video buffer lists */
-	struct list_head capture;
-	struct vb2_v4l2_buffer *active;
-
-	struct sh_mobile_ceu_info *pdata;
-	struct completion complete;
-
-	u32 cflcr;
-
-	/* static max sizes either from platform data or default */
-	int max_width;
-	int max_height;
-
-	enum v4l2_field field;
-	int sequence;
-	unsigned long flags;
-
-	unsigned int image_mode:1;
-	unsigned int is_16bit:1;
-	unsigned int frozen:1;
-};
-
-struct sh_mobile_ceu_cam {
-	/* CEU offsets within the camera output, before the CEU scaler */
-	unsigned int ceu_left;
-	unsigned int ceu_top;
-	/* Client output, as seen by the CEU */
-	unsigned int width;
-	unsigned int height;
-	/*
-	 * User window from S_SELECTION / G_SELECTION, produced by client cropping and
-	 * scaling, CEU scaling and CEU cropping, mapped back onto the client
-	 * input window
-	 */
-	struct v4l2_rect subrect;
-	/* Camera cropping rectangle */
-	struct v4l2_rect rect;
-	const struct soc_mbus_pixelfmt *extra_fmt;
-	u32 code;
-};
-
-static struct sh_mobile_ceu_buffer *to_ceu_vb(struct vb2_v4l2_buffer *vbuf)
-{
-	return container_of(vbuf, struct sh_mobile_ceu_buffer, vb);
-}
-
-static void ceu_write(struct sh_mobile_ceu_dev *priv,
-		      unsigned long reg_offs, u32 data)
-{
-	iowrite32(data, priv->base + reg_offs);
-}
-
-static u32 ceu_read(struct sh_mobile_ceu_dev *priv, unsigned long reg_offs)
-{
-	return ioread32(priv->base + reg_offs);
-}
-
-static int sh_mobile_ceu_soft_reset(struct sh_mobile_ceu_dev *pcdev)
-{
-	int i, success = 0;
-
-	ceu_write(pcdev, CAPSR, 1 << 16); /* reset */
-
-	/* wait CSTSR.CPTON bit */
-	for (i = 0; i < 1000; i++) {
-		if (!(ceu_read(pcdev, CSTSR) & 1)) {
-			success++;
-			break;
-		}
-		udelay(1);
-	}
-
-	/* wait CAPSR.CPKIL bit */
-	for (i = 0; i < 1000; i++) {
-		if (!(ceu_read(pcdev, CAPSR) & (1 << 16))) {
-			success++;
-			break;
-		}
-		udelay(1);
-	}
-
-	if (2 != success) {
-		dev_warn(pcdev->ici.v4l2_dev.dev, "soft reset time out\n");
-		return -EIO;
-	}
-
-	return 0;
-}
-
-/*
- *  Videobuf operations
- */
-
-/*
- * .queue_setup() is called to check, whether the driver can accept the
- *		  requested number of buffers and to fill in plane sizes
- *		  for the current frame format if required
- */
-static int sh_mobile_ceu_videobuf_setup(struct vb2_queue *vq,
-			unsigned int *count, unsigned int *num_planes,
-			unsigned int sizes[], struct device *alloc_devs[])
-{
-	struct soc_camera_device *icd = soc_camera_from_vb2q(vq);
-	struct soc_camera_host *ici = to_soc_camera_host(icd->parent);
-	struct sh_mobile_ceu_dev *pcdev = ici->priv;
-
-	if (!vq->num_buffers)
-		pcdev->sequence = 0;
-
-	if (!*count)
-		*count = 2;
-
-	/* Called from VIDIOC_REQBUFS or in compatibility mode */
-	if (!*num_planes)
-		sizes[0] = icd->sizeimage;
-	else if (sizes[0] < icd->sizeimage)
-		return -EINVAL;
-
-	/* If *num_planes != 0, we have already verified *count. */
-	if (pcdev->video_limit) {
-		size_t size = PAGE_ALIGN(sizes[0]) * *count;
-
-		if (size + pcdev->buf_total > pcdev->video_limit)
-			*count = (pcdev->video_limit - pcdev->buf_total) /
-				PAGE_ALIGN(sizes[0]);
-	}
-
-	*num_planes = 1;
-
-	dev_dbg(icd->parent, "count=%d, size=%u\n", *count, sizes[0]);
-
-	return 0;
-}
-
-#define CEU_CETCR_MAGIC 0x0317f313 /* acknowledge magical interrupt sources */
-#define CEU_CETCR_IGRW (1 << 4) /* prohibited register access interrupt bit */
-#define CEU_CEIER_CPEIE (1 << 0) /* one-frame capture end interrupt */
-#define CEU_CEIER_VBP   (1 << 20) /* vbp error */
-#define CEU_CAPCR_CTNCP (1 << 16) /* continuous capture mode (if set) */
-#define CEU_CEIER_MASK (CEU_CEIER_CPEIE | CEU_CEIER_VBP)
-
-
-/*
- * return value doesn't reflex the success/failure to queue the new buffer,
- * but rather the status of the previous buffer.
- */
-static int sh_mobile_ceu_capture(struct sh_mobile_ceu_dev *pcdev)
-{
-	struct soc_camera_device *icd = pcdev->ici.icd;
-	dma_addr_t phys_addr_top, phys_addr_bottom;
-	unsigned long top1, top2;
-	unsigned long bottom1, bottom2;
-	u32 status;
-	bool planar;
-	int ret = 0;
-
-	/*
-	 * The hardware is _very_ picky about this sequence. Especially
-	 * the CEU_CETCR_MAGIC value. It seems like we need to acknowledge
-	 * several not-so-well documented interrupt sources in CETCR.
-	 */
-	ceu_write(pcdev, CEIER, ceu_read(pcdev, CEIER) & ~CEU_CEIER_MASK);
-	status = ceu_read(pcdev, CETCR);
-	ceu_write(pcdev, CETCR, ~status & CEU_CETCR_MAGIC);
-	if (!pcdev->frozen)
-		ceu_write(pcdev, CEIER, ceu_read(pcdev, CEIER) | CEU_CEIER_MASK);
-	ceu_write(pcdev, CAPCR, ceu_read(pcdev, CAPCR) & ~CEU_CAPCR_CTNCP);
-	ceu_write(pcdev, CETCR, CEU_CETCR_MAGIC ^ CEU_CETCR_IGRW);
-
-	/*
-	 * When a VBP interrupt occurs, a capture end interrupt does not occur
-	 * and the image of that frame is not captured correctly. So, soft reset
-	 * is needed here.
-	 */
-	if (status & CEU_CEIER_VBP) {
-		sh_mobile_ceu_soft_reset(pcdev);
-		ret = -EIO;
-	}
-
-	if (pcdev->frozen) {
-		complete(&pcdev->complete);
-		return ret;
-	}
-
-	if (!pcdev->active)
-		return ret;
-
-	if (V4L2_FIELD_INTERLACED_BT == pcdev->field) {
-		top1	= CDBYR;
-		top2	= CDBCR;
-		bottom1	= CDAYR;
-		bottom2	= CDACR;
-	} else {
-		top1	= CDAYR;
-		top2	= CDACR;
-		bottom1	= CDBYR;
-		bottom2	= CDBCR;
-	}
-
-	phys_addr_top =
-		vb2_dma_contig_plane_dma_addr(&pcdev->active->vb2_buf, 0);
-
-	switch (icd->current_fmt->host_fmt->fourcc) {
-	case V4L2_PIX_FMT_NV12:
-	case V4L2_PIX_FMT_NV21:
-	case V4L2_PIX_FMT_NV16:
-	case V4L2_PIX_FMT_NV61:
-		planar = true;
-		break;
-	default:
-		planar = false;
-	}
-
-	ceu_write(pcdev, top1, phys_addr_top);
-	if (V4L2_FIELD_NONE != pcdev->field) {
-		phys_addr_bottom = phys_addr_top + icd->bytesperline;
-		ceu_write(pcdev, bottom1, phys_addr_bottom);
-	}
-
-	if (planar) {
-		phys_addr_top += icd->bytesperline * icd->user_height;
-		ceu_write(pcdev, top2, phys_addr_top);
-		if (V4L2_FIELD_NONE != pcdev->field) {
-			phys_addr_bottom = phys_addr_top + icd->bytesperline;
-			ceu_write(pcdev, bottom2, phys_addr_bottom);
-		}
-	}
-
-	ceu_write(pcdev, CAPSR, 0x1); /* start capture */
-
-	return ret;
-}
-
-static int sh_mobile_ceu_videobuf_prepare(struct vb2_buffer *vb)
-{
-	struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
-	struct sh_mobile_ceu_buffer *buf = to_ceu_vb(vbuf);
-
-	/* Added list head initialization on alloc */
-	WARN(!list_empty(&buf->queue), "Buffer %p on queue!\n", vb);
-
-	return 0;
-}
-
-static void sh_mobile_ceu_videobuf_queue(struct vb2_buffer *vb)
-{
-	struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
-	struct soc_camera_device *icd = soc_camera_from_vb2q(vb->vb2_queue);
-	struct soc_camera_host *ici = to_soc_camera_host(icd->parent);
-	struct sh_mobile_ceu_dev *pcdev = ici->priv;
-	struct sh_mobile_ceu_buffer *buf = to_ceu_vb(vbuf);
-	unsigned long size;
-
-	size = icd->sizeimage;
-
-	if (vb2_plane_size(vb, 0) < size) {
-		dev_err(icd->parent, "Buffer #%d too small (%lu < %lu)\n",
-			vb->index, vb2_plane_size(vb, 0), size);
-		goto error;
-	}
-
-	vb2_set_plane_payload(vb, 0, size);
-
-	dev_dbg(icd->parent, "%s (vb=0x%p) 0x%p %lu\n", __func__,
-		vb, vb2_plane_vaddr(vb, 0), vb2_get_plane_payload(vb, 0));
-
-#ifdef DEBUG
-	/*
-	 * This can be useful if you want to see if we actually fill
-	 * the buffer with something
-	 */
-	if (vb2_plane_vaddr(vb, 0))
-		memset(vb2_plane_vaddr(vb, 0), 0xaa, vb2_get_plane_payload(vb, 0));
-#endif
-
-	spin_lock_irq(&pcdev->lock);
-	list_add_tail(&buf->queue, &pcdev->capture);
-
-	if (!pcdev->active) {
-		/*
-		 * Because there were no active buffer at this moment,
-		 * we are not interested in the return value of
-		 * sh_mobile_ceu_capture here.
-		 */
-		pcdev->active = vbuf;
-		sh_mobile_ceu_capture(pcdev);
-	}
-	spin_unlock_irq(&pcdev->lock);
-
-	return;
-
-error:
-	vb2_buffer_done(vb, VB2_BUF_STATE_ERROR);
-}
-
-static void sh_mobile_ceu_videobuf_release(struct vb2_buffer *vb)
-{
-	struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
-	struct soc_camera_device *icd = soc_camera_from_vb2q(vb->vb2_queue);
-	struct soc_camera_host *ici = to_soc_camera_host(icd->parent);
-	struct sh_mobile_ceu_buffer *buf = to_ceu_vb(vbuf);
-	struct sh_mobile_ceu_dev *pcdev = ici->priv;
-
-	spin_lock_irq(&pcdev->lock);
-
-	if (pcdev->active == vbuf) {
-		/* disable capture (release DMA buffer), reset */
-		ceu_write(pcdev, CAPSR, 1 << 16);
-		pcdev->active = NULL;
-	}
-
-	/*
-	 * Doesn't hurt also if the list is empty, but it hurts, if queuing the
-	 * buffer failed, and .buf_init() hasn't been called
-	 */
-	if (buf->queue.next)
-		list_del_init(&buf->queue);
-
-	pcdev->buf_total -= PAGE_ALIGN(vb2_plane_size(vb, 0));
-	dev_dbg(icd->parent, "%s() %zu bytes buffers\n", __func__,
-		pcdev->buf_total);
-
-	spin_unlock_irq(&pcdev->lock);
-}
-
-static int sh_mobile_ceu_videobuf_init(struct vb2_buffer *vb)
-{
-	struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
-	struct soc_camera_device *icd = soc_camera_from_vb2q(vb->vb2_queue);
-	struct soc_camera_host *ici = to_soc_camera_host(icd->parent);
-	struct sh_mobile_ceu_dev *pcdev = ici->priv;
-
-	pcdev->buf_total += PAGE_ALIGN(vb2_plane_size(vb, 0));
-	dev_dbg(icd->parent, "%s() %zu bytes buffers\n", __func__,
-		pcdev->buf_total);
-
-	/* This is for locking debugging only */
-	INIT_LIST_HEAD(&to_ceu_vb(vbuf)->queue);
-	return 0;
-}
-
-static void sh_mobile_ceu_stop_streaming(struct vb2_queue *q)
-{
-	struct soc_camera_device *icd = soc_camera_from_vb2q(q);
-	struct soc_camera_host *ici = to_soc_camera_host(icd->parent);
-	struct sh_mobile_ceu_dev *pcdev = ici->priv;
-	struct list_head *buf_head, *tmp;
-	struct vb2_v4l2_buffer *vbuf;
-
-	spin_lock_irq(&pcdev->lock);
-
-	pcdev->active = NULL;
-
-	list_for_each_safe(buf_head, tmp, &pcdev->capture) {
-		vbuf = &list_entry(buf_head, struct sh_mobile_ceu_buffer,
-				   queue)->vb;
-		vb2_buffer_done(&vbuf->vb2_buf, VB2_BUF_STATE_DONE);
-		list_del_init(buf_head);
-	}
-
-	spin_unlock_irq(&pcdev->lock);
-
-	sh_mobile_ceu_soft_reset(pcdev);
-}
-
-static const struct vb2_ops sh_mobile_ceu_videobuf_ops = {
-	.queue_setup	= sh_mobile_ceu_videobuf_setup,
-	.buf_prepare	= sh_mobile_ceu_videobuf_prepare,
-	.buf_queue	= sh_mobile_ceu_videobuf_queue,
-	.buf_cleanup	= sh_mobile_ceu_videobuf_release,
-	.buf_init	= sh_mobile_ceu_videobuf_init,
-	.wait_prepare	= vb2_ops_wait_prepare,
-	.wait_finish	= vb2_ops_wait_finish,
-	.stop_streaming	= sh_mobile_ceu_stop_streaming,
-};
-
-static irqreturn_t sh_mobile_ceu_irq(int irq, void *data)
-{
-	struct sh_mobile_ceu_dev *pcdev = data;
-	struct vb2_v4l2_buffer *vbuf;
-	int ret;
-
-	spin_lock(&pcdev->lock);
-
-	vbuf = pcdev->active;
-	if (!vbuf)
-		/* Stale interrupt from a released buffer */
-		goto out;
-
-	list_del_init(&to_ceu_vb(vbuf)->queue);
-
-	if (!list_empty(&pcdev->capture))
-		pcdev->active = &list_entry(pcdev->capture.next,
-					    struct sh_mobile_ceu_buffer, queue)->vb;
-	else
-		pcdev->active = NULL;
-
-	ret = sh_mobile_ceu_capture(pcdev);
-	vbuf->vb2_buf.timestamp = ktime_get_ns();
-	if (!ret) {
-		vbuf->field = pcdev->field;
-		vbuf->sequence = pcdev->sequence++;
-	}
-	vb2_buffer_done(&vbuf->vb2_buf,
-			ret < 0 ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE);
-
-out:
-	spin_unlock(&pcdev->lock);
-
-	return IRQ_HANDLED;
-}
-
-static int sh_mobile_ceu_add_device(struct soc_camera_device *icd)
-{
-	dev_info(icd->parent,
-		 "SuperH Mobile CEU driver attached to camera %d\n",
-		 icd->devnum);
-
-	return 0;
-}
-
-static void sh_mobile_ceu_remove_device(struct soc_camera_device *icd)
-{
-	dev_info(icd->parent,
-		 "SuperH Mobile CEU driver detached from camera %d\n",
-		 icd->devnum);
-}
-
-/* Called with .host_lock held */
-static int sh_mobile_ceu_clock_start(struct soc_camera_host *ici)
-{
-	struct sh_mobile_ceu_dev *pcdev = ici->priv;
-
-	pm_runtime_get_sync(ici->v4l2_dev.dev);
-
-	pcdev->buf_total = 0;
-
-	sh_mobile_ceu_soft_reset(pcdev);
-
-	return 0;
-}
-
-/* Called with .host_lock held */
-static void sh_mobile_ceu_clock_stop(struct soc_camera_host *ici)
-{
-	struct sh_mobile_ceu_dev *pcdev = ici->priv;
-
-	/* disable capture, disable interrupts */
-	ceu_write(pcdev, CEIER, 0);
-	sh_mobile_ceu_soft_reset(pcdev);
-
-	/* make sure active buffer is canceled */
-	spin_lock_irq(&pcdev->lock);
-	if (pcdev->active) {
-		list_del_init(&to_ceu_vb(pcdev->active)->queue);
-		vb2_buffer_done(&pcdev->active->vb2_buf, VB2_BUF_STATE_ERROR);
-		pcdev->active = NULL;
-	}
-	spin_unlock_irq(&pcdev->lock);
-
-	pm_runtime_put(ici->v4l2_dev.dev);
-}
-
-/*
- * See chapter 29.4.12 "Capture Filter Control Register (CFLCR)"
- * in SH7722 Hardware Manual
- */
-static unsigned int size_dst(unsigned int src, unsigned int scale)
-{
-	unsigned int mant_pre = scale >> 12;
-	if (!src || !scale)
-		return src;
-	return ((mant_pre + 2 * (src - 1)) / (2 * mant_pre) - 1) *
-		mant_pre * 4096 / scale + 1;
-}
-
-static u16 calc_scale(unsigned int src, unsigned int *dst)
-{
-	u16 scale;
-
-	if (src == *dst)
-		return 0;
-
-	scale = (src * 4096 / *dst) & ~7;
-
-	while (scale > 4096 && size_dst(src, scale) < *dst)
-		scale -= 8;
-
-	*dst = size_dst(src, scale);
-
-	return scale;
-}
-
-/* rect is guaranteed to not exceed the scaled camera rectangle */
-static void sh_mobile_ceu_set_rect(struct soc_camera_device *icd)
-{
-	struct soc_camera_host *ici = to_soc_camera_host(icd->parent);
-	struct sh_mobile_ceu_cam *cam = icd->host_priv;
-	struct sh_mobile_ceu_dev *pcdev = ici->priv;
-	unsigned int height, width, cdwdr_width, in_width, in_height;
-	unsigned int left_offset, top_offset;
-	u32 camor;
-
-	dev_geo(icd->parent, "Crop %ux%u@%u:%u\n",
-		icd->user_width, icd->user_height, cam->ceu_left, cam->ceu_top);
-
-	left_offset	= cam->ceu_left;
-	top_offset	= cam->ceu_top;
-
-	WARN_ON(icd->user_width & 3 || icd->user_height & 3);
-
-	width = icd->user_width;
-
-	if (pcdev->image_mode) {
-		in_width = cam->width;
-		if (!pcdev->is_16bit) {
-			in_width *= 2;
-			left_offset *= 2;
-		}
-	} else {
-		unsigned int w_factor;
-
-		switch (icd->current_fmt->host_fmt->packing) {
-		case SOC_MBUS_PACKING_2X8_PADHI:
-			w_factor = 2;
-			break;
-		default:
-			w_factor = 1;
-		}
-
-		in_width = cam->width * w_factor;
-		left_offset *= w_factor;
-	}
-
-	cdwdr_width = icd->bytesperline;
-
-	height = icd->user_height;
-	in_height = cam->height;
-	if (V4L2_FIELD_NONE != pcdev->field) {
-		height = (height / 2) & ~3;
-		in_height /= 2;
-		top_offset /= 2;
-		cdwdr_width *= 2;
-	}
-
-	/* Set CAMOR, CAPWR, CFSZR, take care of CDWDR */
-	camor = left_offset | (top_offset << 16);
-
-	dev_geo(icd->parent,
-		"CAMOR 0x%x, CAPWR 0x%x, CFSZR 0x%x, CDWDR 0x%x\n", camor,
-		(in_height << 16) | in_width, (height << 16) | width,
-		cdwdr_width);
-
-	ceu_write(pcdev, CAMOR, camor);
-	ceu_write(pcdev, CAPWR, (in_height << 16) | in_width);
-	/* CFSZR clipping is applied _after_ the scaling filter (CFLCR) */
-	ceu_write(pcdev, CFSZR, (height << 16) | width);
-	ceu_write(pcdev, CDWDR, cdwdr_width);
-}
-
-static u32 capture_save_reset(struct sh_mobile_ceu_dev *pcdev)
-{
-	u32 capsr = ceu_read(pcdev, CAPSR);
-	ceu_write(pcdev, CAPSR, 1 << 16); /* reset, stop capture */
-	return capsr;
-}
-
-static void capture_restore(struct sh_mobile_ceu_dev *pcdev, u32 capsr)
-{
-	unsigned long timeout = jiffies + 10 * HZ;
-
-	/*
-	 * Wait until the end of the current frame. It can take a long time,
-	 * but if it has been aborted by a CAPSR reset, it shoule exit sooner.
-	 */
-	while ((ceu_read(pcdev, CSTSR) & 1) && time_before(jiffies, timeout))
-		msleep(1);
-
-	if (time_after(jiffies, timeout)) {
-		dev_err(pcdev->ici.v4l2_dev.dev,
-			"Timeout waiting for frame end! Interface problem?\n");
-		return;
-	}
-
-	/* Wait until reset clears, this shall not hang... */
-	while (ceu_read(pcdev, CAPSR) & (1 << 16))
-		udelay(10);
-
-	/* Anything to restore? */
-	if (capsr & ~(1 << 16))
-		ceu_write(pcdev, CAPSR, capsr);
-}
-
-#define CEU_BUS_FLAGS (V4L2_MBUS_MASTER |	\
-		V4L2_MBUS_PCLK_SAMPLE_RISING |	\
-		V4L2_MBUS_HSYNC_ACTIVE_HIGH |	\
-		V4L2_MBUS_HSYNC_ACTIVE_LOW |	\
-		V4L2_MBUS_VSYNC_ACTIVE_HIGH |	\
-		V4L2_MBUS_VSYNC_ACTIVE_LOW |	\
-		V4L2_MBUS_DATA_ACTIVE_HIGH)
-
-/* Capture is not running, no interrupts, no locking needed */
-static int sh_mobile_ceu_set_bus_param(struct soc_camera_device *icd)
-{
-	struct soc_camera_host *ici = to_soc_camera_host(icd->parent);
-	struct sh_mobile_ceu_dev *pcdev = ici->priv;
-	struct v4l2_subdev *sd = soc_camera_to_subdev(icd);
-	struct sh_mobile_ceu_cam *cam = icd->host_priv;
-	struct v4l2_mbus_config cfg = {.type = V4L2_MBUS_PARALLEL,};
-	unsigned long value, common_flags = CEU_BUS_FLAGS;
-	u32 capsr = capture_save_reset(pcdev);
-	unsigned int yuv_lineskip;
-	int ret;
-
-	/*
-	 * If the client doesn't implement g_mbus_config, we just use our
-	 * platform data
-	 */
-	ret = v4l2_subdev_call(sd, video, g_mbus_config, &cfg);
-	if (!ret) {
-		common_flags = soc_mbus_config_compatible(&cfg,
-							  common_flags);
-		if (!common_flags)
-			return -EINVAL;
-	} else if (ret != -ENOIOCTLCMD) {
-		return ret;
-	}
-
-	/* Make choises, based on platform preferences */
-	if ((common_flags & V4L2_MBUS_HSYNC_ACTIVE_HIGH) &&
-	    (common_flags & V4L2_MBUS_HSYNC_ACTIVE_LOW)) {
-		if (pcdev->flags & SH_CEU_FLAG_HSYNC_LOW)
-			common_flags &= ~V4L2_MBUS_HSYNC_ACTIVE_HIGH;
-		else
-			common_flags &= ~V4L2_MBUS_HSYNC_ACTIVE_LOW;
-	}
-
-	if ((common_flags & V4L2_MBUS_VSYNC_ACTIVE_HIGH) &&
-	    (common_flags & V4L2_MBUS_VSYNC_ACTIVE_LOW)) {
-		if (pcdev->flags & SH_CEU_FLAG_VSYNC_LOW)
-			common_flags &= ~V4L2_MBUS_VSYNC_ACTIVE_HIGH;
-		else
-			common_flags &= ~V4L2_MBUS_VSYNC_ACTIVE_LOW;
-	}
-
-	cfg.flags = common_flags;
-	ret = v4l2_subdev_call(sd, video, s_mbus_config, &cfg);
-	if (ret < 0 && ret != -ENOIOCTLCMD)
-		return ret;
-
-	if (icd->current_fmt->host_fmt->bits_per_sample > 8)
-		pcdev->is_16bit = 1;
-	else
-		pcdev->is_16bit = 0;
-
-	ceu_write(pcdev, CRCNTR, 0);
-	ceu_write(pcdev, CRCMPR, 0);
-
-	value = 0x00000010; /* data fetch by default */
-	yuv_lineskip = 0x10;
-
-	switch (icd->current_fmt->host_fmt->fourcc) {
-	case V4L2_PIX_FMT_NV12:
-	case V4L2_PIX_FMT_NV21:
-		/* convert 4:2:2 -> 4:2:0 */
-		yuv_lineskip = 0; /* skip for NV12/21, no skip for NV16/61 */
-		/* fall-through */
-	case V4L2_PIX_FMT_NV16:
-	case V4L2_PIX_FMT_NV61:
-		switch (cam->code) {
-		case MEDIA_BUS_FMT_UYVY8_2X8:
-			value = 0x00000000; /* Cb0, Y0, Cr0, Y1 */
-			break;
-		case MEDIA_BUS_FMT_VYUY8_2X8:
-			value = 0x00000100; /* Cr0, Y0, Cb0, Y1 */
-			break;
-		case MEDIA_BUS_FMT_YUYV8_2X8:
-			value = 0x00000200; /* Y0, Cb0, Y1, Cr0 */
-			break;
-		case MEDIA_BUS_FMT_YVYU8_2X8:
-			value = 0x00000300; /* Y0, Cr0, Y1, Cb0 */
-			break;
-		default:
-			BUG();
-		}
-	}
-
-	if (icd->current_fmt->host_fmt->fourcc == V4L2_PIX_FMT_NV21 ||
-	    icd->current_fmt->host_fmt->fourcc == V4L2_PIX_FMT_NV61)
-		value ^= 0x00000100; /* swap U, V to change from NV1x->NVx1 */
-
-	value |= common_flags & V4L2_MBUS_VSYNC_ACTIVE_LOW ? 1 << 1 : 0;
-	value |= common_flags & V4L2_MBUS_HSYNC_ACTIVE_LOW ? 1 << 0 : 0;
-
-	if (pcdev->is_16bit)
-		value |= 1 << 12;
-	else if (pcdev->flags & SH_CEU_FLAG_LOWER_8BIT)
-		value |= 2 << 12;
-
-	ceu_write(pcdev, CAMCR, value);
-
-	ceu_write(pcdev, CAPCR, 0x00300000);
-
-	switch (pcdev->field) {
-	case V4L2_FIELD_INTERLACED_TB:
-		value = 0x101;
-		break;
-	case V4L2_FIELD_INTERLACED_BT:
-		value = 0x102;
-		break;
-	default:
-		value = 0;
-		break;
-	}
-	ceu_write(pcdev, CAIFR, value);
-
-	sh_mobile_ceu_set_rect(icd);
-	mdelay(1);
-
-	dev_geo(icd->parent, "CFLCR 0x%x\n", pcdev->cflcr);
-	ceu_write(pcdev, CFLCR, pcdev->cflcr);
-
-	/*
-	 * A few words about byte order (observed in Big Endian mode)
-	 *
-	 * In data fetch mode bytes are received in chunks of 8 bytes.
-	 * D0, D1, D2, D3, D4, D5, D6, D7 (D0 received first)
-	 *
-	 * The data is however by default written to memory in reverse order:
-	 * D7, D6, D5, D4, D3, D2, D1, D0 (D7 written to lowest byte)
-	 *
-	 * The lowest three bits of CDOCR allows us to do swapping,
-	 * using 7 we swap the data bytes to match the incoming order:
-	 * D0, D1, D2, D3, D4, D5, D6, D7
-	 */
-	value = 0x00000007 | yuv_lineskip;
-
-	ceu_write(pcdev, CDOCR, value);
-	ceu_write(pcdev, CFWCR, 0); /* keep "datafetch firewall" disabled */
-
-	capture_restore(pcdev, capsr);
-
-	/* not in bundle mode: skip CBDSR, CDAYR2, CDACR2, CDBYR2, CDBCR2 */
-	return 0;
-}
-
-static int sh_mobile_ceu_try_bus_param(struct soc_camera_device *icd,
-				       unsigned char buswidth)
-{
-	struct v4l2_subdev *sd = soc_camera_to_subdev(icd);
-	unsigned long common_flags = CEU_BUS_FLAGS;
-	struct v4l2_mbus_config cfg = {.type = V4L2_MBUS_PARALLEL,};
-	int ret;
-
-	ret = v4l2_subdev_call(sd, video, g_mbus_config, &cfg);
-	if (!ret)
-		common_flags = soc_mbus_config_compatible(&cfg,
-							  common_flags);
-	else if (ret != -ENOIOCTLCMD)
-		return ret;
-
-	if (!common_flags || buswidth > 16)
-		return -EINVAL;
-
-	return 0;
-}
-
-static const struct soc_mbus_pixelfmt sh_mobile_ceu_formats[] = {
-	{
-		.fourcc			= V4L2_PIX_FMT_NV12,
-		.name			= "NV12",
-		.bits_per_sample	= 8,
-		.packing		= SOC_MBUS_PACKING_1_5X8,
-		.order			= SOC_MBUS_ORDER_LE,
-		.layout			= SOC_MBUS_LAYOUT_PLANAR_2Y_C,
-	}, {
-		.fourcc			= V4L2_PIX_FMT_NV21,
-		.name			= "NV21",
-		.bits_per_sample	= 8,
-		.packing		= SOC_MBUS_PACKING_1_5X8,
-		.order			= SOC_MBUS_ORDER_LE,
-		.layout			= SOC_MBUS_LAYOUT_PLANAR_2Y_C,
-	}, {
-		.fourcc			= V4L2_PIX_FMT_NV16,
-		.name			= "NV16",
-		.bits_per_sample	= 8,
-		.packing		= SOC_MBUS_PACKING_2X8_PADHI,
-		.order			= SOC_MBUS_ORDER_LE,
-		.layout			= SOC_MBUS_LAYOUT_PLANAR_Y_C,
-	}, {
-		.fourcc			= V4L2_PIX_FMT_NV61,
-		.name			= "NV61",
-		.bits_per_sample	= 8,
-		.packing		= SOC_MBUS_PACKING_2X8_PADHI,
-		.order			= SOC_MBUS_ORDER_LE,
-		.layout			= SOC_MBUS_LAYOUT_PLANAR_Y_C,
-	},
-};
-
-/* This will be corrected as we get more formats */
-static bool sh_mobile_ceu_packing_supported(const struct soc_mbus_pixelfmt *fmt)
-{
-	return	fmt->packing == SOC_MBUS_PACKING_NONE ||
-		(fmt->bits_per_sample == 8 &&
-		 fmt->packing == SOC_MBUS_PACKING_1_5X8) ||
-		(fmt->bits_per_sample == 8 &&
-		 fmt->packing == SOC_MBUS_PACKING_2X8_PADHI) ||
-		(fmt->bits_per_sample > 8 &&
-		 fmt->packing == SOC_MBUS_PACKING_EXTEND16);
-}
-
-static struct soc_camera_device *ctrl_to_icd(struct v4l2_ctrl *ctrl)
-{
-	return container_of(ctrl->handler, struct soc_camera_device,
-							ctrl_handler);
-}
-
-static int sh_mobile_ceu_s_ctrl(struct v4l2_ctrl *ctrl)
-{
-	struct soc_camera_device *icd = ctrl_to_icd(ctrl);
-	struct soc_camera_host *ici = to_soc_camera_host(icd->parent);
-	struct sh_mobile_ceu_dev *pcdev = ici->priv;
-
-	switch (ctrl->id) {
-	case V4L2_CID_SHARPNESS:
-		switch (icd->current_fmt->host_fmt->fourcc) {
-		case V4L2_PIX_FMT_NV12:
-		case V4L2_PIX_FMT_NV21:
-		case V4L2_PIX_FMT_NV16:
-		case V4L2_PIX_FMT_NV61:
-			ceu_write(pcdev, CLFCR, !ctrl->val);
-			return 0;
-		}
-		break;
-	}
-
-	return -EINVAL;
-}
-
-static const struct v4l2_ctrl_ops sh_mobile_ceu_ctrl_ops = {
-	.s_ctrl = sh_mobile_ceu_s_ctrl,
-};
-
-static int sh_mobile_ceu_get_formats(struct soc_camera_device *icd, unsigned int idx,
-				     struct soc_camera_format_xlate *xlate)
-{
-	struct v4l2_subdev *sd = soc_camera_to_subdev(icd);
-	struct device *dev = icd->parent;
-	struct soc_camera_host *ici = to_soc_camera_host(dev);
-	struct sh_mobile_ceu_dev *pcdev = ici->priv;
-	int ret, k, n;
-	int formats = 0;
-	struct sh_mobile_ceu_cam *cam;
-	struct v4l2_subdev_mbus_code_enum code = {
-		.which = V4L2_SUBDEV_FORMAT_ACTIVE,
-		.index = idx,
-	};
-	const struct soc_mbus_pixelfmt *fmt;
-
-	ret = v4l2_subdev_call(sd, pad, enum_mbus_code, NULL, &code);
-	if (ret < 0)
-		/* No more formats */
-		return 0;
-
-	fmt = soc_mbus_get_fmtdesc(code.code);
-	if (!fmt) {
-		dev_warn(dev, "unsupported format code #%u: %d\n", idx, code.code);
-		return 0;
-	}
-
-	ret = sh_mobile_ceu_try_bus_param(icd, fmt->bits_per_sample);
-	if (ret < 0)
-		return 0;
-
-	if (!icd->host_priv) {
-		struct v4l2_subdev_format fmt = {
-			.which = V4L2_SUBDEV_FORMAT_ACTIVE,
-		};
-		struct v4l2_mbus_framefmt *mf = &fmt.format;
-		struct v4l2_rect rect;
-		int shift = 0;
-
-		/* Add our control */
-		v4l2_ctrl_new_std(&icd->ctrl_handler, &sh_mobile_ceu_ctrl_ops,
-				  V4L2_CID_SHARPNESS, 0, 1, 1, 1);
-		if (icd->ctrl_handler.error)
-			return icd->ctrl_handler.error;
-
-		/* FIXME: subwindow is lost between close / open */
-
-		/* Cache current client geometry */
-		ret = soc_camera_client_g_rect(sd, &rect);
-		if (ret < 0)
-			return ret;
-
-		/* First time */
-		ret = v4l2_subdev_call(sd, pad, get_fmt, NULL, &fmt);
-		if (ret < 0)
-			return ret;
-
-		/*
-		 * All currently existing CEU implementations support 2560x1920
-		 * or larger frames. If the sensor is proposing too big a frame,
-		 * don't bother with possibly supportred by the CEU larger
-		 * sizes, just try VGA multiples. If needed, this can be
-		 * adjusted in the future.
-		 */
-		while ((mf->width > pcdev->max_width ||
-			mf->height > pcdev->max_height) && shift < 4) {
-			/* Try 2560x1920, 1280x960, 640x480, 320x240 */
-			mf->width	= 2560 >> shift;
-			mf->height	= 1920 >> shift;
-			ret = v4l2_device_call_until_err(sd->v4l2_dev,
-					soc_camera_grp_id(icd), pad,
-					set_fmt, NULL, &fmt);
-			if (ret < 0)
-				return ret;
-			shift++;
-		}
-
-		if (shift == 4) {
-			dev_err(dev, "Failed to configure the client below %ux%x\n",
-				mf->width, mf->height);
-			return -EIO;
-		}
-
-		dev_geo(dev, "camera fmt %ux%u\n", mf->width, mf->height);
-
-		cam = kzalloc(sizeof(*cam), GFP_KERNEL);
-		if (!cam)
-			return -ENOMEM;
-
-		/* We are called with current camera crop, initialise subrect with it */
-		cam->rect	= rect;
-		cam->subrect	= rect;
-
-		cam->width	= mf->width;
-		cam->height	= mf->height;
-
-		icd->host_priv = cam;
-	} else {
-		cam = icd->host_priv;
-	}
-
-	/* Beginning of a pass */
-	if (!idx)
-		cam->extra_fmt = NULL;
-
-	switch (code.code) {
-	case MEDIA_BUS_FMT_UYVY8_2X8:
-	case MEDIA_BUS_FMT_VYUY8_2X8:
-	case MEDIA_BUS_FMT_YUYV8_2X8:
-	case MEDIA_BUS_FMT_YVYU8_2X8:
-		if (cam->extra_fmt)
-			break;
-
-		/*
-		 * Our case is simple so far: for any of the above four camera
-		 * formats we add all our four synthesized NV* formats, so,
-		 * just marking the device with a single flag suffices. If
-		 * the format generation rules are more complex, you would have
-		 * to actually hang your already added / counted formats onto
-		 * the host_priv pointer and check whether the format you're
-		 * going to add now is already there.
-		 */
-		cam->extra_fmt = sh_mobile_ceu_formats;
-
-		n = ARRAY_SIZE(sh_mobile_ceu_formats);
-		formats += n;
-		for (k = 0; xlate && k < n; k++) {
-			xlate->host_fmt	= &sh_mobile_ceu_formats[k];
-			xlate->code	= code.code;
-			xlate++;
-			dev_dbg(dev, "Providing format %s using code %d\n",
-				sh_mobile_ceu_formats[k].name, code.code);
-		}
-		break;
-	default:
-		if (!sh_mobile_ceu_packing_supported(fmt))
-			return 0;
-	}
-
-	/* Generic pass-through */
-	formats++;
-	if (xlate) {
-		xlate->host_fmt	= fmt;
-		xlate->code	= code.code;
-		xlate++;
-		dev_dbg(dev, "Providing format %s in pass-through mode\n",
-			fmt->name);
-	}
-
-	return formats;
-}
-
-static void sh_mobile_ceu_put_formats(struct soc_camera_device *icd)
-{
-	kfree(icd->host_priv);
-	icd->host_priv = NULL;
-}
-
-#define scale_down(size, scale) soc_camera_shift_scale(size, 12, scale)
-#define calc_generic_scale(in, out) soc_camera_calc_scale(in, 12, out)
-
-/*
- * CEU can scale and crop, but we don't want to waste bandwidth and kill the
- * framerate by always requesting the maximum image from the client. See
- * Documentation/media/v4l-drivers/sh_mobile_ceu_camera.rst for a description of
- * scaling and cropping algorithms and for the meaning of referenced here steps.
- */
-static int sh_mobile_ceu_set_selection(struct soc_camera_device *icd,
-				       struct v4l2_selection *sel)
-{
-	struct v4l2_rect *rect = &sel->r;
-	struct device *dev = icd->parent;
-	struct soc_camera_host *ici = to_soc_camera_host(dev);
-	struct sh_mobile_ceu_dev *pcdev = ici->priv;
-	struct v4l2_selection cam_sel;
-	struct sh_mobile_ceu_cam *cam = icd->host_priv;
-	struct v4l2_rect *cam_rect = &cam_sel.r;
-	struct v4l2_subdev *sd = soc_camera_to_subdev(icd);
-	struct v4l2_subdev_format fmt = {
-		.which = V4L2_SUBDEV_FORMAT_ACTIVE,
-	};
-	struct v4l2_mbus_framefmt *mf = &fmt.format;
-	unsigned int scale_cam_h, scale_cam_v, scale_ceu_h, scale_ceu_v,
-		out_width, out_height;
-	int interm_width, interm_height;
-	u32 capsr, cflcr;
-	int ret;
-
-	dev_geo(dev, "S_SELECTION(%ux%u@%u:%u)\n", rect->width, rect->height,
-		rect->left, rect->top);
-
-	/* During camera cropping its output window can change too, stop CEU */
-	capsr = capture_save_reset(pcdev);
-	dev_dbg(dev, "CAPSR 0x%x, CFLCR 0x%x\n", capsr, pcdev->cflcr);
-
-	/*
-	 * 1. - 2. Apply iterative camera S_SELECTION for new input window, read back
-	 * actual camera rectangle.
-	 */
-	ret = soc_camera_client_s_selection(sd, sel, &cam_sel,
-				       &cam->rect, &cam->subrect);
-	if (ret < 0)
-		return ret;
-
-	dev_geo(dev, "1-2: camera cropped to %ux%u@%u:%u\n",
-		cam_rect->width, cam_rect->height,
-		cam_rect->left, cam_rect->top);
-
-	/* On success cam_crop contains current camera crop */
-
-	/* 3. Retrieve camera output window */
-	ret = v4l2_subdev_call(sd, pad, get_fmt, NULL, &fmt);
-	if (ret < 0)
-		return ret;
-
-	if (mf->width > pcdev->max_width || mf->height > pcdev->max_height)
-		return -EINVAL;
-
-	/* 4. Calculate camera scales */
-	scale_cam_h	= calc_generic_scale(cam_rect->width, mf->width);
-	scale_cam_v	= calc_generic_scale(cam_rect->height, mf->height);
-
-	/* Calculate intermediate window */
-	interm_width	= scale_down(rect->width, scale_cam_h);
-	interm_height	= scale_down(rect->height, scale_cam_v);
-
-	if (interm_width < icd->user_width) {
-		u32 new_scale_h;
-
-		new_scale_h = calc_generic_scale(rect->width, icd->user_width);
-
-		mf->width = scale_down(cam_rect->width, new_scale_h);
-	}
-
-	if (interm_height < icd->user_height) {
-		u32 new_scale_v;
-
-		new_scale_v = calc_generic_scale(rect->height, icd->user_height);
-
-		mf->height = scale_down(cam_rect->height, new_scale_v);
-	}
-
-	if (interm_width < icd->user_width || interm_height < icd->user_height) {
-		ret = v4l2_device_call_until_err(sd->v4l2_dev,
-					soc_camera_grp_id(icd), pad,
-					set_fmt, NULL, &fmt);
-		if (ret < 0)
-			return ret;
-
-		dev_geo(dev, "New camera output %ux%u\n", mf->width, mf->height);
-		scale_cam_h	= calc_generic_scale(cam_rect->width, mf->width);
-		scale_cam_v	= calc_generic_scale(cam_rect->height, mf->height);
-		interm_width	= scale_down(rect->width, scale_cam_h);
-		interm_height	= scale_down(rect->height, scale_cam_v);
-	}
-
-	/* Cache camera output window */
-	cam->width	= mf->width;
-	cam->height	= mf->height;
-
-	if (pcdev->image_mode) {
-		out_width	= min(interm_width, icd->user_width);
-		out_height	= min(interm_height, icd->user_height);
-	} else {
-		out_width	= interm_width;
-		out_height	= interm_height;
-	}
-
-	/*
-	 * 5. Calculate CEU scales from camera scales from results of (5) and
-	 *    the user window
-	 */
-	scale_ceu_h	= calc_scale(interm_width, &out_width);
-	scale_ceu_v	= calc_scale(interm_height, &out_height);
-
-	dev_geo(dev, "5: CEU scales %u:%u\n", scale_ceu_h, scale_ceu_v);
-
-	/* Apply CEU scales. */
-	cflcr = scale_ceu_h | (scale_ceu_v << 16);
-	if (cflcr != pcdev->cflcr) {
-		pcdev->cflcr = cflcr;
-		ceu_write(pcdev, CFLCR, cflcr);
-	}
-
-	icd->user_width	 = out_width & ~3;
-	icd->user_height = out_height & ~3;
-	/* Offsets are applied at the CEU scaling filter input */
-	cam->ceu_left	 = scale_down(rect->left - cam_rect->left, scale_cam_h) & ~1;
-	cam->ceu_top	 = scale_down(rect->top - cam_rect->top, scale_cam_v) & ~1;
-
-	/* 6. Use CEU cropping to crop to the new window. */
-	sh_mobile_ceu_set_rect(icd);
-
-	cam->subrect = *rect;
-
-	dev_geo(dev, "6: CEU cropped to %ux%u@%u:%u\n",
-		icd->user_width, icd->user_height,
-		cam->ceu_left, cam->ceu_top);
-
-	/* Restore capture. The CE bit can be cleared by the hardware */
-	if (pcdev->active)
-		capsr |= 1;
-	capture_restore(pcdev, capsr);
-
-	/* Even if only camera cropping succeeded */
-	return ret;
-}
-
-static int sh_mobile_ceu_get_selection(struct soc_camera_device *icd,
-				       struct v4l2_selection *sel)
-{
-	struct sh_mobile_ceu_cam *cam = icd->host_priv;
-
-	sel->r = cam->subrect;
-
-	return 0;
-}
-
-/* Similar to set_crop multistage iterative algorithm */
-static int sh_mobile_ceu_set_fmt(struct soc_camera_device *icd,
-				 struct v4l2_format *f)
-{
-	struct device *dev = icd->parent;
-	struct soc_camera_host *ici = to_soc_camera_host(dev);
-	struct sh_mobile_ceu_dev *pcdev = ici->priv;
-	struct sh_mobile_ceu_cam *cam = icd->host_priv;
-	struct v4l2_pix_format *pix = &f->fmt.pix;
-	struct v4l2_mbus_framefmt mf;
-	__u32 pixfmt = pix->pixelformat;
-	const struct soc_camera_format_xlate *xlate;
-	unsigned int ceu_sub_width = pcdev->max_width,
-		ceu_sub_height = pcdev->max_height;
-	u16 scale_v, scale_h;
-	int ret;
-	bool image_mode;
-	enum v4l2_field field;
-
-	switch (pix->field) {
-	default:
-		pix->field = V4L2_FIELD_NONE;
-		/* fall-through */
-	case V4L2_FIELD_INTERLACED_TB:
-	case V4L2_FIELD_INTERLACED_BT:
-	case V4L2_FIELD_NONE:
-		field = pix->field;
-		break;
-	case V4L2_FIELD_INTERLACED:
-		field = V4L2_FIELD_INTERLACED_TB;
-		break;
-	}
-
-	xlate = soc_camera_xlate_by_fourcc(icd, pixfmt);
-	if (!xlate) {
-		dev_warn(dev, "Format %x not found\n", pixfmt);
-		return -EINVAL;
-	}
-
-	/* 1.-4. Calculate desired client output geometry */
-	soc_camera_calc_client_output(icd, &cam->rect, &cam->subrect, pix, &mf, 12);
-	mf.field	= pix->field;
-	mf.colorspace	= pix->colorspace;
-	mf.code		= xlate->code;
-
-	switch (pixfmt) {
-	case V4L2_PIX_FMT_NV12:
-	case V4L2_PIX_FMT_NV21:
-	case V4L2_PIX_FMT_NV16:
-	case V4L2_PIX_FMT_NV61:
-		image_mode = true;
-		break;
-	default:
-		image_mode = false;
-	}
-
-	dev_geo(dev, "S_FMT(pix=0x%x, fld 0x%x, code 0x%x, %ux%u)\n", pixfmt, mf.field, mf.code,
-		pix->width, pix->height);
-
-	dev_geo(dev, "4: request camera output %ux%u\n", mf.width, mf.height);
-
-	/* 5. - 9. */
-	ret = soc_camera_client_scale(icd, &cam->rect, &cam->subrect,
-				&mf, &ceu_sub_width, &ceu_sub_height,
-				image_mode && V4L2_FIELD_NONE == field, 12);
-
-	dev_geo(dev, "5-9: client scale return %d\n", ret);
-
-	/* Done with the camera. Now see if we can improve the result */
-
-	dev_geo(dev, "fmt %ux%u, requested %ux%u\n",
-		mf.width, mf.height, pix->width, pix->height);
-	if (ret < 0)
-		return ret;
-
-	if (mf.code != xlate->code)
-		return -EINVAL;
-
-	/* 9. Prepare CEU crop */
-	cam->width = mf.width;
-	cam->height = mf.height;
-
-	/* 10. Use CEU scaling to scale to the requested user window. */
-
-	/* We cannot scale up */
-	if (pix->width > ceu_sub_width)
-		ceu_sub_width = pix->width;
-
-	if (pix->height > ceu_sub_height)
-		ceu_sub_height = pix->height;
-
-	pix->colorspace = mf.colorspace;
-
-	if (image_mode) {
-		/* Scale pix->{width x height} down to width x height */
-		scale_h		= calc_scale(ceu_sub_width, &pix->width);
-		scale_v		= calc_scale(ceu_sub_height, &pix->height);
-	} else {
-		pix->width	= ceu_sub_width;
-		pix->height	= ceu_sub_height;
-		scale_h		= 0;
-		scale_v		= 0;
-	}
-
-	pcdev->cflcr = scale_h | (scale_v << 16);
-
-	/*
-	 * We have calculated CFLCR, the actual configuration will be performed
-	 * in sh_mobile_ceu_set_bus_param()
-	 */
-
-	dev_geo(dev, "10: W: %u : 0x%x = %u, H: %u : 0x%x = %u\n",
-		ceu_sub_width, scale_h, pix->width,
-		ceu_sub_height, scale_v, pix->height);
-
-	cam->code		= xlate->code;
-	icd->current_fmt	= xlate;
-
-	pcdev->field = field;
-	pcdev->image_mode = image_mode;
-
-	/* CFSZR requirement */
-	pix->width	&= ~3;
-	pix->height	&= ~3;
-
-	return 0;
-}
-
-#define CEU_CHDW_MAX	8188U	/* Maximum line stride */
-
-static int sh_mobile_ceu_try_fmt(struct soc_camera_device *icd,
-				 struct v4l2_format *f)
-{
-	struct soc_camera_host *ici = to_soc_camera_host(icd->parent);
-	struct sh_mobile_ceu_dev *pcdev = ici->priv;
-	const struct soc_camera_format_xlate *xlate;
-	struct v4l2_pix_format *pix = &f->fmt.pix;
-	struct v4l2_subdev *sd = soc_camera_to_subdev(icd);
-	struct v4l2_subdev_pad_config pad_cfg;
-	struct v4l2_subdev_format format = {
-		.which = V4L2_SUBDEV_FORMAT_TRY,
-	};
-	struct v4l2_mbus_framefmt *mf = &format.format;
-	__u32 pixfmt = pix->pixelformat;
-	int width, height;
-	int ret;
-
-	dev_geo(icd->parent, "TRY_FMT(pix=0x%x, %ux%u)\n",
-		 pixfmt, pix->width, pix->height);
-
-	xlate = soc_camera_xlate_by_fourcc(icd, pixfmt);
-	if (!xlate) {
-		xlate = icd->current_fmt;
-		dev_dbg(icd->parent, "Format %x not found, keeping %x\n",
-			pixfmt, xlate->host_fmt->fourcc);
-		pixfmt = xlate->host_fmt->fourcc;
-		pix->pixelformat = pixfmt;
-		pix->colorspace = icd->colorspace;
-	}
-
-	/* FIXME: calculate using depth and bus width */
-
-	/* CFSZR requires height and width to be 4-pixel aligned */
-	v4l_bound_align_image(&pix->width, 2, pcdev->max_width, 2,
-			      &pix->height, 4, pcdev->max_height, 2, 0);
-
-	width = pix->width;
-	height = pix->height;
-
-	/* limit to sensor capabilities */
-	mf->width	= pix->width;
-	mf->height	= pix->height;
-	mf->field	= pix->field;
-	mf->code	= xlate->code;
-	mf->colorspace	= pix->colorspace;
-
-	ret = v4l2_device_call_until_err(sd->v4l2_dev, soc_camera_grp_id(icd),
-					 pad, set_fmt, &pad_cfg, &format);
-	if (ret < 0)
-		return ret;
-
-	pix->width	= mf->width;
-	pix->height	= mf->height;
-	pix->field	= mf->field;
-	pix->colorspace	= mf->colorspace;
-
-	switch (pixfmt) {
-	case V4L2_PIX_FMT_NV12:
-	case V4L2_PIX_FMT_NV21:
-	case V4L2_PIX_FMT_NV16:
-	case V4L2_PIX_FMT_NV61:
-		/* FIXME: check against rect_max after converting soc-camera */
-		/* We can scale precisely, need a bigger image from camera */
-		if (pix->width < width || pix->height < height) {
-			/*
-			 * We presume, the sensor behaves sanely, i.e., if
-			 * requested a bigger rectangle, it will not return a
-			 * smaller one.
-			 */
-			mf->width = pcdev->max_width;
-			mf->height = pcdev->max_height;
-			ret = v4l2_device_call_until_err(sd->v4l2_dev,
-					soc_camera_grp_id(icd), pad,
-					set_fmt, &pad_cfg, &format);
-			if (ret < 0) {
-				/* Shouldn't actually happen... */
-				dev_err(icd->parent,
-					"FIXME: client try_fmt() = %d\n", ret);
-				return ret;
-			}
-		}
-		/* We will scale exactly */
-		if (mf->width > width)
-			pix->width = width;
-		if (mf->height > height)
-			pix->height = height;
-
-		pix->bytesperline = max(pix->bytesperline, pix->width);
-		pix->bytesperline = min(pix->bytesperline, CEU_CHDW_MAX);
-		pix->bytesperline &= ~3;
-		break;
-
-	default:
-		/* Configurable stride isn't supported in pass-through mode. */
-		pix->bytesperline  = 0;
-	}
-
-	pix->width	&= ~3;
-	pix->height	&= ~3;
-	pix->sizeimage	= 0;
-
-	dev_geo(icd->parent, "%s(): return %d, fmt 0x%x, %ux%u\n",
-		__func__, ret, pix->pixelformat, pix->width, pix->height);
-
-	return ret;
-}
-
-static int sh_mobile_ceu_set_liveselection(struct soc_camera_device *icd,
-					   struct v4l2_selection *sel)
-{
-	struct v4l2_subdev *sd = soc_camera_to_subdev(icd);
-	struct soc_camera_host *ici = to_soc_camera_host(icd->parent);
-	struct sh_mobile_ceu_dev *pcdev = ici->priv;
-	u32 out_width = icd->user_width, out_height = icd->user_height;
-	int ret;
-
-	/* Freeze queue */
-	pcdev->frozen = 1;
-	/* Wait for frame */
-	ret = wait_for_completion_interruptible(&pcdev->complete);
-	/* Stop the client */
-	ret = v4l2_subdev_call(sd, video, s_stream, 0);
-	if (ret < 0)
-		dev_warn(icd->parent,
-			 "Client failed to stop the stream: %d\n", ret);
-	else
-		/* Do the crop, if it fails, there's nothing more we can do */
-		sh_mobile_ceu_set_selection(icd, sel);
-
-	dev_geo(icd->parent, "Output after crop: %ux%u\n", icd->user_width, icd->user_height);
-
-	if (icd->user_width != out_width || icd->user_height != out_height) {
-		struct v4l2_format f = {
-			.type	= V4L2_BUF_TYPE_VIDEO_CAPTURE,
-			.fmt.pix	= {
-				.width		= out_width,
-				.height		= out_height,
-				.pixelformat	= icd->current_fmt->host_fmt->fourcc,
-				.field		= pcdev->field,
-				.colorspace	= icd->colorspace,
-			},
-		};
-		ret = sh_mobile_ceu_set_fmt(icd, &f);
-		if (!ret && (out_width != f.fmt.pix.width ||
-			     out_height != f.fmt.pix.height))
-			ret = -EINVAL;
-		if (!ret) {
-			icd->user_width		= out_width & ~3;
-			icd->user_height	= out_height & ~3;
-			ret = sh_mobile_ceu_set_bus_param(icd);
-		}
-	}
-
-	/* Thaw the queue */
-	pcdev->frozen = 0;
-	spin_lock_irq(&pcdev->lock);
-	sh_mobile_ceu_capture(pcdev);
-	spin_unlock_irq(&pcdev->lock);
-	/* Start the client */
-	ret = v4l2_subdev_call(sd, video, s_stream, 1);
-	return ret;
-}
-
-static __poll_t sh_mobile_ceu_poll(struct file *file, poll_table *pt)
-{
-	struct soc_camera_device *icd = file->private_data;
-
-	return vb2_poll(&icd->vb2_vidq, file, pt);
-}
-
-static int sh_mobile_ceu_querycap(struct soc_camera_host *ici,
-				  struct v4l2_capability *cap)
-{
-	strscpy(cap->card, "SuperH_Mobile_CEU", sizeof(cap->card));
-	strscpy(cap->driver, "sh_mobile_ceu", sizeof(cap->driver));
-	strscpy(cap->bus_info, "platform:sh_mobile_ceu", sizeof(cap->bus_info));
-	cap->device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_STREAMING;
-	cap->capabilities = cap->device_caps | V4L2_CAP_DEVICE_CAPS;
-
-	return 0;
-}
-
-static int sh_mobile_ceu_init_videobuf(struct vb2_queue *q,
-				       struct soc_camera_device *icd)
-{
-	struct soc_camera_host *ici = to_soc_camera_host(icd->parent);
-
-	q->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-	q->io_modes = VB2_MMAP | VB2_USERPTR;
-	q->drv_priv = icd;
-	q->ops = &sh_mobile_ceu_videobuf_ops;
-	q->mem_ops = &vb2_dma_contig_memops;
-	q->buf_struct_size = sizeof(struct sh_mobile_ceu_buffer);
-	q->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
-	q->lock = &ici->host_lock;
-	q->dev = ici->v4l2_dev.dev;
-
-	return vb2_queue_init(q);
-}
-
-static struct soc_camera_host_ops sh_mobile_ceu_host_ops = {
-	.owner		= THIS_MODULE,
-	.add		= sh_mobile_ceu_add_device,
-	.remove		= sh_mobile_ceu_remove_device,
-	.clock_start	= sh_mobile_ceu_clock_start,
-	.clock_stop	= sh_mobile_ceu_clock_stop,
-	.get_formats	= sh_mobile_ceu_get_formats,
-	.put_formats	= sh_mobile_ceu_put_formats,
-	.get_selection	= sh_mobile_ceu_get_selection,
-	.set_selection	= sh_mobile_ceu_set_selection,
-	.set_liveselection	= sh_mobile_ceu_set_liveselection,
-	.set_fmt	= sh_mobile_ceu_set_fmt,
-	.try_fmt	= sh_mobile_ceu_try_fmt,
-	.poll		= sh_mobile_ceu_poll,
-	.querycap	= sh_mobile_ceu_querycap,
-	.set_bus_param	= sh_mobile_ceu_set_bus_param,
-	.init_videobuf2	= sh_mobile_ceu_init_videobuf,
-};
-
-struct bus_wait {
-	struct notifier_block	notifier;
-	struct completion	completion;
-	struct device		*dev;
-};
-
-static int bus_notify(struct notifier_block *nb,
-		      unsigned long action, void *data)
-{
-	struct device *dev = data;
-	struct bus_wait *wait = container_of(nb, struct bus_wait, notifier);
-
-	if (wait->dev != dev)
-		return NOTIFY_DONE;
-
-	switch (action) {
-	case BUS_NOTIFY_UNBOUND_DRIVER:
-		/* Protect from module unloading */
-		wait_for_completion(&wait->completion);
-		return NOTIFY_OK;
-	}
-	return NOTIFY_DONE;
-}
-
-static int sh_mobile_ceu_probe(struct platform_device *pdev)
-{
-	struct sh_mobile_ceu_dev *pcdev;
-	struct resource *res;
-	void __iomem *base;
-	unsigned int irq;
-	int err;
-	struct bus_wait wait = {
-		.completion = COMPLETION_INITIALIZER_ONSTACK(wait.completion),
-		.notifier.notifier_call = bus_notify,
-	};
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	irq = platform_get_irq(pdev, 0);
-	if (!res || (int)irq <= 0) {
-		dev_err(&pdev->dev, "Not enough CEU platform resources.\n");
-		return -ENODEV;
-	}
-
-	pcdev = devm_kzalloc(&pdev->dev, sizeof(*pcdev), GFP_KERNEL);
-	if (!pcdev) {
-		dev_err(&pdev->dev, "Could not allocate pcdev\n");
-		return -ENOMEM;
-	}
-
-	INIT_LIST_HEAD(&pcdev->capture);
-	spin_lock_init(&pcdev->lock);
-	init_completion(&pcdev->complete);
-
-	pcdev->pdata = pdev->dev.platform_data;
-	if (!pcdev->pdata && !pdev->dev.of_node) {
-		dev_err(&pdev->dev, "CEU platform data not set.\n");
-		return -EINVAL;
-	}
-
-	/* TODO: implement per-device bus flags */
-	if (pcdev->pdata) {
-		pcdev->max_width = pcdev->pdata->max_width;
-		pcdev->max_height = pcdev->pdata->max_height;
-		pcdev->flags = pcdev->pdata->flags;
-	}
-	pcdev->field = V4L2_FIELD_NONE;
-
-	if (!pcdev->max_width) {
-		unsigned int v;
-		err = of_property_read_u32(pdev->dev.of_node, "renesas,max-width", &v);
-		if (!err)
-			pcdev->max_width = v;
-
-		if (!pcdev->max_width)
-			pcdev->max_width = 2560;
-	}
-	if (!pcdev->max_height) {
-		unsigned int v;
-		err = of_property_read_u32(pdev->dev.of_node, "renesas,max-height", &v);
-		if (!err)
-			pcdev->max_height = v;
-
-		if (!pcdev->max_height)
-			pcdev->max_height = 1920;
-	}
-
-	base = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(base))
-		return PTR_ERR(base);
-
-	pcdev->irq = irq;
-	pcdev->base = base;
-	pcdev->video_limit = 0; /* only enabled if second resource exists */
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-	if (res) {
-		err = dma_declare_coherent_memory(&pdev->dev, res->start,
-						  res->start,
-						  resource_size(res),
-						  DMA_MEMORY_EXCLUSIVE);
-		if (err) {
-			dev_err(&pdev->dev, "Unable to declare CEU memory.\n");
-			return err;
-		}
-
-		pcdev->video_limit = resource_size(res);
-	}
-
-	/* request irq */
-	err = devm_request_irq(&pdev->dev, pcdev->irq, sh_mobile_ceu_irq,
-			       0, dev_name(&pdev->dev), pcdev);
-	if (err) {
-		dev_err(&pdev->dev, "Unable to register CEU interrupt.\n");
-		goto exit_release_mem;
-	}
-
-	pm_suspend_ignore_children(&pdev->dev, true);
-	pm_runtime_enable(&pdev->dev);
-	pm_runtime_resume(&pdev->dev);
-
-	pcdev->ici.priv = pcdev;
-	pcdev->ici.v4l2_dev.dev = &pdev->dev;
-	pcdev->ici.nr = pdev->id;
-	pcdev->ici.drv_name = dev_name(&pdev->dev);
-	pcdev->ici.ops = &sh_mobile_ceu_host_ops;
-	pcdev->ici.capabilities = SOCAM_HOST_CAP_STRIDE;
-
-	if (pcdev->pdata && pcdev->pdata->asd_sizes) {
-		pcdev->ici.asd = pcdev->pdata->asd;
-		pcdev->ici.asd_sizes = pcdev->pdata->asd_sizes;
-	}
-
-	err = soc_camera_host_register(&pcdev->ici);
-	if (err)
-		goto exit_free_clk;
-
-	return 0;
-
-exit_free_clk:
-	pm_runtime_disable(&pdev->dev);
-exit_release_mem:
-	if (platform_get_resource(pdev, IORESOURCE_MEM, 1))
-		dma_release_declared_memory(&pdev->dev);
-	return err;
-}
-
-static int sh_mobile_ceu_remove(struct platform_device *pdev)
-{
-	struct soc_camera_host *soc_host = to_soc_camera_host(&pdev->dev);
-
-	soc_camera_host_unregister(soc_host);
-	pm_runtime_disable(&pdev->dev);
-	if (platform_get_resource(pdev, IORESOURCE_MEM, 1))
-		dma_release_declared_memory(&pdev->dev);
-
-	return 0;
-}
-
-static int sh_mobile_ceu_runtime_nop(struct device *dev)
-{
-	/* Runtime PM callback shared between ->runtime_suspend()
-	 * and ->runtime_resume(). Simply returns success.
-	 *
-	 * This driver re-initializes all registers after
-	 * pm_runtime_get_sync() anyway so there is no need
-	 * to save and restore registers here.
-	 */
-	return 0;
-}
-
-static const struct dev_pm_ops sh_mobile_ceu_dev_pm_ops = {
-	.runtime_suspend = sh_mobile_ceu_runtime_nop,
-	.runtime_resume = sh_mobile_ceu_runtime_nop,
-};
-
-static const struct of_device_id sh_mobile_ceu_of_match[] = {
-	{ .compatible = "renesas,sh-mobile-ceu" },
-	{ }
-};
-MODULE_DEVICE_TABLE(of, sh_mobile_ceu_of_match);
-
-static struct platform_driver sh_mobile_ceu_driver = {
-	.driver		= {
-		.name	= "sh_mobile_ceu",
-		.pm	= &sh_mobile_ceu_dev_pm_ops,
-		.of_match_table = sh_mobile_ceu_of_match,
-	},
-	.probe		= sh_mobile_ceu_probe,
-	.remove		= sh_mobile_ceu_remove,
-};
-
-module_platform_driver(sh_mobile_ceu_driver);
-
-MODULE_DESCRIPTION("SuperH Mobile CEU driver");
-MODULE_AUTHOR("Magnus Damm");
-MODULE_LICENSE("GPL");
-MODULE_VERSION("0.1.0");
-MODULE_ALIAS("platform:sh_mobile_ceu");
diff --git a/include/media/drv-intf/sh_mobile_ceu.h b/include/media/drv-intf/sh_mobile_ceu.h
deleted file mode 100644
index 555f0ecc0fde..000000000000
--- a/include/media/drv-intf/sh_mobile_ceu.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __ASM_SH_MOBILE_CEU_H__
-#define __ASM_SH_MOBILE_CEU_H__
-
-#define SH_CEU_FLAG_USE_8BIT_BUS	(1 << 0) /* use  8bit bus width */
-#define SH_CEU_FLAG_USE_16BIT_BUS	(1 << 1) /* use 16bit bus width */
-#define SH_CEU_FLAG_HSYNC_LOW		(1 << 2) /* default High if possible */
-#define SH_CEU_FLAG_VSYNC_LOW		(1 << 3) /* default High if possible */
-#define SH_CEU_FLAG_LOWER_8BIT		(1 << 4) /* default upper 8bit */
-
-struct device;
-struct resource;
-
-struct sh_mobile_ceu_companion {
-	u32		num_resources;
-	struct resource	*resource;
-	int		id;
-	void		*platform_data;
-};
-
-struct sh_mobile_ceu_info {
-	unsigned long flags;
-	int max_width;
-	int max_height;
-	struct v4l2_async_subdev **asd;	/* Flat array, arranged in groups */
-	unsigned int *asd_sizes;	/* 0-terminated array pf asd group sizes */
-};
-
-#endif /* __ASM_SH_MOBILE_CEU_H__ */
-- 
cgit v1.2.3


From dc60a4cfb77c891f67f31953025208067b05883c Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Thu, 17 Jan 2019 11:47:55 -0200
Subject: media: soc_camera_platform: remove obsolete soc_camera test driver

This is a test stub driver for soc_camera. Since soc_camera is
being deprecated (and in fact, nobody is using it anymore)
there's no sense in keeping this test driver.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/platform/soc_camera/Kconfig          |   6 -
 drivers/media/platform/soc_camera/Makefile         |   4 -
 .../platform/soc_camera/soc_camera_platform.c      | 188 ---------------------
 .../platform_data/media/soc_camera_platform.h      |  83 ---------
 4 files changed, 281 deletions(-)
 delete mode 100644 drivers/media/platform/soc_camera/soc_camera_platform.c
 delete mode 100644 include/linux/platform_data/media/soc_camera_platform.h

(limited to 'include')

diff --git a/drivers/media/platform/soc_camera/Kconfig b/drivers/media/platform/soc_camera/Kconfig
index d471d34b884c..8f9b3bac5450 100644
--- a/drivers/media/platform/soc_camera/Kconfig
+++ b/drivers/media/platform/soc_camera/Kconfig
@@ -6,9 +6,3 @@ config SOC_CAMERA
 	  SoC Camera is a common API to several cameras, not connecting
 	  over a bus like PCI or USB. For example some i2c camera connected
 	  directly to the data bus of an SoC.
-
-config SOC_CAMERA_PLATFORM
-	tristate "platform camera support"
-	depends on SOC_CAMERA
-	help
-	  This is a generic SoC camera platform driver, useful for testing
diff --git a/drivers/media/platform/soc_camera/Makefile b/drivers/media/platform/soc_camera/Makefile
index 2cb7022e073b..85d5e74f3b2b 100644
--- a/drivers/media/platform/soc_camera/Makefile
+++ b/drivers/media/platform/soc_camera/Makefile
@@ -1,5 +1 @@
 obj-$(CONFIG_SOC_CAMERA)		+= soc_camera.o soc_mediabus.o
-
-# a platform subdevice driver stub, allowing to support cameras by adding a
-# couple of callback functions to the board code
-obj-$(CONFIG_SOC_CAMERA_PLATFORM)	+= soc_camera_platform.o
diff --git a/drivers/media/platform/soc_camera/soc_camera_platform.c b/drivers/media/platform/soc_camera/soc_camera_platform.c
deleted file mode 100644
index 79fbe1fea95f..000000000000
--- a/drivers/media/platform/soc_camera/soc_camera_platform.c
+++ /dev/null
@@ -1,188 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Generic Platform Camera Driver
- *
- * Copyright (C) 2008 Magnus Damm
- * Based on mt9m001 driver,
- * Copyright (C) 2008, Guennadi Liakhovetski <kernel@pengutronix.de>
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/delay.h>
-#include <linux/platform_device.h>
-#include <linux/videodev2.h>
-#include <media/v4l2-subdev.h>
-#include <media/soc_camera.h>
-#include <linux/platform_data/media/soc_camera_platform.h>
-
-struct soc_camera_platform_priv {
-	struct v4l2_subdev subdev;
-};
-
-static struct soc_camera_platform_priv *get_priv(struct platform_device *pdev)
-{
-	struct v4l2_subdev *subdev = platform_get_drvdata(pdev);
-	return container_of(subdev, struct soc_camera_platform_priv, subdev);
-}
-
-static int soc_camera_platform_s_stream(struct v4l2_subdev *sd, int enable)
-{
-	struct soc_camera_platform_info *p = v4l2_get_subdevdata(sd);
-	return p->set_capture(p, enable);
-}
-
-static int soc_camera_platform_fill_fmt(struct v4l2_subdev *sd,
-		struct v4l2_subdev_pad_config *cfg,
-		struct v4l2_subdev_format *format)
-{
-	struct soc_camera_platform_info *p = v4l2_get_subdevdata(sd);
-	struct v4l2_mbus_framefmt *mf = &format->format;
-
-	mf->width	= p->format.width;
-	mf->height	= p->format.height;
-	mf->code	= p->format.code;
-	mf->colorspace	= p->format.colorspace;
-	mf->field	= p->format.field;
-
-	return 0;
-}
-
-static int soc_camera_platform_s_power(struct v4l2_subdev *sd, int on)
-{
-	struct soc_camera_platform_info *p = v4l2_get_subdevdata(sd);
-
-	return soc_camera_set_power(p->icd->control, &p->icd->sdesc->subdev_desc, NULL, on);
-}
-
-static const struct v4l2_subdev_core_ops platform_subdev_core_ops = {
-	.s_power = soc_camera_platform_s_power,
-};
-
-static int soc_camera_platform_enum_mbus_code(struct v4l2_subdev *sd,
-		struct v4l2_subdev_pad_config *cfg,
-		struct v4l2_subdev_mbus_code_enum *code)
-{
-	struct soc_camera_platform_info *p = v4l2_get_subdevdata(sd);
-
-	if (code->pad || code->index)
-		return -EINVAL;
-
-	code->code = p->format.code;
-	return 0;
-}
-
-static int soc_camera_platform_get_selection(struct v4l2_subdev *sd,
-		struct v4l2_subdev_pad_config *cfg,
-		struct v4l2_subdev_selection *sel)
-{
-	struct soc_camera_platform_info *p = v4l2_get_subdevdata(sd);
-
-	if (sel->which != V4L2_SUBDEV_FORMAT_ACTIVE)
-		return -EINVAL;
-
-	switch (sel->target) {
-	case V4L2_SEL_TGT_CROP_BOUNDS:
-	case V4L2_SEL_TGT_CROP_DEFAULT:
-	case V4L2_SEL_TGT_CROP:
-		sel->r.left = 0;
-		sel->r.top = 0;
-		sel->r.width = p->format.width;
-		sel->r.height = p->format.height;
-		return 0;
-	default:
-		return -EINVAL;
-	}
-}
-
-static int soc_camera_platform_g_mbus_config(struct v4l2_subdev *sd,
-					     struct v4l2_mbus_config *cfg)
-{
-	struct soc_camera_platform_info *p = v4l2_get_subdevdata(sd);
-
-	cfg->flags = p->mbus_param;
-	cfg->type = p->mbus_type;
-
-	return 0;
-}
-
-static const struct v4l2_subdev_video_ops platform_subdev_video_ops = {
-	.s_stream	= soc_camera_platform_s_stream,
-	.g_mbus_config	= soc_camera_platform_g_mbus_config,
-};
-
-static const struct v4l2_subdev_pad_ops platform_subdev_pad_ops = {
-	.enum_mbus_code = soc_camera_platform_enum_mbus_code,
-	.get_selection	= soc_camera_platform_get_selection,
-	.get_fmt	= soc_camera_platform_fill_fmt,
-	.set_fmt	= soc_camera_platform_fill_fmt,
-};
-
-static const struct v4l2_subdev_ops platform_subdev_ops = {
-	.core	= &platform_subdev_core_ops,
-	.video	= &platform_subdev_video_ops,
-	.pad	= &platform_subdev_pad_ops,
-};
-
-static int soc_camera_platform_probe(struct platform_device *pdev)
-{
-	struct soc_camera_host *ici;
-	struct soc_camera_platform_priv *priv;
-	struct soc_camera_platform_info *p = pdev->dev.platform_data;
-	struct soc_camera_device *icd;
-
-	if (!p)
-		return -EINVAL;
-
-	if (!p->icd) {
-		dev_err(&pdev->dev,
-			"Platform has not set soc_camera_device pointer!\n");
-		return -EINVAL;
-	}
-
-	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
-	if (!priv)
-		return -ENOMEM;
-
-	icd = p->icd;
-
-	/* soc-camera convention: control's drvdata points to the subdev */
-	platform_set_drvdata(pdev, &priv->subdev);
-	/* Set the control device reference */
-	icd->control = &pdev->dev;
-
-	ici = to_soc_camera_host(icd->parent);
-
-	v4l2_subdev_init(&priv->subdev, &platform_subdev_ops);
-	v4l2_set_subdevdata(&priv->subdev, p);
-	strscpy(priv->subdev.name, dev_name(&pdev->dev),
-		sizeof(priv->subdev.name));
-
-	return v4l2_device_register_subdev(&ici->v4l2_dev, &priv->subdev);
-}
-
-static int soc_camera_platform_remove(struct platform_device *pdev)
-{
-	struct soc_camera_platform_priv *priv = get_priv(pdev);
-	struct soc_camera_platform_info *p = v4l2_get_subdevdata(&priv->subdev);
-
-	p->icd->control = NULL;
-	v4l2_device_unregister_subdev(&priv->subdev);
-	return 0;
-}
-
-static struct platform_driver soc_camera_platform_driver = {
-	.driver		= {
-		.name	= "soc_camera_platform",
-	},
-	.probe		= soc_camera_platform_probe,
-	.remove		= soc_camera_platform_remove,
-};
-
-module_platform_driver(soc_camera_platform_driver);
-
-MODULE_DESCRIPTION("SoC Camera Platform driver");
-MODULE_AUTHOR("Magnus Damm");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS("platform:soc_camera_platform");
diff --git a/include/linux/platform_data/media/soc_camera_platform.h b/include/linux/platform_data/media/soc_camera_platform.h
deleted file mode 100644
index 1e5065dab430..000000000000
--- a/include/linux/platform_data/media/soc_camera_platform.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Generic Platform Camera Driver Header
- *
- * Copyright (C) 2008 Magnus Damm
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef __SOC_CAMERA_H__
-#define __SOC_CAMERA_H__
-
-#include <linux/videodev2.h>
-#include <media/soc_camera.h>
-#include <media/v4l2-mediabus.h>
-
-struct device;
-
-struct soc_camera_platform_info {
-	const char *format_name;
-	unsigned long format_depth;
-	struct v4l2_mbus_framefmt format;
-	unsigned long mbus_param;
-	enum v4l2_mbus_type mbus_type;
-	struct soc_camera_device *icd;
-	int (*set_capture)(struct soc_camera_platform_info *info, int enable);
-};
-
-static inline void soc_camera_platform_release(struct platform_device **pdev)
-{
-	*pdev = NULL;
-}
-
-static inline int soc_camera_platform_add(struct soc_camera_device *icd,
-					  struct platform_device **pdev,
-					  struct soc_camera_link *plink,
-					  void (*release)(struct device *dev),
-					  int id)
-{
-	struct soc_camera_subdev_desc *ssdd =
-		(struct soc_camera_subdev_desc *)plink;
-	struct soc_camera_platform_info *info = ssdd->drv_priv;
-	int ret;
-
-	if (&icd->sdesc->subdev_desc != ssdd)
-		return -ENODEV;
-
-	if (*pdev)
-		return -EBUSY;
-
-	*pdev = platform_device_alloc("soc_camera_platform", id);
-	if (!*pdev)
-		return -ENOMEM;
-
-	info->icd = icd;
-
-	(*pdev)->dev.platform_data = info;
-	(*pdev)->dev.release = release;
-
-	ret = platform_device_add(*pdev);
-	if (ret < 0) {
-		platform_device_put(*pdev);
-		*pdev = NULL;
-		info->icd = NULL;
-	}
-
-	return ret;
-}
-
-static inline void soc_camera_platform_del(const struct soc_camera_device *icd,
-					   struct platform_device *pdev,
-					   const struct soc_camera_link *plink)
-{
-	const struct soc_camera_subdev_desc *ssdd =
-		(const struct soc_camera_subdev_desc *)plink;
-	if (&icd->sdesc->subdev_desc != ssdd || !pdev)
-		return;
-
-	platform_device_unregister(pdev);
-}
-
-#endif /* __SOC_CAMERA_H__ */
-- 
cgit v1.2.3


From 534fd7aac56a7994d16032f32123def9923e339f Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 13 Jan 2019 16:01:17 +0200
Subject: IB/mlx5: Manage indirection mkey upon DEVX flow for ODP

Manage indirection mkey upon DEVX flow to support ODP.

To support a page fault event on the indirection mkey it needs to be part
of the device mkey radix tree.

Both the creation and the deletion flows for a DEVX object which is
indirection mkey were adapted to handle that.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/devx.c    | 89 +++++++++++++++++++++++++++++++++++-
 drivers/infiniband/hw/mlx5/main.c    |  1 +
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  6 +++
 include/linux/mlx5/driver.h          |  1 +
 4 files changed, 96 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index b7ff2138ac2a..bbf9a26d8fa6 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -17,12 +17,18 @@
 #define UVERBS_MODULE_NAME mlx5_ib
 #include <rdma/uverbs_named_ioctl.h>
 
+enum devx_obj_flags {
+	DEVX_OBJ_FLAGS_INDIRECT_MKEY = 1 << 0,
+};
+
 #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in)
 struct devx_obj {
 	struct mlx5_core_dev	*mdev;
 	u64			obj_id;
 	u32			dinlen; /* destroy inbox length */
 	u32			dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW];
+	u32			flags;
+	struct mlx5_ib_devx_mr	devx_mr;
 };
 
 struct devx_umem {
@@ -1011,6 +1017,36 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din,
 	}
 }
 
+static int devx_handle_mkey_indirect(struct devx_obj *obj,
+				     struct mlx5_ib_dev *dev,
+				     void *in, void *out)
+{
+	struct mlx5_mkey_table *table = &dev->mdev->priv.mkey_table;
+	struct mlx5_ib_devx_mr *devx_mr = &obj->devx_mr;
+	unsigned long flags;
+	struct mlx5_core_mkey *mkey;
+	void *mkc;
+	u8 key;
+	int err;
+
+	mkey = &devx_mr->mmkey;
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	key = MLX5_GET(mkc, mkc, mkey_7_0);
+	mkey->key = mlx5_idx_to_mkey(
+			MLX5_GET(create_mkey_out, out, mkey_index)) | key;
+	mkey->type = MLX5_MKEY_INDIRECT_DEVX;
+	mkey->iova = MLX5_GET64(mkc, mkc, start_addr);
+	mkey->size = MLX5_GET64(mkc, mkc, len);
+	mkey->pd = MLX5_GET(mkc, mkc, pd);
+	devx_mr->ndescs = MLX5_GET(mkc, mkc, translations_octword_size);
+
+	write_lock_irqsave(&table->lock, flags);
+	err = radix_tree_insert(&table->tree, mlx5_base_mkey(mkey->key),
+				mkey);
+	write_unlock_irqrestore(&table->lock, flags);
+	return err;
+}
+
 static int devx_handle_mkey_create(struct mlx5_ib_dev *dev,
 				   struct devx_obj *obj,
 				   void *in, int in_len)
@@ -1030,13 +1066,45 @@ static int devx_handle_mkey_create(struct mlx5_ib_dev *dev,
 	access_mode |= MLX5_GET(mkc, mkc, access_mode_4_2) << 2;
 
 	if (access_mode == MLX5_MKC_ACCESS_MODE_KLMS ||
-		access_mode == MLX5_MKC_ACCESS_MODE_KSM)
+		access_mode == MLX5_MKC_ACCESS_MODE_KSM) {
+		if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+			obj->flags |= DEVX_OBJ_FLAGS_INDIRECT_MKEY;
 		return 0;
+	}
 
 	MLX5_SET(create_mkey_in, in, mkey_umem_valid, 1);
 	return 0;
 }
 
+static void devx_free_indirect_mkey(struct rcu_head *rcu)
+{
+	kfree(container_of(rcu, struct devx_obj, devx_mr.rcu));
+}
+
+/* This function to delete from the radix tree needs to be called before
+ * destroying the underlying mkey. Otherwise a race might occur in case that
+ * other thread will get the same mkey before this one will be deleted,
+ * in that case it will fail via inserting to the tree its own data.
+ *
+ * Note:
+ * An error in the destroy is not expected unless there is some other indirect
+ * mkey which points to this one. In a kernel cleanup flow it will be just
+ * destroyed in the iterative destruction call. In a user flow, in case
+ * the application didn't close in the expected order it's its own problem,
+ * the mkey won't be part of the tree, in both cases the kernel is safe.
+ */
+static void devx_cleanup_mkey(struct devx_obj *obj)
+{
+	struct mlx5_mkey_table *table = &obj->mdev->priv.mkey_table;
+	struct mlx5_core_mkey *del_mkey;
+	unsigned long flags;
+
+	write_lock_irqsave(&table->lock, flags);
+	del_mkey = radix_tree_delete(&table->tree,
+				     mlx5_base_mkey(obj->devx_mr.mmkey.key));
+	write_unlock_irqrestore(&table->lock, flags);
+}
+
 static int devx_obj_cleanup(struct ib_uobject *uobject,
 			    enum rdma_remove_reason why)
 {
@@ -1044,10 +1112,21 @@ static int devx_obj_cleanup(struct ib_uobject *uobject,
 	struct devx_obj *obj = uobject->object;
 	int ret;
 
+	if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY)
+		devx_cleanup_mkey(obj);
+
 	ret = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out));
 	if (ib_is_destroy_retryable(ret, why, uobject))
 		return ret;
 
+	if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
+		struct mlx5_ib_dev *dev = to_mdev(uobject->context->device);
+
+		call_srcu(&dev->mr_srcu, &obj->devx_mr.rcu,
+			  devx_free_indirect_mkey);
+		return ret;
+	}
+
 	kfree(obj);
 	return ret;
 }
@@ -1108,6 +1187,12 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
 				   &obj_id);
 	WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32));
 
+	if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
+		err = devx_handle_mkey_indirect(obj, dev, cmd_in, cmd_out);
+		if (err)
+			goto obj_destroy;
+	}
+
 	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, cmd_out, cmd_out_len);
 	if (err)
 		goto obj_destroy;
@@ -1116,6 +1201,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
 	return 0;
 
 obj_destroy:
+	if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY)
+		devx_cleanup_mkey(obj);
 	mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out));
 obj_free:
 	kfree(obj);
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 61064b7171fc..ae00f994673b 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -5724,6 +5724,7 @@ void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
 {
 	mlx5_ib_cleanup_multiport_master(dev);
 	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+		srcu_barrier(&dev->mr_srcu);
 		cleanup_srcu_struct(&dev->mr_srcu);
 		drain_workqueue(dev->advise_mr_wq);
 		destroy_workqueue(dev->advise_mr_wq);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index b0a37ca2a714..819207190343 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -602,6 +602,12 @@ struct mlx5_ib_mw {
 	int			ndescs;
 };
 
+struct mlx5_ib_devx_mr {
+	struct mlx5_core_mkey	mmkey;
+	int			ndescs;
+	struct rcu_head		rcu;
+};
+
 struct mlx5_ib_umr_context {
 	struct ib_cqe		cqe;
 	enum ib_wc_status	status;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index b6f5839f129a..619d6fee96a1 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -364,6 +364,7 @@ struct mlx5_core_sig_ctx {
 enum {
 	MLX5_MKEY_MR = 1,
 	MLX5_MKEY_MW,
+	MLX5_MKEY_INDIRECT_DEVX,
 };
 
 struct mlx5_core_mkey {
-- 
cgit v1.2.3


From 51c48b310183ab6ba5419edfc6a8de889cc04521 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Sat, 19 Jan 2019 11:35:04 -0600
Subject: PCI: Probe bridge window attributes once at enumeration-time

pci_bridge_check_ranges() determines whether a bridge supports the optional
I/O and prefetchable memory windows and sets the flag bits in the bridge
resources.  This *could* be done once during enumeration except that the
resource allocation code completely clears the flag bits, e.g., in the
pci_assign_unassigned_bridge_resources() path.

The problem with pci_bridge_check_ranges() in the resource allocation path
is that we may allocate resources after devices have been claimed by
drivers, and pci_bridge_check_ranges() *changes* the window registers to
determine whether they're writable.  This may break concurrent accesses to
devices behind the bridge.

Add a new pci_read_bridge_windows() to determine whether a bridge supports
the optional windows, call it once during enumeration, remember the
results, and change pci_bridge_check_ranges() so it doesn't touch the
bridge windows but sets the flag bits based on those remembered results.

Link: https://lore.kernel.org/linux-pci/1506151482-113560-1-git-send-email-wangzhou1@hisilicon.com
Link: https://lists.gnu.org/archive/html/qemu-devel/2018-12/msg02082.html
Reported-by: Yandong Xu <xuyandong2@huawei.com>
Tested-by: Yandong Xu <xuyandong2@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Ofer Hayut <ofer@lightbitslabs.com>
Cc: Roy Shterman <roys@lightbitslabs.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Zhou Wang <wangzhou1@hisilicon.com>
---
 drivers/pci/probe.c     | 52 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/setup-bus.c | 45 ++++--------------------------------------
 include/linux/pci.h     |  3 +++
 3 files changed, 59 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 257b9f6f2ebb..2ef8b954c65a 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -348,6 +348,57 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
 	}
 }
 
+static void pci_read_bridge_windows(struct pci_dev *bridge)
+{
+	u16 io;
+	u32 pmem, tmp;
+
+	pci_read_config_word(bridge, PCI_IO_BASE, &io);
+	if (!io) {
+		pci_write_config_word(bridge, PCI_IO_BASE, 0xe0f0);
+		pci_read_config_word(bridge, PCI_IO_BASE, &io);
+		pci_write_config_word(bridge, PCI_IO_BASE, 0x0);
+	}
+	if (io)
+		bridge->io_window = 1;
+
+	/*
+	 * DECchip 21050 pass 2 errata: the bridge may miss an address
+	 * disconnect boundary by one PCI data phase.  Workaround: do not
+	 * use prefetching on this device.
+	 */
+	if (bridge->vendor == PCI_VENDOR_ID_DEC && bridge->device == 0x0001)
+		return;
+
+	pci_read_config_dword(bridge, PCI_PREF_MEMORY_BASE, &pmem);
+	if (!pmem) {
+		pci_write_config_dword(bridge, PCI_PREF_MEMORY_BASE,
+					       0xffe0fff0);
+		pci_read_config_dword(bridge, PCI_PREF_MEMORY_BASE, &pmem);
+		pci_write_config_dword(bridge, PCI_PREF_MEMORY_BASE, 0x0);
+	}
+	if (!pmem)
+		return;
+
+	bridge->pref_window = 1;
+
+	if ((pmem & PCI_PREF_RANGE_TYPE_MASK) == PCI_PREF_RANGE_TYPE_64) {
+
+		/*
+		 * Bridge claims to have a 64-bit prefetchable memory
+		 * window; verify that the upper bits are actually
+		 * writable.
+		 */
+		pci_read_config_dword(bridge, PCI_PREF_BASE_UPPER32, &pmem);
+		pci_write_config_dword(bridge, PCI_PREF_BASE_UPPER32,
+				       0xffffffff);
+		pci_read_config_dword(bridge, PCI_PREF_BASE_UPPER32, &tmp);
+		pci_write_config_dword(bridge, PCI_PREF_BASE_UPPER32, pmem);
+		if (tmp)
+			bridge->pref_64_window = 1;
+	}
+}
+
 static void pci_read_bridge_io(struct pci_bus *child)
 {
 	struct pci_dev *dev = child->self;
@@ -1739,6 +1790,7 @@ int pci_setup_device(struct pci_dev *dev)
 		pci_read_irq(dev);
 		dev->transparent = ((dev->class & 0xff) == 1);
 		pci_read_bases(dev, 2, PCI_ROM_ADDRESS1);
+		pci_read_bridge_windows(dev);
 		set_pcie_hotplug_bridge(dev);
 		pos = pci_find_capability(dev, PCI_CAP_ID_SSVID);
 		if (pos) {
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index ed960436df5e..1941bb0a6c13 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -735,58 +735,21 @@ int pci_claim_bridge_resource(struct pci_dev *bridge, int i)
    base/limit registers must be read-only and read as 0. */
 static void pci_bridge_check_ranges(struct pci_bus *bus)
 {
-	u16 io;
-	u32 pmem;
 	struct pci_dev *bridge = bus->self;
-	struct resource *b_res;
+	struct resource *b_res = &bridge->resource[PCI_BRIDGE_RESOURCES];
 
-	b_res = &bridge->resource[PCI_BRIDGE_RESOURCES];
 	b_res[1].flags |= IORESOURCE_MEM;
 
-	pci_read_config_word(bridge, PCI_IO_BASE, &io);
-	if (!io) {
-		pci_write_config_word(bridge, PCI_IO_BASE, 0xe0f0);
-		pci_read_config_word(bridge, PCI_IO_BASE, &io);
-		pci_write_config_word(bridge, PCI_IO_BASE, 0x0);
-	}
-	if (io)
+	if (bridge->io_window)
 		b_res[0].flags |= IORESOURCE_IO;
 
-	/*  DECchip 21050 pass 2 errata: the bridge may miss an address
-	    disconnect boundary by one PCI data phase.
-	    Workaround: do not use prefetching on this device. */
-	if (bridge->vendor == PCI_VENDOR_ID_DEC && bridge->device == 0x0001)
-		return;
-
-	pci_read_config_dword(bridge, PCI_PREF_MEMORY_BASE, &pmem);
-	if (!pmem) {
-		pci_write_config_dword(bridge, PCI_PREF_MEMORY_BASE,
-					       0xffe0fff0);
-		pci_read_config_dword(bridge, PCI_PREF_MEMORY_BASE, &pmem);
-		pci_write_config_dword(bridge, PCI_PREF_MEMORY_BASE, 0x0);
-	}
-	if (pmem) {
+	if (bridge->pref_window) {
 		b_res[2].flags |= IORESOURCE_MEM | IORESOURCE_PREFETCH;
-		if ((pmem & PCI_PREF_RANGE_TYPE_MASK) ==
-		    PCI_PREF_RANGE_TYPE_64) {
+		if (bridge->pref_64_window) {
 			b_res[2].flags |= IORESOURCE_MEM_64;
 			b_res[2].flags |= PCI_PREF_RANGE_TYPE_64;
 		}
 	}
-
-	/* double check if bridge does support 64 bit pref */
-	if (b_res[2].flags & IORESOURCE_MEM_64) {
-		u32 mem_base_hi, tmp;
-		pci_read_config_dword(bridge, PCI_PREF_BASE_UPPER32,
-					 &mem_base_hi);
-		pci_write_config_dword(bridge, PCI_PREF_BASE_UPPER32,
-					       0xffffffff);
-		pci_read_config_dword(bridge, PCI_PREF_BASE_UPPER32, &tmp);
-		if (!tmp)
-			b_res[2].flags &= ~IORESOURCE_MEM_64;
-		pci_write_config_dword(bridge, PCI_PREF_BASE_UPPER32,
-				       mem_base_hi);
-	}
 }
 
 /* Helper function for sizing routines: find first available
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 65f1d8c2f082..40b327b814aa 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -373,6 +373,9 @@ struct pci_dev {
 	bool		match_driver;		/* Skip attaching driver */
 
 	unsigned int	transparent:1;		/* Subtractive decode bridge */
+	unsigned int	io_window:1;		/* Bridge has I/O window */
+	unsigned int	pref_window:1;		/* Bridge has pref mem window */
+	unsigned int	pref_64_window:1;	/* Pref mem window is 64-bit */
 	unsigned int	multifunction:1;	/* Multi-function device */
 
 	unsigned int	is_busmaster:1;		/* Is busmaster */
-- 
cgit v1.2.3


From 643fa9612bf1a29153eee46fd398117632f93cbe Mon Sep 17 00:00:00 2001
From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Date: Wed, 12 Dec 2018 15:20:12 +0530
Subject: fscrypt: remove filesystem specific build config option

In order to have a common code base for fscrypt "post read" processing
for all filesystems which support encryption, this commit removes
filesystem specific build config option (e.g. CONFIG_EXT4_FS_ENCRYPTION)
and replaces it with a build option (i.e. CONFIG_FS_ENCRYPTION) whose
value affects all the filesystems making use of fscrypt.

Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 Documentation/filesystems/fscrypt.rst   |   4 +-
 arch/mips/configs/generic_defconfig     |   2 +-
 arch/nds32/configs/defconfig            |   2 +-
 arch/s390/configs/debug_defconfig       |   2 +-
 arch/s390/configs/performance_defconfig |   2 +-
 fs/crypto/Kconfig                       |   5 +-
 fs/crypto/fscrypt_private.h             |   1 -
 fs/ext4/Kconfig                         |  15 --
 fs/ext4/dir.c                           |   2 -
 fs/ext4/ext4.h                          |   7 +-
 fs/ext4/inode.c                         |   8 +-
 fs/ext4/ioctl.c                         |   4 +-
 fs/ext4/namei.c                         |  10 +-
 fs/ext4/page-io.c                       |   6 +-
 fs/ext4/readpage.c                      |   2 +-
 fs/ext4/super.c                         |   6 +-
 fs/ext4/sysfs.c                         |   4 +-
 fs/f2fs/Kconfig                         |  12 +-
 fs/f2fs/f2fs.h                          |   7 +-
 fs/f2fs/super.c                         |   8 +-
 fs/f2fs/sysfs.c                         |   4 +-
 fs/ubifs/Kconfig                        |  12 +-
 fs/ubifs/Makefile                       |   2 +-
 fs/ubifs/ioctl.c                        |   4 +-
 fs/ubifs/sb.c                           |   2 +-
 fs/ubifs/super.c                        |   2 +-
 fs/ubifs/ubifs.h                        |   5 +-
 include/linux/fs.h                      |   4 +-
 include/linux/fscrypt.h                 | 416 +++++++++++++++++++++++++++++++-
 include/linux/fscrypt_notsupp.h         | 231 ------------------
 include/linux/fscrypt_supp.h            | 204 ----------------
 31 files changed, 460 insertions(+), 535 deletions(-)
 delete mode 100644 include/linux/fscrypt_notsupp.h
 delete mode 100644 include/linux/fscrypt_supp.h

(limited to 'include')

diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index 3a7b60521b94..43dd989e2a3f 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -343,9 +343,9 @@ FS_IOC_SET_ENCRYPTION_POLICY can fail with the following errors:
 - ``ENOTEMPTY``: the file is unencrypted and is a nonempty directory
 - ``ENOTTY``: this type of filesystem does not implement encryption
 - ``EOPNOTSUPP``: the kernel was not configured with encryption
-  support for this filesystem, or the filesystem superblock has not
+  support for filesystems, or the filesystem superblock has not
   had encryption enabled on it.  (For example, to use encryption on an
-  ext4 filesystem, CONFIG_EXT4_ENCRYPTION must be enabled in the
+  ext4 filesystem, CONFIG_FS_ENCRYPTION must be enabled in the
   kernel config, and the superblock must have had the "encrypt"
   feature flag enabled using ``tune2fs -O encrypt`` or ``mkfs.ext4 -O
   encrypt``.)
diff --git a/arch/mips/configs/generic_defconfig b/arch/mips/configs/generic_defconfig
index 7c138dab87df..5d80521e5d5a 100644
--- a/arch/mips/configs/generic_defconfig
+++ b/arch/mips/configs/generic_defconfig
@@ -59,7 +59,7 @@ CONFIG_HID_MONTEREY=y
 CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
 CONFIG_EXT4_FS_SECURITY=y
-CONFIG_EXT4_ENCRYPTION=y
+CONFIG_FS_ENCRYPTION=y
 CONFIG_FANOTIFY=y
 CONFIG_FUSE_FS=y
 CONFIG_CUSE=y
diff --git a/arch/nds32/configs/defconfig b/arch/nds32/configs/defconfig
index 2546d8770785..65ce9259081b 100644
--- a/arch/nds32/configs/defconfig
+++ b/arch/nds32/configs/defconfig
@@ -74,7 +74,7 @@ CONFIG_GENERIC_PHY=y
 CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
 CONFIG_EXT4_FS_SECURITY=y
-CONFIG_EXT4_ENCRYPTION=y
+CONFIG_FS_ENCRYPTION=y
 CONFIG_FUSE_FS=y
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=y
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index c69cb04b7a59..9824c7bad9d4 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -500,7 +500,6 @@ CONFIG_S390_AP_IOMMU=y
 CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
 CONFIG_EXT4_FS_SECURITY=y
-CONFIG_EXT4_ENCRYPTION=y
 CONFIG_JBD2_DEBUG=y
 CONFIG_JFS_FS=m
 CONFIG_JFS_POSIX_ACL=y
@@ -520,6 +519,7 @@ CONFIG_BTRFS_DEBUG=y
 CONFIG_NILFS2_FS=m
 CONFIG_FS_DAX=y
 CONFIG_EXPORTFS_BLOCK_OPS=y
+CONFIG_FS_ENCRYPTION=y
 CONFIG_FANOTIFY=y
 CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig
index 32f539dc9c19..4fcbe5792744 100644
--- a/arch/s390/configs/performance_defconfig
+++ b/arch/s390/configs/performance_defconfig
@@ -497,7 +497,6 @@ CONFIG_S390_AP_IOMMU=y
 CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
 CONFIG_EXT4_FS_SECURITY=y
-CONFIG_EXT4_ENCRYPTION=y
 CONFIG_JBD2_DEBUG=y
 CONFIG_JFS_FS=m
 CONFIG_JFS_POSIX_ACL=y
@@ -515,6 +514,7 @@ CONFIG_BTRFS_FS_POSIX_ACL=y
 CONFIG_NILFS2_FS=m
 CONFIG_FS_DAX=y
 CONFIG_EXPORTFS_BLOCK_OPS=y
+CONFIG_FS_ENCRYPTION=y
 CONFIG_FANOTIFY=y
 CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 284b589b4774..f0de238000c0 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -1,5 +1,5 @@
 config FS_ENCRYPTION
-	tristate "FS Encryption (Per-file encryption)"
+	bool "FS Encryption (Per-file encryption)"
 	select CRYPTO
 	select CRYPTO_AES
 	select CRYPTO_CBC
@@ -12,4 +12,5 @@ config FS_ENCRYPTION
 	  Enable encryption of files and directories.  This
 	  feature is similar to ecryptfs, but it is more memory
 	  efficient since it avoids caching the encrypted and
-	  decrypted pages in the page cache.
+	  decrypted pages in the page cache.  Currently Ext4,
+	  F2FS and UBIFS make use of this feature.
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 7424f851eb5c..7da276159593 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -12,7 +12,6 @@
 #ifndef _FSCRYPT_PRIVATE_H
 #define _FSCRYPT_PRIVATE_H
 
-#define __FS_HAS_ENCRYPTION 1
 #include <linux/fscrypt.h>
 #include <crypto/hash.h>
 
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index a453cc87082b..031e5a82d556 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -96,21 +96,6 @@ config EXT4_FS_SECURITY
 	  If you are not using a security module that requires using
 	  extended attributes for file security labels, say N.
 
-config EXT4_ENCRYPTION
-	bool "Ext4 Encryption"
-	depends on EXT4_FS
-	select FS_ENCRYPTION
-	help
-	  Enable encryption of ext4 files and directories.  This
-	  feature is similar to ecryptfs, but it is more memory
-	  efficient since it avoids caching the encrypted and
-	  decrypted pages in the page cache.
-
-config EXT4_FS_ENCRYPTION
-	bool
-	default y
-	depends on EXT4_ENCRYPTION
-
 config EXT4_DEBUG
 	bool "EXT4 debugging support"
 	depends on EXT4_FS
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index fb7a64ea5679..0ccd51f72048 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -283,9 +283,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
 done:
 	err = 0;
 errout:
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
 	fscrypt_fname_free_buffer(&fstr);
-#endif
 	brelse(bh);
 	return err;
 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index afdb9ad8be0e..5012ddb6daf9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -40,7 +40,6 @@
 #include <linux/compat.h>
 #endif
 
-#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION)
 #include <linux/fscrypt.h>
 
 #include <linux/compiler.h>
@@ -1326,7 +1325,7 @@ struct ext4_super_block {
 #define EXT4_MF_FS_ABORTED		0x0002	/* Fatal error detected */
 #define EXT4_MF_TEST_DUMMY_ENCRYPTION	0x0004
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 #define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \
 						EXT4_MF_TEST_DUMMY_ENCRYPTION))
 #else
@@ -2051,7 +2050,7 @@ struct ext4_filename {
 	const struct qstr *usr_fname;
 	struct fscrypt_str disk_name;
 	struct dx_hash_info hinfo;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	struct fscrypt_str crypto_buf;
 #endif
 };
@@ -2279,7 +2278,7 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
 					      struct ext4_group_desc *gdp);
 ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 static inline int ext4_fname_setup_filename(struct inode *dir,
 			const struct qstr *iname,
 			int lookup, struct ext4_filename *fname)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 71bd2d28f58d..4356ef6d728e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1150,7 +1150,7 @@ int do_journal_get_write_access(handle_t *handle,
 	return ret;
 }
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
 				  get_block_t *get_block)
 {
@@ -1302,7 +1302,7 @@ retry_journal:
 	/* In case writeback began while the page was unlocked */
 	wait_for_stable_page(page);
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	if (ext4_should_dioread_nolock(inode))
 		ret = ext4_block_write_begin(page, pos, len,
 					     ext4_get_block_unwritten);
@@ -3104,7 +3104,7 @@ retry_journal:
 	/* In case writeback began while the page was unlocked */
 	wait_for_stable_page(page);
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	ret = ext4_block_write_begin(page, pos, len,
 				     ext4_da_get_block_prep);
 #else
@@ -3879,7 +3879,7 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	loff_t offset = iocb->ki_pos;
 	ssize_t ret;
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
 		return 0;
 #endif
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index d37dafa1d133..d26bcac291bb 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -210,7 +210,7 @@ journal_err_out:
 	return err;
 }
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 static int uuid_is_zero(__u8 u[16])
 {
 	int	i;
@@ -978,7 +978,7 @@ resizefs_out:
 		return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
 
 	case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 		int err, err2;
 		struct ext4_sb_info *sbi = EXT4_SB(sb);
 		handle_t *handle;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index be6cb69beb12..980166a8122a 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -612,7 +612,7 @@ static struct stats dx_show_leaf(struct inode *dir,
 		{
 			if (show_names)
 			{
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 				int len;
 				char *name;
 				struct fscrypt_str fname_crypto_str =
@@ -984,7 +984,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 	top = (struct ext4_dir_entry_2 *) ((char *) de +
 					   dir->i_sb->s_blocksize -
 					   EXT4_DIR_REC_LEN(0));
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	/* Check if the directory is encrypted */
 	if (IS_ENCRYPTED(dir)) {
 		err = fscrypt_get_encryption_info(dir);
@@ -1047,7 +1047,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 	}
 errout:
 	brelse(bh);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	fscrypt_fname_free_buffer(&fname_crypto_str);
 #endif
 	return count;
@@ -1267,7 +1267,7 @@ static inline bool ext4_match(const struct ext4_filename *fname,
 
 	f.usr_fname = fname->usr_fname;
 	f.disk_name = fname->disk_name;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	f.crypto_buf = fname->crypto_buf;
 #endif
 	return fscrypt_match_name(&f, de->name, de->name_len);
@@ -1498,7 +1498,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
 	ext4_lblk_t block;
 	int retval;
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	*res_dir = NULL;
 #endif
 	frame = dx_probe(fname, dir, NULL, frames);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index c398b55da854..b9d6cabe2ea8 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -66,7 +66,7 @@ static void ext4_finish_bio(struct bio *bio)
 
 	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 		struct page *data_page = NULL;
 #endif
 		struct buffer_head *bh, *head;
@@ -78,7 +78,7 @@ static void ext4_finish_bio(struct bio *bio)
 		if (!page)
 			continue;
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 		if (!page->mapping) {
 			/* The bounce data pages are unmapped. */
 			data_page = page;
@@ -111,7 +111,7 @@ static void ext4_finish_bio(struct bio *bio)
 		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
 		local_irq_restore(flags);
 		if (!under_io) {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 			if (data_page)
 				fscrypt_restore_control_page(data_page);
 #endif
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 52d3ff5a9db1..b18881eb8da6 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -49,7 +49,7 @@
 
 static inline bool ext4_bio_encrypted(struct bio *bio)
 {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	return unlikely(bio->bi_private != NULL);
 #else
 	return false;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index fb12d3c17c1b..60da0a6e4d86 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1232,7 +1232,7 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
 	return try_to_free_buffers(page);
 }
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
 {
 	return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
@@ -1922,7 +1922,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 		*journal_ioprio =
 			IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
 	} else if (token == Opt_test_dummy_encryption) {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 		sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION;
 		ext4_msg(sb, KERN_WARNING,
 			 "Test dummy encryption mode enabled");
@@ -4167,7 +4167,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_op = &ext4_sops;
 	sb->s_export_op = &ext4_export_ops;
 	sb->s_xattr = ext4_xattr_handlers;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	sb->s_cop = &ext4_cryptops;
 #endif
 #ifdef CONFIG_QUOTA
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 9212a026a1f1..5e4e78fc0b3a 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -224,7 +224,7 @@ static struct attribute *ext4_attrs[] = {
 EXT4_ATTR_FEATURE(lazy_itable_init);
 EXT4_ATTR_FEATURE(batched_discard);
 EXT4_ATTR_FEATURE(meta_bg_resize);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 EXT4_ATTR_FEATURE(encryption);
 #endif
 EXT4_ATTR_FEATURE(metadata_csum_seed);
@@ -233,7 +233,7 @@ static struct attribute *ext4_feat_attrs[] = {
 	ATTR_LIST(lazy_itable_init),
 	ATTR_LIST(batched_discard),
 	ATTR_LIST(meta_bg_resize),
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	ATTR_LIST(encryption),
 #endif
 	ATTR_LIST(metadata_csum_seed),
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 9a20ef42fadd..e57cc754d543 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -3,6 +3,7 @@ config F2FS_FS
 	depends on BLOCK
 	select CRYPTO
 	select CRYPTO_CRC32
+	select F2FS_FS_XATTR if FS_ENCRYPTION
 	help
 	  F2FS is based on Log-structured File System (LFS), which supports
 	  versatile "flash-friendly" features. The design has been focused on
@@ -70,17 +71,6 @@ config F2FS_CHECK_FS
 
 	  If you want to improve the performance, say N.
 
-config F2FS_FS_ENCRYPTION
-	bool "F2FS Encryption"
-	depends on F2FS_FS
-	depends on F2FS_FS_XATTR
-	select FS_ENCRYPTION
-	help
-	  Enable encryption of f2fs files and directories.  This
-	  feature is similar to ecryptfs, but it is more memory
-	  efficient since it avoids caching the encrypted and
-	  decrypted pages in the page cache.
-
 config F2FS_IO_TRACE
 	bool "F2FS IO tracer"
 	depends on F2FS_FS
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9ef6f38e51cc..95cc885ccb2f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -24,7 +24,6 @@
 #include <linux/quotaops.h>
 #include <crypto/hash.h>
 
-#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_F2FS_FS_ENCRYPTION)
 #include <linux/fscrypt.h>
 
 #ifdef CONFIG_F2FS_CHECK_FS
@@ -1137,7 +1136,7 @@ enum fsync_mode {
 	FSYNC_MODE_NOBARRIER,	/* fsync behaves nobarrier based on posix */
 };
 
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 #define DUMMY_ENCRYPTION_ENABLED(sbi) \
 			(unlikely(F2FS_OPTION(sbi).test_dummy_encryption))
 #else
@@ -3470,7 +3469,7 @@ static inline bool f2fs_encrypted_file(struct inode *inode)
 
 static inline void f2fs_set_encrypted_inode(struct inode *inode)
 {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	file_set_encrypt(inode);
 	f2fs_set_inode_flags(inode);
 #endif
@@ -3549,7 +3548,7 @@ static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
 
 static inline bool f2fs_may_encrypt(struct inode *inode)
 {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	umode_t mode = inode->i_mode;
 
 	return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index c46a1d4318d4..0f3db3a8e5cb 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -757,7 +757,7 @@ static int parse_options(struct super_block *sb, char *options)
 			kvfree(name);
 			break;
 		case Opt_test_dummy_encryption:
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 			if (!f2fs_sb_has_encrypt(sbi)) {
 				f2fs_msg(sb, KERN_ERR, "Encrypt feature is off");
 				return -EINVAL;
@@ -1390,7 +1390,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_printf(seq, ",whint_mode=%s", "user-based");
 	else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS)
 		seq_printf(seq, ",whint_mode=%s", "fs-based");
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	if (F2FS_OPTION(sbi).test_dummy_encryption)
 		seq_puts(seq, ",test_dummy_encryption");
 #endif
@@ -2154,7 +2154,7 @@ static const struct super_operations f2fs_sops = {
 	.remount_fs	= f2fs_remount,
 };
 
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
 {
 	return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
@@ -3116,7 +3116,7 @@ try_onemore:
 #endif
 
 	sb->s_op = &f2fs_sops;
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	sb->s_cop = &f2fs_cryptops;
 #endif
 	sb->s_xattr = f2fs_xattr_handlers;
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 0575edbe3ed6..70da6801c86f 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -431,7 +431,7 @@ F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
 F2FS_GENERAL_RO_ATTR(features);
 F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
 
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO);
 #endif
 #ifdef CONFIG_BLK_DEV_ZONED
@@ -492,7 +492,7 @@ static struct attribute *f2fs_attrs[] = {
 };
 
 static struct attribute *f2fs_feat_attrs[] = {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	ATTR_LIST(encryption),
 #endif
 #ifdef CONFIG_BLK_DEV_ZONED
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index bc1e082d921d..9da2f135121b 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -8,6 +8,7 @@ config UBIFS_FS
 	select CRYPTO_LZO if UBIFS_FS_LZO
 	select CRYPTO_DEFLATE if UBIFS_FS_ZLIB
 	select CRYPTO_HASH_INFO
+	select UBIFS_FS_XATTR if FS_ENCRYPTION
 	depends on MTD_UBI
 	help
 	  UBIFS is a file system for flash devices which works on top of UBI.
@@ -60,17 +61,6 @@ config UBIFS_FS_XATTR
 
 	  If unsure, say Y.
 
-config UBIFS_FS_ENCRYPTION
-	bool "UBIFS Encryption"
-	depends on UBIFS_FS_XATTR && BLOCK
-	select FS_ENCRYPTION
-	default n
-	help
-	  Enable encryption of UBIFS files and directories. This
-	  feature is similar to ecryptfs, but it is more memory
-	  efficient since it avoids caching the encrypted and
-	  decrypted pages in the page cache.
-
 config UBIFS_FS_SECURITY
 	bool "UBIFS Security Labels"
 	depends on UBIFS_FS_XATTR
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
index 5f838319c8d5..5c4b845754a7 100644
--- a/fs/ubifs/Makefile
+++ b/fs/ubifs/Makefile
@@ -6,6 +6,6 @@ ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
 ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
 ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o debug.o
 ubifs-y += misc.o
-ubifs-$(CONFIG_UBIFS_FS_ENCRYPTION) += crypto.o
+ubifs-$(CONFIG_FS_ENCRYPTION) += crypto.o
 ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o
 ubifs-$(CONFIG_UBIFS_FS_AUTHENTICATION) += auth.o
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 0164bcc827f8..0f9c362a3402 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -185,7 +185,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		return err;
 	}
 	case FS_IOC_SET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 		struct ubifs_info *c = inode->i_sb->s_fs_info;
 
 		err = ubifs_enable_encryption(c);
@@ -198,7 +198,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 #endif
 	}
 	case FS_IOC_GET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 		return fscrypt_ioctl_get_policy(file, (void __user *)arg);
 #else
 		return -EOPNOTSUPP;
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 3da90c951c23..67fac1e8adfb 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -748,7 +748,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
 		goto out;
 	}
 
-#ifndef CONFIG_UBIFS_FS_ENCRYPTION
+#ifndef CONFIG_FS_ENCRYPTION
 	if (c->encrypted) {
 		ubifs_err(c, "file system contains encrypted files but UBIFS"
 			     " was built without crypto support.");
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1fac1133dadd..8dc2818fdd84 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2146,7 +2146,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_UBIFS_FS_XATTR
 	sb->s_xattr = ubifs_xattr_handlers;
 #endif
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 	sb->s_cop = &ubifs_crypt_operations;
 #endif
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 38401adaa00d..1ae12900e01d 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -43,7 +43,6 @@
 #include <crypto/hash.h>
 #include <crypto/algapi.h>
 
-#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_UBIFS_FS_ENCRYPTION)
 #include <linux/fscrypt.h>
 
 #include "ubifs-media.h"
@@ -142,7 +141,7 @@
  */
 #define WORST_COMPR_FACTOR 2
 
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
 #define UBIFS_CIPHER_BLOCK_SIZE FS_CRYPTO_BLOCK_SIZE
 #else
 #define UBIFS_CIPHER_BLOCK_SIZE 0
@@ -2072,7 +2071,7 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len,
 #include "misc.h"
 #include "key.h"
 
-#ifndef CONFIG_UBIFS_FS_ENCRYPTION
+#ifndef CONFIG_FS_ENCRYPTION
 static inline int ubifs_encrypt(const struct inode *inode,
 				struct ubifs_data_node *dn,
 				unsigned int in_len, unsigned int *out_len,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 811c77743dad..ba7889bb9ef6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -698,7 +698,7 @@ struct inode {
 	struct fsnotify_mark_connector __rcu	*i_fsnotify_marks;
 #endif
 
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+#ifdef CONFIG_FS_ENCRYPTION
 	struct fscrypt_info	*i_crypt_info;
 #endif
 
@@ -1403,7 +1403,7 @@ struct super_block {
 	void                    *s_security;
 #endif
 	const struct xattr_handler **s_xattr;
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+#ifdef CONFIG_FS_ENCRYPTION
 	const struct fscrypt_operations	*s_cop;
 #endif
 	struct hlist_bl_head	s_roots;	/* alternate root dentries for NFS */
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 952ab97af325..eec604840568 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -2,9 +2,8 @@
 /*
  * fscrypt.h: declarations for per-file encryption
  *
- * Filesystems that implement per-file encryption include this header
- * file with the __FS_HAS_ENCRYPTION set according to whether that filesystem
- * is being built with encryption support or not.
+ * Filesystems that implement per-file encryption must include this header
+ * file.
  *
  * Copyright (C) 2015, Google, Inc.
  *
@@ -15,6 +14,8 @@
 #define _LINUX_FSCRYPT_H
 
 #include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
 
 #define FS_CRYPTO_BLOCK_SIZE		16
 
@@ -42,11 +43,410 @@ struct fscrypt_name {
 /* Maximum value for the third parameter of fscrypt_operations.set_context(). */
 #define FSCRYPT_SET_CONTEXT_MAX_SIZE	28
 
-#if __FS_HAS_ENCRYPTION
-#include <linux/fscrypt_supp.h>
-#else
-#include <linux/fscrypt_notsupp.h>
-#endif
+#ifdef CONFIG_FS_ENCRYPTION
+/*
+ * fscrypt superblock flags
+ */
+#define FS_CFLG_OWN_PAGES (1U << 1)
+
+/*
+ * crypto operations for filesystems
+ */
+struct fscrypt_operations {
+	unsigned int flags;
+	const char *key_prefix;
+	int (*get_context)(struct inode *, void *, size_t);
+	int (*set_context)(struct inode *, const void *, size_t, void *);
+	bool (*dummy_context)(struct inode *);
+	bool (*empty_dir)(struct inode *);
+	unsigned int max_namelen;
+};
+
+struct fscrypt_ctx {
+	union {
+		struct {
+			struct page *bounce_page;	/* Ciphertext page */
+			struct page *control_page;	/* Original page  */
+		} w;
+		struct {
+			struct bio *bio;
+			struct work_struct work;
+		} r;
+		struct list_head free_list;	/* Free list */
+	};
+	u8 flags;				/* Flags */
+};
+
+static inline bool fscrypt_has_encryption_key(const struct inode *inode)
+{
+	return (inode->i_crypt_info != NULL);
+}
+
+static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
+{
+	return inode->i_sb->s_cop->dummy_context &&
+		inode->i_sb->s_cop->dummy_context(inode);
+}
+
+/* crypto.c */
+extern void fscrypt_enqueue_decrypt_work(struct work_struct *);
+extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t);
+extern void fscrypt_release_ctx(struct fscrypt_ctx *);
+extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *,
+						unsigned int, unsigned int,
+						u64, gfp_t);
+extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int,
+				unsigned int, u64);
+
+static inline struct page *fscrypt_control_page(struct page *page)
+{
+	return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
+}
+
+extern void fscrypt_restore_control_page(struct page *);
+
+/* policy.c */
+extern int fscrypt_ioctl_set_policy(struct file *, const void __user *);
+extern int fscrypt_ioctl_get_policy(struct file *, void __user *);
+extern int fscrypt_has_permitted_context(struct inode *, struct inode *);
+extern int fscrypt_inherit_context(struct inode *, struct inode *,
+					void *, bool);
+/* keyinfo.c */
+extern int fscrypt_get_encryption_info(struct inode *);
+extern void fscrypt_put_encryption_info(struct inode *);
+
+/* fname.c */
+extern int fscrypt_setup_filename(struct inode *, const struct qstr *,
+				int lookup, struct fscrypt_name *);
+
+static inline void fscrypt_free_filename(struct fscrypt_name *fname)
+{
+	kfree(fname->crypto_buf.name);
+}
+
+extern int fscrypt_fname_alloc_buffer(const struct inode *, u32,
+				struct fscrypt_str *);
+extern void fscrypt_fname_free_buffer(struct fscrypt_str *);
+extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32,
+			const struct fscrypt_str *, struct fscrypt_str *);
+
+#define FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE	32
+
+/* Extracts the second-to-last ciphertext block; see explanation below */
+#define FSCRYPT_FNAME_DIGEST(name, len)	\
+	((name) + round_down((len) - FS_CRYPTO_BLOCK_SIZE - 1, \
+			     FS_CRYPTO_BLOCK_SIZE))
+
+#define FSCRYPT_FNAME_DIGEST_SIZE	FS_CRYPTO_BLOCK_SIZE
+
+/**
+ * fscrypt_digested_name - alternate identifier for an on-disk filename
+ *
+ * When userspace lists an encrypted directory without access to the key,
+ * filenames whose ciphertext is longer than FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE
+ * bytes are shown in this abbreviated form (base64-encoded) rather than as the
+ * full ciphertext (base64-encoded).  This is necessary to allow supporting
+ * filenames up to NAME_MAX bytes, since base64 encoding expands the length.
+ *
+ * To make it possible for filesystems to still find the correct directory entry
+ * despite not knowing the full on-disk name, we encode any filesystem-specific
+ * 'hash' and/or 'minor_hash' which the filesystem may need for its lookups,
+ * followed by the second-to-last ciphertext block of the filename.  Due to the
+ * use of the CBC-CTS encryption mode, the second-to-last ciphertext block
+ * depends on the full plaintext.  (Note that ciphertext stealing causes the
+ * last two blocks to appear "flipped".)  This makes accidental collisions very
+ * unlikely: just a 1 in 2^128 chance for two filenames to collide even if they
+ * share the same filesystem-specific hashes.
+ *
+ * However, this scheme isn't immune to intentional collisions, which can be
+ * created by anyone able to create arbitrary plaintext filenames and view them
+ * without the key.  Making the "digest" be a real cryptographic hash like
+ * SHA-256 over the full ciphertext would prevent this, although it would be
+ * less efficient and harder to implement, especially since the filesystem would
+ * need to calculate it for each directory entry examined during a search.
+ */
+struct fscrypt_digested_name {
+	u32 hash;
+	u32 minor_hash;
+	u8 digest[FSCRYPT_FNAME_DIGEST_SIZE];
+};
+
+/**
+ * fscrypt_match_name() - test whether the given name matches a directory entry
+ * @fname: the name being searched for
+ * @de_name: the name from the directory entry
+ * @de_name_len: the length of @de_name in bytes
+ *
+ * Normally @fname->disk_name will be set, and in that case we simply compare
+ * that to the name stored in the directory entry.  The only exception is that
+ * if we don't have the key for an encrypted directory and a filename in it is
+ * very long, then we won't have the full disk_name and we'll instead need to
+ * match against the fscrypt_digested_name.
+ *
+ * Return: %true if the name matches, otherwise %false.
+ */
+static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
+				      const u8 *de_name, u32 de_name_len)
+{
+	if (unlikely(!fname->disk_name.name)) {
+		const struct fscrypt_digested_name *n =
+			(const void *)fname->crypto_buf.name;
+		if (WARN_ON_ONCE(fname->usr_fname->name[0] != '_'))
+			return false;
+		if (de_name_len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE)
+			return false;
+		return !memcmp(FSCRYPT_FNAME_DIGEST(de_name, de_name_len),
+			       n->digest, FSCRYPT_FNAME_DIGEST_SIZE);
+	}
+
+	if (de_name_len != fname->disk_name.len)
+		return false;
+	return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len);
+}
+
+/* bio.c */
+extern void fscrypt_decrypt_bio(struct bio *);
+extern void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx,
+					struct bio *bio);
+extern void fscrypt_pullback_bio_page(struct page **, bool);
+extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t,
+				 unsigned int);
+
+/* hooks.c */
+extern int fscrypt_file_open(struct inode *inode, struct file *filp);
+extern int __fscrypt_prepare_link(struct inode *inode, struct inode *dir);
+extern int __fscrypt_prepare_rename(struct inode *old_dir,
+				    struct dentry *old_dentry,
+				    struct inode *new_dir,
+				    struct dentry *new_dentry,
+				    unsigned int flags);
+extern int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry);
+extern int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len,
+				     unsigned int max_len,
+				     struct fscrypt_str *disk_link);
+extern int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
+				     unsigned int len,
+				     struct fscrypt_str *disk_link);
+extern const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
+				       unsigned int max_size,
+				       struct delayed_call *done);
+#else  /* !CONFIG_FS_ENCRYPTION */
+
+static inline bool fscrypt_has_encryption_key(const struct inode *inode)
+{
+	return false;
+}
+
+static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
+{
+	return false;
+}
+
+/* crypto.c */
+static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work)
+{
+}
+
+static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode,
+						  gfp_t gfp_flags)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
+{
+	return;
+}
+
+static inline struct page *fscrypt_encrypt_page(const struct inode *inode,
+						struct page *page,
+						unsigned int len,
+						unsigned int offs,
+						u64 lblk_num, gfp_t gfp_flags)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline int fscrypt_decrypt_page(const struct inode *inode,
+				       struct page *page,
+				       unsigned int len, unsigned int offs,
+				       u64 lblk_num)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline struct page *fscrypt_control_page(struct page *page)
+{
+	WARN_ON_ONCE(1);
+	return ERR_PTR(-EINVAL);
+}
+
+static inline void fscrypt_restore_control_page(struct page *page)
+{
+	return;
+}
+
+/* policy.c */
+static inline int fscrypt_ioctl_set_policy(struct file *filp,
+					   const void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int fscrypt_has_permitted_context(struct inode *parent,
+						struct inode *child)
+{
+	return 0;
+}
+
+static inline int fscrypt_inherit_context(struct inode *parent,
+					  struct inode *child,
+					  void *fs_data, bool preload)
+{
+	return -EOPNOTSUPP;
+}
+
+/* keyinfo.c */
+static inline int fscrypt_get_encryption_info(struct inode *inode)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void fscrypt_put_encryption_info(struct inode *inode)
+{
+	return;
+}
+
+ /* fname.c */
+static inline int fscrypt_setup_filename(struct inode *dir,
+					 const struct qstr *iname,
+					 int lookup, struct fscrypt_name *fname)
+{
+	if (IS_ENCRYPTED(dir))
+		return -EOPNOTSUPP;
+
+	memset(fname, 0, sizeof(struct fscrypt_name));
+	fname->usr_fname = iname;
+	fname->disk_name.name = (unsigned char *)iname->name;
+	fname->disk_name.len = iname->len;
+	return 0;
+}
+
+static inline void fscrypt_free_filename(struct fscrypt_name *fname)
+{
+	return;
+}
+
+static inline int fscrypt_fname_alloc_buffer(const struct inode *inode,
+					     u32 max_encrypted_len,
+					     struct fscrypt_str *crypto_str)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
+{
+	return;
+}
+
+static inline int fscrypt_fname_disk_to_usr(struct inode *inode,
+					    u32 hash, u32 minor_hash,
+					    const struct fscrypt_str *iname,
+					    struct fscrypt_str *oname)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
+				      const u8 *de_name, u32 de_name_len)
+{
+	/* Encryption support disabled; use standard comparison */
+	if (de_name_len != fname->disk_name.len)
+		return false;
+	return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len);
+}
+
+/* bio.c */
+static inline void fscrypt_decrypt_bio(struct bio *bio)
+{
+}
+
+static inline void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx,
+					       struct bio *bio)
+{
+}
+
+static inline void fscrypt_pullback_bio_page(struct page **page, bool restore)
+{
+	return;
+}
+
+static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
+					sector_t pblk, unsigned int len)
+{
+	return -EOPNOTSUPP;
+}
+
+/* hooks.c */
+
+static inline int fscrypt_file_open(struct inode *inode, struct file *filp)
+{
+	if (IS_ENCRYPTED(inode))
+		return -EOPNOTSUPP;
+	return 0;
+}
+
+static inline int __fscrypt_prepare_link(struct inode *inode,
+					 struct inode *dir)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int __fscrypt_prepare_rename(struct inode *old_dir,
+					   struct dentry *old_dentry,
+					   struct inode *new_dir,
+					   struct dentry *new_dentry,
+					   unsigned int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int __fscrypt_prepare_lookup(struct inode *dir,
+					   struct dentry *dentry)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int __fscrypt_prepare_symlink(struct inode *dir,
+					    unsigned int len,
+					    unsigned int max_len,
+					    struct fscrypt_str *disk_link)
+{
+	return -EOPNOTSUPP;
+}
+
+
+static inline int __fscrypt_encrypt_symlink(struct inode *inode,
+					    const char *target,
+					    unsigned int len,
+					    struct fscrypt_str *disk_link)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline const char *fscrypt_get_symlink(struct inode *inode,
+					      const void *caddr,
+					      unsigned int max_size,
+					      struct delayed_call *done)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+#endif	/* !CONFIG_FS_ENCRYPTION */
 
 /**
  * fscrypt_require_key - require an inode's encryption key
diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h
deleted file mode 100644
index ee8b43e4c15a..000000000000
--- a/include/linux/fscrypt_notsupp.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * fscrypt_notsupp.h
- *
- * This stubs out the fscrypt functions for filesystems configured without
- * encryption support.
- *
- * Do not include this file directly. Use fscrypt.h instead!
- */
-#ifndef _LINUX_FSCRYPT_H
-#error "Incorrect include of linux/fscrypt_notsupp.h!"
-#endif
-
-#ifndef _LINUX_FSCRYPT_NOTSUPP_H
-#define _LINUX_FSCRYPT_NOTSUPP_H
-
-static inline bool fscrypt_has_encryption_key(const struct inode *inode)
-{
-	return false;
-}
-
-static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
-{
-	return false;
-}
-
-/* crypto.c */
-static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work)
-{
-}
-
-static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode,
-						  gfp_t gfp_flags)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
-{
-	return;
-}
-
-static inline struct page *fscrypt_encrypt_page(const struct inode *inode,
-						struct page *page,
-						unsigned int len,
-						unsigned int offs,
-						u64 lblk_num, gfp_t gfp_flags)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline int fscrypt_decrypt_page(const struct inode *inode,
-				       struct page *page,
-				       unsigned int len, unsigned int offs,
-				       u64 lblk_num)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline struct page *fscrypt_control_page(struct page *page)
-{
-	WARN_ON_ONCE(1);
-	return ERR_PTR(-EINVAL);
-}
-
-static inline void fscrypt_restore_control_page(struct page *page)
-{
-	return;
-}
-
-/* policy.c */
-static inline int fscrypt_ioctl_set_policy(struct file *filp,
-					   const void __user *arg)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int fscrypt_has_permitted_context(struct inode *parent,
-						struct inode *child)
-{
-	return 0;
-}
-
-static inline int fscrypt_inherit_context(struct inode *parent,
-					  struct inode *child,
-					  void *fs_data, bool preload)
-{
-	return -EOPNOTSUPP;
-}
-
-/* keyinfo.c */
-static inline int fscrypt_get_encryption_info(struct inode *inode)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void fscrypt_put_encryption_info(struct inode *inode)
-{
-	return;
-}
-
- /* fname.c */
-static inline int fscrypt_setup_filename(struct inode *dir,
-					 const struct qstr *iname,
-					 int lookup, struct fscrypt_name *fname)
-{
-	if (IS_ENCRYPTED(dir))
-		return -EOPNOTSUPP;
-
-	memset(fname, 0, sizeof(struct fscrypt_name));
-	fname->usr_fname = iname;
-	fname->disk_name.name = (unsigned char *)iname->name;
-	fname->disk_name.len = iname->len;
-	return 0;
-}
-
-static inline void fscrypt_free_filename(struct fscrypt_name *fname)
-{
-	return;
-}
-
-static inline int fscrypt_fname_alloc_buffer(const struct inode *inode,
-					     u32 max_encrypted_len,
-					     struct fscrypt_str *crypto_str)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
-{
-	return;
-}
-
-static inline int fscrypt_fname_disk_to_usr(struct inode *inode,
-					    u32 hash, u32 minor_hash,
-					    const struct fscrypt_str *iname,
-					    struct fscrypt_str *oname)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
-				      const u8 *de_name, u32 de_name_len)
-{
-	/* Encryption support disabled; use standard comparison */
-	if (de_name_len != fname->disk_name.len)
-		return false;
-	return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len);
-}
-
-/* bio.c */
-static inline void fscrypt_decrypt_bio(struct bio *bio)
-{
-}
-
-static inline void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx,
-					       struct bio *bio)
-{
-}
-
-static inline void fscrypt_pullback_bio_page(struct page **page, bool restore)
-{
-	return;
-}
-
-static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
-					sector_t pblk, unsigned int len)
-{
-	return -EOPNOTSUPP;
-}
-
-/* hooks.c */
-
-static inline int fscrypt_file_open(struct inode *inode, struct file *filp)
-{
-	if (IS_ENCRYPTED(inode))
-		return -EOPNOTSUPP;
-	return 0;
-}
-
-static inline int __fscrypt_prepare_link(struct inode *inode,
-					 struct inode *dir)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int __fscrypt_prepare_rename(struct inode *old_dir,
-					   struct dentry *old_dentry,
-					   struct inode *new_dir,
-					   struct dentry *new_dentry,
-					   unsigned int flags)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int __fscrypt_prepare_lookup(struct inode *dir,
-					   struct dentry *dentry)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int __fscrypt_prepare_symlink(struct inode *dir,
-					    unsigned int len,
-					    unsigned int max_len,
-					    struct fscrypt_str *disk_link)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int __fscrypt_encrypt_symlink(struct inode *inode,
-					    const char *target,
-					    unsigned int len,
-					    struct fscrypt_str *disk_link)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline const char *fscrypt_get_symlink(struct inode *inode,
-					      const void *caddr,
-					      unsigned int max_size,
-					      struct delayed_call *done)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-#endif	/* _LINUX_FSCRYPT_NOTSUPP_H */
diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h
deleted file mode 100644
index 6456c6b2005f..000000000000
--- a/include/linux/fscrypt_supp.h
+++ /dev/null
@@ -1,204 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * fscrypt_supp.h
- *
- * Do not include this file directly. Use fscrypt.h instead!
- */
-#ifndef _LINUX_FSCRYPT_H
-#error "Incorrect include of linux/fscrypt_supp.h!"
-#endif
-
-#ifndef _LINUX_FSCRYPT_SUPP_H
-#define _LINUX_FSCRYPT_SUPP_H
-
-#include <linux/mm.h>
-#include <linux/slab.h>
-
-/*
- * fscrypt superblock flags
- */
-#define FS_CFLG_OWN_PAGES (1U << 1)
-
-/*
- * crypto operations for filesystems
- */
-struct fscrypt_operations {
-	unsigned int flags;
-	const char *key_prefix;
-	int (*get_context)(struct inode *, void *, size_t);
-	int (*set_context)(struct inode *, const void *, size_t, void *);
-	bool (*dummy_context)(struct inode *);
-	bool (*empty_dir)(struct inode *);
-	unsigned int max_namelen;
-};
-
-struct fscrypt_ctx {
-	union {
-		struct {
-			struct page *bounce_page;	/* Ciphertext page */
-			struct page *control_page;	/* Original page  */
-		} w;
-		struct {
-			struct bio *bio;
-			struct work_struct work;
-		} r;
-		struct list_head free_list;	/* Free list */
-	};
-	u8 flags;				/* Flags */
-};
-
-static inline bool fscrypt_has_encryption_key(const struct inode *inode)
-{
-	return (inode->i_crypt_info != NULL);
-}
-
-static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
-{
-	return inode->i_sb->s_cop->dummy_context &&
-		inode->i_sb->s_cop->dummy_context(inode);
-}
-
-/* crypto.c */
-extern void fscrypt_enqueue_decrypt_work(struct work_struct *);
-extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t);
-extern void fscrypt_release_ctx(struct fscrypt_ctx *);
-extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *,
-						unsigned int, unsigned int,
-						u64, gfp_t);
-extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int,
-				unsigned int, u64);
-
-static inline struct page *fscrypt_control_page(struct page *page)
-{
-	return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
-}
-
-extern void fscrypt_restore_control_page(struct page *);
-
-/* policy.c */
-extern int fscrypt_ioctl_set_policy(struct file *, const void __user *);
-extern int fscrypt_ioctl_get_policy(struct file *, void __user *);
-extern int fscrypt_has_permitted_context(struct inode *, struct inode *);
-extern int fscrypt_inherit_context(struct inode *, struct inode *,
-					void *, bool);
-/* keyinfo.c */
-extern int fscrypt_get_encryption_info(struct inode *);
-extern void fscrypt_put_encryption_info(struct inode *);
-
-/* fname.c */
-extern int fscrypt_setup_filename(struct inode *, const struct qstr *,
-				int lookup, struct fscrypt_name *);
-
-static inline void fscrypt_free_filename(struct fscrypt_name *fname)
-{
-	kfree(fname->crypto_buf.name);
-}
-
-extern int fscrypt_fname_alloc_buffer(const struct inode *, u32,
-				struct fscrypt_str *);
-extern void fscrypt_fname_free_buffer(struct fscrypt_str *);
-extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32,
-			const struct fscrypt_str *, struct fscrypt_str *);
-
-#define FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE	32
-
-/* Extracts the second-to-last ciphertext block; see explanation below */
-#define FSCRYPT_FNAME_DIGEST(name, len)	\
-	((name) + round_down((len) - FS_CRYPTO_BLOCK_SIZE - 1, \
-			     FS_CRYPTO_BLOCK_SIZE))
-
-#define FSCRYPT_FNAME_DIGEST_SIZE	FS_CRYPTO_BLOCK_SIZE
-
-/**
- * fscrypt_digested_name - alternate identifier for an on-disk filename
- *
- * When userspace lists an encrypted directory without access to the key,
- * filenames whose ciphertext is longer than FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE
- * bytes are shown in this abbreviated form (base64-encoded) rather than as the
- * full ciphertext (base64-encoded).  This is necessary to allow supporting
- * filenames up to NAME_MAX bytes, since base64 encoding expands the length.
- *
- * To make it possible for filesystems to still find the correct directory entry
- * despite not knowing the full on-disk name, we encode any filesystem-specific
- * 'hash' and/or 'minor_hash' which the filesystem may need for its lookups,
- * followed by the second-to-last ciphertext block of the filename.  Due to the
- * use of the CBC-CTS encryption mode, the second-to-last ciphertext block
- * depends on the full plaintext.  (Note that ciphertext stealing causes the
- * last two blocks to appear "flipped".)  This makes accidental collisions very
- * unlikely: just a 1 in 2^128 chance for two filenames to collide even if they
- * share the same filesystem-specific hashes.
- *
- * However, this scheme isn't immune to intentional collisions, which can be
- * created by anyone able to create arbitrary plaintext filenames and view them
- * without the key.  Making the "digest" be a real cryptographic hash like
- * SHA-256 over the full ciphertext would prevent this, although it would be
- * less efficient and harder to implement, especially since the filesystem would
- * need to calculate it for each directory entry examined during a search.
- */
-struct fscrypt_digested_name {
-	u32 hash;
-	u32 minor_hash;
-	u8 digest[FSCRYPT_FNAME_DIGEST_SIZE];
-};
-
-/**
- * fscrypt_match_name() - test whether the given name matches a directory entry
- * @fname: the name being searched for
- * @de_name: the name from the directory entry
- * @de_name_len: the length of @de_name in bytes
- *
- * Normally @fname->disk_name will be set, and in that case we simply compare
- * that to the name stored in the directory entry.  The only exception is that
- * if we don't have the key for an encrypted directory and a filename in it is
- * very long, then we won't have the full disk_name and we'll instead need to
- * match against the fscrypt_digested_name.
- *
- * Return: %true if the name matches, otherwise %false.
- */
-static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
-				      const u8 *de_name, u32 de_name_len)
-{
-	if (unlikely(!fname->disk_name.name)) {
-		const struct fscrypt_digested_name *n =
-			(const void *)fname->crypto_buf.name;
-		if (WARN_ON_ONCE(fname->usr_fname->name[0] != '_'))
-			return false;
-		if (de_name_len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE)
-			return false;
-		return !memcmp(FSCRYPT_FNAME_DIGEST(de_name, de_name_len),
-			       n->digest, FSCRYPT_FNAME_DIGEST_SIZE);
-	}
-
-	if (de_name_len != fname->disk_name.len)
-		return false;
-	return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len);
-}
-
-/* bio.c */
-extern void fscrypt_decrypt_bio(struct bio *);
-extern void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx,
-					struct bio *bio);
-extern void fscrypt_pullback_bio_page(struct page **, bool);
-extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t,
-				 unsigned int);
-
-/* hooks.c */
-extern int fscrypt_file_open(struct inode *inode, struct file *filp);
-extern int __fscrypt_prepare_link(struct inode *inode, struct inode *dir);
-extern int __fscrypt_prepare_rename(struct inode *old_dir,
-				    struct dentry *old_dentry,
-				    struct inode *new_dir,
-				    struct dentry *new_dentry,
-				    unsigned int flags);
-extern int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry);
-extern int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len,
-				     unsigned int max_len,
-				     struct fscrypt_str *disk_link);
-extern int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
-				     unsigned int len,
-				     struct fscrypt_str *disk_link);
-extern const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
-				       unsigned int max_size,
-				       struct delayed_call *done);
-
-#endif	/* _LINUX_FSCRYPT_SUPP_H */
-- 
cgit v1.2.3


From f5e55e777cc93eae1416f0fa4908e8846b6d7825 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 22 Jan 2019 16:20:21 -0800
Subject: fscrypt: return -EXDEV for incompatible rename or link into encrypted
 dir

Currently, trying to rename or link a regular file, directory, or
symlink into an encrypted directory fails with EPERM when the source
file is unencrypted or is encrypted with a different encryption policy,
and is on the same mountpoint.  It is correct for the operation to fail,
but the choice of EPERM breaks tools like 'mv' that know to copy rather
than rename if they see EXDEV, but don't know what to do with EPERM.

Our original motivation for EPERM was to encourage users to securely
handle their data.  Encrypting files by "moving" them into an encrypted
directory can be insecure because the unencrypted data may remain in
free space on disk, where it can later be recovered by an attacker.
It's much better to encrypt the data from the start, or at least try to
securely delete the source data e.g. using the 'shred' program.

However, the current behavior hasn't been effective at achieving its
goal because users tend to be confused, hack around it, and complain;
see e.g. https://github.com/google/fscrypt/issues/76.  And in some cases
it's actually inconsistent or unnecessary.  For example, 'mv'-ing files
between differently encrypted directories doesn't work even in cases
where it can be secure, such as when in userspace the same passphrase
protects both directories.  Yet, you *can* already 'mv' unencrypted
files into an encrypted directory if the source files are on a different
mountpoint, even though doing so is often insecure.

There are probably better ways to teach users to securely handle their
files.  For example, the 'fscrypt' userspace tool could provide a
command that migrates unencrypted files into an encrypted directory,
acting like 'shred' on the source files and providing appropriate
warnings depending on the type of the source filesystem and disk.

Receiving errors on unimportant files might also force some users to
disable encryption, thus making the behavior counterproductive.  It's
desirable to make encryption as unobtrusive as possible.

Therefore, change the error code from EPERM to EXDEV so that tools
looking for EXDEV will fall back to a copy.

This, of course, doesn't prevent users from still doing the right things
to securely manage their files.  Note that this also matches the
behavior when a file is renamed between two project quota hierarchies;
so there's precedent for using EXDEV for things other than mountpoints.

xfstests generic/398 will require an update with this change.

[Rewritten from an earlier patch series by Michael Halcrow.]

Cc: Michael Halcrow <mhalcrow@google.com>
Cc: Joe Richey <joerichey@google.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 Documentation/filesystems/fscrypt.rst | 12 ++++++++++--
 fs/crypto/hooks.c                     |  6 +++---
 fs/crypto/policy.c                    |  3 +--
 include/linux/fscrypt.h               |  4 ++--
 4 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index 43dd989e2a3f..08c23b60e016 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -451,10 +451,18 @@ astute users may notice some differences in behavior:
 - Unencrypted files, or files encrypted with a different encryption
   policy (i.e. different key, modes, or flags), cannot be renamed or
   linked into an encrypted directory; see `Encryption policy
-  enforcement`_.  Attempts to do so will fail with EPERM.  However,
+  enforcement`_.  Attempts to do so will fail with EXDEV.  However,
   encrypted files can be renamed within an encrypted directory, or
   into an unencrypted directory.
 
+  Note: "moving" an unencrypted file into an encrypted directory, e.g.
+  with the `mv` program, is implemented in userspace by a copy
+  followed by a delete.  Be aware that the original unencrypted data
+  may remain recoverable from free space on the disk; prefer to keep
+  all files encrypted from the very beginning.  The `shred` program
+  may be used to overwrite the source files but isn't guaranteed to be
+  effective on all filesystems and storage devices.
+
 - Direct I/O is not supported on encrypted files.  Attempts to use
   direct I/O on such files will fall back to buffered I/O.
 
@@ -541,7 +549,7 @@ not be encrypted.
 Except for those special files, it is forbidden to have unencrypted
 files, or files encrypted with a different encryption policy, in an
 encrypted directory tree.  Attempts to link or rename such a file into
-an encrypted directory will fail with EPERM.  This is also enforced
+an encrypted directory will fail with EXDEV.  This is also enforced
 during ->lookup() to provide limited protection against offline
 attacks that try to disable or downgrade encryption in known locations
 where applications may later write sensitive data.  It is recommended
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 926e5df20ec3..56debb1fcf5e 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -58,7 +58,7 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir)
 		return err;
 
 	if (!fscrypt_has_permitted_context(dir, inode))
-		return -EPERM;
+		return -EXDEV;
 
 	return 0;
 }
@@ -82,13 +82,13 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
 		if (IS_ENCRYPTED(new_dir) &&
 		    !fscrypt_has_permitted_context(new_dir,
 						   d_inode(old_dentry)))
-			return -EPERM;
+			return -EXDEV;
 
 		if ((flags & RENAME_EXCHANGE) &&
 		    IS_ENCRYPTED(old_dir) &&
 		    !fscrypt_has_permitted_context(old_dir,
 						   d_inode(new_dentry)))
-			return -EPERM;
+			return -EXDEV;
 	}
 	return 0;
 }
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index f490de921ce8..bd7eaf9b3f00 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -151,8 +151,7 @@ EXPORT_SYMBOL(fscrypt_ioctl_get_policy);
  * malicious offline violations of this constraint, while the link and rename
  * checks are needed to prevent online violations of this constraint.
  *
- * Return: 1 if permitted, 0 if forbidden.  If forbidden, the caller must fail
- * the filesystem operation with EPERM.
+ * Return: 1 if permitted, 0 if forbidden.
  */
 int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
 {
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index eec604840568..e5194fc3983e 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -489,7 +489,7 @@ static inline int fscrypt_require_key(struct inode *inode)
  * in an encrypted directory tree use the same encryption policy.
  *
  * Return: 0 on success, -ENOKEY if the directory's encryption key is missing,
- * -EPERM if the link would result in an inconsistent encryption policy, or
+ * -EXDEV if the link would result in an inconsistent encryption policy, or
  * another -errno code.
  */
 static inline int fscrypt_prepare_link(struct dentry *old_dentry,
@@ -519,7 +519,7 @@ static inline int fscrypt_prepare_link(struct dentry *old_dentry,
  * We also verify that the rename will not violate the constraint that all files
  * in an encrypted directory tree use the same encryption policy.
  *
- * Return: 0 on success, -ENOKEY if an encryption key is missing, -EPERM if the
+ * Return: 0 on success, -ENOKEY if an encryption key is missing, -EXDEV if the
  * rename would cause inconsistent encryption policies, or another -errno code.
  */
 static inline int fscrypt_prepare_rename(struct inode *old_dir,
-- 
cgit v1.2.3


From ef74f70e5a10cc2a78cc5529e564170cabcda9af Mon Sep 17 00:00:00 2001
From: Brian Masney <masneyb@onstation.org>
Date: Sat, 19 Jan 2019 15:42:42 -0500
Subject: gpio: add irq domain activate/deactivate functions

This adds the two new functions gpiochip_irq_domain_activate and
gpiochip_irq_domain_deactivate that can be used as the activate and
deactivate functions in the struct irq_domain_ops. This is for
situations where only gpiochip_{lock,unlock}_as_irq needs to be called.
SPMI and SSBI GPIO are two users that will initially use these
functions.

Signed-off-by: Brian Masney <masneyb@onstation.org>
Suggested-by: Stephen Boyd <sboyd@kernel.org>
Reviewed-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/gpio/driver.h |  5 +++++
 2 files changed, 42 insertions(+)

(limited to 'include')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 1651d7f0a303..361a09c8138a 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1775,6 +1775,43 @@ static const struct irq_domain_ops gpiochip_domain_ops = {
 	.xlate	= irq_domain_xlate_twocell,
 };
 
+/**
+ * gpiochip_irq_domain_activate() - Lock a GPIO to be used as an IRQ
+ * @domain: The IRQ domain used by this IRQ chip
+ * @data: Outermost irq_data associated with the IRQ
+ * @reserve: If set, only reserve an interrupt vector instead of assigning one
+ *
+ * This function is a wrapper that calls gpiochip_lock_as_irq() and is to be
+ * used as the activate function for the &struct irq_domain_ops. The host_data
+ * for the IRQ domain must be the &struct gpio_chip.
+ */
+int gpiochip_irq_domain_activate(struct irq_domain *domain,
+				 struct irq_data *data, bool reserve)
+{
+	struct gpio_chip *chip = domain->host_data;
+
+	return gpiochip_lock_as_irq(chip, data->hwirq);
+}
+EXPORT_SYMBOL_GPL(gpiochip_irq_domain_activate);
+
+/**
+ * gpiochip_irq_domain_deactivate() - Unlock a GPIO used as an IRQ
+ * @domain: The IRQ domain used by this IRQ chip
+ * @data: Outermost irq_data associated with the IRQ
+ *
+ * This function is a wrapper that will call gpiochip_unlock_as_irq() and is to
+ * be used as the deactivate function for the &struct irq_domain_ops. The
+ * host_data for the IRQ domain must be the &struct gpio_chip.
+ */
+void gpiochip_irq_domain_deactivate(struct irq_domain *domain,
+				    struct irq_data *data)
+{
+	struct gpio_chip *chip = domain->host_data;
+
+	return gpiochip_unlock_as_irq(chip, data->hwirq);
+}
+EXPORT_SYMBOL_GPL(gpiochip_irq_domain_deactivate);
+
 static int gpiochip_to_irq(struct gpio_chip *chip, unsigned offset)
 {
 	if (!gpiochip_irqchip_irq_valid(chip, offset))
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 07cddbf45186..01497910f023 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -472,6 +472,11 @@ int gpiochip_irq_map(struct irq_domain *d, unsigned int irq,
 		     irq_hw_number_t hwirq);
 void gpiochip_irq_unmap(struct irq_domain *d, unsigned int irq);
 
+int gpiochip_irq_domain_activate(struct irq_domain *domain,
+				 struct irq_data *data, bool reserve);
+void gpiochip_irq_domain_deactivate(struct irq_domain *domain,
+				    struct irq_data *data);
+
 void gpiochip_set_chained_irqchip(struct gpio_chip *gpiochip,
 		struct irq_chip *irqchip,
 		unsigned int parent_irq,
-- 
cgit v1.2.3


From da6a496a34f2fdcab14362cdc5068aac385e7b47 Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Tue, 22 Jan 2019 09:16:08 +0200
Subject: IB/mlx5: Ranges in implicit ODP MR inherit its write access

A sub-range in ODP implicit MR should take its write permission from the
MR and not be set always to allow.

Fixes: d07d1d70ce1a ("IB/umem: Update on demand page (ODP) support")
Signed-off-by: Moni Shoua <monis@mellanox.com>
Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/umem_odp.c | 5 +++--
 drivers/infiniband/hw/mlx5/odp.c   | 2 +-
 include/rdma/ib_umem_odp.h         | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index dc1d7cb15cfa..eb8a5eb65bfa 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -332,9 +332,10 @@ static void put_per_mm(struct ib_umem_odp *umem_odp)
 	mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm);
 }
 
-struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm,
+struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root,
 				      unsigned long addr, size_t size)
 {
+	struct ib_ucontext_per_mm *per_mm = root->per_mm;
 	struct ib_ucontext *ctx = per_mm->context;
 	struct ib_umem_odp *odp_data;
 	struct ib_umem *umem;
@@ -349,7 +350,7 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm,
 	umem->length     = size;
 	umem->address    = addr;
 	umem->page_shift = PAGE_SHIFT;
-	umem->writable   = 1;
+	umem->writable   = root->umem.writable;
 	umem->is_odp = 1;
 	odp_data->per_mm = per_mm;
 
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 3abdfc3584c0..3e0d5885c026 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -439,7 +439,7 @@ next_mr:
 		if (nentries)
 			nentries++;
 	} else {
-		odp = ib_alloc_odp_umem(odp_mr->per_mm, addr,
+		odp = ib_alloc_odp_umem(odp_mr, addr,
 					MLX5_IMR_MTT_SIZE);
 		if (IS_ERR(odp)) {
 			mutex_unlock(&odp_mr->umem_mutex);
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index d3725cf13ecd..d0024f53626e 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -116,7 +116,7 @@ struct ib_ucontext_per_mm {
 };
 
 int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access);
-struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm,
+struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root_umem,
 				      unsigned long addr, size_t size);
 void ib_umem_odp_release(struct ib_umem_odp *umem_odp);
 
-- 
cgit v1.2.3


From 61b2fe3c62e5269408e264b2348f96467246d537 Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Tue, 22 Jan 2019 09:16:09 +0200
Subject: IB/mlx5: Remove dead code

When CONFIG_INFINIBAND_ON_DEMAND_PAGING is not set there is no caller to
ib_alloc_odp_umem() so let's remove it.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/rdma/ib_umem_odp.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index d0024f53626e..dadc96dea39c 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -169,12 +169,6 @@ static inline int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access)
 	return -EINVAL;
 }
 
-static inline struct ib_umem_odp *
-ib_alloc_odp_umem(struct ib_ucontext *context, unsigned long addr, size_t size)
-{
-	return ERR_PTR(-EINVAL);
-}
-
 static inline void ib_umem_odp_release(struct ib_umem_odp *umem_odp) {}
 
 #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
-- 
cgit v1.2.3


From 9359a8cbcc77d4afb07444f341e39015d232bd55 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 25 Jan 2019 16:30:35 +0200
Subject: RAS: Use consistent types for UUIDs

The commit

  297b64c74385 ("ras: acpi / apei: generate trace event for unrecognized CPER section")

brought inconsistency in UUID types which are used across the RAS
subsystem.

Fix this by using guid_t everywhere.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Thomas Tai <thomas.tai@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Tyler Baicar <tbaicar@codeaurora.org>
Link: https://lkml.kernel.org/r/20190125143035.81589-1-andriy.shevchenko@linux.intel.com
---
 drivers/ras/ras.c       | 2 +-
 include/ras/ras_event.h | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index 3f38907320dc..95540ea8dd9d 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -14,7 +14,7 @@
 #define TRACE_INCLUDE_PATH ../../include/ras
 #include <ras/ras_event.h>
 
-void log_non_standard_event(const uuid_le *sec_type, const uuid_le *fru_id,
+void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id,
 			    const char *fru_text, const u8 sev, const u8 *err,
 			    const u32 len)
 {
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index a0794632fd01..36c5c5e38c1d 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -27,7 +27,7 @@
 TRACE_EVENT(extlog_mem_event,
 	TP_PROTO(struct cper_sec_mem_err *mem,
 		 u32 err_seq,
-		 const uuid_le *fru_id,
+		 const guid_t *fru_id,
 		 const char *fru_text,
 		 u8 sev),
 
@@ -39,7 +39,7 @@ TRACE_EVENT(extlog_mem_event,
 		__field(u8, sev)
 		__field(u64, pa)
 		__field(u8, pa_mask_lsb)
-		__field_struct(uuid_le, fru_id)
+		__field_struct(guid_t, fru_id)
 		__string(fru_text, fru_text)
 		__field_struct(struct cper_mem_err_compact, data)
 	),
@@ -218,8 +218,8 @@ TRACE_EVENT(arm_event,
  */
 TRACE_EVENT(non_standard_event,
 
-	TP_PROTO(const uuid_le *sec_type,
-		 const uuid_le *fru_id,
+	TP_PROTO(const guid_t *sec_type,
+		 const guid_t *fru_id,
 		 const char *fru_text,
 		 const u8 sev,
 		 const u8 *err,
-- 
cgit v1.2.3


From 9b28aa1d0eae1be1016c8f4ba504545caff01da3 Mon Sep 17 00:00:00 2001
From: Vadim Pasternak <vadimp@mellanox.com>
Date: Wed, 12 Dec 2018 23:59:13 +0000
Subject: platform_data/mlxreg: Document fixes for core platform data

Remove "led" from the description, since the structure
"mlxreg_core_platform_data" is used not only for led data.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 include/linux/platform_data/mlxreg.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h
index 19f5cb618c55..d823713f94ec 100644
--- a/include/linux/platform_data/mlxreg.h
+++ b/include/linux/platform_data/mlxreg.h
@@ -107,9 +107,9 @@ struct mlxreg_core_item {
 /**
  * struct mlxreg_core_platform_data - platform data:
  *
- * @led_data: led private data;
+ * @data: instance private data;
  * @regmap: register map of parent device;
- * @counter: number of led instances;
+ * @counter: number of instances;
  */
 struct mlxreg_core_platform_data {
 	struct mlxreg_core_data *data;
-- 
cgit v1.2.3


From 946e4e02b11889cb161b15ff4712a8ba21a50eb6 Mon Sep 17 00:00:00 2001
From: Vadim Pasternak <vadimp@mellanox.com>
Date: Wed, 12 Dec 2018 23:59:14 +0000
Subject: platform_data/mlxreg: Add capability field to core platform data

Add capability field to "mlxreg_core_platform_data" structure.
The purpose of this register is to provide additional info to platform
driver through the atribute related capability register.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 include/linux/platform_data/mlxreg.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h
index d823713f94ec..1b2f86f96743 100644
--- a/include/linux/platform_data/mlxreg.h
+++ b/include/linux/platform_data/mlxreg.h
@@ -61,6 +61,7 @@ struct mlxreg_hotplug_device {
  * @reg: attribute register;
  * @mask: attribute access mask;
  * @bit: attribute effective bit;
+ * @capability: attribute capability register;
  * @mode: access mode;
  * @np - pointer to node platform associated with attribute;
  * @hpdev - hotplug device data;
@@ -72,6 +73,7 @@ struct mlxreg_core_data {
 	u32 reg;
 	u32 mask;
 	u32 bit;
+	u32 capability;
 	umode_t	mode;
 	struct device_node *np;
 	struct mlxreg_hotplug_device hpdev;
-- 
cgit v1.2.3


From 87eff9af7efb154cc4a940ed12efc803a0bf3fba Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vz@mleia.com>
Date: Tue, 22 Jan 2019 23:18:21 +0200
Subject: pinctrl: remove pinctrl/machine.h inclusion from pinctrl/pinconf.h

The change adds explicit inclusion of linux/pinctrl/machine.h header
to the only needed pinctrl-madera-core.c file, and therefore inclusion
of pinctrl/machine.h header from pinctrl/pinconf.h can be removed.

The change is preparatory to a follow-up reversal of commit f07512e615dd
("pinctrl/pinconfig: add debug interface").

Signed-off-by: Vladimir Zapolskiy <vz@mleia.com>
Cc: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by Richard Fitzgerald <rf@opensource.cirrus.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/cirrus/pinctrl-madera-core.c | 1 +
 include/linux/pinctrl/pinconf.h              | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/pinctrl/cirrus/pinctrl-madera-core.c b/drivers/pinctrl/cirrus/pinctrl-madera-core.c
index a5dda832024a..7c9694593f79 100644
--- a/drivers/pinctrl/cirrus/pinctrl-madera-core.c
+++ b/drivers/pinctrl/cirrus/pinctrl-madera-core.c
@@ -14,6 +14,7 @@
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
 #include <linux/slab.h>
+#include <linux/pinctrl/machine.h>
 #include <linux/pinctrl/pinctrl.h>
 #include <linux/pinctrl/pinmux.h>
 #include <linux/pinctrl/pinconf.h>
diff --git a/include/linux/pinctrl/pinconf.h b/include/linux/pinctrl/pinconf.h
index 8dd85d302b90..109468d9d849 100644
--- a/include/linux/pinctrl/pinconf.h
+++ b/include/linux/pinctrl/pinconf.h
@@ -14,8 +14,6 @@
 
 #ifdef CONFIG_PINCONF
 
-#include <linux/pinctrl/machine.h>
-
 struct pinctrl_dev;
 struct seq_file;
 
-- 
cgit v1.2.3


From e73339037f6b6d65e84f5fd42e56dd3cdf0d9e9c Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vz@mleia.com>
Date: Tue, 22 Jan 2019 23:18:22 +0200
Subject: pinctrl: remove unused 'pinconf-config' debugfs interface

The main goal of the change is to remove .pin_config_dbg_parse_modify
callback before a driver with its support appears. So far the in-kernel
interface did not attract any users since its introduction 5 years ago.

Originally .pin_config_dbg_parse_modify callback and the associated
'pinconf-config' debugfs file were introduced in commit f07512e615dd
("pinctrl/pinconfig: add debug interface"), a short description of
'pinconf-config' usage for debugging can be expressed this way:

Write to 'pinconf-config' (see pinconf_dbg_config_write() function):

% echo -n modify $map_type $device_name $state_name $pin_name $config > \
	/sys/kernel/debug/pinctrl/$pinctrl/pinconf-config

It supposes to update a global (therefore single!) 'pinconf_dbg_conf'
variable with an alternative setting, the arguments should match
an existing pinconf device and some registered pinctrl mapping 'map':

* $map_type is either 'config_pin' or 'config_group', it should match
  'map->type' value of PIN_MAP_TYPE_CONFIGS_PIN or
   PIN_MAP_TYPE_CONFIGS_GROUP accordingly,
* $device_name should match 'map->dev_name' string value,
* $state_name should match 'map->name' string value,
* $pin_name should match 'map->data.configs.group_or_pin' string value,

If all above has matched, then $config is a new value to be set by calling
pinconfops->pin_config_dbg_parse_modify(pctldev, config, matched_config).

After a successful write into 'pinconf-config' a user can read the file
to get information about that single modified pin configuration.

The fact is .pin_config_dbg_parse_modify callback has never been defined
in 'struct pinconf_ops' of any pinconf driver, thus an actual modification
of a pin or group state on any present pinconf controller does not happen,
and it declares that all related code is no more than dead code.

I discovered the issue while attempting to add .pin_config_dbg_parse_modify
support in some drivers and found that too short 'MAX_NAME_LEN' set by

  drivers/pinctrl/pinconf.c:372:#define MAX_NAME_LEN 15

is practically insufficient to store a regular pinctrl device name,
which are like 'e6060000.pin-controller-sh-pfc' or pin names like
'MX6QDL_PAD_ENET_REF_CLK', thus it is another indicator that the code
is barely usable, insufficiently tested and unprepossessing.

Of course it might be possible to increase MAX_NAME_LEN, and then add
.pin_config_dbg_parse_modify callbacks to the drivers, but the whole
idea of such a limited debug option looks inviable. A more flexible
way to functionally substitute the original approach is to implicitly
or explicitly use pinctrl_select_state() function whenever needed.

Signed-off-by: Vladimir Zapolskiy <vz@mleia.com>
Cc: Laurent Meunier <laurent.meunier@st.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/pinconf.c       | 222 ----------------------------------------
 include/linux/pinctrl/pinconf.h |   4 -
 2 files changed, 226 deletions(-)

(limited to 'include')

diff --git a/drivers/pinctrl/pinconf.c b/drivers/pinctrl/pinconf.c
index 2c7229380f08..2678603df14b 100644
--- a/drivers/pinctrl/pinconf.c
+++ b/drivers/pinctrl/pinconf.c
@@ -17,7 +17,6 @@
 #include <linux/slab.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
-#include <linux/uaccess.h>
 #include <linux/pinctrl/machine.h>
 #include <linux/pinctrl/pinctrl.h>
 #include <linux/pinctrl/pinconf.h>
@@ -369,225 +368,6 @@ static int pinconf_groups_show(struct seq_file *s, void *what)
 DEFINE_SHOW_ATTRIBUTE(pinconf_pins);
 DEFINE_SHOW_ATTRIBUTE(pinconf_groups);
 
-#define MAX_NAME_LEN 15
-
-struct dbg_cfg {
-	enum pinctrl_map_type map_type;
-	char dev_name[MAX_NAME_LEN + 1];
-	char state_name[MAX_NAME_LEN + 1];
-	char pin_name[MAX_NAME_LEN + 1];
-};
-
-/*
- * Goal is to keep this structure as global in order to simply read the
- * pinconf-config file after a write to check config is as expected
- */
-static struct dbg_cfg pinconf_dbg_conf;
-
-/**
- * pinconf_dbg_config_print() - display the pinctrl config from the pinctrl
- * map, of the dev/pin/state that was last written to pinconf-config file.
- * @s: string filled in  with config description
- * @d: not used
- */
-static int pinconf_dbg_config_print(struct seq_file *s, void *d)
-{
-	struct pinctrl_maps *maps_node;
-	const struct pinctrl_map *map;
-	const struct pinctrl_map *found = NULL;
-	struct pinctrl_dev *pctldev;
-	struct dbg_cfg *dbg = &pinconf_dbg_conf;
-	int i;
-
-	mutex_lock(&pinctrl_maps_mutex);
-
-	/* Parse the pinctrl map and look for the elected pin/state */
-	for_each_maps(maps_node, i, map) {
-		if (map->type != dbg->map_type)
-			continue;
-		if (strcmp(map->dev_name, dbg->dev_name))
-			continue;
-		if (strcmp(map->name, dbg->state_name))
-			continue;
-
-		if (!strcmp(map->data.configs.group_or_pin, dbg->pin_name)) {
-			/* We found the right pin */
-			found = map;
-			break;
-		}
-	}
-
-	if (!found) {
-		seq_printf(s, "No config found for dev/state/pin, expected:\n");
-		seq_printf(s, "Searched dev:%s\n", dbg->dev_name);
-		seq_printf(s, "Searched state:%s\n", dbg->state_name);
-		seq_printf(s, "Searched pin:%s\n", dbg->pin_name);
-		seq_printf(s, "Use: modify config_pin <devname> "\
-				"<state> <pinname> <value>\n");
-		goto exit;
-	}
-
-	pctldev = get_pinctrl_dev_from_devname(found->ctrl_dev_name);
-	seq_printf(s, "Dev %s has config of %s in state %s:\n",
-		   dbg->dev_name, dbg->pin_name, dbg->state_name);
-	pinconf_show_config(s, pctldev, found->data.configs.configs,
-			    found->data.configs.num_configs);
-
-exit:
-	mutex_unlock(&pinctrl_maps_mutex);
-
-	return 0;
-}
-
-/**
- * pinconf_dbg_config_write() - modify the pinctrl config in the pinctrl
- * map, of a dev/pin/state entry based on user entries to pinconf-config
- * @user_buf: contains the modification request with expected format:
- *     modify <config> <devicename> <state> <name> <newvalue>
- * modify is literal string, alternatives like add/delete not supported yet
- * <config> is the configuration to be changed. Supported configs are
- *     "config_pin" or "config_group", alternatives like config_mux are not
- *     supported yet.
- * <devicename> <state> <name> are values that should match the pinctrl-maps
- * <newvalue> reflects the new config and is driver dependent
- */
-static ssize_t pinconf_dbg_config_write(struct file *file,
-	const char __user *user_buf, size_t count, loff_t *ppos)
-{
-	struct pinctrl_maps *maps_node;
-	const struct pinctrl_map *map;
-	const struct pinctrl_map *found = NULL;
-	struct pinctrl_dev *pctldev;
-	const struct pinconf_ops *confops = NULL;
-	struct dbg_cfg *dbg = &pinconf_dbg_conf;
-	const struct pinctrl_map_configs *configs;
-	char config[MAX_NAME_LEN + 1];
-	char buf[128];
-	char *b = &buf[0];
-	int buf_size;
-	char *token;
-	int i;
-
-	/* Get userspace string and assure termination */
-	buf_size = min(count, sizeof(buf) - 1);
-	if (copy_from_user(buf, user_buf, buf_size))
-		return -EFAULT;
-	buf[buf_size] = 0;
-
-	/*
-	 * need to parse entry and extract parameters:
-	 * modify configs_pin devicename state pinname newvalue
-	 */
-
-	/* Get arg: 'modify' */
-	token = strsep(&b, " ");
-	if (!token)
-		return -EINVAL;
-	if (strcmp(token, "modify"))
-		return -EINVAL;
-
-	/*
-	 * Get arg type: "config_pin" and "config_group"
-	 *                types are supported so far
-	 */
-	token = strsep(&b, " ");
-	if (!token)
-		return -EINVAL;
-	if (!strcmp(token, "config_pin"))
-		dbg->map_type = PIN_MAP_TYPE_CONFIGS_PIN;
-	else if (!strcmp(token, "config_group"))
-		dbg->map_type = PIN_MAP_TYPE_CONFIGS_GROUP;
-	else
-		return -EINVAL;
-
-	/* get arg 'device_name' */
-	token = strsep(&b, " ");
-	if (!token)
-		return -EINVAL;
-	if (strlen(token) >= MAX_NAME_LEN)
-		return -EINVAL;
-	strncpy(dbg->dev_name, token, MAX_NAME_LEN);
-
-	/* get arg 'state_name' */
-	token = strsep(&b, " ");
-	if (!token)
-		return -EINVAL;
-	if (strlen(token) >= MAX_NAME_LEN)
-		return -EINVAL;
-	strncpy(dbg->state_name, token, MAX_NAME_LEN);
-
-	/* get arg 'pin_name' */
-	token = strsep(&b, " ");
-	if (!token)
-		return -EINVAL;
-	if (strlen(token) >= MAX_NAME_LEN)
-		return -EINVAL;
-	strncpy(dbg->pin_name, token, MAX_NAME_LEN);
-
-	/* get new_value of config' */
-	token = strsep(&b, " ");
-	if (!token)
-		return -EINVAL;
-	if (strlen(token) >= MAX_NAME_LEN)
-		return -EINVAL;
-	strncpy(config, token, MAX_NAME_LEN);
-
-	mutex_lock(&pinctrl_maps_mutex);
-
-	/* Parse the pinctrl map and look for the selected dev/state/pin */
-	for_each_maps(maps_node, i, map) {
-		if (strcmp(map->dev_name, dbg->dev_name))
-			continue;
-		if (map->type != dbg->map_type)
-			continue;
-		if (strcmp(map->name, dbg->state_name))
-			continue;
-
-		/*  we found the right pin / state, so overwrite config */
-		if (!strcmp(map->data.configs.group_or_pin, dbg->pin_name)) {
-			found = map;
-			break;
-		}
-	}
-
-	if (!found) {
-		count = -EINVAL;
-		goto exit;
-	}
-
-	pctldev = get_pinctrl_dev_from_devname(found->ctrl_dev_name);
-	if (pctldev)
-		confops = pctldev->desc->confops;
-
-	if (confops && confops->pin_config_dbg_parse_modify) {
-		configs = &found->data.configs;
-		for (i = 0; i < configs->num_configs; i++) {
-			confops->pin_config_dbg_parse_modify(pctldev,
-						     config,
-						     &configs->configs[i]);
-		}
-	}
-
-exit:
-	mutex_unlock(&pinctrl_maps_mutex);
-
-	return count;
-}
-
-static int pinconf_dbg_config_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pinconf_dbg_config_print, inode->i_private);
-}
-
-static const struct file_operations pinconf_dbg_pinconfig_fops = {
-	.open = pinconf_dbg_config_open,
-	.write = pinconf_dbg_config_write,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-	.owner = THIS_MODULE,
-};
-
 void pinconf_init_device_debugfs(struct dentry *devroot,
 			 struct pinctrl_dev *pctldev)
 {
@@ -595,8 +375,6 @@ void pinconf_init_device_debugfs(struct dentry *devroot,
 			    devroot, pctldev, &pinconf_pins_fops);
 	debugfs_create_file("pinconf-groups", S_IFREG | S_IRUGO,
 			    devroot, pctldev, &pinconf_groups_fops);
-	debugfs_create_file("pinconf-config",  (S_IRUGO | S_IWUSR | S_IWGRP),
-			    devroot, pctldev, &pinconf_dbg_pinconfig_fops);
 }
 
 #endif
diff --git a/include/linux/pinctrl/pinconf.h b/include/linux/pinctrl/pinconf.h
index 109468d9d849..93c9dd133e9d 100644
--- a/include/linux/pinctrl/pinconf.h
+++ b/include/linux/pinctrl/pinconf.h
@@ -29,7 +29,6 @@ struct seq_file;
  * @pin_config_group_get: get configurations for an entire pin group; should
  *	return -ENOTSUPP and -EINVAL using the same rules as pin_config_get.
  * @pin_config_group_set: configure all pins in a group
- * @pin_config_dbg_parse_modify: optional debugfs to modify a pin configuration
  * @pin_config_dbg_show: optional debugfs display hook that will provide
  *	per-device info for a certain pin in debugfs
  * @pin_config_group_dbg_show: optional debugfs display hook that will provide
@@ -55,9 +54,6 @@ struct pinconf_ops {
 				     unsigned selector,
 				     unsigned long *configs,
 				     unsigned num_configs);
-	int (*pin_config_dbg_parse_modify) (struct pinctrl_dev *pctldev,
-					   const char *arg,
-					   unsigned long *config);
 	void (*pin_config_dbg_show) (struct pinctrl_dev *pctldev,
 				     struct seq_file *s,
 				     unsigned offset);
-- 
cgit v1.2.3


From 59abc8cc05856cb31d6e07a8e25f0d749d559233 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Wed, 23 Jan 2019 11:08:05 -0800
Subject: scsi: libsas: Remove scsi_to_u32()

Since the function scsi_to_u32() is identical to get_unaligned_be32(),
change all scsi_to_u32() calls into get_unaligned_be32() calls.

Cc: Jian Luo <luojian5@huawei.com>
Cc: John Garry <john.garry@huawei.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: John Garry <john.garry@huawei.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/libsas/sas_expander.c | 9 +++++----
 include/scsi/scsi.h                | 6 ------
 2 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 83e3715d30e4..b13b245791b0 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -25,6 +25,7 @@
 #include <linux/scatterlist.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
+#include <asm/unaligned.h>
 
 #include "sas_internal.h"
 
@@ -696,10 +697,10 @@ int sas_smp_get_phy_events(struct sas_phy *phy)
 	if (res)
 		goto out;
 
-	phy->invalid_dword_count = scsi_to_u32(&resp[12]);
-	phy->running_disparity_error_count = scsi_to_u32(&resp[16]);
-	phy->loss_of_dword_sync_count = scsi_to_u32(&resp[20]);
-	phy->phy_reset_problem_count = scsi_to_u32(&resp[24]);
+	phy->invalid_dword_count = get_unaligned_be32(&resp[12]);
+	phy->running_disparity_error_count = get_unaligned_be32(&resp[16]);
+	phy->loss_of_dword_sync_count = get_unaligned_be32(&resp[20]);
+	phy->phy_reset_problem_count = get_unaligned_be32(&resp[24]);
 
  out:
 	kfree(req);
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index eb7853c1a23b..5339baadc082 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -274,10 +274,4 @@ static inline int scsi_is_wlun(u64 lun)
 /* Used to obtain the PCI location of a device */
 #define SCSI_IOCTL_GET_PCI		0x5387
 
-/* Pull a u32 out of a SCSI message (using BE SCSI conventions) */
-static inline __u32 scsi_to_u32(__u8 *ptr)
-{
-	return (ptr[0]<<24) + (ptr[1]<<16) + (ptr[2]<<8) + ptr[3];
-}
-
 #endif /* _SCSI_SCSI_H */
-- 
cgit v1.2.3


From 6bf8f22aea0ddd93af822aed8afeeee4acdf7694 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Tue, 22 Jan 2019 08:29:56 +0200
Subject: IB/mlx5: Introduce MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD

Introduce MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD and its initial implementation.

This object is from type class FD and will be used to read DEVX async
commands completion.

The core layer should allow the driver to set object from type FD in a
safe mode, this option was added with a matching comment in place.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/rdma_core.c      |  1 +
 drivers/infiniband/core/uverbs_uapi.c    | 15 ++++--
 drivers/infiniband/hw/mlx5/devx.c        | 83 ++++++++++++++++++++++++++++++++
 include/rdma/uverbs_types.h              |  1 +
 include/uapi/rdma/mlx5_user_ioctl_cmds.h |  9 ++++
 5 files changed, 104 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 6c4747e61d2b..a260d2f8e0b7 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -801,6 +801,7 @@ void uverbs_close_fd(struct file *f)
 	/* Pairs with filp->private_data in alloc_begin_fd_uobject */
 	uverbs_uobject_put(uobj);
 }
+EXPORT_SYMBOL(uverbs_close_fd);
 
 /*
  * Drop the ucontext off the ufile and completely disconnect it from the
diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c
index 9ae08e4b78a3..7a987acf0c0b 100644
--- a/drivers/infiniband/core/uverbs_uapi.c
+++ b/drivers/infiniband/core/uverbs_uapi.c
@@ -188,13 +188,18 @@ static int uapi_merge_obj_tree(struct uverbs_api *uapi,
 		obj_elm->type_attrs = obj->type_attrs;
 		obj_elm->type_class = obj->type_attrs->type_class;
 		/*
-		 * Today drivers are only permitted to use idr_class
-		 * types. They cannot use FD types because we currently have
-		 * no way to revoke the fops pointer after device
-		 * disassociation.
+		 * Today drivers are only permitted to use idr_class and
+		 * fd_class types. We can revoke the IDR types during
+		 * disassociation, and the FD types require the driver to use
+		 * struct file_operations.owner to prevent the driver module
+		 * code from unloading while the file is open. This provides
+		 * enough safety that uverbs_close_fd() will continue to work.
+		 * Drivers using FD are responsible to handle disassociation of
+		 * the device on their own.
 		 */
 		if (WARN_ON(is_driver &&
-			    obj->type_attrs->type_class != &uverbs_idr_class))
+			    obj->type_attrs->type_class != &uverbs_idr_class &&
+			    obj->type_attrs->type_class != &uverbs_fd_class))
 			return -EINVAL;
 	}
 
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 5a588f3cfb1b..9933bcf83a6b 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -1168,6 +1168,38 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)(
 			      cmd_out, cmd_out_len);
 }
 
+struct devx_async_event_queue {
+	spinlock_t		lock;
+	wait_queue_head_t	poll_wait;
+	struct list_head	event_list;
+};
+
+struct devx_async_cmd_event_file {
+	struct ib_uobject		uobj;
+	struct devx_async_event_queue	ev_queue;
+};
+
+static void devx_init_event_queue(struct devx_async_event_queue *ev_queue)
+{
+	spin_lock_init(&ev_queue->lock);
+	INIT_LIST_HEAD(&ev_queue->event_list);
+	init_waitqueue_head(&ev_queue->poll_wait);
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct devx_async_cmd_event_file *ev_file;
+
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(
+		attrs, MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE);
+
+	ev_file = container_of(uobj, struct devx_async_cmd_event_file,
+			       uobj);
+	devx_init_event_queue(&ev_file->ev_queue);
+	return 0;
+}
+
 static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
 			 struct uverbs_attr_bundle *attrs,
 			 struct devx_umem *obj)
@@ -1313,6 +1345,38 @@ static int devx_umem_cleanup(struct ib_uobject *uobject,
 	return 0;
 }
 
+static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf,
+					 size_t count, loff_t *pos)
+{
+	return -EINVAL;
+}
+
+static int devx_async_cmd_event_close(struct inode *inode, struct file *filp)
+{
+	uverbs_close_fd(filp);
+	return 0;
+}
+
+static __poll_t devx_async_cmd_event_poll(struct file *filp,
+					      struct poll_table_struct *wait)
+{
+	return 0;
+}
+
+const struct file_operations devx_async_cmd_event_fops = {
+	.owner	 = THIS_MODULE,
+	.read	 = devx_async_cmd_event_read,
+	.poll    = devx_async_cmd_event_poll,
+	.release = devx_async_cmd_event_close,
+	.llseek	 = no_llseek,
+};
+
+static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj,
+						   enum rdma_remove_reason why)
+{
+	return 0;
+};
+
 DECLARE_UVERBS_NAMED_METHOD(
 	MLX5_IB_METHOD_DEVX_UMEM_REG,
 	UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE,
@@ -1440,6 +1504,22 @@ DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_UMEM,
 			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_UMEM_REG),
 			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_UMEM_DEREG));
 
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC,
+	UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE,
+			MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+	MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
+	UVERBS_TYPE_ALLOC_FD(sizeof(struct devx_async_cmd_event_file),
+			     devx_hot_unplug_async_cmd_event_file,
+			     &devx_async_cmd_event_fops, "[devx_async_cmd]",
+			     O_RDONLY),
+	&UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC));
+
 static bool devx_is_supported(struct ib_device *device)
 {
 	struct mlx5_ib_dev *dev = to_mdev(device);
@@ -1457,5 +1537,8 @@ const struct uapi_definition mlx5_ib_devx_defs[] = {
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
 		MLX5_IB_OBJECT_DEVX_UMEM,
 		UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+		MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
+		UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)),
 	{},
 };
diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h
index acb1bfa3cc99..175d761695e1 100644
--- a/include/rdma/uverbs_types.h
+++ b/include/rdma/uverbs_types.h
@@ -157,6 +157,7 @@ struct uverbs_obj_fd_type {
 
 extern const struct uverbs_obj_type_class uverbs_idr_class;
 extern const struct uverbs_obj_type_class uverbs_fd_class;
+void uverbs_close_fd(struct file *f);
 
 #define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) -	\
 				   sizeof(char))
diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
index b8d121d457f1..6ceae29d77cd 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
@@ -113,11 +113,20 @@ enum mlx5_ib_devx_umem_methods {
 	MLX5_IB_METHOD_DEVX_UMEM_DEREG,
 };
 
+enum mlx5_ib_devx_async_cmd_fd_alloc_attrs {
+	MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+};
+
+enum mlx5_ib_devx_async_cmd_fd_methods {
+	MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC = (1U << UVERBS_ID_NS_SHIFT),
+};
+
 enum mlx5_ib_objects {
 	MLX5_IB_OBJECT_DEVX = (1U << UVERBS_ID_NS_SHIFT),
 	MLX5_IB_OBJECT_DEVX_OBJ,
 	MLX5_IB_OBJECT_DEVX_UMEM,
 	MLX5_IB_OBJECT_FLOW_MATCHER,
+	MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
 };
 
 enum mlx5_ib_flow_matcher_create_attrs {
-- 
cgit v1.2.3


From a124edba26270697540f1058bfcd490c1c65b116 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Tue, 22 Jan 2019 08:29:57 +0200
Subject: IB/mlx5: Introduce async DEVX obj query API

Introduce async DEVX obj query API to get the command response back to
user space once it's ready without blocking when calling the firmware.

The event's data includes a header with some meta data then the firmware
output command data.

The header includes:
- The input 'wr_id' to let application recognizing the response.

The input FD attribute is used to have the event data ready on.
Downstream patches from this series will implement the file ops to let
application read it.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/devx.c         | 164 +++++++++++++++++++++++++++++-
 include/uapi/rdma/mlx5_user_ioctl_cmds.h  |   9 ++
 include/uapi/rdma/mlx5_user_ioctl_verbs.h |   5 +
 3 files changed, 177 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 9933bcf83a6b..9ca116155f9c 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -8,6 +8,7 @@
 #include <rdma/uverbs_types.h>
 #include <rdma/uverbs_ioctl.h>
 #include <rdma/mlx5_user_ioctl_cmds.h>
+#include <rdma/mlx5_user_ioctl_verbs.h>
 #include <rdma/ib_umem.h>
 #include <rdma/uverbs_std_types.h>
 #include <linux/mlx5/driver.h>
@@ -17,6 +18,16 @@
 #define UVERBS_MODULE_NAME mlx5_ib
 #include <rdma/uverbs_named_ioctl.h>
 
+struct devx_async_data {
+	struct mlx5_ib_dev *mdev;
+	struct list_head list;
+	struct ib_uobject *fd_uobj;
+	struct mlx5_async_work cb_work;
+	u16 cmd_out_len;
+	/* must be last field in this structure */
+	struct mlx5_ib_uapi_devx_async_cmd_hdr hdr;
+};
+
 #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in)
 struct devx_obj {
 	struct mlx5_core_dev	*mdev;
@@ -1172,11 +1183,13 @@ struct devx_async_event_queue {
 	spinlock_t		lock;
 	wait_queue_head_t	poll_wait;
 	struct list_head	event_list;
+	atomic_t		bytes_in_use;
 };
 
 struct devx_async_cmd_event_file {
 	struct ib_uobject		uobj;
 	struct devx_async_event_queue	ev_queue;
+	struct mlx5_async_ctx		async_ctx;
 };
 
 static void devx_init_event_queue(struct devx_async_event_queue *ev_queue)
@@ -1184,6 +1197,7 @@ static void devx_init_event_queue(struct devx_async_event_queue *ev_queue)
 	spin_lock_init(&ev_queue->lock);
 	INIT_LIST_HEAD(&ev_queue->event_list);
 	init_waitqueue_head(&ev_queue->poll_wait);
+	atomic_set(&ev_queue->bytes_in_use, 0);
 }
 
 static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)(
@@ -1193,13 +1207,123 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)(
 
 	struct ib_uobject *uobj = uverbs_attr_get_uobject(
 		attrs, MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE);
+	struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device);
 
 	ev_file = container_of(uobj, struct devx_async_cmd_event_file,
 			       uobj);
 	devx_init_event_queue(&ev_file->ev_queue);
+	mlx5_cmd_init_async_ctx(mdev->mdev, &ev_file->async_ctx);
 	return 0;
 }
 
+static void devx_query_callback(int status, struct mlx5_async_work *context)
+{
+	struct devx_async_data *async_data =
+		container_of(context, struct devx_async_data, cb_work);
+	struct ib_uobject *fd_uobj = async_data->fd_uobj;
+	struct devx_async_cmd_event_file *ev_file;
+	struct devx_async_event_queue *ev_queue;
+	unsigned long flags;
+
+	ev_file = container_of(fd_uobj, struct devx_async_cmd_event_file,
+			       uobj);
+	ev_queue = &ev_file->ev_queue;
+
+	spin_lock_irqsave(&ev_queue->lock, flags);
+	list_add_tail(&async_data->list, &ev_queue->event_list);
+	spin_unlock_irqrestore(&ev_queue->lock, flags);
+
+	wake_up_interruptible(&ev_queue->poll_wait);
+	fput(fd_uobj->object);
+}
+
+#define MAX_ASYNC_BYTES_IN_USE (1024 * 1024) /* 1MB */
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY)(
+	struct uverbs_attr_bundle *attrs)
+{
+	void *cmd_in = uverbs_attr_get_alloced_ptr(attrs,
+				MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN);
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(
+				attrs,
+				MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_HANDLE);
+	u16 cmd_out_len;
+	struct mlx5_ib_ucontext *c = to_mucontext(uobj->context);
+	struct ib_uobject *fd_uobj;
+	int err;
+	int uid;
+	struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device);
+	struct devx_async_cmd_event_file *ev_file;
+	struct devx_async_data *async_data;
+
+	uid = devx_get_uid(c, cmd_in);
+	if (uid < 0)
+		return uid;
+
+	if (!devx_is_obj_query_cmd(cmd_in))
+		return -EINVAL;
+
+	err = uverbs_get_const(&cmd_out_len, attrs,
+			       MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN);
+	if (err)
+		return err;
+
+	if (!devx_is_valid_obj_id(uobj, cmd_in))
+		return -EINVAL;
+
+	fd_uobj = uverbs_attr_get_uobject(attrs,
+				MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD);
+	if (IS_ERR(fd_uobj))
+		return PTR_ERR(fd_uobj);
+
+	ev_file = container_of(fd_uobj, struct devx_async_cmd_event_file,
+			       uobj);
+
+	if (atomic_add_return(cmd_out_len, &ev_file->ev_queue.bytes_in_use) >
+			MAX_ASYNC_BYTES_IN_USE) {
+		atomic_sub(cmd_out_len, &ev_file->ev_queue.bytes_in_use);
+		return -EAGAIN;
+	}
+
+	async_data = kvzalloc(struct_size(async_data, hdr.out_data,
+					  cmd_out_len), GFP_KERNEL);
+	if (!async_data) {
+		err = -ENOMEM;
+		goto sub_bytes;
+	}
+
+	err = uverbs_copy_from(&async_data->hdr.wr_id, attrs,
+			       MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID);
+	if (err)
+		goto free_async;
+
+	async_data->cmd_out_len = cmd_out_len;
+	async_data->mdev = mdev;
+	async_data->fd_uobj = fd_uobj;
+
+	get_file(fd_uobj->object);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid);
+	err = mlx5_cmd_exec_cb(&ev_file->async_ctx, cmd_in,
+		    uverbs_attr_get_len(attrs,
+				MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN),
+		    async_data->hdr.out_data,
+		    async_data->cmd_out_len,
+		    devx_query_callback, &async_data->cb_work);
+
+	if (err)
+		goto cb_err;
+
+	return 0;
+
+cb_err:
+	fput(fd_uobj->object);
+free_async:
+	kvfree(async_data);
+sub_bytes:
+	atomic_sub(cmd_out_len, &ev_file->ev_queue.bytes_in_use);
+	return err;
+}
+
 static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
 			 struct uverbs_attr_bundle *attrs,
 			 struct devx_umem *obj)
@@ -1353,6 +1477,17 @@ static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf,
 
 static int devx_async_cmd_event_close(struct inode *inode, struct file *filp)
 {
+	struct ib_uobject *uobj = filp->private_data;
+	struct devx_async_cmd_event_file *comp_ev_file = container_of(
+		uobj, struct devx_async_cmd_event_file, uobj);
+	struct devx_async_data *entry, *tmp;
+
+	spin_lock_irq(&comp_ev_file->ev_queue.lock);
+	list_for_each_entry_safe(entry, tmp,
+				 &comp_ev_file->ev_queue.event_list, list)
+		kvfree(entry);
+	spin_unlock_irq(&comp_ev_file->ev_queue.lock);
+
 	uverbs_close_fd(filp);
 	return 0;
 }
@@ -1374,6 +1509,11 @@ const struct file_operations devx_async_cmd_event_fops = {
 static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj,
 						   enum rdma_remove_reason why)
 {
+	struct devx_async_cmd_event_file *comp_ev_file =
+		container_of(uobj, struct devx_async_cmd_event_file,
+			     uobj);
+
+	mlx5_cmd_cleanup_async_ctx(&comp_ev_file->async_ctx);
 	return 0;
 };
 
@@ -1487,6 +1627,27 @@ DECLARE_UVERBS_NAMED_METHOD(
 		UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_out_cmd_hdr)),
 		UA_MANDATORY));
 
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE,
+			UVERBS_IDR_ANY_OBJECT,
+			UVERBS_ACCESS_READ,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(
+		MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN,
+		UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr)),
+		UA_MANDATORY,
+		UA_ALLOC_AND_COPY),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN,
+		u16, UA_MANDATORY),
+	UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD,
+		MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
+		UVERBS_ACCESS_READ,
+		UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID,
+		UVERBS_ATTR_TYPE(u64),
+		UA_MANDATORY));
+
 DECLARE_UVERBS_GLOBAL_METHODS(MLX5_IB_OBJECT_DEVX,
 			      &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OTHER),
 			      &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_UAR),
@@ -1497,7 +1658,8 @@ DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_OBJ,
 			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_CREATE),
 			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_DESTROY),
 			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_MODIFY),
-			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_QUERY));
+			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_QUERY),
+			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY));
 
 DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_UMEM,
 			    UVERBS_TYPE_ALLOC_IDR(devx_umem_cleanup),
diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
index 6ceae29d77cd..8149d224030b 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
@@ -84,6 +84,14 @@ enum mlx5_ib_devx_obj_query_attrs {
 	MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT,
 };
 
+enum mlx5_ib_devx_obj_query_async_attrs {
+	MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+	MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN,
+	MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD,
+	MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID,
+	MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN,
+};
+
 enum  mlx5_ib_devx_query_eqn_attrs {
 	MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC = (1U << UVERBS_ID_NS_SHIFT),
 	MLX5_IB_ATTR_DEVX_QUERY_EQN_DEV_EQN,
@@ -94,6 +102,7 @@ enum mlx5_ib_devx_obj_methods {
 	MLX5_IB_METHOD_DEVX_OBJ_DESTROY,
 	MLX5_IB_METHOD_DEVX_OBJ_MODIFY,
 	MLX5_IB_METHOD_DEVX_OBJ_QUERY,
+	MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY,
 };
 
 enum mlx5_ib_devx_umem_reg_attrs {
diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
index 4ef62c0e8452..4a701033b93f 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
@@ -51,5 +51,10 @@ enum mlx5_ib_uapi_flow_action_packet_reformat_type {
 	MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x3,
 };
 
+struct mlx5_ib_uapi_devx_async_cmd_hdr {
+	__aligned_u64	wr_id;
+	__u8		out_data[];
+};
+
 #endif
 
-- 
cgit v1.2.3


From 1194c4133195dfcb6c5fc0935d54bbed872a5285 Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui@microsoft.com>
Date: Tue, 29 Jan 2019 00:56:17 +0000
Subject: nfit: Add Hyper-V NVDIMM DSM command set to white list

Add the Hyper-V _DSM command set to the white list of NVDIMM command
sets.

This command set is documented at http://www.uefi.org/RFIC_LIST
(see "Virtual NVDIMM 0x1901").

Signed-off-by: Dexuan Cui <decui@microsoft.com>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c   | 17 ++++++++++++++---
 drivers/acpi/nfit/nfit.h   |  6 +++++-
 include/uapi/linux/ndctl.h |  1 +
 3 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 1598e3a121a6..4a7e8b1fa43b 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1846,9 +1846,17 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 	dev_set_drvdata(&adev_dimm->dev, nfit_mem);
 
 	/*
-	 * Until standardization materializes we need to consider 4
-	 * different command sets.  Note, that checking for function0 (bit0)
-	 * tells us if any commands are reachable through this GUID.
+	 * There are 4 "legacy" NVDIMM command sets
+	 * (NVDIMM_FAMILY_{INTEL,MSFT,HPE1,HPE2}) that were created before
+	 * an EFI working group was established to constrain this
+	 * proliferation. The nfit driver probes for the supported command
+	 * set by GUID. Note, if you're a platform developer looking to add
+	 * a new command set to this probe, consider using an existing set,
+	 * or otherwise seek approval to publish the command set at
+	 * http://www.uefi.org/RFIC_LIST.
+	 *
+	 * Note, that checking for function0 (bit0) tells us if any commands
+	 * are reachable through this GUID.
 	 */
 	for (i = 0; i <= NVDIMM_FAMILY_MAX; i++)
 		if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1))
@@ -1871,6 +1879,8 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 			dsm_mask &= ~(1 << 8);
 	} else if (nfit_mem->family == NVDIMM_FAMILY_MSFT) {
 		dsm_mask = 0xffffffff;
+	} else if (nfit_mem->family == NVDIMM_FAMILY_HYPERV) {
+		dsm_mask = 0x1f;
 	} else {
 		dev_dbg(dev, "unknown dimm command family\n");
 		nfit_mem->family = -1;
@@ -3714,6 +3724,7 @@ static __init int nfit_init(void)
 	guid_parse(UUID_NFIT_DIMM_N_HPE1, &nfit_uuid[NFIT_DEV_DIMM_N_HPE1]);
 	guid_parse(UUID_NFIT_DIMM_N_HPE2, &nfit_uuid[NFIT_DEV_DIMM_N_HPE2]);
 	guid_parse(UUID_NFIT_DIMM_N_MSFT, &nfit_uuid[NFIT_DEV_DIMM_N_MSFT]);
+	guid_parse(UUID_NFIT_DIMM_N_HYPERV, &nfit_uuid[NFIT_DEV_DIMM_N_HYPERV]);
 
 	nfit_wq = create_singlethread_workqueue("nfit");
 	if (!nfit_wq)
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index 33691aecfcee..4de167b4f76f 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -34,11 +34,14 @@
 /* https://msdn.microsoft.com/library/windows/hardware/mt604741 */
 #define UUID_NFIT_DIMM_N_MSFT "1ee68b36-d4bd-4a1a-9a16-4f8e53d46e05"
 
+/* http://www.uefi.org/RFIC_LIST (see "Virtual NVDIMM 0x1901") */
+#define UUID_NFIT_DIMM_N_HYPERV "5746c5f2-a9a2-4264-ad0e-e4ddc9e09e80"
+
 #define ACPI_NFIT_MEM_FAILED_MASK (ACPI_NFIT_MEM_SAVE_FAILED \
 		| ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
 		| ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)
 
-#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_MSFT
+#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_HYPERV
 
 #define NVDIMM_STANDARD_CMDMASK \
 (1 << ND_CMD_SMART | 1 << ND_CMD_SMART_THRESHOLD | 1 << ND_CMD_DIMM_FLAGS \
@@ -94,6 +97,7 @@ enum nfit_uuids {
 	NFIT_DEV_DIMM_N_HPE1 = NVDIMM_FAMILY_HPE1,
 	NFIT_DEV_DIMM_N_HPE2 = NVDIMM_FAMILY_HPE2,
 	NFIT_DEV_DIMM_N_MSFT = NVDIMM_FAMILY_MSFT,
+	NFIT_DEV_DIMM_N_HYPERV = NVDIMM_FAMILY_HYPERV,
 	NFIT_SPA_VOLATILE,
 	NFIT_SPA_PM,
 	NFIT_SPA_DCR,
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index f57c9e434d2d..de5d90212409 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -243,6 +243,7 @@ struct nd_cmd_pkg {
 #define NVDIMM_FAMILY_HPE1 1
 #define NVDIMM_FAMILY_HPE2 2
 #define NVDIMM_FAMILY_MSFT 3
+#define NVDIMM_FAMILY_HYPERV 4
 
 #define ND_IOCTL_CALL			_IOWR(ND_IOCTL, ND_CMD_CALL,\
 					struct nd_cmd_pkg)
-- 
cgit v1.2.3


From 0b5cb3300ae59ed7e93b465dfa2384a6a4df8eb4 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 22 Jan 2019 10:25:20 -0800
Subject: RDMA/srp: Increase max_segment_size

The default behavior of the SCSI core is to set the block layer request
queue parameter max_segment_size to 64 KB. That means that elements of
scatterlists are limited to 64 KB. Since RDMA adapters support larger
sizes, increase max_segment_size for the SRP initiator.

Notes:
- The SCSI max_segment_size parameter was introduced in kernel v5.0. See
  also commit 50c2e9107f17 ("scsi: introduce a max_segment_size
  host_template parameters").
- Some other block drivers already set max_segment_size to UINT_MAX,
  e.g. nbd and rbd.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/ulp/srp/ib_srp.c |  1 +
 include/rdma/ib_verbs.h             | 13 +++++++++++++
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 0f855294ff3f..29fe46dbdbf1 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -3823,6 +3823,7 @@ static ssize_t srp_create_target(struct device *dev,
 	target_host->max_id      = 1;
 	target_host->max_lun     = -1LL;
 	target_host->max_cmd_len = sizeof ((struct srp_cmd *) (void *) 0L)->cdb;
+	target_host->max_segment_size = ib_dma_max_seg_size(ibdev);
 
 	target = host_to_target(target_host);
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 94b6e1dd4dab..71ea144ec823 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -3715,6 +3715,19 @@ static inline unsigned int ib_sg_dma_len(struct ib_device *dev,
 	return sg_dma_len(sg);
 }
 
+/**
+ * ib_dma_max_seg_size - Return the size limit of a single DMA transfer
+ * @dev: The device to query
+ *
+ * The returned value represents a size in bytes.
+ */
+static inline unsigned int ib_dma_max_seg_size(struct ib_device *dev)
+{
+	struct device_dma_parameters *p = dev->dma_device->dma_parms;
+
+	return p ? p->max_segment_size : UINT_MAX;
+}
+
 /**
  * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU
  * @dev: The device for which the DMA address was created
-- 
cgit v1.2.3


From 9bc61ab18b1d41f26dc06b9e6d3c203e65f83fe6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 4 Nov 2018 03:19:03 -0500
Subject: vfs: Introduce fs_context, switch vfs_kern_mount() to it.

Introduce a filesystem context concept to be used during superblock
creation for mount and superblock reconfiguration for remount.  This is
allocated at the beginning of the mount procedure and into it is placed:

 (1) Filesystem type.

 (2) Namespaces.

 (3) Source/Device names (there may be multiple).

 (4) Superblock flags (SB_*).

 (5) Security details.

 (6) Filesystem-specific data, as set by the mount options.

Accessor functions are then provided to set up a context, parameterise it
from monolithic mount data (the data page passed to mount(2)) and tear it
down again.

A legacy wrapper is provided that implements what will be the basic
operations, wrapping access to filesystems that aren't yet aware of the
fs_context.

Finally, vfs_kern_mount() is changed to make use of the fs_context and
mount_fs() is replaced by vfs_get_tree(), called from vfs_kern_mount().
[AV -- add missing kstrdup()]
[AV -- put_cred() can be unconditional - fc->cred can't be NULL]
[AV -- take legacy_validate() contents into legacy_parse_monolithic()]
[AV -- merge KERNEL_MOUNT and USER_MOUNT]
[AV -- don't unlock superblock on success return from vfs_get_tree()]
[AV -- kill 'reference' argument of init_fs_context()]

Signed-off-by: David Howells <dhowells@redhat.com>
Co-developed-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/Makefile                |   3 +-
 fs/fs_context.c            | 182 +++++++++++++++++++++++++++++++++++++++++++++
 fs/internal.h              |   9 ++-
 fs/namespace.c             |  46 ++++++++----
 fs/super.c                 |  50 ++++++-------
 include/linux/fs_context.h |  64 ++++++++++++++++
 6 files changed, 310 insertions(+), 44 deletions(-)
 create mode 100644 fs/fs_context.c
 create mode 100644 include/linux/fs_context.h

(limited to 'include')

diff --git a/fs/Makefile b/fs/Makefile
index 293733f61594..5563cf34f7c2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -12,7 +12,8 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o splice.o sync.o utimes.o d_path.o \
-		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
+		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
+		fs_context.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
diff --git a/fs/fs_context.c b/fs/fs_context.c
new file mode 100644
index 000000000000..4294091b689d
--- /dev/null
+++ b/fs/fs_context.c
@@ -0,0 +1,182 @@
+/* Provide a way to create a superblock configuration context within the kernel
+ * that allows a superblock to be set up prior to mounting.
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/fs_context.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/nsproxy.h>
+#include <linux/slab.h>
+#include <linux/magic.h>
+#include <linux/security.h>
+#include <linux/mnt_namespace.h>
+#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
+#include <net/net_namespace.h>
+#include "mount.h"
+#include "internal.h"
+
+struct legacy_fs_context {
+	char			*legacy_data;	/* Data page for legacy filesystems */
+	size_t			data_size;
+};
+
+static int legacy_init_fs_context(struct fs_context *fc);
+
+/**
+ * alloc_fs_context - Create a filesystem context.
+ * @fs_type: The filesystem type.
+ * @reference: The dentry from which this one derives (or NULL)
+ * @sb_flags: Filesystem/superblock flags (SB_*)
+ * @sb_flags_mask: Applicable members of @sb_flags
+ * @purpose: The purpose that this configuration shall be used for.
+ *
+ * Open a filesystem and create a mount context.  The mount context is
+ * initialised with the supplied flags and, if a submount/automount from
+ * another superblock (referred to by @reference) is supplied, may have
+ * parameters such as namespaces copied across from that superblock.
+ */
+static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
+				      struct dentry *reference,
+				      unsigned int sb_flags,
+				      unsigned int sb_flags_mask,
+				      enum fs_context_purpose purpose)
+{
+	struct fs_context *fc;
+	int ret = -ENOMEM;
+
+	fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
+	if (!fc)
+		return ERR_PTR(-ENOMEM);
+
+	fc->purpose	= purpose;
+	fc->sb_flags	= sb_flags;
+	fc->sb_flags_mask = sb_flags_mask;
+	fc->fs_type	= get_filesystem(fs_type);
+	fc->cred	= get_current_cred();
+	fc->net_ns	= get_net(current->nsproxy->net_ns);
+
+	switch (purpose) {
+	case FS_CONTEXT_FOR_MOUNT:
+		fc->user_ns = get_user_ns(fc->cred->user_ns);
+		break;
+	}
+
+	ret = legacy_init_fs_context(fc);
+	if (ret < 0)
+		goto err_fc;
+	fc->need_free = true;
+	return fc;
+
+err_fc:
+	put_fs_context(fc);
+	return ERR_PTR(ret);
+}
+
+struct fs_context *fs_context_for_mount(struct file_system_type *fs_type,
+					unsigned int sb_flags)
+{
+	return alloc_fs_context(fs_type, NULL, sb_flags, 0,
+					FS_CONTEXT_FOR_MOUNT);
+}
+EXPORT_SYMBOL(fs_context_for_mount);
+
+static void legacy_fs_context_free(struct fs_context *fc);
+/**
+ * put_fs_context - Dispose of a superblock configuration context.
+ * @fc: The context to dispose of.
+ */
+void put_fs_context(struct fs_context *fc)
+{
+	struct super_block *sb;
+
+	if (fc->root) {
+		sb = fc->root->d_sb;
+		dput(fc->root);
+		fc->root = NULL;
+		deactivate_super(sb);
+	}
+
+	if (fc->need_free)
+		legacy_fs_context_free(fc);
+
+	security_free_mnt_opts(&fc->security);
+	if (fc->net_ns)
+		put_net(fc->net_ns);
+	put_user_ns(fc->user_ns);
+	put_cred(fc->cred);
+	kfree(fc->subtype);
+	put_filesystem(fc->fs_type);
+	kfree(fc->source);
+	kfree(fc);
+}
+EXPORT_SYMBOL(put_fs_context);
+
+/*
+ * Free the config for a filesystem that doesn't support fs_context.
+ */
+static void legacy_fs_context_free(struct fs_context *fc)
+{
+	kfree(fc->fs_private);
+}
+
+/*
+ * Add monolithic mount data.
+ */
+static int legacy_parse_monolithic(struct fs_context *fc, void *data)
+{
+	struct legacy_fs_context *ctx = fc->fs_private;
+	ctx->legacy_data = data;
+	if (!ctx->legacy_data)
+		return 0;
+	if (fc->fs_type->fs_flags & FS_BINARY_MOUNTDATA)
+		return 0;
+	return security_sb_eat_lsm_opts(ctx->legacy_data, &fc->security);
+}
+
+/*
+ * Get a mountable root with the legacy mount command.
+ */
+int legacy_get_tree(struct fs_context *fc)
+{
+	struct legacy_fs_context *ctx = fc->fs_private;
+	struct super_block *sb;
+	struct dentry *root;
+
+	root = fc->fs_type->mount(fc->fs_type, fc->sb_flags,
+				      fc->source, ctx->legacy_data);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+
+	sb = root->d_sb;
+	BUG_ON(!sb);
+
+	fc->root = root;
+	return 0;
+}
+
+/*
+ * Initialise a legacy context for a filesystem that doesn't support
+ * fs_context.
+ */
+static int legacy_init_fs_context(struct fs_context *fc)
+{
+	fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL);
+	if (!fc->fs_private)
+		return -ENOMEM;
+	return 0;
+}
+
+int parse_monolithic_mount_data(struct fs_context *fc, void *data)
+{
+	return legacy_parse_monolithic(fc, data);
+}
diff --git a/fs/internal.h b/fs/internal.h
index d410186bc369..f85c3212d25d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -17,6 +17,7 @@ struct linux_binprm;
 struct path;
 struct mount;
 struct shrink_control;
+struct fs_context;
 
 /*
  * block_dev.c
@@ -51,6 +52,12 @@ int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied,
  */
 extern void __init chrdev_init(void);
 
+/*
+ * fs_context.c
+ */
+extern int legacy_get_tree(struct fs_context *fc);
+extern int parse_monolithic_mount_data(struct fs_context *, void *);
+
 /*
  * namei.c
  */
@@ -101,8 +108,6 @@ extern struct file *alloc_empty_file_noaccount(int, const struct cred *);
  */
 extern int do_remount_sb(struct super_block *, int, void *, int);
 extern bool trylock_super(struct super_block *sb);
-extern struct dentry *mount_fs(struct file_system_type *,
-			       int, const char *, void *);
 extern struct super_block *user_get_super(dev_t);
 
 /*
diff --git a/fs/namespace.c b/fs/namespace.c
index f0b8a8ca08df..3f2fd7a34733 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,7 @@
 #include <linux/task_work.h>
 #include <linux/sched/task.h>
 #include <uapi/linux/mount.h>
+#include <linux/fs_context.h>
 
 #include "pnode.h"
 #include "internal.h"
@@ -940,36 +941,53 @@ static struct mount *skip_mnt_tree(struct mount *p)
 	return p;
 }
 
-struct vfsmount *
-vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
+struct vfsmount *vfs_kern_mount(struct file_system_type *type,
+				int flags, const char *name,
+				void *data)
 {
+	struct fs_context *fc;
 	struct mount *mnt;
-	struct dentry *root;
+	int ret = 0;
 
 	if (!type)
 		return ERR_PTR(-ENODEV);
 
+	fc = fs_context_for_mount(type, flags);
+	if (IS_ERR(fc))
+		return ERR_CAST(fc);
+
+	if (name) {
+		fc->source = kstrdup(name, GFP_KERNEL);
+		if (!fc->source)
+			ret = -ENOMEM;
+	}
+	if (!ret)
+		ret = parse_monolithic_mount_data(fc, data);
+	if (!ret)
+		ret = vfs_get_tree(fc);
+	if (ret) {
+		put_fs_context(fc);
+		return ERR_PTR(ret);
+	}
+	up_write(&fc->root->d_sb->s_umount);
 	mnt = alloc_vfsmnt(name);
-	if (!mnt)
+	if (!mnt) {
+		put_fs_context(fc);
 		return ERR_PTR(-ENOMEM);
+	}
 
 	if (flags & SB_KERNMOUNT)
 		mnt->mnt.mnt_flags = MNT_INTERNAL;
 
-	root = mount_fs(type, flags, name, data);
-	if (IS_ERR(root)) {
-		mnt_free_id(mnt);
-		free_vfsmnt(mnt);
-		return ERR_CAST(root);
-	}
-
-	mnt->mnt.mnt_root = root;
-	mnt->mnt.mnt_sb = root->d_sb;
+	atomic_inc(&fc->root->d_sb->s_active);
+	mnt->mnt.mnt_root = dget(fc->root);
+	mnt->mnt.mnt_sb = fc->root->d_sb;
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	mnt->mnt_parent = mnt;
 	lock_mount_hash();
-	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
+	list_add_tail(&mnt->mnt_instance, &fc->root->d_sb->s_mounts);
 	unlock_mount_hash();
+	put_fs_context(fc);
 	return &mnt->mnt;
 }
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
diff --git a/fs/super.c b/fs/super.c
index 48e25eba8465..fc3887277ad1 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -35,6 +35,7 @@
 #include <linux/fsnotify.h>
 #include <linux/lockdep.h>
 #include <linux/user_namespace.h>
+#include <linux/fs_context.h>
 #include <uapi/linux/mount.h>
 #include "internal.h"
 
@@ -1241,27 +1242,24 @@ struct dentry *mount_single(struct file_system_type *fs_type,
 }
 EXPORT_SYMBOL(mount_single);
 
-struct dentry *
-mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
+/**
+ * vfs_get_tree - Get the mountable root
+ * @fc: The superblock configuration context.
+ *
+ * The filesystem is invoked to get or create a superblock which can then later
+ * be used for mounting.  The filesystem places a pointer to the root to be
+ * used for mounting in @fc->root.
+ */
+int vfs_get_tree(struct fs_context *fc)
 {
-	struct dentry *root;
 	struct super_block *sb;
-	int error = -ENOMEM;
-	void *sec_opts = NULL;
+	int error;
 
-	if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
-		error = security_sb_eat_lsm_opts(data, &sec_opts);
-		if (error)
-			return ERR_PTR(error);
-	}
+	error = legacy_get_tree(fc);
+	if (error < 0)
+		return error;
 
-	root = type->mount(type, flags, name, data);
-	if (IS_ERR(root)) {
-		error = PTR_ERR(root);
-		goto out_free_secdata;
-	}
-	sb = root->d_sb;
-	BUG_ON(!sb);
+	sb = fc->root->d_sb;
 	WARN_ON(!sb->s_bdi);
 
 	/*
@@ -1273,11 +1271,11 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
 	smp_wmb();
 	sb->s_flags |= SB_BORN;
 
-	error = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL);
+	error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL);
 	if (error)
 		goto out_sb;
 
-	if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT))) {
+	if (!(fc->sb_flags & (MS_KERNMOUNT|MS_SUBMOUNT))) {
 		error = security_sb_kern_mount(sb);
 		if (error)
 			goto out_sb;
@@ -1290,18 +1288,16 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
 	 * violate this rule.
 	 */
 	WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
-		"negative value (%lld)\n", type->name, sb->s_maxbytes);
+		"negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes);
 
-	up_write(&sb->s_umount);
-	security_free_mnt_opts(&sec_opts);
-	return root;
+	return 0;
 out_sb:
-	dput(root);
+	dput(fc->root);
+	fc->root = NULL;
 	deactivate_locked_super(sb);
-out_free_secdata:
-	security_free_mnt_opts(&sec_opts);
-	return ERR_PTR(error);
+	return error;
 }
+EXPORT_SYMBOL(vfs_get_tree);
 
 /*
  * Setup private BDI for given superblock. It gets automatically cleaned up
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
new file mode 100644
index 000000000000..9805514444c9
--- /dev/null
+++ b/include/linux/fs_context.h
@@ -0,0 +1,64 @@
+/* Filesystem superblock creation and reconfiguration context.
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _LINUX_FS_CONTEXT_H
+#define _LINUX_FS_CONTEXT_H
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/security.h>
+
+struct cred;
+struct dentry;
+struct file_operations;
+struct file_system_type;
+struct net;
+struct user_namespace;
+
+enum fs_context_purpose {
+	FS_CONTEXT_FOR_MOUNT,		/* New superblock for explicit mount */
+};
+
+/*
+ * Filesystem context for holding the parameters used in the creation or
+ * reconfiguration of a superblock.
+ *
+ * Superblock creation fills in ->root whereas reconfiguration begins with this
+ * already set.
+ *
+ * See Documentation/filesystems/mounting.txt
+ */
+struct fs_context {
+	struct file_system_type	*fs_type;
+	void			*fs_private;	/* The filesystem's context */
+	struct dentry		*root;		/* The root and superblock */
+	struct user_namespace	*user_ns;	/* The user namespace for this mount */
+	struct net		*net_ns;	/* The network namespace for this mount */
+	const struct cred	*cred;		/* The mounter's credentials */
+	const char		*source;	/* The source name (eg. dev path) */
+	const char		*subtype;	/* The subtype to set on the superblock */
+	void			*security;	/* Linux S&M options */
+	unsigned int		sb_flags;	/* Proposed superblock flags (SB_*) */
+	unsigned int		sb_flags_mask;	/* Superblock flags that were changed */
+	enum fs_context_purpose	purpose:8;
+	bool			need_free:1;	/* Need to call ops->free() */
+};
+
+/*
+ * fs_context manipulation functions.
+ */
+extern struct fs_context *fs_context_for_mount(struct file_system_type *fs_type,
+						unsigned int sb_flags);
+
+extern int vfs_get_tree(struct fs_context *fc);
+extern void put_fs_context(struct fs_context *fc);
+
+#endif /* _LINUX_FS_CONTEXT_H */
-- 
cgit v1.2.3


From 8f2918898eb5fe25845dde7f4a77bda0e2966e05 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 4 Nov 2018 06:48:34 -0500
Subject: new helpers: vfs_create_mount(), fc_mount()

Create a new helper, vfs_create_mount(), that creates a detached vfsmount
object from an fs_context that has a superblock attached to it.

Almost all uses will be paired with immediately preceding vfs_get_tree();
add a helper for such combination.

Switch vfs_kern_mount() to use this.

NOTE: mild behaviour change; passing NULL as 'device name' to
something like procfs will change /proc/*/mountstats - "device none"
instead on "no device".  That is consistent with /proc/mounts et.al.

[do'h - EXPORT_SYMBOL_GPL slipped in by mistake; removed]
[AV -- remove confused comment from vfs_create_mount()]
[AV -- removed the second argument]

Reviewed-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c        | 76 +++++++++++++++++++++++++++++++++++----------------
 include/linux/mount.h |  3 ++
 2 files changed, 55 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/fs/namespace.c b/fs/namespace.c
index 3f2fd7a34733..156771f5745a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -941,12 +941,59 @@ static struct mount *skip_mnt_tree(struct mount *p)
 	return p;
 }
 
+/**
+ * vfs_create_mount - Create a mount for a configured superblock
+ * @fc: The configuration context with the superblock attached
+ *
+ * Create a mount to an already configured superblock.  If necessary, the
+ * caller should invoke vfs_get_tree() before calling this.
+ *
+ * Note that this does not attach the mount to anything.
+ */
+struct vfsmount *vfs_create_mount(struct fs_context *fc)
+{
+	struct mount *mnt;
+
+	if (!fc->root)
+		return ERR_PTR(-EINVAL);
+
+	mnt = alloc_vfsmnt(fc->source ?: "none");
+	if (!mnt)
+		return ERR_PTR(-ENOMEM);
+
+	if (fc->sb_flags & SB_KERNMOUNT)
+		mnt->mnt.mnt_flags = MNT_INTERNAL;
+
+	atomic_inc(&fc->root->d_sb->s_active);
+	mnt->mnt.mnt_sb		= fc->root->d_sb;
+	mnt->mnt.mnt_root	= dget(fc->root);
+	mnt->mnt_mountpoint	= mnt->mnt.mnt_root;
+	mnt->mnt_parent		= mnt;
+
+	lock_mount_hash();
+	list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
+	unlock_mount_hash();
+	return &mnt->mnt;
+}
+EXPORT_SYMBOL(vfs_create_mount);
+
+struct vfsmount *fc_mount(struct fs_context *fc)
+{
+	int err = vfs_get_tree(fc);
+	if (!err) {
+		up_write(&fc->root->d_sb->s_umount);
+		return vfs_create_mount(fc);
+	}
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(fc_mount);
+
 struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 				int flags, const char *name,
 				void *data)
 {
 	struct fs_context *fc;
-	struct mount *mnt;
+	struct vfsmount *mnt;
 	int ret = 0;
 
 	if (!type)
@@ -964,31 +1011,12 @@ struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 	if (!ret)
 		ret = parse_monolithic_mount_data(fc, data);
 	if (!ret)
-		ret = vfs_get_tree(fc);
-	if (ret) {
-		put_fs_context(fc);
-		return ERR_PTR(ret);
-	}
-	up_write(&fc->root->d_sb->s_umount);
-	mnt = alloc_vfsmnt(name);
-	if (!mnt) {
-		put_fs_context(fc);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	if (flags & SB_KERNMOUNT)
-		mnt->mnt.mnt_flags = MNT_INTERNAL;
+		mnt = fc_mount(fc);
+	else
+		mnt = ERR_PTR(ret);
 
-	atomic_inc(&fc->root->d_sb->s_active);
-	mnt->mnt.mnt_root = dget(fc->root);
-	mnt->mnt.mnt_sb = fc->root->d_sb;
-	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
-	mnt->mnt_parent = mnt;
-	lock_mount_hash();
-	list_add_tail(&mnt->mnt_instance, &fc->root->d_sb->s_mounts);
-	unlock_mount_hash();
 	put_fs_context(fc);
-	return &mnt->mnt;
+	return mnt;
 }
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
 
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 037eed52164b..9197ddbf35fb 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -21,6 +21,7 @@ struct super_block;
 struct vfsmount;
 struct dentry;
 struct mnt_namespace;
+struct fs_context;
 
 #define MNT_NOSUID	0x01
 #define MNT_NODEV	0x02
@@ -88,6 +89,8 @@ struct path;
 extern struct vfsmount *clone_private_mount(const struct path *path);
 
 struct file_system_type;
+extern struct vfsmount *fc_mount(struct fs_context *fc);
+extern struct vfsmount *vfs_create_mount(struct fs_context *fc);
 extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 				      int flags, const char *name,
 				      void *data);
-- 
cgit v1.2.3


From a0c9a8b8fd9fd572b0d60276beb2142c8f59f9b8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 4 Nov 2018 07:18:51 -0500
Subject: teach vfs_get_tree() to handle subtype, switch do_new_mount() to it

Roll the handling of subtypes into do_new_mount() and vfs_get_tree().  The
former determines any subtype string and hangs it off the fs_context; the
latter applies it.

Make do_new_mount() create, parameterise and commit an fs_context and
create a mount for itself rather than calling vfs_kern_mount().

[AV -- missing kstrdup()]
[AV -- ... and no kstrdup() if we get to setting ->s_submount - we
simply transfer it from fc, leaving NULL behind]
[AV -- constify ->s_submount, while we are at it]

Reviewed-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c     | 77 ++++++++++++++++++++++++++++++++----------------------
 fs/super.c         |  5 ++++
 include/linux/fs.h |  2 +-
 3 files changed, 52 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/fs/namespace.c b/fs/namespace.c
index 156771f5745a..0354cb6ac2d3 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2479,29 +2479,6 @@ out:
 	return err;
 }
 
-static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
-{
-	int err;
-	const char *subtype = strchr(fstype, '.');
-	if (subtype) {
-		subtype++;
-		err = -EINVAL;
-		if (!subtype[0])
-			goto err;
-	} else
-		subtype = "";
-
-	mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
-	err = -ENOMEM;
-	if (!mnt->mnt_sb->s_subtype)
-		goto err;
-	return mnt;
-
- err:
-	mntput(mnt);
-	return ERR_PTR(err);
-}
-
 /*
  * add a mount into a namespace's mount tree
  */
@@ -2557,7 +2534,9 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
 {
 	struct file_system_type *type;
 	struct vfsmount *mnt;
-	int err;
+	struct fs_context *fc;
+	const char *subtype = NULL;
+	int err = 0;
 
 	if (!fstype)
 		return -EINVAL;
@@ -2566,23 +2545,59 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
 	if (!type)
 		return -ENODEV;
 
-	mnt = vfs_kern_mount(type, sb_flags, name, data);
-	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
-	    !mnt->mnt_sb->s_subtype)
-		mnt = fs_set_subtype(mnt, fstype);
+	if (type->fs_flags & FS_HAS_SUBTYPE) {
+		subtype = strchr(fstype, '.');
+		if (subtype) {
+			subtype++;
+			if (!*subtype) {
+				put_filesystem(type);
+				return -EINVAL;
+			}
+		} else {
+			subtype = "";
+		}
+	}
 
+	fc = fs_context_for_mount(type, sb_flags);
 	put_filesystem(type);
-	if (IS_ERR(mnt))
-		return PTR_ERR(mnt);
+	if (IS_ERR(fc))
+		return PTR_ERR(fc);
+
+	if (subtype) {
+		fc->subtype = kstrdup(subtype, GFP_KERNEL);
+		if (!fc->subtype)
+			err = -ENOMEM;
+	}
+	if (!err && name) {
+		fc->source = kstrdup(name, GFP_KERNEL);
+		if (!fc->source)
+			err = -ENOMEM;
+	}
+	if (!err)
+		err = parse_monolithic_mount_data(fc, data);
+	if (!err)
+		err = vfs_get_tree(fc);
+	if (err)
+		goto out;
+
+	up_write(&fc->root->d_sb->s_umount);
+	mnt = vfs_create_mount(fc);
+	if (IS_ERR(mnt)) {
+		err = PTR_ERR(mnt);
+		goto out;
+	}
 
 	if (mount_too_revealing(mnt, &mnt_flags)) {
 		mntput(mnt);
-		return -EPERM;
+		err = -EPERM;
+		goto out;
 	}
 
 	err = do_add_mount(real_mount(mnt), path, mnt_flags);
 	if (err)
 		mntput(mnt);
+out:
+	put_fs_context(fc);
 	return err;
 }
 
diff --git a/fs/super.c b/fs/super.c
index fc3887277ad1..b91b6df05b67 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1262,6 +1262,11 @@ int vfs_get_tree(struct fs_context *fc)
 	sb = fc->root->d_sb;
 	WARN_ON(!sb->s_bdi);
 
+	if (fc->subtype && !sb->s_subtype) {
+		sb->s_subtype = fc->subtype;
+		fc->subtype = NULL;
+	}
+
 	/*
 	 * Write barrier is for super_cache_count(). We place it before setting
 	 * SB_BORN as the data dependency between the two functions is the
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 811c77743dad..36fff12ab890 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1447,7 +1447,7 @@ struct super_block {
 	 * Filesystem subtype.  If non-empty the filesystem type field
 	 * in /proc/mounts will be "type.subtype"
 	 */
-	char *s_subtype;
+	const char *s_subtype;
 
 	const struct dentry_operations *s_d_op; /* default d_op for dentries */
 
-- 
cgit v1.2.3


From 8d0347f6c3a9d4953ddd636a31c6584da082e084 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 4 Nov 2018 09:28:36 -0500
Subject: convert do_remount_sb() to fs_context

Replace do_remount_sb() with a function, reconfigure_super(), that's
fs_context aware.  The fs_context is expected to be parameterised already
and have ->root pointing to the superblock to be reconfigured.

A legacy wrapper is provided that is intended to be called from the
fs_context ops when those appear, but for now is called directly from
reconfigure_super().  This wrapper invokes the ->remount_fs() superblock op
for the moment.  It is intended that the remount_fs() op will be phased
out.

The fs_context->purpose is set to FS_CONTEXT_FOR_RECONFIGURE to indicate
that the context is being used for reconfiguration.

do_umount_root() is provided to consolidate remount-to-R/O for umount and
emergency remount by creating a context and invoking reconfiguration.

do_remount(), do_umount() and do_emergency_remount_callback() are switched
to use the new process.

[AV -- fold UMOUNT and EMERGENCY_REMOUNT in; fixes the
umount / bug, gets rid of pointless complexity]
[AV -- set ->net_ns in all cases; nfs remount will need that]
[AV -- shift security_sb_remount() call into reconfigure_super(); the callers
that didn't do security_sb_remount() have NULL fc->security anyway, so it's
a no-op for them]

Signed-off-by: David Howells <dhowells@redhat.com>
Co-developed-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs_context.c            |  35 ++++++++++++++-
 fs/internal.h              |   3 +-
 fs/namespace.c             |  61 ++++++++++++++++----------
 fs/super.c                 | 107 +++++++++++++++++++++++++++++++--------------
 include/linux/fs.h         |   1 +
 include/linux/fs_context.h |   4 ++
 6 files changed, 152 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/fs/fs_context.c b/fs/fs_context.c
index 857cd46a687b..5e2c3aba1dd8 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -69,6 +69,13 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
 	case FS_CONTEXT_FOR_MOUNT:
 		fc->user_ns = get_user_ns(fc->cred->user_ns);
 		break;
+	case FS_CONTEXT_FOR_RECONFIGURE:
+		/* We don't pin any namespaces as the superblock's
+		 * subscriptions cannot be changed at this point.
+		 */
+		atomic_inc(&reference->d_sb->s_active);
+		fc->root = dget(reference);
+		break;
 	}
 
 	ret = legacy_init_fs_context(fc);
@@ -90,6 +97,15 @@ struct fs_context *fs_context_for_mount(struct file_system_type *fs_type,
 }
 EXPORT_SYMBOL(fs_context_for_mount);
 
+struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
+					unsigned int sb_flags,
+					unsigned int sb_flags_mask)
+{
+	return alloc_fs_context(dentry->d_sb->s_type, dentry, sb_flags,
+				sb_flags_mask, FS_CONTEXT_FOR_RECONFIGURE);
+}
+EXPORT_SYMBOL(fs_context_for_reconfigure);
+
 void fc_drop_locked(struct fs_context *fc)
 {
 	struct super_block *sb = fc->root->d_sb;
@@ -99,6 +115,7 @@ void fc_drop_locked(struct fs_context *fc)
 }
 
 static void legacy_fs_context_free(struct fs_context *fc);
+
 /**
  * put_fs_context - Dispose of a superblock configuration context.
  * @fc: The context to dispose of.
@@ -118,8 +135,7 @@ void put_fs_context(struct fs_context *fc)
 		legacy_fs_context_free(fc);
 
 	security_free_mnt_opts(&fc->security);
-	if (fc->net_ns)
-		put_net(fc->net_ns);
+	put_net(fc->net_ns);
 	put_user_ns(fc->user_ns);
 	put_cred(fc->cred);
 	kfree(fc->subtype);
@@ -172,6 +188,21 @@ int legacy_get_tree(struct fs_context *fc)
 	return 0;
 }
 
+/*
+ * Handle remount.
+ */
+int legacy_reconfigure(struct fs_context *fc)
+{
+	struct legacy_fs_context *ctx = fc->fs_private;
+	struct super_block *sb = fc->root->d_sb;
+
+	if (!sb->s_op->remount_fs)
+		return 0;
+
+	return sb->s_op->remount_fs(sb, &fc->sb_flags,
+				    ctx ? ctx->legacy_data : NULL);
+}
+
 /*
  * Initialise a legacy context for a filesystem that doesn't support
  * fs_context.
diff --git a/fs/internal.h b/fs/internal.h
index 6af26d897034..016a5b8dd305 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -56,6 +56,7 @@ extern void __init chrdev_init(void);
  * fs_context.c
  */
 extern int legacy_get_tree(struct fs_context *fc);
+extern int legacy_reconfigure(struct fs_context *fc);
 extern int parse_monolithic_mount_data(struct fs_context *, void *);
 extern void fc_drop_locked(struct fs_context *);
 
@@ -107,7 +108,7 @@ extern struct file *alloc_empty_file_noaccount(int, const struct cred *);
 /*
  * super.c
  */
-extern int do_remount_sb(struct super_block *, int, void *, int);
+extern int reconfigure_super(struct fs_context *);
 extern bool trylock_super(struct super_block *sb);
 extern struct super_block *user_get_super(dev_t);
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 750500c6c33d..931228d8518a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1489,6 +1489,29 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 
 static void shrink_submounts(struct mount *mnt);
 
+static int do_umount_root(struct super_block *sb)
+{
+	int ret = 0;
+
+	down_write(&sb->s_umount);
+	if (!sb_rdonly(sb)) {
+		struct fs_context *fc;
+
+		fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
+						SB_RDONLY);
+		if (IS_ERR(fc)) {
+			ret = PTR_ERR(fc);
+		} else {
+			ret = parse_monolithic_mount_data(fc, NULL);
+			if (!ret)
+				ret = reconfigure_super(fc);
+			put_fs_context(fc);
+		}
+	}
+	up_write(&sb->s_umount);
+	return ret;
+}
+
 static int do_umount(struct mount *mnt, int flags)
 {
 	struct super_block *sb = mnt->mnt.mnt_sb;
@@ -1554,11 +1577,7 @@ static int do_umount(struct mount *mnt, int flags)
 		 */
 		if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
 			return -EPERM;
-		down_write(&sb->s_umount);
-		if (!sb_rdonly(sb))
-			retval = do_remount_sb(sb, SB_RDONLY, NULL, 0);
-		up_write(&sb->s_umount);
-		return retval;
+		return do_umount_root(sb);
 	}
 
 	namespace_lock();
@@ -2367,7 +2386,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
 	int err;
 	struct super_block *sb = path->mnt->mnt_sb;
 	struct mount *mnt = real_mount(path->mnt);
-	void *sec_opts = NULL;
+	struct fs_context *fc;
 
 	if (!check_mnt(mnt))
 		return -EINVAL;
@@ -2378,24 +2397,22 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
 	if (!can_change_locked_flags(mnt, mnt_flags))
 		return -EPERM;
 
-	if (data && !(sb->s_type->fs_flags & FS_BINARY_MOUNTDATA)) {
-		err = security_sb_eat_lsm_opts(data, &sec_opts);
-		if (err)
-			return err;
-	}
-	err = security_sb_remount(sb, sec_opts);
-	security_free_mnt_opts(&sec_opts);
-	if (err)
-		return err;
+	fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
+	if (IS_ERR(fc))
+		return PTR_ERR(fc);
 
-	down_write(&sb->s_umount);
-	err = -EPERM;
-	if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
-		err = do_remount_sb(sb, sb_flags, data, 0);
-		if (!err)
-			set_mount_attributes(mnt, mnt_flags);
+	err = parse_monolithic_mount_data(fc, data);
+	if (!err) {
+		down_write(&sb->s_umount);
+		err = -EPERM;
+		if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
+			err = reconfigure_super(fc);
+			if (!err)
+				set_mount_attributes(mnt, mnt_flags);
+		}
+		up_write(&sb->s_umount);
 	}
-	up_write(&sb->s_umount);
+	put_fs_context(fc);
 	return err;
 }
 
diff --git a/fs/super.c b/fs/super.c
index 11e2a6cb3baf..50553233dd15 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -836,28 +836,35 @@ rescan:
 }
 
 /**
- *	do_remount_sb - asks filesystem to change mount options.
- *	@sb:	superblock in question
- *	@sb_flags: revised superblock flags
- *	@data:	the rest of options
- *      @force: whether or not to force the change
+ * reconfigure_super - asks filesystem to change superblock parameters
+ * @fc: The superblock and configuration
  *
- *	Alters the mount options of a mounted file system.
+ * Alters the configuration parameters of a live superblock.
  */
-int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
+int reconfigure_super(struct fs_context *fc)
 {
+	struct super_block *sb = fc->root->d_sb;
 	int retval;
-	int remount_ro;
+	bool remount_ro = false;
+	bool force = fc->sb_flags & SB_FORCE;
 
+	if (fc->sb_flags_mask & ~MS_RMT_MASK)
+		return -EINVAL;
 	if (sb->s_writers.frozen != SB_UNFROZEN)
 		return -EBUSY;
 
+	retval = security_sb_remount(sb, fc->security);
+	if (retval)
+		return retval;
+
+	if (fc->sb_flags_mask & SB_RDONLY) {
 #ifdef CONFIG_BLOCK
-	if (!(sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev))
-		return -EACCES;
+		if (!(fc->sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev))
+			return -EACCES;
 #endif
 
-	remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb);
+		remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb);
+	}
 
 	if (remount_ro) {
 		if (!hlist_empty(&sb->s_pins)) {
@@ -868,13 +875,14 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
 				return 0;
 			if (sb->s_writers.frozen != SB_UNFROZEN)
 				return -EBUSY;
-			remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb);
+			remount_ro = !sb_rdonly(sb);
 		}
 	}
 	shrink_dcache_sb(sb);
 
-	/* If we are remounting RDONLY and current sb is read/write,
-	   make sure there are no rw files opened */
+	/* If we are reconfiguring to RDONLY and current sb is read/write,
+	 * make sure there are no files open for writing.
+	 */
 	if (remount_ro) {
 		if (force) {
 			sb->s_readonly_remount = 1;
@@ -886,17 +894,17 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
 		}
 	}
 
-	if (sb->s_op->remount_fs) {
-		retval = sb->s_op->remount_fs(sb, &sb_flags, data);
-		if (retval) {
-			if (!force)
-				goto cancel_readonly;
-			/* If forced remount, go ahead despite any errors */
-			WARN(1, "forced remount of a %s fs returned %i\n",
-			     sb->s_type->name, retval);
-		}
+	retval = legacy_reconfigure(fc);
+	if (retval) {
+		if (!force)
+			goto cancel_readonly;
+		/* If forced remount, go ahead despite any errors */
+		WARN(1, "forced remount of a %s fs returned %i\n",
+		     sb->s_type->name, retval);
 	}
-	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (sb_flags & MS_RMT_MASK);
+
+	WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) |
+				 (fc->sb_flags & fc->sb_flags_mask)));
 	/* Needs to be ordered wrt mnt_is_readonly() */
 	smp_wmb();
 	sb->s_readonly_remount = 0;
@@ -923,10 +931,15 @@ static void do_emergency_remount_callback(struct super_block *sb)
 	down_write(&sb->s_umount);
 	if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) &&
 	    !sb_rdonly(sb)) {
-		/*
-		 * What lock protects sb->s_flags??
-		 */
-		do_remount_sb(sb, SB_RDONLY, NULL, 1);
+		struct fs_context *fc;
+
+		fc = fs_context_for_reconfigure(sb->s_root,
+					SB_RDONLY | SB_FORCE, SB_RDONLY);
+		if (!IS_ERR(fc)) {
+			if (parse_monolithic_mount_data(fc, NULL) == 0)
+				(void)reconfigure_super(fc);
+			put_fs_context(fc);
+		}
 	}
 	up_write(&sb->s_umount);
 }
@@ -1213,6 +1226,31 @@ struct dentry *mount_nodev(struct file_system_type *fs_type,
 }
 EXPORT_SYMBOL(mount_nodev);
 
+static int reconfigure_single(struct super_block *s,
+			      int flags, void *data)
+{
+	struct fs_context *fc;
+	int ret;
+
+	/* The caller really need to be passing fc down into mount_single(),
+	 * then a chunk of this can be removed.  [Bollocks -- AV]
+	 * Better yet, reconfiguration shouldn't happen, but rather the second
+	 * mount should be rejected if the parameters are not compatible.
+	 */
+	fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK);
+	if (IS_ERR(fc))
+		return PTR_ERR(fc);
+
+	ret = parse_monolithic_mount_data(fc, data);
+	if (ret < 0)
+		goto out;
+
+	ret = reconfigure_super(fc);
+out:
+	put_fs_context(fc);
+	return ret;
+}
+
 static int compare_single(struct super_block *s, void *p)
 {
 	return 1;
@@ -1230,13 +1268,14 @@ struct dentry *mount_single(struct file_system_type *fs_type,
 		return ERR_CAST(s);
 	if (!s->s_root) {
 		error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
-		if (error) {
-			deactivate_locked_super(s);
-			return ERR_PTR(error);
-		}
-		s->s_flags |= SB_ACTIVE;
+		if (!error)
+			s->s_flags |= SB_ACTIVE;
 	} else {
-		do_remount_sb(s, flags, data, 0);
+		error = reconfigure_single(s, flags, data);
+	}
+	if (unlikely(error)) {
+		deactivate_locked_super(s);
+		return ERR_PTR(error);
 	}
 	return dget(s->s_root);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 36fff12ab890..c65d02c5c512 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1337,6 +1337,7 @@ extern int send_sigurg(struct fown_struct *fown);
 
 /* These sb flags are internal to the kernel */
 #define SB_SUBMOUNT     (1<<26)
+#define SB_FORCE    	(1<<27)
 #define SB_NOSEC	(1<<28)
 #define SB_BORN		(1<<29)
 #define SB_ACTIVE	(1<<30)
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 9805514444c9..98772f882a3e 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -25,6 +25,7 @@ struct user_namespace;
 
 enum fs_context_purpose {
 	FS_CONTEXT_FOR_MOUNT,		/* New superblock for explicit mount */
+	FS_CONTEXT_FOR_RECONFIGURE,	/* Superblock reconfiguration (remount) */
 };
 
 /*
@@ -57,6 +58,9 @@ struct fs_context {
  */
 extern struct fs_context *fs_context_for_mount(struct file_system_type *fs_type,
 						unsigned int sb_flags);
+extern struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
+						unsigned int sb_flags,
+						unsigned int sb_flags_mask);
 
 extern int vfs_get_tree(struct fs_context *fc);
 extern void put_fs_context(struct fs_context *fc);
-- 
cgit v1.2.3


From e1a91586d5da6f879b6dd385a2e7227bf1653570 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 23 Dec 2018 16:25:31 -0500
Subject: fs_context flavour for submounts

This is an eventual replacement for vfs_submount() uses.  Unlike the
"mount" and "remount" cases, the users of that thing are not in VFS -
they are buried in various ->d_automount() instances and rather than
converting them all at once we introduce the (thankfully small and
simple) infrastructure here and deal with the prospective users in
afs, nfs, etc. parts of the series.

Here we just introduce a new constructor (fs_context_for_submount())
along with the corresponding enum constant to be put into fc->purpose
for those.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs_context.c            | 10 ++++++++++
 include/linux/fs_context.h |  3 +++
 2 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/fs/fs_context.c b/fs/fs_context.c
index 5e2c3aba1dd8..2bd652b6e848 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -69,6 +69,9 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
 	case FS_CONTEXT_FOR_MOUNT:
 		fc->user_ns = get_user_ns(fc->cred->user_ns);
 		break;
+	case FS_CONTEXT_FOR_SUBMOUNT:
+		fc->user_ns = get_user_ns(reference->d_sb->s_user_ns);
+		break;
 	case FS_CONTEXT_FOR_RECONFIGURE:
 		/* We don't pin any namespaces as the superblock's
 		 * subscriptions cannot be changed at this point.
@@ -106,6 +109,13 @@ struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
 }
 EXPORT_SYMBOL(fs_context_for_reconfigure);
 
+struct fs_context *fs_context_for_submount(struct file_system_type *type,
+					   struct dentry *reference)
+{
+	return alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT);
+}
+EXPORT_SYMBOL(fs_context_for_submount);
+
 void fc_drop_locked(struct fs_context *fc)
 {
 	struct super_block *sb = fc->root->d_sb;
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 98772f882a3e..7feb018c7a9e 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -25,6 +25,7 @@ struct user_namespace;
 
 enum fs_context_purpose {
 	FS_CONTEXT_FOR_MOUNT,		/* New superblock for explicit mount */
+	FS_CONTEXT_FOR_SUBMOUNT,	/* New superblock for automatic submount */
 	FS_CONTEXT_FOR_RECONFIGURE,	/* Superblock reconfiguration (remount) */
 };
 
@@ -61,6 +62,8 @@ extern struct fs_context *fs_context_for_mount(struct file_system_type *fs_type,
 extern struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
 						unsigned int sb_flags,
 						unsigned int sb_flags_mask);
+extern struct fs_context *fs_context_for_submount(struct file_system_type *fs_type,
+						struct dentry *reference);
 
 extern int vfs_get_tree(struct fs_context *fc);
 extern void put_fs_context(struct fs_context *fc);
-- 
cgit v1.2.3


From f3a09c92018a91ad0981146a4ac59414f814d801 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 23 Dec 2018 18:55:56 -0500
Subject: introduce fs_context methods

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs_context.c            | 28 ++++++++++++++++++++++------
 fs/internal.h              |  2 --
 fs/super.c                 | 36 ++++++++++++++++++++++++++++--------
 include/linux/fs.h         |  2 ++
 include/linux/fs_context.h | 13 +++++++++++++
 5 files changed, 65 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/fs/fs_context.c b/fs/fs_context.c
index 2bd652b6e848..825d1b2c8807 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -51,6 +51,7 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
 				      unsigned int sb_flags_mask,
 				      enum fs_context_purpose purpose)
 {
+	int (*init_fs_context)(struct fs_context *);
 	struct fs_context *fc;
 	int ret = -ENOMEM;
 
@@ -81,7 +82,12 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
 		break;
 	}
 
-	ret = legacy_init_fs_context(fc);
+	/* TODO: Make all filesystems support this unconditionally */
+	init_fs_context = fc->fs_type->init_fs_context;
+	if (!init_fs_context)
+		init_fs_context = legacy_init_fs_context;
+
+	ret = init_fs_context(fc);
 	if (ret < 0)
 		goto err_fc;
 	fc->need_free = true;
@@ -141,8 +147,8 @@ void put_fs_context(struct fs_context *fc)
 		deactivate_super(sb);
 	}
 
-	if (fc->need_free)
-		legacy_fs_context_free(fc);
+	if (fc->need_free && fc->ops && fc->ops->free)
+		fc->ops->free(fc);
 
 	security_free_mnt_opts(&fc->security);
 	put_net(fc->net_ns);
@@ -180,7 +186,7 @@ static int legacy_parse_monolithic(struct fs_context *fc, void *data)
 /*
  * Get a mountable root with the legacy mount command.
  */
-int legacy_get_tree(struct fs_context *fc)
+static int legacy_get_tree(struct fs_context *fc)
 {
 	struct legacy_fs_context *ctx = fc->fs_private;
 	struct super_block *sb;
@@ -201,7 +207,7 @@ int legacy_get_tree(struct fs_context *fc)
 /*
  * Handle remount.
  */
-int legacy_reconfigure(struct fs_context *fc)
+static int legacy_reconfigure(struct fs_context *fc)
 {
 	struct legacy_fs_context *ctx = fc->fs_private;
 	struct super_block *sb = fc->root->d_sb;
@@ -213,6 +219,13 @@ int legacy_reconfigure(struct fs_context *fc)
 				    ctx ? ctx->legacy_data : NULL);
 }
 
+const struct fs_context_operations legacy_fs_context_ops = {
+	.free			= legacy_fs_context_free,
+	.parse_monolithic	= legacy_parse_monolithic,
+	.get_tree		= legacy_get_tree,
+	.reconfigure		= legacy_reconfigure,
+};
+
 /*
  * Initialise a legacy context for a filesystem that doesn't support
  * fs_context.
@@ -222,10 +235,13 @@ static int legacy_init_fs_context(struct fs_context *fc)
 	fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL);
 	if (!fc->fs_private)
 		return -ENOMEM;
+	fc->ops = &legacy_fs_context_ops;
 	return 0;
 }
 
 int parse_monolithic_mount_data(struct fs_context *fc, void *data)
 {
-	return legacy_parse_monolithic(fc, data);
+	int (*monolithic_mount_data)(struct fs_context *, void *);
+	monolithic_mount_data = fc->ops->parse_monolithic;
+	return monolithic_mount_data(fc, data);
 }
diff --git a/fs/internal.h b/fs/internal.h
index 016a5b8dd305..8f8d07cc433f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -55,8 +55,6 @@ extern void __init chrdev_init(void);
 /*
  * fs_context.c
  */
-extern int legacy_get_tree(struct fs_context *fc);
-extern int legacy_reconfigure(struct fs_context *fc);
 extern int parse_monolithic_mount_data(struct fs_context *, void *);
 extern void fc_drop_locked(struct fs_context *);
 
diff --git a/fs/super.c b/fs/super.c
index 50553233dd15..76b3181c782d 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -894,13 +894,15 @@ int reconfigure_super(struct fs_context *fc)
 		}
 	}
 
-	retval = legacy_reconfigure(fc);
-	if (retval) {
-		if (!force)
-			goto cancel_readonly;
-		/* If forced remount, go ahead despite any errors */
-		WARN(1, "forced remount of a %s fs returned %i\n",
-		     sb->s_type->name, retval);
+	if (fc->ops->reconfigure) {
+		retval = fc->ops->reconfigure(fc);
+		if (retval) {
+			if (!force)
+				goto cancel_readonly;
+			/* If forced remount, go ahead despite any errors */
+			WARN(1, "forced remount of a %s fs returned %i\n",
+			     sb->s_type->name, retval);
+		}
 	}
 
 	WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) |
@@ -1294,10 +1296,28 @@ int vfs_get_tree(struct fs_context *fc)
 	struct super_block *sb;
 	int error;
 
-	error = legacy_get_tree(fc);
+	if (fc->fs_type->fs_flags & FS_REQUIRES_DEV && !fc->source)
+		return -ENOENT;
+
+	if (fc->root)
+		return -EBUSY;
+
+	/* Get the mountable root in fc->root, with a ref on the root and a ref
+	 * on the superblock.
+	 */
+	error = fc->ops->get_tree(fc);
 	if (error < 0)
 		return error;
 
+	if (!fc->root) {
+		pr_err("Filesystem %s get_tree() didn't set fc->root\n",
+		       fc->fs_type->name);
+		/* We don't know what the locking state of the superblock is -
+		 * if there is a superblock.
+		 */
+		BUG();
+	}
+
 	sb = fc->root->d_sb;
 	WARN_ON(!sb->s_bdi);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c65d02c5c512..8d578a9e1e8c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -61,6 +61,7 @@ struct workqueue_struct;
 struct iov_iter;
 struct fscrypt_info;
 struct fscrypt_operations;
+struct fs_context;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -2173,6 +2174,7 @@ struct file_system_type {
 #define FS_HAS_SUBTYPE		4
 #define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
+	int (*init_fs_context)(struct fs_context *);
 	struct dentry *(*mount) (struct file_system_type *, int,
 		       const char *, void *);
 	void (*kill_sb) (struct super_block *);
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 7feb018c7a9e..087c12954360 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -20,8 +20,13 @@ struct cred;
 struct dentry;
 struct file_operations;
 struct file_system_type;
+struct mnt_namespace;
 struct net;
+struct pid_namespace;
+struct super_block;
 struct user_namespace;
+struct vfsmount;
+struct path;
 
 enum fs_context_purpose {
 	FS_CONTEXT_FOR_MOUNT,		/* New superblock for explicit mount */
@@ -39,6 +44,7 @@ enum fs_context_purpose {
  * See Documentation/filesystems/mounting.txt
  */
 struct fs_context {
+	const struct fs_context_operations *ops;
 	struct file_system_type	*fs_type;
 	void			*fs_private;	/* The filesystem's context */
 	struct dentry		*root;		/* The root and superblock */
@@ -54,6 +60,13 @@ struct fs_context {
 	bool			need_free:1;	/* Need to call ops->free() */
 };
 
+struct fs_context_operations {
+	void (*free)(struct fs_context *fc);
+	int (*parse_monolithic)(struct fs_context *fc, void *data);
+	int (*get_tree)(struct fs_context *fc);
+	int (*reconfigure)(struct fs_context *fc);
+};
+
 /*
  * fs_context manipulation functions.
  */
-- 
cgit v1.2.3


From c6b82263f9c6e745eb4c5dfc2578d147c4cd7604 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 1 Nov 2018 23:07:23 +0000
Subject: vfs: Introduce logging functions

Introduce a set of logging functions through which informational messages,
warnings and error messages incurred by the mount procedure can be logged
and, in a future patch, passed to userspace instead by way of the
filesystem configuration context file descriptor.

There are four functions:

 (1) infof(const char *fmt, ...);

     Logs an informational message.

 (2) warnf(const char *fmt, ...);

     Logs a warning message.

 (3) errorf(const char *fmt, ...);

     Logs an error message.

 (4) invalf(const char *fmt, ...);

     As errof(), but returns -EINVAL so can be used on a return statement.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs_context.h | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'include')

diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 087c12954360..d208cc40b868 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -81,4 +81,46 @@ extern struct fs_context *fs_context_for_submount(struct file_system_type *fs_ty
 extern int vfs_get_tree(struct fs_context *fc);
 extern void put_fs_context(struct fs_context *fc);
 
+#define logfc(FC, FMT, ...) pr_notice(FMT, ## __VA_ARGS__)
+
+/**
+ * infof - Store supplementary informational message
+ * @fc: The context in which to log the informational message
+ * @fmt: The format string
+ *
+ * Store the supplementary informational message for the process if the process
+ * has enabled the facility.
+ */
+#define infof(fc, fmt, ...) ({ logfc(fc, fmt, ## __VA_ARGS__); })
+
+/**
+ * warnf - Store supplementary warning message
+ * @fc: The context in which to log the error message
+ * @fmt: The format string
+ *
+ * Store the supplementary warning message for the process if the process has
+ * enabled the facility.
+ */
+#define warnf(fc, fmt, ...) ({ logfc(fc, fmt, ## __VA_ARGS__); })
+
+/**
+ * errorf - Store supplementary error message
+ * @fc: The context in which to log the error message
+ * @fmt: The format string
+ *
+ * Store the supplementary error message for the process if the process has
+ * enabled the facility.
+ */
+#define errorf(fc, fmt, ...) ({ logfc(fc, fmt, ## __VA_ARGS__); })
+
+/**
+ * invalf - Store supplementary invalid argument error message
+ * @fc: The context in which to log the error message
+ * @fmt: The format string
+ *
+ * Store the supplementary error message for the process if the process has
+ * enabled the facility and return -EINVAL.
+ */
+#define invalf(fc, fmt, ...) ({	errorf(fc, fmt, ## __VA_ARGS__); -EINVAL; })
+
 #endif /* _LINUX_FS_CONTEXT_H */
-- 
cgit v1.2.3


From 459cc69fa4c17caf21de596693d8a07170820a58 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Wed, 30 Jan 2019 12:49:11 +0200
Subject: RDMA: Provide safe ib_alloc_device() function

All callers to ib_alloc_device() provide a larger size than struct
ib_device and rely on the fact that struct ib_device is embedded in their
driver specific structure as the first member.

Provide a safer variant of ib_alloc_device() that checks and enforces this
approach to make sure the drivers are using it right.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c               | 6 +++---
 drivers/infiniband/hw/bnxt_re/main.c           | 2 +-
 drivers/infiniband/hw/cxgb3/iwch.c             | 2 +-
 drivers/infiniband/hw/cxgb4/device.c           | 2 +-
 drivers/infiniband/hw/hns/hns_roce_hw_v1.c     | 2 +-
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c     | 2 +-
 drivers/infiniband/hw/i40iw/i40iw_verbs.c      | 2 +-
 drivers/infiniband/hw/mlx4/main.c              | 2 +-
 drivers/infiniband/hw/mlx5/ib_rep.c            | 2 +-
 drivers/infiniband/hw/mlx5/main.c              | 2 +-
 drivers/infiniband/hw/mthca/mthca_main.c       | 2 +-
 drivers/infiniband/hw/nes/nes_verbs.c          | 2 +-
 drivers/infiniband/hw/ocrdma/ocrdma_main.c     | 2 +-
 drivers/infiniband/hw/qedr/main.c              | 2 +-
 drivers/infiniband/hw/usnic/usnic_ib_main.c    | 2 +-
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 2 +-
 drivers/infiniband/sw/rdmavt/vt.c              | 2 +-
 drivers/infiniband/sw/rxe/rxe_net.c            | 2 +-
 include/rdma/ib_verbs.h                        | 8 +++++++-
 19 files changed, 27 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 200431c540f2..b511cfa00bdb 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -268,7 +268,7 @@ static struct class ib_class = {
 };
 
 /**
- * ib_alloc_device - allocate an IB device struct
+ * _ib_alloc_device - allocate an IB device struct
  * @size:size of structure to allocate
  *
  * Low-level drivers should use ib_alloc_device() to allocate &struct
@@ -277,7 +277,7 @@ static struct class ib_class = {
  * ib_dealloc_device() must be used to free structures allocated with
  * ib_alloc_device().
  */
-struct ib_device *ib_alloc_device(size_t size)
+struct ib_device *_ib_alloc_device(size_t size)
 {
 	struct ib_device *device;
 
@@ -303,7 +303,7 @@ struct ib_device *ib_alloc_device(size_t size)
 
 	return device;
 }
-EXPORT_SYMBOL(ib_alloc_device);
+EXPORT_SYMBOL(_ib_alloc_device);
 
 /**
  * ib_dealloc_device - free an IB device struct
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 16eecfa5882c..08777506d0b1 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -688,7 +688,7 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct net_device *netdev,
 	struct bnxt_re_dev *rdev;
 
 	/* Allocate bnxt_re_dev instance here */
-	rdev = (struct bnxt_re_dev *)ib_alloc_device(sizeof(*rdev));
+	rdev = ib_alloc_device(bnxt_re_dev, ibdev);
 	if (!rdev) {
 		dev_err(NULL, "%s: bnxt_re_dev allocation failure!",
 			ROCE_DRV_MODULE_NAME);
diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c
index 591de319c178..fb03bc492ef7 100644
--- a/drivers/infiniband/hw/cxgb3/iwch.c
+++ b/drivers/infiniband/hw/cxgb3/iwch.c
@@ -146,7 +146,7 @@ static void open_rnic_dev(struct t3cdev *tdev)
 
 	pr_debug("%s t3cdev %p\n", __func__,  tdev);
 	pr_info_once("Chelsio T3 RDMA Driver - version %s\n", DRV_VERSION);
-	rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp));
+	rnicp = ib_alloc_device(iwch_dev, ibdev);
 	if (!rnicp) {
 		pr_err("Cannot allocate ib device\n");
 		return;
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index 9c10fff6dcfb..4b4e2464b705 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -966,7 +966,7 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop)
 		pr_info("%s: On-Chip Queues not supported on this device\n",
 			pci_name(infop->pdev));
 
-	devp = (struct c4iw_dev *)ib_alloc_device(sizeof(*devp));
+	devp = ib_alloc_device(c4iw_dev, ibdev);
 	if (!devp) {
 		pr_err("Cannot allocate ib device\n");
 		return ERR_PTR(-ENOMEM);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index b74c742b000c..fa08c22aad66 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -5002,7 +5002,7 @@ static int hns_roce_probe(struct platform_device *pdev)
 	struct hns_roce_dev *hr_dev;
 	struct device *dev = &pdev->dev;
 
-	hr_dev = (struct hns_roce_dev *)ib_alloc_device(sizeof(*hr_dev));
+	hr_dev = ib_alloc_device(hns_roce_dev, ib_dev);
 	if (!hr_dev)
 		return -ENOMEM;
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 5c483b437bdd..48a5d6548cd4 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -6059,7 +6059,7 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
 	struct hns_roce_dev *hr_dev;
 	int ret;
 
-	hr_dev = (struct hns_roce_dev *)ib_alloc_device(sizeof(*hr_dev));
+	hr_dev = ib_alloc_device(hns_roce_dev, ib_dev);
 	if (!hr_dev)
 		return -ENOMEM;
 
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index 12b31a8440be..d4ab46dd9e6c 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -2762,7 +2762,7 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev
 	struct net_device *netdev = iwdev->netdev;
 	struct pci_dev *pcidev = (struct pci_dev *)iwdev->hw.dev_context;
 
-	iwibdev = (struct i40iw_ib_device *)ib_alloc_device(sizeof(*iwibdev));
+	iwibdev = ib_alloc_device(i40iw_ib_device, ibdev);
 	if (!iwibdev) {
 		i40iw_pr_err("iwdev == NULL\n");
 		return NULL;
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index dc2ffd293a11..d66002a31000 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2635,7 +2635,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	if (num_ports == 0)
 		return NULL;
 
-	ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev);
+	ibdev = ib_alloc_device(mlx4_ib_dev, ib_dev);
 	if (!ibdev) {
 		dev_err(&dev->persist->pdev->dev,
 			"Device struct alloc failed\n");
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index 46a9ddc8ca56..6d7b8bad4b61 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -70,7 +70,7 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 {
 	struct mlx5_ib_dev *ibdev;
 
-	ibdev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*ibdev));
+	ibdev = ib_alloc_device(mlx5_ib_dev, ib_dev);
 	if (!ibdev)
 		return -ENOMEM;
 
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 96bf1b2f9dd7..8161acda64e6 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -6508,7 +6508,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET)
 		return mlx5_ib_add_slave_port(mdev);
 
-	dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
+	dev = ib_alloc_device(mlx5_ib_dev, ib_dev);
 	if (!dev)
 		return NULL;
 
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
index 92c49bff22bc..fe9654a7af71 100644
--- a/drivers/infiniband/hw/mthca/mthca_main.c
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -961,7 +961,7 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type)
 	/* We can handle large RDMA requests, so allow larger segments. */
 	dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024);
 
-	mdev = (struct mthca_dev *) ib_alloc_device(sizeof *mdev);
+	mdev = ib_alloc_device(mthca_dev, ib_dev);
 	if (!mdev) {
 		dev_err(&pdev->dev, "Device struct alloc failed, "
 			"aborting.\n");
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 034156f7e9ed..6eb991d40035 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -3669,7 +3669,7 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
 	struct nes_vnic *nesvnic = netdev_priv(netdev);
 	struct nes_device *nesdev = nesvnic->nesdev;
 
-	nesibdev = (struct nes_ib_device *)ib_alloc_device(sizeof(struct nes_ib_device));
+	nesibdev = ib_alloc_device(nes_ib_device, ibdev);
 	if (nesibdev == NULL) {
 		return NULL;
 	}
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index b0491b9ecfe4..88970a6bb555 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -297,7 +297,7 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)
 	u8 lstate = 0;
 	struct ocrdma_dev *dev;
 
-	dev = (struct ocrdma_dev *)ib_alloc_device(sizeof(struct ocrdma_dev));
+	dev = ib_alloc_device(ocrdma_dev, ibdev);
 	if (!dev) {
 		pr_err("Unable to allocate ib device\n");
 		return NULL;
diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index f85e72b65a10..878e9e23652b 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -853,7 +853,7 @@ static struct qedr_dev *qedr_add(struct qed_dev *cdev, struct pci_dev *pdev,
 	struct qedr_dev *dev;
 	int rc = 0;
 
-	dev = (struct qedr_dev *)ib_alloc_device(sizeof(*dev));
+	dev = ib_alloc_device(qedr_dev, ibdev);
 	if (!dev) {
 		pr_err("Unable to allocate ib device\n");
 		return NULL;
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index 3201dd1899c7..0c0a288cb585 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -372,7 +372,7 @@ static void *usnic_ib_device_add(struct pci_dev *dev)
 	usnic_dbg("\n");
 	netdev = pci_get_drvdata(dev);
 
-	us_ibdev = (struct usnic_ib_dev *)ib_alloc_device(sizeof(*us_ibdev));
+	us_ibdev = ib_alloc_device(usnic_ib_dev, ib_dev);
 	if (!us_ibdev) {
 		usnic_err("Device %s context alloc failed\n",
 				netdev_name(pci_get_drvdata(dev)));
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
index a5f02276d903..e582beaf9430 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
@@ -795,7 +795,7 @@ static int pvrdma_pci_probe(struct pci_dev *pdev,
 	dev_dbg(&pdev->dev, "initializing driver %s\n", pci_name(pdev));
 
 	/* Allocate zero-out device */
-	dev = (struct pvrdma_dev *)ib_alloc_device(sizeof(*dev));
+	dev = ib_alloc_device(pvrdma_dev, ib_dev);
 	if (!dev) {
 		dev_err(&pdev->dev, "failed to allocate IB device\n");
 		return -ENOMEM;
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
index 7de7389d0235..b3f0c5578925 100644
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -91,7 +91,7 @@ struct rvt_dev_info *rvt_alloc_device(size_t size, int nports)
 {
 	struct rvt_dev_info *rdi;
 
-	rdi = (struct rvt_dev_info *)ib_alloc_device(size);
+	rdi = container_of(_ib_alloc_device(size), struct rvt_dev_info, ibdev);
 	if (!rdi)
 		return rdi;
 
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index 8fd03ae20efc..19f3c69916b1 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -555,7 +555,7 @@ struct rxe_dev *rxe_net_add(struct net_device *ndev)
 	int err;
 	struct rxe_dev *rxe = NULL;
 
-	rxe = (struct rxe_dev *)ib_alloc_device(sizeof(*rxe));
+	rxe = ib_alloc_device(rxe_dev, ib_dev);
 	if (!rxe)
 		return NULL;
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 71ea144ec823..a1a1e710642c 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2621,7 +2621,13 @@ struct ib_client {
 	struct list_head list;
 };
 
-struct ib_device *ib_alloc_device(size_t size);
+struct ib_device *_ib_alloc_device(size_t size);
+#define ib_alloc_device(drv_struct, member)                                    \
+	container_of(_ib_alloc_device(sizeof(struct drv_struct) +              \
+				      BUILD_BUG_ON_ZERO(offsetof(              \
+					      struct drv_struct, member))),    \
+		     struct drv_struct, member)
+
 void ib_dealloc_device(struct ib_device *device);
 
 void ib_get_device_fw_str(struct ib_device *device, char *str);
-- 
cgit v1.2.3


From 6780c4fa9d6e091b2f206ac429a40e2e8d2e45f3 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galpress@amazon.com>
Date: Tue, 22 Jan 2019 10:08:22 +0200
Subject: RDMA: Add indication for in kernel API support to IB device

Drivers that do not provide kernel verbs support should not be used by ib
kernel clients at all.

In case a device does not implement all mandatory verbs for kverbs usage
mark it as a non kverbs provider and prevent its usage for all clients
except for uverbs.

The device is marked as a non kverbs provider using the 'kverbs_provider'
flag which should only be set by the core code.  The clients can choose
whether kverbs are requested for its usage using the 'no_kverbs_req' flag
which is currently set for uverbs only.

This patch allows drivers to remove mandatory verbs stubs and simply set
the callbacks to NULL. The IB device will be registered as a non-kverbs
provider. Note that verbs that are required for the device registration
process must be implemented.

Signed-off-by: Gal Pressman <galpress@amazon.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c      | 10 ++++++----
 drivers/infiniband/core/uverbs_main.c |  1 +
 include/rdma/ib_verbs.h               |  5 +++++
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index b511cfa00bdb..9d2e108235e9 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -121,13 +121,12 @@ static int ib_device_check_mandatory(struct ib_device *device)
 	};
 	int i;
 
+	device->kverbs_provider = true;
 	for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
 		if (!*(void **) ((void *) &device->ops +
 				 mandatory_table[i].offset)) {
-			dev_warn(&device->dev,
-				 "Device is missing mandatory function %s\n",
-				 mandatory_table[i].name);
-			return -EINVAL;
+			device->kverbs_provider = false;
+			break;
 		}
 	}
 
@@ -325,6 +324,9 @@ static int add_client_context(struct ib_device *device, struct ib_client *client
 {
 	struct ib_client_data *context;
 
+	if (!device->kverbs_provider && !client->no_kverbs_req)
+		return -EOPNOTSUPP;
+
 	context = kmalloc(sizeof(*context), GFP_KERNEL);
 	if (!context)
 		return -ENOMEM;
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 996f167d1436..d628747e058c 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -1151,6 +1151,7 @@ static const struct file_operations uverbs_mmap_fops = {
 
 static struct ib_client uverbs_client = {
 	.name   = "uverbs",
+	.no_kverbs_req = true,
 	.add    = ib_uverbs_add_one,
 	.remove = ib_uverbs_remove_one
 };
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index a1a1e710642c..4183a03b46b5 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2565,6 +2565,8 @@ struct ib_device {
 	__be64			     node_guid;
 	u32			     local_dma_lkey;
 	u16                          is_switch:1;
+	/* Indicates kernel verbs support, should not be used in drivers */
+	u16                          kverbs_provider:1;
 	u8                           node_type;
 	u8                           phys_port_cnt;
 	struct ib_device_attr        attrs;
@@ -2619,6 +2621,9 @@ struct ib_client {
 			const struct sockaddr *addr,
 			void *client_data);
 	struct list_head list;
+
+	/* kverbs are not required by the client */
+	u8 no_kverbs_req:1;
 };
 
 struct ib_device *_ib_alloc_device(size_t size);
-- 
cgit v1.2.3


From 0ad699c0edc97a864177679dd67f2ccd73b07cb7 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Wed, 30 Jan 2019 12:48:58 +0200
Subject: RDMA/core: Simplify restrack interface

In the current implementation, we have one restrack root per-device and
all users are simply providing it directly. Let's simplify the interface
and have callers provide the ib_device and internally access the
restrack_root.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c   |  4 ++--
 drivers/infiniband/core/nldev.c    |  4 ++--
 drivers/infiniband/core/restrack.c | 26 +++++++++++++++++++++-----
 include/rdma/restrack.h            | 23 ++++-------------------
 4 files changed, 29 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 9d2e108235e9..919e94ff4b25 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -287,7 +287,7 @@ struct ib_device *_ib_alloc_device(size_t size)
 	if (!device)
 		return NULL;
 
-	rdma_restrack_init(&device->res);
+	rdma_restrack_init(device);
 
 	device->dev.class = &ib_class;
 	device_initialize(&device->dev);
@@ -315,7 +315,7 @@ void ib_dealloc_device(struct ib_device *device)
 	WARN_ON(!list_empty(&device->client_data_list));
 	WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
 		device->reg_state != IB_DEV_UNINITIALIZED);
-	rdma_restrack_clean(&device->res);
+	rdma_restrack_clean(device);
 	put_device(&device->dev);
 }
 EXPORT_SYMBOL(ib_dealloc_device);
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 1742ff4fbf79..ee98fc9058b1 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -314,7 +314,6 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device)
 		[RDMA_RESTRACK_CTX] = "ctx",
 	};
 
-	struct rdma_restrack_root *res = &device->res;
 	struct nlattr *table_attr;
 	int ret, i, curr;
 
@@ -328,7 +327,8 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device)
 	for (i = 0; i < RDMA_RESTRACK_MAX; i++) {
 		if (!names[i])
 			continue;
-		curr = rdma_restrack_count(res, i, task_active_pid_ns(current));
+		curr = rdma_restrack_count(device, i,
+					   task_active_pid_ns(current));
 		ret = fill_res_info_entry(msg, names[i], curr);
 		if (ret)
 			goto err;
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index 46a5c553c624..0ade3da0a5c7 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -18,8 +18,14 @@ static int fill_res_noop(struct sk_buff *msg,
 	return 0;
 }
 
-void rdma_restrack_init(struct rdma_restrack_root *res)
+/**
+ * rdma_restrack_init() - initialize resource tracking
+ * @dev:  IB device
+ */
+void rdma_restrack_init(struct ib_device *dev)
 {
+	struct rdma_restrack_root *res = &dev->res;
+
 	init_rwsem(&res->rwsem);
 	res->fill_res_entry = fill_res_noop;
 }
@@ -38,11 +44,15 @@ static const char *type2str(enum rdma_restrack_type type)
 	return names[type];
 };
 
-void rdma_restrack_clean(struct rdma_restrack_root *res)
+/**
+ * rdma_restrack_clean() - clean resource tracking
+ * @dev:  IB device
+ */
+void rdma_restrack_clean(struct ib_device *dev)
 {
+	struct rdma_restrack_root *res = &dev->res;
 	struct rdma_restrack_entry *e;
 	char buf[TASK_COMM_LEN];
-	struct ib_device *dev;
 	const char *owner;
 	int bkt;
 
@@ -72,10 +82,16 @@ void rdma_restrack_clean(struct rdma_restrack_root *res)
 	pr_err("restrack: %s", CUT_HERE);
 }
 
-int rdma_restrack_count(struct rdma_restrack_root *res,
-			enum rdma_restrack_type type,
+/**
+ * rdma_restrack_count() - the current usage of specific object
+ * @dev:  IB device
+ * @type: actual type of object to operate
+ * @ns:   PID namespace
+ */
+int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type,
 			struct pid_namespace *ns)
 {
+	struct rdma_restrack_root *res = &dev->res;
 	struct rdma_restrack_entry *e;
 	u32 cnt = 0;
 
diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h
index 8f179be9d9a9..f756fc48eee5 100644
--- a/include/rdma/restrack.h
+++ b/include/rdma/restrack.h
@@ -49,6 +49,7 @@ enum rdma_restrack_type {
 };
 
 #define RDMA_RESTRACK_HASH_BITS	8
+struct ib_device;
 struct rdma_restrack_entry;
 
 /**
@@ -122,25 +123,9 @@ struct rdma_restrack_entry {
 	bool			user;
 };
 
-/**
- * rdma_restrack_init() - initialize resource tracking
- * @res:  resource tracking root
- */
-void rdma_restrack_init(struct rdma_restrack_root *res);
-
-/**
- * rdma_restrack_clean() - clean resource tracking
- * @res:  resource tracking root
- */
-void rdma_restrack_clean(struct rdma_restrack_root *res);
-
-/**
- * rdma_restrack_count() - the current usage of specific object
- * @res:  resource entry
- * @type: actual type of object to operate
- * @ns:   PID namespace
- */
-int rdma_restrack_count(struct rdma_restrack_root *res,
+void rdma_restrack_init(struct ib_device *dev);
+void rdma_restrack_clean(struct ib_device *dev);
+int rdma_restrack_count(struct ib_device *dev,
 			enum rdma_restrack_type type,
 			struct pid_namespace *ns);
 
-- 
cgit v1.2.3


From 02da37509705d3ba6a58fe4799a0caf6b4baecb0 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Wed, 30 Jan 2019 12:49:02 +0200
Subject: RDMA/core: Use the ops infrastructure to keep all callbacks in one
 place

As preparation to hide rdma_restrack_root, refactor the code to use the
ops structure instead of a special callback which is hidden in
rdma_restrack_root.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c       |  1 +
 drivers/infiniband/core/nldev.c        | 28 ++++++++++++++++++----------
 drivers/infiniband/core/restrack.c     |  7 -------
 drivers/infiniband/hw/cxgb4/provider.c |  2 +-
 include/rdma/ib_verbs.h                |  5 +++++
 include/rdma/restrack.h                |  7 -------
 6 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 919e94ff4b25..b9f725df4195 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -1263,6 +1263,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	SET_DEVICE_OP(dev_ops, disassociate_ucontext);
 	SET_DEVICE_OP(dev_ops, drain_rq);
 	SET_DEVICE_OP(dev_ops, drain_sq);
+	SET_DEVICE_OP(dev_ops, fill_res_entry);
 	SET_DEVICE_OP(dev_ops, get_dev_fw_str);
 	SET_DEVICE_OP(dev_ops, get_dma_mr);
 	SET_DEVICE_OP(dev_ops, get_hw_stats);
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index ee98fc9058b1..25a248847575 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -361,11 +361,19 @@ static int fill_res_name_pid(struct sk_buff *msg,
 	return 0;
 }
 
+static bool fill_res_entry(struct ib_device *dev, struct sk_buff *msg,
+			   struct rdma_restrack_entry *res)
+{
+	if (!dev->ops.fill_res_entry)
+		return false;
+	return dev->ops.fill_res_entry(msg, res);
+}
+
 static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin,
 			     struct rdma_restrack_entry *res, uint32_t port)
 {
 	struct ib_qp *qp = container_of(res, struct ib_qp, res);
-	struct rdma_restrack_root *resroot = &qp->device->res;
+	struct ib_device *dev = qp->device;
 	struct ib_qp_init_attr qp_init_attr;
 	struct nlattr *entry_attr;
 	struct ib_qp_attr qp_attr;
@@ -415,7 +423,7 @@ static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin,
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
-	if (resroot->fill_res_entry(msg, res))
+	if (fill_res_entry(dev, msg, res))
 		goto err;
 
 	nla_nest_end(msg, entry_attr);
@@ -432,7 +440,7 @@ static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin,
 {
 	struct rdma_id_private *id_priv =
 				container_of(res, struct rdma_id_private, res);
-	struct rdma_restrack_root *resroot = &id_priv->id.device->res;
+	struct ib_device *dev = id_priv->id.device;
 	struct rdma_cm_id *cm_id = &id_priv->id;
 	struct nlattr *entry_attr;
 
@@ -474,7 +482,7 @@ static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin,
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
-	if (resroot->fill_res_entry(msg, res))
+	if (fill_res_entry(dev, msg, res))
 		goto err;
 
 	nla_nest_end(msg, entry_attr);
@@ -490,7 +498,7 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin,
 			     struct rdma_restrack_entry *res, uint32_t port)
 {
 	struct ib_cq *cq = container_of(res, struct ib_cq, res);
-	struct rdma_restrack_root *resroot = &cq->device->res;
+	struct ib_device *dev = cq->device;
 	struct nlattr *entry_attr;
 
 	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CQ_ENTRY);
@@ -511,7 +519,7 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin,
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
-	if (resroot->fill_res_entry(msg, res))
+	if (fill_res_entry(dev, msg, res))
 		goto err;
 
 	nla_nest_end(msg, entry_attr);
@@ -527,7 +535,7 @@ static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin,
 			     struct rdma_restrack_entry *res, uint32_t port)
 {
 	struct ib_mr *mr = container_of(res, struct ib_mr, res);
-	struct rdma_restrack_root *resroot = &mr->pd->device->res;
+	struct ib_device *dev = mr->pd->device;
 	struct nlattr *entry_attr;
 
 	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_MR_ENTRY);
@@ -548,7 +556,7 @@ static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin,
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
-	if (resroot->fill_res_entry(msg, res))
+	if (fill_res_entry(dev, msg, res))
 		goto err;
 
 	nla_nest_end(msg, entry_attr);
@@ -564,7 +572,7 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin,
 			     struct rdma_restrack_entry *res, uint32_t port)
 {
 	struct ib_pd *pd = container_of(res, struct ib_pd, res);
-	struct rdma_restrack_root *resroot = &pd->device->res;
+	struct ib_device *dev = pd->device;
 	struct nlattr *entry_attr;
 
 	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_PD_ENTRY);
@@ -591,7 +599,7 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin,
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
-	if (resroot->fill_res_entry(msg, res))
+	if (fill_res_entry(dev, msg, res))
 		goto err;
 
 	nla_nest_end(msg, entry_attr);
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index bd7770ed4174..f80b37d437ac 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -12,12 +12,6 @@
 
 #include "cma_priv.h"
 
-static int fill_res_noop(struct sk_buff *msg,
-			 struct rdma_restrack_entry *entry)
-{
-	return 0;
-}
-
 /**
  * rdma_restrack_init() - initialize resource tracking
  * @dev:  IB device
@@ -27,7 +21,6 @@ void rdma_restrack_init(struct ib_device *dev)
 	struct rdma_restrack_root *res = &dev->res;
 
 	init_rwsem(&res->rwsem);
-	res->fill_res_entry = fill_res_noop;
 }
 
 static const char *type2str(enum rdma_restrack_type type)
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index f977f8e7e162..cb5b713bbf39 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -549,6 +549,7 @@ static const struct ib_device_ops c4iw_dev_ops = {
 	.destroy_cq = c4iw_destroy_cq,
 	.destroy_qp = c4iw_destroy_qp,
 	.destroy_srq = c4iw_destroy_srq,
+	.fill_res_entry = fill_res_entry,
 	.get_dev_fw_str = get_dev_fw_str,
 	.get_dma_mr = c4iw_get_dma_mr,
 	.get_hw_stats = c4iw_get_mib,
@@ -629,7 +630,6 @@ void c4iw_register_device(struct work_struct *work)
 	dev->ibdev.iwcm->add_ref = c4iw_qp_add_ref;
 	dev->ibdev.iwcm->rem_ref = c4iw_qp_rem_ref;
 	dev->ibdev.iwcm->get_qp = c4iw_get_qp;
-	dev->ibdev.res.fill_res_entry = fill_res_entry;
 	memcpy(dev->ibdev.iwcm->ifname, dev->rdev.lldi.ports[0]->name,
 	       sizeof(dev->ibdev.iwcm->ifname));
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 4183a03b46b5..5fc3be884444 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2510,6 +2510,11 @@ struct ib_device_ops {
 	 */
 	int (*init_port)(struct ib_device *device, u8 port_num,
 			 struct kobject *port_sysfs);
+	/**
+	 * Allows rdma drivers to add their own restrack attributes.
+	 */
+	int (*fill_res_entry)(struct sk_buff *msg,
+			      struct rdma_restrack_entry *entry);
 };
 
 struct ib_device {
diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h
index f756fc48eee5..cc66cc7a11d3 100644
--- a/include/rdma/restrack.h
+++ b/include/rdma/restrack.h
@@ -65,13 +65,6 @@ struct rdma_restrack_root {
 	 * @hash: global database for all resources per-device
 	 */
 	DECLARE_HASHTABLE(hash, RDMA_RESTRACK_HASH_BITS);
-	/**
-	 * @fill_res_entry: driver-specific fill function
-	 *
-	 * Allows rdma drivers to add their own restrack attributes.
-	 */
-	int (*fill_res_entry)(struct sk_buff *msg,
-			      struct rdma_restrack_entry *entry);
 };
 
 /**
-- 
cgit v1.2.3


From 03535e7a3a9937da99ee18304309e0574d2504fc Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Thu, 24 Jan 2019 06:47:49 -0200
Subject: media: vb2: vb2_find_timestamp: drop restriction on buffer state

There really is no reason why vb2_find_timestamp can't just find
buffers in any state. Drop that part of the test.

This also means that vb->timestamp should only be set to 0 when
the driver doesn't copy timestamps.

This change allows for more efficient pipelining (i.e. you can use
a buffer for a reference frame even when it is queued).

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Reviewed-by: Tomasz Figa <tfiga@chromium.org>
Reviewed-by: Alexandre Courbot <acourbot@chromium.org>
Reviewed-by: Paul Kocialkowski <paul.kocialkowski@bootlin.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/common/videobuf2/videobuf2-v4l2.c | 11 +++--------
 include/media/videobuf2-v4l2.h                  |  3 +--
 2 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c
index 75ea90e795d8..2f3b3ca5bde6 100644
--- a/drivers/media/common/videobuf2/videobuf2-v4l2.c
+++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c
@@ -567,7 +567,7 @@ static int __fill_vb2_buffer(struct vb2_buffer *vb, struct vb2_plane *planes)
 	struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
 	unsigned int plane;
 
-	if (!vb->vb2_queue->is_output || !vb->vb2_queue->copy_timestamp)
+	if (!vb->vb2_queue->copy_timestamp)
 		vb->timestamp = 0;
 
 	for (plane = 0; plane < vb->num_planes; ++plane) {
@@ -594,14 +594,9 @@ int vb2_find_timestamp(const struct vb2_queue *q, u64 timestamp,
 {
 	unsigned int i;
 
-	for (i = start_idx; i < q->num_buffers; i++) {
-		struct vb2_buffer *vb = q->bufs[i];
-
-		if ((vb->state == VB2_BUF_STATE_DEQUEUED ||
-		     vb->state == VB2_BUF_STATE_DONE) &&
-		    vb->timestamp == timestamp)
+	for (i = start_idx; i < q->num_buffers; i++)
+		if (q->bufs[i]->timestamp == timestamp)
 			return i;
-	}
 	return -1;
 }
 EXPORT_SYMBOL_GPL(vb2_find_timestamp);
diff --git a/include/media/videobuf2-v4l2.h b/include/media/videobuf2-v4l2.h
index a9961bc776dc..8a10889dc2fd 100644
--- a/include/media/videobuf2-v4l2.h
+++ b/include/media/videobuf2-v4l2.h
@@ -59,8 +59,7 @@ struct vb2_v4l2_buffer {
  * vb2_find_timestamp() - Find buffer with given timestamp in the queue
  *
  * @q:		pointer to &struct vb2_queue with videobuf2 queue.
- * @timestamp:	the timestamp to find. Only buffers in state DEQUEUED or DONE
- *		are considered.
+ * @timestamp:	the timestamp to find.
  * @start_idx:	the start index (usually 0) in the buffer array to start
  *		searching from. Note that there may be multiple buffers
  *		with the same timestamp value, so you can restart the search
-- 
cgit v1.2.3


From 28d77c21cbeb2c6039d48ef88401b87a56a7a07f Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Wed, 16 Jan 2019 10:01:13 -0200
Subject: media: vb2: add buf_out_validate callback

When queueing a buffer to a request the 'field' value is not validated.
That field is only validated when the _buf_prepare() is called,
which happens when the request is queued.

However, this validation should happen at QBUF time, since you want
to know about this as soon as possible. Also, the spec requires that
the 'field' value is validated at QBUF time.

This patch adds a new buf_out_validate callback to validate the
output buffer at buf_prepare time or when QBUF queues an unprepared
buffer to a request. This callback is mandatory for output queues
that support requests.

This issue was found by v4l2-compliance since it failed to replace
V4L2_FIELD_ANY by a proper field value when testing the vivid video
output in combination with requests.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/common/videobuf2/videobuf2-core.c | 22 +++++++++++++++++++---
 include/media/videobuf2-core.h                  |  5 +++++
 2 files changed, 24 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c
index ce9294a635cc..e07b6bdb6982 100644
--- a/drivers/media/common/videobuf2/videobuf2-core.c
+++ b/drivers/media/common/videobuf2/videobuf2-core.c
@@ -499,9 +499,9 @@ static int __vb2_queue_free(struct vb2_queue *q, unsigned int buffers)
 			pr_info("     buf_init: %u buf_cleanup: %u buf_prepare: %u buf_finish: %u\n",
 				vb->cnt_buf_init, vb->cnt_buf_cleanup,
 				vb->cnt_buf_prepare, vb->cnt_buf_finish);
-			pr_info("     buf_queue: %u buf_done: %u buf_request_complete: %u\n",
-				vb->cnt_buf_queue, vb->cnt_buf_done,
-				vb->cnt_buf_request_complete);
+			pr_info("     buf_out_validate: %u buf_queue: %u buf_done: %u buf_request_complete: %u\n",
+				vb->cnt_buf_out_validate, vb->cnt_buf_queue,
+				vb->cnt_buf_done, vb->cnt_buf_request_complete);
 			pr_info("     alloc: %u put: %u prepare: %u finish: %u mmap: %u\n",
 				vb->cnt_mem_alloc, vb->cnt_mem_put,
 				vb->cnt_mem_prepare, vb->cnt_mem_finish,
@@ -1277,6 +1277,14 @@ static int __buf_prepare(struct vb2_buffer *vb)
 		return 0;
 	WARN_ON(vb->synced);
 
+	if (q->is_output) {
+		ret = call_vb_qop(vb, buf_out_validate, vb);
+		if (ret) {
+			dprintk(1, "buffer validation failed\n");
+			return ret;
+		}
+	}
+
 	vb->state = VB2_BUF_STATE_PREPARING;
 
 	switch (q->memory) {
@@ -1523,6 +1531,14 @@ int vb2_core_qbuf(struct vb2_queue *q, unsigned int index, void *pb,
 			return -EINVAL;
 		}
 
+		if (q->is_output && !vb->prepared) {
+			ret = call_vb_qop(vb, buf_out_validate, vb);
+			if (ret) {
+				dprintk(1, "buffer validation failed\n");
+				return ret;
+			}
+		}
+
 		media_request_object_init(&vb->req_obj);
 
 		/* Make sure the request is in a safe state for updating. */
diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
index 4a737b2c610b..4849b865b908 100644
--- a/include/media/videobuf2-core.h
+++ b/include/media/videobuf2-core.h
@@ -296,6 +296,7 @@ struct vb2_buffer {
 	u32		cnt_mem_num_users;
 	u32		cnt_mem_mmap;
 
+	u32		cnt_buf_out_validate;
 	u32		cnt_buf_init;
 	u32		cnt_buf_prepare;
 	u32		cnt_buf_finish;
@@ -342,6 +343,9 @@ struct vb2_buffer {
  * @wait_finish:	reacquire all locks released in the previous callback;
  *			required to continue operation after sleeping while
  *			waiting for a new buffer to arrive.
+ * @buf_out_validate:	called when the output buffer is prepared or queued
+ *			to a request; drivers can use this to validate
+ *			userspace-provided information; optional.
  * @buf_init:		called once after allocating a buffer (in MMAP case)
  *			or after acquiring a new USERPTR buffer; drivers may
  *			perform additional buffer-related initialization;
@@ -409,6 +413,7 @@ struct vb2_ops {
 	void (*wait_prepare)(struct vb2_queue *q);
 	void (*wait_finish)(struct vb2_queue *q);
 
+	int (*buf_out_validate)(struct vb2_buffer *vb);
 	int (*buf_init)(struct vb2_buffer *vb);
 	int (*buf_prepare)(struct vb2_buffer *vb);
 	void (*buf_finish)(struct vb2_buffer *vb);
-- 
cgit v1.2.3


From ddf922c31fedd19c5b89a269c35e5c8b68c64327 Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Wed, 23 Jan 2019 19:21:01 -0800
Subject: IB/hfi1, IB/rdmavt: Allow for extending of QP's s_ack_queue

The OPFN protocol uses the COMPARE_SWAP request to exchange data
between the requester and the responder and therefore needs to
be stored in the QP's s_ack_queue when the request is received
on the responder side. However, because the user does not know
anything about the OPFN protocol, this extra entry in the
queue cannot be advertised to the user. This patch adds an extra
entry in a QP's s_ack_queue.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/rc.c    | 13 +++++++------
 drivers/infiniband/hw/hfi1/verbs.c |  1 +
 include/rdma/rdma_vt.h             | 10 +++++++++-
 3 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index 940e9236c328..8970fc7ffd4b 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -122,7 +122,8 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 		 * response has been sent instead of only being
 		 * constructed.
 		 */
-		if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
+		if (++qp->s_tail_ack_queue >
+		    rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
 			qp->s_tail_ack_queue = 0;
 		/* FALLTHROUGH */
 	case OP(SEND_ONLY):
@@ -1818,7 +1819,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
 		if (i)
 			prev = i - 1;
 		else
-			prev = HFI1_MAX_RDMA_ATOMIC;
+			prev = rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
 		if (prev == qp->r_head_ack_queue) {
 			e = NULL;
 			break;
@@ -1942,7 +1943,7 @@ static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
 	unsigned next;
 
 	next = n + 1;
-	if (next > HFI1_MAX_RDMA_ATOMIC)
+	if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
 		next = 0;
 	qp->s_tail_ack_queue = next;
 	qp->s_ack_state = OP(ACKNOWLEDGE);
@@ -2298,8 +2299,8 @@ send_last:
 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
 			goto nack_inv;
 		next = qp->r_head_ack_queue + 1;
-		/* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
-		if (next > HFI1_MAX_RDMA_ATOMIC)
+		/* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */
+		if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
 			next = 0;
 		spin_lock_irqsave(&qp->s_lock, flags);
 		if (unlikely(next == qp->s_tail_ack_queue)) {
@@ -2373,7 +2374,7 @@ send_last:
 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
 			goto nack_inv;
 		next = qp->r_head_ack_queue + 1;
-		if (next > HFI1_MAX_RDMA_ATOMIC)
+		if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
 			next = 0;
 		spin_lock_irqsave(&qp->s_lock, flags);
 		if (unlikely(next == qp->s_tail_ack_queue)) {
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index c980345cf1e1..ec3899c0874c 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -1735,6 +1735,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 	dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode;
 	dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold;
 	dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period;
+	dd->verbs_dev.rdi.dparms.extra_rdma_atomic = 1;
 
 	/* post send table */
 	dd->verbs_dev.rdi.post_parms = hfi1_post_parms;
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index acb3bc96dfa7..168e40be183c 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -182,6 +182,7 @@ struct rvt_driver_params {
 	u32 max_mad_size;
 	u8 qos_shift;
 	u8 max_rdma_atomic;
+	u8 extra_rdma_atomic;
 	u8 reserved_operations;
 };
 
@@ -519,7 +520,14 @@ static inline unsigned rvt_get_npkeys(struct rvt_dev_info *rdi)
  */
 static inline unsigned int rvt_max_atomic(struct rvt_dev_info *rdi)
 {
-	return rdi->dparms.max_rdma_atomic + 1;
+	return rdi->dparms.max_rdma_atomic +
+		rdi->dparms.extra_rdma_atomic + 1;
+}
+
+static inline unsigned int rvt_size_atomic(struct rvt_dev_info *rdi)
+{
+	return rdi->dparms.max_rdma_atomic +
+		rdi->dparms.extra_rdma_atomic;
 }
 
 /*
-- 
cgit v1.2.3


From a0ce2f0aa6ad97c3d4927bf2ca54bcebdf062d55 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Wed, 23 Jan 2019 15:19:17 +0100
Subject: splice: don't merge into linked buffers

Before this patch, it was possible for two pipes to affect each other after
data had been transferred between them with tee():

============
$ cat tee_test.c

int main(void) {
  int pipe_a[2];
  if (pipe(pipe_a)) err(1, "pipe");
  int pipe_b[2];
  if (pipe(pipe_b)) err(1, "pipe");
  if (write(pipe_a[1], "abcd", 4) != 4) err(1, "write");
  if (tee(pipe_a[0], pipe_b[1], 2, 0) != 2) err(1, "tee");
  if (write(pipe_b[1], "xx", 2) != 2) err(1, "write");

  char buf[5];
  if (read(pipe_a[0], buf, 4) != 4) err(1, "read");
  buf[4] = 0;
  printf("got back: '%s'\n", buf);
}
$ gcc -o tee_test tee_test.c
$ ./tee_test
got back: 'abxx'
$
============

As suggested by Al Viro, fix it by creating a separate type for
non-mergeable pipe buffers, then changing the types of buffers in
splice_pipe_to_pipe() and link_pipe().

Cc: <stable@vger.kernel.org>
Fixes: 7c77f0b3f920 ("splice: implement pipe to pipe splicing")
Fixes: 70524490ee2e ("[PATCH] splice: add support for sys_tee()")
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pipe.c                 | 14 ++++++++++++++
 fs/splice.c               |  4 ++++
 include/linux/pipe_fs_i.h |  1 +
 3 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/fs/pipe.c b/fs/pipe.c
index bdc5d3c0977d..c51750ed4011 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -234,6 +234,14 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
 	.get = generic_pipe_buf_get,
 };
 
+static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = {
+	.can_merge = 0,
+	.confirm = generic_pipe_buf_confirm,
+	.release = anon_pipe_buf_release,
+	.steal = anon_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
 static const struct pipe_buf_operations packet_pipe_buf_ops = {
 	.can_merge = 0,
 	.confirm = generic_pipe_buf_confirm,
@@ -242,6 +250,12 @@ static const struct pipe_buf_operations packet_pipe_buf_ops = {
 	.get = generic_pipe_buf_get,
 };
 
+void pipe_buf_mark_unmergeable(struct pipe_buffer *buf)
+{
+	if (buf->ops == &anon_pipe_buf_ops)
+		buf->ops = &anon_pipe_buf_nomerge_ops;
+}
+
 static ssize_t
 pipe_read(struct kiocb *iocb, struct iov_iter *to)
 {
diff --git a/fs/splice.c b/fs/splice.c
index de2ede048473..90c29675d573 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1597,6 +1597,8 @@ retry:
 			 */
 			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
 
+			pipe_buf_mark_unmergeable(obuf);
+
 			obuf->len = len;
 			opipe->nrbufs++;
 			ibuf->offset += obuf->len;
@@ -1671,6 +1673,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 		 */
 		obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
 
+		pipe_buf_mark_unmergeable(obuf);
+
 		if (obuf->len > len)
 			obuf->len = len;
 
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 5a3bb3b7c9ad..3ecd7ea212ae 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -182,6 +182,7 @@ void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
 void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
+void pipe_buf_mark_unmergeable(struct pipe_buffer *buf);
 
 extern const struct pipe_buf_operations nosteal_pipe_buf_ops;
 
-- 
cgit v1.2.3


From 01e7187b41191376cee8bea8de9f907b001e87b4 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Wed, 23 Jan 2019 15:19:18 +0100
Subject: pipe: stop using ->can_merge

Al Viro pointed out that since there is only one pipe buffer type to which
new data can be appended, it isn't necessary to have a ->can_merge field in
struct pipe_buf_operations, we can just check for a magic type.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pipe.c                 | 20 ++++++++++++++++----
 fs/splice.c               |  4 ----
 include/linux/pipe_fs_i.h |  7 -------
 kernel/relay.c            |  1 -
 kernel/trace/trace.c      |  2 --
 net/smc/smc_rx.c          |  1 -
 6 files changed, 16 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/fs/pipe.c b/fs/pipe.c
index c51750ed4011..0ff09b490ddf 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -226,8 +226,8 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe,
 }
 EXPORT_SYMBOL(generic_pipe_buf_release);
 
+/* New data written to a pipe may be appended to a buffer with this type. */
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
-	.can_merge = 1,
 	.confirm = generic_pipe_buf_confirm,
 	.release = anon_pipe_buf_release,
 	.steal = anon_pipe_buf_steal,
@@ -235,7 +235,6 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
 };
 
 static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = {
-	.can_merge = 0,
 	.confirm = generic_pipe_buf_confirm,
 	.release = anon_pipe_buf_release,
 	.steal = anon_pipe_buf_steal,
@@ -243,19 +242,32 @@ static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = {
 };
 
 static const struct pipe_buf_operations packet_pipe_buf_ops = {
-	.can_merge = 0,
 	.confirm = generic_pipe_buf_confirm,
 	.release = anon_pipe_buf_release,
 	.steal = anon_pipe_buf_steal,
 	.get = generic_pipe_buf_get,
 };
 
+/**
+ * pipe_buf_mark_unmergeable - mark a &struct pipe_buffer as unmergeable
+ * @buf:	the buffer to mark
+ *
+ * Description:
+ *	This function ensures that no future writes will be merged into the
+ *	given &struct pipe_buffer. This is necessary when multiple pipe buffers
+ *	share the same backing page.
+ */
 void pipe_buf_mark_unmergeable(struct pipe_buffer *buf)
 {
 	if (buf->ops == &anon_pipe_buf_ops)
 		buf->ops = &anon_pipe_buf_nomerge_ops;
 }
 
+static bool pipe_buf_can_merge(struct pipe_buffer *buf)
+{
+	return buf->ops == &anon_pipe_buf_ops;
+}
+
 static ssize_t
 pipe_read(struct kiocb *iocb, struct iov_iter *to)
 {
@@ -393,7 +405,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 		struct pipe_buffer *buf = pipe->bufs + lastbuf;
 		int offset = buf->offset + buf->len;
 
-		if (buf->ops->can_merge && offset + chars <= PAGE_SIZE) {
+		if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) {
 			ret = pipe_buf_confirm(pipe, buf);
 			if (ret)
 				goto out;
diff --git a/fs/splice.c b/fs/splice.c
index 90c29675d573..fc71e9733f7a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -138,7 +138,6 @@ error:
 }
 
 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
-	.can_merge = 0,
 	.confirm = page_cache_pipe_buf_confirm,
 	.release = page_cache_pipe_buf_release,
 	.steal = page_cache_pipe_buf_steal,
@@ -156,7 +155,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 }
 
 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
-	.can_merge = 0,
 	.confirm = generic_pipe_buf_confirm,
 	.release = page_cache_pipe_buf_release,
 	.steal = user_page_pipe_buf_steal,
@@ -326,7 +324,6 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 EXPORT_SYMBOL(generic_file_splice_read);
 
 const struct pipe_buf_operations default_pipe_buf_ops = {
-	.can_merge = 0,
 	.confirm = generic_pipe_buf_confirm,
 	.release = generic_pipe_buf_release,
 	.steal = generic_pipe_buf_steal,
@@ -341,7 +338,6 @@ static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
 
 /* Pipe buffer operations for a socket and similar. */
 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
-	.can_merge = 0,
 	.confirm = generic_pipe_buf_confirm,
 	.release = generic_pipe_buf_release,
 	.steal = generic_pipe_buf_nosteal,
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 3ecd7ea212ae..787d224ff43e 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -73,13 +73,6 @@ struct pipe_inode_info {
  * in fs/pipe.c for the pipe and generic variants of these hooks.
  */
 struct pipe_buf_operations {
-	/*
-	 * This is set to 1, if the generic pipe read/write may coalesce
-	 * data into an existing buffer. If this is set to 0, a new pipe
-	 * page segment is always used for new data.
-	 */
-	int can_merge;
-
 	/*
 	 * ->confirm() verifies that the data in the pipe buffer is there
 	 * and that the contents are good. If the pages in the pipe belong
diff --git a/kernel/relay.c b/kernel/relay.c
index 04f248644e06..db3e419c25a6 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1175,7 +1175,6 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
 }
 
 static const struct pipe_buf_operations relay_pipe_buf_ops = {
-	.can_merge = 0,
 	.confirm = generic_pipe_buf_confirm,
 	.release = relay_pipe_buf_release,
 	.steal = generic_pipe_buf_steal,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c521b7347482..f380139e972c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5823,7 +5823,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
 }
 
 static const struct pipe_buf_operations tracing_pipe_buf_ops = {
-	.can_merge		= 0,
 	.confirm		= generic_pipe_buf_confirm,
 	.release		= generic_pipe_buf_release,
 	.steal			= generic_pipe_buf_steal,
@@ -6843,7 +6842,6 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
 
 /* Pipe buffer operations for a buffer. */
 static const struct pipe_buf_operations buffer_pipe_buf_ops = {
-	.can_merge		= 0,
 	.confirm		= generic_pipe_buf_confirm,
 	.release		= buffer_pipe_buf_release,
 	.steal			= generic_pipe_buf_steal,
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index bbcf0fe4ae10..413a6abf227e 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -136,7 +136,6 @@ static int smc_rx_pipe_buf_nosteal(struct pipe_inode_info *pipe,
 }
 
 static const struct pipe_buf_operations smc_pipe_ops = {
-	.can_merge = 0,
 	.confirm = generic_pipe_buf_confirm,
 	.release = smc_rx_pipe_buf_release,
 	.steal = smc_rx_pipe_buf_nosteal,
-- 
cgit v1.2.3


From 4bc59c2f7e306775f3d2e1bbafaa854dd1e09335 Mon Sep 17 00:00:00 2001
From: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Date: Wed, 12 Dec 2018 18:33:56 +0100
Subject: mfd / platform: cros_ec: Use devm_mfd_add_devices

Use devm_mfd_add_devices() for adding cros-ec core MFD child devices. This
reduces the need of remove callback from platform/chrome for removing the
MFD child devices.

Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Reviewed-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cros_ec.c                 | 14 +++-----------
 drivers/platform/chrome/cros_ec_i2c.c | 10 ----------
 drivers/platform/chrome/cros_ec_lpc.c |  4 ----
 drivers/platform/chrome/cros_ec_spi.c | 11 -----------
 include/linux/mfd/cros_ec.h           | 10 ----------
 5 files changed, 3 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/cros_ec.c b/drivers/mfd/cros_ec.c
index fe6f83766144..6acfe036d522 100644
--- a/drivers/mfd/cros_ec.c
+++ b/drivers/mfd/cros_ec.c
@@ -129,8 +129,8 @@ int cros_ec_register(struct cros_ec_device *ec_dev)
 		}
 	}
 
-	err = mfd_add_devices(ec_dev->dev, PLATFORM_DEVID_AUTO, &ec_cell, 1,
-			      NULL, ec_dev->irq, NULL);
+	err = devm_mfd_add_devices(ec_dev->dev, PLATFORM_DEVID_AUTO, &ec_cell,
+				   1, NULL, ec_dev->irq, NULL);
 	if (err) {
 		dev_err(dev,
 			"Failed to register Embedded Controller subdevice %d\n",
@@ -147,7 +147,7 @@ int cros_ec_register(struct cros_ec_device *ec_dev)
 		 * - the EC is responsive at init time (it is not true for a
 		 *   sensor hub.
 		 */
-		err = mfd_add_devices(ec_dev->dev, PLATFORM_DEVID_AUTO,
+		err = devm_mfd_add_devices(ec_dev->dev, PLATFORM_DEVID_AUTO,
 				      &ec_pd_cell, 1, NULL, ec_dev->irq, NULL);
 		if (err) {
 			dev_err(dev,
@@ -181,14 +181,6 @@ int cros_ec_register(struct cros_ec_device *ec_dev)
 }
 EXPORT_SYMBOL(cros_ec_register);
 
-int cros_ec_remove(struct cros_ec_device *ec_dev)
-{
-	mfd_remove_devices(ec_dev->dev);
-
-	return 0;
-}
-EXPORT_SYMBOL(cros_ec_remove);
-
 #ifdef CONFIG_PM_SLEEP
 int cros_ec_suspend(struct cros_ec_device *ec_dev)
 {
diff --git a/drivers/platform/chrome/cros_ec_i2c.c b/drivers/platform/chrome/cros_ec_i2c.c
index ef9b4763356f..9a009eaa4ada 100644
--- a/drivers/platform/chrome/cros_ec_i2c.c
+++ b/drivers/platform/chrome/cros_ec_i2c.c
@@ -317,15 +317,6 @@ static int cros_ec_i2c_probe(struct i2c_client *client,
 	return 0;
 }
 
-static int cros_ec_i2c_remove(struct i2c_client *client)
-{
-	struct cros_ec_device *ec_dev = i2c_get_clientdata(client);
-
-	cros_ec_remove(ec_dev);
-
-	return 0;
-}
-
 #ifdef CONFIG_PM_SLEEP
 static int cros_ec_i2c_suspend(struct device *dev)
 {
@@ -376,7 +367,6 @@ static struct i2c_driver cros_ec_driver = {
 		.pm	= &cros_ec_i2c_pm_ops,
 	},
 	.probe		= cros_ec_i2c_probe,
-	.remove		= cros_ec_i2c_remove,
 	.id_table	= cros_ec_i2c_id,
 };
 
diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c
index e1b75775cd4a..14684a56e40f 100644
--- a/drivers/platform/chrome/cros_ec_lpc.c
+++ b/drivers/platform/chrome/cros_ec_lpc.c
@@ -327,7 +327,6 @@ static int cros_ec_lpc_probe(struct platform_device *pdev)
 
 static int cros_ec_lpc_remove(struct platform_device *pdev)
 {
-	struct cros_ec_device *ec_dev;
 	struct acpi_device *adev;
 
 	adev = ACPI_COMPANION(&pdev->dev);
@@ -335,9 +334,6 @@ static int cros_ec_lpc_remove(struct platform_device *pdev)
 		acpi_remove_notify_handler(adev->handle, ACPI_ALL_NOTIFY,
 					   cros_ec_lpc_acpi_notify);
 
-	ec_dev = platform_get_drvdata(pdev);
-	cros_ec_remove(ec_dev);
-
 	return 0;
 }
 
diff --git a/drivers/platform/chrome/cros_ec_spi.c b/drivers/platform/chrome/cros_ec_spi.c
index 2060d1483043..6cfbc2835beb 100644
--- a/drivers/platform/chrome/cros_ec_spi.c
+++ b/drivers/platform/chrome/cros_ec_spi.c
@@ -685,16 +685,6 @@ static int cros_ec_spi_probe(struct spi_device *spi)
 	return 0;
 }
 
-static int cros_ec_spi_remove(struct spi_device *spi)
-{
-	struct cros_ec_device *ec_dev;
-
-	ec_dev = spi_get_drvdata(spi);
-	cros_ec_remove(ec_dev);
-
-	return 0;
-}
-
 #ifdef CONFIG_PM_SLEEP
 static int cros_ec_spi_suspend(struct device *dev)
 {
@@ -733,7 +723,6 @@ static struct spi_driver cros_ec_driver_spi = {
 		.pm	= &cros_ec_spi_pm_ops,
 	},
 	.probe		= cros_ec_spi_probe,
-	.remove		= cros_ec_spi_remove,
 	.id_table	= cros_ec_spi_id,
 };
 
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index de8b588c8776..977ebaa78e99 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -281,16 +281,6 @@ int cros_ec_cmd_xfer(struct cros_ec_device *ec_dev,
 int cros_ec_cmd_xfer_status(struct cros_ec_device *ec_dev,
 			    struct cros_ec_command *msg);
 
-/**
- * cros_ec_remove() - Remove a ChromeOS EC.
- * @ec_dev: Device to register.
- *
- * Call this to deregister a ChromeOS EC, then clean up any private data.
- *
- * Return: 0 on success or negative error code.
- */
-int cros_ec_remove(struct cros_ec_device *ec_dev);
-
 /**
  * cros_ec_register() - Register a new ChromeOS EC, using the provided info.
  * @ec_dev: Device to register.
-- 
cgit v1.2.3


From ecf8a6cd949ef236ce435ae488ceb6b3354e677e Mon Sep 17 00:00:00 2001
From: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Date: Wed, 12 Dec 2018 18:33:57 +0100
Subject: mfd / platform: cros_ec: Move lightbar attributes to its own driver

The entire way how cros sysfs attibutes are created is broken.
cros_ec_lightbar should be its own driver and its attributes should be
associated with a lightbar driver not the mfd driver. In order to retain
the path, the lightbar attributes are attached to the cros_class.

The patch also adds the sysfs documentation.

Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Reviewed-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 .../sysfs-class-chromeos-driver-cros-ec-lightbar   | 74 +++++++++++++++++
 drivers/mfd/cros_ec_dev.c                          | 24 +++---
 drivers/mfd/cros_ec_dev.h                          |  6 --
 drivers/platform/chrome/Kconfig                    | 11 +++
 drivers/platform/chrome/Makefile                   |  3 +-
 drivers/platform/chrome/cros_ec_lightbar.c         | 95 +++++++++++++++++-----
 include/linux/mfd/cros_ec.h                        |  1 -
 7 files changed, 173 insertions(+), 41 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-class-chromeos-driver-cros-ec-lightbar

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-class-chromeos-driver-cros-ec-lightbar b/Documentation/ABI/testing/sysfs-class-chromeos-driver-cros-ec-lightbar
new file mode 100644
index 000000000000..57a037791403
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-chromeos-driver-cros-ec-lightbar
@@ -0,0 +1,74 @@
+What:		/sys/class/chromeos/<ec-device-name>/lightbar/brightness
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		Writing to this file adjusts the overall brightness of
+		the lightbar, separate from any color intensity. The
+		valid range is 0 (off) to 255 (maximum brightness).
+
+What:		/sys/class/chromeos/<ec-device-name>/lightbar/interval_msec
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		The lightbar is controlled by an embedded controller (EC),
+		which also manages the keyboard, battery charging, fans,
+		and other system hardware. To prevent unprivileged users
+		from interfering with the other EC functions, the rate at
+		which the lightbar control files can be read or written is
+		limited.
+
+		Reading this file will return the number of milliseconds
+		that must elapse between accessing any of the lightbar
+		functions through this interface. Going faster will simply
+		block until the necessary interval has lapsed. The interval
+		applies uniformly to all accesses of any kind by any user.
+
+What:		/sys/class/chromeos/<ec-device-name>/lightbar/led_rgb
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		This allows you to control each LED segment. If the
+		lightbar is already running one of the automatic
+		sequences, you probably won’t see anything change because
+		your color setting will be almost immediately replaced.
+		To get useful results, you should stop the lightbar
+		sequence first.
+
+		The values written to this file are sets of four integers,
+		indicating LED, RED, GREEN, BLUE. The LED number is 0 to 3
+		to select a single segment, or 4 to set all four segments
+		to the same value at once. The RED, GREEN, and BLUE
+		numbers should be in the range 0 (off) to 255 (maximum).
+		You can update more than one segment at a time by writing
+		more than one set of four integers.
+
+What:		/sys/class/chromeos/<ec-device-name>/lightbar/program
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		This allows you to upload and run custom lightbar sequences.
+
+What:		/sys/class/chromeos/<ec-device-name>/lightbar/sequence
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		The Pixel lightbar has a number of built-in sequences
+		that it displays under various conditions, such as at
+		power on, shut down, or while running. Reading from this
+		file displays the current sequence that the lightbar is
+		displaying. Writing to this file allows you to change the
+		sequence.
+
+What:		/sys/class/chromeos/<ec-device-name>/lightbar/userspace_control
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		This allows you to take the control of the lightbar. This
+		prevents the kernel from going through its normal
+		sequences.
+
+What:		/sys/class/chromeos/<ec-device-name>/lightbar/version
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		Show the information about the lightbar version.
diff --git a/drivers/mfd/cros_ec_dev.c b/drivers/mfd/cros_ec_dev.c
index 2d0fee488c5a..b227718e0ec2 100644
--- a/drivers/mfd/cros_ec_dev.c
+++ b/drivers/mfd/cros_ec_dev.c
@@ -36,7 +36,6 @@ static int ec_major;
 
 static const struct attribute_group *cros_ec_groups[] = {
 	&cros_ec_attr_group,
-	&cros_ec_lightbar_attr_group,
 	&cros_ec_vbc_attr_group,
 	NULL,
 };
@@ -395,6 +394,10 @@ static const struct mfd_cell cros_usbpd_charger_cells[] = {
 	{ .name = "cros-usbpd-charger" }
 };
 
+static const struct mfd_cell cros_ec_platform_cells[] = {
+	{ .name = "cros-ec-lightbar" },
+};
+
 static int ec_device_probe(struct platform_device *pdev)
 {
 	int retval = -ENOMEM;
@@ -470,9 +473,6 @@ static int ec_device_probe(struct platform_device *pdev)
 				retval);
 	}
 
-	/* Take control of the lightbar from the EC. */
-	lb_manual_suspend_ctrl(ec, 1);
-
 	/* We can now add the sysfs class, we know which parameter to show */
 	retval = cdev_device_add(&ec->cdev, &ec->class_dev);
 	if (retval) {
@@ -480,6 +480,15 @@ static int ec_device_probe(struct platform_device *pdev)
 		goto failed;
 	}
 
+	retval = mfd_add_devices(ec->dev, PLATFORM_DEVID_AUTO,
+				 cros_ec_platform_cells,
+				 ARRAY_SIZE(cros_ec_platform_cells),
+				 NULL, 0, NULL);
+	if (retval)
+		dev_warn(ec->dev,
+			 "failed to add cros-ec platform devices: %d\n",
+			 retval);
+
 	if (cros_ec_debugfs_init(ec))
 		dev_warn(dev, "failed to create debugfs directory\n");
 
@@ -494,9 +503,6 @@ static int ec_device_remove(struct platform_device *pdev)
 {
 	struct cros_ec_dev *ec = dev_get_drvdata(&pdev->dev);
 
-	/* Let the EC take over the lightbar again. */
-	lb_manual_suspend_ctrl(ec, 0);
-
 	cros_ec_debugfs_remove(ec);
 
 	mfd_remove_devices(ec->dev);
@@ -525,8 +531,6 @@ static __maybe_unused int ec_device_suspend(struct device *dev)
 
 	cros_ec_debugfs_suspend(ec);
 
-	lb_suspend(ec);
-
 	return 0;
 }
 
@@ -536,8 +540,6 @@ static __maybe_unused int ec_device_resume(struct device *dev)
 
 	cros_ec_debugfs_resume(ec);
 
-	lb_resume(ec);
-
 	return 0;
 }
 
diff --git a/drivers/mfd/cros_ec_dev.h b/drivers/mfd/cros_ec_dev.h
index 978d836a0248..ec750433455a 100644
--- a/drivers/mfd/cros_ec_dev.h
+++ b/drivers/mfd/cros_ec_dev.h
@@ -44,10 +44,4 @@ struct cros_ec_readmem {
 #define CROS_EC_DEV_IOCXCMD   _IOWR(CROS_EC_DEV_IOC, 0, struct cros_ec_command)
 #define CROS_EC_DEV_IOCRDMEM  _IOWR(CROS_EC_DEV_IOC, 1, struct cros_ec_readmem)
 
-/* Lightbar utilities */
-extern bool ec_has_lightbar(struct cros_ec_dev *ec);
-extern int lb_manual_suspend_ctrl(struct cros_ec_dev *ec, uint8_t enable);
-extern int lb_suspend(struct cros_ec_dev *ec);
-extern int lb_resume(struct cros_ec_dev *ec);
-
 #endif /* _CROS_EC_DEV_H_ */
diff --git a/drivers/platform/chrome/Kconfig b/drivers/platform/chrome/Kconfig
index 16b1615958aa..6c05752a3334 100644
--- a/drivers/platform/chrome/Kconfig
+++ b/drivers/platform/chrome/Kconfig
@@ -111,4 +111,15 @@ config CROS_KBD_LED_BACKLIGHT
 	  To compile this driver as a module, choose M here: the
 	  module will be called cros_kbd_led_backlight.
 
+config CROS_EC_LIGHTBAR
+	tristate "Chromebook Pixel's lightbar support"
+	depends on MFD_CROS_EC_CHARDEV
+	default MFD_CROS_EC_CHARDEV
+	help
+	  This option exposes the Chromebook Pixel's lightbar to
+	  userspace.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called cros_ec_lightbar.
+
 endif # CHROMEOS_PLATFORMS
diff --git a/drivers/platform/chrome/Makefile b/drivers/platform/chrome/Makefile
index cd591bf872bb..3c29a4b405d5 100644
--- a/drivers/platform/chrome/Makefile
+++ b/drivers/platform/chrome/Makefile
@@ -3,7 +3,7 @@
 obj-$(CONFIG_CHROMEOS_LAPTOP)		+= chromeos_laptop.o
 obj-$(CONFIG_CHROMEOS_PSTORE)		+= chromeos_pstore.o
 obj-$(CONFIG_CHROMEOS_TBMC)		+= chromeos_tbmc.o
-cros_ec_ctl-objs			:= cros_ec_sysfs.o cros_ec_lightbar.o \
+cros_ec_ctl-objs			:= cros_ec_sysfs.o \
 					   cros_ec_vbc.o cros_ec_debugfs.o
 obj-$(CONFIG_CROS_EC_CTL)		+= cros_ec_ctl.o
 obj-$(CONFIG_CROS_EC_I2C)		+= cros_ec_i2c.o
@@ -13,3 +13,4 @@ cros_ec_lpcs-$(CONFIG_CROS_EC_LPC_MEC)	+= cros_ec_lpc_mec.o
 obj-$(CONFIG_CROS_EC_LPC)		+= cros_ec_lpcs.o
 obj-$(CONFIG_CROS_EC_PROTO)		+= cros_ec_proto.o
 obj-$(CONFIG_CROS_KBD_LED_BACKLIGHT)	+= cros_kbd_led_backlight.o
+obj-$(CONFIG_CROS_EC_LIGHTBAR)		+= cros_ec_lightbar.o
diff --git a/drivers/platform/chrome/cros_ec_lightbar.c b/drivers/platform/chrome/cros_ec_lightbar.c
index 68193bb53383..80eed6317570 100644
--- a/drivers/platform/chrome/cros_ec_lightbar.c
+++ b/drivers/platform/chrome/cros_ec_lightbar.c
@@ -33,6 +33,8 @@
 #include <linux/uaccess.h>
 #include <linux/slab.h>
 
+#define DRV_NAME "cros-ec-lightbar"
+
 /* Rate-limit the lightbar interface to prevent DoS. */
 static unsigned long lb_interval_jiffies = 50 * HZ / 1000;
 
@@ -373,7 +375,7 @@ error:
 	return ret;
 }
 
-int lb_manual_suspend_ctrl(struct cros_ec_dev *ec, uint8_t enable)
+static int lb_manual_suspend_ctrl(struct cros_ec_dev *ec, uint8_t enable)
 {
 	struct ec_params_lightbar *param;
 	struct cros_ec_command *msg;
@@ -408,25 +410,6 @@ error:
 
 	return ret;
 }
-EXPORT_SYMBOL(lb_manual_suspend_ctrl);
-
-int lb_suspend(struct cros_ec_dev *ec)
-{
-	if (userspace_control || ec != ec_with_lightbar)
-		return 0;
-
-	return lb_send_empty_cmd(ec, LIGHTBAR_CMD_SUSPEND);
-}
-EXPORT_SYMBOL(lb_suspend);
-
-int lb_resume(struct cros_ec_dev *ec)
-{
-	if (userspace_control || ec != ec_with_lightbar)
-		return 0;
-
-	return lb_send_empty_cmd(ec, LIGHTBAR_CMD_RESUME);
-}
-EXPORT_SYMBOL(lb_resume);
 
 static ssize_t sequence_store(struct device *dev, struct device_attribute *attr,
 			      const char *buf, size_t count)
@@ -584,7 +567,7 @@ static struct attribute *__lb_cmds_attrs[] = {
 	NULL,
 };
 
-bool ec_has_lightbar(struct cros_ec_dev *ec)
+static bool ec_has_lightbar(struct cros_ec_dev *ec)
 {
 	return !!get_lightbar_version(ec, NULL, NULL);
 }
@@ -616,4 +599,72 @@ struct attribute_group cros_ec_lightbar_attr_group = {
 	.attrs = __lb_cmds_attrs,
 	.is_visible = cros_ec_lightbar_attrs_are_visible,
 };
-EXPORT_SYMBOL(cros_ec_lightbar_attr_group);
+
+static int cros_ec_lightbar_probe(struct platform_device *pd)
+{
+	struct cros_ec_dev *ec_dev = dev_get_drvdata(pd->dev.parent);
+	struct device *dev = &pd->dev;
+	int ret;
+
+	/* Take control of the lightbar from the EC. */
+	lb_manual_suspend_ctrl(ec_dev, 1);
+
+	ret = sysfs_create_group(&ec_dev->class_dev.kobj,
+				 &cros_ec_lightbar_attr_group);
+	if (ret < 0)
+		dev_err(dev, "failed to create %s attributes. err=%d\n",
+			cros_ec_lightbar_attr_group.name, ret);
+
+	return ret;
+}
+
+static int cros_ec_lightbar_remove(struct platform_device *pd)
+{
+	struct cros_ec_dev *ec_dev = dev_get_drvdata(pd->dev.parent);
+
+	sysfs_remove_group(&ec_dev->class_dev.kobj,
+			   &cros_ec_lightbar_attr_group);
+
+	/* Let the EC take over the lightbar again. */
+	lb_manual_suspend_ctrl(ec_dev, 0);
+
+	return 0;
+}
+
+static int __maybe_unused cros_ec_lightbar_resume(struct device *dev)
+{
+	struct cros_ec_dev *ec_dev = dev_get_drvdata(dev);
+
+	if (userspace_control || ec_dev != ec_with_lightbar)
+		return 0;
+
+	return lb_send_empty_cmd(ec_dev, LIGHTBAR_CMD_RESUME);
+}
+
+static int __maybe_unused cros_ec_lightbar_suspend(struct device *dev)
+{
+	struct cros_ec_dev *ec_dev = dev_get_drvdata(dev);
+
+	if (userspace_control || ec_dev != ec_with_lightbar)
+		return 0;
+
+	return lb_send_empty_cmd(ec_dev, LIGHTBAR_CMD_SUSPEND);
+}
+
+static SIMPLE_DEV_PM_OPS(cros_ec_lightbar_pm_ops,
+			 cros_ec_lightbar_suspend, cros_ec_lightbar_resume);
+
+static struct platform_driver cros_ec_lightbar_driver = {
+	.driver = {
+		.name = DRV_NAME,
+		.pm = &cros_ec_lightbar_pm_ops,
+	},
+	.probe = cros_ec_lightbar_probe,
+	.remove = cros_ec_lightbar_remove,
+};
+
+module_platform_driver(cros_ec_lightbar_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Expose the Chromebook Pixel's lightbar to userspace");
+MODULE_ALIAS("platform:" DRV_NAME);
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index 977ebaa78e99..1e9b569564ea 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -327,7 +327,6 @@ u32 cros_ec_get_host_event(struct cros_ec_device *ec_dev);
 
 /* sysfs stuff */
 extern struct attribute_group cros_ec_attr_group;
-extern struct attribute_group cros_ec_lightbar_attr_group;
 extern struct attribute_group cros_ec_vbc_attr_group;
 
 /* debugfs stuff */
-- 
cgit v1.2.3


From acb9900f9e8074858738f48bee9a705138961258 Mon Sep 17 00:00:00 2001
From: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Date: Wed, 12 Dec 2018 18:33:58 +0100
Subject: mfd / platform: cros_ec: Move vbc attributes to its own driver

The entire way how cros sysfs attibutes are created is broken.
cros_ec_vbc should be its own driver and its attributes should be
associated with a vbc driver not the mfd driver. In order to retain
the path, the vbc attributes are attached to the cros_class.

The patch also adds the sysfs documentation.

Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Reviewed-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 .../sysfs-class-chromeos-driver-cros-ec-vbc        |  6 +++
 drivers/mfd/cros_ec_dev.c                          |  2 +-
 drivers/platform/chrome/Kconfig                    | 11 ++++++
 drivers/platform/chrome/Makefile                   |  3 +-
 drivers/platform/chrome/cros_ec_vbc.c              | 43 +++++++++++++++++++++-
 include/linux/mfd/cros_ec.h                        |  1 -
 6 files changed, 62 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-class-chromeos-driver-cros-ec-vbc

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-class-chromeos-driver-cros-ec-vbc b/Documentation/ABI/testing/sysfs-class-chromeos-driver-cros-ec-vbc
new file mode 100644
index 000000000000..38c5aaaaa89a
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-chromeos-driver-cros-ec-vbc
@@ -0,0 +1,6 @@
+What:		/sys/class/chromeos/<ec-device-name>/vbc/vboot_context
+Date:		October 2015
+KernelVersion:	4.4
+Description:
+		Read/write the verified boot context data included on a
+		small nvram space on some EC implementations.
diff --git a/drivers/mfd/cros_ec_dev.c b/drivers/mfd/cros_ec_dev.c
index b227718e0ec2..40c98808fa1c 100644
--- a/drivers/mfd/cros_ec_dev.c
+++ b/drivers/mfd/cros_ec_dev.c
@@ -36,7 +36,6 @@ static int ec_major;
 
 static const struct attribute_group *cros_ec_groups[] = {
 	&cros_ec_attr_group,
-	&cros_ec_vbc_attr_group,
 	NULL,
 };
 
@@ -396,6 +395,7 @@ static const struct mfd_cell cros_usbpd_charger_cells[] = {
 
 static const struct mfd_cell cros_ec_platform_cells[] = {
 	{ .name = "cros-ec-lightbar" },
+	{ .name = "cros-ec-vbc" },
 };
 
 static int ec_device_probe(struct platform_device *pdev)
diff --git a/drivers/platform/chrome/Kconfig b/drivers/platform/chrome/Kconfig
index 6c05752a3334..1e9dc9626e84 100644
--- a/drivers/platform/chrome/Kconfig
+++ b/drivers/platform/chrome/Kconfig
@@ -122,4 +122,15 @@ config CROS_EC_LIGHTBAR
 	  To compile this driver as a module, choose M here: the
 	  module will be called cros_ec_lightbar.
 
+config CROS_EC_VBC
+	tristate "ChromeOS EC vboot context support"
+	depends on MFD_CROS_EC_CHARDEV && OF
+	default MFD_CROS_EC_CHARDEV
+	help
+	  This option exposes the ChromeOS EC vboot context nvram to
+	  userspace.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called cros_ec_vbc.
+
 endif # CHROMEOS_PLATFORMS
diff --git a/drivers/platform/chrome/Makefile b/drivers/platform/chrome/Makefile
index 3c29a4b405d5..4081b7179df7 100644
--- a/drivers/platform/chrome/Makefile
+++ b/drivers/platform/chrome/Makefile
@@ -4,7 +4,7 @@ obj-$(CONFIG_CHROMEOS_LAPTOP)		+= chromeos_laptop.o
 obj-$(CONFIG_CHROMEOS_PSTORE)		+= chromeos_pstore.o
 obj-$(CONFIG_CHROMEOS_TBMC)		+= chromeos_tbmc.o
 cros_ec_ctl-objs			:= cros_ec_sysfs.o \
-					   cros_ec_vbc.o cros_ec_debugfs.o
+					   cros_ec_debugfs.o
 obj-$(CONFIG_CROS_EC_CTL)		+= cros_ec_ctl.o
 obj-$(CONFIG_CROS_EC_I2C)		+= cros_ec_i2c.o
 obj-$(CONFIG_CROS_EC_SPI)		+= cros_ec_spi.o
@@ -14,3 +14,4 @@ obj-$(CONFIG_CROS_EC_LPC)		+= cros_ec_lpcs.o
 obj-$(CONFIG_CROS_EC_PROTO)		+= cros_ec_proto.o
 obj-$(CONFIG_CROS_KBD_LED_BACKLIGHT)	+= cros_kbd_led_backlight.o
 obj-$(CONFIG_CROS_EC_LIGHTBAR)		+= cros_ec_lightbar.o
+obj-$(CONFIG_CROS_EC_VBC)		+= cros_ec_vbc.o
diff --git a/drivers/platform/chrome/cros_ec_vbc.c b/drivers/platform/chrome/cros_ec_vbc.c
index 5356f26bc022..da3bbf05e86f 100644
--- a/drivers/platform/chrome/cros_ec_vbc.c
+++ b/drivers/platform/chrome/cros_ec_vbc.c
@@ -22,8 +22,11 @@
 #include <linux/platform_device.h>
 #include <linux/mfd/cros_ec.h>
 #include <linux/mfd/cros_ec_commands.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 
+#define DRV_NAME "cros-ec-vbc"
+
 static ssize_t vboot_context_read(struct file *filp, struct kobject *kobj,
 				  struct bin_attribute *att, char *buf,
 				  loff_t pos, size_t count)
@@ -132,4 +135,42 @@ struct attribute_group cros_ec_vbc_attr_group = {
 	.bin_attrs = cros_ec_vbc_bin_attrs,
 	.is_bin_visible = cros_ec_vbc_is_visible,
 };
-EXPORT_SYMBOL(cros_ec_vbc_attr_group);
+
+static int cros_ec_vbc_probe(struct platform_device *pd)
+{
+	struct cros_ec_dev *ec_dev = dev_get_drvdata(pd->dev.parent);
+	struct device *dev = &pd->dev;
+	int ret;
+
+	ret = sysfs_create_group(&ec_dev->class_dev.kobj,
+				 &cros_ec_vbc_attr_group);
+	if (ret < 0)
+		dev_err(dev, "failed to create %s attributes. err=%d\n",
+			cros_ec_vbc_attr_group.name, ret);
+
+	return ret;
+}
+
+static int cros_ec_vbc_remove(struct platform_device *pd)
+{
+	struct cros_ec_dev *ec_dev = dev_get_drvdata(pd->dev.parent);
+
+	sysfs_remove_group(&ec_dev->class_dev.kobj,
+			   &cros_ec_vbc_attr_group);
+
+	return 0;
+}
+
+static struct platform_driver cros_ec_vbc_driver = {
+	.driver = {
+		.name = DRV_NAME,
+	},
+	.probe = cros_ec_vbc_probe,
+	.remove = cros_ec_vbc_remove,
+};
+
+module_platform_driver(cros_ec_vbc_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Expose the vboot context nvram to userspace");
+MODULE_ALIAS("platform:" DRV_NAME);
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index 1e9b569564ea..fdc3152cca1d 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -327,7 +327,6 @@ u32 cros_ec_get_host_event(struct cros_ec_device *ec_dev);
 
 /* sysfs stuff */
 extern struct attribute_group cros_ec_attr_group;
-extern struct attribute_group cros_ec_vbc_attr_group;
 
 /* debugfs stuff */
 int cros_ec_debugfs_init(struct cros_ec_dev *ec);
-- 
cgit v1.2.3


From 6fce0a2cf5a050e8a3326556d7d293e69be303be Mon Sep 17 00:00:00 2001
From: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Date: Wed, 12 Dec 2018 18:33:59 +0100
Subject: mfd / platform: cros_ec: Move debugfs attributes to its own driver

The entire way how cros debugfs attibutes are created is broken.
cros_ec_debugfs should be its own driver and its attributes should be
associated with a debugfs driver not the mfd driver.

Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Reviewed-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cros_ec_dev.c                 | 41 +-------------------
 drivers/platform/chrome/Kconfig           | 11 ++++++
 drivers/platform/chrome/Makefile          |  4 +-
 drivers/platform/chrome/cros_ec_debugfs.c | 62 +++++++++++++++++++++----------
 include/linux/mfd/cros_ec.h               |  6 ---
 5 files changed, 56 insertions(+), 68 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/cros_ec_dev.c b/drivers/mfd/cros_ec_dev.c
index 40c98808fa1c..9955937b821d 100644
--- a/drivers/mfd/cros_ec_dev.c
+++ b/drivers/mfd/cros_ec_dev.c
@@ -394,6 +394,7 @@ static const struct mfd_cell cros_usbpd_charger_cells[] = {
 };
 
 static const struct mfd_cell cros_ec_platform_cells[] = {
+	{ .name = "cros-ec-debugfs" },
 	{ .name = "cros-ec-lightbar" },
 	{ .name = "cros-ec-vbc" },
 };
@@ -489,9 +490,6 @@ static int ec_device_probe(struct platform_device *pdev)
 			 "failed to add cros-ec platform devices: %d\n",
 			 retval);
 
-	if (cros_ec_debugfs_init(ec))
-		dev_warn(dev, "failed to create debugfs directory\n");
-
 	return 0;
 
 failed:
@@ -503,62 +501,25 @@ static int ec_device_remove(struct platform_device *pdev)
 {
 	struct cros_ec_dev *ec = dev_get_drvdata(&pdev->dev);
 
-	cros_ec_debugfs_remove(ec);
-
 	mfd_remove_devices(ec->dev);
 	cdev_del(&ec->cdev);
 	device_unregister(&ec->class_dev);
 	return 0;
 }
 
-static void ec_device_shutdown(struct platform_device *pdev)
-{
-	struct cros_ec_dev *ec = dev_get_drvdata(&pdev->dev);
-
-	/* Be sure to clear up debugfs delayed works */
-	cros_ec_debugfs_remove(ec);
-}
-
 static const struct platform_device_id cros_ec_id[] = {
 	{ DRV_NAME, 0 },
 	{ /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(platform, cros_ec_id);
 
-static __maybe_unused int ec_device_suspend(struct device *dev)
-{
-	struct cros_ec_dev *ec = dev_get_drvdata(dev);
-
-	cros_ec_debugfs_suspend(ec);
-
-	return 0;
-}
-
-static __maybe_unused int ec_device_resume(struct device *dev)
-{
-	struct cros_ec_dev *ec = dev_get_drvdata(dev);
-
-	cros_ec_debugfs_resume(ec);
-
-	return 0;
-}
-
-static const struct dev_pm_ops cros_ec_dev_pm_ops = {
-#ifdef CONFIG_PM_SLEEP
-	.suspend = ec_device_suspend,
-	.resume = ec_device_resume,
-#endif
-};
-
 static struct platform_driver cros_ec_dev_driver = {
 	.driver = {
 		.name = DRV_NAME,
-		.pm = &cros_ec_dev_pm_ops,
 	},
 	.id_table = cros_ec_id,
 	.probe = ec_device_probe,
 	.remove = ec_device_remove,
-	.shutdown = ec_device_shutdown,
 };
 
 static int __init cros_ec_dev_init(void)
diff --git a/drivers/platform/chrome/Kconfig b/drivers/platform/chrome/Kconfig
index 1e9dc9626e84..6cbf5b69d156 100644
--- a/drivers/platform/chrome/Kconfig
+++ b/drivers/platform/chrome/Kconfig
@@ -133,4 +133,15 @@ config CROS_EC_VBC
 	  To compile this driver as a module, choose M here: the
 	  module will be called cros_ec_vbc.
 
+config CROS_EC_DEBUGFS
+	tristate "Export ChromeOS EC internals in DebugFS"
+	depends on MFD_CROS_EC_CHARDEV && DEBUG_FS
+	default MFD_CROS_EC_CHARDEV
+	help
+	  This option exposes the ChromeOS EC device internals to
+	  userspace.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called cros_ec_debugfs.
+
 endif # CHROMEOS_PLATFORMS
diff --git a/drivers/platform/chrome/Makefile b/drivers/platform/chrome/Makefile
index 4081b7179df7..12a5c4d18c17 100644
--- a/drivers/platform/chrome/Makefile
+++ b/drivers/platform/chrome/Makefile
@@ -3,8 +3,7 @@
 obj-$(CONFIG_CHROMEOS_LAPTOP)		+= chromeos_laptop.o
 obj-$(CONFIG_CHROMEOS_PSTORE)		+= chromeos_pstore.o
 obj-$(CONFIG_CHROMEOS_TBMC)		+= chromeos_tbmc.o
-cros_ec_ctl-objs			:= cros_ec_sysfs.o \
-					   cros_ec_debugfs.o
+cros_ec_ctl-objs			:= cros_ec_sysfs.o
 obj-$(CONFIG_CROS_EC_CTL)		+= cros_ec_ctl.o
 obj-$(CONFIG_CROS_EC_I2C)		+= cros_ec_i2c.o
 obj-$(CONFIG_CROS_EC_SPI)		+= cros_ec_spi.o
@@ -15,3 +14,4 @@ obj-$(CONFIG_CROS_EC_PROTO)		+= cros_ec_proto.o
 obj-$(CONFIG_CROS_KBD_LED_BACKLIGHT)	+= cros_kbd_led_backlight.o
 obj-$(CONFIG_CROS_EC_LIGHTBAR)		+= cros_ec_lightbar.o
 obj-$(CONFIG_CROS_EC_VBC)		+= cros_ec_vbc.o
+obj-$(CONFIG_CROS_EC_DEBUGFS)		+= cros_ec_debugfs.o
diff --git a/drivers/platform/chrome/cros_ec_debugfs.c b/drivers/platform/chrome/cros_ec_debugfs.c
index c62ee8e610a0..6cacac53dfce 100644
--- a/drivers/platform/chrome/cros_ec_debugfs.c
+++ b/drivers/platform/chrome/cros_ec_debugfs.c
@@ -23,12 +23,16 @@
 #include <linux/fs.h>
 #include <linux/mfd/cros_ec.h>
 #include <linux/mfd/cros_ec_commands.h>
+#include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/platform_device.h>
 #include <linux/poll.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/wait.h>
 
+#define DRV_NAME "cros-ec-debugfs"
+
 #define LOG_SHIFT		14
 #define LOG_SIZE		(1 << LOG_SHIFT)
 #define LOG_POLL_SEC		10
@@ -423,8 +427,9 @@ static int cros_ec_create_pdinfo(struct cros_ec_debugfs *debug_info)
 	return 0;
 }
 
-int cros_ec_debugfs_init(struct cros_ec_dev *ec)
+static int cros_ec_debugfs_probe(struct platform_device *pd)
 {
+	struct cros_ec_dev *ec = dev_get_drvdata(pd->dev.parent);
 	struct cros_ec_platform *ec_platform = dev_get_platdata(ec->dev);
 	const char *name = ec_platform->ec_name;
 	struct cros_ec_debugfs *debug_info;
@@ -453,40 +458,57 @@ int cros_ec_debugfs_init(struct cros_ec_dev *ec)
 
 	ec->debug_info = debug_info;
 
+	dev_set_drvdata(&pd->dev, ec);
+
 	return 0;
 
 remove_debugfs:
 	debugfs_remove_recursive(debug_info->dir);
 	return ret;
 }
-EXPORT_SYMBOL(cros_ec_debugfs_init);
 
-void cros_ec_debugfs_remove(struct cros_ec_dev *ec)
+static int cros_ec_debugfs_remove(struct platform_device *pd)
 {
-	if (!ec->debug_info)
-		return;
+	struct cros_ec_dev *ec = dev_get_drvdata(pd->dev.parent);
 
 	debugfs_remove_recursive(ec->debug_info->dir);
 	cros_ec_cleanup_console_log(ec->debug_info);
+
+	return 0;
 }
-EXPORT_SYMBOL(cros_ec_debugfs_remove);
 
-void cros_ec_debugfs_suspend(struct cros_ec_dev *ec)
+static int __maybe_unused cros_ec_debugfs_suspend(struct device *dev)
 {
-	/*
-	 * cros_ec_debugfs_init() failures are non-fatal; it's also possible
-	 * that we initted things but decided that console log wasn't supported.
-	 * We'll use the same set of checks that cros_ec_debugfs_remove() +
-	 * cros_ec_cleanup_console_log() end up using to handle those cases.
-	 */
-	if (ec->debug_info && ec->debug_info->log_buffer.buf)
-		cancel_delayed_work_sync(&ec->debug_info->log_poll_work);
+	struct cros_ec_dev *ec = dev_get_drvdata(dev);
+
+	cancel_delayed_work_sync(&ec->debug_info->log_poll_work);
+
+	return 0;
 }
-EXPORT_SYMBOL(cros_ec_debugfs_suspend);
 
-void cros_ec_debugfs_resume(struct cros_ec_dev *ec)
+static int __maybe_unused cros_ec_debugfs_resume(struct device *dev)
 {
-	if (ec->debug_info && ec->debug_info->log_buffer.buf)
-		schedule_delayed_work(&ec->debug_info->log_poll_work, 0);
+	struct cros_ec_dev *ec = dev_get_drvdata(dev);
+
+	schedule_delayed_work(&ec->debug_info->log_poll_work, 0);
+
+	return 0;
 }
-EXPORT_SYMBOL(cros_ec_debugfs_resume);
+
+static SIMPLE_DEV_PM_OPS(cros_ec_debugfs_pm_ops,
+			 cros_ec_debugfs_suspend, cros_ec_debugfs_resume);
+
+static struct platform_driver cros_ec_debugfs_driver = {
+	.driver = {
+		.name = DRV_NAME,
+		.pm = &cros_ec_debugfs_pm_ops,
+	},
+	.probe = cros_ec_debugfs_probe,
+	.remove = cros_ec_debugfs_remove,
+};
+
+module_platform_driver(cros_ec_debugfs_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Debug logs for ChromeOS EC");
+MODULE_ALIAS("platform:" DRV_NAME);
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index fdc3152cca1d..e50860d190db 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -328,10 +328,4 @@ u32 cros_ec_get_host_event(struct cros_ec_device *ec_dev);
 /* sysfs stuff */
 extern struct attribute_group cros_ec_attr_group;
 
-/* debugfs stuff */
-int cros_ec_debugfs_init(struct cros_ec_dev *ec);
-void cros_ec_debugfs_remove(struct cros_ec_dev *ec);
-void cros_ec_debugfs_suspend(struct cros_ec_dev *ec);
-void cros_ec_debugfs_resume(struct cros_ec_dev *ec);
-
 #endif /* __LINUX_MFD_CROS_EC_H */
-- 
cgit v1.2.3


From 6fd7f2bbd4422e7635bc771cd1ec440378158cb1 Mon Sep 17 00:00:00 2001
From: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Date: Wed, 12 Dec 2018 18:34:00 +0100
Subject: mfd / platform: cros_ec: Move device sysfs attributes to its own
 driver

The entire way how cros debugfs attibutes are created is broken.
cros_ec_sysfs should be its own driver and its attributes should be
associated with the sysfs driver not the mfd driver.

The patch also adds the sysfs documentation.

Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Reviewed-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 Documentation/ABI/testing/sysfs-class-chromeos | 32 +++++++++++++++++++++++
 drivers/mfd/Kconfig                            |  1 -
 drivers/mfd/cros_ec_dev.c                      |  7 +----
 drivers/platform/chrome/Kconfig                | 14 +++++++---
 drivers/platform/chrome/Makefile               |  3 +--
 drivers/platform/chrome/cros_ec_lightbar.c     |  2 +-
 drivers/platform/chrome/cros_ec_sysfs.c        | 36 +++++++++++++++++++++++++-
 include/linux/mfd/cros_ec.h                    |  3 ---
 8 files changed, 81 insertions(+), 17 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-class-chromeos

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-class-chromeos b/Documentation/ABI/testing/sysfs-class-chromeos
new file mode 100644
index 000000000000..5819699d66ec
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-chromeos
@@ -0,0 +1,32 @@
+What:		/sys/class/chromeos/<ec-device-name>/flashinfo
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		Show the EC flash information.
+
+What:		/sys/class/chromeos/<ec-device-name>/kb_wake_angle
+Date:		March 2018
+KernelVersion:	4.17
+Description:
+		Control the keyboard wake lid angle. Values are between
+		0 and 360. This file will also show the keyboard wake lid
+		angle by querying the hardware.
+
+What:		/sys/class/chromeos/<ec-device-name>/reboot
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		Tell the EC to reboot in various ways. Options are:
+		"cancel": Cancel a pending reboot.
+		"ro": Jump to RO without rebooting.
+		"rw": Jump to RW without rebooting.
+		"cold": Cold reboot.
+		"disable-jump": Disable jump until next reboot.
+		"hibernate": Hibernate the EC.
+		"at-shutdown": Reboot after an AP shutdown.
+
+What:		/sys/class/chromeos/<ec-device-name>/version
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		Show the information about the EC software and hardware.
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index f461460a2aeb..2acc105e43cc 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -215,7 +215,6 @@ config MFD_CROS_EC
 config MFD_CROS_EC_CHARDEV
         tristate "Chrome OS Embedded Controller userspace device interface"
         depends on MFD_CROS_EC
-        select CROS_EC_CTL
         ---help---
           This driver adds support to talk with the ChromeOS EC from userspace.
 
diff --git a/drivers/mfd/cros_ec_dev.c b/drivers/mfd/cros_ec_dev.c
index 9955937b821d..b9ec2a798dbb 100644
--- a/drivers/mfd/cros_ec_dev.c
+++ b/drivers/mfd/cros_ec_dev.c
@@ -34,15 +34,9 @@
 #define CROS_MAX_DEV 128
 static int ec_major;
 
-static const struct attribute_group *cros_ec_groups[] = {
-	&cros_ec_attr_group,
-	NULL,
-};
-
 static struct class cros_class = {
 	.owner          = THIS_MODULE,
 	.name           = "chromeos",
-	.dev_groups     = cros_ec_groups,
 };
 
 /* Basic communication */
@@ -396,6 +390,7 @@ static const struct mfd_cell cros_usbpd_charger_cells[] = {
 static const struct mfd_cell cros_ec_platform_cells[] = {
 	{ .name = "cros-ec-debugfs" },
 	{ .name = "cros-ec-lightbar" },
+	{ .name = "cros-ec-sysfs" },
 	{ .name = "cros-ec-vbc" },
 };
 
diff --git a/drivers/platform/chrome/Kconfig b/drivers/platform/chrome/Kconfig
index 6cbf5b69d156..5e2fde5ff63d 100644
--- a/drivers/platform/chrome/Kconfig
+++ b/drivers/platform/chrome/Kconfig
@@ -49,9 +49,6 @@ config CHROMEOS_TBMC
 	  To compile this driver as a module, choose M here: the
 	  module will be called chromeos_tbmc.
 
-config CROS_EC_CTL
-        tristate
-
 config CROS_EC_I2C
 	tristate "ChromeOS Embedded Controller (I2C)"
 	depends on MFD_CROS_EC && I2C
@@ -144,4 +141,15 @@ config CROS_EC_DEBUGFS
 	  To compile this driver as a module, choose M here: the
 	  module will be called cros_ec_debugfs.
 
+config CROS_EC_SYSFS
+	tristate "ChromeOS EC control and information through sysfs"
+	depends on MFD_CROS_EC_CHARDEV && SYSFS
+	default MFD_CROS_EC_CHARDEV
+	help
+	  This option exposes some sysfs attributes to control and get
+	  information from ChromeOS EC.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called cros_ec_sysfs.
+
 endif # CHROMEOS_PLATFORMS
diff --git a/drivers/platform/chrome/Makefile b/drivers/platform/chrome/Makefile
index 12a5c4d18c17..fdbee501931b 100644
--- a/drivers/platform/chrome/Makefile
+++ b/drivers/platform/chrome/Makefile
@@ -3,8 +3,6 @@
 obj-$(CONFIG_CHROMEOS_LAPTOP)		+= chromeos_laptop.o
 obj-$(CONFIG_CHROMEOS_PSTORE)		+= chromeos_pstore.o
 obj-$(CONFIG_CHROMEOS_TBMC)		+= chromeos_tbmc.o
-cros_ec_ctl-objs			:= cros_ec_sysfs.o
-obj-$(CONFIG_CROS_EC_CTL)		+= cros_ec_ctl.o
 obj-$(CONFIG_CROS_EC_I2C)		+= cros_ec_i2c.o
 obj-$(CONFIG_CROS_EC_SPI)		+= cros_ec_spi.o
 cros_ec_lpcs-objs			:= cros_ec_lpc.o cros_ec_lpc_reg.o
@@ -15,3 +13,4 @@ obj-$(CONFIG_CROS_KBD_LED_BACKLIGHT)	+= cros_kbd_led_backlight.o
 obj-$(CONFIG_CROS_EC_LIGHTBAR)		+= cros_ec_lightbar.o
 obj-$(CONFIG_CROS_EC_VBC)		+= cros_ec_vbc.o
 obj-$(CONFIG_CROS_EC_DEBUGFS)		+= cros_ec_debugfs.o
+obj-$(CONFIG_CROS_EC_SYSFS)		+= cros_ec_sysfs.o
diff --git a/drivers/platform/chrome/cros_ec_lightbar.c b/drivers/platform/chrome/cros_ec_lightbar.c
index 80eed6317570..c22318ba93aa 100644
--- a/drivers/platform/chrome/cros_ec_lightbar.c
+++ b/drivers/platform/chrome/cros_ec_lightbar.c
@@ -594,7 +594,7 @@ static umode_t cros_ec_lightbar_attrs_are_visible(struct kobject *kobj,
 	return 0;
 }
 
-struct attribute_group cros_ec_lightbar_attr_group = {
+static struct attribute_group cros_ec_lightbar_attr_group = {
 	.name = "lightbar",
 	.attrs = __lb_cmds_attrs,
 	.is_visible = cros_ec_lightbar_attrs_are_visible,
diff --git a/drivers/platform/chrome/cros_ec_sysfs.c b/drivers/platform/chrome/cros_ec_sysfs.c
index f34a50121064..0ff5aa30c070 100644
--- a/drivers/platform/chrome/cros_ec_sysfs.c
+++ b/drivers/platform/chrome/cros_ec_sysfs.c
@@ -34,6 +34,8 @@
 #include <linux/types.h>
 #include <linux/uaccess.h>
 
+#define DRV_NAME "cros-ec-sysfs"
+
 /* Accessor functions */
 
 static ssize_t reboot_show(struct device *dev,
@@ -353,7 +355,39 @@ struct attribute_group cros_ec_attr_group = {
 	.attrs = __ec_attrs,
 	.is_visible = cros_ec_ctrl_visible,
 };
-EXPORT_SYMBOL(cros_ec_attr_group);
+
+static int cros_ec_sysfs_probe(struct platform_device *pd)
+{
+	struct cros_ec_dev *ec_dev = dev_get_drvdata(pd->dev.parent);
+	struct device *dev = &pd->dev;
+	int ret;
+
+	ret = sysfs_create_group(&ec_dev->class_dev.kobj, &cros_ec_attr_group);
+	if (ret < 0)
+		dev_err(dev, "failed to create attributes. err=%d\n", ret);
+
+	return ret;
+}
+
+static int cros_ec_sysfs_remove(struct platform_device *pd)
+{
+	struct cros_ec_dev *ec_dev = dev_get_drvdata(pd->dev.parent);
+
+	sysfs_remove_group(&ec_dev->class_dev.kobj, &cros_ec_attr_group);
+
+	return 0;
+}
+
+static struct platform_driver cros_ec_sysfs_driver = {
+	.driver = {
+		.name = DRV_NAME,
+	},
+	.probe = cros_ec_sysfs_probe,
+	.remove = cros_ec_sysfs_remove,
+};
+
+module_platform_driver(cros_ec_sysfs_driver);
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("ChromeOS EC control driver");
+MODULE_ALIAS("platform:" DRV_NAME);
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index e50860d190db..8f2a8918bfa3 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -325,7 +325,4 @@ int cros_ec_get_next_event(struct cros_ec_device *ec_dev, bool *wake_event);
  */
 u32 cros_ec_get_host_event(struct cros_ec_device *ec_dev);
 
-/* sysfs stuff */
-extern struct attribute_group cros_ec_attr_group;
-
 #endif /* __LINUX_MFD_CROS_EC_H */
-- 
cgit v1.2.3


From efb5a790dfc33b36bc64dd5a41ffc3ae5a709770 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sun, 13 Jan 2019 13:36:46 -0500
Subject: mfd: wm831x-core: Drop unused module infrastructure from non-modular
 code

The Kconfig currently controlling compilation of this code is:

drivers/mfd/Kconfig:config MFD_WM831X
drivers/mfd/Kconfig:    bool

...meaning that it currently is not being built as a module by anyone.

Lets remove the couple traces of modular infrastructure use, so that
when reading the driver there is no doubt it is builtin-only.

We delete the MODULE_LICENSE tag etc. since all that information
is already contained at the top of the file in the comments.

Previous demodularizaion work has made wm831x_device_exit() no longer
used, so it is also removed from the 831x core code.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Acked-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/wm831x-core.c       | 15 ++-------------
 include/linux/mfd/wm831x/core.h |  1 -
 2 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/wm831x-core.c b/drivers/mfd/wm831x-core.c
index e70d35ef5c6d..25fbbaf39cb9 100644
--- a/drivers/mfd/wm831x-core.c
+++ b/drivers/mfd/wm831x-core.c
@@ -13,7 +13,8 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/export.h>
 #include <linux/bcd.h>
 #include <linux/delay.h>
 #include <linux/mfd/core.h>
@@ -1892,14 +1893,6 @@ err:
 	return ret;
 }
 
-void wm831x_device_exit(struct wm831x *wm831x)
-{
-	wm831x_otp_exit(wm831x);
-	mfd_remove_devices(wm831x->dev);
-	free_irq(wm831x_irq(wm831x, WM831X_IRQ_AUXADC_DATA), wm831x);
-	wm831x_irq_exit(wm831x);
-}
-
 int wm831x_device_suspend(struct wm831x *wm831x)
 {
 	int reg, mask;
@@ -1944,7 +1937,3 @@ void wm831x_device_shutdown(struct wm831x *wm831x)
 	}
 }
 EXPORT_SYMBOL_GPL(wm831x_device_shutdown);
-
-MODULE_DESCRIPTION("Core support for the WM831X AudioPlus PMIC");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Mark Brown");
diff --git a/include/linux/mfd/wm831x/core.h b/include/linux/mfd/wm831x/core.h
index b49fa67612f1..6fcb8eb00282 100644
--- a/include/linux/mfd/wm831x/core.h
+++ b/include/linux/mfd/wm831x/core.h
@@ -418,7 +418,6 @@ int wm831x_bulk_read(struct wm831x *wm831x, unsigned short reg,
 		     int count, u16 *buf);
 
 int wm831x_device_init(struct wm831x *wm831x, int irq);
-void wm831x_device_exit(struct wm831x *wm831x);
 int wm831x_device_suspend(struct wm831x *wm831x);
 void wm831x_device_shutdown(struct wm831x *wm831x);
 int wm831x_irq_init(struct wm831x *wm831x, int irq);
-- 
cgit v1.2.3


From 0db88688e1bb0180d6348742bdba8927cd0e5670 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sun, 13 Jan 2019 13:36:48 -0500
Subject: mfd: wm8350-core: Drop unused module infrastructure from non-modular
 code

The Kconfig currently controlling compilation of this code is:

drivers/mfd/Kconfig:config MFD_WM8350
drivers/mfd/Kconfig:    bool

...meaning that it currently is not being built as a module by anyone.

Lets remove the couple traces of modular infrastructure use, so that
when reading the driver there is no doubt it is builtin-only.

We delete the MODULE_LICENSE tag etc. since all that information
is already contained at the top of the file in the comments.

We replace module.h with init.h and export.h ; the latter since the
file does export some symbols.

Previous demodularizaion work has made wm8350_device_exit() no longer
used, so it is also removed from the 8350 core code.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Acked-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/wm8350-core.c       | 30 ++----------------------------
 include/linux/mfd/wm8350/core.h |  1 -
 2 files changed, 2 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/wm8350-core.c b/drivers/mfd/wm8350-core.c
index 8a07c5634aee..9e1070f26b11 100644
--- a/drivers/mfd/wm8350-core.c
+++ b/drivers/mfd/wm8350-core.c
@@ -13,7 +13,8 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/bug.h>
 #include <linux/device.h>
@@ -442,30 +443,3 @@ err:
 	return ret;
 }
 EXPORT_SYMBOL_GPL(wm8350_device_init);
-
-void wm8350_device_exit(struct wm8350 *wm8350)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(wm8350->pmic.led); i++)
-		platform_device_unregister(wm8350->pmic.led[i].pdev);
-
-	for (i = 0; i < ARRAY_SIZE(wm8350->pmic.pdev); i++)
-		platform_device_unregister(wm8350->pmic.pdev[i]);
-
-	platform_device_unregister(wm8350->wdt.pdev);
-	platform_device_unregister(wm8350->rtc.pdev);
-	platform_device_unregister(wm8350->power.pdev);
-	platform_device_unregister(wm8350->hwmon.pdev);
-	platform_device_unregister(wm8350->gpio.pdev);
-	platform_device_unregister(wm8350->codec.pdev);
-
-	if (wm8350->irq_base)
-		free_irq(wm8350->irq_base + WM8350_IRQ_AUXADC_DATARDY, wm8350);
-
-	wm8350_irq_exit(wm8350);
-}
-EXPORT_SYMBOL_GPL(wm8350_device_exit);
-
-MODULE_DESCRIPTION("WM8350 AudioPlus PMIC core driver");
-MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/wm8350/core.h b/include/linux/mfd/wm8350/core.h
index 509481d9cf19..202d9bde2c7c 100644
--- a/include/linux/mfd/wm8350/core.h
+++ b/include/linux/mfd/wm8350/core.h
@@ -643,7 +643,6 @@ struct wm8350_platform_data {
  */
 int wm8350_device_init(struct wm8350 *wm8350, int irq,
 		       struct wm8350_platform_data *pdata);
-void wm8350_device_exit(struct wm8350 *wm8350);
 
 /*
  * WM8350 device IO
-- 
cgit v1.2.3


From d57f72875eed3f26afaca176c0f425f209bc99d7 Mon Sep 17 00:00:00 2001
From: Christian Hohnstaedt <Christian.Hohnstaedt@wago.com>
Date: Mon, 14 Jan 2019 09:16:34 +0100
Subject: mfd: tps65218.c: Add input voltage options

These options apply to all regulators in this chip.

ti,strict-supply-voltage-supervision:
  Set STRICT flag in CONFIG1
ti,under-voltage-limit-microvolt:
  Select 2.75, 2.95, 3.25 or 3.35 V UVLO in CONFIG1
ti,under-voltage-hyst-microvolt:
  Select 200mV or 400mV UVLOHYS in CONFIG2

Signed-off-by: Christian Hohnstaedt <Christian.Hohnstaedt@wago.com>
Tested-by: Keerthy <j-keerthy@ti.com>
Reviewed-by: Keerthy <j-keerthy@ti.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/tps65218.c       | 89 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/tps65218.h |  4 ++
 2 files changed, 93 insertions(+)

(limited to 'include')

diff --git a/drivers/mfd/tps65218.c b/drivers/mfd/tps65218.c
index 8bcdecf494d0..a62ea4cb8be7 100644
--- a/drivers/mfd/tps65218.c
+++ b/drivers/mfd/tps65218.c
@@ -211,6 +211,83 @@ static const struct of_device_id of_tps65218_match_table[] = {
 };
 MODULE_DEVICE_TABLE(of, of_tps65218_match_table);
 
+static int tps65218_voltage_set_strict(struct tps65218 *tps)
+{
+	u32 strict;
+
+	if (of_property_read_u32(tps->dev->of_node,
+				 "ti,strict-supply-voltage-supervision",
+				 &strict))
+		return 0;
+
+	if (strict != 0 && strict != 1) {
+		dev_err(tps->dev,
+			"Invalid ti,strict-supply-voltage-supervision value\n");
+		return -EINVAL;
+	}
+
+	tps65218_update_bits(tps, TPS65218_REG_CONFIG1,
+			     TPS65218_CONFIG1_STRICT,
+			     strict ? TPS65218_CONFIG1_STRICT : 0,
+			     TPS65218_PROTECT_L1);
+	return 0;
+}
+
+static int tps65218_voltage_set_uv_hyst(struct tps65218 *tps)
+{
+	u32 hyst;
+
+	if (of_property_read_u32(tps->dev->of_node,
+				 "ti,under-voltage-hyst-microvolt", &hyst))
+		return 0;
+
+	if (hyst != 400000 && hyst != 200000) {
+		dev_err(tps->dev,
+			"Invalid ti,under-voltage-hyst-microvolt value\n");
+		return -EINVAL;
+	}
+
+	tps65218_update_bits(tps, TPS65218_REG_CONFIG2,
+			     TPS65218_CONFIG2_UVLOHYS,
+			     hyst == 400000 ? TPS65218_CONFIG2_UVLOHYS : 0,
+			     TPS65218_PROTECT_L1);
+	return 0;
+}
+
+static int tps65218_voltage_set_uvlo(struct tps65218 *tps)
+{
+	u32 uvlo;
+	int uvloval;
+
+	if (of_property_read_u32(tps->dev->of_node,
+				 "ti,under-voltage-limit-microvolt", &uvlo))
+		return 0;
+
+	switch (uvlo) {
+	case 2750000:
+		uvloval = TPS65218_CONFIG1_UVLO_2750000;
+		break;
+	case 2950000:
+		uvloval = TPS65218_CONFIG1_UVLO_2950000;
+		break;
+	case 3250000:
+		uvloval = TPS65218_CONFIG1_UVLO_3250000;
+		break;
+	case 3350000:
+		uvloval = TPS65218_CONFIG1_UVLO_3350000;
+		break;
+	default:
+		dev_err(tps->dev,
+			"Invalid ti,under-voltage-limit-microvolt value\n");
+		return -EINVAL;
+	}
+
+	tps65218_update_bits(tps, TPS65218_REG_CONFIG1,
+			     TPS65218_CONFIG1_UVLO_MASK, uvloval,
+			     TPS65218_PROTECT_L1);
+	return 0;
+}
+
 static int tps65218_probe(struct i2c_client *client,
 				const struct i2c_device_id *ids)
 {
@@ -249,6 +326,18 @@ static int tps65218_probe(struct i2c_client *client,
 
 	tps->rev = chipid & TPS65218_CHIPID_REV_MASK;
 
+	ret = tps65218_voltage_set_strict(tps);
+	if (ret)
+		return ret;
+
+	ret = tps65218_voltage_set_uvlo(tps);
+	if (ret)
+		return ret;
+
+	ret = tps65218_voltage_set_uv_hyst(tps);
+	if (ret)
+		return ret;
+
 	ret = mfd_add_devices(tps->dev, PLATFORM_DEVID_AUTO, tps65218_cells,
 			      ARRAY_SIZE(tps65218_cells), NULL, 0,
 			      regmap_irq_get_domain(tps->irq_data));
diff --git a/include/linux/mfd/tps65218.h b/include/linux/mfd/tps65218.h
index c204d9a79436..3cbe103495ab 100644
--- a/include/linux/mfd/tps65218.h
+++ b/include/linux/mfd/tps65218.h
@@ -137,6 +137,10 @@
 #define TPS65218_CONFIG1_PGDLY_MASK	0x18
 #define TPS65218_CONFIG1_STRICT		BIT(2)
 #define TPS65218_CONFIG1_UVLO_MASK	0x3
+#define TPS65218_CONFIG1_UVLO_2750000	0x0
+#define TPS65218_CONFIG1_UVLO_2950000	0x1
+#define TPS65218_CONFIG1_UVLO_3250000	0x2
+#define TPS65218_CONFIG1_UVLO_3350000	0x3
 
 #define TPS65218_CONFIG2_DC12_RST	BIT(7)
 #define TPS65218_CONFIG2_UVLOHYS	BIT(6)
-- 
cgit v1.2.3


From cfced786969c2a3e1bca45d7055a00311d93ae6c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Jan 2019 18:20:05 +0100
Subject: dma-mapping: remove the default map_resource implementation

Instead provide a proper implementation in the direct mapping code, and
also wire it up for arm and powerpc, leaving an error return for all the
IOMMU or virtual mapping instances for which we'd have to wire up an
actual implementation

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
---
 arch/arm/mm/dma-mapping.c         |  2 ++
 arch/powerpc/kernel/dma-swiotlb.c |  1 +
 arch/powerpc/kernel/dma.c         |  1 +
 include/linux/dma-mapping.h       | 12 +++++++-----
 kernel/dma/direct.c               | 14 ++++++++++++++
 5 files changed, 25 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index f1e2922e447c..3c8534904209 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -188,6 +188,7 @@ const struct dma_map_ops arm_dma_ops = {
 	.unmap_page		= arm_dma_unmap_page,
 	.map_sg			= arm_dma_map_sg,
 	.unmap_sg		= arm_dma_unmap_sg,
+	.map_resource		= dma_direct_map_resource,
 	.sync_single_for_cpu	= arm_dma_sync_single_for_cpu,
 	.sync_single_for_device	= arm_dma_sync_single_for_device,
 	.sync_sg_for_cpu	= arm_dma_sync_sg_for_cpu,
@@ -211,6 +212,7 @@ const struct dma_map_ops arm_coherent_dma_ops = {
 	.get_sgtable		= arm_dma_get_sgtable,
 	.map_page		= arm_coherent_dma_map_page,
 	.map_sg			= arm_dma_map_sg,
+	.map_resource		= dma_direct_map_resource,
 	.dma_supported		= arm_dma_supported,
 };
 EXPORT_SYMBOL(arm_coherent_dma_ops);
diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c
index 7d5fc9751622..fbb2506a414e 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -55,6 +55,7 @@ const struct dma_map_ops powerpc_swiotlb_dma_ops = {
 	.dma_supported = swiotlb_dma_supported,
 	.map_page = dma_direct_map_page,
 	.unmap_page = dma_direct_unmap_page,
+	.map_resource = dma_direct_map_resource,
 	.sync_single_for_cpu = dma_direct_sync_single_for_cpu,
 	.sync_single_for_device = dma_direct_sync_single_for_device,
 	.sync_sg_for_cpu = dma_direct_sync_sg_for_cpu,
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index b1903ebb2e9c..258b9e8ebb99 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -273,6 +273,7 @@ const struct dma_map_ops dma_nommu_ops = {
 	.dma_supported			= dma_nommu_dma_supported,
 	.map_page			= dma_nommu_map_page,
 	.unmap_page			= dma_nommu_unmap_page,
+	.map_resource			= dma_direct_map_resource,
 	.get_required_mask		= dma_nommu_get_required_mask,
 #ifdef CONFIG_NOT_COHERENT_CACHE
 	.sync_single_for_cpu 		= dma_nommu_sync_single,
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index f6ded992c183..9842085e6774 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -208,6 +208,8 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 		unsigned long attrs);
 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		enum dma_data_direction dir, unsigned long attrs);
+dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs);
 
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
     defined(CONFIG_SWIOTLB)
@@ -346,19 +348,19 @@ static inline dma_addr_t dma_map_resource(struct device *dev,
 					  unsigned long attrs)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-	dma_addr_t addr;
+	dma_addr_t addr = DMA_MAPPING_ERROR;
 
 	BUG_ON(!valid_dma_direction(dir));
 
 	/* Don't allow RAM to be mapped */
 	BUG_ON(pfn_valid(PHYS_PFN(phys_addr)));
 
-	addr = phys_addr;
-	if (ops && ops->map_resource)
+	if (dma_is_direct(ops))
+		addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
+	else if (ops->map_resource)
 		addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
 
 	debug_dma_map_resource(dev, phys_addr, size, dir, addr);
-
 	return addr;
 }
 
@@ -369,7 +371,7 @@ static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops && ops->unmap_resource)
+	if (!dma_is_direct(ops) && ops->unmap_resource)
 		ops->unmap_resource(dev, addr, size, dir, attrs);
 	debug_dma_unmap_resource(dev, addr, size, dir);
 }
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 355d16acee6d..25bd19974223 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -356,6 +356,20 @@ out_unmap:
 }
 EXPORT_SYMBOL(dma_direct_map_sg);
 
+dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	dma_addr_t dma_addr = paddr;
+
+	if (unlikely(!dma_direct_possible(dev, dma_addr, size))) {
+		report_addr(dev, dma_addr, size);
+		return DMA_MAPPING_ERROR;
+	}
+
+	return dma_addr;
+}
+EXPORT_SYMBOL(dma_direct_map_resource);
+
 /*
  * Because 32-bit DMA masks are so common we expect every architecture to be
  * able to satisfy them - either by not supporting more physical memory, or by
-- 
cgit v1.2.3


From 645386dfe6307dbb28f10a4513792a59beda0efa Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Jan 2019 17:17:53 +0100
Subject: dma-mapping: don't BUG when calling dma_map_resource on RAM

Use WARN_ON_ONCE to print a stack trace and return a proper error
code instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
---
 include/linux/dma-mapping.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 9842085e6774..b904d55247ab 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -353,7 +353,8 @@ static inline dma_addr_t dma_map_resource(struct device *dev,
 	BUG_ON(!valid_dma_direction(dir));
 
 	/* Don't allow RAM to be mapped */
-	BUG_ON(pfn_valid(PHYS_PFN(phys_addr)));
+	if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
+		return DMA_MAPPING_ERROR;
 
 	if (dma_is_direct(ops))
 		addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
-- 
cgit v1.2.3


From 7403e48d7abd9a0114e9056e7615cfbbb5123b15 Mon Sep 17 00:00:00 2001
From: Kamil Konieczny <k.konieczny@partner.samsung.com>
Date: Tue, 22 Jan 2019 10:02:31 +0100
Subject: clk: samsung: dt-bindings: Add Exynos5433 IMEM CMU clock IDs

Add DT bindings to describe the IMEM CMU clocks for the SlimSSS
(Slim Security SubSystem).

Signed-off-by: Kamil Konieczny <k.konieczny@partner.samsung.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Acked-by:  Chanwoo Choi <cw00.choi@samsung.com>
[s.nawrocki@samsung.com: edited commit description]
Signed-off-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
---
 include/dt-bindings/clock/exynos5433.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/exynos5433.h b/include/dt-bindings/clock/exynos5433.h
index 98bd85ce1e45..7cfce3a62050 100644
--- a/include/dt-bindings/clock/exynos5433.h
+++ b/include/dt-bindings/clock/exynos5433.h
@@ -1406,4 +1406,10 @@
 
 #define CAM1_NR_CLK					113
 
+/* CMU_IMEM */
+#define CLK_ACLK_SLIMSSS		2
+#define CLK_PCLK_SLIMSSS		35
+
+#define IMEM_NR_CLK			36
+
 #endif /* _DT_BINDINGS_CLOCK_EXYNOS5433_H */
-- 
cgit v1.2.3


From b80a40c659ba38e1295ab7e55b2c781abee64739 Mon Sep 17 00:00:00 2001
From: Kamil Konieczny <k.konieczny@partner.samsung.com>
Date: Tue, 22 Jan 2019 10:02:29 +0100
Subject: clk: samsung: exynos5433: Fix name typo in sssx

Fix typo in sssx name, there should be three letters 's'.

Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Kamil Konieczny <k.konieczny@partner.samsung.com>
Signed-off-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
---
 drivers/clk/samsung/clk-exynos5433.c   | 2 +-
 include/dt-bindings/clock/exynos5433.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/samsung/clk-exynos5433.c b/drivers/clk/samsung/clk-exynos5433.c
index ea47f49abc7f..24c3360db65b 100644
--- a/drivers/clk/samsung/clk-exynos5433.c
+++ b/drivers/clk/samsung/clk-exynos5433.c
@@ -559,7 +559,7 @@ static const struct samsung_gate_clock top_gate_clks[] __initconst = {
 	/* ENABLE_ACLK_TOP */
 	GATE(CLK_ACLK_G3D_400, "aclk_g3d_400", "div_aclk_g3d_400",
 			ENABLE_ACLK_TOP, 30, CLK_IS_CRITICAL, 0),
-	GATE(CLK_ACLK_IMEM_SSX_266, "aclk_imem_ssx_266",
+	GATE(CLK_ACLK_IMEM_SSSX_266, "aclk_imem_sssx_266",
 			"div_aclk_imem_sssx_266", ENABLE_ACLK_TOP,
 			29, CLK_IGNORE_UNUSED, 0),
 	GATE(CLK_ACLK_BUS0_400, "aclk_bus0_400", "div_aclk_bus0_400",
diff --git a/include/dt-bindings/clock/exynos5433.h b/include/dt-bindings/clock/exynos5433.h
index 7cfce3a62050..25ffa53573a5 100644
--- a/include/dt-bindings/clock/exynos5433.h
+++ b/include/dt-bindings/clock/exynos5433.h
@@ -156,7 +156,7 @@
 #define CLK_ACLK_G2D_266		220
 #define CLK_ACLK_G2D_400		221
 #define CLK_ACLK_G3D_400		222
-#define CLK_ACLK_IMEM_SSX_266		223
+#define CLK_ACLK_IMEM_SSSX_266		223
 #define CLK_ACLK_BUS0_400		224
 #define CLK_ACLK_BUS1_400		225
 #define CLK_ACLK_IMEM_200		226
-- 
cgit v1.2.3


From d0a060be573bfbf8753a15dca35497db5e968bb0 Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Wed, 30 Jan 2019 12:02:44 +0000
Subject: arm64: add ptrace regsets for ptrauth key management

Add two new ptrace regsets, which can be used to request and change the
pointer authentication keys of a thread. NT_ARM_PACA_KEYS gives access
to the instruction/data address keys, and NT_ARM_PACG_KEYS to the
generic authentication key. The keys are also part of the core dump file
of the process.

The regsets are only exposed if the kernel is compiled with
CONFIG_CHECKPOINT_RESTORE=y, as the only intended use case is
checkpointing and restoring processes that are using pointer
authentication. (This can be changed later if there are other use
cases.)

Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arm64/pointer-authentication.txt |   5 +
 arch/arm64/include/uapi/asm/ptrace.h           |  13 +++
 arch/arm64/kernel/ptrace.c                     | 147 +++++++++++++++++++++++++
 include/uapi/linux/elf.h                       |   2 +
 4 files changed, 167 insertions(+)

(limited to 'include')

diff --git a/Documentation/arm64/pointer-authentication.txt b/Documentation/arm64/pointer-authentication.txt
index a25cd21290e9..5baca42ba146 100644
--- a/Documentation/arm64/pointer-authentication.txt
+++ b/Documentation/arm64/pointer-authentication.txt
@@ -78,6 +78,11 @@ bits can vary between the two. Note that the masks apply to TTBR0
 addresses, and are not valid to apply to TTBR1 addresses (e.g. kernel
 pointers).
 
+Additionally, when CONFIG_CHECKPOINT_RESTORE is also set, the kernel
+will expose the NT_ARM_PACA_KEYS and NT_ARM_PACG_KEYS regsets (struct
+user_pac_address_keys and struct user_pac_generic_keys). These can be
+used to get and set the keys for a thread.
+
 
 Virtualization
 --------------
diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h
index 28d77c9ed531..d78623acb649 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -233,6 +233,19 @@ struct user_pac_mask {
 	__u64		insn_mask;
 };
 
+/* pointer authentication keys (NT_ARM_PACA_KEYS, NT_ARM_PACG_KEYS) */
+
+struct user_pac_address_keys {
+	__uint128_t	apiakey;
+	__uint128_t	apibkey;
+	__uint128_t	apdakey;
+	__uint128_t	apdbkey;
+};
+
+struct user_pac_generic_keys {
+	__uint128_t	apgakey;
+};
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _UAPI__ASM_PTRACE_H */
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 9dce33b0e260..a86413be5a2d 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -979,6 +979,131 @@ static int pac_mask_get(struct task_struct *target,
 
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &uregs, 0, -1);
 }
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static __uint128_t pac_key_to_user(const struct ptrauth_key *key)
+{
+	return (__uint128_t)key->hi << 64 | key->lo;
+}
+
+static struct ptrauth_key pac_key_from_user(__uint128_t ukey)
+{
+	struct ptrauth_key key = {
+		.lo = (unsigned long)ukey,
+		.hi = (unsigned long)(ukey >> 64),
+	};
+
+	return key;
+}
+
+static void pac_address_keys_to_user(struct user_pac_address_keys *ukeys,
+				     const struct ptrauth_keys *keys)
+{
+	ukeys->apiakey = pac_key_to_user(&keys->apia);
+	ukeys->apibkey = pac_key_to_user(&keys->apib);
+	ukeys->apdakey = pac_key_to_user(&keys->apda);
+	ukeys->apdbkey = pac_key_to_user(&keys->apdb);
+}
+
+static void pac_address_keys_from_user(struct ptrauth_keys *keys,
+				       const struct user_pac_address_keys *ukeys)
+{
+	keys->apia = pac_key_from_user(ukeys->apiakey);
+	keys->apib = pac_key_from_user(ukeys->apibkey);
+	keys->apda = pac_key_from_user(ukeys->apdakey);
+	keys->apdb = pac_key_from_user(ukeys->apdbkey);
+}
+
+static int pac_address_keys_get(struct task_struct *target,
+				const struct user_regset *regset,
+				unsigned int pos, unsigned int count,
+				void *kbuf, void __user *ubuf)
+{
+	struct ptrauth_keys *keys = &target->thread.keys_user;
+	struct user_pac_address_keys user_keys;
+
+	if (!system_supports_address_auth())
+		return -EINVAL;
+
+	pac_address_keys_to_user(&user_keys, keys);
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   &user_keys, 0, -1);
+}
+
+static int pac_address_keys_set(struct task_struct *target,
+				const struct user_regset *regset,
+				unsigned int pos, unsigned int count,
+				const void *kbuf, const void __user *ubuf)
+{
+	struct ptrauth_keys *keys = &target->thread.keys_user;
+	struct user_pac_address_keys user_keys;
+	int ret;
+
+	if (!system_supports_address_auth())
+		return -EINVAL;
+
+	pac_address_keys_to_user(&user_keys, keys);
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &user_keys, 0, -1);
+	if (ret)
+		return ret;
+	pac_address_keys_from_user(keys, &user_keys);
+
+	return 0;
+}
+
+static void pac_generic_keys_to_user(struct user_pac_generic_keys *ukeys,
+				     const struct ptrauth_keys *keys)
+{
+	ukeys->apgakey = pac_key_to_user(&keys->apga);
+}
+
+static void pac_generic_keys_from_user(struct ptrauth_keys *keys,
+				       const struct user_pac_generic_keys *ukeys)
+{
+	keys->apga = pac_key_from_user(ukeys->apgakey);
+}
+
+static int pac_generic_keys_get(struct task_struct *target,
+				const struct user_regset *regset,
+				unsigned int pos, unsigned int count,
+				void *kbuf, void __user *ubuf)
+{
+	struct ptrauth_keys *keys = &target->thread.keys_user;
+	struct user_pac_generic_keys user_keys;
+
+	if (!system_supports_generic_auth())
+		return -EINVAL;
+
+	pac_generic_keys_to_user(&user_keys, keys);
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   &user_keys, 0, -1);
+}
+
+static int pac_generic_keys_set(struct task_struct *target,
+				const struct user_regset *regset,
+				unsigned int pos, unsigned int count,
+				const void *kbuf, const void __user *ubuf)
+{
+	struct ptrauth_keys *keys = &target->thread.keys_user;
+	struct user_pac_generic_keys user_keys;
+	int ret;
+
+	if (!system_supports_generic_auth())
+		return -EINVAL;
+
+	pac_generic_keys_to_user(&user_keys, keys);
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &user_keys, 0, -1);
+	if (ret)
+		return ret;
+	pac_generic_keys_from_user(keys, &user_keys);
+
+	return 0;
+}
+#endif /* CONFIG_CHECKPOINT_RESTORE */
 #endif /* CONFIG_ARM64_PTR_AUTH */
 
 enum aarch64_regset {
@@ -995,6 +1120,10 @@ enum aarch64_regset {
 #endif
 #ifdef CONFIG_ARM64_PTR_AUTH
 	REGSET_PAC_MASK,
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	REGSET_PACA_KEYS,
+	REGSET_PACG_KEYS,
+#endif
 #endif
 };
 
@@ -1074,6 +1203,24 @@ static const struct user_regset aarch64_regsets[] = {
 		.get = pac_mask_get,
 		/* this cannot be set dynamically */
 	},
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	[REGSET_PACA_KEYS] = {
+		.core_note_type = NT_ARM_PACA_KEYS,
+		.n = sizeof(struct user_pac_address_keys) / sizeof(__uint128_t),
+		.size = sizeof(__uint128_t),
+		.align = sizeof(__uint128_t),
+		.get = pac_address_keys_get,
+		.set = pac_address_keys_set,
+	},
+	[REGSET_PACG_KEYS] = {
+		.core_note_type = NT_ARM_PACG_KEYS,
+		.n = sizeof(struct user_pac_generic_keys) / sizeof(__uint128_t),
+		.size = sizeof(__uint128_t),
+		.align = sizeof(__uint128_t),
+		.get = pac_generic_keys_get,
+		.set = pac_generic_keys_set,
+	},
+#endif
 #endif
 };
 
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index e4d6ddd93567..34c02e4290fe 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -421,6 +421,8 @@ typedef struct elf64_shdr {
 #define NT_ARM_SYSTEM_CALL	0x404	/* ARM system call number */
 #define NT_ARM_SVE	0x405		/* ARM Scalable Vector Extension registers */
 #define NT_ARM_PAC_MASK		0x406	/* ARM pointer authentication code masks */
+#define NT_ARM_PACA_KEYS	0x407	/* ARM pointer authentication address keys */
+#define NT_ARM_PACG_KEYS	0x408	/* ARM pointer authentication generic key */
 #define NT_ARC_V2	0x600		/* ARCv2 accumulator/extra registers */
 #define NT_VMCOREDD	0x700		/* Vmcore Device Dump Note */
 #define NT_MIPS_DSP	0x800		/* MIPS DSP ASE registers */
-- 
cgit v1.2.3


From b8580e9de48bf32b884910d22330ef2fa027cf01 Mon Sep 17 00:00:00 2001
From: Shunyong Yang <shunyong.yang@hxt-semitech.com>
Date: Fri, 1 Feb 2019 17:11:14 -0600
Subject: PCI: Add HXT vendor ID

Add the HXT vendor ID to pci_ids.h.

Signed-off-by: Shunyong Yang <shunyong.yang@hxt-semitech.com>
[bhelgaas: split to separate patch]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Sinan Kaya <okaya@kernel.org>
CC: Joey Zheng <yu.zheng@hxt-semitech.com>
---
 include/linux/pci_ids.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 5eaf39dbc388..26420e619dd7 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2573,6 +2573,8 @@
 
 #define PCI_VENDOR_ID_HYGON		0x1d94
 
+#define PCI_VENDOR_ID_HXT		0x1dbf
+
 #define PCI_VENDOR_ID_TEKRAM		0x1de1
 #define PCI_DEVICE_ID_TEKRAM_DC290	0xdc29
 
-- 
cgit v1.2.3


From 0ce26a1c31ca928df4dfc7504c8898b71ff9f5d5 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 1 Feb 2019 17:24:52 -0600
Subject: PCI: Move Rohm Vendor ID to generic list

Move the Rohm Vendor ID to pci_ids.h instead of defining it in several
drivers.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/dma/pch_dma.c                                | 1 -
 drivers/gpio/gpio-ml-ioh.c                           | 2 --
 drivers/gpio/gpio-pch.c                              | 1 -
 drivers/i2c/busses/i2c-eg20t.c                       | 1 -
 drivers/misc/pch_phub.c                              | 1 -
 drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c | 7 ++-----
 drivers/spi/spi-topcliff-pch.c                       | 1 -
 drivers/tty/serial/pch_uart.c                        | 2 --
 drivers/usb/gadget/udc/pch_udc.c                     | 1 -
 include/linux/pci_ids.h                              | 2 ++
 10 files changed, 4 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/pch_dma.c b/drivers/dma/pch_dma.c
index afd8f27bda96..538b6e0e17bb 100644
--- a/drivers/dma/pch_dma.c
+++ b/drivers/dma/pch_dma.c
@@ -972,7 +972,6 @@ static void pch_dma_remove(struct pci_dev *pdev)
 }
 
 /* PCI Device ID of DMA device */
-#define PCI_VENDOR_ID_ROHM             0x10DB
 #define PCI_DEVICE_ID_EG20T_PCH_DMA_8CH        0x8810
 #define PCI_DEVICE_ID_EG20T_PCH_DMA_4CH        0x8815
 #define PCI_DEVICE_ID_ML7213_DMA1_8CH	0x8026
diff --git a/drivers/gpio/gpio-ml-ioh.c b/drivers/gpio/gpio-ml-ioh.c
index 51c7d1b84c2e..0c076dce9e17 100644
--- a/drivers/gpio/gpio-ml-ioh.c
+++ b/drivers/gpio/gpio-ml-ioh.c
@@ -31,8 +31,6 @@
 
 #define IOH_IRQ_BASE		0
 
-#define PCI_VENDOR_ID_ROHM             0x10DB
-
 struct ioh_reg_comn {
 	u32	ien;
 	u32	istatus;
diff --git a/drivers/gpio/gpio-pch.c b/drivers/gpio/gpio-pch.c
index ee79e5f88b5a..1d99293096f2 100644
--- a/drivers/gpio/gpio-pch.c
+++ b/drivers/gpio/gpio-pch.c
@@ -437,7 +437,6 @@ static int __maybe_unused pch_gpio_resume(struct device *dev)
 
 static SIMPLE_DEV_PM_OPS(pch_gpio_pm_ops, pch_gpio_suspend, pch_gpio_resume);
 
-#define PCI_VENDOR_ID_ROHM             0x10DB
 static const struct pci_device_id pch_gpio_pcidev_id[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x8803) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_ROHM, 0x8014) },
diff --git a/drivers/i2c/busses/i2c-eg20t.c b/drivers/i2c/busses/i2c-eg20t.c
index 835d54ac2971..231675b10376 100644
--- a/drivers/i2c/busses/i2c-eg20t.c
+++ b/drivers/i2c/busses/i2c-eg20t.c
@@ -177,7 +177,6 @@ static wait_queue_head_t pch_event;
 static DEFINE_MUTEX(pch_mutex);
 
 /* Definition for ML7213 by LAPIS Semiconductor */
-#define PCI_VENDOR_ID_ROHM		0x10DB
 #define PCI_DEVICE_ID_ML7213_I2C	0x802D
 #define PCI_DEVICE_ID_ML7223_I2C	0x8010
 #define PCI_DEVICE_ID_ML7831_I2C	0x8817
diff --git a/drivers/misc/pch_phub.c b/drivers/misc/pch_phub.c
index 540845651b8c..309703e9c42e 100644
--- a/drivers/misc/pch_phub.c
+++ b/drivers/misc/pch_phub.c
@@ -64,7 +64,6 @@
 #define CLKCFG_UARTCLKSEL			(1 << 18)
 
 /* Macros for ML7213 */
-#define PCI_VENDOR_ID_ROHM			0x10db
 #define PCI_DEVICE_ID_ROHM_ML7213_PHUB		0x801A
 
 /* Macros for ML7223 */
diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
index 43c0c10dfeb7..3a4225837049 100644
--- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
+++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
@@ -27,7 +27,6 @@
 #define DRV_VERSION     "1.01"
 const char pch_driver_version[] = DRV_VERSION;
 
-#define PCI_DEVICE_ID_INTEL_IOH1_GBE	0x8802		/* Pci device ID */
 #define PCH_GBE_MAR_ENTRIES		16
 #define PCH_GBE_SHORT_PKT		64
 #define DSC_INIT16			0xC000
@@ -37,11 +36,9 @@ const char pch_driver_version[] = DRV_VERSION;
 #define PCH_GBE_PCI_BAR			1
 #define PCH_GBE_RESERVE_MEMORY		0x200000	/* 2MB */
 
-/* Macros for ML7223 */
-#define PCI_VENDOR_ID_ROHM			0x10db
-#define PCI_DEVICE_ID_ROHM_ML7223_GBE		0x8013
+#define PCI_DEVICE_ID_INTEL_IOH1_GBE		0x8802
 
-/* Macros for ML7831 */
+#define PCI_DEVICE_ID_ROHM_ML7223_GBE		0x8013
 #define PCI_DEVICE_ID_ROHM_ML7831_GBE		0x8802
 
 #define PCH_GBE_TX_WEIGHT         64
diff --git a/drivers/spi/spi-topcliff-pch.c b/drivers/spi/spi-topcliff-pch.c
index 97d137591b18..d794180e83dc 100644
--- a/drivers/spi/spi-topcliff-pch.c
+++ b/drivers/spi/spi-topcliff-pch.c
@@ -92,7 +92,6 @@
 #define PCH_MAX_SPBR		1023
 
 /* Definition for ML7213/ML7223/ML7831 by LAPIS Semiconductor */
-#define PCI_VENDOR_ID_ROHM		0x10DB
 #define PCI_DEVICE_ID_ML7213_SPI	0x802c
 #define PCI_DEVICE_ID_ML7223_SPI	0x800F
 #define PCI_DEVICE_ID_ML7831_SPI	0x8816
diff --git a/drivers/tty/serial/pch_uart.c b/drivers/tty/serial/pch_uart.c
index 9ed121f08a54..6157213a8359 100644
--- a/drivers/tty/serial/pch_uart.c
+++ b/drivers/tty/serial/pch_uart.c
@@ -192,8 +192,6 @@ enum {
 #define PCH_UART_HAL_LOOP		(PCH_UART_MCR_LOOP)
 #define PCH_UART_HAL_AFE		(PCH_UART_MCR_AFE)
 
-#define PCI_VENDOR_ID_ROHM		0x10DB
-
 #define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
 
 #define DEFAULT_UARTCLK   1843200 /*   1.8432 MHz */
diff --git a/drivers/usb/gadget/udc/pch_udc.c b/drivers/usb/gadget/udc/pch_udc.c
index 55c8c8abeacd..cded51f36fc1 100644
--- a/drivers/usb/gadget/udc/pch_udc.c
+++ b/drivers/usb/gadget/udc/pch_udc.c
@@ -368,7 +368,6 @@ struct pch_udc_dev {
 #define PCI_DEVICE_ID_INTEL_QUARK_X1000_UDC	0x0939
 #define PCI_DEVICE_ID_INTEL_EG20T_UDC		0x8808
 
-#define PCI_VENDOR_ID_ROHM		0x10DB
 #define PCI_DEVICE_ID_ML7213_IOH_UDC	0x801D
 #define PCI_DEVICE_ID_ML7831_IOH_UDC	0x8808
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 26420e619dd7..70e86148cb1e 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1140,6 +1140,8 @@
 #define PCI_VENDOR_ID_TCONRAD		0x10da
 #define PCI_DEVICE_ID_TCONRAD_TOKENRING	0x0508
 
+#define PCI_VENDOR_ID_ROHM		0x10db
+
 #define PCI_VENDOR_ID_NVIDIA			0x10de
 #define PCI_DEVICE_ID_NVIDIA_TNT		0x0020
 #define PCI_DEVICE_ID_NVIDIA_TNT2		0x0028
-- 
cgit v1.2.3


From d5d30d5a5c60628de5e77e3f292a8f9012d51350 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 2 Feb 2019 16:35:26 -0800
Subject: libnvdimm/dimm: Add a no-BLK quirk based on NVDIMM family

As Dexuan reports the NVDIMM_FAMILY_HYPERV platform is incompatible with
the existing Linux namespace implementation because it uses
NSLABEL_FLAG_LOCAL for x1-width PMEM interleave sets. Quirk it as an
platform / DIMM that does not provide BLK-aperture access. Allow the
libnvdimm core to assume no potential for aliasing. In case other
implementations make the same mistake, provide a "noblk" module
parameter to force-enable the quirk.

Link: https://lkml.kernel.org/r/PU1P153MB0169977604493B82B662A01CBF920@PU1P153MB0169.APCP153.PROD.OUTLOOK.COM
Reported-by: Dexuan Cui <decui@microsoft.com>
Tested-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c        | 4 ++++
 drivers/nvdimm/dimm_devs.c      | 7 +++++++
 drivers/nvdimm/label.c          | 3 +++
 drivers/nvdimm/namespace_devs.c | 6 ++++++
 drivers/nvdimm/region_devs.c    | 7 +++++++
 include/linux/libnvdimm.h       | 2 ++
 6 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 4a7e8b1fa43b..811c399a3a76 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2016,6 +2016,10 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 			cmd_mask |= nfit_mem->dsm_mask & NVDIMM_STANDARD_CMDMASK;
 		}
 
+		/* Quirk to ignore LOCAL for labels on HYPERV DIMMs */
+		if (nfit_mem->family == NVDIMM_FAMILY_HYPERV)
+			set_bit(NDD_NOBLK, &flags);
+
 		if (test_bit(NFIT_MEM_LSR, &nfit_mem->flags)) {
 			set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask);
 			set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask);
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index 4890310df874..553aa78abeee 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -11,6 +11,7 @@
  * General Public License for more details.
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/moduleparam.h>
 #include <linux/vmalloc.h>
 #include <linux/device.h>
 #include <linux/ndctl.h>
@@ -25,6 +26,10 @@
 
 static DEFINE_IDA(dimm_ida);
 
+static bool noblk;
+module_param(noblk, bool, 0444);
+MODULE_PARM_DESC(noblk, "force disable BLK / local alias support");
+
 /*
  * Retrieve bus and dimm handle and return if this bus supports
  * get_config_data commands
@@ -551,6 +556,8 @@ struct nvdimm *__nvdimm_create(struct nvdimm_bus *nvdimm_bus,
 
 	nvdimm->dimm_id = dimm_id;
 	nvdimm->provider_data = provider_data;
+	if (noblk)
+		flags |= 1 << NDD_NOBLK;
 	nvdimm->flags = flags;
 	nvdimm->cmd_mask = cmd_mask;
 	nvdimm->num_flush = num_flush;
diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
index 6d6e9a12150b..f3d753d3169c 100644
--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -392,6 +392,7 @@ int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd)
 		return 0; /* no label, nothing to reserve */
 
 	for_each_clear_bit_le(slot, free, nslot) {
+		struct nvdimm *nvdimm = to_nvdimm(ndd->dev);
 		struct nd_namespace_label *nd_label;
 		struct nd_region *nd_region = NULL;
 		u8 label_uuid[NSLABEL_UUID_LEN];
@@ -406,6 +407,8 @@ int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd)
 
 		memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN);
 		flags = __le32_to_cpu(nd_label->flags);
+		if (test_bit(NDD_NOBLK, &nvdimm->flags))
+			flags &= ~NSLABEL_FLAG_LOCAL;
 		nd_label_gen_id(&label_id, label_uuid, flags);
 		res = nvdimm_allocate_dpa(ndd, &label_id,
 				__le64_to_cpu(nd_label->dpa),
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 4b077555ac70..3677b0c4a33d 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -2492,6 +2492,12 @@ static int init_active_labels(struct nd_region *nd_region)
 			if (!label_ent)
 				break;
 			label = nd_label_active(ndd, j);
+			if (test_bit(NDD_NOBLK, &nvdimm->flags)) {
+				u32 flags = __le32_to_cpu(label->flags);
+
+				flags &= ~NSLABEL_FLAG_LOCAL;
+				label->flags = __cpu_to_le32(flags);
+			}
 			label_ent->label = label;
 
 			mutex_lock(&nd_mapping->lock);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index e2818f94f292..3b58baa44b5c 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -1003,6 +1003,13 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 
 		if (test_bit(NDD_UNARMED, &nvdimm->flags))
 			ro = 1;
+
+		if (test_bit(NDD_NOBLK, &nvdimm->flags)
+				&& dev_type == &nd_blk_device_type) {
+			dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not BLK capable\n",
+					caller, dev_name(&nvdimm->dev), i);
+			return NULL;
+		}
 	}
 
 	if (dev_type == &nd_blk_device_type) {
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 5440f11b0907..7da406ae3a2b 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -42,6 +42,8 @@ enum {
 	NDD_SECURITY_OVERWRITE = 3,
 	/*  tracking whether or not there is a pending device reference */
 	NDD_WORK_PENDING = 4,
+	/* ignore / filter NSLABEL_FLAG_LOCAL for this DIMM, i.e. no aliasing */
+	NDD_NOBLK = 5,
 
 	/* need to set a limit somewhere, but yes, this is likely overkill */
 	ND_IOCTL_MAX_BUFLEN = SZ_4M,
-- 
cgit v1.2.3


From 25db146aa7262bc95c314b2df35db107f5eadd55 Mon Sep 17 00:00:00 2001
From: Jian Hu <jian.hu@amlogic.com>
Date: Fri, 1 Feb 2019 15:53:43 +0100
Subject: dt-bindings: clk: meson: add g12a periph clock controller bindings

Add new clock controller compatible and dt-bindings header for the
Everything-Else domain of the g12a SoC

Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Jian Hu <jian.hu@amlogic.com>
Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Reviewed-by: Neil Armstrong <narmstrong@baylibre.com>
Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
Link: https://lkml.kernel.org/r/20190201145345.6795-3-jbrunet@baylibre.com
---
 .../bindings/clock/amlogic,gxbb-clkc.txt           |   1 +
 include/dt-bindings/clock/g12a-clkc.h              | 135 +++++++++++++++++++++
 2 files changed, 136 insertions(+)
 create mode 100644 include/dt-bindings/clock/g12a-clkc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt b/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt
index a6871953bf04..5c8b105be4d6 100644
--- a/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt
+++ b/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt
@@ -9,6 +9,7 @@ Required Properties:
 		"amlogic,gxbb-clkc" for GXBB SoC,
 		"amlogic,gxl-clkc" for GXL and GXM SoC,
 		"amlogic,axg-clkc" for AXG SoC.
+		"amlogic,g12a-clkc" for G12A SoC.
 - clocks : list of clock phandle, one for each entry clock-names.
 - clock-names : should contain the following:
   * "xtal": the platform xtal
diff --git a/include/dt-bindings/clock/g12a-clkc.h b/include/dt-bindings/clock/g12a-clkc.h
new file mode 100644
index 000000000000..83b657038d1e
--- /dev/null
+++ b/include/dt-bindings/clock/g12a-clkc.h
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR MIT */
+/*
+ * Meson-G12A clock tree IDs
+ *
+ * Copyright (c) 2018 Amlogic, Inc. All rights reserved.
+ */
+
+#ifndef __G12A_CLKC_H
+#define __G12A_CLKC_H
+
+#define CLKID_SYS_PLL				0
+#define CLKID_FIXED_PLL				1
+#define CLKID_FCLK_DIV2				2
+#define CLKID_FCLK_DIV3				3
+#define CLKID_FCLK_DIV4				4
+#define CLKID_FCLK_DIV5				5
+#define CLKID_FCLK_DIV7				6
+#define CLKID_GP0_PLL				7
+#define CLKID_CLK81				10
+#define CLKID_MPLL0				11
+#define CLKID_MPLL1				12
+#define CLKID_MPLL2				13
+#define CLKID_MPLL3				14
+#define CLKID_DDR				15
+#define CLKID_DOS				16
+#define CLKID_AUDIO_LOCKER			17
+#define CLKID_MIPI_DSI_HOST			18
+#define CLKID_ETH_PHY				19
+#define CLKID_ISA				20
+#define CLKID_PL301				21
+#define CLKID_PERIPHS				22
+#define CLKID_SPICC0				23
+#define CLKID_I2C				24
+#define CLKID_SANA				25
+#define CLKID_SD				26
+#define CLKID_RNG0				27
+#define CLKID_UART0				28
+#define CLKID_SPICC1				29
+#define CLKID_HIU_IFACE				30
+#define CLKID_MIPI_DSI_PHY			31
+#define CLKID_ASSIST_MISC			32
+#define CLKID_SD_EMMC_A				33
+#define CLKID_SD_EMMC_B				34
+#define CLKID_SD_EMMC_C				35
+#define CLKID_AUDIO_CODEC			36
+#define CLKID_AUDIO				37
+#define CLKID_ETH				38
+#define CLKID_DEMUX				39
+#define CLKID_AUDIO_IFIFO			40
+#define CLKID_ADC				41
+#define CLKID_UART1				42
+#define CLKID_G2D				43
+#define CLKID_RESET				44
+#define CLKID_PCIE_COMB				45
+#define CLKID_PARSER				46
+#define CLKID_USB				47
+#define CLKID_PCIE_PHY				48
+#define CLKID_AHB_ARB0				49
+#define CLKID_AHB_DATA_BUS			50
+#define CLKID_AHB_CTRL_BUS			51
+#define CLKID_HTX_HDCP22			52
+#define CLKID_HTX_PCLK				53
+#define CLKID_BT656				54
+#define CLKID_USB1_DDR_BRIDGE			55
+#define CLKID_MMC_PCLK				56
+#define CLKID_UART2				57
+#define CLKID_VPU_INTR				58
+#define CLKID_GIC				59
+#define CLKID_SD_EMMC_A_CLK0			60
+#define CLKID_SD_EMMC_B_CLK0			61
+#define CLKID_SD_EMMC_C_CLK0			62
+#define CLKID_HIFI_PLL				74
+#define CLKID_VCLK2_VENCI0			80
+#define CLKID_VCLK2_VENCI1			81
+#define CLKID_VCLK2_VENCP0			82
+#define CLKID_VCLK2_VENCP1			83
+#define CLKID_VCLK2_VENCT0			84
+#define CLKID_VCLK2_VENCT1			85
+#define CLKID_VCLK2_OTHER			86
+#define CLKID_VCLK2_ENCI			87
+#define CLKID_VCLK2_ENCP			88
+#define CLKID_DAC_CLK				89
+#define CLKID_AOCLK				90
+#define CLKID_IEC958				91
+#define CLKID_ENC480P				92
+#define CLKID_RNG1				93
+#define CLKID_VCLK2_ENCT			94
+#define CLKID_VCLK2_ENCL			95
+#define CLKID_VCLK2_VENCLMMC			96
+#define CLKID_VCLK2_VENCL			97
+#define CLKID_VCLK2_OTHER1			98
+#define CLKID_FCLK_DIV2P5			99
+#define CLKID_DMA				105
+#define CLKID_EFUSE				106
+#define CLKID_ROM_BOOT				107
+#define CLKID_RESET_SEC				108
+#define CLKID_SEC_AHB_APB3			109
+#define CLKID_VPU_0_SEL				110
+#define CLKID_VPU_0				112
+#define CLKID_VPU_1_SEL				113
+#define CLKID_VPU_1				115
+#define CLKID_VPU				116
+#define CLKID_VAPB_0_SEL			117
+#define CLKID_VAPB_0				119
+#define CLKID_VAPB_1_SEL			120
+#define CLKID_VAPB_1				122
+#define CLKID_VAPB_SEL				123
+#define CLKID_VAPB				124
+#define CLKID_HDMI_PLL				128
+#define CLKID_VID_PLL				129
+#define CLKID_VCLK				138
+#define CLKID_VCLK2				139
+#define CLKID_VCLK_DIV1				148
+#define CLKID_VCLK_DIV2				149
+#define CLKID_VCLK_DIV4				150
+#define CLKID_VCLK_DIV6				151
+#define CLKID_VCLK_DIV12			152
+#define CLKID_VCLK2_DIV1			153
+#define CLKID_VCLK2_DIV2			154
+#define CLKID_VCLK2_DIV4			155
+#define CLKID_VCLK2_DIV6			156
+#define CLKID_VCLK2_DIV12			157
+#define CLKID_CTS_ENCI				162
+#define CLKID_CTS_ENCP				163
+#define CLKID_CTS_VDAC				164
+#define CLKID_HDMI_TX				165
+#define CLKID_HDMI				168
+#define CLKID_MALI_0_SEL			169
+#define CLKID_MALI_0				171
+#define CLKID_MALI_1_SEL			172
+#define CLKID_MALI_1				174
+#define CLKID_MALI				175
+#define CLKID_MPLL_5OM				177
+
+#endif /* __G12A_CLKC_H */
-- 
cgit v1.2.3


From 77000bc43da17d5d6bc4ebfaf44d52d43bb69492 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 4 Feb 2019 16:31:04 +0100
Subject: uio: remove the unused iov_for_each macro

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 .clang-format       | 1 -
 include/linux/uio.h | 8 --------
 2 files changed, 9 deletions(-)

(limited to 'include')

diff --git a/.clang-format b/.clang-format
index e6080f5834a3..c144d9c24d5d 100644
--- a/.clang-format
+++ b/.clang-format
@@ -259,7 +259,6 @@ ForEachMacros:
   - 'idr_for_each_entry_ul'
   - 'inet_bind_bucket_for_each'
   - 'inet_lhash2_for_each_icsk_rcu'
-  - 'iov_for_each'
   - 'key_for_each'
   - 'key_for_each_safe'
   - 'klp_for_each_func'
diff --git a/include/linux/uio.h b/include/linux/uio.h
index ecf584f6b82d..87477e1640f9 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -110,14 +110,6 @@ static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
 	};
 }
 
-#define iov_for_each(iov, iter, start)				\
-	if (iov_iter_type(start) == ITER_IOVEC ||		\
-	    iov_iter_type(start) == ITER_KVEC)			\
-	for (iter = (start);					\
-	     (iter).count &&					\
-	     ((iov = iov_iter_iovec(&(iter))), 1);		\
-	     iov_iter_advance(&(iter), (iov).iov_len))
-
 size_t iov_iter_copy_from_user_atomic(struct page *page,
 		struct iov_iter *i, unsigned long offset, size_t bytes);
 void iov_iter_advance(struct iov_iter *i, size_t bytes);
-- 
cgit v1.2.3


From da82334219bc386ef7ea5b4b185a339a973dd513 Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Tue, 22 Jan 2019 08:48:41 +0200
Subject: IB/core: Allocate a bit for SRQ ODP support

The ODP support matrix is per operation and per transport. The support for
each transport (RC, UD, etc.) is described with a bit field.

ODP for SRQ WQEs is considered a different kind of support from ODP for RQ
WQs and therefore needs a different capability bit.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/rdma/ib_verbs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 5fc3be884444..5eefdea62831 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -268,6 +268,7 @@ enum ib_odp_transport_cap_bits {
 	IB_ODP_SUPPORT_WRITE	= 1 << 2,
 	IB_ODP_SUPPORT_READ	= 1 << 3,
 	IB_ODP_SUPPORT_ATOMIC	= 1 << 4,
+	IB_ODP_SUPPORT_SRQ_RECV	= 1 << 5,
 };
 
 struct ib_odp_caps {
-- 
cgit v1.2.3


From 52a72e2a395fa3c5ab5df41058a8511e87215730 Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Tue, 22 Jan 2019 08:48:42 +0200
Subject: IB/uverbs: Expose XRC ODP device capabilities

Expose XRC ODP capabilities as part of the extended device capabilities.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/uverbs_cmd.c | 1 +
 include/rdma/ib_verbs.h              | 1 +
 include/uapi/rdma/ib_user_verbs.h    | 2 ++
 3 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index d4f1a2ef5015..68c4ea514faf 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3613,6 +3613,7 @@ static int ib_uverbs_ex_query_device(struct uverbs_attr_bundle *attrs)
 		attr.odp_caps.per_transport_caps.uc_odp_caps;
 	resp.odp_caps.per_transport_caps.ud_odp_caps =
 		attr.odp_caps.per_transport_caps.ud_odp_caps;
+	resp.xrc_odp_caps = attr.odp_caps.per_transport_caps.xrc_odp_caps;
 
 	resp.timestamp_mask = attr.timestamp_mask;
 	resp.hca_core_clock = attr.hca_core_clock;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 5eefdea62831..8219c07340a9 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -277,6 +277,7 @@ struct ib_odp_caps {
 		uint32_t  rc_odp_caps;
 		uint32_t  uc_odp_caps;
 		uint32_t  ud_odp_caps;
+		uint32_t  xrc_odp_caps;
 	} per_transport_caps;
 };
 
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 480d9a60b68e..0474c7400268 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -270,6 +270,8 @@ struct ib_uverbs_ex_query_device_resp {
 	struct ib_uverbs_tm_caps tm_caps;
 	struct ib_uverbs_cq_moderation_caps cq_moderation_caps;
 	__aligned_u64 max_dm_size;
+	__u32 xrc_odp_caps;
+	__u32 reserved;
 };
 
 struct ib_uverbs_query_port {
-- 
cgit v1.2.3


From a163afc88556e099271a7b423295bc5176fcecce Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 31 Jan 2019 08:30:34 -0800
Subject: IB/core: Remove ib_sg_dma_address() and ib_sg_dma_len()

Keeping single line wrapper functions is not useful. Hence remove the
ib_sg_dma_address() and ib_sg_dma_len() functions. This patch does not
change any functionality.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/rw.c              | 12 +++++-------
 drivers/infiniband/ulp/iser/iser_memory.c |  9 ++++-----
 drivers/infiniband/ulp/srp/ib_srp.c       | 17 +++++++----------
 include/rdma/ib_verbs.h                   | 27 ---------------------------
 net/rds/ib.h                              | 12 ++++--------
 net/rds/ib_fmr.c                          |  8 ++++----
 net/rds/ib_frmr.c                         |  4 ++--
 net/rds/ib_recv.c                         |  8 +++-----
 net/rds/ib_send.c                         | 15 +++++++--------
 9 files changed, 36 insertions(+), 76 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index d22c4a2ebac6..89a5be3a2f97 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -179,7 +179,6 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 		struct scatterlist *sg, u32 sg_cnt, u32 offset,
 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 {
-	struct ib_device *dev = qp->pd->device;
 	u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
 		      qp->max_read_sge;
 	struct ib_sge *sge;
@@ -209,8 +208,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 		rdma_wr->wr.sg_list = sge;
 
 		for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) {
-			sge->addr = ib_sg_dma_address(dev, sg) + offset;
-			sge->length = ib_sg_dma_len(dev, sg) - offset;
+			sge->addr = sg_dma_address(sg) + offset;
+			sge->length = sg_dma_len(sg) - offset;
 			sge->lkey = qp->pd->local_dma_lkey;
 
 			total_len += sge->length;
@@ -236,14 +235,13 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 		struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey,
 		enum dma_data_direction dir)
 {
-	struct ib_device *dev = qp->pd->device;
 	struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
 
 	ctx->nr_ops = 1;
 
 	ctx->single.sge.lkey = qp->pd->local_dma_lkey;
-	ctx->single.sge.addr = ib_sg_dma_address(dev, sg) + offset;
-	ctx->single.sge.length = ib_sg_dma_len(dev, sg) - offset;
+	ctx->single.sge.addr = sg_dma_address(sg) + offset;
+	ctx->single.sge.length = sg_dma_len(sg) - offset;
 
 	memset(rdma_wr, 0, sizeof(*rdma_wr));
 	if (dir == DMA_TO_DEVICE)
@@ -294,7 +292,7 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
 	 * Skip to the S/G entry that sg_offset falls into:
 	 */
 	for (;;) {
-		u32 len = ib_sg_dma_len(dev, sg);
+		u32 len = sg_dma_len(sg);
 
 		if (sg_offset < len)
 			break;
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index 394d1b9c2ff7..2ba70729d7b0 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -145,9 +145,8 @@ static void iser_data_buf_dump(struct iser_data_buf *data,
 	for_each_sg(data->sg, sg, data->dma_nents, i)
 		iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p "
 			 "off:0x%x sz:0x%x dma_len:0x%x\n",
-			 i, (unsigned long)ib_sg_dma_address(ibdev, sg),
-			 sg_page(sg), sg->offset,
-			 sg->length, ib_sg_dma_len(ibdev, sg));
+			 i, (unsigned long)sg_dma_address(sg),
+			 sg_page(sg), sg->offset, sg->length, sg_dma_len(sg));
 }
 
 static void iser_dump_page_vec(struct iser_page_vec *page_vec)
@@ -204,8 +203,8 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
 		reg->rkey = device->pd->unsafe_global_rkey;
 	else
 		reg->rkey = 0;
-	reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]);
-	reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]);
+	reg->sge.addr = sg_dma_address(&sg[0]);
+	reg->sge.length = sg_dma_len(&sg[0]);
 
 	iser_dbg("Single DMA entry: lkey=0x%x, rkey=0x%x, addr=0x%llx,"
 		 " length=0x%x\n", reg->sge.lkey, reg->rkey,
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 29fe46dbdbf1..84184910f038 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1600,9 +1600,8 @@ static int srp_map_sg_entry(struct srp_map_state *state,
 {
 	struct srp_target_port *target = ch->target;
 	struct srp_device *dev = target->srp_host->srp_dev;
-	struct ib_device *ibdev = dev->dev;
-	dma_addr_t dma_addr = ib_sg_dma_address(ibdev, sg);
-	unsigned int dma_len = ib_sg_dma_len(ibdev, sg);
+	dma_addr_t dma_addr = sg_dma_address(sg);
+	unsigned int dma_len = sg_dma_len(sg);
 	unsigned int len = 0;
 	int ret;
 
@@ -1696,13 +1695,11 @@ static int srp_map_sg_dma(struct srp_map_state *state, struct srp_rdma_ch *ch,
 			  int count)
 {
 	struct srp_target_port *target = ch->target;
-	struct srp_device *dev = target->srp_host->srp_dev;
 	struct scatterlist *sg;
 	int i;
 
 	for_each_sg(scat, sg, count, i) {
-		srp_map_desc(state, ib_sg_dma_address(dev->dev, sg),
-			     ib_sg_dma_len(dev->dev, sg),
+		srp_map_desc(state, sg_dma_address(sg), sg_dma_len(sg),
 			     target->global_rkey);
 	}
 
@@ -1852,8 +1849,8 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
 		buf->len = cpu_to_be32(data_len);
 		WARN_ON_ONCE((void *)(buf + 1) > (void *)cmd + len);
 		for_each_sg(scat, sg, count, i) {
-			sge[i].addr   = ib_sg_dma_address(ibdev, sg);
-			sge[i].length = ib_sg_dma_len(ibdev, sg);
+			sge[i].addr   = sg_dma_address(sg);
+			sge[i].length = sg_dma_len(sg);
 			sge[i].lkey   = target->lkey;
 		}
 		req->cmd->num_sge += count;
@@ -1874,9 +1871,9 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
 		struct srp_direct_buf *buf;
 
 		buf = (void *)cmd->add_data + cmd->add_cdb_len;
-		buf->va  = cpu_to_be64(ib_sg_dma_address(ibdev, scat));
+		buf->va  = cpu_to_be64(sg_dma_address(scat));
 		buf->key = cpu_to_be32(target->global_rkey);
-		buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat));
+		buf->len = cpu_to_be32(sg_dma_len(scat));
 
 		req->nmdesc = 0;
 		goto map_complete;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 8219c07340a9..f7e8709e48cd 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -3705,33 +3705,6 @@ static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev,
 {
 	dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, dma_attrs);
 }
-/**
- * ib_sg_dma_address - Return the DMA address from a scatter/gather entry
- * @dev: The device for which the DMA addresses were created
- * @sg: The scatter/gather entry
- *
- * Note: this function is obsolete. To do: change all occurrences of
- * ib_sg_dma_address() into sg_dma_address().
- */
-static inline u64 ib_sg_dma_address(struct ib_device *dev,
-				    struct scatterlist *sg)
-{
-	return sg_dma_address(sg);
-}
-
-/**
- * ib_sg_dma_len - Return the DMA length from a scatter/gather entry
- * @dev: The device for which the DMA addresses were created
- * @sg: The scatter/gather entry
- *
- * Note: this function is obsolete. To do: change all occurrences of
- * ib_sg_dma_len() into sg_dma_len().
- */
-static inline unsigned int ib_sg_dma_len(struct ib_device *dev,
-					 struct scatterlist *sg)
-{
-	return sg_dma_len(sg);
-}
 
 /**
  * ib_dma_max_seg_size - Return the size limit of a single DMA transfer
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 71ff356ee702..1fd1cac85da2 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -331,10 +331,8 @@ static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev,
 	unsigned int i;
 
 	for_each_sg(sglist, sg, sg_dma_len, i) {
-		ib_dma_sync_single_for_cpu(dev,
-				ib_sg_dma_address(dev, sg),
-				ib_sg_dma_len(dev, sg),
-				direction);
+		ib_dma_sync_single_for_cpu(dev, sg_dma_address(sg),
+					   sg_dma_len(sg), direction);
 	}
 }
 #define ib_dma_sync_sg_for_cpu	rds_ib_dma_sync_sg_for_cpu
@@ -348,10 +346,8 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
 	unsigned int i;
 
 	for_each_sg(sglist, sg, sg_dma_len, i) {
-		ib_dma_sync_single_for_device(dev,
-				ib_sg_dma_address(dev, sg),
-				ib_sg_dma_len(dev, sg),
-				direction);
+		ib_dma_sync_single_for_device(dev, sg_dma_address(sg),
+					      sg_dma_len(sg), direction);
 	}
 }
 #define ib_dma_sync_sg_for_device	rds_ib_dma_sync_sg_for_device
diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c
index e0f70c4051b6..31cf37da4510 100644
--- a/net/rds/ib_fmr.c
+++ b/net/rds/ib_fmr.c
@@ -108,8 +108,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev,
 	page_cnt = 0;
 
 	for (i = 0; i < sg_dma_len; ++i) {
-		unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
-		u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+		unsigned int dma_len = sg_dma_len(&scat[i]);
+		u64 dma_addr = sg_dma_address(&scat[i]);
 
 		if (dma_addr & ~PAGE_MASK) {
 			if (i > 0) {
@@ -148,8 +148,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev,
 
 	page_cnt = 0;
 	for (i = 0; i < sg_dma_len; ++i) {
-		unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
-		u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+		unsigned int dma_len = sg_dma_len(&scat[i]);
+		u64 dma_addr = sg_dma_address(&scat[i]);
 
 		for (j = 0; j < dma_len; j += PAGE_SIZE)
 			dma_pages[page_cnt++] =
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 6431a023ac89..688dcd68d4ea 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -181,8 +181,8 @@ static int rds_ib_map_frmr(struct rds_ib_device *rds_ibdev,
 
 	ret = -EINVAL;
 	for (i = 0; i < ibmr->sg_dma_len; ++i) {
-		unsigned int dma_len = ib_sg_dma_len(dev, &ibmr->sg[i]);
-		u64 dma_addr = ib_sg_dma_address(dev, &ibmr->sg[i]);
+		unsigned int dma_len = sg_dma_len(&ibmr->sg[i]);
+		u64 dma_addr = sg_dma_address(&ibmr->sg[i]);
 
 		frmr->sg_byte_len += dma_len;
 		if (dma_addr & ~PAGE_MASK) {
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 2f16146e4ec9..672b91a9e207 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -346,8 +346,8 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
 	sge->length = sizeof(struct rds_header);
 
 	sge = &recv->r_sge[1];
-	sge->addr = ib_sg_dma_address(ic->i_cm_id->device, &recv->r_frag->f_sg);
-	sge->length = ib_sg_dma_len(ic->i_cm_id->device, &recv->r_frag->f_sg);
+	sge->addr = sg_dma_address(&recv->r_frag->f_sg);
+	sge->length = sg_dma_len(&recv->r_frag->f_sg);
 
 	ret = 0;
 out:
@@ -409,9 +409,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
 
 		rdsdebug("recv %p ibinc %p page %p addr %lu\n", recv,
 			 recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
-			 (long) ib_sg_dma_address(
-				ic->i_cm_id->device,
-				&recv->r_frag->f_sg));
+			 (long)sg_dma_address(&recv->r_frag->f_sg));
 
 		/* XXX when can this fail? */
 		ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, NULL);
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 2dcb555e6350..dc5897a3a958 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -645,16 +645,16 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		if (i < work_alloc
 		    && scat != &rm->data.op_sg[rm->data.op_count]) {
 			len = min(RDS_FRAG_SIZE,
-				ib_sg_dma_len(dev, scat) - rm->data.op_dmaoff);
+				  sg_dma_len(scat) - rm->data.op_dmaoff);
 			send->s_wr.num_sge = 2;
 
-			send->s_sge[1].addr = ib_sg_dma_address(dev, scat);
+			send->s_sge[1].addr = sg_dma_address(scat);
 			send->s_sge[1].addr += rm->data.op_dmaoff;
 			send->s_sge[1].length = len;
 
 			bytes_sent += len;
 			rm->data.op_dmaoff += len;
-			if (rm->data.op_dmaoff == ib_sg_dma_len(dev, scat)) {
+			if (rm->data.op_dmaoff == sg_dma_len(scat)) {
 				scat++;
 				rm->data.op_dmasg++;
 				rm->data.op_dmaoff = 0;
@@ -808,8 +808,8 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
 	}
 
 	/* Convert our struct scatterlist to struct ib_sge */
-	send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
-	send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
+	send->s_sge[0].addr = sg_dma_address(op->op_sg);
+	send->s_sge[0].length = sg_dma_len(op->op_sg);
 	send->s_sge[0].lkey = ic->i_pd->local_dma_lkey;
 
 	rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
@@ -921,9 +921,8 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 
 		for (j = 0; j < send->s_rdma_wr.wr.num_sge &&
 		     scat != &op->op_sg[op->op_count]; j++) {
-			len = ib_sg_dma_len(ic->i_cm_id->device, scat);
-			send->s_sge[j].addr =
-				 ib_sg_dma_address(ic->i_cm_id->device, scat);
+			len = sg_dma_len(scat);
+			send->s_sge[j].addr = sg_dma_address(scat);
 			send->s_sge[j].length = len;
 			send->s_sge[j].lkey = ic->i_pd->local_dma_lkey;
 
-- 
cgit v1.2.3


From 219a3e8676f3132d27b530c7d2d6bcab89536b57 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@redhat.com>
Date: Mon, 21 Jan 2019 17:59:28 +0800
Subject: integrity, KEYS: add a reference to platform keyring

commit 9dc92c45177a ("integrity: Define a trusted platform keyring")
introduced a .platform keyring for storing preboot keys, used for
verifying kernel image signatures. Currently only IMA-appraisal is able
to use the keyring to verify kernel images that have their signature
stored in xattr.

This patch exposes the .platform keyring, making it accessible for
verifying PE signed kernel images as well.

Suggested-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Kairui Song <kasong@redhat.com>
Cc: David Howells <dhowells@redhat.com>
[zohar@linux.ibm.com: fixed checkpatch errors, squashed with patch fix]
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 certs/system_keyring.c        | 10 ++++++++++
 include/keys/system_keyring.h |  8 ++++++++
 security/integrity/digsig.c   |  3 +++
 3 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index 81728717523d..da055e901df4 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -24,6 +24,9 @@ static struct key *builtin_trusted_keys;
 #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING
 static struct key *secondary_trusted_keys;
 #endif
+#ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING
+static struct key *platform_trusted_keys;
+#endif
 
 extern __initconst const u8 system_certificate_list[];
 extern __initconst const unsigned long system_certificate_list_size;
@@ -266,3 +269,10 @@ error:
 EXPORT_SYMBOL_GPL(verify_pkcs7_signature);
 
 #endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
+
+#ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING
+void __init set_platform_trusted_keys(struct key *keyring)
+{
+	platform_trusted_keys = keyring;
+}
+#endif
diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h
index 359c2f936004..42a93eda331c 100644
--- a/include/keys/system_keyring.h
+++ b/include/keys/system_keyring.h
@@ -61,5 +61,13 @@ static inline struct key *get_ima_blacklist_keyring(void)
 }
 #endif /* CONFIG_IMA_BLACKLIST_KEYRING */
 
+#if defined(CONFIG_INTEGRITY_PLATFORM_KEYRING) && \
+	defined(CONFIG_SYSTEM_TRUSTED_KEYRING)
+extern void __init set_platform_trusted_keys(struct key *keyring);
+#else
+static inline void set_platform_trusted_keys(struct key *keyring)
+{
+}
+#endif
 
 #endif /* _KEYS_SYSTEM_KEYRING_H */
diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c
index f45d6edecf99..e19c2eb72c51 100644
--- a/security/integrity/digsig.c
+++ b/security/integrity/digsig.c
@@ -87,6 +87,9 @@ static int __integrity_init_keyring(const unsigned int id, key_perm_t perm,
 		pr_info("Can't allocate %s keyring (%d)\n",
 			keyring_name[id], err);
 		keyring[id] = NULL;
+	} else {
+		if (id == INTEGRITY_KEYRING_PLATFORM)
+			set_platform_trusted_keys(keyring[id]);
 	}
 
 	return err;
-- 
cgit v1.2.3


From 278311e417be60f7caef6fcb12bda4da2711ceff Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@redhat.com>
Date: Mon, 21 Jan 2019 17:59:29 +0800
Subject: kexec, KEYS: Make use of platform keyring for signature verify

This patch allows the kexec_file_load syscall to verify the PE signed
kernel image signature based on the preboot keys stored in the .platform
keyring, as fall back, if the signature verification failed due to not
finding the public key in the secondary or builtin keyrings.

This commit adds a VERIFY_USE_PLATFORM_KEYRING similar to previous
VERIFY_USE_SECONDARY_KEYRING indicating that verify_pkcs7_signature
should verify the signature using platform keyring.  Also, decrease
the error message log level when verification failed with -ENOKEY,
so that if called tried multiple time with different keyring it
won't generate extra noises.

Signed-off-by: Kairui Song <kasong@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Acked-by: Dave Young <dyoung@redhat.com> (for kexec_file_load part)
[zohar@linux.ibm.com: tweaked the first paragraph of the patch description,
 and fixed checkpatch warning.]
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 arch/x86/kernel/kexec-bzimage64.c | 14 +++++++++++---
 certs/system_keyring.c            | 13 ++++++++++++-
 include/linux/verification.h      |  1 +
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 278cd07228dd..e1215a600064 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -531,9 +531,17 @@ static int bzImage64_cleanup(void *loader_data)
 #ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
 static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
 {
-	return verify_pefile_signature(kernel, kernel_len,
-				       VERIFY_USE_SECONDARY_KEYRING,
-				       VERIFYING_KEXEC_PE_SIGNATURE);
+	int ret;
+
+	ret = verify_pefile_signature(kernel, kernel_len,
+				      VERIFY_USE_SECONDARY_KEYRING,
+				      VERIFYING_KEXEC_PE_SIGNATURE);
+	if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) {
+		ret = verify_pefile_signature(kernel, kernel_len,
+					      VERIFY_USE_PLATFORM_KEYRING,
+					      VERIFYING_KEXEC_PE_SIGNATURE);
+	}
+	return ret;
 }
 #endif
 
diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index da055e901df4..c05c29ae4d5d 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -240,11 +240,22 @@ int verify_pkcs7_signature(const void *data, size_t len,
 #else
 		trusted_keys = builtin_trusted_keys;
 #endif
+	} else if (trusted_keys == VERIFY_USE_PLATFORM_KEYRING) {
+#ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING
+		trusted_keys = platform_trusted_keys;
+#else
+		trusted_keys = NULL;
+#endif
+		if (!trusted_keys) {
+			ret = -ENOKEY;
+			pr_devel("PKCS#7 platform keyring is not available\n");
+			goto error;
+		}
 	}
 	ret = pkcs7_validate_trust(pkcs7, trusted_keys);
 	if (ret < 0) {
 		if (ret == -ENOKEY)
-			pr_err("PKCS#7 signature not signed with a trusted key\n");
+			pr_devel("PKCS#7 signature not signed with a trusted key\n");
 		goto error;
 	}
 
diff --git a/include/linux/verification.h b/include/linux/verification.h
index cfa4730d607a..018fb5f13d44 100644
--- a/include/linux/verification.h
+++ b/include/linux/verification.h
@@ -17,6 +17,7 @@
  * should be used.
  */
 #define VERIFY_USE_SECONDARY_KEYRING ((struct key *)1UL)
+#define VERIFY_USE_PLATFORM_KEYRING  ((struct key *)2UL)
 
 /*
  * The use to which an asymmetric key is being put.
-- 
cgit v1.2.3


From fdb2410f7702f25f82804a261f90ad03422bd2c3 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.ibm.com>
Date: Tue, 22 Jan 2019 14:06:49 -0600
Subject: ima: define ima_post_create_tmpfile() hook and add missing call

If tmpfiles can be made persistent, then newly created tmpfiles need to
be treated like any other new files in policy.

This patch indicates which newly created tmpfiles are in policy, causing
the file hash to be calculated on __fput().

Reported-by: Ignaz Forster <ignaz.forster@gmx.de>
[rgoldwyn@suse.com: Call ima_post_create_tmpfile() in vfs_tmpfile() as
opposed to do_tmpfile(). This will help the case for overlayfs where
copy_up is denied while overwriting a file.]
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 fs/namei.c                        |  1 +
 include/linux/ima.h               |  5 +++++
 security/integrity/ima/ima_main.c | 35 +++++++++++++++++++++++++++++++++--
 3 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/namei.c b/fs/namei.c
index 914178cdbe94..373a7ec4b09d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3462,6 +3462,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
 		inode->i_state |= I_LINKABLE;
 		spin_unlock(&inode->i_lock);
 	}
+	ima_post_create_tmpfile(inode);
 	return child;
 
 out_err:
diff --git a/include/linux/ima.h b/include/linux/ima.h
index b5e16b8c50b7..dc12fbcf484c 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -18,6 +18,7 @@ struct linux_binprm;
 #ifdef CONFIG_IMA
 extern int ima_bprm_check(struct linux_binprm *bprm);
 extern int ima_file_check(struct file *file, int mask);
+extern void ima_post_create_tmpfile(struct inode *inode);
 extern void ima_file_free(struct file *file);
 extern int ima_file_mmap(struct file *file, unsigned long prot);
 extern int ima_load_data(enum kernel_load_data_id id);
@@ -56,6 +57,10 @@ static inline int ima_file_check(struct file *file, int mask)
 	return 0;
 }
 
+static inline void ima_post_create_tmpfile(struct inode *inode)
+{
+}
+
 static inline void ima_file_free(struct file *file)
 {
 	return;
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 4ffac4f5c647..357edd140c09 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -396,6 +396,33 @@ int ima_file_check(struct file *file, int mask)
 }
 EXPORT_SYMBOL_GPL(ima_file_check);
 
+/**
+ * ima_post_create_tmpfile - mark newly created tmpfile as new
+ * @file : newly created tmpfile
+ *
+ * No measuring, appraising or auditing of newly created tmpfiles is needed.
+ * Skip calling process_measurement(), but indicate which newly, created
+ * tmpfiles are in policy.
+ */
+void ima_post_create_tmpfile(struct inode *inode)
+{
+	struct integrity_iint_cache *iint;
+	int must_appraise;
+
+	must_appraise = ima_must_appraise(inode, MAY_ACCESS, FILE_CHECK);
+	if (!must_appraise)
+		return;
+
+	/* Nothing to do if we can't allocate memory */
+	iint = integrity_inode_get(inode);
+	if (!iint)
+		return;
+
+	/* needed for writing the security xattrs */
+	set_bit(IMA_UPDATE_XATTR, &iint->atomic_flags);
+	iint->ima_file_status = INTEGRITY_PASS;
+}
+
 /**
  * ima_post_path_mknod - mark as a new inode
  * @dentry: newly created dentry
@@ -413,9 +440,13 @@ void ima_post_path_mknod(struct dentry *dentry)
 	if (!must_appraise)
 		return;
 
+	/* Nothing to do if we can't allocate memory */
 	iint = integrity_inode_get(inode);
-	if (iint)
-		iint->flags |= IMA_NEW_FILE;
+	if (!iint)
+		return;
+
+	/* needed for re-opening empty files */
+	iint->flags |= IMA_NEW_FILE;
 }
 
 /**
-- 
cgit v1.2.3


From 668aa15b5bf87f156ec805cb7348c785c56b82ab Mon Sep 17 00:00:00 2001
From: Kamal Heib <kamalheib1@gmail.com>
Date: Tue, 29 Jan 2019 12:08:50 +0200
Subject: RDMA/rxe: Improve loopback marking

Currently a packet is marked for loopback only if the source and
destination addresses equals. This is not enough when multiple gids are
present in rxe device's gid table and the traffic is from one gid to
another. Fix it by marking the packet for loopback if the destination MAC
address is equal to the source MAC address.

Signed-off-by: Kamal Heib <kamalheib1@gmail.com>
Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
Tested-by: Yuval Shaia <yuval.shaia@oracle.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/sw/rxe/rxe_av.c  | 1 +
 drivers/infiniband/sw/rxe/rxe_net.c | 9 +++------
 include/uapi/rdma/rdma_user_rxe.h   | 3 +--
 3 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c
index 27a7dec18874..81ee756c19b8 100644
--- a/drivers/infiniband/sw/rxe/rxe_av.c
+++ b/drivers/infiniband/sw/rxe/rxe_av.c
@@ -38,6 +38,7 @@ void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av)
 {
 	rxe_av_from_attr(rdma_ah_get_port_num(attr), av, attr);
 	rxe_av_fill_ip_info(av, attr);
+	memcpy(av->dmac, attr->roce.dmac, ETH_ALEN);
 }
 
 int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr)
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index 19f3c69916b1..3b162e92e8e8 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -384,9 +384,6 @@ static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb,
 		return -EHOSTUNREACH;
 	}
 
-	if (!memcmp(saddr, daddr, sizeof(*daddr)))
-		pkt->mask |= RXE_LOOPBACK_MASK;
-
 	prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),
 			cpu_to_be16(ROCE_V2_UDP_DPORT));
 
@@ -411,9 +408,6 @@ static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb,
 		return -EHOSTUNREACH;
 	}
 
-	if (!memcmp(saddr, daddr, sizeof(*daddr)))
-		pkt->mask |= RXE_LOOPBACK_MASK;
-
 	prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),
 			cpu_to_be16(ROCE_V2_UDP_DPORT));
 
@@ -437,6 +431,9 @@ int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc)
 
 	*crc = rxe_icrc_hdr(pkt, skb);
 
+	if (ether_addr_equal(skb->dev->dev_addr, av->dmac))
+		pkt->mask |= RXE_LOOPBACK_MASK;
+
 	return err;
 }
 
diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h
index 44ef6a3b7afc..aae2e696bb38 100644
--- a/include/uapi/rdma/rdma_user_rxe.h
+++ b/include/uapi/rdma/rdma_user_rxe.h
@@ -58,8 +58,7 @@ struct rxe_global_route {
 struct rxe_av {
 	__u8			port_num;
 	__u8			network_type;
-	__u16			reserved1;
-	__u32			reserved2;
+	__u8			dmac[6];
 	struct rxe_global_route	grh;
 	union {
 		struct sockaddr_in	_sockaddr_in;
-- 
cgit v1.2.3


From f76903d574b26bc596951a5c5e757eb02c67abbd Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Tue, 29 Jan 2019 13:33:11 -0800
Subject: RDMA/IWPM: refactor the IWPM message attribute names

In order to add new IWPM_NL attributes, the enums for the IWPM commands
attributes are refactored such that a new attribute can be added without
breaking ABI version 3. Instead of sharing nl attribute enums for both
request and response messages, we create separate enums for each IWPM
message request and reply.  This allows us to extend any given IWPM
message by adding new attributes for just that message.  These new enums
are created, though, in a way to avoid breaking ABI version 3.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Tatyana Nikolova <Tatyana.E.Nikolova@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/iwpm_msg.c | 40 ++++++++++++++++++++++----------------
 include/uapi/rdma/rdma_netlink.h   | 21 +++++++++++++++++---
 2 files changed, 41 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
index 8861c052155a..3a8753595c8f 100644
--- a/drivers/infiniband/core/iwpm_msg.c
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -403,10 +403,12 @@ register_pid_response_exit:
 
 /* netlink attribute policy for the received response to add mapping request */
 static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = {
-	[IWPM_NLA_MANAGE_MAPPING_SEQ]     = { .type = NLA_U32 },
-	[IWPM_NLA_MANAGE_ADDR]            = { .len = sizeof(struct sockaddr_storage) },
-	[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
-	[IWPM_NLA_RMANAGE_MAPPING_ERR]	  = { .type = NLA_U16 }
+	[IWPM_NLA_RMANAGE_MAPPING_SEQ]     = { .type = NLA_U32 },
+	[IWPM_NLA_RMANAGE_ADDR]            = {
+				.len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR] = {
+				.len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RMANAGE_MAPPING_ERR]	   = { .type = NLA_U16 }
 };
 
 /*
@@ -430,7 +432,7 @@ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb)
 
 	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
 
-	msg_seq = nla_get_u32(nltb[IWPM_NLA_MANAGE_MAPPING_SEQ]);
+	msg_seq = nla_get_u32(nltb[IWPM_NLA_RMANAGE_MAPPING_SEQ]);
 	nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
 	if (!nlmsg_request) {
 		pr_info("%s: Could not find a matching request (seq = %u)\n",
@@ -439,9 +441,9 @@ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb)
 	}
 	pm_msg = nlmsg_request->req_buffer;
 	local_sockaddr = (struct sockaddr_storage *)
-			nla_data(nltb[IWPM_NLA_MANAGE_ADDR]);
+			nla_data(nltb[IWPM_NLA_RMANAGE_ADDR]);
 	mapped_sockaddr = (struct sockaddr_storage *)
-			nla_data(nltb[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR]);
+			nla_data(nltb[IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR]);
 
 	if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) {
 		nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
@@ -472,11 +474,15 @@ add_mapping_response_exit:
 /* netlink attribute policy for the response to add and query mapping request
  * and response with remote address info */
 static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = {
-	[IWPM_NLA_QUERY_MAPPING_SEQ]      = { .type = NLA_U32 },
-	[IWPM_NLA_QUERY_LOCAL_ADDR]       = { .len = sizeof(struct sockaddr_storage) },
-	[IWPM_NLA_QUERY_REMOTE_ADDR]      = { .len = sizeof(struct sockaddr_storage) },
-	[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
-	[IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_MAPPING_SEQ]     = { .type = NLA_U32 },
+	[IWPM_NLA_RQUERY_LOCAL_ADDR]      = {
+				.len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_REMOTE_ADDR]     = {
+				.len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = {
+				.len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = {
+				.len = sizeof(struct sockaddr_storage) },
 	[IWPM_NLA_RQUERY_MAPPING_ERR]	  = { .type = NLA_U16 }
 };
 
@@ -502,7 +508,7 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb,
 		return -EINVAL;
 	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
 
-	msg_seq = nla_get_u32(nltb[IWPM_NLA_QUERY_MAPPING_SEQ]);
+	msg_seq = nla_get_u32(nltb[IWPM_NLA_RQUERY_MAPPING_SEQ]);
 	nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
 	if (!nlmsg_request) {
 		pr_info("%s: Could not find a matching request (seq = %u)\n",
@@ -511,9 +517,9 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb,
 	}
 	pm_msg = nlmsg_request->req_buffer;
 	local_sockaddr = (struct sockaddr_storage *)
-			nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]);
+			nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]);
 	remote_sockaddr = (struct sockaddr_storage *)
-			nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]);
+			nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]);
 	mapped_loc_sockaddr = (struct sockaddr_storage *)
 			nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
 	mapped_rem_sockaddr = (struct sockaddr_storage *)
@@ -588,9 +594,9 @@ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
 	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
 
 	local_sockaddr = (struct sockaddr_storage *)
-			nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]);
+			nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]);
 	remote_sockaddr = (struct sockaddr_storage *)
-			nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]);
+			nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]);
 	mapped_loc_sockaddr = (struct sockaddr_storage *)
 			nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
 	mapped_rem_sockaddr = (struct sockaddr_storage *)
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 2e18b77a817f..42d53e182d5f 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -83,13 +83,20 @@ enum {
 	IWPM_NLA_MANAGE_MAPPING_UNSPEC = 0,
 	IWPM_NLA_MANAGE_MAPPING_SEQ,
 	IWPM_NLA_MANAGE_ADDR,
-	IWPM_NLA_MANAGE_MAPPED_LOC_ADDR,
+	IWPM_NLA_MANAGE_MAPPING_MAX
+};
+
+enum {
+	IWPM_NLA_RMANAGE_MAPPING_UNSPEC = 0,
+	IWPM_NLA_RMANAGE_MAPPING_SEQ,
+	IWPM_NLA_RMANAGE_ADDR,
+	IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR,
+	/* The following maintains bisectability of rdma-core */
+	IWPM_NLA_MANAGE_MAPPED_LOC_ADDR = IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR,
 	IWPM_NLA_RMANAGE_MAPPING_ERR,
 	IWPM_NLA_RMANAGE_MAPPING_MAX
 };
 
-#define IWPM_NLA_MANAGE_MAPPING_MAX 3
-#define IWPM_NLA_QUERY_MAPPING_MAX  4
 #define IWPM_NLA_MAPINFO_SEND_MAX   3
 
 enum {
@@ -97,6 +104,14 @@ enum {
 	IWPM_NLA_QUERY_MAPPING_SEQ,
 	IWPM_NLA_QUERY_LOCAL_ADDR,
 	IWPM_NLA_QUERY_REMOTE_ADDR,
+	IWPM_NLA_QUERY_MAPPING_MAX,
+};
+
+enum {
+	IWPM_NLA_RQUERY_MAPPING_UNSPEC = 0,
+	IWPM_NLA_RQUERY_MAPPING_SEQ,
+	IWPM_NLA_RQUERY_LOCAL_ADDR,
+	IWPM_NLA_RQUERY_REMOTE_ADDR,
 	IWPM_NLA_RQUERY_MAPPED_LOC_ADDR,
 	IWPM_NLA_RQUERY_MAPPED_REM_ADDR,
 	IWPM_NLA_RQUERY_MAPPING_ERR,
-- 
cgit v1.2.3


From b0bad9ad514fc1dd8890f1749f5d2425a73270e3 Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Tue, 29 Jan 2019 13:33:16 -0800
Subject: RDMA/IWPM: Support no port mapping requirements

A soft iwarp driver that uses the host TCP stack via a kernel mode socket
does not need port mapping.  In fact, if the port map daemon, iwpmd, is
running, then iwpmd must not try and create/bind a socket to the actual
port for a soft iwarp connection, since the driver already has that socket
bound.

Yet if the soft iwarp driver wants to interoperate with hard iwarp devices
that -are- using port mapping, then the soft iwarp driver's mappings still
need to be maintained and advertised by the iwpm protocol.

This patch enhances the rdma driver<->iwcm interface to allow an iwarp
driver to specify that it does not want port mapping.  The iwpm
kernel<->iwpmd interface is also enhanced to pass up this information on
map requests.

Care is taken to interoperate with the current iwpmd version (ABI version
3) and only use the new NL attributes if iwpmd supports ABI version 4.

The ABI version define has also been created in rdma_netlink.h so both
kernel and user code can share it.  The iwcm and iwpmd negotiate the ABI
version to use with a new HELLO netlink message.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Tatyana Nikolova <Tatyana.E.Nikolova@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/iwcm.c      |  7 +++-
 drivers/infiniband/core/iwpm_msg.c  | 80 +++++++++++++++++++++++++++++++++++--
 drivers/infiniband/core/iwpm_util.c | 48 +++++++++++++++++++++-
 drivers/infiniband/core/iwpm_util.h | 12 ++++++
 include/rdma/iw_cm.h                | 13 ++++++
 include/rdma/iw_portmap.h           | 15 ++++++-
 include/uapi/rdma/rdma_netlink.h    | 24 +++++++++++
 7 files changed, 192 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index 476abc74178e..350ea2bab78a 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -87,7 +87,8 @@ static struct rdma_nl_cbs iwcm_nl_cb_table[RDMA_NL_IWPM_NUM_OPS] = {
 	[RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb},
 	[RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb},
 	[RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb},
-	[RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb}
+	[RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb},
+	[RDMA_NL_IWPM_HELLO] = {.dump = iwpm_hello_cb}
 };
 
 static struct workqueue_struct *iwcm_wq;
@@ -525,6 +526,8 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
 	cm_id->mapped = true;
 	pm_msg.loc_addr = cm_id->local_addr;
 	pm_msg.rem_addr = cm_id->remote_addr;
+	pm_msg.flags = (cm_id->device->iwcm->driver_flags & IW_F_NO_PORT_MAP) ?
+		       IWPM_FLAGS_NO_PORT_MAP : 0;
 	if (active)
 		status = iwpm_add_and_query_mapping(&pm_msg,
 						    RDMA_NL_IWCM);
@@ -543,7 +546,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
 
 	return iwpm_create_mapinfo(&cm_id->local_addr,
 				   &cm_id->m_local_addr,
-				   RDMA_NL_IWCM);
+				   RDMA_NL_IWCM, pm_msg.flags);
 }
 
 /*
diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
index 3a8753595c8f..2e30e65b0816 100644
--- a/drivers/infiniband/core/iwpm_msg.c
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -34,7 +34,7 @@
 #include "iwpm_util.h"
 
 static const char iwpm_ulib_name[IWPM_ULIBNAME_SIZE] = "iWarpPortMapperUser";
-static int iwpm_ulib_version = 3;
+u16 iwpm_ulib_version = IWPM_UABI_VERSION_MIN;
 static int iwpm_user_pid = IWPM_PID_UNDEFINED;
 static atomic_t echo_nlmsg_seq;
 
@@ -130,6 +130,7 @@ pid_query_error:
  * nlmsg attributes:
  *	[IWPM_NLA_MANAGE_MAPPING_SEQ]
  *	[IWPM_NLA_MANAGE_ADDR]
+ *	[IWPM_NLA_MANAGE_FLAGS]
  */
 int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 {
@@ -173,6 +174,18 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 	if (ret)
 		goto add_mapping_error;
 
+	/* If flags are required and we're not V4, then return a quiet error */
+	if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) {
+		ret = -EINVAL;
+		goto add_mapping_error_nowarn;
+	}
+	if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) {
+		ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags,
+				IWPM_NLA_MANAGE_FLAGS);
+		if (ret)
+			goto add_mapping_error;
+	}
+
 	nlmsg_end(skb, nlh);
 	nlmsg_request->req_buffer = pm_msg;
 
@@ -187,6 +200,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 	return ret;
 add_mapping_error:
 	pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+add_mapping_error_nowarn:
 	if (skb)
 		dev_kfree_skb(skb);
 	if (nlmsg_request)
@@ -201,6 +215,7 @@ add_mapping_error:
  *	[IWPM_NLA_QUERY_MAPPING_SEQ]
  *	[IWPM_NLA_QUERY_LOCAL_ADDR]
  *	[IWPM_NLA_QUERY_REMOTE_ADDR]
+ *	[IWPM_NLA_QUERY_FLAGS]
  */
 int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 {
@@ -251,6 +266,18 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 	if (ret)
 		goto query_mapping_error;
 
+	/* If flags are required and we're not V4, then return a quite error */
+	if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) {
+		ret = -EINVAL;
+		goto query_mapping_error_nowarn;
+	}
+	if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) {
+		ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags,
+				IWPM_NLA_QUERY_FLAGS);
+		if (ret)
+			goto query_mapping_error;
+	}
+
 	nlmsg_end(skb, nlh);
 	nlmsg_request->req_buffer = pm_msg;
 
@@ -264,6 +291,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 	return ret;
 query_mapping_error:
 	pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+query_mapping_error_nowarn:
 	if (skb)
 		dev_kfree_skb(skb);
 	if (nlmsg_request)
@@ -379,7 +407,7 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
 	/* check device name, ulib name and version */
 	if (strcmp(pm_msg->dev_name, dev_name) ||
 			strcmp(iwpm_ulib_name, iwpm_name) ||
-			iwpm_version != iwpm_ulib_version) {
+			iwpm_version < IWPM_UABI_VERSION_MIN) {
 
 		pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n",
 				__func__, dev_name, iwpm_name, iwpm_version);
@@ -387,6 +415,10 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
 		goto register_pid_response_exit;
 	}
 	iwpm_user_pid = cb->nlh->nlmsg_pid;
+	iwpm_ulib_version = iwpm_version;
+	if (iwpm_ulib_version < IWPM_UABI_VERSION)
+		pr_warn_once("%s: Down level iwpmd/pid %u.  Continuing...",
+			__func__, iwpm_user_pid);
 	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
 	pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
 			__func__, iwpm_user_pid);
@@ -661,7 +693,7 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
 	iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]);
 	iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]);
 	if (strcmp(iwpm_ulib_name, iwpm_name) ||
-			iwpm_version != iwpm_ulib_version) {
+			iwpm_version < IWPM_UABI_VERSION_MIN) {
 		pr_info("%s: Invalid port mapper name = %s version = %d\n",
 				__func__, iwpm_name, iwpm_version);
 		return ret;
@@ -675,6 +707,11 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
 	iwpm_set_registration(nl_client, IWPM_REG_INCOMPL);
 	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
 	iwpm_user_pid = cb->nlh->nlmsg_pid;
+
+	if (iwpm_ulib_version < IWPM_UABI_VERSION)
+		pr_warn_once("%s: Down level iwpmd/pid %u.  Continuing...",
+			__func__, iwpm_user_pid);
+
 	if (!iwpm_mapinfo_available())
 		return 0;
 	pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
@@ -754,3 +791,40 @@ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb)
 	up(&nlmsg_request->sem);
 	return 0;
 }
+
+/* netlink attribute policy for the received hello request */
+static const struct nla_policy hello_policy[IWPM_NLA_HELLO_MAX] = {
+	[IWPM_NLA_HELLO_ABI_VERSION]     = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_hello_cb - Process a port mapper hello request
+ */
+int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *nltb[IWPM_NLA_HELLO_MAX];
+	const char *msg_type = "Hello request";
+	u8 nl_client;
+	u16 abi_version;
+	int ret = -EINVAL;
+
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_HELLO_MAX, hello_policy, nltb,
+			     msg_type)) {
+		pr_info("%s: Unable to parse nlmsg\n", __func__);
+		return ret;
+	}
+	abi_version = nla_get_u16(nltb[IWPM_NLA_HELLO_ABI_VERSION]);
+	nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+	if (!iwpm_valid_client(nl_client)) {
+		pr_info("%s: Invalid port mapper client = %d\n",
+				__func__, nl_client);
+		return ret;
+	}
+	iwpm_set_registration(nl_client, IWPM_REG_INCOMPL);
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+	iwpm_ulib_version = min_t(u16, IWPM_UABI_VERSION, abi_version);
+	pr_debug("Using ABI version %u\n", iwpm_ulib_version);
+	iwpm_user_pid = cb->nlh->nlmsg_pid;
+	ret = iwpm_send_hello(nl_client, iwpm_user_pid, iwpm_ulib_version);
+	return ret;
+}
diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c
index cdb63f3f4de7..363938435476 100644
--- a/drivers/infiniband/core/iwpm_util.c
+++ b/drivers/infiniband/core/iwpm_util.c
@@ -114,7 +114,7 @@ static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *,
 
 int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
 			struct sockaddr_storage *mapped_sockaddr,
-			u8 nl_client)
+			u8 nl_client, u32 map_flags)
 {
 	struct hlist_head *hash_bucket_head = NULL;
 	struct iwpm_mapping_info *map_info;
@@ -132,6 +132,7 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
 	memcpy(&map_info->mapped_sockaddr, mapped_sockaddr,
 	       sizeof(struct sockaddr_storage));
 	map_info->nl_client = nl_client;
+	map_info->map_flags = map_flags;
 
 	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
 	if (iwpm_hash_bucket) {
@@ -686,6 +687,14 @@ int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid)
 			if (ret)
 				goto send_mapping_info_unlock;
 
+			if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) {
+				ret = ibnl_put_attr(skb, nlh, sizeof(u32),
+						&map_info->map_flags,
+						IWPM_NLA_MAPINFO_FLAGS);
+				if (ret)
+					goto send_mapping_info_unlock;
+			}
+
 			nlmsg_end(skb, nlh);
 
 			iwpm_print_sockaddr(&map_info->local_sockaddr,
@@ -754,3 +763,40 @@ int iwpm_mapinfo_available(void)
 	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
 	return full_bucket;
 }
+
+int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version)
+{
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	u32 msg_seq;
+	const char *err_str = "";
+	int ret = -EINVAL;
+
+	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_HELLO, &nlh, nl_client);
+	if (!skb) {
+		err_str = "Unable to create a nlmsg";
+		goto hello_num_error;
+	}
+	nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+	msg_seq = 0;
+	err_str = "Unable to put attribute of abi_version into nlmsg";
+	ret = ibnl_put_attr(skb, nlh, sizeof(u16), &abi_version,
+			    IWPM_NLA_HELLO_ABI_VERSION);
+	if (ret)
+		goto hello_num_error;
+	nlmsg_end(skb, nlh);
+
+	ret = rdma_nl_unicast(skb, iwpm_pid);
+	if (ret) {
+		skb = NULL;
+		err_str = "Unable to send a nlmsg";
+		goto hello_num_error;
+	}
+	pr_debug("%s: Sent hello abi_version = %u\n", __func__, abi_version);
+	return 0;
+hello_num_error:
+	pr_info("%s: %s\n", __func__, err_str);
+	if (skb)
+		dev_kfree_skb(skb);
+	return ret;
+}
diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h
index af1fc14a0d3d..7e2bcc72f66c 100644
--- a/drivers/infiniband/core/iwpm_util.h
+++ b/drivers/infiniband/core/iwpm_util.h
@@ -78,6 +78,7 @@ struct iwpm_mapping_info {
 	struct sockaddr_storage local_sockaddr;
 	struct sockaddr_storage mapped_sockaddr;
 	u8     nl_client;
+	u32    map_flags;
 };
 
 struct iwpm_remote_info {
@@ -266,4 +267,15 @@ int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max,
  * @msg: Message to print
  */
 void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg);
+
+/**
+ * iwpm_send_hello - Send hello response to iwpmd
+ *
+ * @nl_client: The index of the netlink client
+ * @abi_version: The kernel's abi_version
+ *
+ * Returns 0 on success or a negative error code
+ */
+int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version);
+extern u16 iwpm_ulib_version;
 #endif
diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h
index 5cd7701db148..48512abd3162 100644
--- a/include/rdma/iw_cm.h
+++ b/include/rdma/iw_cm.h
@@ -105,6 +105,18 @@ struct iw_cm_conn_param {
 	u32 qpn;
 };
 
+enum iw_flags {
+
+	/*
+	 * This flag allows the iwcm and iwpmd to still advertise
+	 * mappings but the real and mapped port numbers are the
+	 * same.  Further, iwpmd will not bind any user socket to
+	 * reserve the port.  This is required for soft iwarp
+	 * to play in the port mapped iwarp space.
+	 */
+	IW_F_NO_PORT_MAP = (1 << 0),
+};
+
 struct iw_cm_verbs {
 	void		(*add_ref)(struct ib_qp *qp);
 
@@ -127,6 +139,7 @@ struct iw_cm_verbs {
 
 	int		(*destroy_listen)(struct iw_cm_id *cm_id);
 	char		ifname[IFNAMSIZ];
+	enum iw_flags	driver_flags;
 };
 
 /**
diff --git a/include/rdma/iw_portmap.h b/include/rdma/iw_portmap.h
index fda31673a562..84fac196ef80 100644
--- a/include/rdma/iw_portmap.h
+++ b/include/rdma/iw_portmap.h
@@ -58,6 +58,7 @@ struct iwpm_sa_data {
 	struct sockaddr_storage mapped_loc_addr;
 	struct sockaddr_storage rem_addr;
 	struct sockaddr_storage mapped_rem_addr;
+	u32 flags;
 };
 
 /**
@@ -205,9 +206,11 @@ int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr,
  * @local_addr: Local ip/tcp address
  * @mapped_addr: Mapped local ip/tcp address
  * @nl_client: The index of the netlink client
+ * @map_flags: IWPM mapping flags
  */
 int iwpm_create_mapinfo(struct sockaddr_storage *local_addr,
-			struct sockaddr_storage *mapped_addr, u8 nl_client);
+			struct sockaddr_storage *mapped_addr, u8 nl_client,
+			u32 map_flags);
 
 /**
  * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address
@@ -221,4 +224,14 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_addr,
 int iwpm_remove_mapinfo(struct sockaddr_storage *local_addr,
 			struct sockaddr_storage *mapped_addr);
 
+/**
+ * iwpm_hello_cb - Process a hello message from iwpmd
+ *
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Using the received port mapper pid, send the kernel's abi_version
+ * after adjusting it to support the iwpmd version.
+ */
+int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb);
 #endif /* _IW_PORTMAP_H */
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 42d53e182d5f..0f5263767fb4 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -35,6 +35,19 @@ enum {
 	RDMA_NL_RDMA_CM_NUM_ATTR,
 };
 
+/* The minimum version that the iwpm kernel supports */
+#define IWPM_UABI_VERSION_MIN	3
+
+/* The latest version that the iwpm kernel supports */
+#define IWPM_UABI_VERSION	4
+
+/* iwarp port mapper message flags */
+enum {
+
+	/* Do not map the port for this IWPM request */
+	IWPM_FLAGS_NO_PORT_MAP = (1 << 0),
+};
+
 /* iwarp port mapper op-codes */
 enum {
 	RDMA_NL_IWPM_REG_PID = 0,
@@ -45,6 +58,7 @@ enum {
 	RDMA_NL_IWPM_HANDLE_ERR,
 	RDMA_NL_IWPM_MAPINFO,
 	RDMA_NL_IWPM_MAPINFO_NUM,
+	RDMA_NL_IWPM_HELLO,
 	RDMA_NL_IWPM_NUM_OPS
 };
 
@@ -83,6 +97,7 @@ enum {
 	IWPM_NLA_MANAGE_MAPPING_UNSPEC = 0,
 	IWPM_NLA_MANAGE_MAPPING_SEQ,
 	IWPM_NLA_MANAGE_ADDR,
+	IWPM_NLA_MANAGE_FLAGS,
 	IWPM_NLA_MANAGE_MAPPING_MAX
 };
 
@@ -98,12 +113,14 @@ enum {
 };
 
 #define IWPM_NLA_MAPINFO_SEND_MAX   3
+#define IWPM_NLA_REMOVE_MAPPING_MAX 3
 
 enum {
 	IWPM_NLA_QUERY_MAPPING_UNSPEC = 0,
 	IWPM_NLA_QUERY_MAPPING_SEQ,
 	IWPM_NLA_QUERY_LOCAL_ADDR,
 	IWPM_NLA_QUERY_REMOTE_ADDR,
+	IWPM_NLA_QUERY_FLAGS,
 	IWPM_NLA_QUERY_MAPPING_MAX,
 };
 
@@ -129,6 +146,7 @@ enum {
 	IWPM_NLA_MAPINFO_UNSPEC = 0,
 	IWPM_NLA_MAPINFO_LOCAL_ADDR,
 	IWPM_NLA_MAPINFO_MAPPED_ADDR,
+	IWPM_NLA_MAPINFO_FLAGS,
 	IWPM_NLA_MAPINFO_MAX
 };
 
@@ -147,6 +165,12 @@ enum {
 	IWPM_NLA_ERR_MAX
 };
 
+enum {
+	IWPM_NLA_HELLO_UNSPEC = 0,
+	IWPM_NLA_HELLO_ABI_VERSION,
+	IWPM_NLA_HELLO_MAX
+};
+
 /*
  * Local service operations:
  *   RESOLVE - The client requests the local service to resolve a path.
-- 
cgit v1.2.3


From f80d2f0846b7b9ceb1f2a5951229ee4391edaebd Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 25 Jan 2019 10:34:43 -0800
Subject: scsi: target/core: Remove the write_pending_status() callback
 function

Due to the patch that makes TMF handling synchronous the
write_pending_status() callback function is no longer called.  Hence remove
it.

Acked-by: Felipe Balbi <balbi@ti.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Reviewed-by: Andy Grover <agrover@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Bryant G. Ly <bryantly@linux.vnet.ibm.com>
Cc: Nicholas Bellinger <nab@linux-iscsi.org>
Cc: Mike Christie <mchristi@redhat.com>
Cc: Himanshu Madhani <himanshu.madhani@qlogic.com>
Cc: Quinn Tran <quinn.tran@qlogic.com>
Cc: Saurav Kashyap <saurav.kashyap@qlogic.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 Documentation/target/tcm_mod_builder.py      |  8 -------
 drivers/infiniband/ulp/srpt/ib_srpt.c        |  9 --------
 drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c     |  6 ------
 drivers/scsi/qla2xxx/tcm_qla2xxx.c           | 31 ----------------------------
 drivers/target/iscsi/iscsi_target_configfs.c | 13 ------------
 drivers/target/loopback/tcm_loop.c           |  6 ------
 drivers/target/sbp/sbp_target.c              |  6 ------
 drivers/target/target_core_configfs.c        |  4 ----
 drivers/target/target_core_xcopy.c           |  6 ------
 drivers/target/tcm_fc/tcm_fc.h               |  1 -
 drivers/target/tcm_fc/tfc_cmd.c              |  7 -------
 drivers/target/tcm_fc/tfc_conf.c             |  1 -
 drivers/usb/gadget/function/f_tcm.c          |  9 --------
 drivers/vhost/scsi.c                         |  6 ------
 drivers/xen/xen-scsiback.c                   |  6 ------
 include/target/target_core_fabric.h          |  1 -
 16 files changed, 120 deletions(-)

(limited to 'include')

diff --git a/Documentation/target/tcm_mod_builder.py b/Documentation/target/tcm_mod_builder.py
index 94bf6944bb1e..95d6e31f1e3a 100755
--- a/Documentation/target/tcm_mod_builder.py
+++ b/Documentation/target/tcm_mod_builder.py
@@ -297,7 +297,6 @@ def tcm_mod_build_configfs(proto_ident, fabric_mod_dir_var, fabric_mod_name):
 	buf += "	.sess_get_index			= " + fabric_mod_name + "_sess_get_index,\n"
 	buf += "	.sess_get_initiator_sid		= NULL,\n"
 	buf += "	.write_pending			= " + fabric_mod_name + "_write_pending,\n"
-	buf += "	.write_pending_status		= " + fabric_mod_name + "_write_pending_status,\n"
 	buf += "	.set_default_node_attributes	= " + fabric_mod_name + "_set_default_node_attrs,\n"
 	buf += "	.get_cmd_state			= " + fabric_mod_name + "_get_cmd_state,\n"
 	buf += "	.queue_data_in			= " + fabric_mod_name + "_queue_data_in,\n"
@@ -479,13 +478,6 @@ def tcm_mod_dump_fabric_ops(proto_ident, fabric_mod_dir_var, fabric_mod_name):
 			buf += "}\n\n"
 			bufi += "int " + fabric_mod_name + "_write_pending(struct se_cmd *);\n"
 
-		if re.search('write_pending_status\)\(', fo):
-			buf += "int " + fabric_mod_name + "_write_pending_status(struct se_cmd *se_cmd)\n"
-			buf += "{\n"
-			buf += "	return 0;\n"
-			buf += "}\n\n"
-			bufi += "int " + fabric_mod_name + "_write_pending_status(struct se_cmd *);\n"
-
 		if re.search('set_default_node_attributes\)\(', fo):
 			buf += "void " + fabric_mod_name + "_set_default_node_attrs(struct se_node_acl *nacl)\n"
 			buf += "{\n"
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index e9c336cff8f5..8cee8c6c6be9 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -2703,14 +2703,6 @@ static int srpt_rdma_cm_handler(struct rdma_cm_id *cm_id,
 	return ret;
 }
 
-static int srpt_write_pending_status(struct se_cmd *se_cmd)
-{
-	struct srpt_send_ioctx *ioctx;
-
-	ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd);
-	return ioctx->state == SRPT_STATE_NEED_DATA;
-}
-
 /*
  * srpt_write_pending - Start data transfer from initiator to target (write).
  */
@@ -3806,7 +3798,6 @@ static const struct target_core_fabric_ops srpt_template = {
 	.sess_get_index			= srpt_sess_get_index,
 	.sess_get_initiator_sid		= NULL,
 	.write_pending			= srpt_write_pending,
-	.write_pending_status		= srpt_write_pending_status,
 	.set_default_node_attributes	= srpt_set_default_node_attrs,
 	.get_cmd_state			= srpt_get_tcm_cmd_state,
 	.queue_data_in			= srpt_queue_data_in,
diff --git a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c
index cc9cae469c4b..7ca277e28d63 100644
--- a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c
+++ b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c
@@ -3788,11 +3788,6 @@ static int ibmvscsis_write_pending(struct se_cmd *se_cmd)
 	return 0;
 }
 
-static int ibmvscsis_write_pending_status(struct se_cmd *se_cmd)
-{
-	return 0;
-}
-
 static void ibmvscsis_set_default_node_attrs(struct se_node_acl *nacl)
 {
 }
@@ -4053,7 +4048,6 @@ static const struct target_core_fabric_ops ibmvscsis_ops = {
 	.release_cmd			= ibmvscsis_release_cmd,
 	.sess_get_index			= ibmvscsis_sess_get_index,
 	.write_pending			= ibmvscsis_write_pending,
-	.write_pending_status		= ibmvscsis_write_pending_status,
 	.set_default_node_attributes	= ibmvscsis_set_default_node_attrs,
 	.get_cmd_state			= ibmvscsis_get_cmd_state,
 	.queue_data_in			= ibmvscsis_queue_data_in,
diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
index 283e6b80abb5..8a3075d17c63 100644
--- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c
+++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
@@ -420,26 +420,6 @@ static int tcm_qla2xxx_write_pending(struct se_cmd *se_cmd)
 	return qlt_rdy_to_xfer(cmd);
 }
 
-static int tcm_qla2xxx_write_pending_status(struct se_cmd *se_cmd)
-{
-	unsigned long flags;
-	/*
-	 * Check for WRITE_PENDING status to determine if we need to wait for
-	 * CTIO aborts to be posted via hardware in tcm_qla2xxx_handle_data().
-	 */
-	spin_lock_irqsave(&se_cmd->t_state_lock, flags);
-	if (se_cmd->t_state == TRANSPORT_WRITE_PENDING ||
-	    se_cmd->t_state == TRANSPORT_COMPLETE_QF_WP) {
-		spin_unlock_irqrestore(&se_cmd->t_state_lock, flags);
-		wait_for_completion_timeout(&se_cmd->t_transport_stop_comp,
-						50);
-		return 0;
-	}
-	spin_unlock_irqrestore(&se_cmd->t_state_lock, flags);
-
-	return 0;
-}
-
 static void tcm_qla2xxx_set_default_node_attrs(struct se_node_acl *nacl)
 {
 	return;
@@ -537,15 +517,6 @@ static void tcm_qla2xxx_handle_data_work(struct work_struct *work)
 
 	cmd->qpair->tgt_counters.qla_core_ret_ctio++;
 	if (!cmd->write_data_transferred) {
-		/*
-		 * Check if se_cmd has already been aborted via LUN_RESET, and
-		 * waiting upon completion in tcm_qla2xxx_write_pending_status()
-		 */
-		if (cmd->se_cmd.transport_state & CMD_T_ABORTED) {
-			complete(&cmd->se_cmd.t_transport_stop_comp);
-			return;
-		}
-
 		switch (cmd->dif_err_code) {
 		case DIF_ERR_GRD:
 			cmd->se_cmd.pi_err =
@@ -1902,7 +1873,6 @@ static const struct target_core_fabric_ops tcm_qla2xxx_ops = {
 	.sess_get_index			= tcm_qla2xxx_sess_get_index,
 	.sess_get_initiator_sid		= NULL,
 	.write_pending			= tcm_qla2xxx_write_pending,
-	.write_pending_status		= tcm_qla2xxx_write_pending_status,
 	.set_default_node_attributes	= tcm_qla2xxx_set_default_node_attrs,
 	.get_cmd_state			= tcm_qla2xxx_get_cmd_state,
 	.queue_data_in			= tcm_qla2xxx_queue_data_in,
@@ -1943,7 +1913,6 @@ static const struct target_core_fabric_ops tcm_qla2xxx_npiv_ops = {
 	.sess_get_index			= tcm_qla2xxx_sess_get_index,
 	.sess_get_initiator_sid		= NULL,
 	.write_pending			= tcm_qla2xxx_write_pending,
-	.write_pending_status		= tcm_qla2xxx_write_pending_status,
 	.set_default_node_attributes	= tcm_qla2xxx_set_default_node_attrs,
 	.get_cmd_state			= tcm_qla2xxx_get_cmd_state,
 	.queue_data_in			= tcm_qla2xxx_queue_data_in,
diff --git a/drivers/target/iscsi/iscsi_target_configfs.c b/drivers/target/iscsi/iscsi_target_configfs.c
index a5481dfeae8d..cac94c94ef5d 100644
--- a/drivers/target/iscsi/iscsi_target_configfs.c
+++ b/drivers/target/iscsi/iscsi_target_configfs.c
@@ -1389,18 +1389,6 @@ static int lio_write_pending(struct se_cmd *se_cmd)
 	return 0;
 }
 
-static int lio_write_pending_status(struct se_cmd *se_cmd)
-{
-	struct iscsi_cmd *cmd = container_of(se_cmd, struct iscsi_cmd, se_cmd);
-	int ret;
-
-	spin_lock_bh(&cmd->istate_lock);
-	ret = !(cmd->cmd_flags & ICF_GOT_LAST_DATAOUT);
-	spin_unlock_bh(&cmd->istate_lock);
-
-	return ret;
-}
-
 static int lio_queue_status(struct se_cmd *se_cmd)
 {
 	struct iscsi_cmd *cmd = container_of(se_cmd, struct iscsi_cmd, se_cmd);
@@ -1564,7 +1552,6 @@ const struct target_core_fabric_ops iscsi_ops = {
 	.sess_get_index			= lio_sess_get_index,
 	.sess_get_initiator_sid		= lio_sess_get_initiator_sid,
 	.write_pending			= lio_write_pending,
-	.write_pending_status		= lio_write_pending_status,
 	.set_default_node_attributes	= lio_set_default_node_attributes,
 	.get_cmd_state			= iscsi_get_cmd_state,
 	.queue_data_in			= lio_queue_data_in,
diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c
index 7bd7c0c0db6f..89c6e3719d2b 100644
--- a/drivers/target/loopback/tcm_loop.c
+++ b/drivers/target/loopback/tcm_loop.c
@@ -560,11 +560,6 @@ static int tcm_loop_write_pending(struct se_cmd *se_cmd)
 	return 0;
 }
 
-static int tcm_loop_write_pending_status(struct se_cmd *se_cmd)
-{
-	return 0;
-}
-
 static int tcm_loop_queue_data_in(struct se_cmd *se_cmd)
 {
 	struct tcm_loop_cmd *tl_cmd = container_of(se_cmd,
@@ -1159,7 +1154,6 @@ static const struct target_core_fabric_ops loop_ops = {
 	.release_cmd			= tcm_loop_release_cmd,
 	.sess_get_index			= tcm_loop_sess_get_index,
 	.write_pending			= tcm_loop_write_pending,
-	.write_pending_status		= tcm_loop_write_pending_status,
 	.set_default_node_attributes	= tcm_loop_set_default_node_attributes,
 	.get_cmd_state			= tcm_loop_get_cmd_state,
 	.queue_data_in			= tcm_loop_queue_data_in,
diff --git a/drivers/target/sbp/sbp_target.c b/drivers/target/sbp/sbp_target.c
index 08cee13dfb9a..b0d3583998f0 100644
--- a/drivers/target/sbp/sbp_target.c
+++ b/drivers/target/sbp/sbp_target.c
@@ -1749,11 +1749,6 @@ static int sbp_write_pending(struct se_cmd *se_cmd)
 	return 0;
 }
 
-static int sbp_write_pending_status(struct se_cmd *se_cmd)
-{
-	return 0;
-}
-
 static void sbp_set_default_node_attrs(struct se_node_acl *nacl)
 {
 	return;
@@ -2329,7 +2324,6 @@ static const struct target_core_fabric_ops sbp_ops = {
 	.release_cmd			= sbp_release_cmd,
 	.sess_get_index			= sbp_sess_get_index,
 	.write_pending			= sbp_write_pending,
-	.write_pending_status		= sbp_write_pending_status,
 	.set_default_node_attributes	= sbp_set_default_node_attrs,
 	.get_cmd_state			= sbp_get_cmd_state,
 	.queue_data_in			= sbp_queue_data_in,
diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c
index 72016d0dfca5..1124c84a3b7f 100644
--- a/drivers/target/target_core_configfs.c
+++ b/drivers/target/target_core_configfs.c
@@ -401,10 +401,6 @@ static int target_fabric_tf_ops_check(const struct target_core_fabric_ops *tfo)
 		pr_err("Missing tfo->write_pending()\n");
 		return -EINVAL;
 	}
-	if (!tfo->write_pending_status) {
-		pr_err("Missing tfo->write_pending_status()\n");
-		return -EINVAL;
-	}
 	if (!tfo->set_default_node_attributes) {
 		pr_err("Missing tfo->set_default_node_attributes()\n");
 		return -EINVAL;
diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c
index c2e1fc927fdf..9be1418e919f 100644
--- a/drivers/target/target_core_xcopy.c
+++ b/drivers/target/target_core_xcopy.c
@@ -442,11 +442,6 @@ static int xcopy_pt_write_pending(struct se_cmd *se_cmd)
 	return 0;
 }
 
-static int xcopy_pt_write_pending_status(struct se_cmd *se_cmd)
-{
-	return 0;
-}
-
 static int xcopy_pt_queue_data_in(struct se_cmd *se_cmd)
 {
 	return 0;
@@ -463,7 +458,6 @@ static const struct target_core_fabric_ops xcopy_pt_tfo = {
 	.release_cmd		= xcopy_pt_release_cmd,
 	.check_stop_free	= xcopy_pt_check_stop_free,
 	.write_pending		= xcopy_pt_write_pending,
-	.write_pending_status	= xcopy_pt_write_pending_status,
 	.queue_data_in		= xcopy_pt_queue_data_in,
 	.queue_status		= xcopy_pt_queue_status,
 };
diff --git a/drivers/target/tcm_fc/tcm_fc.h b/drivers/target/tcm_fc/tcm_fc.h
index 11d27b93b413..b8ced4458118 100644
--- a/drivers/target/tcm_fc/tcm_fc.h
+++ b/drivers/target/tcm_fc/tcm_fc.h
@@ -158,7 +158,6 @@ void ft_release_cmd(struct se_cmd *);
 int ft_queue_status(struct se_cmd *);
 int ft_queue_data_in(struct se_cmd *);
 int ft_write_pending(struct se_cmd *);
-int ft_write_pending_status(struct se_cmd *);
 int ft_get_cmd_state(struct se_cmd *);
 void ft_queue_tm_resp(struct se_cmd *);
 void ft_aborted_task(struct se_cmd *);
diff --git a/drivers/target/tcm_fc/tfc_cmd.c b/drivers/target/tcm_fc/tfc_cmd.c
index a183d4da7db2..f0529ba58f4c 100644
--- a/drivers/target/tcm_fc/tfc_cmd.c
+++ b/drivers/target/tcm_fc/tfc_cmd.c
@@ -184,13 +184,6 @@ int ft_queue_status(struct se_cmd *se_cmd)
 	return 0;
 }
 
-int ft_write_pending_status(struct se_cmd *se_cmd)
-{
-	struct ft_cmd *cmd = container_of(se_cmd, struct ft_cmd, se_cmd);
-
-	return cmd->write_data_len != se_cmd->data_length;
-}
-
 /*
  * Send TX_RDY (transfer ready).
  */
diff --git a/drivers/target/tcm_fc/tfc_conf.c b/drivers/target/tcm_fc/tfc_conf.c
index 1ce49518d440..c873a052fcb0 100644
--- a/drivers/target/tcm_fc/tfc_conf.c
+++ b/drivers/target/tcm_fc/tfc_conf.c
@@ -437,7 +437,6 @@ static const struct target_core_fabric_ops ft_fabric_ops = {
 	.sess_get_index =		ft_sess_get_index,
 	.sess_get_initiator_sid =	NULL,
 	.write_pending =		ft_write_pending,
-	.write_pending_status =		ft_write_pending_status,
 	.set_default_node_attributes =	ft_set_default_node_attr,
 	.get_cmd_state =		ft_get_cmd_state,
 	.queue_data_in =		ft_queue_data_in,
diff --git a/drivers/usb/gadget/function/f_tcm.c b/drivers/usb/gadget/function/f_tcm.c
index 34f5982cab78..7f01f78b1d23 100644
--- a/drivers/usb/gadget/function/f_tcm.c
+++ b/drivers/usb/gadget/function/f_tcm.c
@@ -1292,14 +1292,6 @@ static u32 usbg_sess_get_index(struct se_session *se_sess)
 	return 0;
 }
 
-/*
- * XXX Error recovery: return != 0 if we expect writes. Dunno when that could be
- */
-static int usbg_write_pending_status(struct se_cmd *se_cmd)
-{
-	return 0;
-}
-
 static void usbg_set_default_node_attrs(struct se_node_acl *nacl)
 {
 }
@@ -1725,7 +1717,6 @@ static const struct target_core_fabric_ops usbg_ops = {
 	.sess_get_index			= usbg_sess_get_index,
 	.sess_get_initiator_sid		= NULL,
 	.write_pending			= usbg_send_write_request,
-	.write_pending_status		= usbg_write_pending_status,
 	.set_default_node_attributes	= usbg_set_default_node_attrs,
 	.get_cmd_state			= usbg_get_cmd_state,
 	.queue_data_in			= usbg_send_read_response,
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 8e10ab436d1f..ad7899f99b97 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -346,11 +346,6 @@ static int vhost_scsi_write_pending(struct se_cmd *se_cmd)
 	return 0;
 }
 
-static int vhost_scsi_write_pending_status(struct se_cmd *se_cmd)
-{
-	return 0;
-}
-
 static void vhost_scsi_set_default_node_attrs(struct se_node_acl *nacl)
 {
 	return;
@@ -2298,7 +2293,6 @@ static const struct target_core_fabric_ops vhost_scsi_ops = {
 	.sess_get_index			= vhost_scsi_sess_get_index,
 	.sess_get_initiator_sid		= NULL,
 	.write_pending			= vhost_scsi_write_pending,
-	.write_pending_status		= vhost_scsi_write_pending_status,
 	.set_default_node_attributes	= vhost_scsi_set_default_node_attrs,
 	.get_cmd_state			= vhost_scsi_get_cmd_state,
 	.queue_data_in			= vhost_scsi_queue_data_in,
diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c
index c9e23a126218..e59937293a32 100644
--- a/drivers/xen/xen-scsiback.c
+++ b/drivers/xen/xen-scsiback.c
@@ -1404,11 +1404,6 @@ static int scsiback_write_pending(struct se_cmd *se_cmd)
 	return 0;
 }
 
-static int scsiback_write_pending_status(struct se_cmd *se_cmd)
-{
-	return 0;
-}
-
 static void scsiback_set_default_node_attrs(struct se_node_acl *nacl)
 {
 }
@@ -1818,7 +1813,6 @@ static const struct target_core_fabric_ops scsiback_ops = {
 	.sess_get_index			= scsiback_sess_get_index,
 	.sess_get_initiator_sid		= NULL,
 	.write_pending			= scsiback_write_pending,
-	.write_pending_status		= scsiback_write_pending_status,
 	.set_default_node_attributes	= scsiback_set_default_node_attrs,
 	.get_cmd_state			= scsiback_get_cmd_state,
 	.queue_data_in			= scsiback_queue_data_in,
diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h
index ee5ddd81cd8d..691c6b5e6520 100644
--- a/include/target/target_core_fabric.h
+++ b/include/target/target_core_fabric.h
@@ -74,7 +74,6 @@ struct target_core_fabric_ops {
 	u32 (*sess_get_initiator_sid)(struct se_session *,
 				      unsigned char *, u32);
 	int (*write_pending)(struct se_cmd *);
-	int (*write_pending_status)(struct se_cmd *);
 	void (*set_default_node_attributes)(struct se_node_acl *);
 	int (*get_cmd_state)(struct se_cmd *);
 	int (*queue_data_in)(struct se_cmd *);
-- 
cgit v1.2.3


From 3f0661a492a199b7de2fb63eceba8c79039bff83 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 25 Jan 2019 10:34:44 -0800
Subject: scsi: target/core: Remove several state tests from the TMF code

Whether or not a session is being torn down does not affect whether or not
SCSI commands are in the task set. Hence remove the "tearing down" checks
from the TMF code. The TRANSPORT_ISTATE_PROCESSING check is left out
because it is now safe to wait for a command that is in that state. The
CMD_T_PRE_EXECUTE is left out because abort processing is postponed until
after commands have left the pre-execute state since the patch that makes
TMF processing synchronous.

See also commit 1c21a48055a6 ("target: Avoid early CMD_T_PRE_EXECUTE
failures during ABORT_TASK").

Cc: Nicholas Bellinger <nab@linux-iscsi.org>
Cc: Mike Christie <mchristi@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_tmr.c       | 25 -------------------------
 drivers/target/target_core_transport.c |  2 --
 include/target/target_core_base.h      |  1 -
 3 files changed, 28 deletions(-)

(limited to 'include')

diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c
index ad0061e09d4c..df1ca827de68 100644
--- a/drivers/target/target_core_tmr.c
+++ b/drivers/target/target_core_tmr.c
@@ -114,21 +114,6 @@ static bool __target_check_io_state(struct se_cmd *se_cmd,
 		spin_unlock(&se_cmd->t_state_lock);
 		return false;
 	}
-	if (se_cmd->transport_state & CMD_T_PRE_EXECUTE) {
-		if (se_cmd->scsi_status) {
-			pr_debug("Attempted to abort io tag: %llu early failure"
-				 " status: 0x%02x\n", se_cmd->tag,
-				 se_cmd->scsi_status);
-			spin_unlock(&se_cmd->t_state_lock);
-			return false;
-		}
-	}
-	if (sess->sess_tearing_down) {
-		pr_debug("Attempted to abort io tag: %llu already shutdown,"
-			" skipping\n", se_cmd->tag);
-		spin_unlock(&se_cmd->t_state_lock);
-		return false;
-	}
 	se_cmd->transport_state |= CMD_T_ABORTED;
 
 	if ((tmr_sess != se_cmd->se_sess) && tas)
@@ -239,16 +224,6 @@ static void core_tmr_drain_tmr_list(
 			spin_unlock(&sess->sess_cmd_lock);
 			continue;
 		}
-		if (cmd->t_state == TRANSPORT_ISTATE_PROCESSING) {
-			spin_unlock(&cmd->t_state_lock);
-			spin_unlock(&sess->sess_cmd_lock);
-			continue;
-		}
-		if (sess->sess_tearing_down) {
-			spin_unlock(&cmd->t_state_lock);
-			spin_unlock(&sess->sess_cmd_lock);
-			continue;
-		}
 		cmd->transport_state |= CMD_T_ABORTED;
 		spin_unlock(&cmd->t_state_lock);
 
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index ef9e75b359d4..b25d27e8ca20 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -2056,7 +2056,6 @@ void target_execute_cmd(struct se_cmd *cmd)
 
 	spin_lock_irq(&cmd->t_state_lock);
 	cmd->t_state = TRANSPORT_PROCESSING;
-	cmd->transport_state &= ~CMD_T_PRE_EXECUTE;
 	cmd->transport_state |= CMD_T_ACTIVE | CMD_T_SENT;
 	spin_unlock_irq(&cmd->t_state_lock);
 
@@ -2765,7 +2764,6 @@ int target_get_sess_cmd(struct se_cmd *se_cmd, bool ack_kref)
 		ret = -ESHUTDOWN;
 		goto out;
 	}
-	se_cmd->transport_state |= CMD_T_PRE_EXECUTE;
 	list_add_tail(&se_cmd->se_cmd_list, &se_sess->sess_cmd_list);
 	percpu_ref_get(&se_sess->cmd_count);
 out:
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index 69b7b955902c..19a5bf4214fc 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -502,7 +502,6 @@ struct se_cmd {
 #define CMD_T_STOP		(1 << 5)
 #define CMD_T_TAS		(1 << 10)
 #define CMD_T_FABRIC_STOP	(1 << 11)
-#define CMD_T_PRE_EXECUTE	(1 << 12)
 	spinlock_t		t_state_lock;
 	struct kref		cmd_kref;
 	struct completion	t_transport_stop_comp;
-- 
cgit v1.2.3


From 94ebb47160085343ecaaeab90fa7ffbb86030613 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 25 Jan 2019 10:34:47 -0800
Subject: scsi: target/core: Add target_send_busy()

Introduce a function that sends the SCSI status "BUSY" back to the
initiator. The next patch will add a call to this function in the srpt
target driver.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Cc: Nicholas Bellinger <nab@linux-iscsi.org>
Cc: Mike Christie <mchristi@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_transport.c | 16 ++++++++++++++++
 include/target/target_core_fabric.h    |  1 +
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 6471ede00336..e3f7e21e6614 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -3246,6 +3246,22 @@ transport_send_check_condition_and_sense(struct se_cmd *cmd,
 }
 EXPORT_SYMBOL(transport_send_check_condition_and_sense);
 
+/**
+ * target_send_busy - Send SCSI BUSY status back to the initiator
+ * @cmd: SCSI command for which to send a BUSY reply.
+ *
+ * Note: Only call this function if target_submit_cmd*() failed.
+ */
+int target_send_busy(struct se_cmd *cmd)
+{
+	WARN_ON_ONCE(cmd->se_cmd_flags & SCF_SCSI_TMR_CDB);
+
+	cmd->scsi_status = SAM_STAT_BUSY;
+	trace_target_cmd_complete(cmd);
+	return cmd->se_tfo->queue_status(cmd);
+}
+EXPORT_SYMBOL(target_send_busy);
+
 static void target_tmr_work(struct work_struct *work)
 {
 	struct se_cmd *cmd = container_of(work, struct se_cmd, work);
diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h
index 691c6b5e6520..8ed90407f062 100644
--- a/include/target/target_core_fabric.h
+++ b/include/target/target_core_fabric.h
@@ -173,6 +173,7 @@ int	transport_generic_free_cmd(struct se_cmd *, int);
 bool	transport_wait_for_tasks(struct se_cmd *);
 int	transport_send_check_condition_and_sense(struct se_cmd *,
 		sense_reason_t, int);
+int	target_send_busy(struct se_cmd *cmd);
 int	target_get_sess_cmd(struct se_cmd *, bool);
 int	target_put_sess_cmd(struct se_cmd *);
 void	target_sess_cmd_list_set_waiting(struct se_session *);
-- 
cgit v1.2.3


From 0300b1147e528fe6f6b0632b0ef353a87810dabf Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 25 Jan 2019 10:34:54 -0800
Subject: scsi: target/iscsi: Fix spelling of "unsolicited"

Change "unsoliticed" into "unsolicited".

Cc: Nicholas Bellinger <nab@linux-iscsi.org>
Cc: Mike Christie <mchristi@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/infiniband/ulp/isert/ib_isert.c     | 2 +-
 drivers/target/iscsi/cxgbit/cxgbit_target.c | 2 +-
 drivers/target/iscsi/iscsi_target.c         | 8 ++++----
 drivers/target/iscsi/iscsi_target.h         | 2 +-
 drivers/target/iscsi/iscsi_target_erl1.c    | 2 +-
 include/target/iscsi/iscsi_transport.h      | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index e3dd13798d79..989f1ac4245c 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -1186,7 +1186,7 @@ sequence_cmd:
 	rc = iscsit_sequence_cmd(conn, cmd, buf, hdr->cmdsn);
 
 	if (!rc && dump_payload == false && unsol_data)
-		iscsit_set_unsoliticed_dataout(cmd);
+		iscsit_set_unsolicited_dataout(cmd);
 	else if (dump_payload && imm_data)
 		target_put_sess_cmd(&cmd->se_cmd);
 
diff --git a/drivers/target/iscsi/cxgbit/cxgbit_target.c b/drivers/target/iscsi/cxgbit/cxgbit_target.c
index 25eb3891e34b..29b350a0b58f 100644
--- a/drivers/target/iscsi/cxgbit/cxgbit_target.c
+++ b/drivers/target/iscsi/cxgbit/cxgbit_target.c
@@ -960,7 +960,7 @@ after_immediate_data:
 			target_put_sess_cmd(&cmd->se_cmd);
 			return 0;
 		} else if (cmd->unsolicited_data) {
-			iscsit_set_unsoliticed_dataout(cmd);
+			iscsit_set_unsolicited_dataout(cmd);
 		}
 
 	} else if (immed_ret == IMMEDIATE_DATA_ERL1_CRC_FAILURE) {
diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index 98984c3adfba..7059ffdb0c20 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -1194,7 +1194,7 @@ attach_cmd:
 }
 EXPORT_SYMBOL(iscsit_setup_scsi_cmd);
 
-void iscsit_set_unsoliticed_dataout(struct iscsi_cmd *cmd)
+void iscsit_set_unsolicited_dataout(struct iscsi_cmd *cmd)
 {
 	iscsit_set_dataout_sequence_values(cmd);
 
@@ -1202,7 +1202,7 @@ void iscsit_set_unsoliticed_dataout(struct iscsi_cmd *cmd)
 	iscsit_start_dataout_timer(cmd, cmd->conn);
 	spin_unlock_bh(&cmd->dataout_timeout_lock);
 }
-EXPORT_SYMBOL(iscsit_set_unsoliticed_dataout);
+EXPORT_SYMBOL(iscsit_set_unsolicited_dataout);
 
 int iscsit_process_scsi_cmd(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
 			    struct iscsi_scsi_req *hdr)
@@ -1236,7 +1236,7 @@ int iscsit_process_scsi_cmd(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
 	 */
 	if (!cmd->immediate_data) {
 		if (!cmd->sense_reason && cmd->unsolicited_data)
-			iscsit_set_unsoliticed_dataout(cmd);
+			iscsit_set_unsolicited_dataout(cmd);
 		if (!cmd->sense_reason)
 			return 0;
 
@@ -1308,7 +1308,7 @@ after_immediate_data:
 			target_put_sess_cmd(&cmd->se_cmd);
 			return rc;
 		} else if (cmd->unsolicited_data)
-			iscsit_set_unsoliticed_dataout(cmd);
+			iscsit_set_unsolicited_dataout(cmd);
 
 	} else if (immed_ret == IMMEDIATE_DATA_ERL1_CRC_FAILURE) {
 		/*
diff --git a/drivers/target/iscsi/iscsi_target.h b/drivers/target/iscsi/iscsi_target.h
index 48bac0acf8c7..c95f56a3ce31 100644
--- a/drivers/target/iscsi/iscsi_target.h
+++ b/drivers/target/iscsi/iscsi_target.h
@@ -31,7 +31,7 @@ extern int iscsit_reset_np_thread(struct iscsi_np *, struct iscsi_tpg_np *,
 				struct iscsi_portal_group *, bool);
 extern int iscsit_del_np(struct iscsi_np *);
 extern int iscsit_reject_cmd(struct iscsi_cmd *cmd, u8, unsigned char *);
-extern void iscsit_set_unsoliticed_dataout(struct iscsi_cmd *);
+extern void iscsit_set_unsolicited_dataout(struct iscsi_cmd *);
 extern int iscsit_logout_closesession(struct iscsi_cmd *, struct iscsi_conn *);
 extern int iscsit_logout_closeconnection(struct iscsi_cmd *, struct iscsi_conn *);
 extern int iscsit_logout_removeconnforrecovery(struct iscsi_cmd *, struct iscsi_conn *);
diff --git a/drivers/target/iscsi/iscsi_target_erl1.c b/drivers/target/iscsi/iscsi_target_erl1.c
index 38210ab09887..c27335c29640 100644
--- a/drivers/target/iscsi/iscsi_target_erl1.c
+++ b/drivers/target/iscsi/iscsi_target_erl1.c
@@ -979,7 +979,7 @@ int iscsit_execute_cmd(struct iscsi_cmd *cmd, int ooo)
 			if (cmd->se_cmd.transport_state & CMD_T_ABORTED)
 				return 0;
 
-			iscsit_set_unsoliticed_dataout(cmd);
+			iscsit_set_unsolicited_dataout(cmd);
 		}
 		return transport_handle_cdb_direct(&cmd->se_cmd);
 
diff --git a/include/target/iscsi/iscsi_transport.h b/include/target/iscsi/iscsi_transport.h
index 91948bc5b185..8b4b8ce2c1ae 100644
--- a/include/target/iscsi/iscsi_transport.h
+++ b/include/target/iscsi/iscsi_transport.h
@@ -53,7 +53,7 @@ extern void iscsit_put_transport(struct iscsit_transport *);
  */
 extern int iscsit_setup_scsi_cmd(struct iscsi_conn *, struct iscsi_cmd *,
 				unsigned char *);
-extern void iscsit_set_unsoliticed_dataout(struct iscsi_cmd *);
+extern void iscsit_set_unsolicited_dataout(struct iscsi_cmd *);
 extern int iscsit_process_scsi_cmd(struct iscsi_conn *, struct iscsi_cmd *,
 				struct iscsi_scsi_req *);
 extern int
-- 
cgit v1.2.3


From 1e65cc1631c187c4607e9a2067769791be529ce6 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 25 Jan 2019 10:34:55 -0800
Subject: scsi: target/iscsi: Rename a function and a function pointer

Having both a function and a function pointer member with the same
name (iscsit_release_cmd) is confusing. Hence rename the function pointer
member.

Cc: Nicholas Bellinger <nab@linux-iscsi.org>
Cc: Mike Christie <mchristi@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/iscsi/cxgbit/cxgbit.h      | 2 +-
 drivers/target/iscsi/cxgbit/cxgbit_ddp.c  | 2 +-
 drivers/target/iscsi/cxgbit/cxgbit_main.c | 2 +-
 drivers/target/iscsi/iscsi_target_util.c  | 4 ++--
 include/target/iscsi/iscsi_transport.h    | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/target/iscsi/cxgbit/cxgbit.h b/drivers/target/iscsi/cxgbit/cxgbit.h
index 417b9e66b0cd..3cca22e19964 100644
--- a/drivers/target/iscsi/cxgbit/cxgbit.h
+++ b/drivers/target/iscsi/cxgbit/cxgbit.h
@@ -345,7 +345,7 @@ struct cxgbit_device *cxgbit_find_device(struct net_device *, u8 *);
 int cxgbit_ddp_init(struct cxgbit_device *);
 int cxgbit_setup_conn_pgidx(struct cxgbit_sock *, u32);
 int cxgbit_reserve_ttt(struct cxgbit_sock *, struct iscsi_cmd *);
-void cxgbit_release_cmd(struct iscsi_conn *, struct iscsi_cmd *);
+void cxgbit_unmap_cmd(struct iscsi_conn *, struct iscsi_cmd *);
 
 static inline
 struct cxgbi_ppm *cdev2ppm(struct cxgbit_device *cdev)
diff --git a/drivers/target/iscsi/cxgbit/cxgbit_ddp.c b/drivers/target/iscsi/cxgbit/cxgbit_ddp.c
index 76a262674c8d..d57fd3ed3fa5 100644
--- a/drivers/target/iscsi/cxgbit/cxgbit_ddp.c
+++ b/drivers/target/iscsi/cxgbit/cxgbit_ddp.c
@@ -263,7 +263,7 @@ out:
 	r2t->targ_xfer_tag = ttinfo->tag;
 }
 
-void cxgbit_release_cmd(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
+void cxgbit_unmap_cmd(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
 {
 	struct cxgbit_cmd *ccmd = iscsit_priv_cmd(cmd);
 
diff --git a/drivers/target/iscsi/cxgbit/cxgbit_main.c b/drivers/target/iscsi/cxgbit/cxgbit_main.c
index c011c826fc26..4a7bb0b49d17 100644
--- a/drivers/target/iscsi/cxgbit/cxgbit_main.c
+++ b/drivers/target/iscsi/cxgbit/cxgbit_main.c
@@ -678,7 +678,7 @@ static struct iscsit_transport cxgbit_transport = {
 	.iscsit_get_r2t_ttt	= cxgbit_get_r2t_ttt,
 	.iscsit_get_rx_pdu	= cxgbit_get_rx_pdu,
 	.iscsit_validate_params	= cxgbit_validate_params,
-	.iscsit_release_cmd	= cxgbit_release_cmd,
+	.iscsit_unmap_cmd	= cxgbit_unmap_cmd,
 	.iscsit_aborted_task	= iscsit_aborted_task,
 	.iscsit_get_sup_prot_ops = cxgbit_get_sup_prot_ops,
 };
diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c
index 58886d4b33cb..3ac494f63a0b 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -760,8 +760,8 @@ void __iscsit_free_cmd(struct iscsi_cmd *cmd, bool check_queues)
 		iscsit_remove_cmd_from_response_queue(cmd, conn);
 	}
 
-	if (conn && conn->conn_transport->iscsit_release_cmd)
-		conn->conn_transport->iscsit_release_cmd(conn, cmd);
+	if (conn && conn->conn_transport->iscsit_unmap_cmd)
+		conn->conn_transport->iscsit_unmap_cmd(conn, cmd);
 }
 
 void iscsit_free_cmd(struct iscsi_cmd *cmd, bool shutdown)
diff --git a/include/target/iscsi/iscsi_transport.h b/include/target/iscsi/iscsi_transport.h
index 8b4b8ce2c1ae..75bee29fd7dd 100644
--- a/include/target/iscsi/iscsi_transport.h
+++ b/include/target/iscsi/iscsi_transport.h
@@ -26,7 +26,7 @@ struct iscsit_transport {
 	void (*iscsit_aborted_task)(struct iscsi_conn *, struct iscsi_cmd *);
 	int (*iscsit_xmit_pdu)(struct iscsi_conn *, struct iscsi_cmd *,
 			       struct iscsi_datain_req *, const void *, u32);
-	void (*iscsit_release_cmd)(struct iscsi_conn *, struct iscsi_cmd *);
+	void (*iscsit_unmap_cmd)(struct iscsi_conn *, struct iscsi_cmd *);
 	void (*iscsit_get_rx_pdu)(struct iscsi_conn *);
 	int (*iscsit_validate_params)(struct iscsi_conn *);
 	void (*iscsit_get_r2t_ttt)(struct iscsi_conn *, struct iscsi_cmd *,
-- 
cgit v1.2.3


From 809ab9371ca0a96b44d9866ad82849410759a45b Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Sat, 26 Jan 2019 00:52:26 -0500
Subject: XArray: Update xa_erase family descriptions

xa_erase does not allocate memory and doesn't have a gfp parameter.
Update the descriptions of all four variants to be more useful.

Signed-off-by: Matthew Wilcox <willy@infradead.org>
---
 include/linux/xarray.h | 12 ++++++------
 lib/xarray.c           | 17 ++++++++---------
 2 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 5d9d318bcf7a..e11841537631 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -526,9 +526,9 @@ static inline void *xa_store_irq(struct xarray *xa, unsigned long index,
  * @xa: XArray.
  * @index: Index of entry.
  *
- * This function is the equivalent of calling xa_store() with %NULL as
- * the third argument.  The XArray does not need to allocate memory, so
- * the user does not need to provide GFP flags.
+ * After this function returns, loading from @index will return %NULL.
+ * If the index is part of a multi-index entry, all indices will be erased
+ * and none of the entries will be part of a multi-index entry.
  *
  * Context: Any context.  Takes and releases the xa_lock while
  * disabling softirqs.
@@ -550,9 +550,9 @@ static inline void *xa_erase_bh(struct xarray *xa, unsigned long index)
  * @xa: XArray.
  * @index: Index of entry.
  *
- * This function is the equivalent of calling xa_store() with %NULL as
- * the third argument.  The XArray does not need to allocate memory, so
- * the user does not need to provide GFP flags.
+ * After this function returns, loading from @index will return %NULL.
+ * If the index is part of a multi-index entry, all indices will be erased
+ * and none of the entries will be part of a multi-index entry.
  *
  * Context: Process context.  Takes and releases the xa_lock while
  * disabling interrupts.
diff --git a/lib/xarray.c b/lib/xarray.c
index 81c3171ddde9..fb783bf2a441 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -1294,13 +1294,12 @@ static void *xas_result(struct xa_state *xas, void *curr)
  * @xa: XArray.
  * @index: Index into array.
  *
- * If the entry at this index is a multi-index entry then all indices will
- * be erased, and the entry will no longer be a multi-index entry.
- * This function expects the xa_lock to be held on entry.
+ * After this function returns, loading from @index will return %NULL.
+ * If the index is part of a multi-index entry, all indices will be erased
+ * and none of the entries will be part of a multi-index entry.
  *
- * Context: Any context.  Expects xa_lock to be held on entry.  May
- * release and reacquire xa_lock if @gfp flags permit.
- * Return: The old entry at this index.
+ * Context: Any context.  Expects xa_lock to be held on entry.
+ * Return: The entry which used to be at this index.
  */
 void *__xa_erase(struct xarray *xa, unsigned long index)
 {
@@ -1314,9 +1313,9 @@ EXPORT_SYMBOL(__xa_erase);
  * @xa: XArray.
  * @index: Index of entry.
  *
- * This function is the equivalent of calling xa_store() with %NULL as
- * the third argument.  The XArray does not need to allocate memory, so
- * the user does not need to provide GFP flags.
+ * After this function returns, loading from @index will return %NULL.
+ * If the index is part of a multi-index entry, all indices will be erased
+ * and none of the entries will be part of a multi-index entry.
  *
  * Context: Any context.  Takes and releases the xa_lock.
  * Return: The entry which used to be at this index.
-- 
cgit v1.2.3


From 2292822e1576c89191a65c3d0da584d75d3c033f Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Sat, 19 Jan 2019 13:16:53 +0100
Subject: i2c: algo-bit: include main i2c header

We are using symbols from it, so we should include it directly. Found
after sorting includes in a driver.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/i2c-algo-bit.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/i2c-algo-bit.h b/include/linux/i2c-algo-bit.h
index 63904ba6887e..d64cebc6e65a 100644
--- a/include/linux/i2c-algo-bit.h
+++ b/include/linux/i2c-algo-bit.h
@@ -25,6 +25,8 @@
 #ifndef _LINUX_I2C_ALGO_BIT_H
 #define _LINUX_I2C_ALGO_BIT_H
 
+#include <linux/i2c.h>
+
 /* --- Defines for bit-adapters ---------------------------------------	*/
 /*
  * This struct contains the hw-dependent functions of bit-style adapters to
-- 
cgit v1.2.3


From 738ac0679b969776a638daf2cfb5011049d467da Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Sat, 19 Jan 2019 13:16:54 +0100
Subject: i2c: algo-bit: convert to SPDX header

And use kernel style for the remaining comments in the header.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/algos/i2c-algo-bit.c | 25 ++++++++-----------------
 include/linux/i2c-algo-bit.h     | 31 ++++++++-----------------------
 2 files changed, 16 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/drivers/i2c/algos/i2c-algo-bit.c b/drivers/i2c/algos/i2c-algo-bit.c
index c33dcfb87993..5e5990a83da5 100644
--- a/drivers/i2c/algos/i2c-algo-bit.c
+++ b/drivers/i2c/algos/i2c-algo-bit.c
@@ -1,21 +1,12 @@
-/* -------------------------------------------------------------------------
- * i2c-algo-bit.c i2c driver algorithms for bit-shift adapters
- * -------------------------------------------------------------------------
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * i2c-algo-bit.c: i2c driver algorithms for bit-shift adapters
+ *
  *   Copyright (C) 1995-2000 Simon G. Vogl
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
- * ------------------------------------------------------------------------- */
-
-/* With some changes from Frodo Looijaard <frodol@dds.nl>, Kyösti Mälkki
-   <kmalkki@cc.hut.fi> and Jean Delvare <jdelvare@suse.de> */
+ *
+ * With some changes from Frodo Looijaard <frodol@dds.nl>, Kyösti Mälkki
+ * <kmalkki@cc.hut.fi> and Jean Delvare <jdelvare@suse.de>
+ */
 
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/include/linux/i2c-algo-bit.h b/include/linux/i2c-algo-bit.h
index d64cebc6e65a..69045df78e2d 100644
--- a/include/linux/i2c-algo-bit.h
+++ b/include/linux/i2c-algo-bit.h
@@ -1,26 +1,11 @@
-/* ------------------------------------------------------------------------- */
-/* i2c-algo-bit.h i2c driver algorithms for bit-shift adapters               */
-/* ------------------------------------------------------------------------- */
-/*   Copyright (C) 1995-99 Simon G. Vogl
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
-    MA 02110-1301 USA.							     */
-/* ------------------------------------------------------------------------- */
-
-/* With some changes from Kyösti Mälkki <kmalkki@cc.hut.fi> and even
-   Frodo Looijaard <frodol@dds.nl> */
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * i2c-algo-bit.h: i2c driver algorithms for bit-shift adapters
+ *
+ *   Copyright (C) 1995-99 Simon G. Vogl
+ * With some changes from Kyösti Mälkki <kmalkki@cc.hut.fi> and even
+ * Frodo Looijaard <frodol@dds.nl>
+ */
 
 #ifndef _LINUX_I2C_ALGO_BIT_H
 #define _LINUX_I2C_ALGO_BIT_H
-- 
cgit v1.2.3


From c3424f59a0cb994a098c3701bf14617580a18290 Mon Sep 17 00:00:00 2001
From: Weiyi Lu <weiyi.lu@mediatek.com>
Date: Fri, 14 Dec 2018 10:04:16 +0800
Subject: dt-bindings: clock: add clock for MT2712

Add new clock according to 3rd ECO design change.
It's the parent clock of audio clock mux.

Signed-off-by: Weiyi Lu <weiyi.lu@mediatek.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/clock/mt2712-clk.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/mt2712-clk.h b/include/dt-bindings/clock/mt2712-clk.h
index 76265836a1e1..c3b29dff9c0e 100644
--- a/include/dt-bindings/clock/mt2712-clk.h
+++ b/include/dt-bindings/clock/mt2712-clk.h
@@ -228,7 +228,8 @@
 #define CLK_TOP_NFI2X_EN		189
 #define CLK_TOP_NFIECC_EN		190
 #define CLK_TOP_NFI1X_CK_EN		191
-#define CLK_TOP_NR_CLK			192
+#define CLK_TOP_APLL2_D3		192
+#define CLK_TOP_NR_CLK			193
 
 /* INFRACFG */
 
-- 
cgit v1.2.3


From a78e8723a50530d15faa25cc0b6f009bcd251c20 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Wed, 16 Jan 2019 09:55:41 +0200
Subject: RDMA/cma: Remove CM_ID statistics provided by rdma-cm module

Netlink statistics exported by rdma-cm never had any working user space
component published to the mailing list or to any open source
project. Canvassing various proprietary users, and the original requester,
we find that there are no real users of this interface.

This patch simply removes all occurrences of RDMA CM netlink in favour of
modern nldev implementation, which provides the same information and
accompanied by widely used user space component.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cma.c     | 83 ---------------------------------------
 drivers/infiniband/core/netlink.c |  4 +-
 include/uapi/rdma/rdma_netlink.h  | 17 +-------
 3 files changed, 3 insertions(+), 101 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 81bded0d37d1..e15546ae4d0f 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -4616,85 +4616,6 @@ static void cma_remove_one(struct ib_device *device, void *client_data)
 	kfree(cma_dev);
 }
 
-static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb)
-{
-	struct nlmsghdr *nlh;
-	struct rdma_cm_id_stats *id_stats;
-	struct rdma_id_private *id_priv;
-	struct rdma_cm_id *id = NULL;
-	struct cma_device *cma_dev;
-	int i_dev = 0, i_id = 0;
-
-	/*
-	 * We export all of the IDs as a sequence of messages.  Each
-	 * ID gets its own netlink message.
-	 */
-	mutex_lock(&lock);
-
-	list_for_each_entry(cma_dev, &dev_list, list) {
-		if (i_dev < cb->args[0]) {
-			i_dev++;
-			continue;
-		}
-
-		i_id = 0;
-		list_for_each_entry(id_priv, &cma_dev->id_list, list) {
-			if (i_id < cb->args[1]) {
-				i_id++;
-				continue;
-			}
-
-			id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq,
-						sizeof *id_stats, RDMA_NL_RDMA_CM,
-						RDMA_NL_RDMA_CM_ID_STATS,
-						NLM_F_MULTI);
-			if (!id_stats)
-				goto out;
-
-			memset(id_stats, 0, sizeof *id_stats);
-			id = &id_priv->id;
-			id_stats->node_type = id->route.addr.dev_addr.dev_type;
-			id_stats->port_num = id->port_num;
-			id_stats->bound_dev_if =
-				id->route.addr.dev_addr.bound_dev_if;
-
-			if (ibnl_put_attr(skb, nlh,
-					  rdma_addr_size(cma_src_addr(id_priv)),
-					  cma_src_addr(id_priv),
-					  RDMA_NL_RDMA_CM_ATTR_SRC_ADDR))
-				goto out;
-			if (ibnl_put_attr(skb, nlh,
-					  rdma_addr_size(cma_dst_addr(id_priv)),
-					  cma_dst_addr(id_priv),
-					  RDMA_NL_RDMA_CM_ATTR_DST_ADDR))
-				goto out;
-
-			id_stats->pid	= task_pid_vnr(id_priv->res.task);
-			id_stats->port_space	= id->ps;
-			id_stats->cm_state	= id_priv->state;
-			id_stats->qp_num	= id_priv->qp_num;
-			id_stats->qp_type	= id->qp_type;
-
-			i_id++;
-			nlmsg_end(skb, nlh);
-		}
-
-		cb->args[1] = 0;
-		i_dev++;
-	}
-
-out:
-	mutex_unlock(&lock);
-	cb->args[0] = i_dev;
-	cb->args[1] = i_id;
-
-	return skb->len;
-}
-
-static const struct rdma_nl_cbs cma_cb_table[RDMA_NL_RDMA_CM_NUM_OPS] = {
-	[RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats},
-};
-
 static int cma_init_net(struct net *net)
 {
 	struct cma_pernet *pernet = cma_pernet(net);
@@ -4743,7 +4664,6 @@ static int __init cma_init(void)
 	if (ret)
 		goto err;
 
-	rdma_nl_register(RDMA_NL_RDMA_CM, cma_cb_table);
 	cma_configfs_init();
 
 	return 0;
@@ -4759,7 +4679,6 @@ err_wq:
 static void __exit cma_cleanup(void)
 {
 	cma_configfs_exit();
-	rdma_nl_unregister(RDMA_NL_RDMA_CM);
 	ib_unregister_client(&cma_client);
 	unregister_netdevice_notifier(&cma_nb);
 	ib_sa_unregister_client(&sa_client);
@@ -4767,7 +4686,5 @@ static void __exit cma_cleanup(void)
 	destroy_workqueue(cma_wq);
 }
 
-MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_RDMA_CM, 1);
-
 module_init(cma_init);
 module_exit(cma_cleanup);
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 724f5a62e82f..eecfc0b377c9 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -56,7 +56,6 @@ EXPORT_SYMBOL(rdma_nl_chk_listeners);
 static bool is_nl_msg_valid(unsigned int type, unsigned int op)
 {
 	static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS] = {
-		[RDMA_NL_RDMA_CM] = RDMA_NL_RDMA_CM_NUM_OPS,
 		[RDMA_NL_IWCM] = RDMA_NL_IWPM_NUM_OPS,
 		[RDMA_NL_LS] = RDMA_NL_LS_NUM_OPS,
 		[RDMA_NL_NLDEV] = RDMA_NLDEV_NUM_OPS,
@@ -181,8 +180,7 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return -EINVAL;
 	}
 	/* FIXME: Convert IWCM to properly handle doit callbacks */
-	if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_RDMA_CM ||
-	    index == RDMA_NL_IWCM) {
+	if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_IWCM) {
 		struct netlink_dump_control c = {
 			.dump = cb_table[op].dump,
 		};
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 0f5263767fb4..3a9e681e4257 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -5,8 +5,7 @@
 #include <linux/types.h>
 
 enum {
-	RDMA_NL_RDMA_CM = 1,
-	RDMA_NL_IWCM,
+	RDMA_NL_IWCM = 2,
 	RDMA_NL_RSVD,
 	RDMA_NL_LS,	/* RDMA Local Services */
 	RDMA_NL_NLDEV,	/* RDMA device interface */
@@ -14,8 +13,7 @@ enum {
 };
 
 enum {
-	RDMA_NL_GROUP_CM = 1,
-	RDMA_NL_GROUP_IWPM,
+	RDMA_NL_GROUP_IWPM = 2,
 	RDMA_NL_GROUP_LS,
 	RDMA_NL_NUM_GROUPS
 };
@@ -24,17 +22,6 @@ enum {
 #define RDMA_NL_GET_OP(type) (type & ((1 << 10) - 1))
 #define RDMA_NL_GET_TYPE(client, op) ((client << 10) + op)
 
-enum {
-	RDMA_NL_RDMA_CM_ID_STATS = 0,
-	RDMA_NL_RDMA_CM_NUM_OPS
-};
-
-enum {
-	RDMA_NL_RDMA_CM_ATTR_SRC_ADDR = 1,
-	RDMA_NL_RDMA_CM_ATTR_DST_ADDR,
-	RDMA_NL_RDMA_CM_NUM_ATTR,
-};
-
 /* The minimum version that the iwpm kernel supports */
 #define IWPM_UABI_VERSION_MIN	3
 
-- 
cgit v1.2.3


From a2bfd708b17adb6e597e70d4eca824667f2d4e3c Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Tue, 5 Feb 2019 11:33:22 -0800
Subject: RDMA/iwpm: move kdoc comments to functions

Move the iwpm kdoc comments from the prototype declarations to above
the function bodies.  There are no functional changes in this patch.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/iwpm_msg.c  | 116 ++++++++++++++++++++--------
 drivers/infiniband/core/iwpm_util.c |  40 ++++++++++
 include/rdma/iw_portmap.h           | 149 ------------------------------------
 3 files changed, 123 insertions(+), 182 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
index 2e30e65b0816..2452b0ddcf0d 100644
--- a/drivers/infiniband/core/iwpm_msg.c
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -38,14 +38,21 @@ u16 iwpm_ulib_version = IWPM_UABI_VERSION_MIN;
 static int iwpm_user_pid = IWPM_PID_UNDEFINED;
 static atomic_t echo_nlmsg_seq;
 
+/**
+ * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid
+ *
+ * Returns true if the pid is greater than zero, otherwise returns false
+ */
 int iwpm_valid_pid(void)
 {
 	return iwpm_user_pid > 0;
 }
 
-/*
- * iwpm_register_pid - Send a netlink query to user space
- *                     for the iwarp port mapper pid
+/**
+ * iwpm_register_pid - Send a netlink query to userspace
+ *                     to get the iwarp port mapper pid
+ * @pm_msg: Contains driver info to send to the userspace port mapper
+ * @nl_client: The index of the netlink client
  *
  * nlmsg attributes:
  *	[IWPM_NLA_REG_PID_SEQ]
@@ -124,13 +131,19 @@ pid_query_error:
 	return ret;
 }
 
-/*
- * iwpm_add_mapping - Send a netlink add mapping message
- *                    to the port mapper
+/**
+ * iwpm_add_mapping - Send a netlink add mapping request to
+ *                    the userspace port mapper
+ * @pm_msg: Contains the local ip/tcp address info to send
+ * @nl_client: The index of the netlink client
+ *
  * nlmsg attributes:
  *	[IWPM_NLA_MANAGE_MAPPING_SEQ]
  *	[IWPM_NLA_MANAGE_ADDR]
  *	[IWPM_NLA_MANAGE_FLAGS]
+ *
+ * If the request is successful, the pm_msg stores
+ * the port mapper response (mapped address info)
  */
 int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 {
@@ -208,9 +221,12 @@ add_mapping_error_nowarn:
 	return ret;
 }
 
-/*
- * iwpm_add_and_query_mapping - Send a netlink add and query
- *                              mapping message to the port mapper
+/**
+ * iwpm_add_and_query_mapping - Process the port mapper response to
+ *                              iwpm_add_and_query_mapping request
+ * @pm_msg: Contains the local ip/tcp address info to send
+ * @nl_client: The index of the netlink client
+ *
  * nlmsg attributes:
  *	[IWPM_NLA_QUERY_MAPPING_SEQ]
  *	[IWPM_NLA_QUERY_LOCAL_ADDR]
@@ -299,9 +315,13 @@ query_mapping_error_nowarn:
 	return ret;
 }
 
-/*
- * iwpm_remove_mapping - Send a netlink remove mapping message
- *                       to the port mapper
+/**
+ * iwpm_remove_mapping - Send a netlink remove mapping request
+ *                       to the userspace port mapper
+ *
+ * @local_addr: Local ip/tcp address to remove
+ * @nl_client: The index of the netlink client
+ *
  * nlmsg attributes:
  *	[IWPM_NLA_MANAGE_MAPPING_SEQ]
  *	[IWPM_NLA_MANAGE_ADDR]
@@ -372,9 +392,14 @@ static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = {
 	[IWPM_NLA_RREG_PID_ERR]     = { .type = NLA_U16 }
 };
 
-/*
- * iwpm_register_pid_cb - Process a port mapper response to
- *                        iwpm_register_pid()
+/**
+ * iwpm_register_pid_cb - Process the port mapper response to
+ *                        iwpm_register_pid query
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * If successful, the function receives the userspace port mapper pid
+ * which is used in future communication with the port mapper
  */
 int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
 {
@@ -443,9 +468,11 @@ static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = {
 	[IWPM_NLA_RMANAGE_MAPPING_ERR]	   = { .type = NLA_U16 }
 };
 
-/*
- * iwpm_add_mapping_cb - Process a port mapper response to
- *                       iwpm_add_mapping()
+/**
+ * iwpm_add_mapping_cb - Process the port mapper response to
+ *                       iwpm_add_mapping request
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
  */
 int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb)
 {
@@ -518,9 +545,11 @@ static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] =
 	[IWPM_NLA_RQUERY_MAPPING_ERR]	  = { .type = NLA_U16 }
 };
 
-/*
- * iwpm_add_and_query_mapping_cb - Process a port mapper response to
- *                                 iwpm_add_and_query_mapping()
+/**
+ * iwpm_add_and_query_mapping_cb - Process the port mapper response to
+ *                                 iwpm_add_and_query_mapping request
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
  */
 int iwpm_add_and_query_mapping_cb(struct sk_buff *skb,
 				struct netlink_callback *cb)
@@ -598,9 +627,13 @@ query_mapping_response_exit:
 	return 0;
 }
 
-/*
- * iwpm_remote_info_cb - Process a port mapper message, containing
- *			  the remote connecting peer address info
+/**
+ * iwpm_remote_info_cb - Process remote connecting peer address info, which
+ *                       the port mapper has received from the connecting peer
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Stores the IPv4/IPv6 address info in a hash table
  */
 int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
 {
@@ -673,8 +706,14 @@ static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = {
 	[IWPM_NLA_MAPINFO_ULIB_VER]  = { .type = NLA_U16 }
 };
 
-/*
- * iwpm_mapping_info_cb - Process a port mapper request for mapping info
+/**
+ * iwpm_mapping_info_cb - Process a notification that the userspace
+ *                        port mapper daemon is started
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Using the received port mapper pid, send all the local mapping
+ * info records to the userspace port mapper
  */
 int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
 {
@@ -727,9 +766,11 @@ static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = {
 	[IWPM_NLA_MAPINFO_ACK_NUM] =  { .type = NLA_U32 }
 };
 
-/*
- * iwpm_ack_mapping_info_cb - Process a port mapper ack for
- *                            the provided mapping info records
+/**
+ * iwpm_ack_mapping_info_cb - Process the port mapper ack for
+ *                            the provided local mapping info records
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
  */
 int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
 {
@@ -755,8 +796,11 @@ static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = {
 	[IWPM_NLA_ERR_CODE]       = { .type = NLA_U16 },
 };
 
-/*
- * iwpm_mapping_error_cb - Process a port mapper error message
+/**
+ * iwpm_mapping_error_cb - Process port mapper notification for error
+ *
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
  */
 int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb)
 {
@@ -797,8 +841,14 @@ static const struct nla_policy hello_policy[IWPM_NLA_HELLO_MAX] = {
 	[IWPM_NLA_HELLO_ABI_VERSION]     = { .type = NLA_U16 }
 };
 
-/*
- * iwpm_hello_cb - Process a port mapper hello request
+/**
+ * iwpm_hello_cb - Process a hello message from iwpmd
+ *
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Using the received port mapper pid, send the kernel's abi_version
+ * after adjusting it to support the iwpmd version.
  */
 int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb)
 {
diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c
index 363938435476..7b97f6e2075f 100644
--- a/drivers/infiniband/core/iwpm_util.c
+++ b/drivers/infiniband/core/iwpm_util.c
@@ -51,6 +51,12 @@ static DEFINE_SPINLOCK(iwpm_reminfo_lock);
 static DEFINE_MUTEX(iwpm_admin_lock);
 static struct iwpm_admin_data iwpm_admin;
 
+/**
+ * iwpm_init - Allocate resources for the iwarp port mapper
+ * @nl_client: The index of the netlink client
+ *
+ * Should be called when network interface goes up.
+ */
 int iwpm_init(u8 nl_client)
 {
 	int ret = 0;
@@ -87,6 +93,12 @@ init_exit:
 static void free_hash_bucket(void);
 static void free_reminfo_bucket(void);
 
+/**
+ * iwpm_exit - Deallocate resources for the iwarp port mapper
+ * @nl_client: The index of the netlink client
+ *
+ * Should be called when network interface goes down.
+ */
 int iwpm_exit(u8 nl_client)
 {
 
@@ -112,6 +124,14 @@ int iwpm_exit(u8 nl_client)
 static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *,
 					       struct sockaddr_storage *);
 
+/**
+ * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address
+ *                       info in a hash table
+ * @local_addr: Local ip/tcp address
+ * @mapped_addr: Mapped local ip/tcp address
+ * @nl_client: The index of the netlink client
+ * @map_flags: IWPM mapping flags
+ */
 int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
 			struct sockaddr_storage *mapped_sockaddr,
 			u8 nl_client, u32 map_flags)
@@ -151,6 +171,15 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
 	return ret;
 }
 
+/**
+ * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address
+ *                       info from the hash table
+ * @local_addr: Local ip/tcp address
+ * @mapped_local_addr: Mapped local ip/tcp address
+ *
+ * Returns err code if mapping info is not found in the hash table,
+ * otherwise returns 0
+ */
 int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr,
 			struct sockaddr_storage *mapped_local_addr)
 {
@@ -251,6 +280,17 @@ void iwpm_add_remote_info(struct iwpm_remote_info *rem_info)
 	spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
 }
 
+/**
+ * iwpm_get_remote_info - Get the remote connecting peer address info
+ *
+ * @mapped_loc_addr: Mapped local address of the listening peer
+ * @mapped_rem_addr: Mapped remote address of the connecting peer
+ * @remote_addr: To store the remote address of the connecting peer
+ * @nl_client: The index of the netlink client
+ *
+ * The remote address info is retrieved and provided to the client in
+ * the remote_addr. After that it is removed from the hash table
+ */
 int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr,
 			 struct sockaddr_storage *mapped_rem_addr,
 			 struct sockaddr_storage *remote_addr,
diff --git a/include/rdma/iw_portmap.h b/include/rdma/iw_portmap.h
index 84fac196ef80..b9fee7feeeb5 100644
--- a/include/rdma/iw_portmap.h
+++ b/include/rdma/iw_portmap.h
@@ -61,177 +61,28 @@ struct iwpm_sa_data {
 	u32 flags;
 };
 
-/**
- * iwpm_init - Allocate resources for the iwarp port mapper
- *
- * Should be called when network interface goes up.
- */
 int iwpm_init(u8);
-
-/**
- * iwpm_exit - Deallocate resources for the iwarp port mapper
- *
- * Should be called when network interface goes down.
- */
 int iwpm_exit(u8);
-
-/**
- * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid
- *
- * Returns true if the pid is greater than zero, otherwise returns false
- */
 int iwpm_valid_pid(void);
-
-/**
- * iwpm_register_pid - Send a netlink query to userspace
- *                     to get the iwarp port mapper pid
- * @pm_msg: Contains driver info to send to the userspace port mapper
- * @nl_client: The index of the netlink client
- */
 int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client);
-
-/**
- * iwpm_add_mapping - Send a netlink add mapping request to
- *                    the userspace port mapper
- * @pm_msg: Contains the local ip/tcp address info to send
- * @nl_client: The index of the netlink client
- *
- * If the request is successful, the pm_msg stores
- * the port mapper response (mapped address info)
- */
 int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client);
-
-/**
- * iwpm_add_and_query_mapping - Send a netlink add and query mapping request
- *				 to the userspace port mapper
- * @pm_msg: Contains the local and remote ip/tcp address info to send
- * @nl_client: The index of the netlink client
- *
- * If the request is successful, the pm_msg stores the
- * port mapper response (mapped local and remote address info)
- */
 int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client);
-
-/**
- * iwpm_remove_mapping - Send a netlink remove mapping request
- *                       to the userspace port mapper
- *
- * @local_addr: Local ip/tcp address to remove
- * @nl_client: The index of the netlink client
- */
 int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client);
-
-/**
- * iwpm_register_pid_cb - Process the port mapper response to
- *                        iwpm_register_pid query
- * @skb:
- * @cb: Contains the received message (payload and netlink header)
- *
- * If successful, the function receives the userspace port mapper pid
- * which is used in future communication with the port mapper
- */
 int iwpm_register_pid_cb(struct sk_buff *, struct netlink_callback *);
-
-/**
- * iwpm_add_mapping_cb - Process the port mapper response to
- *                       iwpm_add_mapping request
- * @skb:
- * @cb: Contains the received message (payload and netlink header)
- */
 int iwpm_add_mapping_cb(struct sk_buff *, struct netlink_callback *);
-
-/**
- * iwpm_add_and_query_mapping_cb - Process the port mapper response to
- *                                 iwpm_add_and_query_mapping request
- * @skb:
- * @cb: Contains the received message (payload and netlink header)
- */
 int iwpm_add_and_query_mapping_cb(struct sk_buff *, struct netlink_callback *);
-
-/**
- * iwpm_remote_info_cb - Process remote connecting peer address info, which
- *                       the port mapper has received from the connecting peer
- *
- * @cb: Contains the received message (payload and netlink header)
- *
- * Stores the IPv4/IPv6 address info in a hash table
- */
 int iwpm_remote_info_cb(struct sk_buff *, struct netlink_callback *);
-
-/**
- * iwpm_mapping_error_cb - Process port mapper notification for error
- *
- * @skb:
- * @cb: Contains the received message (payload and netlink header)
- */
 int iwpm_mapping_error_cb(struct sk_buff *, struct netlink_callback *);
-
-/**
- * iwpm_mapping_info_cb - Process a notification that the userspace
- *                        port mapper daemon is started
- * @skb:
- * @cb: Contains the received message (payload and netlink header)
- *
- * Using the received port mapper pid, send all the local mapping
- * info records to the userspace port mapper
- */
 int iwpm_mapping_info_cb(struct sk_buff *, struct netlink_callback *);
-
-/**
- * iwpm_ack_mapping_info_cb - Process the port mapper ack for
- *                            the provided local mapping info records
- * @skb:
- * @cb: Contains the received message (payload and netlink header)
- */
 int iwpm_ack_mapping_info_cb(struct sk_buff *, struct netlink_callback *);
-
-/**
- * iwpm_get_remote_info - Get the remote connecting peer address info
- *
- * @mapped_loc_addr: Mapped local address of the listening peer
- * @mapped_rem_addr: Mapped remote address of the connecting peer
- * @remote_addr: To store the remote address of the connecting peer
- * @nl_client: The index of the netlink client
- *
- * The remote address info is retrieved and provided to the client in
- * the remote_addr. After that it is removed from the hash table
- */
 int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr,
 			struct sockaddr_storage *mapped_rem_addr,
 			struct sockaddr_storage *remote_addr, u8 nl_client);
-
-/**
- * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address
- *                       info in a hash table
- * @local_addr: Local ip/tcp address
- * @mapped_addr: Mapped local ip/tcp address
- * @nl_client: The index of the netlink client
- * @map_flags: IWPM mapping flags
- */
 int iwpm_create_mapinfo(struct sockaddr_storage *local_addr,
 			struct sockaddr_storage *mapped_addr, u8 nl_client,
 			u32 map_flags);
-
-/**
- * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address
- *                       info from the hash table
- * @local_addr: Local ip/tcp address
- * @mapped_addr: Mapped local ip/tcp address
- *
- * Returns err code if mapping info is not found in the hash table,
- * otherwise returns 0
- */
 int iwpm_remove_mapinfo(struct sockaddr_storage *local_addr,
 			struct sockaddr_storage *mapped_addr);
 
-/**
- * iwpm_hello_cb - Process a hello message from iwpmd
- *
- * @skb:
- * @cb: Contains the received message (payload and netlink header)
- *
- * Using the received port mapper pid, send the kernel's abi_version
- * after adjusting it to support the iwpmd version.
- */
 int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb);
 #endif /* _IW_PORTMAP_H */
-- 
cgit v1.2.3


From 385156c5f2a61834666f079ee66338f177c65c28 Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Wed, 23 Jan 2019 19:29:44 -0800
Subject: IB/hfi: Move RC functions into a header file

This patch moves some RC helper functions into a header file so that
they can be called from both RC and  TID RDMA functions. In addition,
a common function for rewinding a request is created in rdmavt so that
it can be shared between qib and hfi1 driver.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/rc.c    | 119 +++++++++++++++----------------------
 drivers/infiniband/hw/hfi1/rc.h    |  50 ++++++++++++++++
 drivers/infiniband/hw/qib/qib_rc.c |   7 +--
 drivers/infiniband/sw/rdmavt/rc.c  |  13 ++++
 include/rdma/rdmavt_qp.h           |  10 ++++
 5 files changed, 123 insertions(+), 76 deletions(-)
 create mode 100644 drivers/infiniband/hw/hfi1/rc.h

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index 092d5eba980f..6e74cd3814b8 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -51,28 +51,48 @@
 
 #include "hfi.h"
 #include "qp.h"
+#include "rc.h"
 #include "verbs_txreq.h"
 #include "trace.h"
 
-/* cut down ridiculously long IB macro names */
-#define OP(x) RC_OP(x)
-
-static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
-					 struct rvt_swqe *wqe,
-					 struct hfi1_ibport *ibp);
-
-static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
-		       u32 psn, u32 pmtu)
+struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
+				      u8 *prev_ack, bool *scheduled)
+	__must_hold(&qp->s_lock)
 {
-	u32 len;
+	struct rvt_ack_entry *e = NULL;
+	u8 i, p;
+	bool s = true;
 
-	len = delta_psn(psn, wqe->psn) * pmtu;
-	ss->sge = wqe->sg_list[0];
-	ss->sg_list = wqe->sg_list + 1;
-	ss->num_sge = wqe->wr.num_sge;
-	ss->total_len = wqe->length;
-	rvt_skip_sge(ss, len, false);
-	return wqe->length - len;
+	for (i = qp->r_head_ack_queue; ; i = p) {
+		if (i == qp->s_tail_ack_queue)
+			s = false;
+		if (i)
+			p = i - 1;
+		else
+			p = rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
+		if (p == qp->r_head_ack_queue) {
+			e = NULL;
+			break;
+		}
+		e = &qp->s_ack_queue[p];
+		if (!e->opcode) {
+			e = NULL;
+			break;
+		}
+		if (cmp_psn(psn, e->psn) >= 0) {
+			if (p == qp->s_tail_ack_queue &&
+			    cmp_psn(psn, e->lpsn) <= 0)
+				s = false;
+			break;
+		}
+	}
+	if (prev)
+		*prev = p;
+	if (prev_ack)
+		*prev_ack = i;
+	if (scheduled)
+		*scheduled = s;
+	return e;
 }
 
 /**
@@ -1229,9 +1249,9 @@ static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
  * This is similar to hfi1_send_complete but has to check to be sure
  * that the SGEs are not being referenced if the SWQE is being resent.
  */
-static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
-					 struct rvt_swqe *wqe,
-					 struct hfi1_ibport *ibp)
+struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
+				  struct rvt_swqe *wqe,
+				  struct hfi1_ibport *ibp)
 {
 	lockdep_assert_held(&qp->s_lock);
 	/*
@@ -1314,8 +1334,8 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
  * May be called at interrupt level, with the QP s_lock held.
  * Returns 1 if OK, 0 if current operation should be aborted (NAK).
  */
-static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
-		     u64 val, struct hfi1_ctxtdata *rcd)
+int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
+	      u64 val, struct hfi1_ctxtdata *rcd)
 {
 	struct hfi1_ibport *ibp;
 	enum ib_wc_status status;
@@ -1754,16 +1774,6 @@ bail:
 	return;
 }
 
-static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
-				  struct rvt_qp *qp)
-{
-	if (list_empty(&qp->rspwait)) {
-		qp->r_flags |= RVT_R_RSP_NAK;
-		rvt_get_qp(qp);
-		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
-	}
-}
-
 static inline void rc_cancel_ack(struct rvt_qp *qp)
 {
 	qp->r_adefered = 0;
@@ -1796,8 +1806,9 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
 	struct rvt_ack_entry *e;
 	unsigned long flags;
-	u8 i, prev;
-	int old_req;
+	u8 prev;
+	u8 mra; /* most recent ACK */
+	bool old_req;
 
 	trace_hfi1_rcv_error(qp, psn);
 	if (diff > 0) {
@@ -1843,29 +1854,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
 
 	spin_lock_irqsave(&qp->s_lock, flags);
 
-	for (i = qp->r_head_ack_queue; ; i = prev) {
-		if (i == qp->s_tail_ack_queue)
-			old_req = 0;
-		if (i)
-			prev = i - 1;
-		else
-			prev = rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
-		if (prev == qp->r_head_ack_queue) {
-			e = NULL;
-			break;
-		}
-		e = &qp->s_ack_queue[prev];
-		if (!e->opcode) {
-			e = NULL;
-			break;
-		}
-		if (cmp_psn(psn, e->psn) >= 0) {
-			if (prev == qp->s_tail_ack_queue &&
-			    cmp_psn(psn, e->lpsn) <= 0)
-				old_req = 0;
-			break;
-		}
-	}
+	e = find_prev_entry(qp, psn, &prev, &mra, &old_req);
+
 	switch (opcode) {
 	case OP(RDMA_READ_REQUEST): {
 		struct ib_reth *reth;
@@ -1940,7 +1930,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
 		 * Resend the most recent ACK if this request is
 		 * after all the previous RDMA reads and atomics.
 		 */
-		if (i == qp->r_head_ack_queue) {
+		if (mra == qp->r_head_ack_queue) {
 			spin_unlock_irqrestore(&qp->s_lock, flags);
 			qp->r_nak_state = 0;
 			qp->r_ack_psn = qp->r_psn - 1;
@@ -1951,7 +1941,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
 		 * Resend the RDMA read or atomic op which
 		 * ACKs this duplicate request.
 		 */
-		qp->s_tail_ack_queue = i;
+		qp->s_tail_ack_queue = mra;
 		break;
 	}
 	qp->s_ack_state = OP(ACKNOWLEDGE);
@@ -1968,17 +1958,6 @@ send_ack:
 	return 0;
 }
 
-static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
-{
-	unsigned next;
-
-	next = n + 1;
-	if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
-		next = 0;
-	qp->s_tail_ack_queue = next;
-	qp->s_ack_state = OP(ACKNOWLEDGE);
-}
-
 static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
 			  u32 lqpn, u32 rqpn, u8 svc_type)
 {
diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h
new file mode 100644
index 000000000000..4329eadcb3df
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/rc.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+
+#ifndef HFI1_RC_H
+#define HFI1_RC_H
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n)
+{
+	unsigned int next;
+
+	next = n + 1;
+	if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+		next = 0;
+	qp->s_tail_ack_queue = next;
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+}
+
+static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
+				  struct rvt_qp *qp)
+{
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= RVT_R_RSP_NAK;
+		rvt_get_qp(qp);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+}
+
+static inline u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
+			      u32 psn, u32 pmtu)
+{
+	u32 len;
+
+	len = delta_psn(psn, wqe->psn) * pmtu;
+	return rvt_restart_sge(ss, wqe, len);
+}
+
+struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
+				      u8 *prev_ack, bool *scheduled);
+int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val,
+	      struct hfi1_ctxtdata *rcd);
+struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, struct rvt_swqe *wqe,
+				  struct hfi1_ibport *ibp);
+
+#endif /* HFI1_RC_H */
diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c
index 6fa002940451..50dd9811b088 100644
--- a/drivers/infiniband/hw/qib/qib_rc.c
+++ b/drivers/infiniband/hw/qib/qib_rc.c
@@ -45,12 +45,7 @@ static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
 	u32 len;
 
 	len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
-	ss->sge = wqe->sg_list[0];
-	ss->sg_list = wqe->sg_list + 1;
-	ss->num_sge = wqe->wr.num_sge;
-	ss->total_len = wqe->length;
-	rvt_skip_sge(ss, len, false);
-	return wqe->length - len;
+	return rvt_restart_sge(ss, wqe, len);
 }
 
 /**
diff --git a/drivers/infiniband/sw/rdmavt/rc.c b/drivers/infiniband/sw/rdmavt/rc.c
index 6131cc558bdb..8d71647820a8 100644
--- a/drivers/infiniband/sw/rdmavt/rc.c
+++ b/drivers/infiniband/sw/rdmavt/rc.c
@@ -187,3 +187,16 @@ void rvt_get_credit(struct rvt_qp *qp, u32 aeth)
 	}
 }
 EXPORT_SYMBOL(rvt_get_credit);
+
+/* rvt_restart_sge - rewind the sge state for a wqe */
+u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len)
+{
+	ss->sge = wqe->sg_list[0];
+	ss->sg_list = wqe->sg_list + 1;
+	ss->num_sge = wqe->wr.num_sge;
+	ss->total_len = wqe->length;
+	rvt_skip_sge(ss, len, false);
+	return wqe->length - len;
+}
+EXPORT_SYMBOL(rvt_restart_sge);
+
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index cbafb1878669..56a9221378d9 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -628,6 +628,16 @@ __be32 rvt_compute_aeth(struct rvt_qp *qp);
  */
 void rvt_get_credit(struct rvt_qp *qp, u32 aeth);
 
+/**
+ * rvt_restart_sge - rewind the sge state for a wqe
+ * @ss: the sge state pointer
+ * @wqe: the wqe to rewind
+ * @len: the data length from the start of the wqe in bytes
+ *
+ * Returns the remaining data length.
+ */
+u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len);
+
 /**
  * @qp - the qp pair
  * @len - the length
-- 
cgit v1.2.3


From 838b6fd2d9ca29998869e4d1ecf4566efe807666 Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Wed, 23 Jan 2019 19:30:07 -0800
Subject: IB/hfi1: TID RDMA RcvArray programming and TID allocation

TID entries are used by hfi1 hardware to receive data payload from
incoming packets directly into a user buffer and thus avoid data copying
by software. This patch implements the functions for TID allocation,
freeing, and programming TID RcvArray entries in hardware for kernel
clients. TID entries are managed via lists of TID groups similar to PSM.
Furthermore, to track TID resource allocation for each request, software
flows are also allocated and freed as needed. Since software flows
consume large amount of memory for tracking TID allocation and freeing,
it is generally desirable to allocate them dynamically in the send queue
and only for TID RDMA requests, but pre-allocate them for receive queue
because the send queue could have thousands of entries while the receive
queue has only a limited number of entries.

Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/hfi.h          |   2 +
 drivers/infiniband/hw/hfi1/init.c         |   3 +-
 drivers/infiniband/hw/hfi1/tid_rdma.c     | 877 +++++++++++++++++++++++++++++-
 drivers/infiniband/hw/hfi1/tid_rdma.h     | 101 ++++
 drivers/infiniband/hw/hfi1/user_exp_rcv.h |   1 -
 drivers/infiniband/hw/hfi1/verbs.c        |  29 +-
 drivers/infiniband/hw/hfi1/verbs.h        |  34 ++
 drivers/infiniband/sw/rdmavt/qp.c         |   2 +-
 include/rdma/rdmavt_qp.h                  |   2 +
 9 files changed, 1033 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index 78aa344c7403..1412ed157c98 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -303,6 +303,8 @@ struct hfi1_ctxtdata {
 	spinlock_t exp_lock;
 	/* Queue for QP's waiting for HW TID flows */
 	struct tid_queue flow_queue;
+	/* Queue for QP's waiting for HW receive array entries */
+	struct tid_queue rarr_queue;
 	/* when waiting for rcv or pioavail */
 	wait_queue_head_t wait;
 	/* uuid from PSM */
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index 56830a514b92..d13304f7340d 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -372,6 +372,7 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
 		mutex_init(&rcd->exp_mutex);
 		spin_lock_init(&rcd->exp_lock);
 		INIT_LIST_HEAD(&rcd->flow_queue.queue_head);
+		INIT_LIST_HEAD(&rcd->rarr_queue.queue_head);
 
 		hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt);
 
@@ -1596,7 +1597,7 @@ static void cleanup_device_data(struct hfi1_devdata *dd)
 		struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
 
 		if (rcd) {
-			hfi1_clear_tids(rcd);
+			hfi1_free_ctxt_rcv_groups(rcd);
 			hfi1_free_ctxt(rcd);
 		}
 	}
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index 70671212808f..1d02b12590f6 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -8,6 +8,7 @@
 #include "qp.h"
 #include "verbs.h"
 #include "tid_rdma.h"
+#include "exp_rcv.h"
 #include "trace.h"
 
 #define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32)
@@ -35,8 +36,14 @@ static u32 mask_generation(u32 a)
 #define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE
 #define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1)
 
+/* Maximum number of segments in flight per QP request. */
 #define TID_RDMA_MAX_READ_SEGS_PER_REQ  6
 #define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4
+#define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \
+			TID_RDMA_MAX_WRITE_SEGS_PER_REQ)
+#define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1)
+
+#define MAX_EXPECTED_PAGES     (MAX_EXPECTED_BUFFER / PAGE_SIZE)
 
 #define TID_OPFN_QP_CTXT_MASK 0xff
 #define TID_OPFN_QP_CTXT_SHIFT 56
@@ -79,6 +86,11 @@ static u32 mask_generation(u32 a)
  */
 
 static void tid_rdma_trigger_resume(struct work_struct *work);
+static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req);
+static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
+					 gfp_t gfp);
+static void hfi1_init_trdma_req(struct rvt_qp *qp,
+				struct tid_rdma_request *req);
 
 static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
 {
@@ -230,7 +242,7 @@ int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit)
 	BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY);
 	rcd->jkey = TID_RDMA_JKEY;
 	hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey);
-	return 0;
+	return hfi1_alloc_ctxt_rcv_groups(rcd);
 }
 
 /**
@@ -266,6 +278,7 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
 		      struct ib_qp_init_attr *init_attr)
 {
 	struct hfi1_qp_priv *qpriv = qp->priv;
+	int i, ret;
 
 	qpriv->rcd = qp_to_rcd(rdi, qp);
 
@@ -278,15 +291,75 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
 	qpriv->flow_state.generation = KERN_GENERATION_RESERVED;
 	INIT_LIST_HEAD(&qpriv->tid_wait);
 
+	if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
+		struct hfi1_devdata *dd = qpriv->rcd->dd;
+
+		qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES *
+						sizeof(*qpriv->pages),
+					    GFP_KERNEL, dd->node);
+		if (!qpriv->pages)
+			return -ENOMEM;
+		for (i = 0; i < qp->s_size; i++) {
+			struct hfi1_swqe_priv *priv;
+			struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
+
+			priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
+					    dd->node);
+			if (!priv)
+				return -ENOMEM;
+
+			hfi1_init_trdma_req(qp, &priv->tid_req);
+			priv->tid_req.e.swqe = wqe;
+			wqe->priv = priv;
+		}
+		for (i = 0; i < rvt_max_atomic(rdi); i++) {
+			struct hfi1_ack_priv *priv;
+
+			priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
+					    dd->node);
+			if (!priv)
+				return -ENOMEM;
+
+			hfi1_init_trdma_req(qp, &priv->tid_req);
+			priv->tid_req.e.ack = &qp->s_ack_queue[i];
+
+			ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req,
+							    GFP_KERNEL);
+			if (ret) {
+				kfree(priv);
+				return ret;
+			}
+			qp->s_ack_queue[i].priv = priv;
+		}
+	}
+
 	return 0;
 }
 
 void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
 {
-	struct hfi1_qp_priv *priv = qp->priv;
-
-	if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA))
-		cancel_work_sync(&priv->opfn.opfn_work);
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct rvt_swqe *wqe;
+	u32 i;
+
+	if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
+		for (i = 0; i < qp->s_size; i++) {
+			wqe = rvt_get_swqe_ptr(qp, i);
+			kfree(wqe->priv);
+			wqe->priv = NULL;
+		}
+		for (i = 0; i < rvt_max_atomic(rdi); i++) {
+			struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv;
+
+			if (priv)
+				hfi1_kern_exp_rcv_free_flows(&priv->tid_req);
+			kfree(priv);
+			qp->s_ack_queue[i].priv = NULL;
+		}
+		cancel_work_sync(&qpriv->opfn.opfn_work);
+		kfree(qpriv->pages);
+		qpriv->pages = NULL;
+	}
 }
 
 /* Flow and tid waiter functions */
@@ -540,6 +613,7 @@ void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp)
 	struct hfi1_qp_priv *priv = qp->priv;
 
 	_tid_rdma_flush_wait(qp, &priv->rcd->flow_queue);
+	_tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue);
 }
 
 /* Flow functions */
@@ -702,3 +776,796 @@ void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd)
 		kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i);
 	}
 }
+
+/* TID allocation functions */
+static u8 trdma_pset_order(struct tid_rdma_pageset *s)
+{
+	u8 count = s->count;
+
+	return ilog2(count) + 1;
+}
+
+/**
+ * tid_rdma_find_phys_blocks_4k - get groups base on mr info
+ * @npages - number of pages
+ * @pages - pointer to an array of page structs
+ * @list - page set array to return
+ *
+ * This routine returns the number of groups associated with
+ * the current sge information.  This implementation is based
+ * on the expected receive find_phys_blocks() adjusted to
+ * use the MR information vs. the pfn.
+ *
+ * Return:
+ * the number of RcvArray entries
+ */
+static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow,
+					struct page **pages,
+					u32 npages,
+					struct tid_rdma_pageset *list)
+{
+	u32 pagecount, pageidx, setcount = 0, i;
+	void *vaddr, *this_vaddr;
+
+	if (!npages)
+		return 0;
+
+	/*
+	 * Look for sets of physically contiguous pages in the user buffer.
+	 * This will allow us to optimize Expected RcvArray entry usage by
+	 * using the bigger supported sizes.
+	 */
+	vaddr = page_address(pages[0]);
+	for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
+		this_vaddr = i < npages ? page_address(pages[i]) : NULL;
+		/*
+		 * If the vaddr's are not sequential, pages are not physically
+		 * contiguous.
+		 */
+		if (this_vaddr != (vaddr + PAGE_SIZE)) {
+			/*
+			 * At this point we have to loop over the set of
+			 * physically contiguous pages and break them down it
+			 * sizes supported by the HW.
+			 * There are two main constraints:
+			 *     1. The max buffer size is MAX_EXPECTED_BUFFER.
+			 *        If the total set size is bigger than that
+			 *        program only a MAX_EXPECTED_BUFFER chunk.
+			 *     2. The buffer size has to be a power of two. If
+			 *        it is not, round down to the closes power of
+			 *        2 and program that size.
+			 */
+			while (pagecount) {
+				int maxpages = pagecount;
+				u32 bufsize = pagecount * PAGE_SIZE;
+
+				if (bufsize > MAX_EXPECTED_BUFFER)
+					maxpages =
+						MAX_EXPECTED_BUFFER >>
+						PAGE_SHIFT;
+				else if (!is_power_of_2(bufsize))
+					maxpages =
+						rounddown_pow_of_two(bufsize) >>
+						PAGE_SHIFT;
+
+				list[setcount].idx = pageidx;
+				list[setcount].count = maxpages;
+				pagecount -= maxpages;
+				pageidx += maxpages;
+				setcount++;
+			}
+			pageidx = i;
+			pagecount = 1;
+			vaddr = this_vaddr;
+		} else {
+			vaddr += PAGE_SIZE;
+			pagecount++;
+		}
+	}
+	/* insure we always return an even number of sets */
+	if (setcount & 1)
+		list[setcount++].count = 0;
+	return setcount;
+}
+
+/**
+ * tid_flush_pages - dump out pages into pagesets
+ * @list - list of pagesets
+ * @idx - pointer to current page index
+ * @pages - number of pages to dump
+ * @sets - current number of pagesset
+ *
+ * This routine flushes out accumuated pages.
+ *
+ * To insure an even number of sets the
+ * code may add a filler.
+ *
+ * This can happen with when pages is not
+ * a power of 2 or pages is a power of 2
+ * less than the maximum pages.
+ *
+ * Return:
+ * The new number of sets
+ */
+
+static u32 tid_flush_pages(struct tid_rdma_pageset *list,
+			   u32 *idx, u32 pages, u32 sets)
+{
+	while (pages) {
+		u32 maxpages = pages;
+
+		if (maxpages > MAX_EXPECTED_PAGES)
+			maxpages = MAX_EXPECTED_PAGES;
+		else if (!is_power_of_2(maxpages))
+			maxpages = rounddown_pow_of_two(maxpages);
+		list[sets].idx = *idx;
+		list[sets++].count = maxpages;
+		*idx += maxpages;
+		pages -= maxpages;
+	}
+	/* might need a filler */
+	if (sets & 1)
+		list[sets++].count = 0;
+	return sets;
+}
+
+/**
+ * tid_rdma_find_phys_blocks_8k - get groups base on mr info
+ * @pages - pointer to an array of page structs
+ * @npages - number of pages
+ * @list - page set array to return
+ *
+ * This routine parses an array of pages to compute pagesets
+ * in an 8k compatible way.
+ *
+ * pages are tested two at a time, i, i + 1 for contiguous
+ * pages and i - 1 and i contiguous pages.
+ *
+ * If any condition is false, any accumlated pages are flushed and
+ * v0,v1 are emitted as separate PAGE_SIZE pagesets
+ *
+ * Otherwise, the current 8k is totaled for a future flush.
+ *
+ * Return:
+ * The number of pagesets
+ * list set with the returned number of pagesets
+ *
+ */
+static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow,
+					struct page **pages,
+					u32 npages,
+					struct tid_rdma_pageset *list)
+{
+	u32 idx, sets = 0, i;
+	u32 pagecnt = 0;
+	void *v0, *v1, *vm1;
+
+	if (!npages)
+		return 0;
+	for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) {
+		/* get a new v0 */
+		v0 = page_address(pages[i]);
+		v1 = i + 1 < npages ?
+				page_address(pages[i + 1]) : NULL;
+		/* compare i, i + 1 vaddr */
+		if (v1 != (v0 + PAGE_SIZE)) {
+			/* flush out pages */
+			sets = tid_flush_pages(list, &idx, pagecnt, sets);
+			/* output v0,v1 as two pagesets */
+			list[sets].idx = idx++;
+			list[sets++].count = 1;
+			if (v1) {
+				list[sets].count = 1;
+				list[sets++].idx = idx++;
+			} else {
+				list[sets++].count = 0;
+			}
+			vm1 = NULL;
+			pagecnt = 0;
+			continue;
+		}
+		/* i,i+1 consecutive, look at i-1,i */
+		if (vm1 && v0 != (vm1 + PAGE_SIZE)) {
+			/* flush out pages */
+			sets = tid_flush_pages(list, &idx, pagecnt, sets);
+			pagecnt = 0;
+		}
+		/* pages will always be a multiple of 8k */
+		pagecnt += 2;
+		/* save i-1 */
+		vm1 = v1;
+		/* move to next pair */
+	}
+	/* dump residual pages at end */
+	sets = tid_flush_pages(list, &idx, npages - idx, sets);
+	/* by design cannot be odd sets */
+	WARN_ON(sets & 1);
+	return sets;
+}
+
+/**
+ * Find pages for one segment of a sge array represented by @ss. The function
+ * does not check the sge, the sge must have been checked for alignment with a
+ * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of
+ * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge
+ * copy maintained in @ss->sge, the original sge is not modified.
+ *
+ * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not
+ * releasing the MR reference count at the same time. Otherwise, we'll "leak"
+ * references to the MR. This difference requires that we keep track of progress
+ * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request
+ * structure.
+ */
+static u32 kern_find_pages(struct tid_rdma_flow *flow,
+			   struct page **pages,
+			   struct rvt_sge_state *ss, bool *last)
+{
+	struct tid_rdma_request *req = flow->req;
+	struct rvt_sge *sge = &ss->sge;
+	u32 length = flow->req->seg_len;
+	u32 len = PAGE_SIZE;
+	u32 i = 0;
+
+	while (length && req->isge < ss->num_sge) {
+		pages[i++] = virt_to_page(sge->vaddr);
+
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (!sge->sge_length) {
+			if (++req->isge < ss->num_sge)
+				*sge = ss->sg_list[req->isge - 1];
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= RVT_SEGSZ) {
+				++sge->m;
+				sge->n = 0;
+			}
+			sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		length -= len;
+	}
+
+	flow->length = flow->req->seg_len - length;
+	*last = req->isge == ss->num_sge ? false : true;
+	return i;
+}
+
+static void dma_unmap_flow(struct tid_rdma_flow *flow)
+{
+	struct hfi1_devdata *dd;
+	int i;
+	struct tid_rdma_pageset *pset;
+
+	dd = flow->req->rcd->dd;
+	for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
+			i++, pset++) {
+		if (pset->count && pset->addr) {
+			dma_unmap_page(&dd->pcidev->dev,
+				       pset->addr,
+				       PAGE_SIZE * pset->count,
+				       DMA_FROM_DEVICE);
+			pset->mapped = 0;
+		}
+	}
+}
+
+static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages)
+{
+	int i;
+	struct hfi1_devdata *dd = flow->req->rcd->dd;
+	struct tid_rdma_pageset *pset;
+
+	for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
+			i++, pset++) {
+		if (pset->count) {
+			pset->addr = dma_map_page(&dd->pcidev->dev,
+						  pages[pset->idx],
+						  0,
+						  PAGE_SIZE * pset->count,
+						  DMA_FROM_DEVICE);
+
+			if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) {
+				dma_unmap_flow(flow);
+				return -ENOMEM;
+			}
+			pset->mapped = 1;
+		}
+	}
+	return 0;
+}
+
+static inline bool dma_mapped(struct tid_rdma_flow *flow)
+{
+	return !!flow->pagesets[0].mapped;
+}
+
+/*
+ * Get pages pointers and identify contiguous physical memory chunks for a
+ * segment. All segments are of length flow->req->seg_len.
+ */
+static int kern_get_phys_blocks(struct tid_rdma_flow *flow,
+				struct page **pages,
+				struct rvt_sge_state *ss, bool *last)
+{
+	u8 npages;
+
+	/* Reuse previously computed pagesets, if any */
+	if (flow->npagesets) {
+		if (!dma_mapped(flow))
+			return dma_map_flow(flow, pages);
+		return 0;
+	}
+
+	npages = kern_find_pages(flow, pages, ss, last);
+
+	if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096))
+		flow->npagesets =
+			tid_rdma_find_phys_blocks_4k(flow, pages, npages,
+						     flow->pagesets);
+	else
+		flow->npagesets =
+			tid_rdma_find_phys_blocks_8k(flow, pages, npages,
+						     flow->pagesets);
+
+	return dma_map_flow(flow, pages);
+}
+
+static inline void kern_add_tid_node(struct tid_rdma_flow *flow,
+				     struct hfi1_ctxtdata *rcd, char *s,
+				     struct tid_group *grp, u8 cnt)
+{
+	struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++];
+
+	WARN_ON_ONCE(flow->tnode_cnt >=
+		     (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT));
+	if (WARN_ON_ONCE(cnt & 1))
+		dd_dev_err(rcd->dd,
+			   "unexpected odd allocation cnt %u map 0x%x used %u",
+			   cnt, grp->map, grp->used);
+
+	node->grp = grp;
+	node->map = grp->map;
+	node->cnt = cnt;
+}
+
+/*
+ * Try to allocate pageset_count TID's from TID groups for a context
+ *
+ * This function allocates TID's without moving groups between lists or
+ * modifying grp->map. This is done as follows, being cogizant of the lists
+ * between which the TID groups will move:
+ * 1. First allocate complete groups of 8 TID's since this is more efficient,
+ *    these groups will move from group->full without affecting used
+ * 2. If more TID's are needed allocate from used (will move from used->full or
+ *    stay in used)
+ * 3. If we still don't have the required number of TID's go back and look again
+ *    at a complete group (will move from group->used)
+ */
+static int kern_alloc_tids(struct tid_rdma_flow *flow)
+{
+	struct hfi1_ctxtdata *rcd = flow->req->rcd;
+	struct hfi1_devdata *dd = rcd->dd;
+	u32 ngroups, pageidx = 0;
+	struct tid_group *group = NULL, *used;
+	u8 use;
+
+	flow->tnode_cnt = 0;
+	ngroups = flow->npagesets / dd->rcv_entries.group_size;
+	if (!ngroups)
+		goto used_list;
+
+	/* First look at complete groups */
+	list_for_each_entry(group,  &rcd->tid_group_list.list, list) {
+		kern_add_tid_node(flow, rcd, "complete groups", group,
+				  group->size);
+
+		pageidx += group->size;
+		if (!--ngroups)
+			break;
+	}
+
+	if (pageidx >= flow->npagesets)
+		goto ok;
+
+used_list:
+	/* Now look at partially used groups */
+	list_for_each_entry(used, &rcd->tid_used_list.list, list) {
+		use = min_t(u32, flow->npagesets - pageidx,
+			    used->size - used->used);
+		kern_add_tid_node(flow, rcd, "used groups", used, use);
+
+		pageidx += use;
+		if (pageidx >= flow->npagesets)
+			goto ok;
+	}
+
+	/*
+	 * Look again at a complete group, continuing from where we left.
+	 * However, if we are at the head, we have reached the end of the
+	 * complete groups list from the first loop above
+	 */
+	if (group && &group->list == &rcd->tid_group_list.list)
+		goto bail_eagain;
+	group = list_prepare_entry(group, &rcd->tid_group_list.list,
+				   list);
+	if (list_is_last(&group->list, &rcd->tid_group_list.list))
+		goto bail_eagain;
+	group = list_next_entry(group, list);
+	use = min_t(u32, flow->npagesets - pageidx, group->size);
+	kern_add_tid_node(flow, rcd, "complete continue", group, use);
+	pageidx += use;
+	if (pageidx >= flow->npagesets)
+		goto ok;
+bail_eagain:
+	return -EAGAIN;
+ok:
+	return 0;
+}
+
+static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num,
+				   u32 *pset_idx)
+{
+	struct hfi1_ctxtdata *rcd = flow->req->rcd;
+	struct hfi1_devdata *dd = rcd->dd;
+	struct kern_tid_node *node = &flow->tnode[grp_num];
+	struct tid_group *grp = node->grp;
+	struct tid_rdma_pageset *pset;
+	u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT;
+	u32 rcventry, npages = 0, pair = 0, tidctrl;
+	u8 i, cnt = 0;
+
+	for (i = 0; i < grp->size; i++) {
+		rcventry = grp->base + i;
+
+		if (node->map & BIT(i) || cnt >= node->cnt) {
+			rcv_array_wc_fill(dd, rcventry);
+			continue;
+		}
+		pset = &flow->pagesets[(*pset_idx)++];
+		if (pset->count) {
+			hfi1_put_tid(dd, rcventry, PT_EXPECTED,
+				     pset->addr, trdma_pset_order(pset));
+		} else {
+			hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
+		}
+		npages += pset->count;
+
+		rcventry -= rcd->expected_base;
+		tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1;
+		/*
+		 * A single TID entry will be used to use a rcvarr pair (with
+		 * tidctrl 0x3), if ALL these are true (a) the bit pos is even
+		 * (b) the group map shows current and the next bits as free
+		 * indicating two consecutive rcvarry entries are available (c)
+		 * we actually need 2 more entries
+		 */
+		pair = !(i & 0x1) && !((node->map >> i) & 0x3) &&
+			node->cnt >= cnt + 2;
+		if (!pair) {
+			if (!pset->count)
+				tidctrl = 0x1;
+			flow->tid_entry[flow->tidcnt++] =
+				EXP_TID_SET(IDX, rcventry >> 1) |
+				EXP_TID_SET(CTRL, tidctrl) |
+				EXP_TID_SET(LEN, npages);
+			/* Efficient DIV_ROUND_UP(npages, pmtu_pg) */
+			flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg);
+			npages = 0;
+		}
+
+		if (grp->used == grp->size - 1)
+			tid_group_move(grp, &rcd->tid_used_list,
+				       &rcd->tid_full_list);
+		else if (!grp->used)
+			tid_group_move(grp, &rcd->tid_group_list,
+				       &rcd->tid_used_list);
+
+		grp->used++;
+		grp->map |= BIT(i);
+		cnt++;
+	}
+}
+
+static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num)
+{
+	struct hfi1_ctxtdata *rcd = flow->req->rcd;
+	struct hfi1_devdata *dd = rcd->dd;
+	struct kern_tid_node *node = &flow->tnode[grp_num];
+	struct tid_group *grp = node->grp;
+	u32 rcventry;
+	u8 i, cnt = 0;
+
+	for (i = 0; i < grp->size; i++) {
+		rcventry = grp->base + i;
+
+		if (node->map & BIT(i) || cnt >= node->cnt) {
+			rcv_array_wc_fill(dd, rcventry);
+			continue;
+		}
+
+		hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
+
+		grp->used--;
+		grp->map &= ~BIT(i);
+		cnt++;
+
+		if (grp->used == grp->size - 1)
+			tid_group_move(grp, &rcd->tid_full_list,
+				       &rcd->tid_used_list);
+		else if (!grp->used)
+			tid_group_move(grp, &rcd->tid_used_list,
+				       &rcd->tid_group_list);
+	}
+	if (WARN_ON_ONCE(cnt & 1)) {
+		struct hfi1_ctxtdata *rcd = flow->req->rcd;
+		struct hfi1_devdata *dd = rcd->dd;
+
+		dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u",
+			   cnt, grp->map, grp->used);
+	}
+}
+
+static void kern_program_rcvarray(struct tid_rdma_flow *flow)
+{
+	u32 pset_idx = 0;
+	int i;
+
+	flow->npkts = 0;
+	flow->tidcnt = 0;
+	for (i = 0; i < flow->tnode_cnt; i++)
+		kern_program_rcv_group(flow, i, &pset_idx);
+}
+
+/**
+ * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a
+ * TID RDMA request
+ *
+ * @req: TID RDMA request for which the segment/flow is being set up
+ * @ss: sge state, maintains state across successive segments of a sge
+ * @last: set to true after the last sge segment has been processed
+ *
+ * This function
+ * (1) finds a free flow entry in the flow circular buffer
+ * (2) finds pages and continuous physical chunks constituing one segment
+ *     of an sge
+ * (3) allocates TID group entries for those chunks
+ * (4) programs rcvarray entries in the hardware corresponding to those
+ *     TID's
+ * (5) computes a tidarray with formatted TID entries which can be sent
+ *     to the sender
+ * (6) Reserves and programs HW flows.
+ * (7) It also manages queing the QP when TID/flow resources are not
+ *     available.
+ *
+ * @req points to struct tid_rdma_request of which the segments are a part. The
+ * function uses qp, rcd and seg_len members of @req. In the absence of errors,
+ * req->flow_idx is the index of the flow which has been prepared in this
+ * invocation of function call. With flow = &req->flows[req->flow_idx],
+ * flow->tid_entry contains the TID array which the sender can use for TID RDMA
+ * sends and flow->npkts contains number of packets required to send the
+ * segment.
+ *
+ * hfi1_check_sge_align should be called prior to calling this function and if
+ * it signals error TID RDMA cannot be used for this sge and this function
+ * should not be called.
+ *
+ * For the queuing, caller must hold the flow->req->qp s_lock from the send
+ * engine and the function will procure the exp_lock.
+ *
+ * Return:
+ * The function returns -EAGAIN if sufficient number of TID/flow resources to
+ * map the segment could not be allocated. In this case the function should be
+ * called again with previous arguments to retry the TID allocation. There are
+ * no other error returns. The function returns 0 on success.
+ */
+int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
+			    struct rvt_sge_state *ss, bool *last)
+	__must_hold(&req->qp->s_lock)
+{
+	struct tid_rdma_flow *flow = &req->flows[req->setup_head];
+	struct hfi1_ctxtdata *rcd = req->rcd;
+	struct hfi1_qp_priv *qpriv = req->qp->priv;
+	unsigned long flags;
+	struct rvt_qp *fqp;
+	u16 clear_tail = req->clear_tail;
+
+	lockdep_assert_held(&req->qp->s_lock);
+	/*
+	 * We return error if either (a) we don't have space in the flow
+	 * circular buffer, or (b) we already have max entries in the buffer.
+	 * Max entries depend on the type of request we are processing and the
+	 * negotiated TID RDMA parameters.
+	 */
+	if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) ||
+	    CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >=
+	    req->n_flows)
+		return -EINVAL;
+
+	/*
+	 * Get pages, identify contiguous physical memory chunks for the segment
+	 * If we can not determine a DMA address mapping we will treat it just
+	 * like if we ran out of space above.
+	 */
+	if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) {
+		hfi1_wait_kmem(flow->req->qp);
+		return -ENOMEM;
+	}
+
+	spin_lock_irqsave(&rcd->exp_lock, flags);
+	if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp))
+		goto queue;
+
+	/*
+	 * At this point we know the number of pagesets and hence the number of
+	 * TID's to map the segment. Allocate the TID's from the TID groups. If
+	 * we cannot allocate the required number we exit and try again later
+	 */
+	if (kern_alloc_tids(flow))
+		goto queue;
+	/*
+	 * Finally program the TID entries with the pagesets, compute the
+	 * tidarray and enable the HW flow
+	 */
+	kern_program_rcvarray(flow);
+
+	/*
+	 * Setup the flow state with relevant information.
+	 * This information is used for tracking the sequence of data packets
+	 * for the segment.
+	 * The flow is setup here as this is the most accurate time and place
+	 * to do so. Doing at a later time runs the risk of the flow data in
+	 * qpriv getting out of sync.
+	 */
+	memset(&flow->flow_state, 0x0, sizeof(flow->flow_state));
+	flow->idx = qpriv->flow_state.index;
+	flow->flow_state.generation = qpriv->flow_state.generation;
+	flow->flow_state.spsn = qpriv->flow_state.psn;
+	flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1;
+	flow->flow_state.r_next_psn =
+		full_flow_psn(flow, flow->flow_state.spsn);
+	qpriv->flow_state.psn += flow->npkts;
+
+	dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp);
+	/* get head before dropping lock */
+	fqp = first_qp(rcd, &rcd->rarr_queue);
+	spin_unlock_irqrestore(&rcd->exp_lock, flags);
+	tid_rdma_schedule_tid_wakeup(fqp);
+
+	req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
+	return 0;
+queue:
+	queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp);
+	spin_unlock_irqrestore(&rcd->exp_lock, flags);
+	return -EAGAIN;
+}
+
+static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow)
+{
+	flow->npagesets = 0;
+}
+
+/*
+ * This function is called after one segment has been successfully sent to
+ * release the flow and TID HW/SW resources for that segment. The segments for a
+ * TID RDMA request are setup and cleared in FIFO order which is managed using a
+ * circular buffer.
+ */
+int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req)
+	__must_hold(&req->qp->s_lock)
+{
+	struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
+	struct hfi1_ctxtdata *rcd = req->rcd;
+	unsigned long flags;
+	int i;
+	struct rvt_qp *fqp;
+
+	lockdep_assert_held(&req->qp->s_lock);
+	/* Exit if we have nothing in the flow circular buffer */
+	if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS))
+		return -EINVAL;
+
+	spin_lock_irqsave(&rcd->exp_lock, flags);
+
+	for (i = 0; i < flow->tnode_cnt; i++)
+		kern_unprogram_rcv_group(flow, i);
+	/* To prevent double unprogramming */
+	flow->tnode_cnt = 0;
+	/* get head before dropping lock */
+	fqp = first_qp(rcd, &rcd->rarr_queue);
+	spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
+	dma_unmap_flow(flow);
+
+	hfi1_tid_rdma_reset_flow(flow);
+	req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1);
+
+	if (fqp == req->qp) {
+		__trigger_tid_waiter(fqp);
+		rvt_put_qp(fqp);
+	} else {
+		tid_rdma_schedule_tid_wakeup(fqp);
+	}
+
+	return 0;
+}
+
+/*
+ * This function is called to release all the tid entries for
+ * a request.
+ */
+void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req)
+	__must_hold(&req->qp->s_lock)
+{
+	/* Use memory barrier for proper ordering */
+	while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) {
+		if (hfi1_kern_exp_rcv_clear(req))
+			break;
+	}
+}
+
+/**
+ * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information
+ * @req - the tid rdma request to be cleaned
+ */
+static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req)
+{
+	kfree(req->flows);
+	req->flows = NULL;
+}
+
+/**
+ * __trdma_clean_swqe - clean up for large sized QPs
+ * @qp: the queue patch
+ * @wqe: the send wqe
+ */
+void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+	struct hfi1_swqe_priv *p = wqe->priv;
+
+	hfi1_kern_exp_rcv_free_flows(&p->tid_req);
+}
+
+/*
+ * This can be called at QP create time or in the data path.
+ */
+static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
+					 gfp_t gfp)
+{
+	struct tid_rdma_flow *flows;
+	int i;
+
+	if (likely(req->flows))
+		return 0;
+	flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp,
+			     req->rcd->numa_id);
+	if (!flows)
+		return -ENOMEM;
+	/* mini init */
+	for (i = 0; i < MAX_FLOWS; i++) {
+		flows[i].req = req;
+		flows[i].npagesets = 0;
+		flows[i].pagesets[0].mapped =  0;
+	}
+	req->flows = flows;
+	return 0;
+}
+
+static void hfi1_init_trdma_req(struct rvt_qp *qp,
+				struct tid_rdma_request *req)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+
+	/*
+	 * Initialize various TID RDMA request variables.
+	 * These variables are "static", which is why they
+	 * can be pre-initialized here before the WRs has
+	 * even been submitted.
+	 * However, non-NULL values for these variables do not
+	 * imply that this WQE has been enabled for TID RDMA.
+	 * Drivers should check the WQE's opcode to determine
+	 * if a request is a TID RDMA one or not.
+	 */
+	req->qp = qp;
+	req->rcd = qpriv->rcd;
+}
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h
index 3bc0aaf9568f..524baf8c8fac 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.h
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.h
@@ -6,7 +6,16 @@
 #ifndef HFI1_TID_RDMA_H
 #define HFI1_TID_RDMA_H
 
+#include <linux/circ_buf.h>
+#include "common.h"
+
+/* Add a convenience helper */
+#define CIRC_ADD(val, add, size) (((val) + (add)) & ((size) - 1))
+#define CIRC_NEXT(val, size) CIRC_ADD(val, 1, size)
+#define CIRC_PREV(val, size) CIRC_ADD(val, -1, size)
+
 #define TID_RDMA_MAX_SEGMENT_SIZE       BIT(18)   /* 256 KiB (for now) */
+#define TID_RDMA_MAX_PAGES              (BIT(18) >> PAGE_SHIFT)
 
 struct tid_rdma_params {
 	struct rcu_head rcu_head;
@@ -36,6 +45,81 @@ struct tid_flow_state {
 	u8 flags;
 };
 
+struct tid_rdma_request {
+	struct rvt_qp *qp;
+	struct hfi1_ctxtdata *rcd;
+	union {
+		struct rvt_swqe *swqe;
+		struct rvt_ack_entry *ack;
+	} e;
+
+	struct tid_rdma_flow *flows;	/* array of tid flows */
+	u16 n_flows;		/* size of the flow buffer window */
+	u16 setup_head;		/* flow index we are setting up */
+	u16 clear_tail;		/* flow index we are clearing */
+	u16 flow_idx;		/* flow index most recently set up */
+
+	u32 seg_len;
+
+	u32 isge;		/* index of "current" sge */
+};
+
+/*
+ * When header suppression is used, PSNs associated with a "flow" are
+ * relevant (and not the PSNs maintained by verbs). Track per-flow
+ * PSNs here for a TID RDMA segment.
+ *
+ */
+struct flow_state {
+	u32 flags;
+	u32 resp_ib_psn;     /* The IB PSN of the response for this flow */
+	u32 generation;      /* generation of flow */
+	u32 spsn;            /* starting PSN in TID space */
+	u32 lpsn;            /* last PSN in TID space */
+	u32 r_next_psn;      /* next PSN to be received (in TID space) */
+};
+
+struct tid_rdma_pageset {
+	dma_addr_t addr : 48; /* Only needed for the first page */
+	u8 idx: 8;
+	u8 count : 7;
+	u8 mapped: 1;
+};
+
+/**
+ * kern_tid_node - used for managing TID's in TID groups
+ *
+ * @grp_idx: rcd relative index to tid_group
+ * @map: grp->map captured prior to programming this TID group in HW
+ * @cnt: Only @cnt of available group entries are actually programmed
+ */
+struct kern_tid_node {
+	struct tid_group *grp;
+	u8 map;
+	u8 cnt;
+};
+
+/* Overall info for a TID RDMA segment */
+struct tid_rdma_flow {
+	/*
+	 * While a TID RDMA segment is being transferred, it uses a QP number
+	 * from the "KDETH section of QP numbers" (which is different from the
+	 * QP number that originated the request). Bits 11-15 of these QP
+	 * numbers identify the "TID flow" for the segment.
+	 */
+	struct flow_state flow_state;
+	struct tid_rdma_request *req;
+	u32 length;
+	u8 tnode_cnt;
+	u8 tidcnt;
+	u8 idx;
+	u8 npagesets;
+	u8 npkts;
+	struct kern_tid_node tnode[TID_RDMA_MAX_PAGES];
+	struct tid_rdma_pageset pagesets[TID_RDMA_MAX_PAGES];
+	u32 tid_entry[TID_RDMA_MAX_PAGES];
+};
+
 bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data);
 bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data);
 bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data);
@@ -43,6 +127,23 @@ void tid_rdma_conn_error(struct rvt_qp *qp);
 void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p);
 
 int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit);
+int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
+			    struct rvt_sge_state *ss, bool *last);
+int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req);
+void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req);
+void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
+/**
+ * trdma_clean_swqe - clean flows for swqe if large send queue
+ * @qp: the qp
+ * @wqe: the send wqe
+ */
+static inline void trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+	if (!wqe->priv)
+		return;
+	__trdma_clean_swqe(qp, wqe);
+}
 
 int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
 		      struct ib_qp_init_attr *init_attr);
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
index e383cc01a2bf..43b105de1d54 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
@@ -48,7 +48,6 @@
  */
 
 #include "hfi.h"
-
 #include "exp_rcv.h"
 
 struct tid_pageset {
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 571bfd549c2a..02c1873a976c 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -504,11 +504,28 @@ static void verbs_sdma_complete(
 	hfi1_put_txreq(tx);
 }
 
+void hfi1_wait_kmem(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct ib_qp *ibqp = &qp->ibqp;
+	struct ib_device *ibdev = ibqp->device;
+	struct hfi1_ibdev *dev = to_idev(ibdev);
+
+	if (list_empty(&priv->s_iowait.list)) {
+		if (list_empty(&dev->memwait))
+			mod_timer(&dev->mem_timer, jiffies + 1);
+		qp->s_flags |= RVT_S_WAIT_KMEM;
+		list_add_tail(&priv->s_iowait.list, &dev->memwait);
+		priv->s_iowait.lock = &dev->iowait_lock;
+		trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
+		rvt_get_qp(qp);
+	}
+}
+
 static int wait_kmem(struct hfi1_ibdev *dev,
 		     struct rvt_qp *qp,
 		     struct hfi1_pkt_state *ps)
 {
-	struct hfi1_qp_priv *priv = qp->priv;
 	unsigned long flags;
 	int ret = 0;
 
@@ -517,15 +534,7 @@ static int wait_kmem(struct hfi1_ibdev *dev,
 		write_seqlock(&dev->iowait_lock);
 		list_add_tail(&ps->s_txreq->txreq.list,
 			      &ps->wait->tx_head);
-		if (list_empty(&priv->s_iowait.list)) {
-			if (list_empty(&dev->memwait))
-				mod_timer(&dev->mem_timer, jiffies + 1);
-			qp->s_flags |= RVT_S_WAIT_KMEM;
-			list_add_tail(&priv->s_iowait.list, &dev->memwait);
-			priv->s_iowait.lock = &dev->iowait_lock;
-			trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
-			rvt_get_qp(qp);
-		}
+		hfi1_wait_kmem(qp);
 		write_sequnlock(&dev->iowait_lock);
 		hfi1_qp_unbusy(qp, ps->wait);
 		ret = -EBUSY;
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index 9065e470bebb..94f198b47239 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -159,6 +159,7 @@ struct hfi1_qp_priv {
 	struct sdma_engine *s_sde;                /* current sde */
 	struct send_context *s_sendcontext;       /* current sendcontext */
 	struct hfi1_ctxtdata *rcd;                /* QP's receive context */
+	struct page **pages;                      /* for TID page scan */
 	u32 tid_enqueue;                          /* saved when tid waited */
 	u8 s_sc;		                  /* SC[0..4] for next packet */
 	struct iowait s_iowait;
@@ -173,6 +174,14 @@ struct hfi1_qp_priv {
 	u8 timeout_shift;       /* account for number of packets per segment */
 };
 
+struct hfi1_swqe_priv {
+	struct tid_rdma_request tid_req;
+};
+
+struct hfi1_ack_priv {
+	struct tid_rdma_request tid_req;
+};
+
 /*
  * This structure is used to hold commonly lookedup and computed values during
  * the send engine progress.
@@ -321,6 +330,21 @@ static inline u32 delta_psn(u32 a, u32 b)
 	return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT;
 }
 
+/*
+ * Look through all the active flows for a TID RDMA request and find
+ * the one (if it exists) that contains the specified PSN.
+ */
+static inline u32 __full_flow_psn(struct flow_state *state, u32 psn)
+{
+	return mask_psn((state->generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
+			(psn & HFI1_KDETH_BTH_SEQ_MASK));
+}
+
+static inline u32 full_flow_psn(struct tid_rdma_flow *flow, u32 psn)
+{
+	return __full_flow_psn(&flow->flow_state, psn);
+}
+
 struct verbs_txreq;
 void hfi1_put_txreq(struct verbs_txreq *tx);
 
@@ -403,6 +427,16 @@ static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr)
 	return ohdr->bth[1] & cpu_to_be32(OPA_BTH_MIG_REQ);
 }
 
+void hfi1_wait_kmem(struct rvt_qp *qp);
+
+static inline void hfi1_trdma_send_complete(struct rvt_qp *qp,
+					    struct rvt_swqe *wqe,
+					    enum ib_wc_status status)
+{
+	trdma_clean_swqe(qp, wqe);
+	rvt_send_complete(qp, wqe, status);
+}
+
 extern const enum ib_wc_opcode ib_hfi1_wc_opcode[];
 
 extern const u8 hdr_len_by_opcode[];
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 16247d2a671d..c8e70cf69a8a 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1642,11 +1642,11 @@ int rvt_destroy_qp(struct ib_qp *ibqp)
 		kref_put(&qp->ip->ref, rvt_release_mmap_info);
 	else
 		vfree(qp->r_rq.wq);
-	vfree(qp->s_wq);
 	rdi->driver_f.qp_priv_free(rdi, qp);
 	kfree(qp->s_ack_queue);
 	rdma_destroy_ah_attr(&qp->remote_ah_attr);
 	rdma_destroy_ah_attr(&qp->alt_ah_attr);
+	vfree(qp->s_wq);
 	kfree(qp);
 	return 0;
 }
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 56a9221378d9..9095a0b71250 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -174,6 +174,7 @@ struct rvt_swqe {
 	u32 lpsn;               /* last packet sequence number */
 	u32 ssn;                /* send sequence number */
 	u32 length;             /* total length of data in sg_list */
+	void *priv;             /* driver dependent field */
 	struct rvt_sge sg_list[0];
 };
 
@@ -235,6 +236,7 @@ struct rvt_ack_entry {
 	u32 lpsn;
 	u8 opcode;
 	u8 sent;
+	void *priv;
 };
 
 #define	RC_QP_SCALING_INTERVAL	5
-- 
cgit v1.2.3


From 742a3826cf82395e304df99f6494d04b0dd03a84 Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Wed, 23 Jan 2019 19:30:40 -0800
Subject: IB/hfi1: Add functions to build TID RDMA READ request

This patch adds the helper functions to build the TID RDMA READ request
on the requester side. The key is to allocate TID resources (TID flow
and TID entries) and send the resource information to the responder side
along with the read request. Since the TID resources are limited, each
TID RDMA READ request has to be split into segments with a default
segment size of 256K. A software flow is allocated to track the data
transaction for each segment. The work request opcode, packet opcode, and
packet formats for TID RDMA READ protocol are also defined in this patch.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/tid_rdma.c | 200 ++++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/hfi1/tid_rdma.h |  32 ++++++
 drivers/infiniband/hw/hfi1/verbs.h    |   9 ++
 include/rdma/ib_hdrs.h                |   9 +-
 include/rdma/tid_rdma_defs.h          |  52 +++++++++
 5 files changed, 301 insertions(+), 1 deletion(-)
 create mode 100644 include/rdma/tid_rdma_defs.h

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index 506b5a59ded5..56c8c10b5a85 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -6,11 +6,27 @@
 
 #include "hfi.h"
 #include "qp.h"
+#include "rc.h"
 #include "verbs.h"
 #include "tid_rdma.h"
 #include "exp_rcv.h"
 #include "trace.h"
 
+/**
+ * DOC: TID RDMA READ protocol
+ *
+ * This is an end-to-end protocol at the hfi1 level between two nodes that
+ * improves performance by avoiding data copy on the requester side. It
+ * converts a qualified RDMA READ request into a TID RDMA READ request on
+ * the requester side and thereafter handles the request and response
+ * differently. To be qualified, the RDMA READ request should meet the
+ * following:
+ * -- The total data length should be greater than 256K;
+ * -- The total data length should be a multiple of 4K page size;
+ * -- Each local scatter-gather entry should be 4K page aligned;
+ * -- Each local scatter-gather entry should be a multiple of 4K page size;
+ */
+
 #define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32)
 #define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33)
 #define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34)
@@ -18,6 +34,9 @@
 #define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37)
 #define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38)
 
+/* Maximum number of packets within a flow generation. */
+#define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT)
+
 #define GENERATION_MASK 0xFFFFF
 
 static u32 mask_generation(u32 a)
@@ -45,6 +64,9 @@ static u32 mask_generation(u32 a)
 
 #define MAX_EXPECTED_PAGES     (MAX_EXPECTED_BUFFER / PAGE_SIZE)
 
+#define TID_RDMA_DESTQP_FLOW_SHIFT      11
+#define TID_RDMA_DESTQP_FLOW_MASK       0x1f
+
 #define TID_OPFN_QP_CTXT_MASK 0xff
 #define TID_OPFN_QP_CTXT_SHIFT 56
 #define TID_OPFN_QP_KDETH_MASK 0xff
@@ -1597,3 +1619,181 @@ u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
 
 	return dd->verbs_dev.n_tidwait;
 }
+
+/* TID RDMA READ functions */
+u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
+				    struct ib_other_headers *ohdr, u32 *bth1,
+				    u32 *bth2, u32 *len)
+{
+	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+	struct tid_rdma_flow *flow = &req->flows[req->flow_idx];
+	struct rvt_qp *qp = req->qp;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct hfi1_swqe_priv *wpriv = wqe->priv;
+	struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req;
+	struct tid_rdma_params *remote;
+	u32 req_len = 0;
+	void *req_addr = NULL;
+
+	/* This is the IB psn used to send the request */
+	*bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt);
+
+	/* TID Entries for TID RDMA READ payload */
+	req_addr = &flow->tid_entry[flow->tid_idx];
+	req_len = sizeof(*flow->tid_entry) *
+			(flow->tidcnt - flow->tid_idx);
+
+	memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req));
+	wpriv->ss.sge.vaddr = req_addr;
+	wpriv->ss.sge.sge_length = req_len;
+	wpriv->ss.sge.length = wpriv->ss.sge.sge_length;
+	/*
+	 * We can safely zero these out. Since the first SGE covers the
+	 * entire packet, nothing else should even look at the MR.
+	 */
+	wpriv->ss.sge.mr = NULL;
+	wpriv->ss.sge.m = 0;
+	wpriv->ss.sge.n = 0;
+
+	wpriv->ss.sg_list = NULL;
+	wpriv->ss.total_len = wpriv->ss.sge.sge_length;
+	wpriv->ss.num_sge = 1;
+
+	/* Construct the TID RDMA READ REQ packet header */
+	rcu_read_lock();
+	remote = rcu_dereference(qpriv->tid_rdma.remote);
+
+	KDETH_RESET(rreq->kdeth0, KVER, 0x1);
+	KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey);
+	rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr +
+			   req->cur_seg * req->seg_len + flow->sent);
+	rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey);
+	rreq->reth.length = cpu_to_be32(*len);
+	rreq->tid_flow_psn =
+		cpu_to_be32((flow->flow_state.generation <<
+			     HFI1_KDETH_BTH_SEQ_SHIFT) |
+			    ((flow->flow_state.spsn + flow->pkt) &
+			     HFI1_KDETH_BTH_SEQ_MASK));
+	rreq->tid_flow_qp =
+		cpu_to_be32(qpriv->tid_rdma.local.qp |
+			    ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
+			     TID_RDMA_DESTQP_FLOW_SHIFT) |
+			    qpriv->rcd->ctxt);
+	rreq->verbs_qp = cpu_to_be32(qp->remote_qpn);
+	*bth1 &= ~RVT_QPN_MASK;
+	*bth1 |= remote->qp;
+	*bth2 |= IB_BTH_REQ_ACK;
+	rcu_read_unlock();
+
+	/* We are done with this segment */
+	flow->sent += *len;
+	req->cur_seg++;
+	qp->s_state = TID_OP(READ_REQ);
+	req->ack_pending++;
+	req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1);
+	qpriv->pending_tid_r_segs++;
+	qp->s_num_rd_atomic++;
+
+	/* Set the TID RDMA READ request payload size */
+	*len = req_len;
+
+	return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32);
+}
+
+/*
+ * @len: contains the data length to read upon entry and the read request
+ *       payload length upon exit.
+ */
+u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+				 struct ib_other_headers *ohdr, u32 *bth1,
+				 u32 *bth2, u32 *len)
+	__must_hold(&qp->s_lock)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+	struct tid_rdma_flow *flow = NULL;
+	u32 hdwords = 0;
+	bool last;
+	bool retry = true;
+	u32 npkts = rvt_div_round_up_mtu(qp, *len);
+
+	/*
+	 * Check sync conditions. Make sure that there are no pending
+	 * segments before freeing the flow.
+	 */
+sync_check:
+	if (req->state == TID_REQUEST_SYNC) {
+		if (qpriv->pending_tid_r_segs)
+			goto done;
+
+		hfi1_kern_clear_hw_flow(req->rcd, qp);
+		req->state = TID_REQUEST_ACTIVE;
+	}
+
+	/*
+	 * If the request for this segment is resent, the tid resources should
+	 * have been allocated before. In this case, req->flow_idx should
+	 * fall behind req->setup_head.
+	 */
+	if (req->flow_idx == req->setup_head) {
+		retry = false;
+		if (req->state == TID_REQUEST_RESEND) {
+			/*
+			 * This is the first new segment for a request whose
+			 * earlier segments have been re-sent. We need to
+			 * set up the sge pointer correctly.
+			 */
+			restart_sge(&qp->s_sge, wqe, req->s_next_psn,
+				    qp->pmtu);
+			req->isge = 0;
+			req->state = TID_REQUEST_ACTIVE;
+		}
+
+		/*
+		 * Check sync. The last PSN of each generation is reserved for
+		 * RESYNC.
+		 */
+		if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) {
+			req->state = TID_REQUEST_SYNC;
+			goto sync_check;
+		}
+
+		/* Allocate the flow if not yet */
+		if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp))
+			goto done;
+
+		/*
+		 * The following call will advance req->setup_head after
+		 * allocating the tid entries.
+		 */
+		if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) {
+			req->state = TID_REQUEST_QUEUED;
+
+			/*
+			 * We don't have resources for this segment. The QP has
+			 * already been queued.
+			 */
+			goto done;
+		}
+	}
+
+	/* req->flow_idx should only be one slot behind req->setup_head */
+	flow = &req->flows[req->flow_idx];
+	flow->pkt = 0;
+	flow->tid_idx = 0;
+	flow->sent = 0;
+	if (!retry) {
+		/* Set the first and last IB PSN for the flow in use.*/
+		flow->flow_state.ib_spsn = req->s_next_psn;
+		flow->flow_state.ib_lpsn =
+			flow->flow_state.ib_spsn + flow->npkts - 1;
+	}
+
+	/* Calculate the next segment start psn.*/
+	req->s_next_psn += flow->npkts;
+
+	/* Build the packet header */
+	hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len);
+done:
+	return hdwords;
+}
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h
index 3dbeaa8cb5b3..f692f3ff9419 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.h
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.h
@@ -45,6 +45,19 @@ struct tid_flow_state {
 	u8 flags;
 };
 
+enum tid_rdma_req_state {
+	TID_REQUEST_INACTIVE = 0,
+	TID_REQUEST_INIT,
+	TID_REQUEST_INIT_RESEND,
+	TID_REQUEST_ACTIVE,
+	TID_REQUEST_RESEND,
+	TID_REQUEST_RESEND_ACTIVE,
+	TID_REQUEST_QUEUED,
+	TID_REQUEST_SYNC,
+	TID_REQUEST_RNR_NAK,
+	TID_REQUEST_COMPLETE,
+};
+
 struct tid_rdma_request {
 	struct rvt_qp *qp;
 	struct hfi1_ctxtdata *rcd;
@@ -60,8 +73,13 @@ struct tid_rdma_request {
 	u16 flow_idx;		/* flow index most recently set up */
 
 	u32 seg_len;
+	u32 s_next_psn;		/* IB PSN of next segment start for read */
 
+	u32 cur_seg;		/* index of current segment */
 	u32 isge;		/* index of "current" sge */
+	u32 ack_pending;        /* num acks pending for this request */
+
+	enum tid_rdma_req_state state;
 };
 
 /*
@@ -77,6 +95,10 @@ struct flow_state {
 	u32 spsn;            /* starting PSN in TID space */
 	u32 lpsn;            /* last PSN in TID space */
 	u32 r_next_psn;      /* next PSN to be received (in TID space) */
+
+	/* For tid rdma read */
+	u32 ib_spsn;         /* starting PSN in Verbs space */
+	u32 ib_lpsn;         /* last PSn in Verbs space */
 };
 
 struct tid_rdma_pageset {
@@ -110,11 +132,14 @@ struct tid_rdma_flow {
 	struct flow_state flow_state;
 	struct tid_rdma_request *req;
 	u32 length;
+	u32 sent;
 	u8 tnode_cnt;
 	u8 tidcnt;
+	u8 tid_idx;
 	u8 idx;
 	u8 npagesets;
 	u8 npkts;
+	u8 pkt;
 	struct kern_tid_node tnode[TID_RDMA_MAX_PAGES];
 	struct tid_rdma_pageset pagesets[TID_RDMA_MAX_PAGES];
 	u32 tid_entry[TID_RDMA_MAX_PAGES];
@@ -159,4 +184,11 @@ struct cntr_entry;
 u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
 			    void *context, int vl, int mode, u64 data);
 
+u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
+				    struct ib_other_headers *ohdr,
+				    u32 *bth1, u32 *bth2, u32 *len);
+u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+				 struct ib_other_headers *ohdr, u32 *bth1,
+				 u32 *bth2, u32 *len);
+
 #endif /* HFI1_TID_RDMA_H */
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index 20729454f181..2965b0957855 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -170,12 +170,16 @@ struct hfi1_qp_priv {
 	struct rvt_qp *owner;
 	u8 hdr_type; /* 9B or 16B */
 	unsigned long tid_timer_timeout_jiffies;
+
+	/* For TID RDMA READ */
+	u32 pending_tid_r_segs; /* Num of pending tid read segments */
 	u16 pkts_ps;            /* packets per segment */
 	u8 timeout_shift;       /* account for number of packets per segment */
 };
 
 struct hfi1_swqe_priv {
 	struct tid_rdma_request tid_req;
+	struct rvt_sge_state ss;  /* Used for TID RDMA READ Request */
 };
 
 struct hfi1_ack_priv {
@@ -331,6 +335,11 @@ static inline u32 delta_psn(u32 a, u32 b)
 	return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT;
 }
 
+static inline struct tid_rdma_request *wqe_to_tid_req(struct rvt_swqe *wqe)
+{
+	return &((struct hfi1_swqe_priv *)wqe->priv)->tid_req;
+}
+
 /*
  * Look through all the active flows for a TID RDMA request and find
  * the one (if it exists) that contains the specified PSN.
diff --git a/include/rdma/ib_hdrs.h b/include/rdma/ib_hdrs.h
index 6e35416170a3..58a0a0f99e7f 100644
--- a/include/rdma/ib_hdrs.h
+++ b/include/rdma/ib_hdrs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -100,6 +100,8 @@ struct ib_atomic_eth {
 	__be64 compare_data; /* potentially unaligned */
 } __packed;
 
+#include <rdma/tid_rdma_defs.h>
+
 union ib_ehdrs {
 	struct {
 		__be32 deth[2];
@@ -117,6 +119,11 @@ union ib_ehdrs {
 	__be32 aeth;
 	__be32 ieth;
 	struct ib_atomic_eth atomic_eth;
+	/* TID RDMA headers */
+	union {
+		struct tid_rdma_read_req r_req;
+		struct tid_rdma_read_resp r_rsp;
+	} tid_rdma;
 }  __packed;
 
 struct ib_other_headers {
diff --git a/include/rdma/tid_rdma_defs.h b/include/rdma/tid_rdma_defs.h
new file mode 100644
index 000000000000..1c431ea32b52
--- /dev/null
+++ b/include/rdma/tid_rdma_defs.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+
+#ifndef TID_RDMA_DEFS_H
+#define TID_RDMA_DEFS_H
+
+#include <rdma/ib_pack.h>
+
+struct tid_rdma_read_req {
+	__le32 kdeth0;
+	__le32 kdeth1;
+	struct ib_reth reth;
+	__be32 tid_flow_psn;
+	__be32 tid_flow_qp;
+	__be32 verbs_qp;
+};
+
+struct tid_rdma_read_resp {
+	__le32 kdeth0;
+	__le32 kdeth1;
+	__be32 aeth;
+	__be32 reserved[4];
+	__be32 verbs_psn;
+	__be32 verbs_qp;
+};
+
+/*
+ * TID RDMA Opcodes
+ */
+#define IB_OPCODE_TID_RDMA 0xe0
+enum {
+	IB_OPCODE_READ_REQ        = 0x4,
+	IB_OPCODE_READ_RESP       = 0x5,
+
+	IB_OPCODE(TID_RDMA, READ_REQ),
+	IB_OPCODE(TID_RDMA, READ_RESP),
+};
+
+#define TID_OP(x) IB_OPCODE_TID_RDMA_##x
+
+/*
+ * Define TID RDMA specific WR opcodes. The ib_wr_opcode
+ * enum already provides some reserved values for use by
+ * low level drivers. Two of those are used but renamed
+ * to be more descriptive.
+ */
+#define IB_WR_TID_RDMA_READ  IB_WR_RESERVED2
+
+#endif /* TID_RDMA_DEFS_H */
-- 
cgit v1.2.3


From 039cd3daf19b9acbf080054d765cbceac842b6a0 Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Wed, 23 Jan 2019 19:31:57 -0800
Subject: IB/hfi1: Increment the retry timeout value for TID RDMA READ request

The RC retry timeout value is based on the estimated time for the
response packet to come back. However, for TID RDMA READ request, due
to the use of header suppression, the driver is normally not notified
for each incoming response packet until the last TID RDMA READ response
packet. Consequently, the retry timeout value should be extended to
cover the transaction time for the entire length of a segment (default
256K) instead of that for a single packet. This patch addresses the
issue by introducing new retry timer functions to account for multiple
packets and wrapper functions for backward compatibility.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/sw/rdmavt/qp.c | 11 ++++++-----
 include/rdma/rdma_vt.h            | 12 +++++++++---
 include/rdma/rdmavt_qp.h          |  6 +++++-
 3 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index c8e70cf69a8a..2769ebdf89fb 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -2393,11 +2393,12 @@ static inline unsigned long rvt_aeth_to_usec(u32 aeth)
 }
 
 /*
- *  rvt_add_retry_timer - add/start a retry timer
+ *  rvt_add_retry_timer_ext - add/start a retry timer
  *  @qp - the QP
+ *  @shift - timeout shift to wait for multiple packets
  *  add a retry timer on the QP
  */
-void rvt_add_retry_timer(struct rvt_qp *qp)
+void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift)
 {
 	struct ib_qp *ibqp = &qp->ibqp;
 	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
@@ -2405,11 +2406,11 @@ void rvt_add_retry_timer(struct rvt_qp *qp)
 	lockdep_assert_held(&qp->s_lock);
 	qp->s_flags |= RVT_S_TIMER;
        /* 4.096 usec. * (1 << qp->timeout) */
-	qp->s_timer.expires = jiffies + qp->timeout_jiffies +
-			     rdi->busy_jiffies;
+	qp->s_timer.expires = jiffies + rdi->busy_jiffies +
+			      (qp->timeout_jiffies << shift);
 	add_timer(&qp->s_timer);
 }
-EXPORT_SYMBOL(rvt_add_retry_timer);
+EXPORT_SYMBOL(rvt_add_retry_timer_ext);
 
 /**
  * rvt_add_rnr_timer - add/start an rnr timer
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index 168e40be183c..87d66c9630d7 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -574,9 +574,10 @@ static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi,
 /**
  * rvt_mod_retry_timer - mod a retry timer
  * @qp - the QP
+ * @shift - timeout shift to wait for multiple packets
  * Modify a potentially already running retry timer
  */
-static inline void rvt_mod_retry_timer(struct rvt_qp *qp)
+static inline void rvt_mod_retry_timer_ext(struct rvt_qp *qp, u8 shift)
 {
 	struct ib_qp *ibqp = &qp->ibqp;
 	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
@@ -584,8 +585,13 @@ static inline void rvt_mod_retry_timer(struct rvt_qp *qp)
 	lockdep_assert_held(&qp->s_lock);
 	qp->s_flags |= RVT_S_TIMER;
 	/* 4.096 usec. * (1 << qp->timeout) */
-	mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies +
-		  rdi->busy_jiffies);
+	mod_timer(&qp->s_timer, jiffies + rdi->busy_jiffies +
+		  (qp->timeout_jiffies << shift));
+}
+
+static inline void rvt_mod_retry_timer(struct rvt_qp *qp)
+{
+	return rvt_mod_retry_timer_ext(qp, 0);
 }
 
 struct rvt_dev_info *rvt_alloc_device(size_t size, int nports);
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 9095a0b71250..d8d88d023092 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -688,7 +688,11 @@ enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t);
 void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth);
 void rvt_del_timers_sync(struct rvt_qp *qp);
 void rvt_stop_rc_timers(struct rvt_qp *qp);
-void rvt_add_retry_timer(struct rvt_qp *qp);
+void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift);
+static inline void rvt_add_retry_timer(struct rvt_qp *qp)
+{
+	rvt_add_retry_timer_ext(qp, 0);
+}
 
 void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss,
 		  void *data, u32 length,
-- 
cgit v1.2.3


From c098bbb00cd1986cbb58ed1712643f80ed00fcc3 Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Wed, 23 Jan 2019 21:48:28 -0800
Subject: IB/hfi1: Build TID RDMA WRITE request

This patch adds the functions to build TID RDMA WRITE request.
The work request opcode, packet opcode, and packet formats for TID
RDMA WRITE protocol are also defined in this patch.

Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/qp.h       |  2 ++
 drivers/infiniband/hw/hfi1/tid_rdma.c | 38 ++++++++++++++++++++++++
 drivers/infiniband/hw/hfi1/tid_rdma.h |  3 ++
 include/rdma/ib_hdrs.h                |  5 ++++
 include/rdma/tid_rdma_defs.h          | 56 +++++++++++++++++++++++++++++++++++
 5 files changed, 104 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h
index ce25a27aa4a1..f74e2509e8b9 100644
--- a/drivers/infiniband/hw/hfi1/qp.h
+++ b/drivers/infiniband/hw/hfi1/qp.h
@@ -64,12 +64,14 @@ extern const struct rvt_operation_params hfi1_post_parms[];
  * HFI1_S_AHG_CLEAR - have send engine clear ahg state
  * HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain
  * HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource
+ * HFI1_S_WAIT_TID_RESP - waiting for a TID RDMA WRITE response
  * HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1
  */
 #define HFI1_S_AHG_VALID         0x80000000
 #define HFI1_S_AHG_CLEAR         0x40000000
 #define HFI1_S_WAIT_PIO_DRAIN    0x20000000
 #define HFI1_S_WAIT_TID_SPACE    0x10000000
+#define HFI1_S_WAIT_TID_RESP     0x08000000
 #define HFI1_S_MIN_BIT_MASK      0x01000000
 
 /*
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index 0ee79403acaf..089e301d9bcd 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -2975,3 +2975,41 @@ void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
 exit:
 	rcu_read_unlock();
 }
+
+/* TID RDMA WRITE functions */
+
+u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+				  struct ib_other_headers *ohdr,
+				  u32 *bth1, u32 *bth2, u32 *len)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+	struct tid_rdma_params *remote;
+
+	rcu_read_lock();
+	remote = rcu_dereference(qpriv->tid_rdma.remote);
+	/*
+	 * Set the number of flow to be used based on negotiated
+	 * parameters.
+	 */
+	req->n_flows = remote->max_write;
+	req->state = TID_REQUEST_ACTIVE;
+
+	KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1);
+	KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey);
+	ohdr->u.tid_rdma.w_req.reth.vaddr =
+		cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len));
+	ohdr->u.tid_rdma.w_req.reth.rkey =
+		cpu_to_be32(wqe->rdma_wr.rkey);
+	ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len);
+	ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn);
+	*bth1 &= ~RVT_QPN_MASK;
+	*bth1 |= remote->qp;
+	qp->s_state = TID_OP(WRITE_REQ);
+	qp->s_flags |= HFI1_S_WAIT_TID_RESP;
+	*bth2 |= IB_BTH_REQ_ACK;
+	*len = 0;
+
+	rcu_read_unlock();
+	return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32);
+}
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h
index a53598ce45b2..baba539b2b80 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.h
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.h
@@ -233,4 +233,7 @@ static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp,
 		setup_tid_rdma_wqe(qp, wqe);
 }
 
+u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+				  struct ib_other_headers *ohdr,
+				  u32 *bth1, u32 *bth2, u32 *len);
 #endif /* HFI1_TID_RDMA_H */
diff --git a/include/rdma/ib_hdrs.h b/include/rdma/ib_hdrs.h
index 58a0a0f99e7f..9a90bd031e8c 100644
--- a/include/rdma/ib_hdrs.h
+++ b/include/rdma/ib_hdrs.h
@@ -123,6 +123,11 @@ union ib_ehdrs {
 	union {
 		struct tid_rdma_read_req r_req;
 		struct tid_rdma_read_resp r_rsp;
+		struct tid_rdma_write_req w_req;
+		struct tid_rdma_write_resp w_rsp;
+		struct tid_rdma_write_data w_data;
+		struct tid_rdma_resync resync;
+		struct tid_rdma_ack ack;
 	} tid_rdma;
 }  __packed;
 
diff --git a/include/rdma/tid_rdma_defs.h b/include/rdma/tid_rdma_defs.h
index 1c431ea32b52..08fe47c7ad2c 100644
--- a/include/rdma/tid_rdma_defs.h
+++ b/include/rdma/tid_rdma_defs.h
@@ -27,16 +27,71 @@ struct tid_rdma_read_resp {
 	__be32 verbs_qp;
 };
 
+struct tid_rdma_write_req {
+	__le32 kdeth0;
+	__le32 kdeth1;
+	struct ib_reth reth;
+	__be32 reserved[2];
+	__be32 verbs_qp;
+};
+
+struct tid_rdma_write_resp {
+	__le32 kdeth0;
+	__le32 kdeth1;
+	__be32 aeth;
+	__be32 reserved[3];
+	__be32 tid_flow_psn;
+	__be32 tid_flow_qp;
+	__be32 verbs_qp;
+};
+
+struct tid_rdma_write_data {
+	__le32 kdeth0;
+	__le32 kdeth1;
+	__be32 reserved[6];
+	__be32 verbs_qp;
+};
+
+struct tid_rdma_resync {
+	__le32 kdeth0;
+	__le32 kdeth1;
+	__be32 reserved[6];
+	__be32 verbs_qp;
+};
+
+struct tid_rdma_ack {
+	__le32 kdeth0;
+	__le32 kdeth1;
+	__be32 aeth;
+	__be32 reserved[2];
+	__be32 tid_flow_psn;
+	__be32 verbs_psn;
+	__be32 tid_flow_qp;
+	__be32 verbs_qp;
+};
+
 /*
  * TID RDMA Opcodes
  */
 #define IB_OPCODE_TID_RDMA 0xe0
 enum {
+	IB_OPCODE_WRITE_REQ       = 0x0,
+	IB_OPCODE_WRITE_RESP      = 0x1,
+	IB_OPCODE_WRITE_DATA      = 0x2,
+	IB_OPCODE_WRITE_DATA_LAST = 0x3,
 	IB_OPCODE_READ_REQ        = 0x4,
 	IB_OPCODE_READ_RESP       = 0x5,
+	IB_OPCODE_RESYNC          = 0x6,
+	IB_OPCODE_ACK             = 0x7,
 
+	IB_OPCODE(TID_RDMA, WRITE_REQ),
+	IB_OPCODE(TID_RDMA, WRITE_RESP),
+	IB_OPCODE(TID_RDMA, WRITE_DATA),
+	IB_OPCODE(TID_RDMA, WRITE_DATA_LAST),
 	IB_OPCODE(TID_RDMA, READ_REQ),
 	IB_OPCODE(TID_RDMA, READ_RESP),
+	IB_OPCODE(TID_RDMA, RESYNC),
+	IB_OPCODE(TID_RDMA, ACK),
 };
 
 #define TID_OP(x) IB_OPCODE_TID_RDMA_##x
@@ -47,6 +102,7 @@ enum {
  * low level drivers. Two of those are used but renamed
  * to be more descriptive.
  */
+#define IB_WR_TID_RDMA_WRITE IB_WR_RESERVED1
 #define IB_WR_TID_RDMA_READ  IB_WR_RESERVED2
 
 #endif /* TID_RDMA_DEFS_H */
-- 
cgit v1.2.3


From 4f9264d156dc6c154a8a6cfae780730bad45c6f8 Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Wed, 23 Jan 2019 21:48:48 -0800
Subject: IB/hfi1: Add an s_acked_ack_queue pointer

The s_ack_queue is managed by two pointers into the ring:
r_head_ack_queue and s_tail_ack_queue. r_head_ack_queue is the index of
where the next received request is going to be placed and s_tail_ack_queue
is the entry of the request currently being processed. This works
perfectly fine for normal Verbs as the requests are processed one at a
time and the s_tail_ack_queue is not moved until the request that it
points to is fully completed.

In this fashion, s_tail_ack_queue constantly chases r_head_ack_queue and
the two pointers can easily be used to determine "queue full" and "queue
empty" conditions.

The detection of these two conditions are imported in determining when an
old entry can safely be overwritten with a new received request and the
resources associated with the old request be safely released.

When pipelined TID RDMA WRITE is introduced into this mix, things look
very different. r_head_ack_queue is still the point at which a newly
received request will be inserted, s_tail_ack_queue is still the
currently processed request. However, with pipelined TID RDMA WRITE
requests, s_tail_ack_queue moves to the next request once all TID RDMA
WRITE responses for that request have been sent. The rest of the protocol
for a particular request is managed by other pointers specific to TID RDMA
- r_tid_tail and r_tid_ack - which point to the entries for which the next
TID RDMA DATA packets are going to arrive and the request for which
the next TID RDMA ACK packets are to be generated, respectively.

What this means is that entries in the ring, which are "behind"
s_tail_ack_queue (entries which s_tail_ack_queue has gone past) are no
longer considered complete. This is where the problem is - a newly
received request could potentially overwrite a still active TID RDMA WRITE
request.

The reason why the TID RDMA pointers trail s_tail_ack_queue is that the
normal Verbs send engine uses s_tail_ack_queue as the pointer for the next
response. Since TID RDMA WRITE responses are processed by the normal Verbs
send engine, s_tail_ack_queue had to be moved to the next entry once all
TID RDMA WRITE response packets were sent to get the desired pipelining
between requests. Doing otherwise would mean that the normal Verbs send
engine would not be able to send the TID RDMA WRITE responses for the next
TID RDMA request until the current one is fully completed.

This patch introduces the s_acked_ack_queue index to point to the next
request to complete on the responder side. For requests other than TID
RDMA WRITE, s_acked_ack_queue should always be kept in sync with
s_tail_ack_queue. For TID RDMA WRITE request, it may fall behind
s_tail_ack_queue.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/rc.c        | 33 ++++++++++++++++++++++++++++-----
 drivers/infiniband/hw/hfi1/rc.h        |  1 +
 drivers/infiniband/hw/hfi1/tid_rdma.c  |  2 ++
 drivers/infiniband/hw/hfi1/trace_tid.h | 10 ++++++++--
 drivers/infiniband/sw/rdmavt/qp.c      |  1 +
 include/rdma/rdmavt_qp.h               |  1 +
 6 files changed, 41 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index 6c9ef572fc69..9dc8e524510e 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -120,6 +120,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 	struct hfi1_qp_priv *priv = qp->priv;
 	bool last_pkt;
 	u32 delta;
+	u8 next = qp->s_tail_ack_queue;
 
 	trace_hfi1_rsp_make_rc_ack(qp, 0);
 	lockdep_assert_held(&qp->s_lock);
@@ -149,9 +150,17 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 		 * response has been sent instead of only being
 		 * constructed.
 		 */
-		if (++qp->s_tail_ack_queue >
-		    rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
-			qp->s_tail_ack_queue = 0;
+		if (++next > rvt_size_atomic(&dev->rdi))
+			next = 0;
+		/*
+		 * Only advance the s_acked_ack_queue pointer if there
+		 * have been no TID RDMA requests.
+		 */
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		if (e->opcode != TID_OP(WRITE_REQ) &&
+		    qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+			qp->s_acked_ack_queue = next;
+		qp->s_tail_ack_queue = next;
 		/* FALLTHROUGH */
 	case OP(SEND_ONLY):
 	case OP(ACKNOWLEDGE):
@@ -172,6 +181,10 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 			 */
 			len = e->rdma_sge.sge_length;
 			if (len && !e->rdma_sge.mr) {
+				if (qp->s_acked_ack_queue ==
+				    qp->s_tail_ack_queue)
+					qp->s_acked_ack_queue =
+						qp->r_head_ack_queue;
 				qp->s_tail_ack_queue = qp->r_head_ack_queue;
 				goto bail;
 			}
@@ -202,6 +215,10 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 			 */
 			len = e->rdma_sge.sge_length;
 			if (len && !e->rdma_sge.mr) {
+				if (qp->s_acked_ack_queue ==
+				    qp->s_tail_ack_queue)
+					qp->s_acked_ack_queue =
+						qp->r_head_ack_queue;
 				qp->s_tail_ack_queue = qp->r_head_ack_queue;
 				goto bail;
 			}
@@ -2235,6 +2252,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
 		e->psn = psn;
 		if (old_req)
 			goto unlock_done;
+		if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+			qp->s_acked_ack_queue = prev;
 		qp->s_tail_ack_queue = prev;
 		break;
 	}
@@ -2248,6 +2267,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
 		 */
 		if (!e || e->opcode != (u8)opcode || old_req)
 			goto unlock_done;
+		if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
+			qp->s_acked_ack_queue = prev;
 		qp->s_tail_ack_queue = prev;
 		break;
 	}
@@ -2274,6 +2295,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
 		 * Resend the RDMA read or atomic op which
 		 * ACKs this duplicate request.
 		 */
+		if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
+			qp->s_acked_ack_queue = mra;
 		qp->s_tail_ack_queue = mra;
 		break;
 	}
@@ -2646,7 +2669,7 @@ send_last:
 		if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
 			next = 0;
 		spin_lock_irqsave(&qp->s_lock, flags);
-		if (unlikely(next == qp->s_tail_ack_queue)) {
+		if (unlikely(next == qp->s_acked_ack_queue)) {
 			if (!qp->s_ack_queue[next].sent)
 				goto nack_inv_unlck;
 			update_ack_queue(qp, next);
@@ -2723,7 +2746,7 @@ send_last:
 		if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
 			next = 0;
 		spin_lock_irqsave(&qp->s_lock, flags);
-		if (unlikely(next == qp->s_tail_ack_queue)) {
+		if (unlikely(next == qp->s_acked_ack_queue)) {
 			if (!qp->s_ack_queue[next].sent)
 				goto nack_inv_unlck;
 			update_ack_queue(qp, next);
diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h
index 4329eadcb3df..8e0935b9bf2a 100644
--- a/drivers/infiniband/hw/hfi1/rc.h
+++ b/drivers/infiniband/hw/hfi1/rc.h
@@ -18,6 +18,7 @@ static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n)
 	if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
 		next = 0;
 	qp->s_tail_ack_queue = next;
+	qp->s_acked_ack_queue = next;
 	qp->s_ack_state = OP(ACKNOWLEDGE);
 }
 
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index 089e301d9bcd..c320a99afb35 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -2044,6 +2044,8 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet,
 			goto unlock;
 	}
 	/* Re-process old requests.*/
+	if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+		qp->s_acked_ack_queue = prev;
 	qp->s_tail_ack_queue = prev;
 	/*
 	 * Since the qp->s_tail_ack_queue is modified, the
diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h
index b71638c22d4b..51f5b0e8da71 100644
--- a/drivers/infiniband/hw/hfi1/trace_tid.h
+++ b/drivers/infiniband/hw/hfi1/trace_tid.h
@@ -40,7 +40,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent);
 #define RSP_INFO_PRN "[%s] qpn 0x%x state 0x%x s_state 0x%x psn 0x%x " \
 		     "r_psn 0x%x r_state 0x%x r_flags 0x%x " \
 		     "r_head_ack_queue %u s_tail_ack_queue %u " \
-		     "s_ack_state 0x%x " \
+		     "s_acked_ack_queue %u s_ack_state 0x%x " \
 		     "s_nak_state 0x%x s_flags 0x%x ps_flags 0x%x " \
 		     "iow_flags 0x%lx"
 
@@ -62,7 +62,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent);
 		    "s_next_psn 0x%x"
 
 #define RCV_ERR_PRN "[%s] qpn 0x%x s_flags 0x%x state 0x%x " \
-		    "s_tail_ack_queue %u " \
+		    "s_acked_ack_queue %u s_tail_ack_queue %u " \
 		    "r_head_ack_queue %u opcode 0x%x psn 0x%x r_psn 0x%x " \
 		    " diff %d"
 
@@ -671,6 +671,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */
 		__field(u8, r_flags)
 		__field(u8, r_head_ack_queue)
 		__field(u8, s_tail_ack_queue)
+		__field(u8, s_acked_ack_queue)
 		__field(u8, s_ack_state)
 		__field(u8, s_nak_state)
 		__field(u8, r_nak_state)
@@ -691,6 +692,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */
 		__entry->r_flags = qp->r_flags;
 		__entry->r_head_ack_queue = qp->r_head_ack_queue;
 		__entry->s_tail_ack_queue = qp->s_tail_ack_queue;
+		__entry->s_acked_ack_queue = qp->s_acked_ack_queue;
 		__entry->s_ack_state = qp->s_ack_state;
 		__entry->s_nak_state = qp->s_nak_state;
 		__entry->s_flags = qp->s_flags;
@@ -709,6 +711,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */
 		__entry->r_flags,
 		__entry->r_head_ack_queue,
 		__entry->s_tail_ack_queue,
+		__entry->s_acked_ack_queue,
 		__entry->s_ack_state,
 		__entry->s_nak_state,
 		__entry->s_flags,
@@ -1007,6 +1010,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */
 		__field(u32, qpn)
 		__field(u32, s_flags)
 		__field(u8, state)
+		__field(u8, s_acked_ack_queue)
 		__field(u8, s_tail_ack_queue)
 		__field(u8, r_head_ack_queue)
 		__field(u32, opcode)
@@ -1019,6 +1023,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */
 		__entry->qpn = qp->ibqp.qp_num;
 		__entry->s_flags = qp->s_flags;
 		__entry->state = qp->state;
+		__entry->s_acked_ack_queue = qp->s_acked_ack_queue;
 		__entry->s_tail_ack_queue = qp->s_tail_ack_queue;
 		__entry->r_head_ack_queue = qp->r_head_ack_queue;
 		__entry->opcode = opcode;
@@ -1032,6 +1037,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */
 		__entry->qpn,
 		__entry->s_flags,
 		__entry->state,
+		__entry->s_acked_ack_queue,
 		__entry->s_tail_ack_queue,
 		__entry->r_head_ack_queue,
 		__entry->opcode,
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 2769ebdf89fb..14ec2577bcaa 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -854,6 +854,7 @@ static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
 	qp->s_mig_state = IB_MIG_MIGRATED;
 	qp->r_head_ack_queue = 0;
 	qp->s_tail_ack_queue = 0;
+	qp->s_acked_ack_queue = 0;
 	qp->s_num_rd_atomic = 0;
 	if (qp->r_rq.wq) {
 		qp->r_rq.wq->head = 0;
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index d8d88d023092..4ee612ab6cb4 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -375,6 +375,7 @@ struct rvt_qp {
 	u8 s_rnr_retry;         /* requester RNR retry counter */
 	u8 s_num_rd_atomic;     /* number of RDMA read/atomic pending */
 	u8 s_tail_ack_queue;    /* index into s_ack_queue[] */
+	u8 s_acked_ack_queue;   /* index into s_ack_queue[] */
 
 	struct rvt_sge_state s_ack_rdma_sge;
 	struct timer_list s_timer;
-- 
cgit v1.2.3


From 3c6cb20a0d17d7a75778fb0935d6fa427c8177af Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Wed, 23 Jan 2019 21:51:39 -0800
Subject: IB/hfi1: Add TID RDMA WRITE functionality into RDMA verbs

This patch integrates TID RDMA WRITE protocol into normal RDMA verbs
framework. The TID RDMA WRITE protocol is an end-to-end protocol
between the hfi1 drivers on two OPA nodes that converts a qualified
RDMA WRITE request into a TID RDMA WRITE request to avoid data copying
on the responder side.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/qp.c        |   7 +
 drivers/infiniband/hw/hfi1/rc.c        | 487 +++++++++++++++++++++++++++++----
 drivers/infiniband/hw/hfi1/tid_rdma.c  |  14 +
 drivers/infiniband/hw/hfi1/user_sdma.c |   3 +-
 drivers/infiniband/hw/hfi1/verbs.c     |  17 +-
 drivers/infiniband/hw/hfi1/verbs.h     |   1 +
 include/rdma/rdmavt_qp.h               |   1 +
 7 files changed, 481 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index 96632c77f36f..cfd598e4b303 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -138,6 +138,12 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = {
 	.flags = RVT_OPERATION_USE_RESERVE,
 },
 
+[IB_WR_TID_RDMA_WRITE] = {
+	.length = sizeof(struct ib_rdma_wr),
+	.qpt_support = BIT(IB_QPT_RC),
+	.flags = RVT_OPERATION_IGN_RNR_CNT,
+},
+
 };
 
 static void flush_list_head(struct list_head *l)
@@ -780,6 +786,7 @@ void quiesce_qp(struct rvt_qp *qp)
 	struct hfi1_qp_priv *priv = qp->priv;
 
 	hfi1_del_tid_reap_timer(qp);
+	hfi1_del_tid_retry_timer(qp);
 	iowait_sdma_drain(&priv->s_iowait);
 	qp_pio_drain(qp);
 	flush_tx_list(qp);
diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index fcb733ea8dfb..6d2abea896e5 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -111,16 +111,17 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 		       struct hfi1_pkt_state *ps)
 {
 	struct rvt_ack_entry *e;
-	u32 hwords;
+	u32 hwords, hdrlen;
 	u32 len = 0;
 	u32 bth0 = 0, bth2 = 0;
 	u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
 	int middle = 0;
 	u32 pmtu = qp->pmtu;
-	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_qp_priv *qpriv = qp->priv;
 	bool last_pkt;
 	u32 delta;
 	u8 next = qp->s_tail_ack_queue;
+	struct tid_rdma_request *req;
 
 	trace_hfi1_rsp_make_rc_ack(qp, 0);
 	lockdep_assert_held(&qp->s_lock);
@@ -128,7 +129,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
 		goto bail;
 
-	if (priv->hdr_type == HFI1_PKT_TYPE_9B)
+	if (qpriv->hdr_type == HFI1_PKT_TYPE_9B)
 		/* header size in 32-bit words LRH+BTH = (8+12)/4. */
 		hwords = 5;
 	else
@@ -206,6 +207,21 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 			hwords++;
 			qp->s_ack_rdma_psn = e->psn;
 			bth2 = mask_psn(qp->s_ack_rdma_psn++);
+		} else if (e->opcode == TID_OP(WRITE_REQ)) {
+			/*
+			 * If a TID RDMA WRITE RESP is being resent, we have to
+			 * wait for the actual request. All requests that are to
+			 * be resent will have their state set to
+			 * TID_REQUEST_RESEND. When the new request arrives, the
+			 * state will be changed to TID_REQUEST_RESEND_ACTIVE.
+			 */
+			req = ack_to_tid_req(e);
+			if (req->state == TID_REQUEST_RESEND ||
+			    req->state == TID_REQUEST_INIT_RESEND)
+				goto bail;
+			qp->s_ack_state = TID_OP(WRITE_RESP);
+			qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg);
+			goto write_resp;
 		} else if (e->opcode == TID_OP(READ_REQ)) {
 			/*
 			 * If a TID RDMA read response is being resent and
@@ -267,6 +283,59 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 		bth2 = mask_psn(qp->s_ack_rdma_psn++);
 		break;
 
+	case TID_OP(WRITE_RESP):
+write_resp:
+		/*
+		 * 1. Check if RVT_S_ACK_PENDING is set. If yes,
+		 *    goto normal.
+		 * 2. Attempt to allocate TID resources.
+		 * 3. Remove RVT_S_RESP_PENDING flags from s_flags
+		 * 4. If resources not available:
+		 *    4.1 Set RVT_S_WAIT_TID_SPACE
+		 *    4.2 Queue QP on RCD TID queue
+		 *    4.3 Put QP on iowait list.
+		 *    4.4 Build IB RNR NAK with appropriate timeout value
+		 *    4.5 Return indication progress made.
+		 * 5. If resources are available:
+		 *    5.1 Program HW flow CSRs
+		 *    5.2 Build TID RDMA WRITE RESP packet
+		 *    5.3 If more resources needed, do 2.1 - 2.3.
+		 *    5.4 Wake up next QP on RCD TID queue.
+		 *    5.5 Return indication progress made.
+		 */
+
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		req = ack_to_tid_req(e);
+
+		/*
+		 * Send scheduled RNR NAK's. RNR NAK's need to be sent at
+		 * segment boundaries, not at request boundaries. Don't change
+		 * s_ack_state because we are still in the middle of a request
+		 */
+		if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND &&
+		    qp->s_tail_ack_queue == qpriv->r_tid_alloc &&
+		    req->cur_seg == req->alloc_seg) {
+			qpriv->rnr_nak_state = TID_RNR_NAK_SENT;
+			goto normal_no_state;
+		}
+
+		bth2 = mask_psn(qp->s_ack_rdma_psn);
+		hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1,
+							bth2, &len,
+							&ps->s_txreq->ss);
+		if (!hdrlen)
+			return 0;
+
+		hwords += hdrlen;
+		bth0 = qp->s_ack_state << 24;
+		qp->s_ack_rdma_psn++;
+		if (req->cur_seg != req->total_segs)
+			break;
+
+		e->sent = 1;
+		qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+		break;
+
 	case TID_OP(READ_RESP):
 read_resp:
 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
@@ -298,8 +367,7 @@ normal:
 		 * (see above).
 		 */
 		qp->s_ack_state = OP(SEND_ONLY);
-		qp->s_flags &= ~RVT_S_ACK_PENDING;
-		ps->s_txreq->ss = NULL;
+normal_no_state:
 		if (qp->s_nak_state)
 			ohdr->u.aeth =
 				cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
@@ -311,9 +379,11 @@ normal:
 		len = 0;
 		bth0 = OP(ACKNOWLEDGE) << 24;
 		bth2 = mask_psn(qp->s_ack_psn);
+		qp->s_flags &= ~RVT_S_ACK_PENDING;
+		ps->s_txreq->ss = NULL;
 	}
 	qp->s_rdma_ack_cnt++;
-	ps->s_txreq->sde = priv->s_sde;
+	ps->s_txreq->sde = qpriv->s_sde;
 	ps->s_txreq->s_cur_size = len;
 	ps->s_txreq->hdr_dwords = hwords;
 	hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps);
@@ -366,6 +436,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 	int middle = 0;
 	int delta;
 	struct tid_rdma_flow *flow = NULL;
+	struct tid_rdma_params *remote;
 
 	trace_hfi1_sender_make_rc_req(qp);
 	lockdep_assert_held(&qp->s_lock);
@@ -414,7 +485,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 		goto done_free_tx;
 	}
 
-	if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
+	if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT))
 		goto bail;
 
 	if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
@@ -586,6 +657,108 @@ no_flow_control:
 				qp->s_cur = 0;
 			break;
 
+		case IB_WR_TID_RDMA_WRITE:
+			if (newreq) {
+				/*
+				 * Limit the number of TID RDMA WRITE requests.
+				 */
+				if (atomic_read(&priv->n_tid_requests) >=
+				    HFI1_TID_RDMA_WRITE_CNT)
+					goto bail;
+
+				if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+					qp->s_lsn++;
+			}
+
+			hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr,
+								&bth1, &bth2,
+								&len);
+			ss = NULL;
+			if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) {
+				priv->s_tid_cur = qp->s_cur;
+				if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) {
+					priv->s_tid_tail = qp->s_cur;
+					priv->s_state = TID_OP(WRITE_RESP);
+				}
+			} else if (priv->s_tid_cur == priv->s_tid_head) {
+				struct rvt_swqe *__w;
+				struct tid_rdma_request *__r;
+
+				__w = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
+				__r = wqe_to_tid_req(__w);
+
+				/*
+				 * The s_tid_cur pointer is advanced to s_cur if
+				 * any of the following conditions about the WQE
+				 * to which s_ti_cur currently points to are
+				 * satisfied:
+				 *   1. The request is not a TID RDMA WRITE
+				 *      request,
+				 *   2. The request is in the INACTIVE or
+				 *      COMPLETE states (TID RDMA READ requests
+				 *      stay at INACTIVE and TID RDMA WRITE
+				 *      transition to COMPLETE when done),
+				 *   3. The request is in the ACTIVE or SYNC
+				 *      state and the number of completed
+				 *      segments is equal to the total segment
+				 *      count.
+				 *      (If ACTIVE, the request is waiting for
+				 *       ACKs. If SYNC, the request has not
+				 *       received any responses because it's
+				 *       waiting on a sync point.)
+				 */
+				if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE ||
+				    __r->state == TID_REQUEST_INACTIVE ||
+				    __r->state == TID_REQUEST_COMPLETE ||
+				    ((__r->state == TID_REQUEST_ACTIVE ||
+				      __r->state == TID_REQUEST_SYNC) &&
+				     __r->comp_seg == __r->total_segs)) {
+					if (priv->s_tid_tail ==
+					    priv->s_tid_cur &&
+					    priv->s_state ==
+					    TID_OP(WRITE_DATA_LAST)) {
+						priv->s_tid_tail = qp->s_cur;
+						priv->s_state =
+							TID_OP(WRITE_RESP);
+					}
+					priv->s_tid_cur = qp->s_cur;
+				}
+				/*
+				 * A corner case: when the last TID RDMA WRITE
+				 * request was completed, s_tid_head,
+				 * s_tid_cur, and s_tid_tail all point to the
+				 * same location. Other requests are posted and
+				 * s_cur wraps around to the same location,
+				 * where a new TID RDMA WRITE is posted. In
+				 * this case, none of the indices need to be
+				 * updated. However, the priv->s_state should.
+				 */
+				if (priv->s_tid_tail == qp->s_cur &&
+				    priv->s_state == TID_OP(WRITE_DATA_LAST))
+					priv->s_state = TID_OP(WRITE_RESP);
+			}
+			req = wqe_to_tid_req(wqe);
+			if (newreq) {
+				priv->s_tid_head = qp->s_cur;
+				priv->pending_tid_w_resp += req->total_segs;
+				atomic_inc(&priv->n_tid_requests);
+				atomic_dec(&priv->n_requests);
+			} else {
+				req->state = TID_REQUEST_RESEND;
+				req->comp_seg = delta_psn(bth2, wqe->psn);
+				/*
+				 * Pull back any segments since we are going
+				 * to re-receive them.
+				 */
+				req->setup_head = req->clear_tail;
+				priv->pending_tid_w_resp +=
+					delta_psn(wqe->lpsn, bth2) + 1;
+			}
+
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
 		case IB_WR_RDMA_READ:
 			/*
 			 * Don't allow more operations to be started
@@ -745,7 +918,8 @@ no_flow_control:
 			if (qp->s_tail >= qp->s_size)
 				qp->s_tail = 0;
 		}
-		if (wqe->wr.opcode == IB_WR_RDMA_READ)
+		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		    wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
 			qp->s_psn = wqe->lpsn + 1;
 		else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
 			qp->s_psn = req->s_next_psn;
@@ -865,6 +1039,33 @@ no_flow_control:
 		if (qp->s_cur == qp->s_size)
 			qp->s_cur = 0;
 		break;
+
+	case TID_OP(WRITE_RESP):
+		/*
+		 * This value for s_state is used for restarting a TID RDMA
+		 * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE
+		 * for more).
+		 */
+		req = wqe_to_tid_req(wqe);
+		req->state = TID_REQUEST_RESEND;
+		rcu_read_lock();
+		remote = rcu_dereference(priv->tid_rdma.remote);
+		req->comp_seg = delta_psn(qp->s_psn, wqe->psn);
+		len = wqe->length - (req->comp_seg * remote->max_len);
+		rcu_read_unlock();
+
+		bth2 = mask_psn(qp->s_psn);
+		hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1,
+							&bth2, &len);
+		qp->s_psn = wqe->lpsn + 1;
+		ss = NULL;
+		qp->s_state = TID_OP(WRITE_REQ);
+		priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1;
+		priv->s_tid_cur = qp->s_cur;
+		if (++qp->s_cur == qp->s_size)
+			qp->s_cur = 0;
+		break;
+
 	case TID_OP(READ_RESP):
 		if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
 			goto bail;
@@ -965,7 +1166,8 @@ no_flow_control:
 	}
 	qp->s_sending_hpsn = bth2;
 	delta = delta_psn(bth2, wqe->psn);
-	if (delta && delta % HFI1_PSN_CREDIT == 0)
+	if (delta && delta % HFI1_PSN_CREDIT == 0 &&
+	    wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
 		bth2 |= IB_BTH_REQ_ACK;
 	if (qp->s_flags & RVT_S_SEND_ONE) {
 		qp->s_flags &= ~RVT_S_SEND_ONE;
@@ -998,6 +1200,12 @@ bail:
 bail_no_tx:
 	ps->s_txreq = NULL;
 	qp->s_flags &= ~RVT_S_BUSY;
+	/*
+	 * If we didn't get a txreq, the QP will be woken up later to try
+	 * again. Set the flags to indicate which work item to wake
+	 * up.
+	 */
+	iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
 	return 0;
 }
 
@@ -1285,6 +1493,7 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
 	lockdep_assert_held(&qp->s_lock);
 	qp->s_cur = n;
 	priv->pending_tid_r_segs = 0;
+	priv->pending_tid_w_resp = 0;
 	qp->s_num_rd_atomic = 0;
 
 	/*
@@ -1342,6 +1551,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
 		qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
 		break;
 
+	case IB_WR_TID_RDMA_WRITE:
+		qp->s_state = TID_OP(WRITE_RESP);
+		break;
+
 	case IB_WR_RDMA_READ:
 		qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
 		break;
@@ -1435,7 +1648,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
 
 	qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
 			 RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
-			 RVT_S_WAIT_ACK);
+			 RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP);
 	if (wait)
 		qp->s_flags |= RVT_S_SEND_ONE;
 	reset_psn(qp, psn);
@@ -1443,7 +1656,8 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
 
 /*
  * Set qp->s_sending_psn to the next PSN after the given one.
- * This would be psn+1 except when RDMA reads are present.
+ * This would be psn+1 except when RDMA reads or TID RDMA ops
+ * are present.
  */
 static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
 {
@@ -1456,7 +1670,8 @@ static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
 		wqe = rvt_get_swqe_ptr(qp, n);
 		if (cmp_psn(psn, wqe->lpsn) <= 0) {
 			if (wqe->wr.opcode == IB_WR_RDMA_READ ||
-			    wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+			    wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+			    wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
 				qp->s_sending_psn = wqe->lpsn + 1;
 			else
 				qp->s_sending_psn = psn + 1;
@@ -1479,8 +1694,9 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
 	struct rvt_swqe *wqe;
 	struct ib_header *hdr = NULL;
 	struct hfi1_16b_header *hdr_16b = NULL;
-	u32 opcode;
+	u32 opcode, head, tail;
 	u32 psn;
+	struct tid_rdma_request *req;
 
 	lockdep_assert_held(&qp->s_lock);
 	if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
@@ -1507,29 +1723,84 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
 	opcode = ib_bth_get_opcode(ohdr);
 	if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
 	     opcode <= OP(ATOMIC_ACKNOWLEDGE)) ||
-	    opcode == TID_OP(READ_RESP)) {
+	    opcode == TID_OP(READ_RESP) ||
+	    opcode == TID_OP(WRITE_RESP)) {
 		WARN_ON(!qp->s_rdma_ack_cnt);
 		qp->s_rdma_ack_cnt--;
 		return;
 	}
 
 	psn = ib_bth_get_psn(ohdr);
-	reset_sending_psn(qp, psn);
+	/*
+	 * Don't attempt to reset the sending PSN for packets in the
+	 * KDETH PSN space since the PSN does not match anything.
+	 */
+	if (opcode != TID_OP(WRITE_DATA) &&
+	    opcode != TID_OP(WRITE_DATA_LAST) &&
+	    opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC))
+		reset_sending_psn(qp, psn);
+
+	/* Handle TID RDMA WRITE packets differently */
+	if (opcode >= TID_OP(WRITE_REQ) &&
+	    opcode <= TID_OP(WRITE_DATA_LAST)) {
+		head = priv->s_tid_head;
+		tail = priv->s_tid_cur;
+		/*
+		 * s_tid_cur is set to s_tid_head in the case, where
+		 * a new TID RDMA request is being started and all
+		 * previous ones have been completed.
+		 * Therefore, we need to do a secondary check in order
+		 * to properly determine whether we should start the
+		 * RC timer.
+		 */
+		wqe = rvt_get_swqe_ptr(qp, tail);
+		req = wqe_to_tid_req(wqe);
+		if (head == tail && req->comp_seg < req->total_segs) {
+			if (tail == 0)
+				tail = qp->s_size - 1;
+			else
+				tail -= 1;
+		}
+	} else {
+		head = qp->s_tail;
+		tail = qp->s_acked;
+	}
 
 	/*
 	 * Start timer after a packet requesting an ACK has been sent and
 	 * there are still requests that haven't been acked.
 	 */
-	if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
+	if ((psn & IB_BTH_REQ_ACK) && tail != head &&
+	    opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) &&
+	    opcode != TID_OP(RESYNC) &&
 	    !(qp->s_flags &
-		(RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
-		(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+	      (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
+	    (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
 		if (opcode == TID_OP(READ_REQ))
 			rvt_add_retry_timer_ext(qp, priv->timeout_shift);
 		else
 			rvt_add_retry_timer(qp);
 	}
 
+	/* Start TID RDMA ACK timer */
+	if ((opcode == TID_OP(WRITE_DATA) ||
+	     opcode == TID_OP(WRITE_DATA_LAST) ||
+	     opcode == TID_OP(RESYNC)) &&
+	    (psn & IB_BTH_REQ_ACK) &&
+	    !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) &&
+	    (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+		/*
+		 * The TID RDMA ACK packet could be received before this
+		 * function is called. Therefore, add the timer only if TID
+		 * RDMA ACK packets are actually pending.
+		 */
+		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+		req = wqe_to_tid_req(wqe);
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+		    req->ack_seg < req->cur_seg)
+			hfi1_add_tid_retry_timer(qp);
+	}
+
 	while (qp->s_last != qp->s_acked) {
 		u32 s_last;
 
@@ -1628,7 +1899,16 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
 	}
 
 	qp->s_retry = qp->s_retry_cnt;
-	update_last_psn(qp, wqe->lpsn);
+	/*
+	 * Don't update the last PSN if the request being completed is
+	 * a TID RDMA WRITE request.
+	 * Completion of the TID RDMA WRITE requests are done by the
+	 * TID RDMA ACKs and as such could be for a request that has
+	 * already been ACKed as far as the IB state machine is
+	 * concerned.
+	 */
+	if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
+		update_last_psn(qp, wqe->lpsn);
 
 	/*
 	 * If we are completing a request which is in the process of
@@ -1658,6 +1938,54 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
 	return wqe;
 }
 
+static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd)
+{
+	/* Retry this request. */
+	if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
+		qp->r_flags |= RVT_R_RDMAR_SEQ;
+		hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
+		if (list_empty(&qp->rspwait)) {
+			qp->r_flags |= RVT_R_RSP_SEND;
+			rvt_get_qp(qp);
+			list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+		}
+	}
+}
+
+/**
+ * update_qp_retry_state - Update qp retry state.
+ * @qp: the QP
+ * @psn: the packet sequence number of the TID RDMA WRITE RESP.
+ * @spsn:  The start psn for the given TID RDMA WRITE swqe.
+ * @lpsn:  The last psn for the given TID RDMA WRITE swqe.
+ *
+ * This function is called to update the qp retry state upon
+ * receiving a TID WRITE RESP after the qp is scheduled to retry
+ * a request.
+ */
+static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn,
+				  u32 lpsn)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+
+	qp->s_psn = psn + 1;
+	/*
+	 * If this is the first TID RDMA WRITE RESP packet for the current
+	 * request, change the s_state so that the retry will be processed
+	 * correctly. Similarly, if this is the last TID RDMA WRITE RESP
+	 * packet, change the s_state and advance the s_cur.
+	 */
+	if (cmp_psn(psn, lpsn) >= 0) {
+		qp->s_cur = qpriv->s_tid_cur + 1;
+		if (qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		qp->s_state = TID_OP(WRITE_REQ);
+	} else  if (!cmp_psn(psn, spsn)) {
+		qp->s_cur = qpriv->s_tid_cur;
+		qp->s_state = TID_OP(WRITE_RESP);
+	}
+}
+
 /**
  * do_rc_ack - process an incoming RC ACK
  * @qp: the QP the ACK came in on
@@ -1679,6 +2007,7 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
 	int ret = 0;
 	u32 ack_psn;
 	int diff;
+	struct rvt_dev_info *rdi;
 
 	lockdep_assert_held(&qp->s_lock);
 	/*
@@ -1725,18 +2054,10 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
 		     (opcode != TID_OP(READ_RESP) || diff != 0)) ||
 		    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
 		      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
-		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
-			/* Retry this request. */
-			if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
-				qp->r_flags |= RVT_R_RDMAR_SEQ;
-				hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
-				if (list_empty(&qp->rspwait)) {
-					qp->r_flags |= RVT_R_RSP_SEND;
-					rvt_get_qp(qp);
-					list_add_tail(&qp->rspwait,
-						      &rcd->qp_wait_list);
-				}
-			}
+		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) ||
+		    (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+		     (delta_psn(psn, qp->s_last_psn) != 1))) {
+			set_restart_qp(qp, rcd);
 			/*
 			 * No need to process the ACK/NAK since we are
 			 * restarting an earlier request.
@@ -1768,6 +2089,14 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
 				hfi1_schedule_send(qp);
 			}
 		}
+
+		/*
+		 * TID RDMA WRITE requests will be completed by the TID RDMA
+		 * ACK packet handler (see tid_rdma.c).
+		 */
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
+			break;
+
 		wqe = do_rc_completion(qp, wqe, ibp);
 		if (qp->s_acked == qp->s_tail)
 			break;
@@ -1785,17 +2114,60 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
 			else
 				rvt_stop_rc_timers(qp);
 		} else if (qp->s_acked != qp->s_tail) {
+			struct rvt_swqe *__w = NULL;
+
+			if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID)
+				__w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
+
 			/*
-			 * We are expecting more ACKs so
-			 * mod the retry timer.
-			 */
-			rvt_mod_retry_timer(qp);
-			/*
-			 * We can stop re-sending the earlier packets and
-			 * continue with the next packet the receiver wants.
+			 * Stop timers if we've received all of the TID RDMA
+			 * WRITE * responses.
 			 */
-			if (cmp_psn(qp->s_psn, psn) <= 0)
-				reset_psn(qp, psn + 1);
+			if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+			    opcode == TID_OP(WRITE_RESP)) {
+				/*
+				 * Normally, the loop above would correctly
+				 * process all WQEs from s_acked onward and
+				 * either complete them or check for correct
+				 * PSN sequencing.
+				 * However, for TID RDMA, due to pipelining,
+				 * the response may not be for the request at
+				 * s_acked so the above look would just be
+				 * skipped. This does not allow for checking
+				 * the PSN sequencing. It has to be done
+				 * separately.
+				 */
+				if (cmp_psn(psn, qp->s_last_psn + 1)) {
+					set_restart_qp(qp, rcd);
+					goto bail_stop;
+				}
+				/*
+				 * If the psn is being resent, stop the
+				 * resending.
+				 */
+				if (qp->s_cur != qp->s_tail &&
+				    cmp_psn(qp->s_psn, psn) <= 0)
+					update_qp_retry_state(qp, psn,
+							      __w->psn,
+							      __w->lpsn);
+				else if (--qpriv->pending_tid_w_resp)
+					rvt_mod_retry_timer(qp);
+				else
+					rvt_stop_rc_timers(qp);
+			} else {
+				/*
+				 * We are expecting more ACKs so
+				 * mod the retry timer.
+				 */
+				rvt_mod_retry_timer(qp);
+				/*
+				 * We can stop re-sending the earlier packets
+				 * and continue with the next packet the
+				 * receiver wants.
+				 */
+				if (cmp_psn(qp->s_psn, psn) <= 0)
+					reset_psn(qp, psn + 1);
+			}
 		} else {
 			/* No more acks - kill all timers */
 			rvt_stop_rc_timers(qp);
@@ -1811,6 +2183,15 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
 		rvt_get_credit(qp, aeth);
 		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
 		qp->s_retry = qp->s_retry_cnt;
+		/*
+		 * If the current request is a TID RDMA WRITE request and the
+		 * response is not a TID RDMA WRITE RESP packet, s_last_psn
+		 * can't be advanced.
+		 */
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+		    opcode != TID_OP(WRITE_RESP) &&
+		    cmp_psn(psn, wqe->psn) >= 0)
+			return 1;
 		update_last_psn(qp, psn);
 		return 1;
 
@@ -1820,20 +2201,31 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
 			goto bail_stop;
 		if (qp->s_flags & RVT_S_WAIT_RNR)
 			goto bail_stop;
-		if (qp->s_rnr_retry == 0) {
+		rdi = ib_to_rvt(qp->ibqp.device);
+		if (qp->s_rnr_retry == 0 &&
+		    !((rdi->post_parms[wqe->wr.opcode].flags &
+		      RVT_OPERATION_IGN_RNR_CNT) &&
+		      qp->s_rnr_retry_cnt == 0)) {
 			status = IB_WC_RNR_RETRY_EXC_ERR;
 			goto class_b;
 		}
-		if (qp->s_rnr_retry_cnt < 7)
+		if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0)
 			qp->s_rnr_retry--;
 
-		/* The last valid PSN is the previous PSN. */
-		update_last_psn(qp, psn - 1);
+		/*
+		 * The last valid PSN is the previous PSN. For TID RDMA WRITE
+		 * request, s_last_psn should be incremented only when a TID
+		 * RDMA WRITE RESP is received to avoid skipping lost TID RDMA
+		 * WRITE RESP packets.
+		 */
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+			reset_psn(qp, qp->s_last_psn + 1);
+		} else {
+			update_last_psn(qp, psn - 1);
+			reset_psn(qp, psn);
+		}
 
 		ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
-
-		reset_psn(qp, psn);
-
 		qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
 		rvt_stop_rc_timers(qp);
 		rvt_add_rnr_timer(qp, aeth);
@@ -1918,6 +2310,7 @@ static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
 	while (cmp_psn(psn, wqe->lpsn) > 0) {
 		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
 		    wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+		    wqe->wr.opcode == IB_WR_TID_RDMA_WRITE ||
 		    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
 		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
 			break;
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index 80111dd1d876..490e47a0f68b 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -3205,6 +3205,20 @@ void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp)
 		do {
 			struct hfi1_swqe_priv *priv = wqe->priv;
 
+			ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
+		} while (!ret);
+	}
+	for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) {
+		struct rvt_ack_entry *e = &qp->s_ack_queue[i];
+
+		if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device)))
+			i = 0;
+		/* Free only locally allocated TID entries */
+		if (e->opcode != TID_OP(WRITE_REQ))
+			continue;
+		do {
+			struct hfi1_ack_priv *priv = e->priv;
+
 			ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
 		} while (!ret);
 	}
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
index e5e7fad09f32..6764114b886c 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.c
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -1126,7 +1126,8 @@ static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
 			0xffffffull),
 		psn = val & mask;
 	if (expct)
-		psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
+		psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
+			((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
 	else
 		psn = psn + frags;
 	return psn & mask;
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 7b87b77582bd..ab97d71cdd92 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -161,6 +161,7 @@ MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the
  */
 const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
 	[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+	[IB_WR_TID_RDMA_WRITE] = IB_WC_RDMA_WRITE,
 	[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
 	[IB_WR_SEND] = IB_WC_SEND,
 	[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
@@ -203,6 +204,12 @@ const u8 hdr_len_by_opcode[256] = {
 	[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = 12 + 8 + 4,
 	[IB_OPCODE_TID_RDMA_READ_REQ]                 = 12 + 8 + 36,
 	[IB_OPCODE_TID_RDMA_READ_RESP]                = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_WRITE_REQ]                = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_WRITE_RESP]               = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_WRITE_DATA]               = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_WRITE_DATA_LAST]          = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_ACK]                      = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_RESYNC]                   = 12 + 8 + 36,
 	/* UC */
 	[IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
 	[IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
@@ -248,8 +255,14 @@ static const opcode_handler opcode_handler_tbl[256] = {
 	[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = &hfi1_rc_rcv,
 
 	/* TID RDMA has separate handlers for different opcodes.*/
+	[IB_OPCODE_TID_RDMA_WRITE_REQ]       = &hfi1_rc_rcv_tid_rdma_write_req,
+	[IB_OPCODE_TID_RDMA_WRITE_RESP]      = &hfi1_rc_rcv_tid_rdma_write_resp,
+	[IB_OPCODE_TID_RDMA_WRITE_DATA]      = &hfi1_rc_rcv_tid_rdma_write_data,
+	[IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = &hfi1_rc_rcv_tid_rdma_write_data,
 	[IB_OPCODE_TID_RDMA_READ_REQ]        = &hfi1_rc_rcv_tid_rdma_read_req,
 	[IB_OPCODE_TID_RDMA_READ_RESP]       = &hfi1_rc_rcv_tid_rdma_read_resp,
+	[IB_OPCODE_TID_RDMA_RESYNC]          = &hfi1_rc_rcv_tid_rdma_resync,
+	[IB_OPCODE_TID_RDMA_ACK]             = &hfi1_rc_rcv_tid_rdma_ack,
 
 	/* UC */
 	[IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
@@ -1332,7 +1345,9 @@ static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
 	rdi->dparms.props.max_mr_size = U64_MAX;
 	rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX;
 	rdi->dparms.props.max_qp = hfi1_max_qps;
-	rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs;
+	rdi->dparms.props.max_qp_wr =
+		(hfi1_max_qp_wrs >= HFI1_QP_WQE_INVALID ?
+		 HFI1_QP_WQE_INVALID - 1 : hfi1_max_qp_wrs);
 	rdi->dparms.props.max_send_sge = hfi1_max_sges;
 	rdi->dparms.props.max_recv_sge = hfi1_max_sges;
 	rdi->dparms.props.max_sge_rd = hfi1_max_sges;
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index bee3d21a548e..62ace0b2d17a 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -193,6 +193,7 @@ struct hfi1_qp_priv {
 	u32 r_tid_ack;      /* the TID RDMA request to be ACK'ed */
 	u32 r_tid_alloc;    /* Request for which we are allocating resources */
 	u32 pending_tid_w_segs; /* Num of pending tid write segments */
+	u32 pending_tid_w_resp; /* Num of pending tid write responses */
 	u32 alloc_w_segs;       /* Number of segments for which write */
 			       /* resources have been allocated for this QP */
 
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 4ee612ab6cb4..f0fbd4063fef 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -246,6 +246,7 @@ struct rvt_ack_entry {
 #define RVT_OPERATION_ATOMIC_SGE  0x00000004
 #define RVT_OPERATION_LOCAL       0x00000008
 #define RVT_OPERATION_USE_RESERVE 0x00000010
+#define RVT_OPERATION_IGN_RNR_CNT 0x00000020
 
 #define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1)
 
-- 
cgit v1.2.3


From 972248e9111ee6fe9fb56c24ecfd7434f3d713ac Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 Jan 2019 09:32:03 +0100
Subject: scsi: bsg-lib: handle bidi requests without block layer help

We can just stash away the second request in struct bsg_job instead of
using the block layer req->next_rq field, allowing for the eventual removal
of the latter.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 block/bsg-lib.c                   | 44 +++++++++++++++++++++----
 block/bsg.c                       | 68 ++++++++-------------------------------
 drivers/scsi/scsi_transport_sas.c |  1 -
 include/linux/bsg-lib.h           |  4 +++
 4 files changed, 56 insertions(+), 61 deletions(-)

(limited to 'include')

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 192129856342..005e2b75d775 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -51,11 +51,40 @@ static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr,
 		fmode_t mode)
 {
 	struct bsg_job *job = blk_mq_rq_to_pdu(rq);
+	int ret;
 
 	job->request_len = hdr->request_len;
 	job->request = memdup_user(uptr64(hdr->request), hdr->request_len);
+	if (IS_ERR(job->request))
+		return PTR_ERR(job->request);
+
+	if (hdr->dout_xfer_len && hdr->din_xfer_len) {
+		job->bidi_rq = blk_get_request(rq->q, REQ_OP_SCSI_IN, 0);
+		if (IS_ERR(job->bidi_rq)) {
+			ret = PTR_ERR(job->bidi_rq);
+			goto out;
+		}
+
+		ret = blk_rq_map_user(rq->q, job->bidi_rq, NULL,
+				uptr64(hdr->din_xferp), hdr->din_xfer_len,
+				GFP_KERNEL);
+		if (ret)
+			goto out_free_bidi_rq;
+
+		job->bidi_bio = job->bidi_rq->bio;
+	} else {
+		job->bidi_rq = NULL;
+		job->bidi_bio = NULL;
+	}
 
-	return PTR_ERR_OR_ZERO(job->request);
+	return 0;
+
+out_free_bidi_rq:
+	if (job->bidi_rq)
+		blk_put_request(job->bidi_rq);
+out:
+	kfree(job->request);
+	return ret;
 }
 
 static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr)
@@ -93,7 +122,7 @@ static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr)
 	/* we assume all request payload was transferred, residual == 0 */
 	hdr->dout_resid = 0;
 
-	if (rq->next_rq) {
+	if (job->bidi_rq) {
 		unsigned int rsp_len = job->reply_payload.payload_len;
 
 		if (WARN_ON(job->reply_payload_rcv_len > rsp_len))
@@ -111,6 +140,11 @@ static void bsg_transport_free_rq(struct request *rq)
 {
 	struct bsg_job *job = blk_mq_rq_to_pdu(rq);
 
+	if (job->bidi_rq) {
+		blk_rq_unmap_user(job->bidi_bio);
+		blk_put_request(job->bidi_rq);
+	}
+
 	kfree(job->request);
 }
 
@@ -200,7 +234,6 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
  */
 static bool bsg_prepare_job(struct device *dev, struct request *req)
 {
-	struct request *rsp = req->next_rq;
 	struct bsg_job *job = blk_mq_rq_to_pdu(req);
 	int ret;
 
@@ -211,8 +244,8 @@ static bool bsg_prepare_job(struct device *dev, struct request *req)
 		if (ret)
 			goto failjob_rls_job;
 	}
-	if (rsp && rsp->bio) {
-		ret = bsg_map_buffer(&job->reply_payload, rsp);
+	if (job->bidi_rq) {
+		ret = bsg_map_buffer(&job->reply_payload, job->bidi_rq);
 		if (ret)
 			goto failjob_rls_rqst_payload;
 	}
@@ -369,7 +402,6 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
 	}
 
 	q->queuedata = dev;
-	blk_queue_flag_set(QUEUE_FLAG_BIDI, q);
 	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
 
 	ret = bsg_register_queue(q, dev, name, &bsg_transport_ops);
diff --git a/block/bsg.c b/block/bsg.c
index a799b0ace55c..f306853c6b08 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -74,6 +74,11 @@ static int bsg_scsi_fill_hdr(struct request *rq, struct sg_io_v4 *hdr,
 {
 	struct scsi_request *sreq = scsi_req(rq);
 
+	if (hdr->dout_xfer_len && hdr->din_xfer_len) {
+		pr_warn_once("BIDI support in bsg has been removed.\n");
+		return -EOPNOTSUPP;
+	}
+
 	sreq->cmd_len = hdr->request_len;
 	if (sreq->cmd_len > BLK_MAX_CDB) {
 		sreq->cmd = kzalloc(sreq->cmd_len, GFP_KERNEL);
@@ -114,14 +119,10 @@ static int bsg_scsi_complete_rq(struct request *rq, struct sg_io_v4 *hdr)
 			hdr->response_len = len;
 	}
 
-	if (rq->next_rq) {
-		hdr->dout_resid = sreq->resid_len;
-		hdr->din_resid = scsi_req(rq->next_rq)->resid_len;
-	} else if (rq_data_dir(rq) == READ) {
+	if (rq_data_dir(rq) == READ)
 		hdr->din_resid = sreq->resid_len;
-	} else {
+	else
 		hdr->dout_resid = sreq->resid_len;
-	}
 
 	return ret;
 }
@@ -140,8 +141,8 @@ static const struct bsg_ops bsg_scsi_ops = {
 
 static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg)
 {
-	struct request *rq, *next_rq = NULL;
-	struct bio *bio, *bidi_bio = NULL;
+	struct request *rq;
+	struct bio *bio;
 	struct sg_io_v4 hdr;
 	int ret;
 
@@ -164,7 +165,7 @@ static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg)
 
 	ret = q->bsg_dev.ops->fill_hdr(rq, &hdr, mode);
 	if (ret)
-		goto out;
+		return ret;
 
 	rq->timeout = msecs_to_jiffies(hdr.timeout);
 	if (!rq->timeout)
@@ -174,29 +175,6 @@ static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg)
 	if (rq->timeout < BLK_MIN_SG_TIMEOUT)
 		rq->timeout = BLK_MIN_SG_TIMEOUT;
 
-	if (hdr.dout_xfer_len && hdr.din_xfer_len) {
-		if (!test_bit(QUEUE_FLAG_BIDI, &q->queue_flags)) {
-			ret = -EOPNOTSUPP;
-			goto out;
-		}
-
-		pr_warn_once(
-			"BIDI support in bsg has been deprecated and might be removed. "
-			"Please report your use case to linux-scsi@vger.kernel.org\n");
-
-		next_rq = blk_get_request(q, REQ_OP_SCSI_IN, 0);
-		if (IS_ERR(next_rq)) {
-			ret = PTR_ERR(next_rq);
-			goto out;
-		}
-
-		rq->next_rq = next_rq;
-		ret = blk_rq_map_user(q, next_rq, NULL, uptr64(hdr.din_xferp),
-				       hdr.din_xfer_len, GFP_KERNEL);
-		if (ret)
-			goto out_free_nextrq;
-	}
-
 	if (hdr.dout_xfer_len) {
 		ret = blk_rq_map_user(q, rq, NULL, uptr64(hdr.dout_xferp),
 				hdr.dout_xfer_len, GFP_KERNEL);
@@ -206,38 +184,20 @@ static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg)
 	}
 
 	if (ret)
-		goto out_unmap_nextrq;
+		goto out_free_rq;
 
 	bio = rq->bio;
-	if (rq->next_rq)
-		bidi_bio = rq->next_rq->bio;
 
 	blk_execute_rq(q, NULL, rq, !(hdr.flags & BSG_FLAG_Q_AT_TAIL));
 	ret = rq->q->bsg_dev.ops->complete_rq(rq, &hdr);
-
-	if (rq->next_rq) {
-		blk_rq_unmap_user(bidi_bio);
-		blk_put_request(rq->next_rq);
-	}
-
 	blk_rq_unmap_user(bio);
+
+out_free_rq:
 	rq->q->bsg_dev.ops->free_rq(rq);
 	blk_put_request(rq);
-
-	if (copy_to_user(uarg, &hdr, sizeof(hdr)))
+	if (!ret && copy_to_user(uarg, &hdr, sizeof(hdr)))
 		return -EFAULT;
 	return ret;
-
-out_unmap_nextrq:
-	if (rq->next_rq)
-		blk_rq_unmap_user(rq->next_rq->bio);
-out_free_nextrq:
-	if (rq->next_rq)
-		blk_put_request(rq->next_rq);
-out:
-	q->bsg_dev.ops->free_rq(rq);
-	blk_put_request(rq);
-	return ret;
 }
 
 static struct bsg_device *bsg_alloc_device(void)
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 692b46937e52..60f1a81d2034 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -213,7 +213,6 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
 		to_sas_host_attrs(shost)->q = q;
 	}
 
-	blk_queue_flag_set(QUEUE_FLAG_BIDI, q);
 	return 0;
 }
 
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index b356e0006731..7f14517a559b 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -69,6 +69,10 @@ struct bsg_job {
 	int result;
 	unsigned int reply_payload_rcv_len;
 
+	/* BIDI support */
+	struct request *bidi_rq;
+	struct bio *bidi_bio;
+
 	void *dd_data;		/* Used for driver-specific storage */
 };
 
-- 
cgit v1.2.3


From 19fcae3d4f2dd513d472055fb694d802e9a21b91 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 8 Nov 2018 09:28:20 +0100
Subject: scsi: remove the SCSI OSD library

Now that all the users are gone the SCSI OSD library can be removed as
well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 Documentation/scsi/osd.txt       |  192 ----
 MAINTAINERS                      |    6 -
 drivers/scsi/Kconfig             |    2 -
 drivers/scsi/Makefile            |    1 -
 drivers/scsi/osd/Kbuild          |   20 -
 drivers/scsi/osd/Kconfig         |   49 -
 drivers/scsi/osd/osd_debug.h     |   30 -
 drivers/scsi/osd/osd_initiator.c | 2076 --------------------------------------
 drivers/scsi/osd/osd_uld.c       |  571 -----------
 include/scsi/osd_initiator.h     |  511 ----------
 include/scsi/osd_ore.h           |  201 ----
 11 files changed, 3659 deletions(-)
 delete mode 100644 Documentation/scsi/osd.txt
 delete mode 100644 drivers/scsi/osd/Kbuild
 delete mode 100644 drivers/scsi/osd/Kconfig
 delete mode 100644 drivers/scsi/osd/osd_debug.h
 delete mode 100644 drivers/scsi/osd/osd_initiator.c
 delete mode 100644 drivers/scsi/osd/osd_uld.c
 delete mode 100644 include/scsi/osd_initiator.h
 delete mode 100644 include/scsi/osd_ore.h

(limited to 'include')

diff --git a/Documentation/scsi/osd.txt b/Documentation/scsi/osd.txt
deleted file mode 100644
index 2bc2ab06b0c0..000000000000
--- a/Documentation/scsi/osd.txt
+++ /dev/null
@@ -1,192 +0,0 @@
-The OSD Standard
-================
-OSD (Object-Based Storage Device) is a T10 SCSI command set that is designed
-to provide efficient operation of input/output logical units that manage the
-allocation, placement, and accessing of variable-size data-storage containers,
-called objects. Objects are intended to contain operating system and application
-constructs. Each object has associated attributes attached to it, which are
-integral part of the object and provide metadata about the object. The standard
-defines some common obligatory attributes, but user attributes can be added as
-needed.
-
-See: http://www.t10.org/ftp/t10/drafts/osd2/ for the latest draft for OSD 2
-or search the web for "OSD SCSI"
-
-OSD in the Linux Kernel
-=======================
-osd-initiator:
-  The main component of OSD in Kernel is the osd-initiator library. Its main
-user is intended to be the pNFS-over-objects layout driver, which uses objects
-as its back-end data storage. Other clients are the other osd parts listed below.
-
-osd-uld:
-  This is a SCSI ULD that registers for OSD type devices and provides a testing
-platform, both for the in-kernel initiator as well as connected targets. It
-currently has no useful user-mode API, though it could have if need be.
-
-osd target:
-  There are no current plans for an OSD target implementation in kernel. For all
-needs, a user-mode target that is based on the scsi tgt target framework is
-available from Ohio Supercomputer Center (OSC) at:
-http://www.open-osd.org/bin/view/Main/OscOsdProject
-There are several other target implementations. See http://open-osd.org for more
-links.
-
-Files and Folders
-=================
-This is the complete list of files included in this work:
-include/scsi/
-	osd_initiator.h   Main API for the initiator library
-	osd_types.h	  Common OSD types
-	osd_sec.h	  Security Manager API
-	osd_protocol.h	  Wire definitions of the OSD standard protocol
-	osd_attributes.h  Wire definitions of OSD attributes
-
-drivers/scsi/osd/
-	osd_initiator.c   OSD-Initiator library implementation
-	osd_uld.c	  The OSD scsi ULD
-	osd_ktest.{h,c}	  In-kernel test suite (called by osd_uld)
-	osd_debug.h	  Some printk macros
-	Makefile	  For both in-tree and out-of-tree compilation
-	Kconfig		  Enables inclusion of the different pieces
-	osd_test.c	  User-mode application to call the kernel tests
-
-The OSD-Initiator Library
-=========================
-osd_initiator is a low level implementation of an osd initiator encoder.
-But even though, it should be intuitive and easy to use. Perhaps over time an
-higher lever will form that automates some of the more common recipes.
-
-init/fini:
-- osd_dev_init() associates a scsi_device with an osd_dev structure
-  and initializes some global pools. This should be done once per scsi_device
-  (OSD LUN). The osd_dev structure is needed for calling osd_start_request().
-
-- osd_dev_fini() cleans up before a osd_dev/scsi_device destruction.
-
-OSD commands encoding, execution, and decoding of results:
-
-struct osd_request's is used to iteratively encode an OSD command and carry
-its state throughout execution. Each request goes through these stages:
-
-a. osd_start_request() allocates the request.
-
-b. Any of the osd_req_* methods is used to encode a request of the specified
-   type.
-
-c. osd_req_add_{get,set}_attr_* may be called to add get/set attributes to the
-   CDB. "List" or "Page" mode can be used exclusively. The attribute-list API
-   can be called multiple times on the same request. However, only one
-   attribute-page can be read, as mandated by the OSD standard.
-
-d. osd_finalize_request() computes offsets into the data-in and data-out buffers
-   and signs the request using the provided capability key and integrity-
-   check parameters.
-
-e. osd_execute_request() may be called to execute the request via the block
-   layer and wait for its completion.  The request can be executed
-   asynchronously by calling the block layer API directly.
-
-f. After execution, osd_req_decode_sense() can be called to decode the request's
-   sense information.
-
-g. osd_req_decode_get_attr() may be called to retrieve osd_add_get_attr_list()
-   values.
-
-h. osd_end_request() must be called to deallocate the request and any resource
-   associated with it. Note that osd_end_request cleans up the request at any
-   stage and it must always be called after a successful osd_start_request().
-
-osd_request's structure:
-
-The OSD standard defines a complex structure of IO segments pointed to by
-members in the CDB. Up to 3 segments can be deployed in the IN-Buffer and up to
-4 in the OUT-Buffer. The ASCII illustration below depicts a secure-read with
-associated get+set of attributes-lists. Other combinations very on the same
-basic theme. From no-segments-used up to all-segments-used.
-
-|________OSD-CDB__________|
-|                         |
-|read_len (offset=0)     -|---------\
-|                         |         |
-|get_attrs_list_length    |         |
-|get_attrs_list_offset   -|----\    |
-|                         |    |    |
-|retrieved_attrs_alloc_len|    |    |
-|retrieved_attrs_offset  -|----|----|-\
-|                         |    |    | |
-|set_attrs_list_length    |    |    | |
-|set_attrs_list_offset   -|-\  |    | |
-|                         | |  |    | |
-|in_data_integ_offset    -|-|--|----|-|-\
-|out_data_integ_offset   -|-|--|--\ | | |
-\_________________________/ |  |  | | | |
-                            |  |  | | | |
-|_______OUT-BUFFER________| |  |  | | | |
-|      Set attr list      |</  |  | | | |
-|                         |    |  | | | |
-|-------------------------|    |  | | | |
-|   Get attr descriptors  |<---/  | | | |
-|                         |       | | | |
-|-------------------------|       | | | |
-|    Out-data integrity   |<------/ | | |
-|                         |         | | |
-\_________________________/         | | |
-                                    | | |
-|________IN-BUFFER________|         | | |
-|      In-Data read       |<--------/ | |
-|                         |           | |
-|-------------------------|           | |
-|      Get attr list      |<----------/ |
-|                         |             |
-|-------------------------|             |
-|    In-data integrity    |<------------/
-|                         |
-\_________________________/
-
-A block device request can carry bidirectional payload by means of associating
-a bidi_read request with a main write-request. Each in/out request is described
-by a chain of BIOs associated with each request.
-The CDB is of a SCSI VARLEN CDB format, as described by OSD standard.
-The OSD standard also mandates alignment restrictions at start of each segment.
-
-In the code, in struct osd_request, there are two _osd_io_info structures to
-describe the IN/OUT buffers above, two BIOs for the data payload and up to five
-_osd_req_data_segment structures to hold the different segments allocation and
-information.
-
-Important: We have chosen to disregard the assumption that a BIO-chain (and
-the resulting sg-list) describes a linear memory buffer. Meaning only first and
-last scatter chain can be incomplete and all the middle chains are of PAGE_SIZE.
-For us, a scatter-gather-list, as its name implies and as used by the Networking
-layer, is to describe a vector of buffers that will be transferred to/from the
-wire. It works very well with current iSCSI transport. iSCSI is currently the
-only deployed OSD transport. In the future we anticipate SAS and FC attached OSD
-devices as well.
-
-The OSD Testing ULD
-===================
-TODO: More user-mode control on tests.
-
-Authors, Mailing list
-=====================
-Please communicate with us on any deployment of osd, whether using this code
-or not.
-
-Any problems, questions, bug reports, lonely OSD nights, please email:
-   OSD Dev List <osd-dev@open-osd.org>
-
-More up-to-date information can be found on:
-http://open-osd.org
-
-Boaz Harrosh <ooo@electrozaur.com>
-
-References
-==========
-Weber, R., "SCSI Object-Based Storage Device Commands",
-T10/1355-D ANSI/INCITS 400-2004,
-http://www.t10.org/ftp/t10/drafts/osd/osd-r10.pdf
-
-Weber, R., "SCSI Object-Based Storage Device Commands -2 (OSD-2)"
-T10/1729-D, Working Draft, rev. 3
-http://www.t10.org/ftp/t10/drafts/osd2/osd2r03.pdf
diff --git a/MAINTAINERS b/MAINTAINERS
index 108b340ab625..0cd037950bc3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11385,12 +11385,6 @@ W:	http://www.nongnu.org/orinoco/
 S:	Orphan
 F:	drivers/net/wireless/intersil/orinoco/
 
-OSD LIBRARY and FILESYSTEM
-M:	Boaz Harrosh <ooo@electrozaur.com>
-S:	Maintained
-F:	drivers/scsi/osd/
-F:	include/scsi/osd_*
-
 OV2659 OMNIVISION SENSOR DRIVER
 M:	"Lad, Prabhakar" <prabhakar.csengg@gmail.com>
 L:	linux-media@vger.kernel.org
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 85c5c89f2fa5..237c6294738d 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -1515,6 +1515,4 @@ source "drivers/scsi/pcmcia/Kconfig"
 
 source "drivers/scsi/device_handler/Kconfig"
 
-source "drivers/scsi/osd/Kconfig"
-
 endmenu
diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
index fcb41ae329c4..8826111fdf4a 100644
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -150,7 +150,6 @@ obj-$(CONFIG_CHR_DEV_SG)	+= sg.o
 obj-$(CONFIG_CHR_DEV_SCH)	+= ch.o
 obj-$(CONFIG_SCSI_ENCLOSURE)	+= ses.o
 
-obj-$(CONFIG_SCSI_OSD_INITIATOR) += osd/
 obj-$(CONFIG_SCSI_HISI_SAS) += hisi_sas/
 
 # This goes last, so that "real" scsi devices probe earlier
diff --git a/drivers/scsi/osd/Kbuild b/drivers/scsi/osd/Kbuild
deleted file mode 100644
index 58cecd45b0f5..000000000000
--- a/drivers/scsi/osd/Kbuild
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# Kbuild for the OSD modules
-#
-# Copyright (C) 2008 Panasas Inc.  All rights reserved.
-#
-# Authors:
-#   Boaz Harrosh <ooo@electrozaur.com>
-#   Benny Halevy <bhalevy@panasas.com>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2
-#
-
-# libosd.ko - osd-initiator library
-libosd-y := osd_initiator.o
-obj-$(CONFIG_SCSI_OSD_INITIATOR) += libosd.o
-
-# osd.ko - SCSI ULD and char-device
-osd-y := osd_uld.o
-obj-$(CONFIG_SCSI_OSD_ULD) += osd.o
diff --git a/drivers/scsi/osd/Kconfig b/drivers/scsi/osd/Kconfig
deleted file mode 100644
index 347cc5e33749..000000000000
--- a/drivers/scsi/osd/Kconfig
+++ /dev/null
@@ -1,49 +0,0 @@
-#
-# Kernel configuration file for the OSD scsi protocol
-#
-# Copyright (C) 2008 Panasas Inc.  All rights reserved.
-#
-# Authors:
-#   Boaz Harrosh <ooo@electrozaur.com>
-#   Benny Halevy <bhalevy@panasas.com>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public version 2 License as
-# published by the Free Software Foundation
-#
-config SCSI_OSD_INITIATOR
-	tristate "OSD-Initiator library"
-	depends on SCSI
-	help
-		Enable the OSD-Initiator library (libosd.ko).
-		NOTE: You must also select CRYPTO_SHA1 + CRYPTO_HMAC and their
-		dependencies
-
-config SCSI_OSD_ULD
-	tristate "OSD Upper Level driver"
-	depends on SCSI_OSD_INITIATOR
-	help
-		Build a SCSI upper layer driver that exports /dev/osdX devices
-		to user-mode for testing and controlling OSD devices. It is also
-		needed by exofs, for mounting an OSD based file system.
-
-config SCSI_OSD_DPRINT_SENSE
-    int "(0-2) When sense is returned, DEBUG print all sense descriptors"
-    default 1
-    depends on SCSI_OSD_INITIATOR
-    help
-        When a CHECK_CONDITION status is returned from a target, and a
-        sense-buffer is retrieved, turning this on will dump a full
-        sense-decoding message. Setting to 2 will also print recoverable
-        errors that might be regularly returned for some filesystem
-        operations.
-
-config SCSI_OSD_DEBUG
-	bool "Compile All OSD modules with lots of DEBUG prints"
-	default n
-	depends on SCSI_OSD_INITIATOR
-	help
-		OSD Code is populated with lots of OSD_DEBUG(..) printouts to
-		dmesg. Enable this if you found a bug and you want to help us
-		track the problem (see also MAINTAINERS). Setting this will also
-		force SCSI_OSD_DPRINT_SENSE=2.
diff --git a/drivers/scsi/osd/osd_debug.h b/drivers/scsi/osd/osd_debug.h
deleted file mode 100644
index 26341261bb5c..000000000000
--- a/drivers/scsi/osd/osd_debug.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * osd_debug.h - Some kprintf macros
- *
- * Copyright (C) 2008 Panasas Inc.  All rights reserved.
- *
- * Authors:
- *   Boaz Harrosh <ooo@electrozaur.com>
- *   Benny Halevy <bhalevy@panasas.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- *
- */
-#ifndef __OSD_DEBUG_H__
-#define __OSD_DEBUG_H__
-
-#define OSD_ERR(fmt, a...) printk(KERN_ERR "osd: " fmt, ##a)
-#define OSD_INFO(fmt, a...) printk(KERN_NOTICE "osd: " fmt, ##a)
-
-#ifdef CONFIG_SCSI_OSD_DEBUG
-#define OSD_DEBUG(fmt, a...) \
-	printk(KERN_NOTICE "osd @%s:%d: " fmt, __func__, __LINE__, ##a)
-#else
-#define OSD_DEBUG(fmt, a...) do {} while (0)
-#endif
-
-/* u64 has problems with printk this will cast it to unsigned long long */
-#define _LLU(x) (unsigned long long)(x)
-
-#endif /* ndef __OSD_DEBUG_H__ */
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
deleted file mode 100644
index 60cf7c5eb880..000000000000
--- a/drivers/scsi/osd/osd_initiator.c
+++ /dev/null
@@ -1,2076 +0,0 @@
-/*
- * osd_initiator - Main body of the osd initiator library.
- *
- * Note: The file does not contain the advanced security functionality which
- * is only needed by the security_manager's initiators.
- *
- * Copyright (C) 2008 Panasas Inc.  All rights reserved.
- *
- * Authors:
- *   Boaz Harrosh <ooo@electrozaur.com>
- *   Benny Halevy <bhalevy@panasas.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  1. Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *  3. Neither the name of the Panasas company nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-#include <linux/module.h>
-
-#include <scsi/osd_initiator.h>
-#include <scsi/osd_sec.h>
-#include <scsi/osd_attributes.h>
-#include <scsi/osd_sense.h>
-
-#include <scsi/scsi_device.h>
-#include <scsi/scsi_request.h>
-
-#include "osd_debug.h"
-
-#ifndef __unused
-#    define __unused			__attribute__((unused))
-#endif
-
-enum { OSD_REQ_RETRIES = 1 };
-
-MODULE_AUTHOR("Boaz Harrosh <ooo@electrozaur.com>");
-MODULE_DESCRIPTION("open-osd initiator library libosd.ko");
-MODULE_LICENSE("GPL");
-
-static inline void build_test(void)
-{
-	/* structures were not packed */
-	BUILD_BUG_ON(sizeof(struct osd_capability) != OSD_CAP_LEN);
-	BUILD_BUG_ON(sizeof(struct osdv2_cdb) != OSD_TOTAL_CDB_LEN);
-	BUILD_BUG_ON(sizeof(struct osdv1_cdb) != OSDv1_TOTAL_CDB_LEN);
-}
-
-static const char *_osd_ver_desc(struct osd_request *or)
-{
-	return osd_req_is_ver1(or) ? "OSD1" : "OSD2";
-}
-
-#define ATTR_DEF_RI(id, len) ATTR_DEF(OSD_APAGE_ROOT_INFORMATION, id, len)
-
-static int _osd_get_print_system_info(struct osd_dev *od,
-	void *caps, struct osd_dev_info *odi)
-{
-	struct osd_request *or;
-	struct osd_attr get_attrs[] = {
-		ATTR_DEF_RI(OSD_ATTR_RI_VENDOR_IDENTIFICATION, 8),
-		ATTR_DEF_RI(OSD_ATTR_RI_PRODUCT_IDENTIFICATION, 16),
-		ATTR_DEF_RI(OSD_ATTR_RI_PRODUCT_MODEL, 32),
-		ATTR_DEF_RI(OSD_ATTR_RI_PRODUCT_REVISION_LEVEL, 4),
-		ATTR_DEF_RI(OSD_ATTR_RI_PRODUCT_SERIAL_NUMBER, 64 /*variable*/),
-		ATTR_DEF_RI(OSD_ATTR_RI_OSD_NAME, 64 /*variable*/),
-		ATTR_DEF_RI(OSD_ATTR_RI_TOTAL_CAPACITY, 8),
-		ATTR_DEF_RI(OSD_ATTR_RI_USED_CAPACITY, 8),
-		ATTR_DEF_RI(OSD_ATTR_RI_NUMBER_OF_PARTITIONS, 8),
-		ATTR_DEF_RI(OSD_ATTR_RI_CLOCK, 6),
-		/* IBM-OSD-SIM Has a bug with this one put it last */
-		ATTR_DEF_RI(OSD_ATTR_RI_OSD_SYSTEM_ID, 20),
-	};
-	void *iter = NULL, *pFirst;
-	int nelem = ARRAY_SIZE(get_attrs), a = 0;
-	int ret;
-
-	or = osd_start_request(od);
-	if (!or)
-		return -ENOMEM;
-
-	/* get attrs */
-	osd_req_get_attributes(or, &osd_root_object);
-	osd_req_add_get_attr_list(or, get_attrs, ARRAY_SIZE(get_attrs));
-
-	ret = osd_finalize_request(or, 0, caps, NULL);
-	if (ret)
-		goto out;
-
-	ret = osd_execute_request(or);
-	if (ret) {
-		OSD_ERR("Failed to detect %s => %d\n", _osd_ver_desc(or), ret);
-		goto out;
-	}
-
-	osd_req_decode_get_attr_list(or, get_attrs, &nelem, &iter);
-
-	OSD_INFO("Detected %s device\n",
-		_osd_ver_desc(or));
-
-	pFirst = get_attrs[a++].val_ptr;
-	OSD_INFO("VENDOR_IDENTIFICATION  [%s]\n",
-		(char *)pFirst);
-
-	pFirst = get_attrs[a++].val_ptr;
-	OSD_INFO("PRODUCT_IDENTIFICATION [%s]\n",
-		(char *)pFirst);
-
-	pFirst = get_attrs[a++].val_ptr;
-	OSD_INFO("PRODUCT_MODEL          [%s]\n",
-		(char *)pFirst);
-
-	pFirst = get_attrs[a++].val_ptr;
-	OSD_INFO("PRODUCT_REVISION_LEVEL [%u]\n",
-		pFirst ? get_unaligned_be32(pFirst) : ~0U);
-
-	pFirst = get_attrs[a++].val_ptr;
-	OSD_INFO("PRODUCT_SERIAL_NUMBER  [%s]\n",
-		(char *)pFirst);
-
-	odi->osdname_len = get_attrs[a].len;
-	/* Avoid NULL for memcmp optimization 0-length is good enough */
-	odi->osdname = kzalloc(odi->osdname_len + 1, GFP_KERNEL);
-	if (!odi->osdname) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	if (odi->osdname_len)
-		memcpy(odi->osdname, get_attrs[a].val_ptr, odi->osdname_len);
-	OSD_INFO("OSD_NAME               [%s]\n", odi->osdname);
-	a++;
-
-	pFirst = get_attrs[a++].val_ptr;
-	OSD_INFO("TOTAL_CAPACITY         [0x%llx]\n",
-		pFirst ? _LLU(get_unaligned_be64(pFirst)) : ~0ULL);
-
-	pFirst = get_attrs[a++].val_ptr;
-	OSD_INFO("USED_CAPACITY          [0x%llx]\n",
-		pFirst ? _LLU(get_unaligned_be64(pFirst)) : ~0ULL);
-
-	pFirst = get_attrs[a++].val_ptr;
-	OSD_INFO("NUMBER_OF_PARTITIONS   [%llu]\n",
-		pFirst ? _LLU(get_unaligned_be64(pFirst)) : ~0ULL);
-
-	if (a >= nelem)
-		goto out;
-
-	/* FIXME: Where are the time utilities */
-	pFirst = get_attrs[a++].val_ptr;
-	OSD_INFO("CLOCK                  [0x%6phN]\n", pFirst);
-
-	if (a < nelem) { /* IBM-OSD-SIM bug, Might not have it */
-		unsigned len = get_attrs[a].len;
-		char sid_dump[32*4 + 2]; /* 2nibbles+space+ASCII */
-
-		hex_dump_to_buffer(get_attrs[a].val_ptr, len, 32, 1,
-				   sid_dump, sizeof(sid_dump), true);
-		OSD_INFO("OSD_SYSTEM_ID(%d)\n"
-			 "        [%s]\n", len, sid_dump);
-
-		if (unlikely(len > sizeof(odi->systemid))) {
-			OSD_ERR("OSD Target error: OSD_SYSTEM_ID too long(%d). "
-				"device identification might not work\n", len);
-			len = sizeof(odi->systemid);
-		}
-		odi->systemid_len = len;
-		memcpy(odi->systemid, get_attrs[a].val_ptr, len);
-		a++;
-	}
-out:
-	osd_end_request(or);
-	return ret;
-}
-
-int osd_auto_detect_ver(struct osd_dev *od,
-	void *caps, struct osd_dev_info *odi)
-{
-	int ret;
-
-	/* Auto-detect the osd version */
-	ret = _osd_get_print_system_info(od, caps, odi);
-	if (ret) {
-		osd_dev_set_ver(od, OSD_VER1);
-		OSD_DEBUG("converting to OSD1\n");
-		ret = _osd_get_print_system_info(od, caps, odi);
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL(osd_auto_detect_ver);
-
-static unsigned _osd_req_cdb_len(struct osd_request *or)
-{
-	return osd_req_is_ver1(or) ? OSDv1_TOTAL_CDB_LEN : OSD_TOTAL_CDB_LEN;
-}
-
-static unsigned _osd_req_alist_elem_size(struct osd_request *or, unsigned len)
-{
-	return osd_req_is_ver1(or) ?
-		osdv1_attr_list_elem_size(len) :
-		osdv2_attr_list_elem_size(len);
-}
-
-static void _osd_req_alist_elem_encode(struct osd_request *or,
-	void *attr_last, const struct osd_attr *oa)
-{
-	if (osd_req_is_ver1(or)) {
-		struct osdv1_attributes_list_element *attr = attr_last;
-
-		attr->attr_page = cpu_to_be32(oa->attr_page);
-		attr->attr_id = cpu_to_be32(oa->attr_id);
-		attr->attr_bytes = cpu_to_be16(oa->len);
-		memcpy(attr->attr_val, oa->val_ptr, oa->len);
-	} else {
-		struct osdv2_attributes_list_element *attr = attr_last;
-
-		attr->attr_page = cpu_to_be32(oa->attr_page);
-		attr->attr_id = cpu_to_be32(oa->attr_id);
-		attr->attr_bytes = cpu_to_be16(oa->len);
-		memcpy(attr->attr_val, oa->val_ptr, oa->len);
-	}
-}
-
-static int _osd_req_alist_elem_decode(struct osd_request *or,
-	void *cur_p, struct osd_attr *oa, unsigned max_bytes)
-{
-	unsigned inc;
-	if (osd_req_is_ver1(or)) {
-		struct osdv1_attributes_list_element *attr = cur_p;
-
-		if (max_bytes < sizeof(*attr))
-			return -1;
-
-		oa->len = be16_to_cpu(attr->attr_bytes);
-		inc = _osd_req_alist_elem_size(or, oa->len);
-		if (inc > max_bytes)
-			return -1;
-
-		oa->attr_page = be32_to_cpu(attr->attr_page);
-		oa->attr_id = be32_to_cpu(attr->attr_id);
-
-		/* OSD1: On empty attributes we return a pointer to 2 bytes
-		 * of zeros. This keeps similar behaviour with OSD2.
-		 * (See below)
-		 */
-		oa->val_ptr = likely(oa->len) ? attr->attr_val :
-						(u8 *)&attr->attr_bytes;
-	} else {
-		struct osdv2_attributes_list_element *attr = cur_p;
-
-		if (max_bytes < sizeof(*attr))
-			return -1;
-
-		oa->len = be16_to_cpu(attr->attr_bytes);
-		inc = _osd_req_alist_elem_size(or, oa->len);
-		if (inc > max_bytes)
-			return -1;
-
-		oa->attr_page = be32_to_cpu(attr->attr_page);
-		oa->attr_id = be32_to_cpu(attr->attr_id);
-
-		/* OSD2: For convenience, on empty attributes, we return 8 bytes
-		 * of zeros here. This keeps the same behaviour with OSD2r04,
-		 * and is nice with null terminating ASCII fields.
-		 * oa->val_ptr == NULL marks the end-of-list, or error.
-		 */
-		oa->val_ptr = likely(oa->len) ? attr->attr_val : attr->reserved;
-	}
-	return inc;
-}
-
-static unsigned _osd_req_alist_size(struct osd_request *or, void *list_head)
-{
-	return osd_req_is_ver1(or) ?
-		osdv1_list_size(list_head) :
-		osdv2_list_size(list_head);
-}
-
-static unsigned _osd_req_sizeof_alist_header(struct osd_request *or)
-{
-	return osd_req_is_ver1(or) ?
-		sizeof(struct osdv1_attributes_list_header) :
-		sizeof(struct osdv2_attributes_list_header);
-}
-
-static void _osd_req_set_alist_type(struct osd_request *or,
-	void *list, int list_type)
-{
-	if (osd_req_is_ver1(or)) {
-		struct osdv1_attributes_list_header *attr_list = list;
-
-		memset(attr_list, 0, sizeof(*attr_list));
-		attr_list->type = list_type;
-	} else {
-		struct osdv2_attributes_list_header *attr_list = list;
-
-		memset(attr_list, 0, sizeof(*attr_list));
-		attr_list->type = list_type;
-	}
-}
-
-static bool _osd_req_is_alist_type(struct osd_request *or,
-	void *list, int list_type)
-{
-	if (!list)
-		return false;
-
-	if (osd_req_is_ver1(or)) {
-		struct osdv1_attributes_list_header *attr_list = list;
-
-		return attr_list->type == list_type;
-	} else {
-		struct osdv2_attributes_list_header *attr_list = list;
-
-		return attr_list->type == list_type;
-	}
-}
-
-/* This is for List-objects not Attributes-Lists */
-static void _osd_req_encode_olist(struct osd_request *or,
-	struct osd_obj_id_list *list)
-{
-	struct osd_cdb_head *cdbh = osd_cdb_head(&or->cdb);
-
-	if (osd_req_is_ver1(or)) {
-		cdbh->v1.list_identifier = list->list_identifier;
-		cdbh->v1.start_address = list->continuation_id;
-	} else {
-		cdbh->v2.list_identifier = list->list_identifier;
-		cdbh->v2.start_address = list->continuation_id;
-	}
-}
-
-static osd_cdb_offset osd_req_encode_offset(struct osd_request *or,
-	u64 offset, unsigned *padding)
-{
-	return __osd_encode_offset(offset, padding,
-			osd_req_is_ver1(or) ?
-				OSDv1_OFFSET_MIN_SHIFT : OSD_OFFSET_MIN_SHIFT,
-			OSD_OFFSET_MAX_SHIFT);
-}
-
-static struct osd_security_parameters *
-_osd_req_sec_params(struct osd_request *or)
-{
-	struct osd_cdb *ocdb = &or->cdb;
-
-	if (osd_req_is_ver1(or))
-		return (struct osd_security_parameters *)&ocdb->v1.sec_params;
-	else
-		return (struct osd_security_parameters *)&ocdb->v2.sec_params;
-}
-
-void osd_dev_init(struct osd_dev *osdd, struct scsi_device *scsi_device)
-{
-	memset(osdd, 0, sizeof(*osdd));
-	osdd->scsi_device = scsi_device;
-	osdd->def_timeout = BLK_DEFAULT_SG_TIMEOUT;
-#ifdef OSD_VER1_SUPPORT
-	osdd->version = OSD_VER2;
-#endif
-	/* TODO: Allocate pools for osd_request attributes ... */
-}
-EXPORT_SYMBOL(osd_dev_init);
-
-void osd_dev_fini(struct osd_dev *osdd)
-{
-	/* TODO: De-allocate pools */
-
-	osdd->scsi_device = NULL;
-}
-EXPORT_SYMBOL(osd_dev_fini);
-
-static struct osd_request *_osd_request_alloc(gfp_t gfp)
-{
-	struct osd_request *or;
-
-	/* TODO: Use mempool with one saved request */
-	or = kzalloc(sizeof(*or), gfp);
-	return or;
-}
-
-static void _osd_request_free(struct osd_request *or)
-{
-	kfree(or);
-}
-
-struct osd_request *osd_start_request(struct osd_dev *dev)
-{
-	struct osd_request *or;
-
-	or = _osd_request_alloc(GFP_KERNEL);
-	if (!or)
-		return NULL;
-
-	or->osd_dev = dev;
-	or->timeout = dev->def_timeout;
-	or->retries = OSD_REQ_RETRIES;
-
-	return or;
-}
-EXPORT_SYMBOL(osd_start_request);
-
-static void _osd_free_seg(struct osd_request *or __unused,
-	struct _osd_req_data_segment *seg)
-{
-	if (!seg->buff || !seg->alloc_size)
-		return;
-
-	kfree(seg->buff);
-	seg->buff = NULL;
-	seg->alloc_size = 0;
-}
-
-static void _put_request(struct request *rq)
-{
-	/*
-	 * If osd_finalize_request() was called but the request was not
-	 * executed through the block layer, then we must release BIOs.
-	 * TODO: Keep error code in or->async_error. Need to audit all
-	 *       code paths.
-	 */
-	if (unlikely(rq->bio))
-		blk_mq_end_request(rq, BLK_STS_IOERR);
-	else
-		blk_put_request(rq);
-}
-
-void osd_end_request(struct osd_request *or)
-{
-	struct request *rq = or->request;
-
-	if (rq) {
-		if (rq->next_rq) {
-			_put_request(rq->next_rq);
-			rq->next_rq = NULL;
-		}
-
-		_put_request(rq);
-	}
-
-	_osd_free_seg(or, &or->get_attr);
-	_osd_free_seg(or, &or->enc_get_attr);
-	_osd_free_seg(or, &or->set_attr);
-	_osd_free_seg(or, &or->cdb_cont);
-
-	_osd_request_free(or);
-}
-EXPORT_SYMBOL(osd_end_request);
-
-static void _set_error_resid(struct osd_request *or, struct request *req,
-			     blk_status_t error)
-{
-	or->async_error = error;
-	or->req_errors = scsi_req(req)->result;
-	or->sense_len = scsi_req(req)->sense_len;
-	if (or->sense_len)
-		memcpy(or->sense, scsi_req(req)->sense, or->sense_len);
-	if (or->out.req)
-		or->out.residual = scsi_req(or->out.req)->resid_len;
-	if (or->in.req)
-		or->in.residual = scsi_req(or->in.req)->resid_len;
-}
-
-int osd_execute_request(struct osd_request *or)
-{
-	blk_execute_rq(or->request->q, NULL, or->request, 0);
-
-	if (scsi_req(or->request)->result) {
-		_set_error_resid(or, or->request, BLK_STS_IOERR);
-		return -EIO;
-	}
-
-	_set_error_resid(or, or->request, BLK_STS_OK);
-	return 0;
-}
-EXPORT_SYMBOL(osd_execute_request);
-
-static void osd_request_async_done(struct request *req, blk_status_t error)
-{
-	struct osd_request *or = req->end_io_data;
-
-	_set_error_resid(or, req, error);
-	if (req->next_rq) {
-		blk_put_request(req->next_rq);
-		req->next_rq = NULL;
-	}
-
-	blk_put_request(req);
-	or->request = NULL;
-	or->in.req = NULL;
-	or->out.req = NULL;
-
-	if (or->async_done)
-		or->async_done(or, or->async_private);
-	else
-		osd_end_request(or);
-}
-
-int osd_execute_request_async(struct osd_request *or,
-	osd_req_done_fn *done, void *private)
-{
-	or->request->end_io_data = or;
-	or->async_private = private;
-	or->async_done = done;
-
-	blk_execute_rq_nowait(or->request->q, NULL, or->request, 0,
-			      osd_request_async_done);
-	return 0;
-}
-EXPORT_SYMBOL(osd_execute_request_async);
-
-u8 sg_out_pad_buffer[1 << OSDv1_OFFSET_MIN_SHIFT];
-u8 sg_in_pad_buffer[1 << OSDv1_OFFSET_MIN_SHIFT];
-
-static int _osd_realloc_seg(struct osd_request *or,
-	struct _osd_req_data_segment *seg, unsigned max_bytes)
-{
-	void *buff;
-
-	if (seg->alloc_size >= max_bytes)
-		return 0;
-
-	buff = krealloc(seg->buff, max_bytes, GFP_KERNEL);
-	if (!buff) {
-		OSD_ERR("Failed to Realloc %d-bytes was-%d\n", max_bytes,
-			seg->alloc_size);
-		return -ENOMEM;
-	}
-
-	memset(buff + seg->alloc_size, 0, max_bytes - seg->alloc_size);
-	seg->buff = buff;
-	seg->alloc_size = max_bytes;
-	return 0;
-}
-
-static int _alloc_cdb_cont(struct osd_request *or, unsigned total_bytes)
-{
-	OSD_DEBUG("total_bytes=%d\n", total_bytes);
-	return _osd_realloc_seg(or, &or->cdb_cont, total_bytes);
-}
-
-static int _alloc_set_attr_list(struct osd_request *or,
-	const struct osd_attr *oa, unsigned nelem, unsigned add_bytes)
-{
-	unsigned total_bytes = add_bytes;
-
-	for (; nelem; --nelem, ++oa)
-		total_bytes += _osd_req_alist_elem_size(or, oa->len);
-
-	OSD_DEBUG("total_bytes=%d\n", total_bytes);
-	return _osd_realloc_seg(or, &or->set_attr, total_bytes);
-}
-
-static int _alloc_get_attr_desc(struct osd_request *or, unsigned max_bytes)
-{
-	OSD_DEBUG("total_bytes=%d\n", max_bytes);
-	return _osd_realloc_seg(or, &or->enc_get_attr, max_bytes);
-}
-
-static int _alloc_get_attr_list(struct osd_request *or)
-{
-	OSD_DEBUG("total_bytes=%d\n", or->get_attr.total_bytes);
-	return _osd_realloc_seg(or, &or->get_attr, or->get_attr.total_bytes);
-}
-
-/*
- * Common to all OSD commands
- */
-
-static void _osdv1_req_encode_common(struct osd_request *or,
-	__be16 act, const struct osd_obj_id *obj, u64 offset, u64 len)
-{
-	struct osdv1_cdb *ocdb = &or->cdb.v1;
-
-	/*
-	 * For speed, the commands
-	 *	OSD_ACT_PERFORM_SCSI_COMMAND	, V1 0x8F7E, V2 0x8F7C
-	 *	OSD_ACT_SCSI_TASK_MANAGEMENT	, V1 0x8F7F, V2 0x8F7D
-	 * are not supported here. Should pass zero and set after the call
-	 */
-	act &= cpu_to_be16(~0x0080); /* V1 action code */
-
-	OSD_DEBUG("OSDv1 execute opcode 0x%x\n", be16_to_cpu(act));
-
-	ocdb->h.varlen_cdb.opcode = VARIABLE_LENGTH_CMD;
-	ocdb->h.varlen_cdb.additional_cdb_length = OSD_ADDITIONAL_CDB_LENGTH;
-	ocdb->h.varlen_cdb.service_action = act;
-
-	ocdb->h.partition = cpu_to_be64(obj->partition);
-	ocdb->h.object = cpu_to_be64(obj->id);
-	ocdb->h.v1.length = cpu_to_be64(len);
-	ocdb->h.v1.start_address = cpu_to_be64(offset);
-}
-
-static void _osdv2_req_encode_common(struct osd_request *or,
-	 __be16 act, const struct osd_obj_id *obj, u64 offset, u64 len)
-{
-	struct osdv2_cdb *ocdb = &or->cdb.v2;
-
-	OSD_DEBUG("OSDv2 execute opcode 0x%x\n", be16_to_cpu(act));
-
-	ocdb->h.varlen_cdb.opcode = VARIABLE_LENGTH_CMD;
-	ocdb->h.varlen_cdb.additional_cdb_length = OSD_ADDITIONAL_CDB_LENGTH;
-	ocdb->h.varlen_cdb.service_action = act;
-
-	ocdb->h.partition = cpu_to_be64(obj->partition);
-	ocdb->h.object = cpu_to_be64(obj->id);
-	ocdb->h.v2.length = cpu_to_be64(len);
-	ocdb->h.v2.start_address = cpu_to_be64(offset);
-}
-
-static void _osd_req_encode_common(struct osd_request *or,
-	__be16 act, const struct osd_obj_id *obj, u64 offset, u64 len)
-{
-	if (osd_req_is_ver1(or))
-		_osdv1_req_encode_common(or, act, obj, offset, len);
-	else
-		_osdv2_req_encode_common(or, act, obj, offset, len);
-}
-
-/*
- * Device commands
- */
-/*TODO: void osd_req_set_master_seed_xchg(struct osd_request *, ...); */
-/*TODO: void osd_req_set_master_key(struct osd_request *, ...); */
-
-void osd_req_format(struct osd_request *or, u64 tot_capacity)
-{
-	_osd_req_encode_common(or, OSD_ACT_FORMAT_OSD, &osd_root_object, 0,
-				tot_capacity);
-}
-EXPORT_SYMBOL(osd_req_format);
-
-int osd_req_list_dev_partitions(struct osd_request *or,
-	osd_id initial_id, struct osd_obj_id_list *list, unsigned nelem)
-{
-	return osd_req_list_partition_objects(or, 0, initial_id, list, nelem);
-}
-EXPORT_SYMBOL(osd_req_list_dev_partitions);
-
-static void _osd_req_encode_flush(struct osd_request *or,
-	enum osd_options_flush_scope_values op)
-{
-	struct osd_cdb_head *ocdb = osd_cdb_head(&or->cdb);
-
-	ocdb->command_specific_options = op;
-}
-
-void osd_req_flush_obsd(struct osd_request *or,
-	enum osd_options_flush_scope_values op)
-{
-	_osd_req_encode_common(or, OSD_ACT_FLUSH_OSD, &osd_root_object, 0, 0);
-	_osd_req_encode_flush(or, op);
-}
-EXPORT_SYMBOL(osd_req_flush_obsd);
-
-/*TODO: void osd_req_perform_scsi_command(struct osd_request *,
-	const u8 *cdb, ...); */
-/*TODO: void osd_req_task_management(struct osd_request *, ...); */
-
-/*
- * Partition commands
- */
-static void _osd_req_encode_partition(struct osd_request *or,
-	__be16 act, osd_id partition)
-{
-	struct osd_obj_id par = {
-		.partition = partition,
-		.id = 0,
-	};
-
-	_osd_req_encode_common(or, act, &par, 0, 0);
-}
-
-void osd_req_create_partition(struct osd_request *or, osd_id partition)
-{
-	_osd_req_encode_partition(or, OSD_ACT_CREATE_PARTITION, partition);
-}
-EXPORT_SYMBOL(osd_req_create_partition);
-
-void osd_req_remove_partition(struct osd_request *or, osd_id partition)
-{
-	_osd_req_encode_partition(or, OSD_ACT_REMOVE_PARTITION, partition);
-}
-EXPORT_SYMBOL(osd_req_remove_partition);
-
-/*TODO: void osd_req_set_partition_key(struct osd_request *,
-	osd_id partition, u8 new_key_id[OSD_CRYPTO_KEYID_SIZE],
-	u8 seed[OSD_CRYPTO_SEED_SIZE]); */
-
-static int _osd_req_list_objects(struct osd_request *or,
-	__be16 action, const struct osd_obj_id *obj, osd_id initial_id,
-	struct osd_obj_id_list *list, unsigned nelem)
-{
-	struct request_queue *q = osd_request_queue(or->osd_dev);
-	u64 len = nelem * sizeof(osd_id) + sizeof(*list);
-	struct bio *bio;
-
-	_osd_req_encode_common(or, action, obj, (u64)initial_id, len);
-
-	if (list->list_identifier)
-		_osd_req_encode_olist(or, list);
-
-	WARN_ON(or->in.bio);
-	bio = bio_map_kern(q, list, len, GFP_KERNEL);
-	if (IS_ERR(bio)) {
-		OSD_ERR("!!! Failed to allocate list_objects BIO\n");
-		return PTR_ERR(bio);
-	}
-
-	bio_set_op_attrs(bio, REQ_OP_READ, 0);
-	or->in.bio = bio;
-	or->in.total_bytes = bio->bi_iter.bi_size;
-	return 0;
-}
-
-int osd_req_list_partition_collections(struct osd_request *or,
-	osd_id partition, osd_id initial_id, struct osd_obj_id_list *list,
-	unsigned nelem)
-{
-	struct osd_obj_id par = {
-		.partition = partition,
-		.id = 0,
-	};
-
-	return osd_req_list_collection_objects(or, &par, initial_id, list,
-					       nelem);
-}
-EXPORT_SYMBOL(osd_req_list_partition_collections);
-
-int osd_req_list_partition_objects(struct osd_request *or,
-	osd_id partition, osd_id initial_id, struct osd_obj_id_list *list,
-	unsigned nelem)
-{
-	struct osd_obj_id par = {
-		.partition = partition,
-		.id = 0,
-	};
-
-	return _osd_req_list_objects(or, OSD_ACT_LIST, &par, initial_id, list,
-				     nelem);
-}
-EXPORT_SYMBOL(osd_req_list_partition_objects);
-
-void osd_req_flush_partition(struct osd_request *or,
-	osd_id partition, enum osd_options_flush_scope_values op)
-{
-	_osd_req_encode_partition(or, OSD_ACT_FLUSH_PARTITION, partition);
-	_osd_req_encode_flush(or, op);
-}
-EXPORT_SYMBOL(osd_req_flush_partition);
-
-/*
- * Collection commands
- */
-/*TODO: void osd_req_create_collection(struct osd_request *,
-	const struct osd_obj_id *); */
-/*TODO: void osd_req_remove_collection(struct osd_request *,
-	const struct osd_obj_id *); */
-
-int osd_req_list_collection_objects(struct osd_request *or,
-	const struct osd_obj_id *obj, osd_id initial_id,
-	struct osd_obj_id_list *list, unsigned nelem)
-{
-	return _osd_req_list_objects(or, OSD_ACT_LIST_COLLECTION, obj,
-				     initial_id, list, nelem);
-}
-EXPORT_SYMBOL(osd_req_list_collection_objects);
-
-/*TODO: void query(struct osd_request *, ...); V2 */
-
-void osd_req_flush_collection(struct osd_request *or,
-	const struct osd_obj_id *obj, enum osd_options_flush_scope_values op)
-{
-	_osd_req_encode_common(or, OSD_ACT_FLUSH_PARTITION, obj, 0, 0);
-	_osd_req_encode_flush(or, op);
-}
-EXPORT_SYMBOL(osd_req_flush_collection);
-
-/*TODO: void get_member_attrs(struct osd_request *, ...); V2 */
-/*TODO: void set_member_attrs(struct osd_request *, ...); V2 */
-
-/*
- * Object commands
- */
-void osd_req_create_object(struct osd_request *or, struct osd_obj_id *obj)
-{
-	_osd_req_encode_common(or, OSD_ACT_CREATE, obj, 0, 0);
-}
-EXPORT_SYMBOL(osd_req_create_object);
-
-void osd_req_remove_object(struct osd_request *or, struct osd_obj_id *obj)
-{
-	_osd_req_encode_common(or, OSD_ACT_REMOVE, obj, 0, 0);
-}
-EXPORT_SYMBOL(osd_req_remove_object);
-
-
-/*TODO: void osd_req_create_multi(struct osd_request *or,
-	struct osd_obj_id *first, struct osd_obj_id_list *list, unsigned nelem);
-*/
-
-void osd_req_write(struct osd_request *or,
-	const struct osd_obj_id *obj, u64 offset,
-	struct bio *bio, u64 len)
-{
-	_osd_req_encode_common(or, OSD_ACT_WRITE, obj, offset, len);
-	WARN_ON(or->out.bio || or->out.total_bytes);
-	WARN_ON(!op_is_write(bio_op(bio)));
-	or->out.bio = bio;
-	or->out.total_bytes = len;
-}
-EXPORT_SYMBOL(osd_req_write);
-
-int osd_req_write_kern(struct osd_request *or,
-	const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
-{
-	struct request_queue *req_q = osd_request_queue(or->osd_dev);
-	struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
-
-	if (IS_ERR(bio))
-		return PTR_ERR(bio);
-
-	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-	osd_req_write(or, obj, offset, bio, len);
-	return 0;
-}
-EXPORT_SYMBOL(osd_req_write_kern);
-
-/*TODO: void osd_req_append(struct osd_request *,
-	const struct osd_obj_id *, struct bio *data_out); */
-/*TODO: void osd_req_create_write(struct osd_request *,
-	const struct osd_obj_id *, struct bio *data_out, u64 offset); */
-/*TODO: void osd_req_clear(struct osd_request *,
-	const struct osd_obj_id *, u64 offset, u64 len); */
-/*TODO: void osd_req_punch(struct osd_request *,
-	const struct osd_obj_id *, u64 offset, u64 len); V2 */
-
-void osd_req_flush_object(struct osd_request *or,
-	const struct osd_obj_id *obj, enum osd_options_flush_scope_values op,
-	/*V2*/ u64 offset, /*V2*/ u64 len)
-{
-	if (unlikely(osd_req_is_ver1(or) && (offset || len))) {
-		OSD_DEBUG("OSD Ver1 flush on specific range ignored\n");
-		offset = 0;
-		len = 0;
-	}
-
-	_osd_req_encode_common(or, OSD_ACT_FLUSH, obj, offset, len);
-	_osd_req_encode_flush(or, op);
-}
-EXPORT_SYMBOL(osd_req_flush_object);
-
-void osd_req_read(struct osd_request *or,
-	const struct osd_obj_id *obj, u64 offset,
-	struct bio *bio, u64 len)
-{
-	_osd_req_encode_common(or, OSD_ACT_READ, obj, offset, len);
-	WARN_ON(or->in.bio || or->in.total_bytes);
-	WARN_ON(op_is_write(bio_op(bio)));
-	or->in.bio = bio;
-	or->in.total_bytes = len;
-}
-EXPORT_SYMBOL(osd_req_read);
-
-int osd_req_read_kern(struct osd_request *or,
-	const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
-{
-	struct request_queue *req_q = osd_request_queue(or->osd_dev);
-	struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
-
-	if (IS_ERR(bio))
-		return PTR_ERR(bio);
-
-	osd_req_read(or, obj, offset, bio, len);
-	return 0;
-}
-EXPORT_SYMBOL(osd_req_read_kern);
-
-static int _add_sg_continuation_descriptor(struct osd_request *or,
-	const struct osd_sg_entry *sglist, unsigned numentries, u64 *len)
-{
-	struct osd_sg_continuation_descriptor *oscd;
-	u32 oscd_size;
-	unsigned i;
-	int ret;
-
-	oscd_size = sizeof(*oscd) + numentries * sizeof(oscd->entries[0]);
-
-	if (!or->cdb_cont.total_bytes) {
-		/* First time, jump over the header, we will write to:
-		 *	cdb_cont.buff + cdb_cont.total_bytes
-		 */
-		or->cdb_cont.total_bytes =
-				sizeof(struct osd_continuation_segment_header);
-	}
-
-	ret = _alloc_cdb_cont(or, or->cdb_cont.total_bytes + oscd_size);
-	if (unlikely(ret))
-		return ret;
-
-	oscd = or->cdb_cont.buff + or->cdb_cont.total_bytes;
-	oscd->hdr.type = cpu_to_be16(SCATTER_GATHER_LIST);
-	oscd->hdr.pad_length = 0;
-	oscd->hdr.length = cpu_to_be32(oscd_size - sizeof(*oscd));
-
-	*len = 0;
-	/* copy the sg entries and convert to network byte order */
-	for (i = 0; i < numentries; i++) {
-		oscd->entries[i].offset = cpu_to_be64(sglist[i].offset);
-		oscd->entries[i].len    = cpu_to_be64(sglist[i].len);
-		*len += sglist[i].len;
-	}
-
-	or->cdb_cont.total_bytes += oscd_size;
-	OSD_DEBUG("total_bytes=%d oscd_size=%d numentries=%d\n",
-		  or->cdb_cont.total_bytes, oscd_size, numentries);
-	return 0;
-}
-
-static int _osd_req_finalize_cdb_cont(struct osd_request *or, const u8 *cap_key)
-{
-	struct request_queue *req_q = osd_request_queue(or->osd_dev);
-	struct bio *bio;
-	struct osd_cdb_head *cdbh = osd_cdb_head(&or->cdb);
-	struct osd_continuation_segment_header *cont_seg_hdr;
-
-	if (!or->cdb_cont.total_bytes)
-		return 0;
-
-	cont_seg_hdr = or->cdb_cont.buff;
-	cont_seg_hdr->format = CDB_CONTINUATION_FORMAT_V2;
-	cont_seg_hdr->service_action = cdbh->varlen_cdb.service_action;
-
-	/* create a bio for continuation segment */
-	bio = bio_map_kern(req_q, or->cdb_cont.buff, or->cdb_cont.total_bytes,
-			   GFP_KERNEL);
-	if (IS_ERR(bio))
-		return PTR_ERR(bio);
-
-	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-
-	/* integrity check the continuation before the bio is linked
-	 * with the other data segments since the continuation
-	 * integrity is separate from the other data segments.
-	 */
-	osd_sec_sign_data(cont_seg_hdr->integrity_check, bio, cap_key);
-
-	cdbh->v2.cdb_continuation_length = cpu_to_be32(or->cdb_cont.total_bytes);
-
-	/* we can't use _req_append_segment, because we need to link in the
-	 * continuation bio to the head of the bio list - the
-	 * continuation segment (if it exists) is always the first segment in
-	 * the out data buffer.
-	 */
-	bio->bi_next = or->out.bio;
-	or->out.bio = bio;
-	or->out.total_bytes += or->cdb_cont.total_bytes;
-
-	return 0;
-}
-
-/* osd_req_write_sg: Takes a @bio that points to the data out buffer and an
- * @sglist that has the scatter gather entries. Scatter-gather enables a write
- * of multiple none-contiguous areas of an object, in a single call. The extents
- * may overlap and/or be in any order. The only constrain is that:
- *	total_bytes(sglist) >= total_bytes(bio)
- */
-int osd_req_write_sg(struct osd_request *or,
-	const struct osd_obj_id *obj, struct bio *bio,
-	const struct osd_sg_entry *sglist, unsigned numentries)
-{
-	u64 len;
-	int ret = _add_sg_continuation_descriptor(or, sglist, numentries, &len);
-
-	if (ret)
-		return ret;
-	osd_req_write(or, obj, 0, bio, len);
-
-	return 0;
-}
-EXPORT_SYMBOL(osd_req_write_sg);
-
-/* osd_req_read_sg: Read multiple extents of an object into @bio
- * See osd_req_write_sg
- */
-int osd_req_read_sg(struct osd_request *or,
-	const struct osd_obj_id *obj, struct bio *bio,
-	const struct osd_sg_entry *sglist, unsigned numentries)
-{
-	u64 len;
-	u64 off;
-	int ret;
-
-	if (numentries > 1) {
-		off = 0;
-		ret = _add_sg_continuation_descriptor(or, sglist, numentries,
-						      &len);
-		if (ret)
-			return ret;
-	} else {
-		/* Optimize the case of single segment, read_sg is a
-		 * bidi operation.
-		 */
-		len = sglist->len;
-		off = sglist->offset;
-	}
-	osd_req_read(or, obj, off, bio, len);
-
-	return 0;
-}
-EXPORT_SYMBOL(osd_req_read_sg);
-
-/* SG-list write/read Kern API
- *
- * osd_req_{write,read}_sg_kern takes an array of @buff pointers and an array
- * of sg_entries. @numentries indicates how many pointers and sg_entries there
- * are.  By requiring an array of buff pointers. This allows a caller to do a
- * single write/read and scatter into multiple buffers.
- * NOTE: Each buffer + len should not cross a page boundary.
- */
-static struct bio *_create_sg_bios(struct osd_request *or,
-	void **buff, const struct osd_sg_entry *sglist, unsigned numentries)
-{
-	struct request_queue *q = osd_request_queue(or->osd_dev);
-	struct bio *bio;
-	unsigned i;
-
-	bio = bio_kmalloc(GFP_KERNEL, numentries);
-	if (unlikely(!bio)) {
-		OSD_DEBUG("Failed to allocate BIO size=%u\n", numentries);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	for (i = 0; i < numentries; i++) {
-		unsigned offset = offset_in_page(buff[i]);
-		struct page *page = virt_to_page(buff[i]);
-		unsigned len = sglist[i].len;
-		unsigned added_len;
-
-		BUG_ON(offset + len > PAGE_SIZE);
-		added_len = bio_add_pc_page(q, bio, page, len, offset);
-		if (unlikely(len != added_len)) {
-			OSD_DEBUG("bio_add_pc_page len(%d) != added_len(%d)\n",
-				  len, added_len);
-			bio_put(bio);
-			return ERR_PTR(-ENOMEM);
-		}
-	}
-
-	return bio;
-}
-
-int osd_req_write_sg_kern(struct osd_request *or,
-	const struct osd_obj_id *obj, void **buff,
-	const struct osd_sg_entry *sglist, unsigned numentries)
-{
-	struct bio *bio = _create_sg_bios(or, buff, sglist, numentries);
-	if (IS_ERR(bio))
-		return PTR_ERR(bio);
-
-	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-	osd_req_write_sg(or, obj, bio, sglist, numentries);
-
-	return 0;
-}
-EXPORT_SYMBOL(osd_req_write_sg_kern);
-
-int osd_req_read_sg_kern(struct osd_request *or,
-	const struct osd_obj_id *obj, void **buff,
-	const struct osd_sg_entry *sglist, unsigned numentries)
-{
-	struct bio *bio = _create_sg_bios(or, buff, sglist, numentries);
-	if (IS_ERR(bio))
-		return PTR_ERR(bio);
-
-	osd_req_read_sg(or, obj, bio, sglist, numentries);
-
-	return 0;
-}
-EXPORT_SYMBOL(osd_req_read_sg_kern);
-
-
-
-void osd_req_get_attributes(struct osd_request *or,
-	const struct osd_obj_id *obj)
-{
-	_osd_req_encode_common(or, OSD_ACT_GET_ATTRIBUTES, obj, 0, 0);
-}
-EXPORT_SYMBOL(osd_req_get_attributes);
-
-void osd_req_set_attributes(struct osd_request *or,
-	const struct osd_obj_id *obj)
-{
-	_osd_req_encode_common(or, OSD_ACT_SET_ATTRIBUTES, obj, 0, 0);
-}
-EXPORT_SYMBOL(osd_req_set_attributes);
-
-/*
- * Attributes List-mode
- */
-
-int osd_req_add_set_attr_list(struct osd_request *or,
-	const struct osd_attr *oa, unsigned nelem)
-{
-	unsigned total_bytes = or->set_attr.total_bytes;
-	void *attr_last;
-	int ret;
-
-	if (or->attributes_mode &&
-	    or->attributes_mode != OSD_CDB_GET_SET_ATTR_LISTS) {
-		WARN_ON(1);
-		return -EINVAL;
-	}
-	or->attributes_mode = OSD_CDB_GET_SET_ATTR_LISTS;
-
-	if (!total_bytes) { /* first-time: allocate and put list header */
-		total_bytes = _osd_req_sizeof_alist_header(or);
-		ret = _alloc_set_attr_list(or, oa, nelem, total_bytes);
-		if (ret)
-			return ret;
-		_osd_req_set_alist_type(or, or->set_attr.buff,
-					OSD_ATTR_LIST_SET_RETRIEVE);
-	}
-	attr_last = or->set_attr.buff + total_bytes;
-
-	for (; nelem; --nelem) {
-		unsigned elem_size = _osd_req_alist_elem_size(or, oa->len);
-
-		total_bytes += elem_size;
-		if (unlikely(or->set_attr.alloc_size < total_bytes)) {
-			or->set_attr.total_bytes = total_bytes - elem_size;
-			ret = _alloc_set_attr_list(or, oa, nelem, total_bytes);
-			if (ret)
-				return ret;
-			attr_last =
-				or->set_attr.buff + or->set_attr.total_bytes;
-		}
-
-		_osd_req_alist_elem_encode(or, attr_last, oa);
-
-		attr_last += elem_size;
-		++oa;
-	}
-
-	or->set_attr.total_bytes = total_bytes;
-	return 0;
-}
-EXPORT_SYMBOL(osd_req_add_set_attr_list);
-
-static int _req_append_segment(struct osd_request *or,
-	unsigned padding, struct _osd_req_data_segment *seg,
-	struct _osd_req_data_segment *last_seg, struct _osd_io_info *io)
-{
-	void *pad_buff;
-	int ret;
-
-	if (padding) {
-		/* check if we can just add it to last buffer */
-		if (last_seg &&
-		    (padding <= last_seg->alloc_size - last_seg->total_bytes))
-			pad_buff = last_seg->buff + last_seg->total_bytes;
-		else
-			pad_buff = io->pad_buff;
-
-		ret = blk_rq_map_kern(io->req->q, io->req, pad_buff, padding,
-				       GFP_KERNEL);
-		if (ret)
-			return ret;
-		io->total_bytes += padding;
-	}
-
-	ret = blk_rq_map_kern(io->req->q, io->req, seg->buff, seg->total_bytes,
-			       GFP_KERNEL);
-	if (ret)
-		return ret;
-
-	io->total_bytes += seg->total_bytes;
-	OSD_DEBUG("padding=%d buff=%p total_bytes=%d\n", padding, seg->buff,
-		  seg->total_bytes);
-	return 0;
-}
-
-static int _osd_req_finalize_set_attr_list(struct osd_request *or)
-{
-	struct osd_cdb_head *cdbh = osd_cdb_head(&or->cdb);
-	unsigned padding;
-	int ret;
-
-	if (!or->set_attr.total_bytes) {
-		cdbh->attrs_list.set_attr_offset = OSD_OFFSET_UNUSED;
-		return 0;
-	}
-
-	cdbh->attrs_list.set_attr_bytes = cpu_to_be32(or->set_attr.total_bytes);
-	cdbh->attrs_list.set_attr_offset =
-		osd_req_encode_offset(or, or->out.total_bytes, &padding);
-
-	ret = _req_append_segment(or, padding, &or->set_attr,
-				  or->out.last_seg, &or->out);
-	if (ret)
-		return ret;
-
-	or->out.last_seg = &or->set_attr;
-	return 0;
-}
-
-int osd_req_add_get_attr_list(struct osd_request *or,
-	const struct osd_attr *oa, unsigned nelem)
-{
-	unsigned total_bytes = or->enc_get_attr.total_bytes;
-	void *attr_last;
-	int ret;
-
-	if (or->attributes_mode &&
-	    or->attributes_mode != OSD_CDB_GET_SET_ATTR_LISTS) {
-		WARN_ON(1);
-		return -EINVAL;
-	}
-	or->attributes_mode = OSD_CDB_GET_SET_ATTR_LISTS;
-
-	/* first time calc data-in list header size */
-	if (!or->get_attr.total_bytes)
-		or->get_attr.total_bytes = _osd_req_sizeof_alist_header(or);
-
-	/* calc data-out info */
-	if (!total_bytes) { /* first-time: allocate and put list header */
-		unsigned max_bytes;
-
-		total_bytes = _osd_req_sizeof_alist_header(or);
-		max_bytes = total_bytes +
-			nelem * sizeof(struct osd_attributes_list_attrid);
-		ret = _alloc_get_attr_desc(or, max_bytes);
-		if (ret)
-			return ret;
-
-		_osd_req_set_alist_type(or, or->enc_get_attr.buff,
-					OSD_ATTR_LIST_GET);
-	}
-	attr_last = or->enc_get_attr.buff + total_bytes;
-
-	for (; nelem; --nelem) {
-		struct osd_attributes_list_attrid *attrid;
-		const unsigned cur_size = sizeof(*attrid);
-
-		total_bytes += cur_size;
-		if (unlikely(or->enc_get_attr.alloc_size < total_bytes)) {
-			or->enc_get_attr.total_bytes = total_bytes - cur_size;
-			ret = _alloc_get_attr_desc(or,
-					total_bytes + nelem * sizeof(*attrid));
-			if (ret)
-				return ret;
-			attr_last = or->enc_get_attr.buff +
-				or->enc_get_attr.total_bytes;
-		}
-
-		attrid = attr_last;
-		attrid->attr_page = cpu_to_be32(oa->attr_page);
-		attrid->attr_id = cpu_to_be32(oa->attr_id);
-
-		attr_last += cur_size;
-
-		/* calc data-in size */
-		or->get_attr.total_bytes +=
-			_osd_req_alist_elem_size(or, oa->len);
-		++oa;
-	}
-
-	or->enc_get_attr.total_bytes = total_bytes;
-
-	OSD_DEBUG(
-	       "get_attr.total_bytes=%u(%u) enc_get_attr.total_bytes=%u(%zu)\n",
-	       or->get_attr.total_bytes,
-	       or->get_attr.total_bytes - _osd_req_sizeof_alist_header(or),
-	       or->enc_get_attr.total_bytes,
-	       (or->enc_get_attr.total_bytes - _osd_req_sizeof_alist_header(or))
-			/ sizeof(struct osd_attributes_list_attrid));
-
-	return 0;
-}
-EXPORT_SYMBOL(osd_req_add_get_attr_list);
-
-static int _osd_req_finalize_get_attr_list(struct osd_request *or)
-{
-	struct osd_cdb_head *cdbh = osd_cdb_head(&or->cdb);
-	unsigned out_padding;
-	unsigned in_padding;
-	int ret;
-
-	if (!or->enc_get_attr.total_bytes) {
-		cdbh->attrs_list.get_attr_desc_offset = OSD_OFFSET_UNUSED;
-		cdbh->attrs_list.get_attr_offset = OSD_OFFSET_UNUSED;
-		return 0;
-	}
-
-	ret = _alloc_get_attr_list(or);
-	if (ret)
-		return ret;
-
-	/* The out-going buffer info update */
-	OSD_DEBUG("out-going\n");
-	cdbh->attrs_list.get_attr_desc_bytes =
-		cpu_to_be32(or->enc_get_attr.total_bytes);
-
-	cdbh->attrs_list.get_attr_desc_offset =
-		osd_req_encode_offset(or, or->out.total_bytes, &out_padding);
-
-	ret = _req_append_segment(or, out_padding, &or->enc_get_attr,
-				  or->out.last_seg, &or->out);
-	if (ret)
-		return ret;
-	or->out.last_seg = &or->enc_get_attr;
-
-	/* The incoming buffer info update */
-	OSD_DEBUG("in-coming\n");
-	cdbh->attrs_list.get_attr_alloc_length =
-		cpu_to_be32(or->get_attr.total_bytes);
-
-	cdbh->attrs_list.get_attr_offset =
-		osd_req_encode_offset(or, or->in.total_bytes, &in_padding);
-
-	ret = _req_append_segment(or, in_padding, &or->get_attr, NULL,
-				  &or->in);
-	if (ret)
-		return ret;
-	or->in.last_seg = &or->get_attr;
-
-	return 0;
-}
-
-int osd_req_decode_get_attr_list(struct osd_request *or,
-	struct osd_attr *oa, int *nelem, void **iterator)
-{
-	unsigned cur_bytes, returned_bytes;
-	int n;
-	const unsigned sizeof_attr_list = _osd_req_sizeof_alist_header(or);
-	void *cur_p;
-
-	if (!_osd_req_is_alist_type(or, or->get_attr.buff,
-				    OSD_ATTR_LIST_SET_RETRIEVE)) {
-		oa->attr_page = 0;
-		oa->attr_id = 0;
-		oa->val_ptr = NULL;
-		oa->len = 0;
-		*iterator = NULL;
-		return 0;
-	}
-
-	if (*iterator) {
-		BUG_ON((*iterator < or->get_attr.buff) ||
-		     (or->get_attr.buff + or->get_attr.alloc_size < *iterator));
-		cur_p = *iterator;
-		cur_bytes = (*iterator - or->get_attr.buff) - sizeof_attr_list;
-		returned_bytes = or->get_attr.total_bytes;
-	} else { /* first time decode the list header */
-		cur_bytes = sizeof_attr_list;
-		returned_bytes = _osd_req_alist_size(or, or->get_attr.buff) +
-					sizeof_attr_list;
-
-		cur_p = or->get_attr.buff + sizeof_attr_list;
-
-		if (returned_bytes > or->get_attr.alloc_size) {
-			OSD_DEBUG("target report: space was not big enough! "
-				  "Allocate=%u Needed=%u\n",
-				  or->get_attr.alloc_size,
-				  returned_bytes + sizeof_attr_list);
-
-			returned_bytes =
-				or->get_attr.alloc_size - sizeof_attr_list;
-		}
-		or->get_attr.total_bytes = returned_bytes;
-	}
-
-	for (n = 0; (n < *nelem) && (cur_bytes < returned_bytes); ++n) {
-		int inc = _osd_req_alist_elem_decode(or, cur_p, oa,
-						 returned_bytes - cur_bytes);
-
-		if (inc < 0) {
-			OSD_ERR("BAD FOOD from target. list not valid!"
-				"c=%d r=%d n=%d\n",
-				cur_bytes, returned_bytes, n);
-			oa->val_ptr = NULL;
-			cur_bytes = returned_bytes; /* break the caller loop */
-			break;
-		}
-
-		cur_bytes += inc;
-		cur_p += inc;
-		++oa;
-	}
-
-	*iterator = (returned_bytes - cur_bytes) ? cur_p : NULL;
-	*nelem = n;
-	return returned_bytes - cur_bytes;
-}
-EXPORT_SYMBOL(osd_req_decode_get_attr_list);
-
-/*
- * Attributes Page-mode
- */
-
-int osd_req_add_get_attr_page(struct osd_request *or,
-	u32 page_id, void *attar_page, unsigned max_page_len,
-	const struct osd_attr *set_one_attr)
-{
-	struct osd_cdb_head *cdbh = osd_cdb_head(&or->cdb);
-
-	if (or->attributes_mode &&
-	    or->attributes_mode != OSD_CDB_GET_ATTR_PAGE_SET_ONE) {
-		WARN_ON(1);
-		return -EINVAL;
-	}
-	or->attributes_mode = OSD_CDB_GET_ATTR_PAGE_SET_ONE;
-
-	or->get_attr.buff = attar_page;
-	or->get_attr.total_bytes = max_page_len;
-
-	cdbh->attrs_page.get_attr_page = cpu_to_be32(page_id);
-	cdbh->attrs_page.get_attr_alloc_length = cpu_to_be32(max_page_len);
-
-	if (!set_one_attr || !set_one_attr->attr_page)
-		return 0; /* The set is optional */
-
-	or->set_attr.buff = set_one_attr->val_ptr;
-	or->set_attr.total_bytes = set_one_attr->len;
-
-	cdbh->attrs_page.set_attr_page = cpu_to_be32(set_one_attr->attr_page);
-	cdbh->attrs_page.set_attr_id = cpu_to_be32(set_one_attr->attr_id);
-	cdbh->attrs_page.set_attr_length = cpu_to_be32(set_one_attr->len);
-	return 0;
-}
-EXPORT_SYMBOL(osd_req_add_get_attr_page);
-
-static int _osd_req_finalize_attr_page(struct osd_request *or)
-{
-	struct osd_cdb_head *cdbh = osd_cdb_head(&or->cdb);
-	unsigned in_padding, out_padding;
-	int ret;
-
-	/* returned page */
-	cdbh->attrs_page.get_attr_offset =
-		osd_req_encode_offset(or, or->in.total_bytes, &in_padding);
-
-	ret = _req_append_segment(or, in_padding, &or->get_attr, NULL,
-				  &or->in);
-	if (ret)
-		return ret;
-
-	if (or->set_attr.total_bytes == 0)
-		return 0;
-
-	/* set one value */
-	cdbh->attrs_page.set_attr_offset =
-		osd_req_encode_offset(or, or->out.total_bytes, &out_padding);
-
-	ret = _req_append_segment(or, out_padding, &or->set_attr, NULL,
-				  &or->out);
-	return ret;
-}
-
-static inline void osd_sec_parms_set_out_offset(bool is_v1,
-	struct osd_security_parameters *sec_parms, osd_cdb_offset offset)
-{
-	if (is_v1)
-		sec_parms->v1.data_out_integrity_check_offset = offset;
-	else
-		sec_parms->v2.data_out_integrity_check_offset = offset;
-}
-
-static inline void osd_sec_parms_set_in_offset(bool is_v1,
-	struct osd_security_parameters *sec_parms, osd_cdb_offset offset)
-{
-	if (is_v1)
-		sec_parms->v1.data_in_integrity_check_offset = offset;
-	else
-		sec_parms->v2.data_in_integrity_check_offset = offset;
-}
-
-static int _osd_req_finalize_data_integrity(struct osd_request *or,
-	bool has_in, bool has_out, struct bio *out_data_bio, u64 out_data_bytes,
-	const u8 *cap_key)
-{
-	struct osd_security_parameters *sec_parms = _osd_req_sec_params(or);
-	int ret;
-
-	if (!osd_is_sec_alldata(sec_parms))
-		return 0;
-
-	if (has_out) {
-		struct _osd_req_data_segment seg = {
-			.buff = &or->out_data_integ,
-			.total_bytes = sizeof(or->out_data_integ),
-		};
-		unsigned pad;
-
-		or->out_data_integ.data_bytes = cpu_to_be64(out_data_bytes);
-		or->out_data_integ.set_attributes_bytes = cpu_to_be64(
-			or->set_attr.total_bytes);
-		or->out_data_integ.get_attributes_bytes = cpu_to_be64(
-			or->enc_get_attr.total_bytes);
-
-		osd_sec_parms_set_out_offset(osd_req_is_ver1(or), sec_parms,
-			osd_req_encode_offset(or, or->out.total_bytes, &pad));
-
-		ret = _req_append_segment(or, pad, &seg, or->out.last_seg,
-					  &or->out);
-		if (ret)
-			return ret;
-		or->out.last_seg = NULL;
-
-		/* they are now all chained to request sign them all together */
-		osd_sec_sign_data(&or->out_data_integ, out_data_bio,
-				  cap_key);
-	}
-
-	if (has_in) {
-		struct _osd_req_data_segment seg = {
-			.buff = &or->in_data_integ,
-			.total_bytes = sizeof(or->in_data_integ),
-		};
-		unsigned pad;
-
-		osd_sec_parms_set_in_offset(osd_req_is_ver1(or), sec_parms,
-			osd_req_encode_offset(or, or->in.total_bytes, &pad));
-
-		ret = _req_append_segment(or, pad, &seg, or->in.last_seg,
-					  &or->in);
-		if (ret)
-			return ret;
-
-		or->in.last_seg = NULL;
-	}
-
-	return 0;
-}
-
-/*
- * osd_finalize_request and helpers
- */
-static struct request *_make_request(struct request_queue *q, bool has_write,
-			      struct _osd_io_info *oii)
-{
-	struct request *req;
-	struct bio *bio = oii->bio;
-	int ret;
-
-	req = blk_get_request(q, has_write ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
-			0);
-	if (IS_ERR(req))
-		return req;
-
-	for_each_bio(bio) {
-		struct bio *bounce_bio = bio;
-
-		ret = blk_rq_append_bio(req, &bounce_bio);
-		if (ret)
-			return ERR_PTR(ret);
-	}
-
-	return req;
-}
-
-static int _init_blk_request(struct osd_request *or,
-	bool has_in, bool has_out)
-{
-	struct scsi_device *scsi_device = or->osd_dev->scsi_device;
-	struct request_queue *q = scsi_device->request_queue;
-	struct request *req;
-	int ret;
-
-	req = _make_request(q, has_out, has_out ? &or->out : &or->in);
-	if (IS_ERR(req)) {
-		ret = PTR_ERR(req);
-		goto out;
-	}
-
-	or->request = req;
-	req->rq_flags |= RQF_QUIET;
-
-	req->timeout = or->timeout;
-	scsi_req(req)->retries = or->retries;
-
-	if (has_out) {
-		or->out.req = req;
-		if (has_in) {
-			/* allocate bidi request */
-			req = _make_request(q, false, &or->in);
-			if (IS_ERR(req)) {
-				OSD_DEBUG("blk_get_request for bidi failed\n");
-				ret = PTR_ERR(req);
-				goto out;
-			}
-			or->in.req = or->request->next_rq = req;
-		}
-	} else if (has_in)
-		or->in.req = req;
-
-	ret = 0;
-out:
-	OSD_DEBUG("or=%p has_in=%d has_out=%d => %d, %p\n",
-			or, has_in, has_out, ret, or->request);
-	return ret;
-}
-
-int osd_finalize_request(struct osd_request *or,
-	u8 options, const void *cap, const u8 *cap_key)
-{
-	struct osd_cdb_head *cdbh = osd_cdb_head(&or->cdb);
-	bool has_in, has_out;
-	 /* Save for data_integrity without the cdb_continuation */
-	struct bio *out_data_bio = or->out.bio;
-	u64 out_data_bytes = or->out.total_bytes;
-	int ret;
-
-	if (options & OSD_REQ_FUA)
-		cdbh->options |= OSD_CDB_FUA;
-
-	if (options & OSD_REQ_DPO)
-		cdbh->options |= OSD_CDB_DPO;
-
-	if (options & OSD_REQ_BYPASS_TIMESTAMPS)
-		cdbh->timestamp_control = OSD_CDB_BYPASS_TIMESTAMPS;
-
-	osd_set_caps(&or->cdb, cap);
-
-	has_in = or->in.bio || or->get_attr.total_bytes;
-	has_out = or->out.bio || or->cdb_cont.total_bytes ||
-		or->set_attr.total_bytes || or->enc_get_attr.total_bytes;
-
-	ret = _osd_req_finalize_cdb_cont(or, cap_key);
-	if (ret) {
-		OSD_DEBUG("_osd_req_finalize_cdb_cont failed\n");
-		return ret;
-	}
-	ret = _init_blk_request(or, has_in, has_out);
-	if (ret) {
-		OSD_DEBUG("_init_blk_request failed\n");
-		return ret;
-	}
-
-	or->out.pad_buff = sg_out_pad_buffer;
-	or->in.pad_buff = sg_in_pad_buffer;
-
-	if (!or->attributes_mode)
-		or->attributes_mode = OSD_CDB_GET_SET_ATTR_LISTS;
-	cdbh->command_specific_options |= or->attributes_mode;
-	if (or->attributes_mode == OSD_CDB_GET_ATTR_PAGE_SET_ONE) {
-		ret = _osd_req_finalize_attr_page(or);
-		if (ret) {
-			OSD_DEBUG("_osd_req_finalize_attr_page failed\n");
-			return ret;
-		}
-	} else {
-		/* TODO: I think that for the GET_ATTR command these 2 should
-		 * be reversed to keep them in execution order (for embedded
-		 * targets with low memory footprint)
-		 */
-		ret = _osd_req_finalize_set_attr_list(or);
-		if (ret) {
-			OSD_DEBUG("_osd_req_finalize_set_attr_list failed\n");
-			return ret;
-		}
-
-		ret = _osd_req_finalize_get_attr_list(or);
-		if (ret) {
-			OSD_DEBUG("_osd_req_finalize_get_attr_list failed\n");
-			return ret;
-		}
-	}
-
-	ret = _osd_req_finalize_data_integrity(or, has_in, has_out,
-					       out_data_bio, out_data_bytes,
-					       cap_key);
-	if (ret)
-		return ret;
-
-	osd_sec_sign_cdb(&or->cdb, cap_key);
-
-	scsi_req(or->request)->cmd = or->cdb.buff;
-	scsi_req(or->request)->cmd_len = _osd_req_cdb_len(or);
-
-	return 0;
-}
-EXPORT_SYMBOL(osd_finalize_request);
-
-static bool _is_osd_security_code(int code)
-{
-	return	(code == osd_security_audit_value_frozen) ||
-		(code == osd_security_working_key_frozen) ||
-		(code == osd_nonce_not_unique) ||
-		(code == osd_nonce_timestamp_out_of_range) ||
-		(code == osd_invalid_dataout_buffer_integrity_check_value);
-}
-
-#define OSD_SENSE_PRINT1(fmt, a...) \
-	do { \
-		if (__cur_sense_need_output) \
-			OSD_ERR(fmt, ##a); \
-	} while (0)
-
-#define OSD_SENSE_PRINT2(fmt, a...) OSD_SENSE_PRINT1("    " fmt, ##a)
-
-int osd_req_decode_sense_full(struct osd_request *or,
-	struct osd_sense_info *osi, bool silent,
-	struct osd_obj_id *bad_obj_list __unused, int max_obj __unused,
-	struct osd_attr *bad_attr_list, int max_attr)
-{
-	int sense_len, original_sense_len;
-	struct osd_sense_info local_osi;
-	struct scsi_sense_descriptor_based *ssdb;
-	void *cur_descriptor;
-#if (CONFIG_SCSI_OSD_DPRINT_SENSE == 0)
-	const bool __cur_sense_need_output = false;
-#else
-	bool __cur_sense_need_output = !silent;
-#endif
-	int ret;
-
-	if (likely(!or->req_errors))
-		return 0;
-
-	osi = osi ? : &local_osi;
-	memset(osi, 0, sizeof(*osi));
-
-	ssdb = (typeof(ssdb))or->sense;
-	sense_len = or->sense_len;
-	if ((sense_len < (int)sizeof(*ssdb) || !ssdb->sense_key)) {
-		OSD_ERR("Block-layer returned error(0x%x) but "
-			"sense_len(%u) || key(%d) is empty\n",
-			or->req_errors, sense_len, ssdb->sense_key);
-		goto analyze;
-	}
-
-	if ((ssdb->response_code != 0x72) && (ssdb->response_code != 0x73)) {
-		OSD_ERR("Unrecognized scsi sense: rcode=%x length=%d\n",
-			ssdb->response_code, sense_len);
-		goto analyze;
-	}
-
-	osi->key = ssdb->sense_key;
-	osi->additional_code = be16_to_cpu(ssdb->additional_sense_code);
-	original_sense_len = ssdb->additional_sense_length + 8;
-
-#if (CONFIG_SCSI_OSD_DPRINT_SENSE == 1)
-	if (__cur_sense_need_output)
-		__cur_sense_need_output = (osi->key > scsi_sk_recovered_error);
-#endif
-	OSD_SENSE_PRINT1("Main Sense information key=0x%x length(%d, %d) "
-			"additional_code=0x%x async_error=%d errors=0x%x\n",
-			osi->key, original_sense_len, sense_len,
-			osi->additional_code, or->async_error,
-			or->req_errors);
-
-	if (original_sense_len < sense_len)
-		sense_len = original_sense_len;
-
-	cur_descriptor = ssdb->ssd;
-	sense_len -= sizeof(*ssdb);
-	while (sense_len > 0) {
-		struct scsi_sense_descriptor *ssd = cur_descriptor;
-		int cur_len = ssd->additional_length + 2;
-
-		sense_len -= cur_len;
-
-		if (sense_len < 0)
-			break; /* sense was truncated */
-
-		switch (ssd->descriptor_type) {
-		case scsi_sense_information:
-		case scsi_sense_command_specific_information:
-		{
-			struct scsi_sense_command_specific_data_descriptor
-				*sscd = cur_descriptor;
-
-			osi->command_info =
-				get_unaligned_be64(&sscd->information) ;
-			OSD_SENSE_PRINT2(
-				"command_specific_information 0x%llx \n",
-				_LLU(osi->command_info));
-			break;
-		}
-		case scsi_sense_key_specific:
-		{
-			struct scsi_sense_key_specific_data_descriptor
-				*ssks = cur_descriptor;
-
-			osi->sense_info = get_unaligned_be16(&ssks->value);
-			OSD_SENSE_PRINT2(
-				"sense_key_specific_information %u"
-				"sksv_cd_bpv_bp (0x%x)\n",
-				osi->sense_info, ssks->sksv_cd_bpv_bp);
-			break;
-		}
-		case osd_sense_object_identification:
-		{ /*FIXME: Keep first not last, Store in array*/
-			struct osd_sense_identification_data_descriptor
-				*osidd = cur_descriptor;
-
-			osi->not_initiated_command_functions =
-				le32_to_cpu(osidd->not_initiated_functions);
-			osi->completed_command_functions =
-				le32_to_cpu(osidd->completed_functions);
-			osi->obj.partition = be64_to_cpu(osidd->partition_id);
-			osi->obj.id = be64_to_cpu(osidd->object_id);
-			OSD_SENSE_PRINT2(
-				"object_identification pid=0x%llx oid=0x%llx\n",
-				_LLU(osi->obj.partition), _LLU(osi->obj.id));
-			OSD_SENSE_PRINT2(
-				"not_initiated_bits(%x) "
-				"completed_command_bits(%x)\n",
-				osi->not_initiated_command_functions,
-				osi->completed_command_functions);
-			break;
-		}
-		case osd_sense_response_integrity_check:
-		{
-			struct osd_sense_response_integrity_check_descriptor
-				*d = cur_descriptor;
-			/* 2nibbles+space+ASCII */
-			char dump[sizeof(d->integrity_check_value) * 4 + 2];
-
-			hex_dump_to_buffer(d->integrity_check_value,
-					sizeof(d->integrity_check_value),
-					32, 1, dump, sizeof(dump), true);
-			OSD_SENSE_PRINT2("response_integrity [%s]\n", dump);
-		}
-		case osd_sense_attribute_identification:
-		{
-			struct osd_sense_attributes_data_descriptor
-				*osadd = cur_descriptor;
-			unsigned len = min(cur_len, sense_len);
-			struct osd_sense_attr *pattr = osadd->sense_attrs;
-
-			while (len >= sizeof(*pattr)) {
-				u32 attr_page = be32_to_cpu(pattr->attr_page);
-				u32 attr_id = be32_to_cpu(pattr->attr_id);
-
-				if (!osi->attr.attr_page) {
-					osi->attr.attr_page = attr_page;
-					osi->attr.attr_id = attr_id;
-				}
-
-				if (bad_attr_list && max_attr) {
-					bad_attr_list->attr_page = attr_page;
-					bad_attr_list->attr_id = attr_id;
-					bad_attr_list++;
-					max_attr--;
-				}
-
-				len -= sizeof(*pattr);
-				OSD_SENSE_PRINT2(
-					"osd_sense_attribute_identification"
-					"attr_page=0x%x attr_id=0x%x\n",
-					attr_page, attr_id);
-			}
-		}
-		/*These are not legal for OSD*/
-		case scsi_sense_field_replaceable_unit:
-			OSD_SENSE_PRINT2("scsi_sense_field_replaceable_unit\n");
-			break;
-		case scsi_sense_stream_commands:
-			OSD_SENSE_PRINT2("scsi_sense_stream_commands\n");
-			break;
-		case scsi_sense_block_commands:
-			OSD_SENSE_PRINT2("scsi_sense_block_commands\n");
-			break;
-		case scsi_sense_ata_return:
-			OSD_SENSE_PRINT2("scsi_sense_ata_return\n");
-			break;
-		default:
-			if (ssd->descriptor_type <= scsi_sense_Reserved_last)
-				OSD_SENSE_PRINT2(
-					"scsi_sense Reserved descriptor (0x%x)",
-					ssd->descriptor_type);
-			else
-				OSD_SENSE_PRINT2(
-					"scsi_sense Vendor descriptor (0x%x)",
-					ssd->descriptor_type);
-		}
-
-		cur_descriptor += cur_len;
-	}
-
-analyze:
-	if (!osi->key) {
-		/* scsi sense is Empty, the request was never issued to target
-		 * linux return code might tell us what happened.
-		 */
-		if (or->async_error == BLK_STS_RESOURCE)
-			osi->osd_err_pri = OSD_ERR_PRI_RESOURCE;
-		else
-			osi->osd_err_pri = OSD_ERR_PRI_UNREACHABLE;
-		ret = or->async_error;
-	} else if (osi->key <= scsi_sk_recovered_error) {
-		osi->osd_err_pri = 0;
-		ret = 0;
-	} else if (osi->additional_code == scsi_invalid_field_in_cdb) {
-		if (osi->cdb_field_offset == OSD_CFO_STARTING_BYTE) {
-			osi->osd_err_pri = OSD_ERR_PRI_CLEAR_PAGES;
-			ret = -EFAULT; /* caller should recover from this */
-		} else if (osi->cdb_field_offset == OSD_CFO_OBJECT_ID) {
-			osi->osd_err_pri = OSD_ERR_PRI_NOT_FOUND;
-			ret = -ENOENT;
-		} else if (osi->cdb_field_offset == OSD_CFO_PERMISSIONS) {
-			osi->osd_err_pri = OSD_ERR_PRI_NO_ACCESS;
-			ret = -EACCES;
-		} else {
-			osi->osd_err_pri = OSD_ERR_PRI_BAD_CRED;
-			ret = -EINVAL;
-		}
-	} else if (osi->additional_code == osd_quota_error) {
-		osi->osd_err_pri = OSD_ERR_PRI_NO_SPACE;
-		ret = -ENOSPC;
-	} else if (_is_osd_security_code(osi->additional_code)) {
-		osi->osd_err_pri = OSD_ERR_PRI_BAD_CRED;
-		ret = -EINVAL;
-	} else {
-		osi->osd_err_pri = OSD_ERR_PRI_EIO;
-		ret = -EIO;
-	}
-
-	if (!or->out.residual)
-		or->out.residual = or->out.total_bytes;
-	if (!or->in.residual)
-		or->in.residual = or->in.total_bytes;
-
-	return ret;
-}
-EXPORT_SYMBOL(osd_req_decode_sense_full);
-
-/*
- * Implementation of osd_sec.h API
- * TODO: Move to a separate osd_sec.c file at a later stage.
- */
-
-enum { OSD_SEC_CAP_V1_ALL_CAPS =
-	OSD_SEC_CAP_APPEND | OSD_SEC_CAP_OBJ_MGMT | OSD_SEC_CAP_REMOVE   |
-	OSD_SEC_CAP_CREATE | OSD_SEC_CAP_SET_ATTR | OSD_SEC_CAP_GET_ATTR |
-	OSD_SEC_CAP_WRITE  | OSD_SEC_CAP_READ     | OSD_SEC_CAP_POL_SEC  |
-	OSD_SEC_CAP_GLOBAL | OSD_SEC_CAP_DEV_MGMT
-};
-
-enum { OSD_SEC_CAP_V2_ALL_CAPS =
-	OSD_SEC_CAP_V1_ALL_CAPS | OSD_SEC_CAP_QUERY | OSD_SEC_CAP_M_OBJECT
-};
-
-void osd_sec_init_nosec_doall_caps(void *caps,
-	const struct osd_obj_id *obj, bool is_collection, const bool is_v1)
-{
-	struct osd_capability *cap = caps;
-	u8 type;
-	u8 descriptor_type;
-
-	if (likely(obj->id)) {
-		if (unlikely(is_collection)) {
-			type = OSD_SEC_OBJ_COLLECTION;
-			descriptor_type = is_v1 ? OSD_SEC_OBJ_DESC_OBJ :
-						  OSD_SEC_OBJ_DESC_COL;
-		} else {
-			type = OSD_SEC_OBJ_USER;
-			descriptor_type = OSD_SEC_OBJ_DESC_OBJ;
-		}
-		WARN_ON(!obj->partition);
-	} else {
-		type = obj->partition ? OSD_SEC_OBJ_PARTITION :
-					OSD_SEC_OBJ_ROOT;
-		descriptor_type = OSD_SEC_OBJ_DESC_PAR;
-	}
-
-	memset(cap, 0, sizeof(*cap));
-
-	cap->h.format = OSD_SEC_CAP_FORMAT_VER1;
-	cap->h.integrity_algorithm__key_version = 0; /* MAKE_BYTE(0, 0); */
-	cap->h.security_method = OSD_SEC_NOSEC;
-/*	cap->expiration_time;
-	cap->AUDIT[30-10];
-	cap->discriminator[42-30];
-	cap->object_created_time; */
-	cap->h.object_type = type;
-	osd_sec_set_caps(&cap->h, OSD_SEC_CAP_V1_ALL_CAPS);
-	cap->h.object_descriptor_type = descriptor_type;
-	cap->od.obj_desc.policy_access_tag = 0;
-	cap->od.obj_desc.allowed_partition_id = cpu_to_be64(obj->partition);
-	cap->od.obj_desc.allowed_object_id = cpu_to_be64(obj->id);
-}
-EXPORT_SYMBOL(osd_sec_init_nosec_doall_caps);
-
-/* FIXME: Extract version from caps pointer.
- *        Also Pete's target only supports caps from OSDv1 for now
- */
-void osd_set_caps(struct osd_cdb *cdb, const void *caps)
-{
-	/* NOTE: They start at same address */
-	memcpy(&cdb->v1.caps, caps, OSDv1_CAP_LEN);
-}
-
-bool osd_is_sec_alldata(struct osd_security_parameters *sec_parms __unused)
-{
-	return false;
-}
-
-void osd_sec_sign_cdb(struct osd_cdb *ocdb __unused, const u8 *cap_key __unused)
-{
-}
-
-void osd_sec_sign_data(void *data_integ __unused,
-		       struct bio *bio __unused, const u8 *cap_key __unused)
-{
-}
-
-/*
- * Declared in osd_protocol.h
- * 4.12.5 Data-In and Data-Out buffer offsets
- * byte offset = mantissa * (2^(exponent+8))
- * Returns the smallest allowed encoded offset that contains given @offset
- * The actual encoded offset returned is @offset + *@padding.
- */
-osd_cdb_offset __osd_encode_offset(
-	u64 offset, unsigned *padding, int min_shift, int max_shift)
-{
-	u64 try_offset = -1, mod, align;
-	osd_cdb_offset be32_offset;
-	int shift;
-
-	*padding = 0;
-	if (!offset)
-		return 0;
-
-	for (shift = min_shift; shift < max_shift; ++shift) {
-		try_offset = offset >> shift;
-		if (try_offset < (1 << OSD_OFFSET_MAX_BITS))
-			break;
-	}
-
-	BUG_ON(shift == max_shift);
-
-	align = 1 << shift;
-	mod = offset & (align - 1);
-	if (mod) {
-		*padding = align - mod;
-		try_offset += 1;
-	}
-
-	try_offset |= ((shift - 8) & 0xf) << 28;
-	be32_offset = cpu_to_be32((u32)try_offset);
-
-	OSD_DEBUG("offset=%llu mantissa=%llu exp=%d encoded=%x pad=%d\n",
-		 _LLU(offset), _LLU(try_offset & 0x0FFFFFFF), shift,
-		 be32_offset, *padding);
-	return be32_offset;
-}
diff --git a/drivers/scsi/osd/osd_uld.c b/drivers/scsi/osd/osd_uld.c
deleted file mode 100644
index eaf36ccf58db..000000000000
--- a/drivers/scsi/osd/osd_uld.c
+++ /dev/null
@@ -1,571 +0,0 @@
-/*
- * osd_uld.c - OSD Upper Layer Driver
- *
- * A Linux driver module that registers as a SCSI ULD and probes
- * for OSD type SCSI devices.
- * It's main function is to export osd devices to in-kernel users like
- * osdfs and pNFS-objects-LD. It also provides one ioctl for running
- * in Kernel tests.
- *
- * Copyright (C) 2008 Panasas Inc.  All rights reserved.
- *
- * Authors:
- *   Boaz Harrosh <ooo@electrozaur.com>
- *   Benny Halevy <bhalevy@panasas.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  1. Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *  3. Neither the name of the Panasas company nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/namei.h>
-#include <linux/cdev.h>
-#include <linux/fs.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/idr.h>
-#include <linux/major.h>
-#include <linux/file.h>
-#include <linux/slab.h>
-
-#include <scsi/scsi.h>
-#include <scsi/scsi_driver.h>
-#include <scsi/scsi_device.h>
-#include <scsi/scsi_ioctl.h>
-
-#include <scsi/osd_initiator.h>
-#include <scsi/osd_sec.h>
-
-#include "osd_debug.h"
-
-#ifndef TYPE_OSD
-#  define TYPE_OSD 0x11
-#endif
-
-#ifndef SCSI_OSD_MAJOR
-#  define SCSI_OSD_MAJOR 260
-#endif
-#define SCSI_OSD_MAX_MINOR MINORMASK
-
-static const char osd_name[] = "osd";
-static const char *osd_version_string = "open-osd 0.2.1";
-
-MODULE_AUTHOR("Boaz Harrosh <ooo@electrozaur.com>");
-MODULE_DESCRIPTION("open-osd Upper-Layer-Driver osd.ko");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_CHARDEV_MAJOR(SCSI_OSD_MAJOR);
-MODULE_ALIAS_SCSI_DEVICE(TYPE_OSD);
-
-struct osd_uld_device {
-	int minor;
-	struct device class_dev;
-	struct cdev cdev;
-	struct osd_dev od;
-	struct osd_dev_info odi;
-	struct gendisk *disk;
-};
-
-struct osd_dev_handle {
-	struct osd_dev od;
-	struct file *file;
-	struct osd_uld_device *oud;
-} ;
-
-static DEFINE_IDA(osd_minor_ida);
-
-/*
- * scsi sysfs attribute operations
- */
-static ssize_t osdname_show(struct device *dev, struct device_attribute *attr,
-			    char *buf)
-{
-	struct osd_uld_device *ould = container_of(dev, struct osd_uld_device,
-						   class_dev);
-	return sprintf(buf, "%s\n", ould->odi.osdname);
-}
-static DEVICE_ATTR_RO(osdname);
-
-static ssize_t systemid_show(struct device *dev, struct device_attribute *attr,
-			    char *buf)
-{
-	struct osd_uld_device *ould = container_of(dev, struct osd_uld_device,
-						   class_dev);
-
-	memcpy(buf, ould->odi.systemid, ould->odi.systemid_len);
-	return ould->odi.systemid_len;
-}
-static DEVICE_ATTR_RO(systemid);
-
-static struct attribute *osd_uld_attrs[] = {
-	&dev_attr_osdname.attr,
-	&dev_attr_systemid.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(osd_uld);
-
-static struct class osd_uld_class = {
-	.owner		= THIS_MODULE,
-	.name		= "scsi_osd",
-	.dev_groups	= osd_uld_groups,
-};
-
-/*
- * Char Device operations
- */
-
-static int osd_uld_open(struct inode *inode, struct file *file)
-{
-	struct osd_uld_device *oud = container_of(inode->i_cdev,
-					struct osd_uld_device, cdev);
-
-	get_device(&oud->class_dev);
-	/* cache osd_uld_device on file handle */
-	file->private_data = oud;
-	OSD_DEBUG("osd_uld_open %p\n", oud);
-	return 0;
-}
-
-static int osd_uld_release(struct inode *inode, struct file *file)
-{
-	struct osd_uld_device *oud = file->private_data;
-
-	OSD_DEBUG("osd_uld_release %p\n", file->private_data);
-	file->private_data = NULL;
-	put_device(&oud->class_dev);
-	return 0;
-}
-
-/* FIXME: Only one vector for now */
-unsigned g_test_ioctl;
-do_test_fn *g_do_test;
-
-int osduld_register_test(unsigned ioctl, do_test_fn *do_test)
-{
-	if (g_test_ioctl)
-		return -EINVAL;
-
-	g_test_ioctl = ioctl;
-	g_do_test = do_test;
-	return 0;
-}
-EXPORT_SYMBOL(osduld_register_test);
-
-void osduld_unregister_test(unsigned ioctl)
-{
-	if (ioctl == g_test_ioctl) {
-		g_test_ioctl = 0;
-		g_do_test = NULL;
-	}
-}
-EXPORT_SYMBOL(osduld_unregister_test);
-
-static do_test_fn *_find_ioctl(unsigned cmd)
-{
-	if (g_test_ioctl == cmd)
-		return g_do_test;
-	else
-		return NULL;
-}
-
-static long osd_uld_ioctl(struct file *file, unsigned int cmd,
-	unsigned long arg)
-{
-	struct osd_uld_device *oud = file->private_data;
-	int ret;
-	do_test_fn *do_test;
-
-	do_test = _find_ioctl(cmd);
-	if (do_test)
-		ret = do_test(&oud->od, cmd, arg);
-	else {
-		OSD_ERR("Unknown ioctl %d: osd_uld_device=%p\n", cmd, oud);
-		ret = -ENOIOCTLCMD;
-	}
-	return ret;
-}
-
-static const struct file_operations osd_fops = {
-	.owner          = THIS_MODULE,
-	.open           = osd_uld_open,
-	.release        = osd_uld_release,
-	.unlocked_ioctl = osd_uld_ioctl,
-	.llseek		= noop_llseek,
-};
-
-struct osd_dev *osduld_path_lookup(const char *name)
-{
-	struct osd_uld_device *oud;
-	struct osd_dev_handle *odh;
-	struct file *file;
-	int error;
-
-	if (!name || !*name) {
-		OSD_ERR("Mount with !path || !*path\n");
-		return ERR_PTR(-EINVAL);
-	}
-
-	odh = kzalloc(sizeof(*odh), GFP_KERNEL);
-	if (unlikely(!odh))
-		return ERR_PTR(-ENOMEM);
-
-	file = filp_open(name, O_RDWR, 0);
-	if (IS_ERR(file)) {
-		error = PTR_ERR(file);
-		goto free_od;
-	}
-
-	if (file->f_op != &osd_fops){
-		error = -EINVAL;
-		goto close_file;
-	}
-
-	oud = file->private_data;
-
-	odh->od = oud->od;
-	odh->file = file;
-	odh->oud = oud;
-
-	return &odh->od;
-
-close_file:
-	fput(file);
-free_od:
-	kfree(odh);
-	return ERR_PTR(error);
-}
-EXPORT_SYMBOL(osduld_path_lookup);
-
-static inline bool _the_same_or_null(const u8 *a1, unsigned a1_len,
-				     const u8 *a2, unsigned a2_len)
-{
-	if (!a2_len) /* User string is Empty means don't care */
-		return true;
-
-	if (a1_len != a2_len)
-		return false;
-
-	return 0 == memcmp(a1, a2, a1_len);
-}
-
-static int _match_odi(struct device *dev, const void *find_data)
-{
-	struct osd_uld_device *oud = container_of(dev, struct osd_uld_device,
-						  class_dev);
-	const struct osd_dev_info *odi = find_data;
-
-	if (_the_same_or_null(oud->odi.systemid, oud->odi.systemid_len,
-			      odi->systemid, odi->systemid_len) &&
-	    _the_same_or_null(oud->odi.osdname, oud->odi.osdname_len,
-			      odi->osdname, odi->osdname_len)) {
-		OSD_DEBUG("found device sysid_len=%d osdname=%d\n",
-			  odi->systemid_len, odi->osdname_len);
-		return 1;
-	} else {
-		return 0;
-	}
-}
-
-/* osduld_info_lookup - Loop through all devices, return the requested osd_dev.
- *
- * if @odi->systemid_len and/or @odi->osdname_len are zero, they act as a don't
- * care. .e.g if they're both zero /dev/osd0 is returned.
- */
-struct osd_dev *osduld_info_lookup(const struct osd_dev_info *odi)
-{
-	struct device *dev = class_find_device(&osd_uld_class, NULL, odi, _match_odi);
-	if (likely(dev)) {
-		struct osd_dev_handle *odh = kzalloc(sizeof(*odh), GFP_KERNEL);
-		struct osd_uld_device *oud = container_of(dev,
-			struct osd_uld_device, class_dev);
-
-		if (unlikely(!odh)) {
-			put_device(dev);
-			return ERR_PTR(-ENOMEM);
-		}
-
-		odh->od = oud->od;
-		odh->oud = oud;
-
-		return &odh->od;
-	}
-
-	return ERR_PTR(-ENODEV);
-}
-EXPORT_SYMBOL(osduld_info_lookup);
-
-void osduld_put_device(struct osd_dev *od)
-{
-	if (od && !IS_ERR(od)) {
-		struct osd_dev_handle *odh =
-				container_of(od, struct osd_dev_handle, od);
-		struct osd_uld_device *oud = odh->oud;
-
-		BUG_ON(od->scsi_device != oud->od.scsi_device);
-
-		/* If scsi has released the device (logout), and exofs has last
-		 * reference on oud it will be freed by above osd_uld_release
-		 * within fput below. But this will oops in cdev_release which
-		 * is called after the fops->release. A get_/put_ pair makes
-		 * sure we have a cdev for the duration of fput
-		 */
-		if (odh->file) {
-			get_device(&oud->class_dev);
-			fput(odh->file);
-		}
-		put_device(&oud->class_dev);
-		kfree(odh);
-	}
-}
-EXPORT_SYMBOL(osduld_put_device);
-
-const struct osd_dev_info *osduld_device_info(struct osd_dev *od)
-{
-	struct osd_dev_handle *odh =
-				container_of(od, struct osd_dev_handle, od);
-	return &odh->oud->odi;
-}
-EXPORT_SYMBOL(osduld_device_info);
-
-bool osduld_device_same(struct osd_dev *od, const struct osd_dev_info *odi)
-{
-	struct osd_dev_handle *odh =
-				container_of(od, struct osd_dev_handle, od);
-	struct osd_uld_device *oud = odh->oud;
-
-	return (oud->odi.systemid_len == odi->systemid_len) &&
-		_the_same_or_null(oud->odi.systemid, oud->odi.systemid_len,
-				 odi->systemid, odi->systemid_len) &&
-		(oud->odi.osdname_len == odi->osdname_len) &&
-		_the_same_or_null(oud->odi.osdname, oud->odi.osdname_len,
-				  odi->osdname, odi->osdname_len);
-}
-EXPORT_SYMBOL(osduld_device_same);
-
-/*
- * Scsi Device operations
- */
-
-static int __detect_osd(struct osd_uld_device *oud)
-{
-	struct scsi_device *scsi_device = oud->od.scsi_device;
-	struct scsi_sense_hdr sense_hdr;
-	char caps[OSD_CAP_LEN];
-	int error;
-
-	/* sending a test_unit_ready as first command seems to be needed
-	 * by some targets
-	 */
-	OSD_DEBUG("start scsi_test_unit_ready %p %p %p\n",
-			oud, scsi_device, scsi_device->request_queue);
-	error = scsi_test_unit_ready(scsi_device, 10*HZ, 5, &sense_hdr);
-	if (error)
-		OSD_ERR("warning: scsi_test_unit_ready failed\n");
-
-	osd_sec_init_nosec_doall_caps(caps, &osd_root_object, false, true);
-	if (osd_auto_detect_ver(&oud->od, caps, &oud->odi))
-		return -ENODEV;
-
-	return 0;
-}
-
-static void __remove(struct device *dev)
-{
-	struct osd_uld_device *oud = container_of(dev, struct osd_uld_device,
-						  class_dev);
-	struct scsi_device *scsi_device = oud->od.scsi_device;
-
-	kfree(oud->odi.osdname);
-
-	osd_dev_fini(&oud->od);
-	scsi_device_put(scsi_device);
-
-	OSD_INFO("osd_remove %s\n",
-		 oud->disk ? oud->disk->disk_name : NULL);
-
-	if (oud->disk)
-		put_disk(oud->disk);
-
-	kfree(oud);
-}
-
-static int osd_probe(struct device *dev)
-{
-	struct scsi_device *scsi_device = to_scsi_device(dev);
-	struct gendisk *disk;
-	struct osd_uld_device *oud;
-	int minor;
-	int error;
-
-	if (scsi_device->type != TYPE_OSD)
-		return -ENODEV;
-
-	minor = ida_alloc_max(&osd_minor_ida, SCSI_OSD_MAX_MINOR, GFP_KERNEL);
-	if (minor == -ENOSPC)
-		return -EBUSY;
-	if (minor < 0)
-		return -ENODEV;
-
-	error = -ENOMEM;
-	oud = kzalloc(sizeof(*oud), GFP_KERNEL);
-	if (NULL == oud)
-		goto err_retract_minor;
-
-	/* class device member */
-	device_initialize(&oud->class_dev);
-	dev_set_drvdata(dev, oud);
-	oud->minor = minor;
-	oud->class_dev.devt = MKDEV(SCSI_OSD_MAJOR, oud->minor);
-	oud->class_dev.class = &osd_uld_class;
-	oud->class_dev.parent = dev;
-	oud->class_dev.release = __remove;
-
-	/* hold one more reference to the scsi_device that will get released
-	 * in __release, in case a logout is happening while fs is mounted
-	 */
-	if (scsi_device_get(scsi_device))
-		goto err_retract_minor;
-	osd_dev_init(&oud->od, scsi_device);
-
-	/* allocate a disk and set it up */
-	/* FIXME: do we need this since sg has already done that */
-	disk = alloc_disk(1);
-	if (!disk) {
-		OSD_ERR("alloc_disk failed\n");
-		goto err_free_osd;
-	}
-	disk->major = SCSI_OSD_MAJOR;
-	disk->first_minor = oud->minor;
-	sprintf(disk->disk_name, "osd%d", oud->minor);
-	oud->disk = disk;
-
-	/* Detect the OSD Version */
-	error = __detect_osd(oud);
-	if (error) {
-		OSD_ERR("osd detection failed, non-compatible OSD device\n");
-		goto err_free_osd;
-	}
-
-	/* init the char-device for communication with user-mode */
-	cdev_init(&oud->cdev, &osd_fops);
-	oud->cdev.owner = THIS_MODULE;
-
-	error = dev_set_name(&oud->class_dev, "%s", disk->disk_name);
-	if (error) {
-		OSD_ERR("dev_set_name failed => %d\n", error);
-		goto err_free_osd;
-	}
-
-	error = cdev_device_add(&oud->cdev, &oud->class_dev);
-	if (error) {
-		OSD_ERR("device_register failed => %d\n", error);
-		goto err_free_osd;
-	}
-
-	OSD_INFO("osd_probe %s\n", disk->disk_name);
-	return 0;
-
-err_free_osd:
-	put_device(&oud->class_dev);
-err_retract_minor:
-	ida_free(&osd_minor_ida, minor);
-	return error;
-}
-
-static int osd_remove(struct device *dev)
-{
-	struct scsi_device *scsi_device = to_scsi_device(dev);
-	struct osd_uld_device *oud = dev_get_drvdata(dev);
-
-	if (oud->od.scsi_device != scsi_device) {
-		OSD_ERR("Half cooked osd-device %p, || %p!=%p",
-			dev, oud->od.scsi_device, scsi_device);
-	}
-
-	cdev_device_del(&oud->cdev, &oud->class_dev);
-	ida_free(&osd_minor_ida, oud->minor);
-	put_device(&oud->class_dev);
-
-	return 0;
-}
-
-/*
- * Global driver and scsi registration
- */
-
-static struct scsi_driver osd_driver = {
-	.gendrv = {
-		.name		= osd_name,
-		.owner		= THIS_MODULE,
-		.probe		= osd_probe,
-		.remove		= osd_remove,
-	}
-};
-
-static int __init osd_uld_init(void)
-{
-	int err;
-
-	err = class_register(&osd_uld_class);
-	if (err) {
-		OSD_ERR("Unable to register sysfs class => %d\n", err);
-		return err;
-	}
-
-	err = register_chrdev_region(MKDEV(SCSI_OSD_MAJOR, 0),
-				     SCSI_OSD_MAX_MINOR, osd_name);
-	if (err) {
-		OSD_ERR("Unable to register major %d for osd ULD => %d\n",
-			SCSI_OSD_MAJOR, err);
-		goto err_out;
-	}
-
-	err = scsi_register_driver(&osd_driver.gendrv);
-	if (err) {
-		OSD_ERR("scsi_register_driver failed => %d\n", err);
-		goto err_out_chrdev;
-	}
-
-	OSD_INFO("LOADED %s\n", osd_version_string);
-	return 0;
-
-err_out_chrdev:
-	unregister_chrdev_region(MKDEV(SCSI_OSD_MAJOR, 0), SCSI_OSD_MAX_MINOR);
-err_out:
-	class_unregister(&osd_uld_class);
-	return err;
-}
-
-static void __exit osd_uld_exit(void)
-{
-	scsi_unregister_driver(&osd_driver.gendrv);
-	unregister_chrdev_region(MKDEV(SCSI_OSD_MAJOR, 0), SCSI_OSD_MAX_MINOR);
-	class_unregister(&osd_uld_class);
-	OSD_INFO("UNLOADED %s\n", osd_version_string);
-}
-
-module_init(osd_uld_init);
-module_exit(osd_uld_exit);
diff --git a/include/scsi/osd_initiator.h b/include/scsi/osd_initiator.h
deleted file mode 100644
index 86a569d008b2..000000000000
--- a/include/scsi/osd_initiator.h
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
- * osd_initiator.h - OSD initiator API definition
- *
- * Copyright (C) 2008 Panasas Inc.  All rights reserved.
- *
- * Authors:
- *   Boaz Harrosh <ooo@electrozaur.com>
- *   Benny Halevy <bhalevy@panasas.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- *
- */
-#ifndef __OSD_INITIATOR_H__
-#define __OSD_INITIATOR_H__
-
-#include <scsi/osd_protocol.h>
-#include <scsi/osd_types.h>
-
-#include <linux/blkdev.h>
-#include <scsi/scsi_device.h>
-
-/* Note: "NI" in comments below means "Not Implemented yet" */
-
-/* Configure of code:
- * #undef if you *don't* want OSD v1 support in runtime.
- * If #defined the initiator will dynamically configure to encode OSD v1
- * CDB's if the target is detected to be OSD v1 only.
- * OSD v2 only commands, options, and attributes will be ignored if target
- * is v1 only.
- * If #defined will result in bigger/slower code (OK Slower maybe not)
- * Q: Should this be CONFIG_SCSI_OSD_VER1_SUPPORT and set from Kconfig?
- */
-#define OSD_VER1_SUPPORT y
-
-enum osd_std_version {
-	OSD_VER_NONE = 0,
-	OSD_VER1 = 1,
-	OSD_VER2 = 2,
-};
-
-/*
- * Object-based Storage Device.
- * This object represents an OSD device.
- * It is not a full linux device in any way. It is only
- * a place to hang resources associated with a Linux
- * request Q and some default properties.
- */
-struct osd_dev {
-	struct scsi_device *scsi_device;
-	unsigned def_timeout;
-
-#ifdef OSD_VER1_SUPPORT
-	enum osd_std_version version;
-#endif
-};
-
-/* Unique Identification of an OSD device */
-struct osd_dev_info {
-	unsigned systemid_len;
-	u8 systemid[OSD_SYSTEMID_LEN];
-	unsigned osdname_len;
-	u8 *osdname;
-};
-
-/* Retrieve/return osd_dev(s) for use by Kernel clients
- * Use IS_ERR/ERR_PTR on returned "osd_dev *".
- */
-struct osd_dev *osduld_path_lookup(const char *dev_name);
-struct osd_dev *osduld_info_lookup(const struct osd_dev_info *odi);
-void osduld_put_device(struct osd_dev *od);
-
-const struct osd_dev_info *osduld_device_info(struct osd_dev *od);
-bool osduld_device_same(struct osd_dev *od, const struct osd_dev_info *odi);
-
-/* Add/remove test ioctls from external modules */
-typedef int (do_test_fn)(struct osd_dev *od, unsigned cmd, unsigned long arg);
-int osduld_register_test(unsigned ioctl, do_test_fn *do_test);
-void osduld_unregister_test(unsigned ioctl);
-
-/* These are called by uld at probe time */
-void osd_dev_init(struct osd_dev *od, struct scsi_device *scsi_device);
-void osd_dev_fini(struct osd_dev *od);
-
-/**
- * osd_auto_detect_ver - Detect the OSD version, return Unique Identification
- *
- * @od:     OSD target lun handle
- * @caps:   Capabilities authorizing OSD root read attributes access
- * @odi:    Retrieved information uniquely identifying the osd target lun
- *          Note: odi->osdname must be kfreed by caller.
- *
- * Auto detects the OSD version of the OSD target and sets the @od
- * accordingly. Meanwhile also returns the "system id" and "osd name" root
- * attributes which uniquely identify the OSD target. This member is usually
- * called by the ULD. ULD users should call osduld_device_info().
- * This rutine allocates osd requests and memory at GFP_KERNEL level and might
- * sleep.
- */
-int osd_auto_detect_ver(struct osd_dev *od,
-	void *caps, struct osd_dev_info *odi);
-
-static inline struct request_queue *osd_request_queue(struct osd_dev *od)
-{
-	return od->scsi_device->request_queue;
-}
-
-/* we might want to use function vector in the future */
-static inline void osd_dev_set_ver(struct osd_dev *od, enum osd_std_version v)
-{
-#ifdef OSD_VER1_SUPPORT
-	od->version = v;
-#endif
-}
-
-static inline bool osd_dev_is_ver1(struct osd_dev *od)
-{
-#ifdef OSD_VER1_SUPPORT
-	return od->version == OSD_VER1;
-#else
-	return false;
-#endif
-}
-
-struct osd_request;
-typedef void (osd_req_done_fn)(struct osd_request *or, void *private);
-
-struct osd_request {
-	struct osd_cdb cdb;
-	struct osd_data_out_integrity_info out_data_integ;
-	struct osd_data_in_integrity_info in_data_integ;
-
-	struct osd_dev *osd_dev;
-	struct request *request;
-
-	struct _osd_req_data_segment {
-		void *buff;
-		unsigned alloc_size; /* 0 here means: don't call kfree */
-		unsigned total_bytes;
-	} cdb_cont, set_attr, enc_get_attr, get_attr;
-
-	struct _osd_io_info {
-		struct bio *bio;
-		u64 total_bytes;
-		u64 residual;
-		struct request *req;
-		struct _osd_req_data_segment *last_seg;
-		u8 *pad_buff;
-	} out, in;
-
-	unsigned timeout;
-	unsigned retries;
-	unsigned sense_len;
-	u8 sense[OSD_MAX_SENSE_LEN];
-	enum osd_attributes_mode attributes_mode;
-
-	osd_req_done_fn *async_done;
-	void *async_private;
-	blk_status_t async_error;
-	int req_errors;
-};
-
-static inline bool osd_req_is_ver1(struct osd_request *or)
-{
-	return osd_dev_is_ver1(or->osd_dev);
-}
-
-/*
- * How to use the osd library:
- *
- * osd_start_request
- *	Allocates a request.
- *
- * osd_req_*
- *	Call one of, to encode the desired operation.
- *
- * osd_add_{get,set}_attr
- *	Optionally add attributes to the CDB, list or page mode.
- *
- * osd_finalize_request
- *	Computes final data out/in offsets and signs the request,
- *	making it ready for execution.
- *
- * osd_execute_request
- *	May be called to execute it through the block layer. Other wise submit
- *	the associated block request in some other way.
- *
- * After execution:
- * osd_req_decode_sense
- *	Decodes sense information to verify execution results.
- *
- * osd_req_decode_get_attr
- *	Retrieve osd_add_get_attr_list() values if used.
- *
- * osd_end_request
- *	Must be called to deallocate the request.
- */
-
-/**
- * osd_start_request - Allocate and initialize an osd_request
- *
- * @osd_dev:    OSD device that holds the scsi-device and default values
- *              that the request is associated with.
- *
- * Allocate osd_request and initialize all members to the
- * default/initial state.
- */
-struct osd_request *osd_start_request(struct osd_dev *od);
-
-enum osd_req_options {
-	OSD_REQ_FUA = 0x08,	/* Force Unit Access */
-	OSD_REQ_DPO = 0x10,	/* Disable Page Out */
-
-	OSD_REQ_BYPASS_TIMESTAMPS = 0x80,
-};
-
-/**
- * osd_finalize_request - Sign request and prepare request for execution
- *
- * @or:		osd_request to prepare
- * @options:	combination of osd_req_options bit flags or 0.
- * @cap:	A Pointer to an OSD_CAP_LEN bytes buffer that is received from
- *              The security manager as capabilities for this cdb.
- * @cap_key:	The cryptographic key used to sign the cdb/data. Can be null
- *              if NOSEC is used.
- *
- * The actual request and bios are only allocated here, so are the get_attr
- * buffers that will receive the returned attributes. Copy's @cap to cdb.
- * Sign the cdb/data with @cap_key.
- */
-int osd_finalize_request(struct osd_request *or,
-	u8 options, const void *cap, const u8 *cap_key);
-
-/**
- * osd_execute_request - Execute the request synchronously through block-layer
- *
- * @or:		osd_request to Executed
- *
- * Calls blk_execute_rq to q the command and waits for completion.
- */
-int osd_execute_request(struct osd_request *or);
-
-/**
- * osd_execute_request_async - Execute the request without waitting.
- *
- * @or:                      - osd_request to Executed
- * @done: (Optional)         - Called at end of execution
- * @private:                 - Will be passed to @done function
- *
- * Calls blk_execute_rq_nowait to queue the command. When execution is done
- * optionally calls @done with @private as parameter. @or->async_error will
- * have the return code
- */
-int osd_execute_request_async(struct osd_request *or,
-	osd_req_done_fn *done, void *private);
-
-/**
- * osd_req_decode_sense_full - Decode sense information after execution.
- *
- * @or:           - osd_request to examine
- * @osi           - Receives a more detailed error report information (optional).
- * @silent        - Do not print to dmsg (Even if enabled)
- * @bad_obj_list  - Some commands act on multiple objects. Failed objects will
- *                  be received here (optional)
- * @max_obj       - Size of @bad_obj_list.
- * @bad_attr_list - List of failing attributes (optional)
- * @max_attr      - Size of @bad_attr_list.
- *
- * After execution, osd_request results are analyzed using this function. The
- * return code is the final disposition on the error. So it is possible that a
- * CHECK_CONDITION was returned from target but this will return NO_ERROR, for
- * example on recovered errors. All parameters are optional if caller does
- * not need any returned information.
- * Note: This function will also dump the error to dmsg according to settings
- * of the SCSI_OSD_DPRINT_SENSE Kconfig value. Set @silent if you know the
- * command would routinely fail, to not spam the dmsg file.
- */
-
-/**
- * osd_err_priority - osd categorized return codes in ascending severity.
- *
- * The categories are borrowed from the pnfs_osd_errno enum.
- * See comments for translated Linux codes returned by osd_req_decode_sense.
- */
-enum osd_err_priority {
-	OSD_ERR_PRI_NO_ERROR	= 0,
-	/* Recoverable, caller should clear_highpage() all pages */
-	OSD_ERR_PRI_CLEAR_PAGES = 1, /* -EFAULT */
-	OSD_ERR_PRI_RESOURCE	= 2, /* -ENOMEM */
-	OSD_ERR_PRI_BAD_CRED	= 3, /* -EINVAL */
-	OSD_ERR_PRI_NO_ACCESS	= 4, /* -EACCES */
-	OSD_ERR_PRI_UNREACHABLE	= 5, /* any other */
-	OSD_ERR_PRI_NOT_FOUND	= 6, /* -ENOENT */
-	OSD_ERR_PRI_NO_SPACE	= 7, /* -ENOSPC */
-	OSD_ERR_PRI_EIO		= 8, /* -EIO    */
-};
-
-struct osd_sense_info {
-	enum osd_err_priority osd_err_pri;
-
-	int key;		/* one of enum scsi_sense_keys */
-	int additional_code ;	/* enum osd_additional_sense_codes */
-	union { /* Sense specific information */
-		u16 sense_info;
-		u16 cdb_field_offset; 	/* scsi_invalid_field_in_cdb */
-	};
-	union { /* Command specific information */
-		u64 command_info;
-	};
-
-	u32 not_initiated_command_functions; /* osd_command_functions_bits */
-	u32 completed_command_functions; /* osd_command_functions_bits */
-	struct osd_obj_id obj;
-	struct osd_attr attr;
-};
-
-int osd_req_decode_sense_full(struct osd_request *or,
-	struct osd_sense_info *osi, bool silent,
-	struct osd_obj_id *bad_obj_list, int max_obj,
-	struct osd_attr *bad_attr_list, int max_attr);
-
-static inline int osd_req_decode_sense(struct osd_request *or,
-	struct osd_sense_info *osi)
-{
-	return osd_req_decode_sense_full(or, osi, false, NULL, 0, NULL, 0);
-}
-
-/**
- * osd_end_request - return osd_request to free store
- *
- * @or:		osd_request to free
- *
- * Deallocate all osd_request resources (struct req's, BIOs, buffers, etc.)
- */
-void osd_end_request(struct osd_request *or);
-
-/*
- * CDB Encoding
- *
- * Note: call only one of the following methods.
- */
-
-/*
- * Device commands
- */
-void osd_req_set_master_seed_xchg(struct osd_request *or, ...);/* NI */
-void osd_req_set_master_key(struct osd_request *or, ...);/* NI */
-
-void osd_req_format(struct osd_request *or, u64 tot_capacity);
-
-/* list all partitions
- * @list header must be initialized to zero on first run.
- *
- * Call osd_is_obj_list_done() to find if we got the complete list.
- */
-int osd_req_list_dev_partitions(struct osd_request *or,
-	osd_id initial_id, struct osd_obj_id_list *list, unsigned nelem);
-
-void osd_req_flush_obsd(struct osd_request *or,
-	enum osd_options_flush_scope_values);
-
-void osd_req_perform_scsi_command(struct osd_request *or,
-	const u8 *cdb, ...);/* NI */
-void osd_req_task_management(struct osd_request *or, ...);/* NI */
-
-/*
- * Partition commands
- */
-void osd_req_create_partition(struct osd_request *or, osd_id partition);
-void osd_req_remove_partition(struct osd_request *or, osd_id partition);
-
-void osd_req_set_partition_key(struct osd_request *or,
-	osd_id partition, u8 new_key_id[OSD_CRYPTO_KEYID_SIZE],
-	u8 seed[OSD_CRYPTO_SEED_SIZE]);/* NI */
-
-/* list all collections in the partition
- * @list header must be init to zero on first run.
- *
- * Call osd_is_obj_list_done() to find if we got the complete list.
- */
-int osd_req_list_partition_collections(struct osd_request *or,
-	osd_id partition, osd_id initial_id, struct osd_obj_id_list *list,
-	unsigned nelem);
-
-/* list all objects in the partition
- * @list header must be init to zero on first run.
- *
- * Call osd_is_obj_list_done() to find if we got the complete list.
- */
-int osd_req_list_partition_objects(struct osd_request *or,
-	osd_id partition, osd_id initial_id, struct osd_obj_id_list *list,
-	unsigned nelem);
-
-void osd_req_flush_partition(struct osd_request *or,
-	osd_id partition, enum osd_options_flush_scope_values);
-
-/*
- * Collection commands
- */
-void osd_req_create_collection(struct osd_request *or,
-	const struct osd_obj_id *);/* NI */
-void osd_req_remove_collection(struct osd_request *or,
-	const struct osd_obj_id *);/* NI */
-
-/* list all objects in the collection */
-int osd_req_list_collection_objects(struct osd_request *or,
-	const struct osd_obj_id *, osd_id initial_id,
-	struct osd_obj_id_list *list, unsigned nelem);
-
-/* V2 only filtered list of objects in the collection */
-void osd_req_query(struct osd_request *or, ...);/* NI */
-
-void osd_req_flush_collection(struct osd_request *or,
-	const struct osd_obj_id *, enum osd_options_flush_scope_values);
-
-void osd_req_get_member_attrs(struct osd_request *or, ...);/* V2-only NI */
-void osd_req_set_member_attrs(struct osd_request *or, ...);/* V2-only NI */
-
-/*
- * Object commands
- */
-void osd_req_create_object(struct osd_request *or, struct osd_obj_id *);
-void osd_req_remove_object(struct osd_request *or, struct osd_obj_id *);
-
-void osd_req_write(struct osd_request *or,
-	const struct osd_obj_id *obj, u64 offset, struct bio *bio, u64 len);
-int osd_req_write_kern(struct osd_request *or,
-	const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
-void osd_req_append(struct osd_request *or,
-	const struct osd_obj_id *, struct bio *data_out);/* NI */
-void osd_req_create_write(struct osd_request *or,
-	const struct osd_obj_id *, struct bio *data_out, u64 offset);/* NI */
-void osd_req_clear(struct osd_request *or,
-	const struct osd_obj_id *, u64 offset, u64 len);/* NI */
-void osd_req_punch(struct osd_request *or,
-	const struct osd_obj_id *, u64 offset, u64 len);/* V2-only NI */
-
-void osd_req_flush_object(struct osd_request *or,
-	const struct osd_obj_id *, enum osd_options_flush_scope_values,
-	/*V2*/ u64 offset, /*V2*/ u64 len);
-
-void osd_req_read(struct osd_request *or,
-	const struct osd_obj_id *obj, u64 offset, struct bio *bio, u64 len);
-int osd_req_read_kern(struct osd_request *or,
-	const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
-
-/* Scatter/Gather write/read commands */
-int osd_req_write_sg(struct osd_request *or,
-	const struct osd_obj_id *obj, struct bio *bio,
-	const struct osd_sg_entry *sglist, unsigned numentries);
-int osd_req_read_sg(struct osd_request *or,
-	const struct osd_obj_id *obj, struct bio *bio,
-	const struct osd_sg_entry *sglist, unsigned numentries);
-int osd_req_write_sg_kern(struct osd_request *or,
-	const struct osd_obj_id *obj, void **buff,
-	const struct osd_sg_entry *sglist, unsigned numentries);
-int osd_req_read_sg_kern(struct osd_request *or,
-	const struct osd_obj_id *obj, void **buff,
-	const struct osd_sg_entry *sglist, unsigned numentries);
-
-/*
- * Root/Partition/Collection/Object Attributes commands
- */
-
-/* get before set */
-void osd_req_get_attributes(struct osd_request *or, const struct osd_obj_id *);
-
-/* set before get */
-void osd_req_set_attributes(struct osd_request *or, const struct osd_obj_id *);
-
-/*
- * Attributes appended to most commands
- */
-
-/* Attributes List mode (or V2 CDB) */
-  /*
-   * TODO: In ver2 if at finalize time only one attr was set and no gets,
-   * then the Attributes CDB mode is used automatically to save IO.
-   */
-
-/* set a list of attributes. */
-int osd_req_add_set_attr_list(struct osd_request *or,
-	const struct osd_attr *, unsigned nelem);
-
-/* get a list of attributes */
-int osd_req_add_get_attr_list(struct osd_request *or,
-	const struct osd_attr *, unsigned nelem);
-
-/*
- * Attributes list decoding
- * Must be called after osd_request.request was executed
- * It is called in a loop to decode the returned get_attr
- * (see osd_add_get_attr)
- */
-int osd_req_decode_get_attr_list(struct osd_request *or,
-	struct osd_attr *, int *nelem, void **iterator);
-
-/* Attributes Page mode */
-
-/*
- * Read an attribute page and optionally set one attribute
- *
- * Retrieves the attribute page directly to a user buffer.
- * @attr_page_data shall stay valid until end of execution.
- * See osd_attributes.h for common page structures
- */
-int osd_req_add_get_attr_page(struct osd_request *or,
-	u32 page_id, void *attr_page_data, unsigned max_page_len,
-	const struct osd_attr *set_one);
-
-#endif /* __OSD_LIB_H__ */
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
deleted file mode 100644
index 7a8d2cd30328..000000000000
--- a/include/scsi/osd_ore.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (C) 2011
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Public Declarations of the ORE API
- *
- * This file is part of the ORE (Object Raid Engine) library.
- *
- * ORE is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation. (GPL v2)
- *
- * ORE is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with the ORE; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __ORE_H__
-#define __ORE_H__
-
-#include <scsi/osd_initiator.h>
-#include <scsi/osd_attributes.h>
-#include <scsi/osd_sec.h>
-#include <linux/pnfs_osd_xdr.h>
-#include <linux/bug.h>
-
-struct ore_comp {
-	struct osd_obj_id	obj;
-	u8			cred[OSD_CAP_LEN];
-};
-
-struct ore_layout {
-	/* Our way of looking at the data_map */
-	enum pnfs_osd_raid_algorithm4
-		 raid_algorithm;
-	unsigned stripe_unit;
-	unsigned mirrors_p1;
-
-	unsigned group_width;
-	unsigned parity;
-	u64	 group_depth;
-	unsigned group_count;
-
-	/* Cached often needed calculations filled in by
-	 * ore_verify_layout
-	 */
-	unsigned long max_io_length;	/* Max length that should be passed to
-					 * ore_get_rw_state
-					 */
-};
-
-struct ore_dev {
-	struct osd_dev *od;
-};
-
-struct ore_components {
-	unsigned	first_dev;		/* First logical device no    */
-	unsigned	numdevs;		/* Num of devices in array    */
-	/* If @single_comp == EC_SINGLE_COMP, @comps points to a single
-	 * component. else there are @numdevs components
-	 */
-	enum EC_COMP_USAGE {
-		EC_SINGLE_COMP = 0, EC_MULTPLE_COMPS = 0xffffffff
-	}		single_comp;
-	struct ore_comp	*comps;
-
-	/* Array of pointers to ore_dev-* . User will usually have these pointed
-	 * too a bigger struct which contain an "ore_dev ored" member and use
-	 * container_of(oc->ods[i], struct foo_dev, ored) to access the bigger
-	 * structure.
-	 */
-	struct ore_dev	**ods;
-};
-
-/* ore_comp_dev Recievies a logical device index */
-static inline struct osd_dev *ore_comp_dev(
-	const struct ore_components *oc, unsigned i)
-{
-	BUG_ON((i < oc->first_dev) || (oc->first_dev + oc->numdevs <= i));
-	return oc->ods[i - oc->first_dev]->od;
-}
-
-static inline void ore_comp_set_dev(
-	struct ore_components *oc, unsigned i, struct osd_dev *od)
-{
-	oc->ods[i - oc->first_dev]->od = od;
-}
-
-struct ore_striping_info {
-	u64 offset;
-	u64 obj_offset;
-	u64 length;
-	u64 first_stripe_start; /* only used in raid writes */
-	u64 M; /* for truncate */
-	unsigned bytes_in_stripe;
-	unsigned dev;
-	unsigned par_dev;
-	unsigned unit_off;
-	unsigned cur_pg;
-	unsigned cur_comp;
-	unsigned maxdevUnits;
-};
-
-struct ore_io_state;
-typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private);
-struct _ore_r4w_op {
-	/* @Priv given here is passed ios->private */
-	struct page * (*get_page)(void *priv, u64 page_index, bool *uptodate);
-	void (*put_page)(void *priv, struct page *page);
-};
-
-struct ore_io_state {
-	struct kref		kref;
-	struct ore_striping_info si;
-
-	void			*private;
-	ore_io_done_fn	done;
-
-	struct ore_layout	*layout;
-	struct ore_components	*oc;
-
-	/* Global read/write IO*/
-	loff_t			offset;
-	unsigned long		length;
-	void			*kern_buff;
-
-	struct page		**pages;
-	unsigned		nr_pages;
-	unsigned		pgbase;
-	unsigned		pages_consumed;
-
-	/* Attributes */
-	unsigned		in_attr_len;
-	struct osd_attr		*in_attr;
-	unsigned		out_attr_len;
-	struct osd_attr		*out_attr;
-
-	bool			reading;
-
-	/* House keeping of Parity pages */
-	bool			extra_part_alloc;
-	struct page		**parity_pages;
-	unsigned		max_par_pages;
-	unsigned		cur_par_page;
-	unsigned		sgs_per_dev;
-	struct __stripe_pages_2d *sp2d;
-	struct ore_io_state	 *ios_read_4_write;
-	const struct _ore_r4w_op *r4w;
-
-	/* Variable array of size numdevs */
-	unsigned numdevs;
-	struct ore_per_dev_state {
-		struct osd_request *or;
-		struct bio *bio;
-		loff_t offset;
-		unsigned length;
-		unsigned last_sgs_total;
-		unsigned dev;
-		struct osd_sg_entry *sglist;
-		unsigned cur_sg;
-	} per_dev[];
-};
-
-static inline unsigned ore_io_state_size(unsigned numdevs)
-{
-	return sizeof(struct ore_io_state) +
-		sizeof(struct ore_per_dev_state) * numdevs;
-}
-
-/* ore.c */
-int ore_verify_layout(unsigned total_comps, struct ore_layout *layout);
-void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
-			  u64 length, struct ore_striping_info *si);
-int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
-		     bool is_reading, u64 offset, u64 length,
-		     struct ore_io_state **ios);
-int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps,
-		     struct ore_io_state **ios);
-void ore_put_io_state(struct ore_io_state *ios);
-
-typedef void (*ore_on_dev_error)(struct ore_io_state *ios, struct ore_dev *od,
-	unsigned dev_index, enum osd_err_priority oep,
-	u64 dev_offset, u64  dev_len);
-int ore_check_io(struct ore_io_state *ios, ore_on_dev_error rep);
-
-int ore_create(struct ore_io_state *ios);
-int ore_remove(struct ore_io_state *ios);
-int ore_write(struct ore_io_state *ios);
-int ore_read(struct ore_io_state *ios);
-int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
-		 u64 size);
-
-int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr);
-
-extern const struct osd_attr g_attr_logical_length;
-
-#endif
-- 
cgit v1.2.3


From ae3d56d81507c33024ba7c1eae2ef433aa9bc0d5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 Jan 2019 09:33:07 +0100
Subject: scsi: remove bidirectional command support

No real need for bidi support once the OSD code is gone.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/cxgbi/libcxgbi.c      | 13 +++---
 drivers/scsi/iscsi_tcp.c           |  9 +---
 drivers/scsi/libiscsi.c            | 64 +++-------------------------
 drivers/scsi/libiscsi_tcp.c        |  8 ++--
 drivers/scsi/scsi_debug.c          | 85 ++++----------------------------------
 drivers/scsi/scsi_error.c          |  3 --
 drivers/scsi/scsi_lib.c            | 60 ++-------------------------
 drivers/scsi/virtio_scsi.c         | 14 ++-----
 drivers/target/loopback/tcm_loop.c | 15 -------
 drivers/usb/storage/uas.c          | 11 +----
 include/scsi/scsi_cmnd.h           | 19 +--------
 include/scsi/scsi_eh.h             |  1 -
 12 files changed, 34 insertions(+), 268 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index 75f876409fb9..4466ae5c9a74 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -1211,7 +1211,7 @@ scmd_get_params(struct scsi_cmnd *sc, struct scatterlist **sgl,
 		unsigned int *sgcnt, unsigned int *dlen,
 		unsigned int prot)
 {
-	struct scsi_data_buffer *sdb = prot ? scsi_prot(sc) : scsi_out(sc);
+	struct scsi_data_buffer *sdb = prot ? scsi_prot(sc) : &sc->sdb;
 
 	*sgl = sdb->table.sgl;
 	*sgcnt = sdb->table.nents;
@@ -1427,8 +1427,7 @@ static void task_release_itt(struct iscsi_task *task, itt_t hdr_itt)
 	log_debug(1 << CXGBI_DBG_DDP,
 		  "cdev 0x%p, task 0x%p, release tag 0x%x.\n",
 		  cdev, task, tag);
-	if (sc &&
-	    (scsi_bidi_cmnd(sc) || sc->sc_data_direction == DMA_FROM_DEVICE) &&
+	if (sc && sc->sc_data_direction == DMA_FROM_DEVICE &&
 	    cxgbi_ppm_is_ddp_tag(ppm, tag)) {
 		struct cxgbi_task_data *tdata = iscsi_task_cxgbi_data(task);
 		struct cxgbi_task_tag_info *ttinfo = &tdata->ttinfo;
@@ -1460,9 +1459,7 @@ static int task_reserve_itt(struct iscsi_task *task, itt_t *hdr_itt)
 	u32 tag = 0;
 	int err = -EINVAL;
 
-	if (sc &&
-	    (scsi_bidi_cmnd(sc) || sc->sc_data_direction == DMA_FROM_DEVICE)
-	) {
+	if (sc && sc->sc_data_direction == DMA_FROM_DEVICE) {
 		struct cxgbi_task_data *tdata = iscsi_task_cxgbi_data(task);
 		struct cxgbi_task_tag_info *ttinfo = &tdata->ttinfo;
 
@@ -1896,7 +1893,7 @@ int cxgbi_conn_alloc_pdu(struct iscsi_task *task, u8 opcode)
 	if (SKB_MAX_HEAD(cdev->skb_tx_rsvd) > (512 * MAX_SKB_FRAGS) &&
 	    (opcode == ISCSI_OP_SCSI_DATA_OUT ||
 	     (opcode == ISCSI_OP_SCSI_CMD &&
-	      (scsi_bidi_cmnd(sc) || sc->sc_data_direction == DMA_TO_DEVICE))))
+	      sc->sc_data_direction == DMA_TO_DEVICE)))
 		/* data could goes into skb head */
 		headroom += min_t(unsigned int,
 				SKB_MAX_HEAD(cdev->skb_tx_rsvd),
@@ -1971,7 +1968,7 @@ int cxgbi_conn_init_pdu(struct iscsi_task *task, unsigned int offset,
 		return 0;
 
 	if (task->sc) {
-		struct scsi_data_buffer *sdb = scsi_out(task->sc);
+		struct scsi_data_buffer *sdb = &task->sc->sdb;
 		struct scatterlist *sg = NULL;
 		int err;
 
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index cae6368ebb98..ed3debce2819 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -518,7 +518,7 @@ static int iscsi_sw_tcp_pdu_init(struct iscsi_task *task,
 	if (!task->sc)
 		iscsi_sw_tcp_send_linear_data_prep(conn, task->data, count);
 	else {
-		struct scsi_data_buffer *sdb = scsi_out(task->sc);
+		struct scsi_data_buffer *sdb = &task->sc->sdb;
 
 		err = iscsi_sw_tcp_send_data_prep(conn, sdb->table.sgl,
 						  sdb->table.nents, offset,
@@ -952,12 +952,6 @@ static umode_t iscsi_sw_tcp_attr_is_visible(int param_type, int param)
 	return 0;
 }
 
-static int iscsi_sw_tcp_slave_alloc(struct scsi_device *sdev)
-{
-	blk_queue_flag_set(QUEUE_FLAG_BIDI, sdev->request_queue);
-	return 0;
-}
-
 static int iscsi_sw_tcp_slave_configure(struct scsi_device *sdev)
 {
 	struct iscsi_sw_tcp_host *tcp_sw_host = iscsi_host_priv(sdev->host);
@@ -985,7 +979,6 @@ static struct scsi_host_template iscsi_sw_tcp_sht = {
 	.eh_device_reset_handler= iscsi_eh_device_reset,
 	.eh_target_reset_handler = iscsi_eh_recover_target,
 	.dma_boundary		= PAGE_SIZE - 1,
-	.slave_alloc            = iscsi_sw_tcp_slave_alloc,
 	.slave_configure        = iscsi_sw_tcp_slave_configure,
 	.target_alloc		= iscsi_target_alloc,
 	.proc_name		= "iscsi_tcp",
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index b8d325ce8754..bca3a8636c27 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -228,32 +228,6 @@ static int iscsi_prep_ecdb_ahs(struct iscsi_task *task)
 	return 0;
 }
 
-static int iscsi_prep_bidi_ahs(struct iscsi_task *task)
-{
-	struct scsi_cmnd *sc = task->sc;
-	struct iscsi_rlength_ahdr *rlen_ahdr;
-	int rc;
-
-	rlen_ahdr = iscsi_next_hdr(task);
-	rc = iscsi_add_hdr(task, sizeof(*rlen_ahdr));
-	if (rc)
-		return rc;
-
-	rlen_ahdr->ahslength =
-		cpu_to_be16(sizeof(rlen_ahdr->read_length) +
-						  sizeof(rlen_ahdr->reserved));
-	rlen_ahdr->ahstype = ISCSI_AHSTYPE_RLENGTH;
-	rlen_ahdr->reserved = 0;
-	rlen_ahdr->read_length = cpu_to_be32(scsi_in(sc)->length);
-
-	ISCSI_DBG_SESSION(task->conn->session,
-			  "bidi-in rlen_ahdr->read_length(%d) "
-		          "rlen_ahdr->ahslength(%d)\n",
-		          be32_to_cpu(rlen_ahdr->read_length),
-		          be16_to_cpu(rlen_ahdr->ahslength));
-	return 0;
-}
-
 /**
  * iscsi_check_tmf_restrictions - check if a task is affected by TMF
  * @task: iscsi task
@@ -392,13 +366,6 @@ static int iscsi_prep_scsi_cmd_pdu(struct iscsi_task *task)
 	memcpy(hdr->cdb, sc->cmnd, cmd_len);
 
 	task->imm_count = 0;
-	if (scsi_bidi_cmnd(sc)) {
-		hdr->flags |= ISCSI_FLAG_CMD_READ;
-		rc = iscsi_prep_bidi_ahs(task);
-		if (rc)
-			return rc;
-	}
-
 	if (scsi_get_prot_op(sc) != SCSI_PROT_NORMAL)
 		task->protected = true;
 
@@ -473,12 +440,10 @@ static int iscsi_prep_scsi_cmd_pdu(struct iscsi_task *task)
 
 	conn->scsicmd_pdus_cnt++;
 	ISCSI_DBG_SESSION(session, "iscsi prep [%s cid %d sc %p cdb 0x%x "
-			  "itt 0x%x len %d bidi_len %d cmdsn %d win %d]\n",
-			  scsi_bidi_cmnd(sc) ? "bidirectional" :
+			  "itt 0x%x len %d cmdsn %d win %d]\n",
 			  sc->sc_data_direction == DMA_TO_DEVICE ?
 			  "write" : "read", conn->id, sc, sc->cmnd[0],
 			  task->itt, transfer_length,
-			  scsi_bidi_cmnd(sc) ? scsi_in(sc)->length : 0,
 			  session->cmdsn,
 			  session->max_cmdsn - session->exp_cmdsn + 1);
 	return 0;
@@ -647,12 +612,7 @@ static void fail_scsi_task(struct iscsi_task *task, int err)
 		state = ISCSI_TASK_ABRT_TMF;
 
 	sc->result = err << 16;
-	if (!scsi_bidi_cmnd(sc))
-		scsi_set_resid(sc, scsi_bufflen(sc));
-	else {
-		scsi_out(sc)->resid = scsi_out(sc)->length;
-		scsi_in(sc)->resid = scsi_in(sc)->length;
-	}
+	scsi_set_resid(sc, scsi_bufflen(sc));
 
 	/* regular RX path uses back_lock */
 	spin_lock_bh(&conn->session->back_lock);
@@ -907,14 +867,7 @@ invalid_datalen:
 
 	if (rhdr->flags & (ISCSI_FLAG_CMD_BIDI_UNDERFLOW |
 			   ISCSI_FLAG_CMD_BIDI_OVERFLOW)) {
-		int res_count = be32_to_cpu(rhdr->bi_residual_count);
-
-		if (scsi_bidi_cmnd(sc) && res_count > 0 &&
-				(rhdr->flags & ISCSI_FLAG_CMD_BIDI_OVERFLOW ||
-				 res_count <= scsi_in(sc)->length))
-			scsi_in(sc)->resid = res_count;
-		else
-			sc->result = (DID_BAD_TARGET << 16) | rhdr->cmd_status;
+		sc->result = (DID_BAD_TARGET << 16) | rhdr->cmd_status;
 	}
 
 	if (rhdr->flags & (ISCSI_FLAG_CMD_UNDERFLOW |
@@ -961,8 +914,8 @@ iscsi_data_in_rsp(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
 
 		if (res_count > 0 &&
 		    (rhdr->flags & ISCSI_FLAG_CMD_OVERFLOW ||
-		     res_count <= scsi_in(sc)->length))
-			scsi_in(sc)->resid = res_count;
+		     res_count <= sc->sdb.length))
+			sc->sdb.resid = res_count;
 		else
 			sc->result = (DID_BAD_TARGET << 16) | rhdr->cmd_status;
 	}
@@ -1804,12 +1757,7 @@ fault:
 	spin_unlock_bh(&session->frwd_lock);
 	ISCSI_DBG_SESSION(session, "iscsi: cmd 0x%x is not queued (%d)\n",
 			  sc->cmnd[0], reason);
-	if (!scsi_bidi_cmnd(sc))
-		scsi_set_resid(sc, scsi_bufflen(sc));
-	else {
-		scsi_out(sc)->resid = scsi_out(sc)->length;
-		scsi_in(sc)->resid = scsi_in(sc)->length;
-	}
+	scsi_set_resid(sc, scsi_bufflen(sc));
 	sc->scsi_done(sc);
 	return 0;
 }
diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c
index 8a6b1b3f8277..9923e9e3b884 100644
--- a/drivers/scsi/libiscsi_tcp.c
+++ b/drivers/scsi/libiscsi_tcp.c
@@ -495,7 +495,7 @@ static int iscsi_tcp_data_in(struct iscsi_conn *conn, struct iscsi_task *task)
 	struct iscsi_tcp_task *tcp_task = task->dd_data;
 	struct iscsi_data_rsp *rhdr = (struct iscsi_data_rsp *)tcp_conn->in.hdr;
 	int datasn = be32_to_cpu(rhdr->datasn);
-	unsigned total_in_length = scsi_in(task->sc)->length;
+	unsigned total_in_length = task->sc->sdb.length;
 
 	/*
 	 * lib iscsi will update this in the completion handling if there
@@ -580,11 +580,11 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task)
 			      data_length, session->max_burst);
 
 	data_offset = be32_to_cpu(rhdr->data_offset);
-	if (data_offset + data_length > scsi_out(task->sc)->length) {
+	if (data_offset + data_length > task->sc->sdb.length) {
 		iscsi_conn_printk(KERN_ERR, conn,
 				  "invalid R2T with data len %u at offset %u "
 				  "and total length %d\n", data_length,
-				  data_offset, scsi_out(task->sc)->length);
+				  data_offset, task->sc->sdb.length);
 		return ISCSI_ERR_DATALEN;
 	}
 
@@ -696,7 +696,7 @@ iscsi_tcp_hdr_dissect(struct iscsi_conn *conn, struct iscsi_hdr *hdr)
 		if (tcp_conn->in.datalen) {
 			struct iscsi_tcp_task *tcp_task = task->dd_data;
 			struct ahash_request *rx_hash = NULL;
-			struct scsi_data_buffer *sdb = scsi_in(task->sc);
+			struct scsi_data_buffer *sdb = &task->sc->sdb;
 
 			/*
 			 * Setup copy of Data-In into the struct scsi_cmnd
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 661512bec3ac..8044bb08455d 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -430,7 +430,6 @@ static int resp_rsup_opcodes(struct scsi_cmnd *, struct sdebug_dev_info *);
 static int resp_rsup_tmfs(struct scsi_cmnd *, struct sdebug_dev_info *);
 static int resp_write_same_10(struct scsi_cmnd *, struct sdebug_dev_info *);
 static int resp_write_same_16(struct scsi_cmnd *, struct sdebug_dev_info *);
-static int resp_xdwriteread_10(struct scsi_cmnd *, struct sdebug_dev_info *);
 static int resp_comp_write(struct scsi_cmnd *, struct sdebug_dev_info *);
 static int resp_write_buffer(struct scsi_cmnd *, struct sdebug_dev_info *);
 static int resp_sync_cache(struct scsi_cmnd *, struct sdebug_dev_info *);
@@ -600,9 +599,6 @@ static const struct opcode_info_t opcode_info_arr[SDEB_I_LAST_ELEMENT + 1] = {
 	{0, 0x42, 0, F_D_OUT | FF_MEDIA_IO, resp_unmap, NULL, /* UNMAP */
 	    {10,  0x1, 0, 0, 0, 0, 0x3f, 0xff, 0xff, 0xc7, 0, 0, 0, 0, 0, 0} },
 /* 25 */
-	{0, 0x53, 0, F_D_IN | F_D_OUT | FF_MEDIA_IO, resp_xdwriteread_10,
-	    NULL, {10,  0xff, 0xff, 0xff, 0xff, 0xff, 0x3f, 0xff, 0xff, 0xc7,
-		   0, 0, 0, 0, 0, 0} },		/* XDWRITEREAD(10) */
 	{0, 0x3b, 0, F_D_OUT_MAYBE, resp_write_buffer, NULL,
 	    {10,  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xc7, 0, 0,
 	     0, 0, 0, 0} },			/* WRITE_BUFFER */
@@ -1010,11 +1006,11 @@ static int fill_from_dev_buffer(struct scsi_cmnd *scp, unsigned char *arr,
 				int arr_len)
 {
 	int act_len;
-	struct scsi_data_buffer *sdb = scsi_in(scp);
+	struct scsi_data_buffer *sdb = &scp->sdb;
 
 	if (!sdb->length)
 		return 0;
-	if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_FROM_DEVICE))
+	if (scp->sc_data_direction != DMA_FROM_DEVICE)
 		return DID_ERROR << 16;
 
 	act_len = sg_copy_from_buffer(sdb->table.sgl, sdb->table.nents,
@@ -1033,12 +1029,12 @@ static int p_fill_from_dev_buffer(struct scsi_cmnd *scp, const void *arr,
 				  int arr_len, unsigned int off_dst)
 {
 	int act_len, n;
-	struct scsi_data_buffer *sdb = scsi_in(scp);
+	struct scsi_data_buffer *sdb = &scp->sdb;
 	off_t skip = off_dst;
 
 	if (sdb->length <= off_dst)
 		return 0;
-	if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_FROM_DEVICE))
+	if (scp->sc_data_direction != DMA_FROM_DEVICE)
 		return DID_ERROR << 16;
 
 	act_len = sg_pcopy_from_buffer(sdb->table.sgl, sdb->table.nents,
@@ -1058,7 +1054,7 @@ static int fetch_to_dev_buffer(struct scsi_cmnd *scp, unsigned char *arr,
 {
 	if (!scsi_bufflen(scp))
 		return 0;
-	if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_TO_DEVICE))
+	if (scp->sc_data_direction != DMA_TO_DEVICE)
 		return -1;
 
 	return scsi_sg_copy_to_buffer(scp, arr, arr_len);
@@ -2477,21 +2473,19 @@ static int do_device_access(struct scsi_cmnd *scmd, u32 sg_skip, u64 lba,
 {
 	int ret;
 	u64 block, rest = 0;
-	struct scsi_data_buffer *sdb;
+	struct scsi_data_buffer *sdb = &scmd->sdb;
 	enum dma_data_direction dir;
 
 	if (do_write) {
-		sdb = scsi_out(scmd);
 		dir = DMA_TO_DEVICE;
 		write_since_sync = true;
 	} else {
-		sdb = scsi_in(scmd);
 		dir = DMA_FROM_DEVICE;
 	}
 
 	if (!sdb->length)
 		return 0;
-	if (!(scsi_bidi_cmnd(scmd) || scmd->sc_data_direction == dir))
+	if (scmd->sc_data_direction != dir)
 		return -1;
 
 	block = do_div(lba, sdebug_store_sectors);
@@ -2774,7 +2768,7 @@ static int resp_read_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip)
 	if (unlikely(ret == -1))
 		return DID_ERROR << 16;
 
-	scsi_in(scp)->resid = scsi_bufflen(scp) - ret;
+	scp->sdb.resid = scsi_bufflen(scp) - ret;
 
 	if (unlikely(sqcp)) {
 		if (sqcp->inj_recovered) {
@@ -3718,68 +3712,6 @@ static int resp_report_luns(struct scsi_cmnd *scp,
 	return res;
 }
 
-static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba,
-			    unsigned int num, struct sdebug_dev_info *devip)
-{
-	int j;
-	unsigned char *kaddr, *buf;
-	unsigned int offset;
-	struct scsi_data_buffer *sdb = scsi_in(scp);
-	struct sg_mapping_iter miter;
-
-	/* better not to use temporary buffer. */
-	buf = kzalloc(scsi_bufflen(scp), GFP_ATOMIC);
-	if (!buf) {
-		mk_sense_buffer(scp, ILLEGAL_REQUEST, INSUFF_RES_ASC,
-				INSUFF_RES_ASCQ);
-		return check_condition_result;
-	}
-
-	scsi_sg_copy_to_buffer(scp, buf, scsi_bufflen(scp));
-
-	offset = 0;
-	sg_miter_start(&miter, sdb->table.sgl, sdb->table.nents,
-			SG_MITER_ATOMIC | SG_MITER_TO_SG);
-
-	while (sg_miter_next(&miter)) {
-		kaddr = miter.addr;
-		for (j = 0; j < miter.length; j++)
-			*(kaddr + j) ^= *(buf + offset + j);
-
-		offset += miter.length;
-	}
-	sg_miter_stop(&miter);
-	kfree(buf);
-
-	return 0;
-}
-
-static int resp_xdwriteread_10(struct scsi_cmnd *scp,
-			       struct sdebug_dev_info *devip)
-{
-	u8 *cmd = scp->cmnd;
-	u64 lba;
-	u32 num;
-	int errsts;
-
-	if (!scsi_bidi_cmnd(scp)) {
-		mk_sense_buffer(scp, ILLEGAL_REQUEST, INSUFF_RES_ASC,
-				INSUFF_RES_ASCQ);
-		return check_condition_result;
-	}
-	errsts = resp_read_dt0(scp, devip);
-	if (errsts)
-		return errsts;
-	if (!(cmd[1] & 0x4)) {		/* DISABLE_WRITE is not set */
-		errsts = resp_write_dt0(scp, devip);
-		if (errsts)
-			return errsts;
-	}
-	lba = get_unaligned_be32(cmd + 2);
-	num = get_unaligned_be16(cmd + 7);
-	return resp_xdwriteread(scp, lba, num, devip);
-}
-
 static struct sdebug_queue *get_queue(struct scsi_cmnd *cmnd)
 {
 	u32 tag = blk_mq_unique_tag(cmnd->request);
@@ -3953,7 +3885,6 @@ static int scsi_debug_slave_alloc(struct scsi_device *sdp)
 	if (sdebug_verbose)
 		pr_info("slave_alloc <%u %u %u %llu>\n",
 		       sdp->host->host_no, sdp->channel, sdp->id, sdp->lun);
-	blk_queue_flag_set(QUEUE_FLAG_BIDI, sdp->request_queue);
 	return 0;
 }
 
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 16eef068e9e9..1b8378f36139 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -965,7 +965,6 @@ void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct scsi_eh_save *ses,
 	ses->cmnd = scmd->cmnd;
 	ses->data_direction = scmd->sc_data_direction;
 	ses->sdb = scmd->sdb;
-	ses->next_rq = scmd->request->next_rq;
 	ses->result = scmd->result;
 	ses->underflow = scmd->underflow;
 	ses->prot_op = scmd->prot_op;
@@ -976,7 +975,6 @@ void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct scsi_eh_save *ses,
 	scmd->cmnd = ses->eh_cmnd;
 	memset(scmd->cmnd, 0, BLK_MAX_CDB);
 	memset(&scmd->sdb, 0, sizeof(scmd->sdb));
-	scmd->request->next_rq = NULL;
 	scmd->result = 0;
 
 	if (sense_bytes) {
@@ -1029,7 +1027,6 @@ void scsi_eh_restore_cmnd(struct scsi_cmnd* scmd, struct scsi_eh_save *ses)
 	scmd->cmnd = ses->cmnd;
 	scmd->sc_data_direction = ses->data_direction;
 	scmd->sdb = ses->sdb;
-	scmd->request->next_rq = ses->next_rq;
 	scmd->result = ses->result;
 	scmd->underflow = ses->underflow;
 	scmd->prot_op = ses->prot_op;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 00cd365fb7d2..2667bb95a441 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -556,15 +556,8 @@ static void scsi_uninit_cmd(struct scsi_cmnd *cmd)
 
 static void scsi_mq_free_sgtables(struct scsi_cmnd *cmd)
 {
-	struct scsi_data_buffer *sdb;
-
 	if (cmd->sdb.table.nents)
 		sg_free_table_chained(&cmd->sdb.table, true);
-	if (cmd->request->next_rq) {
-		sdb = cmd->request->next_rq->special;
-		if (sdb)
-			sg_free_table_chained(&sdb->table, true);
-	}
 	if (scsi_prot_sg_count(cmd))
 		sg_free_table_chained(&cmd->prot_sdb->table, true);
 }
@@ -578,7 +571,7 @@ static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd)
 
 /* Returns false when no more bytes to process, true if there are more */
 static bool scsi_end_request(struct request *req, blk_status_t error,
-		unsigned int bytes, unsigned int bidi_bytes)
+		unsigned int bytes)
 {
 	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
 	struct scsi_device *sdev = cmd->device;
@@ -587,11 +580,6 @@ static bool scsi_end_request(struct request *req, blk_status_t error,
 	if (blk_update_request(req, error, bytes))
 		return true;
 
-	/* Bidi request must be completed as a whole */
-	if (unlikely(bidi_bytes) &&
-	    blk_update_request(req->next_rq, error, bidi_bytes))
-		return true;
-
 	if (blk_queue_add_random(q))
 		add_disk_randomness(req->rq_disk);
 
@@ -816,7 +804,7 @@ static void scsi_io_completion_action(struct scsi_cmnd *cmd, int result)
 				scsi_print_command(cmd);
 			}
 		}
-		if (!scsi_end_request(req, blk_stat, blk_rq_err_bytes(req), 0))
+		if (!scsi_end_request(req, blk_stat, blk_rq_err_bytes(req)))
 			return;
 		/*FALLTHRU*/
 	case ACTION_REPREP:
@@ -951,29 +939,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 		 */
 		scsi_req(req)->result = cmd->result;
 		scsi_req(req)->resid_len = scsi_get_resid(cmd);
-
-		if (unlikely(scsi_bidi_cmnd(cmd))) {
-			/*
-			 * Bidi commands Must be complete as a whole,
-			 * both sides at once.
-			 */
-			scsi_req(req->next_rq)->resid_len = scsi_in(cmd)->resid;
-			if (scsi_end_request(req, BLK_STS_OK, blk_rq_bytes(req),
-					blk_rq_bytes(req->next_rq)))
-				WARN_ONCE(true,
-					  "Bidi command with remaining bytes");
-			return;
-		}
-	}
-
-	/* no bidi support yet, other than in pass-through */
-	if (unlikely(blk_bidi_rq(req))) {
-		WARN_ONCE(true, "Only support bidi command in passthrough");
-		scmd_printk(KERN_ERR, cmd, "Killing bidi command\n");
-		if (scsi_end_request(req, BLK_STS_IOERR, blk_rq_bytes(req),
-				     blk_rq_bytes(req->next_rq)))
-			WARN_ONCE(true, "Bidi command with remaining bytes");
-		return;
 	}
 
 	/*
@@ -990,13 +955,13 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 	 * to retry code. Fast path should return in this block.
 	 */
 	if (likely(blk_rq_bytes(req) > 0 || blk_stat == BLK_STS_OK)) {
-		if (likely(!scsi_end_request(req, blk_stat, good_bytes, 0)))
+		if (likely(!scsi_end_request(req, blk_stat, good_bytes)))
 			return; /* no bytes remaining */
 	}
 
 	/* Kill remainder if no retries. */
 	if (unlikely(blk_stat && scsi_noretry_cmd(cmd))) {
-		if (scsi_end_request(req, blk_stat, blk_rq_bytes(req), 0))
+		if (scsi_end_request(req, blk_stat, blk_rq_bytes(req)))
 			WARN_ONCE(true,
 			    "Bytes remaining after failed, no-retry command");
 		return;
@@ -1058,12 +1023,6 @@ blk_status_t scsi_init_io(struct scsi_cmnd *cmd)
 	if (ret)
 		return ret;
 
-	if (blk_bidi_rq(rq)) {
-		ret = scsi_init_sgtable(rq->next_rq, rq->next_rq->special);
-		if (ret)
-			goto out_free_sgtables;
-	}
-
 	if (blk_integrity_rq(rq)) {
 		struct scsi_data_buffer *prot_sdb = cmd->prot_sdb;
 		int ivecs, count;
@@ -1624,17 +1583,6 @@ static blk_status_t scsi_mq_prep_fn(struct request *req)
 			(struct scatterlist *)(cmd->prot_sdb + 1);
 	}
 
-	if (blk_bidi_rq(req)) {
-		struct request *next_rq = req->next_rq;
-		struct scsi_data_buffer *bidi_sdb = blk_mq_rq_to_pdu(next_rq);
-
-		memset(bidi_sdb, 0, sizeof(struct scsi_data_buffer));
-		bidi_sdb->table.sgl =
-			(struct scatterlist *)(bidi_sdb + 1);
-
-		next_rq->special = bidi_sdb;
-	}
-
 	blk_mq_start_request(req);
 
 	return scsi_setup_cmnd(sdev, req);
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 772b976e4ee4..1a6f150cd2d8 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -100,16 +100,8 @@ static inline struct Scsi_Host *virtio_scsi_host(struct virtio_device *vdev)
 
 static void virtscsi_compute_resid(struct scsi_cmnd *sc, u32 resid)
 {
-	if (!resid)
-		return;
-
-	if (!scsi_bidi_cmnd(sc)) {
+	if (resid)
 		scsi_set_resid(sc, resid);
-		return;
-	}
-
-	scsi_in(sc)->resid = min(resid, scsi_in(sc)->length);
-	scsi_out(sc)->resid = resid - scsi_in(sc)->resid;
 }
 
 /**
@@ -403,9 +395,9 @@ static int virtscsi_add_cmd(struct virtqueue *vq,
 
 	if (sc && sc->sc_data_direction != DMA_NONE) {
 		if (sc->sc_data_direction != DMA_FROM_DEVICE)
-			out = &scsi_out(sc)->table;
+			out = &sc->sdb.table;
 		if (sc->sc_data_direction != DMA_TO_DEVICE)
-			in = &scsi_in(sc)->table;
+			in = &sc->sdb.table;
 	}
 
 	/* Request header.  */
diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c
index 89c6e3719d2b..3305b47fdf53 100644
--- a/drivers/target/loopback/tcm_loop.c
+++ b/drivers/target/loopback/tcm_loop.c
@@ -128,14 +128,6 @@ static void tcm_loop_submission_work(struct work_struct *work)
 		set_host_byte(sc, DID_ERROR);
 		goto out_done;
 	}
-	if (scsi_bidi_cmnd(sc)) {
-		struct scsi_data_buffer *sdb = scsi_in(sc);
-
-		sgl_bidi = sdb->table.sgl;
-		sgl_bidi_count = sdb->table.nents;
-		se_cmd->se_cmd_flags |= SCF_BIDI;
-
-	}
 
 	transfer_length = scsi_transfer_length(sc);
 	if (!scsi_prot_sg_count(sc) &&
@@ -304,12 +296,6 @@ static int tcm_loop_target_reset(struct scsi_cmnd *sc)
 	return FAILED;
 }
 
-static int tcm_loop_slave_alloc(struct scsi_device *sd)
-{
-	blk_queue_flag_set(QUEUE_FLAG_BIDI, sd->request_queue);
-	return 0;
-}
-
 static struct scsi_host_template tcm_loop_driver_template = {
 	.show_info		= tcm_loop_show_info,
 	.proc_name		= "tcm_loopback",
@@ -325,7 +311,6 @@ static struct scsi_host_template tcm_loop_driver_template = {
 	.cmd_per_lun		= 1024,
 	.max_sectors		= 0xFFFF,
 	.dma_boundary		= PAGE_SIZE - 1,
-	.slave_alloc		= tcm_loop_slave_alloc,
 	.module			= THIS_MODULE,
 	.track_queue_depth	= 1,
 };
diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c
index 36742e8e7edc..95f2142093d5 100644
--- a/drivers/usb/storage/uas.c
+++ b/drivers/usb/storage/uas.c
@@ -368,25 +368,19 @@ static void uas_data_cmplt(struct urb *urb)
 	struct scsi_cmnd *cmnd = urb->context;
 	struct uas_cmd_info *cmdinfo = (void *)&cmnd->SCp;
 	struct uas_dev_info *devinfo = (void *)cmnd->device->hostdata;
-	struct scsi_data_buffer *sdb = NULL;
+	struct scsi_data_buffer *sdb = &cmnd->sdb;
 	unsigned long flags;
 	int status = urb->status;
 
 	spin_lock_irqsave(&devinfo->lock, flags);
 
 	if (cmdinfo->data_in_urb == urb) {
-		sdb = scsi_in(cmnd);
 		cmdinfo->state &= ~DATA_IN_URB_INFLIGHT;
 		cmdinfo->data_in_urb = NULL;
 	} else if (cmdinfo->data_out_urb == urb) {
-		sdb = scsi_out(cmnd);
 		cmdinfo->state &= ~DATA_OUT_URB_INFLIGHT;
 		cmdinfo->data_out_urb = NULL;
 	}
-	if (sdb == NULL) {
-		WARN_ON_ONCE(1);
-		goto out;
-	}
 
 	if (devinfo->resetting)
 		goto out;
@@ -426,8 +420,7 @@ static struct urb *uas_alloc_data_urb(struct uas_dev_info *devinfo, gfp_t gfp,
 	struct usb_device *udev = devinfo->udev;
 	struct uas_cmd_info *cmdinfo = (void *)&cmnd->SCp;
 	struct urb *urb = usb_alloc_urb(0, gfp);
-	struct scsi_data_buffer *sdb = (dir == DMA_FROM_DEVICE)
-		? scsi_in(cmnd) : scsi_out(cmnd);
+	struct scsi_data_buffer *sdb = &cmnd->sdb;
 	unsigned int pipe = (dir == DMA_FROM_DEVICE)
 		? devinfo->data_in_pipe : devinfo->data_out_pipe;
 
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index d85e6befa26b..af6ae3e1eba6 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -213,23 +213,6 @@ static inline int scsi_get_resid(struct scsi_cmnd *cmd)
 #define scsi_for_each_sg(cmd, sg, nseg, __i)			\
 	for_each_sg(scsi_sglist(cmd), sg, nseg, __i)
 
-static inline int scsi_bidi_cmnd(struct scsi_cmnd *cmd)
-{
-	return blk_bidi_rq(cmd->request) &&
-		(cmd->request->next_rq->special != NULL);
-}
-
-static inline struct scsi_data_buffer *scsi_in(struct scsi_cmnd *cmd)
-{
-	return scsi_bidi_cmnd(cmd) ?
-		cmd->request->next_rq->special : &cmd->sdb;
-}
-
-static inline struct scsi_data_buffer *scsi_out(struct scsi_cmnd *cmd)
-{
-	return &cmd->sdb;
-}
-
 static inline int scsi_sg_copy_from_buffer(struct scsi_cmnd *cmd,
 					   void *buf, int buflen)
 {
@@ -351,7 +334,7 @@ static inline void set_driver_byte(struct scsi_cmnd *cmd, char status)
 
 static inline unsigned scsi_transfer_length(struct scsi_cmnd *scmd)
 {
-	unsigned int xfer_len = scsi_out(scmd)->length;
+	unsigned int xfer_len = scmd->sdb.length;
 	unsigned int prot_interval = scsi_prot_interval(scmd);
 
 	if (scmd->prot_flags & SCSI_PROT_TRANSFER_PI)
diff --git a/include/scsi/scsi_eh.h b/include/scsi/scsi_eh.h
index 2b7e227960e1..3810b340551c 100644
--- a/include/scsi/scsi_eh.h
+++ b/include/scsi/scsi_eh.h
@@ -39,7 +39,6 @@ struct scsi_eh_save {
 	unsigned char prot_op;
 	unsigned char *cmnd;
 	struct scsi_data_buffer sdb;
-	struct request *next_rq;
 	/* new command support */
 	unsigned char eh_cmnd[BLK_MAX_CDB];
 	struct scatterlist sense_sgl;
-- 
cgit v1.2.3


From 69ed175c195595c73901e18366cb0ebeaeb68b8a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 19:35:11 +0100
Subject: scsi: block: remove req->special

No users left.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 block/blk-mq.c         | 1 -
 drivers/scsi/sd.c      | 2 --
 include/linux/blkdev.h | 2 --
 3 files changed, 5 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3ba37b9e15e9..502cbf964a3b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -331,7 +331,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	rq->nr_integrity_segments = 0;
 #endif
-	rq->special = NULL;
 	/* tag was already set */
 	rq->extra_len = 0;
 	WRITE_ONCE(rq->deadline, 0);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 3db9b1fe7516..c124459041dc 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1171,8 +1171,6 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd)
 	if (ret != BLK_STS_OK)
 		return ret;
 
-	WARN_ON_ONCE(cmd != rq->special);
-
 	if (!scsi_device_online(sdp) || sdp->changed) {
 		scmd_printk(KERN_ERR, cmd, "device offline or changed\n");
 		return BLK_STS_IOERR;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 338604dff7d0..fd1450d53f1c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -216,8 +216,6 @@ struct request {
 	unsigned short write_hint;
 	unsigned short ioprio;
 
-	void *special;		/* opaque pointer available for LLD use */
-
 	unsigned int extra_len;	/* length of alignment and padding */
 
 	enum mq_rq_state state;
-- 
cgit v1.2.3


From 8b3238cabd50e2715b6544e724e74685209b190a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Dec 2018 08:01:10 -0800
Subject: scsi: block: remove bidi support

Unused now, and another field in struct request bites the dust.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 block/blk-mq-debugfs.c | 1 -
 block/blk-mq.c         | 3 ---
 include/linux/blkdev.h | 6 ------
 3 files changed, 10 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 90d68760af08..ac832547160a 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -115,7 +115,6 @@ static int queue_pm_only_show(void *data, struct seq_file *m)
 static const char *const blk_queue_flag_name[] = {
 	QUEUE_FLAG_NAME(STOPPED),
 	QUEUE_FLAG_NAME(DYING),
-	QUEUE_FLAG_NAME(BIDI),
 	QUEUE_FLAG_NAME(NOMERGES),
 	QUEUE_FLAG_NAME(SAME_COMP),
 	QUEUE_FLAG_NAME(FAIL_IO),
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 502cbf964a3b..820d131a6893 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -339,7 +339,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 
 	rq->end_io = NULL;
 	rq->end_io_data = NULL;
-	rq->next_rq = NULL;
 
 	data->ctx->rq_dispatched[op_is_sync(op)]++;
 	refcount_set(&rq->ref, 1);
@@ -549,8 +548,6 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 		rq_qos_done(rq->q, rq);
 		rq->end_io(rq, error);
 	} else {
-		if (unlikely(blk_bidi_rq(rq)))
-			blk_mq_free_request(rq->next_rq);
 		blk_mq_free_request(rq);
 	}
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fd1450d53f1c..21beb456b97a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -234,9 +234,6 @@ struct request {
 	 */
 	rq_end_io_fn *end_io;
 	void *end_io_data;
-
-	/* for bidi */
-	struct request *next_rq;
 };
 
 static inline bool blk_op_is_scsi(unsigned int op)
@@ -572,7 +569,6 @@ struct request_queue {
 
 #define QUEUE_FLAG_STOPPED	1	/* queue is stopped */
 #define QUEUE_FLAG_DYING	2	/* queue being torn down */
-#define QUEUE_FLAG_BIDI		4	/* queue supports bidi requests */
 #define QUEUE_FLAG_NOMERGES     5	/* disable merge attempts */
 #define QUEUE_FLAG_SAME_COMP	6	/* complete on same CPU-group */
 #define QUEUE_FLAG_FAIL_IO	7	/* fake timeout */
@@ -644,8 +640,6 @@ static inline bool blk_account_rq(struct request *rq)
 	return (rq->rq_flags & RQF_STARTED) && !blk_rq_is_passthrough(rq);
 }
 
-#define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
-
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 
 #define rq_data_dir(rq)		(op_is_write(req_op(rq)) ? WRITE : READ)
-- 
cgit v1.2.3


From 5870970b9a828d8693aa6d15742573289d7dbcd0 Mon Sep 17 00:00:00 2001
From: Julien Thierry <julien.thierry@arm.com>
Date: Thu, 31 Jan 2019 14:58:39 +0000
Subject: arm64: Fix HCR.TGE status for NMI contexts

When using VHE, the host needs to clear HCR_EL2.TGE bit in order
to interact with guest TLBs, switching from EL2&0 translation regime
to EL1&0.

However, some non-maskable asynchronous event could happen while TGE is
cleared like SDEI. Because of this address translation operations
relying on EL2&0 translation regime could fail (tlb invalidation,
userspace access, ...).

Fix this by properly setting HCR_EL2.TGE when entering NMI context and
clear it if necessary when returning to the interrupted context.

Signed-off-by: Julien Thierry <julien.thierry@arm.com>
Suggested-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: James Morse <james.morse@arm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: linux-arch@vger.kernel.org
Cc: stable@vger.kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/hardirq.h | 31 +++++++++++++++++++++++++++++++
 arch/arm64/kernel/irq.c          |  3 +++
 include/linux/hardirq.h          |  7 +++++++
 3 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h
index 1473fc2f7ab7..89691c86640a 100644
--- a/arch/arm64/include/asm/hardirq.h
+++ b/arch/arm64/include/asm/hardirq.h
@@ -17,8 +17,12 @@
 #define __ASM_HARDIRQ_H
 
 #include <linux/cache.h>
+#include <linux/percpu.h>
 #include <linux/threads.h>
+#include <asm/barrier.h>
 #include <asm/irq.h>
+#include <asm/kvm_arm.h>
+#include <asm/sysreg.h>
 
 #define NR_IPI	7
 
@@ -37,6 +41,33 @@ u64 smp_irq_stat_cpu(unsigned int cpu);
 
 #define __ARCH_IRQ_EXIT_IRQS_DISABLED	1
 
+struct nmi_ctx {
+	u64 hcr;
+};
+
+DECLARE_PER_CPU(struct nmi_ctx, nmi_contexts);
+
+#define arch_nmi_enter()							\
+	do {									\
+		if (is_kernel_in_hyp_mode()) {					\
+			struct nmi_ctx *nmi_ctx = this_cpu_ptr(&nmi_contexts);	\
+			nmi_ctx->hcr = read_sysreg(hcr_el2);			\
+			if (!(nmi_ctx->hcr & HCR_TGE)) {			\
+				write_sysreg(nmi_ctx->hcr | HCR_TGE, hcr_el2);	\
+				isb();						\
+			}							\
+		}								\
+	} while (0)
+
+#define arch_nmi_exit()								\
+	do {									\
+		if (is_kernel_in_hyp_mode()) {					\
+			struct nmi_ctx *nmi_ctx = this_cpu_ptr(&nmi_contexts);	\
+			if (!(nmi_ctx->hcr & HCR_TGE))				\
+				write_sysreg(nmi_ctx->hcr, hcr_el2);		\
+		}								\
+	} while (0)
+
 static inline void ack_bad_irq(unsigned int irq)
 {
 	extern unsigned long irq_err_count;
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
index 780a12f59a8f..92fa81798fb9 100644
--- a/arch/arm64/kernel/irq.c
+++ b/arch/arm64/kernel/irq.c
@@ -33,6 +33,9 @@
 
 unsigned long irq_err_count;
 
+/* Only access this in an NMI enter/exit */
+DEFINE_PER_CPU(struct nmi_ctx, nmi_contexts);
+
 DEFINE_PER_CPU(unsigned long *, irq_stack_ptr);
 
 int arch_show_interrupts(struct seq_file *p, int prec)
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 0fbbcdf0c178..da0af631ded5 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -60,8 +60,14 @@ extern void irq_enter(void);
  */
 extern void irq_exit(void);
 
+#ifndef arch_nmi_enter
+#define arch_nmi_enter()	do { } while (0)
+#define arch_nmi_exit()		do { } while (0)
+#endif
+
 #define nmi_enter()						\
 	do {							\
+		arch_nmi_enter();				\
 		printk_nmi_enter();				\
 		lockdep_off();					\
 		ftrace_nmi_enter();				\
@@ -80,6 +86,7 @@ extern void irq_exit(void);
 		ftrace_nmi_exit();				\
 		lockdep_on();					\
 		printk_nmi_exit();				\
+		arch_nmi_exit();				\
 	} while (0)
 
 #endif /* LINUX_HARDIRQ_H */
-- 
cgit v1.2.3


From 13b210ddf474d9f3368766008a89fe82a6f90b48 Mon Sep 17 00:00:00 2001
From: Julien Thierry <julien.thierry@arm.com>
Date: Thu, 31 Jan 2019 14:58:49 +0000
Subject: efi: Let architectures decide the flags that should be saved/restored

Currently, irqflags are saved before calling runtime services and
checked for mismatch on return.

Provide a pair of overridable macros to save and restore (if needed) the
state that need to be preserved on return from a runtime service.
This allows to check for flags that are not necesarly related to
irqflags.

Signed-off-by: Julien Thierry <julien.thierry@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: linux-efi@vger.kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/firmware/efi/runtime-wrappers.c | 17 +++++++++++++++--
 include/linux/efi.h                     |  5 +++--
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c
index 8903b9ccfc2b..c70df5ae7c4a 100644
--- a/drivers/firmware/efi/runtime-wrappers.c
+++ b/drivers/firmware/efi/runtime-wrappers.c
@@ -89,11 +89,24 @@ exit:									\
 	efi_rts_work.status;						\
 })
 
+#ifndef arch_efi_save_flags
+#define arch_efi_save_flags(state_flags)	local_save_flags(state_flags)
+#define arch_efi_restore_flags(state_flags)	local_irq_restore(state_flags)
+#endif
+
+unsigned long efi_call_virt_save_flags(void)
+{
+	unsigned long flags;
+
+	arch_efi_save_flags(flags);
+	return flags;
+}
+
 void efi_call_virt_check_flags(unsigned long flags, const char *call)
 {
 	unsigned long cur_flags, mismatch;
 
-	local_save_flags(cur_flags);
+	cur_flags = efi_call_virt_save_flags();
 
 	mismatch = flags ^ cur_flags;
 	if (!WARN_ON_ONCE(mismatch & ARCH_EFI_IRQ_FLAGS_MASK))
@@ -102,7 +115,7 @@ void efi_call_virt_check_flags(unsigned long flags, const char *call)
 	add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_NOW_UNRELIABLE);
 	pr_err_ratelimited(FW_BUG "IRQ flags corrupted (0x%08lx=>0x%08lx) by EFI %s\n",
 			   flags, cur_flags, call);
-	local_irq_restore(flags);
+	arch_efi_restore_flags(flags);
 }
 
 /*
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 45ff763fba76..bd80b7ec35db 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1607,6 +1607,7 @@ efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg,
 
 bool efi_runtime_disabled(void);
 extern void efi_call_virt_check_flags(unsigned long flags, const char *call);
+extern unsigned long efi_call_virt_save_flags(void);
 
 enum efi_secureboot_mode {
 	efi_secureboot_mode_unset,
@@ -1652,7 +1653,7 @@ void efi_retrieve_tpm2_eventlog(efi_system_table_t *sys_table);
 									\
 	arch_efi_call_virt_setup();					\
 									\
-	local_save_flags(__flags);					\
+	__flags = efi_call_virt_save_flags();				\
 	__s = arch_efi_call_virt(p, f, args);				\
 	efi_call_virt_check_flags(__flags, __stringify(f));		\
 									\
@@ -1667,7 +1668,7 @@ void efi_retrieve_tpm2_eventlog(efi_system_table_t *sys_table);
 									\
 	arch_efi_call_virt_setup();					\
 									\
-	local_save_flags(__flags);					\
+	__flags = efi_call_virt_save_flags();				\
 	arch_efi_call_virt(p, f, args);					\
 	efi_call_virt_check_flags(__flags, __stringify(f));		\
 									\
-- 
cgit v1.2.3


From d325c402964e7c63db94e9138c530832269a1297 Mon Sep 17 00:00:00 2001
From: Miroslav Benes <mbenes@suse.cz>
Date: Fri, 28 Dec 2018 14:38:47 +0100
Subject: ring-buffer: Remove unused function ring_buffer_page_len()

Commit 6b7e633fe9c2 ("tracing: Remove extra zeroing out of the ring
buffer page") removed the only caller of ring_buffer_page_len(). The
function is now unused and may be removed.

Link: http://lkml.kernel.org/r/20181228133847.106177-1-mbenes@suse.cz

Signed-off-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  2 --
 kernel/trace/ring_buffer.c  | 14 --------------
 2 files changed, 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 5b9ae62272bb..f1429675f252 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -187,8 +187,6 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,
 void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs);
 bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer);
 
-size_t ring_buffer_page_len(void *page);
-
 size_t ring_buffer_nr_pages(struct ring_buffer *buffer, int cpu);
 size_t ring_buffer_nr_dirty_pages(struct ring_buffer *buffer, int cpu);
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 06e864a334bb..9a91479bbbfe 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -353,20 +353,6 @@ static void rb_init_page(struct buffer_data_page *bpage)
 	local_set(&bpage->commit, 0);
 }
 
-/**
- * ring_buffer_page_len - the size of data on the page.
- * @page: The page to read
- *
- * Returns the amount of data on the page, including buffer page header.
- */
-size_t ring_buffer_page_len(void *page)
-{
-	struct buffer_data_page *bpage = page;
-
-	return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS)
-		+ BUF_PAGE_HDR_SIZE;
-}
-
 /*
  * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
  * this issue out.
-- 
cgit v1.2.3


From fd9dc93e36231fb6d520e0edd467058fad4fd12d Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Wed, 6 Feb 2019 13:07:11 -0500
Subject: XArray: Change xa_insert to return -EBUSY

Userspace translates EEXIST to "File exists" which isn't a very good
error message for the problem.  "Device or resource busy" is a better
indication of what went wrong.

Signed-off-by: Matthew Wilcox <willy@infradead.org>
---
 Documentation/core-api/xarray.rst | 2 +-
 fs/nilfs2/btnode.c                | 2 +-
 include/linux/xarray.h            | 6 +++---
 lib/test_xarray.c                 | 4 ++--
 lib/xarray.c                      | 4 ++--
 5 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst
index 5d54b27c6eba..42bb1a62650f 100644
--- a/Documentation/core-api/xarray.rst
+++ b/Documentation/core-api/xarray.rst
@@ -85,7 +85,7 @@ which was at that index; if it returns the same entry which was passed as
 
 If you want to only store a new entry to an index if the current entry
 at that index is ``NULL``, you can use :c:func:`xa_insert` which
-returns ``-EEXIST`` if the entry is not empty.
+returns ``-EBUSY`` if the entry is not empty.
 
 You can enquire whether a mark is set on an entry by using
 :c:func:`xa_get_mark`.  If the entry is not ``NULL``, you can set a mark
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index f2129a5d9f23..4391fd3abd8f 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -189,7 +189,7 @@ retry:
 		 */
 		if (!err)
 			return 0;
-		else if (err != -EEXIST)
+		else if (err != -EBUSY)
 			goto failed_unlock;
 
 		err = invalidate_inode_pages2_range(btnc, newkey, newkey);
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index e11841537631..57cf35c4d094 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -664,7 +664,7 @@ static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index,
  *
  * Context: Any context.  Takes and releases the xa_lock.  May sleep if
  * the @gfp flags permit.
- * Return: 0 if the store succeeded.  -EEXIST if another entry was present.
+ * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
  * -ENOMEM if memory could not be allocated.
  */
 static inline int xa_insert(struct xarray *xa, unsigned long index,
@@ -693,7 +693,7 @@ static inline int xa_insert(struct xarray *xa, unsigned long index,
  *
  * Context: Any context.  Takes and releases the xa_lock while
  * disabling softirqs.  May sleep if the @gfp flags permit.
- * Return: 0 if the store succeeded.  -EEXIST if another entry was present.
+ * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
  * -ENOMEM if memory could not be allocated.
  */
 static inline int xa_insert_bh(struct xarray *xa, unsigned long index,
@@ -722,7 +722,7 @@ static inline int xa_insert_bh(struct xarray *xa, unsigned long index,
  *
  * Context: Process context.  Takes and releases the xa_lock while
  * disabling interrupts.  May sleep if the @gfp flags permit.
- * Return: 0 if the store succeeded.  -EEXIST if another entry was present.
+ * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
  * -ENOMEM if memory could not be allocated.
  */
 static inline int xa_insert_irq(struct xarray *xa, unsigned long index,
diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index 671a93ee09e6..9d894e93456c 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -346,7 +346,7 @@ static noinline void check_cmpxchg(struct xarray *xa)
 
 	XA_BUG_ON(xa, !xa_empty(xa));
 	XA_BUG_ON(xa, xa_store_index(xa, 12345678, GFP_KERNEL) != NULL);
-	XA_BUG_ON(xa, xa_insert(xa, 12345678, xa, GFP_KERNEL) != -EEXIST);
+	XA_BUG_ON(xa, xa_insert(xa, 12345678, xa, GFP_KERNEL) != -EBUSY);
 	XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, SIX, FIVE, GFP_KERNEL) != LOTS);
 	XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, LOTS, FIVE, GFP_KERNEL) != LOTS);
 	XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, FIVE, LOTS, GFP_KERNEL) != FIVE);
@@ -388,7 +388,7 @@ static noinline void check_reserve(struct xarray *xa)
 	/* But xa_insert does not */
 	xa_reserve(xa, 12345678, GFP_KERNEL);
 	XA_BUG_ON(xa, xa_insert(xa, 12345678, xa_mk_value(12345678), 0) !=
-			-EEXIST);
+			-EBUSY);
 	XA_BUG_ON(xa, xa_empty(xa));
 	XA_BUG_ON(xa, xa_erase(xa, 12345678) != NULL);
 	XA_BUG_ON(xa, !xa_empty(xa));
diff --git a/lib/xarray.c b/lib/xarray.c
index fb783bf2a441..1b97ca58bd15 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -1451,7 +1451,7 @@ EXPORT_SYMBOL(__xa_cmpxchg);
  *
  * Context: Any context.  Expects xa_lock to be held on entry.  May
  * release and reacquire xa_lock if @gfp flags permit.
- * Return: 0 if the store succeeded.  -EEXIST if another entry was present.
+ * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
  * -ENOMEM if memory could not be allocated.
  */
 int __xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
@@ -1471,7 +1471,7 @@ int __xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
 			if (xa_track_free(xa))
 				xas_clear_mark(&xas, XA_FREE_MARK);
 		} else {
-			xas_set_err(&xas, -EEXIST);
+			xas_set_err(&xas, -EBUSY);
 		}
 	} while (__xas_nomem(&xas, gfp));
 
-- 
cgit v1.2.3


From 3ccaf57a6a63ad171a951dcaddffc453b2414c7b Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Fri, 26 Oct 2018 14:43:22 -0400
Subject: XArray: Add support for 1s-based allocation

A lot of places want to allocate IDs starting at 1 instead of 0.
While the xa_alloc() API supports this, it's not very efficient if lots
of IDs are allocated, due to having to walk down to the bottom of the
tree to see if ID 1 is available, then all the way over to the next
non-allocated ID.  This method marks ID 0 as being occupied which wastes
one slot in the XArray, but preserves xa_empty() as working.

Signed-off-by: Matthew Wilcox <willy@infradead.org>
---
 Documentation/core-api/xarray.rst | 10 +++--
 include/linux/xarray.h            | 14 ++++++-
 lib/test_xarray.c                 | 88 ++++++++++++++++++++++++---------------
 lib/xarray.c                      | 11 +++++
 4 files changed, 86 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst
index 42bb1a62650f..e90c4925cd37 100644
--- a/Documentation/core-api/xarray.rst
+++ b/Documentation/core-api/xarray.rst
@@ -131,17 +131,21 @@ If you use :c:func:`DEFINE_XARRAY_ALLOC` to define the XArray, or
 initialise it by passing ``XA_FLAGS_ALLOC`` to :c:func:`xa_init_flags`,
 the XArray changes to track whether entries are in use or not.
 
-You can call :c:func:`xa_alloc` to store the entry at any unused index
+You can call :c:func:`xa_alloc` to store the entry at an unused index
 in the XArray.  If you need to modify the array from interrupt context,
 you can use :c:func:`xa_alloc_bh` or :c:func:`xa_alloc_irq` to disable
 interrupts while allocating the ID.
 
-Using :c:func:`xa_store`, :c:func:`xa_cmpxchg` or :c:func:`xa_insert`
-will mark the entry as being allocated.  Unlike a normal XArray, storing
+Using :c:func:`xa_store`, :c:func:`xa_cmpxchg` or :c:func:`xa_insert` will
+also mark the entry as being allocated.  Unlike a normal XArray, storing
 ``NULL`` will mark the entry as being in use, like :c:func:`xa_reserve`.
 To free an entry, use :c:func:`xa_erase` (or :c:func:`xa_release` if
 you only want to free the entry if it's ``NULL``).
 
+By default, the lowest free entry is allocated starting from 0.  If you
+want to allocate entries starting at 1, it is more efficient to use
+:c:func:`DEFINE_XARRAY_ALLOC1` or ``XA_FLAGS_ALLOC1``.
+
 You cannot use ``XA_MARK_0`` with an allocating XArray as this mark
 is used to track whether an entry is free or not.  The other marks are
 available for your use.
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 57cf35c4d094..99dd0838b4ba 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -220,10 +220,13 @@ enum xa_lock_type {
 #define XA_FLAGS_LOCK_IRQ	((__force gfp_t)XA_LOCK_IRQ)
 #define XA_FLAGS_LOCK_BH	((__force gfp_t)XA_LOCK_BH)
 #define XA_FLAGS_TRACK_FREE	((__force gfp_t)4U)
+#define XA_FLAGS_ZERO_BUSY	((__force gfp_t)8U)
 #define XA_FLAGS_MARK(mark)	((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \
 						(__force unsigned)(mark)))
 
+/* ALLOC is for a normal 0-based alloc.  ALLOC1 is for an 1-based alloc */
 #define XA_FLAGS_ALLOC	(XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK))
+#define XA_FLAGS_ALLOC1	(XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY)
 
 /**
  * struct xarray - The anchor of the XArray.
@@ -279,7 +282,7 @@ struct xarray {
 #define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0)
 
 /**
- * DEFINE_XARRAY_ALLOC() - Define an XArray which can allocate IDs.
+ * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0.
  * @name: A string that names your XArray.
  *
  * This is intended for file scope definitions of allocating XArrays.
@@ -287,6 +290,15 @@ struct xarray {
  */
 #define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC)
 
+/**
+ * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1.
+ * @name: A string that names your XArray.
+ *
+ * This is intended for file scope definitions of allocating XArrays.
+ * See also DEFINE_XARRAY().
+ */
+#define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1)
+
 void *xa_load(struct xarray *, unsigned long index);
 void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
 void *xa_erase(struct xarray *, unsigned long index);
diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index 9d894e93456c..cd74f8f32abe 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -589,64 +589,86 @@ static noinline void check_multi_store(struct xarray *xa)
 #endif
 }
 
-static DEFINE_XARRAY_ALLOC(xa0);
-
-static noinline void check_xa_alloc(void)
+static noinline void check_xa_alloc_1(struct xarray *xa, unsigned int base)
 {
 	int i;
 	u32 id;
 
-	/* An empty array should assign 0 to the first alloc */
-	xa_alloc_index(&xa0, 0, GFP_KERNEL);
+	XA_BUG_ON(xa, !xa_empty(xa));
+	/* An empty array should assign %base to the first alloc */
+	xa_alloc_index(xa, base, GFP_KERNEL);
 
 	/* Erasing it should make the array empty again */
-	xa_erase_index(&xa0, 0);
-	XA_BUG_ON(&xa0, !xa_empty(&xa0));
+	xa_erase_index(xa, base);
+	XA_BUG_ON(xa, !xa_empty(xa));
+
+	/* And it should assign %base again */
+	xa_alloc_index(xa, base, GFP_KERNEL);
+
+	/* Allocating and then erasing a lot should not lose base */
+	for (i = base + 1; i < 2 * XA_CHUNK_SIZE; i++)
+		xa_alloc_index(xa, i, GFP_KERNEL);
+	for (i = base; i < 2 * XA_CHUNK_SIZE; i++)
+		xa_erase_index(xa, i);
+	xa_alloc_index(xa, base, GFP_KERNEL);
+
+	/* Destroying the array should do the same as erasing */
+	xa_destroy(xa);
 
-	/* And it should assign 0 again */
-	xa_alloc_index(&xa0, 0, GFP_KERNEL);
+	/* And it should assign %base again */
+	xa_alloc_index(xa, base, GFP_KERNEL);
 
-	/* The next assigned ID should be 1 */
-	xa_alloc_index(&xa0, 1, GFP_KERNEL);
-	xa_erase_index(&xa0, 1);
+	/* The next assigned ID should be base+1 */
+	xa_alloc_index(xa, base + 1, GFP_KERNEL);
+	xa_erase_index(xa, base + 1);
 
 	/* Storing a value should mark it used */
-	xa_store_index(&xa0, 1, GFP_KERNEL);
-	xa_alloc_index(&xa0, 2, GFP_KERNEL);
+	xa_store_index(xa, base + 1, GFP_KERNEL);
+	xa_alloc_index(xa, base + 2, GFP_KERNEL);
 
-	/* If we then erase 0, it should be free */
-	xa_erase_index(&xa0, 0);
-	xa_alloc_index(&xa0, 0, GFP_KERNEL);
+	/* If we then erase base, it should be free */
+	xa_erase_index(xa, base);
+	xa_alloc_index(xa, base, GFP_KERNEL);
 
-	xa_erase_index(&xa0, 1);
-	xa_erase_index(&xa0, 2);
+	xa_erase_index(xa, base + 1);
+	xa_erase_index(xa, base + 2);
 
 	for (i = 1; i < 5000; i++) {
-		xa_alloc_index(&xa0, i, GFP_KERNEL);
+		xa_alloc_index(xa, base + i, GFP_KERNEL);
 	}
 
-	xa_destroy(&xa0);
+	xa_destroy(xa);
 
+	/* Check that we fail properly at the limit of allocation */
 	id = 0xfffffffeU;
-	XA_BUG_ON(&xa0, xa_alloc(&xa0, &id, UINT_MAX, xa_mk_index(id),
+	XA_BUG_ON(xa, xa_alloc(xa, &id, UINT_MAX, xa_mk_index(id),
 				GFP_KERNEL) != 0);
-	XA_BUG_ON(&xa0, id != 0xfffffffeU);
-	XA_BUG_ON(&xa0, xa_alloc(&xa0, &id, UINT_MAX, xa_mk_index(id),
+	XA_BUG_ON(xa, id != 0xfffffffeU);
+	XA_BUG_ON(xa, xa_alloc(xa, &id, UINT_MAX, xa_mk_index(id),
 				GFP_KERNEL) != 0);
-	XA_BUG_ON(&xa0, id != 0xffffffffU);
-	XA_BUG_ON(&xa0, xa_alloc(&xa0, &id, UINT_MAX, xa_mk_index(id),
+	XA_BUG_ON(xa, id != 0xffffffffU);
+	XA_BUG_ON(xa, xa_alloc(xa, &id, UINT_MAX, xa_mk_index(id),
 				GFP_KERNEL) != -ENOSPC);
-	XA_BUG_ON(&xa0, id != 0xffffffffU);
-	xa_destroy(&xa0);
+	XA_BUG_ON(xa, id != 0xffffffffU);
+	xa_destroy(xa);
 
 	id = 10;
-	XA_BUG_ON(&xa0, xa_alloc(&xa0, &id, 5, xa_mk_index(id),
+	XA_BUG_ON(xa, xa_alloc(xa, &id, 5, xa_mk_index(id),
 				GFP_KERNEL) != -ENOSPC);
-	XA_BUG_ON(&xa0, xa_store_index(&xa0, 3, GFP_KERNEL) != 0);
-	XA_BUG_ON(&xa0, xa_alloc(&xa0, &id, 5, xa_mk_index(id),
+	XA_BUG_ON(xa, xa_store_index(xa, 3, GFP_KERNEL) != 0);
+	XA_BUG_ON(xa, xa_alloc(xa, &id, 5, xa_mk_index(id),
 				GFP_KERNEL) != -ENOSPC);
-	xa_erase_index(&xa0, 3);
-	XA_BUG_ON(&xa0, !xa_empty(&xa0));
+	xa_erase_index(xa, 3);
+	XA_BUG_ON(xa, !xa_empty(xa));
+}
+
+static DEFINE_XARRAY_ALLOC(xa0);
+static DEFINE_XARRAY_ALLOC1(xa1);
+
+static noinline void check_xa_alloc(void)
+{
+	check_xa_alloc_1(&xa0, 0);
+	check_xa_alloc_1(&xa1, 1);
 }
 
 static noinline void __check_store_iter(struct xarray *xa, unsigned long start,
diff --git a/lib/xarray.c b/lib/xarray.c
index 1b97ca58bd15..468fb7b7963f 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -57,6 +57,11 @@ static inline bool xa_track_free(const struct xarray *xa)
 	return xa->xa_flags & XA_FLAGS_TRACK_FREE;
 }
 
+static inline bool xa_zero_busy(const struct xarray *xa)
+{
+	return xa->xa_flags & XA_FLAGS_ZERO_BUSY;
+}
+
 static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark)
 {
 	if (!(xa->xa_flags & XA_FLAGS_MARK(mark)))
@@ -432,6 +437,8 @@ static void xas_shrink(struct xa_state *xas)
 			break;
 		if (!xa_is_node(entry) && node->shift)
 			break;
+		if (xa_is_zero(entry) && xa_zero_busy(xa))
+			entry = NULL;
 		xas->xa_node = XAS_BOUNDS;
 
 		RCU_INIT_POINTER(xa->xa_head, entry);
@@ -628,6 +635,8 @@ static void *xas_create(struct xa_state *xas, bool allow_root)
 	if (xas_top(node)) {
 		entry = xa_head_locked(xa);
 		xas->xa_node = NULL;
+		if (!entry && xa_zero_busy(xa))
+			entry = XA_ZERO_ENTRY;
 		shift = xas_expand(xas, entry);
 		if (shift < 0)
 			return NULL;
@@ -1942,6 +1951,8 @@ void xa_destroy(struct xarray *xa)
 	entry = xa_head_locked(xa);
 	RCU_INIT_POINTER(xa->xa_head, NULL);
 	xas_init_marks(&xas);
+	if (xa_zero_busy(xa))
+		xa_mark_clear(xa, XA_FREE_MARK);
 	/* lockdep checks we're still holding the lock in xas_free_nodes() */
 	if (xa_is_node(entry))
 		xas_free_nodes(&xas, xa_to_node(entry));
-- 
cgit v1.2.3


From a3e4d3f97ec844de005a679585c04c5c03dfbdb6 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Mon, 31 Dec 2018 10:41:01 -0500
Subject: XArray: Redesign xa_alloc API

It was too easy to forget to initialise the start index.  Add an
xa_limit data structure which can be used to pass min & max, and
define a couple of special values for common cases.  Also add some
more tests cribbed from the IDR test suite.  Change the return value
from -ENOSPC to -EBUSY to match xa_insert().

Signed-off-by: Matthew Wilcox <willy@infradead.org>
---
 include/linux/xarray.h | 80 +++++++++++++++++++++++++++++-----------------
 lib/test_xarray.c      | 86 ++++++++++++++++++++++++++++++++++++++++----------
 lib/xarray.c           | 29 ++++++++---------
 3 files changed, 135 insertions(+), 60 deletions(-)

(limited to 'include')

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 99dd0838b4ba..883bb958e462 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -200,6 +200,27 @@ static inline int xa_err(void *entry)
 	return 0;
 }
 
+/**
+ * struct xa_limit - Represents a range of IDs.
+ * @min: The lowest ID to allocate (inclusive).
+ * @max: The maximum ID to allocate (inclusive).
+ *
+ * This structure is used either directly or via the XA_LIMIT() macro
+ * to communicate the range of IDs that are valid for allocation.
+ * Two common ranges are predefined for you:
+ *  * xa_limit_32b	- [0 - UINT_MAX]
+ *  * xa_limit_31b	- [0 - INT_MAX]
+ */
+struct xa_limit {
+	u32 max;
+	u32 min;
+};
+
+#define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max }
+
+#define xa_limit_32b	XA_LIMIT(0, UINT_MAX)
+#define xa_limit_31b	XA_LIMIT(0, INT_MAX)
+
 typedef unsigned __bitwise xa_mark_t;
 #define XA_MARK_0		((__force xa_mark_t)0U)
 #define XA_MARK_1		((__force xa_mark_t)1U)
@@ -476,7 +497,8 @@ void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
 void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old,
 		void *entry, gfp_t);
 int __xa_insert(struct xarray *, unsigned long index, void *entry, gfp_t);
-int __xa_alloc(struct xarray *, u32 *id, u32 max, void *entry, gfp_t);
+int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry,
+		struct xa_limit, gfp_t);
 int __xa_reserve(struct xarray *, unsigned long index, gfp_t);
 void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
 void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);
@@ -753,26 +775,26 @@ static inline int xa_insert_irq(struct xarray *xa, unsigned long index,
  * xa_alloc() - Find somewhere to store this entry in the XArray.
  * @xa: XArray.
  * @id: Pointer to ID.
- * @max: Maximum ID to allocate (inclusive).
  * @entry: New entry.
+ * @limit: Range of ID to allocate.
  * @gfp: Memory allocation flags.
  *
- * Allocates an unused ID in the range specified by @id and @max.
- * Updates the @id pointer with the index, then stores the entry at that
- * index.  A concurrent lookup will not see an uninitialised @id.
+ * Finds an empty entry in @xa between @limit.min and @limit.max,
+ * stores the index into the @id pointer, then stores the entry at
+ * that index.  A concurrent lookup will not see an uninitialised @id.
  *
- * Context: Process context.  Takes and releases the xa_lock.  May sleep if
+ * Context: Any context.  Takes and releases the xa_lock.  May sleep if
  * the @gfp flags permit.
- * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if
- * there is no more space in the XArray.
+ * Return: 0 on success, -ENOMEM if memory could not be allocated or
+ * -EBUSY if there are no free entries in @limit.
  */
-static inline int xa_alloc(struct xarray *xa, u32 *id, u32 max, void *entry,
-		gfp_t gfp)
+static inline __must_check int xa_alloc(struct xarray *xa, u32 *id,
+		void *entry, struct xa_limit limit, gfp_t gfp)
 {
 	int err;
 
 	xa_lock(xa);
-	err = __xa_alloc(xa, id, max, entry, gfp);
+	err = __xa_alloc(xa, id, entry, limit, gfp);
 	xa_unlock(xa);
 
 	return err;
@@ -782,26 +804,26 @@ static inline int xa_alloc(struct xarray *xa, u32 *id, u32 max, void *entry,
  * xa_alloc_bh() - Find somewhere to store this entry in the XArray.
  * @xa: XArray.
  * @id: Pointer to ID.
- * @max: Maximum ID to allocate (inclusive).
  * @entry: New entry.
+ * @limit: Range of ID to allocate.
  * @gfp: Memory allocation flags.
  *
- * Allocates an unused ID in the range specified by @id and @max.
- * Updates the @id pointer with the index, then stores the entry at that
- * index.  A concurrent lookup will not see an uninitialised @id.
+ * Finds an empty entry in @xa between @limit.min and @limit.max,
+ * stores the index into the @id pointer, then stores the entry at
+ * that index.  A concurrent lookup will not see an uninitialised @id.
  *
  * Context: Any context.  Takes and releases the xa_lock while
  * disabling softirqs.  May sleep if the @gfp flags permit.
- * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if
- * there is no more space in the XArray.
+ * Return: 0 on success, -ENOMEM if memory could not be allocated or
+ * -EBUSY if there are no free entries in @limit.
  */
-static inline int xa_alloc_bh(struct xarray *xa, u32 *id, u32 max, void *entry,
-		gfp_t gfp)
+static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id,
+		void *entry, struct xa_limit limit, gfp_t gfp)
 {
 	int err;
 
 	xa_lock_bh(xa);
-	err = __xa_alloc(xa, id, max, entry, gfp);
+	err = __xa_alloc(xa, id, entry, limit, gfp);
 	xa_unlock_bh(xa);
 
 	return err;
@@ -811,26 +833,26 @@ static inline int xa_alloc_bh(struct xarray *xa, u32 *id, u32 max, void *entry,
  * xa_alloc_irq() - Find somewhere to store this entry in the XArray.
  * @xa: XArray.
  * @id: Pointer to ID.
- * @max: Maximum ID to allocate (inclusive).
  * @entry: New entry.
+ * @limit: Range of ID to allocate.
  * @gfp: Memory allocation flags.
  *
- * Allocates an unused ID in the range specified by @id and @max.
- * Updates the @id pointer with the index, then stores the entry at that
- * index.  A concurrent lookup will not see an uninitialised @id.
+ * Finds an empty entry in @xa between @limit.min and @limit.max,
+ * stores the index into the @id pointer, then stores the entry at
+ * that index.  A concurrent lookup will not see an uninitialised @id.
  *
  * Context: Process context.  Takes and releases the xa_lock while
  * disabling interrupts.  May sleep if the @gfp flags permit.
- * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if
- * there is no more space in the XArray.
+ * Return: 0 on success, -ENOMEM if memory could not be allocated or
+ * -EBUSY if there are no free entries in @limit.
  */
-static inline int xa_alloc_irq(struct xarray *xa, u32 *id, u32 max, void *entry,
-		gfp_t gfp)
+static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id,
+		void *entry, struct xa_limit limit, gfp_t gfp)
 {
 	int err;
 
 	xa_lock_irq(xa);
-	err = __xa_alloc(xa, id, max, entry, gfp);
+	err = __xa_alloc(xa, id, entry, limit, gfp);
 	xa_unlock_irq(xa);
 
 	return err;
diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index cd74f8f32abe..b5a6b981454d 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -40,9 +40,9 @@ static void *xa_store_index(struct xarray *xa, unsigned long index, gfp_t gfp)
 
 static void xa_alloc_index(struct xarray *xa, unsigned long index, gfp_t gfp)
 {
-	u32 id = 0;
+	u32 id;
 
-	XA_BUG_ON(xa, xa_alloc(xa, &id, UINT_MAX, xa_mk_index(index),
+	XA_BUG_ON(xa, xa_alloc(xa, &id, xa_mk_index(index), xa_limit_32b,
 				gfp) != 0);
 	XA_BUG_ON(xa, id != index);
 }
@@ -640,28 +640,81 @@ static noinline void check_xa_alloc_1(struct xarray *xa, unsigned int base)
 	xa_destroy(xa);
 
 	/* Check that we fail properly at the limit of allocation */
-	id = 0xfffffffeU;
-	XA_BUG_ON(xa, xa_alloc(xa, &id, UINT_MAX, xa_mk_index(id),
+	XA_BUG_ON(xa, xa_alloc(xa, &id, xa_mk_index(UINT_MAX - 1),
+				XA_LIMIT(UINT_MAX - 1, UINT_MAX),
 				GFP_KERNEL) != 0);
 	XA_BUG_ON(xa, id != 0xfffffffeU);
-	XA_BUG_ON(xa, xa_alloc(xa, &id, UINT_MAX, xa_mk_index(id),
+	XA_BUG_ON(xa, xa_alloc(xa, &id, xa_mk_index(UINT_MAX),
+				XA_LIMIT(UINT_MAX - 1, UINT_MAX),
 				GFP_KERNEL) != 0);
 	XA_BUG_ON(xa, id != 0xffffffffU);
-	XA_BUG_ON(xa, xa_alloc(xa, &id, UINT_MAX, xa_mk_index(id),
-				GFP_KERNEL) != -ENOSPC);
-	XA_BUG_ON(xa, id != 0xffffffffU);
+	id = 3;
+	XA_BUG_ON(xa, xa_alloc(xa, &id, xa_mk_index(0),
+				XA_LIMIT(UINT_MAX - 1, UINT_MAX),
+				GFP_KERNEL) != -EBUSY);
+	XA_BUG_ON(xa, id != 3);
 	xa_destroy(xa);
 
-	id = 10;
-	XA_BUG_ON(xa, xa_alloc(xa, &id, 5, xa_mk_index(id),
-				GFP_KERNEL) != -ENOSPC);
+	XA_BUG_ON(xa, xa_alloc(xa, &id, xa_mk_index(10), XA_LIMIT(10, 5),
+				GFP_KERNEL) != -EBUSY);
 	XA_BUG_ON(xa, xa_store_index(xa, 3, GFP_KERNEL) != 0);
-	XA_BUG_ON(xa, xa_alloc(xa, &id, 5, xa_mk_index(id),
-				GFP_KERNEL) != -ENOSPC);
+	XA_BUG_ON(xa, xa_alloc(xa, &id, xa_mk_index(10), XA_LIMIT(10, 5),
+				GFP_KERNEL) != -EBUSY);
 	xa_erase_index(xa, 3);
 	XA_BUG_ON(xa, !xa_empty(xa));
 }
 
+static noinline void check_xa_alloc_2(struct xarray *xa, unsigned int base)
+{
+	unsigned int i, id;
+	unsigned long index;
+	void *entry;
+
+	/* Allocate and free a NULL and check xa_empty() behaves */
+	XA_BUG_ON(xa, !xa_empty(xa));
+	XA_BUG_ON(xa, xa_alloc(xa, &id, NULL, xa_limit_32b, GFP_KERNEL) != 0);
+	XA_BUG_ON(xa, id != base);
+	XA_BUG_ON(xa, xa_empty(xa));
+	XA_BUG_ON(xa, xa_erase(xa, id) != NULL);
+	XA_BUG_ON(xa, !xa_empty(xa));
+
+	/* Ditto, but check destroy instead of erase */
+	XA_BUG_ON(xa, !xa_empty(xa));
+	XA_BUG_ON(xa, xa_alloc(xa, &id, NULL, xa_limit_32b, GFP_KERNEL) != 0);
+	XA_BUG_ON(xa, id != base);
+	XA_BUG_ON(xa, xa_empty(xa));
+	xa_destroy(xa);
+	XA_BUG_ON(xa, !xa_empty(xa));
+
+	for (i = base; i < base + 10; i++) {
+		XA_BUG_ON(xa, xa_alloc(xa, &id, NULL, xa_limit_32b,
+					GFP_KERNEL) != 0);
+		XA_BUG_ON(xa, id != i);
+	}
+
+	XA_BUG_ON(xa, xa_store(xa, 3, xa_mk_index(3), GFP_KERNEL) != NULL);
+	XA_BUG_ON(xa, xa_store(xa, 4, xa_mk_index(4), GFP_KERNEL) != NULL);
+	XA_BUG_ON(xa, xa_store(xa, 4, NULL, GFP_KERNEL) != xa_mk_index(4));
+	XA_BUG_ON(xa, xa_erase(xa, 5) != NULL);
+	XA_BUG_ON(xa, xa_alloc(xa, &id, NULL, xa_limit_32b, GFP_KERNEL) != 0);
+	XA_BUG_ON(xa, id != 5);
+
+	xa_for_each(xa, index, entry) {
+		xa_erase_index(xa, index);
+	}
+
+	for (i = base; i < base + 9; i++) {
+		XA_BUG_ON(xa, xa_erase(xa, i) != NULL);
+		XA_BUG_ON(xa, xa_empty(xa));
+	}
+	XA_BUG_ON(xa, xa_erase(xa, 8) != NULL);
+	XA_BUG_ON(xa, xa_empty(xa));
+	XA_BUG_ON(xa, xa_erase(xa, base + 9) != NULL);
+	XA_BUG_ON(xa, !xa_empty(xa));
+
+	xa_destroy(xa);
+}
+
 static DEFINE_XARRAY_ALLOC(xa0);
 static DEFINE_XARRAY_ALLOC1(xa1);
 
@@ -669,6 +722,8 @@ static noinline void check_xa_alloc(void)
 {
 	check_xa_alloc_1(&xa0, 0);
 	check_xa_alloc_1(&xa1, 1);
+	check_xa_alloc_2(&xa0, 0);
+	check_xa_alloc_2(&xa1, 1);
 }
 
 static noinline void __check_store_iter(struct xarray *xa, unsigned long start,
@@ -1219,9 +1274,8 @@ static void check_align_1(struct xarray *xa, char *name)
 	void *entry;
 
 	for (i = 0; i < 8; i++) {
-		id = 0;
-		XA_BUG_ON(xa, xa_alloc(xa, &id, UINT_MAX, name + i, GFP_KERNEL)
-				!= 0);
+		XA_BUG_ON(xa, xa_alloc(xa, &id, name + i, xa_limit_32b,
+					GFP_KERNEL) != 0);
 		XA_BUG_ON(xa, id != i);
 	}
 	xa_for_each(xa, index, entry)
diff --git a/lib/xarray.c b/lib/xarray.c
index 468fb7b7963f..c707388fb05e 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -1615,23 +1615,23 @@ EXPORT_SYMBOL(xa_store_range);
  * __xa_alloc() - Find somewhere to store this entry in the XArray.
  * @xa: XArray.
  * @id: Pointer to ID.
- * @max: Maximum ID to allocate (inclusive).
+ * @limit: Range for allocated ID.
  * @entry: New entry.
  * @gfp: Memory allocation flags.
  *
- * Allocates an unused ID in the range specified by @id and @max.
- * Updates the @id pointer with the index, then stores the entry at that
- * index.  A concurrent lookup will not see an uninitialised @id.
+ * Finds an empty entry in @xa between @limit.min and @limit.max,
+ * stores the index into the @id pointer, then stores the entry at
+ * that index.  A concurrent lookup will not see an uninitialised @id.
  *
  * Context: Any context.  Expects xa_lock to be held on entry.  May
  * release and reacquire xa_lock if @gfp flags permit.
- * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if
- * there is no more space in the XArray.
+ * Return: 0 on success, -ENOMEM if memory could not be allocated or
+ * -EBUSY if there are no free entries in @limit.
  */
-int __xa_alloc(struct xarray *xa, u32 *id, u32 max, void *entry, gfp_t gfp)
+int __xa_alloc(struct xarray *xa, u32 *id, void *entry,
+		struct xa_limit limit, gfp_t gfp)
 {
 	XA_STATE(xas, xa, 0);
-	int err;
 
 	if (WARN_ON_ONCE(xa_is_advanced(entry)))
 		return -EINVAL;
@@ -1642,18 +1642,17 @@ int __xa_alloc(struct xarray *xa, u32 *id, u32 max, void *entry, gfp_t gfp)
 		entry = XA_ZERO_ENTRY;
 
 	do {
-		xas.xa_index = *id;
-		xas_find_marked(&xas, max, XA_FREE_MARK);
+		xas.xa_index = limit.min;
+		xas_find_marked(&xas, limit.max, XA_FREE_MARK);
 		if (xas.xa_node == XAS_RESTART)
-			xas_set_err(&xas, -ENOSPC);
+			xas_set_err(&xas, -EBUSY);
+		else
+			*id = xas.xa_index;
 		xas_store(&xas, entry);
 		xas_clear_mark(&xas, XA_FREE_MARK);
 	} while (__xas_nomem(&xas, gfp));
 
-	err = xas_error(&xas);
-	if (!err)
-		*id = xas.xa_index;
-	return err;
+	return xas_error(&xas);
 }
 EXPORT_SYMBOL(__xa_alloc);
 
-- 
cgit v1.2.3


From 2fa044e51a1f35d7b04cbde07ec513b0ba195e38 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Tue, 6 Nov 2018 14:13:35 -0500
Subject: XArray: Add cyclic allocation

This differs slightly from the IDR equivalent in five ways.

1. It can allocate up to UINT_MAX instead of being limited to INT_MAX,
   like xa_alloc().  Also like xa_alloc(), it will write to the 'id'
   pointer before placing the entry in the XArray.
2. The 'next' cursor is allocated separately from the XArray instead
   of being part of the IDR.  This saves memory for all the users which
   do not use the cyclic allocation API and suits some users better.
3. It returns -EBUSY instead of -ENOSPC.
4. It will attempt to wrap back to the minimum value on memory allocation
   failure as well as on an -EBUSY error, assuming that a user would
   rather allocate a small ID than suffer an ID allocation failure.
5. It reports whether it has wrapped, which is important to some users.

Signed-off-by: Matthew Wilcox <willy@infradead.org>
---
 Documentation/core-api/xarray.rst |   4 +-
 include/linux/xarray.h            | 102 ++++++++++++++++++++++++++++++++++++++
 lib/test_xarray.c                 |  53 ++++++++++++++++++++
 lib/xarray.c                      |  50 +++++++++++++++++++
 4 files changed, 208 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst
index e90c4925cd37..c7436da5c4ad 100644
--- a/Documentation/core-api/xarray.rst
+++ b/Documentation/core-api/xarray.rst
@@ -144,7 +144,9 @@ you only want to free the entry if it's ``NULL``).
 
 By default, the lowest free entry is allocated starting from 0.  If you
 want to allocate entries starting at 1, it is more efficient to use
-:c:func:`DEFINE_XARRAY_ALLOC1` or ``XA_FLAGS_ALLOC1``.
+:c:func:`DEFINE_XARRAY_ALLOC1` or ``XA_FLAGS_ALLOC1``.  If you want to
+allocate IDs up to a maximum, then wrap back around to the lowest free
+ID, you can use :c:func:`xa_alloc_cyclic`.
 
 You cannot use ``XA_MARK_0`` with an allocating XArray as this mark
 is used to track whether an entry is free or not.  The other marks are
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 883bb958e462..5ed6b462e754 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -242,6 +242,7 @@ enum xa_lock_type {
 #define XA_FLAGS_LOCK_BH	((__force gfp_t)XA_LOCK_BH)
 #define XA_FLAGS_TRACK_FREE	((__force gfp_t)4U)
 #define XA_FLAGS_ZERO_BUSY	((__force gfp_t)8U)
+#define XA_FLAGS_ALLOC_WRAPPED	((__force gfp_t)16U)
 #define XA_FLAGS_MARK(mark)	((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \
 						(__force unsigned)(mark)))
 
@@ -499,6 +500,8 @@ void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old,
 int __xa_insert(struct xarray *, unsigned long index, void *entry, gfp_t);
 int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry,
 		struct xa_limit, gfp_t);
+int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry,
+		struct xa_limit, u32 *next, gfp_t);
 int __xa_reserve(struct xarray *, unsigned long index, gfp_t);
 void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
 void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);
@@ -858,6 +861,105 @@ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id,
 	return err;
 }
 
+/**
+ * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray.
+ * @xa: XArray.
+ * @id: Pointer to ID.
+ * @entry: New entry.
+ * @limit: Range of allocated ID.
+ * @next: Pointer to next ID to allocate.
+ * @gfp: Memory allocation flags.
+ *
+ * Finds an empty entry in @xa between @limit.min and @limit.max,
+ * stores the index into the @id pointer, then stores the entry at
+ * that index.  A concurrent lookup will not see an uninitialised @id.
+ * The search for an empty entry will start at @next and will wrap
+ * around if necessary.
+ *
+ * Context: Any context.  Takes and releases the xa_lock.  May sleep if
+ * the @gfp flags permit.
+ * Return: 0 if the allocation succeeded without wrapping.  1 if the
+ * allocation succeeded after wrapping, -ENOMEM if memory could not be
+ * allocated or -EBUSY if there are no free entries in @limit.
+ */
+static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
+		struct xa_limit limit, u32 *next, gfp_t gfp)
+{
+	int err;
+
+	xa_lock(xa);
+	err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
+	xa_unlock(xa);
+
+	return err;
+}
+
+/**
+ * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray.
+ * @xa: XArray.
+ * @id: Pointer to ID.
+ * @entry: New entry.
+ * @limit: Range of allocated ID.
+ * @next: Pointer to next ID to allocate.
+ * @gfp: Memory allocation flags.
+ *
+ * Finds an empty entry in @xa between @limit.min and @limit.max,
+ * stores the index into the @id pointer, then stores the entry at
+ * that index.  A concurrent lookup will not see an uninitialised @id.
+ * The search for an empty entry will start at @next and will wrap
+ * around if necessary.
+ *
+ * Context: Any context.  Takes and releases the xa_lock while
+ * disabling softirqs.  May sleep if the @gfp flags permit.
+ * Return: 0 if the allocation succeeded without wrapping.  1 if the
+ * allocation succeeded after wrapping, -ENOMEM if memory could not be
+ * allocated or -EBUSY if there are no free entries in @limit.
+ */
+static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry,
+		struct xa_limit limit, u32 *next, gfp_t gfp)
+{
+	int err;
+
+	xa_lock_bh(xa);
+	err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
+	xa_unlock_bh(xa);
+
+	return err;
+}
+
+/**
+ * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray.
+ * @xa: XArray.
+ * @id: Pointer to ID.
+ * @entry: New entry.
+ * @limit: Range of allocated ID.
+ * @next: Pointer to next ID to allocate.
+ * @gfp: Memory allocation flags.
+ *
+ * Finds an empty entry in @xa between @limit.min and @limit.max,
+ * stores the index into the @id pointer, then stores the entry at
+ * that index.  A concurrent lookup will not see an uninitialised @id.
+ * The search for an empty entry will start at @next and will wrap
+ * around if necessary.
+ *
+ * Context: Process context.  Takes and releases the xa_lock while
+ * disabling interrupts.  May sleep if the @gfp flags permit.
+ * Return: 0 if the allocation succeeded without wrapping.  1 if the
+ * allocation succeeded after wrapping, -ENOMEM if memory could not be
+ * allocated or -EBUSY if there are no free entries in @limit.
+ */
+static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry,
+		struct xa_limit limit, u32 *next, gfp_t gfp)
+{
+	int err;
+
+	xa_lock_irq(xa);
+	err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
+	xa_unlock_irq(xa);
+
+	return err;
+}
+
 /**
  * xa_reserve() - Reserve this index in the XArray.
  * @xa: XArray.
diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index b5a6b981454d..eaf53f742c72 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -715,6 +715,57 @@ static noinline void check_xa_alloc_2(struct xarray *xa, unsigned int base)
 	xa_destroy(xa);
 }
 
+static noinline void check_xa_alloc_3(struct xarray *xa, unsigned int base)
+{
+	struct xa_limit limit = XA_LIMIT(1, 0x3fff);
+	u32 next = 0;
+	unsigned int i, id;
+	unsigned long index;
+	void *entry;
+
+	XA_BUG_ON(xa, xa_alloc_cyclic(xa, &id, xa_mk_index(1), limit,
+				&next, GFP_KERNEL) != 0);
+	XA_BUG_ON(xa, id != 1);
+
+	next = 0x3ffd;
+	XA_BUG_ON(xa, xa_alloc_cyclic(xa, &id, xa_mk_index(0x3ffd), limit,
+				&next, GFP_KERNEL) != 0);
+	XA_BUG_ON(xa, id != 0x3ffd);
+	xa_erase_index(xa, 0x3ffd);
+	xa_erase_index(xa, 1);
+	XA_BUG_ON(xa, !xa_empty(xa));
+
+	for (i = 0x3ffe; i < 0x4003; i++) {
+		if (i < 0x4000)
+			entry = xa_mk_index(i);
+		else
+			entry = xa_mk_index(i - 0x3fff);
+		XA_BUG_ON(xa, xa_alloc_cyclic(xa, &id, entry, limit,
+					&next, GFP_KERNEL) != (id == 1));
+		XA_BUG_ON(xa, xa_mk_index(id) != entry);
+	}
+
+	/* Check wrap-around is handled correctly */
+	if (base != 0)
+		xa_erase_index(xa, base);
+	xa_erase_index(xa, base + 1);
+	next = UINT_MAX;
+	XA_BUG_ON(xa, xa_alloc_cyclic(xa, &id, xa_mk_index(UINT_MAX),
+				xa_limit_32b, &next, GFP_KERNEL) != 0);
+	XA_BUG_ON(xa, id != UINT_MAX);
+	XA_BUG_ON(xa, xa_alloc_cyclic(xa, &id, xa_mk_index(base),
+				xa_limit_32b, &next, GFP_KERNEL) != 1);
+	XA_BUG_ON(xa, id != base);
+	XA_BUG_ON(xa, xa_alloc_cyclic(xa, &id, xa_mk_index(base + 1),
+				xa_limit_32b, &next, GFP_KERNEL) != 0);
+	XA_BUG_ON(xa, id != base + 1);
+
+	xa_for_each(xa, index, entry)
+		xa_erase_index(xa, index);
+
+	XA_BUG_ON(xa, !xa_empty(xa));
+}
+
 static DEFINE_XARRAY_ALLOC(xa0);
 static DEFINE_XARRAY_ALLOC1(xa1);
 
@@ -724,6 +775,8 @@ static noinline void check_xa_alloc(void)
 	check_xa_alloc_1(&xa1, 1);
 	check_xa_alloc_2(&xa0, 0);
 	check_xa_alloc_2(&xa1, 1);
+	check_xa_alloc_3(&xa0, 0);
+	check_xa_alloc_3(&xa1, 1);
 }
 
 static noinline void __check_store_iter(struct xarray *xa, unsigned long start,
diff --git a/lib/xarray.c b/lib/xarray.c
index c707388fb05e..89e37ac50850 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -1656,6 +1656,56 @@ int __xa_alloc(struct xarray *xa, u32 *id, void *entry,
 }
 EXPORT_SYMBOL(__xa_alloc);
 
+/**
+ * __xa_alloc_cyclic() - Find somewhere to store this entry in the XArray.
+ * @xa: XArray.
+ * @id: Pointer to ID.
+ * @entry: New entry.
+ * @limit: Range of allocated ID.
+ * @next: Pointer to next ID to allocate.
+ * @gfp: Memory allocation flags.
+ *
+ * Finds an empty entry in @xa between @limit.min and @limit.max,
+ * stores the index into the @id pointer, then stores the entry at
+ * that index.  A concurrent lookup will not see an uninitialised @id.
+ * The search for an empty entry will start at @next and will wrap
+ * around if necessary.
+ *
+ * Context: Any context.  Expects xa_lock to be held on entry.  May
+ * release and reacquire xa_lock if @gfp flags permit.
+ * Return: 0 if the allocation succeeded without wrapping.  1 if the
+ * allocation succeeded after wrapping, -ENOMEM if memory could not be
+ * allocated or -EBUSY if there are no free entries in @limit.
+ */
+int __xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
+		struct xa_limit limit, u32 *next, gfp_t gfp)
+{
+	u32 min = limit.min;
+	int ret;
+
+	limit.min = max(min, *next);
+	ret = __xa_alloc(xa, id, entry, limit, gfp);
+	if ((xa->xa_flags & XA_FLAGS_ALLOC_WRAPPED) && ret == 0) {
+		xa->xa_flags &= ~XA_FLAGS_ALLOC_WRAPPED;
+		ret = 1;
+	}
+
+	if (ret < 0 && limit.min > min) {
+		limit.min = min;
+		ret = __xa_alloc(xa, id, entry, limit, gfp);
+		if (ret == 0)
+			ret = 1;
+	}
+
+	if (ret >= 0) {
+		*next = *id + 1;
+		if (*next == 0)
+			xa->xa_flags |= XA_FLAGS_ALLOC_WRAPPED;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__xa_alloc_cyclic);
+
 /**
  * __xa_set_mark() - Set this mark on this entry while locked.
  * @xa: XArray.
-- 
cgit v1.2.3


From 60b8f0ddf1a927ef02141a6610fd52575134f821 Mon Sep 17 00:00:00 2001
From: Phil Edworthy <phil.edworthy@renesas.com>
Date: Mon, 3 Dec 2018 11:13:09 +0000
Subject: clk: Add (devm_)clk_get_optional() functions

This adds clk_get_optional() and devm_clk_get_optional() functions to get
optional clocks.

They behave the same as (devm_)clk_get() except where there is no clock
producer. In this case, instead of returning -ENOENT, the function
returns NULL. This makes error checking simpler and allows
clk_prepare_enable, etc to be called on the returned reference
without additional checks.

Signed-off-by: Phil Edworthy <phil.edworthy@renesas.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Russell King <linux@armlinux.org.uk>
[sboyd@kernel.org: Document in devres.txt]
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 Documentation/driver-model/devres.txt |  1 +
 drivers/clk/clk-devres.c              | 11 +++++++++++
 include/linux/clk.h                   | 36 +++++++++++++++++++++++++++++++++++
 3 files changed, 48 insertions(+)

(limited to 'include')

diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index b277cafce71e..83f38c0439cd 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -242,6 +242,7 @@ certainly invest a bit more effort into libata core layer).
 
 CLOCK
   devm_clk_get()
+  devm_clk_get_optional()
   devm_clk_put()
   devm_clk_hw_register()
   devm_of_clk_add_hw_provider()
diff --git a/drivers/clk/clk-devres.c b/drivers/clk/clk-devres.c
index c9a86156ced8..daa1fc8fba53 100644
--- a/drivers/clk/clk-devres.c
+++ b/drivers/clk/clk-devres.c
@@ -29,6 +29,17 @@ struct clk *devm_clk_get(struct device *dev, const char *id)
 }
 EXPORT_SYMBOL(devm_clk_get);
 
+struct clk *devm_clk_get_optional(struct device *dev, const char *id)
+{
+	struct clk *clk = devm_clk_get(dev, id);
+
+	if (clk == ERR_PTR(-ENOENT))
+		return NULL;
+
+	return clk;
+}
+EXPORT_SYMBOL(devm_clk_get_optional);
+
 struct clk_bulk_devres {
 	struct clk_bulk_data *clks;
 	int num_clks;
diff --git a/include/linux/clk.h b/include/linux/clk.h
index a7773b5c0b9f..d8bc1a856b39 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -383,6 +383,17 @@ int __must_check devm_clk_bulk_get_all(struct device *dev,
  */
 struct clk *devm_clk_get(struct device *dev, const char *id);
 
+/**
+ * devm_clk_get_optional - lookup and obtain a managed reference to an optional
+ *			   clock producer.
+ * @dev: device for clock "consumer"
+ * @id: clock consumer ID
+ *
+ * Behaves the same as devm_clk_get() except where there is no clock producer.
+ * In this case, instead of returning -ENOENT, the function returns NULL.
+ */
+struct clk *devm_clk_get_optional(struct device *dev, const char *id);
+
 /**
  * devm_get_clk_from_child - lookup and obtain a managed reference to a
  *			     clock producer from child node.
@@ -718,6 +729,12 @@ static inline struct clk *devm_clk_get(struct device *dev, const char *id)
 	return NULL;
 }
 
+static inline struct clk *devm_clk_get_optional(struct device *dev,
+						const char *id)
+{
+	return NULL;
+}
+
 static inline int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
 						 struct clk_bulk_data *clks)
 {
@@ -862,6 +879,25 @@ static inline void clk_bulk_disable_unprepare(int num_clks,
 	clk_bulk_unprepare(num_clks, clks);
 }
 
+/**
+ * clk_get_optional - lookup and obtain a reference to an optional clock
+ *		      producer.
+ * @dev: device for clock "consumer"
+ * @id: clock consumer ID
+ *
+ * Behaves the same as clk_get() except where there is no clock producer. In
+ * this case, instead of returning -ENOENT, the function returns NULL.
+ */
+static inline struct clk *clk_get_optional(struct device *dev, const char *id)
+{
+	struct clk *clk = clk_get(dev, id);
+
+	if (clk == ERR_PTR(-ENOENT))
+		return NULL;
+
+	return clk;
+}
+
 #if defined(CONFIG_OF) && defined(CONFIG_COMMON_CLK)
 struct clk *of_clk_get(struct device_node *np, int index);
 struct clk *of_clk_get_by_name(struct device_node *np, const char *name);
-- 
cgit v1.2.3


From 3eee6c7d119cd8563ad25898f94d6c1b514da548 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Fri, 7 Dec 2018 13:09:39 +0200
Subject: clkdev: add managed clkdev lookup registration

Clkdev registration lacks of managed registration functions and it
seems few drivers do not drop clkdev lookups at exit. Add
devm_clk_hw_register_clkdev and devm_clk_release_clkdev to ease lookup
releasing at exit.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 Documentation/driver-model/devres.txt |   1 +
 drivers/clk/clkdev.c                  | 111 +++++++++++++++++++++++++++-------
 include/linux/clkdev.h                |   4 ++
 3 files changed, 93 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index b277cafce71e..805b7bf5d98f 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -245,6 +245,7 @@ CLOCK
   devm_clk_put()
   devm_clk_hw_register()
   devm_of_clk_add_hw_provider()
+  devm_clk_hw_register_clkdev()
 
 DMA
   dmaenginem_async_device_register()
diff --git a/drivers/clk/clkdev.c b/drivers/clk/clkdev.c
index 9ab3db8b3988..4621f8a91fc0 100644
--- a/drivers/clk/clkdev.c
+++ b/drivers/clk/clkdev.c
@@ -401,6 +401,23 @@ static struct clk_lookup *__clk_register_clkdev(struct clk_hw *hw,
 	return cl;
 }
 
+static int do_clk_register_clkdev(struct clk_hw *hw,
+	struct clk_lookup **cl, const char *con_id, const char *dev_id)
+{
+	if (IS_ERR(hw))
+		return PTR_ERR(hw);
+	/*
+	 * Since dev_id can be NULL, and NULL is handled specially, we must
+	 * pass it as either a NULL format string, or with "%s".
+	 */
+	if (dev_id)
+		*cl = __clk_register_clkdev(hw, con_id, "%s", dev_id);
+	else
+		*cl = __clk_register_clkdev(hw, con_id, NULL);
+
+	return *cl ? 0 : -ENOMEM;
+}
+
 /**
  * clk_register_clkdev - register one clock lookup for a struct clk
  * @clk: struct clk to associate with all clk_lookups
@@ -423,17 +440,8 @@ int clk_register_clkdev(struct clk *clk, const char *con_id,
 	if (IS_ERR(clk))
 		return PTR_ERR(clk);
 
-	/*
-	 * Since dev_id can be NULL, and NULL is handled specially, we must
-	 * pass it as either a NULL format string, or with "%s".
-	 */
-	if (dev_id)
-		cl = __clk_register_clkdev(__clk_get_hw(clk), con_id, "%s",
-					   dev_id);
-	else
-		cl = __clk_register_clkdev(__clk_get_hw(clk), con_id, NULL);
-
-	return cl ? 0 : -ENOMEM;
+	return do_clk_register_clkdev(__clk_get_hw(clk), &cl, con_id,
+					      dev_id);
 }
 EXPORT_SYMBOL(clk_register_clkdev);
 
@@ -456,18 +464,75 @@ int clk_hw_register_clkdev(struct clk_hw *hw, const char *con_id,
 {
 	struct clk_lookup *cl;
 
-	if (IS_ERR(hw))
-		return PTR_ERR(hw);
+	return do_clk_register_clkdev(hw, &cl, con_id, dev_id);
+}
+EXPORT_SYMBOL(clk_hw_register_clkdev);
 
-	/*
-	 * Since dev_id can be NULL, and NULL is handled specially, we must
-	 * pass it as either a NULL format string, or with "%s".
-	 */
-	if (dev_id)
-		cl = __clk_register_clkdev(hw, con_id, "%s", dev_id);
-	else
-		cl = __clk_register_clkdev(hw, con_id, NULL);
+static void devm_clkdev_release(struct device *dev, void *res)
+{
+	clkdev_drop(*(struct clk_lookup **)res);
+}
 
-	return cl ? 0 : -ENOMEM;
+static int devm_clk_match_clkdev(struct device *dev, void *res, void *data)
+{
+	struct clk_lookup **l = res;
+
+	return *l == data;
 }
-EXPORT_SYMBOL(clk_hw_register_clkdev);
+
+/**
+ * devm_clk_release_clkdev - Resource managed clkdev lookup release
+ * @dev: device this lookup is bound
+ * @con_id: connection ID string on device
+ * @dev_id: format string describing device name
+ *
+ * Drop the clkdev lookup created with devm_clk_hw_register_clkdev.
+ * Normally this function will not need to be called and the resource
+ * management code will ensure that the resource is freed.
+ */
+void devm_clk_release_clkdev(struct device *dev, const char *con_id,
+			     const char *dev_id)
+{
+	struct clk_lookup *cl;
+	int rval;
+
+	cl = clk_find(dev_id, con_id);
+	WARN_ON(!cl);
+	rval = devres_release(dev, devm_clkdev_release,
+			      devm_clk_match_clkdev, cl);
+	WARN_ON(rval);
+}
+EXPORT_SYMBOL(devm_clk_release_clkdev);
+
+/**
+ * devm_clk_hw_register_clkdev - managed clk lookup registration for clk_hw
+ * @dev: device this lookup is bound
+ * @hw: struct clk_hw to associate with all clk_lookups
+ * @con_id: connection ID string on device
+ * @dev_id: format string describing device name
+ *
+ * con_id or dev_id may be NULL as a wildcard, just as in the rest of
+ * clkdev.
+ *
+ * To make things easier for mass registration, we detect error clk_hws
+ * from a previous clk_hw_register_*() call, and return the error code for
+ * those.  This is to permit this function to be called immediately
+ * after clk_hw_register_*().
+ */
+int devm_clk_hw_register_clkdev(struct device *dev, struct clk_hw *hw,
+				const char *con_id, const char *dev_id)
+{
+	int rval = -ENOMEM;
+	struct clk_lookup **cl;
+
+	cl = devres_alloc(devm_clkdev_release, sizeof(*cl), GFP_KERNEL);
+	if (cl) {
+		rval = do_clk_register_clkdev(hw, cl, con_id, dev_id);
+		if (!rval)
+			devres_add(dev, cl);
+		else
+			devres_free(cl);
+	}
+	return rval;
+}
+EXPORT_SYMBOL(devm_clk_hw_register_clkdev);
diff --git a/include/linux/clkdev.h b/include/linux/clkdev.h
index 4890ff033220..ccb32af5848b 100644
--- a/include/linux/clkdev.h
+++ b/include/linux/clkdev.h
@@ -52,4 +52,8 @@ int clk_add_alias(const char *, const char *, const char *, struct device *);
 int clk_register_clkdev(struct clk *, const char *, const char *);
 int clk_hw_register_clkdev(struct clk_hw *, const char *, const char *);
 
+int devm_clk_hw_register_clkdev(struct device *dev, struct clk_hw *hw,
+				const char *con_id, const char *dev_id);
+void devm_clk_release_clkdev(struct device *dev, const char *con_id,
+			     const char *dev_id);
 #endif
-- 
cgit v1.2.3


From 626feb863274da93e44d644a9fd4a59b46851794 Mon Sep 17 00:00:00 2001
From: Marek Vasut <marex@denx.de>
Date: Wed, 6 Feb 2019 21:53:48 -0800
Subject: Input: ili210x - drop platform data support

There is not a single user of the ili210x platform data in the kernel,
just drop it.

Signed-off-by: Marek Vasut <marex@denx.de>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/ili210x.c | 12 ++----------
 include/linux/input/ili210x.h       | 11 -----------
 2 files changed, 2 insertions(+), 21 deletions(-)
 delete mode 100644 include/linux/input/ili210x.h

(limited to 'include')

diff --git a/drivers/input/touchscreen/ili210x.c b/drivers/input/touchscreen/ili210x.c
index 6f76eeedf465..25b0ca6c07d9 100644
--- a/drivers/input/touchscreen/ili210x.c
+++ b/drivers/input/touchscreen/ili210x.c
@@ -6,7 +6,6 @@
 #include <linux/input/mt.h>
 #include <linux/delay.h>
 #include <linux/workqueue.h>
-#include <linux/input/ili210x.h>
 
 #define MAX_TOUCHES		2
 #define DEFAULT_POLL_PERIOD	20
@@ -184,7 +183,6 @@ static int ili210x_i2c_probe(struct i2c_client *client,
 				       const struct i2c_device_id *id)
 {
 	struct device *dev = &client->dev;
-	const struct ili210x_platform_data *pdata = dev_get_platdata(dev);
 	struct ili210x *priv;
 	struct input_dev *input;
 	struct panel_info panel;
@@ -194,11 +192,6 @@ static int ili210x_i2c_probe(struct i2c_client *client,
 
 	dev_dbg(dev, "Probing for ILI210X I2C Touschreen driver");
 
-	if (!pdata) {
-		dev_err(dev, "No platform data!\n");
-		return -EINVAL;
-	}
-
 	if (client->irq <= 0) {
 		dev_err(dev, "No IRQ!\n");
 		return -EINVAL;
@@ -233,8 +226,7 @@ static int ili210x_i2c_probe(struct i2c_client *client,
 
 	priv->client = client;
 	priv->input = input;
-	priv->get_pendown_state = pdata->get_pendown_state;
-	priv->poll_period = pdata->poll_period ? : DEFAULT_POLL_PERIOD;
+	priv->poll_period = DEFAULT_POLL_PERIOD;
 	INIT_DELAYED_WORK(&priv->dwork, ili210x_work);
 
 	/* Setup input device */
@@ -258,7 +250,7 @@ static int ili210x_i2c_probe(struct i2c_client *client,
 
 	i2c_set_clientdata(client, priv);
 
-	error = request_irq(client->irq, ili210x_irq, pdata->irq_flags,
+	error = request_irq(client->irq, ili210x_irq, 0,
 			    client->name, priv);
 	if (error) {
 		dev_err(dev, "Unable to request touchscreen IRQ, err: %d\n",
diff --git a/include/linux/input/ili210x.h b/include/linux/input/ili210x.h
deleted file mode 100644
index b76e7c1404cd..000000000000
--- a/include/linux/input/ili210x.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ILI210X_H
-#define _ILI210X_H
-
-struct ili210x_platform_data {
-	unsigned long irq_flags;
-	unsigned int poll_period;
-	bool (*get_pendown_state)(void);
-};
-
-#endif
-- 
cgit v1.2.3


From fdc98f070b149b038a66e57dea11f2fabb5fdfbc Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Wed, 30 Jan 2019 11:41:23 +0000
Subject: mfd: lochnagar: Add initial binding documentation

Lochnagar is an evaluation and development board for Cirrus
Logic Smart CODEC and Amp devices. It allows the connection of
most Cirrus Logic devices on mini-cards, as well as allowing
connection of various application processor systems to provide a
full evaluation platform. This driver supports the board
controller chip on the Lochnagar board.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 .../devicetree/bindings/mfd/cirrus,lochnagar.txt   |  68 +++++++++++
 include/dt-bindings/clk/lochnagar.h                |  26 ++++
 include/dt-bindings/pinctrl/lochnagar.h            | 132 +++++++++++++++++++++
 3 files changed, 226 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/mfd/cirrus,lochnagar.txt
 create mode 100644 include/dt-bindings/clk/lochnagar.h
 create mode 100644 include/dt-bindings/pinctrl/lochnagar.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/mfd/cirrus,lochnagar.txt b/Documentation/devicetree/bindings/mfd/cirrus,lochnagar.txt
new file mode 100644
index 000000000000..004b0158cf4d
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/cirrus,lochnagar.txt
@@ -0,0 +1,68 @@
+Cirrus Logic Lochnagar Audio Development Board
+
+Lochnagar is an evaluation and development board for Cirrus Logic
+Smart CODEC and Amp devices. It allows the connection of most Cirrus
+Logic devices on mini-cards, as well as allowing connection of
+various application processor systems to provide a full evaluation
+platform.  Audio system topology, clocking and power can all be
+controlled through the Lochnagar, allowing the device under test
+to be used in a variety of possible use cases.
+
+Also see these documents for generic binding information:
+  [1] GPIO : ../gpio/gpio.txt
+
+And these for relevant defines:
+  [2] include/dt-bindings/pinctrl/lochnagar.h
+  [3] include/dt-bindings/clock/lochnagar.h
+
+And these documents for the required sub-node binding details:
+  [4] Clock: ../clock/cirrus,lochnagar.txt
+  [5] Pinctrl: ../pinctrl/cirrus,lochnagar.txt
+  [6] Regulator: ../regulator/cirrus,lochnagar.txt
+
+Required properties:
+
+  - compatible : One of the following strings:
+                 "cirrus,lochnagar1"
+                 "cirrus,lochnagar2"
+
+  - reg : I2C slave address
+
+  - reset-gpios : Reset line to the Lochnagar, see [1].
+
+Required sub-nodes:
+
+  - lochnagar-clk : Binding for the clocking components, see [4].
+
+  - lochnagar-pinctrl : Binding for the pin control components, see [5].
+
+Optional sub-nodes:
+
+  - Bindings for the regulator components, see [6]. Only available on
+    Lochnagar 2.
+
+Optional properties:
+
+  - present-gpios : Host present line, indicating the presence of a
+    host system, see [1]. This can be omitted if the present line is
+    tied in hardware.
+
+Example:
+
+lochnagar: lochnagar@22 {
+	compatible = "cirrus,lochnagar2";
+	reg = <0x22>;
+
+	reset-gpios = <&gpio0 55 0>;
+	present-gpios = <&gpio0 60 0>;
+
+	lochnagar-clk {
+		compatible = "cirrus,lochnagar2-clk";
+		...
+	};
+
+	lochnagar-pinctrl {
+		compatible = "cirrus,lochnagar-pinctrl";
+		...
+	};
+};
diff --git a/include/dt-bindings/clk/lochnagar.h b/include/dt-bindings/clk/lochnagar.h
new file mode 100644
index 000000000000..8fa20551ff17
--- /dev/null
+++ b/include/dt-bindings/clk/lochnagar.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Device Tree defines for Lochnagar clocking
+ *
+ * Copyright (c) 2017-2018 Cirrus Logic, Inc. and
+ *                         Cirrus Logic International Semiconductor Ltd.
+ *
+ * Author: Charles Keepax <ckeepax@opensource.cirrus.com>
+ */
+
+#ifndef DT_BINDINGS_CLK_LOCHNAGAR_H
+#define DT_BINDINGS_CLK_LOCHNAGAR_H
+
+#define LOCHNAGAR_CDC_MCLK1		0
+#define LOCHNAGAR_CDC_MCLK2		1
+#define LOCHNAGAR_DSP_CLKIN		2
+#define LOCHNAGAR_GF_CLKOUT1		3
+#define LOCHNAGAR_GF_CLKOUT2		4
+#define LOCHNAGAR_PSIA1_MCLK		5
+#define LOCHNAGAR_PSIA2_MCLK		6
+#define LOCHNAGAR_SPDIF_MCLK		7
+#define LOCHNAGAR_ADAT_MCLK		8
+#define LOCHNAGAR_SOUNDCARD_MCLK	9
+#define LOCHNAGAR_SPDIF_CLKOUT		10
+
+#endif
diff --git a/include/dt-bindings/pinctrl/lochnagar.h b/include/dt-bindings/pinctrl/lochnagar.h
new file mode 100644
index 000000000000..644760bf5725
--- /dev/null
+++ b/include/dt-bindings/pinctrl/lochnagar.h
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Device Tree defines for Lochnagar pinctrl
+ *
+ * Copyright (c) 2018 Cirrus Logic, Inc. and
+ *                    Cirrus Logic International Semiconductor Ltd.
+ *
+ * Author: Charles Keepax <ckeepax@opensource.cirrus.com>
+ */
+
+#ifndef DT_BINDINGS_PINCTRL_LOCHNAGAR_H
+#define DT_BINDINGS_PINCTRL_LOCHNAGAR_H
+
+#define LOCHNAGAR1_PIN_CDC_RESET		0
+#define LOCHNAGAR1_PIN_DSP_RESET		1
+#define LOCHNAGAR1_PIN_CDC_CIF1MODE		2
+#define LOCHNAGAR1_PIN_NUM_GPIOS		3
+
+#define LOCHNAGAR2_PIN_CDC_RESET		0
+#define LOCHNAGAR2_PIN_DSP_RESET		1
+#define LOCHNAGAR2_PIN_CDC_CIF1MODE		2
+#define LOCHNAGAR2_PIN_CDC_LDOENA		3
+#define LOCHNAGAR2_PIN_SPDIF_HWMODE		4
+#define LOCHNAGAR2_PIN_SPDIF_RESET		5
+#define LOCHNAGAR2_PIN_FPGA_GPIO1		6
+#define LOCHNAGAR2_PIN_FPGA_GPIO2		7
+#define LOCHNAGAR2_PIN_FPGA_GPIO3		8
+#define LOCHNAGAR2_PIN_FPGA_GPIO4		9
+#define LOCHNAGAR2_PIN_FPGA_GPIO5		10
+#define LOCHNAGAR2_PIN_FPGA_GPIO6		11
+#define LOCHNAGAR2_PIN_CDC_GPIO1		12
+#define LOCHNAGAR2_PIN_CDC_GPIO2		13
+#define LOCHNAGAR2_PIN_CDC_GPIO3		14
+#define LOCHNAGAR2_PIN_CDC_GPIO4		15
+#define LOCHNAGAR2_PIN_CDC_GPIO5		16
+#define LOCHNAGAR2_PIN_CDC_GPIO6		17
+#define LOCHNAGAR2_PIN_CDC_GPIO7		18
+#define LOCHNAGAR2_PIN_CDC_GPIO8		19
+#define LOCHNAGAR2_PIN_DSP_GPIO1		20
+#define LOCHNAGAR2_PIN_DSP_GPIO2		21
+#define LOCHNAGAR2_PIN_DSP_GPIO3		22
+#define LOCHNAGAR2_PIN_DSP_GPIO4		23
+#define LOCHNAGAR2_PIN_DSP_GPIO5		24
+#define LOCHNAGAR2_PIN_DSP_GPIO6		25
+#define LOCHNAGAR2_PIN_GF_GPIO2			26
+#define LOCHNAGAR2_PIN_GF_GPIO3			27
+#define LOCHNAGAR2_PIN_GF_GPIO7			28
+#define LOCHNAGAR2_PIN_CDC_AIF1_BCLK		29
+#define LOCHNAGAR2_PIN_CDC_AIF1_RXDAT		30
+#define LOCHNAGAR2_PIN_CDC_AIF1_LRCLK		31
+#define LOCHNAGAR2_PIN_CDC_AIF1_TXDAT		32
+#define LOCHNAGAR2_PIN_CDC_AIF2_BCLK		33
+#define LOCHNAGAR2_PIN_CDC_AIF2_RXDAT		34
+#define LOCHNAGAR2_PIN_CDC_AIF2_LRCLK		35
+#define LOCHNAGAR2_PIN_CDC_AIF2_TXDAT		36
+#define LOCHNAGAR2_PIN_CDC_AIF3_BCLK		37
+#define LOCHNAGAR2_PIN_CDC_AIF3_RXDAT		38
+#define LOCHNAGAR2_PIN_CDC_AIF3_LRCLK		39
+#define LOCHNAGAR2_PIN_CDC_AIF3_TXDAT		40
+#define LOCHNAGAR2_PIN_DSP_AIF1_BCLK		41
+#define LOCHNAGAR2_PIN_DSP_AIF1_RXDAT		42
+#define LOCHNAGAR2_PIN_DSP_AIF1_LRCLK		43
+#define LOCHNAGAR2_PIN_DSP_AIF1_TXDAT		44
+#define LOCHNAGAR2_PIN_DSP_AIF2_BCLK		45
+#define LOCHNAGAR2_PIN_DSP_AIF2_RXDAT		46
+#define LOCHNAGAR2_PIN_DSP_AIF2_LRCLK		47
+#define LOCHNAGAR2_PIN_DSP_AIF2_TXDAT		48
+#define LOCHNAGAR2_PIN_PSIA1_BCLK		49
+#define LOCHNAGAR2_PIN_PSIA1_RXDAT		50
+#define LOCHNAGAR2_PIN_PSIA1_LRCLK		51
+#define LOCHNAGAR2_PIN_PSIA1_TXDAT		52
+#define LOCHNAGAR2_PIN_PSIA2_BCLK		53
+#define LOCHNAGAR2_PIN_PSIA2_RXDAT		54
+#define LOCHNAGAR2_PIN_PSIA2_LRCLK		55
+#define LOCHNAGAR2_PIN_PSIA2_TXDAT		56
+#define LOCHNAGAR2_PIN_GF_AIF3_BCLK		57
+#define LOCHNAGAR2_PIN_GF_AIF3_RXDAT		58
+#define LOCHNAGAR2_PIN_GF_AIF3_LRCLK		59
+#define LOCHNAGAR2_PIN_GF_AIF3_TXDAT		60
+#define LOCHNAGAR2_PIN_GF_AIF4_BCLK		61
+#define LOCHNAGAR2_PIN_GF_AIF4_RXDAT		62
+#define LOCHNAGAR2_PIN_GF_AIF4_LRCLK		63
+#define LOCHNAGAR2_PIN_GF_AIF4_TXDAT		64
+#define LOCHNAGAR2_PIN_GF_AIF1_BCLK		65
+#define LOCHNAGAR2_PIN_GF_AIF1_RXDAT		66
+#define LOCHNAGAR2_PIN_GF_AIF1_LRCLK		67
+#define LOCHNAGAR2_PIN_GF_AIF1_TXDAT		68
+#define LOCHNAGAR2_PIN_GF_AIF2_BCLK		69
+#define LOCHNAGAR2_PIN_GF_AIF2_RXDAT		70
+#define LOCHNAGAR2_PIN_GF_AIF2_LRCLK		71
+#define LOCHNAGAR2_PIN_GF_AIF2_TXDAT		72
+#define LOCHNAGAR2_PIN_DSP_UART1_RX		73
+#define LOCHNAGAR2_PIN_DSP_UART1_TX		74
+#define LOCHNAGAR2_PIN_DSP_UART2_RX		75
+#define LOCHNAGAR2_PIN_DSP_UART2_TX		76
+#define LOCHNAGAR2_PIN_GF_UART2_RX		77
+#define LOCHNAGAR2_PIN_GF_UART2_TX		78
+#define LOCHNAGAR2_PIN_USB_UART_RX		79
+#define LOCHNAGAR2_PIN_CDC_PDMCLK1		80
+#define LOCHNAGAR2_PIN_CDC_PDMDAT1		81
+#define LOCHNAGAR2_PIN_CDC_PDMCLK2		82
+#define LOCHNAGAR2_PIN_CDC_PDMDAT2		83
+#define LOCHNAGAR2_PIN_CDC_DMICCLK1		84
+#define LOCHNAGAR2_PIN_CDC_DMICDAT1		85
+#define LOCHNAGAR2_PIN_CDC_DMICCLK2		86
+#define LOCHNAGAR2_PIN_CDC_DMICDAT2		87
+#define LOCHNAGAR2_PIN_CDC_DMICCLK3		88
+#define LOCHNAGAR2_PIN_CDC_DMICDAT3		89
+#define LOCHNAGAR2_PIN_CDC_DMICCLK4		90
+#define LOCHNAGAR2_PIN_CDC_DMICDAT4		91
+#define LOCHNAGAR2_PIN_DSP_DMICCLK1		92
+#define LOCHNAGAR2_PIN_DSP_DMICDAT1		93
+#define LOCHNAGAR2_PIN_DSP_DMICCLK2		94
+#define LOCHNAGAR2_PIN_DSP_DMICDAT2		95
+#define LOCHNAGAR2_PIN_I2C2_SCL			96
+#define LOCHNAGAR2_PIN_I2C2_SDA			97
+#define LOCHNAGAR2_PIN_I2C3_SCL			98
+#define LOCHNAGAR2_PIN_I2C3_SDA			99
+#define LOCHNAGAR2_PIN_I2C4_SCL			100
+#define LOCHNAGAR2_PIN_I2C4_SDA			101
+#define LOCHNAGAR2_PIN_DSP_STANDBY		102
+#define LOCHNAGAR2_PIN_CDC_MCLK1		103
+#define LOCHNAGAR2_PIN_CDC_MCLK2		104
+#define LOCHNAGAR2_PIN_DSP_CLKIN		105
+#define LOCHNAGAR2_PIN_PSIA1_MCLK		106
+#define LOCHNAGAR2_PIN_PSIA2_MCLK		107
+#define LOCHNAGAR2_PIN_GF_GPIO1			108
+#define LOCHNAGAR2_PIN_GF_GPIO5			109
+#define LOCHNAGAR2_PIN_DSP_GPIO20		110
+#define LOCHNAGAR2_PIN_NUM_GPIOS		111
+
+#endif
-- 
cgit v1.2.3


From 422dcafe477c7240d03c7b150704c45e0b17be57 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Wed, 30 Jan 2019 11:41:26 +0000
Subject: mfd: lochnagar: Add support for the Cirrus Logic Lochnagar

Lochnagar is an evaluation and development board for Cirrus
Logic Smart CODEC and Amp devices. It allows the connection of
most Cirrus Logic devices on mini-cards, as well as allowing
connection of various application processor systems to provide a
full evaluation platform. This driver supports the board
controller chip on the Lochnagar board. Audio system topology,
clocking and power can all be controlled through the Lochnagar
controller chip, allowing the device under test to be used in
a variety of possible use cases.

As the Lochnagar is a fairly complex device this MFD driver
allows the drivers for the various features to be bound
in. Initially clocking, regulator and pinctrl will be added as
these are necessary to configure the system. But in time at least
audio and voltage/current monitoring will also be added.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 MAINTAINERS                         |  17 ++
 drivers/mfd/Kconfig                 |   8 +
 drivers/mfd/Makefile                |   2 +
 drivers/mfd/lochnagar-i2c.c         | 398 ++++++++++++++++++++++++++++++++++++
 include/linux/mfd/lochnagar.h       |  55 +++++
 include/linux/mfd/lochnagar1_regs.h | 157 ++++++++++++++
 include/linux/mfd/lochnagar2_regs.h | 291 ++++++++++++++++++++++++++
 7 files changed, 928 insertions(+)
 create mode 100644 drivers/mfd/lochnagar-i2c.c
 create mode 100644 include/linux/mfd/lochnagar.h
 create mode 100644 include/linux/mfd/lochnagar1_regs.h
 create mode 100644 include/linux/mfd/lochnagar2_regs.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 51029a425dbe..3e3f0384362b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3700,6 +3700,23 @@ L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/net/ethernet/cirrus/ep93xx_eth.c
 
+CIRRUS LOGIC LOCHNAGAR DRIVER
+M:	Charles Keepax <ckeepax@opensource.cirrus.com>
+M:	Richard Fitzgerald <rf@opensource.cirrus.com>
+L:	patches@opensource.cirrus.com
+S:	Supported
+F:	drivers/clk/clk-lochnagar.c
+F:	drivers/mfd/lochnagar-i2c.c
+F:	drivers/pinctrl/cirrus/pinctrl-lochnagar.c
+F:	drivers/regulator/lochnagar-regulator.c
+F:	include/dt-bindings/clk/lochnagar.h
+F:	include/dt-bindings/pinctrl/lochnagar.h
+F:	include/linux/mfd/lochnagar*
+F:	Documentation/devicetree/bindings/mfd/cirrus,lochnagar.txt
+F:	Documentation/devicetree/bindings/clock/cirrus,lochnagar.txt
+F:	Documentation/devicetree/bindings/pinctrl/cirrus,lochnagar.txt
+F:	Documentation/devicetree/bindings/regulator/cirrus,lochnagar.txt
+
 CISCO FCOE HBA DRIVER
 M:	Satish Kharat <satishkh@cisco.com>
 M:	Sesidhar Baddela <sebaddel@cisco.com>
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 6e58221f5c28..f38f8741c68e 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1686,6 +1686,14 @@ config MFD_VX855
 	  VIA VX855/VX875 south bridge. You will need to enable the vx855_spi
 	  and/or vx855_gpio drivers for this to do anything useful.
 
+config MFD_LOCHNAGAR
+	bool "Cirrus Logic Lochnagar Audio Development Board"
+	select MFD_CORE
+	select REGMAP_I2C
+	depends on I2C=y && OF
+	help
+	  Support for Cirrus Logic Lochnagar audio development board.
+
 config MFD_ARIZONA
 	select REGMAP
 	select REGMAP_IRQ
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index a62fb0112d9f..a406fd3b8681 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -37,6 +37,8 @@ obj-$(CONFIG_MFD_T7L66XB)	+= t7l66xb.o tmio_core.o
 obj-$(CONFIG_MFD_TC6387XB)	+= tc6387xb.o tmio_core.o
 obj-$(CONFIG_MFD_TC6393XB)	+= tc6393xb.o tmio_core.o
 
+obj-$(CONFIG_MFD_LOCHNAGAR)	+= lochnagar-i2c.o
+
 obj-$(CONFIG_MFD_ARIZONA)	+= arizona-core.o
 obj-$(CONFIG_MFD_ARIZONA)	+= arizona-irq.o
 obj-$(CONFIG_MFD_ARIZONA_I2C)	+= arizona-i2c.o
diff --git a/drivers/mfd/lochnagar-i2c.c b/drivers/mfd/lochnagar-i2c.c
new file mode 100644
index 000000000000..3a65d9938902
--- /dev/null
+++ b/drivers/mfd/lochnagar-i2c.c
@@ -0,0 +1,398 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Lochnagar I2C bus interface
+ *
+ * Copyright (c) 2012-2018 Cirrus Logic, Inc. and
+ *                         Cirrus Logic International Semiconductor Ltd.
+ *
+ * Author: Charles Keepax <ckeepax@opensource.cirrus.com>
+ */
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/gpio/consumer.h>
+#include <linux/i2c.h>
+#include <linux/lockdep.h>
+#include <linux/mfd/core.h>
+#include <linux/mutex.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/regmap.h>
+
+#include <linux/mfd/lochnagar.h>
+#include <linux/mfd/lochnagar1_regs.h>
+#include <linux/mfd/lochnagar2_regs.h>
+
+#define LOCHNAGAR_BOOT_RETRIES		10
+#define LOCHNAGAR_BOOT_DELAY_MS		350
+
+#define LOCHNAGAR_CONFIG_POLL_US	10000
+
+static bool lochnagar1_readable_register(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case LOCHNAGAR_SOFTWARE_RESET:
+	case LOCHNAGAR_FIRMWARE_ID1...LOCHNAGAR_FIRMWARE_ID2:
+	case LOCHNAGAR1_CDC_AIF1_SEL...LOCHNAGAR1_CDC_AIF3_SEL:
+	case LOCHNAGAR1_CDC_MCLK1_SEL...LOCHNAGAR1_CDC_MCLK2_SEL:
+	case LOCHNAGAR1_CDC_AIF_CTRL1...LOCHNAGAR1_CDC_AIF_CTRL2:
+	case LOCHNAGAR1_EXT_AIF_CTRL:
+	case LOCHNAGAR1_DSP_AIF1_SEL...LOCHNAGAR1_DSP_AIF2_SEL:
+	case LOCHNAGAR1_DSP_CLKIN_SEL:
+	case LOCHNAGAR1_DSP_AIF:
+	case LOCHNAGAR1_GF_AIF1...LOCHNAGAR1_GF_AIF2:
+	case LOCHNAGAR1_PSIA_AIF:
+	case LOCHNAGAR1_PSIA1_SEL...LOCHNAGAR1_PSIA2_SEL:
+	case LOCHNAGAR1_SPDIF_AIF_SEL:
+	case LOCHNAGAR1_GF_AIF3_SEL...LOCHNAGAR1_GF_AIF4_SEL:
+	case LOCHNAGAR1_GF_CLKOUT1_SEL:
+	case LOCHNAGAR1_GF_AIF1_SEL...LOCHNAGAR1_GF_AIF2_SEL:
+	case LOCHNAGAR1_GF_GPIO2...LOCHNAGAR1_GF_GPIO7:
+	case LOCHNAGAR1_RST:
+	case LOCHNAGAR1_LED1...LOCHNAGAR1_LED2:
+	case LOCHNAGAR1_I2C_CTRL:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static const struct regmap_config lochnagar1_i2c_regmap = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.reg_format_endian = REGMAP_ENDIAN_BIG,
+	.val_format_endian = REGMAP_ENDIAN_BIG,
+
+	.max_register = 0x50,
+	.readable_reg = lochnagar1_readable_register,
+
+	.use_single_read = true,
+	.use_single_write = true,
+
+	.cache_type = REGCACHE_RBTREE,
+};
+
+static const struct reg_sequence lochnagar1_patch[] = {
+	{ 0x40, 0x0083 },
+	{ 0x47, 0x0018 },
+	{ 0x50, 0x0000 },
+};
+
+static bool lochnagar2_readable_register(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case LOCHNAGAR_SOFTWARE_RESET:
+	case LOCHNAGAR_FIRMWARE_ID1...LOCHNAGAR_FIRMWARE_ID2:
+	case LOCHNAGAR2_CDC_AIF1_CTRL...LOCHNAGAR2_CDC_AIF3_CTRL:
+	case LOCHNAGAR2_DSP_AIF1_CTRL...LOCHNAGAR2_DSP_AIF2_CTRL:
+	case LOCHNAGAR2_PSIA1_CTRL...LOCHNAGAR2_PSIA2_CTRL:
+	case LOCHNAGAR2_GF_AIF3_CTRL...LOCHNAGAR2_GF_AIF4_CTRL:
+	case LOCHNAGAR2_GF_AIF1_CTRL...LOCHNAGAR2_GF_AIF2_CTRL:
+	case LOCHNAGAR2_SPDIF_AIF_CTRL:
+	case LOCHNAGAR2_USB_AIF1_CTRL...LOCHNAGAR2_USB_AIF2_CTRL:
+	case LOCHNAGAR2_ADAT_AIF_CTRL:
+	case LOCHNAGAR2_CDC_MCLK1_CTRL...LOCHNAGAR2_CDC_MCLK2_CTRL:
+	case LOCHNAGAR2_DSP_CLKIN_CTRL:
+	case LOCHNAGAR2_PSIA1_MCLK_CTRL...LOCHNAGAR2_PSIA2_MCLK_CTRL:
+	case LOCHNAGAR2_SPDIF_MCLK_CTRL:
+	case LOCHNAGAR2_GF_CLKOUT1_CTRL...LOCHNAGAR2_GF_CLKOUT2_CTRL:
+	case LOCHNAGAR2_ADAT_MCLK_CTRL:
+	case LOCHNAGAR2_SOUNDCARD_MCLK_CTRL:
+	case LOCHNAGAR2_GPIO_FPGA_GPIO1...LOCHNAGAR2_GPIO_FPGA_GPIO6:
+	case LOCHNAGAR2_GPIO_CDC_GPIO1...LOCHNAGAR2_GPIO_CDC_GPIO8:
+	case LOCHNAGAR2_GPIO_DSP_GPIO1...LOCHNAGAR2_GPIO_DSP_GPIO6:
+	case LOCHNAGAR2_GPIO_GF_GPIO2...LOCHNAGAR2_GPIO_GF_GPIO7:
+	case LOCHNAGAR2_GPIO_CDC_AIF1_BCLK...LOCHNAGAR2_GPIO_CDC_AIF3_TXDAT:
+	case LOCHNAGAR2_GPIO_DSP_AIF1_BCLK...LOCHNAGAR2_GPIO_DSP_AIF2_TXDAT:
+	case LOCHNAGAR2_GPIO_PSIA1_BCLK...LOCHNAGAR2_GPIO_PSIA2_TXDAT:
+	case LOCHNAGAR2_GPIO_GF_AIF3_BCLK...LOCHNAGAR2_GPIO_GF_AIF4_TXDAT:
+	case LOCHNAGAR2_GPIO_GF_AIF1_BCLK...LOCHNAGAR2_GPIO_GF_AIF2_TXDAT:
+	case LOCHNAGAR2_GPIO_DSP_UART1_RX...LOCHNAGAR2_GPIO_DSP_UART2_TX:
+	case LOCHNAGAR2_GPIO_GF_UART2_RX...LOCHNAGAR2_GPIO_GF_UART2_TX:
+	case LOCHNAGAR2_GPIO_USB_UART_RX:
+	case LOCHNAGAR2_GPIO_CDC_PDMCLK1...LOCHNAGAR2_GPIO_CDC_PDMDAT2:
+	case LOCHNAGAR2_GPIO_CDC_DMICCLK1...LOCHNAGAR2_GPIO_CDC_DMICDAT4:
+	case LOCHNAGAR2_GPIO_DSP_DMICCLK1...LOCHNAGAR2_GPIO_DSP_DMICDAT2:
+	case LOCHNAGAR2_GPIO_I2C2_SCL...LOCHNAGAR2_GPIO_I2C4_SDA:
+	case LOCHNAGAR2_GPIO_DSP_STANDBY:
+	case LOCHNAGAR2_GPIO_CDC_MCLK1...LOCHNAGAR2_GPIO_CDC_MCLK2:
+	case LOCHNAGAR2_GPIO_DSP_CLKIN:
+	case LOCHNAGAR2_GPIO_PSIA1_MCLK...LOCHNAGAR2_GPIO_PSIA2_MCLK:
+	case LOCHNAGAR2_GPIO_GF_GPIO1...LOCHNAGAR2_GPIO_GF_GPIO5:
+	case LOCHNAGAR2_GPIO_DSP_GPIO20:
+	case LOCHNAGAR2_GPIO_CHANNEL1...LOCHNAGAR2_GPIO_CHANNEL16:
+	case LOCHNAGAR2_MINICARD_RESETS:
+	case LOCHNAGAR2_ANALOGUE_PATH_CTRL1...LOCHNAGAR2_ANALOGUE_PATH_CTRL2:
+	case LOCHNAGAR2_COMMS_CTRL4:
+	case LOCHNAGAR2_SPDIF_CTRL:
+	case LOCHNAGAR2_IMON_CTRL1...LOCHNAGAR2_IMON_CTRL4:
+	case LOCHNAGAR2_IMON_DATA1...LOCHNAGAR2_IMON_DATA2:
+	case LOCHNAGAR2_POWER_CTRL:
+	case LOCHNAGAR2_MICVDD_CTRL1:
+	case LOCHNAGAR2_MICVDD_CTRL2:
+	case LOCHNAGAR2_VDDCORE_CDC_CTRL1:
+	case LOCHNAGAR2_VDDCORE_CDC_CTRL2:
+	case LOCHNAGAR2_SOUNDCARD_AIF_CTRL:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool lochnagar2_volatile_register(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case LOCHNAGAR2_GPIO_CHANNEL1...LOCHNAGAR2_GPIO_CHANNEL16:
+	case LOCHNAGAR2_ANALOGUE_PATH_CTRL1:
+	case LOCHNAGAR2_IMON_CTRL3...LOCHNAGAR2_IMON_CTRL4:
+	case LOCHNAGAR2_IMON_DATA1...LOCHNAGAR2_IMON_DATA2:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static const struct regmap_config lochnagar2_i2c_regmap = {
+	.reg_bits = 16,
+	.val_bits = 16,
+	.reg_format_endian = REGMAP_ENDIAN_BIG,
+	.val_format_endian = REGMAP_ENDIAN_BIG,
+
+	.max_register = 0x1F1F,
+	.readable_reg = lochnagar2_readable_register,
+	.volatile_reg = lochnagar2_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+};
+
+static const struct reg_sequence lochnagar2_patch[] = {
+	{ 0x00EE, 0x0000 },
+};
+
+struct lochnagar_config {
+	int id;
+	const char * const name;
+	enum lochnagar_type type;
+	const struct regmap_config *regmap;
+	const struct reg_sequence *patch;
+	int npatch;
+};
+
+static struct lochnagar_config lochnagar_configs[] = {
+	{
+		.id = 0x50,
+		.name = "lochnagar1",
+		.type = LOCHNAGAR1,
+		.regmap = &lochnagar1_i2c_regmap,
+		.patch = lochnagar1_patch,
+		.npatch = ARRAY_SIZE(lochnagar1_patch),
+	},
+	{
+		.id = 0xCB58,
+		.name = "lochnagar2",
+		.type = LOCHNAGAR2,
+		.regmap = &lochnagar2_i2c_regmap,
+		.patch = lochnagar2_patch,
+		.npatch = ARRAY_SIZE(lochnagar2_patch),
+	},
+};
+
+static const struct of_device_id lochnagar_of_match[] = {
+	{ .compatible = "cirrus,lochnagar1", .data = &lochnagar_configs[0] },
+	{ .compatible = "cirrus,lochnagar2", .data = &lochnagar_configs[1] },
+	{},
+};
+
+static int lochnagar_wait_for_boot(struct regmap *regmap, unsigned int *id)
+{
+	int i, ret;
+
+	for (i = 0; i < LOCHNAGAR_BOOT_RETRIES; ++i) {
+		msleep(LOCHNAGAR_BOOT_DELAY_MS);
+
+		/* The reset register will return the device ID when read */
+		ret = regmap_read(regmap, LOCHNAGAR_SOFTWARE_RESET, id);
+		if (!ret)
+			return ret;
+	}
+
+	return -ETIMEDOUT;
+}
+
+/**
+ * lochnagar_update_config - Synchronise the boards analogue configuration to
+ *                           the hardware.
+ *
+ * @lochnagar: A pointer to the primary core data structure.
+ *
+ * Return: Zero on success or an appropriate negative error code on failure.
+ */
+int lochnagar_update_config(struct lochnagar *lochnagar)
+{
+	struct regmap *regmap = lochnagar->regmap;
+	unsigned int done = LOCHNAGAR2_ANALOGUE_PATH_UPDATE_STS_MASK;
+	int timeout_ms = LOCHNAGAR_BOOT_DELAY_MS * LOCHNAGAR_BOOT_RETRIES;
+	unsigned int val = 0;
+	int ret;
+
+	lockdep_assert_held(&lochnagar->analogue_config_lock);
+
+	if (lochnagar->type != LOCHNAGAR2)
+		return 0;
+
+	/*
+	 * Toggle the ANALOGUE_PATH_UPDATE bit and wait for the device to
+	 * acknowledge that any outstanding changes to the analogue
+	 * configuration have been applied.
+	 */
+	ret = regmap_write(regmap, LOCHNAGAR2_ANALOGUE_PATH_CTRL1, 0);
+	if (ret < 0)
+		return ret;
+
+	ret = regmap_write(regmap, LOCHNAGAR2_ANALOGUE_PATH_CTRL1,
+			   LOCHNAGAR2_ANALOGUE_PATH_UPDATE_MASK);
+	if (ret < 0)
+		return ret;
+
+	ret = regmap_read_poll_timeout(regmap,
+				       LOCHNAGAR2_ANALOGUE_PATH_CTRL1, val,
+				       (val & done), LOCHNAGAR_CONFIG_POLL_US,
+				       timeout_ms * 1000);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(lochnagar_update_config);
+
+static int lochnagar_i2c_probe(struct i2c_client *i2c)
+{
+	struct device *dev = &i2c->dev;
+	const struct lochnagar_config *config = NULL;
+	const struct of_device_id *of_id;
+	struct lochnagar *lochnagar;
+	struct gpio_desc *reset, *present;
+	unsigned int val;
+	unsigned int firmwareid;
+	unsigned int devid, rev;
+	int ret;
+
+	lochnagar = devm_kzalloc(dev, sizeof(*lochnagar), GFP_KERNEL);
+	if (!lochnagar)
+		return -ENOMEM;
+
+	of_id = of_match_device(lochnagar_of_match, dev);
+	if (!of_id)
+		return -EINVAL;
+
+	config = of_id->data;
+
+	lochnagar->dev = dev;
+	mutex_init(&lochnagar->analogue_config_lock);
+
+	dev_set_drvdata(dev, lochnagar);
+
+	reset = devm_gpiod_get(dev, "reset", GPIOD_OUT_LOW);
+	if (IS_ERR(reset)) {
+		ret = PTR_ERR(reset);
+		dev_err(dev, "Failed to get reset GPIO: %d\n", ret);
+		return ret;
+	}
+
+	present = devm_gpiod_get_optional(dev, "present", GPIOD_OUT_HIGH);
+	if (IS_ERR(present)) {
+		ret = PTR_ERR(present);
+		dev_err(dev, "Failed to get present GPIO: %d\n", ret);
+		return ret;
+	}
+
+	/* Leave the Lochnagar in reset for a reasonable amount of time */
+	msleep(20);
+
+	/* Bring Lochnagar out of reset */
+	gpiod_set_value_cansleep(reset, 1);
+
+	/* Identify Lochnagar */
+	lochnagar->type = config->type;
+
+	lochnagar->regmap = devm_regmap_init_i2c(i2c, config->regmap);
+	if (IS_ERR(lochnagar->regmap)) {
+		ret = PTR_ERR(lochnagar->regmap);
+		dev_err(dev, "Failed to allocate register map: %d\n", ret);
+		return ret;
+	}
+
+	/* Wait for Lochnagar to boot */
+	ret = lochnagar_wait_for_boot(lochnagar->regmap, &val);
+	if (ret < 0) {
+		dev_err(dev, "Failed to read device ID: %d\n", ret);
+		return ret;
+	}
+
+	devid = val & LOCHNAGAR_DEVICE_ID_MASK;
+	rev = val & LOCHNAGAR_REV_ID_MASK;
+
+	if (devid != config->id) {
+		dev_err(dev,
+			"ID does not match %s (expected 0x%x got 0x%x)\n",
+			config->name, config->id, devid);
+		return -ENODEV;
+	}
+
+	/* Identify firmware */
+	ret = regmap_read(lochnagar->regmap, LOCHNAGAR_FIRMWARE_ID1, &val);
+	if (ret < 0) {
+		dev_err(dev, "Failed to read firmware id 1: %d\n", ret);
+		return ret;
+	}
+
+	firmwareid = val;
+
+	ret = regmap_read(lochnagar->regmap, LOCHNAGAR_FIRMWARE_ID2, &val);
+	if (ret < 0) {
+		dev_err(dev, "Failed to read firmware id 2: %d\n", ret);
+		return ret;
+	}
+
+	firmwareid |= (val << config->regmap->val_bits);
+
+	dev_info(dev, "Found %s (0x%x) revision %u firmware 0x%.6x\n",
+		 config->name, devid, rev + 1, firmwareid);
+
+	ret = regmap_register_patch(lochnagar->regmap, config->patch,
+				    config->npatch);
+	if (ret < 0) {
+		dev_err(dev, "Failed to register patch: %d\n", ret);
+		return ret;
+	}
+
+	ret = devm_of_platform_populate(dev);
+	if (ret < 0) {
+		dev_err(dev, "Failed to populate child nodes: %d\n", ret);
+		return ret;
+	}
+
+	return ret;
+}
+
+static struct i2c_driver lochnagar_i2c_driver = {
+	.driver = {
+		.name = "lochnagar",
+		.of_match_table = of_match_ptr(lochnagar_of_match),
+		.suppress_bind_attrs = true,
+	},
+	.probe_new = lochnagar_i2c_probe,
+};
+
+static int __init lochnagar_i2c_init(void)
+{
+	int ret;
+
+	ret = i2c_add_driver(&lochnagar_i2c_driver);
+	if (ret)
+		pr_err("Failed to register Lochnagar driver: %d\n", ret);
+
+	return ret;
+}
+subsys_initcall(lochnagar_i2c_init);
diff --git a/include/linux/mfd/lochnagar.h b/include/linux/mfd/lochnagar.h
new file mode 100644
index 000000000000..ff9e64cfc9fb
--- /dev/null
+++ b/include/linux/mfd/lochnagar.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Lochnagar internals
+ *
+ * Copyright (c) 2013-2018 Cirrus Logic, Inc. and
+ *                         Cirrus Logic International Semiconductor Ltd.
+ *
+ * Author: Charles Keepax <ckeepax@opensource.cirrus.com>
+ */
+
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/regmap.h>
+
+#ifndef CIRRUS_LOCHNAGAR_H
+#define CIRRUS_LOCHNAGAR_H
+
+enum lochnagar_type {
+	LOCHNAGAR1,
+	LOCHNAGAR2,
+};
+
+/**
+ * struct lochnagar - Core data for the Lochnagar audio board driver.
+ *
+ * @type: The type of Lochnagar device connected.
+ * @dev: A pointer to the struct device for the main MFD.
+ * @regmap: The devices main register map.
+ * @analogue_config_lock: Lock used to protect updates in the analogue
+ * configuration as these must not be changed whilst the hardware is processing
+ * the last update.
+ */
+struct lochnagar {
+	enum lochnagar_type type;
+	struct device *dev;
+	struct regmap *regmap;
+
+	/* Lock to protect updates to the analogue configuration */
+	struct mutex analogue_config_lock;
+};
+
+/* Register Addresses */
+#define LOCHNAGAR_SOFTWARE_RESET                             0x00
+#define LOCHNAGAR_FIRMWARE_ID1                               0x01
+#define LOCHNAGAR_FIRMWARE_ID2                               0x02
+
+/* (0x0000)  Software Reset */
+#define LOCHNAGAR_DEVICE_ID_MASK                           0xFFFC
+#define LOCHNAGAR_DEVICE_ID_SHIFT                               2
+#define LOCHNAGAR_REV_ID_MASK                              0x0003
+#define LOCHNAGAR_REV_ID_SHIFT                                  0
+
+int lochnagar_update_config(struct lochnagar *lochnagar);
+
+#endif
diff --git a/include/linux/mfd/lochnagar1_regs.h b/include/linux/mfd/lochnagar1_regs.h
new file mode 100644
index 000000000000..114b846245d9
--- /dev/null
+++ b/include/linux/mfd/lochnagar1_regs.h
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Lochnagar1 register definitions
+ *
+ * Copyright (c) 2017-2018 Cirrus Logic, Inc. and
+ *                         Cirrus Logic International Semiconductor Ltd.
+ *
+ * Author: Charles Keepax <ckeepax@opensource.cirrus.com>
+ */
+
+#ifndef LOCHNAGAR1_REGISTERS_H
+#define LOCHNAGAR1_REGISTERS_H
+
+/* Register Addresses */
+#define LOCHNAGAR1_CDC_AIF1_SEL                       0x0008
+#define LOCHNAGAR1_CDC_AIF2_SEL                       0x0009
+#define LOCHNAGAR1_CDC_AIF3_SEL                       0x000A
+#define LOCHNAGAR1_CDC_MCLK1_SEL                      0x000B
+#define LOCHNAGAR1_CDC_MCLK2_SEL                      0x000C
+#define LOCHNAGAR1_CDC_AIF_CTRL1                      0x000D
+#define LOCHNAGAR1_CDC_AIF_CTRL2                      0x000E
+#define LOCHNAGAR1_EXT_AIF_CTRL                       0x000F
+#define LOCHNAGAR1_DSP_AIF1_SEL                       0x0010
+#define LOCHNAGAR1_DSP_AIF2_SEL                       0x0011
+#define LOCHNAGAR1_DSP_CLKIN_SEL                      0x0012
+#define LOCHNAGAR1_DSP_AIF                            0x0013
+#define LOCHNAGAR1_GF_AIF1                            0x0014
+#define LOCHNAGAR1_GF_AIF2                            0x0015
+#define LOCHNAGAR1_PSIA_AIF                           0x0016
+#define LOCHNAGAR1_PSIA1_SEL                          0x0017
+#define LOCHNAGAR1_PSIA2_SEL                          0x0018
+#define LOCHNAGAR1_SPDIF_AIF_SEL                      0x0019
+#define LOCHNAGAR1_GF_AIF3_SEL                        0x001C
+#define LOCHNAGAR1_GF_AIF4_SEL                        0x001D
+#define LOCHNAGAR1_GF_CLKOUT1_SEL                     0x001E
+#define LOCHNAGAR1_GF_AIF1_SEL                        0x001F
+#define LOCHNAGAR1_GF_AIF2_SEL                        0x0020
+#define LOCHNAGAR1_GF_GPIO2                           0x0026
+#define LOCHNAGAR1_GF_GPIO3                           0x0027
+#define LOCHNAGAR1_GF_GPIO7                           0x0028
+#define LOCHNAGAR1_RST                                0x0029
+#define LOCHNAGAR1_LED1                               0x002A
+#define LOCHNAGAR1_LED2                               0x002B
+#define LOCHNAGAR1_I2C_CTRL                           0x0046
+
+/*
+ * (0x0008 - 0x000C, 0x0010 - 0x0012, 0x0017 - 0x0020)
+ * CDC_AIF1_SEL - GF_AIF2_SEL
+ */
+#define LOCHNAGAR1_SRC_MASK                             0xFF
+#define LOCHNAGAR1_SRC_SHIFT                               0
+
+/* (0x000D)  CDC_AIF_CTRL1 */
+#define LOCHNAGAR1_CDC_AIF2_LRCLK_DIR_MASK              0x40
+#define LOCHNAGAR1_CDC_AIF2_LRCLK_DIR_SHIFT                6
+#define LOCHNAGAR1_CDC_AIF2_BCLK_DIR_MASK               0x20
+#define LOCHNAGAR1_CDC_AIF2_BCLK_DIR_SHIFT                 5
+#define LOCHNAGAR1_CDC_AIF2_ENA_MASK                    0x10
+#define LOCHNAGAR1_CDC_AIF2_ENA_SHIFT                      4
+#define LOCHNAGAR1_CDC_AIF1_LRCLK_DIR_MASK              0x04
+#define LOCHNAGAR1_CDC_AIF1_LRCLK_DIR_SHIFT                2
+#define LOCHNAGAR1_CDC_AIF1_BCLK_DIR_MASK               0x02
+#define LOCHNAGAR1_CDC_AIF1_BCLK_DIR_SHIFT                 1
+#define LOCHNAGAR1_CDC_AIF1_ENA_MASK                    0x01
+#define LOCHNAGAR1_CDC_AIF1_ENA_SHIFT                      0
+
+/* (0x000E)  CDC_AIF_CTRL2 */
+#define LOCHNAGAR1_CDC_AIF3_LRCLK_DIR_MASK              0x40
+#define LOCHNAGAR1_CDC_AIF3_LRCLK_DIR_SHIFT                6
+#define LOCHNAGAR1_CDC_AIF3_BCLK_DIR_MASK               0x20
+#define LOCHNAGAR1_CDC_AIF3_BCLK_DIR_SHIFT                 5
+#define LOCHNAGAR1_CDC_AIF3_ENA_MASK                    0x10
+#define LOCHNAGAR1_CDC_AIF3_ENA_SHIFT                      4
+#define LOCHNAGAR1_CDC_MCLK1_ENA_MASK                   0x02
+#define LOCHNAGAR1_CDC_MCLK1_ENA_SHIFT                     1
+#define LOCHNAGAR1_CDC_MCLK2_ENA_MASK                   0x01
+#define LOCHNAGAR1_CDC_MCLK2_ENA_SHIFT                     0
+
+/* (0x000F)  EXT_AIF_CTRL */
+#define LOCHNAGAR1_SPDIF_AIF_LRCLK_DIR_MASK             0x20
+#define LOCHNAGAR1_SPDIF_AIF_LRCLK_DIR_SHIFT               5
+#define LOCHNAGAR1_SPDIF_AIF_BCLK_DIR_MASK              0x10
+#define LOCHNAGAR1_SPDIF_AIF_BCLK_DIR_SHIFT                4
+#define LOCHNAGAR1_SPDIF_AIF_ENA_MASK                   0x08
+#define LOCHNAGAR1_SPDIF_AIF_ENA_SHIFT                     3
+
+/* (0x0013)  DSP_AIF */
+#define LOCHNAGAR1_DSP_AIF2_LRCLK_DIR_MASK              0x40
+#define LOCHNAGAR1_DSP_AIF2_LRCLK_DIR_SHIFT                6
+#define LOCHNAGAR1_DSP_AIF2_BCLK_DIR_MASK               0x20
+#define LOCHNAGAR1_DSP_AIF2_BCLK_DIR_SHIFT                 5
+#define LOCHNAGAR1_DSP_AIF2_ENA_MASK                    0x10
+#define LOCHNAGAR1_DSP_AIF2_ENA_SHIFT                      4
+#define LOCHNAGAR1_DSP_CLKIN_ENA_MASK                   0x08
+#define LOCHNAGAR1_DSP_CLKIN_ENA_SHIFT                     3
+#define LOCHNAGAR1_DSP_AIF1_LRCLK_DIR_MASK              0x04
+#define LOCHNAGAR1_DSP_AIF1_LRCLK_DIR_SHIFT                2
+#define LOCHNAGAR1_DSP_AIF1_BCLK_DIR_MASK               0x02
+#define LOCHNAGAR1_DSP_AIF1_BCLK_DIR_SHIFT                 1
+#define LOCHNAGAR1_DSP_AIF1_ENA_MASK                    0x01
+#define LOCHNAGAR1_DSP_AIF1_ENA_SHIFT                      0
+
+/* (0x0014)  GF_AIF1 */
+#define LOCHNAGAR1_GF_CLKOUT1_ENA_MASK                  0x40
+#define LOCHNAGAR1_GF_CLKOUT1_ENA_SHIFT                    6
+#define LOCHNAGAR1_GF_AIF3_LRCLK_DIR_MASK               0x20
+#define LOCHNAGAR1_GF_AIF3_LRCLK_DIR_SHIFT                 5
+#define LOCHNAGAR1_GF_AIF3_BCLK_DIR_MASK                0x10
+#define LOCHNAGAR1_GF_AIF3_BCLK_DIR_SHIFT                  4
+#define LOCHNAGAR1_GF_AIF3_ENA_MASK                     0x08
+#define LOCHNAGAR1_GF_AIF3_ENA_SHIFT                       3
+#define LOCHNAGAR1_GF_AIF1_LRCLK_DIR_MASK               0x04
+#define LOCHNAGAR1_GF_AIF1_LRCLK_DIR_SHIFT                 2
+#define LOCHNAGAR1_GF_AIF1_BCLK_DIR_MASK                0x02
+#define LOCHNAGAR1_GF_AIF1_BCLK_DIR_SHIFT                  1
+#define LOCHNAGAR1_GF_AIF1_ENA_MASK                     0x01
+#define LOCHNAGAR1_GF_AIF1_ENA_SHIFT                       0
+
+/* (0x0015)  GF_AIF2 */
+#define LOCHNAGAR1_GF_AIF4_LRCLK_DIR_MASK               0x20
+#define LOCHNAGAR1_GF_AIF4_LRCLK_DIR_SHIFT                 5
+#define LOCHNAGAR1_GF_AIF4_BCLK_DIR_MASK                0x10
+#define LOCHNAGAR1_GF_AIF4_BCLK_DIR_SHIFT                  4
+#define LOCHNAGAR1_GF_AIF4_ENA_MASK                     0x08
+#define LOCHNAGAR1_GF_AIF4_ENA_SHIFT                       3
+#define LOCHNAGAR1_GF_AIF2_LRCLK_DIR_MASK               0x04
+#define LOCHNAGAR1_GF_AIF2_LRCLK_DIR_SHIFT                 2
+#define LOCHNAGAR1_GF_AIF2_BCLK_DIR_MASK                0x02
+#define LOCHNAGAR1_GF_AIF2_BCLK_DIR_SHIFT                  1
+#define LOCHNAGAR1_GF_AIF2_ENA_MASK                     0x01
+#define LOCHNAGAR1_GF_AIF2_ENA_SHIFT                       0
+
+/* (0x0016)  PSIA_AIF */
+#define LOCHNAGAR1_PSIA2_LRCLK_DIR_MASK                 0x40
+#define LOCHNAGAR1_PSIA2_LRCLK_DIR_SHIFT                   6
+#define LOCHNAGAR1_PSIA2_BCLK_DIR_MASK                  0x20
+#define LOCHNAGAR1_PSIA2_BCLK_DIR_SHIFT                    5
+#define LOCHNAGAR1_PSIA2_ENA_MASK                       0x10
+#define LOCHNAGAR1_PSIA2_ENA_SHIFT                         4
+#define LOCHNAGAR1_PSIA1_LRCLK_DIR_MASK                 0x04
+#define LOCHNAGAR1_PSIA1_LRCLK_DIR_SHIFT                   2
+#define LOCHNAGAR1_PSIA1_BCLK_DIR_MASK                  0x02
+#define LOCHNAGAR1_PSIA1_BCLK_DIR_SHIFT                    1
+#define LOCHNAGAR1_PSIA1_ENA_MASK                       0x01
+#define LOCHNAGAR1_PSIA1_ENA_SHIFT                         0
+
+/* (0x0029)  RST */
+#define LOCHNAGAR1_DSP_RESET_MASK                       0x02
+#define LOCHNAGAR1_DSP_RESET_SHIFT                         1
+#define LOCHNAGAR1_CDC_RESET_MASK                       0x01
+#define LOCHNAGAR1_CDC_RESET_SHIFT                         0
+
+/* (0x0046)  I2C_CTRL */
+#define LOCHNAGAR1_CDC_CIF_MODE_MASK                    0x01
+#define LOCHNAGAR1_CDC_CIF_MODE_SHIFT                      0
+
+#endif
diff --git a/include/linux/mfd/lochnagar2_regs.h b/include/linux/mfd/lochnagar2_regs.h
new file mode 100644
index 000000000000..419b25a332fd
--- /dev/null
+++ b/include/linux/mfd/lochnagar2_regs.h
@@ -0,0 +1,291 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Lochnagar2 register definitions
+ *
+ * Copyright (c) 2017-2018 Cirrus Logic, Inc. and
+ *                         Cirrus Logic International Semiconductor Ltd.
+ *
+ * Author: Charles Keepax <ckeepax@opensource.cirrus.com>
+ */
+
+#ifndef LOCHNAGAR2_REGISTERS_H
+#define LOCHNAGAR2_REGISTERS_H
+
+/* Register Addresses */
+#define LOCHNAGAR2_CDC_AIF1_CTRL                      0x000D
+#define LOCHNAGAR2_CDC_AIF2_CTRL                      0x000E
+#define LOCHNAGAR2_CDC_AIF3_CTRL                      0x000F
+#define LOCHNAGAR2_DSP_AIF1_CTRL                      0x0010
+#define LOCHNAGAR2_DSP_AIF2_CTRL                      0x0011
+#define LOCHNAGAR2_PSIA1_CTRL                         0x0012
+#define LOCHNAGAR2_PSIA2_CTRL                         0x0013
+#define LOCHNAGAR2_GF_AIF3_CTRL                       0x0014
+#define LOCHNAGAR2_GF_AIF4_CTRL                       0x0015
+#define LOCHNAGAR2_GF_AIF1_CTRL                       0x0016
+#define LOCHNAGAR2_GF_AIF2_CTRL                       0x0017
+#define LOCHNAGAR2_SPDIF_AIF_CTRL                     0x0018
+#define LOCHNAGAR2_USB_AIF1_CTRL                      0x0019
+#define LOCHNAGAR2_USB_AIF2_CTRL                      0x001A
+#define LOCHNAGAR2_ADAT_AIF_CTRL                      0x001B
+#define LOCHNAGAR2_CDC_MCLK1_CTRL                     0x001E
+#define LOCHNAGAR2_CDC_MCLK2_CTRL                     0x001F
+#define LOCHNAGAR2_DSP_CLKIN_CTRL                     0x0020
+#define LOCHNAGAR2_PSIA1_MCLK_CTRL                    0x0021
+#define LOCHNAGAR2_PSIA2_MCLK_CTRL                    0x0022
+#define LOCHNAGAR2_SPDIF_MCLK_CTRL                    0x0023
+#define LOCHNAGAR2_GF_CLKOUT1_CTRL                    0x0024
+#define LOCHNAGAR2_GF_CLKOUT2_CTRL                    0x0025
+#define LOCHNAGAR2_ADAT_MCLK_CTRL                     0x0026
+#define LOCHNAGAR2_SOUNDCARD_MCLK_CTRL                0x0027
+#define LOCHNAGAR2_GPIO_FPGA_GPIO1                    0x0031
+#define LOCHNAGAR2_GPIO_FPGA_GPIO2                    0x0032
+#define LOCHNAGAR2_GPIO_FPGA_GPIO3                    0x0033
+#define LOCHNAGAR2_GPIO_FPGA_GPIO4                    0x0034
+#define LOCHNAGAR2_GPIO_FPGA_GPIO5                    0x0035
+#define LOCHNAGAR2_GPIO_FPGA_GPIO6                    0x0036
+#define LOCHNAGAR2_GPIO_CDC_GPIO1                     0x0037
+#define LOCHNAGAR2_GPIO_CDC_GPIO2                     0x0038
+#define LOCHNAGAR2_GPIO_CDC_GPIO3                     0x0039
+#define LOCHNAGAR2_GPIO_CDC_GPIO4                     0x003A
+#define LOCHNAGAR2_GPIO_CDC_GPIO5                     0x003B
+#define LOCHNAGAR2_GPIO_CDC_GPIO6                     0x003C
+#define LOCHNAGAR2_GPIO_CDC_GPIO7                     0x003D
+#define LOCHNAGAR2_GPIO_CDC_GPIO8                     0x003E
+#define LOCHNAGAR2_GPIO_DSP_GPIO1                     0x003F
+#define LOCHNAGAR2_GPIO_DSP_GPIO2                     0x0040
+#define LOCHNAGAR2_GPIO_DSP_GPIO3                     0x0041
+#define LOCHNAGAR2_GPIO_DSP_GPIO4                     0x0042
+#define LOCHNAGAR2_GPIO_DSP_GPIO5                     0x0043
+#define LOCHNAGAR2_GPIO_DSP_GPIO6                     0x0044
+#define LOCHNAGAR2_GPIO_GF_GPIO2                      0x0045
+#define LOCHNAGAR2_GPIO_GF_GPIO3                      0x0046
+#define LOCHNAGAR2_GPIO_GF_GPIO7                      0x0047
+#define LOCHNAGAR2_GPIO_CDC_AIF1_BCLK                 0x0048
+#define LOCHNAGAR2_GPIO_CDC_AIF1_RXDAT                0x0049
+#define LOCHNAGAR2_GPIO_CDC_AIF1_LRCLK                0x004A
+#define LOCHNAGAR2_GPIO_CDC_AIF1_TXDAT                0x004B
+#define LOCHNAGAR2_GPIO_CDC_AIF2_BCLK                 0x004C
+#define LOCHNAGAR2_GPIO_CDC_AIF2_RXDAT                0x004D
+#define LOCHNAGAR2_GPIO_CDC_AIF2_LRCLK                0x004E
+#define LOCHNAGAR2_GPIO_CDC_AIF2_TXDAT                0x004F
+#define LOCHNAGAR2_GPIO_CDC_AIF3_BCLK                 0x0050
+#define LOCHNAGAR2_GPIO_CDC_AIF3_RXDAT                0x0051
+#define LOCHNAGAR2_GPIO_CDC_AIF3_LRCLK                0x0052
+#define LOCHNAGAR2_GPIO_CDC_AIF3_TXDAT                0x0053
+#define LOCHNAGAR2_GPIO_DSP_AIF1_BCLK                 0x0054
+#define LOCHNAGAR2_GPIO_DSP_AIF1_RXDAT                0x0055
+#define LOCHNAGAR2_GPIO_DSP_AIF1_LRCLK                0x0056
+#define LOCHNAGAR2_GPIO_DSP_AIF1_TXDAT                0x0057
+#define LOCHNAGAR2_GPIO_DSP_AIF2_BCLK                 0x0058
+#define LOCHNAGAR2_GPIO_DSP_AIF2_RXDAT                0x0059
+#define LOCHNAGAR2_GPIO_DSP_AIF2_LRCLK                0x005A
+#define LOCHNAGAR2_GPIO_DSP_AIF2_TXDAT                0x005B
+#define LOCHNAGAR2_GPIO_PSIA1_BCLK                    0x005C
+#define LOCHNAGAR2_GPIO_PSIA1_RXDAT                   0x005D
+#define LOCHNAGAR2_GPIO_PSIA1_LRCLK                   0x005E
+#define LOCHNAGAR2_GPIO_PSIA1_TXDAT                   0x005F
+#define LOCHNAGAR2_GPIO_PSIA2_BCLK                    0x0060
+#define LOCHNAGAR2_GPIO_PSIA2_RXDAT                   0x0061
+#define LOCHNAGAR2_GPIO_PSIA2_LRCLK                   0x0062
+#define LOCHNAGAR2_GPIO_PSIA2_TXDAT                   0x0063
+#define LOCHNAGAR2_GPIO_GF_AIF3_BCLK                  0x0064
+#define LOCHNAGAR2_GPIO_GF_AIF3_RXDAT                 0x0065
+#define LOCHNAGAR2_GPIO_GF_AIF3_LRCLK                 0x0066
+#define LOCHNAGAR2_GPIO_GF_AIF3_TXDAT                 0x0067
+#define LOCHNAGAR2_GPIO_GF_AIF4_BCLK                  0x0068
+#define LOCHNAGAR2_GPIO_GF_AIF4_RXDAT                 0x0069
+#define LOCHNAGAR2_GPIO_GF_AIF4_LRCLK                 0x006A
+#define LOCHNAGAR2_GPIO_GF_AIF4_TXDAT                 0x006B
+#define LOCHNAGAR2_GPIO_GF_AIF1_BCLK                  0x006C
+#define LOCHNAGAR2_GPIO_GF_AIF1_RXDAT                 0x006D
+#define LOCHNAGAR2_GPIO_GF_AIF1_LRCLK                 0x006E
+#define LOCHNAGAR2_GPIO_GF_AIF1_TXDAT                 0x006F
+#define LOCHNAGAR2_GPIO_GF_AIF2_BCLK                  0x0070
+#define LOCHNAGAR2_GPIO_GF_AIF2_RXDAT                 0x0071
+#define LOCHNAGAR2_GPIO_GF_AIF2_LRCLK                 0x0072
+#define LOCHNAGAR2_GPIO_GF_AIF2_TXDAT                 0x0073
+#define LOCHNAGAR2_GPIO_DSP_UART1_RX                  0x0074
+#define LOCHNAGAR2_GPIO_DSP_UART1_TX                  0x0075
+#define LOCHNAGAR2_GPIO_DSP_UART2_RX                  0x0076
+#define LOCHNAGAR2_GPIO_DSP_UART2_TX                  0x0077
+#define LOCHNAGAR2_GPIO_GF_UART2_RX                   0x0078
+#define LOCHNAGAR2_GPIO_GF_UART2_TX                   0x0079
+#define LOCHNAGAR2_GPIO_USB_UART_RX                   0x007A
+#define LOCHNAGAR2_GPIO_CDC_PDMCLK1                   0x007C
+#define LOCHNAGAR2_GPIO_CDC_PDMDAT1                   0x007D
+#define LOCHNAGAR2_GPIO_CDC_PDMCLK2                   0x007E
+#define LOCHNAGAR2_GPIO_CDC_PDMDAT2                   0x007F
+#define LOCHNAGAR2_GPIO_CDC_DMICCLK1                  0x0080
+#define LOCHNAGAR2_GPIO_CDC_DMICDAT1                  0x0081
+#define LOCHNAGAR2_GPIO_CDC_DMICCLK2                  0x0082
+#define LOCHNAGAR2_GPIO_CDC_DMICDAT2                  0x0083
+#define LOCHNAGAR2_GPIO_CDC_DMICCLK3                  0x0084
+#define LOCHNAGAR2_GPIO_CDC_DMICDAT3                  0x0085
+#define LOCHNAGAR2_GPIO_CDC_DMICCLK4                  0x0086
+#define LOCHNAGAR2_GPIO_CDC_DMICDAT4                  0x0087
+#define LOCHNAGAR2_GPIO_DSP_DMICCLK1                  0x0088
+#define LOCHNAGAR2_GPIO_DSP_DMICDAT1                  0x0089
+#define LOCHNAGAR2_GPIO_DSP_DMICCLK2                  0x008A
+#define LOCHNAGAR2_GPIO_DSP_DMICDAT2                  0x008B
+#define LOCHNAGAR2_GPIO_I2C2_SCL                      0x008C
+#define LOCHNAGAR2_GPIO_I2C2_SDA                      0x008D
+#define LOCHNAGAR2_GPIO_I2C3_SCL                      0x008E
+#define LOCHNAGAR2_GPIO_I2C3_SDA                      0x008F
+#define LOCHNAGAR2_GPIO_I2C4_SCL                      0x0090
+#define LOCHNAGAR2_GPIO_I2C4_SDA                      0x0091
+#define LOCHNAGAR2_GPIO_DSP_STANDBY                   0x0092
+#define LOCHNAGAR2_GPIO_CDC_MCLK1                     0x0093
+#define LOCHNAGAR2_GPIO_CDC_MCLK2                     0x0094
+#define LOCHNAGAR2_GPIO_DSP_CLKIN                     0x0095
+#define LOCHNAGAR2_GPIO_PSIA1_MCLK                    0x0096
+#define LOCHNAGAR2_GPIO_PSIA2_MCLK                    0x0097
+#define LOCHNAGAR2_GPIO_GF_GPIO1                      0x0098
+#define LOCHNAGAR2_GPIO_GF_GPIO5                      0x0099
+#define LOCHNAGAR2_GPIO_DSP_GPIO20                    0x009A
+#define LOCHNAGAR2_GPIO_CHANNEL1                      0x00B9
+#define LOCHNAGAR2_GPIO_CHANNEL2                      0x00BA
+#define LOCHNAGAR2_GPIO_CHANNEL3                      0x00BB
+#define LOCHNAGAR2_GPIO_CHANNEL4                      0x00BC
+#define LOCHNAGAR2_GPIO_CHANNEL5                      0x00BD
+#define LOCHNAGAR2_GPIO_CHANNEL6                      0x00BE
+#define LOCHNAGAR2_GPIO_CHANNEL7                      0x00BF
+#define LOCHNAGAR2_GPIO_CHANNEL8                      0x00C0
+#define LOCHNAGAR2_GPIO_CHANNEL9                      0x00C1
+#define LOCHNAGAR2_GPIO_CHANNEL10                     0x00C2
+#define LOCHNAGAR2_GPIO_CHANNEL11                     0x00C3
+#define LOCHNAGAR2_GPIO_CHANNEL12                     0x00C4
+#define LOCHNAGAR2_GPIO_CHANNEL13                     0x00C5
+#define LOCHNAGAR2_GPIO_CHANNEL14                     0x00C6
+#define LOCHNAGAR2_GPIO_CHANNEL15                     0x00C7
+#define LOCHNAGAR2_GPIO_CHANNEL16                     0x00C8
+#define LOCHNAGAR2_MINICARD_RESETS                    0x00DF
+#define LOCHNAGAR2_ANALOGUE_PATH_CTRL1                0x00E3
+#define LOCHNAGAR2_ANALOGUE_PATH_CTRL2                0x00E4
+#define LOCHNAGAR2_COMMS_CTRL4                        0x00F0
+#define LOCHNAGAR2_SPDIF_CTRL                         0x00FE
+#define LOCHNAGAR2_IMON_CTRL1                         0x0108
+#define LOCHNAGAR2_IMON_CTRL2                         0x0109
+#define LOCHNAGAR2_IMON_CTRL3                         0x010A
+#define LOCHNAGAR2_IMON_CTRL4                         0x010B
+#define LOCHNAGAR2_IMON_DATA1                         0x010C
+#define LOCHNAGAR2_IMON_DATA2                         0x010D
+#define LOCHNAGAR2_POWER_CTRL                         0x0116
+#define LOCHNAGAR2_MICVDD_CTRL1                       0x0119
+#define LOCHNAGAR2_MICVDD_CTRL2                       0x011B
+#define LOCHNAGAR2_VDDCORE_CDC_CTRL1                  0x011E
+#define LOCHNAGAR2_VDDCORE_CDC_CTRL2                  0x0120
+#define LOCHNAGAR2_SOUNDCARD_AIF_CTRL                 0x0180
+
+/* (0x000D-0x001B, 0x0180)  CDC_AIF1_CTRL - SOUNCARD_AIF_CTRL */
+#define LOCHNAGAR2_AIF_ENA_MASK                       0x8000
+#define LOCHNAGAR2_AIF_ENA_SHIFT                          15
+#define LOCHNAGAR2_AIF_LRCLK_DIR_MASK                 0x4000
+#define LOCHNAGAR2_AIF_LRCLK_DIR_SHIFT                    14
+#define LOCHNAGAR2_AIF_BCLK_DIR_MASK                  0x2000
+#define LOCHNAGAR2_AIF_BCLK_DIR_SHIFT                     13
+#define LOCHNAGAR2_AIF_SRC_MASK                       0x00FF
+#define LOCHNAGAR2_AIF_SRC_SHIFT                           0
+
+/* (0x001E - 0x0027)  CDC_MCLK1_CTRL - SOUNDCARD_MCLK_CTRL */
+#define LOCHNAGAR2_CLK_ENA_MASK                       0x8000
+#define LOCHNAGAR2_CLK_ENA_SHIFT                          15
+#define LOCHNAGAR2_CLK_SRC_MASK                       0x00FF
+#define LOCHNAGAR2_CLK_SRC_SHIFT                           0
+
+/* (0x0031 - 0x009A)  GPIO_FPGA_GPIO1 - GPIO_DSP_GPIO20 */
+#define LOCHNAGAR2_GPIO_SRC_MASK                      0x00FF
+#define LOCHNAGAR2_GPIO_SRC_SHIFT                          0
+
+/* (0x00B9 - 0x00C8)  GPIO_CHANNEL1 - GPIO_CHANNEL16 */
+#define LOCHNAGAR2_GPIO_CHANNEL_STS_MASK              0x8000
+#define LOCHNAGAR2_GPIO_CHANNEL_STS_SHIFT                 15
+#define LOCHNAGAR2_GPIO_CHANNEL_SRC_MASK              0x00FF
+#define LOCHNAGAR2_GPIO_CHANNEL_SRC_SHIFT                  0
+
+/* (0x00DF)  MINICARD_RESETS */
+#define LOCHNAGAR2_DSP_RESET_MASK                     0x0002
+#define LOCHNAGAR2_DSP_RESET_SHIFT                         1
+#define LOCHNAGAR2_CDC_RESET_MASK                     0x0001
+#define LOCHNAGAR2_CDC_RESET_SHIFT                         0
+
+/* (0x00E3)  ANALOGUE_PATH_CTRL1 */
+#define LOCHNAGAR2_ANALOGUE_PATH_UPDATE_MASK          0x8000
+#define LOCHNAGAR2_ANALOGUE_PATH_UPDATE_SHIFT             15
+#define LOCHNAGAR2_ANALOGUE_PATH_UPDATE_STS_MASK      0x4000
+#define LOCHNAGAR2_ANALOGUE_PATH_UPDATE_STS_SHIFT         14
+
+/* (0x00E4)  ANALOGUE_PATH_CTRL2 */
+#define LOCHNAGAR2_P2_INPUT_BIAS_ENA_MASK             0x0080
+#define LOCHNAGAR2_P2_INPUT_BIAS_ENA_SHIFT                 7
+#define LOCHNAGAR2_P1_INPUT_BIAS_ENA_MASK             0x0040
+#define LOCHNAGAR2_P1_INPUT_BIAS_ENA_SHIFT                 6
+#define LOCHNAGAR2_P2_MICBIAS_SRC_MASK                0x0038
+#define LOCHNAGAR2_P2_MICBIAS_SRC_SHIFT                    3
+#define LOCHNAGAR2_P1_MICBIAS_SRC_MASK                0x0007
+#define LOCHNAGAR2_P1_MICBIAS_SRC_SHIFT                    0
+
+/* (0x00F0)  COMMS_CTRL4 */
+#define LOCHNAGAR2_CDC_CIF1MODE_MASK                  0x0001
+#define LOCHNAGAR2_CDC_CIF1MODE_SHIFT                      0
+
+/* (0x00FE)  SPDIF_CTRL */
+#define LOCHNAGAR2_SPDIF_HWMODE_MASK                  0x0008
+#define LOCHNAGAR2_SPDIF_HWMODE_SHIFT                      3
+#define LOCHNAGAR2_SPDIF_RESET_MASK                   0x0001
+#define LOCHNAGAR2_SPDIF_RESET_SHIFT                       0
+
+/* (0x0108)  IMON_CTRL1 */
+#define LOCHNAGAR2_IMON_ENA_MASK                      0x8000
+#define LOCHNAGAR2_IMON_ENA_SHIFT                         15
+#define LOCHNAGAR2_IMON_MEASURED_CHANNELS_MASK        0x03FC
+#define LOCHNAGAR2_IMON_MEASURED_CHANNELS_SHIFT            2
+#define LOCHNAGAR2_IMON_MODE_SEL_MASK                 0x0003
+#define LOCHNAGAR2_IMON_MODE_SEL_SHIFT                     0
+
+/* (0x0109)  IMON_CTRL2 */
+#define LOCHNAGAR2_IMON_FSR_MASK                      0x03FF
+#define LOCHNAGAR2_IMON_FSR_SHIFT                          0
+
+/* (0x010A)  IMON_CTRL3 */
+#define LOCHNAGAR2_IMON_DONE_MASK                     0x0004
+#define LOCHNAGAR2_IMON_DONE_SHIFT                         2
+#define LOCHNAGAR2_IMON_CONFIGURE_MASK                0x0002
+#define LOCHNAGAR2_IMON_CONFIGURE_SHIFT                    1
+#define LOCHNAGAR2_IMON_MEASURE_MASK                  0x0001
+#define LOCHNAGAR2_IMON_MEASURE_SHIFT                      0
+
+/* (0x010B)  IMON_CTRL4 */
+#define LOCHNAGAR2_IMON_DATA_REQ_MASK                 0x0080
+#define LOCHNAGAR2_IMON_DATA_REQ_SHIFT                     7
+#define LOCHNAGAR2_IMON_CH_SEL_MASK                   0x0070
+#define LOCHNAGAR2_IMON_CH_SEL_SHIFT                       4
+#define LOCHNAGAR2_IMON_DATA_RDY_MASK                 0x0008
+#define LOCHNAGAR2_IMON_DATA_RDY_SHIFT                     3
+#define LOCHNAGAR2_IMON_CH_SRC_MASK                   0x0007
+#define LOCHNAGAR2_IMON_CH_SRC_SHIFT                       0
+
+/* (0x010C, 0x010D)  IMON_DATA1, IMON_DATA2 */
+#define LOCHNAGAR2_IMON_DATA_MASK                     0xFFFF
+#define LOCHNAGAR2_IMON_DATA_SHIFT                         0
+
+/* (0x0116)  POWER_CTRL */
+#define LOCHNAGAR2_PWR_ENA_MASK                       0x0001
+#define LOCHNAGAR2_PWR_ENA_SHIFT                           0
+
+/* (0x0119)  MICVDD_CTRL1 */
+#define LOCHNAGAR2_MICVDD_REG_ENA_MASK                0x8000
+#define LOCHNAGAR2_MICVDD_REG_ENA_SHIFT                   15
+
+/* (0x011B)  MICVDD_CTRL2 */
+#define LOCHNAGAR2_MICVDD_VSEL_MASK                   0x001F
+#define LOCHNAGAR2_MICVDD_VSEL_SHIFT                       0
+
+/* (0x011E)  VDDCORE_CDC_CTRL1 */
+#define LOCHNAGAR2_VDDCORE_CDC_REG_ENA_MASK           0x8000
+#define LOCHNAGAR2_VDDCORE_CDC_REG_ENA_SHIFT              15
+
+/* (0x0120)  VDDCORE_CDC_CTRL2 */
+#define LOCHNAGAR2_VDDCORE_CDC_VSEL_MASK              0x007F
+#define LOCHNAGAR2_VDDCORE_CDC_VSEL_SHIFT                  0
+
+#endif
-- 
cgit v1.2.3


From 63635b54e07f646aa1d8e54c96b099162a226f7d Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Mon, 21 Jan 2019 08:32:22 -0500
Subject: media: v4l2-event: keep track of the timestamp in ns

Internally use ktime_get_ns() to get the timestamp of the event.
Only convert to timespec when interfacing with userspace.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/v4l2-core/v4l2-event.c | 19 +++++++++----------
 include/media/v4l2-event.h           |  2 ++
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-event.c b/drivers/media/v4l2-core/v4l2-event.c
index 481e3c65cf97..c46d14c996fc 100644
--- a/drivers/media/v4l2-core/v4l2-event.c
+++ b/drivers/media/v4l2-core/v4l2-event.c
@@ -52,6 +52,7 @@ static int __v4l2_event_dequeue(struct v4l2_fh *fh, struct v4l2_event *event)
 
 	kev->event.pending = fh->navailable;
 	*event = kev->event;
+	event->timestamp = ns_to_timespec(kev->ts);
 	kev->sev->first = sev_pos(kev->sev, 1);
 	kev->sev->in_use--;
 
@@ -103,8 +104,8 @@ static struct v4l2_subscribed_event *v4l2_event_subscribed(
 	return NULL;
 }
 
-static void __v4l2_event_queue_fh(struct v4l2_fh *fh, const struct v4l2_event *ev,
-		const struct timespec *ts)
+static void __v4l2_event_queue_fh(struct v4l2_fh *fh,
+				  const struct v4l2_event *ev, u64 ts)
 {
 	struct v4l2_subscribed_event *sev;
 	struct v4l2_kevent *kev;
@@ -144,7 +145,7 @@ static void __v4l2_event_queue_fh(struct v4l2_fh *fh, const struct v4l2_event *e
 	if (copy_payload)
 		kev->event.u = ev->u;
 	kev->event.id = ev->id;
-	kev->event.timestamp = *ts;
+	kev->ts = ts;
 	kev->event.sequence = fh->sequence;
 	sev->in_use++;
 	list_add_tail(&kev->list, &fh->available);
@@ -158,17 +159,17 @@ void v4l2_event_queue(struct video_device *vdev, const struct v4l2_event *ev)
 {
 	struct v4l2_fh *fh;
 	unsigned long flags;
-	struct timespec timestamp;
+	u64 ts;
 
 	if (vdev == NULL)
 		return;
 
-	ktime_get_ts(&timestamp);
+	ts = ktime_get_ns();
 
 	spin_lock_irqsave(&vdev->fh_lock, flags);
 
 	list_for_each_entry(fh, &vdev->fh_list, list)
-		__v4l2_event_queue_fh(fh, ev, &timestamp);
+		__v4l2_event_queue_fh(fh, ev, ts);
 
 	spin_unlock_irqrestore(&vdev->fh_lock, flags);
 }
@@ -177,12 +178,10 @@ EXPORT_SYMBOL_GPL(v4l2_event_queue);
 void v4l2_event_queue_fh(struct v4l2_fh *fh, const struct v4l2_event *ev)
 {
 	unsigned long flags;
-	struct timespec timestamp;
-
-	ktime_get_ts(&timestamp);
+	u64 ts = ktime_get_ns();
 
 	spin_lock_irqsave(&fh->vdev->fh_lock, flags);
-	__v4l2_event_queue_fh(fh, ev, &timestamp);
+	__v4l2_event_queue_fh(fh, ev, ts);
 	spin_unlock_irqrestore(&fh->vdev->fh_lock, flags);
 }
 EXPORT_SYMBOL_GPL(v4l2_event_queue_fh);
diff --git a/include/media/v4l2-event.h b/include/media/v4l2-event.h
index 17833e886e11..c2b6cdc714d2 100644
--- a/include/media/v4l2-event.h
+++ b/include/media/v4l2-event.h
@@ -34,11 +34,13 @@ struct video_device;
  * @list:	List node for the v4l2_fh->available list.
  * @sev:	Pointer to parent v4l2_subscribed_event.
  * @event:	The event itself.
+ * @ts:		The timestamp of the event.
  */
 struct v4l2_kevent {
 	struct list_head	list;
 	struct v4l2_subscribed_event *sev;
 	struct v4l2_event	event;
+	u64			ts;
 };
 
 /**
-- 
cgit v1.2.3


From 15a40b27beb0a85d7f11d747bfc587dbeb69a96c Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Mon, 21 Jan 2019 08:32:23 -0500
Subject: media: videobuf: use u64 for the timestamp internally

Just like vb2 does, use u64 internally to store the timestamps
of the buffers. Only convert to timeval when interfacing with
userspace.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/common/saa7146/saa7146_fops.c   |  2 +-
 drivers/media/pci/bt8xx/bttv-driver.c         |  8 +++-----
 drivers/media/pci/cx18/cx18-mailbox.c         |  2 +-
 drivers/media/platform/davinci/vpfe_capture.c |  2 +-
 drivers/media/platform/fsl-viu.c              |  2 +-
 drivers/media/platform/omap/omap_vout.c       | 12 ++++++------
 drivers/media/usb/cx231xx/cx231xx-417.c       |  4 ++--
 drivers/media/usb/cx231xx/cx231xx-vbi.c       |  2 +-
 drivers/media/usb/cx231xx/cx231xx-video.c     |  2 +-
 drivers/media/usb/tm6000/tm6000-video.c       |  2 +-
 drivers/media/usb/zr364xx/zr364xx.c           |  4 ++--
 drivers/media/v4l2-core/videobuf-core.c       |  4 ++--
 include/media/videobuf-core.h                 |  2 +-
 13 files changed, 23 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/media/common/saa7146/saa7146_fops.c b/drivers/media/common/saa7146/saa7146_fops.c
index c790ae264464..be4355a4c126 100644
--- a/drivers/media/common/saa7146/saa7146_fops.c
+++ b/drivers/media/common/saa7146/saa7146_fops.c
@@ -105,7 +105,7 @@ void saa7146_buffer_finish(struct saa7146_dev *dev,
 	}
 
 	q->curr->vb.state = state;
-	v4l2_get_timestamp(&q->curr->vb.ts);
+	q->curr->vb.ts = ktime_get_ns();
 	wake_up(&q->curr->vb.done);
 
 	q->curr = NULL;
diff --git a/drivers/media/pci/bt8xx/bttv-driver.c b/drivers/media/pci/bt8xx/bttv-driver.c
index d09785fd37a8..5e769c09dbd0 100644
--- a/drivers/media/pci/bt8xx/bttv-driver.c
+++ b/drivers/media/pci/bt8xx/bttv-driver.c
@@ -3600,9 +3600,7 @@ static void
 bttv_irq_wakeup_video(struct bttv *btv, struct bttv_buffer_set *wakeup,
 		      struct bttv_buffer_set *curr, unsigned int state)
 {
-	struct timeval ts;
-
-	v4l2_get_timestamp(&ts);
+	u64 ts = ktime_get_ns();
 
 	if (wakeup->top == wakeup->bottom) {
 		if (NULL != wakeup->top && curr->top != wakeup->top) {
@@ -3643,7 +3641,7 @@ bttv_irq_wakeup_vbi(struct bttv *btv, struct bttv_buffer *wakeup,
 	if (NULL == wakeup)
 		return;
 
-	v4l2_get_timestamp(&wakeup->vb.ts);
+	wakeup->vb.ts = ktime_get_ns();
 	wakeup->vb.field_count = btv->field_count;
 	wakeup->vb.state = state;
 	wake_up(&wakeup->vb.done);
@@ -3713,7 +3711,7 @@ bttv_irq_wakeup_top(struct bttv *btv)
 	btv->curr.top = NULL;
 	bttv_risc_hook(btv, RISC_SLOT_O_FIELD, NULL, 0);
 
-	v4l2_get_timestamp(&wakeup->vb.ts);
+	wakeup->vb.ts = ktime_get_ns();
 	wakeup->vb.field_count = btv->field_count;
 	wakeup->vb.state = VIDEOBUF_DONE;
 	wake_up(&wakeup->vb.done);
diff --git a/drivers/media/pci/cx18/cx18-mailbox.c b/drivers/media/pci/cx18/cx18-mailbox.c
index f66dd63e1994..0ffd2196a980 100644
--- a/drivers/media/pci/cx18/cx18-mailbox.c
+++ b/drivers/media/pci/cx18/cx18-mailbox.c
@@ -197,7 +197,7 @@ static void cx18_mdl_send_to_videobuf(struct cx18_stream *s,
 	}
 
 	if (dispatch) {
-		v4l2_get_timestamp(&vb_buf->vb.ts);
+		vb_buf->vb.ts = ktime_get_ns();
 		list_del(&vb_buf->vb.queue);
 		vb_buf->vb.state = VIDEOBUF_DONE;
 		wake_up(&vb_buf->vb.done);
diff --git a/drivers/media/platform/davinci/vpfe_capture.c b/drivers/media/platform/davinci/vpfe_capture.c
index 9996bab98fe3..26dadbba930f 100644
--- a/drivers/media/platform/davinci/vpfe_capture.c
+++ b/drivers/media/platform/davinci/vpfe_capture.c
@@ -518,7 +518,7 @@ static void vpfe_schedule_bottom_field(struct vpfe_device *vpfe_dev)
 
 static void vpfe_process_buffer_complete(struct vpfe_device *vpfe_dev)
 {
-	v4l2_get_timestamp(&vpfe_dev->cur_frm->ts);
+	vpfe_dev->cur_frm->ts = ktime_get_ns();
 	vpfe_dev->cur_frm->state = VIDEOBUF_DONE;
 	vpfe_dev->cur_frm->size = vpfe_dev->fmt.fmt.pix.sizeimage;
 	wake_up_interruptible(&vpfe_dev->cur_frm->done);
diff --git a/drivers/media/platform/fsl-viu.c b/drivers/media/platform/fsl-viu.c
index ca6d0317ab42..cffebcaacb90 100644
--- a/drivers/media/platform/fsl-viu.c
+++ b/drivers/media/platform/fsl-viu.c
@@ -1090,7 +1090,7 @@ static void viu_capture_intr(struct viu_dev *dev, u32 status)
 
 		if (waitqueue_active(&buf->vb.done)) {
 			list_del(&buf->vb.queue);
-			v4l2_get_timestamp(&buf->vb.ts);
+			buf->vb.ts = ktime_get_ns();
 			buf->vb.state = VIDEOBUF_DONE;
 			buf->vb.field_count++;
 			wake_up(&buf->vb.done);
diff --git a/drivers/media/platform/omap/omap_vout.c b/drivers/media/platform/omap/omap_vout.c
index f447ae3bb465..ff3de2dce5a2 100644
--- a/drivers/media/platform/omap/omap_vout.c
+++ b/drivers/media/platform/omap/omap_vout.c
@@ -513,7 +513,7 @@ static int omapvid_apply_changes(struct omap_vout_device *vout)
 }
 
 static int omapvid_handle_interlace_display(struct omap_vout_device *vout,
-		unsigned int irqstatus, struct timeval timevalue)
+		unsigned int irqstatus, u64 ts)
 {
 	u32 fid;
 
@@ -537,7 +537,7 @@ static int omapvid_handle_interlace_display(struct omap_vout_device *vout,
 		if (vout->cur_frm == vout->next_frm)
 			goto err;
 
-		vout->cur_frm->ts = timevalue;
+		vout->cur_frm->ts = ts;
 		vout->cur_frm->state = VIDEOBUF_DONE;
 		wake_up_interruptible(&vout->cur_frm->done);
 		vout->cur_frm = vout->next_frm;
@@ -557,7 +557,7 @@ static void omap_vout_isr(void *arg, unsigned int irqstatus)
 	int ret, fid, mgr_id;
 	u32 addr, irq;
 	struct omap_overlay *ovl;
-	struct timeval timevalue;
+	u64 ts;
 	struct omapvideo_info *ovid;
 	struct omap_dss_device *cur_display;
 	struct omap_vout_device *vout = (struct omap_vout_device *)arg;
@@ -577,7 +577,7 @@ static void omap_vout_isr(void *arg, unsigned int irqstatus)
 		return;
 
 	spin_lock(&vout->vbq_lock);
-	v4l2_get_timestamp(&timevalue);
+	ts = ktime_get_ns();
 
 	switch (cur_display->type) {
 	case OMAP_DISPLAY_TYPE_DSI:
@@ -595,7 +595,7 @@ static void omap_vout_isr(void *arg, unsigned int irqstatus)
 		break;
 	case OMAP_DISPLAY_TYPE_VENC:
 		fid = omapvid_handle_interlace_display(vout, irqstatus,
-				timevalue);
+				ts);
 		if (!fid)
 			goto vout_isr_err;
 		break;
@@ -608,7 +608,7 @@ static void omap_vout_isr(void *arg, unsigned int irqstatus)
 	}
 
 	if (!vout->first_int && (vout->cur_frm != vout->next_frm)) {
-		vout->cur_frm->ts = timevalue;
+		vout->cur_frm->ts = ts;
 		vout->cur_frm->state = VIDEOBUF_DONE;
 		wake_up_interruptible(&vout->cur_frm->done);
 		vout->cur_frm = vout->next_frm;
diff --git a/drivers/media/usb/cx231xx/cx231xx-417.c b/drivers/media/usb/cx231xx/cx231xx-417.c
index 1c48c497bd6a..0f8ae81f4820 100644
--- a/drivers/media/usb/cx231xx/cx231xx-417.c
+++ b/drivers/media/usb/cx231xx/cx231xx-417.c
@@ -1316,7 +1316,7 @@ static void buffer_copy(struct cx231xx *dev, char *data, int len, struct urb *ur
 
 		buf->vb.state = VIDEOBUF_DONE;
 		buf->vb.field_count++;
-		v4l2_get_timestamp(&buf->vb.ts);
+		buf->vb.ts = ktime_get_ns();
 		list_del(&buf->vb.queue);
 		wake_up(&buf->vb.done);
 		dma_q->mpeg_buffer_completed = 0;
@@ -1347,7 +1347,7 @@ static void buffer_filled(char *data, int len, struct urb *urb,
 	memcpy(vbuf, data, len);
 	buf->vb.state = VIDEOBUF_DONE;
 	buf->vb.field_count++;
-	v4l2_get_timestamp(&buf->vb.ts);
+	buf->vb.ts = ktime_get_ns();
 	list_del(&buf->vb.queue);
 	wake_up(&buf->vb.done);
 }
diff --git a/drivers/media/usb/cx231xx/cx231xx-vbi.c b/drivers/media/usb/cx231xx/cx231xx-vbi.c
index 10b2eb7338ad..d16b73c04445 100644
--- a/drivers/media/usb/cx231xx/cx231xx-vbi.c
+++ b/drivers/media/usb/cx231xx/cx231xx-vbi.c
@@ -528,7 +528,7 @@ static inline void vbi_buffer_filled(struct cx231xx *dev,
 
 	buf->vb.state = VIDEOBUF_DONE;
 	buf->vb.field_count++;
-	v4l2_get_timestamp(&buf->vb.ts);
+	buf->vb.ts = ktime_get_ns();
 
 	dev->vbi_mode.bulk_ctl.buf = NULL;
 
diff --git a/drivers/media/usb/cx231xx/cx231xx-video.c b/drivers/media/usb/cx231xx/cx231xx-video.c
index 0d451c4ea3b9..aebbaf9d92a6 100644
--- a/drivers/media/usb/cx231xx/cx231xx-video.c
+++ b/drivers/media/usb/cx231xx/cx231xx-video.c
@@ -182,7 +182,7 @@ static inline void buffer_filled(struct cx231xx *dev,
 	cx231xx_isocdbg("[%p/%d] wakeup\n", buf, buf->vb.i);
 	buf->vb.state = VIDEOBUF_DONE;
 	buf->vb.field_count++;
-	v4l2_get_timestamp(&buf->vb.ts);
+	buf->vb.ts = ktime_get_ns();
 
 	if (dev->USE_ISO)
 		dev->video_mode.isoc_ctl.buf = NULL;
diff --git a/drivers/media/usb/tm6000/tm6000-video.c b/drivers/media/usb/tm6000/tm6000-video.c
index ee7b5318b351..5127be71dd03 100644
--- a/drivers/media/usb/tm6000/tm6000-video.c
+++ b/drivers/media/usb/tm6000/tm6000-video.c
@@ -106,7 +106,7 @@ static inline void buffer_filled(struct tm6000_core *dev,
 	dprintk(dev, V4L2_DEBUG_ISOC, "[%p/%d] wakeup\n", buf, buf->vb.i);
 	buf->vb.state = VIDEOBUF_DONE;
 	buf->vb.field_count++;
-	v4l2_get_timestamp(&buf->vb.ts);
+	buf->vb.ts = ktime_get_ns();
 
 	list_del(&buf->vb.queue);
 	wake_up(&buf->vb.done);
diff --git a/drivers/media/usb/zr364xx/zr364xx.c b/drivers/media/usb/zr364xx/zr364xx.c
index ab35554cbffa..51aad2cf742f 100644
--- a/drivers/media/usb/zr364xx/zr364xx.c
+++ b/drivers/media/usb/zr364xx/zr364xx.c
@@ -521,7 +521,7 @@ static void zr364xx_fillbuff(struct zr364xx_camera *cam,
 	/* tell v4l buffer was filled */
 
 	buf->vb.field_count = cam->frame_count * 2;
-	v4l2_get_timestamp(&buf->vb.ts);
+	buf->vb.ts = ktime_get_ns();
 	buf->vb.state = VIDEOBUF_DONE;
 }
 
@@ -549,7 +549,7 @@ static int zr364xx_got_frame(struct zr364xx_camera *cam, int jpgsize)
 		goto unlock;
 	}
 	list_del(&buf->vb.queue);
-	v4l2_get_timestamp(&buf->vb.ts);
+	buf->vb.ts = ktime_get_ns();
 	DBG("[%p/%d] wakeup\n", buf, buf->vb.i);
 	zr364xx_fillbuff(cam, buf, jpgsize);
 	wake_up(&buf->vb.done);
diff --git a/drivers/media/v4l2-core/videobuf-core.c b/drivers/media/v4l2-core/videobuf-core.c
index 7491b337002c..d1bcfa91aaf8 100644
--- a/drivers/media/v4l2-core/videobuf-core.c
+++ b/drivers/media/v4l2-core/videobuf-core.c
@@ -367,7 +367,7 @@ static void videobuf_status(struct videobuf_queue *q, struct v4l2_buffer *b,
 	}
 
 	b->field     = vb->field;
-	b->timestamp = vb->ts;
+	b->timestamp = ns_to_timeval(vb->ts);
 	b->bytesused = vb->size;
 	b->sequence  = vb->field_count >> 1;
 }
@@ -581,7 +581,7 @@ int videobuf_qbuf(struct videobuf_queue *q, struct v4l2_buffer *b)
 		    || q->type == V4L2_BUF_TYPE_SDR_OUTPUT) {
 			buf->size = b->bytesused;
 			buf->field = b->field;
-			buf->ts = b->timestamp;
+			buf->ts = v4l2_timeval_to_ns(&b->timestamp);
 		}
 		break;
 	case V4L2_MEMORY_USERPTR:
diff --git a/include/media/videobuf-core.h b/include/media/videobuf-core.h
index 60a664febba0..5684dc6f0d0d 100644
--- a/include/media/videobuf-core.h
+++ b/include/media/videobuf-core.h
@@ -80,7 +80,7 @@ struct videobuf_buffer {
 	struct list_head        queue;
 	wait_queue_head_t       done;
 	unsigned int            field_count;
-	struct timeval          ts;
+	u64			ts;
 
 	/* Memory type */
 	enum v4l2_memory        memory;
-- 
cgit v1.2.3


From 276c1f066bdaaed6fe82ed231e1eff2f41c34882 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Mon, 21 Jan 2019 08:32:29 -0500
Subject: media: v4l2-common: drop v4l2_get_timestamp

This function is no longer used, so drop it.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/v4l2-core/v4l2-common.c | 10 ----------
 include/media/v4l2-common.h           |  9 ---------
 2 files changed, 19 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c
index 50763fb42a1b..663730f088cd 100644
--- a/drivers/media/v4l2-core/v4l2-common.c
+++ b/drivers/media/v4l2-core/v4l2-common.c
@@ -398,16 +398,6 @@ __v4l2_find_nearest_size(const void *array, size_t array_size,
 }
 EXPORT_SYMBOL_GPL(__v4l2_find_nearest_size);
 
-void v4l2_get_timestamp(struct timeval *tv)
-{
-	struct timespec ts;
-
-	ktime_get_ts(&ts);
-	tv->tv_sec = ts.tv_sec;
-	tv->tv_usec = ts.tv_nsec / NSEC_PER_USEC;
-}
-EXPORT_SYMBOL_GPL(v4l2_get_timestamp);
-
 int v4l2_g_parm_cap(struct video_device *vdev,
 		    struct v4l2_subdev *sd, struct v4l2_streamparm *a)
 {
diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h
index 0c511ed8ffb0..2b93cb281fa5 100644
--- a/include/media/v4l2-common.h
+++ b/include/media/v4l2-common.h
@@ -361,15 +361,6 @@ __v4l2_find_nearest_size(const void *array, size_t array_size,
 			 size_t entry_size, size_t width_offset,
 			 size_t height_offset, s32 width, s32 height);
 
-/**
- * v4l2_get_timestamp - helper routine to get a timestamp to be used when
- *	filling streaming metadata. Internally, it uses ktime_get_ts(),
- *	which is the recommended way to get it.
- *
- * @tv: pointer to &struct timeval to be filled.
- */
-void v4l2_get_timestamp(struct timeval *tv);
-
 /**
  * v4l2_g_parm_cap - helper routine for vidioc_g_parm to fill this in by
  *      calling the g_frame_interval op of the given subdev. It only works
-- 
cgit v1.2.3


From c2d88e7d66f9d327593e0d0f9d7f94755b637e5c Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@collabora.com>
Date: Tue, 5 Feb 2019 16:11:15 -0500
Subject: media: vb2: Fix buf_out_validate documentation

The .buf_out_validate callback is mandatory for OUTPUT
queues. Mark it as such in the callback's doc.

Fixes: 28d77c21cb ("media: vb2: add buf_out_validate callback")

Signed-off-by: Ezequiel Garcia <ezequiel@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/media/videobuf2-core.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
index 4849b865b908..06142c1469cc 100644
--- a/include/media/videobuf2-core.h
+++ b/include/media/videobuf2-core.h
@@ -345,7 +345,8 @@ struct vb2_buffer {
  *			waiting for a new buffer to arrive.
  * @buf_out_validate:	called when the output buffer is prepared or queued
  *			to a request; drivers can use this to validate
- *			userspace-provided information; optional.
+ *			userspace-provided information; this is required only
+ *			for OUTPUT queues.
  * @buf_init:		called once after allocating a buffer (in MMAP case)
  *			or after acquiring a new USERPTR buffer; drivers may
  *			perform additional buffer-related initialization;
-- 
cgit v1.2.3


From a4d3d61254d3645d8de738102c3c473b176180a5 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@collabora.com>
Date: Tue, 5 Feb 2019 16:20:33 -0500
Subject: media: v4l2-mem2mem: Rename v4l2_m2m_buf_copy_data to
 v4l2_m2m_buf_copy_metadata

The v4l2_m2m_buf_copy_data helper is used to copy the buffer
metadata, such as its timestamp and its flags.

Therefore, the v4l2_m2m_buf_copy_metadata name is more clear
and avoids confusion with a payload data copy.

Signed-off-by: Ezequiel Garcia <ezequiel@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
[hverkuil-cisco@xs4all.nl: also fix cedrus_dec.c]
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/platform/vicodec/vicodec-core.c   |  4 ++--
 drivers/media/platform/vim2m.c                  |  2 +-
 drivers/media/v4l2-core/v4l2-mem2mem.c          |  8 ++++----
 drivers/staging/media/sunxi/cedrus/cedrus_dec.c |  2 +-
 include/media/v4l2-mem2mem.h                    | 14 +++++++-------
 5 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/media/platform/vicodec/vicodec-core.c b/drivers/media/platform/vicodec/vicodec-core.c
index 212da185e4f2..9d739ea5542d 100644
--- a/drivers/media/platform/vicodec/vicodec-core.c
+++ b/drivers/media/platform/vicodec/vicodec-core.c
@@ -199,7 +199,7 @@ static int device_process(struct vicodec_ctx *ctx,
 
 	dst_vb->sequence = q_dst->sequence++;
 	dst_vb->flags &= ~V4L2_BUF_FLAG_LAST;
-	v4l2_m2m_buf_copy_data(src_vb, dst_vb, !ctx->is_enc);
+	v4l2_m2m_buf_copy_metadata(src_vb, dst_vb, !ctx->is_enc);
 
 	return 0;
 }
@@ -414,7 +414,7 @@ static void set_last_buffer(struct vb2_v4l2_buffer *dst_buf,
 	vb2_set_plane_payload(&dst_buf->vb2_buf, 0, 0);
 	dst_buf->sequence = q_dst->sequence++;
 
-	v4l2_m2m_buf_copy_data(src_buf, dst_buf, !ctx->is_enc);
+	v4l2_m2m_buf_copy_metadata(src_buf, dst_buf, !ctx->is_enc);
 	dst_buf->flags |= V4L2_BUF_FLAG_LAST;
 	v4l2_m2m_buf_done(dst_buf, VB2_BUF_STATE_DONE);
 }
diff --git a/drivers/media/platform/vim2m.c b/drivers/media/platform/vim2m.c
index bd125ad34343..98be92a4e0b0 100644
--- a/drivers/media/platform/vim2m.c
+++ b/drivers/media/platform/vim2m.c
@@ -432,7 +432,7 @@ static int device_process(struct vim2m_ctx *ctx,
 	out_vb->sequence = get_q_data(ctx,
 				      V4L2_BUF_TYPE_VIDEO_CAPTURE)->sequence++;
 	in_vb->sequence = q_data_in->sequence++;
-	v4l2_m2m_buf_copy_data(in_vb, out_vb, true);
+	v4l2_m2m_buf_copy_metadata(in_vb, out_vb, true);
 
 	if (ctx->mode & MEM2MEM_VFLIP) {
 		start = height - 1;
diff --git a/drivers/media/v4l2-core/v4l2-mem2mem.c b/drivers/media/v4l2-core/v4l2-mem2mem.c
index 631f4e2aa942..1494d0d5951a 100644
--- a/drivers/media/v4l2-core/v4l2-mem2mem.c
+++ b/drivers/media/v4l2-core/v4l2-mem2mem.c
@@ -975,9 +975,9 @@ void v4l2_m2m_buf_queue(struct v4l2_m2m_ctx *m2m_ctx,
 }
 EXPORT_SYMBOL_GPL(v4l2_m2m_buf_queue);
 
-void v4l2_m2m_buf_copy_data(const struct vb2_v4l2_buffer *out_vb,
-			    struct vb2_v4l2_buffer *cap_vb,
-			    bool copy_frame_flags)
+void v4l2_m2m_buf_copy_metadata(const struct vb2_v4l2_buffer *out_vb,
+				struct vb2_v4l2_buffer *cap_vb,
+				bool copy_frame_flags)
 {
 	u32 mask = V4L2_BUF_FLAG_TIMECODE | V4L2_BUF_FLAG_TSTAMP_SRC_MASK;
 
@@ -993,7 +993,7 @@ void v4l2_m2m_buf_copy_data(const struct vb2_v4l2_buffer *out_vb,
 	cap_vb->flags &= ~mask;
 	cap_vb->flags |= out_vb->flags & mask;
 }
-EXPORT_SYMBOL_GPL(v4l2_m2m_buf_copy_data);
+EXPORT_SYMBOL_GPL(v4l2_m2m_buf_copy_metadata);
 
 void v4l2_m2m_request_queue(struct media_request *req)
 {
diff --git a/drivers/staging/media/sunxi/cedrus/cedrus_dec.c b/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
index 443fb037e1cf..4d6d602cdde6 100644
--- a/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
+++ b/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
@@ -50,7 +50,7 @@ void cedrus_device_run(void *priv)
 		break;
 	}
 
-	v4l2_m2m_buf_copy_data(run.src, run.dst, true);
+	v4l2_m2m_buf_copy_metadata(run.src, run.dst, true);
 
 	dev->dec_ops[ctx->current_codec]->setup(ctx, &run);
 
diff --git a/include/media/v4l2-mem2mem.h b/include/media/v4l2-mem2mem.h
index 43e447dcf69d..47c6d9aa0bf4 100644
--- a/include/media/v4l2-mem2mem.h
+++ b/include/media/v4l2-mem2mem.h
@@ -623,11 +623,11 @@ v4l2_m2m_dst_buf_remove_by_idx(struct v4l2_m2m_ctx *m2m_ctx, unsigned int idx)
 }
 
 /**
- * v4l2_m2m_buf_copy_data() - copy buffer data from the output buffer to the
- * capture buffer
+ * v4l2_m2m_buf_copy_metadata() - copy buffer metadata from
+ * the output buffer to the capture buffer
  *
- * @out_vb: the output buffer that is the source of the data.
- * @cap_vb: the capture buffer that will receive the data.
+ * @out_vb: the output buffer that is the source of the metadata.
+ * @cap_vb: the capture buffer that will receive the metadata.
  * @copy_frame_flags: copy the KEY/B/PFRAME flags as well.
  *
  * This helper function copies the timestamp, timecode (if the TIMECODE
@@ -638,9 +638,9 @@ v4l2_m2m_dst_buf_remove_by_idx(struct v4l2_m2m_ctx *m2m_ctx, unsigned int idx)
  * flags are not copied. This is typically needed for encoders that
  * set this bits explicitly.
  */
-void v4l2_m2m_buf_copy_data(const struct vb2_v4l2_buffer *out_vb,
-			    struct vb2_v4l2_buffer *cap_vb,
-			    bool copy_frame_flags);
+void v4l2_m2m_buf_copy_metadata(const struct vb2_v4l2_buffer *out_vb,
+				struct vb2_v4l2_buffer *cap_vb,
+				bool copy_frame_flags);
 
 /* v4l2 request helper */
 
-- 
cgit v1.2.3


From 70f8a3ca68d3e1f3344d959981ca55d5f6ec77f7 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 6 Feb 2019 09:59:15 -0800
Subject: mm: make mm->pinned_vm an atomic64 counter

Taking a sleeping lock to _only_ increment a variable is quite the
overkill, and pretty much all users do this. Furthermore, some drivers
(ie: infiniband and scif) that need pinned semantics can go to quite
some trouble to actually delay via workqueue (un)accounting for pinned
pages when not possible to acquire it.

By making the counter atomic we no longer need to hold the mmap_sem and
can simply some code around it for pinned_vm users. The counter is 64-bit
such that we need not worry about overflows such as rdma user input
controlled from userspace.

Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/umem.c             | 12 ++++++------
 drivers/infiniband/hw/hfi1/user_pages.c    |  6 +++---
 drivers/infiniband/hw/qib/qib_user_pages.c |  4 ++--
 drivers/infiniband/hw/usnic/usnic_uiom.c   |  8 ++++----
 drivers/misc/mic/scif/scif_rma.c           |  6 +++---
 fs/proc/task_mmu.c                         |  2 +-
 include/linux/mm_types.h                   |  2 +-
 kernel/events/core.c                       |  8 ++++----
 kernel/fork.c                              |  2 +-
 mm/debug.c                                 |  5 +++--
 10 files changed, 28 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 1efe0a74e06b..678abe1afcba 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -166,13 +166,13 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
 	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 
 	down_write(&mm->mmap_sem);
-	if (check_add_overflow(mm->pinned_vm, npages, &new_pinned) ||
-	    (new_pinned > lock_limit && !capable(CAP_IPC_LOCK))) {
+	new_pinned = atomic64_read(&mm->pinned_vm) + npages;
+	if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
 		up_write(&mm->mmap_sem);
 		ret = -ENOMEM;
 		goto out;
 	}
-	mm->pinned_vm = new_pinned;
+	atomic64_set(&mm->pinned_vm, new_pinned);
 	up_write(&mm->mmap_sem);
 
 	cur_base = addr & PAGE_MASK;
@@ -234,7 +234,7 @@ umem_release:
 	__ib_umem_release(context->device, umem, 0);
 vma:
 	down_write(&mm->mmap_sem);
-	mm->pinned_vm -= ib_umem_num_pages(umem);
+	atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
 	up_write(&mm->mmap_sem);
 out:
 	if (vma_list)
@@ -263,7 +263,7 @@ static void ib_umem_release_defer(struct work_struct *work)
 	struct ib_umem *umem = container_of(work, struct ib_umem, work);
 
 	down_write(&umem->owning_mm->mmap_sem);
-	umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem);
+	atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
 	up_write(&umem->owning_mm->mmap_sem);
 
 	__ib_umem_release_tail(umem);
@@ -302,7 +302,7 @@ void ib_umem_release(struct ib_umem *umem)
 	} else {
 		down_write(&umem->owning_mm->mmap_sem);
 	}
-	umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem);
+	atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
 	up_write(&umem->owning_mm->mmap_sem);
 
 	__ib_umem_release_tail(umem);
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
index e341e6dcc388..40a6e434190f 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -92,7 +92,7 @@ bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm,
 	size = DIV_ROUND_UP(size, PAGE_SIZE);
 
 	down_read(&mm->mmap_sem);
-	pinned = mm->pinned_vm;
+	pinned = atomic64_read(&mm->pinned_vm);
 	up_read(&mm->mmap_sem);
 
 	/* First, check the absolute limit against all pinned pages. */
@@ -112,7 +112,7 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np
 		return ret;
 
 	down_write(&mm->mmap_sem);
-	mm->pinned_vm += ret;
+	atomic64_add(ret, &mm->pinned_vm);
 	up_write(&mm->mmap_sem);
 
 	return ret;
@@ -131,7 +131,7 @@ void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
 
 	if (mm) { /* during close after signal, mm can be NULL */
 		down_write(&mm->mmap_sem);
-		mm->pinned_vm -= npages;
+		atomic64_sub(npages, &mm->pinned_vm);
 		up_write(&mm->mmap_sem);
 	}
 }
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index 075f09fb7ce3..c6c81022d313 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -75,7 +75,7 @@ static int __qib_get_user_pages(unsigned long start_page, size_t num_pages,
 			goto bail_release;
 	}
 
-	current->mm->pinned_vm += num_pages;
+	atomic64_add(num_pages, &current->mm->pinned_vm);
 
 	ret = 0;
 	goto bail;
@@ -156,7 +156,7 @@ void qib_release_user_pages(struct page **p, size_t num_pages)
 	__qib_release_user_pages(p, num_pages, 1);
 
 	if (current->mm) {
-		current->mm->pinned_vm -= num_pages;
+		atomic64_sub(num_pages, &current->mm->pinned_vm);
 		up_write(&current->mm->mmap_sem);
 	}
 }
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index ce01a59fccc4..854436a2b437 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -129,7 +129,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
 	uiomr->owning_mm = mm = current->mm;
 	down_write(&mm->mmap_sem);
 
-	locked = npages + current->mm->pinned_vm;
+	locked = npages + atomic64_read(&current->mm->pinned_vm);
 	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 
 	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
@@ -187,7 +187,7 @@ out:
 	if (ret < 0)
 		usnic_uiom_put_pages(chunk_list, 0);
 	else {
-		mm->pinned_vm = locked;
+		atomic64_set(&mm->pinned_vm, locked);
 		mmgrab(uiomr->owning_mm);
 	}
 
@@ -441,7 +441,7 @@ static void usnic_uiom_release_defer(struct work_struct *work)
 		container_of(work, struct usnic_uiom_reg, work);
 
 	down_write(&uiomr->owning_mm->mmap_sem);
-	uiomr->owning_mm->pinned_vm -= usnic_uiom_num_pages(uiomr);
+	atomic64_sub(usnic_uiom_num_pages(uiomr), &uiomr->owning_mm->pinned_vm);
 	up_write(&uiomr->owning_mm->mmap_sem);
 
 	__usnic_uiom_release_tail(uiomr);
@@ -469,7 +469,7 @@ void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr,
 	} else {
 		down_write(&uiomr->owning_mm->mmap_sem);
 	}
-	uiomr->owning_mm->pinned_vm -= usnic_uiom_num_pages(uiomr);
+	atomic64_sub(usnic_uiom_num_pages(uiomr), &uiomr->owning_mm->pinned_vm);
 	up_write(&uiomr->owning_mm->mmap_sem);
 
 	__usnic_uiom_release_tail(uiomr);
diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c
index 749321eb91ae..2448368f181e 100644
--- a/drivers/misc/mic/scif/scif_rma.c
+++ b/drivers/misc/mic/scif/scif_rma.c
@@ -285,7 +285,7 @@ __scif_dec_pinned_vm_lock(struct mm_struct *mm,
 	} else {
 		down_write(&mm->mmap_sem);
 	}
-	mm->pinned_vm -= nr_pages;
+	atomic64_sub(nr_pages, &mm->pinned_vm);
 	up_write(&mm->mmap_sem);
 	return 0;
 }
@@ -299,7 +299,7 @@ static inline int __scif_check_inc_pinned_vm(struct mm_struct *mm,
 		return 0;
 
 	locked = nr_pages;
-	locked += mm->pinned_vm;
+	locked += atomic64_read(&mm->pinned_vm);
 	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
 		dev_err(scif_info.mdev.this_device,
@@ -307,7 +307,7 @@ static inline int __scif_check_inc_pinned_vm(struct mm_struct *mm,
 			locked, lock_limit);
 		return -ENOMEM;
 	}
-	mm->pinned_vm = locked;
+	atomic64_set(&mm->pinned_vm, locked);
 	return 0;
 }
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f0ec9edab2f3..d2902962244d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -59,7 +59,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
 	SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
 	SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
-	SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm);
+	SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm));
 	SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss);
 	SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss);
 	SEQ_PUT_DEC(" kB\nRssAnon:\t", anon);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 2c471a2c43fa..acea2ea2d6c4 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -405,7 +405,7 @@ struct mm_struct {
 
 		unsigned long total_vm;	   /* Total pages mapped */
 		unsigned long locked_vm;   /* Pages that have PG_mlocked set */
-		unsigned long pinned_vm;   /* Refcount permanently increased */
+		atomic64_t    pinned_vm;   /* Refcount permanently increased */
 		unsigned long data_vm;	   /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
 		unsigned long exec_vm;	   /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
 		unsigned long stack_vm;	   /* VM_STACK */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e5ede6918050..29e9f2473656 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5459,7 +5459,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 
 		/* now it's safe to free the pages */
 		atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
-		vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
+		atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
 
 		/* this has to be the last one */
 		rb_free_aux(rb);
@@ -5532,7 +5532,7 @@ again:
 	 */
 
 	atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
-	vma->vm_mm->pinned_vm -= mmap_locked;
+	atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
 	free_uid(mmap_user);
 
 out_put:
@@ -5680,7 +5680,7 @@ accounting:
 
 	lock_limit = rlimit(RLIMIT_MEMLOCK);
 	lock_limit >>= PAGE_SHIFT;
-	locked = vma->vm_mm->pinned_vm + extra;
+	locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
 
 	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
 		!capable(CAP_IPC_LOCK)) {
@@ -5721,7 +5721,7 @@ accounting:
 unlock:
 	if (!ret) {
 		atomic_long_add(user_extra, &user->locked_vm);
-		vma->vm_mm->pinned_vm += extra;
+		atomic64_add(extra, &vma->vm_mm->pinned_vm);
 
 		atomic_inc(&event->mmap_count);
 	} else if (rb) {
diff --git a/kernel/fork.c b/kernel/fork.c
index b69248e6f0e0..85e08c379a9e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -981,7 +981,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm_pgtables_bytes_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
-	mm->pinned_vm = 0;
+	atomic64_set(&mm->pinned_vm, 0);
 	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
 	spin_lock_init(&mm->page_table_lock);
 	spin_lock_init(&mm->arg_lock);
diff --git a/mm/debug.c b/mm/debug.c
index 0abb987dad9b..7d13941a72f9 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -135,7 +135,7 @@ void dump_mm(const struct mm_struct *mm)
 		"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
 		"pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
 		"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
-		"pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
+		"pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n"
 		"start_code %lx end_code %lx start_data %lx end_data %lx\n"
 		"start_brk %lx brk %lx start_stack %lx\n"
 		"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
@@ -166,7 +166,8 @@ void dump_mm(const struct mm_struct *mm)
 		mm_pgtables_bytes(mm),
 		mm->map_count,
 		mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
-		mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
+		atomic64_read(&mm->pinned_vm),
+		mm->data_vm, mm->exec_vm, mm->stack_vm,
 		mm->start_code, mm->end_code, mm->start_data, mm->end_data,
 		mm->start_brk, mm->brk, mm->start_stack,
 		mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
-- 
cgit v1.2.3


From 95b86d1c91ad3b19f882d9e70aa37c8e99e8dc17 Mon Sep 17 00:00:00 2001
From: Devesh Sharma <devesh.sharma@broadcom.com>
Date: Thu, 7 Feb 2019 01:31:27 -0500
Subject: RDMA/bnxt_re: Update kernel user abi to pass chip context

User space verbs provider library would need chip context.  Changing the
ABI to add chip version details in structure.  Furthermore, changing the
kernel driver ucontext allocation code to initialize the abi structure
with appropriate values.

As suggested by community, appended the new fields at the bottom of the
ABI structure and retaining to older fields as those were in the older
versions.

Keeping the ABI version at 1 and adding a new field in the ucontext
response structure to hold the component mask.  The user space library
should check pre-defined flags to figure out if a certain feature is
supported on not.

Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/ib_verbs.c | 17 ++++++++++++++---
 include/uapi/rdma/bnxt_re-abi.h          | 11 +++++++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 08c1725f371a..1d7469e23cde 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -3692,9 +3692,10 @@ struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
 					   struct ib_udata *udata)
 {
 	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
 	struct bnxt_re_uctx_resp resp;
 	struct bnxt_re_ucontext *uctx;
-	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+	u32 chip_met_rev_num = 0;
 	int rc;
 
 	dev_dbg(rdev_to_dev(rdev), "ABI version requested %d",
@@ -3719,14 +3720,24 @@ struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
 	}
 	spin_lock_init(&uctx->sh_lock);
 
-	resp.dev_id = rdev->en_dev->pdev->devfn; /*Temp, Use idr_alloc instead*/
+	resp.comp_mask |= BNXT_RE_UCNTX_CMASK_HAVE_CCTX;
+	chip_met_rev_num = rdev->chip_ctx.chip_num;
+	chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_rev & 0xFF) <<
+			     BNXT_RE_CHIP_ID0_CHIP_REV_SFT;
+	chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_metal & 0xFF) <<
+			     BNXT_RE_CHIP_ID0_CHIP_MET_SFT;
+	resp.chip_id0 = chip_met_rev_num;
+	/* Future extension of chip info */
+	resp.chip_id1 = 0;
+	/*Temp, Use idr_alloc instead */
+	resp.dev_id = rdev->en_dev->pdev->devfn;
 	resp.max_qp = rdev->qplib_ctx.qpc_count;
 	resp.pg_size = PAGE_SIZE;
 	resp.cqe_sz = sizeof(struct cq_base);
 	resp.max_cqd = dev_attr->max_cq_wqes;
 	resp.rsvd    = 0;
 
-	rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	rc = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp)));
 	if (rc) {
 		dev_err(rdev_to_dev(rdev), "Failed to copy user context");
 		rc = -EFAULT;
diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h
index a7a6111e50c7..dc52e3cf574c 100644
--- a/include/uapi/rdma/bnxt_re-abi.h
+++ b/include/uapi/rdma/bnxt_re-abi.h
@@ -44,6 +44,14 @@
 
 #define BNXT_RE_ABI_VERSION	1
 
+#define BNXT_RE_CHIP_ID0_CHIP_NUM_SFT		0x00
+#define BNXT_RE_CHIP_ID0_CHIP_REV_SFT		0x10
+#define BNXT_RE_CHIP_ID0_CHIP_MET_SFT		0x18
+
+enum {
+	BNXT_RE_UCNTX_CMASK_HAVE_CCTX = 0x1ULL
+};
+
 struct bnxt_re_uctx_resp {
 	__u32 dev_id;
 	__u32 max_qp;
@@ -51,6 +59,9 @@ struct bnxt_re_uctx_resp {
 	__u32 cqe_sz;
 	__u32 max_cqd;
 	__u32 rsvd;
+	__aligned_u64 comp_mask;
+	__u32 chip_id0;
+	__u32 chip_id1;
 };
 
 /*
-- 
cgit v1.2.3


From 64e21add8cd9717f042b523f35ea831eab14261b Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea@microchip.com>
Date: Thu, 7 Feb 2019 09:25:05 +0000
Subject: pinctrl: at91: add slewrate support for SAM9X60

Add slew rate support for SAM9X60 pin controller.

Signed-off-by: Claudiu Beznea <claudiu.beznea@microchip.com>
Acked-by: Ludovic Desroches <ludovic.desroches@microchip.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/pinctrl-at91.c     | 48 ++++++++++++++++++++++++++++++++++++++
 drivers/pinctrl/pinctrl-at91.h     |  1 +
 include/dt-bindings/pinctrl/at91.h |  4 ++++
 3 files changed, 53 insertions(+)

(limited to 'include')

diff --git a/drivers/pinctrl/pinctrl-at91.c b/drivers/pinctrl/pinctrl-at91.c
index 128003597f43..cb7c432769a5 100644
--- a/drivers/pinctrl/pinctrl-at91.c
+++ b/drivers/pinctrl/pinctrl-at91.c
@@ -59,6 +59,9 @@ static int gpio_banks;
 #define OUTPUT		(1 << 7)
 #define OUTPUT_VAL_SHIFT	8
 #define OUTPUT_VAL	(0x1 << OUTPUT_VAL_SHIFT)
+#define SLEWRATE_SHIFT	9
+#define SLEWRATE_MASK	0x1
+#define SLEWRATE	(SLEWRATE_MASK << SLEWRATE_SHIFT)
 #define DEBOUNCE	(1 << 16)
 #define DEBOUNCE_VAL_SHIFT	17
 #define DEBOUNCE_VAL	(0x3fff << DEBOUNCE_VAL_SHIFT)
@@ -82,6 +85,13 @@ enum drive_strength_bit {
 #define DRIVE_STRENGTH_BIT_MSK(name)	(DRIVE_STRENGTH_BIT_##name << \
 					 DRIVE_STRENGTH_SHIFT)
 
+enum slewrate_bit {
+	SLEWRATE_BIT_DIS,
+	SLEWRATE_BIT_ENA,
+};
+
+#define SLEWRATE_BIT_MSK(name)		(SLEWRATE_BIT_##name << SLEWRATE_SHIFT)
+
 /**
  * struct at91_pmx_func - describes AT91 pinmux functions
  * @name: the name of this specific function
@@ -171,6 +181,8 @@ struct at91_pinctrl_mux_ops {
 	unsigned (*get_drivestrength)(void __iomem *pio, unsigned pin);
 	void (*set_drivestrength)(void __iomem *pio, unsigned pin,
 					u32 strength);
+	unsigned (*get_slewrate)(void __iomem *pio, unsigned pin);
+	void (*set_slewrate)(void __iomem *pio, unsigned pin, u32 slewrate);
 	/* irq */
 	int (*irq_type)(struct irq_data *d, unsigned type);
 };
@@ -585,6 +597,16 @@ static unsigned at91_mux_sam9x60_get_drivestrength(void __iomem *pio,
 	return DRIVE_STRENGTH_BIT_LOW;
 }
 
+static unsigned at91_mux_sam9x60_get_slewrate(void __iomem *pio, unsigned pin)
+{
+	unsigned tmp = readl_relaxed(pio + SAM9X60_PIO_SLEWR);
+
+	if ((tmp & BIT(pin)))
+		return SLEWRATE_BIT_ENA;
+
+	return SLEWRATE_BIT_DIS;
+}
+
 static void set_drive_strength(void __iomem *reg, unsigned pin, u32 strength)
 {
 	unsigned tmp = readl_relaxed(reg);
@@ -643,6 +665,24 @@ static void at91_mux_sam9x60_set_drivestrength(void __iomem *pio, unsigned pin,
 	writel_relaxed(tmp, pio + SAM9X60_PIO_DRIVER1);
 }
 
+static void at91_mux_sam9x60_set_slewrate(void __iomem *pio, unsigned pin,
+					  u32 setting)
+{
+	unsigned int tmp;
+
+	if (setting < SLEWRATE_BIT_DIS || setting > SLEWRATE_BIT_ENA)
+		return;
+
+	tmp = readl_relaxed(pio + SAM9X60_PIO_SLEWR);
+
+	if (setting == SLEWRATE_BIT_DIS)
+		tmp &= ~BIT(pin);
+	else
+		tmp |= BIT(pin);
+
+	writel_relaxed(tmp, pio + SAM9X60_PIO_SLEWR);
+}
+
 static struct at91_pinctrl_mux_ops at91rm9200_ops = {
 	.get_periph	= at91_mux_get_periph,
 	.mux_A_periph	= at91_mux_set_A_periph,
@@ -687,6 +727,8 @@ static const struct at91_pinctrl_mux_ops sam9x60_ops = {
 	.disable_schmitt_trig = at91_mux_pio3_disable_schmitt_trig,
 	.get_drivestrength = at91_mux_sam9x60_get_drivestrength,
 	.set_drivestrength = at91_mux_sam9x60_set_drivestrength,
+	.get_slewrate   = at91_mux_sam9x60_get_slewrate,
+	.set_slewrate   = at91_mux_sam9x60_set_slewrate,
 	.irq_type	= alt_gpio_irq_type,
 
 };
@@ -950,6 +992,8 @@ static int at91_pinconf_get(struct pinctrl_dev *pctldev,
 	if (info->ops->get_drivestrength)
 		*config |= (info->ops->get_drivestrength(pio, pin)
 				<< DRIVE_STRENGTH_SHIFT);
+	if (info->ops->get_slewrate)
+		*config |= (info->ops->get_slewrate(pio, pin) << SLEWRATE_SHIFT);
 	if (at91_mux_get_output(pio, pin, &out))
 		*config |= OUTPUT | (out << OUTPUT_VAL_SHIFT);
 
@@ -1001,6 +1045,9 @@ static int at91_pinconf_set(struct pinctrl_dev *pctldev,
 			info->ops->set_drivestrength(pio, pin,
 				(config & DRIVE_STRENGTH)
 					>> DRIVE_STRENGTH_SHIFT);
+		if (info->ops->set_slewrate)
+			info->ops->set_slewrate(pio, pin,
+				(config & SLEWRATE) >> SLEWRATE_SHIFT);
 
 	} /* for each config */
 
@@ -1044,6 +1091,7 @@ static void at91_pinconf_dbg_show(struct pinctrl_dev *pctldev,
 			     DRIVE_STRENGTH_MED);
 	DBG_SHOW_FLAG_MASKED(DRIVE_STRENGTH, DRIVE_STRENGTH_BIT_MSK(HI),
 			     DRIVE_STRENGTH_HI);
+	DBG_SHOW_FLAG(SLEWRATE);
 	DBG_SHOW_FLAG(DEBOUNCE);
 	if (config & DEBOUNCE) {
 		val = config >> DEBOUNCE_VAL_SHIFT;
diff --git a/drivers/pinctrl/pinctrl-at91.h b/drivers/pinctrl/pinctrl-at91.h
index 19fc27e66bfd..223620f14b05 100644
--- a/drivers/pinctrl/pinctrl-at91.h
+++ b/drivers/pinctrl/pinctrl-at91.h
@@ -69,6 +69,7 @@
 #define AT91SAM9X5_PIO_DRIVER1	0x114  /*PIO Driver 1 register offset*/
 #define AT91SAM9X5_PIO_DRIVER2	0x118  /*PIO Driver 2 register offset*/
 
+#define SAM9X60_PIO_SLEWR	0x110	/* PIO Slew Rate Control Register */
 #define SAM9X60_PIO_DRIVER1	0x118	/* PIO Driver 1 register offset */
 
 #endif
diff --git a/include/dt-bindings/pinctrl/at91.h b/include/dt-bindings/pinctrl/at91.h
index eb81867eac77..8dc10e00c627 100644
--- a/include/dt-bindings/pinctrl/at91.h
+++ b/include/dt-bindings/pinctrl/at91.h
@@ -17,6 +17,7 @@
 #define AT91_PINCTRL_DIS_SCHMIT		(1 << 4)
 #define AT91_PINCTRL_OUTPUT		(1 << 7)
 #define AT91_PINCTRL_OUTPUT_VAL(x)	((x & 0x1) << 8)
+#define AT91_PINCTRL_SLEWRATE		(1 << 9)
 #define AT91_PINCTRL_DEBOUNCE		(1 << 16)
 #define AT91_PINCTRL_DEBOUNCE_VAL(x)	(x << 17)
 
@@ -27,6 +28,9 @@
 #define AT91_PINCTRL_DRIVE_STRENGTH_MED			(0x2 << 5)
 #define AT91_PINCTRL_DRIVE_STRENGTH_HI			(0x3 << 5)
 
+#define AT91_PINCTRL_SLEWRATE_DIS	(0x0 << 9)
+#define AT91_PINCTRL_SLEWRATE_ENA	(0x1 << 9)
+
 #define AT91_PIOA	0
 #define AT91_PIOB	1
 #define AT91_PIOC	2
-- 
cgit v1.2.3


From f3d5e4f18dba18d7c2303dda68b9dbcf5ccc05cd Mon Sep 17 00:00:00 2001
From: Alexander Shiyan <shc_work@mail.ru>
Date: Sat, 19 Jan 2019 07:52:01 +0300
Subject: ata: pata_of_platform: Allow to use 16-bit wide data transfer

In some cases, the system bus can be configured for 16-bit mode,
in this case using read/write functions for 32-bit values
results in two cycles of 16 bits each, which is wrong.
This patch adds the devicetree flag to switch the driver to
use 16-bit mode for I/O transfers.

Acked-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Alexander Shiyan <shc_work@mail.ru>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ata/pata_of_platform.c |  6 +++++-
 drivers/ata/pata_platform.c    | 22 ++++++++++++----------
 include/linux/ata_platform.h   |  3 ++-
 3 files changed, 19 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/pata_of_platform.c b/drivers/ata/pata_of_platform.c
index 01161c1aef4d..7a0b1759e5f0 100644
--- a/drivers/ata/pata_of_platform.c
+++ b/drivers/ata/pata_of_platform.c
@@ -32,6 +32,7 @@ static int pata_of_platform_probe(struct platform_device *ofdev)
 	unsigned int reg_shift = 0;
 	int pio_mode = 0;
 	int pio_mask;
+	bool use16bit;
 
 	ret = of_address_to_resource(dn, 0, &io_res);
 	if (ret) {
@@ -60,11 +61,14 @@ static int pata_of_platform_probe(struct platform_device *ofdev)
 		dev_info(&ofdev->dev, "pio-mode unspecified, assuming PIO0\n");
 	}
 
+	use16bit = of_property_read_bool(dn, "ata-generic,use16bit");
+
 	pio_mask = 1 << pio_mode;
 	pio_mask |= (1 << pio_mode) - 1;
 
 	return __pata_platform_probe(&ofdev->dev, &io_res, &ctl_res, irq_res,
-				     reg_shift, pio_mask, &pata_platform_sht);
+				     reg_shift, pio_mask, &pata_platform_sht,
+				     use16bit);
 }
 
 static const struct of_device_id pata_of_platform_match[] = {
diff --git a/drivers/ata/pata_platform.c b/drivers/ata/pata_platform.c
index d6f8f5406442..31cd0f39b0a7 100644
--- a/drivers/ata/pata_platform.c
+++ b/drivers/ata/pata_platform.c
@@ -47,13 +47,6 @@ static struct scsi_host_template pata_platform_sht = {
 	ATA_PIO_SHT(DRV_NAME),
 };
 
-static struct ata_port_operations pata_platform_port_ops = {
-	.inherits		= &ata_sff_port_ops,
-	.sff_data_xfer		= ata_sff_data_xfer32,
-	.cable_detect		= ata_cable_unknown,
-	.set_mode		= pata_platform_set_mode,
-};
-
 static void pata_platform_setup_port(struct ata_ioports *ioaddr,
 				     unsigned int shift)
 {
@@ -79,6 +72,7 @@ static void pata_platform_setup_port(struct ata_ioports *ioaddr,
  *	@ioport_shift: I/O port shift
  *	@__pio_mask: PIO mask
  *	@sht: scsi_host_template to use when registering
+ *	@use16bit: Flag to indicate 16-bit IO instead of 32-bit
  *
  *	Register a platform bus IDE interface. Such interfaces are PIO and we
  *	assume do not support IRQ sharing.
@@ -101,7 +95,7 @@ static void pata_platform_setup_port(struct ata_ioports *ioaddr,
 int __pata_platform_probe(struct device *dev, struct resource *io_res,
 			  struct resource *ctl_res, struct resource *irq_res,
 			  unsigned int ioport_shift, int __pio_mask,
-			  struct scsi_host_template *sht)
+			  struct scsi_host_template *sht, bool use16bit)
 {
 	struct ata_host *host;
 	struct ata_port *ap;
@@ -131,7 +125,15 @@ int __pata_platform_probe(struct device *dev, struct resource *io_res,
 		return -ENOMEM;
 	ap = host->ports[0];
 
-	ap->ops = &pata_platform_port_ops;
+	ap->ops = devm_kzalloc(dev, sizeof(*ap->ops), GFP_KERNEL);
+	ap->ops->inherits = &ata_sff_port_ops;
+	ap->ops->cable_detect = ata_cable_unknown;
+	ap->ops->set_mode = pata_platform_set_mode;
+	if (use16bit)
+		ap->ops->sff_data_xfer = ata_sff_data_xfer;
+	else
+		ap->ops->sff_data_xfer = ata_sff_data_xfer32;
+
 	ap->pio_mask = __pio_mask;
 	ap->flags |= ATA_FLAG_SLAVE_POSS;
 
@@ -218,7 +220,7 @@ static int pata_platform_probe(struct platform_device *pdev)
 
 	return __pata_platform_probe(&pdev->dev, io_res, ctl_res, irq_res,
 				     pp_info ? pp_info->ioport_shift : 0,
-				     pio_mask, &pata_platform_sht);
+				     pio_mask, &pata_platform_sht, false);
 }
 
 static struct platform_driver pata_platform_driver = {
diff --git a/include/linux/ata_platform.h b/include/linux/ata_platform.h
index ff2120215dec..9cafec92282d 100644
--- a/include/linux/ata_platform.h
+++ b/include/linux/ata_platform.h
@@ -19,7 +19,8 @@ extern int __pata_platform_probe(struct device *dev,
 				 struct resource *irq_res,
 				 unsigned int ioport_shift,
 				 int __pio_mask,
-				 struct scsi_host_template *sht);
+				 struct scsi_host_template *sht,
+				 bool use16bit);
 
 /*
  * Marvell SATA private data
-- 
cgit v1.2.3


From ddd065e423c1bad6c573297455ef0a9755ef12d4 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Fri, 8 Feb 2019 09:54:38 -0700
Subject: genirq/msi: Clean up usage of __u8/__u16 types

The double underscore types are meant for compatibility in userspace
headers which does not apply here. Therefore, change to use the standard
no-underscore types.

The origin of the double underscore types dates back to before the git era
so I was not able to find a commit to see the original justification.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/msi.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 784fb52b9900..7e9b81c3b50d 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -83,12 +83,12 @@ struct msi_desc {
 		struct {
 			u32 masked;
 			struct {
-				__u8	is_msix		: 1;
-				__u8	multiple	: 3;
-				__u8	multi_cap	: 3;
-				__u8	maskbit		: 1;
-				__u8	is_64		: 1;
-				__u16	entry_nr;
+				u8	is_msix		: 1;
+				u8	multiple	: 3;
+				u8	multi_cap	: 3;
+				u8	maskbit		: 1;
+				u8	is_64		: 1;
+				u16	entry_nr;
 				unsigned default_irq;
 			} msi_attrib;
 			union {
-- 
cgit v1.2.3


From 2e5a662de36a92a95b5939273468b01785dc41ec Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 6 Feb 2019 08:16:51 +0100
Subject: i2c: cbus-gpio: Switch to use GPIO descriptors

This augments the CBUS GPIO I2C driver to use GPIO
descriptors for clock, sel and data. We drop the platform
data that was only used for carrying GPIO numbers and
use machine descriptor tables instead.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Tested-by: Aaro Koskinen <aaro.koskinen@iki.fi>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 arch/arm/mach-omap1/board-nokia770.c        | 18 ++++---
 drivers/i2c/busses/i2c-cbus-gpio.c          | 80 +++++++++++------------------
 include/linux/platform_data/i2c-cbus-gpio.h | 27 ----------
 3 files changed, 40 insertions(+), 85 deletions(-)
 delete mode 100644 include/linux/platform_data/i2c-cbus-gpio.h

(limited to 'include')

diff --git a/arch/arm/mach-omap1/board-nokia770.c b/arch/arm/mach-omap1/board-nokia770.c
index eb41db78cd47..10848f573d37 100644
--- a/arch/arm/mach-omap1/board-nokia770.c
+++ b/arch/arm/mach-omap1/board-nokia770.c
@@ -10,6 +10,7 @@
 #include <linux/clkdev.h>
 #include <linux/irq.h>
 #include <linux/gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/mutex.h>
@@ -25,7 +26,6 @@
 #include <linux/platform_data/keypad-omap.h>
 #include <linux/platform_data/lcd-mipid.h>
 #include <linux/platform_data/gpio-omap.h>
-#include <linux/platform_data/i2c-cbus-gpio.h>
 
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
@@ -217,18 +217,19 @@ static inline void nokia770_mmc_init(void)
 #endif
 
 #if IS_ENABLED(CONFIG_I2C_CBUS_GPIO)
-static struct i2c_cbus_platform_data nokia770_cbus_data = {
-	.clk_gpio = OMAP_MPUIO(9),
-	.dat_gpio = OMAP_MPUIO(10),
-	.sel_gpio = OMAP_MPUIO(11),
+static struct gpiod_lookup_table nokia770_cbus_gpio_table = {
+	.dev_id = "i2c-cbus-gpio.2",
+	.table = {
+		GPIO_LOOKUP_IDX("mpuio", 9, NULL, 0, 0), /* clk */
+		GPIO_LOOKUP_IDX("mpuio", 10, NULL, 1, 0), /* dat */
+		GPIO_LOOKUP_IDX("mpuio", 11, NULL, 2, 0), /* sel */
+		{ },
+	},
 };
 
 static struct platform_device nokia770_cbus_device = {
 	.name   = "i2c-cbus-gpio",
 	.id     = 2,
-	.dev    = {
-		.platform_data = &nokia770_cbus_data,
-	},
 };
 
 static struct i2c_board_info nokia770_i2c_board_info_2[] __initdata = {
@@ -257,6 +258,7 @@ static void __init nokia770_cbus_init(void)
 	nokia770_i2c_board_info_2[1].irq = gpio_to_irq(tahvo_irq_gpio);
 	i2c_register_board_info(2, nokia770_i2c_board_info_2,
 				ARRAY_SIZE(nokia770_i2c_board_info_2));
+	gpiod_add_lookup_table(&nokia770_cbus_gpio_table);
 	platform_device_register(&nokia770_cbus_device);
 }
 #else /* CONFIG_I2C_CBUS_GPIO */
diff --git a/drivers/i2c/busses/i2c-cbus-gpio.c b/drivers/i2c/busses/i2c-cbus-gpio.c
index b4f91e48948a..72df563477b1 100644
--- a/drivers/i2c/busses/i2c-cbus-gpio.c
+++ b/drivers/i2c/busses/i2c-cbus-gpio.c
@@ -18,16 +18,14 @@
 
 #include <linux/io.h>
 #include <linux/i2c.h>
-#include <linux/gpio.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/of_gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
-#include <linux/platform_data/i2c-cbus-gpio.h>
 
 /*
  * Bit counts are derived from Nokia implementation. These should be checked
@@ -39,9 +37,9 @@
 struct cbus_host {
 	spinlock_t	lock;		/* host lock */
 	struct device	*dev;
-	int		clk_gpio;
-	int		dat_gpio;
-	int		sel_gpio;
+	struct gpio_desc *clk;
+	struct gpio_desc *dat;
+	struct gpio_desc *sel;
 };
 
 /**
@@ -51,9 +49,9 @@ struct cbus_host {
  */
 static void cbus_send_bit(struct cbus_host *host, unsigned bit)
 {
-	gpio_set_value(host->dat_gpio, bit ? 1 : 0);
-	gpio_set_value(host->clk_gpio, 1);
-	gpio_set_value(host->clk_gpio, 0);
+	gpiod_set_value(host->dat, bit ? 1 : 0);
+	gpiod_set_value(host->clk, 1);
+	gpiod_set_value(host->clk, 0);
 }
 
 /**
@@ -78,9 +76,9 @@ static int cbus_receive_bit(struct cbus_host *host)
 {
 	int ret;
 
-	gpio_set_value(host->clk_gpio, 1);
-	ret = gpio_get_value(host->dat_gpio);
-	gpio_set_value(host->clk_gpio, 0);
+	gpiod_set_value(host->clk, 1);
+	ret = gpiod_get_value(host->dat);
+	gpiod_set_value(host->clk, 0);
 	return ret;
 }
 
@@ -123,10 +121,10 @@ static int cbus_transfer(struct cbus_host *host, char rw, unsigned dev,
 	spin_lock_irqsave(&host->lock, flags);
 
 	/* Reset state and start of transfer, SEL stays down during transfer */
-	gpio_set_value(host->sel_gpio, 0);
+	gpiod_set_value(host->sel, 0);
 
 	/* Set the DAT pin to output */
-	gpio_direction_output(host->dat_gpio, 1);
+	gpiod_direction_output(host->dat, 1);
 
 	/* Send the device address */
 	cbus_send_data(host, dev, CBUS_ADDR_BITS);
@@ -141,12 +139,12 @@ static int cbus_transfer(struct cbus_host *host, char rw, unsigned dev,
 		cbus_send_data(host, data, 16);
 		ret = 0;
 	} else {
-		ret = gpio_direction_input(host->dat_gpio);
+		ret = gpiod_direction_input(host->dat);
 		if (ret) {
 			dev_dbg(host->dev, "failed setting direction\n");
 			goto out;
 		}
-		gpio_set_value(host->clk_gpio, 1);
+		gpiod_set_value(host->clk, 1);
 
 		ret = cbus_receive_word(host);
 		if (ret < 0) {
@@ -156,9 +154,9 @@ static int cbus_transfer(struct cbus_host *host, char rw, unsigned dev,
 	}
 
 	/* Indicate end of transfer, SEL goes up until next transfer */
-	gpio_set_value(host->sel_gpio, 1);
-	gpio_set_value(host->clk_gpio, 1);
-	gpio_set_value(host->clk_gpio, 0);
+	gpiod_set_value(host->sel, 1);
+	gpiod_set_value(host->clk, 1);
+	gpiod_set_value(host->clk, 0);
 
 out:
 	spin_unlock_irqrestore(&host->lock, flags);
@@ -214,7 +212,6 @@ static int cbus_i2c_probe(struct platform_device *pdev)
 {
 	struct i2c_adapter *adapter;
 	struct cbus_host *chost;
-	int ret;
 
 	adapter = devm_kzalloc(&pdev->dev, sizeof(struct i2c_adapter),
 			       GFP_KERNEL);
@@ -225,22 +222,20 @@ static int cbus_i2c_probe(struct platform_device *pdev)
 	if (!chost)
 		return -ENOMEM;
 
-	if (pdev->dev.of_node) {
-		struct device_node *dnode = pdev->dev.of_node;
-		if (of_gpio_count(dnode) != 3)
-			return -ENODEV;
-		chost->clk_gpio = of_get_gpio(dnode, 0);
-		chost->dat_gpio = of_get_gpio(dnode, 1);
-		chost->sel_gpio = of_get_gpio(dnode, 2);
-	} else if (dev_get_platdata(&pdev->dev)) {
-		struct i2c_cbus_platform_data *pdata =
-			dev_get_platdata(&pdev->dev);
-		chost->clk_gpio = pdata->clk_gpio;
-		chost->dat_gpio = pdata->dat_gpio;
-		chost->sel_gpio = pdata->sel_gpio;
-	} else {
+	if (gpiod_count(&pdev->dev, NULL) != 3)
 		return -ENODEV;
-	}
+	chost->clk = devm_gpiod_get_index(&pdev->dev, NULL, 0, GPIOD_OUT_LOW);
+	if (IS_ERR(chost->clk))
+		return PTR_ERR(chost->clk);
+	chost->dat = devm_gpiod_get_index(&pdev->dev, NULL, 1, GPIOD_IN);
+	if (IS_ERR(chost->dat))
+		return PTR_ERR(chost->dat);
+	chost->sel = devm_gpiod_get_index(&pdev->dev, NULL, 2, GPIOD_OUT_HIGH);
+	if (IS_ERR(chost->sel))
+		return PTR_ERR(chost->sel);
+	gpiod_set_consumer_name(chost->clk, "CBUS clk");
+	gpiod_set_consumer_name(chost->dat, "CBUS dat");
+	gpiod_set_consumer_name(chost->sel, "CBUS sel");
 
 	adapter->owner		= THIS_MODULE;
 	adapter->class		= I2C_CLASS_HWMON;
@@ -254,21 +249,6 @@ static int cbus_i2c_probe(struct platform_device *pdev)
 	spin_lock_init(&chost->lock);
 	chost->dev = &pdev->dev;
 
-	ret = devm_gpio_request_one(&pdev->dev, chost->clk_gpio,
-				    GPIOF_OUT_INIT_LOW, "CBUS clk");
-	if (ret)
-		return ret;
-
-	ret = devm_gpio_request_one(&pdev->dev, chost->dat_gpio, GPIOF_IN,
-				    "CBUS data");
-	if (ret)
-		return ret;
-
-	ret = devm_gpio_request_one(&pdev->dev, chost->sel_gpio,
-				    GPIOF_OUT_INIT_HIGH, "CBUS sel");
-	if (ret)
-		return ret;
-
 	i2c_set_adapdata(adapter, chost);
 	platform_set_drvdata(pdev, adapter);
 
diff --git a/include/linux/platform_data/i2c-cbus-gpio.h b/include/linux/platform_data/i2c-cbus-gpio.h
deleted file mode 100644
index 6faa992a9502..000000000000
--- a/include/linux/platform_data/i2c-cbus-gpio.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * i2c-cbus-gpio.h - CBUS I2C platform_data definition
- *
- * Copyright (C) 2004-2009 Nokia Corporation
- *
- * Written by Felipe Balbi and Aaro Koskinen.
- *
- * This file is subject to the terms and conditions of the GNU General
- * Public License. See the file "COPYING" in the main directory of this
- * archive for more details.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- */
-
-#ifndef __INCLUDE_LINUX_I2C_CBUS_GPIO_H
-#define __INCLUDE_LINUX_I2C_CBUS_GPIO_H
-
-struct i2c_cbus_platform_data {
-	int dat_gpio;
-	int clk_gpio;
-	int sel_gpio;
-};
-
-#endif /* __INCLUDE_LINUX_I2C_CBUS_GPIO_H */
-- 
cgit v1.2.3


From 6f4e626fb0cc93d50b49b79c2ee33bd769ee57f0 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Thu, 7 Feb 2019 09:07:20 -0700
Subject: scsi: ata: Use unsigned int for cmd's type in ioctls in
 scsi_host_template

Clang warns several times in the scsi subsystem (trimmed for brevity):

drivers/scsi/hpsa.c:6209:7: warning: overflow converting case value to
switch condition type (2147762695 to 18446744071562347015) [-Wswitch]
        case CCISS_GETBUSTYPES:
             ^
drivers/scsi/hpsa.c:6208:7: warning: overflow converting case value to
switch condition type (2147762694 to 18446744071562347014) [-Wswitch]
        case CCISS_GETHEARTBEAT:
             ^

The root cause is that the _IOC macro can generate really large numbers,
which don't fit into type 'int', which is used for the cmd parameter in
the ioctls in scsi_host_template. My research into how GCC and Clang are
handling this at a low level didn't prove fruitful. However, looking at
the rest of the kernel tree, all ioctls use an 'unsigned int' for the
cmd parameter, which will fit all of the _IOC values in the scsi/ata
subsystems.

Make that change because none of the ioctls expect a negative value for
any command, it brings the ioctls inline with the reset of the kernel,
and it removes ambiguity, which is never good when dealing with compilers.

Link: https://github.com/ClangBuiltLinux/linux/issues/85
Link: https://github.com/ClangBuiltLinux/linux/issues/154
Link: https://github.com/ClangBuiltLinux/linux/issues/157
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Acked-by: Bradley Grove <bgrove@attotech.com>
Acked-by: Don Brace <don.brace@microsemi.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ata/libata-scsi.c             |  5 +++--
 drivers/scsi/aacraid/aachba.c         |  2 +-
 drivers/scsi/aacraid/aacraid.h        |  4 ++--
 drivers/scsi/aacraid/commctrl.c       |  2 +-
 drivers/scsi/aacraid/linit.c          |  6 ++++--
 drivers/scsi/cxlflash/common.h        |  3 ++-
 drivers/scsi/cxlflash/main.c          |  2 +-
 drivers/scsi/cxlflash/superpipe.c     | 12 +++++-------
 drivers/scsi/esas2r/esas2r.h          |  4 ++--
 drivers/scsi/esas2r/esas2r_ioctl.c    | 16 +++++++---------
 drivers/scsi/esas2r/esas2r_main.c     |  2 +-
 drivers/scsi/hpsa.c                   | 15 +++++++++------
 drivers/scsi/ipr.c                    |  3 ++-
 drivers/scsi/libsas/sas_scsi_host.c   |  2 +-
 drivers/scsi/scsi_debug.c             |  3 ++-
 drivers/scsi/smartpqi/smartpqi_init.c |  3 ++-
 include/linux/libata.h                |  5 +++--
 include/scsi/libsas.h                 |  3 ++-
 include/scsi/scsi_host.h              |  6 ++++--
 19 files changed, 54 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 3d4887d0e84a..6291f1dbf342 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -778,7 +778,7 @@ static int ata_ioc32(struct ata_port *ap)
 }
 
 int ata_sas_scsi_ioctl(struct ata_port *ap, struct scsi_device *scsidev,
-		     int cmd, void __user *arg)
+		     unsigned int cmd, void __user *arg)
 {
 	unsigned long val;
 	int rc = -EINVAL;
@@ -829,7 +829,8 @@ int ata_sas_scsi_ioctl(struct ata_port *ap, struct scsi_device *scsidev,
 }
 EXPORT_SYMBOL_GPL(ata_sas_scsi_ioctl);
 
-int ata_scsi_ioctl(struct scsi_device *scsidev, int cmd, void __user *arg)
+int ata_scsi_ioctl(struct scsi_device *scsidev, unsigned int cmd,
+		   void __user *arg)
 {
 	return ata_sas_scsi_ioctl(ata_shost_to_port(scsidev->host),
 				scsidev, cmd, arg);
diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c
index 75ab5ff6b78c..6085aa087a2f 100644
--- a/drivers/scsi/aacraid/aachba.c
+++ b/drivers/scsi/aacraid/aachba.c
@@ -3455,7 +3455,7 @@ static int delete_disk(struct aac_dev *dev, void __user *arg)
 	}
 }
 
-int aac_dev_ioctl(struct aac_dev *dev, int cmd, void __user *arg)
+int aac_dev_ioctl(struct aac_dev *dev, unsigned int cmd, void __user *arg)
 {
 	switch (cmd) {
 	case FSACTL_QUERY_DISK:
diff --git a/drivers/scsi/aacraid/aacraid.h b/drivers/scsi/aacraid/aacraid.h
index 3291d1c16864..1df5171594b8 100644
--- a/drivers/scsi/aacraid/aacraid.h
+++ b/drivers/scsi/aacraid/aacraid.h
@@ -2706,12 +2706,12 @@ void aac_set_intx_mode(struct aac_dev *dev);
 int aac_get_config_status(struct aac_dev *dev, int commit_flag);
 int aac_get_containers(struct aac_dev *dev);
 int aac_scsi_cmd(struct scsi_cmnd *cmd);
-int aac_dev_ioctl(struct aac_dev *dev, int cmd, void __user *arg);
+int aac_dev_ioctl(struct aac_dev *dev, unsigned int cmd, void __user *arg);
 #ifndef shost_to_class
 #define shost_to_class(shost) &shost->shost_dev
 #endif
 ssize_t aac_get_serial_number(struct device *dev, char *buf);
-int aac_do_ioctl(struct aac_dev * dev, int cmd, void __user *arg);
+int aac_do_ioctl(struct aac_dev *dev, unsigned int cmd, void __user *arg);
 int aac_rx_init(struct aac_dev *dev);
 int aac_rkt_init(struct aac_dev *dev);
 int aac_nark_init(struct aac_dev *dev);
diff --git a/drivers/scsi/aacraid/commctrl.c b/drivers/scsi/aacraid/commctrl.c
index e2899ff7913e..f0ff40332753 100644
--- a/drivers/scsi/aacraid/commctrl.c
+++ b/drivers/scsi/aacraid/commctrl.c
@@ -1060,7 +1060,7 @@ static int aac_send_reset_adapter(struct aac_dev *dev, void __user *arg)
 	return retval;
 }
 
-int aac_do_ioctl(struct aac_dev * dev, int cmd, void __user *arg)
+int aac_do_ioctl(struct aac_dev *dev, unsigned int cmd, void __user *arg)
 {
 	int status;
 
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index dd701d5243b1..22ecacffeca6 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -616,7 +616,8 @@ static struct device_attribute *aac_dev_attrs[] = {
 	NULL,
 };
 
-static int aac_ioctl(struct scsi_device *sdev, int cmd, void __user * arg)
+static int aac_ioctl(struct scsi_device *sdev, unsigned int cmd,
+		     void __user *arg)
 {
 	struct aac_dev *dev = (struct aac_dev *)sdev->host->hostdata;
 	if (!capable(CAP_SYS_RAWIO))
@@ -1205,7 +1206,8 @@ static long aac_compat_do_ioctl(struct aac_dev *dev, unsigned cmd, unsigned long
 	return ret;
 }
 
-static int aac_compat_ioctl(struct scsi_device *sdev, int cmd, void __user *arg)
+static int aac_compat_ioctl(struct scsi_device *sdev, unsigned int cmd,
+			    void __user *arg)
 {
 	struct aac_dev *dev = (struct aac_dev *)sdev->host->hostdata;
 	if (!capable(CAP_SYS_RAWIO))
diff --git a/drivers/scsi/cxlflash/common.h b/drivers/scsi/cxlflash/common.h
index 8908a20065c8..4d90106fcb37 100644
--- a/drivers/scsi/cxlflash/common.h
+++ b/drivers/scsi/cxlflash/common.h
@@ -334,7 +334,8 @@ int cxlflash_afu_sync(struct afu *afu, ctx_hndl_t c, res_hndl_t r, u8 mode);
 void cxlflash_list_init(void);
 void cxlflash_term_global_luns(void);
 void cxlflash_free_errpage(void);
-int cxlflash_ioctl(struct scsi_device *sdev, int cmd, void __user *arg);
+int cxlflash_ioctl(struct scsi_device *sdev, unsigned int cmd,
+		   void __user *arg);
 void cxlflash_stop_term_user_contexts(struct cxlflash_cfg *cfg);
 int cxlflash_mark_contexts_error(struct cxlflash_cfg *cfg);
 void cxlflash_term_local_luns(struct cxlflash_cfg *cfg);
diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
index bfa13e3b191c..a0ea2dea7518 100644
--- a/drivers/scsi/cxlflash/main.c
+++ b/drivers/scsi/cxlflash/main.c
@@ -3282,7 +3282,7 @@ static int cxlflash_chr_open(struct inode *inode, struct file *file)
  *
  * Return: A string identifying the decoded host ioctl.
  */
-static char *decode_hioctl(int cmd)
+static char *decode_hioctl(unsigned int cmd)
 {
 	switch (cmd) {
 	case HT_CXLFLASH_LUN_PROVISION:
diff --git a/drivers/scsi/cxlflash/superpipe.c b/drivers/scsi/cxlflash/superpipe.c
index acac6152f50b..1a94a469051e 100644
--- a/drivers/scsi/cxlflash/superpipe.c
+++ b/drivers/scsi/cxlflash/superpipe.c
@@ -1924,7 +1924,7 @@ out:
  *
  * Return: A string identifying the decoded ioctl.
  */
-static char *decode_ioctl(int cmd)
+static char *decode_ioctl(unsigned int cmd)
 {
 	switch (cmd) {
 	case DK_CXLFLASH_ATTACH:
@@ -2051,7 +2051,7 @@ err1:
  *
  * Return: 0 on success, -errno on failure
  */
-static int ioctl_common(struct scsi_device *sdev, int cmd)
+static int ioctl_common(struct scsi_device *sdev, unsigned int cmd)
 {
 	struct cxlflash_cfg *cfg = shost_priv(sdev->host);
 	struct device *dev = &cfg->dev->dev;
@@ -2096,7 +2096,7 @@ out:
  *
  * Return: 0 on success, -errno on failure
  */
-int cxlflash_ioctl(struct scsi_device *sdev, int cmd, void __user *arg)
+int cxlflash_ioctl(struct scsi_device *sdev, unsigned int cmd, void __user *arg)
 {
 	typedef int (*sioctl) (struct scsi_device *, void *);
 
@@ -2179,8 +2179,7 @@ int cxlflash_ioctl(struct scsi_device *sdev, int cmd, void __user *arg)
 	}
 
 	if (unlikely(copy_from_user(&buf, arg, size))) {
-		dev_err(dev, "%s: copy_from_user() fail "
-			"size=%lu cmd=%d (%s) arg=%p\n",
+		dev_err(dev, "%s: copy_from_user() fail size=%lu cmd=%u (%s) arg=%p\n",
 			__func__, size, cmd, decode_ioctl(cmd), arg);
 		rc = -EFAULT;
 		goto cxlflash_ioctl_exit;
@@ -2203,8 +2202,7 @@ int cxlflash_ioctl(struct scsi_device *sdev, int cmd, void __user *arg)
 	rc = do_ioctl(sdev, (void *)&buf);
 	if (likely(!rc))
 		if (unlikely(copy_to_user(arg, &buf, size))) {
-			dev_err(dev, "%s: copy_to_user() fail "
-				"size=%lu cmd=%d (%s) arg=%p\n",
+			dev_err(dev, "%s: copy_to_user() fail size=%lu cmd=%u (%s) arg=%p\n",
 				__func__, size, cmd, decode_ioctl(cmd), arg);
 			rc = -EFAULT;
 		}
diff --git a/drivers/scsi/esas2r/esas2r.h b/drivers/scsi/esas2r/esas2r.h
index 858c3b33db78..7f43b95f4e94 100644
--- a/drivers/scsi/esas2r/esas2r.h
+++ b/drivers/scsi/esas2r/esas2r.h
@@ -965,8 +965,8 @@ struct esas2r_adapter {
 const char *esas2r_info(struct Scsi_Host *);
 int esas2r_write_params(struct esas2r_adapter *a, struct esas2r_request *rq,
 			struct esas2r_sas_nvram *data);
-int esas2r_ioctl_handler(void *hostdata, int cmd, void __user *arg);
-int esas2r_ioctl(struct scsi_device *dev, int cmd, void __user *arg);
+int esas2r_ioctl_handler(void *hostdata, unsigned int cmd, void __user *arg);
+int esas2r_ioctl(struct scsi_device *dev, unsigned int cmd, void __user *arg);
 u8 handle_hba_ioctl(struct esas2r_adapter *a,
 		    struct atto_ioctl *ioctl_hba);
 int esas2r_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd);
diff --git a/drivers/scsi/esas2r/esas2r_ioctl.c b/drivers/scsi/esas2r/esas2r_ioctl.c
index 34bcc8c04ff4..3d130523c288 100644
--- a/drivers/scsi/esas2r/esas2r_ioctl.c
+++ b/drivers/scsi/esas2r/esas2r_ioctl.c
@@ -1274,7 +1274,7 @@ int esas2r_write_params(struct esas2r_adapter *a, struct esas2r_request *rq,
 
 
 /* This function only cares about ATTO-specific ioctls (atto_express_ioctl) */
-int esas2r_ioctl_handler(void *hostdata, int cmd, void __user *arg)
+int esas2r_ioctl_handler(void *hostdata, unsigned int cmd, void __user *arg)
 {
 	struct atto_express_ioctl *ioctl = NULL;
 	struct esas2r_adapter *a;
@@ -1292,9 +1292,8 @@ int esas2r_ioctl_handler(void *hostdata, int cmd, void __user *arg)
 	ioctl = memdup_user(arg, sizeof(struct atto_express_ioctl));
 	if (IS_ERR(ioctl)) {
 		esas2r_log(ESAS2R_LOG_WARN,
-			   "ioctl_handler access_ok failed for cmd %d, "
-			   "address %p", cmd,
-			   arg);
+			   "ioctl_handler access_ok failed for cmd %u, address %p",
+			   cmd, arg);
 		return PTR_ERR(ioctl);
 	}
 
@@ -1493,7 +1492,7 @@ int esas2r_ioctl_handler(void *hostdata, int cmd, void __user *arg)
 ioctl_done:
 
 	if (err < 0) {
-		esas2r_log(ESAS2R_LOG_WARN, "err %d on ioctl cmd %d", err,
+		esas2r_log(ESAS2R_LOG_WARN, "err %d on ioctl cmd %u", err,
 			   cmd);
 
 		switch (err) {
@@ -1518,9 +1517,8 @@ ioctl_done:
 	err = __copy_to_user(arg, ioctl, sizeof(struct atto_express_ioctl));
 	if (err != 0) {
 		esas2r_log(ESAS2R_LOG_WARN,
-			   "ioctl_handler copy_to_user didn't copy "
-			   "everything (err %d, cmd %d)", err,
-			   cmd);
+			   "ioctl_handler copy_to_user didn't copy everything (err %d, cmd %u)",
+			   err, cmd);
 		kfree(ioctl);
 
 		return -EFAULT;
@@ -1531,7 +1529,7 @@ ioctl_done:
 	return 0;
 }
 
-int esas2r_ioctl(struct scsi_device *sd, int cmd, void __user *arg)
+int esas2r_ioctl(struct scsi_device *sd, unsigned int cmd, void __user *arg)
 {
 	return esas2r_ioctl_handler(sd->host->hostdata, cmd, arg);
 }
diff --git a/drivers/scsi/esas2r/esas2r_main.c b/drivers/scsi/esas2r/esas2r_main.c
index 64397d441bae..fdbda5c05aa0 100644
--- a/drivers/scsi/esas2r/esas2r_main.c
+++ b/drivers/scsi/esas2r/esas2r_main.c
@@ -623,7 +623,7 @@ static int esas2r_proc_major;
 long esas2r_proc_ioctl(struct file *fp, unsigned int cmd, unsigned long arg)
 {
 	return esas2r_ioctl_handler(esas2r_proc_host->hostdata,
-				    (int)cmd, (void __user *)arg);
+				    cmd, (void __user *)arg);
 }
 
 static void __exit esas2r_exit(void)
diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index 5284444fdd10..f044e7d10d63 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -251,10 +251,11 @@ static int number_of_controllers;
 
 static irqreturn_t do_hpsa_intr_intx(int irq, void *dev_id);
 static irqreturn_t do_hpsa_intr_msi(int irq, void *dev_id);
-static int hpsa_ioctl(struct scsi_device *dev, int cmd, void __user *arg);
+static int hpsa_ioctl(struct scsi_device *dev, unsigned int cmd,
+		      void __user *arg);
 
 #ifdef CONFIG_COMPAT
-static int hpsa_compat_ioctl(struct scsi_device *dev, int cmd,
+static int hpsa_compat_ioctl(struct scsi_device *dev, unsigned int cmd,
 	void __user *arg);
 #endif
 
@@ -6127,7 +6128,7 @@ static void cmd_free(struct ctlr_info *h, struct CommandList *c)
 
 #ifdef CONFIG_COMPAT
 
-static int hpsa_ioctl32_passthru(struct scsi_device *dev, int cmd,
+static int hpsa_ioctl32_passthru(struct scsi_device *dev, unsigned int cmd,
 	void __user *arg)
 {
 	IOCTL32_Command_struct __user *arg32 =
@@ -6164,7 +6165,7 @@ static int hpsa_ioctl32_passthru(struct scsi_device *dev, int cmd,
 }
 
 static int hpsa_ioctl32_big_passthru(struct scsi_device *dev,
-	int cmd, void __user *arg)
+	unsigned int cmd, void __user *arg)
 {
 	BIG_IOCTL32_Command_struct __user *arg32 =
 	    (BIG_IOCTL32_Command_struct __user *) arg;
@@ -6201,7 +6202,8 @@ static int hpsa_ioctl32_big_passthru(struct scsi_device *dev,
 	return err;
 }
 
-static int hpsa_compat_ioctl(struct scsi_device *dev, int cmd, void __user *arg)
+static int hpsa_compat_ioctl(struct scsi_device *dev, unsigned int cmd,
+			     void __user *arg)
 {
 	switch (cmd) {
 	case CCISS_GETPCIINFO:
@@ -6521,7 +6523,8 @@ static void check_ioctl_unit_attention(struct ctlr_info *h,
 /*
  * ioctl
  */
-static int hpsa_ioctl(struct scsi_device *dev, int cmd, void __user *arg)
+static int hpsa_ioctl(struct scsi_device *dev, unsigned int cmd,
+		      void __user *arg)
 {
 	struct ctlr_info *h;
 	void __user *argp = (void __user *)arg;
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index d1b4025a4503..6d053e220153 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -6696,7 +6696,8 @@ err_nodev:
  * Return value:
  * 	0 on success / other on failure
  **/
-static int ipr_ioctl(struct scsi_device *sdev, int cmd, void __user *arg)
+static int ipr_ioctl(struct scsi_device *sdev, unsigned int cmd,
+		     void __user *arg)
 {
 	struct ipr_resource_entry *res;
 
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index c43a00a9d819..b775445892af 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -799,7 +799,7 @@ out:
 		  shost->host_failed, tries);
 }
 
-int sas_ioctl(struct scsi_device *sdev, int cmd, void __user *arg)
+int sas_ioctl(struct scsi_device *sdev, unsigned int cmd, void __user *arg)
 {
 	struct domain_device *dev = sdev_to_domain_dev(sdev);
 
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 8044bb08455d..c5014e9f4a50 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -832,7 +832,8 @@ static void mk_sense_invalid_opcode(struct scsi_cmnd *scp)
 	mk_sense_buffer(scp, ILLEGAL_REQUEST, INVALID_OPCODE, 0);
 }
 
-static int scsi_debug_ioctl(struct scsi_device *dev, int cmd, void __user *arg)
+static int scsi_debug_ioctl(struct scsi_device *dev, unsigned int cmd,
+			    void __user *arg)
 {
 	if (sdebug_verbose) {
 		if (0x1261 == cmd)
diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c
index e2fa3f476227..f6eaea2eadd1 100644
--- a/drivers/scsi/smartpqi/smartpqi_init.c
+++ b/drivers/scsi/smartpqi/smartpqi_init.c
@@ -6043,7 +6043,8 @@ out:
 	return rc;
 }
 
-static int pqi_ioctl(struct scsi_device *sdev, int cmd, void __user *arg)
+static int pqi_ioctl(struct scsi_device *sdev, unsigned int cmd,
+		     void __user *arg)
 {
 	int rc;
 	struct pqi_ctrl_info *ctrl_info;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 68133842e6d7..c9419c05a90a 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1122,10 +1122,11 @@ extern int ata_host_activate(struct ata_host *host, int irq,
 extern void ata_host_detach(struct ata_host *host);
 extern void ata_host_init(struct ata_host *, struct device *, struct ata_port_operations *);
 extern int ata_scsi_detect(struct scsi_host_template *sht);
-extern int ata_scsi_ioctl(struct scsi_device *dev, int cmd, void __user *arg);
+extern int ata_scsi_ioctl(struct scsi_device *dev, unsigned int cmd,
+			  void __user *arg);
 extern int ata_scsi_queuecmd(struct Scsi_Host *h, struct scsi_cmnd *cmd);
 extern int ata_sas_scsi_ioctl(struct ata_port *ap, struct scsi_device *dev,
-			    int cmd, void __user *arg);
+			    unsigned int cmd, void __user *arg);
 extern void ata_sas_port_destroy(struct ata_port *);
 extern struct ata_port *ata_sas_port_alloc(struct ata_host *,
 					   struct ata_port_info *, struct Scsi_Host *);
diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h
index 857086cf7ebf..56b2dba7d911 100644
--- a/include/scsi/libsas.h
+++ b/include/scsi/libsas.h
@@ -707,7 +707,8 @@ int sas_eh_target_reset_handler(struct scsi_cmnd *cmd);
 
 extern void sas_target_destroy(struct scsi_target *);
 extern int sas_slave_alloc(struct scsi_device *);
-extern int sas_ioctl(struct scsi_device *sdev, int cmd, void __user *arg);
+extern int sas_ioctl(struct scsi_device *sdev, unsigned int cmd,
+		     void __user *arg);
 extern int sas_drain_work(struct sas_ha_struct *ha);
 
 extern void sas_ssp_task_response(struct device *dev, struct sas_task *task,
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 6ca954e9f752..4047d68d1b08 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -60,7 +60,8 @@ struct scsi_host_template {
 	 *
 	 * Status: OPTIONAL
 	 */
-	int (* ioctl)(struct scsi_device *dev, int cmd, void __user *arg);
+	int (*ioctl)(struct scsi_device *dev, unsigned int cmd,
+		     void __user *arg);
 
 
 #ifdef CONFIG_COMPAT
@@ -70,7 +71,8 @@ struct scsi_host_template {
 	 *
 	 * Status: OPTIONAL
 	 */
-	int (* compat_ioctl)(struct scsi_device *dev, int cmd, void __user *arg);
+	int (*compat_ioctl)(struct scsi_device *dev, unsigned int cmd,
+			    void __user *arg);
 #endif
 
 	/*
-- 
cgit v1.2.3


From 2c1619edef61a03cb516efaa81750784c3071d10 Mon Sep 17 00:00:00 2001
From: Danit Goldberg <danitg@mellanox.com>
Date: Thu, 24 Jan 2019 14:18:15 +0200
Subject: IB/cma: Define option to set ack timeout and pack tos_set

Define new option in 'rdma_set_option' to override calculated QP timeout
when requested to provide QP attributes to modify a QP.

At the same time, pack tos_set to be bitfield.

Signed-off-by: Danit Goldberg <danitg@mellanox.com>
Reviewed-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cma.c      | 32 ++++++++++++++++++++++++++++++++
 drivers/infiniband/core/cma_priv.h |  4 +++-
 drivers/infiniband/core/ucma.c     |  7 +++++++
 include/rdma/rdma_cm.h             |  1 +
 include/uapi/rdma/rdma_user_cm.h   |  4 ++++
 5 files changed, 47 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index e15546ae4d0f..83aa2ad0c27e 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -888,6 +888,7 @@ struct rdma_cm_id *__rdma_create_id(struct net *net,
 	id_priv->id.ps = ps;
 	id_priv->id.qp_type = qp_type;
 	id_priv->tos_set = false;
+	id_priv->timeout_set = false;
 	id_priv->gid_type = IB_GID_TYPE_IB;
 	spin_lock_init(&id_priv->lock);
 	mutex_init(&id_priv->qp_mutex);
@@ -1130,6 +1131,9 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
 	} else
 		ret = -ENOSYS;
 
+	if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set)
+		qp_attr->timeout = id_priv->timeout;
+
 	return ret;
 }
 EXPORT_SYMBOL(rdma_init_qp_attr);
@@ -2490,6 +2494,34 @@ void rdma_set_service_type(struct rdma_cm_id *id, int tos)
 }
 EXPORT_SYMBOL(rdma_set_service_type);
 
+/**
+ * rdma_set_ack_timeout() - Set the ack timeout of QP associated
+ *                          with a connection identifier.
+ * @id: Communication identifier to associated with service type.
+ * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec.
+ *
+ * This function should be called before rdma_connect() on active side,
+ * and on passive side before rdma_accept(). It is applicable to primary
+ * path only. The timeout will affect the local side of the QP, it is not
+ * negotiated with remote side and zero disables the timer.
+ *
+ * Return: 0 for success
+ */
+int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout)
+{
+	struct rdma_id_private *id_priv;
+
+	if (id->qp_type != IB_QPT_RC)
+		return -EINVAL;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	id_priv->timeout = timeout;
+	id_priv->timeout_set = true;
+
+	return 0;
+}
+EXPORT_SYMBOL(rdma_set_ack_timeout);
+
 static void cma_query_handler(int status, struct sa_path_rec *path_rec,
 			      void *context)
 {
diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h
index cf47c69436a7..ca7307277518 100644
--- a/drivers/infiniband/core/cma_priv.h
+++ b/drivers/infiniband/core/cma_priv.h
@@ -84,9 +84,11 @@ struct rdma_id_private {
 	u32			options;
 	u8			srq;
 	u8			tos;
-	bool			tos_set;
+	u8			tos_set:1;
+	u8                      timeout_set:1;
 	u8			reuseaddr;
 	u8			afonly;
+	u8			timeout;
 	enum ib_gid_type	gid_type;
 
 	/*
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 01d68ed46c1b..7468b26b8a01 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1236,6 +1236,13 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname,
 		}
 		ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0);
 		break;
+	case RDMA_OPTION_ID_ACK_TIMEOUT:
+		if (optlen != sizeof(u8)) {
+			ret = -EINVAL;
+			break;
+		}
+		ret = rdma_set_ack_timeout(ctx->cm_id, *((u8 *)optval));
+		break;
 	default:
 		ret = -ENOSYS;
 	}
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
index 60987a5903b7..71f48cfdc24c 100644
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -374,6 +374,7 @@ int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse);
  */
 int rdma_set_afonly(struct rdma_cm_id *id, int afonly);
 
+int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout);
  /**
  * rdma_get_service_id - Return the IB service ID for a specified address.
  * @id: Communication identifier associated with the address.
diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h
index 0d1e78ebad05..e42940a215a3 100644
--- a/include/uapi/rdma/rdma_user_cm.h
+++ b/include/uapi/rdma/rdma_user_cm.h
@@ -300,6 +300,10 @@ enum {
 	RDMA_OPTION_ID_TOS	 = 0,
 	RDMA_OPTION_ID_REUSEADDR = 1,
 	RDMA_OPTION_ID_AFONLY	 = 2,
+	RDMA_OPTION_ID_ACK_TIMEOUT = 3
+};
+
+enum {
 	RDMA_OPTION_IB_PATH	 = 1
 };
 
-- 
cgit v1.2.3


From 926ba19b3574f6a80823a42484877ed65e91da9c Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Fri, 1 Feb 2019 12:44:32 -0800
Subject: RDMA/iwcm: add tos_set bool to iw_cm struct

This allows drivers to know the tos was actively set by the application.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cma.c | 2 ++
 include/rdma/iw_cm.h          | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index e761ddd09aed..c43512752b8a 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -2414,6 +2414,7 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
 		return PTR_ERR(id);
 
 	id->tos = id_priv->tos;
+	id->tos_set = id_priv->tos_set;
 	id_priv->cm_id.iw = id;
 
 	memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv),
@@ -3843,6 +3844,7 @@ static int cma_connect_iw(struct rdma_id_private *id_priv,
 		return PTR_ERR(cm_id);
 
 	cm_id->tos = id_priv->tos;
+	cm_id->tos_set = id_priv->tos_set;
 	id_priv->cm_id.iw = cm_id;
 
 	memcpy(&cm_id->local_addr, cma_src_addr(id_priv),
diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h
index 48512abd3162..0e1f02815643 100644
--- a/include/rdma/iw_cm.h
+++ b/include/rdma/iw_cm.h
@@ -94,7 +94,8 @@ struct iw_cm_id {
 	void (*add_ref)(struct iw_cm_id *);
 	void (*rem_ref)(struct iw_cm_id *);
 	u8  tos;
-	bool mapped;
+	bool tos_set:1;
+	bool mapped:1;
 };
 
 struct iw_cm_conn_param {
-- 
cgit v1.2.3


From 805b754d492f6227e1646001bdf85ad4bb819e55 Mon Sep 17 00:00:00 2001
From: Daniel Jurgens <danielj@mellanox.com>
Date: Sat, 2 Feb 2019 11:09:44 +0200
Subject: IB/core: Eliminate a hole in MAD agent struct

Move the security related fields above the u8s to eliminate a hole in the
struct.

pahole before:
struct ib_mad_agent {
...
u32                        hi_tid;               /*    48     4 */
u32                        flags;                /*    52     4 */
u8                         port_num;             /*    56     1 */
u8                         rmpp_version;         /*    57     1 */

/* XXX 6 bytes hole, try to pack */

/* --- cacheline 1 boundary (64 bytes) --- */
void *                     security;             /*    64     8 */
bool                       smp_allowed;          /*    72     1 */
bool                       lsm_nb_reg;           /*    73     1 */

/* XXX 6 bytes hole, try to pack */

struct notifier_block      lsm_nb;               /*    80    24 */

/* XXX last struct has 4 bytes of padding */

/* size: 104, cachelines: 2, members: 14 */
...
};

pahole after:
struct ib_mad_agent {
...
u32                        hi_tid;               /*    48     4 */
u32                        flags;                /*    52     4 */
void *                     security;             /*    56     8 */
/* --- cacheline 1 boundary (64 bytes) --- */
struct notifier_block      lsm_nb;               /*    64    24 */

/* XXX last struct has 4 bytes of padding */

u8                         port_num;             /*    88     1 */
u8                         rmpp_version;         /*    89     1 */
bool                       smp_allowed;          /*    90     1 */
bool                       lsm_nb_reg;           /*    91     1 */

/* size: 96, cachelines: 2, members: 14 */
...
};

Signed-off-by: Daniel Jurgens <danielj@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/rdma/ib_mad.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index fdef558e3a2d..1c0b914f199d 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -616,12 +616,12 @@ struct ib_mad_agent {
 	void			*context;
 	u32			hi_tid;
 	u32			flags;
+	void			*security;
+	struct notifier_block   lsm_nb;
 	u8			port_num;
 	u8			rmpp_version;
-	void			*security;
 	bool			smp_allowed;
 	bool			lsm_nb_reg;
-	struct notifier_block   lsm_nb;
 };
 
 /**
-- 
cgit v1.2.3


From c66f67414c1f88554485bb2a0abf8b5c0d741de7 Mon Sep 17 00:00:00 2001
From: Daniel Jurgens <danielj@mellanox.com>
Date: Sat, 2 Feb 2019 11:09:45 +0200
Subject: IB/core: Don't register each MAD agent for LSM notifier

When creating many MAD agents in a short period of time, receive packet
processing can be delayed long enough to cause timeouts while new agents
are being added to the atomic notifier chain with IRQs disabled.  Notifier
chain registration and unregstration is an O(n) operation. With large
numbers of MAD agents being created and destroyed simultaneously the CPUs
spend too much time with interrupts disabled.

Instead of each MAD agent registering for it's own LSM notification,
maintain a list of agents internally and register once, this registration
already existed for handling the PKeys. This list is write mostly, so a
normal spin lock is used vs a read/write lock. All MAD agents must be
checked, so a single list is used instead of breaking them down per
device.

Notifier calls are done under rcu_read_lock, so there isn't a risk of
similar packet timeouts while checking the MAD agents security settings
when notified.

Signed-off-by: Daniel Jurgens <danielj@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/core_priv.h |  5 ++++
 drivers/infiniband/core/device.c    |  1 +
 drivers/infiniband/core/security.c  | 50 +++++++++++++++++++++----------------
 include/rdma/ib_mad.h               |  3 +--
 4 files changed, 35 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index bcb3e3029a9b..d053110207eb 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -202,6 +202,7 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
 				enum ib_qp_type qp_type);
 void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent);
 int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index);
+void ib_mad_agent_security_change(void);
 #else
 static inline void ib_security_destroy_port_pkey_list(struct ib_device *device)
 {
@@ -267,6 +268,10 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map,
 {
 	return 0;
 }
+
+static inline void ib_mad_agent_security_change(void)
+{
+}
 #endif
 
 struct ib_device *ib_device_get_by_index(u32 ifindex);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 55221990d946..32cd35c9b21e 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -452,6 +452,7 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event,
 		return NOTIFY_DONE;
 
 	schedule_work(&ib_policy_change_work);
+	ib_mad_agent_security_change();
 
 	return NOTIFY_OK;
 }
diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
index 7662e9347238..a70d2ba312ed 100644
--- a/drivers/infiniband/core/security.c
+++ b/drivers/infiniband/core/security.c
@@ -39,6 +39,10 @@
 #include "core_priv.h"
 #include "mad_priv.h"
 
+static LIST_HEAD(mad_agent_list);
+/* Lock to protect mad_agent_list */
+static DEFINE_SPINLOCK(mad_agent_list_lock);
+
 static struct pkey_index_qp_list *get_pkey_idx_qp_list(struct ib_port_pkey *pp)
 {
 	struct pkey_index_qp_list *pkey = NULL;
@@ -676,19 +680,18 @@ static int ib_security_pkey_access(struct ib_device *dev,
 	return security_ib_pkey_access(sec, subnet_prefix, pkey);
 }
 
-static int ib_mad_agent_security_change(struct notifier_block *nb,
-					unsigned long event,
-					void *data)
+void ib_mad_agent_security_change(void)
 {
-	struct ib_mad_agent *ag = container_of(nb, struct ib_mad_agent, lsm_nb);
-
-	if (event != LSM_POLICY_CHANGE)
-		return NOTIFY_DONE;
-
-	ag->smp_allowed = !security_ib_endport_manage_subnet(
-		ag->security, dev_name(&ag->device->dev), ag->port_num);
-
-	return NOTIFY_OK;
+	struct ib_mad_agent *ag;
+
+	spin_lock(&mad_agent_list_lock);
+	list_for_each_entry(ag,
+			    &mad_agent_list,
+			    mad_agent_sec_list)
+		WRITE_ONCE(ag->smp_allowed,
+			   !security_ib_endport_manage_subnet(ag->security,
+				dev_name(&ag->device->dev), ag->port_num));
+	spin_unlock(&mad_agent_list_lock);
 }
 
 int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
@@ -699,6 +702,8 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
 	if (!rdma_protocol_ib(agent->device, agent->port_num))
 		return 0;
 
+	INIT_LIST_HEAD(&agent->mad_agent_sec_list);
+
 	ret = security_ib_alloc_security(&agent->security);
 	if (ret)
 		return ret;
@@ -706,22 +711,20 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
 	if (qp_type != IB_QPT_SMI)
 		return 0;
 
+	spin_lock(&mad_agent_list_lock);
 	ret = security_ib_endport_manage_subnet(agent->security,
 						dev_name(&agent->device->dev),
 						agent->port_num);
 	if (ret)
 		goto free_security;
 
-	agent->lsm_nb.notifier_call = ib_mad_agent_security_change;
-	ret = register_lsm_notifier(&agent->lsm_nb);
-	if (ret)
-		goto free_security;
-
-	agent->smp_allowed = true;
-	agent->lsm_nb_reg = true;
+	WRITE_ONCE(agent->smp_allowed, true);
+	list_add(&agent->mad_agent_sec_list, &mad_agent_list);
+	spin_unlock(&mad_agent_list_lock);
 	return 0;
 
 free_security:
+	spin_unlock(&mad_agent_list_lock);
 	security_ib_free_security(agent->security);
 	return ret;
 }
@@ -731,8 +734,11 @@ void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent)
 	if (!rdma_protocol_ib(agent->device, agent->port_num))
 		return;
 
-	if (agent->lsm_nb_reg)
-		unregister_lsm_notifier(&agent->lsm_nb);
+	if (agent->qp->qp_type == IB_QPT_SMI) {
+		spin_lock(&mad_agent_list_lock);
+		list_del(&agent->mad_agent_sec_list);
+		spin_unlock(&mad_agent_list_lock);
+	}
 
 	security_ib_free_security(agent->security);
 }
@@ -743,7 +749,7 @@ int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index)
 		return 0;
 
 	if (map->agent.qp->qp_type == IB_QPT_SMI) {
-		if (!map->agent.smp_allowed)
+		if (!READ_ONCE(map->agent.smp_allowed))
 			return -EACCES;
 		return 0;
 	}
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index 1c0b914f199d..79ba8219e7dc 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -617,11 +617,10 @@ struct ib_mad_agent {
 	u32			hi_tid;
 	u32			flags;
 	void			*security;
-	struct notifier_block   lsm_nb;
+	struct list_head	mad_agent_sec_list;
 	u8			port_num;
 	u8			rmpp_version;
 	bool			smp_allowed;
-	bool			lsm_nb_reg;
 };
 
 /**
-- 
cgit v1.2.3


From 30471d4b20335d9bd9ae9b2382a1e1e97d18d86d Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Sun, 3 Feb 2019 14:55:50 +0200
Subject: RDMA/core: Share driver structure size with core

Add new macros to be used in drivers while registering ops structure and
IB/core while calling allocation routines, so drivers won't need to
perform kzalloc/kfree in their paths.

The change in allocation stage allows us to initialize common fields prior
to calling to drivers (e.g. restrack).

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c |  2 ++
 include/rdma/ib_verbs.h          | 13 +++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 32cd35c9b21e..d806a5c7b202 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -1228,6 +1228,8 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 				(ptr)->name = ops->name;                       \
 	} while (0)
 
+#define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name)
+
 	SET_DEVICE_OP(dev_ops, add_gid);
 	SET_DEVICE_OP(dev_ops, advise_mr);
 	SET_DEVICE_OP(dev_ops, alloc_dm);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 2e1f1e885ee5..e29eae4aec84 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2264,6 +2264,19 @@ struct ib_counters_read_attr {
 
 struct uverbs_attr_bundle;
 
+#define INIT_RDMA_OBJ_SIZE(ib_struct, drv_struct, member)                      \
+	.size_##ib_struct =                                                    \
+		(sizeof(struct drv_struct) +                                   \
+		 BUILD_BUG_ON_ZERO(offsetof(struct drv_struct, member)) +      \
+		 BUILD_BUG_ON_ZERO(                                            \
+			 !__same_type(((struct drv_struct *)NULL)->member,     \
+				      struct ib_struct)))
+
+#define rdma_zalloc_drv_obj(ib_dev, ib_type)                                   \
+	((struct ib_type *)kzalloc(ib_dev->ops.size_##ib_type, GFP_KERNEL))
+
+#define DECLARE_RDMA_OBJ_SIZE(ib_struct) size_t size_##ib_struct
+
 /**
  * struct ib_device_ops - InfiniBand device operations
  * This structure defines all the InfiniBand device operations, providers will
-- 
cgit v1.2.3


From 21a428a019c9a6d133e745b529b9bf18c1187e70 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Sun, 3 Feb 2019 14:55:51 +0200
Subject: RDMA: Handle PD allocations by IB/core

The PD allocations in IB/core allows us to simplify drivers and their
error flows in their .alloc_pd() paths. The changes in .alloc_pd() go hand
in had with relevant update in .dealloc_pd().

We will use this opportunity and convert .dealloc_pd() to don't fail, as
it was suggested a long time ago, failures are not happening as we have
never seen a WARN_ON print.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c                |  2 +
 drivers/infiniband/core/uverbs_cmd.c            | 15 ++--
 drivers/infiniband/core/uverbs_std_types.c      |  2 +-
 drivers/infiniband/core/verbs.c                 | 27 +++++---
 drivers/infiniband/hw/bnxt_re/ib_verbs.c        | 37 ++++------
 drivers/infiniband/hw/bnxt_re/ib_verbs.h        |  9 ++-
 drivers/infiniband/hw/bnxt_re/main.c            |  1 +
 drivers/infiniband/hw/cxgb3/iwch_provider.c     | 25 +++----
 drivers/infiniband/hw/cxgb4/provider.c          | 25 +++----
 drivers/infiniband/hw/hns/hns_roce_device.h     |  7 +-
 drivers/infiniband/hw/hns/hns_roce_hw_v1.c      | 27 +++++---
 drivers/infiniband/hw/hns/hns_roce_main.c       |  1 +
 drivers/infiniband/hw/hns/hns_roce_pd.c         | 25 +++----
 drivers/infiniband/hw/i40iw/i40iw_utils.c       |  1 -
 drivers/infiniband/hw/i40iw/i40iw_verbs.c       | 32 ++++-----
 drivers/infiniband/hw/mlx4/main.c               | 36 ++++------
 drivers/infiniband/hw/mlx5/main.c               | 48 +++++++------
 drivers/infiniband/hw/mthca/mthca_provider.c    | 29 +++-----
 drivers/infiniband/hw/nes/nes_verbs.c           | 32 +++------
 drivers/infiniband/hw/ocrdma/ocrdma_main.c      |  1 +
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c     | 92 +++++++++++--------------
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.h     |  6 +-
 drivers/infiniband/hw/qedr/main.c               |  1 +
 drivers/infiniband/hw/qedr/verbs.c              | 34 +++------
 drivers/infiniband/hw/qedr/verbs.h              |  6 +-
 drivers/infiniband/hw/usnic/usnic_ib_main.c     |  1 +
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c    | 26 ++-----
 drivers/infiniband/hw/usnic/usnic_ib_verbs.h    |  7 +-
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c  |  1 +
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c | 43 ++++--------
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h |  7 +-
 drivers/infiniband/sw/rdmavt/pd.c               | 29 +++-----
 drivers/infiniband/sw/rdmavt/pd.h               |  7 +-
 drivers/infiniband/sw/rdmavt/vt.c               |  1 +
 drivers/infiniband/sw/rxe/rxe_pool.c            | 60 +++++++++++++---
 drivers/infiniband/sw/rxe/rxe_pool.h            |  4 ++
 drivers/infiniband/sw/rxe/rxe_verbs.c           | 16 ++---
 drivers/infiniband/sw/rxe/rxe_verbs.h           |  2 +-
 include/rdma/ib_verbs.h                         |  9 +--
 39 files changed, 325 insertions(+), 409 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index d806a5c7b202..57e1e177921e 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -1319,6 +1319,8 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	SET_DEVICE_OP(dev_ops, set_vf_guid);
 	SET_DEVICE_OP(dev_ops, set_vf_link_state);
 	SET_DEVICE_OP(dev_ops, unmap_fmr);
+
+	SET_OBJ_SIZE(dev_ops, ib_pd);
 }
 EXPORT_SYMBOL(ib_set_device_ops);
 
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index aa260cafbd85..5ac143f22df0 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -407,9 +407,9 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs)
 	if (IS_ERR(uobj))
 		return PTR_ERR(uobj);
 
-	pd = ib_dev->ops.alloc_pd(ib_dev, uobj->context, &attrs->driver_udata);
-	if (IS_ERR(pd)) {
-		ret = PTR_ERR(pd);
+	pd = rdma_zalloc_drv_obj(ib_dev, ib_pd);
+	if (!pd) {
+		ret = -ENOMEM;
 		goto err;
 	}
 
@@ -417,11 +417,15 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs)
 	pd->uobject = uobj;
 	pd->__internal_mr = NULL;
 	atomic_set(&pd->usecnt, 0);
+	pd->res.type = RDMA_RESTRACK_PD;
+
+	ret = ib_dev->ops.alloc_pd(pd, uobj->context, &attrs->driver_udata);
+	if (ret)
+		goto err_alloc;
 
 	uobj->object = pd;
 	memset(&resp, 0, sizeof resp);
 	resp.pd_handle = uobj->id;
-	pd->res.type = RDMA_RESTRACK_PD;
 	rdma_restrack_uadd(&pd->res);
 
 	ret = uverbs_response(attrs, &resp, sizeof(resp));
@@ -432,7 +436,8 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs)
 
 err_copy:
 	ib_dealloc_pd(pd);
-
+err_alloc:
+	kfree(pd);
 err:
 	uobj_alloc_abort(uobj);
 	return ret;
diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c
index cbc72312eb41..f224cb727224 100644
--- a/drivers/infiniband/core/uverbs_std_types.c
+++ b/drivers/infiniband/core/uverbs_std_types.c
@@ -188,7 +188,7 @@ static int uverbs_free_pd(struct ib_uobject *uobject,
 	if (ret)
 		return ret;
 
-	ib_dealloc_pd((struct ib_pd *)uobject->object);
+	ib_dealloc_pd(pd);
 	return 0;
 }
 
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 3220fb42ecce..de5d895a5054 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -254,10 +254,11 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
 {
 	struct ib_pd *pd;
 	int mr_access_flags = 0;
+	int ret;
 
-	pd = device->ops.alloc_pd(device, NULL, NULL);
-	if (IS_ERR(pd))
-		return pd;
+	pd = rdma_zalloc_drv_obj(device, ib_pd);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
 
 	pd->device = device;
 	pd->uobject = NULL;
@@ -265,6 +266,16 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
 	atomic_set(&pd->usecnt, 0);
 	pd->flags = flags;
 
+	pd->res.type = RDMA_RESTRACK_PD;
+	rdma_restrack_set_task(&pd->res, caller);
+
+	ret = device->ops.alloc_pd(pd, NULL, NULL);
+	if (ret) {
+		kfree(pd);
+		return ERR_PTR(ret);
+	}
+	rdma_restrack_kadd(&pd->res);
+
 	if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
 		pd->local_dma_lkey = device->local_dma_lkey;
 	else
@@ -275,10 +286,6 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
 		mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE;
 	}
 
-	pd->res.type = RDMA_RESTRACK_PD;
-	rdma_restrack_set_task(&pd->res, caller);
-	rdma_restrack_kadd(&pd->res);
-
 	if (mr_access_flags) {
 		struct ib_mr *mr;
 
@@ -329,10 +336,8 @@ void ib_dealloc_pd(struct ib_pd *pd)
 	WARN_ON(atomic_read(&pd->usecnt));
 
 	rdma_restrack_del(&pd->res);
-	/* Making delalloc_pd a void return is a WIP, no driver should return
-	   an error here. */
-	ret = pd->device->ops.dealloc_pd(pd);
-	WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
+	pd->device->ops.dealloc_pd(pd);
+	kfree(pd);
 }
 EXPORT_SYMBOL(ib_dealloc_pd);
 
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 1d7469e23cde..1606571af63d 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -563,41 +563,29 @@ fail:
 }
 
 /* Protection Domains */
-int bnxt_re_dealloc_pd(struct ib_pd *ib_pd)
+void bnxt_re_dealloc_pd(struct ib_pd *ib_pd)
 {
 	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
 	struct bnxt_re_dev *rdev = pd->rdev;
-	int rc;
 
 	bnxt_re_destroy_fence_mr(pd);
 
-	if (pd->qplib_pd.id) {
-		rc = bnxt_qplib_dealloc_pd(&rdev->qplib_res,
-					   &rdev->qplib_res.pd_tbl,
-					   &pd->qplib_pd);
-		if (rc)
-			dev_err(rdev_to_dev(rdev), "Failed to deallocate HW PD");
-	}
-
-	kfree(pd);
-	return 0;
+	if (pd->qplib_pd.id)
+		bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl,
+				      &pd->qplib_pd);
 }
 
-struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev,
-			       struct ib_ucontext *ucontext,
-			       struct ib_udata *udata)
+int bnxt_re_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *ucontext,
+		     struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibpd->device;
 	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
 	struct bnxt_re_ucontext *ucntx = container_of(ucontext,
 						      struct bnxt_re_ucontext,
 						      ib_uctx);
-	struct bnxt_re_pd *pd;
+	struct bnxt_re_pd *pd = container_of(ibpd, struct bnxt_re_pd, ib_pd);
 	int rc;
 
-	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd)
-		return ERR_PTR(-ENOMEM);
-
 	pd->rdev = rdev;
 	if (bnxt_qplib_alloc_pd(&rdev->qplib_res.pd_tbl, &pd->qplib_pd)) {
 		dev_err(rdev_to_dev(rdev), "Failed to allocate HW PD");
@@ -637,13 +625,12 @@ struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev,
 		if (bnxt_re_create_fence_mr(pd))
 			dev_warn(rdev_to_dev(rdev),
 				 "Failed to create Fence-MR\n");
-	return &pd->ib_pd;
+	return 0;
 dbfail:
-	(void)bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl,
-				    &pd->qplib_pd);
+	bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl,
+			      &pd->qplib_pd);
 fail:
-	kfree(pd);
-	return ERR_PTR(rc);
+	return rc;
 }
 
 /* Address Handles */
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
index c4af72604b4f..c7cca803cfa3 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
@@ -56,8 +56,8 @@ struct bnxt_re_fence_data {
 };
 
 struct bnxt_re_pd {
+	struct ib_pd            ib_pd;
 	struct bnxt_re_dev	*rdev;
-	struct ib_pd		ib_pd;
 	struct bnxt_qplib_pd	qplib_pd;
 	struct bnxt_re_fence_data fence;
 };
@@ -163,10 +163,9 @@ int bnxt_re_query_gid(struct ib_device *ibdev, u8 port_num,
 		      int index, union ib_gid *gid);
 enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev,
 					    u8 port_num);
-struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev,
-			       struct ib_ucontext *context,
-			       struct ib_udata *udata);
-int bnxt_re_dealloc_pd(struct ib_pd *pd);
+int bnxt_re_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
+		     struct ib_udata *udata);
+void bnxt_re_dealloc_pd(struct ib_pd *pd);
 struct ib_ah *bnxt_re_create_ah(struct ib_pd *pd,
 				struct rdma_ah_attr *ah_attr,
 				u32 flags,
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 0d40a930c192..0a89ef6e5754 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -637,6 +637,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = {
 	.query_srq = bnxt_re_query_srq,
 	.reg_user_mr = bnxt_re_reg_user_mr,
 	.req_notify_cq = bnxt_re_req_notify_cq,
+	INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd),
 };
 
 static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 804c1fc7bfc1..4cc9a6ae2139 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -370,7 +370,7 @@ static int iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 	return ret;
 }
 
-static int iwch_deallocate_pd(struct ib_pd *pd)
+static void iwch_deallocate_pd(struct ib_pd *pd)
 {
 	struct iwch_dev *rhp;
 	struct iwch_pd *php;
@@ -379,15 +379,13 @@ static int iwch_deallocate_pd(struct ib_pd *pd)
 	rhp = php->rhp;
 	pr_debug("%s ibpd %p pdid 0x%x\n", __func__, pd, php->pdid);
 	cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid);
-	kfree(php);
-	return 0;
 }
 
-static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev,
-			       struct ib_ucontext *context,
-			       struct ib_udata *udata)
+static int iwch_allocate_pd(struct ib_pd *pd, struct ib_ucontext *context,
+			    struct ib_udata *udata)
 {
-	struct iwch_pd *php;
+	struct iwch_pd *php = to_iwch_pd(pd);
+	struct ib_device *ibdev = pd->device;
 	u32 pdid;
 	struct iwch_dev *rhp;
 
@@ -395,12 +393,8 @@ static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev,
 	rhp = (struct iwch_dev *) ibdev;
 	pdid = cxio_hal_get_pdid(rhp->rdev.rscp);
 	if (!pdid)
-		return ERR_PTR(-EINVAL);
-	php = kzalloc(sizeof(*php), GFP_KERNEL);
-	if (!php) {
-		cxio_hal_put_pdid(rhp->rdev.rscp, pdid);
-		return ERR_PTR(-ENOMEM);
-	}
+		return -EINVAL;
+
 	php->pdid = pdid;
 	php->rhp = rhp;
 	if (context) {
@@ -408,11 +402,11 @@ static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev,
 
 		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
 			iwch_deallocate_pd(&php->ibpd);
-			return ERR_PTR(-EFAULT);
+			return -EFAULT;
 		}
 	}
 	pr_debug("%s pdid 0x%0x ptr 0x%p\n", __func__, pdid, php);
-	return &php->ibpd;
+	return 0;
 }
 
 static int iwch_dereg_mr(struct ib_mr *ib_mr)
@@ -1350,6 +1344,7 @@ static const struct ib_device_ops iwch_dev_ops = {
 	.reg_user_mr = iwch_reg_user_mr,
 	.req_notify_cq = iwch_arm_cq,
 	.resize_cq = iwch_resize_cq,
+	INIT_RDMA_OBJ_SIZE(ib_pd, iwch_pd, ibpd),
 };
 
 int iwch_register_device(struct iwch_dev *dev)
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index f59bf7e5a589..680b5e98491d 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -209,7 +209,7 @@ static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 	return ret;
 }
 
-static int c4iw_deallocate_pd(struct ib_pd *pd)
+static void c4iw_deallocate_pd(struct ib_pd *pd)
 {
 	struct c4iw_dev *rhp;
 	struct c4iw_pd *php;
@@ -221,15 +221,13 @@ static int c4iw_deallocate_pd(struct ib_pd *pd)
 	mutex_lock(&rhp->rdev.stats.lock);
 	rhp->rdev.stats.pd.cur--;
 	mutex_unlock(&rhp->rdev.stats.lock);
-	kfree(php);
-	return 0;
 }
 
-static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev,
-				      struct ib_ucontext *context,
-				      struct ib_udata *udata)
+static int c4iw_allocate_pd(struct ib_pd *pd, struct ib_ucontext *context,
+			    struct ib_udata *udata)
 {
-	struct c4iw_pd *php;
+	struct c4iw_pd *php = to_c4iw_pd(pd);
+	struct ib_device *ibdev = pd->device;
 	u32 pdid;
 	struct c4iw_dev *rhp;
 
@@ -237,12 +235,8 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev,
 	rhp = (struct c4iw_dev *) ibdev;
 	pdid =  c4iw_get_resource(&rhp->rdev.resource.pdid_table);
 	if (!pdid)
-		return ERR_PTR(-EINVAL);
-	php = kzalloc(sizeof(*php), GFP_KERNEL);
-	if (!php) {
-		c4iw_put_resource(&rhp->rdev.resource.pdid_table, pdid);
-		return ERR_PTR(-ENOMEM);
-	}
+		return -EINVAL;
+
 	php->pdid = pdid;
 	php->rhp = rhp;
 	if (context) {
@@ -250,7 +244,7 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev,
 
 		if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
 			c4iw_deallocate_pd(&php->ibpd);
-			return ERR_PTR(-EFAULT);
+			return -EFAULT;
 		}
 	}
 	mutex_lock(&rhp->rdev.stats.lock);
@@ -259,7 +253,7 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev,
 		rhp->rdev.stats.pd.max = rhp->rdev.stats.pd.cur;
 	mutex_unlock(&rhp->rdev.stats.lock);
 	pr_debug("pdid 0x%0x ptr 0x%p\n", pdid, php);
-	return &php->ibpd;
+	return 0;
 }
 
 static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
@@ -570,6 +564,7 @@ static const struct ib_device_ops c4iw_dev_ops = {
 	.query_qp = c4iw_ib_query_qp,
 	.reg_user_mr = c4iw_reg_user_mr,
 	.req_notify_cq = c4iw_arm_cq,
+	INIT_RDMA_OBJ_SIZE(ib_pd, c4iw_pd, ibpd),
 };
 
 void c4iw_register_device(struct work_struct *work)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 8ca8d74dfb6a..9ee86daf1700 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -1114,10 +1114,9 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *pd,
 int hns_roce_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
 int hns_roce_destroy_ah(struct ib_ah *ah, u32 flags);
 
-struct ib_pd *hns_roce_alloc_pd(struct ib_device *ib_dev,
-				struct ib_ucontext *context,
-				struct ib_udata *udata);
-int hns_roce_dealloc_pd(struct ib_pd *pd);
+int hns_roce_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
+		      struct ib_udata *udata);
+void hns_roce_dealloc_pd(struct ib_pd *pd);
 
 struct ib_mr *hns_roce_get_dma_mr(struct ib_pd *pd, int acc);
 struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index fa08c22aad66..a18b88c95995 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -711,13 +711,14 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
 	struct ib_qp_attr attr = { 0 };
 	struct hns_roce_v1_priv *priv;
 	struct hns_roce_qp *hr_qp;
+	struct ib_device *ibdev;
 	struct ib_cq *cq;
 	struct ib_pd *pd;
 	union ib_gid dgid;
 	u64 subnet_prefix;
 	int attr_mask = 0;
+	int ret = -ENOMEM;
 	int i, j;
-	int ret;
 	u8 queue_en[HNS_ROCE_V1_RESV_QP] = { 0 };
 	u8 phy_port;
 	u8 port = 0;
@@ -742,12 +743,16 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
 	free_mr->mr_free_cq->ib_cq.cq_context		= NULL;
 	atomic_set(&free_mr->mr_free_cq->ib_cq.usecnt, 0);
 
-	pd = hns_roce_alloc_pd(&hr_dev->ib_dev, NULL, NULL);
-	if (IS_ERR(pd)) {
-		dev_err(dev, "Create pd for reserved loop qp failed!");
-		ret = -ENOMEM;
+	ibdev = &hr_dev->ib_dev;
+	pd = rdma_zalloc_drv_obj(ibdev, ib_pd);
+	if (pd)
+		goto alloc_mem_failed;
+
+	pd->device  = ibdev;
+	ret = hns_roce_alloc_pd(pd, NULL, NULL);
+	if (ret)
 		goto alloc_pd_failed;
-	}
+
 	free_mr->mr_free_pd = to_hr_pd(pd);
 	free_mr->mr_free_pd->ibpd.device  = &hr_dev->ib_dev;
 	free_mr->mr_free_pd->ibpd.uobject = NULL;
@@ -854,10 +859,12 @@ create_lp_qp_failed:
 			dev_err(dev, "Destroy qp %d for mr free failed!\n", i);
 	}
 
-	if (hns_roce_dealloc_pd(pd))
-		dev_err(dev, "Destroy pd for create_lp_qp failed!\n");
+	hns_roce_dealloc_pd(pd);
 
 alloc_pd_failed:
+	kfree(pd);
+
+alloc_mem_failed:
 	if (hns_roce_ib_destroy_cq(cq))
 		dev_err(dev, "Destroy cq for create_lp_qp failed!\n");
 
@@ -891,9 +898,7 @@ static void hns_roce_v1_release_lp_qp(struct hns_roce_dev *hr_dev)
 	if (ret)
 		dev_err(dev, "Destroy cq for mr_free failed(%d)!\n", ret);
 
-	ret = hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd);
-	if (ret)
-		dev_err(dev, "Destroy pd for mr_free failed(%d)!\n", ret);
+	hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd);
 }
 
 static int hns_roce_db_init(struct hns_roce_dev *hr_dev)
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index 67a8c4333f4f..ccf10622586c 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -472,6 +472,7 @@ static const struct ib_device_ops hns_roce_dev_ops = {
 	.query_pkey = hns_roce_query_pkey,
 	.query_port = hns_roce_query_port,
 	.reg_user_mr = hns_roce_reg_user_mr,
+	INIT_RDMA_OBJ_SIZE(ib_pd, hns_roce_pd, ibpd),
 };
 
 static const struct ib_device_ops hns_roce_dev_mr_ops = {
diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c
index 4a29b2cb9bab..b9b97c5e97e6 100644
--- a/drivers/infiniband/hw/hns/hns_roce_pd.c
+++ b/drivers/infiniband/hw/hns/hns_roce_pd.c
@@ -57,24 +57,19 @@ void hns_roce_cleanup_pd_table(struct hns_roce_dev *hr_dev)
 	hns_roce_bitmap_cleanup(&hr_dev->pd_bitmap);
 }
 
-struct ib_pd *hns_roce_alloc_pd(struct ib_device *ib_dev,
-				struct ib_ucontext *context,
-				struct ib_udata *udata)
+int hns_roce_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+		      struct ib_udata *udata)
 {
+	struct ib_device *ib_dev = ibpd->device;
 	struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
 	struct device *dev = hr_dev->dev;
-	struct hns_roce_pd *pd;
+	struct hns_roce_pd *pd = to_hr_pd(ibpd);
 	int ret;
 
-	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd)
-		return ERR_PTR(-ENOMEM);
-
 	ret = hns_roce_pd_alloc(to_hr_dev(ib_dev), &pd->pdn);
 	if (ret) {
-		kfree(pd);
 		dev_err(dev, "[alloc_pd]hns_roce_pd_alloc failed!\n");
-		return ERR_PTR(ret);
+		return ret;
 	}
 
 	if (context) {
@@ -83,21 +78,17 @@ struct ib_pd *hns_roce_alloc_pd(struct ib_device *ib_dev,
 		if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
 			hns_roce_pd_free(to_hr_dev(ib_dev), pd->pdn);
 			dev_err(dev, "[alloc_pd]ib_copy_to_udata failed!\n");
-			kfree(pd);
-			return ERR_PTR(-EFAULT);
+			return -EFAULT;
 		}
 	}
 
-	return &pd->ibpd;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(hns_roce_alloc_pd);
 
-int hns_roce_dealloc_pd(struct ib_pd *pd)
+void hns_roce_dealloc_pd(struct ib_pd *pd)
 {
 	hns_roce_pd_free(to_hr_dev(pd->device), to_hr_pd(pd)->pdn);
-	kfree(to_hr_pd(pd));
-
-	return 0;
 }
 EXPORT_SYMBOL_GPL(hns_roce_dealloc_pd);
 
diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c
index 59e978141ad4..c5a881172524 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_utils.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c
@@ -601,7 +601,6 @@ void i40iw_rem_pdusecount(struct i40iw_pd *iwpd, struct i40iw_device *iwdev)
 	if (!atomic_dec_and_test(&iwpd->usecount))
 		return;
 	i40iw_free_resource(iwdev, iwdev->allocated_pds, iwpd->sc_pd.pd_id);
-	kfree(iwpd);
 }
 
 /**
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index d4ab46dd9e6c..28449ad57b37 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -312,16 +312,15 @@ static void i40iw_dealloc_push_page(struct i40iw_device *iwdev, struct i40iw_sc_
 
 /**
  * i40iw_alloc_pd - allocate protection domain
- * @ibdev: device pointer from stack
+ * @pd: PD pointer
  * @context: user context created during alloc
  * @udata: user data
  */
-static struct ib_pd *i40iw_alloc_pd(struct ib_device *ibdev,
-				    struct ib_ucontext *context,
-				    struct ib_udata *udata)
+static int i40iw_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
+			  struct ib_udata *udata)
 {
-	struct i40iw_pd *iwpd;
-	struct i40iw_device *iwdev = to_iwdev(ibdev);
+	struct i40iw_pd *iwpd = to_iwpd(pd);
+	struct i40iw_device *iwdev = to_iwdev(pd->device);
 	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
 	struct i40iw_alloc_pd_resp uresp;
 	struct i40iw_sc_pd *sc_pd;
@@ -330,19 +329,13 @@ static struct ib_pd *i40iw_alloc_pd(struct ib_device *ibdev,
 	int err;
 
 	if (iwdev->closing)
-		return ERR_PTR(-ENODEV);
+		return -ENODEV;
 
 	err = i40iw_alloc_resource(iwdev, iwdev->allocated_pds,
 				   iwdev->max_pd, &pd_id, &iwdev->next_pd);
 	if (err) {
 		i40iw_pr_err("alloc resource failed\n");
-		return ERR_PTR(err);
-	}
-
-	iwpd = kzalloc(sizeof(*iwpd), GFP_KERNEL);
-	if (!iwpd) {
-		err = -ENOMEM;
-		goto free_res;
+		return err;
 	}
 
 	sc_pd = &iwpd->sc_pd;
@@ -361,25 +354,23 @@ static struct ib_pd *i40iw_alloc_pd(struct ib_device *ibdev,
 	}
 
 	i40iw_add_pdusecount(iwpd);
-	return &iwpd->ibpd;
+	return 0;
+
 error:
-	kfree(iwpd);
-free_res:
 	i40iw_free_resource(iwdev, iwdev->allocated_pds, pd_id);
-	return ERR_PTR(err);
+	return err;
 }
 
 /**
  * i40iw_dealloc_pd - deallocate pd
  * @ibpd: ptr of pd to be deallocated
  */
-static int i40iw_dealloc_pd(struct ib_pd *ibpd)
+static void i40iw_dealloc_pd(struct ib_pd *ibpd)
 {
 	struct i40iw_pd *iwpd = to_iwpd(ibpd);
 	struct i40iw_device *iwdev = to_iwdev(ibpd->device);
 
 	i40iw_rem_pdusecount(iwpd, iwdev);
-	return 0;
 }
 
 /**
@@ -2750,6 +2741,7 @@ static const struct ib_device_ops i40iw_dev_ops = {
 	.query_qp = i40iw_query_qp,
 	.reg_user_mr = i40iw_reg_user_mr,
 	.req_notify_cq = i40iw_req_notify_cq,
+	INIT_RDMA_OBJ_SIZE(ib_pd, i40iw_pd, ibpd),
 };
 
 /**
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index d66002a31000..c0f6aea7ed7c 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1186,38 +1186,27 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 	}
 }
 
-static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev,
-				      struct ib_ucontext *context,
-				      struct ib_udata *udata)
+static int mlx4_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+			    struct ib_udata *udata)
 {
-	struct mlx4_ib_pd *pd;
+	struct mlx4_ib_pd *pd = to_mpd(ibpd);
+	struct ib_device *ibdev = ibpd->device;
 	int err;
 
-	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd)
-		return ERR_PTR(-ENOMEM);
-
 	err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn);
-	if (err) {
-		kfree(pd);
-		return ERR_PTR(err);
-	}
+	if (err)
+		return err;
 
-	if (context)
-		if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) {
-			mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn);
-			kfree(pd);
-			return ERR_PTR(-EFAULT);
-		}
-	return &pd->ibpd;
+	if (context && ib_copy_to_udata(udata, &pd->pdn, sizeof(__u32))) {
+		mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn);
+		return -EFAULT;
+	}
+	return 0;
 }
 
-static int mlx4_ib_dealloc_pd(struct ib_pd *pd)
+static void mlx4_ib_dealloc_pd(struct ib_pd *pd)
 {
 	mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn);
-	kfree(pd);
-
-	return 0;
 }
 
 static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev,
@@ -2580,6 +2569,7 @@ static const struct ib_device_ops mlx4_ib_dev_ops = {
 	.req_notify_cq = mlx4_ib_arm_cq,
 	.rereg_user_mr = mlx4_ib_rereg_user_mr,
 	.resize_cq = mlx4_ib_resize_cq,
+	INIT_RDMA_OBJ_SIZE(ib_pd, mlx4_ib_pd, ibpd),
 };
 
 static const struct ib_device_ops mlx4_ib_dev_wq_ops = {
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 76d6c2557d0c..f9cddc6f2ab6 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -2280,30 +2280,24 @@ int mlx5_ib_dealloc_dm(struct ib_dm *ibdm)
 	return 0;
 }
 
-static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
-				      struct ib_ucontext *context,
-				      struct ib_udata *udata)
+static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+			    struct ib_udata *udata)
 {
+	struct mlx5_ib_pd *pd = to_mpd(ibpd);
+	struct ib_device *ibdev = ibpd->device;
 	struct mlx5_ib_alloc_pd_resp resp;
-	struct mlx5_ib_pd *pd;
 	int err;
 	u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {};
 	u32 in[MLX5_ST_SZ_DW(alloc_pd_in)]   = {};
 	u16 uid = 0;
 
-	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd)
-		return ERR_PTR(-ENOMEM);
-
 	uid = context ? to_mucontext(context)->devx_uid : 0;
 	MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
 	MLX5_SET(alloc_pd_in, in, uid, uid);
 	err = mlx5_cmd_exec(to_mdev(ibdev)->mdev, in, sizeof(in),
 			    out, sizeof(out));
-	if (err) {
-		kfree(pd);
-		return ERR_PTR(err);
-	}
+	if (err)
+		return err;
 
 	pd->pdn = MLX5_GET(alloc_pd_out, out, pd);
 	pd->uid = uid;
@@ -2311,23 +2305,19 @@ static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
 		resp.pdn = pd->pdn;
 		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
 			mlx5_cmd_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid);
-			kfree(pd);
-			return ERR_PTR(-EFAULT);
+			return -EFAULT;
 		}
 	}
 
-	return &pd->ibpd;
+	return 0;
 }
 
-static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
+static void mlx5_ib_dealloc_pd(struct ib_pd *pd)
 {
 	struct mlx5_ib_dev *mdev = to_mdev(pd->device);
 	struct mlx5_ib_pd *mpd = to_mpd(pd);
 
 	mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid);
-	kfree(mpd);
-
-	return 0;
 }
 
 enum {
@@ -4680,23 +4670,28 @@ static int create_dev_resources(struct mlx5_ib_resources *devr)
 {
 	struct ib_srq_init_attr attr;
 	struct mlx5_ib_dev *dev;
+	struct ib_device *ibdev;
 	struct ib_cq_init_attr cq_attr = {.cqe = 1};
 	int port;
 	int ret = 0;
 
 	dev = container_of(devr, struct mlx5_ib_dev, devr);
+	ibdev = &dev->ib_dev;
 
 	mutex_init(&devr->mutex);
 
-	devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
-	if (IS_ERR(devr->p0)) {
-		ret = PTR_ERR(devr->p0);
-		goto error0;
-	}
-	devr->p0->device  = &dev->ib_dev;
+	devr->p0 = rdma_zalloc_drv_obj(ibdev, ib_pd);
+	if (!devr->p0)
+		return -ENOMEM;
+
+	devr->p0->device  = ibdev;
 	devr->p0->uobject = NULL;
 	atomic_set(&devr->p0->usecnt, 0);
 
+	ret = mlx5_ib_alloc_pd(devr->p0, NULL, NULL);
+	if (ret)
+		goto error0;
+
 	devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL);
 	if (IS_ERR(devr->c0)) {
 		ret = PTR_ERR(devr->c0);
@@ -4794,6 +4789,7 @@ error2:
 error1:
 	mlx5_ib_dealloc_pd(devr->p0);
 error0:
+	kfree(devr->p0);
 	return ret;
 }
 
@@ -4809,6 +4805,7 @@ static void destroy_dev_resources(struct mlx5_ib_resources *devr)
 	mlx5_ib_dealloc_xrcd(devr->x1);
 	mlx5_ib_destroy_cq(devr->c0);
 	mlx5_ib_dealloc_pd(devr->p0);
+	kfree(devr->p0);
 
 	/* Make sure no change P_Key work items are still executing */
 	for (port = 0; port < dev->num_ports; ++port)
@@ -5938,6 +5935,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
 	.req_notify_cq = mlx5_ib_arm_cq,
 	.rereg_user_mr = mlx5_ib_rereg_user_mr,
 	.resize_cq = mlx5_ib_resize_cq,
+	INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd),
 };
 
 static const struct ib_device_ops mlx5_ib_dev_flow_ipsec_ops = {
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 1bb67562c8c8..2c754bc226f3 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -374,40 +374,30 @@ static int mthca_mmap_uar(struct ib_ucontext *context,
 	return 0;
 }
 
-static struct ib_pd *mthca_alloc_pd(struct ib_device *ibdev,
-				    struct ib_ucontext *context,
-				    struct ib_udata *udata)
+static int mthca_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+			  struct ib_udata *udata)
 {
-	struct mthca_pd *pd;
+	struct ib_device *ibdev = ibpd->device;
+	struct mthca_pd *pd = to_mpd(ibpd);
 	int err;
 
-	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd)
-		return ERR_PTR(-ENOMEM);
-
 	err = mthca_pd_alloc(to_mdev(ibdev), !context, pd);
-	if (err) {
-		kfree(pd);
-		return ERR_PTR(err);
-	}
+	if (err)
+		return err;
 
 	if (context) {
 		if (ib_copy_to_udata(udata, &pd->pd_num, sizeof (__u32))) {
 			mthca_pd_free(to_mdev(ibdev), pd);
-			kfree(pd);
-			return ERR_PTR(-EFAULT);
+			return -EFAULT;
 		}
 	}
 
-	return &pd->ibpd;
+	return 0;
 }
 
-static int mthca_dealloc_pd(struct ib_pd *pd)
+static void mthca_dealloc_pd(struct ib_pd *pd)
 {
 	mthca_pd_free(to_mdev(pd->device), to_mpd(pd));
-	kfree(pd);
-
-	return 0;
 }
 
 static struct ib_ah *mthca_ah_create(struct ib_pd *pd,
@@ -1228,6 +1218,7 @@ static const struct ib_device_ops mthca_dev_ops = {
 	.query_qp = mthca_query_qp,
 	.reg_user_mr = mthca_reg_user_mr,
 	.resize_cq = mthca_resize_cq,
+	INIT_RDMA_OBJ_SIZE(ib_pd, mthca_pd, ibpd),
 };
 
 static const struct ib_device_ops mthca_dev_arbel_srq_ops = {
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 6eb991d40035..f18b28ae4bd9 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -658,10 +658,11 @@ static int nes_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 /**
  * nes_alloc_pd
  */
-static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev,
-		struct ib_ucontext *context, struct ib_udata *udata)
+static int nes_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
+			struct ib_udata *udata)
 {
-	struct nes_pd *nespd;
+	struct ib_device *ibdev = pd->device;
+	struct nes_pd *nespd = to_nespd(pd);
 	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
 	struct nes_device *nesdev = nesvnic->nesdev;
 	struct nes_adapter *nesadapter = nesdev->nesadapter;
@@ -676,15 +677,8 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev,
 
 	err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds,
 			nesadapter->max_pd, &pd_num, &nesadapter->next_pd, NES_RESOURCE_PD);
-	if (err) {
-		return ERR_PTR(err);
-	}
-
-	nespd = kzalloc(sizeof (struct nes_pd), GFP_KERNEL);
-	if (!nespd) {
-		nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (err)
+		return err;
 
 	nes_debug(NES_DBG_PD, "Allocating PD (%p) for ib device %s\n",
 			nespd, dev_name(&nesvnic->nesibdev->ibdev.dev));
@@ -700,16 +694,14 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev,
 		if (nespd->mmap_db_index >= NES_MAX_USER_DB_REGIONS) {
 			nes_debug(NES_DBG_PD, "mmap_db_index > MAX\n");
 			nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
-			kfree(nespd);
-			return ERR_PTR(-ENOMEM);
+			return -ENOMEM;
 		}
 
 		uresp.pd_id = nespd->pd_id;
 		uresp.mmap_db_index = nespd->mmap_db_index;
 		if (ib_copy_to_udata(udata, &uresp, sizeof (struct nes_alloc_pd_resp))) {
 			nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
-			kfree(nespd);
-			return ERR_PTR(-EFAULT);
+			return -EFAULT;
 		}
 
 		set_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells);
@@ -718,14 +710,14 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev,
 	}
 
 	nes_debug(NES_DBG_PD, "PD%u structure located @%p.\n", nespd->pd_id, nespd);
-	return &nespd->ibpd;
+	return 0;
 }
 
 
 /**
  * nes_dealloc_pd
  */
-static int nes_dealloc_pd(struct ib_pd *ibpd)
+static void nes_dealloc_pd(struct ib_pd *ibpd)
 {
 	struct nes_ucontext *nesucontext;
 	struct nes_pd *nespd = to_nespd(ibpd);
@@ -748,9 +740,6 @@ static int nes_dealloc_pd(struct ib_pd *ibpd)
 			nespd->pd_id, nespd);
 	nes_free_resource(nesadapter, nesadapter->allocated_pds,
 			(nespd->pd_id-nesadapter->base_pd)>>(PAGE_SHIFT-12));
-	kfree(nespd);
-
-	return 0;
 }
 
 
@@ -3658,6 +3647,7 @@ static const struct ib_device_ops nes_dev_ops = {
 	.query_qp = nes_query_qp,
 	.reg_user_mr = nes_reg_user_mr,
 	.req_notify_cq = nes_req_notify_cq,
+	INIT_RDMA_OBJ_SIZE(ib_pd, nes_pd, ibpd),
 };
 
 /**
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index 88970a6bb555..0de83c92691f 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -179,6 +179,7 @@ static const struct ib_device_ops ocrdma_dev_ops = {
 	.reg_user_mr = ocrdma_reg_user_mr,
 	.req_notify_cq = ocrdma_arm_cq,
 	.resize_cq = ocrdma_resize_cq,
+	INIT_RDMA_OBJ_SIZE(ib_pd, ocrdma_pd, ibpd),
 };
 
 static const struct ib_device_ops ocrdma_dev_srq_ops = {
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index 2a62936bef4d..980ba97188ff 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -367,17 +367,12 @@ static int ocrdma_get_pd_num(struct ocrdma_dev *dev, struct ocrdma_pd *pd)
 	return status;
 }
 
-static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev,
-					  struct ocrdma_ucontext *uctx,
-					  struct ib_udata *udata)
+static int _ocrdma_alloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd,
+			    struct ocrdma_ucontext *uctx,
+			    struct ib_udata *udata)
 {
-	struct ocrdma_pd *pd = NULL;
 	int status;
 
-	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd)
-		return ERR_PTR(-ENOMEM);
-
 	if (udata && uctx && dev->attr.max_dpp_pds) {
 		pd->dpp_enabled =
 			ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R;
@@ -386,15 +381,8 @@ static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev,
 					   dev->attr.wqe_size) : 0;
 	}
 
-	if (dev->pd_mgr->pd_prealloc_valid) {
-		status = ocrdma_get_pd_num(dev, pd);
-		if (status == 0) {
-			return pd;
-		} else {
-			kfree(pd);
-			return ERR_PTR(status);
-		}
-	}
+	if (dev->pd_mgr->pd_prealloc_valid)
+		return ocrdma_get_pd_num(dev, pd);
 
 retry:
 	status = ocrdma_mbx_alloc_pd(dev, pd);
@@ -403,13 +391,11 @@ retry:
 			pd->dpp_enabled = false;
 			pd->num_dpp_qp = 0;
 			goto retry;
-		} else {
-			kfree(pd);
-			return ERR_PTR(status);
 		}
+		return status;
 	}
 
-	return pd;
+	return 0;
 }
 
 static inline int is_ucontext_pd(struct ocrdma_ucontext *uctx,
@@ -418,30 +404,33 @@ static inline int is_ucontext_pd(struct ocrdma_ucontext *uctx,
 	return (uctx->cntxt_pd == pd);
 }
 
-static int _ocrdma_dealloc_pd(struct ocrdma_dev *dev,
+static void _ocrdma_dealloc_pd(struct ocrdma_dev *dev,
 			      struct ocrdma_pd *pd)
 {
-	int status;
-
 	if (dev->pd_mgr->pd_prealloc_valid)
-		status = ocrdma_put_pd_num(dev, pd->id, pd->dpp_enabled);
+		ocrdma_put_pd_num(dev, pd->id, pd->dpp_enabled);
 	else
-		status = ocrdma_mbx_dealloc_pd(dev, pd);
-
-	kfree(pd);
-	return status;
+		ocrdma_mbx_dealloc_pd(dev, pd);
 }
 
 static int ocrdma_alloc_ucontext_pd(struct ocrdma_dev *dev,
 				    struct ocrdma_ucontext *uctx,
 				    struct ib_udata *udata)
 {
-	int status = 0;
+	struct ib_device *ibdev = &dev->ibdev;
+	struct ib_pd *pd;
+	int status;
+
+	pd = rdma_zalloc_drv_obj(ibdev, ib_pd);
+	if (!pd)
+		return -ENOMEM;
+
+	pd->device  = ibdev;
+	uctx->cntxt_pd = get_ocrdma_pd(pd);
 
-	uctx->cntxt_pd = _ocrdma_alloc_pd(dev, uctx, udata);
-	if (IS_ERR(uctx->cntxt_pd)) {
-		status = PTR_ERR(uctx->cntxt_pd);
-		uctx->cntxt_pd = NULL;
+	status = _ocrdma_alloc_pd(dev, uctx->cntxt_pd, uctx, udata);
+	if (status) {
+		kfree(uctx->cntxt_pd);
 		goto err;
 	}
 
@@ -460,6 +449,7 @@ static int ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx)
 		pr_err("%s(%d) Freeing in use pdid=0x%x.\n",
 		       __func__, dev->id, pd->id);
 	}
+	kfree(uctx->cntxt_pd);
 	uctx->cntxt_pd = NULL;
 	(void)_ocrdma_dealloc_pd(dev, pd);
 	return 0;
@@ -537,6 +527,7 @@ struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev,
 	return &ctx->ibucontext;
 
 cpy_err:
+	ocrdma_dealloc_ucontext_pd(ctx);
 pd_err:
 	ocrdma_del_mmap(ctx, ctx->ah_tbl.pa, ctx->ah_tbl.len);
 map_err:
@@ -658,10 +649,10 @@ dpp_map_err:
 	return status;
 }
 
-struct ib_pd *ocrdma_alloc_pd(struct ib_device *ibdev,
-			      struct ib_ucontext *context,
-			      struct ib_udata *udata)
+int ocrdma_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+		    struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibpd->device;
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
 	struct ocrdma_pd *pd;
 	struct ocrdma_ucontext *uctx = NULL;
@@ -677,11 +668,10 @@ struct ib_pd *ocrdma_alloc_pd(struct ib_device *ibdev,
 		}
 	}
 
-	pd = _ocrdma_alloc_pd(dev, uctx, udata);
-	if (IS_ERR(pd)) {
-		status = PTR_ERR(pd);
+	pd = get_ocrdma_pd(ibpd);
+	status = _ocrdma_alloc_pd(dev, pd, uctx, udata);
+	if (status)
 		goto exit;
-	}
 
 pd_mapping:
 	if (udata && context) {
@@ -689,25 +679,22 @@ pd_mapping:
 		if (status)
 			goto err;
 	}
-	return &pd->ibpd;
+	return 0;
 
 err:
-	if (is_uctx_pd) {
+	if (is_uctx_pd)
 		ocrdma_release_ucontext_pd(uctx);
-	} else {
-		if (_ocrdma_dealloc_pd(dev, pd))
-			pr_err("%s: _ocrdma_dealloc_pd() failed\n", __func__);
-	}
+	else
+		_ocrdma_dealloc_pd(dev, pd);
 exit:
-	return ERR_PTR(status);
+	return status;
 }
 
-int ocrdma_dealloc_pd(struct ib_pd *ibpd)
+void ocrdma_dealloc_pd(struct ib_pd *ibpd)
 {
 	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
 	struct ocrdma_ucontext *uctx = NULL;
-	int status = 0;
 	u64 usr_db;
 
 	uctx = pd->uctx;
@@ -721,11 +708,10 @@ int ocrdma_dealloc_pd(struct ib_pd *ibpd)
 
 		if (is_ucontext_pd(uctx, pd)) {
 			ocrdma_release_ucontext_pd(uctx);
-			return status;
+			return;
 		}
 	}
-	status = _ocrdma_dealloc_pd(dev, pd);
-	return status;
+	_ocrdma_dealloc_pd(dev, pd);
 }
 
 static int ocrdma_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
index b69cfdce7970..1fd66721c930 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -70,9 +70,9 @@ int ocrdma_dealloc_ucontext(struct ib_ucontext *);
 
 int ocrdma_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
 
-struct ib_pd *ocrdma_alloc_pd(struct ib_device *,
-			      struct ib_ucontext *, struct ib_udata *);
-int ocrdma_dealloc_pd(struct ib_pd *pd);
+int ocrdma_alloc_pd(struct ib_pd *pd, struct ib_ucontext *uctx,
+		    struct ib_udata *udata);
+void ocrdma_dealloc_pd(struct ib_pd *pd);
 
 struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
 			       const struct ib_cq_init_attr *attr,
diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index 878e9e23652b..44ce4989dcef 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -239,6 +239,7 @@ static const struct ib_device_ops qedr_dev_ops = {
 	.reg_user_mr = qedr_reg_user_mr,
 	.req_notify_cq = qedr_arm_cq,
 	.resize_cq = qedr_resize_cq,
+	INIT_RDMA_OBJ_SIZE(ib_pd, qedr_pd, ibpd),
 };
 
 static int qedr_register_device(struct qedr_dev *dev)
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index 989f08633fbe..a06d2258394a 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -450,11 +450,12 @@ int qedr_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 				  vma->vm_page_prot);
 }
 
-struct ib_pd *qedr_alloc_pd(struct ib_device *ibdev,
-			    struct ib_ucontext *context, struct ib_udata *udata)
+int qedr_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+		  struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibpd->device;
 	struct qedr_dev *dev = get_qedr_dev(ibdev);
-	struct qedr_pd *pd;
+	struct qedr_pd *pd = get_qedr_pd(ibpd);
 	u16 pd_id;
 	int rc;
 
@@ -463,16 +464,12 @@ struct ib_pd *qedr_alloc_pd(struct ib_device *ibdev,
 
 	if (!dev->rdma_ctx) {
 		DP_ERR(dev, "invalid RDMA context\n");
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
-	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd)
-		return ERR_PTR(-ENOMEM);
-
 	rc = dev->ops->rdma_alloc_pd(dev->rdma_ctx, &pd_id);
 	if (rc)
-		goto err;
+		return rc;
 
 	pd->pd_id = pd_id;
 
@@ -485,36 +482,23 @@ struct ib_pd *qedr_alloc_pd(struct ib_device *ibdev,
 		if (rc) {
 			DP_ERR(dev, "copy error pd_id=0x%x.\n", pd_id);
 			dev->ops->rdma_dealloc_pd(dev->rdma_ctx, pd_id);
-			goto err;
+			return rc;
 		}
 
 		pd->uctx = get_qedr_ucontext(context);
 		pd->uctx->pd = pd;
 	}
 
-	return &pd->ibpd;
-
-err:
-	kfree(pd);
-	return ERR_PTR(rc);
+	return 0;
 }
 
-int qedr_dealloc_pd(struct ib_pd *ibpd)
+void qedr_dealloc_pd(struct ib_pd *ibpd)
 {
 	struct qedr_dev *dev = get_qedr_dev(ibpd->device);
 	struct qedr_pd *pd = get_qedr_pd(ibpd);
 
-	if (!pd) {
-		pr_err("Invalid PD received in dealloc_pd\n");
-		return -EINVAL;
-	}
-
 	DP_DEBUG(dev, QEDR_MSG_INIT, "Deallocating PD %d\n", pd->pd_id);
 	dev->ops->rdma_dealloc_pd(dev->rdma_ctx, pd->pd_id);
-
-	kfree(pd);
-
-	return 0;
 }
 
 static void qedr_free_pbl(struct qedr_dev *dev,
diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h
index 1852b7012bf4..97a6ff3f9afb 100644
--- a/drivers/infiniband/hw/qedr/verbs.h
+++ b/drivers/infiniband/hw/qedr/verbs.h
@@ -47,9 +47,9 @@ struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *, struct ib_udata *);
 int qedr_dealloc_ucontext(struct ib_ucontext *);
 
 int qedr_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
-struct ib_pd *qedr_alloc_pd(struct ib_device *,
-			    struct ib_ucontext *, struct ib_udata *);
-int qedr_dealloc_pd(struct ib_pd *pd);
+int qedr_alloc_pd(struct ib_pd *pd, struct ib_ucontext *uctx,
+		  struct ib_udata *udata);
+void qedr_dealloc_pd(struct ib_pd *pd);
 
 struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
 			     const struct ib_cq_init_attr *attr,
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index 1ec155823716..256ad2f236c8 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -352,6 +352,7 @@ static const struct ib_device_ops usnic_dev_ops = {
 	.query_port = usnic_ib_query_port,
 	.query_qp = usnic_ib_query_qp,
 	.reg_user_mr = usnic_ib_reg_mr,
+	INIT_RDMA_OBJ_SIZE(ib_pd, usnic_ib_pd, ibpd),
 };
 
 /* Start of PF discovery section */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
index 9dea18106247..0ced89b51448 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -456,37 +456,23 @@ int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
 	return 0;
 }
 
-struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev,
-					struct ib_ucontext *context,
-					struct ib_udata *udata)
+int usnic_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+		      struct ib_udata *udata)
 {
-	struct usnic_ib_pd *pd;
+	struct usnic_ib_pd *pd = to_upd(ibpd);
 	void *umem_pd;
 
-	usnic_dbg("\n");
-
-	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd)
-		return ERR_PTR(-ENOMEM);
-
 	umem_pd = pd->umem_pd = usnic_uiom_alloc_pd();
 	if (IS_ERR_OR_NULL(umem_pd)) {
-		kfree(pd);
-		return ERR_PTR(umem_pd ? PTR_ERR(umem_pd) : -ENOMEM);
+		return umem_pd ? PTR_ERR(umem_pd) : -ENOMEM;
 	}
 
-	usnic_info("domain 0x%p allocated for context 0x%p and device %s\n",
-		   pd, context, dev_name(&ibdev->dev));
-	return &pd->ibpd;
+	return 0;
 }
 
-int usnic_ib_dealloc_pd(struct ib_pd *pd)
+void usnic_ib_dealloc_pd(struct ib_pd *pd)
 {
-	usnic_info("freeing domain 0x%p\n", pd);
-
 	usnic_uiom_dealloc_pd((to_upd(pd))->umem_pd);
-	kfree(pd);
-	return 0;
 }
 
 struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
index 99a6d81c2bcd..44a9d2f82bf5 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
@@ -51,10 +51,9 @@ int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
 struct net_device *usnic_get_netdev(struct ib_device *device, u8 port_num);
 int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
 				u16 *pkey);
-struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev,
-				struct ib_ucontext *context,
-				struct ib_udata *udata);
-int usnic_ib_dealloc_pd(struct ib_pd *pd);
+int usnic_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+		      struct ib_udata *udata);
+void usnic_ib_dealloc_pd(struct ib_pd *pd);
 struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
 					struct ib_qp_init_attr *init_attr,
 					struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
index e582beaf9430..47e653d2495c 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
@@ -195,6 +195,7 @@ static const struct ib_device_ops pvrdma_dev_ops = {
 	.query_qp = pvrdma_query_qp,
 	.reg_user_mr = pvrdma_reg_user_mr,
 	.req_notify_cq = pvrdma_req_notify_cq,
+	INIT_RDMA_OBJ_SIZE(ib_pd, pvrdma_pd, ibpd),
 };
 
 static const struct ib_device_ops pvrdma_dev_srq_ops = {
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
index fafb2add3b44..f44220f72e05 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
@@ -438,37 +438,29 @@ int pvrdma_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
 
 /**
  * pvrdma_alloc_pd - allocate protection domain
- * @ibdev: the IB device
+ * @ibpd: PD pointer
  * @context: user context
  * @udata: user data
  *
  * @return: the ib_pd protection domain pointer on success, otherwise errno.
  */
-struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev,
-			      struct ib_ucontext *context,
-			      struct ib_udata *udata)
+int pvrdma_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+		    struct ib_udata *udata)
 {
-	struct pvrdma_pd *pd;
+	struct ib_device *ibdev = ibpd->device;
+	struct pvrdma_pd *pd = to_vpd(ibpd);
 	struct pvrdma_dev *dev = to_vdev(ibdev);
-	union pvrdma_cmd_req req;
-	union pvrdma_cmd_resp rsp;
+	union pvrdma_cmd_req req = {};
+	union pvrdma_cmd_resp rsp = {};
 	struct pvrdma_cmd_create_pd *cmd = &req.create_pd;
 	struct pvrdma_cmd_create_pd_resp *resp = &rsp.create_pd_resp;
 	struct pvrdma_alloc_pd_resp pd_resp = {0};
 	int ret;
-	void *ptr;
 
 	/* Check allowed max pds */
 	if (!atomic_add_unless(&dev->num_pds, 1, dev->dsr->caps.max_pd))
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
-	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd) {
-		ptr = ERR_PTR(-ENOMEM);
-		goto err;
-	}
-
-	memset(cmd, 0, sizeof(*cmd));
 	cmd->hdr.cmd = PVRDMA_CMD_CREATE_PD;
 	cmd->ctx_handle = (context) ? to_vucontext(context)->ctx_handle : 0;
 	ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_PD_RESP);
@@ -476,8 +468,7 @@ struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev,
 		dev_warn(&dev->pdev->dev,
 			 "failed to allocate protection domain, error: %d\n",
 			 ret);
-		ptr = ERR_PTR(ret);
-		goto freepd;
+		goto err;
 	}
 
 	pd->privileged = !context;
@@ -490,18 +481,16 @@ struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev,
 			dev_warn(&dev->pdev->dev,
 				 "failed to copy back protection domain\n");
 			pvrdma_dealloc_pd(&pd->ibpd);
-			return ERR_PTR(-EFAULT);
+			return -EFAULT;
 		}
 	}
 
 	/* u32 pd handle */
-	return &pd->ibpd;
+	return 0;
 
-freepd:
-	kfree(pd);
 err:
 	atomic_dec(&dev->num_pds);
-	return ptr;
+	return ret;
 }
 
 /**
@@ -510,14 +499,13 @@ err:
  *
  * @return: 0 on success, otherwise errno.
  */
-int pvrdma_dealloc_pd(struct ib_pd *pd)
+void pvrdma_dealloc_pd(struct ib_pd *pd)
 {
 	struct pvrdma_dev *dev = to_vdev(pd->device);
-	union pvrdma_cmd_req req;
+	union pvrdma_cmd_req req = {};
 	struct pvrdma_cmd_destroy_pd *cmd = &req.destroy_pd;
 	int ret;
 
-	memset(cmd, 0, sizeof(*cmd));
 	cmd->hdr.cmd = PVRDMA_CMD_DESTROY_PD;
 	cmd->pd_handle = to_vpd(pd)->pd_handle;
 
@@ -527,10 +515,7 @@ int pvrdma_dealloc_pd(struct ib_pd *pd)
 			 "could not dealloc protection domain, error: %d\n",
 			 ret);
 
-	kfree(to_vpd(pd));
 	atomic_dec(&dev->num_pds);
-
-	return 0;
 }
 
 /**
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
index f7f758d60110..ed91baad1ffa 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
@@ -399,10 +399,9 @@ int pvrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
 struct ib_ucontext *pvrdma_alloc_ucontext(struct ib_device *ibdev,
 					  struct ib_udata *udata);
 int pvrdma_dealloc_ucontext(struct ib_ucontext *context);
-struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev,
-			      struct ib_ucontext *context,
-			      struct ib_udata *udata);
-int pvrdma_dealloc_pd(struct ib_pd *ibpd);
+int pvrdma_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
+		    struct ib_udata *udata);
+void pvrdma_dealloc_pd(struct ib_pd *ibpd);
 struct ib_mr *pvrdma_get_dma_mr(struct ib_pd *pd, int acc);
 struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				 u64 virt_addr, int access_flags,
diff --git a/drivers/infiniband/sw/rdmavt/pd.c b/drivers/infiniband/sw/rdmavt/pd.c
index dcc1870b8d23..6033054b22fa 100644
--- a/drivers/infiniband/sw/rdmavt/pd.c
+++ b/drivers/infiniband/sw/rdmavt/pd.c
@@ -50,7 +50,7 @@
 
 /**
  * rvt_alloc_pd - allocate a protection domain
- * @ibdev: ib device
+ * @ibpd: PD
  * @context: optional user context
  * @udata: optional user data
  *
@@ -58,19 +58,14 @@
  *
  * Return: 0 on success
  */
-struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev,
-			   struct ib_ucontext *context,
-			   struct ib_udata *udata)
+int rvt_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+		 struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibpd->device;
 	struct rvt_dev_info *dev = ib_to_rvt(ibdev);
-	struct rvt_pd *pd;
-	struct ib_pd *ret;
+	struct rvt_pd *pd = ibpd_to_rvtpd(ibpd);
+	int ret = 0;
 
-	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd) {
-		ret = ERR_PTR(-ENOMEM);
-		goto bail;
-	}
 	/*
 	 * While we could continue allocating protecetion domains, being
 	 * constrained only by system resources. The IBTA spec defines that
@@ -81,8 +76,7 @@ struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev,
 	spin_lock(&dev->n_pds_lock);
 	if (dev->n_pds_allocated == dev->dparms.props.max_pd) {
 		spin_unlock(&dev->n_pds_lock);
-		kfree(pd);
-		ret = ERR_PTR(-ENOMEM);
+		ret = -ENOMEM;
 		goto bail;
 	}
 
@@ -92,8 +86,6 @@ struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev,
 	/* ib_alloc_pd() will initialize pd->ibpd. */
 	pd->user = !!udata;
 
-	ret = &pd->ibpd;
-
 bail:
 	return ret;
 }
@@ -104,16 +96,11 @@ bail:
  *
  * Return: always 0
  */
-int rvt_dealloc_pd(struct ib_pd *ibpd)
+void rvt_dealloc_pd(struct ib_pd *ibpd)
 {
-	struct rvt_pd *pd = ibpd_to_rvtpd(ibpd);
 	struct rvt_dev_info *dev = ib_to_rvt(ibpd->device);
 
 	spin_lock(&dev->n_pds_lock);
 	dev->n_pds_allocated--;
 	spin_unlock(&dev->n_pds_lock);
-
-	kfree(pd);
-
-	return 0;
 }
diff --git a/drivers/infiniband/sw/rdmavt/pd.h b/drivers/infiniband/sw/rdmavt/pd.h
index 1892ca4a9746..7a887e4a45e7 100644
--- a/drivers/infiniband/sw/rdmavt/pd.h
+++ b/drivers/infiniband/sw/rdmavt/pd.h
@@ -50,9 +50,8 @@
 
 #include <rdma/rdma_vt.h>
 
-struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev,
-			   struct ib_ucontext *context,
-			   struct ib_udata *udata);
-int rvt_dealloc_pd(struct ib_pd *ibpd);
+int rvt_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
+		 struct ib_udata *udata);
+void rvt_dealloc_pd(struct ib_pd *ibpd);
 
 #endif          /* DEF_RDMAVTPD_H */
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
index b3f0c5578925..a19832c73d5a 100644
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -436,6 +436,7 @@ static const struct ib_device_ops rvt_dev_ops = {
 	.req_notify_cq = rvt_req_notify_cq,
 	.resize_cq = rvt_resize_cq,
 	.unmap_fmr = rvt_unmap_fmr,
+	INIT_RDMA_OBJ_SIZE(ib_pd, rvt_pd, ibpd),
 };
 
 static noinline int check_support(struct rvt_dev_info *rdi, int verb)
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index b5c91df22047..cd3f14629ba8 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -46,6 +46,7 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
 	[RXE_TYPE_PD] = {
 		.name		= "rxe-pd",
 		.size		= sizeof(struct rxe_pd),
+		.flags		= RXE_POOL_NO_ALLOC,
 	},
 	[RXE_TYPE_AH] = {
 		.name		= "rxe-ah",
@@ -119,8 +120,10 @@ static void rxe_cache_clean(size_t cnt)
 
 	for (i = 0; i < cnt; i++) {
 		type = &rxe_type_info[i];
-		kmem_cache_destroy(type->cache);
-		type->cache = NULL;
+		if (!(type->flags & RXE_POOL_NO_ALLOC)) {
+			kmem_cache_destroy(type->cache);
+			type->cache = NULL;
+		}
 	}
 }
 
@@ -134,14 +137,17 @@ int rxe_cache_init(void)
 	for (i = 0; i < RXE_NUM_TYPES; i++) {
 		type = &rxe_type_info[i];
 		size = ALIGN(type->size, RXE_POOL_ALIGN);
-		type->cache = kmem_cache_create(type->name, size,
-				RXE_POOL_ALIGN,
-				RXE_POOL_CACHE_FLAGS, NULL);
-		if (!type->cache) {
-			pr_err("Unable to init kmem cache for %s\n",
-			       type->name);
-			err = -ENOMEM;
-			goto err1;
+		if (!(type->flags & RXE_POOL_NO_ALLOC)) {
+			type->cache =
+				kmem_cache_create(type->name, size,
+						  RXE_POOL_ALIGN,
+						  RXE_POOL_CACHE_FLAGS, NULL);
+			if (!type->cache) {
+				pr_err("Unable to init kmem cache for %s\n",
+				       type->name);
+				err = -ENOMEM;
+				goto err1;
+			}
 		}
 	}
 
@@ -415,6 +421,37 @@ out_put_pool:
 	return NULL;
 }
 
+int rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem)
+{
+	unsigned long flags;
+
+	might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC));
+
+	read_lock_irqsave(&pool->pool_lock, flags);
+	if (pool->state != RXE_POOL_STATE_VALID) {
+		read_unlock_irqrestore(&pool->pool_lock, flags);
+		return -EINVAL;
+	}
+	kref_get(&pool->ref_cnt);
+	read_unlock_irqrestore(&pool->pool_lock, flags);
+
+	kref_get(&pool->rxe->ref_cnt);
+
+	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
+		goto out_put_pool;
+
+	elem->pool = pool;
+	kref_init(&elem->ref_cnt);
+
+	return 0;
+
+out_put_pool:
+	atomic_dec(&pool->num_elem);
+	rxe_dev_put(pool->rxe);
+	rxe_pool_put(pool);
+	return -EINVAL;
+}
+
 void rxe_elem_release(struct kref *kref)
 {
 	struct rxe_pool_entry *elem =
@@ -424,7 +461,8 @@ void rxe_elem_release(struct kref *kref)
 	if (pool->cleanup)
 		pool->cleanup(elem);
 
-	kmem_cache_free(pool_cache(pool), elem);
+	if (!(pool->flags & RXE_POOL_NO_ALLOC))
+		kmem_cache_free(pool_cache(pool), elem);
 	atomic_dec(&pool->num_elem);
 	rxe_dev_put(pool->rxe);
 	rxe_pool_put(pool);
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
index 72968c29e01f..2f2cff1cbe43 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.h
+++ b/drivers/infiniband/sw/rxe/rxe_pool.h
@@ -41,6 +41,7 @@ enum rxe_pool_flags {
 	RXE_POOL_ATOMIC		= BIT(0),
 	RXE_POOL_INDEX		= BIT(1),
 	RXE_POOL_KEY		= BIT(2),
+	RXE_POOL_NO_ALLOC	= BIT(4),
 };
 
 enum rxe_elem_type {
@@ -131,6 +132,9 @@ void rxe_pool_cleanup(struct rxe_pool *pool);
 /* allocate an object from pool */
 void *rxe_alloc(struct rxe_pool *pool);
 
+/* connect already allocated object to pool */
+int rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem);
+
 /* assign an index to an indexed object and insert object into
  *  pool's rb tree
  */
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index cc5a05124ece..051c3930e808 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -191,23 +191,20 @@ static int rxe_port_immutable(struct ib_device *dev, u8 port_num,
 	return 0;
 }
 
-static struct ib_pd *rxe_alloc_pd(struct ib_device *dev,
-				  struct ib_ucontext *context,
-				  struct ib_udata *udata)
+static int rxe_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+			struct ib_udata *udata)
 {
-	struct rxe_dev *rxe = to_rdev(dev);
-	struct rxe_pd *pd;
+	struct rxe_dev *rxe = to_rdev(ibpd->device);
+	struct rxe_pd *pd = to_rpd(ibpd);
 
-	pd = rxe_alloc(&rxe->pd_pool);
-	return pd ? &pd->ibpd : ERR_PTR(-ENOMEM);
+	return rxe_add_to_pool(&rxe->pd_pool, &pd->pelem);
 }
 
-static int rxe_dealloc_pd(struct ib_pd *ibpd)
+static void rxe_dealloc_pd(struct ib_pd *ibpd)
 {
 	struct rxe_pd *pd = to_rpd(ibpd);
 
 	rxe_drop_ref(pd);
-	return 0;
 }
 
 static struct ib_ah *rxe_create_ah(struct ib_pd *ibpd,
@@ -1183,6 +1180,7 @@ static const struct ib_device_ops rxe_dev_ops = {
 	.reg_user_mr = rxe_reg_user_mr,
 	.req_notify_cq = rxe_req_notify_cq,
 	.resize_cq = rxe_resize_cq,
+	INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd),
 };
 
 int rxe_register_device(struct rxe_dev *rxe)
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index 74e04801d34d..70839d3f55d9 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -66,8 +66,8 @@ struct rxe_ucontext {
 };
 
 struct rxe_pd {
+	struct ib_pd            ibpd;
 	struct rxe_pool_entry	pelem;
-	struct ib_pd		ibpd;
 };
 
 struct rxe_ah {
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index e29eae4aec84..854d7816787c 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2385,10 +2385,9 @@ struct ib_device_ops {
 	int (*dealloc_ucontext)(struct ib_ucontext *context);
 	int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma);
 	void (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
-	struct ib_pd *(*alloc_pd)(struct ib_device *device,
-				  struct ib_ucontext *context,
-				  struct ib_udata *udata);
-	int (*dealloc_pd)(struct ib_pd *pd);
+	int (*alloc_pd)(struct ib_pd *pd, struct ib_ucontext *context,
+			struct ib_udata *udata);
+	void (*dealloc_pd)(struct ib_pd *pd);
 	struct ib_ah *(*create_ah)(struct ib_pd *pd,
 				   struct rdma_ah_attr *ah_attr, u32 flags,
 				   struct ib_udata *udata);
@@ -2530,6 +2529,8 @@ struct ib_device_ops {
 	 */
 	int (*fill_res_entry)(struct sk_buff *msg,
 			      struct rdma_restrack_entry *entry);
+
+	DECLARE_RDMA_OBJ_SIZE(ib_pd);
 };
 
 struct ib_device {
-- 
cgit v1.2.3


From 652432f33c01b2edaa5b2550b423cd894b1c7b9a Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Wed, 6 Feb 2019 22:41:50 -0700
Subject: RDMA/device: Get rid of reg_state

This really has no purpose anymore, refcount can be used to tell if the
device is still registered. Keeping it around just invites mis-use.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
---
 drivers/infiniband/core/device.c | 8 ++------
 include/rdma/ib_verbs.h          | 6 ------
 2 files changed, 2 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 872662a84b16..1c54ded776d0 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -243,7 +243,7 @@ static void ib_device_release(struct device *device)
 {
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
 
-	WARN_ON(dev->reg_state == IB_DEV_REGISTERED);
+	WARN_ON(refcount_read(&dev->refcount));
 	ib_cache_release_one(dev);
 	ib_security_release_port_pkey_list(dev);
 	kfree(dev->port_pkey_list);
@@ -316,8 +316,7 @@ EXPORT_SYMBOL(_ib_alloc_device);
 void ib_dealloc_device(struct ib_device *device)
 {
 	WARN_ON(!list_empty(&device->client_data_list));
-	WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
-		device->reg_state != IB_DEV_UNINITIALIZED);
+	WARN_ON(refcount_read(&device->refcount));
 	rdma_restrack_clean(device);
 	put_device(&device->dev);
 }
@@ -602,7 +601,6 @@ int ib_register_device(struct ib_device *device, const char *name)
 	}
 
 	refcount_set(&device->refcount, 1);
-	device->reg_state = IB_DEV_REGISTERED;
 
 	list_for_each_entry(client, &client_list, list)
 		if (!add_client_context(device, client) && client->add)
@@ -673,8 +671,6 @@ void ib_unregister_device(struct ib_device *device)
 	}
 	write_unlock_irqrestore(&device->client_data_lock, flags);
 	up_write(&lists_rwsem);
-
-	device->reg_state = IB_DEV_UNREGISTERED;
 }
 EXPORT_SYMBOL(ib_unregister_device);
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 854d7816787c..d8ba987e8b29 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2572,12 +2572,6 @@ struct ib_device {
 	struct kobject			*ports_kobj;
 	struct list_head             port_list;
 
-	enum {
-		IB_DEV_UNINITIALIZED,
-		IB_DEV_REGISTERED,
-		IB_DEV_UNREGISTERED
-	}                            reg_state;
-
 	int			     uverbs_abi_ver;
 	u64			     uverbs_cmd_mask;
 	u64			     uverbs_ex_cmd_mask;
-- 
cgit v1.2.3


From e59178d895afa29b671323f8265a1e50afe989e5 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Wed, 6 Feb 2019 22:41:52 -0700
Subject: RDMA/devices: Use xarray to store the clients

This gives each client a unique ID and will let us move client_data to use
xarray, and revise the locking scheme.

clients have to be add/removed in strict FIFO/LIFO order as they
interdepend. To support this the client_ids are assigned to increase in
FIFO order. The existing linked list is kept to support reverse iteration
until xarray can get a reverse iteration API.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
---
 drivers/infiniband/core/device.c | 50 ++++++++++++++++++++++++++++++++++++----
 include/rdma/ib_verbs.h          |  3 ++-
 2 files changed, 47 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 3a80f96c2919..f87d85659359 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -65,15 +65,17 @@ struct workqueue_struct *ib_comp_unbound_wq;
 struct workqueue_struct *ib_wq;
 EXPORT_SYMBOL_GPL(ib_wq);
 
-/* The device_list and client_list contain devices and clients after their
+/* The device_list and clients contain devices and clients after their
  * registration has completed, and the devices and clients are removed
  * during unregistration. */
 static LIST_HEAD(device_list);
 static LIST_HEAD(client_list);
+#define CLIENT_REGISTERED XA_MARK_1
+static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC);
 
 /*
  * device_mutex and lists_rwsem protect access to both device_list and
- * client_list.  device_mutex protects writer access by device and client
+ * clients.  device_mutex protects writer access by device and client
  * registration / de-registration.  lists_rwsem protects reader access to
  * these lists.  Iterators of these lists must lock it for read, while updates
  * to the lists must be done with a write lock. A special case is when the
@@ -564,6 +566,7 @@ int ib_register_device(struct ib_device *device, const char *name)
 {
 	int ret;
 	struct ib_client *client;
+	unsigned long index;
 
 	setup_dma_device(device);
 
@@ -608,7 +611,7 @@ int ib_register_device(struct ib_device *device, const char *name)
 
 	refcount_set(&device->refcount, 1);
 
-	list_for_each_entry(client, &client_list, list)
+	xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED)
 		if (!add_client_context(device, client) && client->add)
 			client->add(device);
 
@@ -680,6 +683,32 @@ void ib_unregister_device(struct ib_device *device)
 }
 EXPORT_SYMBOL(ib_unregister_device);
 
+static int assign_client_id(struct ib_client *client)
+{
+	int ret;
+
+	/*
+	 * The add/remove callbacks must be called in FIFO/LIFO order. To
+	 * achieve this we assign client_ids so they are sorted in
+	 * registration order, and retain a linked list we can reverse iterate
+	 * to get the LIFO order. The extra linked list can go away if xarray
+	 * learns to reverse iterate.
+	 */
+	if (list_empty(&client_list))
+		client->client_id = 0;
+	else
+		client->client_id =
+			list_last_entry(&client_list, struct ib_client, list)
+				->client_id;
+	ret = xa_alloc(&clients, &client->client_id, INT_MAX, client,
+		       GFP_KERNEL);
+	if (ret)
+		goto out;
+
+out:
+	return ret;
+}
+
 /**
  * ib_register_client - Register an IB client
  * @client:Client to register
@@ -696,15 +725,21 @@ EXPORT_SYMBOL(ib_unregister_device);
 int ib_register_client(struct ib_client *client)
 {
 	struct ib_device *device;
+	int ret;
 
 	mutex_lock(&device_mutex);
+	ret = assign_client_id(client);
+	if (ret) {
+		mutex_unlock(&device_mutex);
+		return ret;
+	}
 
 	list_for_each_entry(device, &device_list, core_list)
 		if (!add_client_context(device, client) && client->add)
 			client->add(device);
 
 	down_write(&lists_rwsem);
-	list_add_tail(&client->list, &client_list);
+	xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED);
 	up_write(&lists_rwsem);
 
 	mutex_unlock(&device_mutex);
@@ -729,7 +764,7 @@ void ib_unregister_client(struct ib_client *client)
 	mutex_lock(&device_mutex);
 
 	down_write(&lists_rwsem);
-	list_del(&client->list);
+	xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED);
 	up_write(&lists_rwsem);
 
 	list_for_each_entry(device, &device_list, core_list) {
@@ -765,6 +800,10 @@ void ib_unregister_client(struct ib_client *client)
 		kfree(found_context);
 	}
 
+	down_write(&lists_rwsem);
+	list_del(&client->list);
+	xa_erase(&clients, client->client_id);
+	up_write(&lists_rwsem);
 	mutex_unlock(&device_mutex);
 }
 EXPORT_SYMBOL(ib_unregister_client);
@@ -1422,6 +1461,7 @@ static void __exit ib_core_cleanup(void)
 	destroy_workqueue(ib_comp_wq);
 	/* Make sure that any pending umem accounting work is done. */
 	destroy_workqueue(ib_wq);
+	WARN_ON(!xa_empty(&clients));
 }
 
 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index d8ba987e8b29..cc15820513cd 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2610,7 +2610,7 @@ struct ib_device {
 };
 
 struct ib_client {
-	char  *name;
+	const char *name;
 	void (*add)   (struct ib_device *);
 	void (*remove)(struct ib_device *, void *client_data);
 
@@ -2637,6 +2637,7 @@ struct ib_client {
 			const struct sockaddr *addr,
 			void *client_data);
 	struct list_head list;
+	u32 client_id;
 
 	/* kverbs are not required by the client */
 	u8 no_kverbs_req:1;
-- 
cgit v1.2.3


From 0df91bb67334eebaf73d4ba32567e16d55f4f116 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Wed, 6 Feb 2019 22:41:53 -0700
Subject: RDMA/devices: Use xarray to store the client_data

Now that we have a small ID for each client we can use xarray instead of
linearly searching linked lists for client data. This will give much
faster and scalable client data lookup, and will lets us revise the
locking scheme.

Since xarray can store 'going_down' using a mark just entirely eliminate
the struct ib_client_data and directly store the client_data value in the
xarray. However this does require a special iterator as we must still
iterate over any NULL client_data values.

Also eliminate the client_data_lock in favour of internal xarray locking.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c | 348 +++++++++++++++++++--------------------
 include/rdma/ib_verbs.h          |  23 ++-
 2 files changed, 186 insertions(+), 185 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index f87d85659359..5096593b99e9 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -51,30 +51,72 @@ MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("core kernel InfiniBand API");
 MODULE_LICENSE("Dual BSD/GPL");
 
-struct ib_client_data {
-	struct list_head  list;
-	struct ib_client *client;
-	void *            data;
-	/* The device or client is going down. Do not call client or device
-	 * callbacks other than remove(). */
-	bool		  going_down;
-};
-
 struct workqueue_struct *ib_comp_wq;
 struct workqueue_struct *ib_comp_unbound_wq;
 struct workqueue_struct *ib_wq;
 EXPORT_SYMBOL_GPL(ib_wq);
 
-/* The device_list and clients contain devices and clients after their
- * registration has completed, and the devices and clients are removed
- * during unregistration. */
-static LIST_HEAD(device_list);
+/*
+ * devices contains devices that have had their names assigned. The
+ * devices may not be registered. Users that care about the registration
+ * status need to call ib_device_try_get() on the device to ensure it is
+ * registered, and keep it registered, for the required duration.
+ *
+ */
+static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC);
+
+/*
+ * Note that if the *rwsem is held and the *_REGISTERED mark is seen then the
+ * object is guaranteed to be and remain registered for the duration of the
+ * lock.
+ */
+#define DEVICE_REGISTERED XA_MARK_1
+
 static LIST_HEAD(client_list);
 #define CLIENT_REGISTERED XA_MARK_1
 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC);
 
 /*
- * device_mutex and lists_rwsem protect access to both device_list and
+ * If client_data is registered then the corresponding client must also still
+ * be registered.
+ */
+#define CLIENT_DATA_REGISTERED XA_MARK_1
+/*
+ * xarray has this behavior where it won't iterate over NULL values stored in
+ * allocated arrays.  So we need our own iterator to see all values stored in
+ * the array. This does the same thing as xa_for_each except that it also
+ * returns NULL valued entries if the array is allocating. Simplified to only
+ * work on simple xarrays.
+ */
+static void *xan_find_marked(struct xarray *xa, unsigned long *indexp,
+			     xa_mark_t filter)
+{
+	XA_STATE(xas, xa, *indexp);
+	void *entry;
+
+	rcu_read_lock();
+	do {
+		entry = xas_find_marked(&xas, ULONG_MAX, filter);
+		if (xa_is_zero(entry))
+			break;
+	} while (xas_retry(&xas, entry));
+	rcu_read_unlock();
+
+	if (entry) {
+		*indexp = xas.xa_index;
+		if (xa_is_zero(entry))
+			return NULL;
+		return entry;
+	}
+	return XA_ERROR(-ENOENT);
+}
+#define xan_for_each_marked(xa, index, entry, filter)                          \
+	for (index = 0, entry = xan_find_marked(xa, &(index), filter);         \
+	     !xa_is_err(entry);                                                \
+	     (index)++, entry = xan_find_marked(xa, &(index), filter))
+
+/*
+ * device_mutex and lists_rwsem protect access to both devices and
  * clients.  device_mutex protects writer access by device and client
  * registration / de-registration.  lists_rwsem protects reader access to
  * these lists.  Iterators of these lists must lock it for read, while updates
@@ -135,17 +177,6 @@ static int ib_device_check_mandatory(struct ib_device *device)
 	return 0;
 }
 
-static struct ib_device *__ib_device_get_by_index(u32 index)
-{
-	struct ib_device *device;
-
-	list_for_each_entry(device, &device_list, core_list)
-		if (device->index == index)
-			return device;
-
-	return NULL;
-}
-
 /*
  * Caller must perform ib_device_put() to return the device reference count
  * when ib_device_get_by_index() returns valid device pointer.
@@ -155,7 +186,7 @@ struct ib_device *ib_device_get_by_index(u32 index)
 	struct ib_device *device;
 
 	down_read(&lists_rwsem);
-	device = __ib_device_get_by_index(index);
+	device = xa_load(&devices, index);
 	if (device) {
 		if (!ib_device_try_get(device))
 			device = NULL;
@@ -181,8 +212,9 @@ EXPORT_SYMBOL(ib_device_put);
 static struct ib_device *__ib_device_get_by_name(const char *name)
 {
 	struct ib_device *device;
+	unsigned long index;
 
-	list_for_each_entry(device, &device_list, core_list)
+	xa_for_each (&devices, index, device)
 		if (!strcmp(name, dev_name(&device->dev)))
 			return device;
 
@@ -216,12 +248,13 @@ out:
 static int alloc_name(struct ib_device *ibdev, const char *name)
 {
 	struct ib_device *device;
+	unsigned long index;
 	struct ida inuse;
 	int rc;
 	int i;
 
 	ida_init(&inuse);
-	list_for_each_entry(device, &device_list, core_list) {
+	xa_for_each (&devices, index, device) {
 		char buf[IB_DEVICE_NAME_MAX];
 
 		if (sscanf(dev_name(&device->dev), name, &i) != 1)
@@ -256,6 +289,7 @@ static void ib_device_release(struct device *device)
 	ib_security_release_port_pkey_list(dev);
 	kfree(dev->port_pkey_list);
 	kfree(dev->port_immutable);
+	xa_destroy(&dev->client_data);
 	kfree(dev);
 }
 
@@ -306,8 +340,11 @@ struct ib_device *_ib_alloc_device(size_t size)
 
 	INIT_LIST_HEAD(&device->event_handler_list);
 	spin_lock_init(&device->event_handler_lock);
-	rwlock_init(&device->client_data_lock);
-	INIT_LIST_HEAD(&device->client_data_list);
+	/*
+	 * client_data needs to be alloc because we don't want our mark to be
+	 * destroyed if the user stores NULL in the client data.
+	 */
+	xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);
 	INIT_LIST_HEAD(&device->port_list);
 	init_completion(&device->unreg_completion);
 
@@ -323,7 +360,7 @@ EXPORT_SYMBOL(_ib_alloc_device);
  */
 void ib_dealloc_device(struct ib_device *device)
 {
-	WARN_ON(!list_empty(&device->client_data_list));
+	WARN_ON(!xa_empty(&device->client_data));
 	WARN_ON(refcount_read(&device->refcount));
 	rdma_restrack_clean(device);
 	put_device(&device->dev);
@@ -332,26 +369,20 @@ EXPORT_SYMBOL(ib_dealloc_device);
 
 static int add_client_context(struct ib_device *device, struct ib_client *client)
 {
-	struct ib_client_data *context;
+	void *entry;
 
 	if (!device->kverbs_provider && !client->no_kverbs_req)
 		return -EOPNOTSUPP;
 
-	context = kmalloc(sizeof(*context), GFP_KERNEL);
-	if (!context)
-		return -ENOMEM;
-
-	context->client = client;
-	context->data   = NULL;
-	context->going_down = false;
-
 	down_write(&lists_rwsem);
-	write_lock_irq(&device->client_data_lock);
-	list_add(&context->list, &device->client_data_list);
-	write_unlock_irq(&device->client_data_lock);
+	entry = xa_store(&device->client_data, client->client_id, NULL,
+			 GFP_KERNEL);
+	if (!xa_is_err(entry))
+		xa_set_mark(&device->client_data, client->client_id,
+			    CLIENT_DATA_REGISTERED);
 	up_write(&lists_rwsem);
 
-	return 0;
+	return xa_err(entry);
 }
 
 static int verify_immutable(const struct ib_device *dev, u8 port)
@@ -428,9 +459,10 @@ static int setup_port_pkey_list(struct ib_device *device)
 static void ib_policy_change_task(struct work_struct *work)
 {
 	struct ib_device *dev;
+	unsigned long index;
 
 	down_read(&lists_rwsem);
-	list_for_each_entry(dev, &device_list, core_list) {
+	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
 		int i;
 
 		for (i = rdma_start_port(dev); i <= rdma_end_port(dev); i++) {
@@ -461,28 +493,48 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event,
 	return NOTIFY_OK;
 }
 
-/**
- *	__dev_new_index	-	allocate an device index
- *
- *	Returns a suitable unique value for a new device interface
- *	number.  It assumes that there are less than 2^32-1 ib devices
- *	will be present in the system.
+/*
+ * Assign the unique string device name and the unique device index.
  */
-static u32 __dev_new_index(void)
+static int assign_name(struct ib_device *device, const char *name)
 {
-	/*
-	 * The device index to allow stable naming.
-	 * Similar to struct net -> ifindex.
-	 */
-	static u32 index;
+	static u32 last_id;
+	int ret;
 
-	for (;;) {
-		if (!(++index))
-			index = 1;
+	/* Assign a unique name to the device */
+	if (strchr(name, '%'))
+		ret = alloc_name(device, name);
+	else
+		ret = dev_set_name(&device->dev, name);
+	if (ret)
+		goto out;
+
+	if (__ib_device_get_by_name(dev_name(&device->dev))) {
+		ret = -ENFILE;
+		goto out;
+	}
+	strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX);
 
-		if (!__ib_device_get_by_index(index))
-			return index;
+	/* Cyclically allocate a user visible ID for the device */
+	device->index = last_id;
+	ret = xa_alloc(&devices, &device->index, INT_MAX, device, GFP_KERNEL);
+	if (ret == -ENOSPC) {
+		device->index = 0;
+		ret = xa_alloc(&devices, &device->index, INT_MAX, device,
+			       GFP_KERNEL);
 	}
+	if (ret)
+		goto out;
+	last_id = device->index + 1;
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static void release_name(struct ib_device *device)
+{
+	xa_erase(&devices, device->index);
 }
 
 static void setup_dma_device(struct ib_device *device)
@@ -572,34 +624,21 @@ int ib_register_device(struct ib_device *device, const char *name)
 
 	mutex_lock(&device_mutex);
 
-	if (strchr(name, '%')) {
-		ret = alloc_name(device, name);
-		if (ret)
-			goto out;
-	} else {
-		ret = dev_set_name(&device->dev, name);
-		if (ret)
-			goto out;
-	}
-	if (__ib_device_get_by_name(dev_name(&device->dev))) {
-		ret = -ENFILE;
+	ret = assign_name(device, name);
+	if (ret)
 		goto out;
-	}
-	strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX);
 
 	ret = setup_device(device);
 	if (ret)
-		goto out;
+		goto out_name;
 
 	ret = ib_cache_setup_one(device);
 	if (ret) {
 		dev_warn(&device->dev,
 			 "Couldn't set up InfiniBand P_Key/GID cache\n");
-		goto out;
+		goto out_name;
 	}
 
-	device->index = __dev_new_index();
-
 	ib_device_register_rdmacg(device);
 
 	ret = ib_device_register_sysfs(device);
@@ -616,7 +655,7 @@ int ib_register_device(struct ib_device *device, const char *name)
 			client->add(device);
 
 	down_write(&lists_rwsem);
-	list_add_tail(&device->core_list, &device_list);
+	xa_set_mark(&devices, device->index, DEVICE_REGISTERED);
 	up_write(&lists_rwsem);
 	mutex_unlock(&device_mutex);
 	return 0;
@@ -624,6 +663,8 @@ int ib_register_device(struct ib_device *device, const char *name)
 cg_cleanup:
 	ib_device_unregister_rdmacg(device);
 	ib_cache_cleanup_one(device);
+out_name:
+	release_name(device);
 out:
 	mutex_unlock(&device_mutex);
 	return ret;
@@ -638,8 +679,8 @@ EXPORT_SYMBOL(ib_register_device);
  */
 void ib_unregister_device(struct ib_device *device)
 {
-	struct ib_client_data *context, *tmp;
-	unsigned long flags;
+	struct ib_client *client;
+	unsigned long index;
 
 	/*
 	 * Wait for all netlink command callers to finish working on the
@@ -651,34 +692,31 @@ void ib_unregister_device(struct ib_device *device)
 	mutex_lock(&device_mutex);
 
 	down_write(&lists_rwsem);
-	list_del(&device->core_list);
-	write_lock_irq(&device->client_data_lock);
-	list_for_each_entry(context, &device->client_data_list, list)
-		context->going_down = true;
-	write_unlock_irq(&device->client_data_lock);
+	xa_clear_mark(&devices, device->index, DEVICE_REGISTERED);
+	xa_for_each (&clients, index, client)
+		xa_clear_mark(&device->client_data, index,
+			      CLIENT_DATA_REGISTERED);
 	downgrade_write(&lists_rwsem);
 
-	list_for_each_entry(context, &device->client_data_list, list) {
-		if (context->client->remove)
-			context->client->remove(device, context->data);
-	}
+	list_for_each_entry_reverse(client, &client_list, list)
+		if (xa_get_mark(&device->client_data, client->client_id,
+				CLIENT_DATA_REGISTERED) &&
+		    client->remove)
+			client->remove(device, xa_load(&device->client_data,
+						       client->client_id));
 	up_read(&lists_rwsem);
 
 	ib_device_unregister_sysfs(device);
 	ib_device_unregister_rdmacg(device);
 
+	release_name(device);
+
 	mutex_unlock(&device_mutex);
 
 	ib_cache_cleanup_one(device);
 
 	down_write(&lists_rwsem);
-	write_lock_irqsave(&device->client_data_lock, flags);
-	list_for_each_entry_safe(context, tmp, &device->client_data_list,
-				 list) {
-		list_del(&context->list);
-		kfree(context);
-	}
-	write_unlock_irqrestore(&device->client_data_lock, flags);
+	xa_destroy(&device->client_data);
 	up_write(&lists_rwsem);
 }
 EXPORT_SYMBOL(ib_unregister_device);
@@ -725,6 +763,7 @@ out:
 int ib_register_client(struct ib_client *client)
 {
 	struct ib_device *device;
+	unsigned long index;
 	int ret;
 
 	mutex_lock(&device_mutex);
@@ -734,7 +773,7 @@ int ib_register_client(struct ib_client *client)
 		return ret;
 	}
 
-	list_for_each_entry(device, &device_list, core_list)
+	xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED)
 		if (!add_client_context(device, client) && client->add)
 			client->add(device);
 
@@ -758,8 +797,8 @@ EXPORT_SYMBOL(ib_register_client);
  */
 void ib_unregister_client(struct ib_client *client)
 {
-	struct ib_client_data *context;
 	struct ib_device *device;
+	unsigned long index;
 
 	mutex_lock(&device_mutex);
 
@@ -767,37 +806,19 @@ void ib_unregister_client(struct ib_client *client)
 	xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED);
 	up_write(&lists_rwsem);
 
-	list_for_each_entry(device, &device_list, core_list) {
-		struct ib_client_data *found_context = NULL;
-
+	xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) {
 		down_write(&lists_rwsem);
-		write_lock_irq(&device->client_data_lock);
-		list_for_each_entry(context, &device->client_data_list, list)
-			if (context->client == client) {
-				context->going_down = true;
-				found_context = context;
-				break;
-			}
-		write_unlock_irq(&device->client_data_lock);
+		xa_clear_mark(&device->client_data, client->client_id,
+			      CLIENT_DATA_REGISTERED);
 		up_write(&lists_rwsem);
 
 		if (client->remove)
-			client->remove(device, found_context ?
-					       found_context->data : NULL);
-
-		if (!found_context) {
-			dev_warn(&device->dev,
-				 "No client context found for %s\n",
-				 client->name);
-			continue;
-		}
+			client->remove(device, xa_load(&device->client_data,
+						       client->client_id));
 
 		down_write(&lists_rwsem);
-		write_lock_irq(&device->client_data_lock);
-		list_del(&found_context->list);
-		write_unlock_irq(&device->client_data_lock);
+		xa_erase(&device->client_data, client->client_id);
 		up_write(&lists_rwsem);
-		kfree(found_context);
 	}
 
 	down_write(&lists_rwsem);
@@ -808,59 +829,28 @@ void ib_unregister_client(struct ib_client *client)
 }
 EXPORT_SYMBOL(ib_unregister_client);
 
-/**
- * ib_get_client_data - Get IB client context
- * @device:Device to get context for
- * @client:Client to get context for
- *
- * ib_get_client_data() returns client context set with
- * ib_set_client_data().
- */
-void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
-{
-	struct ib_client_data *context;
-	void *ret = NULL;
-	unsigned long flags;
-
-	read_lock_irqsave(&device->client_data_lock, flags);
-	list_for_each_entry(context, &device->client_data_list, list)
-		if (context->client == client) {
-			ret = context->data;
-			break;
-		}
-	read_unlock_irqrestore(&device->client_data_lock, flags);
-
-	return ret;
-}
-EXPORT_SYMBOL(ib_get_client_data);
-
 /**
  * ib_set_client_data - Set IB client context
  * @device:Device to set context for
  * @client:Client to set context for
  * @data:Context to set
  *
- * ib_set_client_data() sets client context that can be retrieved with
- * ib_get_client_data().
+ * ib_set_client_data() sets client context data that can be retrieved with
+ * ib_get_client_data(). This can only be called while the client is
+ * registered to the device, once the ib_client remove() callback returns this
+ * cannot be called.
  */
 void ib_set_client_data(struct ib_device *device, struct ib_client *client,
 			void *data)
 {
-	struct ib_client_data *context;
-	unsigned long flags;
-
-	write_lock_irqsave(&device->client_data_lock, flags);
-	list_for_each_entry(context, &device->client_data_list, list)
-		if (context->client == client) {
-			context->data = data;
-			goto out;
-		}
+	void *rc;
 
-	dev_warn(&device->dev, "No client context found for %s\n",
-		 client->name);
+	if (WARN_ON(IS_ERR(data)))
+		data = NULL;
 
-out:
-	write_unlock_irqrestore(&device->client_data_lock, flags);
+	rc = xa_store(&device->client_data, client->client_id, data,
+		      GFP_KERNEL);
+	WARN_ON(xa_is_err(rc));
 }
 EXPORT_SYMBOL(ib_set_client_data);
 
@@ -1018,9 +1008,10 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
 			      void *cookie)
 {
 	struct ib_device *dev;
+	unsigned long index;
 
 	down_read(&lists_rwsem);
-	list_for_each_entry(dev, &device_list, core_list)
+	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED)
 		ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
 	up_read(&lists_rwsem);
 }
@@ -1034,12 +1025,13 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
 		     struct netlink_callback *cb)
 {
+	unsigned long index;
 	struct ib_device *dev;
 	unsigned int idx = 0;
 	int ret = 0;
 
 	down_read(&lists_rwsem);
-	list_for_each_entry(dev, &device_list, core_list) {
+	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
 		ret = nldev_cb(dev, skb, cb, idx);
 		if (ret)
 			break;
@@ -1212,26 +1204,25 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
 					    const struct sockaddr *addr)
 {
 	struct net_device *net_dev = NULL;
-	struct ib_client_data *context;
+	unsigned long index;
+	void *client_data;
 
 	if (!rdma_protocol_ib(dev, port))
 		return NULL;
 
 	down_read(&lists_rwsem);
 
-	list_for_each_entry(context, &dev->client_data_list, list) {
-		struct ib_client *client = context->client;
+	xan_for_each_marked (&dev->client_data, index, client_data,
+			     CLIENT_DATA_REGISTERED) {
+		struct ib_client *client = xa_load(&clients, index);
 
-		if (context->going_down)
+		if (!client || !client->get_net_dev_by_params)
 			continue;
 
-		if (client->get_net_dev_by_params) {
-			net_dev = client->get_net_dev_by_params(dev, port, pkey,
-								gid, addr,
-								context->data);
-			if (net_dev)
-				break;
-		}
+		net_dev = client->get_net_dev_by_params(dev, port, pkey, gid,
+							addr, client_data);
+		if (net_dev)
+			break;
 	}
 
 	up_read(&lists_rwsem);
@@ -1462,6 +1453,7 @@ static void __exit ib_core_cleanup(void)
 	/* Make sure that any pending umem accounting work is done. */
 	destroy_workqueue(ib_wq);
 	WARN_ON(!xa_empty(&clients));
+	WARN_ON(!xa_empty(&devices));
 }
 
 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index cc15820513cd..8558f31ca46f 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2542,12 +2542,7 @@ struct ib_device {
 	struct list_head              event_handler_list;
 	spinlock_t                    event_handler_lock;
 
-	rwlock_t			client_data_lock;
-	struct list_head              core_list;
-	/* Access to the client_data_list is protected by the client_data_lock
-	 * rwlock and the lists_rwsem read-write semaphore
-	 */
-	struct list_head              client_data_list;
+	struct xarray                 client_data;
 
 	struct ib_cache               cache;
 	/**
@@ -2660,7 +2655,21 @@ void ib_unregister_device(struct ib_device *device);
 int ib_register_client   (struct ib_client *client);
 void ib_unregister_client(struct ib_client *client);
 
-void *ib_get_client_data(struct ib_device *device, struct ib_client *client);
+/**
+ * ib_get_client_data - Get IB client context
+ * @device:Device to get context for
+ * @client:Client to get context for
+ *
+ * ib_get_client_data() returns the client context data set with
+ * ib_set_client_data(). This can only be called while the client is
+ * registered to the device, once the ib_client remove() callback returns this
+ * cannot be called.
+ */
+static inline void *ib_get_client_data(struct ib_device *device,
+				       struct ib_client *client)
+{
+	return xa_load(&device->client_data, client->client_id);
+}
 void  ib_set_client_data(struct ib_device *device, struct ib_client *client,
 			 void *data);
 void ib_set_device_ops(struct ib_device *device,
-- 
cgit v1.2.3


From 921eab1143aadf976a42cac4605b4d35159b355d Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Wed, 6 Feb 2019 22:41:54 -0700
Subject: RDMA/devices: Re-organize device.c locking

The locking here started out with a single lock that covered everything
and then has lately veered into crazy town.

The fundamental problem is that several places need to iterate over a
linked list, but also need to drop their locks to avoid deadlock during
client callbacks.

xarray's restartable iteration offers a simple solution to the
problem. Once all the lists are xarrays we can drop locks in the places
that need that and rely on xarray to provide consistency and locking for
the data structure.

The resulting simplification is that each of the three lists has a
dedicated rwsem that must be held when working with the list it
covers. One data structure is no longer covered by multiple locks.

The sleeping semaphore is selected because the read side generally needs
to be held over something sleeping, and using RCU reader locking in those
cases is overkill.

In the process this simplifies the entire registration/unregistration flow
to be the expected list of setups and the reversed list of matching
teardowns, and the registration lock 'refcount' can now be revised to be
released after the ULPs are removed, providing a very sane semantic for
this feature.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c | 361 ++++++++++++++++++++++++---------------
 include/rdma/ib_verbs.h          |   1 +
 2 files changed, 222 insertions(+), 140 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 5096593b99e9..3325be4f91a5 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -37,7 +37,6 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/init.h>
-#include <linux/mutex.h>
 #include <linux/netdevice.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
@@ -56,6 +55,29 @@ struct workqueue_struct *ib_comp_unbound_wq;
 struct workqueue_struct *ib_wq;
 EXPORT_SYMBOL_GPL(ib_wq);
 
+/*
+ * Each of the three rwsem locks (devices, clients, client_data) protects the
+ * xarray of the same name. Specifically it allows the caller to assert that
+ * the MARK will/will not be changing under the lock, and for devices and
+ * clients, that the value in the xarray is still a valid pointer. Change of
+ * the MARK is linked to the object state, so holding the lock and testing the
+ * MARK also asserts that the contained object is in a certain state.
+ *
+ * This is used to build a two stage register/unregister flow where objects
+ * can continue to be in the xarray even though they are still in progress to
+ * register/unregister.
+ *
+ * The xarray itself provides additional locking, and restartable iteration,
+ * which is also relied on.
+ *
+ * Locks should not be nested, with the exception of client_data, which is
+ * allowed to nest under the read side of the other two locks.
+ *
+ * The devices_rwsem also protects the device name list, any change or
+ * assignment of device name must also hold the write side to guarantee unique
+ * names.
+ */
+
 /*
  * devices contains devices that have had their names assigned. The
  * devices may not be registered. Users that care about the registration
@@ -64,17 +86,13 @@ EXPORT_SYMBOL_GPL(ib_wq);
  *
  */
 static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC);
-
-/*
- * Note that if the *rwsem is held and the *_REGISTERED mark is seen then the
- * object is guaranteed to be and remain registered for the duration of the
- * lock.
- */
+static DECLARE_RWSEM(devices_rwsem);
 #define DEVICE_REGISTERED XA_MARK_1
 
 static LIST_HEAD(client_list);
 #define CLIENT_REGISTERED XA_MARK_1
 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC);
+static DECLARE_RWSEM(clients_rwsem);
 
 /*
  * If client_data is registered then the corresponding client must also still
@@ -115,20 +133,6 @@ static void *xan_find_marked(struct xarray *xa, unsigned long *indexp,
 	     !xa_is_err(entry);                                                \
 	     (index)++, entry = xan_find_marked(xa, &(index), filter))
 
-/*
- * device_mutex and lists_rwsem protect access to both devices and
- * clients.  device_mutex protects writer access by device and client
- * registration / de-registration.  lists_rwsem protects reader access to
- * these lists.  Iterators of these lists must lock it for read, while updates
- * to the lists must be done with a write lock. A special case is when the
- * device_mutex is locked. In this case locking the lists for read access is
- * not necessary as the device_mutex implies it.
- *
- * lists_rwsem also protects access to the client data list.
- */
-static DEFINE_MUTEX(device_mutex);
-static DECLARE_RWSEM(lists_rwsem);
-
 static int ib_security_change(struct notifier_block *nb, unsigned long event,
 			      void *lsm_data);
 static void ib_policy_change_task(struct work_struct *work);
@@ -185,13 +189,13 @@ struct ib_device *ib_device_get_by_index(u32 index)
 {
 	struct ib_device *device;
 
-	down_read(&lists_rwsem);
+	down_read(&devices_rwsem);
 	device = xa_load(&devices, index);
 	if (device) {
 		if (!ib_device_try_get(device))
 			device = NULL;
 	}
-	up_read(&lists_rwsem);
+	up_read(&devices_rwsem);
 	return device;
 }
 
@@ -225,7 +229,7 @@ int ib_device_rename(struct ib_device *ibdev, const char *name)
 {
 	int ret;
 
-	mutex_lock(&device_mutex);
+	down_write(&devices_rwsem);
 	if (!strcmp(name, dev_name(&ibdev->dev))) {
 		ret = 0;
 		goto out;
@@ -241,7 +245,7 @@ int ib_device_rename(struct ib_device *ibdev, const char *name)
 		goto out;
 	strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
 out:
-	mutex_unlock(&device_mutex);
+	up_write(&devices_rwsem);
 	return ret;
 }
 
@@ -253,6 +257,7 @@ static int alloc_name(struct ib_device *ibdev, const char *name)
 	int rc;
 	int i;
 
+	lockdep_assert_held_exclusive(&devices_rwsem);
 	ida_init(&inuse);
 	xa_for_each (&devices, index, device) {
 		char buf[IB_DEVICE_NAME_MAX];
@@ -345,6 +350,7 @@ struct ib_device *_ib_alloc_device(size_t size)
 	 * destroyed if the user stores NULL in the client data.
 	 */
 	xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);
+	init_rwsem(&device->client_data_rwsem);
 	INIT_LIST_HEAD(&device->port_list);
 	init_completion(&device->unreg_completion);
 
@@ -367,22 +373,86 @@ void ib_dealloc_device(struct ib_device *device)
 }
 EXPORT_SYMBOL(ib_dealloc_device);
 
-static int add_client_context(struct ib_device *device, struct ib_client *client)
+/*
+ * add_client_context() and remove_client_context() must be safe against
+ * parallel calls on the same device - registration/unregistration of both the
+ * device and client can be occurring in parallel.
+ *
+ * The routines need to be a fence, any caller must not return until the add
+ * or remove is fully completed.
+ */
+static int add_client_context(struct ib_device *device,
+			      struct ib_client *client)
 {
-	void *entry;
+	int ret = 0;
 
 	if (!device->kverbs_provider && !client->no_kverbs_req)
-		return -EOPNOTSUPP;
+		return 0;
+
+	down_write(&device->client_data_rwsem);
+	/*
+	 * Another caller to add_client_context got here first and has already
+	 * completely initialized context.
+	 */
+	if (xa_get_mark(&device->client_data, client->client_id,
+		    CLIENT_DATA_REGISTERED))
+		goto out;
+
+	ret = xa_err(xa_store(&device->client_data, client->client_id, NULL,
+			      GFP_KERNEL));
+	if (ret)
+		goto out;
+	downgrade_write(&device->client_data_rwsem);
+	if (client->add)
+		client->add(device);
+
+	/* Readers shall not see a client until add has been completed */
+	xa_set_mark(&device->client_data, client->client_id,
+		    CLIENT_DATA_REGISTERED);
+	up_read(&device->client_data_rwsem);
+	return 0;
+
+out:
+	up_write(&device->client_data_rwsem);
+	return ret;
+}
+
+static void remove_client_context(struct ib_device *device,
+				  unsigned int client_id)
+{
+	struct ib_client *client;
+	void *client_data;
 
-	down_write(&lists_rwsem);
-	entry = xa_store(&device->client_data, client->client_id, NULL,
-			 GFP_KERNEL);
-	if (!xa_is_err(entry))
-		xa_set_mark(&device->client_data, client->client_id,
-			    CLIENT_DATA_REGISTERED);
-	up_write(&lists_rwsem);
+	down_write(&device->client_data_rwsem);
+	if (!xa_get_mark(&device->client_data, client_id,
+			 CLIENT_DATA_REGISTERED)) {
+		up_write(&device->client_data_rwsem);
+		return;
+	}
+	client_data = xa_load(&device->client_data, client_id);
+	xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED);
+	client = xa_load(&clients, client_id);
+	downgrade_write(&device->client_data_rwsem);
 
-	return xa_err(entry);
+	/*
+	 * Notice we cannot be holding any exclusive locks when calling the
+	 * remove callback as the remove callback can recurse back into any
+	 * public functions in this module and thus try for any locks those
+	 * functions take.
+	 *
+	 * For this reason clients and drivers should not call the
+	 * unregistration functions will holdling any locks.
+	 *
+	 * It tempting to drop the client_data_rwsem too, but this is required
+	 * to ensure that unregister_client does not return until all clients
+	 * are completely unregistered, which is required to avoid module
+	 * unloading races.
+	 */
+	if (client->remove)
+		client->remove(device, client_data);
+
+	xa_erase(&device->client_data, client_id);
+	up_read(&device->client_data_rwsem);
 }
 
 static int verify_immutable(const struct ib_device *dev, u8 port)
@@ -461,7 +531,7 @@ static void ib_policy_change_task(struct work_struct *work)
 	struct ib_device *dev;
 	unsigned long index;
 
-	down_read(&lists_rwsem);
+	down_read(&devices_rwsem);
 	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
 		int i;
 
@@ -478,7 +548,7 @@ static void ib_policy_change_task(struct work_struct *work)
 				ib_security_cache_change(dev, i, sp);
 		}
 	}
-	up_read(&lists_rwsem);
+	up_read(&devices_rwsem);
 }
 
 static int ib_security_change(struct notifier_block *nb, unsigned long event,
@@ -501,6 +571,7 @@ static int assign_name(struct ib_device *device, const char *name)
 	static u32 last_id;
 	int ret;
 
+	down_write(&devices_rwsem);
 	/* Assign a unique name to the device */
 	if (strchr(name, '%'))
 		ret = alloc_name(device, name);
@@ -528,13 +599,17 @@ static int assign_name(struct ib_device *device, const char *name)
 	last_id = device->index + 1;
 
 	ret = 0;
+
 out:
+	up_write(&devices_rwsem);
 	return ret;
 }
 
 static void release_name(struct ib_device *device)
 {
+	down_write(&devices_rwsem);
 	xa_erase(&devices, device->index);
+	up_write(&devices_rwsem);
 }
 
 static void setup_dma_device(struct ib_device *device)
@@ -572,11 +647,18 @@ static void setup_dma_device(struct ib_device *device)
 	}
 }
 
+/*
+ * setup_device() allocates memory and sets up data that requires calling the
+ * device ops, this is the only reason these actions are not done during
+ * ib_alloc_device. It is undone by ib_dealloc_device().
+ */
 static int setup_device(struct ib_device *device)
 {
 	struct ib_udata uhw = {.outlen = 0, .inlen = 0};
 	int ret;
 
+	setup_dma_device(device);
+
 	ret = ib_device_check_mandatory(device);
 	if (ret)
 		return ret;
@@ -605,6 +687,54 @@ static int setup_device(struct ib_device *device)
 	return 0;
 }
 
+static void disable_device(struct ib_device *device)
+{
+	struct ib_client *client;
+
+	WARN_ON(!refcount_read(&device->refcount));
+
+	down_write(&devices_rwsem);
+	xa_clear_mark(&devices, device->index, DEVICE_REGISTERED);
+	up_write(&devices_rwsem);
+
+	down_read(&clients_rwsem);
+	list_for_each_entry_reverse(client, &client_list, list)
+		remove_client_context(device, client->client_id);
+	up_read(&clients_rwsem);
+
+	/* Pairs with refcount_set in enable_device */
+	ib_device_put(device);
+	wait_for_completion(&device->unreg_completion);
+}
+
+/*
+ * An enabled device is visible to all clients and to all the public facing
+ * APIs that return a device pointer.
+ */
+static int enable_device(struct ib_device *device)
+{
+	struct ib_client *client;
+	unsigned long index;
+	int ret;
+
+	refcount_set(&device->refcount, 1);
+	down_write(&devices_rwsem);
+	xa_set_mark(&devices, device->index, DEVICE_REGISTERED);
+	up_write(&devices_rwsem);
+
+	down_read(&clients_rwsem);
+	xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
+		ret = add_client_context(device, client);
+		if (ret) {
+			up_read(&clients_rwsem);
+			disable_device(device);
+			return ret;
+		}
+	}
+	up_read(&clients_rwsem);
+	return 0;
+}
+
 /**
  * ib_register_device - Register an IB device with IB core
  * @device:Device to register
@@ -617,26 +747,20 @@ static int setup_device(struct ib_device *device)
 int ib_register_device(struct ib_device *device, const char *name)
 {
 	int ret;
-	struct ib_client *client;
-	unsigned long index;
-
-	setup_dma_device(device);
-
-	mutex_lock(&device_mutex);
 
 	ret = assign_name(device, name);
 	if (ret)
-		goto out;
+		return ret;
 
 	ret = setup_device(device);
 	if (ret)
-		goto out_name;
+		goto out;
 
 	ret = ib_cache_setup_one(device);
 	if (ret) {
 		dev_warn(&device->dev,
 			 "Couldn't set up InfiniBand P_Key/GID cache\n");
-		goto out_name;
+		goto out;
 	}
 
 	ib_device_register_rdmacg(device);
@@ -648,25 +772,19 @@ int ib_register_device(struct ib_device *device, const char *name)
 		goto cg_cleanup;
 	}
 
-	refcount_set(&device->refcount, 1);
-
-	xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED)
-		if (!add_client_context(device, client) && client->add)
-			client->add(device);
+	ret = enable_device(device);
+	if (ret)
+		goto sysfs_cleanup;
 
-	down_write(&lists_rwsem);
-	xa_set_mark(&devices, device->index, DEVICE_REGISTERED);
-	up_write(&lists_rwsem);
-	mutex_unlock(&device_mutex);
 	return 0;
 
+sysfs_cleanup:
+	ib_device_unregister_sysfs(device);
 cg_cleanup:
 	ib_device_unregister_rdmacg(device);
 	ib_cache_cleanup_one(device);
-out_name:
-	release_name(device);
 out:
-	mutex_unlock(&device_mutex);
+	release_name(device);
 	return ret;
 }
 EXPORT_SYMBOL(ib_register_device);
@@ -679,45 +797,11 @@ EXPORT_SYMBOL(ib_register_device);
  */
 void ib_unregister_device(struct ib_device *device)
 {
-	struct ib_client *client;
-	unsigned long index;
-
-	/*
-	 * Wait for all netlink command callers to finish working on the
-	 * device.
-	 */
-	ib_device_put(device);
-	wait_for_completion(&device->unreg_completion);
-
-	mutex_lock(&device_mutex);
-
-	down_write(&lists_rwsem);
-	xa_clear_mark(&devices, device->index, DEVICE_REGISTERED);
-	xa_for_each (&clients, index, client)
-		xa_clear_mark(&device->client_data, index,
-			      CLIENT_DATA_REGISTERED);
-	downgrade_write(&lists_rwsem);
-
-	list_for_each_entry_reverse(client, &client_list, list)
-		if (xa_get_mark(&device->client_data, client->client_id,
-				CLIENT_DATA_REGISTERED) &&
-		    client->remove)
-			client->remove(device, xa_load(&device->client_data,
-						       client->client_id));
-	up_read(&lists_rwsem);
-
+	disable_device(device);
 	ib_device_unregister_sysfs(device);
 	ib_device_unregister_rdmacg(device);
-
-	release_name(device);
-
-	mutex_unlock(&device_mutex);
-
 	ib_cache_cleanup_one(device);
-
-	down_write(&lists_rwsem);
-	xa_destroy(&device->client_data);
-	up_write(&lists_rwsem);
+	release_name(device);
 }
 EXPORT_SYMBOL(ib_unregister_device);
 
@@ -725,6 +809,7 @@ static int assign_client_id(struct ib_client *client)
 {
 	int ret;
 
+	down_write(&clients_rwsem);
 	/*
 	 * The add/remove callbacks must be called in FIFO/LIFO order. To
 	 * achieve this we assign client_ids so they are sorted in
@@ -743,7 +828,11 @@ static int assign_client_id(struct ib_client *client)
 	if (ret)
 		goto out;
 
+	xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED);
+	list_add_tail(&client->list, &client_list);
+
 out:
+	up_write(&clients_rwsem);
 	return ret;
 }
 
@@ -766,23 +855,20 @@ int ib_register_client(struct ib_client *client)
 	unsigned long index;
 	int ret;
 
-	mutex_lock(&device_mutex);
 	ret = assign_client_id(client);
-	if (ret) {
-		mutex_unlock(&device_mutex);
+	if (ret)
 		return ret;
-	}
-
-	xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED)
-		if (!add_client_context(device, client) && client->add)
-			client->add(device);
-
-	down_write(&lists_rwsem);
-	xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED);
-	up_write(&lists_rwsem);
-
-	mutex_unlock(&device_mutex);
 
+	down_read(&devices_rwsem);
+	xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) {
+		ret = add_client_context(device, client);
+		if (ret) {
+			up_read(&devices_rwsem);
+			ib_unregister_client(client);
+			return ret;
+		}
+	}
+	up_read(&devices_rwsem);
 	return 0;
 }
 EXPORT_SYMBOL(ib_register_client);
@@ -794,38 +880,31 @@ EXPORT_SYMBOL(ib_register_client);
  * Upper level users use ib_unregister_client() to remove their client
  * registration.  When ib_unregister_client() is called, the client
  * will receive a remove callback for each IB device still registered.
+ *
+ * This is a full fence, once it returns no client callbacks will be called,
+ * or are running in another thread.
  */
 void ib_unregister_client(struct ib_client *client)
 {
 	struct ib_device *device;
 	unsigned long index;
 
-	mutex_lock(&device_mutex);
-
-	down_write(&lists_rwsem);
+	down_write(&clients_rwsem);
 	xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED);
-	up_write(&lists_rwsem);
-
-	xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) {
-		down_write(&lists_rwsem);
-		xa_clear_mark(&device->client_data, client->client_id,
-			      CLIENT_DATA_REGISTERED);
-		up_write(&lists_rwsem);
-
-		if (client->remove)
-			client->remove(device, xa_load(&device->client_data,
-						       client->client_id));
-
-		down_write(&lists_rwsem);
-		xa_erase(&device->client_data, client->client_id);
-		up_write(&lists_rwsem);
-	}
+	up_write(&clients_rwsem);
+	/*
+	 * Every device still known must be serialized to make sure we are
+	 * done with the client callbacks before we return.
+	 */
+	down_read(&devices_rwsem);
+	xa_for_each (&devices, index, device)
+		remove_client_context(device, client->client_id);
+	up_read(&devices_rwsem);
 
-	down_write(&lists_rwsem);
+	down_write(&clients_rwsem);
 	list_del(&client->list);
 	xa_erase(&clients, client->client_id);
-	up_write(&lists_rwsem);
-	mutex_unlock(&device_mutex);
+	up_write(&clients_rwsem);
 }
 EXPORT_SYMBOL(ib_unregister_client);
 
@@ -1010,10 +1089,10 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
 	struct ib_device *dev;
 	unsigned long index;
 
-	down_read(&lists_rwsem);
+	down_read(&devices_rwsem);
 	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED)
 		ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
-	up_read(&lists_rwsem);
+	up_read(&devices_rwsem);
 }
 
 /**
@@ -1030,15 +1109,14 @@ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
 	unsigned int idx = 0;
 	int ret = 0;
 
-	down_read(&lists_rwsem);
+	down_read(&devices_rwsem);
 	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
 		ret = nldev_cb(dev, skb, cb, idx);
 		if (ret)
 			break;
 		idx++;
 	}
-
-	up_read(&lists_rwsem);
+	up_read(&devices_rwsem);
 	return ret;
 }
 
@@ -1196,6 +1274,7 @@ EXPORT_SYMBOL(ib_find_pkey);
  * @gid:	A GID that the net_dev uses to communicate.
  * @addr:	Contains the IP address that the request specified as its
  *		destination.
+ *
  */
 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
 					    u8 port,
@@ -1210,8 +1289,11 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
 	if (!rdma_protocol_ib(dev, port))
 		return NULL;
 
-	down_read(&lists_rwsem);
-
+	/*
+	 * Holding the read side guarantees that the client will not become
+	 * unregistered while we are calling get_net_dev_by_params()
+	 */
+	down_read(&dev->client_data_rwsem);
 	xan_for_each_marked (&dev->client_data, index, client_data,
 			     CLIENT_DATA_REGISTERED) {
 		struct ib_client *client = xa_load(&clients, index);
@@ -1224,8 +1306,7 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
 		if (net_dev)
 			break;
 	}
-
-	up_read(&lists_rwsem);
+	up_read(&dev->client_data_rwsem);
 
 	return net_dev;
 }
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 8558f31ca46f..135fab2c016c 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2542,6 +2542,7 @@ struct ib_device {
 	struct list_head              event_handler_list;
 	spinlock_t                    event_handler_lock;
 
+	struct rw_semaphore	      client_data_rwsem;
 	struct xarray                 client_data;
 
 	struct ib_cache               cache;
-- 
cgit v1.2.3


From f818b82b80164014d7ee3df89bb110808778c796 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Fri, 8 Feb 2019 14:02:45 -0500
Subject: XArray: Mark xa_insert and xa_reserve as must_check

If the user doesn't care about the return value from xa_insert(), then
they should be using xa_store() instead.  The point of xa_reserve() is
to get the return value early before taking another lock, so this should
also be __must_check.

Signed-off-by: Matthew Wilcox <willy@infradead.org>
---
 include/linux/xarray.h | 23 ++++++++++++-----------
 lib/test_xarray.c      | 10 +++++-----
 2 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 5ed6b462e754..687c150071a5 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -497,12 +497,13 @@ void *__xa_erase(struct xarray *, unsigned long index);
 void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
 void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old,
 		void *entry, gfp_t);
-int __xa_insert(struct xarray *, unsigned long index, void *entry, gfp_t);
+int __must_check __xa_insert(struct xarray *, unsigned long index,
+		void *entry, gfp_t);
 int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry,
 		struct xa_limit, gfp_t);
 int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry,
 		struct xa_limit, u32 *next, gfp_t);
-int __xa_reserve(struct xarray *, unsigned long index, gfp_t);
+int __must_check __xa_reserve(struct xarray *, unsigned long index, gfp_t);
 void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
 void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);
 
@@ -704,8 +705,8 @@ static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index,
  * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
  * -ENOMEM if memory could not be allocated.
  */
-static inline int xa_insert(struct xarray *xa, unsigned long index,
-		void *entry, gfp_t gfp)
+static inline int __must_check xa_insert(struct xarray *xa,
+		unsigned long index, void *entry, gfp_t gfp)
 {
 	int err;
 
@@ -733,8 +734,8 @@ static inline int xa_insert(struct xarray *xa, unsigned long index,
  * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
  * -ENOMEM if memory could not be allocated.
  */
-static inline int xa_insert_bh(struct xarray *xa, unsigned long index,
-		void *entry, gfp_t gfp)
+static inline int __must_check xa_insert_bh(struct xarray *xa,
+		unsigned long index, void *entry, gfp_t gfp)
 {
 	int err;
 
@@ -762,8 +763,8 @@ static inline int xa_insert_bh(struct xarray *xa, unsigned long index,
  * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
  * -ENOMEM if memory could not be allocated.
  */
-static inline int xa_insert_irq(struct xarray *xa, unsigned long index,
-		void *entry, gfp_t gfp)
+static inline int __must_check xa_insert_irq(struct xarray *xa,
+		unsigned long index, void *entry, gfp_t gfp)
 {
 	int err;
 
@@ -978,7 +979,7 @@ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry,
  * May sleep if the @gfp flags permit.
  * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
  */
-static inline
+static inline __must_check
 int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
 {
 	int ret;
@@ -1002,7 +1003,7 @@ int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
  * disabling softirqs.
  * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
  */
-static inline
+static inline __must_check
 int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp)
 {
 	int ret;
@@ -1026,7 +1027,7 @@ int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp)
  * disabling interrupts.
  * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
  */
-static inline
+static inline __must_check
 int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp)
 {
 	int ret;
diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index eaf53f742c72..3eaa40ddc390 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -364,21 +364,21 @@ static noinline void check_reserve(struct xarray *xa)
 
 	/* An array with a reserved entry is not empty */
 	XA_BUG_ON(xa, !xa_empty(xa));
-	xa_reserve(xa, 12345678, GFP_KERNEL);
+	XA_BUG_ON(xa, xa_reserve(xa, 12345678, GFP_KERNEL) != 0);
 	XA_BUG_ON(xa, xa_empty(xa));
 	XA_BUG_ON(xa, xa_load(xa, 12345678));
 	xa_release(xa, 12345678);
 	XA_BUG_ON(xa, !xa_empty(xa));
 
 	/* Releasing a used entry does nothing */
-	xa_reserve(xa, 12345678, GFP_KERNEL);
+	XA_BUG_ON(xa, xa_reserve(xa, 12345678, GFP_KERNEL) != 0);
 	XA_BUG_ON(xa, xa_store_index(xa, 12345678, GFP_NOWAIT) != NULL);
 	xa_release(xa, 12345678);
 	xa_erase_index(xa, 12345678);
 	XA_BUG_ON(xa, !xa_empty(xa));
 
 	/* cmpxchg sees a reserved entry as NULL */
-	xa_reserve(xa, 12345678, GFP_KERNEL);
+	XA_BUG_ON(xa, xa_reserve(xa, 12345678, GFP_KERNEL) != 0);
 	XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, NULL, xa_mk_value(12345678),
 				GFP_NOWAIT) != NULL);
 	xa_release(xa, 12345678);
@@ -386,7 +386,7 @@ static noinline void check_reserve(struct xarray *xa)
 	XA_BUG_ON(xa, !xa_empty(xa));
 
 	/* But xa_insert does not */
-	xa_reserve(xa, 12345678, GFP_KERNEL);
+	XA_BUG_ON(xa, xa_reserve(xa, 12345678, GFP_KERNEL) != 0);
 	XA_BUG_ON(xa, xa_insert(xa, 12345678, xa_mk_value(12345678), 0) !=
 			-EBUSY);
 	XA_BUG_ON(xa, xa_empty(xa));
@@ -395,7 +395,7 @@ static noinline void check_reserve(struct xarray *xa)
 
 	/* Can iterate through a reserved entry */
 	xa_store_index(xa, 5, GFP_KERNEL);
-	xa_reserve(xa, 6, GFP_KERNEL);
+	XA_BUG_ON(xa, xa_reserve(xa, 6, GFP_KERNEL) != 0);
 	xa_store_index(xa, 7, GFP_KERNEL);
 
 	xa_for_each(xa, index, entry) {
-- 
cgit v1.2.3


From d11a3998985b351aaab6bbdc23bc884bd5e815c8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 9 Feb 2019 15:40:24 -0700
Subject: block: kill QUEUE_FLAG_FLUSH_NQ

We have various helpers for setting/clearing this flag, and also
a helper to check if the queue supports queueable flushes or not.
But nobody uses them anymore, kill it with fire.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c        | 1 -
 block/blk-settings.c          | 9 ---------
 drivers/ata/libata-scsi.c     | 2 --
 drivers/block/null_blk_main.c | 1 -
 include/linux/blkdev.h        | 7 -------
 5 files changed, 20 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index f8120832ca7b..c782e81db627 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -132,7 +132,6 @@ static const char *const blk_queue_flag_name[] = {
 	QUEUE_FLAG_NAME(POLL),
 	QUEUE_FLAG_NAME(WC),
 	QUEUE_FLAG_NAME(FUA),
-	QUEUE_FLAG_NAME(FLUSH_NQ),
 	QUEUE_FLAG_NAME(DAX),
 	QUEUE_FLAG_NAME(STATS),
 	QUEUE_FLAG_NAME(POLL_STATS),
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 3e7038e475ee..6375afaedcec 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -799,15 +799,6 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
 }
 EXPORT_SYMBOL(blk_queue_update_dma_alignment);
 
-void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
-{
-	if (queueable)
-		blk_queue_flag_clear(QUEUE_FLAG_FLUSH_NQ, q);
-	else
-		blk_queue_flag_set(QUEUE_FLAG_FLUSH_NQ, q);
-}
-EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
-
 /**
  * blk_set_queue_depth - tell the block layer about the device queue depth
  * @q:		the request queue for the device
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 3d4887d0e84a..dfe66d00dd5b 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1318,8 +1318,6 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
 		scsi_change_queue_depth(sdev, depth);
 	}
 
-	blk_queue_flush_queueable(q, false);
-
 	if (dev->flags & ATA_DFLAG_TRUSTED)
 		sdev->security_supported = 1;
 
diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 62c9654b9ce8..83c38a6217d7 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -1678,7 +1678,6 @@ static int null_add_dev(struct nullb_device *dev)
 	if (dev->cache_size > 0) {
 		set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
 		blk_queue_write_cache(nullb->q, true, true);
-		blk_queue_flush_queueable(nullb->q, true);
 	}
 
 	if (dev->zoned) {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 338604dff7d0..24ccab51085f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -592,7 +592,6 @@ struct request_queue {
 #define QUEUE_FLAG_POLL	       19	/* IO polling enabled if set */
 #define QUEUE_FLAG_WC	       20	/* Write back caching */
 #define QUEUE_FLAG_FUA	       21	/* device supports FUA writes */
-#define QUEUE_FLAG_FLUSH_NQ    22	/* flush not queueuable */
 #define QUEUE_FLAG_DAX         23	/* device supports DAX */
 #define QUEUE_FLAG_STATS       24	/* track IO start and completion times */
 #define QUEUE_FLAG_POLL_STATS  25	/* collecting stats for hybrid polling */
@@ -1069,7 +1068,6 @@ extern void blk_queue_virt_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
-extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
 extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
 
 /*
@@ -1446,11 +1444,6 @@ static inline unsigned int block_size(struct block_device *bdev)
 	return bdev->bd_block_size;
 }
 
-static inline bool queue_flush_queueable(struct request_queue *q)
-{
-	return !test_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags);
-}
-
 typedef struct {struct page *v;} Sector;
 
 unsigned char *read_dev_sector(struct block_device *, sector_t, Sector *);
-- 
cgit v1.2.3


From eca7abf31abba2acac445ec6a1d3f94cf0cab918 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 9 Feb 2019 15:42:07 -0700
Subject: block: queue flag cleanup

We have QUEUE_FLAG_DEFAULT defined, but it's not used anymore since
the legacy IO stack is gone. Kill it.

Sanitize the queue flags in general, they use spaces (for some
reason), and the space is pretty sparse. With the flags renumbered,
we can more clearly see how many we have available.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 58 +++++++++++++++++++++++---------------------------
 1 file changed, 27 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 24ccab51085f..3603270cb82d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -572,37 +572,33 @@ struct request_queue {
 	u64			write_hints[BLK_MAX_WRITE_HINTS];
 };
 
-#define QUEUE_FLAG_STOPPED	1	/* queue is stopped */
-#define QUEUE_FLAG_DYING	2	/* queue being torn down */
-#define QUEUE_FLAG_BIDI		4	/* queue supports bidi requests */
-#define QUEUE_FLAG_NOMERGES     5	/* disable merge attempts */
-#define QUEUE_FLAG_SAME_COMP	6	/* complete on same CPU-group */
-#define QUEUE_FLAG_FAIL_IO	7	/* fake timeout */
-#define QUEUE_FLAG_NONROT	9	/* non-rotational device (SSD) */
-#define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
-#define QUEUE_FLAG_IO_STAT     10	/* do disk/partitions IO accounting */
-#define QUEUE_FLAG_DISCARD     11	/* supports DISCARD */
-#define QUEUE_FLAG_NOXMERGES   12	/* No extended merges */
-#define QUEUE_FLAG_ADD_RANDOM  13	/* Contributes to random pool */
-#define QUEUE_FLAG_SECERASE    14	/* supports secure erase */
-#define QUEUE_FLAG_SAME_FORCE  15	/* force complete on same CPU */
-#define QUEUE_FLAG_DEAD        16	/* queue tear-down finished */
-#define QUEUE_FLAG_INIT_DONE   17	/* queue is initialized */
-#define QUEUE_FLAG_NO_SG_MERGE 18	/* don't attempt to merge SG segments*/
-#define QUEUE_FLAG_POLL	       19	/* IO polling enabled if set */
-#define QUEUE_FLAG_WC	       20	/* Write back caching */
-#define QUEUE_FLAG_FUA	       21	/* device supports FUA writes */
-#define QUEUE_FLAG_DAX         23	/* device supports DAX */
-#define QUEUE_FLAG_STATS       24	/* track IO start and completion times */
-#define QUEUE_FLAG_POLL_STATS  25	/* collecting stats for hybrid polling */
-#define QUEUE_FLAG_REGISTERED  26	/* queue has been registered to a disk */
-#define QUEUE_FLAG_SCSI_PASSTHROUGH 27	/* queue supports SCSI commands */
-#define QUEUE_FLAG_QUIESCED    28	/* queue has been quiesced */
-#define QUEUE_FLAG_PCI_P2PDMA  29	/* device supports PCI p2p requests */
-
-#define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
-				 (1 << QUEUE_FLAG_SAME_COMP)	|	\
-				 (1 << QUEUE_FLAG_ADD_RANDOM))
+#define QUEUE_FLAG_STOPPED	0	/* queue is stopped */
+#define QUEUE_FLAG_DYING	1	/* queue being torn down */
+#define QUEUE_FLAG_BIDI		2	/* queue supports bidi requests */
+#define QUEUE_FLAG_NOMERGES     3	/* disable merge attempts */
+#define QUEUE_FLAG_SAME_COMP	4	/* complete on same CPU-group */
+#define QUEUE_FLAG_FAIL_IO	5	/* fake timeout */
+#define QUEUE_FLAG_NONROT	6	/* non-rotational device (SSD) */
+#define QUEUE_FLAG_VIRT		QUEUE_FLAG_NONROT /* paravirt device */
+#define QUEUE_FLAG_IO_STAT	7	/* do disk/partitions IO accounting */
+#define QUEUE_FLAG_DISCARD	8	/* supports DISCARD */
+#define QUEUE_FLAG_NOXMERGES	9	/* No extended merges */
+#define QUEUE_FLAG_ADD_RANDOM	10	/* Contributes to random pool */
+#define QUEUE_FLAG_SECERASE	11	/* supports secure erase */
+#define QUEUE_FLAG_SAME_FORCE	12	/* force complete on same CPU */
+#define QUEUE_FLAG_DEAD		13	/* queue tear-down finished */
+#define QUEUE_FLAG_INIT_DONE	14	/* queue is initialized */
+#define QUEUE_FLAG_NO_SG_MERGE	15	/* don't attempt to merge SG segments*/
+#define QUEUE_FLAG_POLL		16	/* IO polling enabled if set */
+#define QUEUE_FLAG_WC		17	/* Write back caching */
+#define QUEUE_FLAG_FUA		18	/* device supports FUA writes */
+#define QUEUE_FLAG_DAX		19	/* device supports DAX */
+#define QUEUE_FLAG_STATS	20	/* track IO start and completion times */
+#define QUEUE_FLAG_POLL_STATS	21	/* collecting stats for hybrid polling */
+#define QUEUE_FLAG_REGISTERED	22	/* queue has been registered to a disk */
+#define QUEUE_FLAG_SCSI_PASSTHROUGH 23	/* queue supports SCSI commands */
+#define QUEUE_FLAG_QUIESCED	24	/* queue has been quiesced */
+#define QUEUE_FLAG_PCI_P2PDMA	25	/* device supports PCI p2p requests */
 
 #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_SAME_COMP))
-- 
cgit v1.2.3


From c65ea996595005be470fbfa16711deba414fd33b Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard@mvista.com>
Date: Tue, 23 Oct 2018 11:29:02 -0500
Subject: ipmi: Fix how the lower layers are told to watch for messages

The IPMI driver has a mechanism to tell the lower layers it needs
to watch for messages, commands, and watchdogs (so it doesn't
needlessly poll).  However, it needed some extensions, it needed
a way to tell what is being waited for so it could set the timeout
appropriately.

The update to the lower layer was also being done once a second
at best because it was done in the main timeout handler.  However,
if a command is sent and a response message is coming back,
it needed to be started immediately.  So modify the code to
update immediately if it needs to be enabled.  Disable is still
lazy.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
Tested-by: Kamlakant Patel <kamlakant.patel@cavium.com>
---
 drivers/char/ipmi/ipmi_msghandler.c | 119 +++++++++++++++++++++++++-----------
 drivers/char/ipmi/ipmi_si_intf.c    |   5 +-
 drivers/char/ipmi/ipmi_ssif.c       |  26 +++++---
 include/linux/ipmi_smi.h            |  36 ++++++++---
 4 files changed, 134 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index c518659b4d9f..2e008efa735f 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -529,9 +529,22 @@ struct ipmi_smi {
 	unsigned int     waiting_events_count; /* How many events in queue? */
 	char             delivering_events;
 	char             event_msg_printed;
+
+	/* How many users are waiting for events? */
 	atomic_t         event_waiters;
 	unsigned int     ticks_to_req_ev;
-	int              last_needs_timer;
+
+	/* How many users are waiting for commands? */
+	atomic_t         command_waiters;
+
+	/* How many users are waiting for watchdogs? */
+	atomic_t         watchdog_waiters;
+
+	/*
+	 * Tells what the lower layer has last been asked to watch for,
+	 * messages and/or watchdogs.  Protected by xmit_msgs_lock.
+	 */
+	unsigned int     last_watch_mask;
 
 	/*
 	 * The event receiver for my BMC, only really used at panic
@@ -1078,6 +1091,29 @@ static int intf_err_seq(struct ipmi_smi *intf,
 	return rv;
 }
 
+/* Must be called with xmit_msgs_lock held. */
+static void smi_tell_to_watch(struct ipmi_smi *intf,
+			      unsigned int flags,
+			      struct ipmi_smi_msg *smi_msg)
+{
+	if (flags & IPMI_WATCH_MASK_CHECK_MESSAGES) {
+		if (!smi_msg)
+			return;
+
+		if (!smi_msg->needs_response)
+			return;
+	}
+
+	if (!intf->handlers->set_need_watch)
+		return;
+
+	if ((intf->last_watch_mask & flags) == flags)
+		return;
+
+	intf->last_watch_mask |= flags;
+	intf->handlers->set_need_watch(intf->send_info,
+				       intf->last_watch_mask);
+}
 
 int ipmi_create_user(unsigned int          if_num,
 		     const struct ipmi_user_hndl *handler,
@@ -1141,8 +1177,9 @@ int ipmi_create_user(unsigned int          if_num,
 	spin_unlock_irqrestore(&intf->seq_lock, flags);
 	if (handler->ipmi_watchdog_pretimeout) {
 		/* User wants pretimeouts, so make sure to watch for them. */
-		if (atomic_inc_return(&intf->event_waiters) == 1)
-			need_waiter(intf);
+		if (atomic_inc_return(&intf->watchdog_waiters) == 1)
+			smi_tell_to_watch(intf, IPMI_WATCH_MASK_CHECK_WATCHDOG,
+					  NULL);
 	}
 	srcu_read_unlock(&ipmi_interfaces_srcu, index);
 	*user = new_user;
@@ -1214,7 +1251,7 @@ static void _ipmi_destroy_user(struct ipmi_user *user)
 		user->handler->shutdown(user->handler_data);
 
 	if (user->handler->ipmi_watchdog_pretimeout)
-		atomic_dec(&intf->event_waiters);
+		atomic_dec(&intf->watchdog_waiters);
 
 	if (user->gets_events)
 		atomic_dec(&intf->event_waiters);
@@ -1569,8 +1606,8 @@ int ipmi_register_for_cmd(struct ipmi_user *user,
 		goto out_unlock;
 	}
 
-	if (atomic_inc_return(&intf->event_waiters) == 1)
-		need_waiter(intf);
+	if (atomic_inc_return(&intf->command_waiters) == 1)
+		smi_tell_to_watch(intf, IPMI_WATCH_MASK_CHECK_COMMANDS, NULL);
 
 	list_add_rcu(&rcvr->link, &intf->cmd_rcvrs);
 
@@ -1620,7 +1657,7 @@ int ipmi_unregister_for_cmd(struct ipmi_user *user,
 	synchronize_rcu();
 	release_ipmi_user(user, index);
 	while (rcvrs) {
-		atomic_dec(&intf->event_waiters);
+		atomic_dec(&intf->command_waiters);
 		rcvr = rcvrs;
 		rcvrs = rcvr->next;
 		kfree(rcvr);
@@ -1737,22 +1774,21 @@ static struct ipmi_smi_msg *smi_add_send_msg(struct ipmi_smi *intf,
 	return smi_msg;
 }
 
-
 static void smi_send(struct ipmi_smi *intf,
 		     const struct ipmi_smi_handlers *handlers,
 		     struct ipmi_smi_msg *smi_msg, int priority)
 {
 	int run_to_completion = intf->run_to_completion;
+	unsigned long flags = 0;
 
-	if (run_to_completion) {
-		smi_msg = smi_add_send_msg(intf, smi_msg, priority);
-	} else {
-		unsigned long flags;
-
+	if (!run_to_completion)
 		spin_lock_irqsave(&intf->xmit_msgs_lock, flags);
-		smi_msg = smi_add_send_msg(intf, smi_msg, priority);
+	smi_msg = smi_add_send_msg(intf, smi_msg, priority);
+
+	smi_tell_to_watch(intf, IPMI_WATCH_MASK_CHECK_MESSAGES, smi_msg);
+
+	if (!run_to_completion)
 		spin_unlock_irqrestore(&intf->xmit_msgs_lock, flags);
-	}
 
 	if (smi_msg)
 		handlers->sender(intf->send_info, smi_msg);
@@ -1950,6 +1986,9 @@ static int i_ipmi_req_ipmb(struct ipmi_smi        *intf,
 				ipmb_seq, broadcast,
 				source_address, source_lun);
 
+		/* We will be getting a response in the BMC message queue. */
+		smi_msg->needs_response = true;
+
 		/*
 		 * Copy the message into the recv message data, so we
 		 * can retransmit it later if necessary.
@@ -2137,6 +2176,7 @@ static int i_ipmi_request(struct ipmi_user     *user,
 			goto out;
 		}
 	}
+	smi_msg->needs_response = false;
 
 	rcu_read_lock();
 	if (intf->in_shutdown) {
@@ -3351,6 +3391,8 @@ int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
 	INIT_LIST_HEAD(&intf->hp_xmit_msgs);
 	spin_lock_init(&intf->events_lock);
 	atomic_set(&intf->event_waiters, 0);
+	atomic_set(&intf->watchdog_waiters, 0);
+	atomic_set(&intf->command_waiters, 0);
 	intf->ticks_to_req_ev = IPMI_REQUEST_EV_TIME;
 	INIT_LIST_HEAD(&intf->waiting_events);
 	intf->waiting_events_count = 0;
@@ -4365,6 +4407,9 @@ static void smi_recv_tasklet(unsigned long val)
 			intf->curr_msg = newmsg;
 		}
 	}
+
+	smi_tell_to_watch(intf, IPMI_WATCH_MASK_CHECK_MESSAGES, newmsg);
+
 	if (!run_to_completion)
 		spin_unlock_irqrestore(&intf->xmit_msgs_lock, flags);
 	if (newmsg)
@@ -4492,7 +4537,7 @@ static void check_msg_timeout(struct ipmi_smi *intf, struct seq_table *ent,
 			      struct list_head *timeouts,
 			      unsigned long timeout_period,
 			      int slot, unsigned long *flags,
-			      unsigned int *waiting_msgs)
+			      unsigned int *watch_mask)
 {
 	struct ipmi_recv_msg *msg;
 
@@ -4504,7 +4549,7 @@ static void check_msg_timeout(struct ipmi_smi *intf, struct seq_table *ent,
 
 	if (timeout_period < ent->timeout) {
 		ent->timeout -= timeout_period;
-		(*waiting_msgs)++;
+		*watch_mask |= IPMI_WATCH_MASK_CHECK_MESSAGES;
 		return;
 	}
 
@@ -4523,7 +4568,7 @@ static void check_msg_timeout(struct ipmi_smi *intf, struct seq_table *ent,
 		struct ipmi_smi_msg *smi_msg;
 		/* More retries, send again. */
 
-		(*waiting_msgs)++;
+		*watch_mask |= IPMI_WATCH_MASK_CHECK_MESSAGES;
 
 		/*
 		 * Start with the max timer, set to normal timer after
@@ -4575,13 +4620,13 @@ static unsigned int ipmi_timeout_handler(struct ipmi_smi *intf,
 	struct ipmi_recv_msg *msg, *msg2;
 	unsigned long        flags;
 	int                  i;
-	unsigned int         waiting_msgs = 0;
+	unsigned int         watch_mask = 0;
 
 	if (!intf->bmc_registered) {
 		kref_get(&intf->refcount);
 		if (!schedule_work(&intf->bmc_reg_work)) {
 			kref_put(&intf->refcount, intf_free);
-			waiting_msgs++;
+			watch_mask |= IPMI_WATCH_MASK_INTERNAL;
 		}
 	}
 
@@ -4601,7 +4646,7 @@ static unsigned int ipmi_timeout_handler(struct ipmi_smi *intf,
 	for (i = 0; i < IPMI_IPMB_NUM_SEQ; i++)
 		check_msg_timeout(intf, &intf->seq_table[i],
 				  &timeouts, timeout_period, i,
-				  &flags, &waiting_msgs);
+				  &flags, &watch_mask);
 	spin_unlock_irqrestore(&intf->seq_lock, flags);
 
 	list_for_each_entry_safe(msg, msg2, &timeouts, link)
@@ -4632,7 +4677,7 @@ static unsigned int ipmi_timeout_handler(struct ipmi_smi *intf,
 
 	tasklet_schedule(&intf->recv_tasklet);
 
-	return waiting_msgs;
+	return watch_mask;
 }
 
 static void ipmi_request_event(struct ipmi_smi *intf)
@@ -4652,37 +4697,43 @@ static atomic_t stop_operation;
 static void ipmi_timeout(struct timer_list *unused)
 {
 	struct ipmi_smi *intf;
-	int nt = 0, index;
+	unsigned int watch_mask = 0;
+	int index;
+	unsigned long flags;
 
 	if (atomic_read(&stop_operation))
 		return;
 
 	index = srcu_read_lock(&ipmi_interfaces_srcu);
 	list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
-		int lnt = 0;
-
 		if (atomic_read(&intf->event_waiters)) {
 			intf->ticks_to_req_ev--;
 			if (intf->ticks_to_req_ev == 0) {
 				ipmi_request_event(intf);
 				intf->ticks_to_req_ev = IPMI_REQUEST_EV_TIME;
 			}
-			lnt++;
+			watch_mask |= IPMI_WATCH_MASK_INTERNAL;
 		}
 
-		lnt += ipmi_timeout_handler(intf, IPMI_TIMEOUT_TIME);
+		if (atomic_read(&intf->watchdog_waiters))
+			watch_mask |= IPMI_WATCH_MASK_CHECK_WATCHDOG;
 
-		lnt = !!lnt;
-		if (lnt != intf->last_needs_timer &&
-					intf->handlers->set_need_watch)
-			intf->handlers->set_need_watch(intf->send_info, lnt);
-		intf->last_needs_timer = lnt;
+		if (atomic_read(&intf->command_waiters))
+			watch_mask |= IPMI_WATCH_MASK_CHECK_COMMANDS;
+
+		watch_mask |= ipmi_timeout_handler(intf, IPMI_TIMEOUT_TIME);
 
-		nt += lnt;
+		spin_lock_irqsave(&intf->xmit_msgs_lock, flags);
+		if (watch_mask != intf->last_watch_mask &&
+					intf->handlers->set_need_watch)
+			intf->handlers->set_need_watch(intf->send_info,
+						       watch_mask);
+		intf->last_watch_mask = watch_mask;
+		spin_unlock_irqrestore(&intf->xmit_msgs_lock, flags);
 	}
 	srcu_read_unlock(&ipmi_interfaces_srcu, index);
 
-	if (nt)
+	if (watch_mask)
 		mod_timer(&ipmi_timer, jiffies + IPMI_TIMEOUT_JIFFIES);
 }
 
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index f1b9fda6b9df..c81c84a723b6 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -1060,10 +1060,13 @@ static void request_events(void *send_info)
 	atomic_set(&smi_info->req_events, 1);
 }
 
-static void set_need_watch(void *send_info, bool enable)
+static void set_need_watch(void *send_info, unsigned int watch_mask)
 {
 	struct smi_info *smi_info = send_info;
 	unsigned long flags;
+	int enable;
+
+	enable = !!(watch_mask & ~IPMI_WATCH_MASK_INTERNAL);
 
 	atomic_set(&smi_info->need_watch, enable);
 	spin_lock_irqsave(&smi_info->si_lock, flags);
diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
index 1aacc1144d2a..a1219af32105 100644
--- a/drivers/char/ipmi/ipmi_ssif.c
+++ b/drivers/char/ipmi/ipmi_ssif.c
@@ -93,8 +93,8 @@
 /*
  * Timeout for the watch, only used for get flag timer.
  */
-#define SSIF_WATCH_TIMEOUT_MSEC	   100
-#define SSIF_WATCH_TIMEOUT_JIFFIES msecs_to_jiffies(SSIF_WATCH_TIMEOUT_MSEC)
+#define SSIF_WATCH_MSG_TIMEOUT		msecs_to_jiffies(10)
+#define SSIF_WATCH_WATCHDOG_TIMEOUT	msecs_to_jiffies(250)
 
 enum ssif_intf_state {
 	SSIF_NORMAL,
@@ -276,7 +276,7 @@ struct ssif_info {
 	struct timer_list retry_timer;
 	int retries_left;
 
-	bool need_watch;		/* Need to look for flags? */
+	long watch_timeout;		/* Timeout for flags check, 0 if off. */
 	struct timer_list watch_timer;	/* Flag fetch timer. */
 
 	/* Info from SSIF cmd */
@@ -578,9 +578,9 @@ static void watch_timeout(struct timer_list *t)
 		return;
 
 	flags = ipmi_ssif_lock_cond(ssif_info, &oflags);
-	if (ssif_info->need_watch) {
+	if (ssif_info->watch_timeout) {
 		mod_timer(&ssif_info->watch_timer,
-			  jiffies + SSIF_WATCH_TIMEOUT_JIFFIES);
+			  jiffies + ssif_info->watch_timeout);
 		if (SSIF_IDLE(ssif_info)) {
 			start_flag_fetch(ssif_info, flags); /* Releases lock */
 			return;
@@ -1121,17 +1121,23 @@ static void request_events(void *send_info)
  * Upper layer is changing the flag saying whether we need to request
  * flags periodically or not.
  */
-static void ssif_set_need_watch(void *send_info, bool enable)
+static void ssif_set_need_watch(void *send_info, unsigned int watch_mask)
 {
 	struct ssif_info *ssif_info = (struct ssif_info *) send_info;
 	unsigned long oflags, *flags;
+	long timeout = 0;
+
+	if (watch_mask & IPMI_WATCH_MASK_CHECK_MESSAGES)
+		timeout = SSIF_WATCH_MSG_TIMEOUT;
+	else if (watch_mask & ~IPMI_WATCH_MASK_INTERNAL)
+		timeout = SSIF_WATCH_WATCHDOG_TIMEOUT;
 
 	flags = ipmi_ssif_lock_cond(ssif_info, &oflags);
-	if (enable != ssif_info->need_watch) {
-		ssif_info->need_watch = enable;
-		if (ssif_info->need_watch)
+	if (timeout != ssif_info->watch_timeout) {
+		ssif_info->watch_timeout = timeout;
+		if (ssif_info->watch_timeout)
 			mod_timer(&ssif_info->watch_timer,
-				  jiffies + SSIF_WATCH_TIMEOUT_JIFFIES);
+				  jiffies + ssif_info->watch_timeout);
 	}
 	ipmi_ssif_unlock_cond(ssif_info, flags);
 }
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index 8c4e2ab696c3..da6abb06a5dc 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -30,6 +30,17 @@ struct device;
 /* Structure for the low-level drivers. */
 struct ipmi_smi;
 
+/*
+ * Flags for set_check_watch() below.  Tells if the SMI should be
+ * waiting for watchdog timeouts, commands and/or messages.  There is
+ * also an internal flag for the message handler, SMIs should ignore
+ * it.
+ */
+#define IPMI_WATCH_MASK_INTERNAL	(1 << 0)
+#define IPMI_WATCH_MASK_CHECK_MESSAGES	(1 << 1)
+#define IPMI_WATCH_MASK_CHECK_WATCHDOG	(1 << 2)
+#define IPMI_WATCH_MASK_CHECK_COMMANDS	(1 << 3)
+
 /*
  * Messages to/from the lower layer.  The smi interface will take one
  * of these to send. After the send has occurred and a response has
@@ -55,8 +66,16 @@ struct ipmi_smi_msg {
 	int           rsp_size;
 	unsigned char rsp[IPMI_MAX_MSG_LENGTH];
 
-	/* Will be called when the system is done with the message
-	   (presumably to free it). */
+	/*
+	 * There should be a response message coming back in the BMC
+	 * message queue.
+	 */
+	bool needs_response;
+
+	/*
+	 * Will be called when the system is done with the message
+	 * (presumably to free it).
+	 */
 	void (*done)(struct ipmi_smi_msg *msg);
 };
 
@@ -105,12 +124,15 @@ struct ipmi_smi_handlers {
 
 	/*
 	 * Called by the upper layer when some user requires that the
-	 * interface watch for events, received messages, watchdog
-	 * pretimeouts, or not.  Used by the SMI to know if it should
-	 * watch for these.  This may be NULL if the SMI does not
-	 * implement it.
+	 * interface watch for received messages and watchdog
+	 * pretimeouts (basically do a "Get Flags", or not.  Used by
+	 * the SMI to know if it should watch for these.  This may be
+	 * NULL if the SMI does not implement it.  watch_mask is from
+	 * IPMI_WATCH_MASK_xxx above.  The interface should run slower
+	 * timeouts for just watchdog checking or faster timeouts when
+	 * waiting for the message queue.
 	 */
-	void (*set_need_watch)(void *send_info, bool enable);
+	void (*set_need_watch)(void *send_info, unsigned int watch_mask);
 
 	/*
 	 * Called when flushing all pending messages.
-- 
cgit v1.2.3


From e1891cffd4c4896a899337a243273f0e23c028df Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard@mvista.com>
Date: Wed, 24 Oct 2018 15:17:04 -0500
Subject: ipmi: Make the smi watcher be disabled immediately when not needed

The code to tell the lower layer to enable or disable watching for
certain things was lazy in disabling, it waited until a timer tick
to see if a disable was necessary.  Not a really big deal, but it
could be improved.

Modify the code to enable and disable watching immediately and don't
do it from the background timer any more.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
Tested-by: Kamlakant Patel <kamlakant.patel@cavium.com>
---
 drivers/char/ipmi/ipmi_msghandler.c | 164 ++++++++++++++++++++----------------
 drivers/char/ipmi/ipmi_si_intf.c    |   2 +-
 drivers/char/ipmi/ipmi_ssif.c       |   2 +-
 include/linux/ipmi_smi.h            |  17 +---
 4 files changed, 96 insertions(+), 89 deletions(-)

(limited to 'include')

diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 2e008efa735f..5bc84fbbeee5 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -534,15 +534,20 @@ struct ipmi_smi {
 	atomic_t         event_waiters;
 	unsigned int     ticks_to_req_ev;
 
+	spinlock_t       watch_lock; /* For dealing with watch stuff below. */
+
 	/* How many users are waiting for commands? */
-	atomic_t         command_waiters;
+	unsigned int     command_waiters;
 
 	/* How many users are waiting for watchdogs? */
-	atomic_t         watchdog_waiters;
+	unsigned int     watchdog_waiters;
+
+	/* How many users are waiting for message responses? */
+	unsigned int     response_waiters;
 
 	/*
 	 * Tells what the lower layer has last been asked to watch for,
-	 * messages and/or watchdogs.  Protected by xmit_msgs_lock.
+	 * messages and/or watchdogs.  Protected by watch_lock.
 	 */
 	unsigned int     last_watch_mask;
 
@@ -938,6 +943,64 @@ static void deliver_err_response(struct ipmi_smi *intf,
 	deliver_local_response(intf, msg);
 }
 
+static void smi_add_watch(struct ipmi_smi *intf, unsigned int flags)
+{
+	unsigned long iflags;
+
+	if (!intf->handlers->set_need_watch)
+		return;
+
+	spin_lock_irqsave(&intf->watch_lock, iflags);
+	if (flags & IPMI_WATCH_MASK_CHECK_MESSAGES)
+		intf->response_waiters++;
+
+	if (flags & IPMI_WATCH_MASK_CHECK_WATCHDOG)
+		intf->watchdog_waiters++;
+
+	if (flags & IPMI_WATCH_MASK_CHECK_COMMANDS)
+		intf->command_waiters++;
+
+	if ((intf->last_watch_mask & flags) != flags) {
+		intf->last_watch_mask |= flags;
+		intf->handlers->set_need_watch(intf->send_info,
+					       intf->last_watch_mask);
+	}
+	spin_unlock_irqrestore(&intf->watch_lock, iflags);
+}
+
+static void smi_remove_watch(struct ipmi_smi *intf, unsigned int flags)
+{
+	unsigned long iflags;
+
+	if (!intf->handlers->set_need_watch)
+		return;
+
+	spin_lock_irqsave(&intf->watch_lock, iflags);
+	if (flags & IPMI_WATCH_MASK_CHECK_MESSAGES)
+		intf->response_waiters--;
+
+	if (flags & IPMI_WATCH_MASK_CHECK_WATCHDOG)
+		intf->watchdog_waiters--;
+
+	if (flags & IPMI_WATCH_MASK_CHECK_COMMANDS)
+		intf->command_waiters--;
+
+	flags = 0;
+	if (intf->response_waiters)
+		flags |= IPMI_WATCH_MASK_CHECK_MESSAGES;
+	if (intf->watchdog_waiters)
+		flags |= IPMI_WATCH_MASK_CHECK_WATCHDOG;
+	if (intf->command_waiters)
+		flags |= IPMI_WATCH_MASK_CHECK_COMMANDS;
+
+	if (intf->last_watch_mask != flags) {
+		intf->last_watch_mask = flags;
+		intf->handlers->set_need_watch(intf->send_info,
+					       intf->last_watch_mask);
+	}
+	spin_unlock_irqrestore(&intf->watch_lock, iflags);
+}
+
 /*
  * Find the next sequence number not being used and add the given
  * message with the given timeout to the sequence table.  This must be
@@ -981,6 +1044,7 @@ static int intf_next_seq(struct ipmi_smi      *intf,
 		*seq = i;
 		*seqid = intf->seq_table[i].seqid;
 		intf->curr_seq = (i+1)%IPMI_IPMB_NUM_SEQ;
+		smi_add_watch(intf, IPMI_WATCH_MASK_CHECK_MESSAGES);
 		need_waiter(intf);
 	} else {
 		rv = -EAGAIN;
@@ -1019,6 +1083,7 @@ static int intf_find_seq(struct ipmi_smi      *intf,
 				&& (ipmi_addr_equal(addr, &msg->addr))) {
 			*recv_msg = msg;
 			intf->seq_table[seq].inuse = 0;
+			smi_remove_watch(intf, IPMI_WATCH_MASK_CHECK_MESSAGES);
 			rv = 0;
 		}
 	}
@@ -1080,6 +1145,7 @@ static int intf_err_seq(struct ipmi_smi *intf,
 		struct seq_table *ent = &intf->seq_table[seq];
 
 		ent->inuse = 0;
+		smi_remove_watch(intf, IPMI_WATCH_MASK_CHECK_MESSAGES);
 		msg = ent->recv_msg;
 		rv = 0;
 	}
@@ -1091,30 +1157,6 @@ static int intf_err_seq(struct ipmi_smi *intf,
 	return rv;
 }
 
-/* Must be called with xmit_msgs_lock held. */
-static void smi_tell_to_watch(struct ipmi_smi *intf,
-			      unsigned int flags,
-			      struct ipmi_smi_msg *smi_msg)
-{
-	if (flags & IPMI_WATCH_MASK_CHECK_MESSAGES) {
-		if (!smi_msg)
-			return;
-
-		if (!smi_msg->needs_response)
-			return;
-	}
-
-	if (!intf->handlers->set_need_watch)
-		return;
-
-	if ((intf->last_watch_mask & flags) == flags)
-		return;
-
-	intf->last_watch_mask |= flags;
-	intf->handlers->set_need_watch(intf->send_info,
-				       intf->last_watch_mask);
-}
-
 int ipmi_create_user(unsigned int          if_num,
 		     const struct ipmi_user_hndl *handler,
 		     void                  *handler_data,
@@ -1175,12 +1217,9 @@ int ipmi_create_user(unsigned int          if_num,
 	spin_lock_irqsave(&intf->seq_lock, flags);
 	list_add_rcu(&new_user->link, &intf->users);
 	spin_unlock_irqrestore(&intf->seq_lock, flags);
-	if (handler->ipmi_watchdog_pretimeout) {
+	if (handler->ipmi_watchdog_pretimeout)
 		/* User wants pretimeouts, so make sure to watch for them. */
-		if (atomic_inc_return(&intf->watchdog_waiters) == 1)
-			smi_tell_to_watch(intf, IPMI_WATCH_MASK_CHECK_WATCHDOG,
-					  NULL);
-	}
+		smi_add_watch(intf, IPMI_WATCH_MASK_CHECK_WATCHDOG);
 	srcu_read_unlock(&ipmi_interfaces_srcu, index);
 	*user = new_user;
 	return 0;
@@ -1251,7 +1290,7 @@ static void _ipmi_destroy_user(struct ipmi_user *user)
 		user->handler->shutdown(user->handler_data);
 
 	if (user->handler->ipmi_watchdog_pretimeout)
-		atomic_dec(&intf->watchdog_waiters);
+		smi_remove_watch(intf, IPMI_WATCH_MASK_CHECK_WATCHDOG);
 
 	if (user->gets_events)
 		atomic_dec(&intf->event_waiters);
@@ -1264,6 +1303,7 @@ static void _ipmi_destroy_user(struct ipmi_user *user)
 		if (intf->seq_table[i].inuse
 		    && (intf->seq_table[i].recv_msg->user == user)) {
 			intf->seq_table[i].inuse = 0;
+			smi_remove_watch(intf, IPMI_WATCH_MASK_CHECK_MESSAGES);
 			ipmi_free_recv_msg(intf->seq_table[i].recv_msg);
 		}
 	}
@@ -1606,8 +1646,7 @@ int ipmi_register_for_cmd(struct ipmi_user *user,
 		goto out_unlock;
 	}
 
-	if (atomic_inc_return(&intf->command_waiters) == 1)
-		smi_tell_to_watch(intf, IPMI_WATCH_MASK_CHECK_COMMANDS, NULL);
+	smi_add_watch(intf, IPMI_WATCH_MASK_CHECK_COMMANDS);
 
 	list_add_rcu(&rcvr->link, &intf->cmd_rcvrs);
 
@@ -1657,7 +1696,7 @@ int ipmi_unregister_for_cmd(struct ipmi_user *user,
 	synchronize_rcu();
 	release_ipmi_user(user, index);
 	while (rcvrs) {
-		atomic_dec(&intf->command_waiters);
+		smi_remove_watch(intf, IPMI_WATCH_MASK_CHECK_COMMANDS);
 		rcvr = rcvrs;
 		rcvrs = rcvr->next;
 		kfree(rcvr);
@@ -1785,8 +1824,6 @@ static void smi_send(struct ipmi_smi *intf,
 		spin_lock_irqsave(&intf->xmit_msgs_lock, flags);
 	smi_msg = smi_add_send_msg(intf, smi_msg, priority);
 
-	smi_tell_to_watch(intf, IPMI_WATCH_MASK_CHECK_MESSAGES, smi_msg);
-
 	if (!run_to_completion)
 		spin_unlock_irqrestore(&intf->xmit_msgs_lock, flags);
 
@@ -1986,9 +2023,6 @@ static int i_ipmi_req_ipmb(struct ipmi_smi        *intf,
 				ipmb_seq, broadcast,
 				source_address, source_lun);
 
-		/* We will be getting a response in the BMC message queue. */
-		smi_msg->needs_response = true;
-
 		/*
 		 * Copy the message into the recv message data, so we
 		 * can retransmit it later if necessary.
@@ -2176,7 +2210,6 @@ static int i_ipmi_request(struct ipmi_user     *user,
 			goto out;
 		}
 	}
-	smi_msg->needs_response = false;
 
 	rcu_read_lock();
 	if (intf->in_shutdown) {
@@ -3390,9 +3423,8 @@ int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
 	INIT_LIST_HEAD(&intf->xmit_msgs);
 	INIT_LIST_HEAD(&intf->hp_xmit_msgs);
 	spin_lock_init(&intf->events_lock);
+	spin_lock_init(&intf->watch_lock);
 	atomic_set(&intf->event_waiters, 0);
-	atomic_set(&intf->watchdog_waiters, 0);
-	atomic_set(&intf->command_waiters, 0);
 	intf->ticks_to_req_ev = IPMI_REQUEST_EV_TIME;
 	INIT_LIST_HEAD(&intf->waiting_events);
 	intf->waiting_events_count = 0;
@@ -4408,8 +4440,6 @@ static void smi_recv_tasklet(unsigned long val)
 		}
 	}
 
-	smi_tell_to_watch(intf, IPMI_WATCH_MASK_CHECK_MESSAGES, newmsg);
-
 	if (!run_to_completion)
 		spin_unlock_irqrestore(&intf->xmit_msgs_lock, flags);
 	if (newmsg)
@@ -4537,7 +4567,7 @@ static void check_msg_timeout(struct ipmi_smi *intf, struct seq_table *ent,
 			      struct list_head *timeouts,
 			      unsigned long timeout_period,
 			      int slot, unsigned long *flags,
-			      unsigned int *watch_mask)
+			      bool *need_timer)
 {
 	struct ipmi_recv_msg *msg;
 
@@ -4549,13 +4579,14 @@ static void check_msg_timeout(struct ipmi_smi *intf, struct seq_table *ent,
 
 	if (timeout_period < ent->timeout) {
 		ent->timeout -= timeout_period;
-		*watch_mask |= IPMI_WATCH_MASK_CHECK_MESSAGES;
+		*need_timer = true;
 		return;
 	}
 
 	if (ent->retries_left == 0) {
 		/* The message has used all its retries. */
 		ent->inuse = 0;
+		smi_remove_watch(intf, IPMI_WATCH_MASK_CHECK_MESSAGES);
 		msg = ent->recv_msg;
 		list_add_tail(&msg->link, timeouts);
 		if (ent->broadcast)
@@ -4568,7 +4599,7 @@ static void check_msg_timeout(struct ipmi_smi *intf, struct seq_table *ent,
 		struct ipmi_smi_msg *smi_msg;
 		/* More retries, send again. */
 
-		*watch_mask |= IPMI_WATCH_MASK_CHECK_MESSAGES;
+		*need_timer = true;
 
 		/*
 		 * Start with the max timer, set to normal timer after
@@ -4613,20 +4644,20 @@ static void check_msg_timeout(struct ipmi_smi *intf, struct seq_table *ent,
 	}
 }
 
-static unsigned int ipmi_timeout_handler(struct ipmi_smi *intf,
-					 unsigned long timeout_period)
+static bool ipmi_timeout_handler(struct ipmi_smi *intf,
+				 unsigned long timeout_period)
 {
 	struct list_head     timeouts;
 	struct ipmi_recv_msg *msg, *msg2;
 	unsigned long        flags;
 	int                  i;
-	unsigned int         watch_mask = 0;
+	bool                 need_timer = false;
 
 	if (!intf->bmc_registered) {
 		kref_get(&intf->refcount);
 		if (!schedule_work(&intf->bmc_reg_work)) {
 			kref_put(&intf->refcount, intf_free);
-			watch_mask |= IPMI_WATCH_MASK_INTERNAL;
+			need_timer = true;
 		}
 	}
 
@@ -4646,7 +4677,7 @@ static unsigned int ipmi_timeout_handler(struct ipmi_smi *intf,
 	for (i = 0; i < IPMI_IPMB_NUM_SEQ; i++)
 		check_msg_timeout(intf, &intf->seq_table[i],
 				  &timeouts, timeout_period, i,
-				  &flags, &watch_mask);
+				  &flags, &need_timer);
 	spin_unlock_irqrestore(&intf->seq_lock, flags);
 
 	list_for_each_entry_safe(msg, msg2, &timeouts, link)
@@ -4677,7 +4708,7 @@ static unsigned int ipmi_timeout_handler(struct ipmi_smi *intf,
 
 	tasklet_schedule(&intf->recv_tasklet);
 
-	return watch_mask;
+	return need_timer;
 }
 
 static void ipmi_request_event(struct ipmi_smi *intf)
@@ -4697,9 +4728,8 @@ static atomic_t stop_operation;
 static void ipmi_timeout(struct timer_list *unused)
 {
 	struct ipmi_smi *intf;
-	unsigned int watch_mask = 0;
+	bool need_timer = false;
 	int index;
-	unsigned long flags;
 
 	if (atomic_read(&stop_operation))
 		return;
@@ -4712,28 +4742,14 @@ static void ipmi_timeout(struct timer_list *unused)
 				ipmi_request_event(intf);
 				intf->ticks_to_req_ev = IPMI_REQUEST_EV_TIME;
 			}
-			watch_mask |= IPMI_WATCH_MASK_INTERNAL;
+			need_timer = true;
 		}
 
-		if (atomic_read(&intf->watchdog_waiters))
-			watch_mask |= IPMI_WATCH_MASK_CHECK_WATCHDOG;
-
-		if (atomic_read(&intf->command_waiters))
-			watch_mask |= IPMI_WATCH_MASK_CHECK_COMMANDS;
-
-		watch_mask |= ipmi_timeout_handler(intf, IPMI_TIMEOUT_TIME);
-
-		spin_lock_irqsave(&intf->xmit_msgs_lock, flags);
-		if (watch_mask != intf->last_watch_mask &&
-					intf->handlers->set_need_watch)
-			intf->handlers->set_need_watch(intf->send_info,
-						       watch_mask);
-		intf->last_watch_mask = watch_mask;
-		spin_unlock_irqrestore(&intf->xmit_msgs_lock, flags);
+		need_timer |= ipmi_timeout_handler(intf, IPMI_TIMEOUT_TIME);
 	}
 	srcu_read_unlock(&ipmi_interfaces_srcu, index);
 
-	if (watch_mask)
+	if (need_timer)
 		mod_timer(&ipmi_timer, jiffies + IPMI_TIMEOUT_JIFFIES);
 }
 
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index c81c84a723b6..ae99d6a14789 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -1066,7 +1066,7 @@ static void set_need_watch(void *send_info, unsigned int watch_mask)
 	unsigned long flags;
 	int enable;
 
-	enable = !!(watch_mask & ~IPMI_WATCH_MASK_INTERNAL);
+	enable = !!watch_mask;
 
 	atomic_set(&smi_info->need_watch, enable);
 	spin_lock_irqsave(&smi_info->si_lock, flags);
diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
index a1219af32105..e4abaa8e22bc 100644
--- a/drivers/char/ipmi/ipmi_ssif.c
+++ b/drivers/char/ipmi/ipmi_ssif.c
@@ -1129,7 +1129,7 @@ static void ssif_set_need_watch(void *send_info, unsigned int watch_mask)
 
 	if (watch_mask & IPMI_WATCH_MASK_CHECK_MESSAGES)
 		timeout = SSIF_WATCH_MSG_TIMEOUT;
-	else if (watch_mask & ~IPMI_WATCH_MASK_INTERNAL)
+	else if (watch_mask)
 		timeout = SSIF_WATCH_WATCHDOG_TIMEOUT;
 
 	flags = ipmi_ssif_lock_cond(ssif_info, &oflags);
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index da6abb06a5dc..4dc66157d872 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -32,14 +32,11 @@ struct ipmi_smi;
 
 /*
  * Flags for set_check_watch() below.  Tells if the SMI should be
- * waiting for watchdog timeouts, commands and/or messages.  There is
- * also an internal flag for the message handler, SMIs should ignore
- * it.
+ * waiting for watchdog timeouts, commands and/or messages.
  */
-#define IPMI_WATCH_MASK_INTERNAL	(1 << 0)
-#define IPMI_WATCH_MASK_CHECK_MESSAGES	(1 << 1)
-#define IPMI_WATCH_MASK_CHECK_WATCHDOG	(1 << 2)
-#define IPMI_WATCH_MASK_CHECK_COMMANDS	(1 << 3)
+#define IPMI_WATCH_MASK_CHECK_MESSAGES	(1 << 0)
+#define IPMI_WATCH_MASK_CHECK_WATCHDOG	(1 << 1)
+#define IPMI_WATCH_MASK_CHECK_COMMANDS	(1 << 2)
 
 /*
  * Messages to/from the lower layer.  The smi interface will take one
@@ -66,12 +63,6 @@ struct ipmi_smi_msg {
 	int           rsp_size;
 	unsigned char rsp[IPMI_MAX_MSG_LENGTH];
 
-	/*
-	 * There should be a response message coming back in the BMC
-	 * message queue.
-	 */
-	bool needs_response;
-
 	/*
 	 * Will be called when the system is done with the message
 	 * (presumably to free it).
-- 
cgit v1.2.3


From b77cf11f094136a9d7d0ee6a56cf49db1f412871 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 5 Feb 2019 10:37:31 -0600
Subject: iommu: Allow io-pgtable to be used outside of drivers/iommu/

Move io-pgtable.h to include/linux/ and export alloc_io_pgtable_ops
and free_io_pgtable_ops. This enables drivers outside drivers/iommu/ to
use the page table library. Specifically, some ARM Mali GPUs use the
ARM page table formats.

Cc: Will Deacon <will.deacon@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Matthias Brugger <matthias.bgg@gmail.com>
Cc: Rob Clark <robdclark@gmail.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: iommu@lists.linux-foundation.org
Cc: linux-mediatek@lists.infradead.org
Cc: linux-arm-msm@vger.kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/arm-smmu-v3.c        |   3 +-
 drivers/iommu/arm-smmu.c           |   2 +-
 drivers/iommu/io-pgtable-arm-v7s.c |   3 +-
 drivers/iommu/io-pgtable-arm.c     |   3 +-
 drivers/iommu/io-pgtable.c         |   5 +-
 drivers/iommu/io-pgtable.h         | 213 -------------------------------------
 drivers/iommu/ipmmu-vmsa.c         |   3 +-
 drivers/iommu/msm_iommu.c          |   2 +-
 drivers/iommu/mtk_iommu.h          |   3 +-
 drivers/iommu/qcom_iommu.c         |   2 +-
 include/linux/io-pgtable.h         | 213 +++++++++++++++++++++++++++++++++++++
 11 files changed, 224 insertions(+), 228 deletions(-)
 delete mode 100644 drivers/iommu/io-pgtable.h
 create mode 100644 include/linux/io-pgtable.h

(limited to 'include')

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 0d284029dc73..d3880010c6cf 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -18,6 +18,7 @@
 #include <linux/dma-iommu.h>
 #include <linux/err.h>
 #include <linux/interrupt.h>
+#include <linux/io-pgtable.h>
 #include <linux/iommu.h>
 #include <linux/iopoll.h>
 #include <linux/init.h>
@@ -32,8 +33,6 @@
 
 #include <linux/amba/bus.h>
 
-#include "io-pgtable.h"
-
 /* MMIO registers */
 #define ARM_SMMU_IDR0			0x0
 #define IDR0_ST_LVL			GENMASK(28, 27)
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index af18a7e7f917..045d93884164 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -39,6 +39,7 @@
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/io-64-nonatomic-hi-lo.h>
+#include <linux/io-pgtable.h>
 #include <linux/iommu.h>
 #include <linux/iopoll.h>
 #include <linux/init.h>
@@ -56,7 +57,6 @@
 #include <linux/amba/bus.h>
 #include <linux/fsl/mc.h>
 
-#include "io-pgtable.h"
 #include "arm-smmu-regs.h"
 
 #define ARM_MMU500_ACTLR_CPRE		(1 << 1)
diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c
index cec29bf45c9b..75a8273d1ae9 100644
--- a/drivers/iommu/io-pgtable-arm-v7s.c
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -35,6 +35,7 @@
 #include <linux/atomic.h>
 #include <linux/dma-mapping.h>
 #include <linux/gfp.h>
+#include <linux/io-pgtable.h>
 #include <linux/iommu.h>
 #include <linux/kernel.h>
 #include <linux/kmemleak.h>
@@ -45,8 +46,6 @@
 
 #include <asm/barrier.h>
 
-#include "io-pgtable.h"
-
 /* Struct accessors */
 #define io_pgtable_to_data(x)						\
 	container_of((x), struct arm_v7s_io_pgtable, iop)
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 237cacd4a62b..d3700ec15cbd 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -22,6 +22,7 @@
 
 #include <linux/atomic.h>
 #include <linux/bitops.h>
+#include <linux/io-pgtable.h>
 #include <linux/iommu.h>
 #include <linux/kernel.h>
 #include <linux/sizes.h>
@@ -31,8 +32,6 @@
 
 #include <asm/barrier.h>
 
-#include "io-pgtable.h"
-
 #define ARM_LPAE_MAX_ADDR_BITS		52
 #define ARM_LPAE_S2_MAX_CONCAT_PAGES	16
 #define ARM_LPAE_MAX_LEVELS		4
diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
index 127558d83667..93f2880be6c6 100644
--- a/drivers/iommu/io-pgtable.c
+++ b/drivers/iommu/io-pgtable.c
@@ -19,11 +19,10 @@
  */
 
 #include <linux/bug.h>
+#include <linux/io-pgtable.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 
-#include "io-pgtable.h"
-
 static const struct io_pgtable_init_fns *
 io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
 #ifdef CONFIG_IOMMU_IO_PGTABLE_LPAE
@@ -61,6 +60,7 @@ struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
 
 	return &iop->ops;
 }
+EXPORT_SYMBOL_GPL(alloc_io_pgtable_ops);
 
 /*
  * It is the IOMMU driver's responsibility to ensure that the page table
@@ -77,3 +77,4 @@ void free_io_pgtable_ops(struct io_pgtable_ops *ops)
 	io_pgtable_tlb_flush_all(iop);
 	io_pgtable_init_table[iop->fmt]->free(iop);
 }
+EXPORT_SYMBOL_GPL(free_io_pgtable_ops);
diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h
deleted file mode 100644
index 47d5ae559329..000000000000
--- a/drivers/iommu/io-pgtable.h
+++ /dev/null
@@ -1,213 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __IO_PGTABLE_H
-#define __IO_PGTABLE_H
-#include <linux/bitops.h>
-
-/*
- * Public API for use by IOMMU drivers
- */
-enum io_pgtable_fmt {
-	ARM_32_LPAE_S1,
-	ARM_32_LPAE_S2,
-	ARM_64_LPAE_S1,
-	ARM_64_LPAE_S2,
-	ARM_V7S,
-	IO_PGTABLE_NUM_FMTS,
-};
-
-/**
- * struct iommu_gather_ops - IOMMU callbacks for TLB and page table management.
- *
- * @tlb_flush_all: Synchronously invalidate the entire TLB context.
- * @tlb_add_flush: Queue up a TLB invalidation for a virtual address range.
- * @tlb_sync:      Ensure any queued TLB invalidation has taken effect, and
- *                 any corresponding page table updates are visible to the
- *                 IOMMU.
- *
- * Note that these can all be called in atomic context and must therefore
- * not block.
- */
-struct iommu_gather_ops {
-	void (*tlb_flush_all)(void *cookie);
-	void (*tlb_add_flush)(unsigned long iova, size_t size, size_t granule,
-			      bool leaf, void *cookie);
-	void (*tlb_sync)(void *cookie);
-};
-
-/**
- * struct io_pgtable_cfg - Configuration data for a set of page tables.
- *
- * @quirks:        A bitmap of hardware quirks that require some special
- *                 action by the low-level page table allocator.
- * @pgsize_bitmap: A bitmap of page sizes supported by this set of page
- *                 tables.
- * @ias:           Input address (iova) size, in bits.
- * @oas:           Output address (paddr) size, in bits.
- * @tlb:           TLB management callbacks for this set of tables.
- * @iommu_dev:     The device representing the DMA configuration for the
- *                 page table walker.
- */
-struct io_pgtable_cfg {
-	/*
-	 * IO_PGTABLE_QUIRK_ARM_NS: (ARM formats) Set NS and NSTABLE bits in
-	 *	stage 1 PTEs, for hardware which insists on validating them
-	 *	even in	non-secure state where they should normally be ignored.
-	 *
-	 * IO_PGTABLE_QUIRK_NO_PERMS: Ignore the IOMMU_READ, IOMMU_WRITE and
-	 *	IOMMU_NOEXEC flags and map everything with full access, for
-	 *	hardware which does not implement the permissions of a given
-	 *	format, and/or requires some format-specific default value.
-	 *
-	 * IO_PGTABLE_QUIRK_TLBI_ON_MAP: If the format forbids caching invalid
-	 *	(unmapped) entries but the hardware might do so anyway, perform
-	 *	TLB maintenance when mapping as well as when unmapping.
-	 *
-	 * IO_PGTABLE_QUIRK_ARM_MTK_4GB: (ARM v7s format) Set bit 9 in all
-	 *	PTEs, for Mediatek IOMMUs which treat it as a 33rd address bit
-	 *	when the SoC is in "4GB mode" and they can only access the high
-	 *	remap of DRAM (0x1_00000000 to 0x1_ffffffff).
-	 *
-	 * IO_PGTABLE_QUIRK_NO_DMA: Guarantees that the tables will only ever
-	 *	be accessed by a fully cache-coherent IOMMU or CPU (e.g. for a
-	 *	software-emulated IOMMU), such that pagetable updates need not
-	 *	be treated as explicit DMA data.
-	 *
-	 * IO_PGTABLE_QUIRK_NON_STRICT: Skip issuing synchronous leaf TLBIs
-	 *	on unmap, for DMA domains using the flush queue mechanism for
-	 *	delayed invalidation.
-	 */
-	#define IO_PGTABLE_QUIRK_ARM_NS		BIT(0)
-	#define IO_PGTABLE_QUIRK_NO_PERMS	BIT(1)
-	#define IO_PGTABLE_QUIRK_TLBI_ON_MAP	BIT(2)
-	#define IO_PGTABLE_QUIRK_ARM_MTK_4GB	BIT(3)
-	#define IO_PGTABLE_QUIRK_NO_DMA		BIT(4)
-	#define IO_PGTABLE_QUIRK_NON_STRICT	BIT(5)
-	unsigned long			quirks;
-	unsigned long			pgsize_bitmap;
-	unsigned int			ias;
-	unsigned int			oas;
-	const struct iommu_gather_ops	*tlb;
-	struct device			*iommu_dev;
-
-	/* Low-level data specific to the table format */
-	union {
-		struct {
-			u64	ttbr[2];
-			u64	tcr;
-			u64	mair[2];
-		} arm_lpae_s1_cfg;
-
-		struct {
-			u64	vttbr;
-			u64	vtcr;
-		} arm_lpae_s2_cfg;
-
-		struct {
-			u32	ttbr[2];
-			u32	tcr;
-			u32	nmrr;
-			u32	prrr;
-		} arm_v7s_cfg;
-	};
-};
-
-/**
- * struct io_pgtable_ops - Page table manipulation API for IOMMU drivers.
- *
- * @map:          Map a physically contiguous memory region.
- * @unmap:        Unmap a physically contiguous memory region.
- * @iova_to_phys: Translate iova to physical address.
- *
- * These functions map directly onto the iommu_ops member functions with
- * the same names.
- */
-struct io_pgtable_ops {
-	int (*map)(struct io_pgtable_ops *ops, unsigned long iova,
-		   phys_addr_t paddr, size_t size, int prot);
-	size_t (*unmap)(struct io_pgtable_ops *ops, unsigned long iova,
-			size_t size);
-	phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops,
-				    unsigned long iova);
-};
-
-/**
- * alloc_io_pgtable_ops() - Allocate a page table allocator for use by an IOMMU.
- *
- * @fmt:    The page table format.
- * @cfg:    The page table configuration. This will be modified to represent
- *          the configuration actually provided by the allocator (e.g. the
- *          pgsize_bitmap may be restricted).
- * @cookie: An opaque token provided by the IOMMU driver and passed back to
- *          the callback routines in cfg->tlb.
- */
-struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
-					    struct io_pgtable_cfg *cfg,
-					    void *cookie);
-
-/**
- * free_io_pgtable_ops() - Free an io_pgtable_ops structure. The caller
- *                         *must* ensure that the page table is no longer
- *                         live, but the TLB can be dirty.
- *
- * @ops: The ops returned from alloc_io_pgtable_ops.
- */
-void free_io_pgtable_ops(struct io_pgtable_ops *ops);
-
-
-/*
- * Internal structures for page table allocator implementations.
- */
-
-/**
- * struct io_pgtable - Internal structure describing a set of page tables.
- *
- * @fmt:    The page table format.
- * @cookie: An opaque token provided by the IOMMU driver and passed back to
- *          any callback routines.
- * @cfg:    A copy of the page table configuration.
- * @ops:    The page table operations in use for this set of page tables.
- */
-struct io_pgtable {
-	enum io_pgtable_fmt	fmt;
-	void			*cookie;
-	struct io_pgtable_cfg	cfg;
-	struct io_pgtable_ops	ops;
-};
-
-#define io_pgtable_ops_to_pgtable(x) container_of((x), struct io_pgtable, ops)
-
-static inline void io_pgtable_tlb_flush_all(struct io_pgtable *iop)
-{
-	iop->cfg.tlb->tlb_flush_all(iop->cookie);
-}
-
-static inline void io_pgtable_tlb_add_flush(struct io_pgtable *iop,
-		unsigned long iova, size_t size, size_t granule, bool leaf)
-{
-	iop->cfg.tlb->tlb_add_flush(iova, size, granule, leaf, iop->cookie);
-}
-
-static inline void io_pgtable_tlb_sync(struct io_pgtable *iop)
-{
-	iop->cfg.tlb->tlb_sync(iop->cookie);
-}
-
-/**
- * struct io_pgtable_init_fns - Alloc/free a set of page tables for a
- *                              particular format.
- *
- * @alloc: Allocate a set of page tables described by cfg.
- * @free:  Free the page tables associated with iop.
- */
-struct io_pgtable_init_fns {
-	struct io_pgtable *(*alloc)(struct io_pgtable_cfg *cfg, void *cookie);
-	void (*free)(struct io_pgtable *iop);
-};
-
-extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns;
-extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns;
-extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns;
-extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
-extern struct io_pgtable_init_fns io_pgtable_arm_v7s_init_fns;
-
-#endif /* __IO_PGTABLE_H */
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index 7a4529c61c19..9a380c10655e 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
+#include <linux/io-pgtable.h>
 #include <linux/iommu.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
@@ -35,8 +36,6 @@
 #define arm_iommu_detach_device(...)	do {} while (0)
 #endif
 
-#include "io-pgtable.h"
-
 #define IPMMU_CTX_MAX 8
 
 struct ipmmu_features {
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index fc4270733f11..ef7d1f995d6b 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -23,6 +23,7 @@
 #include <linux/platform_device.h>
 #include <linux/errno.h>
 #include <linux/io.h>
+#include <linux/io-pgtable.h>
 #include <linux/interrupt.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
@@ -37,7 +38,6 @@
 
 #include "msm_iommu_hw-8xxx.h"
 #include "msm_iommu.h"
-#include "io-pgtable.h"
 
 #define MRC(reg, processor, op1, crn, crm, op2)				\
 __asm__ __volatile__ (							\
diff --git a/drivers/iommu/mtk_iommu.h b/drivers/iommu/mtk_iommu.h
index 778498b8633f..62c2c3e8c5df 100644
--- a/drivers/iommu/mtk_iommu.h
+++ b/drivers/iommu/mtk_iommu.h
@@ -19,13 +19,12 @@
 #include <linux/component.h>
 #include <linux/device.h>
 #include <linux/io.h>
+#include <linux/io-pgtable.h>
 #include <linux/iommu.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <soc/mediatek/smi.h>
 
-#include "io-pgtable.h"
-
 struct mtk_iommu_suspend_reg {
 	u32				standard_axi_mode;
 	u32				dcm_dis;
diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c
index d8595f0a987d..8cdd3f059513 100644
--- a/drivers/iommu/qcom_iommu.c
+++ b/drivers/iommu/qcom_iommu.c
@@ -26,6 +26,7 @@
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/io-64-nonatomic-hi-lo.h>
+#include <linux/io-pgtable.h>
 #include <linux/iommu.h>
 #include <linux/iopoll.h>
 #include <linux/kconfig.h>
@@ -42,7 +43,6 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 
-#include "io-pgtable.h"
 #include "arm-smmu-regs.h"
 
 #define SMMU_INTR_SEL_NS     0x2000
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
new file mode 100644
index 000000000000..47d5ae559329
--- /dev/null
+++ b/include/linux/io-pgtable.h
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __IO_PGTABLE_H
+#define __IO_PGTABLE_H
+#include <linux/bitops.h>
+
+/*
+ * Public API for use by IOMMU drivers
+ */
+enum io_pgtable_fmt {
+	ARM_32_LPAE_S1,
+	ARM_32_LPAE_S2,
+	ARM_64_LPAE_S1,
+	ARM_64_LPAE_S2,
+	ARM_V7S,
+	IO_PGTABLE_NUM_FMTS,
+};
+
+/**
+ * struct iommu_gather_ops - IOMMU callbacks for TLB and page table management.
+ *
+ * @tlb_flush_all: Synchronously invalidate the entire TLB context.
+ * @tlb_add_flush: Queue up a TLB invalidation for a virtual address range.
+ * @tlb_sync:      Ensure any queued TLB invalidation has taken effect, and
+ *                 any corresponding page table updates are visible to the
+ *                 IOMMU.
+ *
+ * Note that these can all be called in atomic context and must therefore
+ * not block.
+ */
+struct iommu_gather_ops {
+	void (*tlb_flush_all)(void *cookie);
+	void (*tlb_add_flush)(unsigned long iova, size_t size, size_t granule,
+			      bool leaf, void *cookie);
+	void (*tlb_sync)(void *cookie);
+};
+
+/**
+ * struct io_pgtable_cfg - Configuration data for a set of page tables.
+ *
+ * @quirks:        A bitmap of hardware quirks that require some special
+ *                 action by the low-level page table allocator.
+ * @pgsize_bitmap: A bitmap of page sizes supported by this set of page
+ *                 tables.
+ * @ias:           Input address (iova) size, in bits.
+ * @oas:           Output address (paddr) size, in bits.
+ * @tlb:           TLB management callbacks for this set of tables.
+ * @iommu_dev:     The device representing the DMA configuration for the
+ *                 page table walker.
+ */
+struct io_pgtable_cfg {
+	/*
+	 * IO_PGTABLE_QUIRK_ARM_NS: (ARM formats) Set NS and NSTABLE bits in
+	 *	stage 1 PTEs, for hardware which insists on validating them
+	 *	even in	non-secure state where they should normally be ignored.
+	 *
+	 * IO_PGTABLE_QUIRK_NO_PERMS: Ignore the IOMMU_READ, IOMMU_WRITE and
+	 *	IOMMU_NOEXEC flags and map everything with full access, for
+	 *	hardware which does not implement the permissions of a given
+	 *	format, and/or requires some format-specific default value.
+	 *
+	 * IO_PGTABLE_QUIRK_TLBI_ON_MAP: If the format forbids caching invalid
+	 *	(unmapped) entries but the hardware might do so anyway, perform
+	 *	TLB maintenance when mapping as well as when unmapping.
+	 *
+	 * IO_PGTABLE_QUIRK_ARM_MTK_4GB: (ARM v7s format) Set bit 9 in all
+	 *	PTEs, for Mediatek IOMMUs which treat it as a 33rd address bit
+	 *	when the SoC is in "4GB mode" and they can only access the high
+	 *	remap of DRAM (0x1_00000000 to 0x1_ffffffff).
+	 *
+	 * IO_PGTABLE_QUIRK_NO_DMA: Guarantees that the tables will only ever
+	 *	be accessed by a fully cache-coherent IOMMU or CPU (e.g. for a
+	 *	software-emulated IOMMU), such that pagetable updates need not
+	 *	be treated as explicit DMA data.
+	 *
+	 * IO_PGTABLE_QUIRK_NON_STRICT: Skip issuing synchronous leaf TLBIs
+	 *	on unmap, for DMA domains using the flush queue mechanism for
+	 *	delayed invalidation.
+	 */
+	#define IO_PGTABLE_QUIRK_ARM_NS		BIT(0)
+	#define IO_PGTABLE_QUIRK_NO_PERMS	BIT(1)
+	#define IO_PGTABLE_QUIRK_TLBI_ON_MAP	BIT(2)
+	#define IO_PGTABLE_QUIRK_ARM_MTK_4GB	BIT(3)
+	#define IO_PGTABLE_QUIRK_NO_DMA		BIT(4)
+	#define IO_PGTABLE_QUIRK_NON_STRICT	BIT(5)
+	unsigned long			quirks;
+	unsigned long			pgsize_bitmap;
+	unsigned int			ias;
+	unsigned int			oas;
+	const struct iommu_gather_ops	*tlb;
+	struct device			*iommu_dev;
+
+	/* Low-level data specific to the table format */
+	union {
+		struct {
+			u64	ttbr[2];
+			u64	tcr;
+			u64	mair[2];
+		} arm_lpae_s1_cfg;
+
+		struct {
+			u64	vttbr;
+			u64	vtcr;
+		} arm_lpae_s2_cfg;
+
+		struct {
+			u32	ttbr[2];
+			u32	tcr;
+			u32	nmrr;
+			u32	prrr;
+		} arm_v7s_cfg;
+	};
+};
+
+/**
+ * struct io_pgtable_ops - Page table manipulation API for IOMMU drivers.
+ *
+ * @map:          Map a physically contiguous memory region.
+ * @unmap:        Unmap a physically contiguous memory region.
+ * @iova_to_phys: Translate iova to physical address.
+ *
+ * These functions map directly onto the iommu_ops member functions with
+ * the same names.
+ */
+struct io_pgtable_ops {
+	int (*map)(struct io_pgtable_ops *ops, unsigned long iova,
+		   phys_addr_t paddr, size_t size, int prot);
+	size_t (*unmap)(struct io_pgtable_ops *ops, unsigned long iova,
+			size_t size);
+	phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops,
+				    unsigned long iova);
+};
+
+/**
+ * alloc_io_pgtable_ops() - Allocate a page table allocator for use by an IOMMU.
+ *
+ * @fmt:    The page table format.
+ * @cfg:    The page table configuration. This will be modified to represent
+ *          the configuration actually provided by the allocator (e.g. the
+ *          pgsize_bitmap may be restricted).
+ * @cookie: An opaque token provided by the IOMMU driver and passed back to
+ *          the callback routines in cfg->tlb.
+ */
+struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
+					    struct io_pgtable_cfg *cfg,
+					    void *cookie);
+
+/**
+ * free_io_pgtable_ops() - Free an io_pgtable_ops structure. The caller
+ *                         *must* ensure that the page table is no longer
+ *                         live, but the TLB can be dirty.
+ *
+ * @ops: The ops returned from alloc_io_pgtable_ops.
+ */
+void free_io_pgtable_ops(struct io_pgtable_ops *ops);
+
+
+/*
+ * Internal structures for page table allocator implementations.
+ */
+
+/**
+ * struct io_pgtable - Internal structure describing a set of page tables.
+ *
+ * @fmt:    The page table format.
+ * @cookie: An opaque token provided by the IOMMU driver and passed back to
+ *          any callback routines.
+ * @cfg:    A copy of the page table configuration.
+ * @ops:    The page table operations in use for this set of page tables.
+ */
+struct io_pgtable {
+	enum io_pgtable_fmt	fmt;
+	void			*cookie;
+	struct io_pgtable_cfg	cfg;
+	struct io_pgtable_ops	ops;
+};
+
+#define io_pgtable_ops_to_pgtable(x) container_of((x), struct io_pgtable, ops)
+
+static inline void io_pgtable_tlb_flush_all(struct io_pgtable *iop)
+{
+	iop->cfg.tlb->tlb_flush_all(iop->cookie);
+}
+
+static inline void io_pgtable_tlb_add_flush(struct io_pgtable *iop,
+		unsigned long iova, size_t size, size_t granule, bool leaf)
+{
+	iop->cfg.tlb->tlb_add_flush(iova, size, granule, leaf, iop->cookie);
+}
+
+static inline void io_pgtable_tlb_sync(struct io_pgtable *iop)
+{
+	iop->cfg.tlb->tlb_sync(iop->cookie);
+}
+
+/**
+ * struct io_pgtable_init_fns - Alloc/free a set of page tables for a
+ *                              particular format.
+ *
+ * @alloc: Allocate a set of page tables described by cfg.
+ * @free:  Free the page tables associated with iop.
+ */
+struct io_pgtable_init_fns {
+	struct io_pgtable *(*alloc)(struct io_pgtable_cfg *cfg, void *cookie);
+	void (*free)(struct io_pgtable *iop);
+};
+
+extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_arm_v7s_init_fns;
+
+#endif /* __IO_PGTABLE_H */
-- 
cgit v1.2.3


From d123fab71f63aae129aebe052664fda73131921a Mon Sep 17 00:00:00 2001
From: Wesley Sheng <wesley.sheng@microchip.com>
Date: Thu, 6 Dec 2018 21:30:51 +0800
Subject: ntb_hw_switchtec: NT req id mapping table register entry number
 should be 512

The number of available NT req id mapping table entries per NTB control
register is 512. The driver mistakenly limits the number to 256.

Fix the array size of NT req id mapping table.

Fixes: c082b04c9d40 ("NTB: switchtec: Add NTB hardware register definitions")
Signed-off-by: Wesley Sheng <wesley.sheng@microchip.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 include/linux/switchtec.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
index eee0412bdf4b..32b282cd0ead 100644
--- a/include/linux/switchtec.h
+++ b/include/linux/switchtec.h
@@ -249,8 +249,8 @@ struct ntb_ctrl_regs {
 		u64 xlate_addr;
 	} bar_entry[6];
 	u32 reserved2[216];
-	u32 req_id_table[256];
-	u32 reserved3[512];
+	u32 req_id_table[512];
+	u32 reserved3[256];
 	u64 lut_entry[512];
 } __packed;
 
-- 
cgit v1.2.3


From a2585cdc9e4cda6afaea5f5687eaabce3bebbb2c Mon Sep 17 00:00:00 2001
From: Paul Selles <paul.selles@microchip.com>
Date: Thu, 6 Dec 2018 21:30:52 +0800
Subject: ntb_hw_switchtec: Added support of >=4G memory windows

Current Switchtec's BAR setup registers are limited to 32bits,
corresponding to the maximum MW (memory window) size is <4G.

Increase the MW sizes with the addition of the BAR Setup Extension
Register for the upper 32bits of a 64bits MW size. This increases the MW
range to between 4K and 2^63.

Reported-by: Boris Glimcher <boris.glimcher@emc.com>
Signed-off-by: Paul Selles <paul.selles@microchip.com>
Signed-off-by: Wesley Sheng <wesley.sheng@microchip.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 drivers/ntb/hw/mscc/ntb_hw_switchtec.c | 9 +++++++--
 include/linux/switchtec.h              | 6 +++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
index 9916bc5b6759..f6f00354047b 100644
--- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
+++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
@@ -264,6 +264,7 @@ static void switchtec_ntb_mw_clr_direct(struct switchtec_ntb *sndev, int idx)
 	ctl_val &= ~NTB_CTRL_BAR_DIR_WIN_EN;
 	iowrite32(ctl_val, &ctl->bar_entry[bar].ctl);
 	iowrite32(0, &ctl->bar_entry[bar].win_size);
+	iowrite32(0, &ctl->bar_ext_entry[bar].win_size);
 	iowrite64(sndev->self_partition, &ctl->bar_entry[bar].xlate_addr);
 }
 
@@ -286,7 +287,9 @@ static void switchtec_ntb_mw_set_direct(struct switchtec_ntb *sndev, int idx,
 	ctl_val |= NTB_CTRL_BAR_DIR_WIN_EN;
 
 	iowrite32(ctl_val, &ctl->bar_entry[bar].ctl);
-	iowrite32(xlate_pos | size, &ctl->bar_entry[bar].win_size);
+	iowrite32(xlate_pos | (lower_32_bits(size) & 0xFFFFF000),
+		  &ctl->bar_entry[bar].win_size);
+	iowrite32(upper_32_bits(size), &ctl->bar_ext_entry[bar].win_size);
 	iowrite64(sndev->self_partition | addr,
 		  &ctl->bar_entry[bar].xlate_addr);
 }
@@ -1053,7 +1056,9 @@ static int crosslink_setup_mws(struct switchtec_ntb *sndev, int ntb_lut_idx,
 		ctl_val |= NTB_CTRL_BAR_DIR_WIN_EN;
 
 		iowrite32(ctl_val, &ctl->bar_entry[bar].ctl);
-		iowrite32(xlate_pos | size, &ctl->bar_entry[bar].win_size);
+		iowrite32(xlate_pos | (lower_32_bits(size) & 0xFFFFF000),
+			  &ctl->bar_entry[bar].win_size);
+		iowrite32(upper_32_bits(size), &ctl->bar_ext_entry[bar].win_size);
 		iowrite64(sndev->peer_partition | addr,
 			  &ctl->bar_entry[bar].xlate_addr);
 	}
diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
index 32b282cd0ead..52a079b3a9a6 100644
--- a/include/linux/switchtec.h
+++ b/include/linux/switchtec.h
@@ -248,7 +248,11 @@ struct ntb_ctrl_regs {
 		u32 win_size;
 		u64 xlate_addr;
 	} bar_entry[6];
-	u32 reserved2[216];
+	struct {
+		u32 win_size;
+		u32 reserved[3];
+	} bar_ext_entry[6];
+	u32 reserved2[192];
 	u32 req_id_table[512];
 	u32 reserved3[256];
 	u64 lut_entry[512];
-- 
cgit v1.2.3


From bf7fbeeae6db644ef5995085de2bc5c6121f8c8d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Feb 2019 17:02:56 +0100
Subject: module: Cure the MODULE_LICENSE "GPL" vs. "GPL v2" bogosity

The original MODULE_LICENSE string for kernel modules licensed under the
GPL v2 (only / or later) was simply "GPL", which was - and still is -
completely sufficient for the purpose of module loading and checking
whether the module is free software or proprietary.

In January 2003 this was changed with commit 3344ea3ad4b7 ("[PATCH]
MODULE_LICENSE and EXPORT_SYMBOL_GPL support"). This commit can be found in
the history git repository which holds the 1:1 import of Linus' bitkeeper
repository:

  https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/?id=3344ea3ad4b7c302c846a680dbaeedf96ed45c02

The main intention of the patch was to refuse linking proprietary modules
against symbols exported with EXPORT_SYMBOL_GPL() at module load time.

As a completely undocumented side effect it also introduced the distinction
between "GPL" and "GPL v2" MODULE_LICENSE() strings:

 *      "GPL"                           [GNU Public License v2 or later]
 *      "GPL v2"                        [GNU Public License v2]
 *      "GPL and additional rights"     [GNU Public License v2 rights and more]
 *      "Dual BSD/GPL"                  [GNU Public License v2
 *                                       or BSD license choice]
 *      "Dual MPL/GPL"                  [GNU Public License v2
 *                                       or Mozilla license choice]

This distinction was and still is wrong in several aspects:

 1) It broke all modules which were using the "GPL" string in the
    MODULE_LICENSE() already and were licensed under GPL v2 only.

    A quick license scan over the tree at that time shows that at least 480
    out of 1484 modules have been affected by this change back then. The
    number is probably way higher as this was just a quick check for
    clearly identifiable license information.

    There was exactly ONE instance of a "GPL v2" module license string in
    the kernel back then - drivers/net/tulip/xircom_tulip_cb.c which
    otherwise had no license information at all. There is no indication
    that the change above is any way related to this driver. The change
    happend with the 2.4.11 release which was on Oct. 9 2001 - so quite
    some time before the above commit. Unfortunately there is no trace on
    the intertubes to any discussion of this.

 2) The dual licensed strings became ill defined as well because following
    the "GPL" vs. "GPL v2" distinction all dual licensed (or additional
    rights) MODULE_LICENSE strings would either require those dual licensed
    modules to be licensed under GPL v2 or later or just be unspecified for
    the dual licensing case. Neither choice is coherent with the GPL
    distinction.

Due to the lack of a proper changelog and no real discussion on the patch
submission other than a few implementation details, it's completely unclear
why this distinction was introduced at all. Other than the comment in the
module header file exists no documentation for this at all.

From a license compliance and license scanning POV this distinction is a
total nightmare.

As of 5.0-rc2 2873 out of 9200 instances of MODULE_LICENSE() strings are
conflicting with the actual license in the source code (either SPDX or
license boilerplate/reference). A comparison between the scan of the
history tree and a scan of current Linus tree shows to the extent that the
git rename detection over Linus tree grafted with the history tree is
halfways complete that almost none of the files which got broken in 2003
have been cleaned up vs. the MODULE_LICENSE string. So subtracting those
480 known instances from the conflicting 2800 of today more than 25% of the
module authors got it wrong and it's a high propability that a large
portion of the rest just got it right by chance.

There is no value for the module loader to convey the detailed license
information as the only decision to be made is whether the module is free
software or not.

The "and additional rights", "BSD" and "MPL" strings are not conclusive
license information either. So there is no point in trying to make the GPL
part conclusive and exact. As shown above it's already non conclusive for
dual licensing and incoherent with a large portion of the module source.

As an unintended side effect this distinction causes a major headache for
license compliance, license scanners and the ongoing effort to clean up the
license mess of the kernel.

Therefore remove the well meant, but ill defined, distinction between "GPL"
and "GPL v2" and document that:

  - "GPL" and "GPL v2" both express that the module is licensed under GPLv2
    (without a distinction of 'only' and 'or later') and is therefore kernel
    license compliant.

  - None of the MODULE_LICENSE strings can be used for expressing or
    determining the exact license

  - Their sole purpose is to decide whether the module is free software or
    not.

Add a MODULE_LICENSE subsection to the license rule documentation as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Philippe Ombredanne <pombredanne@nexb.com>
Acked-by: Joe Perches <joe@perches.com>
[jc: Did s/merily/merely/ ]
Acked-by: Jessica Yu <jeyu@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/process/license-rules.rst | 62 +++++++++++++++++++++++++++++++++
 include/linux/module.h                  | 18 +++++++++-
 2 files changed, 79 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/process/license-rules.rst b/Documentation/process/license-rules.rst
index 2bb8c0fc2238..43b6a1ee0193 100644
--- a/Documentation/process/license-rules.rst
+++ b/Documentation/process/license-rules.rst
@@ -372,3 +372,65 @@ in the LICENSE subdirectories. This is required to allow tool
 verification (e.g. checkpatch.pl) and to have the licenses ready to read
 and extract right from the source, which is recommended by various FOSS
 organizations, e.g. the `FSFE REUSE initiative <https://reuse.software/>`_.
+
+_`MODULE_LICENSE`
+-----------------
+
+   Loadable kernel modules also require a MODULE_LICENSE() tag. This tag is
+   neither a replacement for proper source code license information
+   (SPDX-License-Identifier) nor in any way relevant for expressing or
+   determining the exact license under which the source code of the module
+   is provided.
+
+   The sole purpose of this tag is to provide sufficient information
+   whether the module is free software or proprietary for the kernel
+   module loader and for user space tools.
+
+   The valid license strings for MODULE_LICENSE() are:
+
+    ============================= =============================================
+    "GPL"			  Module is licensed under GPL version 2. This
+				  does not express any distinction between
+				  GPL-2.0-only or GPL-2.0-or-later. The exact
+				  license information can only be determined
+				  via the license information in the
+				  corresponding source files.
+
+    "GPL v2"			  Same as "GPL". It exists for historic
+				  reasons.
+
+    "GPL and additional rights"   Historical variant of expressing that the
+				  module source is dual licensed under a
+				  GPL v2 variant and MIT license. Please do
+				  not use in new code.
+
+    "Dual MIT/GPL"		  The correct way of expressing that the
+				  module is dual licensed under a GPL v2
+				  variant or MIT license choice.
+
+    "Dual BSD/GPL"		  The module is dual licensed under a GPL v2
+				  variant or BSD license choice. The exact
+				  variant of the BSD license can only be
+				  determined via the license information
+				  in the corresponding source files.
+
+    "Dual MPL/GPL"		  The module is dual licensed under a GPL v2
+				  variant or Mozilla Public License (MPL)
+				  choice. The exact variant of the MPL
+				  license can only be determined via the
+				  license information in the corresponding
+				  source files.
+
+    "Proprietary"		  The module is under a proprietary license.
+				  This string is solely for proprietary third
+				  party modules and cannot be used for modules
+				  which have their source code in the kernel
+				  tree. Modules tagged that way are tainting
+				  the kernel with the 'P' flag when loaded and
+				  the kernel module loader refuses to link such
+				  modules against symbols which are exported
+				  with EXPORT_SYMBOL_GPL().
+    ============================= =============================================
+
+
+
diff --git a/include/linux/module.h b/include/linux/module.h
index 9a21fe3509af..3a2402b8d790 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -172,7 +172,7 @@ extern void cleanup_module(void);
  * The following license idents are currently accepted as indicating free
  * software modules
  *
- *	"GPL"				[GNU Public License v2 or later]
+ *	"GPL"				[GNU Public License v2]
  *	"GPL v2"			[GNU Public License v2]
  *	"GPL and additional rights"	[GNU Public License v2 rights and more]
  *	"Dual BSD/GPL"			[GNU Public License v2
@@ -186,6 +186,22 @@ extern void cleanup_module(void);
  *
  *	"Proprietary"			[Non free products]
  *
+ * Both "GPL v2" and "GPL" (the latter also in dual licensed strings) are
+ * merely stating that the module is licensed under the GPL v2, but are not
+ * telling whether "GPL v2 only" or "GPL v2 or later". The reason why there
+ * are two variants is a historic and failed attempt to convey more
+ * information in the MODULE_LICENSE string. For module loading the
+ * "only/or later" distinction is completely irrelevant and does neither
+ * replace the proper license identifiers in the corresponding source file
+ * nor amends them in any way. The sole purpose is to make the
+ * 'Proprietary' flagging work and to refuse to bind symbols which are
+ * exported with EXPORT_SYMBOL_GPL when a non free module is loaded.
+ *
+ * In the same way "BSD" is not a clear license information. It merely
+ * states, that the module is licensed under one of the compatible BSD
+ * license variants. The detailed and correct license information is again
+ * to be found in the corresponding source files.
+ *
  * There are dual licensed components, but when running with Linux it is the
  * GPL that is relevant so this is a non issue. Similarly LGPL linked with GPL
  * is a GPL combined work.
-- 
cgit v1.2.3


From d901b2760dc6cd5fbbf2eac31d71d94baa6c4aef Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 4 Jan 2019 11:40:21 -0700
Subject: lib/scatterlist: Provide a DMA page iterator

Commit 2db76d7c3c6d ("lib/scatterlist: sg_page_iter: support sg lists w/o
backing pages") introduced the sg_page_iter_dma_address() function without
providing a way to use it in the general case. If the sg_dma_len() is not
equal to the sg length callers cannot safely use the
for_each_sg_page/sg_page_iter_dma_address combination.

Resolve this API mistake by providing a DMA specific iterator,
for_each_sg_dma_page(), that uses the right length so
sg_page_iter_dma_address() works as expected with all sglists.

A new iterator type is introduced to provide compile-time safety against
wrongly mixing accessors and iterators.

Acked-by: Christoph Hellwig <hch@lst.de> (for scatterlist)
Acked-by: Thomas Hellstrom <thellstrom@vmware.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com> (ipu3-cio2)
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 .clang-format                              |  1 +
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c |  8 ++++-
 drivers/media/pci/intel/ipu3/ipu3-cio2.c   |  4 +--
 include/linux/scatterlist.h                | 49 ++++++++++++++++++++++++------
 lib/scatterlist.c                          | 26 ++++++++++++++++
 5 files changed, 76 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/.clang-format b/.clang-format
index bc2ffb2a0b53..335ce29ab813 100644
--- a/.clang-format
+++ b/.clang-format
@@ -240,6 +240,7 @@ ForEachMacros:
   - 'for_each_set_bit'
   - 'for_each_set_bit_from'
   - 'for_each_sg'
+  - 'for_each_sg_dma_page'
   - 'for_each_sg_page'
   - 'for_each_sibling_event'
   - '__for_each_thread'
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
index 31786b200afc..a3357ff7540d 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
@@ -311,7 +311,13 @@ static dma_addr_t __vmw_piter_dma_addr(struct vmw_piter *viter)
 
 static dma_addr_t __vmw_piter_sg_addr(struct vmw_piter *viter)
 {
-	return sg_page_iter_dma_address(&viter->iter);
+	/*
+	 * FIXME: This driver wrongly mixes DMA and CPU SG list iteration and
+	 * needs revision. See
+	 * https://lore.kernel.org/lkml/20190104223531.GA1705@ziepe.ca/
+	 */
+	return sg_page_iter_dma_address(
+		container_of(&viter->iter, struct sg_dma_page_iter, base));
 }
 
 
diff --git a/drivers/media/pci/intel/ipu3/ipu3-cio2.c b/drivers/media/pci/intel/ipu3/ipu3-cio2.c
index cdb79ae2d8dc..9fbfbda74171 100644
--- a/drivers/media/pci/intel/ipu3/ipu3-cio2.c
+++ b/drivers/media/pci/intel/ipu3/ipu3-cio2.c
@@ -846,7 +846,7 @@ static int cio2_vb2_buf_init(struct vb2_buffer *vb)
 	unsigned int pages = DIV_ROUND_UP(vb->planes[0].length, CIO2_PAGE_SIZE);
 	unsigned int lops = DIV_ROUND_UP(pages + 1, entries_per_page);
 	struct sg_table *sg;
-	struct sg_page_iter sg_iter;
+	struct sg_dma_page_iter sg_iter;
 	int i, j;
 
 	if (lops <= 0 || lops > CIO2_MAX_LOPS) {
@@ -873,7 +873,7 @@ static int cio2_vb2_buf_init(struct vb2_buffer *vb)
 		b->offset = sg->sgl->offset;
 
 	i = j = 0;
-	for_each_sg_page(sg->sgl, &sg_iter, sg->nents, 0) {
+	for_each_sg_dma_page (sg->sgl, &sg_iter, sg->nents, 0) {
 		if (!pages--)
 			break;
 		b->lop[i][j] = sg_page_iter_dma_address(&sg_iter) >> PAGE_SHIFT;
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index b96f0d0b5b8f..b4be960c7e5d 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -339,12 +339,12 @@ int sg_alloc_table_chained(struct sg_table *table, int nents,
 /*
  * sg page iterator
  *
- * Iterates over sg entries page-by-page.  On each successful iteration,
- * you can call sg_page_iter_page(@piter) and sg_page_iter_dma_address(@piter)
- * to get the current page and its dma address. @piter->sg will point to the
- * sg holding this page and @piter->sg_pgoffset to the page's page offset
- * within the sg. The iteration will stop either when a maximum number of sg
- * entries was reached or a terminating sg (sg_last(sg) == true) was reached.
+ * Iterates over sg entries page-by-page.  On each successful iteration, you
+ * can call sg_page_iter_page(@piter) to get the current page and its dma
+ * address. @piter->sg will point to the sg holding this page and
+ * @piter->sg_pgoffset to the page's page offset within the sg. The iteration
+ * will stop either when a maximum number of sg entries was reached or a
+ * terminating sg (sg_last(sg) == true) was reached.
  */
 struct sg_page_iter {
 	struct scatterlist	*sg;		/* sg holding the page */
@@ -356,7 +356,19 @@ struct sg_page_iter {
 						 * next step */
 };
 
+/*
+ * sg page iterator for DMA addresses
+ *
+ * This is the same as sg_page_iter however you can call
+ * sg_page_iter_dma_address(@dma_iter) to get the page's DMA
+ * address. sg_page_iter_page() cannot be called on this iterator.
+ */
+struct sg_dma_page_iter {
+	struct sg_page_iter base;
+};
+
 bool __sg_page_iter_next(struct sg_page_iter *piter);
+bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter);
 void __sg_page_iter_start(struct sg_page_iter *piter,
 			  struct scatterlist *sglist, unsigned int nents,
 			  unsigned long pgoffset);
@@ -372,11 +384,13 @@ static inline struct page *sg_page_iter_page(struct sg_page_iter *piter)
 /**
  * sg_page_iter_dma_address - get the dma address of the current page held by
  * the page iterator.
- * @piter:	page iterator holding the page
+ * @dma_iter:	page iterator holding the page
  */
-static inline dma_addr_t sg_page_iter_dma_address(struct sg_page_iter *piter)
+static inline dma_addr_t
+sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter)
 {
-	return sg_dma_address(piter->sg) + (piter->sg_pgoffset << PAGE_SHIFT);
+	return sg_dma_address(dma_iter->base.sg) +
+	       (dma_iter->base.sg_pgoffset << PAGE_SHIFT);
 }
 
 /**
@@ -385,11 +399,28 @@ static inline dma_addr_t sg_page_iter_dma_address(struct sg_page_iter *piter)
  * @piter:	page iterator to hold current page, sg, sg_pgoffset
  * @nents:	maximum number of sg entries to iterate over
  * @pgoffset:	starting page offset
+ *
+ * Callers may use sg_page_iter_page() to get each page pointer.
  */
 #define for_each_sg_page(sglist, piter, nents, pgoffset)		   \
 	for (__sg_page_iter_start((piter), (sglist), (nents), (pgoffset)); \
 	     __sg_page_iter_next(piter);)
 
+/**
+ * for_each_sg_dma_page - iterate over the pages of the given sg list
+ * @sglist:	sglist to iterate over
+ * @dma_iter:	page iterator to hold current page
+ * @dma_nents:	maximum number of sg entries to iterate over, this is the value
+ *              returned from dma_map_sg
+ * @pgoffset:	starting page offset
+ *
+ * Callers may use sg_page_iter_dma_address() to get each page's DMA address.
+ */
+#define for_each_sg_dma_page(sglist, dma_iter, dma_nents, pgoffset)            \
+	for (__sg_page_iter_start(&(dma_iter)->base, sglist, dma_nents,        \
+				  pgoffset);                                   \
+	     __sg_page_iter_dma_next(dma_iter);)
+
 /*
  * Mapping sg iterator
  *
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 9ba349e775ef..739dc9fe2c55 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -625,6 +625,32 @@ bool __sg_page_iter_next(struct sg_page_iter *piter)
 }
 EXPORT_SYMBOL(__sg_page_iter_next);
 
+static int sg_dma_page_count(struct scatterlist *sg)
+{
+	return PAGE_ALIGN(sg->offset + sg_dma_len(sg)) >> PAGE_SHIFT;
+}
+
+bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter)
+{
+	struct sg_page_iter *piter = &dma_iter->base;
+
+	if (!piter->__nents || !piter->sg)
+		return false;
+
+	piter->sg_pgoffset += piter->__pg_advance;
+	piter->__pg_advance = 1;
+
+	while (piter->sg_pgoffset >= sg_dma_page_count(piter->sg)) {
+		piter->sg_pgoffset -= sg_dma_page_count(piter->sg);
+		piter->sg = sg_next(piter->sg);
+		if (!--piter->__nents || !piter->sg)
+			return false;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL(__sg_page_iter_dma_next);
+
 /**
  * sg_miter_start - start mapping iteration over a sg list
  * @miter: sg mapping iter to be started
-- 
cgit v1.2.3


From 9fa505adf9f2914637e4e81620f62dcdaa575f71 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 8 Feb 2019 13:25:03 -0800
Subject: scsi: core: Move resid from scsi_data_buffer to scsi_cmnd

This patch does not change any functionality but reduces the size of
struct scsi_cmnd.

Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_lib.c  | 1 -
 include/scsi/scsi_cmnd.h | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d25101c5e3b8..c77cb0d31dbc 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -938,7 +938,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 		 * scsi_result_to_blk_status may have reset the host_byte
 		 */
 		scsi_req(req)->result = cmd->result;
-		scsi_req(req)->resid_len = scsi_get_resid(cmd);
 	}
 
 	/*
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index af6ae3e1eba6..6c7793c2ad01 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -35,7 +35,6 @@ struct scsi_driver;
 struct scsi_data_buffer {
 	struct sg_table table;
 	unsigned length;
-	int resid;
 };
 
 /* embedded in scsi_cmnd */
@@ -202,12 +201,12 @@ static inline unsigned scsi_bufflen(struct scsi_cmnd *cmd)
 
 static inline void scsi_set_resid(struct scsi_cmnd *cmd, int resid)
 {
-	cmd->sdb.resid = resid;
+	cmd->req.resid_len = resid;
 }
 
 static inline int scsi_get_resid(struct scsi_cmnd *cmd)
 {
-	return cmd->sdb.resid;
+	return cmd->req.resid_len;
 }
 
 #define scsi_for_each_sg(cmd, sg, nseg, __i)			\
-- 
cgit v1.2.3


From c8faabfc6f48009fb0d9ad4203aecfa569e5ff8d Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Thu, 24 Jan 2019 16:49:05 +0100
Subject: tpm: add _head suffix to tcg_efi_specid_event and tcg_pcr_event2

TCG defines two structures, TCG_EfiSpecIDEventStruct and TCG_PCR_EVENT2,
which contain variable-sized arrays in the middle of the definition.

Since these structures are not suitable for type casting, this patch
removes structure members after the variable-sized arrays and adds the
_head suffix to the structure name, to indicate that the renamed structures
do not contain all fields defined by TCG.

Lastly, given that variable-sized arrays are now in the last position, and
given that the size of the arrays cannot be determined in advance, this
patch also sets the size of those arrays to zero and removes the definition
of TPM2_ACTIVE_PCR_BANKS.

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Tested-by: Nayna Jain <nayna@linux.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 drivers/char/tpm/eventlog/tpm2.c | 12 ++++++------
 include/linux/tpm_eventlog.h     | 12 ++++--------
 2 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/char/tpm/eventlog/tpm2.c b/drivers/char/tpm/eventlog/tpm2.c
index 1b8fa9de2cac..d8b77133a83a 100644
--- a/drivers/char/tpm/eventlog/tpm2.c
+++ b/drivers/char/tpm/eventlog/tpm2.c
@@ -37,10 +37,10 @@
  *
  * Returns size of the event. If it is an invalid event, returns 0.
  */
-static int calc_tpm2_event_size(struct tcg_pcr_event2 *event,
+static int calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
 				struct tcg_pcr_event *event_header)
 {
-	struct tcg_efi_specid_event *efispecid;
+	struct tcg_efi_specid_event_head *efispecid;
 	struct tcg_event_field *event_field;
 	void *marker;
 	void *marker_start;
@@ -55,7 +55,7 @@ static int calc_tpm2_event_size(struct tcg_pcr_event2 *event,
 	marker = marker + sizeof(event->pcr_idx) + sizeof(event->event_type)
 		+ sizeof(event->count);
 
-	efispecid = (struct tcg_efi_specid_event *)event_header->event;
+	efispecid = (struct tcg_efi_specid_event_head *)event_header->event;
 
 	/* Check if event is malformed. */
 	if (event->count > efispecid->num_algs)
@@ -95,7 +95,7 @@ static void *tpm2_bios_measurements_start(struct seq_file *m, loff_t *pos)
 	void *addr = log->bios_event_log;
 	void *limit = log->bios_event_log_end;
 	struct tcg_pcr_event *event_header;
-	struct tcg_pcr_event2 *event;
+	struct tcg_pcr_event2_head *event;
 	size_t size;
 	int i;
 
@@ -136,7 +136,7 @@ static void *tpm2_bios_measurements_next(struct seq_file *m, void *v,
 					 loff_t *pos)
 {
 	struct tcg_pcr_event *event_header;
-	struct tcg_pcr_event2 *event;
+	struct tcg_pcr_event2_head *event;
 	struct tpm_chip *chip = m->private;
 	struct tpm_bios_log *log = &chip->log;
 	void *limit = log->bios_event_log_end;
@@ -180,7 +180,7 @@ static int tpm2_binary_bios_measurements_show(struct seq_file *m, void *v)
 	struct tpm_chip *chip = m->private;
 	struct tpm_bios_log *log = &chip->log;
 	struct tcg_pcr_event *event_header = log->bios_event_log;
-	struct tcg_pcr_event2 *event = v;
+	struct tcg_pcr_event2_head *event = v;
 	void *temp_ptr;
 	size_t size;
 
diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h
index 20d9da77fc11..f47342361e87 100644
--- a/include/linux/tpm_eventlog.h
+++ b/include/linux/tpm_eventlog.h
@@ -8,7 +8,6 @@
 #define TCG_EVENT_NAME_LEN_MAX	255
 #define MAX_TEXT_EVENT		1000	/* Max event string length */
 #define ACPI_TCPA_SIG		"TCPA"	/* 0x41504354 /'TCPA' */
-#define TPM2_ACTIVE_PCR_BANKS	3
 
 #define EFI_TCG2_EVENT_LOG_FORMAT_TCG_1_2 0x1
 #define EFI_TCG2_EVENT_LOG_FORMAT_TCG_2   0x2
@@ -82,7 +81,7 @@ struct tcg_efi_specid_event_algs {
 	u16 digest_size;
 } __packed;
 
-struct tcg_efi_specid_event {
+struct tcg_efi_specid_event_head {
 	u8 signature[16];
 	u32 platform_class;
 	u8 spec_version_minor;
@@ -90,9 +89,7 @@ struct tcg_efi_specid_event {
 	u8 spec_errata;
 	u8 uintnsize;
 	u32 num_algs;
-	struct tcg_efi_specid_event_algs digest_sizes[TPM2_ACTIVE_PCR_BANKS];
-	u8 vendor_info_size;
-	u8 vendor_info[0];
+	struct tcg_efi_specid_event_algs digest_sizes[];
 } __packed;
 
 struct tcg_pcr_event {
@@ -113,12 +110,11 @@ struct tpm2_digest {
 	u8 digest[SHA512_DIGEST_SIZE];
 } __packed;
 
-struct tcg_pcr_event2 {
+struct tcg_pcr_event2_head {
 	u32 pcr_idx;
 	u32 event_type;
 	u32 count;
-	struct tpm2_digest digests[TPM2_ACTIVE_PCR_BANKS];
-	struct tcg_event_field event;
+	struct tpm2_digest digests[];
 } __packed;
 
 #endif
-- 
cgit v1.2.3


From 36ce089758b1b55df5854d6b6d74713f609e125d Mon Sep 17 00:00:00 2001
From: Jerry Snitselaar <jsnitsel@redhat.com>
Date: Wed, 30 Jan 2019 15:06:58 -0700
Subject: tpm: don't return bool from update_timeouts

Set tpm_chip->timeouts_adjusted directly in the update_timeouts
code instead of returning bool. In case of tpm read failing
print warning that the read failed and continue on.

Signed-off-by: Jerry Snitselaar <jsnitsel@redhat.com>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 drivers/char/tpm/tpm1-cmd.c     |  3 +--
 drivers/char/tpm/tpm_tis_core.c | 15 +++++++++------
 include/linux/tpm.h             |  2 +-
 3 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm1-cmd.c b/drivers/char/tpm/tpm1-cmd.c
index 6f306338953b..bda9a16b44f6 100644
--- a/drivers/char/tpm/tpm1-cmd.c
+++ b/drivers/char/tpm/tpm1-cmd.c
@@ -380,8 +380,7 @@ int tpm1_get_timeouts(struct tpm_chip *chip)
 	 * of misreporting.
 	 */
 	if (chip->ops->update_timeouts)
-		chip->timeout_adjusted =
-			chip->ops->update_timeouts(chip, timeout_eff);
+		chip->ops->update_timeouts(chip, timeout_eff);
 
 	if (!chip->timeout_adjusted) {
 		/* Restore default if chip reported 0 */
diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c
index bb0c2e160562..c6b0c6d541a5 100644
--- a/drivers/char/tpm/tpm_tis_core.c
+++ b/drivers/char/tpm/tpm_tis_core.c
@@ -521,35 +521,38 @@ static const struct tis_vendor_timeout_override vendor_timeout_overrides[] = {
 			(TIS_SHORT_TIMEOUT*1000), (TIS_SHORT_TIMEOUT*1000) } },
 };
 
-static bool tpm_tis_update_timeouts(struct tpm_chip *chip,
+static void tpm_tis_update_timeouts(struct tpm_chip *chip,
 				    unsigned long *timeout_cap)
 {
 	struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
 	int i, rc;
 	u32 did_vid;
 
+	chip->timeout_adjusted = false;
+
 	if (chip->ops->clk_enable != NULL)
 		chip->ops->clk_enable(chip, true);
 
 	rc = tpm_tis_read32(priv, TPM_DID_VID(0), &did_vid);
-	if (rc < 0)
+	if (rc < 0) {
+		dev_warn(&chip->dev, "%s: failed to read did_vid: %d\n",
+			 __func__, rc);
 		goto out;
+	}
 
 	for (i = 0; i != ARRAY_SIZE(vendor_timeout_overrides); i++) {
 		if (vendor_timeout_overrides[i].did_vid != did_vid)
 			continue;
 		memcpy(timeout_cap, vendor_timeout_overrides[i].timeout_us,
 		       sizeof(vendor_timeout_overrides[i].timeout_us));
-		rc = true;
+		chip->timeout_adjusted = true;
 	}
 
-	rc = false;
-
 out:
 	if (chip->ops->clk_enable != NULL)
 		chip->ops->clk_enable(chip, false);
 
-	return rc;
+	return;
 }
 
 /*
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index b49a55cf775f..13563b8c0c3a 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -41,7 +41,7 @@ struct tpm_class_ops {
 	int (*send) (struct tpm_chip *chip, u8 *buf, size_t len);
 	void (*cancel) (struct tpm_chip *chip);
 	u8 (*status) (struct tpm_chip *chip);
-	bool (*update_timeouts)(struct tpm_chip *chip,
+	void (*update_timeouts)(struct tpm_chip *chip,
 				unsigned long *timeout_cap);
 	int (*go_idle)(struct tpm_chip *chip);
 	int (*cmd_ready)(struct tpm_chip *chip);
-- 
cgit v1.2.3


From aa042475938f5818b0c1b6203061e85ad2535dbc Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Wed, 6 Feb 2019 17:24:48 +0100
Subject: tpm: rename and export tpm2_digest and tpm2_algorithms

Rename tpm2_* to tpm_* and move the definitions to include/linux/tpm.h so
that these can be used by other kernel subsystems (e.g. IMA).

Also, set the length of the digest array in tpm_digest to a new constant
named TPM_MAX_DIGEST_SIZE, equal to SHA512_DIGEST_SIZE.

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Tested-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Acked-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 drivers/char/tpm/tpm-interface.c |  2 +-
 drivers/char/tpm/tpm.h           | 13 +------------
 drivers/char/tpm/tpm1-cmd.c      |  2 +-
 drivers/char/tpm/tpm2-cmd.c      | 18 +++++++++---------
 include/linux/tpm.h              | 19 +++++++++++++++++++
 include/linux/tpm_eventlog.h     |  9 ++-------
 6 files changed, 33 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c
index 2b31eff06b0e..9c6aa77b5dee 100644
--- a/drivers/char/tpm/tpm-interface.c
+++ b/drivers/char/tpm/tpm-interface.c
@@ -318,7 +318,7 @@ EXPORT_SYMBOL_GPL(tpm_pcr_read);
 int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, const u8 *hash)
 {
 	int rc;
-	struct tpm2_digest *digest_list;
+	struct tpm_digest *digest_list;
 	int i;
 
 	chip = tpm_find_get_ops(chip);
diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
index cd330ace6248..0e54061d3fd1 100644
--- a/drivers/char/tpm/tpm.h
+++ b/drivers/char/tpm/tpm.h
@@ -122,17 +122,6 @@ enum tpm2_return_codes {
 	TPM2_RC_RETRY		= 0x0922,
 };
 
-enum tpm2_algorithms {
-	TPM2_ALG_ERROR		= 0x0000,
-	TPM2_ALG_SHA1		= 0x0004,
-	TPM2_ALG_KEYEDHASH	= 0x0008,
-	TPM2_ALG_SHA256		= 0x000B,
-	TPM2_ALG_SHA384		= 0x000C,
-	TPM2_ALG_SHA512		= 0x000D,
-	TPM2_ALG_NULL		= 0x0010,
-	TPM2_ALG_SM3_256	= 0x0012,
-};
-
 enum tpm2_command_codes {
 	TPM2_CC_FIRST		        = 0x011F,
 	TPM2_CC_HIERARCHY_CONTROL       = 0x0121,
@@ -545,7 +534,7 @@ static inline u32 tpm2_rc_value(u32 rc)
 int tpm2_get_timeouts(struct tpm_chip *chip);
 int tpm2_pcr_read(struct tpm_chip *chip, u32 pcr_idx, u8 *res_buf);
 int tpm2_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, u32 count,
-		    struct tpm2_digest *digests);
+		    struct tpm_digest *digests);
 int tpm2_get_random(struct tpm_chip *chip, u8 *dest, size_t max);
 void tpm2_flush_context(struct tpm_chip *chip, u32 handle);
 int tpm2_seal_trusted(struct tpm_chip *chip,
diff --git a/drivers/char/tpm/tpm1-cmd.c b/drivers/char/tpm/tpm1-cmd.c
index e7d3228a0f37..3eb7e03889a0 100644
--- a/drivers/char/tpm/tpm1-cmd.c
+++ b/drivers/char/tpm/tpm1-cmd.c
@@ -703,7 +703,7 @@ int tpm1_auto_startup(struct tpm_chip *chip)
 		goto out;
 	}
 
-	chip->allocated_banks[0] = TPM2_ALG_SHA1;
+	chip->allocated_banks[0] = TPM_ALG_SHA1;
 	chip->nr_allocated_banks = 1;
 
 	return rc;
diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c
index bd20b9a61fc0..440ae6ee29e4 100644
--- a/drivers/char/tpm/tpm2-cmd.c
+++ b/drivers/char/tpm/tpm2-cmd.c
@@ -33,11 +33,11 @@ struct tpm2_hash {
 };
 
 static struct tpm2_hash tpm2_hash_map[] = {
-	{HASH_ALGO_SHA1, TPM2_ALG_SHA1},
-	{HASH_ALGO_SHA256, TPM2_ALG_SHA256},
-	{HASH_ALGO_SHA384, TPM2_ALG_SHA384},
-	{HASH_ALGO_SHA512, TPM2_ALG_SHA512},
-	{HASH_ALGO_SM3_256, TPM2_ALG_SM3_256},
+	{HASH_ALGO_SHA1, TPM_ALG_SHA1},
+	{HASH_ALGO_SHA256, TPM_ALG_SHA256},
+	{HASH_ALGO_SHA384, TPM_ALG_SHA384},
+	{HASH_ALGO_SHA512, TPM_ALG_SHA512},
+	{HASH_ALGO_SM3_256, TPM_ALG_SM3_256},
 };
 
 int tpm2_get_timeouts(struct tpm_chip *chip)
@@ -192,7 +192,7 @@ int tpm2_pcr_read(struct tpm_chip *chip, u32 pcr_idx, u8 *res_buf)
 	pcr_select[pcr_idx >> 3] = 1 << (pcr_idx & 0x7);
 
 	tpm_buf_append_u32(&buf, 1);
-	tpm_buf_append_u16(&buf, TPM2_ALG_SHA1);
+	tpm_buf_append_u16(&buf, TPM_ALG_SHA1);
 	tpm_buf_append_u8(&buf, TPM2_PCR_SELECT_MIN);
 	tpm_buf_append(&buf, (const unsigned char *)pcr_select,
 		       sizeof(pcr_select));
@@ -226,7 +226,7 @@ struct tpm2_null_auth_area {
  * Return: Same as with tpm_transmit_cmd.
  */
 int tpm2_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, u32 count,
-		    struct tpm2_digest *digests)
+		    struct tpm_digest *digests)
 {
 	struct tpm_buf buf;
 	struct tpm2_null_auth_area auth_area;
@@ -443,7 +443,7 @@ int tpm2_seal_trusted(struct tpm_chip *chip,
 
 	/* public */
 	tpm_buf_append_u16(&buf, 14 + options->policydigest_len);
-	tpm_buf_append_u16(&buf, TPM2_ALG_KEYEDHASH);
+	tpm_buf_append_u16(&buf, TPM_ALG_KEYEDHASH);
 	tpm_buf_append_u16(&buf, hash);
 
 	/* policy */
@@ -458,7 +458,7 @@ int tpm2_seal_trusted(struct tpm_chip *chip,
 	}
 
 	/* public parameters */
-	tpm_buf_append_u16(&buf, TPM2_ALG_NULL);
+	tpm_buf_append_u16(&buf, TPM_ALG_NULL);
 	tpm_buf_append_u16(&buf, 0);
 
 	/* outside info */
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 13563b8c0c3a..9fe8c9816cf0 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -22,12 +22,31 @@
 #ifndef __LINUX_TPM_H__
 #define __LINUX_TPM_H__
 
+#include <crypto/hash_info.h>
+
 #define TPM_DIGEST_SIZE 20	/* Max TPM v1.2 PCR size */
+#define TPM_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE
 
 struct tpm_chip;
 struct trusted_key_payload;
 struct trusted_key_options;
 
+enum tpm_algorithms {
+	TPM_ALG_ERROR		= 0x0000,
+	TPM_ALG_SHA1		= 0x0004,
+	TPM_ALG_KEYEDHASH	= 0x0008,
+	TPM_ALG_SHA256		= 0x000B,
+	TPM_ALG_SHA384		= 0x000C,
+	TPM_ALG_SHA512		= 0x000D,
+	TPM_ALG_NULL		= 0x0010,
+	TPM_ALG_SM3_256		= 0x0012,
+};
+
+struct tpm_digest {
+	u16 alg_id;
+	u8 digest[TPM_MAX_DIGEST_SIZE];
+} __packed;
+
 enum TPM_OPS_FLAGS {
 	TPM_OPS_AUTO_STARTUP = BIT(0),
 };
diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h
index f47342361e87..81519f163211 100644
--- a/include/linux/tpm_eventlog.h
+++ b/include/linux/tpm_eventlog.h
@@ -3,7 +3,7 @@
 #ifndef __LINUX_TPM_EVENTLOG_H__
 #define __LINUX_TPM_EVENTLOG_H__
 
-#include <crypto/hash_info.h>
+#include <linux/tpm.h>
 
 #define TCG_EVENT_NAME_LEN_MAX	255
 #define MAX_TEXT_EVENT		1000	/* Max event string length */
@@ -105,16 +105,11 @@ struct tcg_event_field {
 	u8 event[0];
 } __packed;
 
-struct tpm2_digest {
-	u16 alg_id;
-	u8 digest[SHA512_DIGEST_SIZE];
-} __packed;
-
 struct tcg_pcr_event2_head {
 	u32 pcr_idx;
 	u32 event_type;
 	u32 count;
-	struct tpm2_digest digests[];
+	struct tpm_digest digests[];
 } __packed;
 
 #endif
-- 
cgit v1.2.3


From 879b589210a9a0c9f77d301aaf0ddee20f2c5052 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Wed, 6 Feb 2019 17:24:49 +0100
Subject: tpm: retrieve digest size of unknown algorithms with PCR read

Currently, the TPM driver retrieves the digest size from a table mapping
TPM algorithms identifiers to identifiers defined by the crypto subsystem.
If the algorithm is not defined by the latter, the digest size can be
retrieved from the output of the PCR read command.

The patch modifies the definition of tpm_pcr_read() and tpm2_pcr_read() to
pass the desired hash algorithm and obtain the digest size at TPM startup.
Algorithms and corresponding digest sizes are stored in the new structure
tpm_bank_info, member of tpm_chip, so that the information can be used by
other kernel subsystems.

tpm_bank_info contains: the TPM algorithm identifier, necessary to generate
the event log as defined by Trusted Computing Group (TCG); the digest size,
to pad/truncate a digest calculated with a different algorithm; the crypto
subsystem identifier, to calculate the digest of event data.

This patch also protects against data corruption that could happen in the
bus, by checking that the digest size returned by the TPM during a PCR read
matches the size of the algorithm passed to tpm2_pcr_read().

For the initial PCR read, when digest sizes are not yet available, this
patch ensures that the amount of data copied from the output returned by
the TPM does not exceed the size of the array data are copied to.

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Tested-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Acked-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 drivers/char/tpm/tpm-interface.c    | 16 +++----
 drivers/char/tpm/tpm.h              |  5 ++-
 drivers/char/tpm/tpm1-cmd.c         |  4 +-
 drivers/char/tpm/tpm2-cmd.c         | 85 +++++++++++++++++++++++++++++--------
 include/linux/tpm.h                 | 12 +++++-
 security/integrity/ima/ima_crypto.c | 10 ++---
 6 files changed, 96 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c
index 9c6aa77b5dee..1c92dbeef736 100644
--- a/drivers/char/tpm/tpm-interface.c
+++ b/drivers/char/tpm/tpm-interface.c
@@ -281,11 +281,12 @@ EXPORT_SYMBOL_GPL(tpm_is_tpm2);
  * tpm_pcr_read - read a PCR value from SHA1 bank
  * @chip:	a &struct tpm_chip instance, %NULL for the default chip
  * @pcr_idx:	the PCR to be retrieved
- * @res_buf:	the value of the PCR
+ * @digest:	the PCR bank and buffer current PCR value is written to
  *
  * Return: same as with tpm_transmit_cmd()
  */
-int tpm_pcr_read(struct tpm_chip *chip, u32 pcr_idx, u8 *res_buf)
+int tpm_pcr_read(struct tpm_chip *chip, u32 pcr_idx,
+		 struct tpm_digest *digest)
 {
 	int rc;
 
@@ -294,9 +295,9 @@ int tpm_pcr_read(struct tpm_chip *chip, u32 pcr_idx, u8 *res_buf)
 		return -ENODEV;
 
 	if (chip->flags & TPM_CHIP_FLAG_TPM2)
-		rc = tpm2_pcr_read(chip, pcr_idx, res_buf);
+		rc = tpm2_pcr_read(chip, pcr_idx, digest, NULL);
 	else
-		rc = tpm1_pcr_read(chip, pcr_idx, res_buf);
+		rc = tpm1_pcr_read(chip, pcr_idx, digest->digest);
 
 	tpm_put_ops(chip);
 	return rc;
@@ -309,9 +310,8 @@ EXPORT_SYMBOL_GPL(tpm_pcr_read);
  * @pcr_idx:	the PCR to be retrieved
  * @hash:	the hash value used to extend the PCR value
  *
- * Note: with TPM 2.0 extends also those banks with a known digest size to the
- * cryto subsystem in order to prevent malicious use of those PCR banks. In the
- * future we should dynamically determine digest sizes.
+ * Note: with TPM 2.0 extends also those banks for which no digest was
+ * specified in order to prevent malicious use of those PCR banks.
  *
  * Return: same as with tpm_transmit_cmd()
  */
@@ -332,7 +332,7 @@ int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, const u8 *hash)
 			return -ENOMEM;
 
 		for (i = 0; i < chip->nr_allocated_banks; i++) {
-			digest_list[i].alg_id = chip->allocated_banks[i];
+			digest_list[i].alg_id = chip->allocated_banks[i].alg_id;
 			memcpy(digest_list[i].digest, hash, TPM_DIGEST_SIZE);
 		}
 
diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
index 0e54061d3fd1..4efa304e9ece 100644
--- a/drivers/char/tpm/tpm.h
+++ b/drivers/char/tpm/tpm.h
@@ -247,7 +247,7 @@ struct tpm_chip {
 	unsigned int groups_cnt;
 
 	u32 nr_allocated_banks;
-	u16 *allocated_banks;
+	struct tpm_bank_info *allocated_banks;
 #ifdef CONFIG_ACPI
 	acpi_handle acpi_dev_handle;
 	char ppi_version[TPM_PPI_VERSION_LEN + 1];
@@ -532,7 +532,8 @@ static inline u32 tpm2_rc_value(u32 rc)
 }
 
 int tpm2_get_timeouts(struct tpm_chip *chip);
-int tpm2_pcr_read(struct tpm_chip *chip, u32 pcr_idx, u8 *res_buf);
+int tpm2_pcr_read(struct tpm_chip *chip, u32 pcr_idx,
+		  struct tpm_digest *digest, u16 *digest_size_ptr);
 int tpm2_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, u32 count,
 		    struct tpm_digest *digests);
 int tpm2_get_random(struct tpm_chip *chip, u8 *dest, size_t max);
diff --git a/drivers/char/tpm/tpm1-cmd.c b/drivers/char/tpm/tpm1-cmd.c
index 3eb7e03889a0..85dcf2654d11 100644
--- a/drivers/char/tpm/tpm1-cmd.c
+++ b/drivers/char/tpm/tpm1-cmd.c
@@ -703,7 +703,9 @@ int tpm1_auto_startup(struct tpm_chip *chip)
 		goto out;
 	}
 
-	chip->allocated_banks[0] = TPM_ALG_SHA1;
+	chip->allocated_banks[0].alg_id = TPM_ALG_SHA1;
+	chip->allocated_banks[0].digest_size = hash_digest_size[HASH_ALGO_SHA1];
+	chip->allocated_banks[0].crypto_id = HASH_ALGO_SHA1;
 	chip->nr_allocated_banks = 1;
 
 	return rc;
diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c
index 440ae6ee29e4..6967f15a6585 100644
--- a/drivers/char/tpm/tpm2-cmd.c
+++ b/drivers/char/tpm/tpm2-cmd.c
@@ -171,20 +171,36 @@ struct tpm2_pcr_read_out {
  * tpm2_pcr_read() - read a PCR value
  * @chip:	TPM chip to use.
  * @pcr_idx:	index of the PCR to read.
- * @res_buf:	buffer to store the resulting hash.
+ * @digest:	PCR bank and buffer current PCR value is written to.
+ * @digest_size_ptr:	pointer to variable that stores the digest size.
  *
  * Return: Same as with tpm_transmit_cmd.
  */
-int tpm2_pcr_read(struct tpm_chip *chip, u32 pcr_idx, u8 *res_buf)
+int tpm2_pcr_read(struct tpm_chip *chip, u32 pcr_idx,
+		  struct tpm_digest *digest, u16 *digest_size_ptr)
 {
+	int i;
 	int rc;
 	struct tpm_buf buf;
 	struct tpm2_pcr_read_out *out;
 	u8 pcr_select[TPM2_PCR_SELECT_MIN] = {0};
+	u16 digest_size;
+	u16 expected_digest_size = 0;
 
 	if (pcr_idx >= TPM2_PLATFORM_PCR)
 		return -EINVAL;
 
+	if (!digest_size_ptr) {
+		for (i = 0; i < chip->nr_allocated_banks &&
+		     chip->allocated_banks[i].alg_id != digest->alg_id; i++)
+			;
+
+		if (i == chip->nr_allocated_banks)
+			return -EINVAL;
+
+		expected_digest_size = chip->allocated_banks[i].digest_size;
+	}
+
 	rc = tpm_buf_init(&buf, TPM2_ST_NO_SESSIONS, TPM2_CC_PCR_READ);
 	if (rc)
 		return rc;
@@ -192,18 +208,28 @@ int tpm2_pcr_read(struct tpm_chip *chip, u32 pcr_idx, u8 *res_buf)
 	pcr_select[pcr_idx >> 3] = 1 << (pcr_idx & 0x7);
 
 	tpm_buf_append_u32(&buf, 1);
-	tpm_buf_append_u16(&buf, TPM_ALG_SHA1);
+	tpm_buf_append_u16(&buf, digest->alg_id);
 	tpm_buf_append_u8(&buf, TPM2_PCR_SELECT_MIN);
 	tpm_buf_append(&buf, (const unsigned char *)pcr_select,
 		       sizeof(pcr_select));
 
-	rc = tpm_transmit_cmd(chip, &buf, 0, res_buf ?
-			      "attempting to read a pcr value" : NULL);
-	if (rc == 0 && res_buf) {
-		out = (struct tpm2_pcr_read_out *)&buf.data[TPM_HEADER_SIZE];
-		memcpy(res_buf, out->digest, SHA1_DIGEST_SIZE);
+	rc = tpm_transmit_cmd(chip, &buf, 0, "attempting to read a pcr value");
+	if (rc)
+		goto out;
+
+	out = (struct tpm2_pcr_read_out *)&buf.data[TPM_HEADER_SIZE];
+	digest_size = be16_to_cpu(out->digest_size);
+	if (digest_size > sizeof(digest->digest) ||
+	    (!digest_size_ptr && digest_size != expected_digest_size)) {
+		rc = -EINVAL;
+		goto out;
 	}
 
+	if (digest_size_ptr)
+		*digest_size_ptr = digest_size;
+
+	memcpy(digest->digest, out->digest, digest_size);
+out:
 	tpm_buf_destroy(&buf);
 	return rc;
 }
@@ -232,7 +258,6 @@ int tpm2_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, u32 count,
 	struct tpm2_null_auth_area auth_area;
 	int rc;
 	int i;
-	int j;
 
 	if (count > chip->nr_allocated_banks)
 		return -EINVAL;
@@ -254,14 +279,9 @@ int tpm2_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, u32 count,
 	tpm_buf_append_u32(&buf, count);
 
 	for (i = 0; i < count; i++) {
-		for (j = 0; j < ARRAY_SIZE(tpm2_hash_map); j++) {
-			if (digests[i].alg_id != tpm2_hash_map[j].tpm_id)
-				continue;
-			tpm_buf_append_u16(&buf, digests[i].alg_id);
-			tpm_buf_append(&buf, (const unsigned char
-					      *)&digests[i].digest,
-			       hash_digest_size[tpm2_hash_map[j].crypto_id]);
-		}
+		tpm_buf_append_u16(&buf, digests[i].alg_id);
+		tpm_buf_append(&buf, (const unsigned char *)&digests[i].digest,
+			       chip->allocated_banks[i].digest_size);
 	}
 
 	rc = tpm_transmit_cmd(chip, &buf, 0, "attempting extend a PCR value");
@@ -795,6 +815,30 @@ int tpm2_probe(struct tpm_chip *chip)
 }
 EXPORT_SYMBOL_GPL(tpm2_probe);
 
+static int tpm2_init_bank_info(struct tpm_chip *chip, u32 bank_index)
+{
+	struct tpm_bank_info *bank = chip->allocated_banks + bank_index;
+	struct tpm_digest digest = { .alg_id = bank->alg_id };
+	int i;
+
+	/*
+	 * Avoid unnecessary PCR read operations to reduce overhead
+	 * and obtain identifiers of the crypto subsystem.
+	 */
+	for (i = 0; i < ARRAY_SIZE(tpm2_hash_map); i++) {
+		enum hash_algo crypto_algo = tpm2_hash_map[i].crypto_id;
+
+		if (bank->alg_id != tpm2_hash_map[i].tpm_id)
+			continue;
+
+		bank->digest_size = hash_digest_size[crypto_algo];
+		bank->crypto_id = crypto_algo;
+		return 0;
+	}
+
+	return tpm2_pcr_read(chip, 0, &digest, &bank->digest_size);
+}
+
 struct tpm2_pcr_selection {
 	__be16  hash_alg;
 	u8  size_of_select;
@@ -858,7 +902,12 @@ static ssize_t tpm2_get_pcr_allocation(struct tpm_chip *chip)
 		pcr_select_offset = memchr_inv(pcr_selection.pcr_select, 0,
 					       pcr_selection.size_of_select);
 		if (pcr_select_offset) {
-			chip->allocated_banks[nr_alloc_banks] = hash_alg;
+			chip->allocated_banks[nr_alloc_banks].alg_id = hash_alg;
+
+			rc = tpm2_init_bank_info(chip, nr_alloc_banks);
+			if (rc < 0)
+				break;
+
 			nr_alloc_banks++;
 		}
 
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 9fe8c9816cf0..afd022fc9d3d 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -47,6 +47,12 @@ struct tpm_digest {
 	u8 digest[TPM_MAX_DIGEST_SIZE];
 } __packed;
 
+struct tpm_bank_info {
+	u16 alg_id;
+	u16 digest_size;
+	u16 crypto_id;
+};
+
 enum TPM_OPS_FLAGS {
 	TPM_OPS_AUTO_STARTUP = BIT(0),
 };
@@ -72,7 +78,8 @@ struct tpm_class_ops {
 #if defined(CONFIG_TCG_TPM) || defined(CONFIG_TCG_TPM_MODULE)
 
 extern int tpm_is_tpm2(struct tpm_chip *chip);
-extern int tpm_pcr_read(struct tpm_chip *chip, u32 pcr_idx, u8 *res_buf);
+extern int tpm_pcr_read(struct tpm_chip *chip, u32 pcr_idx,
+			struct tpm_digest *digest);
 extern int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, const u8 *hash);
 extern int tpm_send(struct tpm_chip *chip, void *cmd, size_t buflen);
 extern int tpm_get_random(struct tpm_chip *chip, u8 *data, size_t max);
@@ -89,7 +96,8 @@ static inline int tpm_is_tpm2(struct tpm_chip *chip)
 	return -ENODEV;
 }
 
-static inline int tpm_pcr_read(struct tpm_chip *chip, u32 pcr_idx, u8 *res_buf)
+static inline int tpm_pcr_read(struct tpm_chip *chip, int pcr_idx,
+			       struct tpm_digest *digest)
 {
 	return -ENODEV;
 }
diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c
index acf2c7df7145..16a4f45863b1 100644
--- a/security/integrity/ima/ima_crypto.c
+++ b/security/integrity/ima/ima_crypto.c
@@ -643,12 +643,12 @@ int ima_calc_buffer_hash(const void *buf, loff_t len,
 	return calc_buffer_shash(buf, len, hash);
 }
 
-static void __init ima_pcrread(u32 idx, u8 *pcr)
+static void __init ima_pcrread(u32 idx, struct tpm_digest *d)
 {
 	if (!ima_tpm_chip)
 		return;
 
-	if (tpm_pcr_read(ima_tpm_chip, idx, pcr) != 0)
+	if (tpm_pcr_read(ima_tpm_chip, idx, d) != 0)
 		pr_err("Error Communicating to TPM chip\n");
 }
 
@@ -658,7 +658,7 @@ static void __init ima_pcrread(u32 idx, u8 *pcr)
 static int __init ima_calc_boot_aggregate_tfm(char *digest,
 					      struct crypto_shash *tfm)
 {
-	u8 pcr_i[TPM_DIGEST_SIZE];
+	struct tpm_digest d = { .alg_id = TPM_ALG_SHA1, .digest = {0} };
 	int rc;
 	u32 i;
 	SHASH_DESC_ON_STACK(shash, tfm);
@@ -672,9 +672,9 @@ static int __init ima_calc_boot_aggregate_tfm(char *digest,
 
 	/* cumulative sha1 over tpm registers 0-7 */
 	for (i = TPM_PCR0; i < TPM_PCR8; i++) {
-		ima_pcrread(i, pcr_i);
+		ima_pcrread(i, &d);
 		/* now accumulate with current aggregate */
-		rc = crypto_shash_update(shash, pcr_i, TPM_DIGEST_SIZE);
+		rc = crypto_shash_update(shash, d.digest, TPM_DIGEST_SIZE);
 	}
 	if (!rc)
 		crypto_shash_final(shash, digest);
-- 
cgit v1.2.3


From 901615cb916dc955fb7bda4e34402bf263532e4a Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Wed, 6 Feb 2019 17:24:50 +0100
Subject: tpm: move tpm_chip definition to include/linux/tpm.h

The tpm_chip structure contains the list of PCR banks currently allocated
in the TPM. When support for crypto agility will be added to the TPM
driver, users of the driver have to provide a digest for each allocated
bank to tpm_pcr_extend(). With this patch, they can obtain the PCR bank
algorithms directly from chip->allocated_banks.

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Tested-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 drivers/char/tpm/tpm.h | 101 ++-----------------------------------------------
 include/linux/tpm.h    |  91 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 97 deletions(-)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
index 4efa304e9ece..4f85ce909122 100644
--- a/drivers/char/tpm/tpm.h
+++ b/drivers/char/tpm/tpm.h
@@ -25,30 +25,22 @@
 
 #include <linux/module.h>
 #include <linux/delay.h>
-#include <linux/fs.h>
-#include <linux/hw_random.h>
 #include <linux/mutex.h>
 #include <linux/sched.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <linux/tpm.h>
-#include <linux/acpi.h>
-#include <linux/cdev.h>
 #include <linux/highmem.h>
 #include <linux/tpm_eventlog.h>
-#include <crypto/hash_info.h>
 
 #ifdef CONFIG_X86
 #include <asm/intel-family.h>
 #endif
 
-enum tpm_const {
-	TPM_MINOR = 224,	/* officially assigned */
-	TPM_BUFSIZE = 4096,
-	TPM_NUM_DEVICES = 65536,
-	TPM_RETRY = 50,		/* 5 seconds */
-	TPM_NUM_EVENT_LOG_FILES = 3,
-};
+#define TPM_MINOR		224	/* officially assigned */
+#define TPM_BUFSIZE		4096
+#define TPM_NUM_DEVICES		65536
+#define TPM_RETRY		50
 
 enum tpm_timeout {
 	TPM_TIMEOUT = 5,	/* msecs */
@@ -65,16 +57,6 @@ enum tpm_addr {
 	TPM_ADDR = 0x4E,
 };
 
-/* Indexes the duration array */
-enum tpm_duration {
-	TPM_SHORT = 0,
-	TPM_MEDIUM = 1,
-	TPM_LONG = 2,
-	TPM_LONG_LONG = 3,
-	TPM_UNDEFINED,
-	TPM_NUM_DURATIONS = TPM_UNDEFINED,
-};
-
 #define TPM_WARN_RETRY          0x800
 #define TPM_WARN_DOING_SELFTEST 0x802
 #define TPM_ERR_DEACTIVATED     0x6
@@ -179,15 +161,6 @@ enum tpm2_cc_attrs {
 #define TPM_VID_WINBOND  0x1050
 #define TPM_VID_STM      0x104A
 
-#define TPM_PPI_VERSION_LEN		3
-
-struct tpm_space {
-	u32 context_tbl[3];
-	u8 *context_buf;
-	u32 session_tbl[3];
-	u8 *session_buf;
-};
-
 enum tpm_chip_flags {
 	TPM_CHIP_FLAG_TPM2		= BIT(1),
 	TPM_CHIP_FLAG_IRQ		= BIT(2),
@@ -196,72 +169,6 @@ enum tpm_chip_flags {
 	TPM_CHIP_FLAG_ALWAYS_POWERED	= BIT(5),
 };
 
-struct tpm_bios_log {
-	void *bios_event_log;
-	void *bios_event_log_end;
-};
-
-struct tpm_chip_seqops {
-	struct tpm_chip *chip;
-	const struct seq_operations *seqops;
-};
-
-struct tpm_chip {
-	struct device dev;
-	struct device devs;
-	struct cdev cdev;
-	struct cdev cdevs;
-
-	/* A driver callback under ops cannot be run unless ops_sem is held
-	 * (sometimes implicitly, eg for the sysfs code). ops becomes null
-	 * when the driver is unregistered, see tpm_try_get_ops.
-	 */
-	struct rw_semaphore ops_sem;
-	const struct tpm_class_ops *ops;
-
-	struct tpm_bios_log log;
-	struct tpm_chip_seqops bin_log_seqops;
-	struct tpm_chip_seqops ascii_log_seqops;
-
-	unsigned int flags;
-
-	int dev_num;		/* /dev/tpm# */
-	unsigned long is_open;	/* only one allowed */
-
-	char hwrng_name[64];
-	struct hwrng hwrng;
-
-	struct mutex tpm_mutex;	/* tpm is processing */
-
-	unsigned long timeout_a; /* jiffies */
-	unsigned long timeout_b; /* jiffies */
-	unsigned long timeout_c; /* jiffies */
-	unsigned long timeout_d; /* jiffies */
-	bool timeout_adjusted;
-	unsigned long duration[TPM_NUM_DURATIONS]; /* jiffies */
-	bool duration_adjusted;
-
-	struct dentry *bios_dir[TPM_NUM_EVENT_LOG_FILES];
-
-	const struct attribute_group *groups[3];
-	unsigned int groups_cnt;
-
-	u32 nr_allocated_banks;
-	struct tpm_bank_info *allocated_banks;
-#ifdef CONFIG_ACPI
-	acpi_handle acpi_dev_handle;
-	char ppi_version[TPM_PPI_VERSION_LEN + 1];
-#endif /* CONFIG_ACPI */
-
-	struct tpm_space work_space;
-	u32 last_cc;
-	u32 nr_commands;
-	u32 *cc_attrs_tbl;
-
-	/* active locality */
-	int locality;
-};
-
 #define to_tpm_chip(d) container_of(d, struct tpm_chip, dev)
 
 struct tpm_header {
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index afd022fc9d3d..816e686a73ac 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -22,6 +22,10 @@
 #ifndef __LINUX_TPM_H__
 #define __LINUX_TPM_H__
 
+#include <linux/hw_random.h>
+#include <linux/acpi.h>
+#include <linux/cdev.h>
+#include <linux/fs.h>
 #include <crypto/hash_info.h>
 
 #define TPM_DIGEST_SIZE 20	/* Max TPM v1.2 PCR size */
@@ -75,6 +79,93 @@ struct tpm_class_ops {
 	void (*clk_enable)(struct tpm_chip *chip, bool value);
 };
 
+#define TPM_NUM_EVENT_LOG_FILES		3
+
+/* Indexes the duration array */
+enum tpm_duration {
+	TPM_SHORT = 0,
+	TPM_MEDIUM = 1,
+	TPM_LONG = 2,
+	TPM_LONG_LONG = 3,
+	TPM_UNDEFINED,
+	TPM_NUM_DURATIONS = TPM_UNDEFINED,
+};
+
+#define TPM_PPI_VERSION_LEN		3
+
+struct tpm_space {
+	u32 context_tbl[3];
+	u8 *context_buf;
+	u32 session_tbl[3];
+	u8 *session_buf;
+};
+
+struct tpm_bios_log {
+	void *bios_event_log;
+	void *bios_event_log_end;
+};
+
+struct tpm_chip_seqops {
+	struct tpm_chip *chip;
+	const struct seq_operations *seqops;
+};
+
+struct tpm_chip {
+	struct device dev;
+	struct device devs;
+	struct cdev cdev;
+	struct cdev cdevs;
+
+	/* A driver callback under ops cannot be run unless ops_sem is held
+	 * (sometimes implicitly, eg for the sysfs code). ops becomes null
+	 * when the driver is unregistered, see tpm_try_get_ops.
+	 */
+	struct rw_semaphore ops_sem;
+	const struct tpm_class_ops *ops;
+
+	struct tpm_bios_log log;
+	struct tpm_chip_seqops bin_log_seqops;
+	struct tpm_chip_seqops ascii_log_seqops;
+
+	unsigned int flags;
+
+	int dev_num;		/* /dev/tpm# */
+	unsigned long is_open;	/* only one allowed */
+
+	char hwrng_name[64];
+	struct hwrng hwrng;
+
+	struct mutex tpm_mutex;	/* tpm is processing */
+
+	unsigned long timeout_a; /* jiffies */
+	unsigned long timeout_b; /* jiffies */
+	unsigned long timeout_c; /* jiffies */
+	unsigned long timeout_d; /* jiffies */
+	bool timeout_adjusted;
+	unsigned long duration[TPM_NUM_DURATIONS]; /* jiffies */
+	bool duration_adjusted;
+
+	struct dentry *bios_dir[TPM_NUM_EVENT_LOG_FILES];
+
+	const struct attribute_group *groups[3];
+	unsigned int groups_cnt;
+
+	u32 nr_allocated_banks;
+	struct tpm_bank_info *allocated_banks;
+#ifdef CONFIG_ACPI
+	acpi_handle acpi_dev_handle;
+	char ppi_version[TPM_PPI_VERSION_LEN + 1];
+#endif /* CONFIG_ACPI */
+
+	struct tpm_space work_space;
+	u32 last_cc;
+	u32 nr_commands;
+	u32 *cc_attrs_tbl;
+
+	/* active locality */
+	int locality;
+};
+
 #if defined(CONFIG_TCG_TPM) || defined(CONFIG_TCG_TPM_MODULE)
 
 extern int tpm_is_tpm2(struct tpm_chip *chip);
-- 
cgit v1.2.3


From 0b6cf6b97b7ef1fa3c7fefab0cac897a1c4a3400 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Wed, 6 Feb 2019 17:24:52 +0100
Subject: tpm: pass an array of tpm_extend_digest structures to
 tpm_pcr_extend()

Currently, tpm_pcr_extend() accepts as an input only a SHA1 digest.

This patch replaces the hash parameter of tpm_pcr_extend() with an array of
tpm_digest structures, so that the caller can provide a digest for each PCR
bank currently allocated in the TPM.

tpm_pcr_extend() will not extend banks for which no digest was provided,
as it happened before this patch, but instead it requires that callers
provide the full set of digests. Since the number of digests will always be
chip->nr_allocated_banks, the count parameter has been removed.

Due to the API change, ima_pcr_extend() and pcrlock() have been modified.
Since the number of allocated banks is not known in advance, the memory for
the digests must be dynamically allocated. To avoid performance degradation
and to avoid that a PCR extend is not done due to lack of memory, the array
of tpm_digest structures is allocated by the users of the TPM driver at
initialization time.

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Tested-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Tested-by: Mimi Zohar <zohar@linux.ibm.com> (on x86 for TPM 1.2 & PTT TPM 2.0)
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 drivers/char/tpm/tpm-interface.c   | 30 ++++++++++------------------
 drivers/char/tpm/tpm.h             |  2 +-
 drivers/char/tpm/tpm2-cmd.c        | 10 +++-------
 include/linux/tpm.h                |  5 +++--
 security/integrity/ima/ima.h       |  1 +
 security/integrity/ima/ima_init.c  |  4 ++++
 security/integrity/ima/ima_queue.c | 27 ++++++++++++++++++++++++-
 security/keys/trusted.c            | 41 ++++++++++++++++++++++++++++++--------
 8 files changed, 82 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c
index 1c92dbeef736..83ece5639f86 100644
--- a/drivers/char/tpm/tpm-interface.c
+++ b/drivers/char/tpm/tpm-interface.c
@@ -308,42 +308,34 @@ EXPORT_SYMBOL_GPL(tpm_pcr_read);
  * tpm_pcr_extend - extend a PCR value in SHA1 bank.
  * @chip:	a &struct tpm_chip instance, %NULL for the default chip
  * @pcr_idx:	the PCR to be retrieved
- * @hash:	the hash value used to extend the PCR value
+ * @digests:	array of tpm_digest structures used to extend PCRs
  *
- * Note: with TPM 2.0 extends also those banks for which no digest was
- * specified in order to prevent malicious use of those PCR banks.
+ * Note: callers must pass a digest for every allocated PCR bank, in the same
+ * order of the banks in chip->allocated_banks.
  *
  * Return: same as with tpm_transmit_cmd()
  */
-int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, const u8 *hash)
+int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx,
+		   struct tpm_digest *digests)
 {
 	int rc;
-	struct tpm_digest *digest_list;
 	int i;
 
 	chip = tpm_find_get_ops(chip);
 	if (!chip)
 		return -ENODEV;
 
-	if (chip->flags & TPM_CHIP_FLAG_TPM2) {
-		digest_list = kcalloc(chip->nr_allocated_banks,
-				      sizeof(*digest_list), GFP_KERNEL);
-		if (!digest_list)
-			return -ENOMEM;
-
-		for (i = 0; i < chip->nr_allocated_banks; i++) {
-			digest_list[i].alg_id = chip->allocated_banks[i].alg_id;
-			memcpy(digest_list[i].digest, hash, TPM_DIGEST_SIZE);
-		}
+	for (i = 0; i < chip->nr_allocated_banks; i++)
+		if (digests[i].alg_id != chip->allocated_banks[i].alg_id)
+			return -EINVAL;
 
-		rc = tpm2_pcr_extend(chip, pcr_idx, chip->nr_allocated_banks,
-				     digest_list);
-		kfree(digest_list);
+	if (chip->flags & TPM_CHIP_FLAG_TPM2) {
+		rc = tpm2_pcr_extend(chip, pcr_idx, digests);
 		tpm_put_ops(chip);
 		return rc;
 	}
 
-	rc = tpm1_pcr_extend(chip, pcr_idx, hash,
+	rc = tpm1_pcr_extend(chip, pcr_idx, digests[0].digest,
 			     "attempting extend a PCR value");
 	tpm_put_ops(chip);
 	return rc;
diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
index 4f85ce909122..2cce072f25b5 100644
--- a/drivers/char/tpm/tpm.h
+++ b/drivers/char/tpm/tpm.h
@@ -441,7 +441,7 @@ static inline u32 tpm2_rc_value(u32 rc)
 int tpm2_get_timeouts(struct tpm_chip *chip);
 int tpm2_pcr_read(struct tpm_chip *chip, u32 pcr_idx,
 		  struct tpm_digest *digest, u16 *digest_size_ptr);
-int tpm2_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, u32 count,
+int tpm2_pcr_extend(struct tpm_chip *chip, u32 pcr_idx,
 		    struct tpm_digest *digests);
 int tpm2_get_random(struct tpm_chip *chip, u8 *dest, size_t max);
 void tpm2_flush_context(struct tpm_chip *chip, u32 handle);
diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c
index 6967f15a6585..e74c5b7b64bf 100644
--- a/drivers/char/tpm/tpm2-cmd.c
+++ b/drivers/char/tpm/tpm2-cmd.c
@@ -246,12 +246,11 @@ struct tpm2_null_auth_area {
  *
  * @chip:	TPM chip to use.
  * @pcr_idx:	index of the PCR.
- * @count:	number of digests passed.
  * @digests:	list of pcr banks and corresponding digest values to extend.
  *
  * Return: Same as with tpm_transmit_cmd.
  */
-int tpm2_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, u32 count,
+int tpm2_pcr_extend(struct tpm_chip *chip, u32 pcr_idx,
 		    struct tpm_digest *digests)
 {
 	struct tpm_buf buf;
@@ -259,9 +258,6 @@ int tpm2_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, u32 count,
 	int rc;
 	int i;
 
-	if (count > chip->nr_allocated_banks)
-		return -EINVAL;
-
 	rc = tpm_buf_init(&buf, TPM2_ST_SESSIONS, TPM2_CC_PCR_EXTEND);
 	if (rc)
 		return rc;
@@ -276,9 +272,9 @@ int tpm2_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, u32 count,
 	tpm_buf_append_u32(&buf, sizeof(struct tpm2_null_auth_area));
 	tpm_buf_append(&buf, (const unsigned char *)&auth_area,
 		       sizeof(auth_area));
-	tpm_buf_append_u32(&buf, count);
+	tpm_buf_append_u32(&buf, chip->nr_allocated_banks);
 
-	for (i = 0; i < count; i++) {
+	for (i = 0; i < chip->nr_allocated_banks; i++) {
 		tpm_buf_append_u16(&buf, digests[i].alg_id);
 		tpm_buf_append(&buf, (const unsigned char *)&digests[i].digest,
 			       chip->allocated_banks[i].digest_size);
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 816e686a73ac..1b5436b213a2 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -171,7 +171,8 @@ struct tpm_chip {
 extern int tpm_is_tpm2(struct tpm_chip *chip);
 extern int tpm_pcr_read(struct tpm_chip *chip, u32 pcr_idx,
 			struct tpm_digest *digest);
-extern int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, const u8 *hash);
+extern int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx,
+			  struct tpm_digest *digests);
 extern int tpm_send(struct tpm_chip *chip, void *cmd, size_t buflen);
 extern int tpm_get_random(struct tpm_chip *chip, u8 *data, size_t max);
 extern int tpm_seal_trusted(struct tpm_chip *chip,
@@ -194,7 +195,7 @@ static inline int tpm_pcr_read(struct tpm_chip *chip, int pcr_idx,
 }
 
 static inline int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx,
-				 const u8 *hash)
+				 struct tpm_digest *digests)
 {
 	return -ENODEV;
 }
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index cc12f3449a72..89d65cf8053d 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -153,6 +153,7 @@ int ima_measurements_show(struct seq_file *m, void *v);
 unsigned long ima_get_binary_runtime_size(void);
 int ima_init_template(void);
 void ima_init_template_list(void);
+int __init ima_init_digests(void);
 
 /*
  * used to protect h_table and sha_table
diff --git a/security/integrity/ima/ima_init.c b/security/integrity/ima/ima_init.c
index 6bb42a9c5e47..6c9295449751 100644
--- a/security/integrity/ima/ima_init.c
+++ b/security/integrity/ima/ima_init.c
@@ -123,8 +123,12 @@ int __init ima_init(void)
 	if (rc != 0)
 		return rc;
 
+	/* It can be called before ima_init_digests(), it does not use TPM. */
 	ima_load_kexec_buffer();
 
+	rc = ima_init_digests();
+	if (rc != 0)
+		return rc;
 	rc = ima_add_boot_aggregate();	/* boot aggregate must be first entry */
 	if (rc != 0)
 		return rc;
diff --git a/security/integrity/ima/ima_queue.c b/security/integrity/ima/ima_queue.c
index 0e41dc1df1d4..6b6d044e0440 100644
--- a/security/integrity/ima/ima_queue.c
+++ b/security/integrity/ima/ima_queue.c
@@ -27,6 +27,9 @@
 
 #define AUDIT_CAUSE_LEN_MAX 32
 
+/* pre-allocated array of tpm_digest structures to extend a PCR */
+static struct tpm_digest *digests;
+
 LIST_HEAD(ima_measurements);	/* list of all measurements */
 #ifdef CONFIG_IMA_KEXEC
 static unsigned long binary_runtime_size;
@@ -140,11 +143,15 @@ unsigned long ima_get_binary_runtime_size(void)
 static int ima_pcr_extend(const u8 *hash, int pcr)
 {
 	int result = 0;
+	int i;
 
 	if (!ima_tpm_chip)
 		return result;
 
-	result = tpm_pcr_extend(ima_tpm_chip, pcr, hash);
+	for (i = 0; i < ima_tpm_chip->nr_allocated_banks; i++)
+		memcpy(digests[i].digest, hash, TPM_DIGEST_SIZE);
+
+	result = tpm_pcr_extend(ima_tpm_chip, pcr, digests);
 	if (result != 0)
 		pr_err("Error Communicating to TPM chip, result: %d\n", result);
 	return result;
@@ -211,3 +218,21 @@ int ima_restore_measurement_entry(struct ima_template_entry *entry)
 	mutex_unlock(&ima_extend_list_mutex);
 	return result;
 }
+
+int __init ima_init_digests(void)
+{
+	int i;
+
+	if (!ima_tpm_chip)
+		return 0;
+
+	digests = kcalloc(ima_tpm_chip->nr_allocated_banks, sizeof(*digests),
+			  GFP_NOFS);
+	if (!digests)
+		return -ENOMEM;
+
+	for (i = 0; i < ima_tpm_chip->nr_allocated_banks; i++)
+		digests[i].alg_id = ima_tpm_chip->allocated_banks[i].alg_id;
+
+	return 0;
+}
diff --git a/security/keys/trusted.c b/security/keys/trusted.c
index 5b852263eae1..bcc9c6ead7fd 100644
--- a/security/keys/trusted.c
+++ b/security/keys/trusted.c
@@ -35,6 +35,7 @@
 static const char hmac_alg[] = "hmac(sha1)";
 static const char hash_alg[] = "sha1";
 static struct tpm_chip *chip;
+static struct tpm_digest *digests;
 
 struct sdesc {
 	struct shash_desc shash;
@@ -380,15 +381,10 @@ EXPORT_SYMBOL_GPL(trusted_tpm_send);
  */
 static int pcrlock(const int pcrnum)
 {
-	unsigned char hash[SHA1_DIGEST_SIZE];
-	int ret;
-
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
-	ret = tpm_get_random(chip, hash, SHA1_DIGEST_SIZE);
-	if (ret != SHA1_DIGEST_SIZE)
-		return ret;
-	return tpm_pcr_extend(chip, pcrnum, hash) ? -EINVAL : 0;
+
+	return tpm_pcr_extend(chip, pcrnum, digests) ? -EINVAL : 0;
 }
 
 /*
@@ -1222,6 +1218,29 @@ hashalg_fail:
 	return ret;
 }
 
+static int __init init_digests(void)
+{
+	u8 digest[TPM_MAX_DIGEST_SIZE];
+	int ret;
+	int i;
+
+	ret = tpm_get_random(chip, digest, TPM_MAX_DIGEST_SIZE);
+	if (ret < 0)
+		return ret;
+	if (ret < TPM_MAX_DIGEST_SIZE)
+		return -EFAULT;
+
+	digests = kcalloc(chip->nr_allocated_banks, sizeof(*digests),
+			  GFP_KERNEL);
+	if (!digests)
+		return -ENOMEM;
+
+	for (i = 0; i < chip->nr_allocated_banks; i++)
+		memcpy(digests[i].digest, digest, TPM_MAX_DIGEST_SIZE);
+
+	return 0;
+}
+
 static int __init init_trusted(void)
 {
 	int ret;
@@ -1229,15 +1248,20 @@ static int __init init_trusted(void)
 	chip = tpm_default_chip();
 	if (!chip)
 		return -ENOENT;
-	ret = trusted_shash_alloc();
+	ret = init_digests();
 	if (ret < 0)
 		goto err_put;
+	ret = trusted_shash_alloc();
+	if (ret < 0)
+		goto err_free;
 	ret = register_key_type(&key_type_trusted);
 	if (ret < 0)
 		goto err_release;
 	return 0;
 err_release:
 	trusted_shash_release();
+err_free:
+	kfree(digests);
 err_put:
 	put_device(&chip->dev);
 	return ret;
@@ -1246,6 +1270,7 @@ err_put:
 static void __exit cleanup_trusted(void)
 {
 	put_device(&chip->dev);
+	kfree(digests);
 	trusted_shash_release();
 	unregister_key_type(&key_type_trusted);
 }
-- 
cgit v1.2.3


From ede033e1e863c36729de25b57145fff287415830 Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@bootlin.com>
Date: Thu, 7 Feb 2019 17:28:55 +0100
Subject: dt-bindings: gpio: document the new pull-up/pull-down flags

This commit extends the flags that can be used in GPIO specifiers to
indicate if a pull-up resistor or pull-down resistor should be
enabled.

While some pinctrl DT bindings already offer the capability of
configuring pull-up/pull-down resistors at the pin level, a number of
simple GPIO controllers don't have any pinmuxing capability, and
therefore do not rely on the pinctrl DT bindings.

Such simple GPIO controllers however sometimes allow to configure
pull-up and pull-down resistors on a per-pin basis, and whether such
resistors should be enabled or not is a highly board-specific HW
characteristic.

By using two additional bits of the GPIO flag specifier, we can easily
allow the Device Tree to describe which GPIOs should have their
pull-up or pull-down resistors enabled. Even though the two options
are mutually exclusive, we still need two bits to encode at least
three states: no pull-up/pull-down, pull-up, pull-down.

Signed-off-by: Thomas Petazzoni <thomas.petazzoni@bootlin.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/devicetree/bindings/gpio/gpio.txt | 12 ++++++++++++
 include/dt-bindings/gpio/gpio.h                 |  6 ++++++
 2 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/gpio/gpio.txt b/Documentation/devicetree/bindings/gpio/gpio.txt
index f0ba154b5723..a8895d339bfe 100644
--- a/Documentation/devicetree/bindings/gpio/gpio.txt
+++ b/Documentation/devicetree/bindings/gpio/gpio.txt
@@ -67,6 +67,18 @@ Optional standard bitfield specifiers for the last cell:
            https://en.wikipedia.org/wiki/Open_collector
 - Bit 3: 0 means the output should be maintained during sleep/low-power mode
          1 means the output state can be lost during sleep/low-power mode
+- Bit 4: 0 means no pull-up resistor should be enabled
+         1 means a pull-up resistor should be enabled
+         This setting only applies to hardware with a simple on/off
+         control for pull-up configuration. If the hardware has more
+         elaborate pull-up configuration, it should be represented
+         using a pin control binding.
+- Bit 5: 0 means no pull-down resistor should be enabled
+         1 means a pull-down resistor should be enabled
+         This setting only applies to hardware with a simple on/off
+         control for pull-down configuration. If the hardware has more
+         elaborate pull-down configuration, it should be represented
+         using a pin control binding.
 
 1.1) GPIO specifier best practices
 ----------------------------------
diff --git a/include/dt-bindings/gpio/gpio.h b/include/dt-bindings/gpio/gpio.h
index 2cc10ae4bbb7..c029467e828b 100644
--- a/include/dt-bindings/gpio/gpio.h
+++ b/include/dt-bindings/gpio/gpio.h
@@ -33,4 +33,10 @@
 #define GPIO_PERSISTENT 0
 #define GPIO_TRANSITORY 8
 
+/* Bit 4 express pull up */
+#define GPIO_PULL_UP 16
+
+/* Bit 5 express pull down */
+#define GPIO_PULL_DOWN 32
+
 #endif
-- 
cgit v1.2.3


From d449991c4d1d0663b42db7648510a9911de21298 Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@bootlin.com>
Date: Thu, 7 Feb 2019 17:28:58 +0100
Subject: gpio: add core support for pull-up/pull-down configuration

This commit adds support for configuring the pull-up and pull-down
resistors available in some GPIO controllers. While configuring
pull-up/pull-down is already possible through the pinctrl subsystem,
some GPIO controllers, especially simple ones such as GPIO expanders
on I2C, don't have any pinmuxing capability and therefore do not use
the pinctrl subsystem.

This commit implements the GPIO_PULL_UP and GPIO_PULL_DOWN flags,
which can be used from the Device Tree, to enable a pull-up or
pull-down resistor on a given GPIO.

The flag is simply propagated all the way to the core GPIO subsystem,
where it is used to call the gpio_chip ->set_config callback with the
appropriate existing PIN_CONFIG_BIAS_* values.

Signed-off-by: Thomas Petazzoni <thomas.petazzoni@bootlin.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib-of.c    |  5 +++++
 drivers/gpio/gpiolib.c       | 18 ++++++++++++++++++
 drivers/gpio/gpiolib.h       |  2 ++
 include/linux/gpio/machine.h |  2 ++
 include/linux/of_gpio.h      |  2 ++
 5 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
index a6e1891217e2..9a8b78477f79 100644
--- a/drivers/gpio/gpiolib-of.c
+++ b/drivers/gpio/gpiolib-of.c
@@ -345,6 +345,11 @@ struct gpio_desc *of_find_gpio(struct device *dev, const char *con_id,
 	if (of_flags & OF_GPIO_TRANSITORY)
 		*flags |= GPIO_TRANSITORY;
 
+	if (of_flags & OF_GPIO_PULL_UP)
+		*flags |= GPIO_PULL_UP;
+	if (of_flags & OF_GPIO_PULL_DOWN)
+		*flags |= GPIO_PULL_DOWN;
+
 	return desc;
 }
 
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 1f239aac43df..22d8b37f5319 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -2573,6 +2573,13 @@ int gpiod_direction_input(struct gpio_desc *desc)
 	if (status == 0)
 		clear_bit(FLAG_IS_OUT, &desc->flags);
 
+	if (test_bit(FLAG_PULL_UP, &desc->flags))
+		gpio_set_config(chip, gpio_chip_hwgpio(desc),
+				PIN_CONFIG_BIAS_PULL_UP);
+	else if (test_bit(FLAG_PULL_DOWN, &desc->flags))
+		gpio_set_config(chip, gpio_chip_hwgpio(desc),
+				PIN_CONFIG_BIAS_PULL_DOWN);
+
 	trace_gpio_direction(desc_to_gpio(desc), 1, status);
 
 	return status;
@@ -4050,6 +4057,17 @@ int gpiod_configure_flags(struct gpio_desc *desc, const char *con_id,
 	if (lflags & GPIO_OPEN_SOURCE)
 		set_bit(FLAG_OPEN_SOURCE, &desc->flags);
 
+	if ((lflags & GPIO_PULL_UP) && (lflags & GPIO_PULL_DOWN)) {
+		gpiod_err(desc,
+			  "both pull-up and pull-down enabled, invalid configuration\n");
+		return -EINVAL;
+	}
+
+	if (lflags & GPIO_PULL_UP)
+		set_bit(FLAG_PULL_UP, &desc->flags);
+	else if (lflags & GPIO_PULL_DOWN)
+		set_bit(FLAG_PULL_DOWN, &desc->flags);
+
 	status = gpiod_set_transitory(desc, (lflags & GPIO_TRANSITORY));
 	if (status < 0)
 		return status;
diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h
index bc57f0dc5953..078ab17b96bf 100644
--- a/drivers/gpio/gpiolib.h
+++ b/drivers/gpio/gpiolib.h
@@ -219,6 +219,8 @@ struct gpio_desc {
 #define FLAG_IRQ_IS_ENABLED 10	/* GPIO is connected to an enabled IRQ */
 #define FLAG_IS_HOGGED	11	/* GPIO is hogged */
 #define FLAG_TRANSITORY 12	/* GPIO may lose value in sleep or reset */
+#define FLAG_PULL_UP    13	/* GPIO has pull up enabled */
+#define FLAG_PULL_DOWN  14	/* GPIO has pull down enabled */
 
 	/* Connection label */
 	const char		*label;
diff --git a/include/linux/gpio/machine.h b/include/linux/gpio/machine.h
index daa44eac9241..69673be10213 100644
--- a/include/linux/gpio/machine.h
+++ b/include/linux/gpio/machine.h
@@ -12,6 +12,8 @@ enum gpio_lookup_flags {
 	GPIO_OPEN_SOURCE = (1 << 2),
 	GPIO_PERSISTENT = (0 << 3),
 	GPIO_TRANSITORY = (1 << 3),
+	GPIO_PULL_UP = (1 << 4),
+	GPIO_PULL_DOWN = (1 << 5),
 };
 
 /**
diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h
index 163b79ecd01a..f9737dea9d1f 100644
--- a/include/linux/of_gpio.h
+++ b/include/linux/of_gpio.h
@@ -28,6 +28,8 @@ enum of_gpio_flags {
 	OF_GPIO_SINGLE_ENDED = 0x2,
 	OF_GPIO_OPEN_DRAIN = 0x4,
 	OF_GPIO_TRANSITORY = 0x8,
+	OF_GPIO_PULL_UP = 0x10,
+	OF_GPIO_PULL_DOWN = 0x20,
 };
 
 #ifdef CONFIG_OF_GPIO
-- 
cgit v1.2.3


From b5c231d8c8037f63d34199ea1667bbe1cd9f940f Mon Sep 17 00:00:00 2001
From: Brian Masney <masneyb@onstation.org>
Date: Thu, 7 Feb 2019 21:16:22 -0500
Subject: genirq: introduce irq_domain_translate_twocell

Add a new function irq_domain_translate_twocell() that is to be used as
the translate function in struct irq_domain_ops for the v2 IRQ API.

This patch also changes irq_domain_xlate_twocell() from the v1 IRQ API
to call irq_domain_translate_twocell() in the v2 IRQ API. This required
changes to of_phandle_args_to_fwspec()'s arguments so that it can be
called from multiple places.

Cc: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Brian Masney <masneyb@onstation.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/irqdomain.h |  5 +++++
 kernel/irq/irqdomain.c    | 45 ++++++++++++++++++++++++++++++++++-----------
 2 files changed, 39 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 35965f41d7be..fcefe0c7263f 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -419,6 +419,11 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr,
 			const u32 *intspec, unsigned int intsize,
 			irq_hw_number_t *out_hwirq, unsigned int *out_type);
 
+int irq_domain_translate_twocell(struct irq_domain *d,
+				 struct irq_fwspec *fwspec,
+				 unsigned long *out_hwirq,
+				 unsigned int *out_type);
+
 /* IPI functions */
 int irq_reserve_ipi(struct irq_domain *domain, const struct cpumask *dest);
 int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8b0be4bd6565..56a30d542b8e 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -729,16 +729,17 @@ static int irq_domain_translate(struct irq_domain *d,
 	return 0;
 }
 
-static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data,
+static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
+				      unsigned int count,
 				      struct irq_fwspec *fwspec)
 {
 	int i;
 
-	fwspec->fwnode = irq_data->np ? &irq_data->np->fwnode : NULL;
-	fwspec->param_count = irq_data->args_count;
+	fwspec->fwnode = np ? &np->fwnode : NULL;
+	fwspec->param_count = count;
 
-	for (i = 0; i < irq_data->args_count; i++)
-		fwspec->param[i] = irq_data->args[i];
+	for (i = 0; i < count; i++)
+		fwspec->param[i] = args[i];
 }
 
 unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
@@ -836,7 +837,9 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
 {
 	struct irq_fwspec fwspec;
 
-	of_phandle_args_to_fwspec(irq_data, &fwspec);
+	of_phandle_args_to_fwspec(irq_data->np, irq_data->args,
+				  irq_data->args_count, &fwspec);
+
 	return irq_create_fwspec_mapping(&fwspec);
 }
 EXPORT_SYMBOL_GPL(irq_create_of_mapping);
@@ -928,11 +931,10 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
 			const u32 *intspec, unsigned int intsize,
 			irq_hw_number_t *out_hwirq, unsigned int *out_type)
 {
-	if (WARN_ON(intsize < 2))
-		return -EINVAL;
-	*out_hwirq = intspec[0];
-	*out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
-	return 0;
+	struct irq_fwspec fwspec;
+
+	of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec);
+	return irq_domain_translate_twocell(d, &fwspec, out_hwirq, out_type);
 }
 EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
 
@@ -968,6 +970,27 @@ const struct irq_domain_ops irq_domain_simple_ops = {
 };
 EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
 
+/**
+ * irq_domain_translate_twocell() - Generic translate for direct two cell
+ * bindings
+ *
+ * Device Tree IRQ specifier translation function which works with two cell
+ * bindings where the cell values map directly to the hwirq number
+ * and linux irq flags.
+ */
+int irq_domain_translate_twocell(struct irq_domain *d,
+				 struct irq_fwspec *fwspec,
+				 unsigned long *out_hwirq,
+				 unsigned int *out_type)
+{
+	if (WARN_ON(fwspec->param_count < 2))
+		return -EINVAL;
+	*out_hwirq = fwspec->param[0];
+	*out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_translate_twocell);
+
 int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
 			   int node, const struct irq_affinity_desc *affinity)
 {
-- 
cgit v1.2.3


From 5aa5bd563ce041d931c0dc1fc436dd18c27c60a7 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 7 Feb 2019 21:16:23 -0500
Subject: genirq: introduce irq_chip_mask_ack_parent()

The hierarchical irqchip never before ran into a situation
where the parent is not "simple", i.e. does not implement
.irq_ack() and .irq_mask() like most, but the qcom-pm8xxx.c
happens to implement only .irq_mask_ack().

Since we want to make ssbi-gpio a hierarchical child of this
irqchip, it must *also* only implement .irq_mask_ack()
and call down to the parent, and for this we of course
need irq_chip_mask_ack_parent().

Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Brian Masney <masneyb@onstation.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/irq.h |  1 +
 kernel/irq/chip.c   | 11 +++++++++++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index def2b2aac8b1..9a1a67d2e07d 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -605,6 +605,7 @@ extern void irq_chip_disable_parent(struct irq_data *data);
 extern void irq_chip_ack_parent(struct irq_data *data);
 extern int irq_chip_retrigger_hierarchy(struct irq_data *data);
 extern void irq_chip_mask_parent(struct irq_data *data);
+extern void irq_chip_mask_ack_parent(struct irq_data *data);
 extern void irq_chip_unmask_parent(struct irq_data *data);
 extern void irq_chip_eoi_parent(struct irq_data *data);
 extern int irq_chip_set_affinity_parent(struct irq_data *data,
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 34e969069488..982b75e127c5 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1277,6 +1277,17 @@ void irq_chip_mask_parent(struct irq_data *data)
 }
 EXPORT_SYMBOL_GPL(irq_chip_mask_parent);
 
+/**
+ * irq_chip_mask_ack_parent - Mask and acknowledge the parent interrupt
+ * @data:	Pointer to interrupt specific data
+ */
+void irq_chip_mask_ack_parent(struct irq_data *data)
+{
+	data = data->parent_data;
+	data->chip->irq_mask_ack(data);
+}
+EXPORT_SYMBOL_GPL(irq_chip_mask_ack_parent);
+
 /**
  * irq_chip_unmask_parent - Unmask the parent interrupt
  * @data:	Pointer to interrupt specific data
-- 
cgit v1.2.3


From be3d960b0aeb97438a044d8d3ce75393d198c020 Mon Sep 17 00:00:00 2001
From: Neil Armstrong <narmstrong@baylibre.com>
Date: Tue, 12 Feb 2019 17:28:58 +0100
Subject: dt-bindings: clk: add G12A AO Clock and Reset Bindings

Add bindings for the Amlogic G12A AO Clock and Reset controllers.

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
Acked-by: Jerome Brunet <jbrunet@baylibre.com>
Link: https://lkml.kernel.org/r/20190212162859.20743-2-narmstrong@baylibre.com
---
 .../bindings/clock/amlogic,gxbb-aoclkc.txt         |  1 +
 include/dt-bindings/clock/g12a-aoclkc.h            | 34 ++++++++++++++++++++++
 include/dt-bindings/reset/g12a-aoclkc.h            | 18 ++++++++++++
 3 files changed, 53 insertions(+)
 create mode 100644 include/dt-bindings/clock/g12a-aoclkc.h
 create mode 100644 include/dt-bindings/reset/g12a-aoclkc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/amlogic,gxbb-aoclkc.txt b/Documentation/devicetree/bindings/clock/amlogic,gxbb-aoclkc.txt
index 79511d7bb321..c41f0be5d438 100644
--- a/Documentation/devicetree/bindings/clock/amlogic,gxbb-aoclkc.txt
+++ b/Documentation/devicetree/bindings/clock/amlogic,gxbb-aoclkc.txt
@@ -10,6 +10,7 @@ Required Properties:
 	- GXL (S905X, S905D) : "amlogic,meson-gxl-aoclkc"
 	- GXM (S912) : "amlogic,meson-gxm-aoclkc"
 	- AXG (A113D, A113X) : "amlogic,meson-axg-aoclkc"
+	- G12A (S905X2, S905D2, S905Y2) : "amlogic,meson-g12a-aoclkc"
 	followed by the common "amlogic,meson-gx-aoclkc"
 - clocks: list of clock phandle, one for each entry clock-names.
 - clock-names: should contain the following:
diff --git a/include/dt-bindings/clock/g12a-aoclkc.h b/include/dt-bindings/clock/g12a-aoclkc.h
new file mode 100644
index 000000000000..8db01ffbeb06
--- /dev/null
+++ b/include/dt-bindings/clock/g12a-aoclkc.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+/*
+ * Copyright (c) 2016 BayLibre, SAS
+ * Author: Neil Armstrong <narmstrong@baylibre.com>
+ *
+ * Copyright (c) 2018 Amlogic, inc.
+ * Author: Qiufang Dai <qiufang.dai@amlogic.com>
+ */
+
+#ifndef DT_BINDINGS_CLOCK_AMLOGIC_MESON_G12A_AOCLK
+#define DT_BINDINGS_CLOCK_AMLOGIC_MESON_G12A_AOCLK
+
+#define CLKID_AO_AHB		0
+#define CLKID_AO_IR_IN		1
+#define CLKID_AO_I2C_M0		2
+#define CLKID_AO_I2C_S0		3
+#define CLKID_AO_UART		4
+#define CLKID_AO_PROD_I2C	5
+#define CLKID_AO_UART2		6
+#define CLKID_AO_IR_OUT		7
+#define CLKID_AO_SAR_ADC	8
+#define CLKID_AO_MAILBOX	9
+#define CLKID_AO_M3		10
+#define CLKID_AO_AHB_SRAM	11
+#define CLKID_AO_RTI		12
+#define CLKID_AO_M4_FCLK	13
+#define CLKID_AO_M4_HCLK	14
+#define CLKID_AO_CLK81		15
+#define CLKID_AO_SAR_ADC_CLK	18
+#define CLKID_AO_32K		23
+#define CLKID_AO_CEC		27
+#define CLKID_AO_CTS_RTC_OSCIN	28
+
+#endif
diff --git a/include/dt-bindings/reset/g12a-aoclkc.h b/include/dt-bindings/reset/g12a-aoclkc.h
new file mode 100644
index 000000000000..bd2e2337135c
--- /dev/null
+++ b/include/dt-bindings/reset/g12a-aoclkc.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+/*
+ * Copyright (c) 2016 BayLibre, SAS
+ * Author: Neil Armstrong <narmstrong@baylibre.com>
+ */
+
+#ifndef DT_BINDINGS_RESET_AMLOGIC_MESON_G12A_AOCLK
+#define DT_BINDINGS_RESET_AMLOGIC_MESON_G12A_AOCLK
+
+#define RESET_AO_IR_IN		0
+#define RESET_AO_UART		1
+#define RESET_AO_I2C_M		2
+#define RESET_AO_I2C_S		3
+#define RESET_AO_SAR_ADC	4
+#define RESET_AO_UART2		5
+#define RESET_AO_IR_OUT		6
+
+#endif
-- 
cgit v1.2.3


From 40d08f774c17add49c1db0083390b6a672030b7d Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Sun, 10 Feb 2019 23:26:02 +0100
Subject: dt-bindings: clock: meson8b: add APB clock definition

Commit 8e1dd17c8b0e3f ("dt-bindings: clock: meson8b: export the CPU
post dividers") added a clock with the name "ABP". The actual name of
this clock is "APB".

Add a new #define with the same ID but the correct name. The old #define
will be dropped in a follow-up patch because each commit in the tree
must compile on it's own (the old #define is still used by the clock
controller driver).

Fixes: 8e1dd17c8b0e3f ("dt-bindings: clock: meson8b: export the CPU post dividers")
Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
Acked-by: Jerome Brunet <jbrunet@baylibre.com>
Link: https://lkml.kernel.org/r/20190210222603.6404-2-martin.blumenstingl@googlemail.com
---
 include/dt-bindings/clock/meson8b-clkc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/meson8b-clkc.h b/include/dt-bindings/clock/meson8b-clkc.h
index 5fe2923382d0..8067077a62ca 100644
--- a/include/dt-bindings/clock/meson8b-clkc.h
+++ b/include/dt-bindings/clock/meson8b-clkc.h
@@ -104,6 +104,7 @@
 #define CLKID_MPLL2		95
 #define CLKID_NAND_CLK		112
 #define CLKID_ABP		124
+#define CLKID_APB		124
 #define CLKID_PERIPH		126
 #define CLKID_AXI		128
 #define CLKID_L2_DRAM		130
-- 
cgit v1.2.3


From d9a9ea94f748f47b1d75c6c5e33edcf74476c445 Mon Sep 17 00:00:00 2001
From: Chad Austin <chadaustin@fb.com>
Date: Mon, 7 Jan 2019 16:53:17 -0800
Subject: fuse: support clients that don't implement 'opendir'

Allow filesystems to return ENOSYS from opendir, preventing the kernel from
sending opendir and releasedir messages in the future. This avoids
userspace transitions when filesystems don't need to keep track of state
per directory handle.

A new capability flag, FUSE_NO_OPENDIR_SUPPORT, parallels
FUSE_NO_OPEN_SUPPORT, indicating the new semantics for returning ENOSYS
from opendir.

Signed-off-by: Chad Austin <chadaustin@fb.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/file.c            | 11 +++++++----
 fs/fuse/fuse_i.h          |  3 +++
 fs/fuse/inode.c           |  3 ++-
 include/uapi/linux/fuse.h |  7 ++++++-
 4 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8ee0446a8322..cc6ffd23b80f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -90,7 +90,7 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
 	if (refcount_dec_and_test(&ff->count)) {
 		struct fuse_req *req = ff->reserved_req;
 
-		if (ff->fc->no_open && !isdir) {
+		if (isdir ? ff->fc->no_opendir : ff->fc->no_open) {
 			/*
 			 * Drop the release request when client does not
 			 * implement 'open'
@@ -125,7 +125,7 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 
 	ff->fh = 0;
 	ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */
-	if (!fc->no_open || isdir) {
+	if (isdir ? !fc->no_opendir : !fc->no_open) {
 		struct fuse_open_out outarg;
 		int err;
 
@@ -134,11 +134,14 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 			ff->fh = outarg.fh;
 			ff->open_flags = outarg.open_flags;
 
-		} else if (err != -ENOSYS || isdir) {
+		} else if (err != -ENOSYS) {
 			fuse_file_free(ff);
 			return err;
 		} else {
-			fc->no_open = 1;
+			if (isdir)
+				fc->no_opendir = 1;
+			else
+				fc->no_open = 1;
 		}
 	}
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 033e30af519f..0920c0c032a0 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -630,6 +630,9 @@ struct fuse_conn {
 	/** Is open/release not implemented by fs? */
 	unsigned no_open:1;
 
+	/** Is opendir/releasedir not implemented by fs? */
+	unsigned no_opendir:1;
+
 	/** Is fsync not implemented by fs? */
 	unsigned no_fsync:1;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2bbb7c59d6da..1b3f3b67d9f0 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -972,7 +972,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 		FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
 		FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
 		FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
-		FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS;
+		FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
+		FUSE_NO_OPENDIR_SUPPORT;
 	req->in.h.opcode = FUSE_INIT;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(*arg);
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index b4967d48bfda..2ac598614a8f 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -122,6 +122,9 @@
  *  - add FOPEN_CACHE_DIR
  *  - add FUSE_MAX_PAGES, add max_pages to init_out
  *  - add FUSE_CACHE_SYMLINKS
+ *
+ *  7.29
+ *  - add FUSE_NO_OPENDIR_SUPPORT flag
  */
 
 #ifndef _LINUX_FUSE_H
@@ -157,7 +160,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 28
+#define FUSE_KERNEL_MINOR_VERSION 29
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -259,6 +262,7 @@ struct fuse_file_lock {
  * FUSE_ABORT_ERROR: reading the device after abort returns ECONNABORTED
  * FUSE_MAX_PAGES: init_out.max_pages contains the max number of req pages
  * FUSE_CACHE_SYMLINKS: cache READLINK responses
+ * FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir
  */
 #define FUSE_ASYNC_READ		(1 << 0)
 #define FUSE_POSIX_LOCKS	(1 << 1)
@@ -284,6 +288,7 @@ struct fuse_file_lock {
 #define FUSE_ABORT_ERROR	(1 << 21)
 #define FUSE_MAX_PAGES		(1 << 22)
 #define FUSE_CACHE_SYMLINKS	(1 << 23)
+#define FUSE_NO_OPENDIR_SUPPORT (1 << 24)
 
 /**
  * CUSE INIT request/reply flags
-- 
cgit v1.2.3


From 6f701383368d54e8d43fad2a0bac30f877899568 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 Feb 2019 11:24:00 -0500
Subject: SUNRPC: Display symbolic flag names in RPC trace events

Human-readable flags make it easier to observe RPC scheduling
decisions and other operational details.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/sunrpc.h | 56 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 50 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 0d5d0d91f861..f88b0f52aa7e 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -77,6 +77,50 @@ TRACE_EVENT(rpc_request,
 		)
 );
 
+TRACE_DEFINE_ENUM(RPC_TASK_ASYNC);
+TRACE_DEFINE_ENUM(RPC_TASK_SWAPPER);
+TRACE_DEFINE_ENUM(RPC_CALL_MAJORSEEN);
+TRACE_DEFINE_ENUM(RPC_TASK_ROOTCREDS);
+TRACE_DEFINE_ENUM(RPC_TASK_DYNAMIC);
+TRACE_DEFINE_ENUM(RPC_TASK_KILLED);
+TRACE_DEFINE_ENUM(RPC_TASK_SOFT);
+TRACE_DEFINE_ENUM(RPC_TASK_SOFTCONN);
+TRACE_DEFINE_ENUM(RPC_TASK_SENT);
+TRACE_DEFINE_ENUM(RPC_TASK_TIMEOUT);
+TRACE_DEFINE_ENUM(RPC_TASK_NOCONNECT);
+TRACE_DEFINE_ENUM(RPC_TASK_NO_RETRANS_TIMEOUT);
+
+#define rpc_show_task_flags(flags)					\
+	__print_flags(flags, "|",					\
+		{ RPC_TASK_ASYNC, "ASYNC" },				\
+		{ RPC_TASK_SWAPPER, "SWAPPER" },			\
+		{ RPC_CALL_MAJORSEEN, "MAJORSEEN" },			\
+		{ RPC_TASK_ROOTCREDS, "ROOTCREDS" },			\
+		{ RPC_TASK_DYNAMIC, "DYNAMIC" },			\
+		{ RPC_TASK_KILLED, "KILLED" },				\
+		{ RPC_TASK_SOFT, "SOFT" },				\
+		{ RPC_TASK_SOFTCONN, "SOFTCONN" },			\
+		{ RPC_TASK_SENT, "SENT" },				\
+		{ RPC_TASK_TIMEOUT, "TIMEOUT" },			\
+		{ RPC_TASK_NOCONNECT, "NOCONNECT" },			\
+		{ RPC_TASK_NO_RETRANS_TIMEOUT, "NORTO" })
+
+TRACE_DEFINE_ENUM(RPC_TASK_RUNNING);
+TRACE_DEFINE_ENUM(RPC_TASK_QUEUED);
+TRACE_DEFINE_ENUM(RPC_TASK_ACTIVE);
+TRACE_DEFINE_ENUM(RPC_TASK_NEED_XMIT);
+TRACE_DEFINE_ENUM(RPC_TASK_NEED_RECV);
+TRACE_DEFINE_ENUM(RPC_TASK_MSG_PIN_WAIT);
+
+#define rpc_show_runstate(flags)					\
+	__print_flags(flags, "|",					\
+		{ (1UL << RPC_TASK_RUNNING), "RUNNING" },		\
+		{ (1UL << RPC_TASK_QUEUED), "QUEUED" },			\
+		{ (1UL << RPC_TASK_ACTIVE), "ACTIVE" },			\
+		{ (1UL << RPC_TASK_NEED_XMIT), "NEED_XMIT" },		\
+		{ (1UL << RPC_TASK_NEED_RECV), "NEED_RECV" },		\
+		{ (1UL << RPC_TASK_MSG_PIN_WAIT), "MSG_PIN_WAIT" })
+
 DECLARE_EVENT_CLASS(rpc_task_running,
 
 	TP_PROTO(const struct rpc_task *task, const void *action),
@@ -102,10 +146,10 @@ DECLARE_EVENT_CLASS(rpc_task_running,
 		__entry->flags = task->tk_flags;
 		),
 
-	TP_printk("task:%u@%d flags=%4.4x state=%4.4lx status=%d action=%pf",
+	TP_printk("task:%u@%d flags=%s runstate=%s status=%d action=%pf",
 		__entry->task_id, __entry->client_id,
-		__entry->flags,
-		__entry->runstate,
+		rpc_show_task_flags(__entry->flags),
+		rpc_show_runstate(__entry->runstate),
 		__entry->status,
 		__entry->action
 		)
@@ -149,10 +193,10 @@ DECLARE_EVENT_CLASS(rpc_task_queued,
 		__assign_str(q_name, rpc_qname(q));
 		),
 
-	TP_printk("task:%u@%d flags=%4.4x state=%4.4lx status=%d timeout=%lu queue=%s",
+	TP_printk("task:%u@%d flags=%s runstate=%s status=%d timeout=%lu queue=%s",
 		__entry->task_id, __entry->client_id,
-		__entry->flags,
-		__entry->runstate,
+		rpc_show_task_flags(__entry->flags),
+		rpc_show_runstate(__entry->runstate),
 		__entry->status,
 		__entry->timeout,
 		__get_str(q_name)
-- 
cgit v1.2.3


From ebb09b33c60c46fd4f7ffa0af9e693eebe765d1b Mon Sep 17 00:00:00 2001
From: Leonid Ravich <lravich@gmail.com>
Date: Tue, 12 Feb 2019 22:09:28 +0200
Subject: NTB: add new parameter to peer_db_addr() db_bit and db_data

NTB door bell usage depends on NTB hardware.

ex: intel NTB gen1 has one peer door bell register which can be controlled
by the bitmap writen to it, while Intel NTB gen3 has a registers
per door bell and the data trigering the each door bell is always 1.

therefore exposing only peer door bell address forcing the user
to be aware of such low level details

Signed-off-by: Leonid Ravich <Leonid.Ravich@emc.com>
Acked-by: Logan Gunthorpe <logang@deltatee.com>
Acked-by: Dave Jiang <dave.jiang@intel.com>
Acked-by: Allen Hubbe <allenbh@gmail.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 drivers/ntb/hw/intel/ntb_hw_gen1.c     | 25 +++++++++++++++++++------
 drivers/ntb/hw/intel/ntb_hw_gen1.h     |  5 +++--
 drivers/ntb/hw/intel/ntb_hw_gen3.c     | 33 ++++++++++++++++++++++++++++++++-
 drivers/ntb/hw/mscc/ntb_hw_switchtec.c |  9 ++++++++-
 include/linux/ntb.h                    | 10 +++++++---
 5 files changed, 69 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/ntb/hw/intel/ntb_hw_gen1.c b/drivers/ntb/hw/intel/ntb_hw_gen1.c
index 2ad263f708da..bb57ec239029 100644
--- a/drivers/ntb/hw/intel/ntb_hw_gen1.c
+++ b/drivers/ntb/hw/intel/ntb_hw_gen1.c
@@ -180,7 +180,7 @@ int ndev_mw_to_bar(struct intel_ntb_dev *ndev, int idx)
 	return ndev->reg->mw_bar[idx];
 }
 
-static inline int ndev_db_addr(struct intel_ntb_dev *ndev,
+void ndev_db_addr(struct intel_ntb_dev *ndev,
 			       phys_addr_t *db_addr, resource_size_t *db_size,
 			       phys_addr_t reg_addr, unsigned long reg)
 {
@@ -196,8 +196,6 @@ static inline int ndev_db_addr(struct intel_ntb_dev *ndev,
 		*db_size = ndev->reg->db_size;
 		dev_dbg(&ndev->ntb.pdev->dev, "Peer db size %llx\n", *db_size);
 	}
-
-	return 0;
 }
 
 u64 ndev_db_read(struct intel_ntb_dev *ndev,
@@ -1111,13 +1109,28 @@ int intel_ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits)
 				  ndev->self_reg->db_mask);
 }
 
-int intel_ntb_peer_db_addr(struct ntb_dev *ntb, phys_addr_t *db_addr,
-			   resource_size_t *db_size)
+static int intel_ntb_peer_db_addr(struct ntb_dev *ntb, phys_addr_t *db_addr,
+			   resource_size_t *db_size, u64 *db_data, int db_bit)
 {
+	u64 db_bits;
 	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	return ndev_db_addr(ndev, db_addr, db_size, ndev->peer_addr,
+	if (unlikely(db_bit >= BITS_PER_LONG_LONG))
+		return -EINVAL;
+
+	db_bits = BIT_ULL(db_bit);
+
+	if (unlikely(db_bits & ~ntb_ndev(ntb)->db_valid_mask))
+		return -EINVAL;
+
+	ndev_db_addr(ndev, db_addr, db_size, ndev->peer_addr,
 			    ndev->peer_reg->db_bell);
+
+	if (db_data)
+		*db_data = db_bits;
+
+
+	return 0;
 }
 
 static int intel_ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
diff --git a/drivers/ntb/hw/intel/ntb_hw_gen1.h b/drivers/ntb/hw/intel/ntb_hw_gen1.h
index ad8ec1444436..544cf5c06f4d 100644
--- a/drivers/ntb/hw/intel/ntb_hw_gen1.h
+++ b/drivers/ntb/hw/intel/ntb_hw_gen1.h
@@ -147,6 +147,9 @@ extern struct intel_b2b_addr xeon_b2b_dsd_addr;
 int ndev_init_isr(struct intel_ntb_dev *ndev, int msix_min, int msix_max,
 		int msix_shift, int total_shift);
 enum ntb_topo xeon_ppd_topo(struct intel_ntb_dev *ndev, u8 ppd);
+void ndev_db_addr(struct intel_ntb_dev *ndev,
+				phys_addr_t *db_addr, resource_size_t *db_size,
+				phys_addr_t reg_addr, unsigned long reg);
 u64 ndev_db_read(struct intel_ntb_dev *ndev, void __iomem *mmio);
 int ndev_db_write(struct intel_ntb_dev *ndev, u64 db_bits,
 				void __iomem *mmio);
@@ -166,8 +169,6 @@ int intel_ntb_db_vector_count(struct ntb_dev *ntb);
 u64 intel_ntb_db_vector_mask(struct ntb_dev *ntb, int db_vector);
 int intel_ntb_db_set_mask(struct ntb_dev *ntb, u64 db_bits);
 int intel_ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits);
-int intel_ntb_peer_db_addr(struct ntb_dev *ntb, phys_addr_t *db_addr,
-		resource_size_t *db_size);
 int intel_ntb_spad_is_unsafe(struct ntb_dev *ntb);
 int intel_ntb_spad_count(struct ntb_dev *ntb);
 u32 intel_ntb_spad_read(struct ntb_dev *ntb, int idx);
diff --git a/drivers/ntb/hw/intel/ntb_hw_gen3.c b/drivers/ntb/hw/intel/ntb_hw_gen3.c
index b3fa24778f94..f475b56a3f49 100644
--- a/drivers/ntb/hw/intel/ntb_hw_gen3.c
+++ b/drivers/ntb/hw/intel/ntb_hw_gen3.c
@@ -532,6 +532,37 @@ static int intel_ntb3_mw_set_trans(struct ntb_dev *ntb, int pidx, int idx,
 	return 0;
 }
 
+int intel_ntb3_peer_db_addr(struct ntb_dev *ntb, phys_addr_t *db_addr,
+				resource_size_t *db_size,
+				u64 *db_data, int db_bit)
+{
+	phys_addr_t db_addr_base;
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
+
+	if (unlikely(db_bit >= BITS_PER_LONG_LONG))
+		return -EINVAL;
+
+	if (unlikely(BIT_ULL(db_bit) & ~ntb_ndev(ntb)->db_valid_mask))
+		return -EINVAL;
+
+	ndev_db_addr(ndev, &db_addr_base, db_size, ndev->peer_addr,
+				ndev->peer_reg->db_bell);
+
+	if (db_addr) {
+		*db_addr = db_addr_base + (db_bit * 4);
+		dev_dbg(&ndev->ntb.pdev->dev, "Peer db addr %llx db bit %d\n",
+				*db_addr, db_bit);
+	}
+
+	if (db_data) {
+		*db_data = 1;
+		dev_dbg(&ndev->ntb.pdev->dev, "Peer db data %llx db bit %d\n",
+				*db_data, db_bit);
+	}
+
+	return 0;
+}
+
 static int intel_ntb3_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
 {
 	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
@@ -584,7 +615,7 @@ const struct ntb_dev_ops intel_ntb3_ops = {
 	.db_clear		= intel_ntb3_db_clear,
 	.db_set_mask		= intel_ntb_db_set_mask,
 	.db_clear_mask		= intel_ntb_db_clear_mask,
-	.peer_db_addr		= intel_ntb_peer_db_addr,
+	.peer_db_addr		= intel_ntb3_peer_db_addr,
 	.peer_db_set		= intel_ntb3_peer_db_set,
 	.spad_is_unsafe		= intel_ntb_spad_is_unsafe,
 	.spad_count		= intel_ntb_spad_count,
diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
index f6f00354047b..9ae944597708 100644
--- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
+++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
@@ -710,11 +710,16 @@ static u64 switchtec_ntb_db_read_mask(struct ntb_dev *ntb)
 
 static int switchtec_ntb_peer_db_addr(struct ntb_dev *ntb,
 				      phys_addr_t *db_addr,
-				      resource_size_t *db_size)
+				      resource_size_t *db_size,
+				      u64 *db_data,
+				      int db_bit)
 {
 	struct switchtec_ntb *sndev = ntb_sndev(ntb);
 	unsigned long offset;
 
+	if (unlikely(db_bit >= BITS_PER_LONG_LONG))
+		return -EINVAL;
+
 	offset = (unsigned long)sndev->mmio_peer_dbmsg->odb -
 		(unsigned long)sndev->stdev->mmio;
 
@@ -724,6 +729,8 @@ static int switchtec_ntb_peer_db_addr(struct ntb_dev *ntb,
 		*db_addr = pci_resource_start(ntb->pdev, 0) + offset;
 	if (db_size)
 		*db_size = sizeof(u32);
+	if (db_data)
+		*db_data = BIT_ULL(db_bit) << sndev->db_peer_shift;
 
 	return 0;
 }
diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index 181d16601dd9..56a92e3ae3ae 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -296,7 +296,8 @@ struct ntb_dev_ops {
 	int (*db_clear_mask)(struct ntb_dev *ntb, u64 db_bits);
 
 	int (*peer_db_addr)(struct ntb_dev *ntb,
-			    phys_addr_t *db_addr, resource_size_t *db_size);
+			    phys_addr_t *db_addr, resource_size_t *db_size,
+				u64 *db_data, int db_bit);
 	u64 (*peer_db_read)(struct ntb_dev *ntb);
 	int (*peer_db_set)(struct ntb_dev *ntb, u64 db_bits);
 	int (*peer_db_clear)(struct ntb_dev *ntb, u64 db_bits);
@@ -1078,6 +1079,8 @@ static inline int ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits)
  * @ntb:	NTB device context.
  * @db_addr:	OUT - The address of the peer doorbell register.
  * @db_size:	OUT - The number of bytes to write the peer doorbell register.
+ * @db_data:	OUT - The data of peer doorbell register
+ * @db_bit:		door bell bit number
  *
  * Return the address of the peer doorbell register.  This may be used, for
  * example, by drivers that offload memory copy operations to a dma engine.
@@ -1091,12 +1094,13 @@ static inline int ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits)
  */
 static inline int ntb_peer_db_addr(struct ntb_dev *ntb,
 				   phys_addr_t *db_addr,
-				   resource_size_t *db_size)
+				   resource_size_t *db_size,
+				   u64 *db_data, int db_bit)
 {
 	if (!ntb->ops->peer_db_addr)
 		return -EINVAL;
 
-	return ntb->ops->peer_db_addr(ntb, db_addr, db_size);
+	return ntb->ops->peer_db_addr(ntb, db_addr, db_size, db_data, db_bit);
 }
 
 /**
-- 
cgit v1.2.3


From 0ccc61b1c76e5163c6fea6cf83bd18e7ea244c5b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 Feb 2019 11:24:05 -0500
Subject: SUNRPC: Add xdr_stream::rqst field

Having access to the controlling rpc_rqst means a trace point in the
XDR code can report:

 - the XID
 - the task ID and client ID
 - the p_name of RPC being processed

Subsequent patches will introduce such trace points.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/callback_xdr.c                  |  5 +++--
 fs/nfs/flexfilelayout/flexfilelayout.c |  2 +-
 include/linux/sunrpc/xdr.h             |  8 ++++++--
 net/sunrpc/auth.c                      |  4 ++--
 net/sunrpc/auth_gss/auth_gss.c         |  4 ++--
 net/sunrpc/xdr.c                       | 12 +++++++++---
 net/sunrpc/xprtrdma/backchannel.c      |  2 +-
 net/sunrpc/xprtrdma/rpc_rdma.c         |  4 ++--
 8 files changed, 26 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index a87a56273407..bc7c1766a2be 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -943,10 +943,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp)
 	};
 	unsigned int nops = 0;
 
-	xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base);
+	xdr_init_decode(&xdr_in, &rqstp->rq_arg,
+			rqstp->rq_arg.head[0].iov_base, NULL);
 
 	p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
-	xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
+	xdr_init_encode(&xdr_out, &rqstp->rq_res, p, NULL);
 
 	status = decode_compound_hdr_arg(&xdr_in, &hdr_arg);
 	if (status == htonl(NFS4ERR_RESOURCE))
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 63abe705f4ca..32701b6a9566 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -2036,7 +2036,7 @@ ff_layout_encode_layoutreturn(struct xdr_stream *xdr,
 
 	dprintk("%s: Begin\n", __func__);
 
-	xdr_init_encode(&tmp_xdr, &tmp_buf, NULL);
+	xdr_init_encode(&tmp_xdr, &tmp_buf, NULL, NULL);
 
 	ff_layout_encode_ioerr(&tmp_xdr, args, ff_args);
 	ff_layout_encode_iostats_array(&tmp_xdr, args, ff_args);
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 2ec128060239..787939d13643 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -217,6 +217,8 @@ struct xdr_stream {
 	struct kvec scratch;	/* Scratch buffer */
 	struct page **page_ptr;	/* pointer to the current page */
 	unsigned int nwords;	/* Remaining decode buffer length */
+
+	struct rpc_rqst *rqst;	/* For debugging */
 };
 
 /*
@@ -227,7 +229,8 @@ typedef void	(*kxdreproc_t)(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 typedef int	(*kxdrdproc_t)(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 		void *obj);
 
-extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
+extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf,
+			    __be32 *p, struct rpc_rqst *rqst);
 extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
 extern void xdr_commit_encode(struct xdr_stream *xdr);
 extern void xdr_truncate_encode(struct xdr_stream *xdr, size_t len);
@@ -235,7 +238,8 @@ extern int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen);
 extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
 		unsigned int base, unsigned int len);
 extern unsigned int xdr_stream_pos(const struct xdr_stream *xdr);
-extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
+extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf,
+			    __be32 *p, struct rpc_rqst *rqst);
 extern void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
 		struct page **pages, unsigned int len);
 extern void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index f3023bbc0b7f..8dfab6119e6a 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -798,7 +798,7 @@ static void rpcauth_wrap_req_encode(kxdreproc_t encode, struct rpc_rqst *rqstp,
 {
 	struct xdr_stream xdr;
 
-	xdr_init_encode(&xdr, &rqstp->rq_snd_buf, data);
+	xdr_init_encode(&xdr, &rqstp->rq_snd_buf, data, rqstp);
 	encode(rqstp, &xdr, obj);
 }
 
@@ -823,7 +823,7 @@ rpcauth_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp,
 {
 	struct xdr_stream xdr;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, data);
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, data, rqstp);
 	return decode(rqstp, &xdr, obj);
 }
 
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 1531b0219344..a42672e81792 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1722,7 +1722,7 @@ static void gss_wrap_req_encode(kxdreproc_t encode, struct rpc_rqst *rqstp,
 {
 	struct xdr_stream xdr;
 
-	xdr_init_encode(&xdr, &rqstp->rq_snd_buf, p);
+	xdr_init_encode(&xdr, &rqstp->rq_snd_buf, p, rqstp);
 	encode(rqstp, &xdr, obj);
 }
 
@@ -1998,7 +1998,7 @@ gss_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp,
 {
 	struct xdr_stream xdr;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p, rqstp);
 	return decode(rqstp, &xdr, obj);
 }
 
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index f302c6eb8779..345f08b634ee 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -483,6 +483,7 @@ EXPORT_SYMBOL_GPL(xdr_stream_pos);
  * @xdr: pointer to xdr_stream struct
  * @buf: pointer to XDR buffer in which to encode data
  * @p: current pointer inside XDR buffer
+ * @rqst: pointer to controlling rpc_rqst, for debugging
  *
  * Note: at the moment the RPC client only passes the length of our
  *	 scratch buffer in the xdr_buf's header kvec. Previously this
@@ -491,7 +492,8 @@ EXPORT_SYMBOL_GPL(xdr_stream_pos);
  *	 of the buffer length, and takes care of adjusting the kvec
  *	 length for us.
  */
-void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
+void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
+		     struct rpc_rqst *rqst)
 {
 	struct kvec *iov = buf->head;
 	int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len;
@@ -513,6 +515,7 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
 		buf->len += len;
 		iov->iov_len += len;
 	}
+	xdr->rqst = rqst;
 }
 EXPORT_SYMBOL_GPL(xdr_init_encode);
 
@@ -819,8 +822,10 @@ static bool xdr_set_next_buffer(struct xdr_stream *xdr)
  * @xdr: pointer to xdr_stream struct
  * @buf: pointer to XDR buffer from which to decode data
  * @p: current pointer inside XDR buffer
+ * @rqst: pointer to controlling rpc_rqst, for debugging
  */
-void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
+void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
+		     struct rpc_rqst *rqst)
 {
 	xdr->buf = buf;
 	xdr->scratch.iov_base = NULL;
@@ -836,6 +841,7 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
 		xdr->nwords -= p - xdr->p;
 		xdr->p = p;
 	}
+	xdr->rqst = rqst;
 }
 EXPORT_SYMBOL_GPL(xdr_init_decode);
 
@@ -854,7 +860,7 @@ void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
 	buf->page_len =  len;
 	buf->buflen =  len;
 	buf->len = len;
-	xdr_init_decode(xdr, buf, NULL);
+	xdr_init_decode(xdr, buf, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(xdr_init_decode_pages);
 
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 0de9b3e63770..98c1e43eb7b1 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -123,7 +123,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
 
 	rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
 	xdr_init_encode(&req->rl_stream, &req->rl_hdrbuf,
-			req->rl_rdmabuf->rg_base);
+			req->rl_rdmabuf->rg_base, rqst);
 
 	p = xdr_reserve_space(&req->rl_stream, 28);
 	if (unlikely(!p))
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 7774aee7c013..6c1fb270f127 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -748,7 +748,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
 
 	rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
 	xdr_init_encode(xdr, &req->rl_hdrbuf,
-			req->rl_rdmabuf->rg_base);
+			req->rl_rdmabuf->rg_base, rqst);
 
 	/* Fixed header fields */
 	ret = -EMSGSIZE;
@@ -1329,7 +1329,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 
 	/* Fixed transport header fields */
 	xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf,
-			rep->rr_hdrbuf.head[0].iov_base);
+			rep->rr_hdrbuf.head[0].iov_base, NULL);
 	p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p));
 	if (unlikely(!p))
 		goto out_shortreply;
-- 
cgit v1.2.3


From 5582863f450ca44c472dbf4812a62f4e0e68b0c8 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 Feb 2019 11:24:10 -0500
Subject: SUNRPC: Add XDR overflow trace event

This can help field troubleshooting without needing the overhead of
a full network capture (ie, tcpdump).

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/sunrpc.h | 67 +++++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/xdr.c              | 24 +++++++++++-----
 2 files changed, 84 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index f88b0f52aa7e..fbc41b8142d3 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -254,6 +254,73 @@ TRACE_EVENT(rpc_stats_latency,
 		__entry->backlog, __entry->rtt, __entry->execute)
 );
 
+TRACE_EVENT(rpc_xdr_overflow,
+	TP_PROTO(
+		const struct xdr_stream *xdr,
+		size_t requested
+	),
+
+	TP_ARGS(xdr, requested),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(int, version)
+		__field(size_t, requested)
+		__field(const void *, end)
+		__field(const void *, p)
+		__field(const void *, head_base)
+		__field(size_t, head_len)
+		__field(const void *, tail_base)
+		__field(size_t, tail_len)
+		__field(unsigned int, page_len)
+		__field(unsigned int, len)
+		__string(progname,
+			 xdr->rqst->rq_task->tk_client->cl_program->name)
+		__string(procedure,
+			 xdr->rqst->rq_task->tk_msg.rpc_proc->p_name)
+	),
+
+	TP_fast_assign(
+		if (xdr->rqst) {
+			const struct rpc_task *task = xdr->rqst->rq_task;
+
+			__entry->task_id = task->tk_pid;
+			__entry->client_id = task->tk_client->cl_clid;
+			__assign_str(progname,
+				     task->tk_client->cl_program->name)
+			__entry->version = task->tk_client->cl_vers;
+			__assign_str(procedure, task->tk_msg.rpc_proc->p_name)
+		} else {
+			__entry->task_id = 0;
+			__entry->client_id = 0;
+			__assign_str(progname, "unknown")
+			__entry->version = 0;
+			__assign_str(procedure, "unknown")
+		}
+		__entry->requested = requested;
+		__entry->end = xdr->end;
+		__entry->p = xdr->p;
+		__entry->head_base = xdr->buf->head[0].iov_base,
+		__entry->head_len = xdr->buf->head[0].iov_len,
+		__entry->page_len = xdr->buf->page_len,
+		__entry->tail_base = xdr->buf->tail[0].iov_base,
+		__entry->tail_len = xdr->buf->tail[0].iov_len,
+		__entry->len = xdr->buf->len;
+	),
+
+	TP_printk(
+		"task:%u@%u %sv%d %s requested=%zu p=%p end=%p xdr=[%p,%zu]/%u/[%p,%zu]/%u\n",
+		__entry->task_id, __entry->client_id,
+		__get_str(progname), __entry->version, __get_str(procedure),
+		__entry->requested, __entry->p, __entry->end,
+		__entry->head_base, __entry->head_len,
+		__entry->page_len,
+		__entry->tail_base, __entry->tail_len,
+		__entry->len
+	)
+);
+
 /*
  * First define the enums in the below macros to be exported to userspace
  * via TRACE_DEFINE_ENUM().
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 345f08b634ee..6d0b615a02ae 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -16,6 +16,7 @@
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/msg_prot.h>
 #include <linux/bvec.h>
+#include <trace/events/sunrpc.h>
 
 /*
  * XDR functions for basic NFS types
@@ -554,9 +555,9 @@ static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr,
 	int frag1bytes, frag2bytes;
 
 	if (nbytes > PAGE_SIZE)
-		return NULL; /* Bigger buffers require special handling */
+		goto out_overflow; /* Bigger buffers require special handling */
 	if (xdr->buf->len + nbytes > xdr->buf->buflen)
-		return NULL; /* Sorry, we're totally out of space */
+		goto out_overflow; /* Sorry, we're totally out of space */
 	frag1bytes = (xdr->end - xdr->p) << 2;
 	frag2bytes = nbytes - frag1bytes;
 	if (xdr->iov)
@@ -585,6 +586,9 @@ static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr,
 	xdr->buf->page_len += frag2bytes;
 	xdr->buf->len += nbytes;
 	return p;
+out_overflow:
+	trace_rpc_xdr_overflow(xdr, nbytes);
+	return NULL;
 }
 
 /**
@@ -902,20 +906,23 @@ static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes)
 	size_t cplen = (char *)xdr->end - (char *)xdr->p;
 
 	if (nbytes > xdr->scratch.iov_len)
-		return NULL;
+		goto out_overflow;
 	p = __xdr_inline_decode(xdr, cplen);
 	if (p == NULL)
 		return NULL;
 	memcpy(cpdest, p, cplen);
+	if (!xdr_set_next_buffer(xdr))
+		goto out_overflow;
 	cpdest += cplen;
 	nbytes -= cplen;
-	if (!xdr_set_next_buffer(xdr))
-		return NULL;
 	p = __xdr_inline_decode(xdr, nbytes);
 	if (p == NULL)
 		return NULL;
 	memcpy(cpdest, p, nbytes);
 	return xdr->scratch.iov_base;
+out_overflow:
+	trace_rpc_xdr_overflow(xdr, nbytes);
+	return NULL;
 }
 
 /**
@@ -932,14 +939,17 @@ __be32 * xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
 {
 	__be32 *p;
 
-	if (nbytes == 0)
+	if (unlikely(nbytes == 0))
 		return xdr->p;
 	if (xdr->p == xdr->end && !xdr_set_next_buffer(xdr))
-		return NULL;
+		goto out_overflow;
 	p = __xdr_inline_decode(xdr, nbytes);
 	if (p != NULL)
 		return p;
 	return xdr_copy_to_scratch(xdr, nbytes);
+out_overflow:
+	trace_rpc_xdr_overflow(xdr, nbytes);
+	return NULL;
 }
 EXPORT_SYMBOL_GPL(xdr_inline_decode);
 
-- 
cgit v1.2.3


From 7be9cea3600ba24f45733f823cb4ecb5a40f5db7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 Feb 2019 11:24:16 -0500
Subject: SUNRPC: Add trace event that reports reply page vector alignment

We don't want READ payloads that are partially in the head iovec and
in the page buffer because this requires pull-up, which can be
expensive.

The NFS/RPC client tries hard to predict the size of the head iovec
so that the incoming READ data payload lands only in the page
vector, but it doesn't always get it right. To help diagnose such
problems, add a trace point in the logic that decodes READ-like
operations that reports whether pull-up is being done.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/sunrpc.h | 59 +++++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/xdr.c              | 33 +++++++++++++++++++-----
 2 files changed, 86 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index fbc41b8142d3..627650800676 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -321,6 +321,65 @@ TRACE_EVENT(rpc_xdr_overflow,
 	)
 );
 
+TRACE_EVENT(rpc_xdr_alignment,
+	TP_PROTO(
+		const struct xdr_stream *xdr,
+		size_t offset,
+		unsigned int copied
+	),
+
+	TP_ARGS(xdr, offset, copied),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(int, version)
+		__field(size_t, offset)
+		__field(unsigned int, copied)
+		__field(const void *, head_base)
+		__field(size_t, head_len)
+		__field(const void *, tail_base)
+		__field(size_t, tail_len)
+		__field(unsigned int, page_len)
+		__field(unsigned int, len)
+		__string(progname,
+			 xdr->rqst->rq_task->tk_client->cl_program->name)
+		__string(procedure,
+			 xdr->rqst->rq_task->tk_msg.rpc_proc->p_name)
+	),
+
+	TP_fast_assign(
+		const struct rpc_task *task = xdr->rqst->rq_task;
+
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client->cl_clid;
+		__assign_str(progname,
+			     task->tk_client->cl_program->name)
+		__entry->version = task->tk_client->cl_vers;
+		__assign_str(procedure, task->tk_msg.rpc_proc->p_name)
+
+		__entry->offset = offset;
+		__entry->copied = copied;
+		__entry->head_base = xdr->buf->head[0].iov_base,
+		__entry->head_len = xdr->buf->head[0].iov_len,
+		__entry->page_len = xdr->buf->page_len,
+		__entry->tail_base = xdr->buf->tail[0].iov_base,
+		__entry->tail_len = xdr->buf->tail[0].iov_len,
+		__entry->len = xdr->buf->len;
+	),
+
+	TP_printk(
+		"task:%u@%u %sv%d %s offset=%zu copied=%u xdr=[%p,%zu]/%u/[%p,%zu]/%u\n",
+		__entry->task_id, __entry->client_id,
+		__get_str(progname), __entry->version, __get_str(procedure),
+		__entry->offset, __entry->copied,
+		__entry->head_base, __entry->head_len,
+		__entry->page_len,
+		__entry->tail_base, __entry->tail_len,
+		__entry->len
+	)
+);
+
 /*
  * First define the enums in the below macros to be exported to userspace
  * via TRACE_DEFINE_ENUM().
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 6d0b615a02ae..5f0aa53fa4ae 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -347,13 +347,15 @@ EXPORT_SYMBOL_GPL(_copy_from_pages);
  * 'len' bytes. The extra data is not lost, but is instead
  * moved into the inlined pages and/or the tail.
  */
-static void
+static unsigned int
 xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
 {
 	struct kvec *head, *tail;
 	size_t copy, offs;
 	unsigned int pglen = buf->page_len;
+	unsigned int result;
 
+	result = 0;
 	tail = buf->tail;
 	head = buf->head;
 
@@ -367,6 +369,7 @@ xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
 			copy = tail->iov_len - len;
 			memmove((char *)tail->iov_base + len,
 					tail->iov_base, copy);
+			result += copy;
 		}
 		/* Copy from the inlined pages into the tail */
 		copy = len;
@@ -377,11 +380,13 @@ xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
 			copy = 0;
 		else if (copy > tail->iov_len - offs)
 			copy = tail->iov_len - offs;
-		if (copy != 0)
+		if (copy != 0) {
 			_copy_from_pages((char *)tail->iov_base + offs,
 					buf->pages,
 					buf->page_base + pglen + offs - len,
 					copy);
+			result += copy;
+		}
 		/* Do we also need to copy data from the head into the tail ? */
 		if (len > pglen) {
 			offs = copy = len - pglen;
@@ -391,6 +396,7 @@ xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
 					(char *)head->iov_base +
 					head->iov_len - offs,
 					copy);
+			result += copy;
 		}
 	}
 	/* Now handle pages */
@@ -406,12 +412,15 @@ xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
 		_copy_to_pages(buf->pages, buf->page_base,
 				(char *)head->iov_base + head->iov_len - len,
 				copy);
+		result += copy;
 	}
 	head->iov_len -= len;
 	buf->buflen -= len;
 	/* Have we truncated the message? */
 	if (buf->len > buf->buflen)
 		buf->len = buf->buflen;
+
+	return result;
 }
 
 /**
@@ -423,14 +432,16 @@ xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
  * 'len' bytes. The extra data is not lost, but is instead
  * moved into the tail.
  */
-static void
+static unsigned int
 xdr_shrink_pagelen(struct xdr_buf *buf, size_t len)
 {
 	struct kvec *tail;
 	size_t copy;
 	unsigned int pglen = buf->page_len;
 	unsigned int tailbuf_len;
+	unsigned int result;
 
+	result = 0;
 	tail = buf->tail;
 	BUG_ON (len > pglen);
 
@@ -448,18 +459,22 @@ xdr_shrink_pagelen(struct xdr_buf *buf, size_t len)
 		if (tail->iov_len > len) {
 			char *p = (char *)tail->iov_base + len;
 			memmove(p, tail->iov_base, tail->iov_len - len);
+			result += tail->iov_len - len;
 		} else
 			copy = tail->iov_len;
 		/* Copy from the inlined pages into the tail */
 		_copy_from_pages((char *)tail->iov_base,
 				buf->pages, buf->page_base + pglen - len,
 				copy);
+		result += copy;
 	}
 	buf->page_len -= len;
 	buf->buflen -= len;
 	/* Have we truncated the message? */
 	if (buf->len > buf->buflen)
 		buf->len = buf->buflen;
+
+	return result;
 }
 
 void
@@ -959,13 +974,17 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
 	struct kvec *iov;
 	unsigned int nwords = XDR_QUADLEN(len);
 	unsigned int cur = xdr_stream_pos(xdr);
+	unsigned int copied, offset;
 
 	if (xdr->nwords == 0)
 		return 0;
+
 	/* Realign pages to current pointer position */
-	iov  = buf->head;
+	iov = buf->head;
 	if (iov->iov_len > cur) {
-		xdr_shrink_bufhead(buf, iov->iov_len - cur);
+		offset = iov->iov_len - cur;
+		copied = xdr_shrink_bufhead(buf, offset);
+		trace_rpc_xdr_alignment(xdr, offset, copied);
 		xdr->nwords = XDR_QUADLEN(buf->len - cur);
 	}
 
@@ -977,7 +996,9 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
 		len = buf->page_len;
 	else if (nwords < xdr->nwords) {
 		/* Truncate page data and move it into the tail */
-		xdr_shrink_pagelen(buf, buf->page_len - len);
+		offset = buf->page_len - len;
+		copied = xdr_shrink_pagelen(buf, offset);
+		trace_rpc_xdr_alignment(xdr, offset, copied);
 		xdr->nwords = XDR_QUADLEN(buf->len - cur);
 	}
 	return len;
-- 
cgit v1.2.3


From 347cb6af8710b72cf9685fdc09d07873cf42d51f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 7 Jan 2019 13:36:20 -0500
Subject: dma-mapping: add a kconfig symbol for arch_setup_dma_ops availability

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Paul Burton <paul.burton@mips.com> # MIPS
Acked-by: Catalin Marinas <catalin.marinas@arm.com> # arm64
---
 arch/arc/Kconfig                     |  1 +
 arch/arc/include/asm/Kbuild          |  1 +
 arch/arc/include/asm/dma-mapping.h   | 13 -------------
 arch/arm/Kconfig                     |  1 +
 arch/arm/include/asm/dma-mapping.h   |  4 ----
 arch/arm64/Kconfig                   |  1 +
 arch/arm64/include/asm/dma-mapping.h |  4 ----
 arch/mips/Kconfig                    |  1 +
 arch/mips/include/asm/dma-mapping.h  | 10 ----------
 arch/mips/mm/dma-noncoherent.c       |  8 ++++++++
 include/linux/dma-mapping.h          | 12 ++++++++----
 kernel/dma/Kconfig                   |  3 +++
 12 files changed, 24 insertions(+), 35 deletions(-)
 delete mode 100644 arch/arc/include/asm/dma-mapping.h

(limited to 'include')

diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 376366a7db81..2ab27d88eb1c 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -11,6 +11,7 @@ config ARC
 	select ARC_TIMERS
 	select ARCH_HAS_DMA_COHERENT_TO_PFN
 	select ARCH_HAS_PTE_SPECIAL
+	select ARCH_HAS_SETUP_DMA_OPS
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_SUPPORTS_ATOMIC_RMW if ARC_HAS_LLSC
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index caa270261521..b41f8881ecc8 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -3,6 +3,7 @@ generic-y += bugs.h
 generic-y += compat.h
 generic-y += device.h
 generic-y += div64.h
+generic-y += dma-mapping.h
 generic-y += emergency-restart.h
 generic-y += extable.h
 generic-y += ftrace.h
diff --git a/arch/arc/include/asm/dma-mapping.h b/arch/arc/include/asm/dma-mapping.h
deleted file mode 100644
index c946c0a83e76..000000000000
--- a/arch/arc/include/asm/dma-mapping.h
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier:  GPL-2.0
-// (C) 2018 Synopsys, Inc. (www.synopsys.com)
-
-#ifndef ASM_ARC_DMA_MAPPING_H
-#define ASM_ARC_DMA_MAPPING_H
-
-#include <asm-generic/dma-mapping.h>
-
-void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			const struct iommu_ops *iommu, bool coherent);
-#define arch_setup_dma_ops arch_setup_dma_ops
-
-#endif
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 664e918e2624..c1cf44f00870 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -12,6 +12,7 @@ config ARM
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
 	select ARCH_HAS_PTE_SPECIAL if ARM_LPAE
 	select ARCH_HAS_PHYS_TO_DMA
+	select ARCH_HAS_SETUP_DMA_OPS
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL
 	select ARCH_HAS_STRICT_MODULE_RWX if MMU
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index 31d3b96f0f4b..a224b6e39e58 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -96,10 +96,6 @@ static inline unsigned long dma_max_pfn(struct device *dev)
 }
 #define dma_max_pfn(dev) dma_max_pfn(dev)
 
-#define arch_setup_dma_ops arch_setup_dma_ops
-extern void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			       const struct iommu_ops *iommu, bool coherent);
-
 #ifdef CONFIG_MMU
 #define arch_teardown_dma_ops arch_teardown_dma_ops
 extern void arch_teardown_dma_ops(struct device *dev);
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a4168d366127..63909f318d56 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -22,6 +22,7 @@ config ARM64
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
 	select ARCH_HAS_PTE_SPECIAL
+	select ARCH_HAS_SETUP_DMA_OPS
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_HAS_STRICT_MODULE_RWX
diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h
index 95dbf3ef735a..de96507ee2c1 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -29,10 +29,6 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 	return NULL;
 }
 
-void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			const struct iommu_ops *iommu, bool coherent);
-#define arch_setup_dma_ops	arch_setup_dma_ops
-
 #ifdef CONFIG_IOMMU_DMA
 void arch_teardown_dma_ops(struct device *dev);
 #define arch_teardown_dma_ops	arch_teardown_dma_ops
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 0d14f51d0002..dc5d70f674e0 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1118,6 +1118,7 @@ config DMA_MAYBE_COHERENT
 
 config DMA_PERDEV_COHERENT
 	bool
+	select ARCH_HAS_SETUP_DMA_OPS
 	select DMA_NONCOHERENT
 
 config DMA_NONCOHERENT
diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
index 20dfaad3a55d..34de7b17b41b 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -15,14 +15,4 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 #endif
 }
 
-#define arch_setup_dma_ops arch_setup_dma_ops
-static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base,
-				      u64 size, const struct iommu_ops *iommu,
-				      bool coherent)
-{
-#ifdef CONFIG_DMA_PERDEV_COHERENT
-	dev->dma_coherent = coherent;
-#endif
-}
-
 #endif /* _ASM_DMA_MAPPING_H */
diff --git a/arch/mips/mm/dma-noncoherent.c b/arch/mips/mm/dma-noncoherent.c
index cb38461391cb..0606fc87b294 100644
--- a/arch/mips/mm/dma-noncoherent.c
+++ b/arch/mips/mm/dma-noncoherent.c
@@ -159,3 +159,11 @@ void arch_dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 
 	dma_sync_virt(vaddr, size, direction);
 }
+
+#ifdef CONFIG_DMA_PERDEV_COHERENT
+void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
+		const struct iommu_ops *iommu, bool coherent)
+{
+	dev->dma_coherent = coherent;
+}
+#endif
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index b904d55247ab..2b20d60e6158 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -671,11 +671,15 @@ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask)
 	return dma_set_mask_and_coherent(dev, mask);
 }
 
-#ifndef arch_setup_dma_ops
+#ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
+void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
+		const struct iommu_ops *iommu, bool coherent);
+#else
 static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base,
-				      u64 size, const struct iommu_ops *iommu,
-				      bool coherent) { }
-#endif
+		u64 size, const struct iommu_ops *iommu, bool coherent)
+{
+}
+#endif /* CONFIG_ARCH_HAS_SETUP_DMA_OPS */
 
 #ifndef arch_teardown_dma_ops
 static inline void arch_teardown_dma_ops(struct device *dev) { }
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 61cebea36d89..6014cad35e58 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -19,6 +19,9 @@ config ARCH_HAS_DMA_COHERENCE_H
 config HAVE_GENERIC_DMA_COHERENT
 	bool
 
+config ARCH_HAS_SETUP_DMA_OPS
+	bool
+
 config ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	bool
 
-- 
cgit v1.2.3


From dc2acded38957dfa6b7b7e0203b4b8cb8d818ce6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Dec 2018 22:14:44 +0100
Subject: dma-mapping: add a kconfig symbol for arch_teardown_dma_ops
 availability

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Catalin Marinas <catalin.marinas@arm.com> # arm64
---
 arch/arm/Kconfig                     |  1 +
 arch/arm/include/asm/dma-mapping.h   |  5 -----
 arch/arm64/Kconfig                   |  1 +
 arch/arm64/include/asm/dma-mapping.h |  5 -----
 include/linux/dma-mapping.h          | 10 +++++++---
 kernel/dma/Kconfig                   |  3 +++
 6 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index c1cf44f00870..4bb36ae71b14 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -16,6 +16,7 @@ config ARM
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL
 	select ARCH_HAS_STRICT_MODULE_RWX if MMU
+	select ARCH_HAS_TEARDOWN_DMA_OPS if MMU
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAVE_CUSTOM_GPIO_H
 	select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index a224b6e39e58..03ba90ffc0f8 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -96,11 +96,6 @@ static inline unsigned long dma_max_pfn(struct device *dev)
 }
 #define dma_max_pfn(dev) dma_max_pfn(dev)
 
-#ifdef CONFIG_MMU
-#define arch_teardown_dma_ops arch_teardown_dma_ops
-extern void arch_teardown_dma_ops(struct device *dev);
-#endif
-
 /* do not use this function in a driver */
 static inline bool is_device_dma_coherent(struct device *dev)
 {
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 63909f318d56..87ec7be25e97 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -29,6 +29,7 @@ config ARM64
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYSCALL_WRAPPER
+	select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_INLINE_READ_LOCK if !PREEMPT
diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h
index de96507ee2c1..de98191e4c7d 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -29,11 +29,6 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 	return NULL;
 }
 
-#ifdef CONFIG_IOMMU_DMA
-void arch_teardown_dma_ops(struct device *dev);
-#define arch_teardown_dma_ops	arch_teardown_dma_ops
-#endif
-
 /*
  * Do not use this function in a driver, it is only provided for
  * arch/arm/mm/xen.c, which is used by arm64 as well.
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 2b20d60e6158..4210c5c1dd21 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -681,9 +681,13 @@ static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base,
 }
 #endif /* CONFIG_ARCH_HAS_SETUP_DMA_OPS */
 
-#ifndef arch_teardown_dma_ops
-static inline void arch_teardown_dma_ops(struct device *dev) { }
-#endif
+#ifdef CONFIG_ARCH_HAS_TEARDOWN_DMA_OPS
+void arch_teardown_dma_ops(struct device *dev);
+#else
+static inline void arch_teardown_dma_ops(struct device *dev)
+{
+}
+#endif /* CONFIG_ARCH_HAS_TEARDOWN_DMA_OPS */
 
 static inline unsigned int dma_get_max_seg_size(struct device *dev)
 {
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 6014cad35e58..bde9179c6ed7 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -22,6 +22,9 @@ config HAVE_GENERIC_DMA_COHERENT
 config ARCH_HAS_SETUP_DMA_OPS
 	bool
 
+config ARCH_HAS_TEARDOWN_DMA_OPS
+	bool
+
 config ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	bool
 
-- 
cgit v1.2.3


From 067fb11b12af1448f7bbcacca41e470cb775e9fa Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 Feb 2019 11:24:37 -0500
Subject: SUNRPC: Remove rpc_xprt::tsh_size

tsh_size was added to accommodate transports that send a pre-amble
before each RPC message. However, this assumes the pre-amble is
fixed in size, which isn't true for some transports. That makes
tsh_size not very generic.

Also I'd like to make the estimation of RPC send and receive
buffer sizes more precise. tsh_size doesn't currently appear to be
accounted for at all by call_allocate.

Therefore let's just remove the tsh_size concept, and make the only
transports that have a non-zero tsh_size employ a direct approach.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xprt.h                |  7 ---
 net/sunrpc/auth_gss/auth_gss.c             |  3 +-
 net/sunrpc/clnt.c                          |  1 -
 net/sunrpc/svc.c                           | 19 ++-----
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c |  1 -
 net/sunrpc/xprtrdma/transport.c            |  1 -
 net/sunrpc/xprtsock.c                      | 91 ++++++++++++++++++++----------
 7 files changed, 65 insertions(+), 58 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index ad7e910b119d..3a391544299e 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -196,8 +196,6 @@ struct rpc_xprt {
 
 	size_t			max_payload;	/* largest RPC payload size,
 						   in bytes */
-	unsigned int		tsh_size;	/* size of transport specific
-						   header */
 
 	struct rpc_wait_queue	binding;	/* requests waiting on rpcbind */
 	struct rpc_wait_queue	sending;	/* requests waiting to send */
@@ -362,11 +360,6 @@ struct rpc_xprt *	xprt_alloc(struct net *net, size_t size,
 				unsigned int max_req);
 void			xprt_free(struct rpc_xprt *);
 
-static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 *p)
-{
-	return p + xprt->tsh_size;
-}
-
 static inline int
 xprt_enable_swap(struct rpc_xprt *xprt)
 {
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index a42672e81792..4b52e2b11c58 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1563,8 +1563,7 @@ gss_marshal(struct rpc_task *task, __be32 *p)
 
 	/* We compute the checksum for the verifier over the xdr-encoded bytes
 	 * starting with the xid and ending at the end of the credential: */
-	iov.iov_base = xprt_skip_transport_header(req->rq_xprt,
-					req->rq_snd_buf.head[0].iov_base);
+	iov.iov_base = req->rq_snd_buf.head[0].iov_base;
 	iov.iov_len = (u8 *)p - (u8 *)iov.iov_base;
 	xdr_buf_from_iov(&iov, &verf_buf);
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index d7ec6132c046..c4203f6138ef 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2331,7 +2331,6 @@ rpc_encode_header(struct rpc_task *task)
 
 	/* FIXME: check buffer size? */
 
-	p = xprt_skip_transport_header(req->rq_xprt, p);
 	*p++ = req->rq_xid;		/* XID */
 	*p++ = htonl(RPC_CALL);		/* CALL */
 	*p++ = htonl(RPC_VERSION);	/* RPC version */
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index e87ddb9f7feb..dbd19697ee38 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1144,17 +1144,6 @@ void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...)
 static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) {}
 #endif
 
-/*
- * Setup response header for TCP, it has a 4B record length field.
- */
-static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
-{
-	struct kvec *resv = &rqstp->rq_res.head[0];
-
-	/* tcp needs a space for the record length... */
-	svc_putnl(resv, 0);
-}
-
 /*
  * Common routine for processing the RPC request.
  */
@@ -1182,10 +1171,6 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
 	set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
 	clear_bit(RQ_DROPME, &rqstp->rq_flags);
 
-	/* Setup reply header */
-	if (rqstp->rq_prot == IPPROTO_TCP)
-		svc_tcp_prep_reply_hdr(rqstp);
-
 	svc_putu32(resv, rqstp->rq_xid);
 
 	vers = svc_getnl(argv);
@@ -1443,6 +1428,10 @@ svc_process(struct svc_rqst *rqstp)
 		goto out_drop;
 	}
 
+	/* Reserve space for the record marker */
+	if (rqstp->rq_prot == IPPROTO_TCP)
+		svc_putnl(resv, 0);
+
 	/* Returns 1 for send, 0 for drop */
 	if (likely(svc_process_common(rqstp, argv, resv)))
 		return svc_send(rqstp);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index b908f2ca08fd..907464c2a9f0 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -304,7 +304,6 @@ xprt_setup_rdma_bc(struct xprt_create *args)
 	xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
 
 	xprt->prot = XPRT_TRANSPORT_BC_RDMA;
-	xprt->tsh_size = 0;
 	xprt->ops = &xprt_rdma_bc_procs;
 
 	memcpy(&xprt->addr, args->dstaddr, args->addrlen);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index fbc171ebfe91..e7274dc10120 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -332,7 +332,6 @@ xprt_setup_rdma(struct xprt_create *args)
 	xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
 
 	xprt->resvport = 0;		/* privileged port not needed */
-	xprt->tsh_size = 0;		/* RPC-RDMA handles framing */
 	xprt->ops = &xprt_rdma_procs;
 
 	/*
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 7754aa3e434f..ae09d850cd11 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -696,6 +696,40 @@ xs_stream_reset_connect(struct sock_xprt *transport)
 
 #define XS_SENDMSG_FLAGS	(MSG_DONTWAIT | MSG_NOSIGNAL)
 
+/* Common case:
+ *  - stream transport
+ *  - sending from byte 0 of the message
+ *  - the message is wholly contained in @xdr's head iovec
+ */
+static int xs_send_rm_and_kvec(struct socket *sock, struct xdr_buf *xdr,
+			       unsigned int remainder)
+{
+	struct msghdr msg = {
+		.msg_flags	= XS_SENDMSG_FLAGS | (remainder ? MSG_MORE : 0)
+	};
+	rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT |
+					 (u32)xdr->len);
+	struct kvec iov[2] = {
+		{
+			.iov_base	= &marker,
+			.iov_len	= sizeof(marker)
+		},
+		{
+			.iov_base	= xdr->head[0].iov_base,
+			.iov_len	= xdr->head[0].iov_len
+		},
+	};
+	int ret;
+
+	ret = kernel_sendmsg(sock, &msg, iov, 2,
+			     iov[0].iov_len + iov[1].iov_len);
+	if (ret < 0)
+		return ret;
+	if (ret < iov[0].iov_len)
+		return -EPIPE;
+	return ret - iov[0].iov_len;
+}
+
 static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen, struct kvec *vec, unsigned int base, int more)
 {
 	struct msghdr msg = {
@@ -779,7 +813,11 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
 	if (base < xdr->head[0].iov_len || addr != NULL) {
 		unsigned int len = xdr->head[0].iov_len - base;
 		remainder -= len;
-		err = xs_send_kvec(sock, addr, addrlen, &xdr->head[0], base, remainder != 0);
+		if (!base && !addr)
+			err = xs_send_rm_and_kvec(sock, xdr, remainder);
+		else
+			err = xs_send_kvec(sock, addr, addrlen, &xdr->head[0],
+					   base, remainder != 0);
 		if (remainder == 0 || err != len)
 			goto out;
 		*sent_p += err;
@@ -869,16 +907,6 @@ xs_send_request_was_aborted(struct sock_xprt *transport, struct rpc_rqst *req)
 	return transport->xmit.offset != 0 && req->rq_bytes_sent == 0;
 }
 
-/*
- * Construct a stream transport record marker in @buf.
- */
-static inline void xs_encode_stream_record_marker(struct xdr_buf *buf)
-{
-	u32 reclen = buf->len - sizeof(rpc_fraghdr);
-	rpc_fraghdr *base = buf->head[0].iov_base;
-	*base = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | reclen);
-}
-
 /**
  * xs_local_send_request - write an RPC request to an AF_LOCAL socket
  * @req: pointer to RPC request
@@ -905,8 +933,6 @@ static int xs_local_send_request(struct rpc_rqst *req)
 		return -ENOTCONN;
 	}
 
-	xs_encode_stream_record_marker(&req->rq_snd_buf);
-
 	xs_pktdump("packet data:",
 			req->rq_svec->iov_base, req->rq_svec->iov_len);
 
@@ -1057,8 +1083,6 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
 		return -ENOTCONN;
 	}
 
-	xs_encode_stream_record_marker(&req->rq_snd_buf);
-
 	xs_pktdump("packet data:",
 				req->rq_svec->iov_base,
 				req->rq_svec->iov_len);
@@ -2534,26 +2558,35 @@ static int bc_sendto(struct rpc_rqst *req)
 {
 	int len;
 	struct xdr_buf *xbufp = &req->rq_snd_buf;
-	struct rpc_xprt *xprt = req->rq_xprt;
 	struct sock_xprt *transport =
-				container_of(xprt, struct sock_xprt, xprt);
-	struct socket *sock = transport->sock;
+			container_of(req->rq_xprt, struct sock_xprt, xprt);
 	unsigned long headoff;
 	unsigned long tailoff;
+	struct page *tailpage;
+	struct msghdr msg = {
+		.msg_flags	= MSG_MORE
+	};
+	rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT |
+					 (u32)xbufp->len);
+	struct kvec iov = {
+		.iov_base	= &marker,
+		.iov_len	= sizeof(marker),
+	};
 
-	xs_encode_stream_record_marker(xbufp);
+	len = kernel_sendmsg(transport->sock, &msg, &iov, 1, iov.iov_len);
+	if (len != iov.iov_len)
+		return -EAGAIN;
 
+	tailpage = NULL;
+	if (xbufp->tail[0].iov_len)
+		tailpage = virt_to_page(xbufp->tail[0].iov_base);
 	tailoff = (unsigned long)xbufp->tail[0].iov_base & ~PAGE_MASK;
 	headoff = (unsigned long)xbufp->head[0].iov_base & ~PAGE_MASK;
-	len = svc_send_common(sock, xbufp,
+	len = svc_send_common(transport->sock, xbufp,
 			      virt_to_page(xbufp->head[0].iov_base), headoff,
-			      xbufp->tail[0].iov_base, tailoff);
-
-	if (len != xbufp->len) {
-		printk(KERN_NOTICE "Error sending entire callback!\n");
-		len = -EAGAIN;
-	}
-
+			      tailpage, tailoff);
+	if (len != xbufp->len)
+		return -EAGAIN;
 	return len;
 }
 
@@ -2793,7 +2826,6 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
 	transport = container_of(xprt, struct sock_xprt, xprt);
 
 	xprt->prot = 0;
-	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
 	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
 
 	xprt->bind_timeout = XS_BIND_TO;
@@ -2862,7 +2894,6 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
 	transport = container_of(xprt, struct sock_xprt, xprt);
 
 	xprt->prot = IPPROTO_UDP;
-	xprt->tsh_size = 0;
 	/* XXX: header size can vary due to auth type, IPv6, etc. */
 	xprt->max_payload = (1U << 16) - (MAX_HEADER << 3);
 
@@ -2942,7 +2973,6 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
 	transport = container_of(xprt, struct sock_xprt, xprt);
 
 	xprt->prot = IPPROTO_TCP;
-	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
 	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
 
 	xprt->bind_timeout = XS_BIND_TO;
@@ -3015,7 +3045,6 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
 	transport = container_of(xprt, struct sock_xprt, xprt);
 
 	xprt->prot = IPPROTO_TCP;
-	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
 	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
 	xprt->timeout = &xs_tcp_default_timeout;
 
-- 
cgit v1.2.3


From 2b2812961302c38500c1027778e371c895f1cac4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 25 Dec 2018 14:03:32 +0100
Subject: device.h: dma_mem is only needed for HAVE_GENERIC_DMA_COHERENT

No need to carry an unused field around.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/device.h b/include/linux/device.h
index 6cb4640b6160..be544400acdd 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1017,8 +1017,10 @@ struct device {
 
 	struct list_head	dma_pools;	/* dma pools (if dma'ble) */
 
+#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT
 	struct dma_coherent_mem	*dma_mem; /* internal for coherent mem
 					     override */
+#endif
 #ifdef CONFIG_DMA_CMA
 	struct cma *cma_area;		/* contiguous memory area for dma
 					   allocations */
-- 
cgit v1.2.3


From fe9a270519c72bccb3af524db7ea6c7b67700d50 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 Feb 2019 11:24:43 -0500
Subject: SUNRPC: Add build option to disable support for insecure enctypes

Enable distributions to enforce the rejection of ancient and
insecure Kerberos enctypes in the kernel's RPCSEC_GSS
implementation. These are the single-DES encryption types that
were deprecated in 2012 by RFC 6649.

Enctypes that were deprecated more recently (by RFC 8429) remain
fully supported for now because they are still likely to be widely
used.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Acked-by: Simo Sorce <simo@redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/gss_krb5_enctypes.h | 42 +++++++++++++++++++++++++++++++-
 net/sunrpc/Kconfig                       | 16 ++++++++++++
 net/sunrpc/auth_gss/gss_krb5_mech.c      |  2 ++
 3 files changed, 59 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/gss_krb5_enctypes.h b/include/linux/sunrpc/gss_krb5_enctypes.h
index ec6234eee89c..981c89cef19d 100644
--- a/include/linux/sunrpc/gss_krb5_enctypes.h
+++ b/include/linux/sunrpc/gss_krb5_enctypes.h
@@ -1,4 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
- * Dumb way to share this static piece of information with nfsd
+ * Define the string that exports the set of kernel-supported
+ * Kerberos enctypes. This list is sent via upcall to gssd, and
+ * is also exposed via the nfsd /proc API. The consumers generally
+ * treat this as an ordered list, where the first item in the list
+ * is the most preferred.
+ */
+
+#ifndef _LINUX_SUNRPC_GSS_KRB5_ENCTYPES_H
+#define _LINUX_SUNRPC_GSS_KRB5_ENCTYPES_H
+
+#ifdef CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES
+
+/*
+ * NB: This list includes encryption types that were deprecated
+ * by RFC 8429 (DES3_CBC_SHA1 and ARCFOUR_HMAC).
+ *
+ * ENCTYPE_AES256_CTS_HMAC_SHA1_96
+ * ENCTYPE_AES128_CTS_HMAC_SHA1_96
+ * ENCTYPE_DES3_CBC_SHA1
+ * ENCTYPE_ARCFOUR_HMAC
+ */
+#define KRB5_SUPPORTED_ENCTYPES "18,17,16,23"
+
+#else	/* CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES */
+
+/*
+ * NB: This list includes encryption types that were deprecated
+ * by RFC 8429 and RFC 6649.
+ *
+ * ENCTYPE_AES256_CTS_HMAC_SHA1_96
+ * ENCTYPE_AES128_CTS_HMAC_SHA1_96
+ * ENCTYPE_DES3_CBC_SHA1
+ * ENCTYPE_ARCFOUR_HMAC
+ * ENCTYPE_DES_CBC_MD5
+ * ENCTYPE_DES_CBC_CRC
+ * ENCTYPE_DES_CBC_MD4
  */
 #define KRB5_SUPPORTED_ENCTYPES "18,17,16,23,3,1,2"
+
+#endif	/* CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES */
+
+#endif	/* _LINUX_SUNRPC_GSS_KRB5_ENCTYPES_H */
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index ac09ca803296..83f5617bae07 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -34,6 +34,22 @@ config RPCSEC_GSS_KRB5
 
 	  If unsure, say Y.
 
+config CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES
+	bool "Secure RPC: Disable insecure Kerberos encryption types"
+	depends on RPCSEC_GSS_KRB5
+	default n
+	help
+	  Choose Y here to disable the use of deprecated encryption types
+	  with the Kerberos version 5 GSS-API mechanism (RFC 1964). The
+	  deprecated encryption types include DES-CBC-MD5, DES-CBC-CRC,
+	  and DES-CBC-MD4. These types were deprecated by RFC 6649 because
+	  they were found to be insecure.
+
+	  N is the default because many sites have deployed KDCs and
+	  keytabs that contain only these deprecated encryption types.
+	  Choosing Y prevents the use of known-insecure encryption types
+	  but might result in compatibility problems.
+
 config SUNRPC_DEBUG
 	bool "RPC: Enable dprintk debugging"
 	depends on SUNRPC && SYSCTL
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index eab71fc7af3e..be31a58d54e0 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -53,6 +53,7 @@
 static struct gss_api_mech gss_kerberos_mech;	/* forward declaration */
 
 static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = {
+#ifndef CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES
 	/*
 	 * DES (All DES enctypes are mapped to the same gss functionality)
 	 */
@@ -74,6 +75,7 @@ static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = {
 	  .cksumlength = 8,
 	  .keyed_cksum = 0,
 	},
+#endif	/* CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES */
 	/*
 	 * RC4-HMAC
 	 */
-- 
cgit v1.2.3


From e8680a24a269bd6dcb533f4e4a5faba9ae58925c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 Feb 2019 11:24:48 -0500
Subject: SUNRPC: Use struct xdr_stream when constructing RPC Call header

Modernize and harden the code path that constructs each RPC Call
message.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/auth.h    |  15 ++--
 include/linux/sunrpc/xdr.h     |   6 ++
 include/trace/events/sunrpc.h  |  29 +++++++
 net/sunrpc/auth.c              |  56 ++++++++----
 net/sunrpc/auth_gss/auth_gss.c | 191 ++++++++++++++++++++---------------------
 net/sunrpc/auth_null.c         |  23 +++--
 net/sunrpc/auth_unix.c         |  61 ++++++++-----
 net/sunrpc/clnt.c              |  66 +++++++-------
 8 files changed, 266 insertions(+), 181 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index eed3cb16ccf1..96e237f8e60b 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -131,11 +131,12 @@ struct rpc_credops {
 	void			(*crdestroy)(struct rpc_cred *);
 
 	int			(*crmatch)(struct auth_cred *, struct rpc_cred *, int);
-	__be32 *		(*crmarshal)(struct rpc_task *, __be32 *);
+	int			(*crmarshal)(struct rpc_task *task,
+					     struct xdr_stream *xdr);
 	int			(*crrefresh)(struct rpc_task *);
 	__be32 *		(*crvalidate)(struct rpc_task *, __be32 *);
-	int			(*crwrap_req)(struct rpc_task *, kxdreproc_t,
-						void *, __be32 *, void *);
+	int			(*crwrap_req)(struct rpc_task *task,
+					      struct xdr_stream *xdr);
 	int			(*crunwrap_resp)(struct rpc_task *, kxdrdproc_t,
 						void *, __be32 *, void *);
 	int			(*crkey_timeout)(struct rpc_cred *);
@@ -165,9 +166,13 @@ struct rpc_cred *	rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *
 void			rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *);
 struct rpc_cred *	rpcauth_lookupcred(struct rpc_auth *, int);
 void			put_rpccred(struct rpc_cred *);
-__be32 *		rpcauth_marshcred(struct rpc_task *, __be32 *);
+int			rpcauth_marshcred(struct rpc_task *task,
+					  struct xdr_stream *xdr);
 __be32 *		rpcauth_checkverf(struct rpc_task *, __be32 *);
-int			rpcauth_wrap_req(struct rpc_task *task, kxdreproc_t encode, void *rqstp, __be32 *data, void *obj);
+int			rpcauth_wrap_req_encode(struct rpc_task *task,
+						struct xdr_stream *xdr);
+int			rpcauth_wrap_req(struct rpc_task *task,
+					 struct xdr_stream *xdr);
 int			rpcauth_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp, __be32 *data, void *obj);
 bool			rpcauth_xmit_need_reencode(struct rpc_task *task);
 int			rpcauth_refreshcred(struct rpc_task *);
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 787939d13643..6df9ac1ca471 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -87,6 +87,12 @@ xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
 #define	xdr_one		cpu_to_be32(1)
 #define	xdr_two		cpu_to_be32(2)
 
+#define	rpc_auth_null	cpu_to_be32(RPC_AUTH_NULL)
+#define	rpc_auth_unix	cpu_to_be32(RPC_AUTH_UNIX)
+#define	rpc_auth_gss	cpu_to_be32(RPC_AUTH_GSS)
+
+#define	rpc_call	cpu_to_be32(RPC_CALL)
+
 #define	rpc_success		cpu_to_be32(RPC_SUCCESS)
 #define	rpc_prog_unavail	cpu_to_be32(RPC_PROG_UNAVAIL)
 #define	rpc_prog_mismatch	cpu_to_be32(RPC_PROG_MISMATCH)
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 627650800676..2b3f9d139e75 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -213,6 +213,35 @@ DECLARE_EVENT_CLASS(rpc_task_queued,
 DEFINE_RPC_QUEUED_EVENT(sleep);
 DEFINE_RPC_QUEUED_EVENT(wakeup);
 
+DECLARE_EVENT_CLASS(rpc_failure,
+
+	TP_PROTO(const struct rpc_task *task),
+
+	TP_ARGS(task),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client->cl_clid;
+	),
+
+	TP_printk("task:%u@%u",
+		__entry->task_id, __entry->client_id)
+);
+
+#define DEFINE_RPC_FAILURE(name)					\
+	DEFINE_EVENT(rpc_failure, rpc_bad_##name,			\
+			TP_PROTO(					\
+				const struct rpc_task *task		\
+			),						\
+			TP_ARGS(task))
+
+DEFINE_RPC_FAILURE(callhdr);
+
 TRACE_EVENT(rpc_stats_latency,
 
 	TP_PROTO(
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 275e84e817b7..add2135d9b01 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -756,12 +756,21 @@ destroy:
 }
 EXPORT_SYMBOL_GPL(put_rpccred);
 
-__be32 *
-rpcauth_marshcred(struct rpc_task *task, __be32 *p)
+/**
+ * rpcauth_marshcred - Append RPC credential to end of @xdr
+ * @task: controlling RPC task
+ * @xdr: xdr_stream containing initial portion of RPC Call header
+ *
+ * On success, an appropriate verifier is added to @xdr, @xdr is
+ * updated to point past the verifier, and zero is returned.
+ * Otherwise, @xdr is in an undefined state and a negative errno
+ * is returned.
+ */
+int rpcauth_marshcred(struct rpc_task *task, struct xdr_stream *xdr)
 {
-	struct rpc_cred	*cred = task->tk_rqstp->rq_cred;
+	const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops;
 
-	return cred->cr_ops->crmarshal(task, p);
+	return ops->crmarshal(task, xdr);
 }
 
 __be32 *
@@ -772,26 +781,37 @@ rpcauth_checkverf(struct rpc_task *task, __be32 *p)
 	return cred->cr_ops->crvalidate(task, p);
 }
 
-static void rpcauth_wrap_req_encode(kxdreproc_t encode, struct rpc_rqst *rqstp,
-				   __be32 *data, void *obj)
+/**
+ * rpcauth_wrap_req_encode - XDR encode the RPC procedure
+ * @task: controlling RPC task
+ * @xdr: stream where on-the-wire bytes are to be marshalled
+ *
+ * On success, @xdr contains the encoded and wrapped message.
+ * Otherwise, @xdr is in an undefined state.
+ */
+int rpcauth_wrap_req_encode(struct rpc_task *task, struct xdr_stream *xdr)
 {
-	struct xdr_stream xdr;
+	kxdreproc_t encode = task->tk_msg.rpc_proc->p_encode;
 
-	xdr_init_encode(&xdr, &rqstp->rq_snd_buf, data, rqstp);
-	encode(rqstp, &xdr, obj);
+	encode(task->tk_rqstp, xdr, task->tk_msg.rpc_argp);
+	return 0;
 }
+EXPORT_SYMBOL_GPL(rpcauth_wrap_req_encode);
 
-int
-rpcauth_wrap_req(struct rpc_task *task, kxdreproc_t encode, void *rqstp,
-		__be32 *data, void *obj)
+/**
+ * rpcauth_wrap_req - XDR encode and wrap the RPC procedure
+ * @task: controlling RPC task
+ * @xdr: stream where on-the-wire bytes are to be marshalled
+ *
+ * On success, @xdr contains the encoded and wrapped message,
+ * and zero is returned. Otherwise, @xdr is in an undefined
+ * state and a negative errno is returned.
+ */
+int rpcauth_wrap_req(struct rpc_task *task, struct xdr_stream *xdr)
 {
-	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+	const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops;
 
-	if (cred->cr_ops->crwrap_req)
-		return cred->cr_ops->crwrap_req(task, encode, rqstp, data, obj);
-	/* By default, we encode the arguments normally. */
-	rpcauth_wrap_req_encode(encode, rqstp, data, obj);
-	return 0;
+	return ops->crwrap_req(task, xdr);
 }
 
 static int
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 4b52e2b11c58..b333b1bdad45 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1526,18 +1526,20 @@ out:
 }
 
 /*
-* Marshal credentials.
-* Maybe we should keep a cached credential for performance reasons.
-*/
-static __be32 *
-gss_marshal(struct rpc_task *task, __be32 *p)
+ * Marshal credentials.
+ *
+ * The expensive part is computing the verifier. We can't cache a
+ * pre-computed version of the verifier because the seqno, which
+ * is different every time, is included in the MIC.
+ */
+static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr)
 {
 	struct rpc_rqst *req = task->tk_rqstp;
 	struct rpc_cred *cred = req->rq_cred;
 	struct gss_cred	*gss_cred = container_of(cred, struct gss_cred,
 						 gc_base);
 	struct gss_cl_ctx	*ctx = gss_cred_get_ctx(cred);
-	__be32		*cred_len;
+	__be32		*p, *cred_len;
 	u32             maj_stat = 0;
 	struct xdr_netobj mic;
 	struct kvec	iov;
@@ -1545,7 +1547,13 @@ gss_marshal(struct rpc_task *task, __be32 *p)
 
 	dprintk("RPC: %5u %s\n", task->tk_pid, __func__);
 
-	*p++ = htonl(RPC_AUTH_GSS);
+	/* Credential */
+
+	p = xdr_reserve_space(xdr, 7 * sizeof(*p) +
+			      ctx->gc_wire_ctx.len);
+	if (!p)
+		goto out_put_ctx;
+	*p++ = rpc_auth_gss;
 	cred_len = p++;
 
 	spin_lock(&ctx->gc_seq_lock);
@@ -1554,12 +1562,14 @@ gss_marshal(struct rpc_task *task, __be32 *p)
 	if (req->rq_seqno == MAXSEQ)
 		goto out_expired;
 
-	*p++ = htonl((u32) RPC_GSS_VERSION);
-	*p++ = htonl((u32) ctx->gc_proc);
-	*p++ = htonl((u32) req->rq_seqno);
-	*p++ = htonl((u32) gss_cred->gc_service);
+	*p++ = cpu_to_be32(RPC_GSS_VERSION);
+	*p++ = cpu_to_be32(ctx->gc_proc);
+	*p++ = cpu_to_be32(req->rq_seqno);
+	*p++ = cpu_to_be32(gss_cred->gc_service);
 	p = xdr_encode_netobj(p, &ctx->gc_wire_ctx);
-	*cred_len = htonl((p - (cred_len + 1)) << 2);
+	*cred_len = cpu_to_be32((p - (cred_len + 1)) << 2);
+
+	/* Verifier */
 
 	/* We compute the checksum for the verifier over the xdr-encoded bytes
 	 * starting with the xid and ending at the end of the credential: */
@@ -1567,27 +1577,27 @@ gss_marshal(struct rpc_task *task, __be32 *p)
 	iov.iov_len = (u8 *)p - (u8 *)iov.iov_base;
 	xdr_buf_from_iov(&iov, &verf_buf);
 
-	/* set verifier flavor*/
-	*p++ = htonl(RPC_AUTH_GSS);
-
+	p = xdr_reserve_space(xdr, sizeof(*p));
+	if (!p)
+		goto out_put_ctx;
+	*p++ = rpc_auth_gss;
 	mic.data = (u8 *)(p + 1);
 	maj_stat = gss_get_mic(ctx->gc_gss_ctx, &verf_buf, &mic);
-	if (maj_stat == GSS_S_CONTEXT_EXPIRED) {
+	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
 		goto out_expired;
-	} else if (maj_stat != 0) {
-		pr_warn("gss_marshal: gss_get_mic FAILED (%d)\n", maj_stat);
-		task->tk_status = -EIO;
+	else if (maj_stat != 0)
+		goto out_put_ctx;
+	if (xdr_stream_encode_opaque_inline(xdr, (void **)&p, mic.len) < 0)
 		goto out_put_ctx;
-	}
-	p = xdr_encode_opaque(p, NULL, mic.len);
 	gss_put_ctx(ctx);
-	return p;
+	return 0;
 out_expired:
+	gss_put_ctx(ctx);
 	clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
-	task->tk_status = -EKEYEXPIRED;
+	return -EKEYEXPIRED;
 out_put_ctx:
 	gss_put_ctx(ctx);
-	return NULL;
+	return -EMSGSIZE;
 }
 
 static int gss_renew_cred(struct rpc_task *task)
@@ -1716,61 +1726,45 @@ out_bad:
 	return ret;
 }
 
-static void gss_wrap_req_encode(kxdreproc_t encode, struct rpc_rqst *rqstp,
-				__be32 *p, void *obj)
-{
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &rqstp->rq_snd_buf, p, rqstp);
-	encode(rqstp, &xdr, obj);
-}
-
-static inline int
-gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
-		   kxdreproc_t encode, struct rpc_rqst *rqstp,
-		   __be32 *p, void *obj)
+static int gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+			      struct rpc_task *task, struct xdr_stream *xdr)
 {
-	struct xdr_buf	*snd_buf = &rqstp->rq_snd_buf;
-	struct xdr_buf	integ_buf;
-	__be32          *integ_len = NULL;
+	struct rpc_rqst *rqstp = task->tk_rqstp;
+	struct xdr_buf integ_buf, *snd_buf = &rqstp->rq_snd_buf;
 	struct xdr_netobj mic;
-	u32		offset;
-	__be32		*q;
-	struct kvec	*iov;
-	u32             maj_stat = 0;
-	int		status = -EIO;
+	__be32 *p, *integ_len;
+	u32 offset, maj_stat;
 
+	p = xdr_reserve_space(xdr, 2 * sizeof(*p));
+	if (!p)
+		goto wrap_failed;
 	integ_len = p++;
-	offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
-	*p++ = htonl(rqstp->rq_seqno);
+	*p = cpu_to_be32(rqstp->rq_seqno);
 
-	gss_wrap_req_encode(encode, rqstp, p, obj);
+	if (rpcauth_wrap_req_encode(task, xdr))
+		goto wrap_failed;
 
+	offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
 	if (xdr_buf_subsegment(snd_buf, &integ_buf,
 				offset, snd_buf->len - offset))
-		return status;
-	*integ_len = htonl(integ_buf.len);
+		goto wrap_failed;
+	*integ_len = cpu_to_be32(integ_buf.len);
 
-	/* guess whether we're in the head or the tail: */
-	if (snd_buf->page_len || snd_buf->tail[0].iov_len)
-		iov = snd_buf->tail;
-	else
-		iov = snd_buf->head;
-	p = iov->iov_base + iov->iov_len;
+	p = xdr_reserve_space(xdr, 0);
+	if (!p)
+		goto wrap_failed;
 	mic.data = (u8 *)(p + 1);
-
 	maj_stat = gss_get_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
-	status = -EIO; /* XXX? */
 	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
 		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
 	else if (maj_stat)
-		return status;
-	q = xdr_encode_opaque(p, NULL, mic.len);
-
-	offset = (u8 *)q - (u8 *)p;
-	iov->iov_len += offset;
-	snd_buf->len += offset;
+		goto wrap_failed;
+	/* Check that the trailing MIC fit in the buffer, after the fact */
+	if (xdr_stream_encode_opaque_inline(xdr, (void **)&p, mic.len) < 0)
+		goto wrap_failed;
 	return 0;
+wrap_failed:
+	return -EMSGSIZE;
 }
 
 static void
@@ -1821,61 +1815,63 @@ out:
 	return -EAGAIN;
 }
 
-static inline int
-gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
-		  kxdreproc_t encode, struct rpc_rqst *rqstp,
-		  __be32 *p, void *obj)
+static int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+			     struct rpc_task *task, struct xdr_stream *xdr)
 {
+	struct rpc_rqst *rqstp = task->tk_rqstp;
 	struct xdr_buf	*snd_buf = &rqstp->rq_snd_buf;
-	u32		offset;
-	u32             maj_stat;
+	u32		pad, offset, maj_stat;
 	int		status;
-	__be32		*opaque_len;
+	__be32		*p, *opaque_len;
 	struct page	**inpages;
 	int		first;
-	int		pad;
 	struct kvec	*iov;
-	char		*tmp;
 
+	status = -EIO;
+	p = xdr_reserve_space(xdr, 2 * sizeof(*p));
+	if (!p)
+		goto wrap_failed;
 	opaque_len = p++;
-	offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
-	*p++ = htonl(rqstp->rq_seqno);
+	*p = cpu_to_be32(rqstp->rq_seqno);
 
-	gss_wrap_req_encode(encode, rqstp, p, obj);
+	if (rpcauth_wrap_req_encode(task, xdr))
+		goto wrap_failed;
 
 	status = alloc_enc_pages(rqstp);
-	if (status)
-		return status;
+	if (unlikely(status))
+		goto wrap_failed;
 	first = snd_buf->page_base >> PAGE_SHIFT;
 	inpages = snd_buf->pages + first;
 	snd_buf->pages = rqstp->rq_enc_pages;
 	snd_buf->page_base -= first << PAGE_SHIFT;
 	/*
-	 * Give the tail its own page, in case we need extra space in the
-	 * head when wrapping:
+	 * Move the tail into its own page, in case gss_wrap needs
+	 * more space in the head when wrapping.
 	 *
-	 * call_allocate() allocates twice the slack space required
-	 * by the authentication flavor to rq_callsize.
-	 * For GSS, slack is GSS_CRED_SLACK.
+	 * Still... Why can't gss_wrap just slide the tail down?
 	 */
 	if (snd_buf->page_len || snd_buf->tail[0].iov_len) {
+		char *tmp;
+
 		tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]);
 		memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len);
 		snd_buf->tail[0].iov_base = tmp;
 	}
+	status = -EIO;
+	offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
 	maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages);
 	/* slack space should prevent this ever happening: */
-	BUG_ON(snd_buf->len > snd_buf->buflen);
-	status = -EIO;
+	if (unlikely(snd_buf->len > snd_buf->buflen))
+		goto wrap_failed;
 	/* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was
 	 * done anyway, so it's safe to put the request on the wire: */
 	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
 		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
 	else if (maj_stat)
-		return status;
+		goto wrap_failed;
 
-	*opaque_len = htonl(snd_buf->len - offset);
-	/* guess whether we're in the head or the tail: */
+	*opaque_len = cpu_to_be32(snd_buf->len - offset);
+	/* guess whether the pad goes into the head or the tail: */
 	if (snd_buf->page_len || snd_buf->tail[0].iov_len)
 		iov = snd_buf->tail;
 	else
@@ -1887,37 +1883,36 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 	snd_buf->len += pad;
 
 	return 0;
+wrap_failed:
+	return status;
 }
 
-static int
-gss_wrap_req(struct rpc_task *task,
-	     kxdreproc_t encode, void *rqstp, __be32 *p, void *obj)
+static int gss_wrap_req(struct rpc_task *task, struct xdr_stream *xdr)
 {
 	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 	struct gss_cred	*gss_cred = container_of(cred, struct gss_cred,
 			gc_base);
 	struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
-	int             status = -EIO;
+	int status;
 
 	dprintk("RPC: %5u %s\n", task->tk_pid, __func__);
+	status = -EIO;
 	if (ctx->gc_proc != RPC_GSS_PROC_DATA) {
 		/* The spec seems a little ambiguous here, but I think that not
 		 * wrapping context destruction requests makes the most sense.
 		 */
-		gss_wrap_req_encode(encode, rqstp, p, obj);
-		status = 0;
+		status = rpcauth_wrap_req_encode(task, xdr);
 		goto out;
 	}
 	switch (gss_cred->gc_service) {
 	case RPC_GSS_SVC_NONE:
-		gss_wrap_req_encode(encode, rqstp, p, obj);
-		status = 0;
+		status = rpcauth_wrap_req_encode(task, xdr);
 		break;
 	case RPC_GSS_SVC_INTEGRITY:
-		status = gss_wrap_req_integ(cred, ctx, encode, rqstp, p, obj);
+		status = gss_wrap_req_integ(cred, ctx, task, xdr);
 		break;
 	case RPC_GSS_SVC_PRIVACY:
-		status = gss_wrap_req_priv(cred, ctx, encode, rqstp, p, obj);
+		status = gss_wrap_req_priv(cred, ctx, task, xdr);
 		break;
 	}
 out:
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index d0ceac57c06e..797f8472c21b 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -59,15 +59,21 @@ nul_match(struct auth_cred *acred, struct rpc_cred *cred, int taskflags)
 /*
  * Marshal credential.
  */
-static __be32 *
-nul_marshal(struct rpc_task *task, __be32 *p)
+static int
+nul_marshal(struct rpc_task *task, struct xdr_stream *xdr)
 {
-	*p++ = htonl(RPC_AUTH_NULL);
-	*p++ = 0;
-	*p++ = htonl(RPC_AUTH_NULL);
-	*p++ = 0;
-
-	return p;
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4 * sizeof(*p));
+	if (!p)
+		return -EMSGSIZE;
+	/* Credential */
+	*p++ = rpc_auth_null;
+	*p++ = xdr_zero;
+	/* Verifier */
+	*p++ = rpc_auth_null;
+	*p   = xdr_zero;
+	return 0;
 }
 
 /*
@@ -125,6 +131,7 @@ const struct rpc_credops null_credops = {
 	.crdestroy	= nul_destroy_cred,
 	.crmatch	= nul_match,
 	.crmarshal	= nul_marshal,
+	.crwrap_req	= rpcauth_wrap_req_encode,
 	.crrefresh	= nul_refresh,
 	.crvalidate	= nul_validate,
 };
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index fc8a59134640..1d5b7ed9c6f7 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -99,37 +99,55 @@ unx_match(struct auth_cred *acred, struct rpc_cred *cred, int flags)
  * Marshal credentials.
  * Maybe we should keep a cached credential for performance reasons.
  */
-static __be32 *
-unx_marshal(struct rpc_task *task, __be32 *p)
+static int
+unx_marshal(struct rpc_task *task, struct xdr_stream *xdr)
 {
 	struct rpc_clnt	*clnt = task->tk_client;
 	struct rpc_cred	*cred = task->tk_rqstp->rq_cred;
-	__be32		*base, *hold;
+	__be32		*p, *cred_len, *gidarr_len;
 	int		i;
 	struct group_info *gi = cred->cr_cred->group_info;
 
-	*p++ = htonl(RPC_AUTH_UNIX);
-	base = p++;
-	*p++ = htonl(jiffies/HZ);
-
-	/*
-	 * Copy the UTS nodename captured when the client was created.
-	 */
-	p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen);
-
-	*p++ = htonl((u32) from_kuid(&init_user_ns, cred->cr_cred->fsuid));
-	*p++ = htonl((u32) from_kgid(&init_user_ns, cred->cr_cred->fsgid));
-	hold = p++;
+	/* Credential */
+
+	p = xdr_reserve_space(xdr, 3 * sizeof(*p));
+	if (!p)
+		goto marshal_failed;
+	*p++ = rpc_auth_unix;
+	cred_len = p++;
+	*p++ = xdr_zero;	/* stamp */
+	if (xdr_stream_encode_opaque(xdr, clnt->cl_nodename,
+				     clnt->cl_nodelen) < 0)
+		goto marshal_failed;
+	p = xdr_reserve_space(xdr, 3 * sizeof(*p));
+	if (!p)
+		goto marshal_failed;
+	*p++ = cpu_to_be32(from_kuid(&init_user_ns, cred->cr_cred->fsuid));
+	*p++ = cpu_to_be32(from_kgid(&init_user_ns, cred->cr_cred->fsgid));
+
+	gidarr_len = p++;
 	if (gi)
 		for (i = 0; i < UNX_NGROUPS && i < gi->ngroups; i++)
-			*p++ = htonl((u32) from_kgid(&init_user_ns, gi->gid[i]));
-	*hold = htonl(p - hold - 1);		/* gid array length */
-	*base = htonl((p - base - 1) << 2);	/* cred length */
+			*p++ = cpu_to_be32(from_kgid(&init_user_ns,
+						     gi->gid[i]));
+	*gidarr_len = cpu_to_be32(p - gidarr_len - 1);
+	*cred_len = cpu_to_be32((p - cred_len - 1) << 2);
+	p = xdr_reserve_space(xdr, (p - gidarr_len - 1) << 2);
+	if (!p)
+		goto marshal_failed;
+
+	/* Verifier */
+
+	p = xdr_reserve_space(xdr, 2 * sizeof(*p));
+	if (!p)
+		goto marshal_failed;
+	*p++ = rpc_auth_null;
+	*p   = xdr_zero;
 
-	*p++ = htonl(RPC_AUTH_NULL);
-	*p++ = htonl(0);
+	return 0;
 
-	return p;
+marshal_failed:
+	return -EMSGSIZE;
 }
 
 /*
@@ -202,6 +220,7 @@ const struct rpc_credops unix_credops = {
 	.crdestroy	= unx_destroy_cred,
 	.crmatch	= unx_match,
 	.crmarshal	= unx_marshal,
+	.crwrap_req	= rpcauth_wrap_req_encode,
 	.crrefresh	= unx_refresh,
 	.crvalidate	= unx_validate,
 };
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index c4203f6138ef..d6750b7f169a 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -77,7 +77,8 @@ static void	call_timeout(struct rpc_task *task);
 static void	call_connect(struct rpc_task *task);
 static void	call_connect_status(struct rpc_task *task);
 
-static __be32	*rpc_encode_header(struct rpc_task *task);
+static int	rpc_encode_header(struct rpc_task *task,
+				  struct xdr_stream *xdr);
 static __be32	*rpc_verify_header(struct rpc_task *task);
 static int	rpc_ping(struct rpc_clnt *clnt);
 
@@ -1728,10 +1729,7 @@ static void
 rpc_xdr_encode(struct rpc_task *task)
 {
 	struct rpc_rqst	*req = task->tk_rqstp;
-	kxdreproc_t	encode;
-	__be32		*p;
-
-	dprint_status(task);
+	struct xdr_stream xdr;
 
 	xdr_buf_init(&req->rq_snd_buf,
 		     req->rq_buffer,
@@ -1740,18 +1738,13 @@ rpc_xdr_encode(struct rpc_task *task)
 		     req->rq_rbuffer,
 		     req->rq_rcvsize);
 
-	p = rpc_encode_header(task);
-	if (p == NULL)
+	req->rq_snd_buf.head[0].iov_len = 0;
+	xdr_init_encode(&xdr, &req->rq_snd_buf,
+			req->rq_snd_buf.head[0].iov_base, req);
+	if (rpc_encode_header(task, &xdr))
 		return;
 
-	encode = task->tk_msg.rpc_proc->p_encode;
-	if (encode == NULL)
-		return;
-
-	task->tk_status = rpcauth_wrap_req(task, encode, req, p,
-			task->tk_msg.rpc_argp);
-	if (task->tk_status == 0)
-		xprt_request_prepare(req);
+	task->tk_status = rpcauth_wrap_req(task, &xdr);
 }
 
 /*
@@ -1762,6 +1755,7 @@ call_encode(struct rpc_task *task)
 {
 	if (!rpc_task_need_encode(task))
 		goto out;
+	dprint_status(task);
 	/* Encode here so that rpcsec_gss can use correct sequence number. */
 	rpc_xdr_encode(task);
 	/* Did the encode result in an error condition? */
@@ -1779,6 +1773,8 @@ call_encode(struct rpc_task *task)
 			rpc_exit(task, task->tk_status);
 		}
 		return;
+	} else {
+		xprt_request_prepare(task->tk_rqstp);
 	}
 
 	/* Add task to reply queue before transmission to avoid races */
@@ -2322,25 +2318,33 @@ out_retry:
 	}
 }
 
-static __be32 *
-rpc_encode_header(struct rpc_task *task)
+static int
+rpc_encode_header(struct rpc_task *task, struct xdr_stream *xdr)
 {
 	struct rpc_clnt *clnt = task->tk_client;
 	struct rpc_rqst	*req = task->tk_rqstp;
-	__be32		*p = req->rq_svec[0].iov_base;
-
-	/* FIXME: check buffer size? */
-
-	*p++ = req->rq_xid;		/* XID */
-	*p++ = htonl(RPC_CALL);		/* CALL */
-	*p++ = htonl(RPC_VERSION);	/* RPC version */
-	*p++ = htonl(clnt->cl_prog);	/* program number */
-	*p++ = htonl(clnt->cl_vers);	/* program version */
-	*p++ = htonl(task->tk_msg.rpc_proc->p_proc);	/* procedure */
-	p = rpcauth_marshcred(task, p);
-	if (p)
-		req->rq_slen = xdr_adjust_iovec(&req->rq_svec[0], p);
-	return p;
+	__be32 *p;
+	int error;
+
+	error = -EMSGSIZE;
+	p = xdr_reserve_space(xdr, RPC_CALLHDRSIZE << 2);
+	if (!p)
+		goto out_fail;
+	*p++ = req->rq_xid;
+	*p++ = rpc_call;
+	*p++ = cpu_to_be32(RPC_VERSION);
+	*p++ = cpu_to_be32(clnt->cl_prog);
+	*p++ = cpu_to_be32(clnt->cl_vers);
+	*p   = cpu_to_be32(task->tk_msg.rpc_proc->p_proc);
+
+	error = rpcauth_marshcred(task, xdr);
+	if (error < 0)
+		goto out_fail;
+	return 0;
+out_fail:
+	trace_rpc_bad_callhdr(task);
+	rpc_exit(task, error);
+	return error;
 }
 
 static __be32 *
-- 
cgit v1.2.3


From 7f5667a5f8c4ff85b14ccce9d41f9244bd30ab68 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 Feb 2019 11:24:53 -0500
Subject: SUNRPC: Clean up rpc_verify_header()

- Recover some instruction count because I'm about to introduce a
  few xdr_inline_decode call sites
- Replace dprintk() call sites with trace points
- Reduce the hot path so it fits in fewer cachelines

I've also renamed it rpc_decode_header() to match everything else
in the RPC client.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xdr.h    |   7 +-
 include/trace/events/sunrpc.h |  52 ++++++++++
 net/sunrpc/clnt.c             | 223 ++++++++++++++++++------------------------
 3 files changed, 154 insertions(+), 128 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 6df9ac1ca471..c54041950cc0 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -92,6 +92,9 @@ xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
 #define	rpc_auth_gss	cpu_to_be32(RPC_AUTH_GSS)
 
 #define	rpc_call	cpu_to_be32(RPC_CALL)
+#define	rpc_reply	cpu_to_be32(RPC_REPLY)
+
+#define	rpc_msg_accepted	cpu_to_be32(RPC_MSG_ACCEPTED)
 
 #define	rpc_success		cpu_to_be32(RPC_SUCCESS)
 #define	rpc_prog_unavail	cpu_to_be32(RPC_PROG_UNAVAIL)
@@ -101,6 +104,9 @@ xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
 #define	rpc_system_err		cpu_to_be32(RPC_SYSTEM_ERR)
 #define	rpc_drop_reply		cpu_to_be32(RPC_DROP_REPLY)
 
+#define	rpc_mismatch		cpu_to_be32(RPC_MISMATCH)
+#define	rpc_auth_error		cpu_to_be32(RPC_AUTH_ERROR)
+
 #define	rpc_auth_ok		cpu_to_be32(RPC_AUTH_OK)
 #define	rpc_autherr_badcred	cpu_to_be32(RPC_AUTH_BADCRED)
 #define	rpc_autherr_rejectedcred cpu_to_be32(RPC_AUTH_REJECTEDCRED)
@@ -109,7 +115,6 @@ xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
 #define	rpc_autherr_tooweak	cpu_to_be32(RPC_AUTH_TOOWEAK)
 #define	rpcsec_gsserr_credproblem	cpu_to_be32(RPCSEC_GSS_CREDPROBLEM)
 #define	rpcsec_gsserr_ctxproblem	cpu_to_be32(RPCSEC_GSS_CTXPROBLEM)
-#define	rpc_autherr_oldseqnum	cpu_to_be32(101)
 
 /*
  * Miscellaneous XDR helper functions
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 2b3f9d139e75..0654e9c50371 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -241,6 +241,58 @@ DECLARE_EVENT_CLASS(rpc_failure,
 			TP_ARGS(task))
 
 DEFINE_RPC_FAILURE(callhdr);
+DEFINE_RPC_FAILURE(verifier);
+
+DECLARE_EVENT_CLASS(rpc_reply_event,
+
+	TP_PROTO(
+		const struct rpc_task *task
+	),
+
+	TP_ARGS(task),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, xid)
+		__string(progname, task->tk_client->cl_program->name)
+		__field(u32, version)
+		__string(procname, rpc_proc_name(task))
+		__string(servername, task->tk_xprt->servername)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client->cl_clid;
+		__entry->xid = be32_to_cpu(task->tk_rqstp->rq_xid);
+		__assign_str(progname, task->tk_client->cl_program->name)
+		__entry->version = task->tk_client->cl_vers;
+		__assign_str(procname, rpc_proc_name(task))
+		__assign_str(servername, task->tk_xprt->servername)
+	),
+
+	TP_printk("task:%u@%d server=%s xid=0x%08x %sv%d %s",
+		__entry->task_id, __entry->client_id, __get_str(servername),
+		__entry->xid, __get_str(progname), __entry->version,
+		__get_str(procname))
+)
+
+#define DEFINE_RPC_REPLY_EVENT(name)					\
+	DEFINE_EVENT(rpc_reply_event, rpc__##name,			\
+			TP_PROTO(					\
+				const struct rpc_task *task		\
+			),						\
+			TP_ARGS(task))
+
+DEFINE_RPC_REPLY_EVENT(prog_unavail);
+DEFINE_RPC_REPLY_EVENT(prog_mismatch);
+DEFINE_RPC_REPLY_EVENT(proc_unavail);
+DEFINE_RPC_REPLY_EVENT(garbage_args);
+DEFINE_RPC_REPLY_EVENT(unparsable);
+DEFINE_RPC_REPLY_EVENT(mismatch);
+DEFINE_RPC_REPLY_EVENT(stale_creds);
+DEFINE_RPC_REPLY_EVENT(bad_creds);
+DEFINE_RPC_REPLY_EVENT(auth_tooweak);
 
 TRACE_EVENT(rpc_stats_latency,
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index d6750b7f169a..e9735089bd66 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -79,7 +79,7 @@ static void	call_connect_status(struct rpc_task *task);
 
 static int	rpc_encode_header(struct rpc_task *task,
 				  struct xdr_stream *xdr);
-static __be32	*rpc_verify_header(struct rpc_task *task);
+static __be32	*rpc_decode_header(struct rpc_task *task);
 static int	rpc_ping(struct rpc_clnt *clnt);
 
 static void rpc_register_client(struct rpc_clnt *clnt)
@@ -2292,7 +2292,7 @@ call_decode(struct rpc_task *task)
 		goto out_retry;
 	}
 
-	p = rpc_verify_header(task);
+	p = rpc_decode_header(task);
 	if (IS_ERR(p)) {
 		if (p == ERR_PTR(-EAGAIN))
 			goto out_retry;
@@ -2308,7 +2308,7 @@ call_decode(struct rpc_task *task)
 	return;
 out_retry:
 	task->tk_status = 0;
-	/* Note: rpc_verify_header() may have freed the RPC slot */
+	/* Note: rpc_decode_header() may have freed the RPC slot */
 	if (task->tk_rqstp == req) {
 		xdr_free_bvec(&req->rq_rcv_buf);
 		req->rq_reply_bytes_recvd = req->rq_rcv_buf.len = 0;
@@ -2347,164 +2347,133 @@ out_fail:
 	return error;
 }
 
-static __be32 *
-rpc_verify_header(struct rpc_task *task)
+static noinline __be32 *
+rpc_decode_header(struct rpc_task *task)
 {
 	struct rpc_clnt *clnt = task->tk_client;
 	struct kvec *iov = &task->tk_rqstp->rq_rcv_buf.head[0];
 	int len = task->tk_rqstp->rq_rcv_buf.len >> 2;
 	__be32	*p = iov->iov_base;
-	u32 n;
 	int error = -EACCES;
 
-	if ((task->tk_rqstp->rq_rcv_buf.len & 3) != 0) {
-		/* RFC-1014 says that the representation of XDR data must be a
-		 * multiple of four bytes
-		 * - if it isn't pointer subtraction in the NFS client may give
-		 *   undefined results
-		 */
-		dprintk("RPC: %5u %s: XDR representation not a multiple of"
-		       " 4 bytes: 0x%x\n", task->tk_pid, __func__,
-		       task->tk_rqstp->rq_rcv_buf.len);
-		error = -EIO;
-		goto out_err;
-	}
+	/* RFC-1014 says that the representation of XDR data must be a
+	 * multiple of four bytes
+	 * - if it isn't pointer subtraction in the NFS client may give
+	 *   undefined results
+	 */
+	if (task->tk_rqstp->rq_rcv_buf.len & 3)
+		goto out_badlen;
 	if ((len -= 3) < 0)
-		goto out_overflow;
+		goto out_unparsable;
 
-	p += 1; /* skip XID */
-	if ((n = ntohl(*p++)) != RPC_REPLY) {
-		dprintk("RPC: %5u %s: not an RPC reply: %x\n",
-			task->tk_pid, __func__, n);
-		error = -EIO;
-		goto out_garbage;
-	}
+	p++;	/* skip XID */
+	if (*p++ != rpc_reply)
+		goto out_unparsable;
+	if (*p++ != rpc_msg_accepted)
+		goto out_msg_denied;
 
-	if ((n = ntohl(*p++)) != RPC_MSG_ACCEPTED) {
-		if (--len < 0)
-			goto out_overflow;
-		switch ((n = ntohl(*p++))) {
-		case RPC_AUTH_ERROR:
-			break;
-		case RPC_MISMATCH:
-			dprintk("RPC: %5u %s: RPC call version mismatch!\n",
-				task->tk_pid, __func__);
-			error = -EPROTONOSUPPORT;
-			goto out_err;
-		default:
-			dprintk("RPC: %5u %s: RPC call rejected, "
-				"unknown error: %x\n",
-				task->tk_pid, __func__, n);
-			error = -EIO;
-			goto out_err;
-		}
-		if (--len < 0)
-			goto out_overflow;
-		switch ((n = ntohl(*p++))) {
-		case RPC_AUTH_REJECTEDCRED:
-		case RPC_AUTH_REJECTEDVERF:
-		case RPCSEC_GSS_CREDPROBLEM:
-		case RPCSEC_GSS_CTXPROBLEM:
-			if (!task->tk_cred_retry)
-				break;
-			task->tk_cred_retry--;
-			dprintk("RPC: %5u %s: retry stale creds\n",
-					task->tk_pid, __func__);
-			rpcauth_invalcred(task);
-			/* Ensure we obtain a new XID! */
-			xprt_release(task);
-			task->tk_action = call_reserve;
-			goto out_retry;
-		case RPC_AUTH_BADCRED:
-		case RPC_AUTH_BADVERF:
-			/* possibly garbled cred/verf? */
-			if (!task->tk_garb_retry)
-				break;
-			task->tk_garb_retry--;
-			dprintk("RPC: %5u %s: retry garbled creds\n",
-					task->tk_pid, __func__);
-			task->tk_action = call_encode;
-			goto out_retry;
-		case RPC_AUTH_TOOWEAK:
-			printk(KERN_NOTICE "RPC: server %s requires stronger "
-			       "authentication.\n",
-			       task->tk_xprt->servername);
-			break;
-		default:
-			dprintk("RPC: %5u %s: unknown auth error: %x\n",
-					task->tk_pid, __func__, n);
-			error = -EIO;
-		}
-		dprintk("RPC: %5u %s: call rejected %d\n",
-				task->tk_pid, __func__, n);
-		goto out_err;
-	}
 	p = rpcauth_checkverf(task, p);
-	if (IS_ERR(p)) {
-		error = PTR_ERR(p);
-		dprintk("RPC: %5u %s: auth check failed with %d\n",
-				task->tk_pid, __func__, error);
-		goto out_garbage;		/* bad verifier, retry */
-	}
+	if (IS_ERR(p))
+		goto out_verifier;
+
 	len = p - (__be32 *)iov->iov_base - 1;
 	if (len < 0)
-		goto out_overflow;
-	switch ((n = ntohl(*p++))) {
-	case RPC_SUCCESS:
+		goto out_unparsable;
+	switch (*p++) {
+	case rpc_success:
 		return p;
-	case RPC_PROG_UNAVAIL:
-		dprintk("RPC: %5u %s: program %u is unsupported "
-				"by server %s\n", task->tk_pid, __func__,
-				(unsigned int)clnt->cl_prog,
-				task->tk_xprt->servername);
+	case rpc_prog_unavail:
+		trace_rpc__prog_unavail(task);
 		error = -EPFNOSUPPORT;
 		goto out_err;
-	case RPC_PROG_MISMATCH:
-		dprintk("RPC: %5u %s: program %u, version %u unsupported "
-				"by server %s\n", task->tk_pid, __func__,
-				(unsigned int)clnt->cl_prog,
-				(unsigned int)clnt->cl_vers,
-				task->tk_xprt->servername);
+	case rpc_prog_mismatch:
+		trace_rpc__prog_mismatch(task);
 		error = -EPROTONOSUPPORT;
 		goto out_err;
-	case RPC_PROC_UNAVAIL:
-		dprintk("RPC: %5u %s: proc %s unsupported by program %u, "
-				"version %u on server %s\n",
-				task->tk_pid, __func__,
-				rpc_proc_name(task),
-				clnt->cl_prog, clnt->cl_vers,
-				task->tk_xprt->servername);
+	case rpc_proc_unavail:
+		trace_rpc__proc_unavail(task);
 		error = -EOPNOTSUPP;
 		goto out_err;
-	case RPC_GARBAGE_ARGS:
-		dprintk("RPC: %5u %s: server saw garbage\n",
-				task->tk_pid, __func__);
-		break;			/* retry */
+	case rpc_garbage_args:
+		trace_rpc__garbage_args(task);
+		break;
 	default:
-		dprintk("RPC: %5u %s: server accept status: %x\n",
-				task->tk_pid, __func__, n);
-		/* Also retry */
+		trace_rpc__unparsable(task);
 	}
 
 out_garbage:
 	clnt->cl_stats->rpcgarbage++;
 	if (task->tk_garb_retry) {
 		task->tk_garb_retry--;
-		dprintk("RPC: %5u %s: retrying\n",
-				task->tk_pid, __func__);
 		task->tk_action = call_encode;
-out_retry:
 		return ERR_PTR(-EAGAIN);
 	}
 out_err:
 	rpc_exit(task, error);
-	dprintk("RPC: %5u %s: call failed with error %d\n", task->tk_pid,
-			__func__, error);
 	return ERR_PTR(error);
-out_overflow:
-	dprintk("RPC: %5u %s: server reply was truncated.\n", task->tk_pid,
-			__func__);
+
+out_badlen:
+	trace_rpc__unparsable(task);
+	error = -EIO;
+	goto out_err;
+
+out_unparsable:
+	trace_rpc__unparsable(task);
+	error = -EIO;
 	goto out_garbage;
+
+out_verifier:
+	trace_rpc_bad_verifier(task);
+	error = PTR_ERR(p);
+	goto out_garbage;
+
+out_msg_denied:
+	switch (*p++) {
+	case rpc_auth_error:
+		break;
+	case rpc_mismatch:
+		trace_rpc__mismatch(task);
+		error = -EPROTONOSUPPORT;
+		goto out_err;
+	default:
+		trace_rpc__unparsable(task);
+		error = -EIO;
+		goto out_err;
+	}
+
+	switch (*p++) {
+	case rpc_autherr_rejectedcred:
+	case rpc_autherr_rejectedverf:
+	case rpcsec_gsserr_credproblem:
+	case rpcsec_gsserr_ctxproblem:
+		if (!task->tk_cred_retry)
+			break;
+		task->tk_cred_retry--;
+		trace_rpc__stale_creds(task);
+		rpcauth_invalcred(task);
+		/* Ensure we obtain a new XID! */
+		xprt_release(task);
+		task->tk_action = call_reserve;
+		return ERR_PTR(-EAGAIN);
+	case rpc_autherr_badcred:
+	case rpc_autherr_badverf:
+		/* possibly garbled cred/verf? */
+		if (!task->tk_garb_retry)
+			break;
+		task->tk_garb_retry--;
+		trace_rpc__bad_creds(task);
+		task->tk_action = call_encode;
+		return ERR_PTR(-EAGAIN);
+	case rpc_autherr_tooweak:
+		trace_rpc__auth_tooweak(task);
+		pr_warn("RPC: server %s requires stronger authentication.\n",
+			task->tk_xprt->servername);
+		break;
+	default:
+		trace_rpc__unparsable(task);
+		error = -EIO;
+	}
+	goto out_err;
 }
 
 static void rpcproc_encode_null(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
-- 
cgit v1.2.3


From 4fa882c9f628b312d697cfcefaa6e973ce8ece3e Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Tue, 7 Aug 2018 12:07:43 +0200
Subject: eeprom: at24: remove at24_platform_data

There are no more users of at24_platform_data. Remove the relevant
header and modify the driver code to not use it anymore.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
 MAINTAINERS                        |   1 -
 drivers/misc/eeprom/at24.c         | 162 +++++++++++++++++--------------------
 include/linux/platform_data/at24.h |  60 --------------
 3 files changed, 75 insertions(+), 148 deletions(-)
 delete mode 100644 include/linux/platform_data/at24.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 9919840d54cd..d901919dd475 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2503,7 +2503,6 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git
 S:	Maintained
 F:	Documentation/devicetree/bindings/eeprom/at24.txt
 F:	drivers/misc/eeprom/at24.c
-F:	include/linux/platform_data/at24.h
 
 ATA OVER ETHERNET (AOE) DRIVER
 M:	"Ed L. Cashin" <ed.cashin@acm.org>
diff --git a/drivers/misc/eeprom/at24.c b/drivers/misc/eeprom/at24.c
index ddfcf4ade7bf..b806a403ca46 100644
--- a/drivers/misc/eeprom/at24.c
+++ b/drivers/misc/eeprom/at24.c
@@ -22,10 +22,24 @@
 #include <linux/i2c.h>
 #include <linux/nvmem-provider.h>
 #include <linux/regmap.h>
-#include <linux/platform_data/at24.h>
 #include <linux/pm_runtime.h>
 #include <linux/gpio/consumer.h>
 
+/* Address pointer is 16 bit. */
+#define AT24_FLAG_ADDR16	BIT(7)
+/* sysfs-entry will be read-only. */
+#define AT24_FLAG_READONLY	BIT(6)
+/* sysfs-entry will be world-readable. */
+#define AT24_FLAG_IRUGO		BIT(5)
+/* Take always 8 addresses (24c00). */
+#define AT24_FLAG_TAKE8ADDR	BIT(4)
+/* Factory-programmed serial number. */
+#define AT24_FLAG_SERIAL	BIT(3)
+/* Factory-programmed mac address. */
+#define AT24_FLAG_MAC		BIT(2)
+/* Does not auto-rollover reads to the next slave address. */
+#define AT24_FLAG_NO_RDROL	BIT(1)
+
 /*
  * I2C EEPROMs from most vendors are inexpensive and mostly interchangeable.
  * Differences between different vendor product lines (like Atmel AT24C or
@@ -107,10 +121,6 @@ module_param_named(write_timeout, at24_write_timeout, uint, 0);
 MODULE_PARM_DESC(at24_write_timeout, "Time (in ms) to try writes (default 25)");
 
 struct at24_chip_data {
-	/*
-	 * these fields mirror their equivalents in
-	 * struct at24_platform_data
-	 */
 	u32 byte_len;
 	u8 flags;
 };
@@ -471,63 +481,11 @@ static int at24_write(void *priv, unsigned int off, void *val, size_t count)
 	return 0;
 }
 
-static void at24_properties_to_pdata(struct device *dev,
-				     struct at24_platform_data *chip)
-{
-	int err;
-	u32 val;
-
-	if (device_property_present(dev, "read-only"))
-		chip->flags |= AT24_FLAG_READONLY;
-	if (device_property_present(dev, "no-read-rollover"))
-		chip->flags |= AT24_FLAG_NO_RDROL;
-
-	err = device_property_read_u32(dev, "address-width", &val);
-	if (!err) {
-		switch (val) {
-		case 8:
-			if (chip->flags & AT24_FLAG_ADDR16)
-				dev_warn(dev, "Override address width to be 8, while default is 16\n");
-			chip->flags &= ~AT24_FLAG_ADDR16;
-			break;
-		case 16:
-			chip->flags |= AT24_FLAG_ADDR16;
-			break;
-		default:
-			dev_warn(dev, "Bad \"address-width\" property: %u\n",
-				 val);
-		}
-	}
-
-	err = device_property_read_u32(dev, "size", &val);
-	if (!err)
-		chip->byte_len = val;
-
-	err = device_property_read_u32(dev, "pagesize", &val);
-	if (!err) {
-		chip->page_size = val;
-	} else {
-		/*
-		 * This is slow, but we can't know all eeproms, so we better
-		 * play safe. Specifying custom eeprom-types via platform_data
-		 * is recommended anyhow.
-		 */
-		chip->page_size = 1;
-	}
-}
-
-static int at24_get_pdata(struct device *dev, struct at24_platform_data *pdata)
+static const struct at24_chip_data *at24_get_chip_data(struct device *dev)
 {
 	struct device_node *of_node = dev->of_node;
 	const struct at24_chip_data *cdata;
 	const struct i2c_device_id *id;
-	struct at24_platform_data *pd;
-
-	pd = dev_get_platdata(dev);
-	if (pd) {
-		memcpy(pdata, pd, sizeof(*pdata));
-		return 0;
-	}
 
 	id = i2c_match_id(at24_ids, to_i2c_client(dev));
 
@@ -544,13 +502,9 @@ static int at24_get_pdata(struct device *dev, struct at24_platform_data *pdata)
 		cdata = acpi_device_get_match_data(dev);
 
 	if (!cdata)
-		return -ENODEV;
+		return ERR_PTR(-ENODEV);
 
-	pdata->byte_len = cdata->byte_len;
-	pdata->flags = cdata->flags;
-	at24_properties_to_pdata(dev, pdata);
-
-	return 0;
+	return cdata;
 }
 
 static void at24_remove_dummy_clients(struct at24_data *at24)
@@ -619,7 +573,8 @@ static int at24_probe(struct i2c_client *client)
 {
 	struct regmap_config regmap_config = { };
 	struct nvmem_config nvmem_config = { };
-	struct at24_platform_data pdata = { };
+	u32 byte_len, page_size, flags, addrw;
+	const struct at24_chip_data *cdata;
 	struct device *dev = &client->dev;
 	bool i2c_fn_i2c, i2c_fn_block;
 	unsigned int i, num_addresses;
@@ -634,35 +589,72 @@ static int at24_probe(struct i2c_client *client)
 	i2c_fn_block = i2c_check_functionality(client->adapter,
 					       I2C_FUNC_SMBUS_WRITE_I2C_BLOCK);
 
-	err = at24_get_pdata(dev, &pdata);
+	cdata = at24_get_chip_data(dev);
+	if (IS_ERR(cdata))
+		return PTR_ERR(cdata);
+
+	err = device_property_read_u32(dev, "pagesize", &page_size);
 	if (err)
-		return err;
+		/*
+		 * This is slow, but we can't know all eeproms, so we better
+		 * play safe. Specifying custom eeprom-types via platform_data
+		 * is recommended anyhow.
+		 */
+		page_size = 1;
+
+	flags = cdata->flags;
+	if (device_property_present(dev, "read-only"))
+		flags |= AT24_FLAG_READONLY;
+	if (device_property_present(dev, "no-read-rollover"))
+		flags |= AT24_FLAG_NO_RDROL;
+
+	err = device_property_read_u32(dev, "address-width", &addrw);
+	if (!err) {
+		switch (addrw) {
+		case 8:
+			if (flags & AT24_FLAG_ADDR16)
+				dev_warn(dev,
+					 "Override address width to be 8, while default is 16\n");
+			flags &= ~AT24_FLAG_ADDR16;
+			break;
+		case 16:
+			flags |= AT24_FLAG_ADDR16;
+			break;
+		default:
+			dev_warn(dev, "Bad \"address-width\" property: %u\n",
+				 addrw);
+		}
+	}
+
+	err = device_property_read_u32(dev, "size", &byte_len);
+	if (err)
+		byte_len = cdata->byte_len;
 
 	if (!i2c_fn_i2c && !i2c_fn_block)
-		pdata.page_size = 1;
+		page_size = 1;
 
-	if (!pdata.page_size) {
+	if (!page_size) {
 		dev_err(dev, "page_size must not be 0!\n");
 		return -EINVAL;
 	}
 
-	if (!is_power_of_2(pdata.page_size))
+	if (!is_power_of_2(page_size))
 		dev_warn(dev, "page_size looks suspicious (no power of 2)!\n");
 
-	if (pdata.flags & AT24_FLAG_TAKE8ADDR)
+	if (flags & AT24_FLAG_TAKE8ADDR)
 		num_addresses = 8;
 	else
-		num_addresses =	DIV_ROUND_UP(pdata.byte_len,
-			(pdata.flags & AT24_FLAG_ADDR16) ? 65536 : 256);
+		num_addresses =	DIV_ROUND_UP(byte_len,
+			(flags & AT24_FLAG_ADDR16) ? 65536 : 256);
 
-	if ((pdata.flags & AT24_FLAG_SERIAL) && (pdata.flags & AT24_FLAG_MAC)) {
+	if ((flags & AT24_FLAG_SERIAL) && (flags & AT24_FLAG_MAC)) {
 		dev_err(dev,
 			"invalid device data - cannot have both AT24_FLAG_SERIAL & AT24_FLAG_MAC.");
 		return -EINVAL;
 	}
 
 	regmap_config.val_bits = 8;
-	regmap_config.reg_bits = (pdata.flags & AT24_FLAG_ADDR16) ? 16 : 8;
+	regmap_config.reg_bits = (flags & AT24_FLAG_ADDR16) ? 16 : 8;
 	regmap_config.disable_locking = true;
 
 	regmap = devm_regmap_init_i2c(client, &regmap_config);
@@ -675,11 +667,11 @@ static int at24_probe(struct i2c_client *client)
 		return -ENOMEM;
 
 	mutex_init(&at24->lock);
-	at24->byte_len = pdata.byte_len;
-	at24->page_size = pdata.page_size;
-	at24->flags = pdata.flags;
+	at24->byte_len = byte_len;
+	at24->page_size = page_size;
+	at24->flags = flags;
 	at24->num_addresses = num_addresses;
-	at24->offset_adj = at24_get_offset_adj(pdata.flags, pdata.byte_len);
+	at24->offset_adj = at24_get_offset_adj(flags, byte_len);
 	at24->client[0].client = client;
 	at24->client[0].regmap = regmap;
 
@@ -687,10 +679,10 @@ static int at24_probe(struct i2c_client *client)
 	if (IS_ERR(at24->wp_gpio))
 		return PTR_ERR(at24->wp_gpio);
 
-	writable = !(pdata.flags & AT24_FLAG_READONLY);
+	writable = !(flags & AT24_FLAG_READONLY);
 	if (writable) {
 		at24->write_max = min_t(unsigned int,
-					pdata.page_size, at24_io_limit);
+					page_size, at24_io_limit);
 		if (!i2c_fn_i2c && at24->write_max > I2C_SMBUS_BLOCK_MAX)
 			at24->write_max = I2C_SMBUS_BLOCK_MAX;
 	}
@@ -733,7 +725,7 @@ static int at24_probe(struct i2c_client *client)
 	nvmem_config.priv = at24;
 	nvmem_config.stride = 1;
 	nvmem_config.word_size = 1;
-	nvmem_config.size = pdata.byte_len;
+	nvmem_config.size = byte_len;
 
 	at24->nvmem = devm_nvmem_register(dev, &nvmem_config);
 	if (IS_ERR(at24->nvmem)) {
@@ -742,13 +734,9 @@ static int at24_probe(struct i2c_client *client)
 	}
 
 	dev_info(dev, "%u byte %s EEPROM, %s, %u bytes/write\n",
-		 pdata.byte_len, client->name,
+		 byte_len, client->name,
 		 writable ? "writable" : "read-only", at24->write_max);
 
-	/* export data to kernel code */
-	if (pdata.setup)
-		pdata.setup(at24->nvmem, pdata.context);
-
 	return 0;
 
 err_clients:
diff --git a/include/linux/platform_data/at24.h b/include/linux/platform_data/at24.h
deleted file mode 100644
index 63507ff464ee..000000000000
--- a/include/linux/platform_data/at24.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * at24.h - platform_data for the at24 (generic eeprom) driver
- * (C) Copyright 2008 by Pengutronix
- * (C) Copyright 2012 by Wolfram Sang
- * same license as the driver
- */
-
-#ifndef _LINUX_AT24_H
-#define _LINUX_AT24_H
-
-#include <linux/types.h>
-#include <linux/nvmem-consumer.h>
-#include <linux/bitops.h>
-
-/**
- * struct at24_platform_data - data to set up at24 (generic eeprom) driver
- * @byte_len: size of eeprom in byte
- * @page_size: number of byte which can be written in one go
- * @flags: tunable options, check AT24_FLAG_* defines
- * @setup: an optional callback invoked after eeprom is probed; enables kernel
-	code to access eeprom via nvmem, see example
- * @context: optional parameter passed to setup()
- *
- * If you set up a custom eeprom type, please double-check the parameters.
- * Especially page_size needs extra care, as you risk data loss if your value
- * is bigger than what the chip actually supports!
- *
- * An example in pseudo code for a setup() callback:
- *
- * void get_mac_addr(struct nvmem_device *nvmem, void *context)
- * {
- *	u8 *mac_addr = ethernet_pdata->mac_addr;
- *	off_t offset = context;
- *
- *	// Read MAC addr from EEPROM
- *	if (nvmem_device_read(nvmem, offset, ETH_ALEN, mac_addr) == ETH_ALEN)
- *		pr_info("Read MAC addr from EEPROM: %pM\n", mac_addr);
- * }
- *
- * This function pointer and context can now be set up in at24_platform_data.
- */
-
-struct at24_platform_data {
-	u32		byte_len;		/* size (sum of all addr) */
-	u16		page_size;		/* for writes */
-	u8		flags;
-#define AT24_FLAG_ADDR16	BIT(7)	/* address pointer is 16 bit */
-#define AT24_FLAG_READONLY	BIT(6)	/* sysfs-entry will be read-only */
-#define AT24_FLAG_IRUGO		BIT(5)	/* sysfs-entry will be world-readable */
-#define AT24_FLAG_TAKE8ADDR	BIT(4)	/* take always 8 addresses (24c00) */
-#define AT24_FLAG_SERIAL	BIT(3)	/* factory-programmed serial number */
-#define AT24_FLAG_MAC		BIT(2)	/* factory-programmed mac address */
-#define AT24_FLAG_NO_RDROL	BIT(1)	/* does not auto-rollover reads to */
-					/* the next slave address */
-
-	void		(*setup)(struct nvmem_device *nvmem, void *context);
-	void		*context;
-};
-
-#endif /* _LINUX_AT24_H */
-- 
cgit v1.2.3


From a0584ee9aed805446b044ce855e67264f0dc619e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 Feb 2019 11:24:58 -0500
Subject: SUNRPC: Use struct xdr_stream when decoding RPC Reply header

Modernize and harden the code path that parses an RPC Reply
message.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/auth.h    |  15 ++-
 include/linux/sunrpc/xdr.h     |   1 +
 net/sunrpc/auth.c              |  63 ++++++++-----
 net/sunrpc/auth_gss/auth_gss.c | 204 ++++++++++++++++++++++-------------------
 net/sunrpc/auth_null.c         |  31 +++----
 net/sunrpc/auth_unix.c         |  42 +++++----
 net/sunrpc/clnt.c              |  88 +++++++++---------
 7 files changed, 243 insertions(+), 201 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 96e237f8e60b..c51e1893f77e 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -134,11 +134,12 @@ struct rpc_credops {
 	int			(*crmarshal)(struct rpc_task *task,
 					     struct xdr_stream *xdr);
 	int			(*crrefresh)(struct rpc_task *);
-	__be32 *		(*crvalidate)(struct rpc_task *, __be32 *);
+	int			(*crvalidate)(struct rpc_task *task,
+					      struct xdr_stream *xdr);
 	int			(*crwrap_req)(struct rpc_task *task,
 					      struct xdr_stream *xdr);
-	int			(*crunwrap_resp)(struct rpc_task *, kxdrdproc_t,
-						void *, __be32 *, void *);
+	int			(*crunwrap_resp)(struct rpc_task *task,
+						 struct xdr_stream *xdr);
 	int			(*crkey_timeout)(struct rpc_cred *);
 	char *			(*crstringify_acceptor)(struct rpc_cred *);
 	bool			(*crneed_reencode)(struct rpc_task *);
@@ -168,12 +169,16 @@ struct rpc_cred *	rpcauth_lookupcred(struct rpc_auth *, int);
 void			put_rpccred(struct rpc_cred *);
 int			rpcauth_marshcred(struct rpc_task *task,
 					  struct xdr_stream *xdr);
-__be32 *		rpcauth_checkverf(struct rpc_task *, __be32 *);
+int			rpcauth_checkverf(struct rpc_task *task,
+					  struct xdr_stream *xdr);
 int			rpcauth_wrap_req_encode(struct rpc_task *task,
 						struct xdr_stream *xdr);
 int			rpcauth_wrap_req(struct rpc_task *task,
 					 struct xdr_stream *xdr);
-int			rpcauth_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp, __be32 *data, void *obj);
+int			rpcauth_unwrap_resp_decode(struct rpc_task *task,
+						   struct xdr_stream *xdr);
+int			rpcauth_unwrap_resp(struct rpc_task *task,
+					    struct xdr_stream *xdr);
 bool			rpcauth_xmit_need_reencode(struct rpc_task *task);
 int			rpcauth_refreshcred(struct rpc_task *);
 void			rpcauth_invalcred(struct rpc_task *);
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index c54041950cc0..65af6a204b75 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -89,6 +89,7 @@ xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
 
 #define	rpc_auth_null	cpu_to_be32(RPC_AUTH_NULL)
 #define	rpc_auth_unix	cpu_to_be32(RPC_AUTH_UNIX)
+#define	rpc_auth_short	cpu_to_be32(RPC_AUTH_SHORT)
 #define	rpc_auth_gss	cpu_to_be32(RPC_AUTH_GSS)
 
 #define	rpc_call	cpu_to_be32(RPC_CALL)
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index add2135d9b01..e7861026b9e5 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -17,6 +17,8 @@
 #include <linux/sunrpc/gss_api.h>
 #include <linux/spinlock.h>
 
+#include <trace/events/sunrpc.h>
+
 #define RPC_CREDCACHE_DEFAULT_HASHBITS	(4)
 struct rpc_cred_cache {
 	struct hlist_head	*hashtable;
@@ -773,14 +775,6 @@ int rpcauth_marshcred(struct rpc_task *task, struct xdr_stream *xdr)
 	return ops->crmarshal(task, xdr);
 }
 
-__be32 *
-rpcauth_checkverf(struct rpc_task *task, __be32 *p)
-{
-	struct rpc_cred	*cred = task->tk_rqstp->rq_cred;
-
-	return cred->cr_ops->crvalidate(task, p);
-}
-
 /**
  * rpcauth_wrap_req_encode - XDR encode the RPC procedure
  * @task: controlling RPC task
@@ -814,27 +808,52 @@ int rpcauth_wrap_req(struct rpc_task *task, struct xdr_stream *xdr)
 	return ops->crwrap_req(task, xdr);
 }
 
-static int
-rpcauth_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp,
-			  __be32 *data, void *obj)
+/**
+ * rpcauth_checkverf - Validate verifier in RPC Reply header
+ * @task: controlling RPC task
+ * @xdr: xdr_stream containing RPC Reply header
+ *
+ * On success, @xdr is updated to point past the verifier and
+ * zero is returned. Otherwise, @xdr is in an undefined state
+ * and a negative errno is returned.
+ */
+int
+rpcauth_checkverf(struct rpc_task *task, struct xdr_stream *xdr)
 {
-	struct xdr_stream xdr;
+	const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, data, rqstp);
-	return decode(rqstp, &xdr, obj);
+	return ops->crvalidate(task, xdr);
 }
 
+/**
+ * rpcauth_unwrap_resp_decode - Invoke XDR decode function
+ * @task: controlling RPC task
+ * @xdr: stream where the Reply message resides
+ *
+ * Returns zero on success; otherwise a negative errno is returned.
+ */
 int
-rpcauth_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp,
-		__be32 *data, void *obj)
+rpcauth_unwrap_resp_decode(struct rpc_task *task, struct xdr_stream *xdr)
 {
-	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+	kxdrdproc_t decode = task->tk_msg.rpc_proc->p_decode;
+
+	return decode(task->tk_rqstp, xdr, task->tk_msg.rpc_resp);
+}
+EXPORT_SYMBOL_GPL(rpcauth_unwrap_resp_decode);
+
+/**
+ * rpcauth_unwrap_resp - Invoke unwrap and decode function for the cred
+ * @task: controlling RPC task
+ * @xdr: stream where the Reply message resides
+ *
+ * Returns zero on success; otherwise a negative errno is returned.
+ */
+int
+rpcauth_unwrap_resp(struct rpc_task *task, struct xdr_stream *xdr)
+{
+	const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops;
 
-	if (cred->cr_ops->crunwrap_resp)
-		return cred->cr_ops->crunwrap_resp(task, decode, rqstp,
-						   data, obj);
-	/* By default, we decode the arguments normally. */
-	return rpcauth_unwrap_req_decode(decode, rqstp, data, obj);
+	return ops->crunwrap_resp(task, xdr);
 }
 
 bool
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index b333b1bdad45..206788e8b787 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1671,59 +1671,62 @@ gss_refresh_null(struct rpc_task *task)
 	return 0;
 }
 
-static __be32 *
-gss_validate(struct rpc_task *task, __be32 *p)
+static int
+gss_validate(struct rpc_task *task, struct xdr_stream *xdr)
 {
 	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 	struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
-	__be32		*seq = NULL;
+	__be32		*p, *seq = NULL;
 	struct kvec	iov;
 	struct xdr_buf	verf_buf;
 	struct xdr_netobj mic;
-	u32		flav,len;
-	u32		maj_stat;
-	__be32		*ret = ERR_PTR(-EIO);
+	u32		len, maj_stat;
+	int		status;
 
-	dprintk("RPC: %5u %s\n", task->tk_pid, __func__);
+	p = xdr_inline_decode(xdr, 2 * sizeof(*p));
+	if (!p)
+		goto validate_failed;
+	if (*p++ != rpc_auth_gss)
+		goto validate_failed;
+	len = be32_to_cpup(p);
+	if (len > RPC_MAX_AUTH_SIZE)
+		goto validate_failed;
+	p = xdr_inline_decode(xdr, len);
+	if (!p)
+		goto validate_failed;
 
-	flav = ntohl(*p++);
-	if ((len = ntohl(*p++)) > RPC_MAX_AUTH_SIZE)
-		goto out_bad;
-	if (flav != RPC_AUTH_GSS)
-		goto out_bad;
 	seq = kmalloc(4, GFP_NOFS);
 	if (!seq)
-		goto out_bad;
-	*seq = htonl(task->tk_rqstp->rq_seqno);
+		goto validate_failed;
+	*seq = cpu_to_be32(task->tk_rqstp->rq_seqno);
 	iov.iov_base = seq;
 	iov.iov_len = 4;
 	xdr_buf_from_iov(&iov, &verf_buf);
 	mic.data = (u8 *)p;
 	mic.len = len;
-
-	ret = ERR_PTR(-EACCES);
 	maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic);
 	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
 		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
-	if (maj_stat) {
-		dprintk("RPC: %5u %s: gss_verify_mic returned error 0x%08x\n",
-			task->tk_pid, __func__, maj_stat);
-		goto out_bad;
-	}
+	if (maj_stat)
+		goto bad_mic;
+
 	/* We leave it to unwrap to calculate au_rslack. For now we just
 	 * calculate the length of the verifier: */
 	cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2;
+	status = 0;
+out:
 	gss_put_ctx(ctx);
-	dprintk("RPC: %5u %s: gss_verify_mic succeeded.\n",
-			task->tk_pid, __func__);
-	kfree(seq);
-	return p + XDR_QUADLEN(len);
-out_bad:
-	gss_put_ctx(ctx);
-	dprintk("RPC: %5u %s failed ret %ld.\n", task->tk_pid, __func__,
-		PTR_ERR(ret));
 	kfree(seq);
-	return ret;
+	return status;
+
+validate_failed:
+	status = -EIO;
+	goto out;
+bad_mic:
+	dprintk("RPC: %5u %s: gss_verify_mic returned error 0x%08x\n",
+		task->tk_pid, __func__, maj_stat);
+	status = -EACCES;
+	goto out;
 }
 
 static int gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
@@ -1921,79 +1924,98 @@ out:
 	return status;
 }
 
-static inline int
+static int
+gss_unwrap_resp_auth(struct rpc_cred *cred)
+{
+	cred->cr_auth->au_rslack = cred->cr_auth->au_verfsize;
+	return 0;
+}
+
+static int
 gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
-		struct rpc_rqst *rqstp, __be32 **p)
+		      struct rpc_rqst *rqstp, struct xdr_stream *xdr)
 {
-	struct xdr_buf	*rcv_buf = &rqstp->rq_rcv_buf;
-	struct xdr_buf integ_buf;
+	struct xdr_buf integ_buf, *rcv_buf = &rqstp->rq_rcv_buf;
+	u32 data_offset, mic_offset, integ_len, maj_stat;
 	struct xdr_netobj mic;
-	u32 data_offset, mic_offset;
-	u32 integ_len;
-	u32 maj_stat;
-	int status = -EIO;
+	__be32 *p;
 
-	integ_len = ntohl(*(*p)++);
+	p = xdr_inline_decode(xdr, 2 * sizeof(*p));
+	if (unlikely(!p))
+		goto unwrap_failed;
+	integ_len = be32_to_cpup(p++);
 	if (integ_len & 3)
-		return status;
-	data_offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base;
+		goto unwrap_failed;
+	data_offset = (u8 *)(p) - (u8 *)rcv_buf->head[0].iov_base;
 	mic_offset = integ_len + data_offset;
 	if (mic_offset > rcv_buf->len)
-		return status;
-	if (ntohl(*(*p)++) != rqstp->rq_seqno)
-		return status;
-
-	if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset,
-				mic_offset - data_offset))
-		return status;
+		goto unwrap_failed;
+	if (be32_to_cpup(p) != rqstp->rq_seqno)
+		goto unwrap_failed;
 
+	if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len))
+		goto unwrap_failed;
 	if (xdr_buf_read_netobj(rcv_buf, &mic, mic_offset))
-		return status;
-
+		goto unwrap_failed;
 	maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
 	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
 		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
 	if (maj_stat != GSS_S_COMPLETE)
-		return status;
+		goto bad_mic;
+
+	cred->cr_auth->au_rslack = cred->cr_auth->au_verfsize + 2 +
+				   1 + XDR_QUADLEN(mic.len);
 	return 0;
+unwrap_failed:
+	return -EIO;
+bad_mic:
+	dprintk("RPC:       %s: gss_verify_mic returned error 0x%08x\n",
+		__func__, maj_stat);
+	return -EIO;
 }
 
-static inline int
+static int
 gss_unwrap_resp_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
-		struct rpc_rqst *rqstp, __be32 **p)
-{
-	struct xdr_buf  *rcv_buf = &rqstp->rq_rcv_buf;
-	u32 offset;
-	u32 opaque_len;
-	u32 maj_stat;
-	int status = -EIO;
-
-	opaque_len = ntohl(*(*p)++);
-	offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base;
+		     struct rpc_rqst *rqstp, struct xdr_stream *xdr)
+{
+	struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf;
+	struct kvec *head = rqstp->rq_rcv_buf.head;
+	unsigned int savedlen = rcv_buf->len;
+	u32 offset, opaque_len, maj_stat;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 2 * sizeof(*p));
+	if (unlikely(!p))
+		goto unwrap_failed;
+	opaque_len = be32_to_cpup(p++);
+	offset = (u8 *)(p) - (u8 *)head->iov_base;
 	if (offset + opaque_len > rcv_buf->len)
-		return status;
-	/* remove padding: */
+		goto unwrap_failed;
 	rcv_buf->len = offset + opaque_len;
 
 	maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset, rcv_buf);
 	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
 		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
 	if (maj_stat != GSS_S_COMPLETE)
-		return status;
-	if (ntohl(*(*p)++) != rqstp->rq_seqno)
-		return status;
-
-	return 0;
-}
+		goto bad_unwrap;
+	/* gss_unwrap decrypted the sequence number */
+	if (be32_to_cpup(p++) != rqstp->rq_seqno)
+		goto unwrap_failed;
 
-static int
-gss_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp,
-		      __be32 *p, void *obj)
-{
-	struct xdr_stream xdr;
+	/* gss_unwrap redacts the opaque blob from the head iovec.
+	 * rcv_buf has changed, thus the stream needs to be reset.
+	 */
+	xdr_init_decode(xdr, rcv_buf, p, rqstp);
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p, rqstp);
-	return decode(rqstp, &xdr, obj);
+	cred->cr_auth->au_rslack = cred->cr_auth->au_verfsize + 2 +
+				   XDR_QUADLEN(savedlen - rcv_buf->len);
+	return 0;
+unwrap_failed:
+	return -EIO;
+bad_unwrap:
+	dprintk("RPC:       %s: gss_unwrap returned error 0x%08x\n",
+		__func__, maj_stat);
+	return -EIO;
 }
 
 static bool
@@ -2037,39 +2059,33 @@ out:
 }
 
 static int
-gss_unwrap_resp(struct rpc_task *task,
-		kxdrdproc_t decode, void *rqstp, __be32 *p, void *obj)
+gss_unwrap_resp(struct rpc_task *task, struct xdr_stream *xdr)
 {
-	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+	struct rpc_rqst *rqstp = task->tk_rqstp;
+	struct rpc_cred *cred = rqstp->rq_cred;
 	struct gss_cred *gss_cred = container_of(cred, struct gss_cred,
 			gc_base);
 	struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
-	__be32		*savedp = p;
-	struct kvec	*head = ((struct rpc_rqst *)rqstp)->rq_rcv_buf.head;
-	int		savedlen = head->iov_len;
-	int             status = -EIO;
+	int status = -EIO;
 
 	if (ctx->gc_proc != RPC_GSS_PROC_DATA)
 		goto out_decode;
 	switch (gss_cred->gc_service) {
 	case RPC_GSS_SVC_NONE:
+		status = gss_unwrap_resp_auth(cred);
 		break;
 	case RPC_GSS_SVC_INTEGRITY:
-		status = gss_unwrap_resp_integ(cred, ctx, rqstp, &p);
-		if (status)
-			goto out;
+		status = gss_unwrap_resp_integ(cred, ctx, rqstp, xdr);
 		break;
 	case RPC_GSS_SVC_PRIVACY:
-		status = gss_unwrap_resp_priv(cred, ctx, rqstp, &p);
-		if (status)
-			goto out;
+		status = gss_unwrap_resp_priv(cred, ctx, rqstp, xdr);
 		break;
 	}
-	/* take into account extra slack for integrity and privacy cases: */
-	cred->cr_auth->au_rslack = cred->cr_auth->au_verfsize + (p - savedp)
-						+ (savedlen - head->iov_len);
+	if (status)
+		goto out;
+
 out_decode:
-	status = gss_unwrap_req_decode(decode, rqstp, p, obj);
+	status = rpcauth_unwrap_resp_decode(task, xdr);
 out:
 	gss_put_ctx(ctx);
 	dprintk("RPC: %5u %s returning %d\n",
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 797f8472c21b..bf96975ffc4b 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -86,25 +86,19 @@ nul_refresh(struct rpc_task *task)
 	return 0;
 }
 
-static __be32 *
-nul_validate(struct rpc_task *task, __be32 *p)
+static int
+nul_validate(struct rpc_task *task, struct xdr_stream *xdr)
 {
-	rpc_authflavor_t	flavor;
-	u32			size;
-
-	flavor = ntohl(*p++);
-	if (flavor != RPC_AUTH_NULL) {
-		printk("RPC: bad verf flavor: %u\n", flavor);
-		return ERR_PTR(-EIO);
-	}
-
-	size = ntohl(*p++);
-	if (size != 0) {
-		printk("RPC: bad verf size: %u\n", size);
-		return ERR_PTR(-EIO);
-	}
-
-	return p;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 2 * sizeof(*p));
+	if (!p)
+		return -EIO;
+	if (*p++ != rpc_auth_null)
+		return -EIO;
+	if (*p != xdr_zero)
+		return -EIO;
+	return 0;
 }
 
 const struct rpc_authops authnull_ops = {
@@ -134,6 +128,7 @@ const struct rpc_credops null_credops = {
 	.crwrap_req	= rpcauth_wrap_req_encode,
 	.crrefresh	= nul_refresh,
 	.crvalidate	= nul_validate,
+	.crunwrap_resp	= rpcauth_unwrap_resp_decode,
 };
 
 static
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 1d5b7ed9c6f7..5ea84a96f96e 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -160,29 +160,32 @@ unx_refresh(struct rpc_task *task)
 	return 0;
 }
 
-static __be32 *
-unx_validate(struct rpc_task *task, __be32 *p)
+static int
+unx_validate(struct rpc_task *task, struct xdr_stream *xdr)
 {
-	rpc_authflavor_t	flavor;
-	u32			size;
-
-	flavor = ntohl(*p++);
-	if (flavor != RPC_AUTH_NULL &&
-	    flavor != RPC_AUTH_UNIX &&
-	    flavor != RPC_AUTH_SHORT) {
-		printk("RPC: bad verf flavor: %u\n", flavor);
-		return ERR_PTR(-EIO);
-	}
+	__be32 *p;
+	u32 size;
 
-	size = ntohl(*p++);
-	if (size > RPC_MAX_AUTH_SIZE) {
-		printk("RPC: giant verf size: %u\n", size);
-		return ERR_PTR(-EIO);
+	p = xdr_inline_decode(xdr, 2 * sizeof(*p));
+	if (!p)
+		return -EIO;
+	switch (*p++) {
+	case rpc_auth_null:
+	case rpc_auth_unix:
+	case rpc_auth_short:
+		break;
+	default:
+		return -EIO;
 	}
-	task->tk_rqstp->rq_cred->cr_auth->au_rslack = (size >> 2) + 2;
-	p += (size >> 2);
+	size = be32_to_cpup(p);
+	if (size > RPC_MAX_AUTH_SIZE)
+		return -EIO;
+	p = xdr_inline_decode(xdr, size);
+	if (!p)
+		return -EIO;
 
-	return p;
+	task->tk_rqstp->rq_cred->cr_auth->au_rslack = (size >> 2) + 2;
+	return 0;
 }
 
 int __init rpc_init_authunix(void)
@@ -223,4 +226,5 @@ const struct rpc_credops unix_credops = {
 	.crwrap_req	= rpcauth_wrap_req_encode,
 	.crrefresh	= unx_refresh,
 	.crvalidate	= unx_validate,
+	.crunwrap_resp	= rpcauth_unwrap_resp_decode,
 };
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index e9735089bd66..803e93105af1 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -79,7 +79,8 @@ static void	call_connect_status(struct rpc_task *task);
 
 static int	rpc_encode_header(struct rpc_task *task,
 				  struct xdr_stream *xdr);
-static __be32	*rpc_decode_header(struct rpc_task *task);
+static int	rpc_decode_header(struct rpc_task *task,
+				  struct xdr_stream *xdr);
 static int	rpc_ping(struct rpc_clnt *clnt);
 
 static void rpc_register_client(struct rpc_clnt *clnt)
@@ -2251,12 +2252,11 @@ call_decode(struct rpc_task *task)
 {
 	struct rpc_clnt	*clnt = task->tk_client;
 	struct rpc_rqst	*req = task->tk_rqstp;
-	kxdrdproc_t	decode = task->tk_msg.rpc_proc->p_decode;
-	__be32		*p;
+	struct xdr_stream xdr;
 
 	dprint_status(task);
 
-	if (!decode) {
+	if (!task->tk_msg.rpc_proc->p_decode) {
 		task->tk_action = rpc_exit_task;
 		return;
 	}
@@ -2292,29 +2292,27 @@ call_decode(struct rpc_task *task)
 		goto out_retry;
 	}
 
-	p = rpc_decode_header(task);
-	if (IS_ERR(p)) {
-		if (p == ERR_PTR(-EAGAIN))
-			goto out_retry;
+	xdr_init_decode(&xdr, &req->rq_rcv_buf,
+			req->rq_rcv_buf.head[0].iov_base, req);
+	switch (rpc_decode_header(task, &xdr)) {
+	case 0:
+		task->tk_action = rpc_exit_task;
+		task->tk_status = rpcauth_unwrap_resp(task, &xdr);
+		dprintk("RPC: %5u %s result %d\n",
+			task->tk_pid, __func__, task->tk_status);
 		return;
-	}
-	task->tk_action = rpc_exit_task;
-
-	task->tk_status = rpcauth_unwrap_resp(task, decode, req, p,
-					      task->tk_msg.rpc_resp);
-
-	dprintk("RPC: %5u call_decode result %d\n", task->tk_pid,
-			task->tk_status);
-	return;
+	case -EAGAIN:
 out_retry:
-	task->tk_status = 0;
-	/* Note: rpc_decode_header() may have freed the RPC slot */
-	if (task->tk_rqstp == req) {
-		xdr_free_bvec(&req->rq_rcv_buf);
-		req->rq_reply_bytes_recvd = req->rq_rcv_buf.len = 0;
-		if (task->tk_client->cl_discrtry)
-			xprt_conditional_disconnect(req->rq_xprt,
-					req->rq_connect_cookie);
+		task->tk_status = 0;
+		/* Note: rpc_decode_header() may have freed the RPC slot */
+		if (task->tk_rqstp == req) {
+			xdr_free_bvec(&req->rq_rcv_buf);
+			req->rq_reply_bytes_recvd = 0;
+			req->rq_rcv_buf.len = 0;
+			if (task->tk_client->cl_discrtry)
+				xprt_conditional_disconnect(req->rq_xprt,
+							    req->rq_connect_cookie);
+		}
 	}
 }
 
@@ -2347,14 +2345,12 @@ out_fail:
 	return error;
 }
 
-static noinline __be32 *
-rpc_decode_header(struct rpc_task *task)
+static noinline int
+rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr)
 {
 	struct rpc_clnt *clnt = task->tk_client;
-	struct kvec *iov = &task->tk_rqstp->rq_rcv_buf.head[0];
-	int len = task->tk_rqstp->rq_rcv_buf.len >> 2;
-	__be32	*p = iov->iov_base;
 	int error = -EACCES;
+	__be32 *p;
 
 	/* RFC-1014 says that the representation of XDR data must be a
 	 * multiple of four bytes
@@ -2363,25 +2359,26 @@ rpc_decode_header(struct rpc_task *task)
 	 */
 	if (task->tk_rqstp->rq_rcv_buf.len & 3)
 		goto out_badlen;
-	if ((len -= 3) < 0)
-		goto out_unparsable;
 
+	p = xdr_inline_decode(xdr, 3 * sizeof(*p));
+	if (!p)
+		goto out_unparsable;
 	p++;	/* skip XID */
 	if (*p++ != rpc_reply)
 		goto out_unparsable;
 	if (*p++ != rpc_msg_accepted)
 		goto out_msg_denied;
 
-	p = rpcauth_checkverf(task, p);
-	if (IS_ERR(p))
+	error = rpcauth_checkverf(task, xdr);
+	if (error)
 		goto out_verifier;
 
-	len = p - (__be32 *)iov->iov_base - 1;
-	if (len < 0)
+	p = xdr_inline_decode(xdr, sizeof(*p));
+	if (!p)
 		goto out_unparsable;
-	switch (*p++) {
+	switch (*p) {
 	case rpc_success:
-		return p;
+		return 0;
 	case rpc_prog_unavail:
 		trace_rpc__prog_unavail(task);
 		error = -EPFNOSUPPORT;
@@ -2406,11 +2403,11 @@ out_garbage:
 	if (task->tk_garb_retry) {
 		task->tk_garb_retry--;
 		task->tk_action = call_encode;
-		return ERR_PTR(-EAGAIN);
+		return -EAGAIN;
 	}
 out_err:
 	rpc_exit(task, error);
-	return ERR_PTR(error);
+	return error;
 
 out_badlen:
 	trace_rpc__unparsable(task);
@@ -2424,10 +2421,12 @@ out_unparsable:
 
 out_verifier:
 	trace_rpc_bad_verifier(task);
-	error = PTR_ERR(p);
 	goto out_garbage;
 
 out_msg_denied:
+	p = xdr_inline_decode(xdr, sizeof(*p));
+	if (!p)
+		goto out_unparsable;
 	switch (*p++) {
 	case rpc_auth_error:
 		break;
@@ -2441,6 +2440,9 @@ out_msg_denied:
 		goto out_err;
 	}
 
+	p = xdr_inline_decode(xdr, sizeof(*p));
+	if (!p)
+		goto out_unparsable;
 	switch (*p++) {
 	case rpc_autherr_rejectedcred:
 	case rpc_autherr_rejectedverf:
@@ -2454,7 +2456,7 @@ out_msg_denied:
 		/* Ensure we obtain a new XID! */
 		xprt_release(task);
 		task->tk_action = call_reserve;
-		return ERR_PTR(-EAGAIN);
+		return -EAGAIN;
 	case rpc_autherr_badcred:
 	case rpc_autherr_badverf:
 		/* possibly garbled cred/verf? */
@@ -2463,7 +2465,7 @@ out_msg_denied:
 		task->tk_garb_retry--;
 		trace_rpc__bad_creds(task);
 		task->tk_action = call_encode;
-		return ERR_PTR(-EAGAIN);
+		return -EAGAIN;
 	case rpc_autherr_tooweak:
 		trace_rpc__auth_tooweak(task);
 		pr_warn("RPC: server %s requires stronger authentication.\n",
-- 
cgit v1.2.3


From 0c77668ddb4e7bdfbca462c6185d154d0b8889ae Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 Feb 2019 11:25:04 -0500
Subject: SUNRPC: Introduce trace points in rpc_auth_gss.ko

Add infrastructure for trace points in the RPC_AUTH_GSS kernel
module, and add a few sample trace points. These report exceptional
or unexpected events, and observe the assignment of GSS sequence
numbers.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcgss.h  | 361 +++++++++++++++++++++++++++++++++++++++++
 include/trace/events/rpcrdma.h |  12 +-
 include/trace/events/sunrpc.h  |  61 ++++++-
 net/sunrpc/auth_gss/Makefile   |   2 +-
 net/sunrpc/auth_gss/auth_gss.c | 165 +++++++++----------
 net/sunrpc/auth_gss/trace.c    |  11 ++
 net/sunrpc/xprt.c              |  10 +-
 7 files changed, 530 insertions(+), 92 deletions(-)
 create mode 100644 include/trace/events/rpcgss.h
 create mode 100644 net/sunrpc/auth_gss/trace.c

(limited to 'include')

diff --git a/include/trace/events/rpcgss.h b/include/trace/events/rpcgss.h
new file mode 100644
index 000000000000..d1f7fe1b6fe4
--- /dev/null
+++ b/include/trace/events/rpcgss.h
@@ -0,0 +1,361 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2018 Oracle.  All rights reserved.
+ *
+ * Trace point definitions for the "rpcgss" subsystem.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rpcgss
+
+#if !defined(_TRACE_RPCRDMA_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RPCGSS_H
+
+#include <linux/tracepoint.h>
+
+/**
+ ** GSS-API related trace events
+ **/
+
+TRACE_DEFINE_ENUM(GSS_S_BAD_MECH);
+TRACE_DEFINE_ENUM(GSS_S_BAD_NAME);
+TRACE_DEFINE_ENUM(GSS_S_BAD_NAMETYPE);
+TRACE_DEFINE_ENUM(GSS_S_BAD_BINDINGS);
+TRACE_DEFINE_ENUM(GSS_S_BAD_STATUS);
+TRACE_DEFINE_ENUM(GSS_S_BAD_SIG);
+TRACE_DEFINE_ENUM(GSS_S_NO_CRED);
+TRACE_DEFINE_ENUM(GSS_S_NO_CONTEXT);
+TRACE_DEFINE_ENUM(GSS_S_DEFECTIVE_TOKEN);
+TRACE_DEFINE_ENUM(GSS_S_DEFECTIVE_CREDENTIAL);
+TRACE_DEFINE_ENUM(GSS_S_CREDENTIALS_EXPIRED);
+TRACE_DEFINE_ENUM(GSS_S_CONTEXT_EXPIRED);
+TRACE_DEFINE_ENUM(GSS_S_FAILURE);
+TRACE_DEFINE_ENUM(GSS_S_BAD_QOP);
+TRACE_DEFINE_ENUM(GSS_S_UNAUTHORIZED);
+TRACE_DEFINE_ENUM(GSS_S_UNAVAILABLE);
+TRACE_DEFINE_ENUM(GSS_S_DUPLICATE_ELEMENT);
+TRACE_DEFINE_ENUM(GSS_S_NAME_NOT_MN);
+TRACE_DEFINE_ENUM(GSS_S_CONTINUE_NEEDED);
+TRACE_DEFINE_ENUM(GSS_S_DUPLICATE_TOKEN);
+TRACE_DEFINE_ENUM(GSS_S_OLD_TOKEN);
+TRACE_DEFINE_ENUM(GSS_S_UNSEQ_TOKEN);
+TRACE_DEFINE_ENUM(GSS_S_GAP_TOKEN);
+
+#define show_gss_status(x)						\
+	__print_flags(x, "|",						\
+		{ GSS_S_BAD_MECH, "GSS_S_BAD_MECH" },			\
+		{ GSS_S_BAD_NAME, "GSS_S_BAD_NAME" },			\
+		{ GSS_S_BAD_NAMETYPE, "GSS_S_BAD_NAMETYPE" },		\
+		{ GSS_S_BAD_BINDINGS, "GSS_S_BAD_BINDINGS" },		\
+		{ GSS_S_BAD_STATUS, "GSS_S_BAD_STATUS" },		\
+		{ GSS_S_BAD_SIG, "GSS_S_BAD_SIG" },			\
+		{ GSS_S_NO_CRED, "GSS_S_NO_CRED" },			\
+		{ GSS_S_NO_CONTEXT, "GSS_S_NO_CONTEXT" },		\
+		{ GSS_S_DEFECTIVE_TOKEN, "GSS_S_DEFECTIVE_TOKEN" },	\
+		{ GSS_S_DEFECTIVE_CREDENTIAL, "GSS_S_DEFECTIVE_CREDENTIAL" }, \
+		{ GSS_S_CREDENTIALS_EXPIRED, "GSS_S_CREDENTIALS_EXPIRED" }, \
+		{ GSS_S_CONTEXT_EXPIRED, "GSS_S_CONTEXT_EXPIRED" },	\
+		{ GSS_S_FAILURE, "GSS_S_FAILURE" },			\
+		{ GSS_S_BAD_QOP, "GSS_S_BAD_QOP" },			\
+		{ GSS_S_UNAUTHORIZED, "GSS_S_UNAUTHORIZED" },		\
+		{ GSS_S_UNAVAILABLE, "GSS_S_UNAVAILABLE" },		\
+		{ GSS_S_DUPLICATE_ELEMENT, "GSS_S_DUPLICATE_ELEMENT" },	\
+		{ GSS_S_NAME_NOT_MN, "GSS_S_NAME_NOT_MN" },		\
+		{ GSS_S_CONTINUE_NEEDED, "GSS_S_CONTINUE_NEEDED" },	\
+		{ GSS_S_DUPLICATE_TOKEN, "GSS_S_DUPLICATE_TOKEN" },	\
+		{ GSS_S_OLD_TOKEN, "GSS_S_OLD_TOKEN" },			\
+		{ GSS_S_UNSEQ_TOKEN, "GSS_S_UNSEQ_TOKEN" },		\
+		{ GSS_S_GAP_TOKEN, "GSS_S_GAP_TOKEN" })
+
+
+DECLARE_EVENT_CLASS(rpcgss_gssapi_event,
+	TP_PROTO(
+		const struct rpc_task *task,
+		u32 maj_stat
+	),
+
+	TP_ARGS(task, maj_stat),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, maj_stat)
+
+	),
+
+	TP_fast_assign(
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client->cl_clid;
+		__entry->maj_stat = maj_stat;
+	),
+
+	TP_printk("task:%u@%u maj_stat=%s",
+		__entry->task_id, __entry->client_id,
+		__entry->maj_stat == 0 ?
+		"GSS_S_COMPLETE" : show_gss_status(__entry->maj_stat))
+);
+
+#define DEFINE_GSSAPI_EVENT(name)					\
+	DEFINE_EVENT(rpcgss_gssapi_event, rpcgss_##name,		\
+			TP_PROTO(					\
+				const struct rpc_task *task,		\
+				u32 maj_stat				\
+			),						\
+			TP_ARGS(task, maj_stat))
+
+TRACE_EVENT(rpcgss_import_ctx,
+	TP_PROTO(
+		int status
+	),
+
+	TP_ARGS(status),
+
+	TP_STRUCT__entry(
+		__field(int, status)
+	),
+
+	TP_fast_assign(
+		__entry->status = status;
+	),
+
+	TP_printk("status=%d", __entry->status)
+);
+
+DEFINE_GSSAPI_EVENT(get_mic);
+DEFINE_GSSAPI_EVENT(verify_mic);
+DEFINE_GSSAPI_EVENT(wrap);
+DEFINE_GSSAPI_EVENT(unwrap);
+
+
+/**
+ ** GSS auth unwrap failures
+ **/
+
+TRACE_EVENT(rpcgss_unwrap_failed,
+	TP_PROTO(
+		const struct rpc_task *task
+	),
+
+	TP_ARGS(task),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client->cl_clid;
+	),
+
+	TP_printk("task:%u@%u", __entry->task_id, __entry->client_id)
+);
+
+TRACE_EVENT(rpcgss_bad_seqno,
+	TP_PROTO(
+		const struct rpc_task *task,
+		u32 expected,
+		u32 received
+	),
+
+	TP_ARGS(task, expected, received),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, expected)
+		__field(u32, received)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client->cl_clid;
+		__entry->expected = expected;
+		__entry->received = received;
+	),
+
+	TP_printk("task:%u@%u expected seqno %u, received seqno %u",
+		__entry->task_id, __entry->client_id,
+		__entry->expected, __entry->received)
+);
+
+TRACE_EVENT(rpcgss_seqno,
+	TP_PROTO(
+		const struct rpc_task *task
+	),
+
+	TP_ARGS(task),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, xid)
+		__field(u32, seqno)
+	),
+
+	TP_fast_assign(
+		const struct rpc_rqst *rqst = task->tk_rqstp;
+
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client->cl_clid;
+		__entry->xid = be32_to_cpu(rqst->rq_xid);
+		__entry->seqno = rqst->rq_seqno;
+	),
+
+	TP_printk("task:%u@%u xid=0x%08x seqno=%u",
+		__entry->task_id, __entry->client_id,
+		__entry->xid, __entry->seqno)
+);
+
+TRACE_EVENT(rpcgss_need_reencode,
+	TP_PROTO(
+		const struct rpc_task *task,
+		u32 seq_xmit,
+		bool ret
+	),
+
+	TP_ARGS(task, seq_xmit, ret),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, xid)
+		__field(u32, seq_xmit)
+		__field(u32, seqno)
+		__field(bool, ret)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client->cl_clid;
+		__entry->xid = be32_to_cpu(task->tk_rqstp->rq_xid);
+		__entry->seq_xmit = seq_xmit;
+		__entry->seqno = task->tk_rqstp->rq_seqno;
+		__entry->ret = ret;
+	),
+
+	TP_printk("task:%u@%u xid=0x%08x rq_seqno=%u seq_xmit=%u reencode %sneeded",
+		__entry->task_id, __entry->client_id,
+		__entry->xid, __entry->seqno, __entry->seq_xmit,
+		__entry->ret ? "" : "un")
+);
+
+/**
+ ** gssd upcall related trace events
+ **/
+
+TRACE_EVENT(rpcgss_upcall_msg,
+	TP_PROTO(
+		const char *buf
+	),
+
+	TP_ARGS(buf),
+
+	TP_STRUCT__entry(
+		__string(msg, buf)
+	),
+
+	TP_fast_assign(
+		__assign_str(msg, buf)
+	),
+
+	TP_printk("msg='%s'", __get_str(msg))
+);
+
+TRACE_EVENT(rpcgss_upcall_result,
+	TP_PROTO(
+		u32 uid,
+		int result
+	),
+
+	TP_ARGS(uid, result),
+
+	TP_STRUCT__entry(
+		__field(u32, uid)
+		__field(int, result)
+
+	),
+
+	TP_fast_assign(
+		__entry->uid = uid;
+		__entry->result = result;
+	),
+
+	TP_printk("for uid %u, result=%d", __entry->uid, __entry->result)
+);
+
+TRACE_EVENT(rpcgss_context,
+	TP_PROTO(
+		unsigned long expiry,
+		unsigned long now,
+		unsigned int timeout,
+		unsigned int len,
+		const u8 *data
+	),
+
+	TP_ARGS(expiry, now, timeout, len, data),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, expiry)
+		__field(unsigned long, now)
+		__field(unsigned int, timeout)
+		__field(int, len)
+		__string(acceptor, data)
+	),
+
+	TP_fast_assign(
+		__entry->expiry = expiry;
+		__entry->now = now;
+		__entry->timeout = timeout;
+		__entry->len = len;
+		strncpy(__get_str(acceptor), data, len);
+	),
+
+	TP_printk("gc_expiry=%lu now=%lu timeout=%u acceptor=%.*s",
+		__entry->expiry, __entry->now, __entry->timeout,
+		__entry->len, __get_str(acceptor))
+);
+
+
+/**
+ ** Miscellaneous events
+ */
+
+TRACE_DEFINE_ENUM(RPC_AUTH_GSS_KRB5);
+TRACE_DEFINE_ENUM(RPC_AUTH_GSS_KRB5I);
+TRACE_DEFINE_ENUM(RPC_AUTH_GSS_KRB5P);
+
+#define show_pseudoflavor(x)						\
+	__print_symbolic(x,						\
+		{ RPC_AUTH_GSS_KRB5, "RPC_AUTH_GSS_KRB5" },		\
+		{ RPC_AUTH_GSS_KRB5I, "RPC_AUTH_GSS_KRB5I" },		\
+		{ RPC_AUTH_GSS_KRB5P, "RPC_AUTH_GSS_KRB5P" })
+
+
+TRACE_EVENT(rpcgss_createauth,
+	TP_PROTO(
+		unsigned int flavor,
+		int error
+	),
+
+	TP_ARGS(flavor, error),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, flavor)
+		__field(int, error)
+
+	),
+
+	TP_fast_assign(
+		__entry->flavor = flavor;
+		__entry->error = error;
+	),
+
+	TP_printk("flavor=%s error=%d",
+		show_pseudoflavor(__entry->flavor), __entry->error)
+);
+
+
+#endif	/* _TRACE_RPCGSS_H */
+
+#include <trace/define_trace.h>
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 399b1aedc927..962975b4313f 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -521,12 +521,18 @@ TRACE_EVENT(xprtrdma_post_send,
 
 	TP_STRUCT__entry(
 		__field(const void *, req)
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
 		__field(int, num_sge)
 		__field(int, signaled)
 		__field(int, status)
 	),
 
 	TP_fast_assign(
+		const struct rpc_rqst *rqst = &req->rl_slot;
+
+		__entry->task_id = rqst->rq_task->tk_pid;
+		__entry->client_id = rqst->rq_task->tk_client->cl_clid;
 		__entry->req = req;
 		__entry->num_sge = req->rl_sendctx->sc_wr.num_sge;
 		__entry->signaled = req->rl_sendctx->sc_wr.send_flags &
@@ -534,9 +540,11 @@ TRACE_EVENT(xprtrdma_post_send,
 		__entry->status = status;
 	),
 
-	TP_printk("req=%p, %d SGEs%s, status=%d",
+	TP_printk("task:%u@%u req=%p (%d SGE%s) %sstatus=%d",
+		__entry->task_id, __entry->client_id,
 		__entry->req, __entry->num_sge,
-		(__entry->signaled ? ", signaled" : ""),
+		(__entry->num_sge == 1 ? "" : "s"),
+		(__entry->signaled ? "signaled " : ""),
 		__entry->status
 	)
 );
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 0654e9c50371..e58dda8e038c 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -655,9 +655,68 @@ DECLARE_EVENT_CLASS(rpc_xprt_event,
 
 DEFINE_RPC_XPRT_EVENT(timer);
 DEFINE_RPC_XPRT_EVENT(lookup_rqst);
-DEFINE_RPC_XPRT_EVENT(transmit);
 DEFINE_RPC_XPRT_EVENT(complete_rqst);
 
+TRACE_EVENT(xprt_transmit,
+	TP_PROTO(
+		const struct rpc_rqst *rqst,
+		int status
+	),
+
+	TP_ARGS(rqst, status),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, xid)
+		__field(u32, seqno)
+		__field(int, status)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = rqst->rq_task->tk_pid;
+		__entry->client_id = rqst->rq_task->tk_client->cl_clid;
+		__entry->xid = be32_to_cpu(rqst->rq_xid);
+		__entry->seqno = rqst->rq_seqno;
+		__entry->status = status;
+	),
+
+	TP_printk(
+		"task:%u@%u xid=0x%08x seqno=%u status=%d",
+		__entry->task_id, __entry->client_id, __entry->xid,
+		__entry->seqno, __entry->status)
+);
+
+TRACE_EVENT(xprt_enq_xmit,
+	TP_PROTO(
+		const struct rpc_task *task,
+		int stage
+	),
+
+	TP_ARGS(task, stage),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, xid)
+		__field(u32, seqno)
+		__field(int, stage)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client->cl_clid;
+		__entry->xid = be32_to_cpu(task->tk_rqstp->rq_xid);
+		__entry->seqno = task->tk_rqstp->rq_seqno;
+		__entry->stage = stage;
+	),
+
+	TP_printk(
+		"task:%u@%u xid=0x%08x seqno=%u stage=%d",
+		__entry->task_id, __entry->client_id, __entry->xid,
+		__entry->seqno, __entry->stage)
+);
+
 TRACE_EVENT(xprt_ping,
 	TP_PROTO(const struct rpc_xprt *xprt, int status),
 
diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile
index c374268b008f..4a29f4c5dac4 100644
--- a/net/sunrpc/auth_gss/Makefile
+++ b/net/sunrpc/auth_gss/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o
 
 auth_rpcgss-y := auth_gss.o gss_generic_token.o \
 	gss_mech_switch.o svcauth_gss.o \
-	gss_rpc_upcall.o gss_rpc_xdr.o
+	gss_rpc_upcall.o gss_rpc_xdr.o trace.o
 
 obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o
 
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 206788e8b787..3d1fbd6d3370 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -55,6 +55,8 @@
 
 #include "../netns.h"
 
+#include <trace/events/rpcgss.h>
+
 static const struct rpc_authops authgss_ops;
 
 static const struct rpc_credops gss_credops;
@@ -260,6 +262,7 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct
 	}
 	ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, NULL, GFP_NOFS);
 	if (ret < 0) {
+		trace_rpcgss_import_ctx(ret);
 		p = ERR_PTR(ret);
 		goto err;
 	}
@@ -275,12 +278,9 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct
 	if (IS_ERR(p))
 		goto err;
 done:
-	dprintk("RPC:       %s Success. gc_expiry %lu now %lu timeout %u acceptor %.*s\n",
-		__func__, ctx->gc_expiry, now, timeout, ctx->gc_acceptor.len,
-		ctx->gc_acceptor.data);
-	return p;
+	trace_rpcgss_context(ctx->gc_expiry, now, timeout,
+			     ctx->gc_acceptor.len, ctx->gc_acceptor.data);
 err:
-	dprintk("RPC:       %s returns error %ld\n", __func__, -PTR_ERR(p));
 	return p;
 }
 
@@ -354,10 +354,8 @@ __gss_find_upcall(struct rpc_pipe *pipe, kuid_t uid, const struct gss_auth *auth
 		if (auth && pos->auth->service != auth->service)
 			continue;
 		refcount_inc(&pos->count);
-		dprintk("RPC:       %s found msg %p\n", __func__, pos);
 		return pos;
 	}
-	dprintk("RPC:       %s found nothing\n", __func__);
 	return NULL;
 }
 
@@ -456,7 +454,7 @@ static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
 	size_t buflen = sizeof(gss_msg->databuf);
 	int len;
 
-	len = scnprintf(p, buflen, "mech=%s uid=%d ", mech->gm_name,
+	len = scnprintf(p, buflen, "mech=%s uid=%d", mech->gm_name,
 			from_kuid(&init_user_ns, gss_msg->uid));
 	buflen -= len;
 	p += len;
@@ -467,7 +465,7 @@ static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
 	 * identity that we are authenticating to.
 	 */
 	if (target_name) {
-		len = scnprintf(p, buflen, "target=%s ", target_name);
+		len = scnprintf(p, buflen, " target=%s", target_name);
 		buflen -= len;
 		p += len;
 		gss_msg->msg.len += len;
@@ -487,11 +485,11 @@ static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
 		char *c = strchr(service_name, '@');
 
 		if (!c)
-			len = scnprintf(p, buflen, "service=%s ",
+			len = scnprintf(p, buflen, " service=%s",
 					service_name);
 		else
 			len = scnprintf(p, buflen,
-					"service=%.*s srchost=%s ",
+					" service=%.*s srchost=%s",
 					(int)(c - service_name),
 					service_name, c + 1);
 		buflen -= len;
@@ -500,17 +498,17 @@ static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
 	}
 
 	if (mech->gm_upcall_enctypes) {
-		len = scnprintf(p, buflen, "enctypes=%s ",
+		len = scnprintf(p, buflen, " enctypes=%s",
 				mech->gm_upcall_enctypes);
 		buflen -= len;
 		p += len;
 		gss_msg->msg.len += len;
 	}
+	trace_rpcgss_upcall_msg(gss_msg->databuf);
 	len = scnprintf(p, buflen, "\n");
 	if (len == 0)
 		goto out_overflow;
 	gss_msg->msg.len += len;
-
 	gss_msg->msg.data = gss_msg->databuf;
 	return 0;
 out_overflow:
@@ -603,8 +601,6 @@ gss_refresh_upcall(struct rpc_task *task)
 	struct rpc_pipe *pipe;
 	int err = 0;
 
-	dprintk("RPC: %5u %s for uid %u\n",
-		task->tk_pid, __func__, from_kuid(&init_user_ns, cred->cr_cred->fsuid));
 	gss_msg = gss_setup_upcall(gss_auth, cred);
 	if (PTR_ERR(gss_msg) == -EAGAIN) {
 		/* XXX: warning on the first, under the assumption we
@@ -612,7 +608,8 @@ gss_refresh_upcall(struct rpc_task *task)
 		warn_gssd();
 		task->tk_timeout = 15*HZ;
 		rpc_sleep_on(&pipe_version_rpc_waitqueue, task, NULL);
-		return -EAGAIN;
+		err = -EAGAIN;
+		goto out;
 	}
 	if (IS_ERR(gss_msg)) {
 		err = PTR_ERR(gss_msg);
@@ -635,9 +632,8 @@ gss_refresh_upcall(struct rpc_task *task)
 	spin_unlock(&pipe->lock);
 	gss_release_msg(gss_msg);
 out:
-	dprintk("RPC: %5u %s for uid %u result %d\n",
-		task->tk_pid, __func__,
-		from_kuid(&init_user_ns, cred->cr_cred->fsuid),	err);
+	trace_rpcgss_upcall_result(from_kuid(&init_user_ns,
+					     cred->cr_cred->fsuid), err);
 	return err;
 }
 
@@ -652,14 +648,13 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred)
 	DEFINE_WAIT(wait);
 	int err;
 
-	dprintk("RPC:       %s for uid %u\n",
-		__func__, from_kuid(&init_user_ns, cred->cr_cred->fsuid));
 retry:
 	err = 0;
 	/* if gssd is down, just skip upcalling altogether */
 	if (!gssd_running(net)) {
 		warn_gssd();
-		return -EACCES;
+		err = -EACCES;
+		goto out;
 	}
 	gss_msg = gss_setup_upcall(gss_auth, cred);
 	if (PTR_ERR(gss_msg) == -EAGAIN) {
@@ -700,8 +695,8 @@ out_intr:
 	finish_wait(&gss_msg->waitqueue, &wait);
 	gss_release_msg(gss_msg);
 out:
-	dprintk("RPC:       %s for uid %u result %d\n",
-		__func__, from_kuid(&init_user_ns, cred->cr_cred->fsuid), err);
+	trace_rpcgss_upcall_result(from_kuid(&init_user_ns,
+					     cred->cr_cred->fsuid), err);
 	return err;
 }
 
@@ -794,7 +789,6 @@ err_put_ctx:
 err:
 	kfree(buf);
 out:
-	dprintk("RPC:       %s returning %zd\n", __func__, err);
 	return err;
 }
 
@@ -863,8 +857,6 @@ gss_pipe_destroy_msg(struct rpc_pipe_msg *msg)
 	struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg);
 
 	if (msg->errno < 0) {
-		dprintk("RPC:       %s releasing msg %p\n",
-			__func__, gss_msg);
 		refcount_inc(&gss_msg->count);
 		gss_unhash_msg(gss_msg);
 		if (msg->errno == -ETIMEDOUT)
@@ -1024,8 +1016,6 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
 	struct rpc_auth * auth;
 	int err = -ENOMEM; /* XXX? */
 
-	dprintk("RPC:       creating GSS authenticator for client %p\n", clnt);
-
 	if (!try_module_get(THIS_MODULE))
 		return ERR_PTR(err);
 	if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL)))
@@ -1041,10 +1031,8 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
 	gss_auth->net = get_net(rpc_net_ns(clnt));
 	err = -EINVAL;
 	gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor);
-	if (!gss_auth->mech) {
-		dprintk("RPC:       Pseudoflavor %d not found!\n", flavor);
+	if (!gss_auth->mech)
 		goto err_put_net;
-	}
 	gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor);
 	if (gss_auth->service == 0)
 		goto err_put_mech;
@@ -1099,6 +1087,7 @@ err_free:
 	kfree(gss_auth);
 out_dec:
 	module_put(THIS_MODULE);
+	trace_rpcgss_createauth(flavor, err);
 	return ERR_PTR(err);
 }
 
@@ -1135,9 +1124,6 @@ gss_destroy(struct rpc_auth *auth)
 	struct gss_auth *gss_auth = container_of(auth,
 			struct gss_auth, rpc_auth);
 
-	dprintk("RPC:       destroying GSS authenticator %p flavor %d\n",
-			auth, auth->au_flavor);
-
 	if (hash_hashed(&gss_auth->hash)) {
 		spin_lock(&gss_auth_hash_lock);
 		hash_del(&gss_auth->hash);
@@ -1300,8 +1286,6 @@ gss_send_destroy_context(struct rpc_cred *cred)
 static void
 gss_do_free_ctx(struct gss_cl_ctx *ctx)
 {
-	dprintk("RPC:       %s\n", __func__);
-
 	gss_delete_sec_context(&ctx->gc_gss_ctx);
 	kfree(ctx->gc_wire_ctx.data);
 	kfree(ctx->gc_acceptor.data);
@@ -1324,7 +1308,6 @@ gss_free_ctx(struct gss_cl_ctx *ctx)
 static void
 gss_free_cred(struct gss_cred *gss_cred)
 {
-	dprintk("RPC:       %s cred=%p\n", __func__, gss_cred);
 	kfree(gss_cred);
 }
 
@@ -1381,10 +1364,6 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t
 	struct gss_cred	*cred = NULL;
 	int err = -ENOMEM;
 
-	dprintk("RPC:       %s for uid %d, flavor %d\n",
-		__func__, from_kuid(&init_user_ns, acred->cred->fsuid),
-		auth->au_flavor);
-
 	if (!(cred = kzalloc(sizeof(*cred), gfp)))
 		goto out_err;
 
@@ -1400,7 +1379,6 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t
 	return &cred->gc_base;
 
 out_err:
-	dprintk("RPC:       %s failed with error %d\n", __func__, err);
 	return ERR_PTR(err);
 }
 
@@ -1544,15 +1522,14 @@ static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr)
 	struct xdr_netobj mic;
 	struct kvec	iov;
 	struct xdr_buf	verf_buf;
-
-	dprintk("RPC: %5u %s\n", task->tk_pid, __func__);
+	int status;
 
 	/* Credential */
 
 	p = xdr_reserve_space(xdr, 7 * sizeof(*p) +
 			      ctx->gc_wire_ctx.len);
 	if (!p)
-		goto out_put_ctx;
+		goto marshal_failed;
 	*p++ = rpc_auth_gss;
 	cred_len = p++;
 
@@ -1560,7 +1537,8 @@ static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr)
 	req->rq_seqno = (ctx->gc_seq < MAXSEQ) ? ctx->gc_seq++ : MAXSEQ;
 	spin_unlock(&ctx->gc_seq_lock);
 	if (req->rq_seqno == MAXSEQ)
-		goto out_expired;
+		goto expired;
+	trace_rpcgss_seqno(task);
 
 	*p++ = cpu_to_be32(RPC_GSS_VERSION);
 	*p++ = cpu_to_be32(ctx->gc_proc);
@@ -1579,25 +1557,31 @@ static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr)
 
 	p = xdr_reserve_space(xdr, sizeof(*p));
 	if (!p)
-		goto out_put_ctx;
+		goto marshal_failed;
 	*p++ = rpc_auth_gss;
 	mic.data = (u8 *)(p + 1);
 	maj_stat = gss_get_mic(ctx->gc_gss_ctx, &verf_buf, &mic);
 	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
-		goto out_expired;
+		goto expired;
 	else if (maj_stat != 0)
-		goto out_put_ctx;
+		goto bad_mic;
 	if (xdr_stream_encode_opaque_inline(xdr, (void **)&p, mic.len) < 0)
-		goto out_put_ctx;
-	gss_put_ctx(ctx);
-	return 0;
-out_expired:
+		goto marshal_failed;
+	status = 0;
+out:
 	gss_put_ctx(ctx);
+	return status;
+expired:
 	clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
-	return -EKEYEXPIRED;
-out_put_ctx:
-	gss_put_ctx(ctx);
-	return -EMSGSIZE;
+	status = -EKEYEXPIRED;
+	goto out;
+marshal_failed:
+	status = -EMSGSIZE;
+	goto out;
+bad_mic:
+	trace_rpcgss_get_mic(task, maj_stat);
+	status = -EIO;
+	goto out;
 }
 
 static int gss_renew_cred(struct rpc_task *task)
@@ -1723,8 +1707,7 @@ validate_failed:
 	status = -EIO;
 	goto out;
 bad_mic:
-	dprintk("RPC: %5u %s: gss_verify_mic returned error 0x%08x\n",
-		task->tk_pid, __func__, maj_stat);
+	trace_rpcgss_verify_mic(task, maj_stat);
 	status = -EACCES;
 	goto out;
 }
@@ -1761,13 +1744,16 @@ static int gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
 		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
 	else if (maj_stat)
-		goto wrap_failed;
+		goto bad_mic;
 	/* Check that the trailing MIC fit in the buffer, after the fact */
 	if (xdr_stream_encode_opaque_inline(xdr, (void **)&p, mic.len) < 0)
 		goto wrap_failed;
 	return 0;
 wrap_failed:
 	return -EMSGSIZE;
+bad_mic:
+	trace_rpcgss_get_mic(task, maj_stat);
+	return -EIO;
 }
 
 static void
@@ -1860,7 +1846,6 @@ static int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 		memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len);
 		snd_buf->tail[0].iov_base = tmp;
 	}
-	status = -EIO;
 	offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
 	maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages);
 	/* slack space should prevent this ever happening: */
@@ -1871,7 +1856,7 @@ static int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
 		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
 	else if (maj_stat)
-		goto wrap_failed;
+		goto bad_wrap;
 
 	*opaque_len = cpu_to_be32(snd_buf->len - offset);
 	/* guess whether the pad goes into the head or the tail: */
@@ -1888,6 +1873,9 @@ static int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 	return 0;
 wrap_failed:
 	return status;
+bad_wrap:
+	trace_rpcgss_wrap(task, maj_stat);
+	return -EIO;
 }
 
 static int gss_wrap_req(struct rpc_task *task, struct xdr_stream *xdr)
@@ -1898,7 +1886,6 @@ static int gss_wrap_req(struct rpc_task *task, struct xdr_stream *xdr)
 	struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
 	int status;
 
-	dprintk("RPC: %5u %s\n", task->tk_pid, __func__);
 	status = -EIO;
 	if (ctx->gc_proc != RPC_GSS_PROC_DATA) {
 		/* The spec seems a little ambiguous here, but I think that not
@@ -1917,10 +1904,11 @@ static int gss_wrap_req(struct rpc_task *task, struct xdr_stream *xdr)
 	case RPC_GSS_SVC_PRIVACY:
 		status = gss_wrap_req_priv(cred, ctx, task, xdr);
 		break;
+	default:
+		status = -EIO;
 	}
 out:
 	gss_put_ctx(ctx);
-	dprintk("RPC: %5u %s returning %d\n", task->tk_pid, __func__, status);
 	return status;
 }
 
@@ -1932,8 +1920,9 @@ gss_unwrap_resp_auth(struct rpc_cred *cred)
 }
 
 static int
-gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
-		      struct rpc_rqst *rqstp, struct xdr_stream *xdr)
+gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
+		      struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp,
+		      struct xdr_stream *xdr)
 {
 	struct xdr_buf integ_buf, *rcv_buf = &rqstp->rq_rcv_buf;
 	u32 data_offset, mic_offset, integ_len, maj_stat;
@@ -1951,7 +1940,7 @@ gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 	if (mic_offset > rcv_buf->len)
 		goto unwrap_failed;
 	if (be32_to_cpup(p) != rqstp->rq_seqno)
-		goto unwrap_failed;
+		goto bad_seqno;
 
 	if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len))
 		goto unwrap_failed;
@@ -1967,16 +1956,20 @@ gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 				   1 + XDR_QUADLEN(mic.len);
 	return 0;
 unwrap_failed:
+	trace_rpcgss_unwrap_failed(task);
+	return -EIO;
+bad_seqno:
+	trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(p));
 	return -EIO;
 bad_mic:
-	dprintk("RPC:       %s: gss_verify_mic returned error 0x%08x\n",
-		__func__, maj_stat);
+	trace_rpcgss_verify_mic(task, maj_stat);
 	return -EIO;
 }
 
 static int
-gss_unwrap_resp_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
-		     struct rpc_rqst *rqstp, struct xdr_stream *xdr)
+gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred,
+		     struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp,
+		     struct xdr_stream *xdr)
 {
 	struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf;
 	struct kvec *head = rqstp->rq_rcv_buf.head;
@@ -2000,7 +1993,7 @@ gss_unwrap_resp_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 		goto bad_unwrap;
 	/* gss_unwrap decrypted the sequence number */
 	if (be32_to_cpup(p++) != rqstp->rq_seqno)
-		goto unwrap_failed;
+		goto bad_seqno;
 
 	/* gss_unwrap redacts the opaque blob from the head iovec.
 	 * rcv_buf has changed, thus the stream needs to be reset.
@@ -2011,10 +2004,13 @@ gss_unwrap_resp_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 				   XDR_QUADLEN(savedlen - rcv_buf->len);
 	return 0;
 unwrap_failed:
+	trace_rpcgss_unwrap_failed(task);
+	return -EIO;
+bad_seqno:
+	trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(--p));
 	return -EIO;
 bad_unwrap:
-	dprintk("RPC:       %s: gss_unwrap returned error 0x%08x\n",
-		__func__, maj_stat);
+	trace_rpcgss_unwrap(task, maj_stat);
 	return -EIO;
 }
 
@@ -2030,14 +2026,14 @@ gss_xmit_need_reencode(struct rpc_task *task)
 	struct rpc_rqst *req = task->tk_rqstp;
 	struct rpc_cred *cred = req->rq_cred;
 	struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
-	u32 win, seq_xmit;
+	u32 win, seq_xmit = 0;
 	bool ret = true;
 
 	if (!ctx)
-		return true;
+		goto out;
 
 	if (gss_seq_is_newer(req->rq_seqno, READ_ONCE(ctx->gc_seq)))
-		goto out;
+		goto out_ctx;
 
 	seq_xmit = READ_ONCE(ctx->gc_seq_xmit);
 	while (gss_seq_is_newer(req->rq_seqno, seq_xmit)) {
@@ -2046,15 +2042,18 @@ gss_xmit_need_reencode(struct rpc_task *task)
 		seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, req->rq_seqno);
 		if (seq_xmit == tmp) {
 			ret = false;
-			goto out;
+			goto out_ctx;
 		}
 	}
 
 	win = ctx->gc_win;
 	if (win > 0)
 		ret = !gss_seq_is_newer(req->rq_seqno, seq_xmit - win);
-out:
+
+out_ctx:
 	gss_put_ctx(ctx);
+out:
+	trace_rpcgss_need_reencode(task, seq_xmit, ret);
 	return ret;
 }
 
@@ -2075,10 +2074,10 @@ gss_unwrap_resp(struct rpc_task *task, struct xdr_stream *xdr)
 		status = gss_unwrap_resp_auth(cred);
 		break;
 	case RPC_GSS_SVC_INTEGRITY:
-		status = gss_unwrap_resp_integ(cred, ctx, rqstp, xdr);
+		status = gss_unwrap_resp_integ(task, cred, ctx, rqstp, xdr);
 		break;
 	case RPC_GSS_SVC_PRIVACY:
-		status = gss_unwrap_resp_priv(cred, ctx, rqstp, xdr);
+		status = gss_unwrap_resp_priv(task, cred, ctx, rqstp, xdr);
 		break;
 	}
 	if (status)
@@ -2088,8 +2087,6 @@ out_decode:
 	status = rpcauth_unwrap_resp_decode(task, xdr);
 out:
 	gss_put_ctx(ctx);
-	dprintk("RPC: %5u %s returning %d\n",
-		task->tk_pid, __func__, status);
 	return status;
 }
 
diff --git a/net/sunrpc/auth_gss/trace.c b/net/sunrpc/auth_gss/trace.c
new file mode 100644
index 000000000000..5576f1e66de9
--- /dev/null
+++ b/net/sunrpc/auth_gss/trace.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018, 2019 Oracle. All rights reserved.
+ */
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/gss_err.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/rpcgss.h>
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index f1ec2110efeb..bc7489f1fe55 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1165,6 +1165,7 @@ xprt_request_enqueue_transmit(struct rpc_task *task)
 				/* Note: req is added _before_ pos */
 				list_add_tail(&req->rq_xmit, &pos->rq_xmit);
 				INIT_LIST_HEAD(&req->rq_xmit2);
+				trace_xprt_enq_xmit(task, 1);
 				goto out;
 			}
 		} else if (RPC_IS_SWAPPER(task)) {
@@ -1176,6 +1177,7 @@ xprt_request_enqueue_transmit(struct rpc_task *task)
 				/* Note: req is added _before_ pos */
 				list_add_tail(&req->rq_xmit, &pos->rq_xmit);
 				INIT_LIST_HEAD(&req->rq_xmit2);
+				trace_xprt_enq_xmit(task, 2);
 				goto out;
 			}
 		} else if (!req->rq_seqno) {
@@ -1184,11 +1186,13 @@ xprt_request_enqueue_transmit(struct rpc_task *task)
 					continue;
 				list_add_tail(&req->rq_xmit2, &pos->rq_xmit2);
 				INIT_LIST_HEAD(&req->rq_xmit);
+				trace_xprt_enq_xmit(task, 3);
 				goto out;
 			}
 		}
 		list_add_tail(&req->rq_xmit, &xprt->xmit_queue);
 		INIT_LIST_HEAD(&req->rq_xmit2);
+		trace_xprt_enq_xmit(task, 4);
 out:
 		set_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate);
 		spin_unlock(&xprt->queue_lock);
@@ -1313,8 +1317,6 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task)
 	int is_retrans = RPC_WAS_SENT(task);
 	int status;
 
-	dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen);
-
 	if (!req->rq_bytes_sent) {
 		if (xprt_request_data_received(task)) {
 			status = 0;
@@ -1336,9 +1338,9 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task)
 
 	connect_cookie = xprt->connect_cookie;
 	status = xprt->ops->send_request(req);
-	trace_xprt_transmit(xprt, req->rq_xid, status);
 	if (status != 0) {
 		req->rq_ntrans--;
+		trace_xprt_transmit(req, status);
 		return status;
 	}
 
@@ -1347,7 +1349,6 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task)
 
 	xprt_inject_disconnect(xprt);
 
-	dprintk("RPC: %5u xmit complete\n", task->tk_pid);
 	task->tk_flags |= RPC_TASK_SENT;
 	spin_lock_bh(&xprt->transport_lock);
 
@@ -1360,6 +1361,7 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task)
 
 	req->rq_connect_cookie = connect_cookie;
 out_dequeue:
+	trace_xprt_transmit(req, status);
 	xprt_request_dequeue_transmit(task);
 	rpc_wake_up_queued_task_set_status(&xprt->sending, task, status);
 	return status;
-- 
cgit v1.2.3


From 241b1f419f0ea9966d574d7cc67377c74982a125 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 Feb 2019 11:25:09 -0500
Subject: SUNRPC: Remove xdr_buf_trim()

The key action of xdr_buf_trim() is that it shortens buf->len, the
length of the xdr_buf's content. The other actions -- shortening the
head, pages, and tail components -- are actually not necessary. In
particular, changing the size of those components can corrupt the
RPC message contained in the buffer. This is an accident waiting to
happen rather than a current bug, as far as we know.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Acked-by: Bruce Fields <bfields@redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xdr.h          |  1 -
 net/sunrpc/auth_gss/gss_krb5_wrap.c |  8 +++++---
 net/sunrpc/auth_gss/svcauth_gss.c   |  2 +-
 net/sunrpc/xdr.c                    | 41 -------------------------------------
 4 files changed, 6 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 65af6a204b75..9ee3970ba59c 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -179,7 +179,6 @@ xdr_adjust_iovec(struct kvec *iov, __be32 *p)
 extern void xdr_shift_buf(struct xdr_buf *, size_t);
 extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *);
 extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int);
-extern void xdr_buf_trim(struct xdr_buf *, unsigned int);
 extern int xdr_buf_read_netobj(struct xdr_buf *, struct xdr_netobj *, unsigned int);
 extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
 extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index 5cdde6cb703a..14a0aff0cd84 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -570,14 +570,16 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
 	 */
 	movelen = min_t(unsigned int, buf->head[0].iov_len, buf->len);
 	movelen -= offset + GSS_KRB5_TOK_HDR_LEN + headskip;
-	BUG_ON(offset + GSS_KRB5_TOK_HDR_LEN + headskip + movelen >
-							buf->head[0].iov_len);
+	if (offset + GSS_KRB5_TOK_HDR_LEN + headskip + movelen >
+	    buf->head[0].iov_len)
+		return GSS_S_FAILURE;
 	memmove(ptr, ptr + GSS_KRB5_TOK_HDR_LEN + headskip, movelen);
 	buf->head[0].iov_len -= GSS_KRB5_TOK_HDR_LEN + headskip;
 	buf->len -= GSS_KRB5_TOK_HDR_LEN + headskip;
 
 	/* Trim off the trailing "extra count" and checksum blob */
-	xdr_buf_trim(buf, ec + GSS_KRB5_TOK_HDR_LEN + tailskip);
+	buf->len -= ec + GSS_KRB5_TOK_HDR_LEN + tailskip;
+
 	return GSS_S_COMPLETE;
 }
 
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 152790ed309c..f1aabab4a4c2 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -896,7 +896,7 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g
 	if (svc_getnl(&buf->head[0]) != seq)
 		goto out;
 	/* trim off the mic and padding at the end before returning */
-	xdr_buf_trim(buf, round_up_to_quad(mic.len) + 4);
+	buf->len -= 4 + round_up_to_quad(mic.len);
 	stat = 0;
 out:
 	kfree(mic.data);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 5f0aa53fa4ae..4bce61978062 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1139,47 +1139,6 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
 }
 EXPORT_SYMBOL_GPL(xdr_buf_subsegment);
 
-/**
- * xdr_buf_trim - lop at most "len" bytes off the end of "buf"
- * @buf: buf to be trimmed
- * @len: number of bytes to reduce "buf" by
- *
- * Trim an xdr_buf by the given number of bytes by fixing up the lengths. Note
- * that it's possible that we'll trim less than that amount if the xdr_buf is
- * too small, or if (for instance) it's all in the head and the parser has
- * already read too far into it.
- */
-void xdr_buf_trim(struct xdr_buf *buf, unsigned int len)
-{
-	size_t cur;
-	unsigned int trim = len;
-
-	if (buf->tail[0].iov_len) {
-		cur = min_t(size_t, buf->tail[0].iov_len, trim);
-		buf->tail[0].iov_len -= cur;
-		trim -= cur;
-		if (!trim)
-			goto fix_len;
-	}
-
-	if (buf->page_len) {
-		cur = min_t(unsigned int, buf->page_len, trim);
-		buf->page_len -= cur;
-		trim -= cur;
-		if (!trim)
-			goto fix_len;
-	}
-
-	if (buf->head[0].iov_len) {
-		cur = min_t(size_t, buf->head[0].iov_len, trim);
-		buf->head[0].iov_len -= cur;
-		trim -= cur;
-	}
-fix_len:
-	buf->len -= (len - trim);
-}
-EXPORT_SYMBOL_GPL(xdr_buf_trim);
-
 static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
 {
 	unsigned int this_len;
-- 
cgit v1.2.3


From cf500bac8fd48b57f38ece890235923d4ed5ee91 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 Feb 2019 11:25:20 -0500
Subject: SUNRPC: Introduce rpc_prepare_reply_pages()

prepare_reply_buffer() and its NFSv4 equivalents expose the details
of the RPC header and the auth slack values to upper layer
consumers, creating a layering violation, and duplicating code.

Remedy these issues by adding a new RPC client API that hides those
details from upper layers in a common helper function.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/nfs2xdr.c              | 27 +++++------------------
 fs/nfs/nfs3xdr.c              | 29 ++++++------------------
 fs/nfs/nfs4xdr.c              | 51 ++++++++++++++++++-------------------------
 include/linux/sunrpc/clnt.h   |  3 +++
 include/trace/events/sunrpc.h | 37 +++++++++++++++++++++++++++++++
 net/sunrpc/clnt.c             | 19 ++++++++++++++++
 net/sunrpc/xdr.c              |  9 ++++++++
 7 files changed, 102 insertions(+), 73 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index bac3a4e2cb5d..1dcd0feda32d 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -65,21 +65,6 @@
 
 static int nfs_stat_to_errno(enum nfs_stat);
 
-/*
- * While encoding arguments, set up the reply buffer in advance to
- * receive reply data directly into the page cache.
- */
-static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
-				 unsigned int base, unsigned int len,
-				 unsigned int bufsize)
-{
-	struct rpc_auth	*auth = req->rq_cred->cr_auth;
-	unsigned int replen;
-
-	replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
-	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
-}
-
 /*
  * Encode/decode NFSv2 basic data types
  *
@@ -593,8 +578,8 @@ static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
 	const struct nfs_readlinkargs *args = data;
 
 	encode_fhandle(xdr, args->fh);
-	prepare_reply_buffer(req, args->pages, args->pgbase,
-					args->pglen, NFS_readlinkres_sz);
+	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+				args->pglen, NFS_readlinkres_sz);
 }
 
 /*
@@ -629,8 +614,8 @@ static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
 	const struct nfs_pgio_args *args = data;
 
 	encode_readargs(xdr, args);
-	prepare_reply_buffer(req, args->pages, args->pgbase,
-					args->count, NFS_readres_sz);
+	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+				args->count, NFS_readres_sz);
 	req->rq_rcv_buf.flags |= XDRBUF_READ;
 }
 
@@ -787,8 +772,8 @@ static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
 	const struct nfs_readdirargs *args = data;
 
 	encode_readdirargs(xdr, args);
-	prepare_reply_buffer(req, args->pages, 0,
-					args->count, NFS_readdirres_sz);
+	rpc_prepare_reply_pages(req, args->pages, 0,
+				args->count, NFS_readdirres_sz);
 }
 
 /*
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 4aa3ffe1800e..a54dcf4bfb1d 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -104,21 +104,6 @@ static const umode_t nfs_type2fmt[] = {
 	[NF3FIFO] = S_IFIFO,
 };
 
-/*
- * While encoding arguments, set up the reply buffer in advance to
- * receive reply data directly into the page cache.
- */
-static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
-				 unsigned int base, unsigned int len,
-				 unsigned int bufsize)
-{
-	struct rpc_auth	*auth = req->rq_cred->cr_auth;
-	unsigned int replen;
-
-	replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
-	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
-}
-
 /*
  * Encode/decode NFSv3 basic data types
  *
@@ -910,8 +895,8 @@ static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
 	const struct nfs3_readlinkargs *args = data;
 
 	encode_nfs_fh3(xdr, args->fh);
-	prepare_reply_buffer(req, args->pages, args->pgbase,
-					args->pglen, NFS3_readlinkres_sz);
+	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+				args->pglen, NFS3_readlinkres_sz);
 }
 
 /*
@@ -943,8 +928,8 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
 	unsigned int replen = args->replen ? args->replen : NFS3_readres_sz;
 
 	encode_read3args(xdr, args);
-	prepare_reply_buffer(req, args->pages, args->pgbase,
-					args->count, replen);
+	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+				args->count, replen);
 	req->rq_rcv_buf.flags |= XDRBUF_READ;
 }
 
@@ -1236,7 +1221,7 @@ static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
 	const struct nfs3_readdirargs *args = data;
 
 	encode_readdir3args(xdr, args);
-	prepare_reply_buffer(req, args->pages, 0,
+	rpc_prepare_reply_pages(req, args->pages, 0,
 				args->count, NFS3_readdirres_sz);
 }
 
@@ -1278,7 +1263,7 @@ static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
 	const struct nfs3_readdirargs *args = data;
 
 	encode_readdirplus3args(xdr, args);
-	prepare_reply_buffer(req, args->pages, 0,
+	rpc_prepare_reply_pages(req, args->pages, 0,
 				args->count, NFS3_readdirres_sz);
 }
 
@@ -1323,7 +1308,7 @@ static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
 	encode_nfs_fh3(xdr, args->fh);
 	encode_uint32(xdr, args->mask);
 	if (args->mask & (NFS_ACL | NFS_DFACL)) {
-		prepare_reply_buffer(req, args->pages, 0,
+		rpc_prepare_reply_pages(req, args->pages, 0,
 					NFSACL_MAXPAGES << PAGE_SHIFT,
 					ACL3_getaclres_sz);
 		req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 38a4cbc18657..d0fa18df32ea 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1016,12 +1016,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 				struct compound_hdr *hdr)
 {
 	__be32 *p;
-	struct rpc_auth *auth = req->rq_cred->cr_auth;
 
 	/* initialize running count of expected bytes in reply.
 	 * NOTE: the replied tag SHOULD be the same is the one sent,
 	 * but this is not required as a MUST for the server to do so. */
-	hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen;
+	hdr->replen = 3 + hdr->taglen;
 
 	WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN);
 	encode_string(xdr, hdr->taglen, hdr->tag);
@@ -2341,9 +2340,9 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
 	if (args->lg_args) {
 		encode_layoutget(xdr, args->lg_args, &hdr);
-		xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
-				 args->lg_args->layout.pages,
-				 0, args->lg_args->layout.pglen);
+		rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0,
+					args->lg_args->layout.pglen,
+					hdr.replen);
 	}
 	encode_nops(&hdr);
 }
@@ -2387,9 +2386,9 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
 	encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
 	if (args->lg_args) {
 		encode_layoutget(xdr, args->lg_args, &hdr);
-		xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
-				 args->lg_args->layout.pages,
-				 0, args->lg_args->layout.pglen);
+		rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0,
+					args->lg_args->layout.pglen,
+					hdr.replen);
 	}
 	encode_nops(&hdr);
 }
@@ -2499,8 +2498,8 @@ static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_putfh(xdr, args->fh, &hdr);
 	encode_readlink(xdr, args, req, &hdr);
 
-	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
-			args->pgbase, args->pglen);
+	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+				args->pglen, hdr.replen);
 	encode_nops(&hdr);
 }
 
@@ -2520,11 +2519,8 @@ static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_putfh(xdr, args->fh, &hdr);
 	encode_readdir(xdr, args, req, &hdr);
 
-	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
-			 args->pgbase, args->count);
-	dprintk("%s: inlined page args = (%u, %p, %u, %u)\n",
-			__func__, hdr.replen << 2, args->pages,
-			args->pgbase, args->count);
+	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+				args->count, hdr.replen);
 	encode_nops(&hdr);
 }
 
@@ -2544,8 +2540,8 @@ static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_putfh(xdr, args->fh, &hdr);
 	encode_read(xdr, args, &hdr);
 
-	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
-			 args->pages, args->pgbase, args->count);
+	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+				args->count, hdr.replen);
 	req->rq_rcv_buf.flags |= XDRBUF_READ;
 	encode_nops(&hdr);
 }
@@ -2591,9 +2587,8 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_getattr(xdr, nfs4_acl_bitmap, NULL,
 			ARRAY_SIZE(nfs4_acl_bitmap), &hdr);
 
-	xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
-		args->acl_pages, 0, args->acl_len);
-
+	rpc_prepare_reply_pages(req, args->acl_pages, 0,
+				args->acl_len, replen);
 	encode_nops(&hdr);
 }
 
@@ -2814,9 +2809,8 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
 		encode_fs_locations(xdr, args->bitmask, &hdr);
 	}
 
-	/* Set up reply kvec to capture returned fs_locations array. */
-	xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
-			 (struct page **)&args->page, 0, PAGE_SIZE);
+	rpc_prepare_reply_pages(req, (struct page **)&args->page, 0,
+				PAGE_SIZE, replen);
 	encode_nops(&hdr);
 }
 
@@ -3018,10 +3012,8 @@ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
 
 	/* set up reply kvec. Subtract notification bitmap max size (2)
 	 * so that notification bitmap is put in xdr_buf tail */
-	xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
-			 args->pdev->pages, args->pdev->pgbase,
-			 args->pdev->pglen);
-
+	rpc_prepare_reply_pages(req, args->pdev->pages, args->pdev->pgbase,
+				args->pdev->pglen, hdr.replen - 2);
 	encode_nops(&hdr);
 }
 
@@ -3042,9 +3034,8 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
 	encode_putfh(xdr, NFS_FH(args->inode), &hdr);
 	encode_layoutget(xdr, args, &hdr);
 
-	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
-	    args->layout.pages, 0, args->layout.pglen);
-
+	rpc_prepare_reply_pages(req, args->layout.pages, 0,
+				args->layout.pglen, hdr.replen);
 	encode_nops(&hdr);
 }
 
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 1c441714d569..98bc9883b230 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -169,6 +169,9 @@ int		rpcb_v4_register(struct net *net, const u32 program,
 				 const char *netid);
 void		rpcb_getport_async(struct rpc_task *);
 
+void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages,
+			     unsigned int base, unsigned int len,
+			     unsigned int hdrsize);
 void		rpc_call_start(struct rpc_task *);
 int		rpc_call_async(struct rpc_clnt *clnt,
 			       const struct rpc_message *msg, int flags,
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index e58dda8e038c..8451f30c6a0f 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -461,6 +461,43 @@ TRACE_EVENT(rpc_xdr_alignment,
 	)
 );
 
+TRACE_EVENT(rpc_reply_pages,
+	TP_PROTO(
+		const struct rpc_rqst *req
+	),
+
+	TP_ARGS(req),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(const void *, head_base)
+		__field(size_t, head_len)
+		__field(const void *, tail_base)
+		__field(size_t, tail_len)
+		__field(unsigned int, page_len)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = req->rq_task->tk_pid;
+		__entry->client_id = req->rq_task->tk_client->cl_clid;
+
+		__entry->head_base = req->rq_rcv_buf.head[0].iov_base;
+		__entry->head_len = req->rq_rcv_buf.head[0].iov_len;
+		__entry->page_len = req->rq_rcv_buf.page_len;
+		__entry->tail_base = req->rq_rcv_buf.tail[0].iov_base;
+		__entry->tail_len = req->rq_rcv_buf.tail[0].iov_len;
+	),
+
+	TP_printk(
+		"task:%u@%u xdr=[%p,%zu]/%u/[%p,%zu]\n",
+		__entry->task_id, __entry->client_id,
+		__entry->head_base, __entry->head_len,
+		__entry->page_len,
+		__entry->tail_base, __entry->tail_len
+	)
+);
+
 /*
  * First define the enums in the below macros to be exported to userspace
  * via TRACE_DEFINE_ENUM().
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 803e93105af1..f780605fffe0 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1164,6 +1164,25 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
 }
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 
+/**
+ * rpc_prepare_reply_pages - Prepare to receive a reply data payload into pages
+ * @req: RPC request to prepare
+ * @pages: vector of struct page pointers
+ * @base: offset in first page where receive should start, in bytes
+ * @len: expected size of the upper layer data payload, in bytes
+ * @hdrsize: expected size of upper layer reply header, in XDR words
+ *
+ */
+void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages,
+			     unsigned int base, unsigned int len,
+			     unsigned int hdrsize)
+{
+	hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_rslack;
+	xdr_inline_pages(&req->rq_rcv_buf, hdrsize << 2, pages, base, len);
+	trace_rpc_reply_pages(req);
+}
+EXPORT_SYMBOL_GPL(rpc_prepare_reply_pages);
+
 void
 rpc_call_start(struct rpc_task *task)
 {
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 4bce61978062..7cca51560442 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -163,6 +163,15 @@ xdr_free_bvec(struct xdr_buf *buf)
 	buf->bvec = NULL;
 }
 
+/**
+ * xdr_inline_pages - Prepare receive buffer for a large reply
+ * @xdr: xdr_buf into which reply will be placed
+ * @offset: expected offset where data payload will start, in bytes
+ * @pages: vector of struct page pointers
+ * @base: offset in first page where receive should start, in bytes
+ * @len: expected size of the upper layer data payload, in bytes
+ *
+ */
 void
 xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
 		 struct page **pages, unsigned int base, unsigned int len)
-- 
cgit v1.2.3


From 41cb8d189c9d4964df52a6f497cab7b301ae831b Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Mon, 14 Jan 2019 16:44:59 +0530
Subject: PCI: endpoint: Add new pci_epc_ops to get EPC features

Add a new pci_epc_ops ->get_features() to get the features
supported by the EPC. Since EPC can provide different features to
different functions, the ->get_features() ops takes _func_no_ as
an argument.

Tested-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
---
 drivers/pci/endpoint/pci-epc-core.c | 30 ++++++++++++++++++++++++++++++
 include/linux/pci-epc.h             | 22 ++++++++++++++++++++++
 2 files changed, 52 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index 094dcc3203b8..5a099479d9ab 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -83,6 +83,36 @@ err:
 }
 EXPORT_SYMBOL_GPL(pci_epc_get);
 
+/**
+ * pci_epc_get_features() - get the features supported by EPC
+ * @epc: the features supported by *this* EPC device will be returned
+ * @func_no: the features supported by the EPC device specific to the
+ *	     endpoint function with func_no will be returned
+ *
+ * Invoke to get the features provided by the EPC which may be
+ * specific to an endpoint function. Returns pci_epc_features on success
+ * and NULL for any failures.
+ */
+const struct pci_epc_features *pci_epc_get_features(struct pci_epc *epc,
+						    u8 func_no)
+{
+	const struct pci_epc_features *epc_features;
+	unsigned long flags;
+
+	if (IS_ERR_OR_NULL(epc) || func_no >= epc->max_functions)
+		return NULL;
+
+	if (!epc->ops->get_features)
+		return NULL;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	epc_features = epc->ops->get_features(epc, func_no);
+	spin_unlock_irqrestore(&epc->lock, flags);
+
+	return epc_features;
+}
+EXPORT_SYMBOL_GPL(pci_epc_get_features);
+
 /**
  * pci_epc_stop() - stop the PCI link
  * @epc: the link of the EPC device that has to be stopped
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 37dab8116901..79fbcf94e14d 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -59,6 +59,8 @@ struct pci_epc_ops {
 			     enum pci_epc_irq_type type, u16 interrupt_num);
 	int	(*start)(struct pci_epc *epc);
 	void	(*stop)(struct pci_epc *epc);
+	const struct pci_epc_features* (*get_features)(struct pci_epc *epc,
+						       u8 func_no);
 	struct module *owner;
 };
 
@@ -100,6 +102,24 @@ struct pci_epc {
 	unsigned int			features;
 };
 
+/**
+ * struct pci_epc_features - features supported by a EPC device per function
+ * @linkup_notifier: indicate if the EPC device can notify EPF driver on link up
+ * @msi_capable: indicate if the endpoint function has MSI capability
+ * @msix_capable: indicate if the endpoint function has MSI-X capability
+ * @reserved_bar: bitmap to indicate reserved BAR unavailable to function driver
+ * @bar_fixed_64bit: bitmap to indicate fixed 64bit BARs
+ * @bar_fixed_size: Array specifying the size supported by each BAR
+ */
+struct pci_epc_features {
+	unsigned int	linkup_notifier : 1;
+	unsigned int	msi_capable : 1;
+	unsigned int	msix_capable : 1;
+	u8	reserved_bar;
+	u8	bar_fixed_64bit;
+	u64	bar_fixed_size[BAR_5 + 1];
+};
+
 #define EPC_FEATURE_NO_LINKUP_NOTIFIER		BIT(0)
 #define EPC_FEATURE_BAR_MASK			(BIT(1) | BIT(2) | BIT(3))
 #define EPC_FEATURE_MSIX_AVAILABLE		BIT(4)
@@ -158,6 +178,8 @@ int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no,
 		      enum pci_epc_irq_type type, u16 interrupt_num);
 int pci_epc_start(struct pci_epc *epc);
 void pci_epc_stop(struct pci_epc *epc);
+const struct pci_epc_features *pci_epc_get_features(struct pci_epc *epc,
+						    u8 func_no);
 struct pci_epc *pci_epc_get(const char *epc_name);
 void pci_epc_put(struct pci_epc *epc);
 
-- 
cgit v1.2.3


From a00275baa68e1ee226cc659f54dc3a571f3ad600 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 Feb 2019 11:25:31 -0500
Subject: SUNRPC: Make AUTH_SYS and AUTH_NULL set au_verfsize

au_verfsize will be needed for a non-flavor-specific computation
in a subsequent patch.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/auth.h    | 3 +--
 net/sunrpc/auth_gss/auth_gss.c | 1 +
 net/sunrpc/auth_null.c         | 1 +
 net/sunrpc/auth_unix.c         | 5 ++++-
 4 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index c51e1893f77e..359dfdd04e77 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -77,8 +77,7 @@ struct rpc_auth {
 				/* guess at number of u32's auth adds before
 				 * reply data; normally the verifier size: */
 	unsigned int		au_rslack;
-				/* for gss, used to calculate au_rslack: */
-	unsigned int		au_verfsize;
+	unsigned int		au_verfsize;	/* size of reply verifier */
 
 	unsigned int		au_flags;	/* various flags */
 	const struct rpc_authops *au_ops;		/* operations */
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index fda454c9b594..731e7a482e18 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1016,6 +1016,7 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
 	auth = &gss_auth->rpc_auth;
 	auth->au_cslack = GSS_CRED_SLACK >> 2;
 	auth->au_rslack = GSS_VERF_SLACK >> 2;
+	auth->au_verfsize = GSS_VERF_SLACK >> 2;
 	auth->au_flags = 0;
 	auth->au_ops = &authgss_ops;
 	auth->au_flavor = flavor;
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index bf96975ffc4b..9ae08248a9e1 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -114,6 +114,7 @@ static
 struct rpc_auth null_auth = {
 	.au_cslack	= NUL_CALLSLACK,
 	.au_rslack	= NUL_REPLYSLACK,
+	.au_verfsize	= NUL_REPLYSLACK,
 	.au_ops		= &authnull_ops,
 	.au_flavor	= RPC_AUTH_NULL,
 	.au_count	= REFCOUNT_INIT(1),
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 5ea84a96f96e..a93c56442487 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -163,6 +163,7 @@ unx_refresh(struct rpc_task *task)
 static int
 unx_validate(struct rpc_task *task, struct xdr_stream *xdr)
 {
+	struct rpc_auth *auth = task->tk_rqstp->rq_cred->cr_auth;
 	__be32 *p;
 	u32 size;
 
@@ -184,7 +185,8 @@ unx_validate(struct rpc_task *task, struct xdr_stream *xdr)
 	if (!p)
 		return -EIO;
 
-	task->tk_rqstp->rq_cred->cr_auth->au_rslack = (size >> 2) + 2;
+	auth->au_verfsize = XDR_QUADLEN(size) + 2;
+	auth->au_rslack = XDR_QUADLEN(size) + 2;
 	return 0;
 }
 
@@ -212,6 +214,7 @@ static
 struct rpc_auth		unix_auth = {
 	.au_cslack	= UNX_CALLSLACK,
 	.au_rslack	= NUL_REPLYSLACK,
+	.au_verfsize	= NUL_REPLYSLACK,
 	.au_ops		= &authunix_ops,
 	.au_flavor	= RPC_AUTH_UNIX,
 	.au_count	= REFCOUNT_INIT(1),
-- 
cgit v1.2.3


From 35e77d21baa04b554bf3dc9a08dfa7e569286e51 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 Feb 2019 11:25:36 -0500
Subject: SUNRPC: Add rpc_auth::au_ralign field

Currently rpc_inline_rcv_pages() uses au_rslack to estimate the
size of the upper layer reply header. This is fine for auth flavors
where au_verfsize == au_rslack.

However, some auth flavors have more going on. krb5i for example has
two more words after the verifier, and another blob following the
RPC message. The calculation involving au_rslack pushes the upper
layer reply header too far into the rcv_buf.

au_rslack is still valuable: it's the amount of buffer space needed
for the reply, and is used when allocating the reply buffer. We'll
keep that.

But, add a new field that can be used to properly estimate the
location of the upper layer header in each RPC reply, based on the
auth flavor in use.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/auth.h    |  9 ++++-----
 net/sunrpc/auth_gss/auth_gss.c | 18 +++++++++++++-----
 net/sunrpc/auth_null.c         |  1 +
 net/sunrpc/auth_unix.c         |  1 +
 net/sunrpc/clnt.c              |  2 +-
 5 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 359dfdd04e77..5f9076fdb090 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -74,13 +74,12 @@ struct rpc_cred_cache;
 struct rpc_authops;
 struct rpc_auth {
 	unsigned int		au_cslack;	/* call cred size estimate */
-				/* guess at number of u32's auth adds before
-				 * reply data; normally the verifier size: */
-	unsigned int		au_rslack;
+	unsigned int		au_rslack;	/* reply cred size estimate */
 	unsigned int		au_verfsize;	/* size of reply verifier */
+	unsigned int		au_ralign;	/* words before UL header */
 
-	unsigned int		au_flags;	/* various flags */
-	const struct rpc_authops *au_ops;		/* operations */
+	unsigned int		au_flags;
+	const struct rpc_authops *au_ops;
 	rpc_authflavor_t	au_flavor;	/* pseudoflavor (note may
 						 * differ from the flavor in
 						 * au_ops->au_flavor in gss
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 731e7a482e18..c67e2ad151ae 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1017,6 +1017,7 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
 	auth->au_cslack = GSS_CRED_SLACK >> 2;
 	auth->au_rslack = GSS_VERF_SLACK >> 2;
 	auth->au_verfsize = GSS_VERF_SLACK >> 2;
+	auth->au_ralign = GSS_VERF_SLACK >> 2;
 	auth->au_flags = 0;
 	auth->au_ops = &authgss_ops;
 	auth->au_flavor = flavor;
@@ -1891,7 +1892,10 @@ out:
 static int
 gss_unwrap_resp_auth(struct rpc_cred *cred)
 {
-	cred->cr_auth->au_rslack = cred->cr_auth->au_verfsize;
+	struct rpc_auth *auth = cred->cr_auth;
+
+	auth->au_rslack = auth->au_verfsize;
+	auth->au_ralign = auth->au_verfsize;
 	return 0;
 }
 
@@ -1902,6 +1906,7 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
 {
 	struct xdr_buf integ_buf, *rcv_buf = &rqstp->rq_rcv_buf;
 	u32 data_offset, mic_offset, integ_len, maj_stat;
+	struct rpc_auth *auth = cred->cr_auth;
 	struct xdr_netobj mic;
 	__be32 *p;
 
@@ -1928,8 +1933,8 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
 	if (maj_stat != GSS_S_COMPLETE)
 		goto bad_mic;
 
-	cred->cr_auth->au_rslack = cred->cr_auth->au_verfsize + 2 +
-				   1 + XDR_QUADLEN(mic.len);
+	auth->au_rslack = auth->au_verfsize + 2 + 1 + XDR_QUADLEN(mic.len);
+	auth->au_ralign = auth->au_verfsize + 2;
 	return 0;
 unwrap_failed:
 	trace_rpcgss_unwrap_failed(task);
@@ -1949,6 +1954,7 @@ gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred,
 {
 	struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf;
 	struct kvec *head = rqstp->rq_rcv_buf.head;
+	struct rpc_auth *auth = cred->cr_auth;
 	unsigned int savedlen = rcv_buf->len;
 	u32 offset, opaque_len, maj_stat;
 	__be32 *p;
@@ -1976,8 +1982,10 @@ gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred,
 	 */
 	xdr_init_decode(xdr, rcv_buf, p, rqstp);
 
-	cred->cr_auth->au_rslack = cred->cr_auth->au_verfsize + 2 +
-				   XDR_QUADLEN(savedlen - rcv_buf->len);
+	auth->au_rslack = auth->au_verfsize + 2 +
+			  XDR_QUADLEN(savedlen - rcv_buf->len);
+	auth->au_ralign = auth->au_verfsize + 2 +
+			  XDR_QUADLEN(savedlen - rcv_buf->len);
 	return 0;
 unwrap_failed:
 	trace_rpcgss_unwrap_failed(task);
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 9ae08248a9e1..41a633a4049e 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -115,6 +115,7 @@ struct rpc_auth null_auth = {
 	.au_cslack	= NUL_CALLSLACK,
 	.au_rslack	= NUL_REPLYSLACK,
 	.au_verfsize	= NUL_REPLYSLACK,
+	.au_ralign	= NUL_REPLYSLACK,
 	.au_ops		= &authnull_ops,
 	.au_flavor	= RPC_AUTH_NULL,
 	.au_count	= REFCOUNT_INIT(1),
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index a93c56442487..c048eb6deaaf 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -187,6 +187,7 @@ unx_validate(struct rpc_task *task, struct xdr_stream *xdr)
 
 	auth->au_verfsize = XDR_QUADLEN(size) + 2;
 	auth->au_rslack = XDR_QUADLEN(size) + 2;
+	auth->au_ralign = XDR_QUADLEN(size) + 2;
 	return 0;
 }
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 4ea38b029e2f..99bfeb17367c 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1180,7 +1180,7 @@ void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages,
 	/* Subtract one to force an extra word of buffer space for the
 	 * payload's XDR pad to fall into the rcv_buf's tail iovec.
 	 */
-	hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_rslack - 1;
+	hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign - 1;
 
 	xdr_inline_pages(&req->rq_rcv_buf, hdrsize << 2, pages, base, len);
 	trace_rpc_reply_pages(req);
-- 
cgit v1.2.3


From 2c7e4928b35660e2147d14d5e42849c22f44b55f Mon Sep 17 00:00:00 2001
From: Federico Vaga <federico.vaga@cern.ch>
Date: Thu, 14 Feb 2019 09:51:33 +0100
Subject: i2c: ocores: add SPDX tag

It adds the SPDX tag and it removes the old text about the GPLv2.

Signed-off-by: Federico Vaga <federico.vaga@cern.ch>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/busses/i2c-ocores.c          | 5 +----
 include/linux/platform_data/i2c-ocores.h | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c
index 5dea7b9ab7e5..78085a88d866 100644
--- a/drivers/i2c/busses/i2c-ocores.c
+++ b/drivers/i2c/busses/i2c-ocores.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * i2c-ocores.c: I2C bus driver for OpenCores I2C controller
  * (https://opencores.org/project/i2c/overview)
@@ -6,10 +7,6 @@
  *
  * Support for the GRLIB port of the controller by
  * Andreas Larsson <andreas@gaisler.com>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2.  This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
  */
 
 #include <linux/clk.h>
diff --git a/include/linux/platform_data/i2c-ocores.h b/include/linux/platform_data/i2c-ocores.h
index 113d6b12f650..8c416ff8affd 100644
--- a/include/linux/platform_data/i2c-ocores.h
+++ b/include/linux/platform_data/i2c-ocores.h
@@ -1,11 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * i2c-ocores.h - definitions for the i2c-ocores interface
  *
  * Peter Korsgaard <peter@korsgaard.com>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2.  This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
  */
 
 #ifndef _LINUX_I2C_OCORES_H
-- 
cgit v1.2.3


From 237b5f66e1ed8a58662f29bcd04442953cdb8b55 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 14 Feb 2019 04:24:50 +0100
Subject: i2c: ocores: Add support for bus clock via platform data

Add the I2C bus clock speed to the platform data structure.
If not set, default to 100KHz as before.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/busses/i2c-ocores.c          | 5 ++++-
 include/linux/platform_data/i2c-ocores.h | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c
index 0d90a82a2c03..4eea18689e99 100644
--- a/drivers/i2c/busses/i2c-ocores.c
+++ b/drivers/i2c/busses/i2c-ocores.c
@@ -628,7 +628,10 @@ static int ocores_i2c_probe(struct platform_device *pdev)
 		i2c->reg_shift = pdata->reg_shift;
 		i2c->reg_io_width = pdata->reg_io_width;
 		i2c->ip_clock_khz = pdata->clock_khz;
-		i2c->bus_clock_khz = 100;
+		if (pdata->bus_khz)
+			i2c->bus_clock_khz = pdata->bus_khz;
+		else
+			i2c->bus_clock_khz = 100;
 	} else {
 		ret = ocores_i2c_of_probe(pdev, i2c);
 		if (ret)
diff --git a/include/linux/platform_data/i2c-ocores.h b/include/linux/platform_data/i2c-ocores.h
index 8c416ff8affd..e6326cbafe59 100644
--- a/include/linux/platform_data/i2c-ocores.h
+++ b/include/linux/platform_data/i2c-ocores.h
@@ -12,6 +12,7 @@ struct ocores_i2c_platform_data {
 	u32 reg_shift; /* register offset shift value */
 	u32 reg_io_width; /* register io read/write width */
 	u32 clock_khz; /* input clock in kHz */
+	u32 bus_khz; /* bus clock in kHz */
 	bool big_endian; /* registers are big endian */
 	u8 num_devices; /* number of devices in the devices list */
 	struct i2c_board_info const *devices; /* devices connected to the bus */
-- 
cgit v1.2.3


From d8434c31378d5f2009741fa171bd1143c77a6e4a Mon Sep 17 00:00:00 2001
From: John Sperbeck <jsperbeck@google.com>
Date: Tue, 12 Feb 2019 19:40:57 -0800
Subject: i2c: core-smbus: don't trace smbus_reply data on errors

If an smbus transfer fails, there's no guarantee that the output
buffer was written.  So, avoid trying to show the output buffer when
tracing after an error.  This was 'mostly harmless', but would trip
up kasan checking if left-over cruft in byte 0 is a large length,
causing us to read from unwritten memory.

Signed-off-by: John Sperbeck <jsperbeck@google.com>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core-smbus.c | 2 +-
 include/trace/events/smbus.h | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/i2c/i2c-core-smbus.c b/drivers/i2c/i2c-core-smbus.c
index 9cd66cabb84f..132119112596 100644
--- a/drivers/i2c/i2c-core-smbus.c
+++ b/drivers/i2c/i2c-core-smbus.c
@@ -585,7 +585,7 @@ s32 __i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr,
 trace:
 	/* If enabled, the reply tracepoint is conditional on read_write. */
 	trace_smbus_reply(adapter, addr, flags, read_write,
-			  command, protocol, data);
+			  command, protocol, data, res);
 	trace_smbus_result(adapter, addr, flags, read_write,
 			   command, protocol, res);
 
diff --git a/include/trace/events/smbus.h b/include/trace/events/smbus.h
index d2fb6e1d3e10..a4892a187842 100644
--- a/include/trace/events/smbus.h
+++ b/include/trace/events/smbus.h
@@ -138,9 +138,9 @@ TRACE_EVENT_CONDITION(smbus_reply,
 	TP_PROTO(const struct i2c_adapter *adap,
 		 u16 addr, unsigned short flags,
 		 char read_write, u8 command, int protocol,
-		 const union i2c_smbus_data *data),
-	TP_ARGS(adap, addr, flags, read_write, command, protocol, data),
-	TP_CONDITION(read_write == I2C_SMBUS_READ),
+		 const union i2c_smbus_data *data, int res),
+	TP_ARGS(adap, addr, flags, read_write, command, protocol, data, res),
+	TP_CONDITION(res >= 0 && read_write == I2C_SMBUS_READ),
 	TP_STRUCT__entry(
 		__field(int,	adapter_nr		)
 		__field(__u16,	addr			)
-- 
cgit v1.2.3


From 1e9efe6c9976552e88c6e6feaca3a78b8cf5aaf6 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Mon, 14 Jan 2019 16:45:05 +0530
Subject: PCI: endpoint: Add helper to get first unreserved BAR

Add a helper function pci_epc_get_first_free_bar() to get the first
unreserved BAR that can be used for endpoint function.

Tested-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
---
 drivers/pci/endpoint/pci-epc-core.c | 23 +++++++++++++++++++++++
 include/linux/pci-epc.h             |  2 ++
 2 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index 5a099479d9ab..e4712a0f249c 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -83,6 +83,29 @@ err:
 }
 EXPORT_SYMBOL_GPL(pci_epc_get);
 
+/**
+ * pci_epc_get_first_free_bar() - helper to get first unreserved BAR
+ * @epc_features: pci_epc_features structure that holds the reserved bar bitmap
+ *
+ * Invoke to get the first unreserved BAR that can be used for endpoint
+ * function. For any incorrect value in reserved_bar return '0'.
+ */
+unsigned int pci_epc_get_first_free_bar(const struct pci_epc_features
+					*epc_features)
+{
+	int free_bar;
+
+	if (!epc_features)
+		return 0;
+
+	free_bar = ffz(epc_features->reserved_bar);
+	if (free_bar > 5)
+		return 0;
+
+	return free_bar;
+}
+EXPORT_SYMBOL_GPL(pci_epc_get_first_free_bar);
+
 /**
  * pci_epc_get_features() - get the features supported by EPC
  * @epc: the features supported by *this* EPC device will be returned
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 79fbcf94e14d..94e1ecff98ce 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -180,6 +180,8 @@ int pci_epc_start(struct pci_epc *epc);
 void pci_epc_stop(struct pci_epc *epc);
 const struct pci_epc_features *pci_epc_get_features(struct pci_epc *epc,
 						    u8 func_no);
+unsigned int pci_epc_get_first_free_bar(const struct pci_epc_features
+					*epc_features);
 struct pci_epc *pci_epc_get(const char *epc_name);
 void pci_epc_put(struct pci_epc *epc);
 
-- 
cgit v1.2.3


From 35ce0d7922d68021062a955407740d262f9ac811 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Mon, 14 Jan 2019 16:45:13 +0530
Subject: PCI: endpoint: Remove features member in struct pci_epc

Since EPC features are now implemented using pci_epc_features and
all the EPC drivers are moved to using pci_epc_features, remove
features member in struct pci_epc and all the helper macros for
configuring the features.

Tested-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
---
 include/linux/pci-epc.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 94e1ecff98ce..c3ffa3917f88 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -99,7 +99,6 @@ struct pci_epc {
 	struct config_group		*group;
 	/* spinlock to protect against concurrent access of EP controller */
 	spinlock_t			lock;
-	unsigned int			features;
 };
 
 /**
@@ -120,14 +119,6 @@ struct pci_epc_features {
 	u64	bar_fixed_size[BAR_5 + 1];
 };
 
-#define EPC_FEATURE_NO_LINKUP_NOTIFIER		BIT(0)
-#define EPC_FEATURE_BAR_MASK			(BIT(1) | BIT(2) | BIT(3))
-#define EPC_FEATURE_MSIX_AVAILABLE		BIT(4)
-#define EPC_FEATURE_SET_BAR(features, bar)	\
-		(features |= (EPC_FEATURE_BAR_MASK & (bar << 1)))
-#define EPC_FEATURE_GET_BAR(features)		\
-		((features & EPC_FEATURE_BAR_MASK) >> 1)
-
 #define to_pci_epc(device) container_of((device), struct pci_epc, dev)
 
 #define pci_epc_create(dev, ops)    \
-- 
cgit v1.2.3


From d277ce2d3a75c6c116a6119c3745694f5941eff5 Mon Sep 17 00:00:00 2001
From: Andreas Kemnade <andreas@kemnade.info>
Date: Wed, 16 Jan 2019 23:04:27 +0100
Subject: clk: ti: add a usecount for autoidle

Multiple users might deny autoidle on a clock. So we should have some
counting here, also according to the comment in  _setup_iclk_autoidle().
Also setting autoidle regs is not atomic, so there is another reason
for locking.

Signed-off-by: Andreas Kemnade <andreas@kemnade.info>
Acked-by: Tony Lindgren <tony@atomide.com>
Tested-by: Keerthy <j-keerthy@ti.com>
Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 drivers/clk/ti/autoidle.c | 32 ++++++++++++++++++++++++++++----
 include/linux/clk/ti.h    |  1 +
 2 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/ti/autoidle.c b/drivers/clk/ti/autoidle.c
index a129b4b36ea3..964e97b5478a 100644
--- a/drivers/clk/ti/autoidle.c
+++ b/drivers/clk/ti/autoidle.c
@@ -36,17 +36,41 @@ struct clk_ti_autoidle {
 
 static LIST_HEAD(autoidle_clks);
 
+/*
+ * we have some non-atomic read/write
+ * operations behind it, so lets
+ * take one lock for handling autoidle
+ * of all clocks
+ */
+static DEFINE_SPINLOCK(autoidle_spinlock);
+
 static int _omap2_clk_deny_idle(struct clk_hw_omap *clk)
 {
-	if (clk->ops && clk->ops->deny_idle)
-		clk->ops->deny_idle(clk);
+	if (clk->ops && clk->ops->deny_idle) {
+		unsigned long irqflags;
+
+		spin_lock_irqsave(&autoidle_spinlock, irqflags);
+		clk->autoidle_count++;
+		if (clk->autoidle_count == 1)
+			clk->ops->deny_idle(clk);
+
+		spin_unlock_irqrestore(&autoidle_spinlock, irqflags);
+	}
 	return 0;
 }
 
 static int _omap2_clk_allow_idle(struct clk_hw_omap *clk)
 {
-	if (clk->ops && clk->ops->allow_idle)
-		clk->ops->allow_idle(clk);
+	if (clk->ops && clk->ops->allow_idle) {
+		unsigned long irqflags;
+
+		spin_lock_irqsave(&autoidle_spinlock, irqflags);
+		clk->autoidle_count--;
+		if (clk->autoidle_count == 0)
+			clk->ops->allow_idle(clk);
+
+		spin_unlock_irqrestore(&autoidle_spinlock, irqflags);
+	}
 	return 0;
 }
 
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index eacc5df57b99..78872efc7be0 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -160,6 +160,7 @@ struct clk_hw_omap {
 	struct clockdomain	*clkdm;
 	const struct clk_hw_omap_ops	*ops;
 	u32			context;
+	int			autoidle_count;
 };
 
 /*
-- 
cgit v1.2.3


From 8a2ee44a371c8cbef587ea609908c3cbf1645231 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 15 Feb 2019 19:13:07 +0800
Subject: btrfs: look at bi_size for repair decisions

bio_readpage_error currently uses bi_vcnt to decide if it is worth
retrying an I/O.  But the vector count is mostly an implementation
artifact - it really should figure out if there is more than a
single sector worth retrying.  Use bi_size for that and shift by
PAGE_SHIFT.  This really should be blocks/sectors, but given that
btrfs doesn't support a sector size different from the PAGE_SIZE
using the page size keeps the changes to a minimum.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/btrfs/extent_io.c | 2 +-
 include/linux/bio.h  | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 52abe4082680..dc8ba3ee515d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2350,7 +2350,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 	int read_mode = 0;
 	blk_status_t status;
 	int ret;
-	unsigned failed_bio_pages = bio_pages_all(failed_bio);
+	unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
 
 	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7380b094dcca..72b4f7be2106 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -263,12 +263,6 @@ static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
 		bv->bv_len = iter.bi_bvec_done;
 }
 
-static inline unsigned bio_pages_all(struct bio *bio)
-{
-	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
-	return bio->bi_vcnt;
-}
-
 static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
 {
 	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
-- 
cgit v1.2.3


From 19d62f6d00972f957c94aba0975c14490cfed385 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 15 Feb 2019 19:13:09 +0800
Subject: block: remove bvec_iter_rewind()

Commit 7759eb23fd980 ("block: remove bio_rewind_iter()") removes
bio_rewind_iter(), then no one uses bvec_iter_rewind() any more,
so remove it.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bvec.h | 24 ------------------------
 1 file changed, 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 02c73c6aa805..ba0ae40e77c9 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -92,30 +92,6 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv,
 	return true;
 }
 
-static inline bool bvec_iter_rewind(const struct bio_vec *bv,
-				     struct bvec_iter *iter,
-				     unsigned int bytes)
-{
-	while (bytes) {
-		unsigned len = min(bytes, iter->bi_bvec_done);
-
-		if (iter->bi_bvec_done == 0) {
-			if (WARN_ONCE(iter->bi_idx == 0,
-				      "Attempted to rewind iter beyond "
-				      "bvec's boundaries\n")) {
-				return false;
-			}
-			iter->bi_idx--;
-			iter->bi_bvec_done = __bvec_iter_bvec(bv, *iter)->bv_len;
-			continue;
-		}
-		bytes -= len;
-		iter->bi_size += len;
-		iter->bi_bvec_done -= len;
-	}
-	return true;
-}
-
 #define for_each_bvec(bvl, bio_vec, iter, start)			\
 	for (iter = (start);						\
 	     (iter).bi_size &&						\
-- 
cgit v1.2.3


From 3d75ca0adef4280650c6690a0c4702a74a6f3c95 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 15 Feb 2019 19:13:10 +0800
Subject: block: introduce multi-page bvec helpers

This patch introduces helpers of 'mp_bvec_iter_*' for multi-page bvec
support.

The introduced helpers treate one bvec as real multi-page segment,
which may include more than one pages.

The existed helpers of bvec_iter_* are interfaces for supporting current
bvec iterator which is thought as single-page by drivers, fs, dm and
etc. These introduced helpers will build single-page bvec in flight, so
this way won't break current bio/bvec users, which needn't any change.

Follows some multi-page bvec background:

- bvecs stored in bio->bi_io_vec is always multi-page style

- bvec(struct bio_vec) represents one physically contiguous I/O
  buffer, now the buffer may include more than one page after
  multi-page bvec is supported, and all these pages represented
  by one bvec is physically contiguous. Before multi-page bvec
  support, at most one page is included in one bvec, we call it
  single-page bvec.

- .bv_page of the bvec points to the 1st page in the multi-page bvec

- .bv_offset of the bvec is the offset of the buffer in the bvec

The effect on the current drivers/filesystem/dm/bcache/...:

- almost everyone supposes that one bvec only includes one single
  page, so we keep the sp interface not changed, for example,
  bio_for_each_segment() still returns single-page bvec

- bio_for_each_segment_all() will return single-page bvec too

- during iterating, iterator variable(struct bvec_iter) is always
  updated in multi-page bvec style, and bvec_iter_advance() is kept
  not changed

- returned(copied) single-page bvec is built in flight by bvec
  helpers from the stored multi-page bvec

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bvec.h | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index ba0ae40e77c9..0ae729b1c9fe 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -23,6 +23,7 @@
 #include <linux/kernel.h>
 #include <linux/bug.h>
 #include <linux/errno.h>
+#include <linux/mm.h>
 
 /*
  * was unsigned short, but we might as well be ready for > 64kB I/O pages
@@ -50,16 +51,39 @@ struct bvec_iter {
  */
 #define __bvec_iter_bvec(bvec, iter)	(&(bvec)[(iter).bi_idx])
 
-#define bvec_iter_page(bvec, iter)				\
+/* multi-page (mp_bvec) helpers */
+#define mp_bvec_iter_page(bvec, iter)				\
 	(__bvec_iter_bvec((bvec), (iter))->bv_page)
 
-#define bvec_iter_len(bvec, iter)				\
+#define mp_bvec_iter_len(bvec, iter)				\
 	min((iter).bi_size,					\
 	    __bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done)
 
-#define bvec_iter_offset(bvec, iter)				\
+#define mp_bvec_iter_offset(bvec, iter)				\
 	(__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done)
 
+#define mp_bvec_iter_page_idx(bvec, iter)			\
+	(mp_bvec_iter_offset((bvec), (iter)) / PAGE_SIZE)
+
+#define mp_bvec_iter_bvec(bvec, iter)				\
+((struct bio_vec) {						\
+	.bv_page	= mp_bvec_iter_page((bvec), (iter)),	\
+	.bv_len		= mp_bvec_iter_len((bvec), (iter)),	\
+	.bv_offset	= mp_bvec_iter_offset((bvec), (iter)),	\
+})
+
+/* For building single-page bvec in flight */
+ #define bvec_iter_offset(bvec, iter)				\
+	(mp_bvec_iter_offset((bvec), (iter)) % PAGE_SIZE)
+
+#define bvec_iter_len(bvec, iter)				\
+	min_t(unsigned, mp_bvec_iter_len((bvec), (iter)),		\
+	      PAGE_SIZE - bvec_iter_offset((bvec), (iter)))
+
+#define bvec_iter_page(bvec, iter)				\
+	nth_page(mp_bvec_iter_page((bvec), (iter)),		\
+		 mp_bvec_iter_page_idx((bvec), (iter)))
+
 #define bvec_iter_bvec(bvec, iter)				\
 ((struct bio_vec) {						\
 	.bv_page	= bvec_iter_page((bvec), (iter)),	\
-- 
cgit v1.2.3


From d18d91740ad22e9d7998884c4d80523d0ba95ddf Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 15 Feb 2019 19:13:11 +0800
Subject: block: introduce bio_for_each_bvec() and rq_for_each_bvec()

bio_for_each_bvec() is used for iterating over multi-page bvec for bio
split & merge code.

rq_for_each_bvec() can be used for drivers which may handle the
multi-page bvec directly, so far loop is one perfect use case.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h    | 10 ++++++++++
 include/linux/blkdev.h |  4 ++++
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 72b4f7be2106..7ef8a7505c0a 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -156,6 +156,16 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
 #define bio_for_each_segment(bvl, bio, iter)				\
 	__bio_for_each_segment(bvl, bio, iter, (bio)->bi_iter)
 
+#define __bio_for_each_bvec(bvl, bio, iter, start)		\
+	for (iter = (start);						\
+	     (iter).bi_size &&						\
+		((bvl = mp_bvec_iter_bvec((bio)->bi_io_vec, (iter))), 1); \
+	     bio_advance_iter((bio), &(iter), (bvl).bv_len))
+
+/* iterate over multi-page bvec */
+#define bio_for_each_bvec(bvl, bio, iter)			\
+	__bio_for_each_bvec(bvl, bio, iter, (bio)->bi_iter)
+
 #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
 
 static inline unsigned bio_segments(struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3603270cb82d..b6292d469ea4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -792,6 +792,10 @@ struct req_iterator {
 	__rq_for_each_bio(_iter.bio, _rq)			\
 		bio_for_each_segment(bvl, _iter.bio, _iter.iter)
 
+#define rq_for_each_bvec(bvl, _rq, _iter)			\
+	__rq_for_each_bio(_iter.bio, _rq)			\
+		bio_for_each_bvec(bvl, _iter.bio, _iter.iter)
+
 #define rq_iter_last(bvec, _iter)				\
 		(_iter.bio->bi_next == NULL &&			\
 		 bio_iter_last(bvec, _iter.iter))
-- 
cgit v1.2.3


From 45a3fb95298b326ab8175f2bd97bd8666017b692 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 15 Feb 2019 19:13:14 +0800
Subject: block: introduce mp_bvec_last_segment()

BTRFS and guard_bio_eod() need to get the last singlepage segment
from one multipage bvec, so introduce this helper to make them happy.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bvec.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 0ae729b1c9fe..21f76bad7be2 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -131,4 +131,26 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv,
 	.bi_bvec_done	= 0,						\
 }
 
+/*
+ * Get the last single-page segment from the multi-page bvec and store it
+ * in @seg
+ */
+static inline void mp_bvec_last_segment(const struct bio_vec *bvec,
+					struct bio_vec *seg)
+{
+	unsigned total = bvec->bv_offset + bvec->bv_len;
+	unsigned last_page = (total - 1) / PAGE_SIZE;
+
+	seg->bv_page = nth_page(bvec->bv_page, last_page);
+
+	/* the whole segment is inside the last page */
+	if (bvec->bv_offset >= last_page * PAGE_SIZE) {
+		seg->bv_offset = bvec->bv_offset % PAGE_SIZE;
+		seg->bv_len = bvec->bv_len;
+	} else {
+		seg->bv_offset = 0;
+		seg->bv_len = total - last_page * PAGE_SIZE;
+	}
+}
+
 #endif /* __LINUX_BVEC_ITER_H */
-- 
cgit v1.2.3


From 6dc4f100c175dd0511ae8674786e7c9006cdfbfa Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 15 Feb 2019 19:13:19 +0800
Subject: block: allow bio_for_each_segment_all() to iterate over multi-page
 bvec

This patch introduces one extra iterator variable to bio_for_each_segment_all(),
then we can allow bio_for_each_segment_all() to iterate over multi-page bvec.

Given it is just one mechannical & simple change on all bio_for_each_segment_all()
users, this patch does tree-wide change in one single patch, so that we can
avoid to use a temporary helper for this conversion.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                       | 27 ++++++++++++++++++---------
 block/bounce.c                    |  6 ++++--
 drivers/md/bcache/btree.c         |  3 ++-
 drivers/md/dm-crypt.c             |  3 ++-
 drivers/md/raid1.c                |  3 ++-
 drivers/staging/erofs/data.c      |  3 ++-
 drivers/staging/erofs/unzip_vle.c |  3 ++-
 fs/block_dev.c                    |  6 ++++--
 fs/btrfs/compression.c            |  3 ++-
 fs/btrfs/disk-io.c                |  3 ++-
 fs/btrfs/extent_io.c              |  9 ++++++---
 fs/btrfs/inode.c                  |  6 ++++--
 fs/btrfs/raid56.c                 |  3 ++-
 fs/crypto/bio.c                   |  3 ++-
 fs/direct-io.c                    |  4 +++-
 fs/exofs/ore.c                    |  3 ++-
 fs/exofs/ore_raid.c               |  3 ++-
 fs/ext4/page-io.c                 |  3 ++-
 fs/ext4/readpage.c                |  3 ++-
 fs/f2fs/data.c                    |  9 ++++++---
 fs/gfs2/lops.c                    |  9 ++++++---
 fs/gfs2/meta_io.c                 |  3 ++-
 fs/iomap.c                        |  6 ++++--
 fs/mpage.c                        |  3 ++-
 fs/xfs/xfs_aops.c                 |  5 +++--
 include/linux/bio.h               | 11 +++++++++--
 include/linux/bvec.h              | 30 ++++++++++++++++++++++++++++++
 27 files changed, 127 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index 4db1008309ed..968b12fea564 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1072,8 +1072,9 @@ static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter)
 {
 	int i;
 	struct bio_vec *bvec;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		ssize_t ret;
 
 		ret = copy_page_from_iter(bvec->bv_page,
@@ -1103,8 +1104,9 @@ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter)
 {
 	int i;
 	struct bio_vec *bvec;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		ssize_t ret;
 
 		ret = copy_page_to_iter(bvec->bv_page,
@@ -1126,8 +1128,9 @@ void bio_free_pages(struct bio *bio)
 {
 	struct bio_vec *bvec;
 	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i)
+	bio_for_each_segment_all(bvec, bio, i, iter_all)
 		__free_page(bvec->bv_page);
 }
 EXPORT_SYMBOL(bio_free_pages);
@@ -1295,6 +1298,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 	struct bio *bio;
 	int ret;
 	struct bio_vec *bvec;
+	struct bvec_iter_all iter_all;
 
 	if (!iov_iter_count(iter))
 		return ERR_PTR(-EINVAL);
@@ -1368,7 +1372,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 	return bio;
 
  out_unmap:
-	bio_for_each_segment_all(bvec, bio, j) {
+	bio_for_each_segment_all(bvec, bio, j, iter_all) {
 		put_page(bvec->bv_page);
 	}
 	bio_put(bio);
@@ -1379,11 +1383,12 @@ static void __bio_unmap_user(struct bio *bio)
 {
 	struct bio_vec *bvec;
 	int i;
+	struct bvec_iter_all iter_all;
 
 	/*
 	 * make sure we dirty pages we wrote to
 	 */
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		if (bio_data_dir(bio) == READ)
 			set_page_dirty_lock(bvec->bv_page);
 
@@ -1475,8 +1480,9 @@ static void bio_copy_kern_endio_read(struct bio *bio)
 	char *p = bio->bi_private;
 	struct bio_vec *bvec;
 	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
 		p += bvec->bv_len;
 	}
@@ -1585,8 +1591,9 @@ void bio_set_pages_dirty(struct bio *bio)
 {
 	struct bio_vec *bvec;
 	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		if (!PageCompound(bvec->bv_page))
 			set_page_dirty_lock(bvec->bv_page);
 	}
@@ -1596,8 +1603,9 @@ static void bio_release_pages(struct bio *bio)
 {
 	struct bio_vec *bvec;
 	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i)
+	bio_for_each_segment_all(bvec, bio, i, iter_all)
 		put_page(bvec->bv_page);
 }
 
@@ -1644,8 +1652,9 @@ void bio_check_pages_dirty(struct bio *bio)
 	struct bio_vec *bvec;
 	unsigned long flags;
 	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
 			goto defer;
 	}
diff --git a/block/bounce.c b/block/bounce.c
index ffb9e9ecfa7e..add085e28b1d 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -165,11 +165,12 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool)
 	struct bio_vec *bvec, orig_vec;
 	int i;
 	struct bvec_iter orig_iter = bio_orig->bi_iter;
+	struct bvec_iter_all iter_all;
 
 	/*
 	 * free up bounce indirect pages used
 	 */
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		orig_vec = bio_iter_iovec(bio_orig, orig_iter);
 		if (bvec->bv_page != orig_vec.bv_page) {
 			dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
@@ -294,6 +295,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 	bool bounce = false;
 	int sectors = 0;
 	bool passthrough = bio_is_passthrough(*bio_orig);
+	struct bvec_iter_all iter_all;
 
 	bio_for_each_segment(from, *bio_orig, iter) {
 		if (i++ < BIO_MAX_PAGES)
@@ -313,7 +315,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 	bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL :
 			&bounce_bio_set);
 
-	bio_for_each_segment_all(to, bio, i) {
+	bio_for_each_segment_all(to, bio, i, iter_all) {
 		struct page *page = to->bv_page;
 
 		if (page_to_pfn(page) <= q->limits.bounce_pfn)
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 23cb1dc7296b..64def336f053 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -432,8 +432,9 @@ static void do_btree_node_write(struct btree *b)
 		int j;
 		struct bio_vec *bv;
 		void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
+		struct bvec_iter_all iter_all;
 
-		bio_for_each_segment_all(bv, b->bio, j)
+		bio_for_each_segment_all(bv, b->bio, j, iter_all)
 			memcpy(page_address(bv->bv_page),
 			       base + j * PAGE_SIZE, PAGE_SIZE);
 
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 47d4e0d30bf0..9a29037f5615 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1447,8 +1447,9 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
 {
 	unsigned int i;
 	struct bio_vec *bv;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bv, clone, i) {
+	bio_for_each_segment_all(bv, clone, i, iter_all) {
 		BUG_ON(!bv->bv_page);
 		mempool_free(bv->bv_page, &cc->page_pool);
 	}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7e63ccc4ae7b..88c61d3090b0 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2112,13 +2112,14 @@ static void process_checks(struct r1bio *r1_bio)
 		struct page **spages = get_resync_pages(sbio)->pages;
 		struct bio_vec *bi;
 		int page_len[RESYNC_PAGES] = { 0 };
+		struct bvec_iter_all iter_all;
 
 		if (sbio->bi_end_io != end_sync_read)
 			continue;
 		/* Now we can 'fixup' the error value */
 		sbio->bi_status = 0;
 
-		bio_for_each_segment_all(bi, sbio, j)
+		bio_for_each_segment_all(bi, sbio, j, iter_all)
 			page_len[j] = bi->bv_len;
 
 		if (!status) {
diff --git a/drivers/staging/erofs/data.c b/drivers/staging/erofs/data.c
index 5a55f0bfdfbb..4871ba7b7d9a 100644
--- a/drivers/staging/erofs/data.c
+++ b/drivers/staging/erofs/data.c
@@ -20,8 +20,9 @@ static inline void read_endio(struct bio *bio)
 	int i;
 	struct bio_vec *bvec;
 	const blk_status_t err = bio->bi_status;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		struct page *page = bvec->bv_page;
 
 		/* page is already locked */
diff --git a/drivers/staging/erofs/unzip_vle.c b/drivers/staging/erofs/unzip_vle.c
index 4ac1099a39c6..c057c5616b1d 100644
--- a/drivers/staging/erofs/unzip_vle.c
+++ b/drivers/staging/erofs/unzip_vle.c
@@ -830,8 +830,9 @@ static inline void z_erofs_vle_read_endio(struct bio *bio)
 #ifdef EROFS_FS_HAS_MANAGED_CACHE
 	struct address_space *mc = NULL;
 #endif
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		struct page *page = bvec->bv_page;
 		bool cachemngd = false;
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 58a4c1217fa8..7758adee6efe 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -211,6 +211,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 	ssize_t ret;
 	blk_qc_t qc;
 	int i;
+	struct bvec_iter_all iter_all;
 
 	if ((pos | iov_iter_alignment(iter)) &
 	    (bdev_logical_block_size(bdev) - 1))
@@ -260,7 +261,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 	}
 	__set_current_state(TASK_RUNNING);
 
-	bio_for_each_segment_all(bvec, &bio, i) {
+	bio_for_each_segment_all(bvec, &bio, i, iter_all) {
 		if (should_dirty && !PageCompound(bvec->bv_page))
 			set_page_dirty_lock(bvec->bv_page);
 		put_page(bvec->bv_page);
@@ -329,8 +330,9 @@ static void blkdev_bio_end_io(struct bio *bio)
 	} else {
 		struct bio_vec *bvec;
 		int i;
+		struct bvec_iter_all iter_all;
 
-		bio_for_each_segment_all(bvec, bio, i)
+		bio_for_each_segment_all(bvec, bio, i, iter_all)
 			put_page(bvec->bv_page);
 		bio_put(bio);
 	}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 548057630b69..6896ea60c843 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -162,13 +162,14 @@ csum_failed:
 	} else {
 		int i;
 		struct bio_vec *bvec;
+		struct bvec_iter_all iter_all;
 
 		/*
 		 * we have verified the checksum already, set page
 		 * checked so the end_io handlers know about it
 		 */
 		ASSERT(!bio_flagged(bio, BIO_CLONED));
-		bio_for_each_segment_all(bvec, cb->orig_bio, i)
+		bio_for_each_segment_all(bvec, cb->orig_bio, i, iter_all)
 			SetPageChecked(bvec->bv_page);
 
 		bio_endio(cb->orig_bio);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6a2a2a951705..ca1b7da6dd1b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -832,9 +832,10 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
 	struct bio_vec *bvec;
 	struct btrfs_root *root;
 	int i, ret = 0;
+	struct bvec_iter_all iter_all;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
 		ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
 		if (ret)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 986ef49b0269..4ed58c9a94a9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2422,9 +2422,10 @@ static void end_bio_extent_writepage(struct bio *bio)
 	u64 start;
 	u64 end;
 	int i;
+	struct bvec_iter_all iter_all;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		struct page *page = bvec->bv_page;
 		struct inode *inode = page->mapping->host;
 		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2493,9 +2494,10 @@ static void end_bio_extent_readpage(struct bio *bio)
 	int mirror;
 	int ret;
 	int i;
+	struct bvec_iter_all iter_all;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		struct page *page = bvec->bv_page;
 		struct inode *inode = page->mapping->host;
 		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3635,9 +3637,10 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
 	struct bio_vec *bvec;
 	struct extent_buffer *eb;
 	int i, done;
+	struct bvec_iter_all iter_all;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		struct page *page = bvec->bv_page;
 
 		eb = (struct extent_buffer *)page->private;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5c349667c761..7ade5769f691 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7777,6 +7777,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
 	struct bio_vec *bvec;
 	struct extent_io_tree *io_tree, *failure_tree;
 	int i;
+	struct bvec_iter_all iter_all;
 
 	if (bio->bi_status)
 		goto end;
@@ -7788,7 +7789,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
 
 	done->uptodate = 1;
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
-	bio_for_each_segment_all(bvec, bio, i)
+	bio_for_each_segment_all(bvec, bio, i, iter_all)
 		clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
 				 io_tree, done->start, bvec->bv_page,
 				 btrfs_ino(BTRFS_I(inode)), 0);
@@ -7867,6 +7868,7 @@ static void btrfs_retry_endio(struct bio *bio)
 	int uptodate;
 	int ret;
 	int i;
+	struct bvec_iter_all iter_all;
 
 	if (bio->bi_status)
 		goto end;
@@ -7880,7 +7882,7 @@ static void btrfs_retry_endio(struct bio *bio)
 	failure_tree = &BTRFS_I(inode)->io_failure_tree;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
 					     bvec->bv_offset, done->start,
 					     bvec->bv_len);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index e74455eb42f9..1869ba8e5981 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1443,10 +1443,11 @@ static void set_bio_pages_uptodate(struct bio *bio)
 {
 	struct bio_vec *bvec;
 	int i;
+	struct bvec_iter_all iter_all;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
 
-	bio_for_each_segment_all(bvec, bio, i)
+	bio_for_each_segment_all(bvec, bio, i, iter_all)
 		SetPageUptodate(bvec->bv_page);
 }
 
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 0959044c5cee..5759bcd018cd 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -30,8 +30,9 @@ static void __fscrypt_decrypt_bio(struct bio *bio, bool done)
 {
 	struct bio_vec *bv;
 	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bv, bio, i) {
+	bio_for_each_segment_all(bv, bio, i, iter_all) {
 		struct page *page = bv->bv_page;
 		int ret = fscrypt_decrypt_page(page->mapping->host, page,
 				PAGE_SIZE, 0, page->index);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index ec2fb6fe6d37..9bb015bc4a83 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -551,7 +551,9 @@ static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio)
 	if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) {
 		bio_check_pages_dirty(bio);	/* transfers ownership */
 	} else {
-		bio_for_each_segment_all(bvec, bio, i) {
+		struct bvec_iter_all iter_all;
+
+		bio_for_each_segment_all(bvec, bio, i, iter_all) {
 			struct page *page = bvec->bv_page;
 
 			if (dio->op == REQ_OP_READ && !PageCompound(page) &&
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 5331a15a61f1..24a8e34882e9 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -420,8 +420,9 @@ static void _clear_bio(struct bio *bio)
 {
 	struct bio_vec *bv;
 	unsigned i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bv, bio, i) {
+	bio_for_each_segment_all(bv, bio, i, iter_all) {
 		unsigned this_count = bv->bv_len;
 
 		if (likely(PAGE_SIZE == this_count))
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 199590f36203..e83bab54b03e 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -468,11 +468,12 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
 	/* loop on all devices all pages */
 	for (d = 0; d < ios->numdevs; d++) {
 		struct bio *bio = ios->per_dev[d].bio;
+		struct bvec_iter_all iter_all;
 
 		if (!bio)
 			continue;
 
-		bio_for_each_segment_all(bv, bio, i) {
+		bio_for_each_segment_all(bv, bio, i, iter_all) {
 			struct page *page = bv->bv_page;
 
 			SetPageUptodate(page);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 2aa62d58d8dd..cff4c4aa7a9c 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -63,8 +63,9 @@ static void ext4_finish_bio(struct bio *bio)
 {
 	int i;
 	struct bio_vec *bvec;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		struct page *page = bvec->bv_page;
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 		struct page *data_page = NULL;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 6aa282ee455a..e53639784892 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -72,6 +72,7 @@ static void mpage_end_io(struct bio *bio)
 {
 	struct bio_vec *bv;
 	int i;
+	struct bvec_iter_all iter_all;
 
 	if (ext4_bio_encrypted(bio)) {
 		if (bio->bi_status) {
@@ -81,7 +82,7 @@ static void mpage_end_io(struct bio *bio)
 			return;
 		}
 	}
-	bio_for_each_segment_all(bv, bio, i) {
+	bio_for_each_segment_all(bv, bio, i, iter_all) {
 		struct page *page = bv->bv_page;
 
 		if (!bio->bi_status) {
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index f91d8630c9a2..da060b77f64d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -87,8 +87,9 @@ static void __read_end_io(struct bio *bio)
 	struct page *page;
 	struct bio_vec *bv;
 	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bv, bio, i) {
+	bio_for_each_segment_all(bv, bio, i, iter_all) {
 		page = bv->bv_page;
 
 		/* PG_error was set if any post_read step failed */
@@ -164,13 +165,14 @@ static void f2fs_write_end_io(struct bio *bio)
 	struct f2fs_sb_info *sbi = bio->bi_private;
 	struct bio_vec *bvec;
 	int i;
+	struct bvec_iter_all iter_all;
 
 	if (time_to_inject(sbi, FAULT_WRITE_IO)) {
 		f2fs_show_injection_info(FAULT_WRITE_IO);
 		bio->bi_status = BLK_STS_IOERR;
 	}
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		struct page *page = bvec->bv_page;
 		enum count_type type = WB_DATA_TYPE(page);
 
@@ -347,6 +349,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
 	struct bio_vec *bvec;
 	struct page *target;
 	int i;
+	struct bvec_iter_all iter_all;
 
 	if (!io->bio)
 		return false;
@@ -354,7 +357,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
 	if (!inode && !page && !ino)
 		return true;
 
-	bio_for_each_segment_all(bvec, io->bio, i) {
+	bio_for_each_segment_all(bvec, io->bio, i, iter_all) {
 
 		if (bvec->bv_page->mapping)
 			target = bvec->bv_page;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 94dcab655bc0..15deefeaafd0 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -170,7 +170,8 @@ u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
  * that is pinned in the pagecache.
  */
 
-static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
+static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp,
+				  struct bio_vec *bvec,
 				  blk_status_t error)
 {
 	struct buffer_head *bh, *next;
@@ -208,6 +209,7 @@ static void gfs2_end_log_write(struct bio *bio)
 	struct bio_vec *bvec;
 	struct page *page;
 	int i;
+	struct bvec_iter_all iter_all;
 
 	if (bio->bi_status) {
 		fs_err(sdp, "Error %d writing to journal, jid=%u\n",
@@ -215,7 +217,7 @@ static void gfs2_end_log_write(struct bio *bio)
 		wake_up(&sdp->sd_logd_waitq);
 	}
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		page = bvec->bv_page;
 		if (page_has_buffers(page))
 			gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);
@@ -388,8 +390,9 @@ static void gfs2_end_log_read(struct bio *bio)
 	struct page *page;
 	struct bio_vec *bvec;
 	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		page = bvec->bv_page;
 		if (bio->bi_status) {
 			int err = blk_status_to_errno(bio->bi_status);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index be9c0bf697fe..3201342404a7 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -190,8 +190,9 @@ static void gfs2_meta_read_endio(struct bio *bio)
 {
 	struct bio_vec *bvec;
 	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		struct page *page = bvec->bv_page;
 		struct buffer_head *bh = page_buffers(page);
 		unsigned int len = bvec->bv_len;
diff --git a/fs/iomap.c b/fs/iomap.c
index a3088fae567b..af736acd9006 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -267,8 +267,9 @@ iomap_read_end_io(struct bio *bio)
 	int error = blk_status_to_errno(bio->bi_status);
 	struct bio_vec *bvec;
 	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i)
+	bio_for_each_segment_all(bvec, bio, i, iter_all)
 		iomap_read_page_end_io(bvec, error);
 	bio_put(bio);
 }
@@ -1559,8 +1560,9 @@ static void iomap_dio_bio_end_io(struct bio *bio)
 	} else {
 		struct bio_vec *bvec;
 		int i;
+		struct bvec_iter_all iter_all;
 
-		bio_for_each_segment_all(bvec, bio, i)
+		bio_for_each_segment_all(bvec, bio, i, iter_all)
 			put_page(bvec->bv_page);
 		bio_put(bio);
 	}
diff --git a/fs/mpage.c b/fs/mpage.c
index c820dc9bebab..3f19da75178b 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -48,8 +48,9 @@ static void mpage_end_io(struct bio *bio)
 {
 	struct bio_vec *bv;
 	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bv, bio, i) {
+	bio_for_each_segment_all(bv, bio, i, iter_all) {
 		struct page *page = bv->bv_page;
 		page_endio(page, bio_op(bio),
 			   blk_status_to_errno(bio->bi_status));
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 338b9d9984e0..1f1829e506e8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -62,7 +62,7 @@ xfs_find_daxdev_for_inode(
 static void
 xfs_finish_page_writeback(
 	struct inode		*inode,
-	struct bio_vec		*bvec,
+	struct bio_vec	*bvec,
 	int			error)
 {
 	struct iomap_page	*iop = to_iomap_page(bvec->bv_page);
@@ -98,6 +98,7 @@ xfs_destroy_ioend(
 	for (bio = &ioend->io_inline_bio; bio; bio = next) {
 		struct bio_vec	*bvec;
 		int		i;
+		struct bvec_iter_all iter_all;
 
 		/*
 		 * For the last bio, bi_private points to the ioend, so we
@@ -109,7 +110,7 @@ xfs_destroy_ioend(
 			next = bio->bi_private;
 
 		/* walk each page on bio, ending page IO on them */
-		bio_for_each_segment_all(bvec, bio, i)
+		bio_for_each_segment_all(bvec, bio, i, iter_all)
 			xfs_finish_page_writeback(inode, bvec, error);
 		bio_put(bio);
 	}
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7ef8a7505c0a..089370eb84d9 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -128,12 +128,19 @@ static inline bool bio_full(struct bio *bio)
 	return bio->bi_vcnt >= bio->bi_max_vecs;
 }
 
+#define mp_bvec_for_each_segment(bv, bvl, i, iter_all)			\
+	for (bv = bvec_init_iter_all(&iter_all);			\
+		(iter_all.done < (bvl)->bv_len) &&			\
+		(mp_bvec_next_segment((bvl), &iter_all), 1);		\
+		iter_all.done += bv->bv_len, i += 1)
+
 /*
  * drivers should _never_ use the all version - the bio may have been split
  * before it got to the driver and the driver won't own all of it
  */
-#define bio_for_each_segment_all(bvl, bio, i)				\
-	for (i = 0, bvl = (bio)->bi_io_vec; i < (bio)->bi_vcnt; i++, bvl++)
+#define bio_for_each_segment_all(bvl, bio, i, iter_all)		\
+	for (i = 0, iter_all.idx = 0; iter_all.idx < (bio)->bi_vcnt; iter_all.idx++)	\
+		mp_bvec_for_each_segment(bvl, &((bio)->bi_io_vec[iter_all.idx]), i, iter_all)
 
 static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
 				    unsigned bytes)
diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 21f76bad7be2..30a57b68d017 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -45,6 +45,12 @@ struct bvec_iter {
 						   current bvec */
 };
 
+struct bvec_iter_all {
+	struct bio_vec	bv;
+	int		idx;
+	unsigned	done;
+};
+
 /*
  * various member access, note that bio_data should of course not be used
  * on highmem page vectors
@@ -131,6 +137,30 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv,
 	.bi_bvec_done	= 0,						\
 }
 
+static inline struct bio_vec *bvec_init_iter_all(struct bvec_iter_all *iter_all)
+{
+	iter_all->bv.bv_page = NULL;
+	iter_all->done = 0;
+
+	return &iter_all->bv;
+}
+
+static inline void mp_bvec_next_segment(const struct bio_vec *bvec,
+					struct bvec_iter_all *iter_all)
+{
+	struct bio_vec *bv = &iter_all->bv;
+
+	if (bv->bv_page) {
+		bv->bv_page = nth_page(bv->bv_page, 1);
+		bv->bv_offset = 0;
+	} else {
+		bv->bv_page = bvec->bv_page;
+		bv->bv_offset = bvec->bv_offset;
+	}
+	bv->bv_len = min_t(unsigned int, PAGE_SIZE - bv->bv_offset,
+			   bvec->bv_len - iter_all->done);
+}
+
 /*
  * Get the last single-page segment from the multi-page bvec and store it
  * in @seg
-- 
cgit v1.2.3


From 07173c3ec276cbb18dc0e0687d37d310e98a1480 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 15 Feb 2019 19:13:20 +0800
Subject: block: enable multipage bvecs

This patch pulls the trigger for multi-page bvecs.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 22 +++++++++++++++-------
 fs/iomap.c          |  4 ++--
 fs/xfs/xfs_aops.c   |  4 ++--
 include/linux/bio.h |  2 +-
 4 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index 968b12fea564..83a2dfa417ca 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -753,6 +753,8 @@ EXPORT_SYMBOL(bio_add_pc_page);
  * @page: page to add
  * @len: length of the data to add
  * @off: offset of the data in @page
+ * @same_page: if %true only merge if the new data is in the same physical
+ *		page as the last segment of the bio.
  *
  * Try to add the data at @page + @off to the last bvec of @bio.  This is a
  * a useful optimisation for file systems with a block size smaller than the
@@ -761,19 +763,25 @@ EXPORT_SYMBOL(bio_add_pc_page);
  * Return %true on success or %false on failure.
  */
 bool __bio_try_merge_page(struct bio *bio, struct page *page,
-		unsigned int len, unsigned int off)
+		unsigned int len, unsigned int off, bool same_page)
 {
 	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
 		return false;
 
 	if (bio->bi_vcnt > 0) {
 		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+		phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) +
+			bv->bv_offset + bv->bv_len - 1;
+		phys_addr_t page_addr = page_to_phys(page);
 
-		if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) {
-			bv->bv_len += len;
-			bio->bi_iter.bi_size += len;
-			return true;
-		}
+		if (vec_end_addr + 1 != page_addr + off)
+			return false;
+		if (same_page && (vec_end_addr & PAGE_MASK) != page_addr)
+			return false;
+
+		bv->bv_len += len;
+		bio->bi_iter.bi_size += len;
+		return true;
 	}
 	return false;
 }
@@ -819,7 +827,7 @@ EXPORT_SYMBOL_GPL(__bio_add_page);
 int bio_add_page(struct bio *bio, struct page *page,
 		 unsigned int len, unsigned int offset)
 {
-	if (!__bio_try_merge_page(bio, page, len, offset)) {
+	if (!__bio_try_merge_page(bio, page, len, offset, false)) {
 		if (bio_full(bio))
 			return 0;
 		__bio_add_page(bio, page, len, offset);
diff --git a/fs/iomap.c b/fs/iomap.c
index af736acd9006..0c350e658b7f 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -318,7 +318,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 	 */
 	sector = iomap_sector(iomap, pos);
 	if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
-		if (__bio_try_merge_page(ctx->bio, page, plen, poff))
+		if (__bio_try_merge_page(ctx->bio, page, plen, poff, true))
 			goto done;
 		is_contig = true;
 	}
@@ -349,7 +349,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		ctx->bio->bi_end_io = iomap_read_end_io;
 	}
 
-	__bio_add_page(ctx->bio, page, plen, poff);
+	bio_add_page(ctx->bio, page, plen, poff);
 done:
 	/*
 	 * Move the caller beyond our range so that it keeps making progress.
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 1f1829e506e8..b9fd44168f61 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -616,12 +616,12 @@ xfs_add_to_ioend(
 				bdev, sector);
 	}
 
-	if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
+	if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, true)) {
 		if (iop)
 			atomic_inc(&iop->write_count);
 		if (bio_full(wpc->ioend->io_bio))
 			xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
-		__bio_add_page(wpc->ioend->io_bio, page, len, poff);
+		bio_add_page(wpc->ioend->io_bio, page, len, poff);
 	}
 
 	wpc->ioend->io_size += len;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 089370eb84d9..9f77adcfde82 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -441,7 +441,7 @@ extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
 bool __bio_try_merge_page(struct bio *bio, struct page *page,
-		unsigned int len, unsigned int off);
+		unsigned int len, unsigned int off, bool same_page);
 void __bio_add_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int off);
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
-- 
cgit v1.2.3


From 6861428921b51113520cd47897be6c2774e4fc58 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 15 Feb 2019 19:13:21 +0800
Subject: block: always define BIO_MAX_PAGES as 256

Now multi-page bvec can cover CONFIG_THP_SWAP, so we don't need to
increase BIO_MAX_PAGES for it.

CONFIG_THP_SWAP needs to split one THP into normal pages and adds
them all to one bio. With multipage-bvec, it just takes one bvec to
hold them all.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 9f77adcfde82..bdd11d4c2f05 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -34,15 +34,7 @@
 #define BIO_BUG_ON
 #endif
 
-#ifdef CONFIG_THP_SWAP
-#if HPAGE_PMD_NR > 256
-#define BIO_MAX_PAGES		HPAGE_PMD_NR
-#else
 #define BIO_MAX_PAGES		256
-#endif
-#else
-#define BIO_MAX_PAGES		256
-#endif
 
 #define bio_prio(bio)			(bio)->bi_ioprio
 #define bio_set_prio(bio, prio)		((bio)->bi_ioprio = prio)
-- 
cgit v1.2.3


From 2705c93742e91730d335838025d75d8043861174 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 15 Feb 2019 19:13:23 +0800
Subject: block: kill QUEUE_FLAG_NO_SG_MERGE

Since bdced438acd83ad83a6c ("block: setup bi_phys_segments after splitting"),
physical segment number is mainly figured out in blk_queue_split() for
fast path, and the flag of BIO_SEG_VALID is set there too.

Now only blk_recount_segments() and blk_recalc_rq_segments() use this
flag.

Basically blk_recount_segments() is bypassed in fast path given BIO_SEG_VALID
is set in blk_queue_split().

For another user of blk_recalc_rq_segments():

- run in partial completion branch of blk_update_request, which is an unusual case

- run in blk_cloned_rq_check_limits(), still not a big problem if the flag is killed
since dm-rq is the only user.

Multi-page bvec is enabled now, not doing S/G merging is rather pointless with the
current setup of the I/O path, as it isn't going to save you a significant amount
of cycles.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c      | 31 ++++++-------------------------
 block/blk-mq-debugfs.c |  1 -
 block/blk-mq.c         |  3 ---
 drivers/md/dm-table.c  | 13 -------------
 include/linux/blkdev.h |  1 -
 5 files changed, 6 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 1912499b08b7..bed065904677 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -358,8 +358,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
 EXPORT_SYMBOL(blk_queue_split);
 
 static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
-					     struct bio *bio,
-					     bool no_sg_merge)
+					     struct bio *bio)
 {
 	struct bio_vec bv, bvprv = { NULL };
 	int prev = 0;
@@ -385,13 +384,6 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 	nr_phys_segs = 0;
 	for_each_bio(bio) {
 		bio_for_each_bvec(bv, bio, iter) {
-			/*
-			 * If SG merging is disabled, each bio vector is
-			 * a segment
-			 */
-			if (no_sg_merge)
-				goto new_segment;
-
 			if (prev) {
 				if (seg_size + bv.bv_len
 				    > queue_max_segment_size(q))
@@ -421,27 +413,16 @@ new_segment:
 
 void blk_recalc_rq_segments(struct request *rq)
 {
-	bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE,
-			&rq->q->queue_flags);
-
-	rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio,
-			no_sg_merge);
+	rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio);
 }
 
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
 {
-	unsigned short seg_cnt = bio_segments(bio);
-
-	if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) &&
-			(seg_cnt < queue_max_segments(q)))
-		bio->bi_phys_segments = seg_cnt;
-	else {
-		struct bio *nxt = bio->bi_next;
+	struct bio *nxt = bio->bi_next;
 
-		bio->bi_next = NULL;
-		bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false);
-		bio->bi_next = nxt;
-	}
+	bio->bi_next = NULL;
+	bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
+	bio->bi_next = nxt;
 
 	bio_set_flag(bio, BIO_SEG_VALID);
 }
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index c782e81db627..697d6213c82b 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -128,7 +128,6 @@ static const char *const blk_queue_flag_name[] = {
 	QUEUE_FLAG_NAME(SAME_FORCE),
 	QUEUE_FLAG_NAME(DEAD),
 	QUEUE_FLAG_NAME(INIT_DONE),
-	QUEUE_FLAG_NAME(NO_SG_MERGE),
 	QUEUE_FLAG_NAME(POLL),
 	QUEUE_FLAG_NAME(WC),
 	QUEUE_FLAG_NAME(FUA),
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 44d471ff8754..fa508ee31742 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2837,9 +2837,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	    set->map[HCTX_TYPE_POLL].nr_queues)
 		blk_queue_flag_set(QUEUE_FLAG_POLL, q);
 
-	if (!(set->flags & BLK_MQ_F_SG_MERGE))
-		blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
-
 	q->sg_reserved_size = INT_MAX;
 
 	INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 4b1be754cc41..ba9481f1bf3c 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1698,14 +1698,6 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
 	return q && !blk_queue_add_random(q);
 }
 
-static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
-				   sector_t start, sector_t len, void *data)
-{
-	struct request_queue *q = bdev_get_queue(dev->bdev);
-
-	return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
-}
-
 static bool dm_table_all_devices_attribute(struct dm_table *t,
 					   iterate_devices_callout_fn func)
 {
@@ -1902,11 +1894,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	if (!dm_table_supports_write_zeroes(t))
 		q->limits.max_write_zeroes_sectors = 0;
 
-	if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
-		blk_queue_flag_clear(QUEUE_FLAG_NO_SG_MERGE, q);
-	else
-		blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
-
 	dm_table_verify_integrity(t);
 
 	/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b6292d469ea4..faed9d9eb84c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -588,7 +588,6 @@ struct request_queue {
 #define QUEUE_FLAG_SAME_FORCE	12	/* force complete on same CPU */
 #define QUEUE_FLAG_DEAD		13	/* queue tear-down finished */
 #define QUEUE_FLAG_INIT_DONE	14	/* queue is initialized */
-#define QUEUE_FLAG_NO_SG_MERGE	15	/* don't attempt to merge SG segments*/
 #define QUEUE_FLAG_POLL		16	/* IO polling enabled if set */
 #define QUEUE_FLAG_WC		17	/* Write back caching */
 #define QUEUE_FLAG_FUA		18	/* device supports FUA writes */
-- 
cgit v1.2.3


From 56d18f62f556b80105e38e7975975cf7465aae3e Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 15 Feb 2019 19:13:24 +0800
Subject: block: kill BLK_MQ_F_SG_MERGE

QUEUE_FLAG_NO_SG_MERGE has been killed, so kill BLK_MQ_F_SG_MERGE too.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c       | 1 -
 drivers/block/loop.c         | 2 +-
 drivers/block/nbd.c          | 2 +-
 drivers/block/rbd.c          | 2 +-
 drivers/block/skd_main.c     | 1 -
 drivers/block/xen-blkfront.c | 2 +-
 drivers/md/dm-rq.c           | 2 +-
 drivers/mmc/core/queue.c     | 3 +--
 drivers/scsi/scsi_lib.c      | 2 +-
 include/linux/blk-mq.h       | 1 -
 10 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 697d6213c82b..c39247c5ddb6 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -249,7 +249,6 @@ static const char *const alloc_policy_name[] = {
 static const char *const hctx_flag_name[] = {
 	HCTX_FLAG_NAME(SHOULD_MERGE),
 	HCTX_FLAG_NAME(TAG_SHARED),
-	HCTX_FLAG_NAME(SG_MERGE),
 	HCTX_FLAG_NAME(BLOCKING),
 	HCTX_FLAG_NAME(NO_SCHED),
 };
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 8ef583197414..3d63ad036398 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1937,7 +1937,7 @@ static int loop_add(struct loop_device **l, int i)
 	lo->tag_set.queue_depth = 128;
 	lo->tag_set.numa_node = NUMA_NO_NODE;
 	lo->tag_set.cmd_size = sizeof(struct loop_cmd);
-	lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+	lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
 	lo->tag_set.driver_data = lo;
 
 	err = blk_mq_alloc_tag_set(&lo->tag_set);
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 7c9a949e876b..32a7ba1674b7 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1571,7 +1571,7 @@ static int nbd_dev_add(int index)
 	nbd->tag_set.numa_node = NUMA_NO_NODE;
 	nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
 	nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
-		BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
+		BLK_MQ_F_BLOCKING;
 	nbd->tag_set.driver_data = nbd;
 
 	err = blk_mq_alloc_tag_set(&nbd->tag_set);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 1e92b61d0bd5..abe9e1c89227 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3988,7 +3988,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	rbd_dev->tag_set.ops = &rbd_mq_ops;
 	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
-	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
 	rbd_dev->tag_set.nr_hw_queues = 1;
 	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
 
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index ab893a7571a2..7d3ad6c22ee5 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -2843,7 +2843,6 @@ static int skd_cons_disk(struct skd_device *skdev)
 		skdev->sgs_per_request * sizeof(struct scatterlist);
 	skdev->tag_set.numa_node = NUMA_NO_NODE;
 	skdev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
-		BLK_MQ_F_SG_MERGE |
 		BLK_ALLOC_POLICY_TO_MQ_FLAG(BLK_TAG_ALLOC_FIFO);
 	skdev->tag_set.driver_data = skdev;
 	rc = blk_mq_alloc_tag_set(&skdev->tag_set);
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 0ed4b200fa58..d43a5677ccbc 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -977,7 +977,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
 	} else
 		info->tag_set.queue_depth = BLK_RING_SIZE(info);
 	info->tag_set.numa_node = NUMA_NO_NODE;
-	info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+	info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
 	info->tag_set.cmd_size = sizeof(struct blkif_req);
 	info->tag_set.driver_data = info;
 
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 4eb5f8c56535..b2f8eb2365ee 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -527,7 +527,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
 	md->tag_set->ops = &dm_mq_ops;
 	md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
 	md->tag_set->numa_node = md->numa_node_id;
-	md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+	md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
 	md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
 	md->tag_set->driver_data = md;
 
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 35cc138b096d..cc19e71c71d4 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -410,8 +410,7 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card)
 	else
 		mq->tag_set.queue_depth = MMC_QUEUE_DEPTH;
 	mq->tag_set.numa_node = NUMA_NO_NODE;
-	mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE |
-			    BLK_MQ_F_BLOCKING;
+	mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
 	mq->tag_set.nr_hw_queues = 1;
 	mq->tag_set.cmd_size = sizeof(struct mmc_queue_req);
 	mq->tag_set.driver_data = mq;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 6d65ac584eba..6cadbe945bdb 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1899,7 +1899,7 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost)
 	shost->tag_set.queue_depth = shost->can_queue;
 	shost->tag_set.cmd_size = cmd_size;
 	shost->tag_set.numa_node = NUMA_NO_NODE;
-	shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+	shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
 	shost->tag_set.flags |=
 		BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy);
 	shost->tag_set.driver_data = shost;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0e030f5f76b6..b0c814bcc7e3 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -218,7 +218,6 @@ struct blk_mq_ops {
 enum {
 	BLK_MQ_F_SHOULD_MERGE	= 1 << 0,
 	BLK_MQ_F_TAG_SHARED	= 1 << 1,
-	BLK_MQ_F_SG_MERGE	= 1 << 2,
 	BLK_MQ_F_BLOCKING	= 1 << 5,
 	BLK_MQ_F_NO_SCHED	= 1 << 6,
 	BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
-- 
cgit v1.2.3


From 3d9dfd060391928bd615db62ecddea5e1255edfd Mon Sep 17 00:00:00 2001
From: Shamir Rabinovitch <shamir.rabinovitch@oracle.com>
Date: Thu, 7 Feb 2019 18:44:47 +0200
Subject: IB/uverbs: Add ib_ucontext to uverbs_attr_bundle sent from ioctl and
 cmd flows

Add ib_ucontext to the uverbs_attr_bundle sent down the iocl and cmd flows
as soon as the flow has ib_uobject.

In addition, remove rdma_get_ucontext helper function that is only used by
ib_umem_get.

Signed-off-by: Shamir Rabinovitch <shamir.rabinovitch@oracle.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/rdma_core.c    | 32 ++++++++++++++++++++++++++++++++
 drivers/infiniband/core/umem.c         | 10 +++++++---
 drivers/infiniband/core/uverbs_cmd.c   |  2 +-
 drivers/infiniband/core/uverbs_ioctl.c |  3 +++
 drivers/infiniband/core/uverbs_main.c  | 25 +------------------------
 include/rdma/ib_verbs.h                |  2 --
 include/rdma/uverbs_ioctl.h            |  1 +
 include/rdma/uverbs_std_types.h        | 18 +++++++++++++-----
 8 files changed, 58 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index a260d2f8e0b7..96f919fe86e7 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -438,6 +438,38 @@ free:
 	uverbs_uobject_put(uobj);
 	return ERR_PTR(ret);
 }
+struct ib_uobject *_uobj_get_read(enum uverbs_default_objects type,
+				  u32 object_id,
+				  struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *uobj;
+
+	uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile,
+				       object_id, UVERBS_LOOKUP_READ);
+	if (IS_ERR(uobj))
+		return uobj;
+
+	attrs->context = uobj->context;
+
+	return uobj;
+}
+
+struct ib_uobject *_uobj_get_write(enum uverbs_default_objects type,
+				   u32 object_id,
+				   struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *uobj;
+
+	uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile,
+				       object_id, UVERBS_LOOKUP_WRITE);
+
+	if (IS_ERR(uobj))
+		return uobj;
+
+	attrs->context = uobj->context;
+
+	return uobj;
+}
 
 static struct ib_uobject *
 alloc_begin_idr_uobject(const struct uverbs_api_object *obj,
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index b69d3efa8712..fe5551562dbc 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -96,9 +96,13 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
 	struct scatterlist *sg, *sg_list_start;
 	unsigned int gup_flags = FOLL_WRITE;
 
-	context = rdma_get_ucontext(udata);
-	if (IS_ERR(context))
-		return ERR_CAST(context);
+	if (!udata)
+		return ERR_PTR(-EIO);
+
+	context = container_of(udata, struct uverbs_attr_bundle, driver_udata)
+			  ->context;
+	if (!context)
+		return ERR_PTR(-EIO);
 
 	if (dmasync)
 		dma_attrs |= DMA_ATTR_WRITE_BARRIER;
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 5ac143f22df0..c2e7733a7fe0 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -2634,7 +2634,7 @@ void flow_resources_add(struct ib_uflow_resources *uflow_res,
 }
 EXPORT_SYMBOL(flow_resources_add);
 
-static int kern_spec_to_ib_spec_action(const struct uverbs_attr_bundle *attrs,
+static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs,
 				       struct ib_uverbs_flow_spec *kern_spec,
 				       union ib_flow_spec *ib_spec,
 				       struct ib_uflow_resources *uflow_res)
diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c
index 0ca04d224015..e1379949e663 100644
--- a/drivers/infiniband/core/uverbs_ioctl.c
+++ b/drivers/infiniband/core/uverbs_ioctl.c
@@ -213,6 +213,7 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle,
 			ret = PTR_ERR(attr->uobjects[i]);
 			break;
 		}
+		pbundle->bundle.context = attr->uobjects[i]->context;
 	}
 
 	attr->len = i;
@@ -330,6 +331,7 @@ static int uverbs_process_attr(struct bundle_priv *pbundle,
 					uattr->data_s64);
 		if (IS_ERR(o_attr->uobject))
 			return PTR_ERR(o_attr->uobject);
+		pbundle->bundle.context = o_attr->uobject->context;
 		__set_bit(attr_bkey, pbundle->uobj_finalize);
 
 		if (spec->u.obj.access == UVERBS_ACCESS_NEW) {
@@ -592,6 +594,7 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile,
 	pbundle->method_elm = method_elm;
 	pbundle->method_key = attrs_iter.index;
 	pbundle->bundle.ufile = ufile;
+	pbundle->bundle.context = NULL; /* only valid if bundle has uobject */
 	pbundle->radix = &uapi->radix;
 	pbundle->radix_slots = slot;
 	pbundle->radix_slots_len = radix_tree_chunk_size(&attrs_iter);
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index accc61cc93ac..70b7d80431a9 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -101,30 +101,6 @@ struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile)
 }
 EXPORT_SYMBOL(ib_uverbs_get_ucontext_file);
 
-/* rdma_get_ucontext - Return the ucontext from a udata
- * @udata: The udata to get the context from
- *
- * This can only be called from within a uapi method that was passed ib_udata
- * as a parameter. It returns the ucontext associated with the udata, or ERR_PTR
- * if the udata is NULL or the ucontext has been disassociated.
- */
-struct ib_ucontext *rdma_get_ucontext(struct ib_udata *udata)
-{
-	if (!udata)
-		return ERR_PTR(-EIO);
-
-	/*
-	 * FIXME: Really all cases that get here with a udata will have
-	 * already called ib_uverbs_get_ucontext_file, or located a uobject
-	 * that points to a ucontext. We could store that result in the udata
-	 * so this function can't fail.
-	 */
-	return ib_uverbs_get_ucontext_file(
-		container_of(udata, struct uverbs_attr_bundle, driver_udata)
-			->ufile);
-}
-EXPORT_SYMBOL(rdma_get_ucontext);
-
 int uverbs_dealloc_mw(struct ib_mw *mw)
 {
 	struct ib_pd *pd = mw->pd;
@@ -719,6 +695,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 
 	memset(bundle.attr_present, 0, sizeof(bundle.attr_present));
 	bundle.ufile = file;
+	bundle.context = NULL; /* only valid if bundle has uobject */
 	if (!method_elm->is_ex) {
 		size_t in_len = hdr.in_words * 4 - sizeof(hdr);
 		size_t out_len = hdr.out_words * 4;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 135fab2c016c..64ee7c08be22 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -4247,8 +4247,6 @@ void rdma_roce_rescan_device(struct ib_device *ibdev);
 
 struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile);
 
-struct ib_ucontext *rdma_get_ucontext(struct ib_udata *udata);
-
 int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs);
 
 struct net_device *rdma_alloc_netdev(struct ib_device *device, u8 port_num,
diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h
index 27da906beea7..b14a9ee786e9 100644
--- a/include/rdma/uverbs_ioctl.h
+++ b/include/rdma/uverbs_ioctl.h
@@ -652,6 +652,7 @@ struct uverbs_attr_bundle {
 	struct ib_udata driver_udata;
 	struct ib_udata ucore;
 	struct ib_uverbs_file *ufile;
+	struct ib_ucontext *context;
 	DECLARE_BITMAP(attr_present, UVERBS_API_ATTR_BKEY_LEN);
 	struct uverbs_attr attrs[];
 };
diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h
index 883abcf6d36e..794c47565971 100644
--- a/include/rdma/uverbs_std_types.h
+++ b/include/rdma/uverbs_std_types.h
@@ -48,9 +48,12 @@
 #define uobj_get_type(_attrs, _object)                                         \
 	uapi_get_object((_attrs)->ufile->device->uapi, _object)
 
+struct ib_uobject *_uobj_get_read(enum uverbs_default_objects type,
+				  u32 object_id,
+				  struct uverbs_attr_bundle *attrs);
+
 #define uobj_get_read(_type, _id, _attrs)                                      \
-	rdma_lookup_get_uobject(uobj_get_type(_attrs, _type), (_attrs)->ufile, \
-				_uobj_check_id(_id), UVERBS_LOOKUP_READ)
+	_uobj_get_read(_type, _uobj_check_id(_id), _attrs)
 
 #define ufd_get_read(_type, _fdnum, _attrs)                                    \
 	rdma_lookup_get_uobject(uobj_get_type(_attrs, _type), (_attrs)->ufile, \
@@ -67,9 +70,12 @@ static inline void *_uobj_get_obj_read(struct ib_uobject *uobj)
 	((struct ib_##_object *)_uobj_get_obj_read(                            \
 		uobj_get_read(_type, _id, _attrs)))
 
+struct ib_uobject *_uobj_get_write(enum uverbs_default_objects type,
+				   u32 object_id,
+				   struct uverbs_attr_bundle *attrs);
+
 #define uobj_get_write(_type, _id, _attrs)                                     \
-	rdma_lookup_get_uobject(uobj_get_type(_attrs, _type), (_attrs)->ufile, \
-				_uobj_check_id(_id), UVERBS_LOOKUP_WRITE)
+	_uobj_get_write(_type, _uobj_check_id(_id), _attrs)
 
 int __uobj_perform_destroy(const struct uverbs_api_object *obj, u32 id,
 			   const struct uverbs_attr_bundle *attrs);
@@ -123,8 +129,10 @@ __uobj_alloc(const struct uverbs_api_object *obj,
 {
 	struct ib_uobject *uobj = rdma_alloc_begin_uobject(obj, attrs->ufile);
 
-	if (!IS_ERR(uobj))
+	if (!IS_ERR(uobj)) {
 		*ib_dev = uobj->context->device;
+		attrs->context = uobj->context;
+	}
 	return uobj;
 }
 
-- 
cgit v1.2.3


From 730623f4a56fa42d4559715ff2f4a5c32b3ae8bf Mon Sep 17 00:00:00 2001
From: Shamir Rabinovitch <shamir.rabinovitch@oracle.com>
Date: Thu, 7 Feb 2019 18:44:48 +0200
Subject: IB/verbs: Add helper function rdma_udata_to_drv_context

Helper function to get driver's context out of ib_udata wrapped in
uverbs_attr_bundle for user objects or NULL for kernel objects.

Signed-off-by: Shamir Rabinovitch <shamir.rabinovitch@oracle.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/rdma/uverbs_ioctl.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h
index b14a9ee786e9..28570ac2b6a0 100644
--- a/include/rdma/uverbs_ioctl.h
+++ b/include/rdma/uverbs_ioctl.h
@@ -664,6 +664,23 @@ static inline bool uverbs_attr_is_valid(const struct uverbs_attr_bundle *attrs_b
 			attrs_bundle->attr_present);
 }
 
+/**
+ * rdma_udata_to_drv_context - Helper macro to get the driver's context out of
+ *                             ib_udata which is embedded in uverbs_attr_bundle.
+ *
+ * If udata is not NULL this cannot fail. Otherwise a NULL udata will result
+ * in a NULL ucontext pointer, as a safety precaution. Callers should be using
+ * 'udata' to determine if the driver call is in user or kernel mode, not
+ * 'ucontext'.
+ *
+ */
+#define rdma_udata_to_drv_context(udata, drv_dev_struct, member)               \
+	(udata ? container_of(container_of(udata, struct uverbs_attr_bundle,   \
+					   driver_udata)                       \
+				      ->context,                               \
+			      drv_dev_struct, member) :                        \
+		 (drv_dev_struct *)NULL)
+
 #define IS_UVERBS_COPY_ERR(_ret)		((_ret) && (_ret) != -ENOENT)
 
 static inline const struct uverbs_attr *uverbs_attr_get(const struct uverbs_attr_bundle *attrs_bundle,
-- 
cgit v1.2.3


From 89944450547334aa6655e0cd4aec8df1897a205a Mon Sep 17 00:00:00 2001
From: Shamir Rabinovitch <shamir.rabinovitch@oracle.com>
Date: Thu, 7 Feb 2019 18:44:49 +0200
Subject: IB/{hw,sw}: Remove 'uobject->context' dependency in object creation
 APIs

Now when we have the udata passed to all the ib_xxx object creation APIs
and the additional macro 'rdma_udata_to_drv_context' to get the
ib_ucontext from ib_udata stored in uverbs_attr_bundle, we can finally
start to remove the dependency of the drivers in the
ib_xxx->uobject->context.

Signed-off-by: Shamir Rabinovitch <shamir.rabinovitch@oracle.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/ib_verbs.c     | 20 ++++----
 drivers/infiniband/hw/cxgb3/iwch_provider.c  |  4 +-
 drivers/infiniband/hw/cxgb4/qp.c             |  9 ++--
 drivers/infiniband/hw/hns/hns_roce_qp.c      | 21 ++++----
 drivers/infiniband/hw/i40iw/i40iw_verbs.c    |  9 ++--
 drivers/infiniband/hw/mlx4/mr.c              | 12 ++---
 drivers/infiniband/hw/mlx4/qp.c              | 72 +++++++++++++++-------------
 drivers/infiniband/hw/mlx4/srq.c             |  9 ++--
 drivers/infiniband/hw/mlx5/devx.c            | 21 ++++----
 drivers/infiniband/hw/mlx5/qp.c              | 58 ++++++++++++----------
 drivers/infiniband/hw/mlx5/srq.c             |  8 ++--
 drivers/infiniband/hw/mthca/mthca_provider.c | 20 ++++----
 drivers/infiniband/hw/mthca/mthca_qp.c       | 13 +++--
 drivers/infiniband/hw/mthca/mthca_srq.c      | 21 ++++----
 drivers/infiniband/hw/nes/nes_verbs.c        | 12 ++---
 drivers/infiniband/hw/qedr/verbs.c           |  2 +-
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c |  5 +-
 drivers/infiniband/sw/rdmavt/qp.c            |  5 +-
 drivers/infiniband/sw/rdmavt/srq.c           |  5 +-
 drivers/infiniband/sw/rdmavt/vt.c            |  4 --
 drivers/infiniband/sw/rxe/rxe_qp.c           |  8 ++--
 drivers/infiniband/sw/rxe/rxe_verbs.c        |  6 ++-
 include/rdma/rdma_vt.h                       |  5 ++
 23 files changed, 194 insertions(+), 155 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 2ed778683c6b..83bf6f5d67c0 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -48,6 +48,7 @@
 #include <rdma/ib_addr.h>
 #include <rdma/ib_mad.h>
 #include <rdma/ib_cache.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "bnxt_ulp.h"
 
@@ -722,12 +723,11 @@ struct ib_ah *bnxt_re_create_ah(struct ib_pd *ib_pd,
 
 	/* Write AVID to shared page. */
 	if (udata) {
-		struct ib_ucontext *ib_uctx = ib_pd->uobject->context;
-		struct bnxt_re_ucontext *uctx;
+		struct bnxt_re_ucontext *uctx = rdma_udata_to_drv_context(
+			udata, struct bnxt_re_ucontext, ib_uctx);
 		unsigned long flag;
 		u32 *wrptr;
 
-		uctx = container_of(ib_uctx, struct bnxt_re_ucontext, ib_uctx);
 		spin_lock_irqsave(&uctx->sh_lock, flag);
 		wrptr = (u32 *)(uctx->shpg + BNXT_RE_AVID_OFFT);
 		*wrptr = ah->qplib_ah.id;
@@ -872,10 +872,9 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
 	struct bnxt_qplib_qp *qplib_qp = &qp->qplib_qp;
 	struct ib_umem *umem;
 	int bytes = 0, psn_sz;
-	struct ib_ucontext *context = pd->ib_pd.uobject->context;
-	struct bnxt_re_ucontext *cntx = container_of(context,
-						     struct bnxt_re_ucontext,
-						     ib_uctx);
+	struct bnxt_re_ucontext *cntx = rdma_udata_to_drv_context(
+		udata, struct bnxt_re_ucontext, ib_uctx);
+
 	if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
 		return -EFAULT;
 
@@ -1359,10 +1358,9 @@ static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev,
 	struct bnxt_qplib_srq *qplib_srq = &srq->qplib_srq;
 	struct ib_umem *umem;
 	int bytes = 0;
-	struct ib_ucontext *context = pd->ib_pd.uobject->context;
-	struct bnxt_re_ucontext *cntx = container_of(context,
-						     struct bnxt_re_ucontext,
-						     ib_uctx);
+	struct bnxt_re_ucontext *cntx = rdma_udata_to_drv_context(
+		udata, struct bnxt_re_ucontext, ib_uctx);
+
 	if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
 		return -EFAULT;
 
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 80dff6804e48..b74fd90a22dc 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -53,6 +53,7 @@
 #include <rdma/ib_smi.h>
 #include <rdma/ib_umem.h>
 #include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "cxio_hal.h"
 #include "iwch.h"
@@ -825,7 +826,8 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
 	 * Kernel users need more wq space for fastreg WRs which can take
 	 * 2 WR fragments.
 	 */
-	ucontext = udata ? to_iwch_ucontext(pd->uobject->context) : NULL;
+	ucontext = rdma_udata_to_drv_context(udata, struct iwch_ucontext,
+					     ibucontext);
 	if (!ucontext && wqsize < (rqsize + (2 * sqsize)))
 		wqsize = roundup_pow_of_two(rqsize +
 				roundup_pow_of_two(attrs->cap.max_send_wr * 2));
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index 0fe87b9c1e10..2509f65f4420 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -31,6 +31,7 @@
  */
 
 #include <linux/module.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "iw_cxgb4.h"
 
@@ -2136,7 +2137,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
 	struct c4iw_cq *rchp;
 	struct c4iw_create_qp_resp uresp;
 	unsigned int sqsize, rqsize = 0;
-	struct c4iw_ucontext *ucontext;
+	struct c4iw_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct c4iw_ucontext, ibucontext);
 	int ret;
 	struct c4iw_mm_entry *sq_key_mm, *rq_key_mm = NULL, *sq_db_key_mm;
 	struct c4iw_mm_entry *rq_db_key_mm = NULL, *ma_sync_key_mm = NULL;
@@ -2170,8 +2172,6 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
 	if (sqsize < 8)
 		sqsize = 8;
 
-	ucontext = udata ? to_c4iw_ucontext(pd->uobject->context) : NULL;
-
 	qhp = kzalloc(sizeof(*qhp), GFP_KERNEL);
 	if (!qhp)
 		return ERR_PTR(-ENOMEM);
@@ -2718,7 +2718,8 @@ struct ib_srq *c4iw_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *attrs,
 	rqsize = attrs->attr.max_wr + 1;
 	rqsize = roundup_pow_of_two(max_t(u16, rqsize, 16));
 
-	ucontext = udata ? to_c4iw_ucontext(pd->uobject->context) : NULL;
+	ucontext = rdma_udata_to_drv_context(udata, struct c4iw_ucontext,
+					     ibucontext);
 
 	srq = kzalloc(sizeof(*srq), GFP_KERNEL);
 	if (!srq)
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 728ad8b6de2f..57c76eafef2f 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -35,6 +35,7 @@
 #include <linux/platform_device.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_umem.h>
+#include <rdma/uverbs_ioctl.h>
 #include "hns_roce_common.h"
 #include "hns_roce_device.h"
 #include "hns_roce_hem.h"
@@ -560,6 +561,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 	struct device *dev = hr_dev->dev;
 	struct hns_roce_ib_create_qp ucmd;
 	struct hns_roce_ib_create_qp_resp resp = {};
+	struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(
+		udata, struct hns_roce_ucontext, ibucontext);
 	unsigned long qpn = 0;
 	int ret = 0;
 	u32 page_shift;
@@ -670,9 +673,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 		    (udata->inlen >= sizeof(ucmd)) &&
 		    (udata->outlen >= sizeof(resp)) &&
 		    hns_roce_qp_has_sq(init_attr)) {
-			ret = hns_roce_db_map_user(
-				to_hr_ucontext(ib_pd->uobject->context), udata,
-				ucmd.sdb_addr, &hr_qp->sdb);
+			ret = hns_roce_db_map_user(uctx, udata, ucmd.sdb_addr,
+						   &hr_qp->sdb);
 			if (ret) {
 				dev_err(dev, "sq record doorbell map failed!\n");
 				goto err_mtt;
@@ -686,9 +688,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
 		    (udata->outlen >= sizeof(resp)) &&
 		    hns_roce_qp_has_rq(init_attr)) {
-			ret = hns_roce_db_map_user(
-				to_hr_ucontext(ib_pd->uobject->context), udata,
-				ucmd.db_addr, &hr_qp->rdb);
+			ret = hns_roce_db_map_user(uctx, udata, ucmd.db_addr,
+						   &hr_qp->rdb);
 			if (ret) {
 				dev_err(dev, "rq record doorbell map failed!\n");
 				goto err_sq_dbmap;
@@ -838,9 +839,7 @@ err_wrid:
 		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
 		    (udata->outlen >= sizeof(resp)) &&
 		    hns_roce_qp_has_rq(init_attr))
-			hns_roce_db_unmap_user(
-					to_hr_ucontext(ib_pd->uobject->context),
-					&hr_qp->rdb);
+			hns_roce_db_unmap_user(uctx, &hr_qp->rdb);
 	} else {
 		kfree(hr_qp->sq.wrid);
 		kfree(hr_qp->rq.wrid);
@@ -852,9 +851,7 @@ err_sq_dbmap:
 		    (udata->inlen >= sizeof(ucmd)) &&
 		    (udata->outlen >= sizeof(resp)) &&
 		    hns_roce_qp_has_sq(init_attr))
-			hns_roce_db_unmap_user(
-					to_hr_ucontext(ib_pd->uobject->context),
-					&hr_qp->sdb);
+			hns_roce_db_unmap_user(uctx, &hr_qp->sdb);
 
 err_mtt:
 	hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt);
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index d5fb2b927587..76b4d1218696 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -45,6 +45,7 @@
 #include <rdma/iw_cm.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_umem.h>
+#include <rdma/uverbs_ioctl.h>
 #include "i40iw.h"
 
 /**
@@ -556,7 +557,8 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
 	struct i40iw_device *iwdev = to_iwdev(ibpd->device);
 	struct i40iw_cqp *iwcqp = &iwdev->cqp;
 	struct i40iw_qp *iwqp;
-	struct i40iw_ucontext *ucontext;
+	struct i40iw_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct i40iw_ucontext, ibucontext);
 	struct i40iw_create_qp_req req;
 	struct i40iw_create_qp_resp uresp;
 	u32 qp_num = 0;
@@ -665,7 +667,6 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
 		}
 		iwqp->ctx_info.qp_compl_ctx = req.user_compl_ctx;
 		iwqp->user_mode = 1;
-		ucontext = to_ucontext(ibpd->uobject->context);
 
 		if (req.user_wqe_buffers) {
 			struct i40iw_pbl *iwpbl;
@@ -1819,7 +1820,8 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
 {
 	struct i40iw_pd *iwpd = to_iwpd(pd);
 	struct i40iw_device *iwdev = to_iwdev(pd->device);
-	struct i40iw_ucontext *ucontext;
+	struct i40iw_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct i40iw_ucontext, ibucontext);
 	struct i40iw_pble_alloc *palloc;
 	struct i40iw_pbl *iwpbl;
 	struct i40iw_mr *iwmr;
@@ -1860,7 +1862,6 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
 	iwmr->region = region;
 	iwmr->ibmr.pd = pd;
 	iwmr->ibmr.device = pd->device;
-	ucontext = to_ucontext(pd->uobject->context);
 
 	iwmr->page_size = PAGE_SIZE;
 	iwmr->page_msk = PAGE_MASK;
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index 56639ecd53ad..395379a480cb 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -367,8 +367,7 @@ end:
 	return block_shift;
 }
 
-static struct ib_umem *mlx4_get_umem_mr(struct ib_ucontext *context,
-					struct ib_udata *udata, u64 start,
+static struct ib_umem *mlx4_get_umem_mr(struct ib_udata *udata, u64 start,
 					u64 length, u64 virt_addr,
 					int access_flags)
 {
@@ -416,8 +415,8 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
-	mr->umem = mlx4_get_umem_mr(pd->uobject->context, udata, start, length,
-				    virt_addr, access_flags);
+	mr->umem =
+		mlx4_get_umem_mr(udata, start, length, virt_addr, access_flags);
 	if (IS_ERR(mr->umem)) {
 		err = PTR_ERR(mr->umem);
 		goto err_free;
@@ -506,9 +505,8 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
 
 		mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
 		ib_umem_release(mmr->umem);
-		mmr->umem =
-			mlx4_get_umem_mr(mr->uobject->context, udata, start,
-					 length, virt_addr, mr_access_flags);
+		mmr->umem = mlx4_get_umem_mr(udata, start, length, virt_addr,
+					     mr_access_flags);
 		if (IS_ERR(mmr->umem)) {
 			err = PTR_ERR(mmr->umem);
 			/* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index e38bab50cecf..429a59c5801c 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -41,6 +41,7 @@
 #include <rdma/ib_pack.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_mad.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include <linux/mlx4/driver.h>
 #include <linux/mlx4/qp.h>
@@ -52,7 +53,8 @@ static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq,
 			     struct mlx4_ib_cq *recv_cq);
 static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq,
 			       struct mlx4_ib_cq *recv_cq);
-static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state);
+static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state,
+			      struct ib_udata *udata);
 
 enum {
 	MLX4_IB_ACK_REQ_FREQ	= 8,
@@ -863,6 +865,8 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 	int err;
 	struct mlx4_ib_sqp *sqp = NULL;
 	struct mlx4_ib_qp *qp;
+	struct mlx4_ib_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct mlx4_ib_ucontext, ibucontext);
 	enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
 	struct mlx4_ib_cq *mcq;
 	unsigned long flags;
@@ -1038,7 +1042,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 
 		if (qp_has_rq(init_attr)) {
 			err = mlx4_ib_db_map_user(
-				to_mucontext(pd->uobject->context), udata,
+				context, udata,
 				(src == MLX4_IB_QP_SRC) ? ucmd.qp.db_addr :
 							  ucmd.wq.db_addr,
 				&qp->db);
@@ -1112,8 +1116,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			}
 		}
 	} else if (src == MLX4_IB_RWQ_SRC) {
-		err = mlx4_ib_alloc_wqn(to_mucontext(pd->uobject->context), qp,
-					range_size, &qpn);
+		err = mlx4_ib_alloc_wqn(context, qp, range_size, &qpn);
 		if (err)
 			goto err_wrid;
 	} else {
@@ -1184,8 +1187,7 @@ err_qpn:
 		if (qp->flags & MLX4_IB_QP_NETIF)
 			mlx4_ib_steer_qp_free(dev, qpn, 1);
 		else if (src == MLX4_IB_RWQ_SRC)
-			mlx4_ib_release_wqn(to_mucontext(pd->uobject->context),
-					    qp, 0);
+			mlx4_ib_release_wqn(context, qp, 0);
 		else
 			mlx4_qp_release_range(dev->dev, qpn, 1);
 	}
@@ -1195,7 +1197,7 @@ err_proxy:
 err_wrid:
 	if (udata) {
 		if (qp_has_rq(init_attr))
-			mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db);
+			mlx4_ib_db_unmap_user(context, &qp->db);
 	} else {
 		kvfree(qp->sq.wrid);
 		kvfree(qp->rq.wrid);
@@ -1942,7 +1944,8 @@ static u8 gid_type_to_qpc(enum ib_gid_type gid_type)
  * Go over all RSS QP's childes (WQs) and apply their HW state according to
  * their logic state if the RSS QP is the first RSS QP associated for the WQ.
  */
-static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num)
+static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num,
+			    struct ib_udata *udata)
 {
 	int err = 0;
 	int i;
@@ -1966,7 +1969,7 @@ static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num)
 		}
 		wq->port = port_num;
 		if ((wq->rss_usecnt == 0) && (ibwq->state == IB_WQS_RDY)) {
-			err = _mlx4_ib_modify_wq(ibwq, IB_WQS_RDY);
+			err = _mlx4_ib_modify_wq(ibwq, IB_WQS_RDY, udata);
 			if (err) {
 				mutex_unlock(&wq->mutex);
 				break;
@@ -1988,7 +1991,8 @@ static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num)
 
 			if ((wq->rss_usecnt == 1) &&
 			    (ibwq->state == IB_WQS_RDY))
-				if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET))
+				if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET,
+						       udata))
 					pr_warn("failed to reverse WQN=0x%06x\n",
 						ibwq->wq_num);
 			wq->rss_usecnt--;
@@ -2000,7 +2004,8 @@ static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num)
 	return err;
 }
 
-static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl)
+static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl,
+				struct ib_udata *udata)
 {
 	int i;
 
@@ -2011,7 +2016,7 @@ static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl)
 		mutex_lock(&wq->mutex);
 
 		if ((wq->rss_usecnt == 1) && (ibwq->state == IB_WQS_RDY))
-			if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET))
+			if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET, udata))
 				pr_warn("failed to reverse WQN=%x\n",
 					ibwq->wq_num);
 		wq->rss_usecnt--;
@@ -2043,9 +2048,10 @@ static void fill_qp_rss_context(struct mlx4_qp_context *context,
 
 static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
 			       const struct ib_qp_attr *attr, int attr_mask,
-			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
+			       enum ib_qp_state cur_state,
+			       enum ib_qp_state new_state,
+			       struct ib_udata *udata)
 {
-	struct ib_uobject *ibuobject;
 	struct ib_srq  *ibsrq;
 	const struct ib_gid_attr *gid_attr = NULL;
 	struct ib_rwq_ind_table *rwq_ind_tbl;
@@ -2054,6 +2060,8 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
 	struct mlx4_ib_qp *qp;
 	struct mlx4_ib_pd *pd;
 	struct mlx4_ib_cq *send_cq, *recv_cq;
+	struct mlx4_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mlx4_ib_ucontext, ibucontext);
 	struct mlx4_qp_context *context;
 	enum mlx4_qp_optpar optpar = 0;
 	int sqd_event;
@@ -2065,7 +2073,6 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
 		struct ib_wq *ibwq;
 
 		ibwq	    = (struct ib_wq *)src;
-		ibuobject   = ibwq->uobject;
 		ibsrq	    = NULL;
 		rwq_ind_tbl = NULL;
 		qp_type     = IB_QPT_RAW_PACKET;
@@ -2076,7 +2083,6 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
 		struct ib_qp *ibqp;
 
 		ibqp	    = (struct ib_qp *)src;
-		ibuobject   = ibqp->uobject;
 		ibsrq	    = ibqp->srq;
 		rwq_ind_tbl = ibqp->rwq_ind_tbl;
 		qp_type     = ibqp->qp_type;
@@ -2161,11 +2167,9 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
 			context->param3 |= cpu_to_be32(1 << 30);
 	}
 
-	if (ibuobject)
+	if (ucontext)
 		context->usr_page = cpu_to_be32(
-			mlx4_to_hw_uar_index(dev->dev,
-					     to_mucontext(ibuobject->context)
-					     ->uar.index));
+			mlx4_to_hw_uar_index(dev->dev, ucontext->uar.index));
 	else
 		context->usr_page = cpu_to_be32(
 			mlx4_to_hw_uar_index(dev->dev, dev->priv_uar.index));
@@ -2297,7 +2301,7 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
 	context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn);
 
 	/* Set "fast registration enabled" for all kernel QPs */
-	if (!ibuobject)
+	if (!ucontext)
 		context->params1 |= cpu_to_be32(1 << 11);
 
 	if (attr_mask & IB_QP_RNR_RETRY) {
@@ -2434,7 +2438,7 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
 	else
 		sqd_event = 0;
 
-	if (!ibuobject &&
+	if (!ucontext &&
 	    cur_state == IB_QPS_RESET &&
 	    new_state == IB_QPS_INIT)
 		context->rlkey_roce_mode |= (1 << 4);
@@ -2445,7 +2449,7 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
 	 * headroom is stamped so that the hardware doesn't start
 	 * processing stale work requests.
 	 */
-	if (!ibuobject &&
+	if (!ucontext &&
 	    cur_state == IB_QPS_RESET &&
 	    new_state == IB_QPS_INIT) {
 		struct mlx4_wqe_ctrl_seg *ctrl;
@@ -2509,7 +2513,7 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
 	 * entries and reinitialize the QP.
 	 */
 	if (new_state == IB_QPS_RESET) {
-		if (!ibuobject) {
+		if (!ucontext) {
 			mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
 					 ibsrq ? to_msrq(ibsrq) : NULL);
 			if (send_cq != recv_cq)
@@ -2735,16 +2739,17 @@ static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	}
 
 	if (ibqp->rwq_ind_tbl && (new_state == IB_QPS_INIT)) {
-		err = bringup_rss_rwqs(ibqp->rwq_ind_tbl, attr->port_num);
+		err = bringup_rss_rwqs(ibqp->rwq_ind_tbl, attr->port_num,
+				       udata);
 		if (err)
 			goto out;
 	}
 
 	err = __mlx4_ib_modify_qp(ibqp, MLX4_IB_QP_SRC, attr, attr_mask,
-				  cur_state, new_state);
+				  cur_state, new_state, udata);
 
 	if (ibqp->rwq_ind_tbl && err)
-		bring_down_rss_rwqs(ibqp->rwq_ind_tbl);
+		bring_down_rss_rwqs(ibqp->rwq_ind_tbl, udata);
 
 	if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT))
 		attr->port_num = 1;
@@ -4122,7 +4127,8 @@ static int ib_wq2qp_state(enum ib_wq_state state)
 	}
 }
 
-static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state)
+static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state,
+			      struct ib_udata *udata)
 {
 	struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
 	enum ib_qp_state qp_cur_state;
@@ -4146,7 +4152,8 @@ static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state)
 		attr_mask = IB_QP_PORT;
 
 		err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, &attr,
-					  attr_mask, IB_QPS_RESET, IB_QPS_INIT);
+					  attr_mask, IB_QPS_RESET, IB_QPS_INIT,
+					  udata);
 		if (err) {
 			pr_debug("WQN=0x%06x failed to apply RST->INIT on the HW QP\n",
 				 ibwq->wq_num);
@@ -4158,12 +4165,13 @@ static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state)
 
 	attr_mask = 0;
 	err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL, attr_mask,
-				  qp_cur_state,  qp_new_state);
+				  qp_cur_state,  qp_new_state, udata);
 
 	if (err && (qp_cur_state == IB_QPS_INIT)) {
 		qp_new_state = IB_QPS_RESET;
 		if (__mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL,
-					attr_mask, IB_QPS_INIT, IB_QPS_RESET)) {
+					attr_mask, IB_QPS_INIT, IB_QPS_RESET,
+					udata)) {
 			pr_warn("WQN=0x%06x failed with reverting HW's resources failure\n",
 				ibwq->wq_num);
 			qp_new_state = IB_QPS_INIT;
@@ -4226,7 +4234,7 @@ int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr,
 	 * WQ, so we can apply its port on the WQ.
 	 */
 	if (qp->rss_usecnt)
-		err = _mlx4_ib_modify_wq(ibwq, new_state);
+		err = _mlx4_ib_modify_wq(ibwq, new_state, udata);
 
 	if (!err)
 		ibwq->state = new_state;
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index 498588eac051..381cf899bcef 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -37,6 +37,7 @@
 
 #include "mlx4_ib.h"
 #include <rdma/mlx4-abi.h>
+#include <rdma/uverbs_ioctl.h>
 
 static void *get_wqe(struct mlx4_ib_srq *srq, int n)
 {
@@ -73,6 +74,8 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
 				  struct ib_udata *udata)
 {
 	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mlx4_ib_ucontext, ibucontext);
 	struct mlx4_ib_srq *srq;
 	struct mlx4_wqe_srq_next_seg *next;
 	struct mlx4_wqe_data_seg *scatter;
@@ -128,8 +131,8 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
 		if (err)
 			goto err_mtt;
 
-		err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
-					  udata, ucmd.db_addr, &srq->db);
+		err = mlx4_ib_db_map_user(ucontext, udata, ucmd.db_addr,
+					  &srq->db);
 		if (err)
 			goto err_mtt;
 	} else {
@@ -202,7 +205,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
 
 err_wrid:
 	if (udata)
-		mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db);
+		mlx4_ib_db_unmap_user(ucontext, &srq->db);
 	else
 		kvfree(srq->wrid);
 
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 8e6d23d6859f..eaa055007f28 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -1151,7 +1151,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
 	void *cmd_out;
 	struct ib_uobject *uobj = uverbs_attr_get_uobject(
 		attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_HANDLE);
-	struct mlx5_ib_ucontext *c = to_mucontext(uobj->context);
+	struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+		&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
 	struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device);
 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
 	struct devx_obj *obj;
@@ -1227,8 +1228,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_MODIFY)(
 					MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT);
 	struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs,
 							  MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE);
-	struct mlx5_ib_ucontext *c = to_mucontext(uobj->context);
-	struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device);
+	struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+		&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
+	struct mlx5_ib_dev *mdev = to_mdev(c->ibucontext.device);
 	void *cmd_out;
 	int err;
 	int uid;
@@ -1268,11 +1270,12 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)(
 					      MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT);
 	struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs,
 							  MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE);
-	struct mlx5_ib_ucontext *c = to_mucontext(uobj->context);
+	struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+		&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
 	void *cmd_out;
 	int err;
 	int uid;
-	struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device);
+	struct mlx5_ib_dev *mdev = to_mdev(c->ibucontext.device);
 
 	uid = devx_get_uid(c, cmd_in);
 	if (uid < 0)
@@ -1370,11 +1373,12 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY)(
 				attrs,
 				MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_HANDLE);
 	u16 cmd_out_len;
-	struct mlx5_ib_ucontext *c = to_mucontext(uobj->context);
+	struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+		&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
 	struct ib_uobject *fd_uobj;
 	int err;
 	int uid;
-	struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device);
+	struct mlx5_ib_dev *mdev = to_mdev(c->ibucontext.device);
 	struct devx_async_cmd_event_file *ev_file;
 	struct devx_async_data *async_data;
 
@@ -1530,7 +1534,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)(
 	struct ib_uobject *uobj = uverbs_attr_get_uobject(
 		attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE);
 	u32 obj_id;
-	struct mlx5_ib_ucontext *c = to_mucontext(uobj->context);
+	struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+		&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
 	struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device);
 	int err;
 
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 4f2bc101b061..6b1f0e76900b 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -794,6 +794,8 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 			  struct ib_udata *udata, struct mlx5_ib_rwq *rwq,
 			  struct mlx5_ib_create_wq *ucmd)
 {
+	struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
 	int page_shift = 0;
 	int npages;
 	u32 offset = 0;
@@ -828,8 +830,7 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 		    (unsigned long long)ucmd->buf_addr, rwq->buf_size,
 		    npages, page_shift, ncont, offset);
 
-	err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context), udata,
-				  ucmd->db_addr, &rwq->db);
+	err = mlx5_ib_db_map_user(ucontext, udata, ucmd->db_addr, &rwq->db);
 	if (err) {
 		mlx5_ib_dbg(dev, "map failed\n");
 		goto err_umem;
@@ -877,7 +878,8 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 		return err;
 	}
 
-	context = to_mucontext(pd->uobject->context);
+	context = rdma_udata_to_drv_context(udata, struct mlx5_ib_ucontext,
+					    ibucontext);
 	if (ucmd.flags & MLX5_QP_FLAG_BFREG_INDEX) {
 		uar_index = bfregn_to_uar_index(dev, &context->bfregi,
 						ucmd.bfreg_index, true);
@@ -1456,9 +1458,8 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
 	struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
 	struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
 	struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
-	struct ib_uobject *uobj = pd->uobject;
-	struct ib_ucontext *ucontext = uobj->context;
-	struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
+	struct mlx5_ib_ucontext *mucontext = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
 	int err;
 	u32 tdn = mucontext->tdn;
 	u16 uid = to_mpd(pd)->uid;
@@ -1572,9 +1573,8 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
 				 struct ib_qp_init_attr *init_attr,
 				 struct ib_udata *udata)
 {
-	struct ib_uobject *uobj = pd->uobject;
-	struct ib_ucontext *ucontext = uobj->context;
-	struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
+	struct mlx5_ib_ucontext *mucontext = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
 	struct mlx5_ib_create_qp_resp resp = {};
 	int inlen;
 	int err;
@@ -1916,6 +1916,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
 	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_ib_create_qp_resp resp = {};
+	struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
 	struct mlx5_ib_cq *send_cq;
 	struct mlx5_ib_cq *recv_cq;
 	unsigned long flags;
@@ -2018,8 +2020,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 				      MLX5_QP_FLAG_TYPE_DCT))
 			return -EINVAL;
 
-		err = get_qp_user_index(to_mucontext(pd->uobject->context),
-					&ucmd, udata->inlen, &uidx);
+		err = get_qp_user_index(ucontext, &ucmd, udata->inlen, &uidx);
 		if (err)
 			return err;
 
@@ -2503,8 +2504,11 @@ static const char *ib_qp_type_str(enum ib_qp_type type)
 
 static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd,
 					struct ib_qp_init_attr *attr,
-					struct mlx5_ib_create_qp *ucmd)
+					struct mlx5_ib_create_qp *ucmd,
+					struct ib_udata *udata)
 {
+	struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
 	struct mlx5_ib_qp *qp;
 	int err = 0;
 	u32 uidx = MLX5_IB_DEFAULT_UIDX;
@@ -2513,8 +2517,7 @@ static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd,
 	if (!attr->srq || !attr->recv_cq)
 		return ERR_PTR(-EINVAL);
 
-	err = get_qp_user_index(to_mucontext(pd->uobject->context),
-				ucmd, sizeof(*ucmd), &uidx);
+	err = get_qp_user_index(ucontext, ucmd, sizeof(*ucmd), &uidx);
 	if (err)
 		return ERR_PTR(err);
 
@@ -2596,15 +2599,17 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
 	int err;
 	struct ib_qp_init_attr mlx_init_attr;
 	struct ib_qp_init_attr *init_attr = verbs_init_attr;
+	struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
 
 	if (pd) {
 		dev = to_mdev(pd->device);
 
 		if (init_attr->qp_type == IB_QPT_RAW_PACKET) {
-			if (!udata) {
+			if (!ucontext) {
 				mlx5_ib_dbg(dev, "Raw Packet QP is not supported for kernel consumers\n");
 				return ERR_PTR(-EINVAL);
-			} else if (!to_mucontext(pd->uobject->context)->cqe_version) {
+			} else if (!ucontext->cqe_version) {
 				mlx5_ib_dbg(dev, "Raw Packet QP is only supported for CQE version > 0\n");
 				return ERR_PTR(-EINVAL);
 			}
@@ -2636,7 +2641,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
 				return ERR_PTR(-EINVAL);
 			}
 		} else {
-			return mlx5_ib_create_dct(pd, init_attr, &ucmd);
+			return mlx5_ib_create_dct(pd, init_attr, &ucmd, udata);
 		}
 	}
 
@@ -3274,14 +3279,12 @@ static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
 static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev,
 				    struct mlx5_ib_pd *pd,
 				    struct mlx5_ib_qp_base *qp_base,
-				    u8 port_num)
+				    u8 port_num, struct ib_udata *udata)
 {
-	struct mlx5_ib_ucontext *ucontext = NULL;
+	struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
 	unsigned int tx_port_affinity;
 
-	if (pd && pd->ibpd.uobject && pd->ibpd.uobject->context)
-		ucontext = to_mucontext(pd->ibpd.uobject->context);
-
 	if (ucontext) {
 		tx_port_affinity = (unsigned int)atomic_add_return(
 					   1, &ucontext->tx_port_affinity) %
@@ -3304,8 +3307,10 @@ static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev,
 
 static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 			       const struct ib_qp_attr *attr, int attr_mask,
-			       enum ib_qp_state cur_state, enum ib_qp_state new_state,
-			       const struct mlx5_ib_modify_qp *ucmd)
+			       enum ib_qp_state cur_state,
+			       enum ib_qp_state new_state,
+			       const struct mlx5_ib_modify_qp *ucmd,
+			       struct ib_udata *udata)
 {
 	static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = {
 		[MLX5_QP_STATE_RST] = {
@@ -3396,7 +3401,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 		    (ibqp->qp_type == IB_QPT_XRC_TGT)) {
 			if (dev->lag_active) {
 				u8 p = mlx5_core_native_port_num(dev->mdev);
-				tx_affinity = get_tx_affinity(dev, pd, base, p);
+				tx_affinity = get_tx_affinity(dev, pd, base, p,
+							      udata);
 				context->flags |= cpu_to_be32(tx_affinity << 24);
 			}
 		}
@@ -3879,7 +3885,7 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	}
 
 	err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state,
-				  new_state, &ucmd);
+				  new_state, &ucmd, udata);
 
 out:
 	mutex_unlock(&qp->mutex);
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index 22bd774e0b4e..1ec1beb1296b 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -47,6 +47,8 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct mlx5_ib_create_srq ucmd = {};
+	struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
 	size_t ucmdlen;
 	int err;
 	int npages;
@@ -71,8 +73,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 		return -EINVAL;
 
 	if (in->type != IB_SRQT_BASIC) {
-		err = get_srq_user_index(to_mucontext(pd->uobject->context),
-					 &ucmd, udata->inlen, &uidx);
+		err = get_srq_user_index(ucontext, &ucmd, udata->inlen, &uidx);
 		if (err)
 			return err;
 	}
@@ -103,8 +104,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 
 	mlx5_ib_populate_pas(dev, srq->umem, page_shift, in->pas, 0);
 
-	err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context), udata,
-				  ucmd.db_addr, &srq->db);
+	err = mlx5_ib_db_map_user(ucontext, udata, ucmd.db_addr, &srq->db);
 	if (err) {
 		mlx5_ib_dbg(dev, "map doorbell failed\n");
 		goto err_in;
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 516c8cf9c0fd..80c3af217d96 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -37,6 +37,7 @@
 #include <rdma/ib_smi.h>
 #include <rdma/ib_umem.h>
 #include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -435,7 +436,8 @@ static struct ib_srq *mthca_create_srq(struct ib_pd *pd,
 				       struct ib_udata *udata)
 {
 	struct mthca_create_srq ucmd;
-	struct mthca_ucontext *context = NULL;
+	struct mthca_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct mthca_ucontext, ibucontext);
 	struct mthca_srq *srq;
 	int err;
 
@@ -447,8 +449,6 @@ static struct ib_srq *mthca_create_srq(struct ib_pd *pd,
 		return ERR_PTR(-ENOMEM);
 
 	if (udata) {
-		context = to_mucontext(pd->uobject->context);
-
 		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
 			err = -EFAULT;
 			goto err_free;
@@ -510,6 +510,8 @@ static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
 				     struct ib_qp_init_attr *init_attr,
 				     struct ib_udata *udata)
 {
+	struct mthca_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct mthca_ucontext, ibucontext);
 	struct mthca_create_qp ucmd;
 	struct mthca_qp *qp;
 	int err;
@@ -522,15 +524,11 @@ static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
 	case IB_QPT_UC:
 	case IB_QPT_UD:
 	{
-		struct mthca_ucontext *context;
-
 		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
 		if (!qp)
 			return ERR_PTR(-ENOMEM);
 
 		if (udata) {
-			context = to_mucontext(pd->uobject->context);
-
 			if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
 				kfree(qp);
 				return ERR_PTR(-EFAULT);
@@ -568,8 +566,6 @@ static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
 				     &init_attr->cap, qp, udata);
 
 		if (err && udata) {
-			context = to_mucontext(pd->uobject->context);
-
 			mthca_unmap_user_db(to_mdev(pd->device),
 					    &context->uar,
 					    context->db_tab,
@@ -898,6 +894,8 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 {
 	struct mthca_dev *dev = to_mdev(pd->device);
 	struct sg_dma_page_iter sg_iter;
+	struct mthca_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct mthca_ucontext, ibucontext);
 	struct mthca_mr *mr;
 	struct mthca_reg_mr ucmd;
 	u64 *pages;
@@ -906,12 +904,12 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	int write_mtt_size;
 
 	if (udata->inlen < sizeof ucmd) {
-		if (!to_mucontext(pd->uobject->context)->reg_mr_warned) {
+		if (!context->reg_mr_warned) {
 			mthca_warn(dev, "Process '%s' did not pass in MR attrs.\n",
 				   current->comm);
 			mthca_warn(dev, "  Update libmthca to fix this.\n");
 		}
-		++to_mucontext(pd->uobject->context)->reg_mr_warned;
+		++context->reg_mr_warned;
 		ucmd.mr_attrs = 0;
 	} else if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
 		return ERR_PTR(-EFAULT);
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 4e5b5cc17f1d..7a5b25d13faa 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -42,6 +42,7 @@
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_cache.h>
 #include <rdma/ib_pack.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "mthca_dev.h"
 #include "mthca_cmd.h"
@@ -554,10 +555,14 @@ static int mthca_path_set(struct mthca_dev *dev, const struct rdma_ah_attr *ah,
 
 static int __mthca_modify_qp(struct ib_qp *ibqp,
 			     const struct ib_qp_attr *attr, int attr_mask,
-			     enum ib_qp_state cur_state, enum ib_qp_state new_state)
+			     enum ib_qp_state cur_state,
+			     enum ib_qp_state new_state,
+			     struct ib_udata *udata)
 {
 	struct mthca_dev *dev = to_mdev(ibqp->device);
 	struct mthca_qp *qp = to_mqp(ibqp);
+	struct mthca_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct mthca_ucontext, ibucontext);
 	struct mthca_mailbox *mailbox;
 	struct mthca_qp_param *qp_param;
 	struct mthca_qp_context *qp_context;
@@ -619,8 +624,7 @@ static int __mthca_modify_qp(struct ib_qp *ibqp,
 	/* leave arbel_sched_queue as 0 */
 
 	if (qp->ibqp.uobject)
-		qp_context->usr_page =
-			cpu_to_be32(to_mucontext(qp->ibqp.uobject->context)->uar.index);
+		qp_context->usr_page = cpu_to_be32(context->uar.index);
 	else
 		qp_context->usr_page = cpu_to_be32(dev->driver_uar.index);
 	qp_context->local_qpn  = cpu_to_be32(qp->qpn);
@@ -913,7 +917,8 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
 		goto out;
 	}
 
-	err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+	err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state,
+				udata);
 
 out:
 	mutex_unlock(&qp->mutex);
diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c
index b8333c79e3fa..06b920385512 100644
--- a/drivers/infiniband/hw/mthca/mthca_srq.c
+++ b/drivers/infiniband/hw/mthca/mthca_srq.c
@@ -36,6 +36,8 @@
 
 #include <asm/io.h>
 
+#include <rdma/uverbs_ioctl.h>
+
 #include "mthca_dev.h"
 #include "mthca_cmd.h"
 #include "mthca_memfree.h"
@@ -96,17 +98,19 @@ static void mthca_tavor_init_srq_context(struct mthca_dev *dev,
 					 struct mthca_pd *pd,
 					 struct mthca_srq *srq,
 					 struct mthca_tavor_srq_context *context,
-					 bool is_user)
+					 struct ib_udata *udata)
 {
+	struct mthca_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mthca_ucontext, ibucontext);
+
 	memset(context, 0, sizeof *context);
 
 	context->wqe_base_ds = cpu_to_be64(1 << (srq->wqe_shift - 4));
 	context->state_pd    = cpu_to_be32(pd->pd_num);
 	context->lkey        = cpu_to_be32(srq->mr.ibmr.lkey);
 
-	if (is_user)
-		context->uar =
-			cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index);
+	if (udata)
+		context->uar = cpu_to_be32(ucontext->uar.index);
 	else
 		context->uar = cpu_to_be32(dev->driver_uar.index);
 }
@@ -115,8 +119,10 @@ static void mthca_arbel_init_srq_context(struct mthca_dev *dev,
 					 struct mthca_pd *pd,
 					 struct mthca_srq *srq,
 					 struct mthca_arbel_srq_context *context,
-					 bool is_user)
+					 struct ib_udata *udata)
 {
+	struct mthca_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mthca_ucontext, ibucontext);
 	int logsize, max;
 
 	memset(context, 0, sizeof *context);
@@ -131,9 +137,8 @@ static void mthca_arbel_init_srq_context(struct mthca_dev *dev,
 	context->lkey = cpu_to_be32(srq->mr.ibmr.lkey);
 	context->db_index = cpu_to_be32(srq->db_index);
 	context->logstride_usrpage = cpu_to_be32((srq->wqe_shift - 4) << 29);
-	if (is_user)
-		context->logstride_usrpage |=
-			cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index);
+	if (udata)
+		context->logstride_usrpage |= cpu_to_be32(ucontext->uar.index);
 	else
 		context->logstride_usrpage |= cpu_to_be32(dev->driver_uar.index);
 	context->eq_pd = cpu_to_be32(MTHCA_EQ_ASYNC << 24 | pd->pd_num);
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 138844299d73..38480b7708eb 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -41,6 +41,7 @@
 #include <rdma/ib_verbs.h>
 #include <rdma/iw_cm.h>
 #include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "nes.h"
 
@@ -974,7 +975,8 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
 	struct nes_adapter *nesadapter = nesdev->nesadapter;
 	struct nes_qp *nesqp;
 	struct nes_cq *nescq;
-	struct nes_ucontext *nes_ucontext;
+	struct nes_ucontext *nes_ucontext = rdma_udata_to_drv_context(
+		udata, struct nes_ucontext, ibucontext);
 	struct nes_hw_cqp_wqe *cqp_wqe;
 	struct nes_cqp_request *cqp_request;
 	struct nes_create_qp_req req;
@@ -1055,9 +1057,8 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
 				}
 				if (req.user_qp_buffer)
 					nesqp->nesuqp_addr = req.user_qp_buffer;
-				if (udata && (ibpd->uobject->context)) {
+				if (udata) {
 					nesqp->user_mode = 1;
-					nes_ucontext = to_nesucontext(ibpd->uobject->context);
 					if (virt_wqs) {
 						err = 1;
 						list_for_each_entry(nespbl, &nes_ucontext->qp_reg_mem_list, list) {
@@ -1078,7 +1079,6 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
 						}
 					}
 
-					nes_ucontext = to_nesucontext(ibpd->uobject->context);
 					nesqp->mmap_sq_db_index =
 						find_next_zero_bit(nes_ucontext->allocated_wqs,
 								   NES_MAX_USER_WQ_REGIONS, nes_ucontext->first_free_wq);
@@ -2099,7 +2099,8 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	struct nes_adapter *nesadapter = nesdev->nesadapter;
 	struct ib_mr *ibmr = ERR_PTR(-EINVAL);
 	struct sg_dma_page_iter dma_iter;
-	struct nes_ucontext *nes_ucontext;
+	struct nes_ucontext *nes_ucontext = rdma_udata_to_drv_context(
+		udata, struct nes_ucontext, ibucontext);
 	struct nes_pbl *nespbl;
 	struct nes_mr *nesmr;
 	struct ib_umem *region;
@@ -2342,7 +2343,6 @@ reg_user_mr_err:
 				return ERR_PTR(-ENOMEM);
 			}
 			nesmr->region = region;
-			nes_ucontext = to_nesucontext(pd->uobject->context);
 			pbl_depth = region->length >> 12;
 			pbl_depth += (region->length & (4096-1)) ? 1 : 0;
 			nespbl->pbl_size = pbl_depth*sizeof(u64);
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index a613ebde322f..d51bc3ede9d1 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -1434,7 +1434,7 @@ struct ib_srq *qedr_create_srq(struct ib_pd *ibpd,
 	hw_srq->max_wr = init_attr->attr.max_wr;
 	hw_srq->max_sges = init_attr->attr.max_sge;
 
-	if (udata && ibpd->uobject && ibpd->uobject->context) {
+	if (udata) {
 		if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) {
 			DP_ERR(dev,
 			       "create srq: problem copying data from user space\n");
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
index 0ced89b51448..b4575b1c2a62 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -37,6 +37,7 @@
 
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_addr.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "usnic_abi.h"
 #include "usnic_ib.h"
@@ -482,7 +483,8 @@ struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
 	int err;
 	struct usnic_ib_dev *us_ibdev;
 	struct usnic_ib_qp_grp *qp_grp;
-	struct usnic_ib_ucontext *ucontext;
+	struct usnic_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct usnic_ib_ucontext, ibucontext);
 	int cq_cnt;
 	struct usnic_vnic_res_spec res_spec;
 	struct usnic_ib_create_qp_cmd cmd;
@@ -490,7 +492,6 @@ struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
 
 	usnic_dbg("\n");
 
-	ucontext = to_uucontext(pd->uobject->context);
 	us_ibdev = to_usdev(pd->device);
 
 	if (init_attr->create_flags)
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 6b1b2b75ef60..72664f0a52ee 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -53,6 +53,7 @@
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_hdrs.h>
 #include <rdma/opa_addr.h>
+#include <rdma/uverbs_ioctl.h>
 #include "qp.h"
 #include "vt.h"
 #include "trace.h"
@@ -956,6 +957,8 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 	size_t sg_list_sz;
 	struct ib_qp *ret = ERR_PTR(-ENOMEM);
 	struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device);
+	struct rvt_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct rvt_ucontext, ibucontext);
 	void *priv = NULL;
 	size_t sqsize;
 
@@ -1129,7 +1132,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 			u32 s = sizeof(struct rvt_rwq) + qp->r_rq.size * sz;
 
 			qp->ip = rvt_create_mmap_info(rdi, s,
-						      ibpd->uobject->context,
+						      &ucontext->ibucontext,
 						      qp->r_rq.wq);
 			if (!qp->ip) {
 				ret = ERR_PTR(-ENOMEM);
diff --git a/drivers/infiniband/sw/rdmavt/srq.c b/drivers/infiniband/sw/rdmavt/srq.c
index 78e06fc456c5..895b3fabd0bf 100644
--- a/drivers/infiniband/sw/rdmavt/srq.c
+++ b/drivers/infiniband/sw/rdmavt/srq.c
@@ -48,6 +48,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "srq.h"
 #include "vt.h"
@@ -77,6 +78,8 @@ struct ib_srq *rvt_create_srq(struct ib_pd *ibpd,
 			      struct ib_udata *udata)
 {
 	struct rvt_dev_info *dev = ib_to_rvt(ibpd->device);
+	struct rvt_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct rvt_ucontext, ibucontext);
 	struct rvt_srq *srq;
 	u32 sz;
 	struct ib_srq *ret;
@@ -119,7 +122,7 @@ struct ib_srq *rvt_create_srq(struct ib_pd *ibpd,
 		u32 s = sizeof(struct rvt_rwq) + srq->rq.size * sz;
 
 		srq->ip =
-		    rvt_create_mmap_info(dev, s, ibpd->uobject->context,
+		    rvt_create_mmap_info(dev, s, &ucontext->ibucontext,
 					 srq->rq.wq);
 		if (!srq->ip) {
 			ret = ERR_PTR(-ENOMEM);
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
index a19832c73d5a..ee9c82cb3b6b 100644
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -284,10 +284,6 @@ static int rvt_query_gid(struct ib_device *ibdev, u8 port_num,
 					 &gid->global.interface_id);
 }
 
-struct rvt_ucontext {
-	struct ib_ucontext ibucontext;
-};
-
 static inline struct rvt_ucontext *to_iucontext(struct ib_ucontext
 						*ibucontext)
 {
diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
index be5d76b2bcca..09ede70dc1e8 100644
--- a/drivers/infiniband/sw/rxe/rxe_qp.c
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -35,6 +35,7 @@
 #include <linux/delay.h>
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "rxe.h"
 #include "rxe_loc.h"
@@ -343,7 +344,8 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
 	struct rxe_cq *rcq = to_rcq(init->recv_cq);
 	struct rxe_cq *scq = to_rcq(init->send_cq);
 	struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL;
-	struct ib_ucontext *context = udata ? ibpd->uobject->context : NULL;
+	struct rxe_ucontext *ucontext =
+		rdma_udata_to_drv_context(udata, struct rxe_ucontext, ibuc);
 
 	rxe_add_ref(pd);
 	rxe_add_ref(rcq);
@@ -358,11 +360,11 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
 
 	rxe_qp_init_misc(rxe, qp, init);
 
-	err = rxe_qp_init_req(rxe, qp, init, context, uresp);
+	err = rxe_qp_init_req(rxe, qp, init, &ucontext->ibuc, uresp);
 	if (err)
 		goto err1;
 
-	err = rxe_qp_init_resp(rxe, qp, init, context, uresp);
+	err = rxe_qp_init_resp(rxe, qp, init, &ucontext->ibuc, uresp);
 	if (err)
 		goto err2;
 
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 051c3930e808..ffca654c8697 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -33,6 +33,7 @@
 
 #include <linux/dma-mapping.h>
 #include <net/addrconf.h>
+#include <rdma/uverbs_ioctl.h>
 #include "rxe.h"
 #include "rxe_loc.h"
 #include "rxe_queue.h"
@@ -320,8 +321,9 @@ static struct ib_srq *rxe_create_srq(struct ib_pd *ibpd,
 	int err;
 	struct rxe_dev *rxe = to_rdev(ibpd->device);
 	struct rxe_pd *pd = to_rpd(ibpd);
+	struct rxe_ucontext *ucontext =
+		rdma_udata_to_drv_context(udata, struct rxe_ucontext, ibuc);
 	struct rxe_srq *srq;
-	struct ib_ucontext *context = udata ? ibpd->uobject->context : NULL;
 	struct rxe_create_srq_resp __user *uresp = NULL;
 
 	if (udata) {
@@ -344,7 +346,7 @@ static struct ib_srq *rxe_create_srq(struct ib_pd *ibpd,
 	rxe_add_ref(pd);
 	srq->pd = pd;
 
-	err = rxe_srq_from_init(rxe, srq, init, context, uresp);
+	err = rxe_srq_from_init(rxe, srq, init, &ucontext->ibuc, uresp);
 	if (err)
 		goto err2;
 
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index 87d66c9630d7..4c257aff7d32 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -186,6 +186,11 @@ struct rvt_driver_params {
 	u8 reserved_operations;
 };
 
+/* User context */
+struct rvt_ucontext {
+	struct ib_ucontext ibucontext;
+};
+
 /* Protection domain */
 struct rvt_pd {
 	struct ib_pd ibpd;
-- 
cgit v1.2.3


From db610a640eeeb268c36a4558414f28e1c269433e Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 24 Jan 2019 17:48:38 -0800
Subject: f2fs: add quick mode of checkpoint=disable for QA

This mode returns mount() quickly with EAGAIN. We can trigger this by
shutdown(F2FS_GOING_DOWN_NEED_FSCK).

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c    | 5 +++++
 fs/f2fs/f2fs.h          | 2 ++
 fs/f2fs/file.c          | 6 +++---
 fs/f2fs/segment.c       | 3 +++
 fs/f2fs/super.c         | 5 +++++
 include/linux/f2fs_fs.h | 1 +
 6 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index f955cd3e0677..622dca707752 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1259,6 +1259,11 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	else
 		__clear_ckpt_flags(ckpt, CP_DISABLED_FLAG);
 
+	if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK))
+		__set_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG);
+	else
+		__clear_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG);
+
 	if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH))
 		__set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
 	else
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6b6ec5600089..fe95abb05d40 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -191,6 +191,7 @@ enum {
 #define DEF_CP_INTERVAL			60	/* 60 secs */
 #define DEF_IDLE_INTERVAL		5	/* 5 secs */
 #define DEF_DISABLE_INTERVAL		5	/* 5 secs */
+#define DEF_DISABLE_QUICK_INTERVAL	1	/* 1 secs */
 #define DEF_UMOUNT_DISCARD_TIMEOUT	5	/* 5 secs */
 
 struct cp_control {
@@ -1101,6 +1102,7 @@ enum {
 	SBI_IS_SHUTDOWN,			/* shutdown by ioctl */
 	SBI_IS_RECOVERED,			/* recovered orphan/data */
 	SBI_CP_DISABLED,			/* CP was disabled last mount */
+	SBI_CP_DISABLED_QUICK,			/* CP was disabled quickly */
 	SBI_QUOTA_NEED_FLUSH,			/* need to flush quota info in CP */
 	SBI_QUOTA_SKIP_FLUSH,			/* skip flushing quota in current CP */
 	SBI_QUOTA_NEED_REPAIR,			/* quota file may be corrupted */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 0d461321edfc..fe6f92fbba38 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1972,11 +1972,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
 		break;
 	case F2FS_GOING_DOWN_NEED_FSCK:
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
+		set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
+		set_sbi_flag(sbi, SBI_IS_DIRTY);
 		/* do checkpoint only */
 		ret = f2fs_sync_fs(sb, 1);
-		if (ret)
-			goto out;
-		break;
+		goto out;
 	default:
 		ret = -EINVAL;
 		goto out;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 5b2b9be6f28d..342b720fb4db 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -868,6 +868,9 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi)
 
 	if (holes[DATA] > ovp || holes[NODE] > ovp)
 		return -EAGAIN;
+	if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) &&
+		dirty_segments(sbi) > overprovision_segments(sbi))
+		return -EAGAIN;
 	return 0;
 }
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 24efd76ca151..5e1f8573a17f 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3205,6 +3205,10 @@ try_onemore:
 
 	if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_QUOTA_NEED_FSCK_FLAG))
 		set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+	if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_DISABLED_QUICK_FLAG)) {
+		set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
+		sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_QUICK_INTERVAL;
+	}
 
 	/* Initialize device list */
 	err = f2fs_scan_devices(sbi);
@@ -3392,6 +3396,7 @@ skip_recovery:
 				cur_cp_version(F2FS_CKPT(sbi)));
 	f2fs_update_time(sbi, CP_TIME);
 	f2fs_update_time(sbi, REQ_TIME);
+	clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
 	return 0;
 
 free_meta:
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index d7711048ef93..d6befe1f9dc7 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -116,6 +116,7 @@ struct f2fs_super_block {
 /*
  * For checkpoint
  */
+#define CP_DISABLED_QUICK_FLAG		0x00002000
 #define CP_DISABLED_FLAG		0x00001000
 #define CP_QUOTA_NEED_FSCK_FLAG		0x00000800
 #define CP_LARGE_NAT_BITMAP_FLAG	0x00000400
-- 
cgit v1.2.3


From 1ffdc3807589779626924eb600db784d3d943a08 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@gmail.com>
Date: Fri, 25 Jan 2019 15:35:01 +0800
Subject: f2fs: fix typos in code comments

lengh -> length

Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/linux/f2fs_fs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index d6befe1f9dc7..8d57aaee8166 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -187,7 +187,7 @@ struct f2fs_orphan_block {
 struct f2fs_extent {
 	__le32 fofs;		/* start file offset of the extent */
 	__le32 blk;		/* start block address of the extent */
-	__le32 len;		/* lengh of the extent */
+	__le32 len;		/* length of the extent */
 } __packed;
 
 #define F2FS_NAME_LEN		255
@@ -512,7 +512,7 @@ typedef __le32	f2fs_hash_t;
 struct f2fs_dir_entry {
 	__le32 hash_code;	/* hash code of file name */
 	__le32 ino;		/* inode number */
-	__le16 name_len;	/* lengh of file name */
+	__le16 name_len;	/* length of file name */
 	__u8 file_type;		/* file type */
 } __packed;
 
-- 
cgit v1.2.3


From 357b4da50a62e2fd70eacee21cdbd22d4c7a7b60 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 14 Feb 2019 11:42:39 +0100
Subject: x86: respect memory size limiting via mem= parameter

When limiting memory size via kernel parameter "mem=" this should be
respected even in case of memory made accessible via a PCI card.

Today this kind of memory won't be made usable in initial memory
setup as the memory won't be visible in E820 map, but it might be
added when adding PCI devices due to corresponding ACPI table entries.

Not respecting "mem=" can be corrected by adding a global max_mem_size
variable set by parse_memopt() which will result in rejecting adding
memory areas resulting in a memory size above the allowed limit.

Signed-off-by: Juergen Gross <jgross@suse.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/kernel/e820.c         | 5 +++++
 include/linux/memory_hotplug.h | 2 ++
 mm/memory_hotplug.c            | 6 ++++++
 3 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 50895c2f937d..e67513e2cbbb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -14,6 +14,7 @@
 #include <linux/acpi.h>
 #include <linux/firmware-map.h>
 #include <linux/sort.h>
+#include <linux/memory_hotplug.h>
 
 #include <asm/e820/api.h>
 #include <asm/setup.h>
@@ -881,6 +882,10 @@ static int __init parse_memopt(char *p)
 
 	e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+	max_mem_size = mem_size;
+#endif
+
 	return 0;
 }
 early_param("mem", parse_memopt);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 368267c1b71b..cfd12078172a 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -100,6 +100,8 @@ extern void __online_page_free(struct page *page);
 
 extern int try_online_node(int nid);
 
+extern u64 max_mem_size;
+
 extern bool memhp_auto_online;
 /* If movable_node boot option specified */
 extern bool movable_node_enabled;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 124e794867c5..519f9db063ff 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -96,10 +96,16 @@ void mem_hotplug_done(void)
 	cpus_read_unlock();
 }
 
+u64 max_mem_size = U64_MAX;
+
 /* add this memory to iomem resource */
 static struct resource *register_memory_resource(u64 start, u64 size)
 {
 	struct resource *res, *conflict;
+
+	if (start + size > max_mem_size)
+		return ERR_PTR(-E2BIG);
+
 	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
 	if (!res)
 		return ERR_PTR(-ENOMEM);
-- 
cgit v1.2.3


From 1d988ed46543ca36c010634c97ac32114362ddb1 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 14 Feb 2019 11:42:40 +0100
Subject: x86/xen: dont add memory above max allowed allocation

Don't allow memory to be added above the allowed maximum allocation
limit set by Xen.

Trying to do so would result in cases like the following:

[  584.559652] ------------[ cut here ]------------
[  584.564897] WARNING: CPU: 2 PID: 1 at ../arch/x86/xen/multicalls.c:129 xen_alloc_pte+0x1c7/0x390()
[  584.575151] Modules linked in:
[  584.578643] Supported: Yes
[  584.581750] CPU: 2 PID: 1 Comm: swapper/0 Not tainted 4.4.120-92.70-default #1
[  584.590000] Hardware name: Cisco Systems Inc UCSC-C460-M4/UCSC-C460-M4, BIOS C460M4.4.0.1b.0.0629181419 06/29/2018
[  584.601862]  0000000000000000 ffffffff813175a0 0000000000000000 ffffffff8184777c
[  584.610200]  ffffffff8107f4e1 ffff880487eb7000 ffff8801862b79c0 ffff88048608d290
[  584.618537]  0000000000487eb7 ffffea0000000201 ffffffff81009de7 ffffffff81068561
[  584.626876] Call Trace:
[  584.629699]  [<ffffffff81019ad9>] dump_trace+0x59/0x340
[  584.635645]  [<ffffffff81019eaa>] show_stack_log_lvl+0xea/0x170
[  584.642391]  [<ffffffff8101ac51>] show_stack+0x21/0x40
[  584.648238]  [<ffffffff813175a0>] dump_stack+0x5c/0x7c
[  584.654085]  [<ffffffff8107f4e1>] warn_slowpath_common+0x81/0xb0
[  584.660932]  [<ffffffff81009de7>] xen_alloc_pte+0x1c7/0x390
[  584.667289]  [<ffffffff810647f0>] pmd_populate_kernel.constprop.6+0x40/0x80
[  584.675241]  [<ffffffff815ecfe8>] phys_pmd_init+0x210/0x255
[  584.681587]  [<ffffffff815ed207>] phys_pud_init+0x1da/0x247
[  584.687931]  [<ffffffff815edb3b>] kernel_physical_mapping_init+0xf5/0x1d4
[  584.695682]  [<ffffffff815e9bdd>] init_memory_mapping+0x18d/0x380
[  584.702631]  [<ffffffff81064699>] arch_add_memory+0x59/0xf0

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/xen/setup.c      | 13 +++++++++++++
 drivers/xen/xen-balloon.c | 11 +++++++++++
 include/xen/xen.h         |  4 ++++
 3 files changed, 28 insertions(+)

(limited to 'include')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index d5f303c0e656..548d1e0a5ba1 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -12,6 +12,7 @@
 #include <linux/memblock.h>
 #include <linux/cpuidle.h>
 #include <linux/cpufreq.h>
+#include <linux/memory_hotplug.h>
 
 #include <asm/elf.h>
 #include <asm/vdso.h>
@@ -589,6 +590,14 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start,
 	if (type == E820_TYPE_RAM) {
 		start = PAGE_ALIGN(start);
 		end &= ~((phys_addr_t)PAGE_SIZE - 1);
+#ifdef CONFIG_MEMORY_HOTPLUG
+		/*
+		 * Don't allow adding memory not in E820 map while booting the
+		 * system. Once the balloon driver is up it will remove that
+		 * restriction again.
+		 */
+		max_mem_size = end;
+#endif
 	}
 
 	e820__range_add(start, end - start, type);
@@ -748,6 +757,10 @@ char * __init xen_memory_setup(void)
 	memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries);
 	set_xen_guest_handle(memmap.buffer, xen_e820_table.entries);
 
+#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_XEN_BALLOON)
+	xen_saved_max_mem_size = max_mem_size;
+#endif
+
 	op = xen_initial_domain() ?
 		XENMEM_machine_memory_map :
 		XENMEM_memory_map;
diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c
index 2acbfe104e46..a67236b02452 100644
--- a/drivers/xen/xen-balloon.c
+++ b/drivers/xen/xen-balloon.c
@@ -37,6 +37,7 @@
 #include <linux/mm_types.h>
 #include <linux/init.h>
 #include <linux/capability.h>
+#include <linux/memory_hotplug.h>
 
 #include <xen/xen.h>
 #include <xen/interface/xen.h>
@@ -50,6 +51,10 @@
 
 #define BALLOON_CLASS_NAME "xen_memory"
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+u64 xen_saved_max_mem_size = 0;
+#endif
+
 static struct device balloon_dev;
 
 static int register_balloon(struct device *dev);
@@ -63,6 +68,12 @@ static void watch_target(struct xenbus_watch *watch,
 	static bool watch_fired;
 	static long target_diff;
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+	/* The balloon driver will take care of adding memory now. */
+	if (xen_saved_max_mem_size)
+		max_mem_size = xen_saved_max_mem_size;
+#endif
+
 	err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
 	if (err != 1) {
 		/* This is ok (for domain0 at least) - so just return */
diff --git a/include/xen/xen.h b/include/xen/xen.h
index 0e2156786ad2..19d032373de5 100644
--- a/include/xen/xen.h
+++ b/include/xen/xen.h
@@ -46,4 +46,8 @@ struct bio_vec;
 bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
 		const struct bio_vec *vec2);
 
+#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_XEN_BALLOON)
+extern u64 xen_saved_max_mem_size;
+#endif
+
 #endif	/* _XEN_XEN_H */
-- 
cgit v1.2.3


From 3d769df5fc32dc3cb12d6ccb61690b09371b8b3f Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Fri, 8 Feb 2019 03:49:23 -0500
Subject: media: v4l2-subdev.h: v4l2_subdev_call: use temp __sd variable

The sd argument of this macro can be a more complex expression. Since it
is used 5 times in the macro it can be evaluated that many times as well.

So assign it to a temp variable in the beginning and use that instead.

This also avoids any potential side-effects of evaluating sd.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/media/v4l2-subdev.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h
index 47af609dc8f1..34da094a3f40 100644
--- a/include/media/v4l2-subdev.h
+++ b/include/media/v4l2-subdev.h
@@ -1093,13 +1093,14 @@ void v4l2_subdev_init(struct v4l2_subdev *sd,
  */
 #define v4l2_subdev_call(sd, o, f, args...)				\
 	({								\
+		struct v4l2_subdev *__sd = (sd);			\
 		int __result;						\
-		if (!(sd))						\
+		if (!__sd)						\
 			__result = -ENODEV;				\
-		else if (!((sd)->ops->o && (sd)->ops->o->f))		\
+		else if (!(__sd->ops->o && __sd->ops->o->f))		\
 			__result = -ENOIOCTLCMD;			\
 		else							\
-			__result = (sd)->ops->o->f((sd), ##args);	\
+			__result = __sd->ops->o->f(__sd, ##args);	\
 		__result;						\
 	})
 
-- 
cgit v1.2.3


From a7fe4ca72b1fe7877de5672640d0b4e023d0fdca Mon Sep 17 00:00:00 2001
From: Vivek Kasireddy <vivek.kasireddy@intel.com>
Date: Thu, 7 Feb 2019 22:18:43 -0500
Subject: media: v4l: Add 32-bit packed YUV formats

The formats added in this patch include:
 V4L2_PIX_FMT_AYUV32
 V4L2_PIX_FMT_XYUV32
 V4L2_PIX_FMT_VUYA32
 V4L2_PIX_FMT_VUYX32

These formats enable the trasmission of alpha channel data to other
drivers and userspace applications in addition to YUV data. For
example, buffers generated by drivers in one of these formats
can be used by the Weston compositor to display as a texture or
flipped directly onto the overlay planes with the help of a DRM
driver.

Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst | 170 ++++++++++++++++++++-
 drivers/media/v4l2-core/v4l2-ioctl.c               |   4 +
 include/uapi/linux/videodev2.h                     |   4 +
 3 files changed, 177 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst b/Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst
index f53e8f57a003..7fcee1c11ac4 100644
--- a/Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst
+++ b/Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst
@@ -190,6 +190,170 @@ component of each pixel in one 16 or 32 bit word.
       - Cr\ :sub:`2`
       - Cr\ :sub:`1`
       - Cr\ :sub:`0`
+      -
+    * .. _V4L2-PIX-FMT-AYUV32:
+
+      - ``V4L2_PIX_FMT_AYUV32``
+      - 'AYUV'
+
+      - a\ :sub:`7`
+      - a\ :sub:`6`
+      - a\ :sub:`5`
+      - a\ :sub:`4`
+      - a\ :sub:`3`
+      - a\ :sub:`2`
+      - a\ :sub:`1`
+      - a\ :sub:`0`
+
+      - Y'\ :sub:`7`
+      - Y'\ :sub:`6`
+      - Y'\ :sub:`5`
+      - Y'\ :sub:`4`
+      - Y'\ :sub:`3`
+      - Y'\ :sub:`2`
+      - Y'\ :sub:`1`
+      - Y'\ :sub:`0`
+
+      - Cb\ :sub:`7`
+      - Cb\ :sub:`6`
+      - Cb\ :sub:`5`
+      - Cb\ :sub:`4`
+      - Cb\ :sub:`3`
+      - Cb\ :sub:`2`
+      - Cb\ :sub:`1`
+      - Cb\ :sub:`0`
+
+      - Cr\ :sub:`7`
+      - Cr\ :sub:`6`
+      - Cr\ :sub:`5`
+      - Cr\ :sub:`4`
+      - Cr\ :sub:`3`
+      - Cr\ :sub:`2`
+      - Cr\ :sub:`1`
+      - Cr\ :sub:`0`
+      -
+    * .. _V4L2-PIX-FMT-XYUV32:
+
+      - ``V4L2_PIX_FMT_XYUV32``
+      - 'XYUV'
+
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+
+      - Y'\ :sub:`7`
+      - Y'\ :sub:`6`
+      - Y'\ :sub:`5`
+      - Y'\ :sub:`4`
+      - Y'\ :sub:`3`
+      - Y'\ :sub:`2`
+      - Y'\ :sub:`1`
+      - Y'\ :sub:`0`
+
+      - Cb\ :sub:`7`
+      - Cb\ :sub:`6`
+      - Cb\ :sub:`5`
+      - Cb\ :sub:`4`
+      - Cb\ :sub:`3`
+      - Cb\ :sub:`2`
+      - Cb\ :sub:`1`
+      - Cb\ :sub:`0`
+
+      - Cr\ :sub:`7`
+      - Cr\ :sub:`6`
+      - Cr\ :sub:`5`
+      - Cr\ :sub:`4`
+      - Cr\ :sub:`3`
+      - Cr\ :sub:`2`
+      - Cr\ :sub:`1`
+      - Cr\ :sub:`0`
+      -
+    * .. _V4L2-PIX-FMT-VUYA32:
+
+      - ``V4L2_PIX_FMT_VUYA32``
+      - 'VUYA'
+
+      - Cr\ :sub:`7`
+      - Cr\ :sub:`6`
+      - Cr\ :sub:`5`
+      - Cr\ :sub:`4`
+      - Cr\ :sub:`3`
+      - Cr\ :sub:`2`
+      - Cr\ :sub:`1`
+      - Cr\ :sub:`0`
+
+      - Cb\ :sub:`7`
+      - Cb\ :sub:`6`
+      - Cb\ :sub:`5`
+      - Cb\ :sub:`4`
+      - Cb\ :sub:`3`
+      - Cb\ :sub:`2`
+      - Cb\ :sub:`1`
+      - Cb\ :sub:`0`
+
+      - Y'\ :sub:`7`
+      - Y'\ :sub:`6`
+      - Y'\ :sub:`5`
+      - Y'\ :sub:`4`
+      - Y'\ :sub:`3`
+      - Y'\ :sub:`2`
+      - Y'\ :sub:`1`
+      - Y'\ :sub:`0`
+
+      - a\ :sub:`7`
+      - a\ :sub:`6`
+      - a\ :sub:`5`
+      - a\ :sub:`4`
+      - a\ :sub:`3`
+      - a\ :sub:`2`
+      - a\ :sub:`1`
+      - a\ :sub:`0`
+      -
+    * .. _V4L2-PIX-FMT-VUYX32:
+
+      - ``V4L2_PIX_FMT_VUYX32``
+      - 'VUYX'
+
+      - Cr\ :sub:`7`
+      - Cr\ :sub:`6`
+      - Cr\ :sub:`5`
+      - Cr\ :sub:`4`
+      - Cr\ :sub:`3`
+      - Cr\ :sub:`2`
+      - Cr\ :sub:`1`
+      - Cr\ :sub:`0`
+
+      - Cb\ :sub:`7`
+      - Cb\ :sub:`6`
+      - Cb\ :sub:`5`
+      - Cb\ :sub:`4`
+      - Cb\ :sub:`3`
+      - Cb\ :sub:`2`
+      - Cb\ :sub:`1`
+      - Cb\ :sub:`0`
+
+      - Y'\ :sub:`7`
+      - Y'\ :sub:`6`
+      - Y'\ :sub:`5`
+      - Y'\ :sub:`4`
+      - Y'\ :sub:`3`
+      - Y'\ :sub:`2`
+      - Y'\ :sub:`1`
+      - Y'\ :sub:`0`
+
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
 
 .. raw:: latex
 
@@ -202,4 +366,8 @@ component of each pixel in one 16 or 32 bit word.
     #) The value of a = alpha bits is undefined when reading from the driver,
        ignored when writing to the driver, except when alpha blending has
        been negotiated for a :ref:`Video Overlay <overlay>` or
-       :ref:`Video Output Overlay <osd>`.
+       :ref:`Video Output Overlay <osd>` for the formats Y444, YUV555 and
+       YUV4. However, for formats AYUV32 and VUYA32, the alpha component is
+       expected to contain a meaningful value that can be used by drivers
+       and applications. And, the formats XYUV32 and VUYX32 contain undefined
+       alpha values that must be ignored by all applications and drivers.
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 206b7348797e..6f707466b5d2 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1220,6 +1220,10 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_YUV555:	descr = "16-bit A/XYUV 1-5-5-5"; break;
 	case V4L2_PIX_FMT_YUV565:	descr = "16-bit YUV 5-6-5"; break;
 	case V4L2_PIX_FMT_YUV32:	descr = "32-bit A/XYUV 8-8-8-8"; break;
+	case V4L2_PIX_FMT_AYUV32:	descr = "32-bit AYUV 8-8-8-8"; break;
+	case V4L2_PIX_FMT_XYUV32:	descr = "32-bit XYUV 8-8-8-8"; break;
+	case V4L2_PIX_FMT_VUYA32:	descr = "32-bit VUYA 8-8-8-8"; break;
+	case V4L2_PIX_FMT_VUYX32:	descr = "32-bit VUYX 8-8-8-8"; break;
 	case V4L2_PIX_FMT_YUV410:	descr = "Planar YUV 4:1:0"; break;
 	case V4L2_PIX_FMT_YUV420:	descr = "Planar YUV 4:2:0"; break;
 	case V4L2_PIX_FMT_HI240:	descr = "8-bit Dithered RGB (BTTV)"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 9a920f071ff9..1db220da3bcc 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -562,6 +562,10 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_YUV555  v4l2_fourcc('Y', 'U', 'V', 'O') /* 16  YUV-5-5-5     */
 #define V4L2_PIX_FMT_YUV565  v4l2_fourcc('Y', 'U', 'V', 'P') /* 16  YUV-5-6-5     */
 #define V4L2_PIX_FMT_YUV32   v4l2_fourcc('Y', 'U', 'V', '4') /* 32  YUV-8-8-8-8   */
+#define V4L2_PIX_FMT_AYUV32  v4l2_fourcc('A', 'Y', 'U', 'V') /* 32  AYUV-8-8-8-8  */
+#define V4L2_PIX_FMT_XYUV32  v4l2_fourcc('X', 'Y', 'U', 'V') /* 32  XYUV-8-8-8-8  */
+#define V4L2_PIX_FMT_VUYA32  v4l2_fourcc('V', 'U', 'Y', 'A') /* 32  VUYA-8-8-8-8  */
+#define V4L2_PIX_FMT_VUYX32  v4l2_fourcc('V', 'U', 'Y', 'X') /* 32  VUYX-8-8-8-8  */
 #define V4L2_PIX_FMT_HI240   v4l2_fourcc('H', 'I', '2', '4') /*  8  8-bit color   */
 #define V4L2_PIX_FMT_HM12    v4l2_fourcc('H', 'M', '1', '2') /*  8  YUV 4:2:0 16x16 macroblocks */
 #define V4L2_PIX_FMT_M420    v4l2_fourcc('M', '4', '2', '0') /* 12  YUV 4:2:0 2 lines y, 1 line uv interleaved */
-- 
cgit v1.2.3


From cfc7740835d09b98a304a4793c93b6074c04379e Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Mon, 4 Feb 2019 05:11:32 -0500
Subject: media: vb2: replace bool by bitfield in vb2_buffer

The bool type is not recommended for use in structs, so replace these
by bitfields.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Reviewed-by: Paul Kocialkowski <paul.kocialkowski@bootlin.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/common/videobuf2/videobuf2-core.c | 12 ++++++------
 include/media/videobuf2-core.h                  |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c
index 34cc87ca8d59..b4457edc3245 100644
--- a/drivers/media/common/videobuf2/videobuf2-core.c
+++ b/drivers/media/common/videobuf2/videobuf2-core.c
@@ -934,7 +934,7 @@ void vb2_buffer_done(struct vb2_buffer *vb, enum vb2_buffer_state state)
 		/* sync buffers */
 		for (plane = 0; plane < vb->num_planes; ++plane)
 			call_void_memop(vb, finish, vb->planes[plane].mem_priv);
-		vb->synced = false;
+		vb->synced = 0;
 	}
 
 	spin_lock_irqsave(&q->done_lock, flags);
@@ -1313,8 +1313,8 @@ static int __buf_prepare(struct vb2_buffer *vb)
 	for (plane = 0; plane < vb->num_planes; ++plane)
 		call_void_memop(vb, prepare, vb->planes[plane].mem_priv);
 
-	vb->synced = true;
-	vb->prepared = true;
+	vb->synced = 1;
+	vb->prepared = 1;
 	vb->state = orig_state;
 
 	return 0;
@@ -1802,7 +1802,7 @@ int vb2_core_dqbuf(struct vb2_queue *q, unsigned int *pindex, void *pb,
 	}
 
 	call_void_vb_qop(vb, buf_finish, vb);
-	vb->prepared = false;
+	vb->prepared = 0;
 
 	if (pindex)
 		*pindex = vb->index;
@@ -1926,12 +1926,12 @@ static void __vb2_queue_cancel(struct vb2_queue *q)
 			for (plane = 0; plane < vb->num_planes; ++plane)
 				call_void_memop(vb, finish,
 						vb->planes[plane].mem_priv);
-			vb->synced = false;
+			vb->synced = 0;
 		}
 
 		if (vb->prepared) {
 			call_void_vb_qop(vb, buf_finish, vb);
-			vb->prepared = false;
+			vb->prepared = 0;
 		}
 		__vb2_dqbuf(vb);
 
diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
index 06142c1469cc..6d5490bb91d3 100644
--- a/include/media/videobuf2-core.h
+++ b/include/media/videobuf2-core.h
@@ -269,8 +269,8 @@ struct vb2_buffer {
 	 * vb2_plane:		per-plane information; do not change
 	 */
 	enum vb2_buffer_state	state;
-	bool			synced;
-	bool			prepared;
+	unsigned int		synced:1;
+	unsigned int		prepared:1;
 
 	struct vb2_plane	planes[VB2_MAX_PLANES];
 	struct list_head	queued_entry;
-- 
cgit v1.2.3


From 7e4e71624491d8a8befe62b43138beb0ab696006 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Mon, 4 Feb 2019 05:11:33 -0500
Subject: media: vb2: keep track of timestamp status

If a stream is stopped, or if a USERPTR/DMABUF buffer is queued
backed by a different user address or dmabuf fd, then the timestamp
should be skipped by vb2_find_timestamp since the memory it refers
to is no longer valid.

So keep track of a 'copied_timestamp' state: it is set when the
timestamp is copied from an output to a capture buffer, and is
cleared when it is no longer valid.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Reviewed-by: Paul Kocialkowski <paul.kocialkowski@bootlin.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/common/videobuf2/videobuf2-core.c | 3 +++
 drivers/media/common/videobuf2/videobuf2-v4l2.c | 3 ++-
 drivers/media/v4l2-core/v4l2-mem2mem.c          | 1 +
 include/media/videobuf2-core.h                  | 3 +++
 4 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c
index b4457edc3245..275cddd8808c 100644
--- a/drivers/media/common/videobuf2/videobuf2-core.c
+++ b/drivers/media/common/videobuf2/videobuf2-core.c
@@ -1041,6 +1041,7 @@ static int __prepare_userptr(struct vb2_buffer *vb)
 		if (vb->planes[plane].mem_priv) {
 			if (!reacquired) {
 				reacquired = true;
+				vb->copied_timestamp = 0;
 				call_void_vb_qop(vb, buf_cleanup, vb);
 			}
 			call_void_memop(vb, put_userptr, vb->planes[plane].mem_priv);
@@ -1165,6 +1166,7 @@ static int __prepare_dmabuf(struct vb2_buffer *vb)
 
 		if (!reacquired) {
 			reacquired = true;
+			vb->copied_timestamp = 0;
 			call_void_vb_qop(vb, buf_cleanup, vb);
 		}
 
@@ -1942,6 +1944,7 @@ static void __vb2_queue_cancel(struct vb2_queue *q)
 		if (vb->request)
 			media_request_put(vb->request);
 		vb->request = NULL;
+		vb->copied_timestamp = 0;
 	}
 }
 
diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c
index 3aeaea3af42a..55277370c313 100644
--- a/drivers/media/common/videobuf2/videobuf2-v4l2.c
+++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c
@@ -604,7 +604,8 @@ int vb2_find_timestamp(const struct vb2_queue *q, u64 timestamp,
 	unsigned int i;
 
 	for (i = start_idx; i < q->num_buffers; i++)
-		if (q->bufs[i]->timestamp == timestamp)
+		if (q->bufs[i]->copied_timestamp &&
+		    q->bufs[i]->timestamp == timestamp)
 			return i;
 	return -1;
 }
diff --git a/drivers/media/v4l2-core/v4l2-mem2mem.c b/drivers/media/v4l2-core/v4l2-mem2mem.c
index 1494d0d5951a..add228c98a9a 100644
--- a/drivers/media/v4l2-core/v4l2-mem2mem.c
+++ b/drivers/media/v4l2-core/v4l2-mem2mem.c
@@ -992,6 +992,7 @@ void v4l2_m2m_buf_copy_metadata(const struct vb2_v4l2_buffer *out_vb,
 	cap_vb->field = out_vb->field;
 	cap_vb->flags &= ~mask;
 	cap_vb->flags |= out_vb->flags & mask;
+	cap_vb->vb2_buf.copied_timestamp = 1;
 }
 EXPORT_SYMBOL_GPL(v4l2_m2m_buf_copy_metadata);
 
diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
index 6d5490bb91d3..a844abcae71e 100644
--- a/include/media/videobuf2-core.h
+++ b/include/media/videobuf2-core.h
@@ -262,6 +262,8 @@ struct vb2_buffer {
 	 * prepared:		this buffer has been prepared, i.e. the
 	 *			buf_prepare op was called. It is cleared again
 	 *			after the 'buf_finish' op is called.
+	 * copied_timestamp:	the timestamp of this capture buffer was copied
+	 *			from an output buffer.
 	 * queued_entry:	entry on the queued buffers list, which holds
 	 *			all buffers queued from userspace
 	 * done_entry:		entry on the list that stores all buffers ready
@@ -271,6 +273,7 @@ struct vb2_buffer {
 	enum vb2_buffer_state	state;
 	unsigned int		synced:1;
 	unsigned int		prepared:1;
+	unsigned int		copied_timestamp:1;
 
 	struct vb2_plane	planes[VB2_MAX_PLANES];
 	struct list_head	queued_entry;
-- 
cgit v1.2.3


From 8dd22b289c8fab1dcb13ab038c0228aa74f134ca Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@collabora.com>
Date: Fri, 8 Feb 2019 11:17:48 -0500
Subject: media: v4l2-mem2mem: Correct return type for mem2mem buffer helpers

This commit changes the return type of mem2mem buffer handling API.
Namely, these functions:

 v4l2_m2m_next_buf
 v4l2_m2m_last_buf
 v4l2_m2m_buf_remove
 v4l2_m2m_next_src_buf
 v4l2_m2m_next_dst_buf
 v4l2_m2m_last_src_buf
 v4l2_m2m_last_dst_buf
 v4l2_m2m_src_buf_remove
 v4l2_m2m_dst_buf_remove

which currently return void pointer.

In every case, the actual return type is a struct vb2_v4l2_buffer
pointer. Change the return type of the listed functions,
so type checking can be properly used.

Signed-off-by: Ezequiel Garcia <ezequiel@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
[hverkuil-cisco@xs4all.nl: clean up line-too-long checkpatch warnings]
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/v4l2-core/v4l2-mem2mem.c |  6 +++---
 include/media/v4l2-mem2mem.h           | 24 +++++++++++++++---------
 2 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-mem2mem.c b/drivers/media/v4l2-core/v4l2-mem2mem.c
index 57eccbaa52de..3392833d9541 100644
--- a/drivers/media/v4l2-core/v4l2-mem2mem.c
+++ b/drivers/media/v4l2-core/v4l2-mem2mem.c
@@ -131,7 +131,7 @@ struct vb2_queue *v4l2_m2m_get_vq(struct v4l2_m2m_ctx *m2m_ctx,
 }
 EXPORT_SYMBOL(v4l2_m2m_get_vq);
 
-void *v4l2_m2m_next_buf(struct v4l2_m2m_queue_ctx *q_ctx)
+struct vb2_v4l2_buffer *v4l2_m2m_next_buf(struct v4l2_m2m_queue_ctx *q_ctx)
 {
 	struct v4l2_m2m_buffer *b;
 	unsigned long flags;
@@ -149,7 +149,7 @@ void *v4l2_m2m_next_buf(struct v4l2_m2m_queue_ctx *q_ctx)
 }
 EXPORT_SYMBOL_GPL(v4l2_m2m_next_buf);
 
-void *v4l2_m2m_last_buf(struct v4l2_m2m_queue_ctx *q_ctx)
+struct vb2_v4l2_buffer *v4l2_m2m_last_buf(struct v4l2_m2m_queue_ctx *q_ctx)
 {
 	struct v4l2_m2m_buffer *b;
 	unsigned long flags;
@@ -167,7 +167,7 @@ void *v4l2_m2m_last_buf(struct v4l2_m2m_queue_ctx *q_ctx)
 }
 EXPORT_SYMBOL_GPL(v4l2_m2m_last_buf);
 
-void *v4l2_m2m_buf_remove(struct v4l2_m2m_queue_ctx *q_ctx)
+struct vb2_v4l2_buffer *v4l2_m2m_buf_remove(struct v4l2_m2m_queue_ctx *q_ctx)
 {
 	struct v4l2_m2m_buffer *b;
 	unsigned long flags;
diff --git a/include/media/v4l2-mem2mem.h b/include/media/v4l2-mem2mem.h
index 47c6d9aa0bf4..bb3e63d6bd1a 100644
--- a/include/media/v4l2-mem2mem.h
+++ b/include/media/v4l2-mem2mem.h
@@ -425,7 +425,7 @@ unsigned int v4l2_m2m_num_dst_bufs_ready(struct v4l2_m2m_ctx *m2m_ctx)
  *
  * @q_ctx: pointer to struct @v4l2_m2m_queue_ctx
  */
-void *v4l2_m2m_next_buf(struct v4l2_m2m_queue_ctx *q_ctx);
+struct vb2_v4l2_buffer *v4l2_m2m_next_buf(struct v4l2_m2m_queue_ctx *q_ctx);
 
 /**
  * v4l2_m2m_next_src_buf() - return next source buffer from the list of ready
@@ -433,7 +433,8 @@ void *v4l2_m2m_next_buf(struct v4l2_m2m_queue_ctx *q_ctx);
  *
  * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx
  */
-static inline void *v4l2_m2m_next_src_buf(struct v4l2_m2m_ctx *m2m_ctx)
+static inline struct vb2_v4l2_buffer *
+v4l2_m2m_next_src_buf(struct v4l2_m2m_ctx *m2m_ctx)
 {
 	return v4l2_m2m_next_buf(&m2m_ctx->out_q_ctx);
 }
@@ -444,7 +445,8 @@ static inline void *v4l2_m2m_next_src_buf(struct v4l2_m2m_ctx *m2m_ctx)
  *
  * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx
  */
-static inline void *v4l2_m2m_next_dst_buf(struct v4l2_m2m_ctx *m2m_ctx)
+static inline struct vb2_v4l2_buffer *
+v4l2_m2m_next_dst_buf(struct v4l2_m2m_ctx *m2m_ctx)
 {
 	return v4l2_m2m_next_buf(&m2m_ctx->cap_q_ctx);
 }
@@ -454,7 +456,7 @@ static inline void *v4l2_m2m_next_dst_buf(struct v4l2_m2m_ctx *m2m_ctx)
  *
  * @q_ctx: pointer to struct @v4l2_m2m_queue_ctx
  */
-void *v4l2_m2m_last_buf(struct v4l2_m2m_queue_ctx *q_ctx);
+struct vb2_v4l2_buffer *v4l2_m2m_last_buf(struct v4l2_m2m_queue_ctx *q_ctx);
 
 /**
  * v4l2_m2m_last_src_buf() - return last destination buffer from the list of
@@ -462,7 +464,8 @@ void *v4l2_m2m_last_buf(struct v4l2_m2m_queue_ctx *q_ctx);
  *
  * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx
  */
-static inline void *v4l2_m2m_last_src_buf(struct v4l2_m2m_ctx *m2m_ctx)
+static inline struct vb2_v4l2_buffer *
+v4l2_m2m_last_src_buf(struct v4l2_m2m_ctx *m2m_ctx)
 {
 	return v4l2_m2m_last_buf(&m2m_ctx->out_q_ctx);
 }
@@ -473,7 +476,8 @@ static inline void *v4l2_m2m_last_src_buf(struct v4l2_m2m_ctx *m2m_ctx)
  *
  * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx
  */
-static inline void *v4l2_m2m_last_dst_buf(struct v4l2_m2m_ctx *m2m_ctx)
+static inline struct vb2_v4l2_buffer *
+v4l2_m2m_last_dst_buf(struct v4l2_m2m_ctx *m2m_ctx)
 {
 	return v4l2_m2m_last_buf(&m2m_ctx->cap_q_ctx);
 }
@@ -547,7 +551,7 @@ struct vb2_queue *v4l2_m2m_get_dst_vq(struct v4l2_m2m_ctx *m2m_ctx)
  *
  * @q_ctx: pointer to struct @v4l2_m2m_queue_ctx
  */
-void *v4l2_m2m_buf_remove(struct v4l2_m2m_queue_ctx *q_ctx);
+struct vb2_v4l2_buffer *v4l2_m2m_buf_remove(struct v4l2_m2m_queue_ctx *q_ctx);
 
 /**
  * v4l2_m2m_src_buf_remove() - take off a source buffer from the list of ready
@@ -555,7 +559,8 @@ void *v4l2_m2m_buf_remove(struct v4l2_m2m_queue_ctx *q_ctx);
  *
  * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx
  */
-static inline void *v4l2_m2m_src_buf_remove(struct v4l2_m2m_ctx *m2m_ctx)
+static inline struct vb2_v4l2_buffer *
+v4l2_m2m_src_buf_remove(struct v4l2_m2m_ctx *m2m_ctx)
 {
 	return v4l2_m2m_buf_remove(&m2m_ctx->out_q_ctx);
 }
@@ -566,7 +571,8 @@ static inline void *v4l2_m2m_src_buf_remove(struct v4l2_m2m_ctx *m2m_ctx)
  *
  * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx
  */
-static inline void *v4l2_m2m_dst_buf_remove(struct v4l2_m2m_ctx *m2m_ctx)
+static inline struct vb2_v4l2_buffer *
+v4l2_m2m_dst_buf_remove(struct v4l2_m2m_ctx *m2m_ctx)
 {
 	return v4l2_m2m_buf_remove(&m2m_ctx->cap_q_ctx);
 }
-- 
cgit v1.2.3


From 721074b03411327e7bf41555d4cc7c18f49313f7 Mon Sep 17 00:00:00 2001
From: Patrick Lerda <patrick9876@free.fr>
Date: Thu, 17 Jan 2019 03:50:13 -0500
Subject: media: rc: rcmm decoder and encoder

media: add support for RCMM infrared remote controls.

Signed-off-by: Patrick Lerda <patrick9876@free.fr>
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/lirc.h.rst.exceptions |   3 +
 MAINTAINERS                               |   5 +
 drivers/media/rc/Kconfig                  |  13 ++
 drivers/media/rc/Makefile                 |   1 +
 drivers/media/rc/ir-rcmm-decoder.c        | 254 ++++++++++++++++++++++++++++++
 drivers/media/rc/rc-core-priv.h           |   5 +
 drivers/media/rc/rc-main.c                |   9 ++
 include/media/rc-map.h                    |  14 +-
 include/uapi/linux/lirc.h                 |   6 +
 tools/include/uapi/linux/lirc.h           |  12 ++
 tools/testing/selftests/ir/ir_loopback.c  |   9 ++
 11 files changed, 328 insertions(+), 3 deletions(-)
 create mode 100644 drivers/media/rc/ir-rcmm-decoder.c

(limited to 'include')

diff --git a/Documentation/media/lirc.h.rst.exceptions b/Documentation/media/lirc.h.rst.exceptions
index 379b9e7df5d0..7a8b8ff4f076 100644
--- a/Documentation/media/lirc.h.rst.exceptions
+++ b/Documentation/media/lirc.h.rst.exceptions
@@ -60,6 +60,9 @@ ignore symbol RC_PROTO_SHARP
 ignore symbol RC_PROTO_XMP
 ignore symbol RC_PROTO_CEC
 ignore symbol RC_PROTO_IMON
+ignore symbol RC_PROTO_RCMM12
+ignore symbol RC_PROTO_RCMM24
+ignore symbol RC_PROTO_RCMM32
 
 # Undocumented macros
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 914d333ee73d..9662ad6c58e4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16536,6 +16536,11 @@ M:	David Härdeman <david@hardeman.nu>
 S:	Maintained
 F:	drivers/media/rc/winbond-cir.c
 
+RCMM REMOTE CONTROLS DECODER
+M:	Patrick Lerda <patrick9876@free.fr>
+S:	Maintained
+F:	drivers/media/rc/ir-rcmm-decoder.c
+
 WINSYSTEMS EBC-C384 WATCHDOG DRIVER
 M:	William Breathitt Gray <vilhelm.gray@gmail.com>
 L:	linux-watchdog@vger.kernel.org
diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig
index 8a216068a35a..8e95f6eac89d 100644
--- a/drivers/media/rc/Kconfig
+++ b/drivers/media/rc/Kconfig
@@ -133,6 +133,19 @@ config IR_IMON_DECODER
 	   remote control and you would like to use it with a raw IR
 	   receiver, or if you wish to use an encoder to transmit this IR.
 
+config IR_RCMM_DECODER
+	tristate "Enable IR raw decoder for the RC-MM protocol"
+	depends on RC_CORE
+	help
+	   Enable this option when you have IR with RC-MM protocol, and
+	   you need the software decoder. The driver supports 12,
+	   24 and 32 bits RC-MM variants. You can enable or disable the
+	   different modes using the following RC protocol keywords:
+	   'rc-mm-12', 'rc-mm-24' and 'rc-mm-32'.
+
+	   To compile this driver as a module, choose M here: the module
+	   will be called ir-rcmm-decoder.
+
 endif #RC_DECODERS
 
 menuconfig RC_DEVICES
diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile
index 92c163816849..48d23433b3c0 100644
--- a/drivers/media/rc/Makefile
+++ b/drivers/media/rc/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_IR_SHARP_DECODER) += ir-sharp-decoder.o
 obj-$(CONFIG_IR_MCE_KBD_DECODER) += ir-mce_kbd-decoder.o
 obj-$(CONFIG_IR_XMP_DECODER) += ir-xmp-decoder.o
 obj-$(CONFIG_IR_IMON_DECODER) += ir-imon-decoder.o
+obj-$(CONFIG_IR_RCMM_DECODER) += ir-rcmm-decoder.o
 
 # stand-alone IR receivers/transmitters
 obj-$(CONFIG_RC_ATI_REMOTE) += ati_remote.o
diff --git a/drivers/media/rc/ir-rcmm-decoder.c b/drivers/media/rc/ir-rcmm-decoder.c
new file mode 100644
index 000000000000..f1096ac1e5c5
--- /dev/null
+++ b/drivers/media/rc/ir-rcmm-decoder.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0+
+// ir-rcmm-decoder.c - A decoder for the RCMM IR protocol
+//
+// Copyright (C) 2018 by Patrick Lerda <patrick9876@free.fr>
+
+#include "rc-core-priv.h"
+#include <linux/module.h>
+#include <linux/version.h>
+
+#define RCMM_UNIT		166667	/* nanosecs */
+#define RCMM_PREFIX_PULSE	416666  /* 166666.666666666*2.5 */
+#define RCMM_PULSE_0            277777  /* 166666.666666666*(1+2/3) */
+#define RCMM_PULSE_1            444444  /* 166666.666666666*(2+2/3) */
+#define RCMM_PULSE_2            611111  /* 166666.666666666*(3+2/3) */
+#define RCMM_PULSE_3            777778  /* 166666.666666666*(4+2/3) */
+
+enum rcmm_state {
+	STATE_INACTIVE,
+	STATE_LOW,
+	STATE_BUMP,
+	STATE_VALUE,
+	STATE_FINISHED,
+};
+
+static bool rcmm_mode(const struct rcmm_dec *data)
+{
+	return !((0x000c0000 & data->bits) == 0x000c0000);
+}
+
+static int rcmm_miscmode(struct rc_dev *dev, struct rcmm_dec *data)
+{
+	switch (data->count) {
+	case 24:
+		if (dev->enabled_protocols & RC_PROTO_BIT_RCMM24) {
+			rc_keydown(dev, RC_PROTO_RCMM24, data->bits, 0);
+			data->state = STATE_INACTIVE;
+			return 0;
+		}
+		return -1;
+
+	case 12:
+		if (dev->enabled_protocols & RC_PROTO_BIT_RCMM12) {
+			rc_keydown(dev, RC_PROTO_RCMM12, data->bits, 0);
+			data->state = STATE_INACTIVE;
+			return 0;
+		}
+		return -1;
+	}
+
+	return -1;
+}
+
+/**
+ * ir_rcmm_decode() - Decode one RCMM pulse or space
+ * @dev:	the struct rc_dev descriptor of the device
+ * @ev:		the struct ir_raw_event descriptor of the pulse/space
+ *
+ * This function returns -EINVAL if the pulse violates the state machine
+ */
+static int ir_rcmm_decode(struct rc_dev *dev, struct ir_raw_event ev)
+{
+	struct rcmm_dec *data = &dev->raw->rcmm;
+	u32 scancode;
+	u8 toggle;
+	int value;
+
+	if (!(dev->enabled_protocols & (RC_PROTO_BIT_RCMM32 |
+							RC_PROTO_BIT_RCMM24 |
+							RC_PROTO_BIT_RCMM12)))
+		return 0;
+
+	if (!is_timing_event(ev)) {
+		if (ev.reset)
+			data->state = STATE_INACTIVE;
+		return 0;
+	}
+
+	switch (data->state) {
+	case STATE_INACTIVE:
+		if (!ev.pulse)
+			break;
+
+		if (!eq_margin(ev.duration, RCMM_PREFIX_PULSE, RCMM_UNIT / 2))
+			break;
+
+		data->state = STATE_LOW;
+		data->count = 0;
+		data->bits  = 0;
+		return 0;
+
+	case STATE_LOW:
+		if (ev.pulse)
+			break;
+
+		if (!eq_margin(ev.duration, RCMM_PULSE_0, RCMM_UNIT / 2))
+			break;
+
+		data->state = STATE_BUMP;
+		return 0;
+
+	case STATE_BUMP:
+		if (!ev.pulse)
+			break;
+
+		if (!eq_margin(ev.duration, RCMM_UNIT, RCMM_UNIT / 2))
+			break;
+
+		data->state = STATE_VALUE;
+		return 0;
+
+	case STATE_VALUE:
+		if (ev.pulse)
+			break;
+
+		if (eq_margin(ev.duration, RCMM_PULSE_0, RCMM_UNIT / 2))
+			value = 0;
+		else if (eq_margin(ev.duration, RCMM_PULSE_1, RCMM_UNIT / 2))
+			value = 1;
+		else if (eq_margin(ev.duration, RCMM_PULSE_2, RCMM_UNIT / 2))
+			value = 2;
+		else if (eq_margin(ev.duration, RCMM_PULSE_3, RCMM_UNIT / 2))
+			value = 3;
+		else
+			value = -1;
+
+		if (value == -1) {
+			if (!rcmm_miscmode(dev, data))
+				return 0;
+			break;
+		}
+
+		data->bits <<= 2;
+		data->bits |= value;
+
+		data->count += 2;
+
+		if (data->count < 32)
+			data->state = STATE_BUMP;
+		else
+			data->state = STATE_FINISHED;
+
+		return 0;
+
+	case STATE_FINISHED:
+		if (!ev.pulse)
+			break;
+
+		if (!eq_margin(ev.duration, RCMM_UNIT, RCMM_UNIT / 2))
+			break;
+
+		if (rcmm_mode(data)) {
+			toggle = !!(0x8000 & data->bits);
+			scancode = data->bits & ~0x8000;
+		} else {
+			toggle = 0;
+			scancode = data->bits;
+		}
+
+		if (dev->enabled_protocols & RC_PROTO_BIT_RCMM32) {
+			rc_keydown(dev, RC_PROTO_RCMM32, scancode, toggle);
+			data->state = STATE_INACTIVE;
+			return 0;
+		}
+
+		break;
+	}
+
+	data->state = STATE_INACTIVE;
+	return -EINVAL;
+}
+
+static const int rcmmspace[] = {
+	RCMM_PULSE_0,
+	RCMM_PULSE_1,
+	RCMM_PULSE_2,
+	RCMM_PULSE_3,
+};
+
+static int ir_rcmm_rawencoder(struct ir_raw_event **ev, unsigned int max,
+			      unsigned int n, u32 data)
+{
+	int i;
+	int ret;
+
+	ret = ir_raw_gen_pulse_space(ev, &max, RCMM_PREFIX_PULSE, RCMM_PULSE_0);
+	if (ret)
+		return ret;
+
+	for (i = n - 2; i >= 0; i -= 2) {
+		const unsigned int space = rcmmspace[(data >> i) & 3];
+
+		ret = ir_raw_gen_pulse_space(ev, &max, RCMM_UNIT, space);
+		if (ret)
+			return ret;
+	}
+
+	return ir_raw_gen_pulse_space(ev, &max, RCMM_UNIT, RCMM_PULSE_3 * 2);
+}
+
+static int ir_rcmm_encode(enum rc_proto protocol, u32 scancode,
+			  struct ir_raw_event *events, unsigned int max)
+{
+	struct ir_raw_event *e = events;
+	int ret;
+
+	switch (protocol) {
+	case RC_PROTO_RCMM32:
+		ret = ir_rcmm_rawencoder(&e, max, 32, scancode);
+		break;
+	case RC_PROTO_RCMM24:
+		ret = ir_rcmm_rawencoder(&e, max, 24, scancode);
+		break;
+	case RC_PROTO_RCMM12:
+		ret = ir_rcmm_rawencoder(&e, max, 12, scancode);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (ret < 0)
+		return ret;
+
+	return e - events;
+}
+
+static struct ir_raw_handler rcmm_handler = {
+	.protocols	= RC_PROTO_BIT_RCMM32 |
+			  RC_PROTO_BIT_RCMM24 |
+			  RC_PROTO_BIT_RCMM12,
+	.decode		= ir_rcmm_decode,
+	.encode         = ir_rcmm_encode,
+	.carrier        = 36000,
+	.min_timeout	= RCMM_PULSE_3 + RCMM_UNIT,
+};
+
+static int __init ir_rcmm_decode_init(void)
+{
+	ir_raw_handler_register(&rcmm_handler);
+
+	pr_info("IR RCMM protocol handler initialized\n");
+	return 0;
+}
+
+static void __exit ir_rcmm_decode_exit(void)
+{
+	ir_raw_handler_unregister(&rcmm_handler);
+}
+
+module_init(ir_rcmm_decode_init);
+module_exit(ir_rcmm_decode_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick Lerda");
+MODULE_DESCRIPTION("RCMM IR protocol decoder");
diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h
index c2cbe7f6266c..9f21b3e8b377 100644
--- a/drivers/media/rc/rc-core-priv.h
+++ b/drivers/media/rc/rc-core-priv.h
@@ -131,6 +131,11 @@ struct ir_raw_event_ctrl {
 		unsigned int bits;
 		bool stick_keyboard;
 	} imon;
+	struct rcmm_dec {
+		int state;
+		unsigned int count;
+		u32 bits;
+	} rcmm;
 };
 
 /* Mutex for locking raw IR processing and handler change */
diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c
index 66a174979b3c..dc38e9c0a2ff 100644
--- a/drivers/media/rc/rc-main.c
+++ b/drivers/media/rc/rc-main.c
@@ -70,6 +70,12 @@ static const struct {
 	[RC_PROTO_CEC] = { .name = "cec", .repeat_period = 0 },
 	[RC_PROTO_IMON] = { .name = "imon",
 		.scancode_bits = 0x7fffffff, .repeat_period = 114 },
+	[RC_PROTO_RCMM12] = { .name = "rc-mm-12",
+		.scancode_bits = 0x00000fff, .repeat_period = 114 },
+	[RC_PROTO_RCMM24] = { .name = "rc-mm-24",
+		.scancode_bits = 0x00ffffff, .repeat_period = 114 },
+	[RC_PROTO_RCMM32] = { .name = "rc-mm-32",
+		.scancode_bits = 0xffffffff, .repeat_period = 114 },
 };
 
 /* Used to keep track of known keymaps */
@@ -1006,6 +1012,9 @@ static const struct {
 	{ RC_PROTO_BIT_XMP,	"xmp",		"ir-xmp-decoder"	},
 	{ RC_PROTO_BIT_CEC,	"cec",		NULL			},
 	{ RC_PROTO_BIT_IMON,	"imon",		"ir-imon-decoder"	},
+	{ RC_PROTO_BIT_RCMM12 |
+	  RC_PROTO_BIT_RCMM24 |
+	  RC_PROTO_BIT_RCMM32,	"rc-mm",	"ir-rcmm-decoder"	},
 };
 
 /**
diff --git a/include/media/rc-map.h b/include/media/rc-map.h
index d621acadfbf3..e5e86d595645 100644
--- a/include/media/rc-map.h
+++ b/include/media/rc-map.h
@@ -37,6 +37,9 @@
 #define RC_PROTO_BIT_XMP		BIT_ULL(RC_PROTO_XMP)
 #define RC_PROTO_BIT_CEC		BIT_ULL(RC_PROTO_CEC)
 #define RC_PROTO_BIT_IMON		BIT_ULL(RC_PROTO_IMON)
+#define RC_PROTO_BIT_RCMM12		BIT_ULL(RC_PROTO_RCMM12)
+#define RC_PROTO_BIT_RCMM24		BIT_ULL(RC_PROTO_RCMM24)
+#define RC_PROTO_BIT_RCMM32		BIT_ULL(RC_PROTO_RCMM32)
 
 #define RC_PROTO_BIT_ALL \
 			(RC_PROTO_BIT_UNKNOWN | RC_PROTO_BIT_OTHER | \
@@ -51,7 +54,8 @@
 			 RC_PROTO_BIT_RC6_6A_24 | RC_PROTO_BIT_RC6_6A_32 | \
 			 RC_PROTO_BIT_RC6_MCE | RC_PROTO_BIT_SHARP | \
 			 RC_PROTO_BIT_XMP | RC_PROTO_BIT_CEC | \
-			 RC_PROTO_BIT_IMON)
+			 RC_PROTO_BIT_IMON | RC_PROTO_BIT_RCMM12 | \
+			 RC_PROTO_BIT_RCMM24 | RC_PROTO_BIT_RCMM32)
 /* All rc protocols for which we have decoders */
 #define RC_PROTO_BIT_ALL_IR_DECODER \
 			(RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC5X_20 | \
@@ -64,7 +68,9 @@
 			 RC_PROTO_BIT_RC6_0 | RC_PROTO_BIT_RC6_6A_20 | \
 			 RC_PROTO_BIT_RC6_6A_24 |  RC_PROTO_BIT_RC6_6A_32 | \
 			 RC_PROTO_BIT_RC6_MCE | RC_PROTO_BIT_SHARP | \
-			 RC_PROTO_BIT_XMP | RC_PROTO_BIT_IMON)
+			 RC_PROTO_BIT_XMP | RC_PROTO_BIT_IMON | \
+			 RC_PROTO_BIT_RCMM12 | RC_PROTO_BIT_RCMM24 | \
+			 RC_PROTO_BIT_RCMM32)
 
 #define RC_PROTO_BIT_ALL_IR_ENCODER \
 			(RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC5X_20 | \
@@ -77,7 +83,9 @@
 			 RC_PROTO_BIT_RC6_0 | RC_PROTO_BIT_RC6_6A_20 | \
 			 RC_PROTO_BIT_RC6_6A_24 | \
 			 RC_PROTO_BIT_RC6_6A_32 | RC_PROTO_BIT_RC6_MCE | \
-			 RC_PROTO_BIT_SHARP | RC_PROTO_BIT_IMON)
+			 RC_PROTO_BIT_SHARP | RC_PROTO_BIT_IMON | \
+			 RC_PROTO_BIT_RCMM12 | RC_PROTO_BIT_RCMM24 | \
+			 RC_PROTO_BIT_RCMM32)
 
 #define RC_SCANCODE_UNKNOWN(x)			(x)
 #define RC_SCANCODE_OTHER(x)			(x)
diff --git a/include/uapi/linux/lirc.h b/include/uapi/linux/lirc.h
index 6b319581882f..45fcbf99d72e 100644
--- a/include/uapi/linux/lirc.h
+++ b/include/uapi/linux/lirc.h
@@ -192,6 +192,9 @@ struct lirc_scancode {
  * @RC_PROTO_XMP: XMP protocol
  * @RC_PROTO_CEC: CEC protocol
  * @RC_PROTO_IMON: iMon Pad protocol
+ * @RC_PROTO_RCMM12: RC-MM protocol 12 bits
+ * @RC_PROTO_RCMM24: RC-MM protocol 24 bits
+ * @RC_PROTO_RCMM32: RC-MM protocol 32 bits
  */
 enum rc_proto {
 	RC_PROTO_UNKNOWN	= 0,
@@ -218,6 +221,9 @@ enum rc_proto {
 	RC_PROTO_XMP		= 21,
 	RC_PROTO_CEC		= 22,
 	RC_PROTO_IMON		= 23,
+	RC_PROTO_RCMM12		= 24,
+	RC_PROTO_RCMM24		= 25,
+	RC_PROTO_RCMM32		= 26,
 };
 
 #endif
diff --git a/tools/include/uapi/linux/lirc.h b/tools/include/uapi/linux/lirc.h
index f189931042a7..45fcbf99d72e 100644
--- a/tools/include/uapi/linux/lirc.h
+++ b/tools/include/uapi/linux/lirc.h
@@ -133,6 +133,12 @@
 
 #define LIRC_SET_WIDEBAND_RECEIVER     _IOW('i', 0x00000023, __u32)
 
+/*
+ * Return the recording timeout, which is either set by
+ * the ioctl LIRC_SET_REC_TIMEOUT or by the kernel after setting the protocols.
+ */
+#define LIRC_GET_REC_TIMEOUT	       _IOR('i', 0x00000024, __u32)
+
 /*
  * struct lirc_scancode - decoded scancode with protocol for use with
  *	LIRC_MODE_SCANCODE
@@ -186,6 +192,9 @@ struct lirc_scancode {
  * @RC_PROTO_XMP: XMP protocol
  * @RC_PROTO_CEC: CEC protocol
  * @RC_PROTO_IMON: iMon Pad protocol
+ * @RC_PROTO_RCMM12: RC-MM protocol 12 bits
+ * @RC_PROTO_RCMM24: RC-MM protocol 24 bits
+ * @RC_PROTO_RCMM32: RC-MM protocol 32 bits
  */
 enum rc_proto {
 	RC_PROTO_UNKNOWN	= 0,
@@ -212,6 +221,9 @@ enum rc_proto {
 	RC_PROTO_XMP		= 21,
 	RC_PROTO_CEC		= 22,
 	RC_PROTO_IMON		= 23,
+	RC_PROTO_RCMM12		= 24,
+	RC_PROTO_RCMM24		= 25,
+	RC_PROTO_RCMM32		= 26,
 };
 
 #endif
diff --git a/tools/testing/selftests/ir/ir_loopback.c b/tools/testing/selftests/ir/ir_loopback.c
index 858c19caf224..570a7358942c 100644
--- a/tools/testing/selftests/ir/ir_loopback.c
+++ b/tools/testing/selftests/ir/ir_loopback.c
@@ -51,6 +51,10 @@ static const struct {
 	{ RC_PROTO_RC6_6A_32, "rc-6-6a-32", 0xffffffff, "rc-6" },
 	{ RC_PROTO_RC6_MCE, "rc-6-mce", 0x00007fff, "rc-6" },
 	{ RC_PROTO_SHARP, "sharp", 0x1fff, "sharp" },
+	{ RC_PROTO_IMON, "imon", 0x7fffffff, "imon" },
+	{ RC_PROTO_RCMM12, "rcmm-12", 0x00000fff, "rcmm" },
+	{ RC_PROTO_RCMM24, "rcmm-24", 0x00ffffff, "rcmm" },
+	{ RC_PROTO_RCMM32, "rcmm-32", 0xffffffff, "rcmm" },
 };
 
 int lirc_open(const char *rc)
@@ -139,6 +143,11 @@ int main(int argc, char **argv)
 			    (((scancode >> 8) ^ ~scancode) & 0xff) == 0)
 				continue;
 
+			if (rc_proto == RC_PROTO_RCMM32 &&
+			    (scancode & 0x000c0000) != 0x000c0000 &&
+			    scancode & 0x00008000)
+				continue;
+
 			struct lirc_scancode lsc = {
 				.rc_proto = rc_proto,
 				.scancode = scancode
-- 
cgit v1.2.3


From fd47c2f99f04249d1ba82c422d1818dcbe193908 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Mon, 18 Feb 2019 22:25:43 +0200
Subject: RDMA/restrack: Convert internal DB from hash to XArray

The additions of .doit callbacks posses new access pattern to the resource
entries by some user visible index. Back then, the legacy DB was
implemented as hash because per-index access wasn't needed and XArray
wasn't accepted yet.

Acceptance of XArray together with per-index access requires the refresh
of DB implementation.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/nldev.c    |  13 ++---
 drivers/infiniband/core/restrack.c | 103 +++++++++++++++++++++++++------------
 include/rdma/restrack.h            |  22 +++++---
 3 files changed, 93 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 5601fa968244..4bf890ae6e28 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -970,6 +970,7 @@ static int res_get_common_dumpit(struct sk_buff *skb,
 	int start = cb->args[0];
 	bool has_cap_net_admin;
 	struct nlmsghdr *nlh;
+	unsigned long id;
 	u32 index, port = 0;
 	bool filled = false;
 
@@ -1020,7 +1021,12 @@ static int res_get_common_dumpit(struct sk_buff *skb,
 	has_cap_net_admin = netlink_capable(cb->skb, CAP_NET_ADMIN);
 
 	down_read(&device->res.rwsem);
-	hash_for_each_possible(device->res.hash, res, node, res_type) {
+	/*
+	 * FIXME: if the skip ahead is something common this loop should
+	 * use xas_for_each & xas_pause to optimize, we can have a lot of
+	 * objects.
+	 */
+	xa_for_each(&device->res.xa[res_type], id, res) {
 		if (idx < start)
 			goto next;
 
@@ -1047,11 +1053,6 @@ static int res_get_common_dumpit(struct sk_buff *skb,
 		rdma_restrack_put(res);
 
 		if (ret == -EMSGSIZE)
-			/*
-			 * There is a chance to optimize here.
-			 * It can be done by using list_prepare_entry
-			 * and list_for_each_entry_continue afterwards.
-			 */
 			break;
 		if (ret)
 			goto res_err;
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index f80b37d437ac..b4f302811858 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -12,6 +12,28 @@
 
 #include "cma_priv.h"
 
+static int rt_xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
+			      u32 *next)
+{
+	int err;
+
+	*id = *next;
+	if (*next == U32_MAX)
+		*id = 0;
+
+	xa_lock(xa);
+	err = __xa_alloc(xa, id, U32_MAX, entry, GFP_KERNEL);
+	if (err && *next != U32_MAX) {
+		*id = 0;
+		err = __xa_alloc(xa, id, *next, entry, GFP_KERNEL);
+	}
+
+	if (!err)
+		*next = *id + 1;
+	xa_unlock(xa);
+	return err;
+}
+
 /**
  * rdma_restrack_init() - initialize resource tracking
  * @dev:  IB device
@@ -19,6 +41,10 @@
 void rdma_restrack_init(struct ib_device *dev)
 {
 	struct rdma_restrack_root *res = &dev->res;
+	int i;
+
+	for (i = 0 ; i < RDMA_RESTRACK_MAX; i++)
+		xa_init_flags(&res->xa[i], XA_FLAGS_ALLOC);
 
 	init_rwsem(&res->rwsem);
 }
@@ -46,33 +72,42 @@ void rdma_restrack_clean(struct ib_device *dev)
 	struct rdma_restrack_root *res = &dev->res;
 	struct rdma_restrack_entry *e;
 	char buf[TASK_COMM_LEN];
+	bool found = false;
 	const char *owner;
-	int bkt;
-
-	if (hash_empty(res->hash))
-		return;
-
-	dev = container_of(res, struct ib_device, res);
-	pr_err("restrack: %s", CUT_HERE);
-	dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n");
-	hash_for_each(res->hash, bkt, e, node) {
-		if (rdma_is_kernel_res(e)) {
-			owner = e->kern_name;
-		} else {
-			/*
-			 * There is no need to call get_task_struct here,
-			 * because we can be here only if there are more
-			 * get_task_struct() call than put_task_struct().
-			 */
-			get_task_comm(buf, e->task);
-			owner = buf;
+	int i;
+
+	for (i = 0 ; i < RDMA_RESTRACK_MAX; i++) {
+		if (!xa_empty(&res->xa[i])) {
+			unsigned long index;
+
+			if (!found) {
+				pr_err("restrack: %s", CUT_HERE);
+				dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n");
+			}
+			xa_for_each(&res->xa[i], index, e) {
+				if (rdma_is_kernel_res(e)) {
+					owner = e->kern_name;
+				} else {
+					/*
+					 * There is no need to call get_task_struct here,
+					 * because we can be here only if there are more
+					 * get_task_struct() call than put_task_struct().
+					 */
+					get_task_comm(buf, e->task);
+					owner = buf;
+				}
+
+				pr_err("restrack: %s %s object allocated by %s is not freed\n",
+				       rdma_is_kernel_res(e) ? "Kernel" :
+							       "User",
+				       type2str(e->type), owner);
+			}
+			found = true;
 		}
-
-		pr_err("restrack: %s %s object allocated by %s is not freed\n",
-		       rdma_is_kernel_res(e) ? "Kernel" : "User",
-		       type2str(e->type), owner);
+		xa_destroy(&res->xa[i]);
 	}
-	pr_err("restrack: %s", CUT_HERE);
+	if (found)
+		pr_err("restrack: %s", CUT_HERE);
 }
 
 /**
@@ -86,10 +121,11 @@ int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type,
 {
 	struct rdma_restrack_root *res = &dev->res;
 	struct rdma_restrack_entry *e;
+	unsigned long index = 0;
 	u32 cnt = 0;
 
 	down_read(&res->rwsem);
-	hash_for_each_possible(res->hash, e, node, type) {
+	xa_for_each(&res->xa[type], index, e) {
 		if (ns == &init_pid_ns ||
 		    (!rdma_is_kernel_res(e) &&
 		     ns == task_active_pid_ns(e->task)))
@@ -166,16 +202,20 @@ EXPORT_SYMBOL(rdma_restrack_set_task);
 static void rdma_restrack_add(struct rdma_restrack_entry *res)
 {
 	struct ib_device *dev = res_to_dev(res);
+	int ret;
 
 	if (!dev)
 		return;
 
 	kref_init(&res->kref);
 	init_completion(&res->comp);
-	res->valid = true;
 
 	down_write(&dev->res.rwsem);
-	hash_add(dev->res.hash, &res->node, res->type);
+	ret = rt_xa_alloc_cyclic(&dev->res.xa[res->type], &res->id, res,
+				 &dev->res.next_id[res->type]);
+
+	if (!ret)
+		res->valid = true;
 	up_write(&dev->res.rwsem);
 }
 
@@ -241,15 +281,14 @@ void rdma_restrack_del(struct rdma_restrack_entry *res)
 	if (!dev)
 		return;
 
-	rdma_restrack_put(res);
-
-	wait_for_completion(&res->comp);
-
 	down_write(&dev->res.rwsem);
-	hash_del(&res->node);
+	xa_erase(&dev->res.xa[res->type], res->id);
 	res->valid = false;
 	up_write(&dev->res.rwsem);
 
+	rdma_restrack_put(res);
+	wait_for_completion(&res->comp);
+
 out:
 	if (res->task) {
 		put_task_struct(res->task);
diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h
index cc66cc7a11d3..16e11b4c3ec3 100644
--- a/include/rdma/restrack.h
+++ b/include/rdma/restrack.h
@@ -13,6 +13,7 @@
 #include <linux/completion.h>
 #include <linux/sched/task.h>
 #include <uapi/rdma/rdma_netlink.h>
+#include <linux/xarray.h>
 
 /**
  * enum rdma_restrack_type - HW objects to track
@@ -48,7 +49,6 @@ enum rdma_restrack_type {
 	RDMA_RESTRACK_MAX
 };
 
-#define RDMA_RESTRACK_HASH_BITS	8
 struct ib_device;
 struct rdma_restrack_entry;
 
@@ -62,9 +62,17 @@ struct rdma_restrack_root {
 	 */
 	struct rw_semaphore	rwsem;
 	/**
-	 * @hash: global database for all resources per-device
+	 * @xa: Array of XArray structures to hold restrack entries.
+	 * We want to use array of XArrays because insertion is type
+	 * dependent. For types with xisiting unique ID (like QPN),
+	 * we will insert to that unique index. For other types,
+	 * we insert based on pointers and auto-allocate unique index.
 	 */
-	DECLARE_HASHTABLE(hash, RDMA_RESTRACK_HASH_BITS);
+	struct xarray xa[RDMA_RESTRACK_MAX];
+	/**
+	 * @next_id: Next ID to support cyclic allocation
+	 */
+	u32 next_id[RDMA_RESTRACK_MAX];
 };
 
 /**
@@ -102,10 +110,6 @@ struct rdma_restrack_entry {
 	 * @kern_name: name of owner for the kernel created entities.
 	 */
 	const char		*kern_name;
-	/**
-	 * @node: hash table entry
-	 */
-	struct hlist_node	node;
 	/**
 	 * @type: various objects in restrack database
 	 */
@@ -114,6 +118,10 @@ struct rdma_restrack_entry {
 	 * @user: user resource
 	 */
 	bool			user;
+	/**
+	 * @id: ID to expose to users
+	 */
+	u32 id;
 };
 
 void rdma_restrack_init(struct ib_device *dev);
-- 
cgit v1.2.3


From 18c4c66f76d99df89ad682ba25bafb9227e8ec30 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Mon, 18 Feb 2019 22:25:44 +0200
Subject: RDMA/restrack: Translate from ID to restrack object

Add new general helper to get restrack entry given by ID and their
respective type.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/restrack.c | 25 +++++++++++++++++++++++++
 include/rdma/restrack.h            |  3 +++
 2 files changed, 28 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index b4f302811858..ac97167da81c 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -256,6 +256,31 @@ int __must_check rdma_restrack_get(struct rdma_restrack_entry *res)
 }
 EXPORT_SYMBOL(rdma_restrack_get);
 
+/**
+ * rdma_restrack_get_byid() - translate from ID to restrack object
+ * @dev: IB device
+ * @type: resource track type
+ * @id: ID to take a look
+ *
+ * Return: Pointer to restrack entry or -ENOENT in case of error.
+ */
+struct rdma_restrack_entry *
+rdma_restrack_get_byid(struct ib_device *dev,
+		       enum rdma_restrack_type type, u32 id)
+{
+	struct rdma_restrack_root *rt = &dev->res;
+	struct rdma_restrack_entry *res;
+
+	down_read(&dev->res.rwsem);
+	res = xa_load(&rt->xa[type], id);
+	if (!res || !rdma_restrack_get(res))
+		res = ERR_PTR(-ENOENT);
+	up_read(&dev->res.rwsem);
+
+	return res;
+}
+EXPORT_SYMBOL(rdma_restrack_get_byid);
+
 static void restrack_release(struct kref *kref)
 {
 	struct rdma_restrack_entry *res;
diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h
index 16e11b4c3ec3..44ce32cc0b51 100644
--- a/include/rdma/restrack.h
+++ b/include/rdma/restrack.h
@@ -179,4 +179,7 @@ int rdma_nl_put_driver_u32_hex(struct sk_buff *msg, const char *name,
 int rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, u64 value);
 int rdma_nl_put_driver_u64_hex(struct sk_buff *msg, const char *name,
 			       u64 value);
+struct rdma_restrack_entry *rdma_restrack_get_byid(struct ib_device *dev,
+						   enum rdma_restrack_type type,
+						   u32 id);
 #endif /* _RDMA_RESTRACK_H_ */
-- 
cgit v1.2.3


From 48118527186fb255461ebf3685ab0f1c2680bd9c Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Mon, 18 Feb 2019 22:25:46 +0200
Subject: RDMA/restrack: Reduce scope of synchronization lock while updating DB

XArray uses internal lock for updates to XArray. This means that our
external RW lock is needed to ensure that entry is not deleted while we
are performing iteration over list.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/restrack.c | 2 --
 include/rdma/restrack.h            | 3 ++-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index ac97167da81c..076ef6475df8 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -210,13 +210,11 @@ static void rdma_restrack_add(struct rdma_restrack_entry *res)
 	kref_init(&res->kref);
 	init_completion(&res->comp);
 
-	down_write(&dev->res.rwsem);
 	ret = rt_xa_alloc_cyclic(&dev->res.xa[res->type], &res->id, res,
 				 &dev->res.next_id[res->type]);
 
 	if (!ret)
 		res->valid = true;
-	up_write(&dev->res.rwsem);
 }
 
 /**
diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h
index 44ce32cc0b51..53e1a7fb7355 100644
--- a/include/rdma/restrack.h
+++ b/include/rdma/restrack.h
@@ -58,7 +58,8 @@ struct rdma_restrack_entry;
  */
 struct rdma_restrack_root {
 	/*
-	 * @rwsem: Read/write lock to protect lists
+	 * @rwsem: Read/write lock to protect erase of entry.
+	 * Lists and insertions are protected by XArray internal lock.
 	 */
 	struct rw_semaphore	rwsem;
 	/**
-- 
cgit v1.2.3


From 41eda65c6100930d95bb854a0114f3544593070c Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Mon, 18 Feb 2019 22:25:47 +0200
Subject: RDMA/restrack: Hide restrack DB from IB/core

There is no need to expose internals of restrack DB to IB/core.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c   |  6 ++-
 drivers/infiniband/core/nldev.c    | 17 ++++----
 drivers/infiniband/core/restrack.c | 83 ++++++++++++++++++++++++++------------
 drivers/infiniband/core/restrack.h | 39 ++++++++++++++++++
 include/rdma/ib_verbs.h            |  7 ++--
 include/rdma/restrack.h            | 28 -------------
 6 files changed, 114 insertions(+), 66 deletions(-)
 create mode 100644 drivers/infiniband/core/restrack.h

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 3eddc6e67a16..f7e206033d39 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -45,6 +45,7 @@
 #include <rdma/ib_cache.h>
 
 #include "core_priv.h"
+#include "restrack.h"
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("core kernel InfiniBand API");
@@ -338,7 +339,10 @@ struct ib_device *_ib_alloc_device(size_t size)
 	if (!device)
 		return NULL;
 
-	rdma_restrack_init(device);
+	if (rdma_restrack_init(device)) {
+		kfree(device);
+		return NULL;
+	}
 
 	device->dev.class = &ib_class;
 	device->groups[0] = &ib_dev_attr_group;
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 0cd95f80f7b4..54312f9626a1 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -39,6 +39,7 @@
 
 #include "core_priv.h"
 #include "cma_priv.h"
+#include "restrack.h"
 
 static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_DEV_INDEX]     = { .type = NLA_U32 },
@@ -1027,6 +1028,7 @@ static int res_get_common_dumpit(struct sk_buff *skb,
 	unsigned long id;
 	u32 index, port = 0;
 	bool filled = false;
+	struct xarray *xa;
 
 	err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
 			  nldev_policy, NULL);
@@ -1074,13 +1076,14 @@ static int res_get_common_dumpit(struct sk_buff *skb,
 
 	has_cap_net_admin = netlink_capable(cb->skb, CAP_NET_ADMIN);
 
-	down_read(&device->res.rwsem);
+	xa = &device->res->xa[res_type];
+	down_read(&device->res->rwsem);
 	/*
 	 * FIXME: if the skip ahead is something common this loop should
 	 * use xas_for_each & xas_pause to optimize, we can have a lot of
 	 * objects.
 	 */
-	xa_for_each(&device->res.xa[res_type], id, res) {
+	xa_for_each(xa, id, res) {
 		if (idx < start)
 			goto next;
 
@@ -1101,13 +1104,13 @@ static int res_get_common_dumpit(struct sk_buff *skb,
 		if (!entry_attr) {
 			ret = -EMSGSIZE;
 			rdma_restrack_put(res);
-			up_read(&device->res.rwsem);
+			up_read(&device->res->rwsem);
 			break;
 		}
-		up_read(&device->res.rwsem);
 
+		up_read(&device->res->rwsem);
 		ret = fe->fill_res_func(skb, has_cap_net_admin, res, port);
-		down_read(&device->res.rwsem);
+		down_read(&device->res->rwsem);
 		/*
 		 * Return resource back, but it won't be released till
 		 * the &device->res.rwsem will be released for write.
@@ -1125,7 +1128,7 @@ static int res_get_common_dumpit(struct sk_buff *skb,
 		nla_nest_end(skb, entry_attr);
 next:		idx++;
 	}
-	up_read(&device->res.rwsem);
+	up_read(&device->res->rwsem);
 
 	nla_nest_end(skb, table_attr);
 	nlmsg_end(skb, nlh);
@@ -1143,7 +1146,7 @@ next:		idx++;
 
 res_err:
 	nla_nest_cancel(skb, table_attr);
-	up_read(&device->res.rwsem);
+	up_read(&device->res->rwsem);
 
 err:
 	nlmsg_cancel(skb, nlh);
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index 076ef6475df8..6a4b76c66bcb 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -9,8 +9,10 @@
 #include <linux/mutex.h>
 #include <linux/sched/task.h>
 #include <linux/pid_namespace.h>
+#include <linux/rwsem.h>
 
 #include "cma_priv.h"
+#include "restrack.h"
 
 static int rt_xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
 			      u32 *next)
@@ -35,18 +37,27 @@ static int rt_xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
 }
 
 /**
- * rdma_restrack_init() - initialize resource tracking
+ * rdma_restrack_init() - initialize and allocate resource tracking
  * @dev:  IB device
+ *
+ * Return: 0 on success
  */
-void rdma_restrack_init(struct ib_device *dev)
+int rdma_restrack_init(struct ib_device *dev)
 {
-	struct rdma_restrack_root *res = &dev->res;
+	struct rdma_restrack_root *rt;
 	int i;
 
+	dev->res = kzalloc(sizeof(*rt), GFP_KERNEL);
+	if (!dev->res)
+		return -ENOMEM;
+
+	rt = dev->res;
+
 	for (i = 0 ; i < RDMA_RESTRACK_MAX; i++)
-		xa_init_flags(&res->xa[i], XA_FLAGS_ALLOC);
+		xa_init_flags(&rt->xa[i], XA_FLAGS_ALLOC);
+	init_rwsem(&rt->rwsem);
 
-	init_rwsem(&res->rwsem);
+	return 0;
 }
 
 static const char *type2str(enum rdma_restrack_type type)
@@ -69,7 +80,7 @@ static const char *type2str(enum rdma_restrack_type type)
  */
 void rdma_restrack_clean(struct ib_device *dev)
 {
-	struct rdma_restrack_root *res = &dev->res;
+	struct rdma_restrack_root *rt = dev->res;
 	struct rdma_restrack_entry *e;
 	char buf[TASK_COMM_LEN];
 	bool found = false;
@@ -77,14 +88,16 @@ void rdma_restrack_clean(struct ib_device *dev)
 	int i;
 
 	for (i = 0 ; i < RDMA_RESTRACK_MAX; i++) {
-		if (!xa_empty(&res->xa[i])) {
+		struct xarray *xa = &dev->res->xa[i];
+
+		if (!xa_empty(xa)) {
 			unsigned long index;
 
 			if (!found) {
 				pr_err("restrack: %s", CUT_HERE);
 				dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n");
 			}
-			xa_for_each(&res->xa[i], index, e) {
+			xa_for_each(xa, index, e) {
 				if (rdma_is_kernel_res(e)) {
 					owner = e->kern_name;
 				} else {
@@ -104,10 +117,12 @@ void rdma_restrack_clean(struct ib_device *dev)
 			}
 			found = true;
 		}
-		xa_destroy(&res->xa[i]);
+		xa_destroy(xa);
 	}
 	if (found)
 		pr_err("restrack: %s", CUT_HERE);
+
+	kfree(rt);
 }
 
 /**
@@ -119,19 +134,19 @@ void rdma_restrack_clean(struct ib_device *dev)
 int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type,
 			struct pid_namespace *ns)
 {
-	struct rdma_restrack_root *res = &dev->res;
+	struct xarray *xa = &dev->res->xa[type];
 	struct rdma_restrack_entry *e;
 	unsigned long index = 0;
 	u32 cnt = 0;
 
-	down_read(&res->rwsem);
-	xa_for_each(&res->xa[type], index, e) {
+	down_read(&dev->res->rwsem);
+	xa_for_each(xa, index, e) {
 		if (ns == &init_pid_ns ||
 		    (!rdma_is_kernel_res(e) &&
 		     ns == task_active_pid_ns(e->task)))
 			cnt++;
 	}
-	up_read(&res->rwsem);
+	up_read(&dev->res->rwsem);
 	return cnt;
 }
 EXPORT_SYMBOL(rdma_restrack_count);
@@ -202,17 +217,19 @@ EXPORT_SYMBOL(rdma_restrack_set_task);
 static void rdma_restrack_add(struct rdma_restrack_entry *res)
 {
 	struct ib_device *dev = res_to_dev(res);
+	struct rdma_restrack_root *rt;
+	struct xarray *xa;
 	int ret;
 
 	if (!dev)
 		return;
 
+	rt = dev->res;
+	xa = &dev->res->xa[res->type];
+
 	kref_init(&res->kref);
 	init_completion(&res->comp);
-
-	ret = rt_xa_alloc_cyclic(&dev->res.xa[res->type], &res->id, res,
-				 &dev->res.next_id[res->type]);
-
+	ret = rt_xa_alloc_cyclic(xa, &res->id, res, &rt->next_id[res->type]);
 	if (!ret)
 		res->valid = true;
 }
@@ -266,14 +283,14 @@ struct rdma_restrack_entry *
 rdma_restrack_get_byid(struct ib_device *dev,
 		       enum rdma_restrack_type type, u32 id)
 {
-	struct rdma_restrack_root *rt = &dev->res;
+	struct xarray *xa = &dev->res->xa[type];
 	struct rdma_restrack_entry *res;
 
-	down_read(&dev->res.rwsem);
-	res = xa_load(&rt->xa[type], id);
+	down_read(&dev->res->rwsem);
+	res = xa_load(xa, id);
 	if (!res || !rdma_restrack_get(res))
 		res = ERR_PTR(-ENOENT);
-	up_read(&dev->res.rwsem);
+	up_read(&dev->res->rwsem);
 
 	return res;
 }
@@ -295,19 +312,33 @@ EXPORT_SYMBOL(rdma_restrack_put);
 
 void rdma_restrack_del(struct rdma_restrack_entry *res)
 {
-	struct ib_device *dev;
+	struct ib_device *dev = res_to_dev(res);
+	struct xarray *xa;
 
 	if (!res->valid)
 		goto out;
 
-	dev = res_to_dev(res);
+	/*
+	 * All objects except CM_ID set valid device immediately
+	 * after new object is created, it means that for not valid
+	 * objects will still have "dev".
+	 *
+	 * It is not the case for CM_ID, newly created object has
+	 * this field set to NULL and it is set in _cma_attach_to_dev()
+	 * only.
+	 *
+	 * Because we don't want to add any conditions on call
+	 * to rdma_restrack_del(), the check below protects from
+	 * NULL-dereference.
+	 */
 	if (!dev)
 		return;
 
-	down_write(&dev->res.rwsem);
-	xa_erase(&dev->res.xa[res->type], res->id);
+	xa = &dev->res->xa[res->type];
+	down_write(&dev->res->rwsem);
+	xa_erase(xa, res->id);
 	res->valid = false;
-	up_write(&dev->res.rwsem);
+	up_write(&dev->res->rwsem);
 
 	rdma_restrack_put(res);
 	wait_for_completion(&res->comp);
diff --git a/drivers/infiniband/core/restrack.h b/drivers/infiniband/core/restrack.h
new file mode 100644
index 000000000000..cf89ef0b8ed5
--- /dev/null
+++ b/drivers/infiniband/core/restrack.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2017-2019 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef _RDMA_CORE_RESTRACK_H_
+#define _RDMA_CORE_RESTRACK_H_
+
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+
+/**
+ * struct rdma_restrack_root - main resource tracking management
+ * entity, per-device
+ */
+struct rdma_restrack_root {
+	/*
+	 * @rwsem: Read/write lock to protect erase of entry.
+	 * Lists and insertions are protected by XArray internal lock.
+	 */
+	struct rw_semaphore	rwsem;
+	/**
+	 * @xa: Array of XArray structures to hold restrack entries.
+	 * We want to use array of XArrays because insertion is type
+	 * dependent. For types with xisiting unique ID (like QPN),
+	 * we will insert to that unique index. For other types,
+	 * we insert based on pointers and auto-allocate unique index.
+	 */
+	struct xarray xa[RDMA_RESTRACK_MAX];
+	/**
+	 * @next_id: Next ID to support cyclic allocation
+	 */
+	u32 next_id[RDMA_RESTRACK_MAX];
+};
+
+
+int rdma_restrack_init(struct ib_device *dev);
+void rdma_restrack_clean(struct ib_device *dev);
+#endif /* _RDMA_CORE_RESTRACK_H_ */
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 64ee7c08be22..2a17c2b30073 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2533,6 +2533,8 @@ struct ib_device_ops {
 	DECLARE_RDMA_OBJ_SIZE(ib_pd);
 };
 
+struct rdma_restrack_root;
+
 struct ib_device {
 	/* Do not access @dma_device directly from ULP nor from HW drivers. */
 	struct device                *dma_device;
@@ -2589,10 +2591,7 @@ struct ib_device {
 #endif
 
 	u32                          index;
-	/*
-	 * Implementation details of the RDMA core, don't use in drivers
-	 */
-	struct rdma_restrack_root     res;
+	struct rdma_restrack_root *res;
 
 	const struct uapi_definition   *driver_def;
 	enum rdma_driver_id		driver_id;
diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h
index 53e1a7fb7355..ecf3c7702a4f 100644
--- a/include/rdma/restrack.h
+++ b/include/rdma/restrack.h
@@ -7,7 +7,6 @@
 #define _RDMA_RESTRACK_H_
 
 #include <linux/typecheck.h>
-#include <linux/rwsem.h>
 #include <linux/sched.h>
 #include <linux/kref.h>
 #include <linux/completion.h>
@@ -50,31 +49,6 @@ enum rdma_restrack_type {
 };
 
 struct ib_device;
-struct rdma_restrack_entry;
-
-/**
- * struct rdma_restrack_root - main resource tracking management
- * entity, per-device
- */
-struct rdma_restrack_root {
-	/*
-	 * @rwsem: Read/write lock to protect erase of entry.
-	 * Lists and insertions are protected by XArray internal lock.
-	 */
-	struct rw_semaphore	rwsem;
-	/**
-	 * @xa: Array of XArray structures to hold restrack entries.
-	 * We want to use array of XArrays because insertion is type
-	 * dependent. For types with xisiting unique ID (like QPN),
-	 * we will insert to that unique index. For other types,
-	 * we insert based on pointers and auto-allocate unique index.
-	 */
-	struct xarray xa[RDMA_RESTRACK_MAX];
-	/**
-	 * @next_id: Next ID to support cyclic allocation
-	 */
-	u32 next_id[RDMA_RESTRACK_MAX];
-};
 
 /**
  * struct rdma_restrack_entry - metadata per-entry
@@ -125,8 +99,6 @@ struct rdma_restrack_entry {
 	u32 id;
 };
 
-void rdma_restrack_init(struct ib_device *dev);
-void rdma_restrack_clean(struct ib_device *dev);
 int rdma_restrack_count(struct ib_device *dev,
 			enum rdma_restrack_type type,
 			struct pid_namespace *ns);
-- 
cgit v1.2.3


From 517b773e0f612d608cbc62a08c55601bd56f73f6 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Mon, 18 Feb 2019 22:25:49 +0200
Subject: RDMA/nldev: Share with user-space object IDs

Give to the user space tools unique identifier for PD, MR, CQ and CM_ID
objects, so they can be able to query on them with .doit callbacks.

QP .doit is not supported yet, till all drivers will be updated to provide
their LQPN to be equal to their restrack ID.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/nldev.c  | 20 ++++++++++++++++++++
 include/uapi/rdma/rdma_netlink.h |  9 +++++++++
 2 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 9b4f891771c4..81d7ee3dcb20 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -108,6 +108,10 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_DRIVER_U32]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_DRIVER_S64]		= { .type = NLA_S64 },
 	[RDMA_NLDEV_ATTR_DRIVER_U64]		= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_RES_PDN]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_CQN]               = { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_MRN]               = { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_CM_IDN]            = { .type = NLA_U32 },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -466,6 +470,9 @@ static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin,
 		    &cm_id->route.addr.dst_addr))
 		goto err;
 
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CM_IDN, res->id))
+		goto err;
+
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
@@ -494,6 +501,9 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin,
 	    nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx))
 		goto err;
 
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id))
+		goto err;
+
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
@@ -522,6 +532,9 @@ static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin,
 			      RDMA_NLDEV_ATTR_PAD))
 		goto err;
 
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id))
+		goto err;
+
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
@@ -552,6 +565,9 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin,
 			      atomic_read(&pd->usecnt), RDMA_NLDEV_ATTR_PAD))
 		goto err;
 
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, res->id))
+		goto err;
+
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
@@ -893,6 +909,7 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
 		.nldev_cmd = RDMA_NLDEV_CMD_RES_CM_ID_GET,
 		.nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID,
 		.entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY,
+		.id = RDMA_NLDEV_ATTR_RES_CM_IDN,
 	},
 	[RDMA_RESTRACK_CQ] = {
 		.fill_res_func = fill_res_cq_entry,
@@ -900,6 +917,7 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
 		.nldev_attr = RDMA_NLDEV_ATTR_RES_CQ,
 		.flags = NLDEV_PER_DEV,
 		.entry = RDMA_NLDEV_ATTR_RES_CQ_ENTRY,
+		.id = RDMA_NLDEV_ATTR_RES_CQN,
 	},
 	[RDMA_RESTRACK_MR] = {
 		.fill_res_func = fill_res_mr_entry,
@@ -907,6 +925,7 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
 		.nldev_attr = RDMA_NLDEV_ATTR_RES_MR,
 		.flags = NLDEV_PER_DEV,
 		.entry = RDMA_NLDEV_ATTR_RES_MR_ENTRY,
+		.id = RDMA_NLDEV_ATTR_RES_MRN,
 	},
 	[RDMA_RESTRACK_PD] = {
 		.fill_res_func = fill_res_pd_entry,
@@ -914,6 +933,7 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
 		.nldev_attr = RDMA_NLDEV_ATTR_RES_PD,
 		.flags = NLDEV_PER_DEV,
 		.entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY,
+		.id = RDMA_NLDEV_ATTR_RES_PDN,
 	},
 };
 
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 3a9e681e4257..43362132e0d7 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -456,6 +456,15 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_DRIVER_S64,		/* s64 */
 	RDMA_NLDEV_ATTR_DRIVER_U64,		/* u64 */
 
+	/*
+	 * Indexes to get/set secific entry,
+	 * for QP use RDMA_NLDEV_ATTR_RES_LQPN
+	 */
+	RDMA_NLDEV_ATTR_RES_PDN,               /* u32 */
+	RDMA_NLDEV_ATTR_RES_CQN,               /* u32 */
+	RDMA_NLDEV_ATTR_RES_MRN,               /* u32 */
+	RDMA_NLDEV_ATTR_RES_CM_IDN,            /* u32 */
+
 	/*
 	 * Always the end
 	 */
-- 
cgit v1.2.3


From c3d02788b45ab4a2d8f243b98c04b549c8193af6 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Mon, 18 Feb 2019 22:25:50 +0200
Subject: RDMA/nldev: Provide parent IDs for PD, MR and QP objects

PD, MR and QP objects have parents objects: contexts and PDs.  The exposed
parent IDs allow to correlate various objects and simplify debug
investigation.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/nldev.c  | 18 ++++++++++++++++++
 include/uapi/rdma/rdma_netlink.h |  1 +
 2 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 81d7ee3dcb20..e6c7cc510556 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -112,6 +112,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_RES_CQN]               = { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_MRN]               = { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_CM_IDN]            = { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_CTXN]              = { .type = NLA_U32 },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -420,6 +421,10 @@ static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin,
 	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state))
 		goto err;
 
+	if (!rdma_is_kernel_res(res) &&
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id))
+		goto err;
+
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
@@ -503,6 +508,10 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin,
 
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id))
 		goto err;
+	if (!rdma_is_kernel_res(res) &&
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN,
+			cq->uobject->context->res.id))
+		goto err;
 
 	if (fill_res_name_pid(msg, res))
 		goto err;
@@ -535,6 +544,10 @@ static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin,
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id))
 		goto err;
 
+	if (!rdma_is_kernel_res(res) &&
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, mr->pd->res.id))
+		goto err;
+
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
@@ -568,6 +581,11 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin,
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, res->id))
 		goto err;
 
+	if (!rdma_is_kernel_res(res) &&
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN,
+			pd->uobject->context->res.id))
+		goto err;
+
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 43362132e0d7..4ebbcfb2c6ef 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -464,6 +464,7 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_RES_CQN,               /* u32 */
 	RDMA_NLDEV_ATTR_RES_MRN,               /* u32 */
 	RDMA_NLDEV_ATTR_RES_CM_IDN,            /* u32 */
+	RDMA_NLDEV_ATTR_RES_CTXN,	       /* u32 */
 
 	/*
 	 * Always the end
-- 
cgit v1.2.3


From ea1075edcbab7d92f4e4ccf5490043f796bf78be Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 12 Feb 2019 21:12:47 -0700
Subject: RDMA: Add and use rdma_for_each_port

We have many loops iterating over all of the end port numbers on a struct
ib_device, simplify them with a for_each helper.

Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 .clang-format                             |  1 +
 drivers/infiniband/core/cache.c           |  6 +++---
 drivers/infiniband/core/cma.c             |  7 +++----
 drivers/infiniband/core/device.c          | 26 ++++++++++++--------------
 drivers/infiniband/core/mad.c             |  4 ++--
 drivers/infiniband/core/nldev.c           |  4 ++--
 drivers/infiniband/core/security.c        | 11 +++++++----
 drivers/infiniband/core/sysfs.c           | 12 +++---------
 drivers/infiniband/core/user_mad.c        |  9 +++++----
 drivers/infiniband/ulp/ipoib/ipoib_main.c |  4 ++--
 drivers/infiniband/ulp/srp/ib_srp.c       |  5 +++--
 include/rdma/ib_verbs.h                   | 10 ++++++++++
 12 files changed, 53 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/.clang-format b/.clang-format
index 335ce29ab813..201a4f531b90 100644
--- a/.clang-format
+++ b/.clang-format
@@ -361,6 +361,7 @@ ForEachMacros:
   - 'radix_tree_for_each_slot'
   - 'radix_tree_for_each_tagged'
   - 'rbtree_postorder_for_each_entry_safe'
+  - 'rdma_for_each_port'
   - 'resource_list_for_each_entry'
   - 'resource_list_for_each_entry_safe'
   - 'rhl_for_each_entry_rcu'
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 2338d0b3a0ca..3d137d8381a9 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -1428,7 +1428,7 @@ static void ib_cache_event(struct ib_event_handler *handler,
 
 int ib_cache_setup_one(struct ib_device *device)
 {
-	int p;
+	unsigned int p;
 	int err;
 
 	rwlock_init(&device->cache.lock);
@@ -1447,8 +1447,8 @@ int ib_cache_setup_one(struct ib_device *device)
 		return err;
 	}
 
-	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
-		ib_cache_update(device, p + rdma_start_port(device), true);
+	rdma_for_each_port (device, p)
+		ib_cache_update(device, p, true);
 
 	INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
 			      device, ib_cache_event);
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index c43512752b8a..68c997be2429 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -659,7 +659,7 @@ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv)
 	struct cma_device *cma_dev;
 	enum ib_gid_type gid_type;
 	int ret = -ENODEV;
-	u8 port;
+	unsigned int port;
 
 	if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
 	    id_priv->id.ps == RDMA_PS_IPOIB)
@@ -673,8 +673,7 @@ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv)
 
 	mutex_lock(&lock);
 	list_for_each_entry(cma_dev, &dev_list, list) {
-		for (port = rdma_start_port(cma_dev->device);
-		     port <= rdma_end_port(cma_dev->device); port++) {
+		rdma_for_each_port (cma_dev->device, port) {
 			gidp = rdma_protocol_roce(cma_dev->device, port) ?
 			       &iboe_gid : &gid;
 			gid_type = cma_dev->default_gid_type[port - 1];
@@ -4548,7 +4547,7 @@ static void cma_add_one(struct ib_device *device)
 	if (!cma_dev->default_roce_tos)
 		goto free_gid_type;
 
-	for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+	rdma_for_each_port (device, i) {
 		supported_gids = roce_gid_type_mask_support(device, i);
 		WARN_ON(!supported_gids);
 		if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE))
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index f7e206033d39..71582f848a9c 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -470,10 +470,8 @@ static int verify_immutable(const struct ib_device *dev, u8 port)
 
 static int read_port_immutable(struct ib_device *device)
 {
+	unsigned int port;
 	int ret;
-	u8 start_port = rdma_start_port(device);
-	u8 end_port = rdma_end_port(device);
-	u8 port;
 
 	/**
 	 * device->port_immutable is indexed directly by the port number to make
@@ -482,13 +480,13 @@ static int read_port_immutable(struct ib_device *device)
 	 * Therefore port_immutable is declared as a 1 based array with
 	 * potential empty slots at the beginning.
 	 */
-	device->port_immutable = kcalloc(end_port + 1,
-					 sizeof(*device->port_immutable),
-					 GFP_KERNEL);
+	device->port_immutable =
+		kcalloc(rdma_end_port(device) + 1,
+			sizeof(*device->port_immutable), GFP_KERNEL);
 	if (!device->port_immutable)
 		return -ENOMEM;
 
-	for (port = start_port; port <= end_port; ++port) {
+	rdma_for_each_port (device, port) {
 		ret = device->ops.get_port_immutable(
 			device, port, &device->port_immutable[port]);
 		if (ret)
@@ -540,9 +538,9 @@ static void ib_policy_change_task(struct work_struct *work)
 
 	down_read(&devices_rwsem);
 	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
-		int i;
+		unsigned int i;
 
-		for (i = rdma_start_port(dev); i <= rdma_end_port(dev); i++) {
+		rdma_for_each_port (dev, i) {
 			u64 sp;
 			int ret = ib_get_cached_subnet_prefix(dev,
 							      i,
@@ -1060,10 +1058,9 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev,
 			 roce_netdev_callback cb,
 			 void *cookie)
 {
-	u8 port;
+	unsigned int port;
 
-	for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev);
-	     port++)
+	rdma_for_each_port (ib_dev, port)
 		if (rdma_protocol_roce(ib_dev, port)) {
 			struct net_device *idev = NULL;
 
@@ -1217,9 +1214,10 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid,
 		u8 *port_num, u16 *index)
 {
 	union ib_gid tmp_gid;
-	int ret, port, i;
+	unsigned int port;
+	int ret, i;
 
-	for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
+	rdma_for_each_port (device, port) {
 		if (!rdma_protocol_ib(device, port))
 			continue;
 
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 7870823bac47..e742a6a2c138 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -3326,9 +3326,9 @@ error:
 
 static void ib_mad_remove_device(struct ib_device *device, void *client_data)
 {
-	int i;
+	unsigned int i;
 
-	for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+	rdma_for_each_port (device, i) {
 		if (!rdma_cap_ib_mad(device, i))
 			continue;
 
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index e7350d9d60e9..85f6f2bcce40 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -774,7 +774,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb,
 	u32 idx = 0;
 	u32 ifindex;
 	int err;
-	u32 p;
+	unsigned int p;
 
 	err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
 			  nldev_policy, NULL);
@@ -786,7 +786,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb,
 	if (!device)
 		return -EINVAL;
 
-	for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
+	rdma_for_each_port (device, p) {
 		/*
 		 * The dumpit function returns all information from specific
 		 * index. This specific index is taken from the netlink
diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
index dad6a94a43f3..492702b83600 100644
--- a/drivers/infiniband/core/security.c
+++ b/drivers/infiniband/core/security.c
@@ -422,12 +422,15 @@ void ib_close_shared_qp_security(struct ib_qp_security *sec)
 
 int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev)
 {
-	u8 i = rdma_start_port(dev);
+	unsigned int i;
 	bool is_ib = false;
 	int ret;
 
-	while (i <= rdma_end_port(dev) && !is_ib)
+	rdma_for_each_port (dev, i) {
 		is_ib = rdma_protocol_ib(dev, i++);
+		if (is_ib)
+			break;
+	}
 
 	/* If this isn't an IB device don't create the security context */
 	if (!is_ib)
@@ -561,9 +564,9 @@ void ib_security_cache_change(struct ib_device *device,
 void ib_security_release_port_pkey_list(struct ib_device *device)
 {
 	struct pkey_index_qp_list *pkey, *tmp_pkey;
-	int i;
+	unsigned int i;
 
-	for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+	rdma_for_each_port (device, i) {
 		list_for_each_entry_safe(pkey,
 					 tmp_pkey,
 					 &device->port_pkey_list[i].pkey_list,
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 9335b15c2e38..9b6a065bdfa5 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -1308,23 +1308,17 @@ static void ib_free_port_attrs(struct ib_device *device)
 
 static int ib_setup_port_attrs(struct ib_device *device)
 {
+	unsigned int port;
 	int ret;
-	int i;
 
 	device->ports_kobj = kobject_create_and_add("ports", &device->dev.kobj);
 	if (!device->ports_kobj)
 		return -ENOMEM;
 
-	if (rdma_cap_ib_switch(device)) {
-		ret = add_port(device, 0);
+	rdma_for_each_port (device, port) {
+		ret = add_port(device, port);
 		if (ret)
 			goto err_put;
-	} else {
-		for (i = 1; i <= device->phys_port_cnt; ++i) {
-			ret = add_port(device, i);
-			if (ret)
-				goto err_put;
-		}
 	}
 
 	return 0;
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 3ebd211a87ed..02b7947ab215 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -1323,14 +1323,15 @@ free:
 static void ib_umad_remove_one(struct ib_device *device, void *client_data)
 {
 	struct ib_umad_device *umad_dev = client_data;
-	int i;
+	unsigned int i;
 
 	if (!umad_dev)
 		return;
 
-	for (i = 0; i <= rdma_end_port(device) - rdma_start_port(device); ++i) {
-		if (rdma_cap_ib_mad(device, i + rdma_start_port(device)))
-			ib_umad_kill_port(&umad_dev->ports[i]);
+	rdma_for_each_port (device, i) {
+		if (rdma_cap_ib_mad(device, i))
+			ib_umad_kill_port(
+				&umad_dev->ports[i - rdma_start_port(device)]);
 	}
 	/* balances kref_init() */
 	ib_umad_dev_put(umad_dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index ee4cca80f00b..48eda16db1a7 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -2495,7 +2495,7 @@ static void ipoib_add_one(struct ib_device *device)
 	struct list_head *dev_list;
 	struct net_device *dev;
 	struct ipoib_dev_priv *priv;
-	int p;
+	unsigned int p;
 	int count = 0;
 
 	dev_list = kmalloc(sizeof(*dev_list), GFP_KERNEL);
@@ -2504,7 +2504,7 @@ static void ipoib_add_one(struct ib_device *device)
 
 	INIT_LIST_HEAD(dev_list);
 
-	for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
+	rdma_for_each_port (device, p) {
 		if (!rdma_protocol_ib(device, p))
 			continue;
 		dev = ipoib_add_port("ib%d", device, p);
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 84184910f038..151f4eba84b8 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -4127,7 +4127,8 @@ static void srp_add_one(struct ib_device *device)
 	struct srp_device *srp_dev;
 	struct ib_device_attr *attr = &device->attrs;
 	struct srp_host *host;
-	int mr_page_shift, p;
+	int mr_page_shift;
+	unsigned int p;
 	u64 max_pages_per_mr;
 	unsigned int flags = 0;
 
@@ -4194,7 +4195,7 @@ static void srp_add_one(struct ib_device *device)
 		WARN_ON_ONCE(srp_dev->global_rkey == 0);
 	}
 
-	for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
+	rdma_for_each_port (device, p) {
 		host = srp_add_port(srp_dev, p);
 		if (host)
 			list_add_tail(&host->list, &srp_dev->dev_list);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 2a17c2b30073..fa0edd6ae33c 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2827,6 +2827,16 @@ static inline u8 rdma_start_port(const struct ib_device *device)
 	return rdma_cap_ib_switch(device) ? 0 : 1;
 }
 
+/**
+ * rdma_for_each_port - Iterate over all valid port numbers of the IB device
+ * @device - The struct ib_device * to iterate over
+ * @iter - The unsigned int to store the port number
+ */
+#define rdma_for_each_port(device, iter)                                       \
+	for (iter = rdma_start_port(device + BUILD_BUG_ON_ZERO(!__same_type(   \
+						     unsigned int, iter)));    \
+	     iter <= rdma_end_port(device); (iter)++)
+
 /**
  * rdma_end_port - Return the last valid port number for the device
  * specified
-- 
cgit v1.2.3


From 8ceb1357b33790193e9d55d2d09bcfd6bd59dd6d Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 12 Feb 2019 21:12:48 -0700
Subject: RDMA/device: Consolidate ib_device per_port data into one place

There is no reason to have three allocations of per-port data. Combine
them together and make the lifetime for all the per-port data match the
struct ib_device.

Following patches will require more port-specific data, now there is a
good place to put it.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cache.c    |  4 +--
 drivers/infiniband/core/device.c   | 70 ++++++++++++------------------------
 drivers/infiniband/core/security.c | 24 ++++++-------
 include/rdma/ib_verbs.h            | 74 ++++++++++++++++++++++----------------
 4 files changed, 78 insertions(+), 94 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 3d137d8381a9..9d0e8aca741a 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -881,8 +881,8 @@ static int _gid_table_setup_one(struct ib_device *ib_dev)
 	for (port = 0; port < ib_dev->phys_port_cnt; port++) {
 		u8 rdma_port = port + rdma_start_port(ib_dev);
 
-		table =	alloc_gid_table(
-				ib_dev->port_immutable[rdma_port].gid_tbl_len);
+		table = alloc_gid_table(
+			ib_dev->port_data[rdma_port].immutable.gid_tbl_len);
 		if (!table)
 			goto rollback_table_setup;
 
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 71582f848a9c..8d7d63a60ef5 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -293,8 +293,7 @@ static void ib_device_release(struct device *device)
 	WARN_ON(refcount_read(&dev->refcount));
 	ib_cache_release_one(dev);
 	ib_security_release_port_pkey_list(dev);
-	kfree(dev->port_pkey_list);
-	kfree(dev->port_immutable);
+	kfree(dev->port_data);
 	xa_destroy(&dev->client_data);
 	kfree(dev);
 }
@@ -468,27 +467,31 @@ static int verify_immutable(const struct ib_device *dev, u8 port)
 			    rdma_max_mad_size(dev, port) != 0);
 }
 
-static int read_port_immutable(struct ib_device *device)
+static int setup_port_data(struct ib_device *device)
 {
 	unsigned int port;
 	int ret;
 
-	/**
-	 * device->port_immutable is indexed directly by the port number to make
+	/*
+	 * device->port_data is indexed directly by the port number to make
 	 * access to this data as efficient as possible.
 	 *
-	 * Therefore port_immutable is declared as a 1 based array with
-	 * potential empty slots at the beginning.
+	 * Therefore port_data is declared as a 1 based array with potential
+	 * empty slots at the beginning.
 	 */
-	device->port_immutable =
-		kcalloc(rdma_end_port(device) + 1,
-			sizeof(*device->port_immutable), GFP_KERNEL);
-	if (!device->port_immutable)
+	device->port_data = kcalloc(rdma_end_port(device) + 1,
+				    sizeof(*device->port_data), GFP_KERNEL);
+	if (!device->port_data)
 		return -ENOMEM;
 
 	rdma_for_each_port (device, port) {
-		ret = device->ops.get_port_immutable(
-			device, port, &device->port_immutable[port]);
+		struct ib_port_data *pdata = &device->port_data[port];
+
+		spin_lock_init(&pdata->pkey_list_lock);
+		INIT_LIST_HEAD(&pdata->pkey_list);
+
+		ret = device->ops.get_port_immutable(device, port,
+						     &pdata->immutable);
 		if (ret)
 			return ret;
 
@@ -507,30 +510,6 @@ void ib_get_device_fw_str(struct ib_device *dev, char *str)
 }
 EXPORT_SYMBOL(ib_get_device_fw_str);
 
-static int setup_port_pkey_list(struct ib_device *device)
-{
-	int i;
-
-	/**
-	 * device->port_pkey_list is indexed directly by the port number,
-	 * Therefore it is declared as a 1 based array with potential empty
-	 * slots at the beginning.
-	 */
-	device->port_pkey_list = kcalloc(rdma_end_port(device) + 1,
-					 sizeof(*device->port_pkey_list),
-					 GFP_KERNEL);
-
-	if (!device->port_pkey_list)
-		return -ENOMEM;
-
-	for (i = 0; i < (rdma_end_port(device) + 1); i++) {
-		spin_lock_init(&device->port_pkey_list[i].list_lock);
-		INIT_LIST_HEAD(&device->port_pkey_list[i].pkey_list);
-	}
-
-	return 0;
-}
-
 static void ib_policy_change_task(struct work_struct *work)
 {
 	struct ib_device *dev;
@@ -668,10 +647,9 @@ static int setup_device(struct ib_device *device)
 	if (ret)
 		return ret;
 
-	ret = read_port_immutable(device);
+	ret = setup_port_data(device);
 	if (ret) {
-		dev_warn(&device->dev,
-			 "Couldn't create per port immutable data\n");
+		dev_warn(&device->dev, "Couldn't create per-port data\n");
 		return ret;
 	}
 
@@ -683,12 +661,6 @@ static int setup_device(struct ib_device *device)
 		return ret;
 	}
 
-	ret = setup_port_pkey_list(device);
-	if (ret) {
-		dev_warn(&device->dev, "Couldn't create per port_pkey_list\n");
-		return ret;
-	}
-
 	return 0;
 }
 
@@ -1221,7 +1193,8 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid,
 		if (!rdma_protocol_ib(device, port))
 			continue;
 
-		for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
+		for (i = 0; i < device->port_data[port].immutable.gid_tbl_len;
+		     ++i) {
 			ret = rdma_query_gid(device, port, i, &tmp_gid);
 			if (ret)
 				return ret;
@@ -1253,7 +1226,8 @@ int ib_find_pkey(struct ib_device *device,
 	u16 tmp_pkey;
 	int partial_ix = -1;
 
-	for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) {
+	for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len;
+	     ++i) {
 		ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
 		if (ret)
 			return ret;
diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
index 492702b83600..1ab423b19f77 100644
--- a/drivers/infiniband/core/security.c
+++ b/drivers/infiniband/core/security.c
@@ -49,16 +49,15 @@ static struct pkey_index_qp_list *get_pkey_idx_qp_list(struct ib_port_pkey *pp)
 	struct pkey_index_qp_list *tmp_pkey;
 	struct ib_device *dev = pp->sec->dev;
 
-	spin_lock(&dev->port_pkey_list[pp->port_num].list_lock);
-	list_for_each_entry(tmp_pkey,
-			    &dev->port_pkey_list[pp->port_num].pkey_list,
-			    pkey_index_list) {
+	spin_lock(&dev->port_data[pp->port_num].pkey_list_lock);
+	list_for_each_entry (tmp_pkey, &dev->port_data[pp->port_num].pkey_list,
+			     pkey_index_list) {
 		if (tmp_pkey->pkey_index == pp->pkey_index) {
 			pkey = tmp_pkey;
 			break;
 		}
 	}
-	spin_unlock(&dev->port_pkey_list[pp->port_num].list_lock);
+	spin_unlock(&dev->port_data[pp->port_num].pkey_list_lock);
 	return pkey;
 }
 
@@ -263,12 +262,12 @@ static int port_pkey_list_insert(struct ib_port_pkey *pp)
 		if (!pkey)
 			return -ENOMEM;
 
-		spin_lock(&dev->port_pkey_list[port_num].list_lock);
+		spin_lock(&dev->port_data[port_num].pkey_list_lock);
 		/* Check for the PKey again.  A racing process may
 		 * have created it.
 		 */
 		list_for_each_entry(tmp_pkey,
-				    &dev->port_pkey_list[port_num].pkey_list,
+				    &dev->port_data[port_num].pkey_list,
 				    pkey_index_list) {
 			if (tmp_pkey->pkey_index == pp->pkey_index) {
 				kfree(pkey);
@@ -283,9 +282,9 @@ static int port_pkey_list_insert(struct ib_port_pkey *pp)
 			spin_lock_init(&pkey->qp_list_lock);
 			INIT_LIST_HEAD(&pkey->qp_list);
 			list_add(&pkey->pkey_index_list,
-				 &dev->port_pkey_list[port_num].pkey_list);
+				 &dev->port_data[port_num].pkey_list);
 		}
-		spin_unlock(&dev->port_pkey_list[port_num].list_lock);
+		spin_unlock(&dev->port_data[port_num].pkey_list_lock);
 	}
 
 	spin_lock(&pkey->qp_list_lock);
@@ -551,9 +550,8 @@ void ib_security_cache_change(struct ib_device *device,
 {
 	struct pkey_index_qp_list *pkey;
 
-	list_for_each_entry(pkey,
-			    &device->port_pkey_list[port_num].pkey_list,
-			    pkey_index_list) {
+	list_for_each_entry (pkey, &device->port_data[port_num].pkey_list,
+			     pkey_index_list) {
 		check_pkey_qps(pkey,
 			       device,
 			       port_num,
@@ -569,7 +567,7 @@ void ib_security_release_port_pkey_list(struct ib_device *device)
 	rdma_for_each_port (device, i) {
 		list_for_each_entry_safe(pkey,
 					 tmp_pkey,
-					 &device->port_pkey_list[i].pkey_list,
+					 &device->port_data[i].pkey_list,
 					 pkey_index_list) {
 			list_del(&pkey->pkey_index_list);
 			kfree(pkey);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index fa0edd6ae33c..b42e257814f7 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2198,6 +2198,13 @@ struct ib_port_immutable {
 	u32                           max_mad_size;
 };
 
+struct ib_port_data {
+	struct ib_port_immutable immutable;
+
+	spinlock_t pkey_list_lock;
+	struct list_head pkey_list;
+};
+
 /* rdma netdev type - specifies protocol type */
 enum rdma_netdev_t {
 	RDMA_NETDEV_OPA_VNIC,
@@ -2243,12 +2250,6 @@ struct rdma_netdev_alloc_params {
 				      struct net_device *netdev, void *param);
 };
 
-struct ib_port_pkey_list {
-	/* Lock to hold while modifying the list. */
-	spinlock_t                    list_lock;
-	struct list_head              pkey_list;
-};
-
 struct ib_counters {
 	struct ib_device	*device;
 	struct ib_uobject	*uobject;
@@ -2549,14 +2550,12 @@ struct ib_device {
 
 	struct ib_cache               cache;
 	/**
-	 * port_immutable is indexed by port number
+	 * port_data is indexed by port number
 	 */
-	struct ib_port_immutable     *port_immutable;
+	struct ib_port_data *port_data;
 
 	int			      num_comp_vectors;
 
-	struct ib_port_pkey_list     *port_pkey_list;
-
 	struct iw_cm_verbs	     *iwcm;
 
 	struct module               *owner;
@@ -2860,34 +2859,38 @@ static inline int rdma_is_port_valid(const struct ib_device *device,
 static inline bool rdma_is_grh_required(const struct ib_device *device,
 					u8 port_num)
 {
-	return device->port_immutable[port_num].core_cap_flags &
-		RDMA_CORE_PORT_IB_GRH_REQUIRED;
+	return device->port_data[port_num].immutable.core_cap_flags &
+	       RDMA_CORE_PORT_IB_GRH_REQUIRED;
 }
 
 static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num)
 {
-	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IB;
+	return device->port_data[port_num].immutable.core_cap_flags &
+	       RDMA_CORE_CAP_PROT_IB;
 }
 
 static inline bool rdma_protocol_roce(const struct ib_device *device, u8 port_num)
 {
-	return device->port_immutable[port_num].core_cap_flags &
-		(RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP);
+	return device->port_data[port_num].immutable.core_cap_flags &
+	       (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP);
 }
 
 static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u8 port_num)
 {
-	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP;
+	return device->port_data[port_num].immutable.core_cap_flags &
+	       RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP;
 }
 
 static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u8 port_num)
 {
-	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE;
+	return device->port_data[port_num].immutable.core_cap_flags &
+	       RDMA_CORE_CAP_PROT_ROCE;
 }
 
 static inline bool rdma_protocol_iwarp(const struct ib_device *device, u8 port_num)
 {
-	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IWARP;
+	return device->port_data[port_num].immutable.core_cap_flags &
+	       RDMA_CORE_CAP_PROT_IWARP;
 }
 
 static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num)
@@ -2898,12 +2901,14 @@ static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num)
 
 static inline bool rdma_protocol_raw_packet(const struct ib_device *device, u8 port_num)
 {
-	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_RAW_PACKET;
+	return device->port_data[port_num].immutable.core_cap_flags &
+	       RDMA_CORE_CAP_PROT_RAW_PACKET;
 }
 
 static inline bool rdma_protocol_usnic(const struct ib_device *device, u8 port_num)
 {
-	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_USNIC;
+	return device->port_data[port_num].immutable.core_cap_flags &
+	       RDMA_CORE_CAP_PROT_USNIC;
 }
 
 /**
@@ -2920,7 +2925,8 @@ static inline bool rdma_protocol_usnic(const struct ib_device *device, u8 port_n
  */
 static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num)
 {
-	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_MAD;
+	return device->port_data[port_num].immutable.core_cap_flags &
+	       RDMA_CORE_CAP_IB_MAD;
 }
 
 /**
@@ -2944,8 +2950,8 @@ static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num)
  */
 static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num)
 {
-	return (device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_OPA_MAD)
-		== RDMA_CORE_CAP_OPA_MAD;
+	return (device->port_data[port_num].immutable.core_cap_flags &
+		RDMA_CORE_CAP_OPA_MAD) == RDMA_CORE_CAP_OPA_MAD;
 }
 
 /**
@@ -2970,7 +2976,8 @@ static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num)
  */
 static inline bool rdma_cap_ib_smi(const struct ib_device *device, u8 port_num)
 {
-	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SMI;
+	return device->port_data[port_num].immutable.core_cap_flags &
+	       RDMA_CORE_CAP_IB_SMI;
 }
 
 /**
@@ -2990,7 +2997,8 @@ static inline bool rdma_cap_ib_smi(const struct ib_device *device, u8 port_num)
  */
 static inline bool rdma_cap_ib_cm(const struct ib_device *device, u8 port_num)
 {
-	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_CM;
+	return device->port_data[port_num].immutable.core_cap_flags &
+	       RDMA_CORE_CAP_IB_CM;
 }
 
 /**
@@ -3007,7 +3015,8 @@ static inline bool rdma_cap_ib_cm(const struct ib_device *device, u8 port_num)
  */
 static inline bool rdma_cap_iw_cm(const struct ib_device *device, u8 port_num)
 {
-	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IW_CM;
+	return device->port_data[port_num].immutable.core_cap_flags &
+	       RDMA_CORE_CAP_IW_CM;
 }
 
 /**
@@ -3027,7 +3036,8 @@ static inline bool rdma_cap_iw_cm(const struct ib_device *device, u8 port_num)
  */
 static inline bool rdma_cap_ib_sa(const struct ib_device *device, u8 port_num)
 {
-	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SA;
+	return device->port_data[port_num].immutable.core_cap_flags &
+	       RDMA_CORE_CAP_IB_SA;
 }
 
 /**
@@ -3067,7 +3077,8 @@ static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u8 port_num
  */
 static inline bool rdma_cap_af_ib(const struct ib_device *device, u8 port_num)
 {
-	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_AF_IB;
+	return device->port_data[port_num].immutable.core_cap_flags &
+	       RDMA_CORE_CAP_AF_IB;
 }
 
 /**
@@ -3088,7 +3099,8 @@ static inline bool rdma_cap_af_ib(const struct ib_device *device, u8 port_num)
  */
 static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num)
 {
-	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_ETH_AH;
+	return device->port_data[port_num].immutable.core_cap_flags &
+	       RDMA_CORE_CAP_ETH_AH;
 }
 
 /**
@@ -3102,7 +3114,7 @@ static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num)
  */
 static inline bool rdma_cap_opa_ah(struct ib_device *device, u8 port_num)
 {
-	return (device->port_immutable[port_num].core_cap_flags &
+	return (device->port_data[port_num].immutable.core_cap_flags &
 		RDMA_CORE_CAP_OPA_AH) == RDMA_CORE_CAP_OPA_AH;
 }
 
@@ -3120,7 +3132,7 @@ static inline bool rdma_cap_opa_ah(struct ib_device *device, u8 port_num)
  */
 static inline size_t rdma_max_mad_size(const struct ib_device *device, u8 port_num)
 {
-	return device->port_immutable[port_num].max_mad_size;
+	return device->port_data[port_num].immutable.max_mad_size;
 }
 
 /**
-- 
cgit v1.2.3


From 8faea9fd4a3914f12cd343e10810ec5f4215ddd6 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 12 Feb 2019 21:12:49 -0700
Subject: RDMA/cache: Move the cache per-port data into the main ib_port_data

Like the other cases there no real reason to have another array just for
the cache. This larger conversion gets its own patch.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cache.c | 89 ++++++++++++++---------------------------
 include/rdma/ib_verbs.h         |  3 +-
 2 files changed, 33 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 9d0e8aca741a..a28dc1901c80 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -185,7 +185,7 @@ EXPORT_SYMBOL(ib_cache_gid_parse_type_str);
 
 static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u8 port)
 {
-	return device->cache.ports[port - rdma_start_port(device)].gid;
+	return device->port_data[port].cache.gid;
 }
 
 static bool is_gid_entry_free(const struct ib_gid_table_entry *entry)
@@ -765,7 +765,7 @@ err_free_table:
 	return NULL;
 }
 
-static void release_gid_table(struct ib_device *device, u8 port,
+static void release_gid_table(struct ib_device *device,
 			      struct ib_gid_table *table)
 {
 	bool leak = false;
@@ -863,31 +863,27 @@ static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
 
 static void gid_table_release_one(struct ib_device *ib_dev)
 {
-	struct ib_gid_table *table;
-	u8 port;
+	unsigned int p;
 
-	for (port = 0; port < ib_dev->phys_port_cnt; port++) {
-		table = ib_dev->cache.ports[port].gid;
-		release_gid_table(ib_dev, port, table);
-		ib_dev->cache.ports[port].gid = NULL;
+	rdma_for_each_port (ib_dev, p) {
+		release_gid_table(ib_dev, ib_dev->port_data[p].cache.gid);
+		ib_dev->port_data[p].cache.gid = NULL;
 	}
 }
 
 static int _gid_table_setup_one(struct ib_device *ib_dev)
 {
-	u8 port;
 	struct ib_gid_table *table;
+	unsigned int rdma_port;
 
-	for (port = 0; port < ib_dev->phys_port_cnt; port++) {
-		u8 rdma_port = port + rdma_start_port(ib_dev);
-
+	rdma_for_each_port (ib_dev, rdma_port) {
 		table = alloc_gid_table(
 			ib_dev->port_data[rdma_port].immutable.gid_tbl_len);
 		if (!table)
 			goto rollback_table_setup;
 
 		gid_table_reserve_default(ib_dev, rdma_port, table);
-		ib_dev->cache.ports[port].gid = table;
+		ib_dev->port_data[rdma_port].cache.gid = table;
 	}
 	return 0;
 
@@ -898,14 +894,11 @@ rollback_table_setup:
 
 static void gid_table_cleanup_one(struct ib_device *ib_dev)
 {
-	struct ib_gid_table *table;
-	u8 port;
+	unsigned int p;
 
-	for (port = 0; port < ib_dev->phys_port_cnt; port++) {
-		table = ib_dev->cache.ports[port].gid;
-		cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
-				       table);
-	}
+	rdma_for_each_port (ib_dev, p)
+		cleanup_gid_table_port(ib_dev, p,
+				       ib_dev->port_data[p].cache.gid);
 }
 
 static int gid_table_setup_one(struct ib_device *ib_dev)
@@ -983,17 +976,17 @@ const struct ib_gid_attr *rdma_find_gid(struct ib_device *device,
 	unsigned long mask = GID_ATTR_FIND_MASK_GID |
 			     GID_ATTR_FIND_MASK_GID_TYPE;
 	struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type};
-	u8 p;
+	unsigned int p;
 
 	if (ndev)
 		mask |= GID_ATTR_FIND_MASK_NETDEV;
 
-	for (p = 0; p < device->phys_port_cnt; p++) {
+	rdma_for_each_port(device, p) {
 		struct ib_gid_table *table;
 		unsigned long flags;
 		int index;
 
-		table = device->cache.ports[p].gid;
+		table = device->port_data[p].cache.gid;
 		read_lock_irqsave(&table->rwlock, flags);
 		index = find_gid(table, gid, &gid_attr_val, false, mask, NULL);
 		if (index >= 0) {
@@ -1025,7 +1018,7 @@ int ib_get_cached_pkey(struct ib_device *device,
 
 	read_lock_irqsave(&device->cache.lock, flags);
 
-	cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
+	cache = device->port_data[port_num].cache.pkey;
 
 	if (index < 0 || index >= cache->table_len)
 		ret = -EINVAL;
@@ -1043,14 +1036,12 @@ int ib_get_cached_subnet_prefix(struct ib_device *device,
 				u64              *sn_pfx)
 {
 	unsigned long flags;
-	int p;
 
 	if (!rdma_is_port_valid(device, port_num))
 		return -EINVAL;
 
-	p = port_num - rdma_start_port(device);
 	read_lock_irqsave(&device->cache.lock, flags);
-	*sn_pfx = device->cache.ports[p].subnet_prefix;
+	*sn_pfx = device->port_data[port_num].cache.subnet_prefix;
 	read_unlock_irqrestore(&device->cache.lock, flags);
 
 	return 0;
@@ -1073,7 +1064,7 @@ int ib_find_cached_pkey(struct ib_device *device,
 
 	read_lock_irqsave(&device->cache.lock, flags);
 
-	cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
+	cache = device->port_data[port_num].cache.pkey;
 
 	*index = -1;
 
@@ -1113,7 +1104,7 @@ int ib_find_exact_cached_pkey(struct ib_device *device,
 
 	read_lock_irqsave(&device->cache.lock, flags);
 
-	cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
+	cache = device->port_data[port_num].cache.pkey;
 
 	*index = -1;
 
@@ -1141,7 +1132,7 @@ int ib_get_cached_lmc(struct ib_device *device,
 		return -EINVAL;
 
 	read_lock_irqsave(&device->cache.lock, flags);
-	*lmc = device->cache.ports[port_num - rdma_start_port(device)].lmc;
+	*lmc = device->port_data[port_num].cache.lmc;
 	read_unlock_irqrestore(&device->cache.lock, flags);
 
 	return ret;
@@ -1159,8 +1150,7 @@ int ib_get_cached_port_state(struct ib_device   *device,
 		return -EINVAL;
 
 	read_lock_irqsave(&device->cache.lock, flags);
-	*port_state = device->cache.ports[port_num
-		- rdma_start_port(device)].port_state;
+	*port_state = device->port_data[port_num].cache.port_state;
 	read_unlock_irqrestore(&device->cache.lock, flags);
 
 	return ret;
@@ -1361,16 +1351,13 @@ static void ib_cache_update(struct ib_device *device,
 
 	write_lock_irq(&device->cache.lock);
 
-	old_pkey_cache = device->cache.ports[port -
-		rdma_start_port(device)].pkey;
+	old_pkey_cache = device->port_data[port].cache.pkey;
 
-	device->cache.ports[port - rdma_start_port(device)].pkey = pkey_cache;
-	device->cache.ports[port - rdma_start_port(device)].lmc = tprops->lmc;
-	device->cache.ports[port - rdma_start_port(device)].port_state =
-		tprops->state;
+	device->port_data[port].cache.pkey = pkey_cache;
+	device->port_data[port].cache.lmc = tprops->lmc;
+	device->port_data[port].cache.port_state = tprops->state;
 
-	device->cache.ports[port - rdma_start_port(device)].subnet_prefix =
-							tprops->subnet_prefix;
+	device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix;
 	write_unlock_irq(&device->cache.lock);
 
 	if (enforce_security)
@@ -1433,19 +1420,9 @@ int ib_cache_setup_one(struct ib_device *device)
 
 	rwlock_init(&device->cache.lock);
 
-	device->cache.ports =
-		kcalloc(rdma_end_port(device) - rdma_start_port(device) + 1,
-			sizeof(*device->cache.ports),
-			GFP_KERNEL);
-	if (!device->cache.ports)
-		return -ENOMEM;
-
 	err = gid_table_setup_one(device);
-	if (err) {
-		kfree(device->cache.ports);
-		device->cache.ports = NULL;
+	if (err)
 		return err;
-	}
 
 	rdma_for_each_port (device, p)
 		ib_cache_update(device, p, true);
@@ -1458,10 +1435,7 @@ int ib_cache_setup_one(struct ib_device *device)
 
 void ib_cache_release_one(struct ib_device *device)
 {
-	int p;
-
-	if (!device->cache.ports)
-		return;
+	unsigned int p;
 
 	/*
 	 * The release function frees all the cache elements.
@@ -1469,11 +1443,10 @@ void ib_cache_release_one(struct ib_device *device)
 	 * all the device's resources when the cache could no
 	 * longer be accessed.
 	 */
-	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
-		kfree(device->cache.ports[p].pkey);
+	rdma_for_each_port (device, p)
+		kfree(device->port_data[p].cache.pkey);
 
 	gid_table_release_one(device);
-	kfree(device->cache.ports);
 }
 
 void ib_cache_cleanup_one(struct ib_device *device)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index b42e257814f7..50b7ebc2885e 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2186,7 +2186,6 @@ struct ib_port_cache {
 struct ib_cache {
 	rwlock_t                lock;
 	struct ib_event_handler event_handler;
-	struct ib_port_cache   *ports;
 };
 
 struct iw_cm_verbs;
@@ -2203,6 +2202,8 @@ struct ib_port_data {
 
 	spinlock_t pkey_list_lock;
 	struct list_head pkey_list;
+
+	struct ib_port_cache cache;
 };
 
 /* rdma netdev type - specifies protocol type */
-- 
cgit v1.2.3


From ee7930490a8f84570e96268f5736e99570b58307 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 6 Jul 2018 09:11:50 +0100
Subject: clocksource/arm_arch_timer: Store physical timer IRQ number for KVM
 on VHE

A host running in VHE mode gets the EL2 physical timer as its time
source (accessed using the EL1 sysreg accessors, which get re-directed
to the EL2 sysregs by VHE).

The EL1 physical timer remains unused by the host kernel, allowing us to
pass that on directly to a KVM guest and saves us from emulating this
timer for the guest on VHE systems.

Store the EL1 Physical Timer's IRQ number in
struct arch_timer_kvm_info on VHE systems to allow KVM to use it.

Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
---
 drivers/clocksource/arm_arch_timer.c | 11 +++++++++--
 include/clocksource/arm_arch_timer.h |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index 9a7d4dc00b6e..b9243e2328b4 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -1206,6 +1206,13 @@ static enum arch_timer_ppi_nr __init arch_timer_select_ppi(void)
 	return ARCH_TIMER_PHYS_SECURE_PPI;
 }
 
+static void __init arch_timer_populate_kvm_info(void)
+{
+	arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI];
+	if (is_kernel_in_hyp_mode())
+		arch_timer_kvm_info.physical_irq = arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI];
+}
+
 static int __init arch_timer_of_init(struct device_node *np)
 {
 	int i, ret;
@@ -1220,7 +1227,7 @@ static int __init arch_timer_of_init(struct device_node *np)
 	for (i = ARCH_TIMER_PHYS_SECURE_PPI; i < ARCH_TIMER_MAX_TIMER_PPI; i++)
 		arch_timer_ppi[i] = irq_of_parse_and_map(np, i);
 
-	arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI];
+	arch_timer_populate_kvm_info();
 
 	rate = arch_timer_get_cntfrq();
 	arch_timer_of_configure_rate(rate, np);
@@ -1550,7 +1557,7 @@ static int __init arch_timer_acpi_init(struct acpi_table_header *table)
 	arch_timer_ppi[ARCH_TIMER_HYP_PPI] =
 		acpi_gtdt_map_ppi(ARCH_TIMER_HYP_PPI);
 
-	arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI];
+	arch_timer_populate_kvm_info();
 
 	/*
 	 * When probing via ACPI, we have no mechanism to override the sysreg
diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h
index 349e5957c949..702967d996bb 100644
--- a/include/clocksource/arm_arch_timer.h
+++ b/include/clocksource/arm_arch_timer.h
@@ -74,6 +74,7 @@ enum arch_timer_spi_nr {
 struct arch_timer_kvm_info {
 	struct timecounter timecounter;
 	int virtual_irq;
+	int physical_irq;
 };
 
 struct arch_timer_mem_frame {
-- 
cgit v1.2.3


From accb99bcd0ca6d3ee412557b0c3f583a3abc0eb6 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@arm.com>
Date: Mon, 26 Nov 2018 18:21:22 +0100
Subject: KVM: arm/arm64: Simplify bg_timer programming

Instead of calling into kvm_timer_[un]schedule from the main kvm
blocking path, test if the VCPU is on the wait queue from the load/put
path and perform the background timer setup/cancel in this path.

This has the distinct advantage that we no longer race between load/put
and schedule/unschedule and programming and canceling of the bg_timer
always happens when the timer state is not loaded.

Note that we must now remove the checks in kvm_timer_blocking that do
not schedule a background timer if one of the timers can fire, because
we no longer have a guarantee that kvm_vcpu_check_block() will be called
before kvm_timer_blocking.

Reported-by: Andre Przywara <andre.przywara@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/kvm/arm_arch_timer.h |  3 ---
 virt/kvm/arm/arch_timer.c    | 35 ++++++++++++++---------------------
 virt/kvm/arm/arm.c           |  2 --
 3 files changed, 14 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index 33771352dcd6..d6e6a45d1d24 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -76,9 +76,6 @@ int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
 
 bool kvm_timer_is_pending(struct kvm_vcpu *vcpu);
 
-void kvm_timer_schedule(struct kvm_vcpu *vcpu);
-void kvm_timer_unschedule(struct kvm_vcpu *vcpu);
-
 u64 kvm_phys_timer_read(void);
 
 void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index b07ac4614e1c..4986028d9829 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -349,22 +349,12 @@ out:
  * thread is removed from its waitqueue and made runnable when there's a timer
  * interrupt to handle.
  */
-void kvm_timer_schedule(struct kvm_vcpu *vcpu)
+static void kvm_timer_blocking(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
 
-	vtimer_save_state(vcpu);
-
-	/*
-	 * No need to schedule a background timer if any guest timer has
-	 * already expired, because kvm_vcpu_block will return before putting
-	 * the thread to sleep.
-	 */
-	if (kvm_timer_should_fire(vtimer) || kvm_timer_should_fire(ptimer))
-		return;
-
 	/*
 	 * If both timers are not capable of raising interrupts (disabled or
 	 * masked), then there's no more work for us to do.
@@ -373,12 +363,19 @@ void kvm_timer_schedule(struct kvm_vcpu *vcpu)
 		return;
 
 	/*
-	 * The guest timers have not yet expired, schedule a background timer.
+	 * At least one guest time will expire. Schedule a background timer.
 	 * Set the earliest expiration time among the guest timers.
 	 */
 	soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu));
 }
 
+static void kvm_timer_unblocking(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+
+	soft_timer_cancel(&timer->bg_timer);
+}
+
 static void vtimer_restore_state(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
@@ -401,15 +398,6 @@ out:
 	local_irq_restore(flags);
 }
 
-void kvm_timer_unschedule(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-
-	vtimer_restore_state(vcpu);
-
-	soft_timer_cancel(&timer->bg_timer);
-}
-
 static void set_cntvoff(u64 cntvoff)
 {
 	u32 low = lower_32_bits(cntvoff);
@@ -485,6 +473,8 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
 	/* Set the background timer for the physical timer emulation. */
 	phys_timer_emulate(vcpu);
 
+	kvm_timer_unblocking(vcpu);
+
 	/* If the timer fired while we weren't running, inject it now */
 	if (kvm_timer_should_fire(ptimer) != ptimer->irq.level)
 		kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer);
@@ -527,6 +517,9 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
 	 */
 	soft_timer_cancel(&timer->phys_timer);
 
+	if (swait_active(kvm_arch_vcpu_wq(vcpu)))
+		kvm_timer_blocking(vcpu);
+
 	/*
 	 * The kernel may decide to run userspace after calling vcpu_put, so
 	 * we reset cntvoff to 0 to ensure a consistent read between user
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index b77db673bb03..9fbdb9e1c51f 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -335,13 +335,11 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 
 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
 {
-	kvm_timer_schedule(vcpu);
 	kvm_vgic_v4_enable_doorbell(vcpu);
 }
 
 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
 {
-	kvm_timer_unschedule(vcpu);
 	kvm_vgic_v4_disable_doorbell(vcpu);
 }
 
-- 
cgit v1.2.3


From 84135d3d18da2ff17d3ad1a609b2818cc3049552 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Thu, 5 Jul 2018 16:48:23 +0100
Subject: KVM: arm/arm64: consolidate arch timer trap handlers

At the moment we have separate system register emulation handlers for
each timer register. Actually they are quite similar, and we rely on
kvm_arm_timer_[gs]et_reg() for the actual emulation anyways, so let's
just merge all of those handlers into one function, which just marshalls
the arguments and then hands off to a set of common accessors.
This makes extending the emulation to include EL2 timers much easier.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
[Fixed 32-bit VM breakage and reduced to reworking existing code]
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
[Fixed 32bit host, general cleanup]
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm/kvm/coproc.c           |  23 ++++---
 arch/arm64/include/asm/sysreg.h |   4 ++
 arch/arm64/kvm/sys_regs.c       |  73 ++++++++++------------
 include/kvm/arm_arch_timer.h    |  23 +++++++
 virt/kvm/arm/arch_timer.c       | 130 ++++++++++++++++++++++++++++++++++------
 5 files changed, 187 insertions(+), 66 deletions(-)

(limited to 'include')

diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
index 222c1635bc7a..51863364f8d1 100644
--- a/arch/arm/kvm/coproc.c
+++ b/arch/arm/kvm/coproc.c
@@ -293,15 +293,16 @@ static bool access_cntp_tval(struct kvm_vcpu *vcpu,
 			     const struct coproc_params *p,
 			     const struct coproc_reg *r)
 {
-	u64 now = kvm_phys_timer_read();
-	u64 val;
+	u32 val;
 
 	if (p->is_write) {
 		val = *vcpu_reg(vcpu, p->Rt1);
-		kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, val + now);
+		kvm_arm_timer_write_sysreg(vcpu,
+					   TIMER_PTIMER, TIMER_REG_TVAL, val);
 	} else {
-		val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL);
-		*vcpu_reg(vcpu, p->Rt1) = val - now;
+		val = kvm_arm_timer_read_sysreg(vcpu,
+						TIMER_PTIMER, TIMER_REG_TVAL);
+		*vcpu_reg(vcpu, p->Rt1) = val;
 	}
 
 	return true;
@@ -315,9 +316,11 @@ static bool access_cntp_ctl(struct kvm_vcpu *vcpu,
 
 	if (p->is_write) {
 		val = *vcpu_reg(vcpu, p->Rt1);
-		kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CTL, val);
+		kvm_arm_timer_write_sysreg(vcpu,
+					   TIMER_PTIMER, TIMER_REG_CTL, val);
 	} else {
-		val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CTL);
+		val = kvm_arm_timer_read_sysreg(vcpu,
+						TIMER_PTIMER, TIMER_REG_CTL);
 		*vcpu_reg(vcpu, p->Rt1) = val;
 	}
 
@@ -333,9 +336,11 @@ static bool access_cntp_cval(struct kvm_vcpu *vcpu,
 	if (p->is_write) {
 		val = (u64)*vcpu_reg(vcpu, p->Rt2) << 32;
 		val |= *vcpu_reg(vcpu, p->Rt1);
-		kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, val);
+		kvm_arm_timer_write_sysreg(vcpu,
+					   TIMER_PTIMER, TIMER_REG_CVAL, val);
 	} else {
-		val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL);
+		val = kvm_arm_timer_read_sysreg(vcpu,
+						TIMER_PTIMER, TIMER_REG_CVAL);
 		*vcpu_reg(vcpu, p->Rt1) = val;
 		*vcpu_reg(vcpu, p->Rt2) = val >> 32;
 	}
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 3e5650903d6d..6482e8bcf1b8 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -392,6 +392,10 @@
 #define SYS_CNTP_CTL_EL0		sys_reg(3, 3, 14, 2, 1)
 #define SYS_CNTP_CVAL_EL0		sys_reg(3, 3, 14, 2, 2)
 
+#define SYS_AARCH32_CNTP_TVAL		sys_reg(0, 0, 14, 2, 0)
+#define SYS_AARCH32_CNTP_CTL		sys_reg(0, 0, 14, 2, 1)
+#define SYS_AARCH32_CNTP_CVAL		sys_reg(0, 2, 0, 14, 0)
+
 #define __PMEV_op2(n)			((n) & 0x7)
 #define __CNTR_CRm(n)			(0x8 | (((n) >> 3) & 0x3))
 #define SYS_PMEVCNTRn_EL0(n)		sys_reg(3, 3, 14, __CNTR_CRm(n), __PMEV_op2(n))
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 1a5bea4285e4..0d348500b24b 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -990,44 +990,38 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 	{ SYS_DESC(SYS_PMEVTYPERn_EL0(n)),					\
 	  access_pmu_evtyper, reset_unknown, (PMEVTYPER0_EL0 + n), }
 
-static bool access_cntp_tval(struct kvm_vcpu *vcpu,
-		struct sys_reg_params *p,
-		const struct sys_reg_desc *r)
+static bool access_arch_timer(struct kvm_vcpu *vcpu,
+			      struct sys_reg_params *p,
+			      const struct sys_reg_desc *r)
 {
-	u64 now = kvm_phys_timer_read();
-	u64 cval;
+	enum kvm_arch_timers tmr;
+	enum kvm_arch_timer_regs treg;
+	u64 reg = reg_to_encoding(r);
 
-	if (p->is_write) {
-		kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL,
-				      p->regval + now);
-	} else {
-		cval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL);
-		p->regval = cval - now;
+	switch (reg) {
+	case SYS_CNTP_TVAL_EL0:
+	case SYS_AARCH32_CNTP_TVAL:
+		tmr = TIMER_PTIMER;
+		treg = TIMER_REG_TVAL;
+		break;
+	case SYS_CNTP_CTL_EL0:
+	case SYS_AARCH32_CNTP_CTL:
+		tmr = TIMER_PTIMER;
+		treg = TIMER_REG_CTL;
+		break;
+	case SYS_CNTP_CVAL_EL0:
+	case SYS_AARCH32_CNTP_CVAL:
+		tmr = TIMER_PTIMER;
+		treg = TIMER_REG_CVAL;
+		break;
+	default:
+		BUG();
 	}
 
-	return true;
-}
-
-static bool access_cntp_ctl(struct kvm_vcpu *vcpu,
-		struct sys_reg_params *p,
-		const struct sys_reg_desc *r)
-{
-	if (p->is_write)
-		kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CTL, p->regval);
-	else
-		p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CTL);
-
-	return true;
-}
-
-static bool access_cntp_cval(struct kvm_vcpu *vcpu,
-		struct sys_reg_params *p,
-		const struct sys_reg_desc *r)
-{
 	if (p->is_write)
-		kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, p->regval);
+		kvm_arm_timer_write_sysreg(vcpu, tmr, treg, p->regval);
 	else
-		p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL);
+		p->regval = kvm_arm_timer_read_sysreg(vcpu, tmr, treg);
 
 	return true;
 }
@@ -1392,9 +1386,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	{ SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 },
 	{ SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 },
 
-	{ SYS_DESC(SYS_CNTP_TVAL_EL0), access_cntp_tval },
-	{ SYS_DESC(SYS_CNTP_CTL_EL0), access_cntp_ctl },
-	{ SYS_DESC(SYS_CNTP_CVAL_EL0), access_cntp_cval },
+	{ SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer },
+	{ SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer },
+	{ SYS_DESC(SYS_CNTP_CVAL_EL0), access_arch_timer },
 
 	/* PMEVCNTRn_EL0 */
 	PMU_PMEVCNTR_EL0(0),
@@ -1715,10 +1709,9 @@ static const struct sys_reg_desc cp15_regs[] = {
 
 	{ Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, c13_CID },
 
-	/* CNTP_TVAL */
-	{ Op1( 0), CRn(14), CRm( 2), Op2( 0), access_cntp_tval },
-	/* CNTP_CTL */
-	{ Op1( 0), CRn(14), CRm( 2), Op2( 1), access_cntp_ctl },
+	/* Arch Tmers */
+	{ SYS_DESC(SYS_AARCH32_CNTP_TVAL), access_arch_timer },
+	{ SYS_DESC(SYS_AARCH32_CNTP_CTL), access_arch_timer },
 
 	/* PMEVCNTRn */
 	PMU_PMEVCNTR(0),
@@ -1795,7 +1788,7 @@ static const struct sys_reg_desc cp15_64_regs[] = {
 	{ Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 },
 	{ Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */
 	{ Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */
-	{ Op1( 2), CRn( 0), CRm(14), Op2( 0), access_cntp_cval },
+	{ SYS_DESC(SYS_AARCH32_CNTP_CVAL),    access_arch_timer },
 };
 
 /* Target specific emulation tables */
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index d6e6a45d1d24..d26b7fde9935 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -22,6 +22,19 @@
 #include <linux/clocksource.h>
 #include <linux/hrtimer.h>
 
+enum kvm_arch_timers {
+	TIMER_PTIMER,
+	TIMER_VTIMER,
+	NR_KVM_TIMERS
+};
+
+enum kvm_arch_timer_regs {
+	TIMER_REG_CNT,
+	TIMER_REG_CVAL,
+	TIMER_REG_TVAL,
+	TIMER_REG_CTL,
+};
+
 struct arch_timer_context {
 	/* Registers: control register, timer value */
 	u32				cnt_ctl;
@@ -87,5 +100,15 @@ bool kvm_arch_timer_get_input_level(int vintid);
 
 #define vcpu_vtimer(v)	(&(v)->arch.timer_cpu.vtimer)
 #define vcpu_ptimer(v)	(&(v)->arch.timer_cpu.ptimer)
+#define vcpu_get_timer(v,t)					\
+	(t == TIMER_VTIMER ? vcpu_vtimer(v) : vcpu_ptimer(v))
+
+u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
+			      enum kvm_arch_timers tmr,
+			      enum kvm_arch_timer_regs treg);
+void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu,
+				enum kvm_arch_timers tmr,
+				enum kvm_arch_timer_regs treg,
+				u64 val);
 
 #endif
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 4986028d9829..f7d377448438 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -25,6 +25,7 @@
 
 #include <clocksource/arm_arch_timer.h>
 #include <asm/arch_timer.h>
+#include <asm/kvm_emulate.h>
 #include <asm/kvm_hyp.h>
 
 #include <kvm/arm_vgic.h>
@@ -52,6 +53,13 @@ static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx);
 static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
 				 struct arch_timer_context *timer_ctx);
 static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx);
+static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
+				struct arch_timer_context *timer,
+				enum kvm_arch_timer_regs treg,
+				u64 val);
+static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
+			      struct arch_timer_context *timer,
+			      enum kvm_arch_timer_regs treg);
 
 u64 kvm_phys_timer_read(void)
 {
@@ -628,24 +636,25 @@ static void kvm_timer_init_interrupt(void *info)
 
 int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
 {
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-
 	switch (regid) {
 	case KVM_REG_ARM_TIMER_CTL:
-		vtimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT;
+		kvm_arm_timer_write(vcpu,
+				    vcpu_vtimer(vcpu), TIMER_REG_CTL, value);
 		break;
 	case KVM_REG_ARM_TIMER_CNT:
 		update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value);
 		break;
 	case KVM_REG_ARM_TIMER_CVAL:
-		vtimer->cnt_cval = value;
+		kvm_arm_timer_write(vcpu,
+				    vcpu_vtimer(vcpu), TIMER_REG_CVAL, value);
 		break;
 	case KVM_REG_ARM_PTIMER_CTL:
-		ptimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT;
+		kvm_arm_timer_write(vcpu,
+				    vcpu_ptimer(vcpu), TIMER_REG_CTL, value);
 		break;
 	case KVM_REG_ARM_PTIMER_CVAL:
-		ptimer->cnt_cval = value;
+		kvm_arm_timer_write(vcpu,
+				    vcpu_ptimer(vcpu), TIMER_REG_CVAL, value);
 		break;
 
 	default:
@@ -672,26 +681,113 @@ static u64 read_timer_ctl(struct arch_timer_context *timer)
 
 u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid)
 {
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-
 	switch (regid) {
 	case KVM_REG_ARM_TIMER_CTL:
-		return read_timer_ctl(vtimer);
+		return kvm_arm_timer_read(vcpu,
+					  vcpu_vtimer(vcpu), TIMER_REG_CTL);
 	case KVM_REG_ARM_TIMER_CNT:
-		return kvm_phys_timer_read() - vtimer->cntvoff;
+		return kvm_arm_timer_read(vcpu,
+					  vcpu_vtimer(vcpu), TIMER_REG_CNT);
 	case KVM_REG_ARM_TIMER_CVAL:
-		return vtimer->cnt_cval;
+		return kvm_arm_timer_read(vcpu,
+					  vcpu_vtimer(vcpu), TIMER_REG_CVAL);
 	case KVM_REG_ARM_PTIMER_CTL:
-		return read_timer_ctl(ptimer);
-	case KVM_REG_ARM_PTIMER_CVAL:
-		return ptimer->cnt_cval;
+		return kvm_arm_timer_read(vcpu,
+					  vcpu_ptimer(vcpu), TIMER_REG_CTL);
 	case KVM_REG_ARM_PTIMER_CNT:
-		return kvm_phys_timer_read();
+		return kvm_arm_timer_read(vcpu,
+					  vcpu_vtimer(vcpu), TIMER_REG_CNT);
+	case KVM_REG_ARM_PTIMER_CVAL:
+		return kvm_arm_timer_read(vcpu,
+					  vcpu_ptimer(vcpu), TIMER_REG_CVAL);
 	}
 	return (u64)-1;
 }
 
+static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
+			      struct arch_timer_context *timer,
+			      enum kvm_arch_timer_regs treg)
+{
+	u64 val;
+
+	switch (treg) {
+	case TIMER_REG_TVAL:
+		val = kvm_phys_timer_read() - timer->cntvoff - timer->cnt_cval;
+		break;
+
+	case TIMER_REG_CTL:
+		val = read_timer_ctl(timer);
+		break;
+
+	case TIMER_REG_CVAL:
+		val = timer->cnt_cval;
+		break;
+
+	case TIMER_REG_CNT:
+		val = kvm_phys_timer_read() - timer->cntvoff;
+		break;
+
+	default:
+		BUG();
+	}
+
+	return val;
+}
+
+u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
+			      enum kvm_arch_timers tmr,
+			      enum kvm_arch_timer_regs treg)
+{
+	u64 val;
+
+	preempt_disable();
+	kvm_timer_vcpu_put(vcpu);
+
+	val = kvm_arm_timer_read(vcpu, vcpu_get_timer(vcpu, tmr), treg);
+
+	kvm_timer_vcpu_load(vcpu);
+	preempt_enable();
+
+	return val;
+}
+
+static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
+				struct arch_timer_context *timer,
+				enum kvm_arch_timer_regs treg,
+				u64 val)
+{
+	switch (treg) {
+	case TIMER_REG_TVAL:
+		timer->cnt_cval = val - kvm_phys_timer_read() - timer->cntvoff;
+		break;
+
+	case TIMER_REG_CTL:
+		timer->cnt_ctl = val & ~ARCH_TIMER_CTRL_IT_STAT;
+		break;
+
+	case TIMER_REG_CVAL:
+		timer->cnt_cval = val;
+		break;
+
+	default:
+		BUG();
+	}
+}
+
+void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu,
+				enum kvm_arch_timers tmr,
+				enum kvm_arch_timer_regs treg,
+				u64 val)
+{
+	preempt_disable();
+	kvm_timer_vcpu_put(vcpu);
+
+	kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val);
+
+	kvm_timer_vcpu_load(vcpu);
+	preempt_enable();
+}
+
 static int kvm_timer_starting_cpu(unsigned int cpu)
 {
 	kvm_timer_init_interrupt(NULL);
-- 
cgit v1.2.3


From e604dd5d45c75c2112424dec74853efb708f4fa6 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@arm.com>
Date: Tue, 18 Sep 2018 10:08:18 -0700
Subject: KVM: arm/arm64: timer: Rework data structures for multiple timers

Prepare for having 4 timer data structures (2 for now).

Move loaded to the cpu data structure and not the individual timer
structure, in preparation for assigning the EL1 phys timer as well.

Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/kvm/arm_arch_timer.h | 41 ++++++++++++++++---------------
 virt/kvm/arm/arch_timer.c    | 58 +++++++++++++++++++++++---------------------
 2 files changed, 51 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index d26b7fde9935..ab835112204d 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -36,6 +36,8 @@ enum kvm_arch_timer_regs {
 };
 
 struct arch_timer_context {
+	struct kvm_vcpu			*vcpu;
+
 	/* Registers: control register, timer value */
 	u32				cnt_ctl;
 	u64				cnt_cval;
@@ -43,32 +45,31 @@ struct arch_timer_context {
 	/* Timer IRQ */
 	struct kvm_irq_level		irq;
 
-	/*
-	 * We have multiple paths which can save/restore the timer state
-	 * onto the hardware, so we need some way of keeping track of
-	 * where the latest state is.
-	 *
-	 * loaded == true:  State is loaded on the hardware registers.
-	 * loaded == false: State is stored in memory.
-	 */
-	bool			loaded;
-
 	/* Virtual offset */
-	u64			cntvoff;
+	u64				cntvoff;
+
+	/* Emulated Timer (may be unused) */
+	struct hrtimer			hrtimer;
 };
 
 struct arch_timer_cpu {
-	struct arch_timer_context	vtimer;
-	struct arch_timer_context	ptimer;
+	struct arch_timer_context timers[NR_KVM_TIMERS];
 
 	/* Background timer used when the guest is not running */
 	struct hrtimer			bg_timer;
 
-	/* Physical timer emulation */
-	struct hrtimer			phys_timer;
-
 	/* Is the timer enabled */
 	bool			enabled;
+
+	/*
+	 * We have multiple paths which can save/restore the timer state
+	 * onto the hardware, so we need some way of keeping track of
+	 * where the latest state is.
+	 *
+	 * loaded == true:  State is loaded on the hardware registers.
+	 * loaded == false: State is stored in memory.
+	 */
+	bool			loaded;
 };
 
 int kvm_timer_hyp_init(bool);
@@ -98,10 +99,10 @@ void kvm_timer_init_vhe(void);
 
 bool kvm_arch_timer_get_input_level(int vintid);
 
-#define vcpu_vtimer(v)	(&(v)->arch.timer_cpu.vtimer)
-#define vcpu_ptimer(v)	(&(v)->arch.timer_cpu.ptimer)
-#define vcpu_get_timer(v,t)					\
-	(t == TIMER_VTIMER ? vcpu_vtimer(v) : vcpu_ptimer(v))
+#define vcpu_timer(v)	(&(v)->arch.timer_cpu)
+#define vcpu_get_timer(v,t)	(&vcpu_timer(v)->timers[(t)])
+#define vcpu_vtimer(v)	(&(v)->arch.timer_cpu.timers[TIMER_VTIMER])
+#define vcpu_ptimer(v)	(&(v)->arch.timer_cpu.timers[TIMER_PTIMER])
 
 u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
 			      enum kvm_arch_timers tmr,
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index f7d377448438..471f9fd004c9 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -184,13 +184,11 @@ static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt)
 static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt)
 {
 	struct arch_timer_context *ptimer;
-	struct arch_timer_cpu *timer;
 	struct kvm_vcpu *vcpu;
 	u64 ns;
 
-	timer = container_of(hrt, struct arch_timer_cpu, phys_timer);
-	vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu);
-	ptimer = vcpu_ptimer(vcpu);
+	ptimer = container_of(hrt, struct arch_timer_context, hrtimer);
+	vcpu = ptimer->vcpu;
 
 	/*
 	 * Check that the timer has really expired from the guest's
@@ -209,9 +207,10 @@ static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt)
 
 static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
 {
+	struct arch_timer_cpu *timer = vcpu_timer(timer_ctx->vcpu);
 	u64 cval, now;
 
-	if (timer_ctx->loaded) {
+	if (timer->loaded) {
 		u32 cnt_ctl;
 
 		/* Only the virtual timer can be loaded so far */
@@ -280,7 +279,6 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
 /* Schedule the background timer for the emulated timer. */
 static void phys_timer_emulate(struct kvm_vcpu *vcpu)
 {
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
 
 	/*
@@ -289,11 +287,11 @@ static void phys_timer_emulate(struct kvm_vcpu *vcpu)
 	 * then we also don't need a soft timer.
 	 */
 	if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) {
-		soft_timer_cancel(&timer->phys_timer);
+		soft_timer_cancel(&ptimer->hrtimer);
 		return;
 	}
 
-	soft_timer_start(&timer->phys_timer, kvm_timer_compute_delta(ptimer));
+	soft_timer_start(&ptimer->hrtimer, kvm_timer_compute_delta(ptimer));
 }
 
 /*
@@ -303,7 +301,7 @@ static void phys_timer_emulate(struct kvm_vcpu *vcpu)
  */
 static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
 {
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
 	bool level;
@@ -329,13 +327,13 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
 
 static void vtimer_save_state(struct kvm_vcpu *vcpu)
 {
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 	unsigned long flags;
 
 	local_irq_save(flags);
 
-	if (!vtimer->loaded)
+	if (!timer->loaded)
 		goto out;
 
 	if (timer->enabled) {
@@ -347,7 +345,7 @@ static void vtimer_save_state(struct kvm_vcpu *vcpu)
 	write_sysreg_el0(0, cntv_ctl);
 	isb();
 
-	vtimer->loaded = false;
+	timer->loaded = false;
 out:
 	local_irq_restore(flags);
 }
@@ -359,7 +357,7 @@ out:
  */
 static void kvm_timer_blocking(struct kvm_vcpu *vcpu)
 {
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
 
@@ -379,20 +377,20 @@ static void kvm_timer_blocking(struct kvm_vcpu *vcpu)
 
 static void kvm_timer_unblocking(struct kvm_vcpu *vcpu)
 {
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
 
 	soft_timer_cancel(&timer->bg_timer);
 }
 
 static void vtimer_restore_state(struct kvm_vcpu *vcpu)
 {
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 	unsigned long flags;
 
 	local_irq_save(flags);
 
-	if (vtimer->loaded)
+	if (timer->loaded)
 		goto out;
 
 	if (timer->enabled) {
@@ -401,7 +399,7 @@ static void vtimer_restore_state(struct kvm_vcpu *vcpu)
 		write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl);
 	}
 
-	vtimer->loaded = true;
+	timer->loaded = true;
 out:
 	local_irq_restore(flags);
 }
@@ -462,7 +460,7 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
 
 void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
 {
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
 
@@ -507,7 +505,8 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
 
 void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
 {
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
 
 	if (unlikely(!timer->enabled))
 		return;
@@ -523,7 +522,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
 	 * In any case, we re-schedule the hrtimer for the physical timer when
 	 * coming back to the VCPU thread in kvm_timer_vcpu_load().
 	 */
-	soft_timer_cancel(&timer->phys_timer);
+	soft_timer_cancel(&ptimer->hrtimer);
 
 	if (swait_active(kvm_arch_vcpu_wq(vcpu)))
 		kvm_timer_blocking(vcpu);
@@ -559,7 +558,7 @@ static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu)
 
 void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
 {
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
 
 	if (unlikely(!timer->enabled))
 		return;
@@ -570,7 +569,7 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
 
 int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
 {
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
 
@@ -611,22 +610,25 @@ static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff)
 
 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
 {
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
 
 	/* Synchronize cntvoff across all vtimers of a VM. */
 	update_vtimer_cntvoff(vcpu, kvm_phys_timer_read());
-	vcpu_ptimer(vcpu)->cntvoff = 0;
+	ptimer->cntvoff = 0;
 
 	hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	timer->bg_timer.function = kvm_bg_timer_expire;
 
-	hrtimer_init(&timer->phys_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
-	timer->phys_timer.function = kvm_phys_timer_expire;
+	hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	ptimer->hrtimer.function = kvm_phys_timer_expire;
 
 	vtimer->irq.irq = default_vtimer_irq.irq;
 	ptimer->irq.irq = default_ptimer_irq.irq;
+
+	vtimer->vcpu = vcpu;
+	ptimer->vcpu = vcpu;
 }
 
 static void kvm_timer_init_interrupt(void *info)
@@ -860,7 +862,7 @@ out_free_irq:
 
 void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
 {
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
 
 	soft_timer_cancel(&timer->bg_timer);
 }
@@ -904,7 +906,7 @@ bool kvm_arch_timer_get_input_level(int vintid)
 
 int kvm_timer_enable(struct kvm_vcpu *vcpu)
 {
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 	int ret;
 
-- 
cgit v1.2.3


From 9e01dc76be6a3b5768cb02130d2ff0055a68809a Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@arm.com>
Date: Tue, 19 Feb 2019 14:04:30 +0100
Subject: KVM: arm/arm64: arch_timer: Assign the phys timer on VHE systems

VHE systems don't have to emulate the physical timer, we can simply
assign the EL1 physical timer directly to the VM as the host always
uses the EL2 timers.

In order to minimize the amount of cruft, AArch32 gets definitions for
the physical timer too, but is should be generally unused on this
architecture.

Co-written with Marc Zyngier <marc.zyngier@arm.com>

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
---
 arch/arm/include/asm/kvm_hyp.h |   4 +
 include/kvm/arm_arch_timer.h   |   6 ++
 virt/kvm/arm/arch_timer.c      | 219 ++++++++++++++++++++++++++++++++---------
 3 files changed, 180 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h
index e93a0cac9add..87bcd18df8d5 100644
--- a/arch/arm/include/asm/kvm_hyp.h
+++ b/arch/arm/include/asm/kvm_hyp.h
@@ -40,6 +40,7 @@
 #define TTBR1		__ACCESS_CP15_64(1, c2)
 #define VTTBR		__ACCESS_CP15_64(6, c2)
 #define PAR		__ACCESS_CP15_64(0, c7)
+#define CNTP_CVAL	__ACCESS_CP15_64(2, c14)
 #define CNTV_CVAL	__ACCESS_CP15_64(3, c14)
 #define CNTVOFF		__ACCESS_CP15_64(4, c14)
 
@@ -85,6 +86,7 @@
 #define TID_PRIV	__ACCESS_CP15(c13, 0, c0, 4)
 #define HTPIDR		__ACCESS_CP15(c13, 4, c0, 2)
 #define CNTKCTL		__ACCESS_CP15(c14, 0, c1, 0)
+#define CNTP_CTL	__ACCESS_CP15(c14, 0, c2, 1)
 #define CNTV_CTL	__ACCESS_CP15(c14, 0, c3, 1)
 #define CNTHCTL		__ACCESS_CP15(c14, 4, c1, 0)
 
@@ -94,6 +96,8 @@
 #define read_sysreg_el0(r)		read_sysreg(r##_el0)
 #define write_sysreg_el0(v, r)		write_sysreg(v, r##_el0)
 
+#define cntp_ctl_el0			CNTP_CTL
+#define cntp_cval_el0			CNTP_CVAL
 #define cntv_ctl_el0			CNTV_CTL
 #define cntv_cval_el0			CNTV_CVAL
 #define cntvoff_el2			CNTVOFF
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index ab835112204d..6d4a33a9c45a 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -50,6 +50,10 @@ struct arch_timer_context {
 
 	/* Emulated Timer (may be unused) */
 	struct hrtimer			hrtimer;
+
+	/* Duplicated state from arch_timer.c for convenience */
+	u32				host_timer_irq;
+	u32				host_timer_irq_flags;
 };
 
 struct arch_timer_cpu {
@@ -104,6 +108,8 @@ bool kvm_arch_timer_get_input_level(int vintid);
 #define vcpu_vtimer(v)	(&(v)->arch.timer_cpu.timers[TIMER_VTIMER])
 #define vcpu_ptimer(v)	(&(v)->arch.timer_cpu.timers[TIMER_PTIMER])
 
+#define arch_timer_ctx_index(ctx)	((ctx) - vcpu_timer((ctx)->vcpu)->timers)
+
 u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
 			      enum kvm_arch_timers tmr,
 			      enum kvm_arch_timer_regs treg);
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 471f9fd004c9..10c15151c87e 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -35,7 +35,9 @@
 
 static struct timecounter *timecounter;
 static unsigned int host_vtimer_irq;
+static unsigned int host_ptimer_irq;
 static u32 host_vtimer_irq_flags;
+static u32 host_ptimer_irq_flags;
 
 static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
 
@@ -86,20 +88,24 @@ static void soft_timer_cancel(struct hrtimer *hrt)
 static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
 {
 	struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
-	struct arch_timer_context *vtimer;
+	struct arch_timer_context *ctx;
 
 	/*
 	 * We may see a timer interrupt after vcpu_put() has been called which
 	 * sets the CPU's vcpu pointer to NULL, because even though the timer
-	 * has been disabled in vtimer_save_state(), the hardware interrupt
+	 * has been disabled in timer_save_state(), the hardware interrupt
 	 * signal may not have been retired from the interrupt controller yet.
 	 */
 	if (!vcpu)
 		return IRQ_HANDLED;
 
-	vtimer = vcpu_vtimer(vcpu);
-	if (kvm_timer_should_fire(vtimer))
-		kvm_timer_update_irq(vcpu, true, vtimer);
+	if (irq == host_vtimer_irq)
+		ctx = vcpu_vtimer(vcpu);
+	else
+		ctx = vcpu_ptimer(vcpu);
+
+	if (kvm_timer_should_fire(ctx))
+		kvm_timer_update_irq(vcpu, true, ctx);
 
 	if (userspace_irqchip(vcpu->kvm) &&
 	    !static_branch_unlikely(&has_gic_active_state))
@@ -208,13 +214,25 @@ static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt)
 static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
 {
 	struct arch_timer_cpu *timer = vcpu_timer(timer_ctx->vcpu);
+	enum kvm_arch_timers index = arch_timer_ctx_index(timer_ctx);
 	u64 cval, now;
 
 	if (timer->loaded) {
-		u32 cnt_ctl;
+		u32 cnt_ctl = 0;
+
+		switch (index) {
+		case TIMER_VTIMER:
+			cnt_ctl = read_sysreg_el0(cntv_ctl);
+			break;
+		case TIMER_PTIMER:
+			cnt_ctl = read_sysreg_el0(cntp_ctl);
+			break;
+		case NR_KVM_TIMERS:
+			/* GCC is braindead */
+			cnt_ctl = 0;
+			break;
+		}
 
-		/* Only the virtual timer can be loaded so far */
-		cnt_ctl = read_sysreg_el0(cntv_ctl);
 		return  (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) &&
 		        (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) &&
 		       !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK);
@@ -310,7 +328,7 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
 		return;
 
 	/*
-	 * The vtimer virtual interrupt is a 'mapped' interrupt, meaning part
+	 * If the timer virtual interrupt is a 'mapped' interrupt, part
 	 * of its lifecycle is offloaded to the hardware, and we therefore may
 	 * not have lowered the irq.level value before having to signal a new
 	 * interrupt, but have to signal an interrupt every time the level is
@@ -319,31 +337,55 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
 	level = kvm_timer_should_fire(vtimer);
 	kvm_timer_update_irq(vcpu, level, vtimer);
 
+	if (has_vhe()) {
+		level = kvm_timer_should_fire(ptimer);
+		kvm_timer_update_irq(vcpu, level, ptimer);
+
+		return;
+	}
+
 	phys_timer_emulate(vcpu);
 
 	if (kvm_timer_should_fire(ptimer) != ptimer->irq.level)
 		kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer);
 }
 
-static void vtimer_save_state(struct kvm_vcpu *vcpu)
+static void timer_save_state(struct arch_timer_context *ctx)
 {
-	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+	struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
+	enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
 	unsigned long flags;
 
+	if (!timer->enabled)
+		return;
+
 	local_irq_save(flags);
 
 	if (!timer->loaded)
 		goto out;
 
-	if (timer->enabled) {
-		vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
-		vtimer->cnt_cval = read_sysreg_el0(cntv_cval);
-	}
+	switch (index) {
+	case TIMER_VTIMER:
+		ctx->cnt_ctl = read_sysreg_el0(cntv_ctl);
+		ctx->cnt_cval = read_sysreg_el0(cntv_cval);
 
-	/* Disable the virtual timer */
-	write_sysreg_el0(0, cntv_ctl);
-	isb();
+		/* Disable the timer */
+		write_sysreg_el0(0, cntv_ctl);
+		isb();
+
+		break;
+	case TIMER_PTIMER:
+		ctx->cnt_ctl = read_sysreg_el0(cntp_ctl);
+		ctx->cnt_cval = read_sysreg_el0(cntp_cval);
+
+		/* Disable the timer */
+		write_sysreg_el0(0, cntp_ctl);
+		isb();
+
+		break;
+	case NR_KVM_TIMERS:
+		break; /* GCC is braindead */
+	}
 
 	timer->loaded = false;
 out:
@@ -382,21 +424,33 @@ static void kvm_timer_unblocking(struct kvm_vcpu *vcpu)
 	soft_timer_cancel(&timer->bg_timer);
 }
 
-static void vtimer_restore_state(struct kvm_vcpu *vcpu)
+static void timer_restore_state(struct arch_timer_context *ctx)
 {
-	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+	struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
+	enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
 	unsigned long flags;
 
+	if (!timer->enabled)
+		return;
+
 	local_irq_save(flags);
 
 	if (timer->loaded)
 		goto out;
 
-	if (timer->enabled) {
-		write_sysreg_el0(vtimer->cnt_cval, cntv_cval);
+	switch (index) {
+	case TIMER_VTIMER:
+		write_sysreg_el0(ctx->cnt_cval, cntv_cval);
+		isb();
+		write_sysreg_el0(ctx->cnt_ctl, cntv_ctl);
+		break;
+	case TIMER_PTIMER:
+		write_sysreg_el0(ctx->cnt_cval, cntp_cval);
 		isb();
-		write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl);
+		write_sysreg_el0(ctx->cnt_ctl, cntp_ctl);
+		break;
+	case NR_KVM_TIMERS:
+		break; /* GCC is braindead */
 	}
 
 	timer->loaded = true;
@@ -419,23 +473,23 @@ static void set_cntvoff(u64 cntvoff)
 	kvm_call_hyp(__kvm_timer_set_cntvoff, low, high);
 }
 
-static inline void set_vtimer_irq_phys_active(struct kvm_vcpu *vcpu, bool active)
+static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active)
 {
 	int r;
-	r = irq_set_irqchip_state(host_vtimer_irq, IRQCHIP_STATE_ACTIVE, active);
+	r = irq_set_irqchip_state(ctx->host_timer_irq, IRQCHIP_STATE_ACTIVE, active);
 	WARN_ON(r);
 }
 
-static void kvm_timer_vcpu_load_gic(struct kvm_vcpu *vcpu)
+static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx)
 {
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+	struct kvm_vcpu *vcpu = ctx->vcpu;
 	bool phys_active;
 
 	if (irqchip_in_kernel(vcpu->kvm))
-		phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq);
+		phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq);
 	else
-		phys_active = vtimer->irq.level;
-	set_vtimer_irq_phys_active(vcpu, phys_active);
+		phys_active = ctx->irq.level;
+	set_timer_irq_phys_active(ctx, phys_active);
 }
 
 static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
@@ -467,14 +521,22 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
 	if (unlikely(!timer->enabled))
 		return;
 
-	if (static_branch_likely(&has_gic_active_state))
-		kvm_timer_vcpu_load_gic(vcpu);
-	else
+	if (static_branch_likely(&has_gic_active_state)) {
+		kvm_timer_vcpu_load_gic(vtimer);
+		if (has_vhe())
+			kvm_timer_vcpu_load_gic(ptimer);
+	} else {
 		kvm_timer_vcpu_load_nogic(vcpu);
+	}
 
 	set_cntvoff(vtimer->cntvoff);
 
-	vtimer_restore_state(vcpu);
+	timer_restore_state(vtimer);
+
+	if (has_vhe()) {
+		timer_restore_state(ptimer);
+		return;
+	}
 
 	/* Set the background timer for the physical timer emulation. */
 	phys_timer_emulate(vcpu);
@@ -506,12 +568,17 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
 void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
 
 	if (unlikely(!timer->enabled))
 		return;
 
-	vtimer_save_state(vcpu);
+	timer_save_state(vtimer);
+	if (has_vhe()) {
+		timer_save_state(ptimer);
+		return;
+	}
 
 	/*
 	 * Cancel the physical timer emulation, because the only case where we
@@ -534,8 +601,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
 	 * counter of non-VHE case. For VHE, the virtual counter uses a fixed
 	 * virtual offset of zero, so no need to zero CNTVOFF_EL2 register.
 	 */
-	if (!has_vhe())
-		set_cntvoff(0);
+	set_cntvoff(0);
 }
 
 /*
@@ -550,7 +616,7 @@ static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu)
 	if (!kvm_timer_should_fire(vtimer)) {
 		kvm_timer_update_irq(vcpu, false, vtimer);
 		if (static_branch_likely(&has_gic_active_state))
-			set_vtimer_irq_phys_active(vcpu, false);
+			set_timer_irq_phys_active(vtimer, false);
 		else
 			enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
 	}
@@ -625,7 +691,12 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
 	ptimer->hrtimer.function = kvm_phys_timer_expire;
 
 	vtimer->irq.irq = default_vtimer_irq.irq;
+	vtimer->host_timer_irq = host_vtimer_irq;
+	vtimer->host_timer_irq_flags = host_vtimer_irq_flags;
+
 	ptimer->irq.irq = default_ptimer_irq.irq;
+	ptimer->host_timer_irq = host_ptimer_irq;
+	ptimer->host_timer_irq_flags = host_ptimer_irq_flags;
 
 	vtimer->vcpu = vcpu;
 	ptimer->vcpu = vcpu;
@@ -634,6 +705,7 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
 static void kvm_timer_init_interrupt(void *info)
 {
 	enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
+	enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags);
 }
 
 int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
@@ -815,6 +887,8 @@ int kvm_timer_hyp_init(bool has_gic)
 		return -ENODEV;
 	}
 
+	/* First, do the virtual EL1 timer irq */
+
 	if (info->virtual_irq <= 0) {
 		kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n",
 			info->virtual_irq);
@@ -825,15 +899,15 @@ int kvm_timer_hyp_init(bool has_gic)
 	host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq);
 	if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH &&
 	    host_vtimer_irq_flags != IRQF_TRIGGER_LOW) {
-		kvm_err("Invalid trigger for IRQ%d, assuming level low\n",
+		kvm_err("Invalid trigger for vtimer IRQ%d, assuming level low\n",
 			host_vtimer_irq);
 		host_vtimer_irq_flags = IRQF_TRIGGER_LOW;
 	}
 
 	err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler,
-				 "kvm guest timer", kvm_get_running_vcpus());
+				 "kvm guest vtimer", kvm_get_running_vcpus());
 	if (err) {
-		kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n",
+		kvm_err("kvm_arch_timer: can't request vtimer interrupt %d (%d)\n",
 			host_vtimer_irq, err);
 		return err;
 	}
@@ -851,6 +925,43 @@ int kvm_timer_hyp_init(bool has_gic)
 
 	kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq);
 
+	/* Now let's do the physical EL1 timer irq */
+
+	if (info->physical_irq > 0) {
+		host_ptimer_irq = info->physical_irq;
+		host_ptimer_irq_flags = irq_get_trigger_type(host_ptimer_irq);
+		if (host_ptimer_irq_flags != IRQF_TRIGGER_HIGH &&
+		    host_ptimer_irq_flags != IRQF_TRIGGER_LOW) {
+			kvm_err("Invalid trigger for ptimer IRQ%d, assuming level low\n",
+				host_ptimer_irq);
+			host_ptimer_irq_flags = IRQF_TRIGGER_LOW;
+		}
+
+		err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler,
+					 "kvm guest ptimer", kvm_get_running_vcpus());
+		if (err) {
+			kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n",
+				host_ptimer_irq, err);
+			return err;
+		}
+
+		if (has_gic) {
+			err = irq_set_vcpu_affinity(host_ptimer_irq,
+						    kvm_get_running_vcpus());
+			if (err) {
+				kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
+				goto out_free_irq;
+			}
+		}
+
+		kvm_debug("physical timer IRQ%d\n", host_ptimer_irq);
+	} else if (has_vhe()) {
+		kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n",
+			info->physical_irq);
+		err = -ENODEV;
+		goto out_free_irq;
+	}
+
 	cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING,
 			  "kvm/arm/timer:starting", kvm_timer_starting_cpu,
 			  kvm_timer_dying_cpu);
@@ -898,8 +1009,10 @@ bool kvm_arch_timer_get_input_level(int vintid)
 
 	if (vintid == vcpu_vtimer(vcpu)->irq.irq)
 		timer = vcpu_vtimer(vcpu);
+	else if (vintid == vcpu_ptimer(vcpu)->irq.irq)
+		timer = vcpu_ptimer(vcpu);
 	else
-		BUG(); /* We only map the vtimer so far */
+		BUG();
 
 	return kvm_timer_should_fire(timer);
 }
@@ -908,6 +1021,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
 	int ret;
 
 	if (timer->enabled)
@@ -930,14 +1044,21 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
 	if (ret)
 		return ret;
 
+	if (has_vhe()) {
+		ret = kvm_vgic_map_phys_irq(vcpu, host_ptimer_irq, ptimer->irq.irq,
+					    kvm_arch_timer_get_input_level);
+		if (ret)
+			return ret;
+	}
+
 no_vgic:
 	timer->enabled = 1;
 	return 0;
 }
 
 /*
- * On VHE system, we only need to configure trap on physical timer and counter
- * accesses in EL0 and EL1 once, not for every world switch.
+ * On VHE system, we only need to configure the EL2 timer trap register once,
+ * not for every world switch.
  * The host kernel runs at EL2 with HCR_EL2.TGE == 1,
  * and this makes those bits have no effect for the host kernel execution.
  */
@@ -948,11 +1069,11 @@ void kvm_timer_init_vhe(void)
 	u64 val;
 
 	/*
-	 * Disallow physical timer access for the guest.
-	 * Physical counter access is allowed.
+	 * VHE systems allow the guest direct access to the EL1 physical
+	 * timer/counter.
 	 */
 	val = read_sysreg(cnthctl_el2);
-	val &= ~(CNTHCTL_EL1PCEN << cnthctl_shift);
+	val |= (CNTHCTL_EL1PCEN << cnthctl_shift);
 	val |= (CNTHCTL_EL1PCTEN << cnthctl_shift);
 	write_sysreg(val, cnthctl_el2);
 }
-- 
cgit v1.2.3


From bee038a67487598ebbe995f85bf60c3a5b2e9099 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@arm.com>
Date: Fri, 4 Jan 2019 13:31:22 +0100
Subject: KVM: arm/arm64: Rework the timer code to use a timer_map

We are currently emulating two timers in two different ways.  When we
add support for nested virtualization in the future, we are going to be
emulating either two timers in two diffferent ways, or four timers in a
single way.

We need a unified data structure to keep track of how we map virtual
state to physical state and we need to cleanup some of the timer code to
operate more independently on a struct arch_timer_context instead of
trying to consider the global state of the VCPU and recomputing all
state.

Co-written with Marc Zyngier <marc.zyngier@arm.com>

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
---
 include/kvm/arm_arch_timer.h |  23 ++--
 virt/kvm/arm/arch_timer.c    | 295 +++++++++++++++++++++++--------------------
 virt/kvm/arm/trace.h         | 105 +++++++++++++++
 3 files changed, 278 insertions(+), 145 deletions(-)

(limited to 'include')

diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index 6d4a33a9c45a..05a18dd265b5 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -51,11 +51,24 @@ struct arch_timer_context {
 	/* Emulated Timer (may be unused) */
 	struct hrtimer			hrtimer;
 
+	/*
+	 * We have multiple paths which can save/restore the timer state onto
+	 * the hardware, so we need some way of keeping track of where the
+	 * latest state is.
+	 */
+	bool				loaded;
+
 	/* Duplicated state from arch_timer.c for convenience */
 	u32				host_timer_irq;
 	u32				host_timer_irq_flags;
 };
 
+struct timer_map {
+	struct arch_timer_context *direct_vtimer;
+	struct arch_timer_context *direct_ptimer;
+	struct arch_timer_context *emul_ptimer;
+};
+
 struct arch_timer_cpu {
 	struct arch_timer_context timers[NR_KVM_TIMERS];
 
@@ -64,16 +77,6 @@ struct arch_timer_cpu {
 
 	/* Is the timer enabled */
 	bool			enabled;
-
-	/*
-	 * We have multiple paths which can save/restore the timer state
-	 * onto the hardware, so we need some way of keeping track of
-	 * where the latest state is.
-	 *
-	 * loaded == true:  State is loaded on the hardware registers.
-	 * loaded == false: State is stored in memory.
-	 */
-	bool			loaded;
 };
 
 int kvm_timer_hyp_init(bool);
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 10c15151c87e..17f9de73cc8a 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -68,6 +68,21 @@ u64 kvm_phys_timer_read(void)
 	return timecounter->cc->read(timecounter->cc);
 }
 
+static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map)
+{
+	if (has_vhe()) {
+		map->direct_vtimer = vcpu_vtimer(vcpu);
+		map->direct_ptimer = vcpu_ptimer(vcpu);
+		map->emul_ptimer = NULL;
+	} else {
+		map->direct_vtimer = vcpu_vtimer(vcpu);
+		map->direct_ptimer = NULL;
+		map->emul_ptimer = vcpu_ptimer(vcpu);
+	}
+
+	trace_kvm_get_timer_map(vcpu->vcpu_id, map);
+}
+
 static inline bool userspace_irqchip(struct kvm *kvm)
 {
 	return static_branch_unlikely(&userspace_irqchip_in_use) &&
@@ -89,6 +104,7 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
 {
 	struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
 	struct arch_timer_context *ctx;
+	struct timer_map map;
 
 	/*
 	 * We may see a timer interrupt after vcpu_put() has been called which
@@ -99,10 +115,12 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
 	if (!vcpu)
 		return IRQ_HANDLED;
 
+	get_timer_map(vcpu, &map);
+
 	if (irq == host_vtimer_irq)
-		ctx = vcpu_vtimer(vcpu);
+		ctx = map.direct_vtimer;
 	else
-		ctx = vcpu_ptimer(vcpu);
+		ctx = map.direct_ptimer;
 
 	if (kvm_timer_should_fire(ctx))
 		kvm_timer_update_irq(vcpu, true, ctx);
@@ -136,7 +154,9 @@ static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx)
 
 static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx)
 {
-	return !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) &&
+	WARN_ON(timer_ctx && timer_ctx->loaded);
+	return timer_ctx &&
+	       !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) &&
 		(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE);
 }
 
@@ -146,21 +166,22 @@ static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx)
  */
 static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu)
 {
-	u64 min_virt = ULLONG_MAX, min_phys = ULLONG_MAX;
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+	u64 min_delta = ULLONG_MAX;
+	int i;
 
-	if (kvm_timer_irq_can_fire(vtimer))
-		min_virt = kvm_timer_compute_delta(vtimer);
+	for (i = 0; i < NR_KVM_TIMERS; i++) {
+		struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i];
 
-	if (kvm_timer_irq_can_fire(ptimer))
-		min_phys = kvm_timer_compute_delta(ptimer);
+		WARN(ctx->loaded, "timer %d loaded\n", i);
+		if (kvm_timer_irq_can_fire(ctx))
+			min_delta = min(min_delta, kvm_timer_compute_delta(ctx));
+	}
 
 	/* If none of timers can fire, then return 0 */
-	if ((min_virt == ULLONG_MAX) && (min_phys == ULLONG_MAX))
+	if (min_delta == ULLONG_MAX)
 		return 0;
 
-	return min(min_virt, min_phys);
+	return min_delta;
 }
 
 static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt)
@@ -187,37 +208,45 @@ static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt)
 	return HRTIMER_NORESTART;
 }
 
-static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt)
+static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt)
 {
-	struct arch_timer_context *ptimer;
+	struct arch_timer_context *ctx;
 	struct kvm_vcpu *vcpu;
 	u64 ns;
 
-	ptimer = container_of(hrt, struct arch_timer_context, hrtimer);
-	vcpu = ptimer->vcpu;
+	ctx = container_of(hrt, struct arch_timer_context, hrtimer);
+	vcpu = ctx->vcpu;
+
+	trace_kvm_timer_hrtimer_expire(ctx);
 
 	/*
 	 * Check that the timer has really expired from the guest's
 	 * PoV (NTP on the host may have forced it to expire
 	 * early). If not ready, schedule for a later time.
 	 */
-	ns = kvm_timer_compute_delta(ptimer);
+	ns = kvm_timer_compute_delta(ctx);
 	if (unlikely(ns)) {
 		hrtimer_forward_now(hrt, ns_to_ktime(ns));
 		return HRTIMER_RESTART;
 	}
 
-	kvm_timer_update_irq(vcpu, true, ptimer);
+	kvm_timer_update_irq(vcpu, true, ctx);
 	return HRTIMER_NORESTART;
 }
 
 static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
 {
-	struct arch_timer_cpu *timer = vcpu_timer(timer_ctx->vcpu);
-	enum kvm_arch_timers index = arch_timer_ctx_index(timer_ctx);
+	struct arch_timer_cpu *timer;
+	enum kvm_arch_timers index;
 	u64 cval, now;
 
-	if (timer->loaded) {
+	if (!timer_ctx)
+		return false;
+
+	timer = vcpu_timer(timer_ctx->vcpu);
+	index = arch_timer_ctx_index(timer_ctx);
+
+	if (timer_ctx->loaded) {
 		u32 cnt_ctl = 0;
 
 		switch (index) {
@@ -249,13 +278,13 @@ static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
 
 bool kvm_timer_is_pending(struct kvm_vcpu *vcpu)
 {
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+	struct timer_map map;
 
-	if (kvm_timer_should_fire(vtimer))
-		return true;
+	get_timer_map(vcpu, &map);
 
-	return kvm_timer_should_fire(ptimer);
+	return kvm_timer_should_fire(map.direct_vtimer) ||
+	       kvm_timer_should_fire(map.direct_ptimer) ||
+	       kvm_timer_should_fire(map.emul_ptimer);
 }
 
 /*
@@ -294,60 +323,28 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
 	}
 }
 
-/* Schedule the background timer for the emulated timer. */
-static void phys_timer_emulate(struct kvm_vcpu *vcpu)
+static void timer_emulate(struct arch_timer_context *ctx)
 {
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+	bool should_fire = kvm_timer_should_fire(ctx);
 
-	/*
-	 * If the timer can fire now, we don't need to have a soft timer
-	 * scheduled for the future.  If the timer cannot fire at all,
-	 * then we also don't need a soft timer.
-	 */
-	if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) {
-		soft_timer_cancel(&ptimer->hrtimer);
-		return;
-	}
-
-	soft_timer_start(&ptimer->hrtimer, kvm_timer_compute_delta(ptimer));
-}
-
-/*
- * Check if there was a change in the timer state, so that we should either
- * raise or lower the line level to the GIC or schedule a background timer to
- * emulate the physical timer.
- */
-static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-	bool level;
+	trace_kvm_timer_emulate(ctx, should_fire);
 
-	if (unlikely(!timer->enabled))
+	if (should_fire) {
+		kvm_timer_update_irq(ctx->vcpu, true, ctx);
 		return;
+	}
 
 	/*
-	 * If the timer virtual interrupt is a 'mapped' interrupt, part
-	 * of its lifecycle is offloaded to the hardware, and we therefore may
-	 * not have lowered the irq.level value before having to signal a new
-	 * interrupt, but have to signal an interrupt every time the level is
-	 * asserted.
+	 * If the timer can fire now, we don't need to have a soft timer
+	 * scheduled for the future.  If the timer cannot fire at all,
+	 * then we also don't need a soft timer.
 	 */
-	level = kvm_timer_should_fire(vtimer);
-	kvm_timer_update_irq(vcpu, level, vtimer);
-
-	if (has_vhe()) {
-		level = kvm_timer_should_fire(ptimer);
-		kvm_timer_update_irq(vcpu, level, ptimer);
-
+	if (!kvm_timer_irq_can_fire(ctx)) {
+		soft_timer_cancel(&ctx->hrtimer);
 		return;
 	}
 
-	phys_timer_emulate(vcpu);
-
-	if (kvm_timer_should_fire(ptimer) != ptimer->irq.level)
-		kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer);
+	soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx));
 }
 
 static void timer_save_state(struct arch_timer_context *ctx)
@@ -361,7 +358,7 @@ static void timer_save_state(struct arch_timer_context *ctx)
 
 	local_irq_save(flags);
 
-	if (!timer->loaded)
+	if (!ctx->loaded)
 		goto out;
 
 	switch (index) {
@@ -384,10 +381,12 @@ static void timer_save_state(struct arch_timer_context *ctx)
 
 		break;
 	case NR_KVM_TIMERS:
-		break; /* GCC is braindead */
+		BUG();
 	}
 
-	timer->loaded = false;
+	trace_kvm_timer_save_state(ctx);
+
+	ctx->loaded = false;
 out:
 	local_irq_restore(flags);
 }
@@ -400,14 +399,17 @@ out:
 static void kvm_timer_blocking(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+	struct timer_map map;
+
+	get_timer_map(vcpu, &map);
 
 	/*
-	 * If both timers are not capable of raising interrupts (disabled or
+	 * If no timers are capable of raising interrupts (disabled or
 	 * masked), then there's no more work for us to do.
 	 */
-	if (!kvm_timer_irq_can_fire(vtimer) && !kvm_timer_irq_can_fire(ptimer))
+	if (!kvm_timer_irq_can_fire(map.direct_vtimer) &&
+	    !kvm_timer_irq_can_fire(map.direct_ptimer) &&
+	    !kvm_timer_irq_can_fire(map.emul_ptimer))
 		return;
 
 	/*
@@ -435,7 +437,7 @@ static void timer_restore_state(struct arch_timer_context *ctx)
 
 	local_irq_save(flags);
 
-	if (timer->loaded)
+	if (ctx->loaded)
 		goto out;
 
 	switch (index) {
@@ -450,10 +452,12 @@ static void timer_restore_state(struct arch_timer_context *ctx)
 		write_sysreg_el0(ctx->cnt_ctl, cntp_ctl);
 		break;
 	case NR_KVM_TIMERS:
-		break; /* GCC is braindead */
+		BUG();
 	}
 
-	timer->loaded = true;
+	trace_kvm_timer_restore_state(ctx);
+
+	ctx->loaded = true;
 out:
 	local_irq_restore(flags);
 }
@@ -515,37 +519,31 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
 void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+	struct timer_map map;
 
 	if (unlikely(!timer->enabled))
 		return;
 
+	get_timer_map(vcpu, &map);
+
 	if (static_branch_likely(&has_gic_active_state)) {
-		kvm_timer_vcpu_load_gic(vtimer);
-		if (has_vhe())
-			kvm_timer_vcpu_load_gic(ptimer);
+		kvm_timer_vcpu_load_gic(map.direct_vtimer);
+		if (map.direct_ptimer)
+			kvm_timer_vcpu_load_gic(map.direct_ptimer);
 	} else {
 		kvm_timer_vcpu_load_nogic(vcpu);
 	}
 
-	set_cntvoff(vtimer->cntvoff);
-
-	timer_restore_state(vtimer);
-
-	if (has_vhe()) {
-		timer_restore_state(ptimer);
-		return;
-	}
-
-	/* Set the background timer for the physical timer emulation. */
-	phys_timer_emulate(vcpu);
+	set_cntvoff(map.direct_vtimer->cntvoff);
 
 	kvm_timer_unblocking(vcpu);
 
-	/* If the timer fired while we weren't running, inject it now */
-	if (kvm_timer_should_fire(ptimer) != ptimer->irq.level)
-		kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer);
+	timer_restore_state(map.direct_vtimer);
+	if (map.direct_ptimer)
+		timer_restore_state(map.direct_ptimer);
+
+	if (map.emul_ptimer)
+		timer_emulate(map.emul_ptimer);
 }
 
 bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
@@ -568,20 +566,19 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
 void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+	struct timer_map map;
 
 	if (unlikely(!timer->enabled))
 		return;
 
-	timer_save_state(vtimer);
-	if (has_vhe()) {
-		timer_save_state(ptimer);
-		return;
-	}
+	get_timer_map(vcpu, &map);
+
+	timer_save_state(map.direct_vtimer);
+	if (map.direct_ptimer)
+		timer_save_state(map.direct_ptimer);
 
 	/*
-	 * Cancel the physical timer emulation, because the only case where we
+	 * Cancel soft timer emulation, because the only case where we
 	 * need it after a vcpu_put is in the context of a sleeping VCPU, and
 	 * in that case we already factor in the deadline for the physical
 	 * timer when scheduling the bg_timer.
@@ -589,7 +586,8 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
 	 * In any case, we re-schedule the hrtimer for the physical timer when
 	 * coming back to the VCPU thread in kvm_timer_vcpu_load().
 	 */
-	soft_timer_cancel(&ptimer->hrtimer);
+	if (map.emul_ptimer)
+		soft_timer_cancel(&map.emul_ptimer->hrtimer);
 
 	if (swait_active(kvm_arch_vcpu_wq(vcpu)))
 		kvm_timer_blocking(vcpu);
@@ -636,8 +634,9 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
 int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+	struct timer_map map;
+
+	get_timer_map(vcpu, &map);
 
 	/*
 	 * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
@@ -645,12 +644,22 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
 	 * resets the timer to be disabled and unmasked and is compliant with
 	 * the ARMv7 architecture.
 	 */
-	vtimer->cnt_ctl = 0;
-	ptimer->cnt_ctl = 0;
-	kvm_timer_update_state(vcpu);
+	vcpu_vtimer(vcpu)->cnt_ctl = 0;
+	vcpu_ptimer(vcpu)->cnt_ctl = 0;
+
+	if (timer->enabled) {
+		kvm_timer_update_irq(vcpu, false, vcpu_vtimer(vcpu));
+		kvm_timer_update_irq(vcpu, false, vcpu_ptimer(vcpu));
+
+		if (irqchip_in_kernel(vcpu->kvm)) {
+			kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq);
+			if (map.direct_ptimer)
+				kvm_vgic_reset_mapped_irq(vcpu, map.direct_ptimer->irq.irq);
+		}
+	}
 
-	if (timer->enabled && irqchip_in_kernel(vcpu->kvm))
-		kvm_vgic_reset_mapped_irq(vcpu, vtimer->irq.irq);
+	if (map.emul_ptimer)
+		soft_timer_cancel(&map.emul_ptimer->hrtimer);
 
 	return 0;
 }
@@ -687,15 +696,18 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
 	hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	timer->bg_timer.function = kvm_bg_timer_expire;
 
+	hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
-	ptimer->hrtimer.function = kvm_phys_timer_expire;
+	vtimer->hrtimer.function = kvm_hrtimer_expire;
+	ptimer->hrtimer.function = kvm_hrtimer_expire;
 
 	vtimer->irq.irq = default_vtimer_irq.irq;
-	vtimer->host_timer_irq = host_vtimer_irq;
-	vtimer->host_timer_irq_flags = host_vtimer_irq_flags;
-
 	ptimer->irq.irq = default_ptimer_irq.irq;
+
+	vtimer->host_timer_irq = host_vtimer_irq;
 	ptimer->host_timer_irq = host_ptimer_irq;
+
+	vtimer->host_timer_irq_flags = host_vtimer_irq_flags;
 	ptimer->host_timer_irq_flags = host_ptimer_irq_flags;
 
 	vtimer->vcpu = vcpu;
@@ -710,32 +722,39 @@ static void kvm_timer_init_interrupt(void *info)
 
 int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
 {
+	struct arch_timer_context *timer;
+	bool level;
+
 	switch (regid) {
 	case KVM_REG_ARM_TIMER_CTL:
-		kvm_arm_timer_write(vcpu,
-				    vcpu_vtimer(vcpu), TIMER_REG_CTL, value);
+		timer = vcpu_vtimer(vcpu);
+		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
 		break;
 	case KVM_REG_ARM_TIMER_CNT:
+		timer = vcpu_vtimer(vcpu);
 		update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value);
 		break;
 	case KVM_REG_ARM_TIMER_CVAL:
-		kvm_arm_timer_write(vcpu,
-				    vcpu_vtimer(vcpu), TIMER_REG_CVAL, value);
+		timer = vcpu_vtimer(vcpu);
+		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
 		break;
 	case KVM_REG_ARM_PTIMER_CTL:
-		kvm_arm_timer_write(vcpu,
-				    vcpu_ptimer(vcpu), TIMER_REG_CTL, value);
+		timer = vcpu_ptimer(vcpu);
+		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
 		break;
 	case KVM_REG_ARM_PTIMER_CVAL:
-		kvm_arm_timer_write(vcpu,
-				    vcpu_ptimer(vcpu), TIMER_REG_CVAL, value);
+		timer = vcpu_ptimer(vcpu);
+		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
 		break;
 
 	default:
 		return -1;
 	}
 
-	kvm_timer_update_state(vcpu);
+	level = kvm_timer_should_fire(timer);
+	kvm_timer_update_irq(vcpu, level, timer);
+	timer_emulate(timer);
+
 	return 0;
 }
 
@@ -1020,8 +1039,7 @@ bool kvm_arch_timer_get_input_level(int vintid)
 int kvm_timer_enable(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+	struct timer_map map;
 	int ret;
 
 	if (timer->enabled)
@@ -1039,18 +1057,25 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
 		return -EINVAL;
 	}
 
-	ret = kvm_vgic_map_phys_irq(vcpu, host_vtimer_irq, vtimer->irq.irq,
+	get_timer_map(vcpu, &map);
+
+	ret = kvm_vgic_map_phys_irq(vcpu,
+				    map.direct_vtimer->host_timer_irq,
+				    map.direct_vtimer->irq.irq,
 				    kvm_arch_timer_get_input_level);
 	if (ret)
 		return ret;
 
-	if (has_vhe()) {
-		ret = kvm_vgic_map_phys_irq(vcpu, host_ptimer_irq, ptimer->irq.irq,
+	if (map.direct_ptimer) {
+		ret = kvm_vgic_map_phys_irq(vcpu,
+					    map.direct_ptimer->host_timer_irq,
+					    map.direct_ptimer->irq.irq,
 					    kvm_arch_timer_get_input_level);
-		if (ret)
-			return ret;
 	}
 
+	if (ret)
+		return ret;
+
 no_vgic:
 	timer->enabled = 1;
 	return 0;
diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h
index 3828beab93f2..54bb059243b9 100644
--- a/virt/kvm/arm/trace.h
+++ b/virt/kvm/arm/trace.h
@@ -2,6 +2,7 @@
 #if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_KVM_H
 
+#include <kvm/arm_arch_timer.h>
 #include <linux/tracepoint.h>
 
 #undef TRACE_SYSTEM
@@ -262,6 +263,110 @@ TRACE_EVENT(kvm_timer_update_irq,
 		  __entry->vcpu_id, __entry->irq, __entry->level)
 );
 
+TRACE_EVENT(kvm_get_timer_map,
+	TP_PROTO(unsigned long vcpu_id, struct timer_map *map),
+	TP_ARGS(vcpu_id, map),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,		vcpu_id	)
+		__field(	int,			direct_vtimer	)
+		__field(	int,			direct_ptimer	)
+		__field(	int,			emul_ptimer	)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id		= vcpu_id;
+		__entry->direct_vtimer		= arch_timer_ctx_index(map->direct_vtimer);
+		__entry->direct_ptimer =
+			(map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1;
+		__entry->emul_ptimer =
+			(map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1;
+	),
+
+	TP_printk("VCPU: %ld, dv: %d, dp: %d, ep: %d",
+		  __entry->vcpu_id,
+		  __entry->direct_vtimer,
+		  __entry->direct_ptimer,
+		  __entry->emul_ptimer)
+);
+
+TRACE_EVENT(kvm_timer_save_state,
+	TP_PROTO(struct arch_timer_context *ctx),
+	TP_ARGS(ctx),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,		ctl		)
+		__field(	unsigned long long,	cval		)
+		__field(	int,			timer_idx	)
+	),
+
+	TP_fast_assign(
+		__entry->ctl			= ctx->cnt_ctl;
+		__entry->cval			= ctx->cnt_cval;
+		__entry->timer_idx		= arch_timer_ctx_index(ctx);
+	),
+
+	TP_printk("   CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d",
+		  __entry->ctl,
+		  __entry->cval,
+		  __entry->timer_idx)
+);
+
+TRACE_EVENT(kvm_timer_restore_state,
+	TP_PROTO(struct arch_timer_context *ctx),
+	TP_ARGS(ctx),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,		ctl		)
+		__field(	unsigned long long,	cval		)
+		__field(	int,			timer_idx	)
+	),
+
+	TP_fast_assign(
+		__entry->ctl			= ctx->cnt_ctl;
+		__entry->cval			= ctx->cnt_cval;
+		__entry->timer_idx		= arch_timer_ctx_index(ctx);
+	),
+
+	TP_printk("CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d",
+		  __entry->ctl,
+		  __entry->cval,
+		  __entry->timer_idx)
+);
+
+TRACE_EVENT(kvm_timer_hrtimer_expire,
+	TP_PROTO(struct arch_timer_context *ctx),
+	TP_ARGS(ctx),
+
+	TP_STRUCT__entry(
+		__field(	int,			timer_idx	)
+	),
+
+	TP_fast_assign(
+		__entry->timer_idx		= arch_timer_ctx_index(ctx);
+	),
+
+	TP_printk("arch_timer_ctx_index: %d", __entry->timer_idx)
+);
+
+TRACE_EVENT(kvm_timer_emulate,
+	TP_PROTO(struct arch_timer_context *ctx, bool should_fire),
+	TP_ARGS(ctx, should_fire),
+
+	TP_STRUCT__entry(
+		__field(	int,			timer_idx	)
+		__field(	bool,			should_fire	)
+	),
+
+	TP_fast_assign(
+		__entry->timer_idx		= arch_timer_ctx_index(ctx);
+		__entry->should_fire		= should_fire;
+	),
+
+	TP_printk("arch_timer_ctx_index: %d (should_fire: %d)",
+		  __entry->timer_idx, __entry->should_fire)
+);
+
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit v1.2.3


From 04fb53101edef67517f2d5dc00c1a5eb707fe101 Mon Sep 17 00:00:00 2001
From: Artur Rojek <contact@artur-rojek.eu>
Date: Sun, 17 Feb 2019 15:29:11 +0100
Subject: power: supply: core: Add a field to support battery max voltage

Add a field for "voltage_max_design_uv" to present fully charged
battery voltage.

Signed-off-by: Artur Rojek <contact@artur-rojek.eu>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/power_supply_core.c | 3 +++
 include/linux/power_supply.h             | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index 07a85e19615c..c917a8b43b2b 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -573,6 +573,7 @@ int power_supply_get_battery_info(struct power_supply *psy,
 	info->energy_full_design_uwh         = -EINVAL;
 	info->charge_full_design_uah         = -EINVAL;
 	info->voltage_min_design_uv          = -EINVAL;
+	info->voltage_max_design_uv          = -EINVAL;
 	info->precharge_current_ua           = -EINVAL;
 	info->charge_term_current_ua         = -EINVAL;
 	info->constant_charge_current_max_ua = -EINVAL;
@@ -613,6 +614,8 @@ int power_supply_get_battery_info(struct power_supply *psy,
 			     &info->charge_full_design_uah);
 	of_property_read_u32(battery_np, "voltage-min-design-microvolt",
 			     &info->voltage_min_design_uv);
+	of_property_read_u32(battery_np, "voltage-max-design-microvolt",
+			     &info->voltage_max_design_uv);
 	of_property_read_u32(battery_np, "precharge-current-microamp",
 			     &info->precharge_current_ua);
 	of_property_read_u32(battery_np, "charge-term-current-microamp",
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 57b2ab82b951..2f9c201a54d1 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -332,6 +332,7 @@ struct power_supply_battery_info {
 	int energy_full_design_uwh;	    /* microWatt-hours */
 	int charge_full_design_uah;	    /* microAmp-hours */
 	int voltage_min_design_uv;	    /* microVolts */
+	int voltage_max_design_uv;	    /* microVolts */
 	int precharge_current_ua;	    /* microAmps */
 	int charge_term_current_ua;	    /* microAmps */
 	int constant_charge_current_max_ua; /* microAmps */
-- 
cgit v1.2.3


From 8beb90aaf334a6efa3e924339926b5f93a234dbb Mon Sep 17 00:00:00 2001
From: Sedat Dilek <sedat.dilek@gmail.com>
Date: Fri, 15 Feb 2019 13:19:20 +0100
Subject: scsi: fcoe: make use of fip_mode enum complete

commit 1917d42d14b7 ("fcoe: use enum for fip_mode") introduces a separate
enum for the fip_mode that shall be used during initialisation handling
until it is passed to fcoe_ctrl_link_up to set the initial fip_state.  That
change was incomplete and gcc quietly converted in various places between
the fip_mode and the fip_state enum values with implicit enum conversions,
which fortunately cannot cause any issues in the actual code's execution.

clang however warns about these implicit enum conversions in the scsi
drivers. This commit consolidates the use of the two enums, guided by
clang's enum-conversion warnings.

This commit now completes the use of the fip_mode: It expects and uses
fip_mode in {bnx2fc,fcoe}_interface_create and fcoe_ctlr_init, and it calls
fcoe_ctrl_set_set() with the correct values in fcoe_ctlr_link_up().  It
also breaks the association between FIP_MODE_AUTO and FIP_ST_AUTO to
indicate these two enums are distinct.

Link: https://github.com/ClangBuiltLinux/linux/issues/151
Fixes: 1917d42d14b7 ("fcoe: use enum for fip_mode")
Reported-by: Dmitry Golovin <dima@golovin.in>
Original-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
CC: Lukas Bulwahn <lukas.bulwahn@gmail.com>
CC: Nick Desaulniers <ndesaulniers@google.com>
CC: Nathan Chancellor <natechancellor@gmail.com>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Tested-by: Nathan Chancellor <natechancellor@gmail.com>
Suggested-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/bnx2fc/bnx2fc_fcoe.c  | 2 +-
 drivers/scsi/fcoe/fcoe.c           | 2 +-
 drivers/scsi/fcoe/fcoe_ctlr.c      | 7 +++++--
 drivers/scsi/fcoe/fcoe_transport.c | 2 +-
 drivers/scsi/qedf/qedf_main.c      | 2 +-
 include/scsi/libfcoe.h             | 4 ++--
 6 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
index 2e4e7159ebf9..a75e74ad1698 100644
--- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
@@ -1438,7 +1438,7 @@ bind_err:
 static struct bnx2fc_interface *
 bnx2fc_interface_create(struct bnx2fc_hba *hba,
 			struct net_device *netdev,
-			enum fip_state fip_mode)
+			enum fip_mode fip_mode)
 {
 	struct fcoe_ctlr_device *ctlr_dev;
 	struct bnx2fc_interface *interface;
diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index cd19be3f3405..8ba8862d3292 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c
@@ -389,7 +389,7 @@ static int fcoe_interface_setup(struct fcoe_interface *fcoe,
  * Returns: pointer to a struct fcoe_interface or NULL on error
  */
 static struct fcoe_interface *fcoe_interface_create(struct net_device *netdev,
-						    enum fip_state fip_mode)
+						    enum fip_mode fip_mode)
 {
 	struct fcoe_ctlr_device *ctlr_dev;
 	struct fcoe_ctlr *ctlr;
diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
index 54da3166da8d..7dc4ffa24430 100644
--- a/drivers/scsi/fcoe/fcoe_ctlr.c
+++ b/drivers/scsi/fcoe/fcoe_ctlr.c
@@ -147,7 +147,7 @@ static void fcoe_ctlr_map_dest(struct fcoe_ctlr *fip)
  * fcoe_ctlr_init() - Initialize the FCoE Controller instance
  * @fip: The FCoE controller to initialize
  */
-void fcoe_ctlr_init(struct fcoe_ctlr *fip, enum fip_state mode)
+void fcoe_ctlr_init(struct fcoe_ctlr *fip, enum fip_mode mode)
 {
 	fcoe_ctlr_set_state(fip, FIP_ST_LINK_WAIT);
 	fip->mode = mode;
@@ -454,7 +454,10 @@ void fcoe_ctlr_link_up(struct fcoe_ctlr *fip)
 		mutex_unlock(&fip->ctlr_mutex);
 		fc_linkup(fip->lp);
 	} else if (fip->state == FIP_ST_LINK_WAIT) {
-		fcoe_ctlr_set_state(fip, fip->mode);
+		if (fip->mode == FIP_MODE_NON_FIP)
+			fcoe_ctlr_set_state(fip, FIP_ST_NON_FIP);
+		else
+			fcoe_ctlr_set_state(fip, FIP_ST_AUTO);
 		switch (fip->mode) {
 		default:
 			LIBFCOE_FIP_DBG(fip, "invalid mode %d\n", fip->mode);
diff --git a/drivers/scsi/fcoe/fcoe_transport.c b/drivers/scsi/fcoe/fcoe_transport.c
index 6d3949b687ef..29fe3426f9f2 100644
--- a/drivers/scsi/fcoe/fcoe_transport.c
+++ b/drivers/scsi/fcoe/fcoe_transport.c
@@ -872,7 +872,7 @@ static int fcoe_transport_create(const char *buffer,
 	int rc = -ENODEV;
 	struct net_device *netdev = NULL;
 	struct fcoe_transport *ft = NULL;
-	enum fip_state fip_mode = (enum fip_state)(long)kp->arg;
+	enum fip_mode fip_mode = (enum fip_mode)kp->arg;
 
 	mutex_lock(&ft_mutex);
 
diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c
index edcaf4b0cb0b..d43342b327ac 100644
--- a/drivers/scsi/qedf/qedf_main.c
+++ b/drivers/scsi/qedf/qedf_main.c
@@ -1417,7 +1417,7 @@ static struct libfc_function_template qedf_lport_template = {
 
 static void qedf_fcoe_ctlr_setup(struct qedf_ctx *qedf)
 {
-	fcoe_ctlr_init(&qedf->ctlr, FIP_ST_AUTO);
+	fcoe_ctlr_init(&qedf->ctlr, FIP_MODE_AUTO);
 
 	qedf->ctlr.send = qedf_fip_send;
 	qedf->ctlr.get_src_addr = qedf_get_src_mac;
diff --git a/include/scsi/libfcoe.h b/include/scsi/libfcoe.h
index cb8a273732cf..bb8092fa1e36 100644
--- a/include/scsi/libfcoe.h
+++ b/include/scsi/libfcoe.h
@@ -79,7 +79,7 @@ enum fip_state {
  * It must not change after fcoe_ctlr_init() sets it.
  */
 enum fip_mode {
-	FIP_MODE_AUTO = FIP_ST_AUTO,
+	FIP_MODE_AUTO,
 	FIP_MODE_NON_FIP,
 	FIP_MODE_FABRIC,
 	FIP_MODE_VN2VN,
@@ -250,7 +250,7 @@ struct fcoe_rport {
 };
 
 /* FIP API functions */
-void fcoe_ctlr_init(struct fcoe_ctlr *, enum fip_state);
+void fcoe_ctlr_init(struct fcoe_ctlr *, enum fip_mode);
 void fcoe_ctlr_destroy(struct fcoe_ctlr *);
 void fcoe_ctlr_link_up(struct fcoe_ctlr *);
 int fcoe_ctlr_link_down(struct fcoe_ctlr *);
-- 
cgit v1.2.3


From c2261dd76b549754c14c8ac7cadadd0993b182d6 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 12 Feb 2019 21:12:50 -0700
Subject: RDMA/device: Add ib_device_set_netdev() as an alternative to
 get_netdev

The associated netdev should not actually be very dynamic, so for most
drivers there is no reason for a callback like this. Provide an API to
inform the core code about the net dev affiliation and use a core
maintained data structure instead.

This allows the core code to be more aware of the ndev relationship which
will allow some new APIs based around this.

This also uses locking that makes some kind of sense, many drivers had a
confusing RCU lock, or missing locking which isn't right.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cache.c     |  24 +++---
 drivers/infiniband/core/core_priv.h |   3 +
 drivers/infiniband/core/device.c    | 166 ++++++++++++++++++++++++++++++++----
 drivers/infiniband/core/nldev.c     |   4 +-
 drivers/infiniband/core/verbs.c     |   5 +-
 include/rdma/ib_verbs.h             |   7 ++
 6 files changed, 171 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index a28dc1901c80..43c67e5f43c6 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -547,21 +547,19 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
 	unsigned long mask;
 	int ret;
 
-	if (ib_dev->ops.get_netdev) {
-		idev = ib_dev->ops.get_netdev(ib_dev, port);
-		if (idev && attr->ndev != idev) {
-			union ib_gid default_gid;
-
-			/* Adding default GIDs in not permitted */
-			make_default_gid(idev, &default_gid);
-			if (!memcmp(gid, &default_gid, sizeof(*gid))) {
-				dev_put(idev);
-				return -EPERM;
-			}
-		}
-		if (idev)
+	idev = ib_device_get_netdev(ib_dev, port);
+	if (idev && attr->ndev != idev) {
+		union ib_gid default_gid;
+
+		/* Adding default GIDs is not permitted */
+		make_default_gid(idev, &default_gid);
+		if (!memcmp(gid, &default_gid, sizeof(*gid))) {
 			dev_put(idev);
+			return -EPERM;
+		}
 	}
+	if (idev)
+		dev_put(idev);
 
 	mask = GID_ATTR_FIND_MASK_GID |
 	       GID_ATTR_FIND_MASK_GID_TYPE |
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index eeabe9ca8427..08c690249594 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -66,6 +66,9 @@ typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
 typedef bool (*roce_netdev_filter)(struct ib_device *device, u8 port,
 				   struct net_device *idev, void *cookie);
 
+struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
+					unsigned int port);
+
 void ib_enum_roce_netdev(struct ib_device *ib_dev,
 			 roce_netdev_filter filter,
 			 void *filter_cookie,
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 8d7d63a60ef5..7680a64a98bc 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -134,6 +134,7 @@ static void *xan_find_marked(struct xarray *xa, unsigned long *indexp,
 	     !xa_is_err(entry);                                                \
 	     (index)++, entry = xan_find_marked(xa, &(index), filter))
 
+static void free_netdevs(struct ib_device *ib_dev);
 static int ib_security_change(struct notifier_block *nb, unsigned long event,
 			      void *lsm_data);
 static void ib_policy_change_task(struct work_struct *work);
@@ -290,6 +291,7 @@ static void ib_device_release(struct device *device)
 {
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
 
+	free_netdevs(dev);
 	WARN_ON(refcount_read(&dev->refcount));
 	ib_cache_release_one(dev);
 	ib_security_release_port_pkey_list(dev);
@@ -371,6 +373,9 @@ EXPORT_SYMBOL(_ib_alloc_device);
  */
 void ib_dealloc_device(struct ib_device *device)
 {
+	/* Expedite releasing netdev references */
+	free_netdevs(device);
+
 	WARN_ON(!xa_empty(&device->client_data));
 	WARN_ON(refcount_read(&device->refcount));
 	rdma_restrack_clean(device);
@@ -461,16 +466,16 @@ static void remove_client_context(struct ib_device *device,
 	up_read(&device->client_data_rwsem);
 }
 
-static int verify_immutable(const struct ib_device *dev, u8 port)
-{
-	return WARN_ON(!rdma_cap_ib_mad(dev, port) &&
-			    rdma_max_mad_size(dev, port) != 0);
-}
-
-static int setup_port_data(struct ib_device *device)
+static int alloc_port_data(struct ib_device *device)
 {
 	unsigned int port;
-	int ret;
+
+	if (device->port_data)
+		return 0;
+
+	/* This can only be called once the physical port range is defined */
+	if (WARN_ON(!device->phys_port_cnt))
+		return -EINVAL;
 
 	/*
 	 * device->port_data is indexed directly by the port number to make
@@ -489,6 +494,28 @@ static int setup_port_data(struct ib_device *device)
 
 		spin_lock_init(&pdata->pkey_list_lock);
 		INIT_LIST_HEAD(&pdata->pkey_list);
+		spin_lock_init(&pdata->netdev_lock);
+	}
+	return 0;
+}
+
+static int verify_immutable(const struct ib_device *dev, u8 port)
+{
+	return WARN_ON(!rdma_cap_ib_mad(dev, port) &&
+			    rdma_max_mad_size(dev, port) != 0);
+}
+
+static int setup_port_data(struct ib_device *device)
+{
+	unsigned int port;
+	int ret;
+
+	ret = alloc_port_data(device);
+	if (ret)
+		return ret;
+
+	rdma_for_each_port (device, port) {
+		struct ib_port_data *pdata = &device->port_data[port];
 
 		ret = device->ops.get_port_immutable(device, port,
 						     &pdata->immutable);
@@ -682,6 +709,9 @@ static void disable_device(struct ib_device *device)
 	/* Pairs with refcount_set in enable_device */
 	ib_device_put(device);
 	wait_for_completion(&device->unreg_completion);
+
+	/* Expedite removing unregistered pointers from the hash table */
+	free_netdevs(device);
 }
 
 /*
@@ -1012,6 +1042,114 @@ int ib_query_port(struct ib_device *device,
 }
 EXPORT_SYMBOL(ib_query_port);
 
+/**
+ * ib_device_set_netdev - Associate the ib_dev with an underlying net_device
+ * @ib_dev: Device to modify
+ * @ndev: net_device to affiliate, may be NULL
+ * @port: IB port the net_device is connected to
+ *
+ * Drivers should use this to link the ib_device to a netdev so the netdev
+ * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be
+ * affiliated with any port.
+ *
+ * The caller must ensure that the given ndev is not unregistered or
+ * unregistering, and that either the ib_device is unregistered or
+ * ib_device_set_netdev() is called with NULL when the ndev sends a
+ * NETDEV_UNREGISTER event.
+ */
+int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
+			 unsigned int port)
+{
+	struct net_device *old_ndev;
+	struct ib_port_data *pdata;
+	unsigned long flags;
+	int ret;
+
+	/*
+	 * Drivers wish to call this before ib_register_driver, so we have to
+	 * setup the port data early.
+	 */
+	ret = alloc_port_data(ib_dev);
+	if (ret)
+		return ret;
+
+	if (!rdma_is_port_valid(ib_dev, port))
+		return -EINVAL;
+
+	pdata = &ib_dev->port_data[port];
+	spin_lock_irqsave(&pdata->netdev_lock, flags);
+	if (pdata->netdev == ndev) {
+		spin_unlock_irqrestore(&pdata->netdev_lock, flags);
+		return 0;
+	}
+	old_ndev = pdata->netdev;
+
+	if (ndev)
+		dev_hold(ndev);
+	pdata->netdev = ndev;
+	spin_unlock_irqrestore(&pdata->netdev_lock, flags);
+
+	if (old_ndev)
+		dev_put(old_ndev);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_device_set_netdev);
+
+static void free_netdevs(struct ib_device *ib_dev)
+{
+	unsigned long flags;
+	unsigned int port;
+
+	rdma_for_each_port (ib_dev, port) {
+		struct ib_port_data *pdata = &ib_dev->port_data[port];
+
+		spin_lock_irqsave(&pdata->netdev_lock, flags);
+		if (pdata->netdev) {
+			dev_put(pdata->netdev);
+			pdata->netdev = NULL;
+		}
+		spin_unlock_irqrestore(&pdata->netdev_lock, flags);
+	}
+}
+
+struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
+					unsigned int port)
+{
+	struct ib_port_data *pdata;
+	struct net_device *res;
+
+	if (!rdma_is_port_valid(ib_dev, port))
+		return NULL;
+
+	pdata = &ib_dev->port_data[port];
+
+	/*
+	 * New drivers should use ib_device_set_netdev() not the legacy
+	 * get_netdev().
+	 */
+	if (ib_dev->ops.get_netdev)
+		res = ib_dev->ops.get_netdev(ib_dev, port);
+	else {
+		spin_lock(&pdata->netdev_lock);
+		res = pdata->netdev;
+		if (res)
+			dev_hold(res);
+		spin_unlock(&pdata->netdev_lock);
+	}
+
+	/*
+	 * If we are starting to unregister expedite things by preventing
+	 * propagation of an unregistering netdev.
+	 */
+	if (res && res->reg_state != NETREG_REGISTERED) {
+		dev_put(res);
+		return NULL;
+	}
+
+	return res;
+}
+
 /**
  * ib_enum_roce_netdev - enumerate all RoCE ports
  * @ib_dev : IB device we want to query
@@ -1034,16 +1172,8 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev,
 
 	rdma_for_each_port (ib_dev, port)
 		if (rdma_protocol_roce(ib_dev, port)) {
-			struct net_device *idev = NULL;
-
-			if (ib_dev->ops.get_netdev)
-				idev = ib_dev->ops.get_netdev(ib_dev, port);
-
-			if (idev &&
-			    idev->reg_state >= NETREG_UNREGISTERED) {
-				dev_put(idev);
-				idev = NULL;
-			}
+			struct net_device *idev =
+				ib_device_get_netdev(ib_dev, port);
 
 			if (filter(ib_dev, port, idev, filter_cookie))
 				cb(ib_dev, port, idev, cookie);
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 85f6f2bcce40..1980ddc5f7bc 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -268,9 +268,7 @@ static int fill_port_info(struct sk_buff *msg,
 	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state))
 		return -EMSGSIZE;
 
-	if (device->ops.get_netdev)
-		netdev = device->ops.get_netdev(device, port);
-
+	netdev = ib_device_get_netdev(device, port);
 	if (netdev && net_eq(dev_net(netdev), net)) {
 		ret = nla_put_u32(msg,
 				  RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index de5d895a5054..5a5e83f5f0fc 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1723,10 +1723,7 @@ int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width)
 	if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET)
 		return -EINVAL;
 
-	if (!dev->ops.get_netdev)
-		return -EOPNOTSUPP;
-
-	netdev = dev->ops.get_netdev(dev, port_num);
+	netdev = ib_device_get_netdev(dev, port_num);
 	if (!netdev)
 		return -ENODEV;
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 50b7ebc2885e..7f81a313c01b 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2204,6 +2204,9 @@ struct ib_port_data {
 	struct list_head pkey_list;
 
 	struct ib_port_cache cache;
+
+	spinlock_t netdev_lock;
+	struct net_device *netdev;
 };
 
 /* rdma netdev type - specifies protocol type */
@@ -3996,6 +3999,10 @@ void ib_device_put(struct ib_device *device);
 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
 					    u16 pkey, const union ib_gid *gid,
 					    const struct sockaddr *addr);
+int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
+			 unsigned int port);
+struct net_device *ib_device_netdev(struct ib_device *dev, u8 port);
+
 struct ib_wq *ib_create_wq(struct ib_pd *pd,
 			   struct ib_wq_init_attr *init_attr);
 int ib_destroy_wq(struct ib_wq *wq);
-- 
cgit v1.2.3


From 324e227ea7c952626abafe72db42ae0d70220a6e Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 12 Feb 2019 21:12:51 -0700
Subject: RDMA/device: Add ib_device_get_by_netdev()

Several drivers need to find the ib_device from a given netdev. rxe needs
this at speed in an unsleepable context, so choose to implement the
translation using a RCU safe hash table.

The hash table can have a many to one mapping. This is intended to support
some future case where multiple IB drivers (ie iWarp and RoCE) connect to
the same netdevs. driver_ids will need to be different to support this.

In the process this makes the struct ib_device and ib_port_data RCU safe
by deferring their kfrees.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c | 119 +++++++++++++++++++++++++++++++++++----
 include/rdma/ib_verbs.h          |  10 +++-
 2 files changed, 116 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 7680a64a98bc..f6795ad7ca98 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -40,6 +40,7 @@
 #include <linux/netdevice.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
+#include <linux/hashtable.h>
 #include <rdma/rdma_netlink.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_cache.h>
@@ -134,6 +135,10 @@ static void *xan_find_marked(struct xarray *xa, unsigned long *indexp,
 	     !xa_is_err(entry);                                                \
 	     (index)++, entry = xan_find_marked(xa, &(index), filter))
 
+/* RCU hash table mapping netdevice pointers to struct ib_port_data */
+static DEFINE_SPINLOCK(ndev_hash_lock);
+static DECLARE_HASHTABLE(ndev_hash, 5);
+
 static void free_netdevs(struct ib_device *ib_dev);
 static int ib_security_change(struct notifier_block *nb, unsigned long event,
 			      void *lsm_data);
@@ -144,6 +149,12 @@ static struct notifier_block ibdev_lsm_nb = {
 	.notifier_call = ib_security_change,
 };
 
+/* Pointer to the RCU head at the start of the ib_port_data array */
+struct ib_port_data_rcu {
+	struct rcu_head rcu_head;
+	struct ib_port_data pdata[];
+};
+
 static int ib_device_check_mandatory(struct ib_device *device)
 {
 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x }
@@ -295,9 +306,12 @@ static void ib_device_release(struct device *device)
 	WARN_ON(refcount_read(&dev->refcount));
 	ib_cache_release_one(dev);
 	ib_security_release_port_pkey_list(dev);
-	kfree(dev->port_data);
 	xa_destroy(&dev->client_data);
-	kfree(dev);
+	if (dev->port_data)
+		kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
+				       pdata[0]),
+			  rcu_head);
+	kfree_rcu(dev, rcu_head);
 }
 
 static int ib_device_uevent(struct device *device,
@@ -468,6 +482,7 @@ static void remove_client_context(struct ib_device *device,
 
 static int alloc_port_data(struct ib_device *device)
 {
+	struct ib_port_data_rcu *pdata_rcu;
 	unsigned int port;
 
 	if (device->port_data)
@@ -484,17 +499,26 @@ static int alloc_port_data(struct ib_device *device)
 	 * Therefore port_data is declared as a 1 based array with potential
 	 * empty slots at the beginning.
 	 */
-	device->port_data = kcalloc(rdma_end_port(device) + 1,
-				    sizeof(*device->port_data), GFP_KERNEL);
-	if (!device->port_data)
+	pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata,
+					rdma_end_port(device) + 1),
+			    GFP_KERNEL);
+	if (!pdata_rcu)
 		return -ENOMEM;
+	/*
+	 * The rcu_head is put in front of the port data array and the stored
+	 * pointer is adjusted since we never need to see that member until
+	 * kfree_rcu.
+	 */
+	device->port_data = pdata_rcu->pdata;
 
 	rdma_for_each_port (device, port) {
 		struct ib_port_data *pdata = &device->port_data[port];
 
+		pdata->ib_dev = device;
 		spin_lock_init(&pdata->pkey_list_lock);
 		INIT_LIST_HEAD(&pdata->pkey_list);
 		spin_lock_init(&pdata->netdev_lock);
+		INIT_HLIST_NODE(&pdata->ndev_hash_link);
 	}
 	return 0;
 }
@@ -1042,6 +1066,29 @@ int ib_query_port(struct ib_device *device,
 }
 EXPORT_SYMBOL(ib_query_port);
 
+static void add_ndev_hash(struct ib_port_data *pdata)
+{
+	unsigned long flags;
+
+	might_sleep();
+
+	spin_lock_irqsave(&ndev_hash_lock, flags);
+	if (hash_hashed(&pdata->ndev_hash_link)) {
+		hash_del_rcu(&pdata->ndev_hash_link);
+		spin_unlock_irqrestore(&ndev_hash_lock, flags);
+		/*
+		 * We cannot do hash_add_rcu after a hash_del_rcu until the
+		 * grace period
+		 */
+		synchronize_rcu();
+		spin_lock_irqsave(&ndev_hash_lock, flags);
+	}
+	if (pdata->netdev)
+		hash_add_rcu(ndev_hash, &pdata->ndev_hash_link,
+			     (uintptr_t)pdata->netdev);
+	spin_unlock_irqrestore(&ndev_hash_lock, flags);
+}
+
 /**
  * ib_device_set_netdev - Associate the ib_dev with an underlying net_device
  * @ib_dev: Device to modify
@@ -1078,17 +1125,19 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
 
 	pdata = &ib_dev->port_data[port];
 	spin_lock_irqsave(&pdata->netdev_lock, flags);
-	if (pdata->netdev == ndev) {
+	old_ndev = rcu_dereference_protected(
+		pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
+	if (old_ndev == ndev) {
 		spin_unlock_irqrestore(&pdata->netdev_lock, flags);
 		return 0;
 	}
-	old_ndev = pdata->netdev;
 
 	if (ndev)
 		dev_hold(ndev);
-	pdata->netdev = ndev;
+	rcu_assign_pointer(pdata->netdev, ndev);
 	spin_unlock_irqrestore(&pdata->netdev_lock, flags);
 
+	add_ndev_hash(pdata);
 	if (old_ndev)
 		dev_put(old_ndev);
 
@@ -1103,11 +1152,24 @@ static void free_netdevs(struct ib_device *ib_dev)
 
 	rdma_for_each_port (ib_dev, port) {
 		struct ib_port_data *pdata = &ib_dev->port_data[port];
+		struct net_device *ndev;
 
 		spin_lock_irqsave(&pdata->netdev_lock, flags);
-		if (pdata->netdev) {
-			dev_put(pdata->netdev);
-			pdata->netdev = NULL;
+		ndev = rcu_dereference_protected(
+			pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
+		if (ndev) {
+			spin_lock(&ndev_hash_lock);
+			hash_del_rcu(&pdata->ndev_hash_link);
+			spin_unlock(&ndev_hash_lock);
+
+			/*
+			 * If this is the last dev_put there is still a
+			 * synchronize_rcu before the netdev is kfreed, so we
+			 * can continue to rely on unlocked pointer
+			 * comparisons after the put
+			 */
+			rcu_assign_pointer(pdata->netdev, NULL);
+			dev_put(ndev);
 		}
 		spin_unlock_irqrestore(&pdata->netdev_lock, flags);
 	}
@@ -1132,7 +1194,8 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
 		res = ib_dev->ops.get_netdev(ib_dev, port);
 	else {
 		spin_lock(&pdata->netdev_lock);
-		res = pdata->netdev;
+		res = rcu_dereference_protected(
+			pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
 		if (res)
 			dev_hold(res);
 		spin_unlock(&pdata->netdev_lock);
@@ -1150,6 +1213,38 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
 	return res;
 }
 
+/**
+ * ib_device_get_by_netdev - Find an IB device associated with a netdev
+ * @ndev: netdev to locate
+ * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
+ *
+ * Find and hold an ib_device that is associated with a netdev via
+ * ib_device_set_netdev(). The caller must call ib_device_put() on the
+ * returned pointer.
+ */
+struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
+					  enum rdma_driver_id driver_id)
+{
+	struct ib_device *res = NULL;
+	struct ib_port_data *cur;
+
+	rcu_read_lock();
+	hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link,
+				    (uintptr_t)ndev) {
+		if (rcu_access_pointer(cur->netdev) == ndev &&
+		    (driver_id == RDMA_DRIVER_UNKNOWN ||
+		     cur->ib_dev->driver_id == driver_id) &&
+		    ib_device_try_get(cur->ib_dev)) {
+			res = cur->ib_dev;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return res;
+}
+EXPORT_SYMBOL(ib_device_get_by_netdev);
+
 /**
  * ib_enum_roce_netdev - enumerate all RoCE ports
  * @ib_dev : IB device we want to query
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 7f81a313c01b..3aa802b65cf3 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2198,6 +2198,8 @@ struct ib_port_immutable {
 };
 
 struct ib_port_data {
+	struct ib_device *ib_dev;
+
 	struct ib_port_immutable immutable;
 
 	spinlock_t pkey_list_lock;
@@ -2206,7 +2208,8 @@ struct ib_port_data {
 	struct ib_port_cache cache;
 
 	spinlock_t netdev_lock;
-	struct net_device *netdev;
+	struct net_device __rcu *netdev;
+	struct hlist_node ndev_hash_link;
 };
 
 /* rdma netdev type - specifies protocol type */
@@ -2545,6 +2548,7 @@ struct ib_device {
 	struct device                *dma_device;
 	struct ib_device_ops	     ops;
 	char                          name[IB_DEVICE_NAME_MAX];
+	struct rcu_head rcu_head;
 
 	struct list_head              event_handler_list;
 	spinlock_t                    event_handler_lock;
@@ -3996,6 +4000,10 @@ static inline bool ib_device_try_get(struct ib_device *dev)
 }
 
 void ib_device_put(struct ib_device *device);
+struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
+					  enum rdma_driver_id driver_id);
+struct ib_device *ib_device_get_by_name(const char *name,
+					enum rdma_driver_id driver_id);
 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
 					    u16 pkey, const union ib_gid *gid,
 					    const struct sockaddr *addr);
-- 
cgit v1.2.3


From d0899892edd089790eb17943ecf28254a909deae Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 12 Feb 2019 21:12:53 -0700
Subject: RDMA/device: Provide APIs from the core code to help unregistration

These APIs are intended to support drivers that exist outside the usual
driver core probe()/remove() callbacks. Normally the driver core will
prevent remove() from running concurrently with probe(), once this safety
is lost drivers need more support to get the locking and lifetimes right.

ib_unregister_driver() is intended to be used during module_exit of a
driver using these APIs. It unregisters all the associated ib_devices.

ib_unregister_device_and_put() is to be used by a driver-specific removal
function (ie removal by name, removal from a netdev notifier, removal from
netlink)

ib_unregister_queued() is to be used from netdev notifier chains where
RTNL is held.

The locking is tricky here since once things become async it is possible
to race unregister with registration. This is largely solved by relying on
the registration refcount, unregistration will only ever work on something
that has a positive registration refcount - and then an unregistration
mutex serializes all competing unregistrations of the same device.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c | 242 +++++++++++++++++++++++++++++++++------
 include/rdma/ib_verbs.h          |  11 ++
 2 files changed, 217 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index f6795ad7ca98..e470fa651961 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -140,6 +140,8 @@ static DEFINE_SPINLOCK(ndev_hash_lock);
 static DECLARE_HASHTABLE(ndev_hash, 5);
 
 static void free_netdevs(struct ib_device *ib_dev);
+static void ib_unregister_work(struct work_struct *work);
+static void __ib_unregister_device(struct ib_device *device);
 static int ib_security_change(struct notifier_block *nb, unsigned long event,
 			      void *lsm_data);
 static void ib_policy_change_task(struct work_struct *work);
@@ -366,6 +368,7 @@ struct ib_device *_ib_alloc_device(size_t size)
 
 	INIT_LIST_HEAD(&device->event_handler_list);
 	spin_lock_init(&device->event_handler_lock);
+	mutex_init(&device->unregistration_lock);
 	/*
 	 * client_data needs to be alloc because we don't want our mark to be
 	 * destroyed if the user stores NULL in the client data.
@@ -374,6 +377,7 @@ struct ib_device *_ib_alloc_device(size_t size)
 	init_rwsem(&device->client_data_rwsem);
 	INIT_LIST_HEAD(&device->port_list);
 	init_completion(&device->unreg_completion);
+	INIT_WORK(&device->unregistration_work, ib_unregister_work);
 
 	return device;
 }
@@ -387,6 +391,20 @@ EXPORT_SYMBOL(_ib_alloc_device);
  */
 void ib_dealloc_device(struct ib_device *device)
 {
+	if (device->ops.dealloc_driver)
+		device->ops.dealloc_driver(device);
+
+	/*
+	 * ib_unregister_driver() requires all devices to remain in the xarray
+	 * while their ops are callable. The last op we call is dealloc_driver
+	 * above.  This is needed to create a fence on op callbacks prior to
+	 * allowing the driver module to unload.
+	 */
+	down_write(&devices_rwsem);
+	if (xa_load(&devices, device->index) == device)
+		xa_erase(&devices, device->index);
+	up_write(&devices_rwsem);
+
 	/* Expedite releasing netdev references */
 	free_netdevs(device);
 
@@ -599,7 +617,8 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event,
 }
 
 /*
- * Assign the unique string device name and the unique device index.
+ * Assign the unique string device name and the unique device index. This is
+ * undone by ib_dealloc_device.
  */
 static int assign_name(struct ib_device *device, const char *name)
 {
@@ -640,13 +659,6 @@ out:
 	return ret;
 }
 
-static void release_name(struct ib_device *device)
-{
-	down_write(&devices_rwsem);
-	xa_erase(&devices, device->index);
-	up_write(&devices_rwsem);
-}
-
 static void setup_dma_device(struct ib_device *device)
 {
 	struct device *parent = device->dev.parent;
@@ -740,30 +752,38 @@ static void disable_device(struct ib_device *device)
 
 /*
  * An enabled device is visible to all clients and to all the public facing
- * APIs that return a device pointer.
+ * APIs that return a device pointer. This always returns with a new get, even
+ * if it fails.
  */
-static int enable_device(struct ib_device *device)
+static int enable_device_and_get(struct ib_device *device)
 {
 	struct ib_client *client;
 	unsigned long index;
-	int ret;
+	int ret = 0;
 
-	refcount_set(&device->refcount, 1);
+	/*
+	 * One ref belongs to the xa and the other belongs to this
+	 * thread. This is needed to guard against parallel unregistration.
+	 */
+	refcount_set(&device->refcount, 2);
 	down_write(&devices_rwsem);
 	xa_set_mark(&devices, device->index, DEVICE_REGISTERED);
-	up_write(&devices_rwsem);
+
+	/*
+	 * By using downgrade_write() we ensure that no other thread can clear
+	 * DEVICE_REGISTERED while we are completing the client setup.
+	 */
+	downgrade_write(&devices_rwsem);
 
 	down_read(&clients_rwsem);
 	xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
 		ret = add_client_context(device, client);
-		if (ret) {
-			up_read(&clients_rwsem);
-			disable_device(device);
-			return ret;
-		}
+		if (ret)
+			break;
 	}
 	up_read(&clients_rwsem);
-	return 0;
+	up_read(&devices_rwsem);
+	return ret;
 }
 
 /**
@@ -774,6 +794,10 @@ static int enable_device(struct ib_device *device)
  * devices with the IB core.  All registered clients will receive a
  * callback for each device that is added. @device must be allocated
  * with ib_alloc_device().
+ *
+ * If the driver uses ops.dealloc_driver and calls any ib_unregister_device()
+ * asynchronously then the device pointer may become freed as soon as this
+ * function returns.
  */
 int ib_register_device(struct ib_device *device, const char *name)
 {
@@ -785,13 +809,13 @@ int ib_register_device(struct ib_device *device, const char *name)
 
 	ret = setup_device(device);
 	if (ret)
-		goto out;
+		return ret;
 
 	ret = ib_cache_setup_one(device);
 	if (ret) {
 		dev_warn(&device->dev,
 			 "Couldn't set up InfiniBand P_Key/GID cache\n");
-		goto out;
+		return ret;
 	}
 
 	ib_device_register_rdmacg(device);
@@ -807,42 +831,186 @@ int ib_register_device(struct ib_device *device, const char *name)
 		goto dev_cleanup;
 	}
 
-	ret = enable_device(device);
-	if (ret)
-		goto sysfs_cleanup;
+	ret = enable_device_and_get(device);
+	if (ret) {
+		void (*dealloc_fn)(struct ib_device *);
+
+		/*
+		 * If we hit this error flow then we don't want to
+		 * automatically dealloc the device since the caller is
+		 * expected to call ib_dealloc_device() after
+		 * ib_register_device() fails. This is tricky due to the
+		 * possibility for a parallel unregistration along with this
+		 * error flow. Since we have a refcount here we know any
+		 * parallel flow is stopped in disable_device and will see the
+		 * NULL pointers, causing the responsibility to
+		 * ib_dealloc_device() to revert back to this thread.
+		 */
+		dealloc_fn = device->ops.dealloc_driver;
+		device->ops.dealloc_driver = NULL;
+		ib_device_put(device);
+		__ib_unregister_device(device);
+		device->ops.dealloc_driver = dealloc_fn;
+		return ret;
+	}
+	ib_device_put(device);
 
 	return 0;
 
-sysfs_cleanup:
-	ib_device_unregister_sysfs(device);
 dev_cleanup:
 	device_del(&device->dev);
 cg_cleanup:
 	ib_device_unregister_rdmacg(device);
 	ib_cache_cleanup_one(device);
-out:
-	release_name(device);
 	return ret;
 }
 EXPORT_SYMBOL(ib_register_device);
 
+/* Callers must hold a get on the device. */
+static void __ib_unregister_device(struct ib_device *ib_dev)
+{
+	/*
+	 * We have a registration lock so that all the calls to unregister are
+	 * fully fenced, once any unregister returns the device is truely
+	 * unregistered even if multiple callers are unregistering it at the
+	 * same time. This also interacts with the registration flow and
+	 * provides sane semantics if register and unregister are racing.
+	 */
+	mutex_lock(&ib_dev->unregistration_lock);
+	if (!refcount_read(&ib_dev->refcount))
+		goto out;
+
+	disable_device(ib_dev);
+	ib_device_unregister_sysfs(ib_dev);
+	device_del(&ib_dev->dev);
+	ib_device_unregister_rdmacg(ib_dev);
+	ib_cache_cleanup_one(ib_dev);
+
+	/*
+	 * Drivers using the new flow may not call ib_dealloc_device except
+	 * in error unwind prior to registration success.
+	 */
+	if (ib_dev->ops.dealloc_driver) {
+		WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1);
+		ib_dealloc_device(ib_dev);
+	}
+out:
+	mutex_unlock(&ib_dev->unregistration_lock);
+}
+
 /**
  * ib_unregister_device - Unregister an IB device
- * @device:Device to unregister
+ * @device: The device to unregister
  *
  * Unregister an IB device.  All clients will receive a remove callback.
+ *
+ * Callers should call this routine only once, and protect against races with
+ * registration. Typically it should only be called as part of a remove
+ * callback in an implementation of driver core's struct device_driver and
+ * related.
+ *
+ * If ops.dealloc_driver is used then ib_dev will be freed upon return from
+ * this function.
  */
-void ib_unregister_device(struct ib_device *device)
+void ib_unregister_device(struct ib_device *ib_dev)
 {
-	disable_device(device);
-	ib_device_unregister_sysfs(device);
-	device_del(&device->dev);
-	ib_device_unregister_rdmacg(device);
-	ib_cache_cleanup_one(device);
-	release_name(device);
+	get_device(&ib_dev->dev);
+	__ib_unregister_device(ib_dev);
+	put_device(&ib_dev->dev);
 }
 EXPORT_SYMBOL(ib_unregister_device);
 
+/**
+ * ib_unregister_device_and_put - Unregister a device while holding a 'get'
+ * device: The device to unregister
+ *
+ * This is the same as ib_unregister_device(), except it includes an internal
+ * ib_device_put() that should match a 'get' obtained by the caller.
+ *
+ * It is safe to call this routine concurrently from multiple threads while
+ * holding the 'get'. When the function returns the device is fully
+ * unregistered.
+ *
+ * Drivers using this flow MUST use the driver_unregister callback to clean up
+ * their resources associated with the device and dealloc it.
+ */
+void ib_unregister_device_and_put(struct ib_device *ib_dev)
+{
+	WARN_ON(!ib_dev->ops.dealloc_driver);
+	get_device(&ib_dev->dev);
+	ib_device_put(ib_dev);
+	__ib_unregister_device(ib_dev);
+	put_device(&ib_dev->dev);
+}
+EXPORT_SYMBOL(ib_unregister_device_and_put);
+
+/**
+ * ib_unregister_driver - Unregister all IB devices for a driver
+ * @driver_id: The driver to unregister
+ *
+ * This implements a fence for device unregistration. It only returns once all
+ * devices associated with the driver_id have fully completed their
+ * unregistration and returned from ib_unregister_device*().
+ *
+ * If device's are not yet unregistered it goes ahead and starts unregistering
+ * them.
+ *
+ * This does not block creation of new devices with the given driver_id, that
+ * is the responsibility of the caller.
+ */
+void ib_unregister_driver(enum rdma_driver_id driver_id)
+{
+	struct ib_device *ib_dev;
+	unsigned long index;
+
+	down_read(&devices_rwsem);
+	xa_for_each (&devices, index, ib_dev) {
+		if (ib_dev->driver_id != driver_id)
+			continue;
+
+		get_device(&ib_dev->dev);
+		up_read(&devices_rwsem);
+
+		WARN_ON(!ib_dev->ops.dealloc_driver);
+		__ib_unregister_device(ib_dev);
+
+		put_device(&ib_dev->dev);
+		down_read(&devices_rwsem);
+	}
+	up_read(&devices_rwsem);
+}
+EXPORT_SYMBOL(ib_unregister_driver);
+
+static void ib_unregister_work(struct work_struct *work)
+{
+	struct ib_device *ib_dev =
+		container_of(work, struct ib_device, unregistration_work);
+
+	__ib_unregister_device(ib_dev);
+	put_device(&ib_dev->dev);
+}
+
+/**
+ * ib_unregister_device_queued - Unregister a device using a work queue
+ * device: The device to unregister
+ *
+ * This schedules an asynchronous unregistration using a WQ for the device. A
+ * driver should use this to avoid holding locks while doing unregistration,
+ * such as holding the RTNL lock.
+ *
+ * Drivers using this API must use ib_unregister_driver before module unload
+ * to ensure that all scheduled unregistrations have completed.
+ */
+void ib_unregister_device_queued(struct ib_device *ib_dev)
+{
+	WARN_ON(!refcount_read(&ib_dev->refcount));
+	WARN_ON(!ib_dev->ops.dealloc_driver);
+	get_device(&ib_dev->dev);
+	if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work))
+		put_device(&ib_dev->dev);
+}
+EXPORT_SYMBOL(ib_unregister_device_queued);
+
 static int assign_client_id(struct ib_client *client)
 {
 	int ret;
@@ -1558,6 +1726,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	SET_DEVICE_OP(dev_ops, create_srq);
 	SET_DEVICE_OP(dev_ops, create_wq);
 	SET_DEVICE_OP(dev_ops, dealloc_dm);
+	SET_DEVICE_OP(dev_ops, dealloc_driver);
 	SET_DEVICE_OP(dev_ops, dealloc_fmr);
 	SET_DEVICE_OP(dev_ops, dealloc_mw);
 	SET_DEVICE_OP(dev_ops, dealloc_pd);
@@ -1744,6 +1913,7 @@ static void __exit ib_core_cleanup(void)
 	destroy_workqueue(ib_comp_wq);
 	/* Make sure that any pending umem accounting work is done. */
 	destroy_workqueue(ib_wq);
+	flush_workqueue(system_unbound_wq);
 	WARN_ON(!xa_empty(&clients));
 	WARN_ON(!xa_empty(&devices));
 }
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 3aa802b65cf3..ad83f8c38dc8 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2538,6 +2538,12 @@ struct ib_device_ops {
 	int (*fill_res_entry)(struct sk_buff *msg,
 			      struct rdma_restrack_entry *entry);
 
+	/* Device lifecycle callbacks */
+	/*
+	 * This is called as part of ib_dealloc_device().
+	 */
+	void (*dealloc_driver)(struct ib_device *dev);
+
 	DECLARE_RDMA_OBJ_SIZE(ib_pd);
 };
 
@@ -2555,6 +2561,7 @@ struct ib_device {
 
 	struct rw_semaphore	      client_data_rwsem;
 	struct xarray                 client_data;
+	struct mutex                  unregistration_lock;
 
 	struct ib_cache               cache;
 	/**
@@ -2609,6 +2616,7 @@ struct ib_device {
 	 */
 	refcount_t refcount;
 	struct completion unreg_completion;
+	struct work_struct unregistration_work;
 };
 
 struct ib_client {
@@ -2658,6 +2666,9 @@ void ib_get_device_fw_str(struct ib_device *device, char *str);
 
 int ib_register_device(struct ib_device *device, const char *name);
 void ib_unregister_device(struct ib_device *device);
+void ib_unregister_driver(enum rdma_driver_id driver_id);
+void ib_unregister_device_and_put(struct ib_device *device);
+void ib_unregister_device_queued(struct ib_device *ib_dev);
 
 int ib_register_client   (struct ib_client *client);
 void ib_unregister_client(struct ib_client *client);
-- 
cgit v1.2.3


From ca22354b140853b8155692d5b2bc0110aa54e937 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 12 Feb 2019 21:12:56 -0700
Subject: RDMA/rxe: Close a race after ib_register_device

Since rxe allows unregistration from other threads the rxe pointer can
become invalid any moment after ib_register_driver returns. This could
cause a user triggered use after free.

Add another driver callback to be called right after the device becomes
registered to complete any device setup required post-registration.  This
callback has enough core locking to prevent the device from becoming
unregistered.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c      |  9 +++++++++
 drivers/infiniband/sw/rxe/rxe_net.c   |  8 ++++----
 drivers/infiniband/sw/rxe/rxe_net.h   |  2 +-
 drivers/infiniband/sw/rxe/rxe_sysfs.c |  9 ++-------
 drivers/infiniband/sw/rxe/rxe_verbs.c | 14 ++++++++++++++
 include/rdma/ib_verbs.h               |  5 +++++
 6 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 2a7d54794ee3..bf2a215d94dd 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -803,6 +803,12 @@ static int enable_device_and_get(struct ib_device *device)
 	 */
 	downgrade_write(&devices_rwsem);
 
+	if (device->ops.enable_driver) {
+		ret = device->ops.enable_driver(device);
+		if (ret)
+			goto out;
+	}
+
 	down_read(&clients_rwsem);
 	xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
 		ret = add_client_context(device, client);
@@ -810,6 +816,8 @@ static int enable_device_and_get(struct ib_device *device)
 			break;
 	}
 	up_read(&clients_rwsem);
+
+out:
 	up_read(&devices_rwsem);
 	return ret;
 }
@@ -1775,6 +1783,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	SET_DEVICE_OP(dev_ops, disassociate_ucontext);
 	SET_DEVICE_OP(dev_ops, drain_rq);
 	SET_DEVICE_OP(dev_ops, drain_sq);
+	SET_DEVICE_OP(dev_ops, enable_driver);
 	SET_DEVICE_OP(dev_ops, fill_res_entry);
 	SET_DEVICE_OP(dev_ops, get_dev_fw_str);
 	SET_DEVICE_OP(dev_ops, get_dma_mr);
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index d6dfbcf6a47e..fb792f5bc0b7 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -517,24 +517,24 @@ enum rdma_link_layer rxe_link_layer(struct rxe_dev *rxe, unsigned int port_num)
 	return IB_LINK_LAYER_ETHERNET;
 }
 
-struct rxe_dev *rxe_net_add(struct net_device *ndev)
+int rxe_net_add(struct net_device *ndev)
 {
 	int err;
 	struct rxe_dev *rxe = NULL;
 
 	rxe = ib_alloc_device(rxe_dev, ib_dev);
 	if (!rxe)
-		return NULL;
+		return -ENOMEM;
 
 	rxe->ndev = ndev;
 
 	err = rxe_add(rxe, ndev->mtu);
 	if (err) {
 		ib_dealloc_device(&rxe->ib_dev);
-		return NULL;
+		return err;
 	}
 
-	return rxe;
+	return 0;
 }
 
 static void rxe_port_event(struct rxe_dev *rxe,
diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
index 106c586dbb26..ad79514191bb 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.h
+++ b/drivers/infiniband/sw/rxe/rxe_net.h
@@ -43,7 +43,7 @@ struct rxe_recv_sockets {
 	struct socket *sk6;
 };
 
-struct rxe_dev *rxe_net_add(struct net_device *ndev);
+int rxe_net_add(struct net_device *ndev);
 
 int rxe_net_init(void);
 void rxe_net_exit(void);
diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c
index d51b55b0a311..46587eb0da0e 100644
--- a/drivers/infiniband/sw/rxe/rxe_sysfs.c
+++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c
@@ -60,7 +60,6 @@ static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
 	char intf[32];
 	struct net_device *ndev;
 	struct rxe_dev *exists;
-	struct rxe_dev *rxe;
 
 	len = sanitize_arg(val, intf, sizeof(intf));
 	if (!len) {
@@ -82,16 +81,12 @@ static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
 		goto err;
 	}
 
-	rxe = rxe_net_add(ndev);
-	if (!rxe) {
+	err = rxe_net_add(ndev);
+	if (err) {
 		pr_err("failed to add %s\n", intf);
-		err = -EINVAL;
 		goto err;
 	}
 
-	rxe_set_port_state(rxe);
-	dev_info(&rxe->ib_dev.dev, "added %s\n", intf);
-
 err:
 	dev_put(ndev);
 	return err;
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 76da8a142bf7..79ad93b4140c 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -1125,6 +1125,15 @@ static const struct attribute_group rxe_attr_group = {
 	.attrs = rxe_dev_attributes,
 };
 
+static int rxe_enable_driver(struct ib_device *ib_dev)
+{
+	struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev);
+
+	rxe_set_port_state(rxe);
+	dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(rxe->ndev));
+	return 0;
+}
+
 static const struct ib_device_ops rxe_dev_ops = {
 	.alloc_hw_stats = rxe_ib_alloc_hw_stats,
 	.alloc_mr = rxe_alloc_mr,
@@ -1144,6 +1153,7 @@ static const struct ib_device_ops rxe_dev_ops = {
 	.destroy_qp = rxe_destroy_qp,
 	.destroy_srq = rxe_destroy_srq,
 	.detach_mcast = rxe_detach_mcast,
+	.enable_driver = rxe_enable_driver,
 	.get_dma_mr = rxe_get_dma_mr,
 	.get_hw_stats = rxe_ib_get_hw_stats,
 	.get_link_layer = rxe_get_link_layer,
@@ -1245,5 +1255,9 @@ int rxe_register_device(struct rxe_dev *rxe)
 	if (err)
 		pr_warn("%s failed with error %d\n", __func__, err);
 
+	/*
+	 * Note that rxe may be invalid at this point if another thread
+	 * unregistered it.
+	 */
 	return err;
 }
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index ad83f8c38dc8..640263289ab9 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2539,6 +2539,11 @@ struct ib_device_ops {
 			      struct rdma_restrack_entry *entry);
 
 	/* Device lifecycle callbacks */
+	/*
+	 * Called after the device becomes registered, before clients are
+	 * attached
+	 */
+	int (*enable_driver)(struct ib_device *dev);
 	/*
 	 * This is called as part of ib_dealloc_device().
 	 */
-- 
cgit v1.2.3


From 3856ec4b93c9463d36ee39098dde1fbbd29ec6dd Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Fri, 15 Feb 2019 11:03:53 -0800
Subject: RDMA/core: Add RDMA_NLDEV_CMD_NEWLINK/DELLINK support

Add support for new LINK messages to allow adding and deleting rdma
interfaces.  This will be used initially for soft rdma drivers which
instantiate device instances dynamically by the admin specifying a netdev
device to use.  The rdma_rxe module will be the first user of these
messages.

The design is modeled after RTNL_NEWLINK/DELLINK: rdma drivers register
with the rdma core if they provide link add/delete functions.  Each driver
registers with a unique "type" string, that is used to dispatch messages
coming from user space.  A new RDMA_NLDEV_ATTR is defined for the "type"
string.  User mode will pass 3 attributes in a NEWLINK message:
RDMA_NLDEV_ATTR_DEV_NAME for the desired rdma device name to be created,
RDMA_NLDEV_ATTR_LINK_TYPE for the "type" of link being added, and
RDMA_NLDEV_ATTR_NDEV_NAME for the net_device interface to use for this
link.  The DELLINK message will contain the RDMA_NLDEV_ATTR_DEV_INDEX of
the device to delete.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/nldev.c  | 122 +++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h          |   3 +
 include/rdma/rdma_netlink.h      |  11 ++++
 include/uapi/rdma/rdma_netlink.h |  10 +++-
 4 files changed, 144 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 1980ddc5f7bc..5e94dc87f04f 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -33,6 +33,7 @@
 #include <linux/module.h>
 #include <linux/pid.h>
 #include <linux/pid_namespace.h>
+#include <linux/mutex.h>
 #include <net/netlink.h>
 #include <rdma/rdma_cm.h>
 #include <rdma/rdma_netlink.h>
@@ -113,6 +114,8 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_RES_MRN]               = { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_CM_IDN]            = { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_CTXN]              = { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_LINK_TYPE]		= { .type = NLA_NUL_STRING,
+				    .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -1200,6 +1203,117 @@ RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ);
 RES_GET_FUNCS(pd, RDMA_RESTRACK_PD);
 RES_GET_FUNCS(mr, RDMA_RESTRACK_MR);
 
+static LIST_HEAD(link_ops);
+static DECLARE_RWSEM(link_ops_rwsem);
+
+static const struct rdma_link_ops *link_ops_get(const char *type)
+{
+	const struct rdma_link_ops *ops;
+
+	list_for_each_entry(ops, &link_ops, list) {
+		if (!strcmp(ops->type, type))
+			goto out;
+	}
+	ops = NULL;
+out:
+	return ops;
+}
+
+void rdma_link_register(struct rdma_link_ops *ops)
+{
+	down_write(&link_ops_rwsem);
+	if (link_ops_get(ops->type)) {
+		WARN_ONCE("Duplicate rdma_link_ops! %s\n", ops->type);
+		goto out;
+	}
+	list_add(&ops->list, &link_ops);
+out:
+	up_write(&link_ops_rwsem);
+}
+EXPORT_SYMBOL(rdma_link_register);
+
+void rdma_link_unregister(struct rdma_link_ops *ops)
+{
+	down_write(&link_ops_rwsem);
+	list_del(&ops->list);
+	up_write(&link_ops_rwsem);
+}
+EXPORT_SYMBOL(rdma_link_unregister);
+
+static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+			  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	char ibdev_name[IB_DEVICE_NAME_MAX];
+	const struct rdma_link_ops *ops;
+	char ndev_name[IFNAMSIZ];
+	struct net_device *ndev;
+	char type[IFNAMSIZ];
+	int err;
+
+	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
+	    !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME])
+		return -EINVAL;
+
+	nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
+		    sizeof(ibdev_name));
+	if (strchr(ibdev_name, '%'))
+		return -EINVAL;
+
+	nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type));
+	nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME],
+		    sizeof(ndev_name));
+
+	ndev = dev_get_by_name(&init_net, ndev_name);
+	if (!ndev)
+		return -ENODEV;
+
+	down_read(&link_ops_rwsem);
+	ops = link_ops_get(type);
+#ifdef CONFIG_MODULES
+	if (!ops) {
+		up_read(&link_ops_rwsem);
+		request_module("rdma-link-%s", type);
+		down_read(&link_ops_rwsem);
+		ops = link_ops_get(type);
+	}
+#endif
+	err = ops ? ops->newlink(ibdev_name, ndev) : -EINVAL;
+	up_read(&link_ops_rwsem);
+	dev_put(ndev);
+
+	return err;
+}
+
+static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
+			  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	u32 index;
+	int err;
+
+	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(index);
+	if (!device)
+		return -EINVAL;
+
+	if (!(device->attrs.device_cap_flags & IB_DEVICE_ALLOW_USER_UNREG)) {
+		ib_device_put(device);
+		return -EINVAL;
+	}
+
+	ib_unregister_device_and_put(device);
+	return 0;
+}
+
 static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 	[RDMA_NLDEV_CMD_GET] = {
 		.doit = nldev_get_doit,
@@ -1209,6 +1323,14 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 		.doit = nldev_set_doit,
 		.flags = RDMA_NL_ADMIN_PERM,
 	},
+	[RDMA_NLDEV_CMD_NEWLINK] = {
+		.doit = nldev_newlink,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
+	[RDMA_NLDEV_CMD_DELLINK] = {
+		.doit = nldev_dellink,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
 	[RDMA_NLDEV_CMD_PORT_GET] = {
 		.doit = nldev_port_get_doit,
 		.dump = nldev_port_get_dumpit,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 640263289ab9..225cb76d469f 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -238,6 +238,7 @@ enum ib_device_cap_flags {
 	IB_DEVICE_RDMA_NETDEV_OPA_VNIC		= (1ULL << 35),
 	/* The device supports padding incoming writes to cacheline. */
 	IB_DEVICE_PCI_WRITE_END_PADDING		= (1ULL << 36),
+	IB_DEVICE_ALLOW_USER_UNREG		= (1ULL << 37),
 };
 
 enum ib_signature_prot_cap {
@@ -2622,6 +2623,8 @@ struct ib_device {
 	refcount_t refcount;
 	struct completion unreg_completion;
 	struct work_struct unregistration_work;
+
+	const struct rdma_link_ops *link_ops;
 };
 
 struct ib_client {
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index 70218e6b5187..10732ab31ba2 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -99,4 +99,15 @@ int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags);
  * Returns true on success or false if no listeners.
  */
 bool rdma_nl_chk_listeners(unsigned int group);
+
+struct rdma_link_ops {
+	struct list_head list;
+	const char *type;
+	int (*newlink)(const char *ibdev_name, struct net_device *ndev);
+};
+
+void rdma_link_register(struct rdma_link_ops *ops);
+void rdma_link_unregister(struct rdma_link_ops *ops);
+
+#define MODULE_ALIAS_RDMA_LINK(type) MODULE_ALIAS("rdma-link-" type)
 #endif /* _RDMA_NETLINK_H */
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 4ebbcfb2c6ef..5cc592728071 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -255,9 +255,11 @@ enum rdma_nldev_command {
 	RDMA_NLDEV_CMD_GET, /* can dump */
 	RDMA_NLDEV_CMD_SET,
 
-	/* 3 - 4 are free to use */
+	RDMA_NLDEV_CMD_NEWLINK,
 
-	RDMA_NLDEV_CMD_PORT_GET = 5, /* can dump */
+	RDMA_NLDEV_CMD_DELLINK,
+
+	RDMA_NLDEV_CMD_PORT_GET, /* can dump */
 
 	/* 6 - 8 are free to use */
 
@@ -465,6 +467,10 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_RES_MRN,               /* u32 */
 	RDMA_NLDEV_ATTR_RES_CM_IDN,            /* u32 */
 	RDMA_NLDEV_ATTR_RES_CTXN,	       /* u32 */
+	/*
+	 * Identifies the rdma driver. eg: "rxe" or "siw"
+	 */
+	RDMA_NLDEV_ATTR_LINK_TYPE,		/* string */
 
 	/*
 	 * Always the end
-- 
cgit v1.2.3


From fadccd8fc2d06cf7fd222245d7e04b00fae946cf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 18 Feb 2019 09:37:13 +0100
Subject: nvme_ioctl.h: remove duplicate GPL boilerplate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We already have a ЅPDX header, so no need to duplicate the information.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
---
 include/linux/nvme.h            | 10 +---------
 include/uapi/linux/nvme_ioctl.h |  9 ---------
 2 files changed, 1 insertion(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index bbcc83886899..baa49e6a23cc 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1,15 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Definitions for the NVM Express interface
  * Copyright (c) 2011-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #ifndef _LINUX_NVME_H
diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h
index 6e74b1eaf541..1c215ea1798e 100644
--- a/include/uapi/linux/nvme_ioctl.h
+++ b/include/uapi/linux/nvme_ioctl.h
@@ -2,15 +2,6 @@
 /*
  * Definitions for the NVM Express ioctl interface
  * Copyright (c) 2011-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #ifndef _UAPI_LINUX_NVME_IOCTL_H
-- 
cgit v1.2.3


From 055d045a7aaeef326f8ab6845519da3157887830 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 18 Feb 2019 09:37:42 +0100
Subject: nvme-tcp.h: fix SPDX header

For .h files we need to use /* */ style comments.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
---
 include/linux/nvme-tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h
index 03d87c0550a9..959e0bd9a913 100644
--- a/include/linux/nvme-tcp.h
+++ b/include/linux/nvme-tcp.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * NVMe over Fabrics TCP protocol header.
  * Copyright (c) 2018 Lightbits Labs. All rights reserved.
-- 
cgit v1.2.3


From 8638b2461475ad4c35a957156ecf2425b9b82e85 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 18 Feb 2019 09:33:28 +0100
Subject: nvme-fc: convert to SPDX identifiers

Update license to use SPDX-License-Identifier instead of verbose license
text.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/fc.c         | 14 +-------------
 include/linux/nvme-fc-driver.h | 10 +---------
 include/linux/nvme-fc.h        | 14 +-------------
 3 files changed, 3 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 89accc76d71c..b29b12498a1a 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1,18 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (c) 2016 Avago Technologies.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful.
- * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
- * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
- * PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
- * THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
- * See the GNU General Public License for more details, a copy of which
- * can be found in the file COPYING included with this package
- *
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index 91745cc3704c..2bb349035431 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -1,14 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright (c) 2016, Avago Technologies
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #ifndef _NVME_FC_DRIVER_H
diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h
index 36cca93a5ff2..067c9fea64fe 100644
--- a/include/linux/nvme-fc.h
+++ b/include/linux/nvme-fc.h
@@ -1,18 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright (c) 2016 Avago Technologies.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful.
- * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
- * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
- * PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
- * THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
- * See the GNU General Public License for more details, a copy of which
- * can be found in the file COPYING included with this package
- *
  */
 
 /*
-- 
cgit v1.2.3


From 5d8762d5684ab997c7ccf2457c8beec7ef972ceb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 18 Feb 2019 09:34:21 +0100
Subject: nvme-rdma: convert to SPDX identifiers

Update license to use SPDX-License-Identifier instead of verbose license
text.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/rdma.c  | 10 +---------
 include/linux/nvme-rdma.h | 10 +---------
 2 files changed, 2 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index ac365366c2ec..7c0d29185249 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * NVMe over Fabrics RDMA host code.
  * Copyright (c) 2015-2016 HGST, a Western Digital Company.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h
index a72fd04aa5e1..3aa97b98dc89 100644
--- a/include/linux/nvme-rdma.h
+++ b/include/linux/nvme-rdma.h
@@ -1,14 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #ifndef _LINUX_NVME_RDMA_H
-- 
cgit v1.2.3


From ff4c25f26a71b79c70ea03b3935a1297439a8a85 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 3 Feb 2019 20:12:02 +0100
Subject: dma-mapping: improve selection of dma_declare_coherent availability

This API is primarily used through DT entries, but two architectures
and two drivers call it directly.  So instead of selecting the config
symbol for random architectures pull it in implicitly for the actual
users.  Also rename the Kconfig option to describe the feature better.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Paul Burton <paul.burton@mips.com> # MIPS
Acked-by: Lee Jones <lee.jones@linaro.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arc/Kconfig            | 1 -
 arch/arm/Kconfig            | 2 +-
 arch/arm64/Kconfig          | 1 -
 arch/csky/Kconfig           | 1 -
 arch/mips/Kconfig           | 1 -
 arch/riscv/Kconfig          | 1 -
 arch/sh/Kconfig             | 2 +-
 arch/unicore32/Kconfig      | 1 -
 arch/x86/Kconfig            | 1 -
 drivers/mfd/Kconfig         | 2 ++
 drivers/of/Kconfig          | 3 ++-
 include/linux/device.h      | 2 +-
 include/linux/dma-mapping.h | 8 ++++----
 kernel/dma/Kconfig          | 2 +-
 kernel/dma/Makefile         | 2 +-
 15 files changed, 13 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index ab8d6131c954..728a0f6f838c 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -31,7 +31,6 @@ config ARC
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_FUTEX_CMPXCHG if FUTEX
-	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_IOREMAP_PROT
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZMA
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index e07e5c184d2f..33612e6da19a 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -32,6 +32,7 @@ config ARM
 	select CLONE_BACKWARDS
 	select CPU_PM if SUSPEND || CPU_IDLE
 	select DCACHE_WORD_ACCESS if HAVE_EFFICIENT_UNALIGNED_ACCESS
+	select DMA_DECLARE_COHERENT
 	select DMA_REMAP if MMU
 	select EDAC_SUPPORT
 	select EDAC_ATOMIC_SCRUB
@@ -74,7 +75,6 @@ config ARM
 	select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL
 	select HAVE_FUNCTION_TRACER if !XIP_KERNEL
 	select HAVE_GCC_PLUGINS
-	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_HW_BREAKPOINT if PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7)
 	select HAVE_IDE if PCI || ISA || PCMCIA
 	select HAVE_IRQ_TIME_ACCOUNTING
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index fbcf521e1c9f..e86fac1e6b03 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -139,7 +139,6 @@ config ARM64
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_GCC_PLUGINS
-	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_HW_BREAKPOINT if PERF_EVENTS
 	select HAVE_IRQ_TIME_ACCOUNTING
 	select HAVE_MEMBLOCK_NODE_MAP if NUMA
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 0a9595afe9be..c009a8c63946 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -30,7 +30,6 @@ config CSKY
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
-	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
 	select HAVE_KERNEL_LZMA
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index dc5d70f674e0..433b9dd35824 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -56,7 +56,6 @@ config MIPS
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER
-	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_IDE
 	select HAVE_IOREMAP_PROT
 	select HAVE_IRQ_EXIT_ON_IRQ_STACK
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index feeeaa60697c..51b9c97751bf 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -32,7 +32,6 @@ config RISCV
 	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_FUTEX_CMPXCHG if FUTEX
-	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_PERF_EVENTS
 	select HAVE_SYSCALL_TRACEPOINTS
 	select IRQ_DOMAIN
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index a9c36f95744a..a3d2a24e75c7 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -7,11 +7,11 @@ config SUPERH
 	select ARCH_NO_COHERENT_DMA_MMAP if !MMU
 	select HAVE_PATA_PLATFORM
 	select CLKDEV_LOOKUP
+	select DMA_DECLARE_COHERENT
 	select HAVE_IDE if HAS_IOPORT_MAP
 	select HAVE_MEMBLOCK_NODE_MAP
 	select ARCH_DISCARD_MEMBLOCK
 	select HAVE_OPROFILE
-	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_PERF_EVENTS
 	select HAVE_DEBUG_BUGVERBOSE
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index c3a41bfe161b..6d2891d37e32 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -4,7 +4,6 @@ config UNICORE32
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
-	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_BZIP2
 	select GENERIC_ATOMIC64
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 26387c7bf305..0e33dede053e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -15,7 +15,6 @@ config X86_32
 	select CLKSRC_I8253
 	select CLONE_BACKWARDS
 	select HAVE_AOUT
-	select HAVE_GENERIC_DMA_COHERENT
 	select MODULES_USE_ELF_REL
 	select OLD_SIGACTION
 
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index f15f6489803d..c3ccf2c7b3ef 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1067,6 +1067,7 @@ config MFD_SI476X_CORE
 config MFD_SM501
 	tristate "Silicon Motion SM501"
 	depends on HAS_DMA
+	select DMA_DECLARE_COHERENT
 	 ---help---
 	  This is the core driver for the Silicon Motion SM501 multimedia
 	  companion chip. This device is a multifunction device which may
@@ -1675,6 +1676,7 @@ config MFD_TC6393XB
 	select GPIOLIB
 	select MFD_CORE
 	select MFD_TMIO
+	select DMA_DECLARE_COHERENT
 	help
 	  Support for Toshiba Mobile IO Controller TC6393XB
 
diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index 3607fd2810e4..37c2ccbefecd 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -43,6 +43,7 @@ config OF_FLATTREE
 
 config OF_EARLY_FLATTREE
 	bool
+	select DMA_DECLARE_COHERENT if HAS_DMA
 	select OF_FLATTREE
 
 config OF_PROMTREE
@@ -83,7 +84,7 @@ config OF_MDIO
 config OF_RESERVED_MEM
 	bool
 	depends on OF_EARLY_FLATTREE
-	default y if HAVE_GENERIC_DMA_COHERENT || DMA_CMA
+	default y if DMA_DECLARE_COHERENT || DMA_CMA
 
 config OF_RESOLVE
 	bool
diff --git a/include/linux/device.h b/include/linux/device.h
index be544400acdd..c52d90348cef 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1017,7 +1017,7 @@ struct device {
 
 	struct list_head	dma_pools;	/* dma pools (if dma'ble) */
 
-#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT
+#ifdef CONFIG_DMA_DECLARE_COHERENT
 	struct dma_coherent_mem	*dma_mem; /* internal for coherent mem
 					     override */
 #endif
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 4210c5c1dd21..e29441b8b3b7 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -153,7 +153,7 @@ static inline int is_device_dma_capable(struct device *dev)
 	return dev->dma_mask != NULL && *dev->dma_mask != DMA_MASK_NONE;
 }
 
-#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT
+#ifdef CONFIG_DMA_DECLARE_COHERENT
 /*
  * These three functions are only for dma allocator.
  * Don't use them in device drivers.
@@ -192,7 +192,7 @@ static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma,
 {
 	return 0;
 }
-#endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */
+#endif /* CONFIG_DMA_DECLARE_COHERENT */
 
 static inline bool dma_is_direct(const struct dma_map_ops *ops)
 {
@@ -739,7 +739,7 @@ static inline int dma_get_cache_alignment(void)
 /* flags for the coherent memory api */
 #define DMA_MEMORY_EXCLUSIVE		0x01
 
-#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT
+#ifdef CONFIG_DMA_DECLARE_COHERENT
 int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
 				dma_addr_t device_addr, size_t size, int flags);
 void dma_release_declared_memory(struct device *dev);
@@ -764,7 +764,7 @@ dma_mark_declared_memory_occupied(struct device *dev,
 {
 	return ERR_PTR(-EBUSY);
 }
-#endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */
+#endif /* CONFIG_DMA_DECLARE_COHERENT */
 
 static inline void *dmam_alloc_coherent(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp)
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index bde9179c6ed7..24d45c78c671 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -16,7 +16,7 @@ config ARCH_DMA_ADDR_T_64BIT
 config ARCH_HAS_DMA_COHERENCE_H
 	bool
 
-config HAVE_GENERIC_DMA_COHERENT
+config DMA_DECLARE_COHERENT
 	bool
 
 config ARCH_HAS_SETUP_DMA_OPS
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 72ff6e46aa86..d237cf3dc181 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -2,7 +2,7 @@
 
 obj-$(CONFIG_HAS_DMA)			+= mapping.o direct.o dummy.o
 obj-$(CONFIG_DMA_CMA)			+= contiguous.o
-obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o
+obj-$(CONFIG_DMA_DECLARE_COHERENT)	+= coherent.o
 obj-$(CONFIG_DMA_VIRT_OPS)		+= virt.o
 obj-$(CONFIG_DMA_API_DEBUG)		+= debug.o
 obj-$(CONFIG_SWIOTLB)			+= swiotlb.o
-- 
cgit v1.2.3


From 91a6fda95cb67c94b887355690d1923a7eb6f630 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 25 Dec 2018 17:27:14 +0100
Subject: dma-mapping: remove dma_mark_declared_memory_occupied

This API is not used anywhere, so remove it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/DMA-API.txt   | 17 -----------------
 include/linux/dma-mapping.h |  9 ---------
 kernel/dma/coherent.c       | 23 -----------------------
 3 files changed, 49 deletions(-)

(limited to 'include')

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 78114ee63057..b9d0cba83877 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -605,23 +605,6 @@ unconditionally having removed all the required structures.  It is the
 driver's job to ensure that no parts of this memory region are
 currently in use.
 
-::
-
-	void *
-	dma_mark_declared_memory_occupied(struct device *dev,
-					  dma_addr_t device_addr, size_t size)
-
-This is used to occupy specific regions of the declared space
-(dma_alloc_coherent() will hand out the first free region it finds).
-
-device_addr is the *device* address of the region requested.
-
-size is the size (and should be a page-sized multiple).
-
-The return value will be either a pointer to the processor virtual
-address of the memory, or an error (via PTR_ERR()) if any part of the
-region is occupied.
-
 Part III - Debug drivers use of the DMA-API
 -------------------------------------------
 
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index e29441b8b3b7..d29faadf6ef2 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -743,8 +743,6 @@ static inline int dma_get_cache_alignment(void)
 int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
 				dma_addr_t device_addr, size_t size, int flags);
 void dma_release_declared_memory(struct device *dev);
-void *dma_mark_declared_memory_occupied(struct device *dev,
-					dma_addr_t device_addr, size_t size);
 #else
 static inline int
 dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
@@ -757,13 +755,6 @@ static inline void
 dma_release_declared_memory(struct device *dev)
 {
 }
-
-static inline void *
-dma_mark_declared_memory_occupied(struct device *dev,
-				  dma_addr_t device_addr, size_t size)
-{
-	return ERR_PTR(-EBUSY);
-}
 #endif /* CONFIG_DMA_DECLARE_COHERENT */
 
 static inline void *dmam_alloc_coherent(struct device *dev, size_t size,
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 4b76aba574c2..1d12a31af6d7 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -137,29 +137,6 @@ void dma_release_declared_memory(struct device *dev)
 }
 EXPORT_SYMBOL(dma_release_declared_memory);
 
-void *dma_mark_declared_memory_occupied(struct device *dev,
-					dma_addr_t device_addr, size_t size)
-{
-	struct dma_coherent_mem *mem = dev->dma_mem;
-	unsigned long flags;
-	int pos, err;
-
-	size += device_addr & ~PAGE_MASK;
-
-	if (!mem)
-		return ERR_PTR(-EINVAL);
-
-	spin_lock_irqsave(&mem->spinlock, flags);
-	pos = PFN_DOWN(device_addr - dma_get_device_base(dev, mem));
-	err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
-	spin_unlock_irqrestore(&mem->spinlock, flags);
-
-	if (err != 0)
-		return ERR_PTR(err);
-	return mem->virt_base + (pos << PAGE_SHIFT);
-}
-EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
-
 static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
 		ssize_t size, dma_addr_t *dma_handle)
 {
-- 
cgit v1.2.3


From 82c5de0ab8dbd6035223ad69e76bd8a88a0a9399 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 25 Dec 2018 13:29:54 +0100
Subject: dma-mapping: remove the DMA_MEMORY_EXCLUSIVE flag

All users of dma_declare_coherent want their allocations to be
exclusive, so default to exclusive allocations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/DMA-API.txt                          |  9 +-------
 arch/arm/mach-imx/mach-imx27_visstrim_m10.c        | 12 ++++-------
 arch/arm/mach-imx/mach-mx31moboard.c               |  3 +--
 arch/sh/boards/mach-ap325rxa/setup.c               |  5 ++---
 arch/sh/boards/mach-ecovec24/setup.c               |  6 ++----
 arch/sh/boards/mach-kfr2r09/setup.c                |  5 ++---
 arch/sh/boards/mach-migor/setup.c                  |  5 ++---
 arch/sh/boards/mach-se/7724/setup.c                |  6 ++----
 arch/sh/drivers/pci/fixups-dreamcast.c             |  3 +--
 .../platform/soc_camera/sh_mobile_ceu_camera.c     |  3 +--
 drivers/usb/host/ohci-sm501.c                      |  3 +--
 drivers/usb/host/ohci-tmio.c                       |  2 +-
 include/linux/dma-mapping.h                        |  7 ++----
 kernel/dma/coherent.c                              | 25 ++++++----------------
 14 files changed, 29 insertions(+), 65 deletions(-)

(limited to 'include')

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index b9d0cba83877..38e561b773b4 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -566,8 +566,7 @@ boundaries when doing this.
 
 	int
 	dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
-				    dma_addr_t device_addr, size_t size, int
-				    flags)
+				    dma_addr_t device_addr, size_t size);
 
 Declare region of memory to be handed out by dma_alloc_coherent() when
 it's asked for coherent memory for this device.
@@ -581,12 +580,6 @@ dma_addr_t in dma_alloc_coherent()).
 
 size is the size of the area (must be multiples of PAGE_SIZE).
 
-flags can be ORed together and are:
-
-- DMA_MEMORY_EXCLUSIVE - only allocate memory from the declared regions.
-  Do not allow dma_alloc_coherent() to fall back to system memory when
-  it's out of memory in the declared region.
-
 As a simplification for the platforms, only *one* such region of
 memory may be declared per device.
 
diff --git a/arch/arm/mach-imx/mach-imx27_visstrim_m10.c b/arch/arm/mach-imx/mach-imx27_visstrim_m10.c
index 5169dfba9718..07d4fcfe5c2e 100644
--- a/arch/arm/mach-imx/mach-imx27_visstrim_m10.c
+++ b/arch/arm/mach-imx/mach-imx27_visstrim_m10.c
@@ -258,8 +258,7 @@ static void __init visstrim_analog_camera_init(void)
 		return;
 
 	dma_declare_coherent_memory(&pdev->dev, mx2_camera_base,
-				    mx2_camera_base, MX2_CAMERA_BUF_SIZE,
-				    DMA_MEMORY_EXCLUSIVE);
+				    mx2_camera_base, MX2_CAMERA_BUF_SIZE);
 }
 
 static void __init visstrim_reserve(void)
@@ -445,8 +444,7 @@ static void __init visstrim_coda_init(void)
 	dma_declare_coherent_memory(&pdev->dev,
 				    mx2_camera_base + MX2_CAMERA_BUF_SIZE,
 				    mx2_camera_base + MX2_CAMERA_BUF_SIZE,
-				    MX2_CAMERA_BUF_SIZE,
-				    DMA_MEMORY_EXCLUSIVE);
+				    MX2_CAMERA_BUF_SIZE);
 }
 
 /* DMA deinterlace */
@@ -465,8 +463,7 @@ static void __init visstrim_deinterlace_init(void)
 	dma_declare_coherent_memory(&pdev->dev,
 				    mx2_camera_base + 2 * MX2_CAMERA_BUF_SIZE,
 				    mx2_camera_base + 2 * MX2_CAMERA_BUF_SIZE,
-				    MX2_CAMERA_BUF_SIZE,
-				    DMA_MEMORY_EXCLUSIVE);
+				    MX2_CAMERA_BUF_SIZE);
 }
 
 /* Emma-PrP for format conversion */
@@ -485,8 +482,7 @@ static void __init visstrim_emmaprp_init(void)
 	 */
 	ret = dma_declare_coherent_memory(&pdev->dev,
 				mx2_camera_base, mx2_camera_base,
-				MX2_CAMERA_BUF_SIZE,
-				DMA_MEMORY_EXCLUSIVE);
+				MX2_CAMERA_BUF_SIZE);
 	if (ret)
 		pr_err("Failed to declare memory for emmaprp\n");
 }
diff --git a/arch/arm/mach-imx/mach-mx31moboard.c b/arch/arm/mach-imx/mach-mx31moboard.c
index 643a3d749703..fe50f4cf00a7 100644
--- a/arch/arm/mach-imx/mach-mx31moboard.c
+++ b/arch/arm/mach-imx/mach-mx31moboard.c
@@ -475,8 +475,7 @@ static int __init mx31moboard_init_cam(void)
 
 	ret = dma_declare_coherent_memory(&pdev->dev,
 					  mx3_camera_base, mx3_camera_base,
-					  MX3_CAMERA_BUF_SIZE,
-					  DMA_MEMORY_EXCLUSIVE);
+					  MX3_CAMERA_BUF_SIZE);
 	if (ret)
 		goto err;
 
diff --git a/arch/sh/boards/mach-ap325rxa/setup.c b/arch/sh/boards/mach-ap325rxa/setup.c
index 8f234d0435aa..7899b4f51fdd 100644
--- a/arch/sh/boards/mach-ap325rxa/setup.c
+++ b/arch/sh/boards/mach-ap325rxa/setup.c
@@ -529,9 +529,8 @@ static int __init ap325rxa_devices_setup(void)
 	device_initialize(&ap325rxa_ceu_device.dev);
 	arch_setup_pdev_archdata(&ap325rxa_ceu_device);
 	dma_declare_coherent_memory(&ap325rxa_ceu_device.dev,
-				    ceu_dma_membase, ceu_dma_membase,
-				    ceu_dma_membase + CEU_BUFFER_MEMORY_SIZE - 1,
-				    DMA_MEMORY_EXCLUSIVE);
+			ceu_dma_membase, ceu_dma_membase,
+			ceu_dma_membase + CEU_BUFFER_MEMORY_SIZE - 1);
 
 	platform_device_add(&ap325rxa_ceu_device);
 
diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c
index 22b4106b8084..eb66754cfb8c 100644
--- a/arch/sh/boards/mach-ecovec24/setup.c
+++ b/arch/sh/boards/mach-ecovec24/setup.c
@@ -1440,8 +1440,7 @@ static int __init arch_setup(void)
 	dma_declare_coherent_memory(&ecovec_ceu_devices[0]->dev,
 				    ceu0_dma_membase, ceu0_dma_membase,
 				    ceu0_dma_membase +
-				    CEU_BUFFER_MEMORY_SIZE - 1,
-				    DMA_MEMORY_EXCLUSIVE);
+				    CEU_BUFFER_MEMORY_SIZE - 1);
 	platform_device_add(ecovec_ceu_devices[0]);
 
 	device_initialize(&ecovec_ceu_devices[1]->dev);
@@ -1449,8 +1448,7 @@ static int __init arch_setup(void)
 	dma_declare_coherent_memory(&ecovec_ceu_devices[1]->dev,
 				    ceu1_dma_membase, ceu1_dma_membase,
 				    ceu1_dma_membase +
-				    CEU_BUFFER_MEMORY_SIZE - 1,
-				    DMA_MEMORY_EXCLUSIVE);
+				    CEU_BUFFER_MEMORY_SIZE - 1);
 	platform_device_add(ecovec_ceu_devices[1]);
 
 	gpiod_add_lookup_table(&cn12_power_gpiod_table);
diff --git a/arch/sh/boards/mach-kfr2r09/setup.c b/arch/sh/boards/mach-kfr2r09/setup.c
index 203d249a0a2b..b8bf67c86eab 100644
--- a/arch/sh/boards/mach-kfr2r09/setup.c
+++ b/arch/sh/boards/mach-kfr2r09/setup.c
@@ -603,9 +603,8 @@ static int __init kfr2r09_devices_setup(void)
 	device_initialize(&kfr2r09_ceu_device.dev);
 	arch_setup_pdev_archdata(&kfr2r09_ceu_device);
 	dma_declare_coherent_memory(&kfr2r09_ceu_device.dev,
-				    ceu_dma_membase, ceu_dma_membase,
-				    ceu_dma_membase + CEU_BUFFER_MEMORY_SIZE - 1,
-				    DMA_MEMORY_EXCLUSIVE);
+			ceu_dma_membase, ceu_dma_membase,
+			ceu_dma_membase + CEU_BUFFER_MEMORY_SIZE - 1);
 
 	platform_device_add(&kfr2r09_ceu_device);
 
diff --git a/arch/sh/boards/mach-migor/setup.c b/arch/sh/boards/mach-migor/setup.c
index f4ad33c6d2aa..bcd249e6cfcc 100644
--- a/arch/sh/boards/mach-migor/setup.c
+++ b/arch/sh/boards/mach-migor/setup.c
@@ -603,9 +603,8 @@ static int __init migor_devices_setup(void)
 	device_initialize(&migor_ceu_device.dev);
 	arch_setup_pdev_archdata(&migor_ceu_device);
 	dma_declare_coherent_memory(&migor_ceu_device.dev,
-				    ceu_dma_membase, ceu_dma_membase,
-				    ceu_dma_membase + CEU_BUFFER_MEMORY_SIZE - 1,
-				    DMA_MEMORY_EXCLUSIVE);
+			ceu_dma_membase, ceu_dma_membase,
+			ceu_dma_membase + CEU_BUFFER_MEMORY_SIZE - 1);
 
 	platform_device_add(&migor_ceu_device);
 
diff --git a/arch/sh/boards/mach-se/7724/setup.c b/arch/sh/boards/mach-se/7724/setup.c
index fdbec22ae687..13c2d3ce78f4 100644
--- a/arch/sh/boards/mach-se/7724/setup.c
+++ b/arch/sh/boards/mach-se/7724/setup.c
@@ -941,8 +941,7 @@ static int __init devices_setup(void)
 	dma_declare_coherent_memory(&ms7724se_ceu_devices[0]->dev,
 				    ceu0_dma_membase, ceu0_dma_membase,
 				    ceu0_dma_membase +
-				    CEU_BUFFER_MEMORY_SIZE - 1,
-				    DMA_MEMORY_EXCLUSIVE);
+				    CEU_BUFFER_MEMORY_SIZE - 1);
 	platform_device_add(ms7724se_ceu_devices[0]);
 
 	device_initialize(&ms7724se_ceu_devices[1]->dev);
@@ -950,8 +949,7 @@ static int __init devices_setup(void)
 	dma_declare_coherent_memory(&ms7724se_ceu_devices[1]->dev,
 				    ceu1_dma_membase, ceu1_dma_membase,
 				    ceu1_dma_membase +
-				    CEU_BUFFER_MEMORY_SIZE - 1,
-				    DMA_MEMORY_EXCLUSIVE);
+				    CEU_BUFFER_MEMORY_SIZE - 1);
 	platform_device_add(ms7724se_ceu_devices[1]);
 
 	return platform_add_devices(ms7724se_devices,
diff --git a/arch/sh/drivers/pci/fixups-dreamcast.c b/arch/sh/drivers/pci/fixups-dreamcast.c
index dfdbd05b6eb1..7be8694c0d13 100644
--- a/arch/sh/drivers/pci/fixups-dreamcast.c
+++ b/arch/sh/drivers/pci/fixups-dreamcast.c
@@ -63,8 +63,7 @@ static void gapspci_fixup_resources(struct pci_dev *dev)
 		BUG_ON(dma_declare_coherent_memory(&dev->dev,
 						res.start,
 						region.start,
-						resource_size(&res),
-						DMA_MEMORY_EXCLUSIVE));
+						resource_size(&res)));
 		break;
 	default:
 		printk("PCI: Failed resource fixup\n");
diff --git a/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c b/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c
index 6803f744e307..cc357b8db1dc 100644
--- a/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c
+++ b/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c
@@ -1708,8 +1708,7 @@ static int sh_mobile_ceu_probe(struct platform_device *pdev)
 	if (res) {
 		err = dma_declare_coherent_memory(&pdev->dev, res->start,
 						  res->start,
-						  resource_size(res),
-						  DMA_MEMORY_EXCLUSIVE);
+						  resource_size(res));
 		if (err) {
 			dev_err(&pdev->dev, "Unable to declare CEU memory.\n");
 			return err;
diff --git a/drivers/usb/host/ohci-sm501.c b/drivers/usb/host/ohci-sm501.c
index c9233cddf9a2..c26228c25f99 100644
--- a/drivers/usb/host/ohci-sm501.c
+++ b/drivers/usb/host/ohci-sm501.c
@@ -126,8 +126,7 @@ static int ohci_hcd_sm501_drv_probe(struct platform_device *pdev)
 
 	retval = dma_declare_coherent_memory(dev, mem->start,
 					 mem->start - mem->parent->start,
-					 resource_size(mem),
-					 DMA_MEMORY_EXCLUSIVE);
+					 resource_size(mem));
 	if (retval) {
 		dev_err(dev, "cannot declare coherent memory\n");
 		goto err1;
diff --git a/drivers/usb/host/ohci-tmio.c b/drivers/usb/host/ohci-tmio.c
index a631dbb369d7..f88a0370659f 100644
--- a/drivers/usb/host/ohci-tmio.c
+++ b/drivers/usb/host/ohci-tmio.c
@@ -225,7 +225,7 @@ static int ohci_hcd_tmio_drv_probe(struct platform_device *dev)
 	}
 
 	ret = dma_declare_coherent_memory(&dev->dev, sram->start, sram->start,
-				resource_size(sram), DMA_MEMORY_EXCLUSIVE);
+				resource_size(sram));
 	if (ret)
 		goto err_dma_declare;
 
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index d29faadf6ef2..70ad15758a70 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -736,17 +736,14 @@ static inline int dma_get_cache_alignment(void)
 	return 1;
 }
 
-/* flags for the coherent memory api */
-#define DMA_MEMORY_EXCLUSIVE		0x01
-
 #ifdef CONFIG_DMA_DECLARE_COHERENT
 int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
-				dma_addr_t device_addr, size_t size, int flags);
+				dma_addr_t device_addr, size_t size);
 void dma_release_declared_memory(struct device *dev);
 #else
 static inline int
 dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
-			    dma_addr_t device_addr, size_t size, int flags)
+			    dma_addr_t device_addr, size_t size)
 {
 	return -ENOSYS;
 }
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 1d12a31af6d7..29fd6590dc1e 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -14,7 +14,6 @@ struct dma_coherent_mem {
 	dma_addr_t	device_base;
 	unsigned long	pfn_base;
 	int		size;
-	int		flags;
 	unsigned long	*bitmap;
 	spinlock_t	spinlock;
 	bool		use_dev_dma_pfn_offset;
@@ -38,9 +37,9 @@ static inline dma_addr_t dma_get_device_base(struct device *dev,
 		return mem->device_base;
 }
 
-static int dma_init_coherent_memory(
-	phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags,
-	struct dma_coherent_mem **mem)
+static int dma_init_coherent_memory(phys_addr_t phys_addr,
+		dma_addr_t device_addr, size_t size,
+		struct dma_coherent_mem **mem)
 {
 	struct dma_coherent_mem *dma_mem = NULL;
 	void *mem_base = NULL;
@@ -73,7 +72,6 @@ static int dma_init_coherent_memory(
 	dma_mem->device_base = device_addr;
 	dma_mem->pfn_base = PFN_DOWN(phys_addr);
 	dma_mem->size = pages;
-	dma_mem->flags = flags;
 	spin_lock_init(&dma_mem->spinlock);
 
 	*mem = dma_mem;
@@ -110,12 +108,12 @@ static int dma_assign_coherent_memory(struct device *dev,
 }
 
 int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
-				dma_addr_t device_addr, size_t size, int flags)
+				dma_addr_t device_addr, size_t size)
 {
 	struct dma_coherent_mem *mem;
 	int ret;
 
-	ret = dma_init_coherent_memory(phys_addr, device_addr, size, flags, &mem);
+	ret = dma_init_coherent_memory(phys_addr, device_addr, size, &mem);
 	if (ret)
 		return ret;
 
@@ -190,15 +188,7 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
 		return 0;
 
 	*ret = __dma_alloc_from_coherent(mem, size, dma_handle);
-	if (*ret)
-		return 1;
-
-	/*
-	 * In the case where the allocation can not be satisfied from the
-	 * per-device area, try to fall back to generic memory if the
-	 * constraints allow it.
-	 */
-	return mem->flags & DMA_MEMORY_EXCLUSIVE;
+	return 1;
 }
 
 void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle)
@@ -327,8 +317,7 @@ static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev)
 
 	if (!mem) {
 		ret = dma_init_coherent_memory(rmem->base, rmem->base,
-					       rmem->size,
-					       DMA_MEMORY_EXCLUSIVE, &mem);
+					       rmem->size, &mem);
 		if (ret) {
 			pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n",
 				&rmem->base, (unsigned long)rmem->size / SZ_1M);
-- 
cgit v1.2.3


From 078b5fd92c4913dd367361db6c28568386077c89 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 18 Feb 2019 11:35:54 -0500
Subject: NFS: Clean up list moves of struct nfs_page

In several places we're just moving the struct nfs_page from one list to
another by first removing from the existing list, then adding to the new
one.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/direct.c          |  3 +--
 fs/nfs/pagelist.c        | 12 ++++--------
 include/linux/nfs_page.h | 10 ++++++++++
 3 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 33824a0a57bf..1377ee20ecf9 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -664,8 +664,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 
 	list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
 		if (!nfs_pageio_add_request(&desc, req)) {
-			nfs_list_remove_request(req);
-			nfs_list_add_request(req, &failed);
+			nfs_list_move_request(req, &failed);
 			spin_lock(&cinfo.inode->i_lock);
 			dreq->flags = 0;
 			if (desc.pg_error < 0)
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a8951f1f7b4e..9cbfdb979992 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -768,8 +768,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
 	pageused = 0;
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
-		nfs_list_remove_request(req);
-		nfs_list_add_request(req, &hdr->pages);
+		nfs_list_move_request(req, &hdr->pages);
 
 		if (!last_page || last_page != req->wb_page) {
 			pageused++;
@@ -961,8 +960,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
 	}
 	if (!nfs_can_coalesce_requests(prev, req, desc))
 		return 0;
-	nfs_list_remove_request(req);
-	nfs_list_add_request(req, &mirror->pg_list);
+	nfs_list_move_request(req, &mirror->pg_list);
 	mirror->pg_count += req->wb_bytes;
 	return 1;
 }
@@ -994,8 +992,7 @@ nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc,
 {
 	LIST_HEAD(head);
 
-	nfs_list_remove_request(req);
-	nfs_list_add_request(req, &head);
+	nfs_list_move_request(req, &head);
 	desc->pg_completion_ops->error_cleanup(&head);
 }
 
@@ -1237,9 +1234,8 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
 	while (!list_empty(&hdr->pages)) {
 		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
 
-		nfs_list_remove_request(req);
 		if (!nfs_pageio_add_request(desc, req))
-			nfs_list_add_request(req, &failed);
+			nfs_list_move_request(req, &failed);
 	}
 	nfs_pageio_complete(desc);
 	if (!list_empty(&failed)) {
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index e27572d30d97..ad69430fd0eb 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -164,6 +164,16 @@ nfs_list_add_request(struct nfs_page *req, struct list_head *head)
 	list_add_tail(&req->wb_list, head);
 }
 
+/**
+ * nfs_list_move_request - Move a request to a new list
+ * @req: request
+ * @head: head of list into which to insert the request.
+ */
+static inline void
+nfs_list_move_request(struct nfs_page *req, struct list_head *head)
+{
+	list_move_tail(&req->wb_list, head);
+}
 
 /**
  * nfs_list_remove_request - Remove a request from its wb_list
-- 
cgit v1.2.3


From df3accb849607a86278a37c35e6b313635ccc48b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 13 Feb 2019 10:39:39 -0500
Subject: NFS: Pass error information to the pgio error cleanup routine

Allow the caller to pass error information when cleaning up a failed
I/O request so that we can conditionally take action to cancel the
request altogether if the error turned out to be fatal.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/direct.c         |  4 ++--
 fs/nfs/pagelist.c       |  5 +++--
 fs/nfs/read.c           |  2 +-
 fs/nfs/write.c          | 11 +++++++++--
 include/linux/nfs_xdr.h |  2 +-
 5 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 1377ee20ecf9..0fd811ac08b5 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -428,7 +428,7 @@ out_put:
 	hdr->release(hdr);
 }
 
-static void nfs_read_sync_pgio_error(struct list_head *head)
+static void nfs_read_sync_pgio_error(struct list_head *head, int error)
 {
 	struct nfs_page *req;
 
@@ -820,7 +820,7 @@ out_put:
 	hdr->release(hdr);
 }
 
-static void nfs_write_sync_pgio_error(struct list_head *head)
+static void nfs_write_sync_pgio_error(struct list_head *head, int error)
 {
 	struct nfs_page *req;
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 9cbfdb979992..695afb7de3a7 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -993,7 +993,7 @@ nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc,
 	LIST_HEAD(head);
 
 	nfs_list_move_request(req, &head);
-	desc->pg_completion_ops->error_cleanup(&head);
+	desc->pg_completion_ops->error_cleanup(&head, desc->pg_error);
 }
 
 /**
@@ -1129,7 +1129,8 @@ static void nfs_pageio_error_cleanup(struct nfs_pageio_descriptor *desc)
 
 	for (midx = 0; midx < desc->pg_mirror_count; midx++) {
 		mirror = &desc->pg_mirrors[midx];
-		desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
+		desc->pg_completion_ops->error_cleanup(&mirror->pg_list,
+				desc->pg_error);
 	}
 }
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f9f19784db82..1d95a60b2586 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -205,7 +205,7 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr,
 }
 
 static void
-nfs_async_read_error(struct list_head *head)
+nfs_async_read_error(struct list_head *head, int error)
 {
 	struct nfs_page	*req;
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d09c9f878141..11df9f03245f 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1412,20 +1412,27 @@ static void nfs_redirty_request(struct nfs_page *req)
 	nfs_release_request(req);
 }
 
-static void nfs_async_write_error(struct list_head *head)
+static void nfs_async_write_error(struct list_head *head, int error)
 {
 	struct nfs_page	*req;
 
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
+		if (nfs_error_is_fatal(error)) {
+			nfs_context_set_write_error(req->wb_context, error);
+			if (nfs_error_is_fatal_on_server(error)) {
+				nfs_write_error_remove_page(req);
+				continue;
+			}
+		}
 		nfs_redirty_request(req);
 	}
 }
 
 static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr)
 {
-	nfs_async_write_error(&hdr->pages);
+	nfs_async_write_error(&hdr->pages, 0);
 	filemap_fdatawrite_range(hdr->inode->i_mapping, hdr->args.offset,
 			hdr->args.offset + hdr->args.count - 1);
 }
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 441a93ebcac0..b4bd2bf5f585 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1549,7 +1549,7 @@ struct nfs_commit_data {
 };
 
 struct nfs_pgio_completion_ops {
-	void	(*error_cleanup)(struct list_head *head);
+	void	(*error_cleanup)(struct list_head *head, int);
 	void	(*init_hdr)(struct nfs_pgio_header *hdr);
 	void	(*completion)(struct nfs_pgio_header *hdr);
 	void	(*reschedule_io)(struct nfs_pgio_header *hdr);
-- 
cgit v1.2.3


From 152482580a1b0accb60676063a1ac57b2d12daf6 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 12:54:17 -0800
Subject: KVM: Call kvm_arch_memslots_updated() before updating memslots

kvm_arch_memslots_updated() is at this point in time an x86-specific
hook for handling MMIO generation wraparound.  x86 stashes 19 bits of
the memslots generation number in its MMIO sptes in order to avoid
full page fault walks for repeat faults on emulated MMIO addresses.
Because only 19 bits are used, wrapping the MMIO generation number is
possible, if unlikely.  kvm_arch_memslots_updated() alerts x86 that
the generation has changed so that it can invalidate all MMIO sptes in
case the effective MMIO generation has wrapped so as to avoid using a
stale spte, e.g. a (very) old spte that was created with generation==0.

Given that the purpose of kvm_arch_memslots_updated() is to prevent
consuming stale entries, it needs to be called before the new generation
is propagated to memslots.  Invalidating the MMIO sptes after updating
memslots means that there is a window where a vCPU could dereference
the new memslots generation, e.g. 0, and incorrectly reuse an old MMIO
spte that was created with (pre-wrap) generation==0.

Fixes: e59dbe09f8e6 ("KVM: Introduce kvm_arch_memslots_updated()")
Cc: <stable@vger.kernel.org>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h    | 2 +-
 arch/powerpc/include/asm/kvm_host.h | 2 +-
 arch/s390/include/asm/kvm_host.h    | 2 +-
 arch/x86/include/asm/kvm_host.h     | 2 +-
 arch/x86/kvm/mmu.c                  | 4 ++--
 arch/x86/kvm/x86.c                  | 4 ++--
 include/linux/kvm_host.h            | 2 +-
 virt/kvm/arm/mmu.c                  | 2 +-
 virt/kvm/kvm_main.c                 | 7 +++++--
 9 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index d2abd98471e8..41204a49cf95 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -1134,7 +1134,7 @@ static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_free_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
-static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
+static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 0f98f00da2ea..19693b8add93 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -837,7 +837,7 @@ struct kvm_vcpu_arch {
 static inline void kvm_arch_hardware_disable(void) {}
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
-static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
+static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_exit(void) {}
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index d5d24889c3bc..c2b8c8c6c9be 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -878,7 +878,7 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_free_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
-static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
+static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *slot) {}
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0e2ef41efb9d..c4758e1a8843 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1254,7 +1254,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 				   struct kvm_memory_slot *slot,
 				   gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
-void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots);
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 415d0e62cb3e..a53a0e7ad9e6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5893,13 +5893,13 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
 	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
 }
 
-void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
 {
 	/*
 	 * The very rare case: if the generation-number is round,
 	 * zap all shadow pages.
 	 */
-	if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) {
+	if (unlikely((gen & MMIO_GEN_MASK) == 0)) {
 		kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
 		kvm_mmu_invalidate_zap_all_pages(kvm);
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3de586f89730..03d26ffb29cd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9357,13 +9357,13 @@ out_free:
 	return -ENOMEM;
 }
 
-void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
+void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
 {
 	/*
 	 * memslots->generation has been incremented.
 	 * mmio generation may have reached its maximum value.
 	 */
-	kvm_mmu_invalidate_mmio_sptes(kvm, slots);
+	kvm_mmu_invalidate_mmio_sptes(kvm, gen);
 }
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c38cc5eb7e73..cf761ff58224 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -634,7 +634,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 			   struct kvm_memory_slot *dont);
 int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 			    unsigned long npages);
-void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots);
+void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
 				const struct kvm_userspace_memory_region *mem,
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index fbdf3ac2f001..e0355e0f8712 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -2350,7 +2350,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 	return 0;
 }
 
-void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
+void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
 {
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0a0ea8f4bb1b..d54f6578a849 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -874,6 +874,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 		int as_id, struct kvm_memslots *slots)
 {
 	struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
+	u64 gen;
 
 	/*
 	 * Set the low bit in the generation, which disables SPTE caching
@@ -896,9 +897,11 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 	 * space 0 will use generations 0, 4, 8, ... while * address space 1 will
 	 * use generations 2, 6, 10, 14, ...
 	 */
-	slots->generation += KVM_ADDRESS_SPACE_NUM * 2 - 1;
+	gen = slots->generation + KVM_ADDRESS_SPACE_NUM * 2 - 1;
 
-	kvm_arch_memslots_updated(kvm, slots);
+	kvm_arch_memslots_updated(kvm, gen);
+
+	slots->generation = gen;
 
 	return old_memslots;
 }
-- 
cgit v1.2.3


From 361209e054a2c9f34da090ee1ee4c1e8bfe76a64 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:14 -0800
Subject: KVM: Explicitly define the "memslot update in-progress" bit

KVM uses bit 0 of the memslots generation as an "update in-progress"
flag, which is used by x86 to prevent caching MMIO access while the
memslots are changing.  Although the intended behavior is flag-like,
e.g. MMIO sptes intentionally drop the in-progress bit so as to avoid
caching data from in-flux memslots, the implementation oftentimes treats
the bit as part of the generation number itself, e.g. incrementing the
generation increments twice, once to set the flag and once to clear it.

Prior to commit 4bd518f1598d ("KVM: use separate generations for
each address space"), incorporating the "update in-progress" bit into
the generation number largely made sense, e.g. "real" generations are
even, "bogus" generations are odd, most code doesn't need to be aware of
the bit, etc...

Now that unique memslots generation numbers are assigned to each address
space, stealthing the in-progress status into the generation number
results in a wide variety of subtle code, e.g. kvm_create_vm() jumps
over bit 0 when initializing the memslots generation without any hint as
to why.

Explicitly define the flag and convert as much code as possible (which
isn't much) to actually treat it like a flag.  This paves the way for
eventually using a different bit for "update in-progress" so that it can
be a flag in truth instead of a awkward extension to the generation
number.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.h       |  2 +-
 include/linux/kvm_host.h | 21 +++++++++++++++++++++
 virt/kvm/kvm_main.c      | 26 +++++++++++++-------------
 3 files changed, 35 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 20ede17202bf..28406aa1136d 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -183,7 +183,7 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
 {
 	u64 gen = kvm_memslots(vcpu->kvm)->generation;
 
-	if (unlikely(gen & 1))
+	if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
 		return;
 
 	/*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index cf761ff58224..5e1cb74922b3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -48,6 +48,27 @@
  */
 #define KVM_MEMSLOT_INVALID	(1UL << 16)
 
+/*
+ * Bit 0 of the memslot generation number is an "update in-progress flag",
+ * e.g. is temporarily set for the duration of install_new_memslots().
+ * This flag effectively creates a unique generation number that is used to
+ * mark cached memslot data, e.g. MMIO accesses, as potentially being stale,
+ * i.e. may (or may not) have come from the previous memslots generation.
+ *
+ * This is necessary because the actual memslots update is not atomic with
+ * respect to the generation number update.  Updating the generation number
+ * first would allow a vCPU to cache a spte from the old memslots using the
+ * new generation number, and updating the generation number after switching
+ * to the new memslots would allow cache hits using the old generation number
+ * to reference the defunct memslots.
+ *
+ * This mechanism is used to prevent getting hits in KVM's caches while a
+ * memslot update is in-progress, and to prevent cache hits *after* updating
+ * the actual generation number against accesses that were inserted into the
+ * cache *before* the memslots were updated.
+ */
+#define KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS	BIT_ULL(0)
+
 /* Two fragments for cross MMIO pages. */
 #define KVM_MAX_MMIO_FRAGMENTS	2
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d54f6578a849..0f1f1c7c7a36 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -874,30 +874,30 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 		int as_id, struct kvm_memslots *slots)
 {
 	struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
-	u64 gen;
+	u64 gen = old_memslots->generation;
 
-	/*
-	 * Set the low bit in the generation, which disables SPTE caching
-	 * until the end of synchronize_srcu_expedited.
-	 */
-	WARN_ON(old_memslots->generation & 1);
-	slots->generation = old_memslots->generation + 1;
+	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
+	slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
 
 	rcu_assign_pointer(kvm->memslots[as_id], slots);
 	synchronize_srcu_expedited(&kvm->srcu);
 
 	/*
-	 * Increment the new memslot generation a second time. This prevents
-	 * vm exits that race with memslot updates from caching a memslot
-	 * generation that will (potentially) be valid forever.
-	 *
+	 * Increment the new memslot generation a second time, dropping the
+	 * update in-progress flag and incrementing then generation based on
+	 * the number of address spaces.  This provides a unique and easily
+	 * identifiable generation number while the memslots are in flux.
+	 */
+	gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
+
+	/*
 	 * Generations must be unique even across address spaces.  We do not need
 	 * a global counter for that, instead the generation space is evenly split
 	 * across address spaces.  For example, with two address spaces, address
-	 * space 0 will use generations 0, 4, 8, ... while * address space 1 will
+	 * space 0 will use generations 0, 4, 8, ... while address space 1 will
 	 * use generations 2, 6, 10, 14, ...
 	 */
-	gen = slots->generation + KVM_ADDRESS_SPACE_NUM * 2 - 1;
+	gen += KVM_ADDRESS_SPACE_NUM * 2;
 
 	kvm_arch_memslots_updated(kvm, gen);
 
-- 
cgit v1.2.3


From 164bf7e56c5a73f2f819c39ba7e0f20e0f97dc7b Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:18 -0800
Subject: KVM: Move the memslot update in-progress flag to bit 63

...now that KVM won't explode by moving it out of bit 0.  Using bit 63
eliminates the need to jump over bit 0, e.g. when calculating a new
memslots generation or when propagating the memslots generation to an
MMIO spte.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/mmu.txt | 13 ++++++++-----
 arch/x86/kvm/mmu.c                | 31 ++++++++++++-------------------
 include/linux/kvm_host.h          |  4 ++--
 virt/kvm/kvm_main.c               |  8 ++++----
 4 files changed, 26 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index e507a9e0421e..367a952f50ab 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -452,13 +452,16 @@ stored into the MMIO spte.  Thus, the MMIO spte might be created based on
 out-of-date information, but with an up-to-date generation number.
 
 To avoid this, the generation number is incremented again after synchronize_srcu
-returns; thus, the low bit of kvm_memslots(kvm)->generation is only 1 during a
+returns; thus, bit 63 of kvm_memslots(kvm)->generation set to 1 only during a
 memslot update, while some SRCU readers might be using the old copy.  We do not
 want to use an MMIO sptes created with an odd generation number, and we can do
-this without losing a bit in the MMIO spte.  The low bit of the generation
-is not stored in MMIO spte, and presumed zero when it is extracted out of the
-spte.  If KVM is unlucky and creates an MMIO spte while the low bit is 1,
-the next access to the spte will always be a cache miss.
+this without losing a bit in the MMIO spte.  The "update in-progress" bit of the
+generation is not stored in MMIO spte, and is so is implicitly zero when the
+generation is extracted out of the spte.  If KVM is unlucky and creates an MMIO
+spte while an update is in-progress, the next access to the spte will always be
+a cache miss.  For example, a subsequent access during the update window will
+miss due to the in-progress flag diverging, while an access after the update
+window closes will have a higher generation number (as compared to the spte).
 
 
 Further reading
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 364b2a737d94..bcf62e1e1ff7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -335,18 +335,17 @@ static inline bool is_access_track_spte(u64 spte)
  * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
  * the memslots generation and is derived as follows:
  *
- * Bits 1-9 of the memslot generation are propagated to spte bits 3-11
- * Bits 10-19 of the memslot generation are propagated to spte bits 52-61
+ * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
+ * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
  *
- * The MMIO generation starts at bit 1 of the memslots generation in order to
- * skip over bit 0, the KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag.  Including
- * the flag would require stealing a bit from the "real" generation number and
- * thus effectively halve the maximum number of MMIO generations that can be
- * handled before encountering a wrap (which requires a full MMU zap).  The
- * flag is instead explicitly queried when checking for MMIO spte cache hits.
+ * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
+ * the MMIO generation number, as doing so would require stealing a bit from
+ * the "real" generation number and thus effectively halve the maximum number
+ * of MMIO generations that can be handled before encountering a wrap (which
+ * requires a full MMU zap).  The flag is instead explicitly queried when
+ * checking for MMIO spte cache hits.
  */
-#define MMIO_SPTE_GEN_MASK		GENMASK_ULL(19, 1)
-#define MMIO_SPTE_GEN_SHIFT		1
+#define MMIO_SPTE_GEN_MASK		GENMASK_ULL(18, 0)
 
 #define MMIO_SPTE_GEN_LOW_START		3
 #define MMIO_SPTE_GEN_LOW_END		11
@@ -363,8 +362,6 @@ static u64 generation_mmio_spte_mask(u64 gen)
 
 	WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
 
-	gen >>= MMIO_SPTE_GEN_SHIFT;
-
 	mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
 	mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
 	return mask;
@@ -378,7 +375,7 @@ static u64 get_mmio_spte_generation(u64 spte)
 
 	gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
 	gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
-	return gen << MMIO_SPTE_GEN_SHIFT;
+	return gen;
 }
 
 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
@@ -5905,13 +5902,9 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
 
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
 {
-	gen &= MMIO_SPTE_GEN_MASK;
+	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
 
-	/*
-	 * Shift to adjust for the "update in-progress" flag, which isn't
-	 * included in the MMIO generation number.
-	 */
-	gen >>= MMIO_SPTE_GEN_SHIFT;
+	gen &= MMIO_SPTE_GEN_MASK;
 
 	/*
 	 * Generation numbers are incremented in multiples of the number of
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5e1cb74922b3..85c0c00d5159 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -49,7 +49,7 @@
 #define KVM_MEMSLOT_INVALID	(1UL << 16)
 
 /*
- * Bit 0 of the memslot generation number is an "update in-progress flag",
+ * Bit 63 of the memslot generation number is an "update in-progress flag",
  * e.g. is temporarily set for the duration of install_new_memslots().
  * This flag effectively creates a unique generation number that is used to
  * mark cached memslot data, e.g. MMIO accesses, as potentially being stale,
@@ -67,7 +67,7 @@
  * the actual generation number against accesses that were inserted into the
  * cache *before* the memslots were updated.
  */
-#define KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS	BIT_ULL(0)
+#define KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS	BIT_ULL(63)
 
 /* Two fragments for cross MMIO pages. */
 #define KVM_MAX_MMIO_FRAGMENTS	2
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5c2e7e173a46..c9d0bc01f8cb 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -657,7 +657,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
 		if (!slots)
 			goto out_err_no_srcu;
 		/* Generations must be different for each address space. */
-		slots->generation = i * 2;
+		slots->generation = i;
 		rcu_assign_pointer(kvm->memslots[i], slots);
 	}
 
@@ -890,10 +890,10 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 	 * Generations must be unique even across address spaces.  We do not need
 	 * a global counter for that, instead the generation space is evenly split
 	 * across address spaces.  For example, with two address spaces, address
-	 * space 0 will use generations 0, 4, 8, ... while address space 1 will
-	 * use generations 2, 6, 10, 14, ...
+	 * space 0 will use generations 0, 2, 4, ... while address space 1 will
+	 * use generations 1, 3, 5, ...
 	 */
-	gen += KVM_ADDRESS_SPACE_NUM * 2;
+	gen += KVM_ADDRESS_SPACE_NUM;
 
 	kvm_arch_memslots_updated(kvm, gen);
 
-- 
cgit v1.2.3


From 49113d360bdeb4dd916fb6bffbcc3e157422b6fd Mon Sep 17 00:00:00 2001
From: Nir Weiner <nir.weiner@oracle.com>
Date: Sun, 27 Jan 2019 12:17:15 +0200
Subject: KVM: Expose the initial start value in grow_halt_poll_ns() as a
 module parameter

The hard-coded value 10000 in grow_halt_poll_ns() stands for the initial
start value when raising up vcpu->halt_poll_ns.
It actually sets the first timeout to the first polling session.
This value has significant effect on how tolerant we are to outliers.
On the standard case, higher value is better - we will spend more time
in the polling busyloop, handle events/interrupts faster and result
in better performance.
But on outliers it puts us in a busy loop that does nothing.
Even if the shrink factor is zero, we will still waste time on the first
iteration.
The optimal value changes between different workloads. It depends on
outliers rate and polling sessions length.
As this value has significant effect on the dynamic halt-polling
algorithm, it should be configurable and exposed.

Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Nir Weiner <nir.weiner@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/halt-polling.txt | 37 +++++++++++++++++++-----------
 arch/powerpc/kvm/book3s_hv.c               |  3 +--
 include/linux/kvm_host.h                   |  1 +
 virt/kvm/kvm_main.c                        |  8 +++++--
 4 files changed, 31 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/Documentation/virtual/kvm/halt-polling.txt b/Documentation/virtual/kvm/halt-polling.txt
index 4a8418318769..4f791b128dd2 100644
--- a/Documentation/virtual/kvm/halt-polling.txt
+++ b/Documentation/virtual/kvm/halt-polling.txt
@@ -53,7 +53,8 @@ the global max polling interval then the polling interval can be increased in
 the hope that next time during the longer polling interval the wake up source
 will be received while the host is polling and the latency benefits will be
 received. The polling interval is grown in the function grow_halt_poll_ns() and
-is multiplied by the module parameter halt_poll_ns_grow.
+is multiplied by the module parameters halt_poll_ns_grow and
+halt_poll_ns_grow_start.
 
 In the event that the total block time was greater than the global max polling
 interval then the host will never poll for long enough (limited by the global
@@ -80,22 +81,30 @@ shrunk. These variables are defined in include/linux/kvm_host.h and as module
 parameters in virt/kvm/kvm_main.c, or arch/powerpc/kvm/book3s_hv.c in the
 powerpc kvm-hv case.
 
-Module Parameter    |	     Description	      |	     Default Value
+Module Parameter	|   Description		    |	     Default Value
 --------------------------------------------------------------------------------
-halt_poll_ns	    | The global max polling interval | KVM_HALT_POLL_NS_DEFAULT
-		    | which defines the ceiling value |
-		    | of the polling interval for     | (per arch value)
-		    | each vcpu. 		      |
+halt_poll_ns		| The global max polling    | KVM_HALT_POLL_NS_DEFAULT
+			| interval which defines    |
+			| the ceiling value of the  |
+			| polling interval for      | (per arch value)
+			| each vcpu.		    |
 --------------------------------------------------------------------------------
-halt_poll_ns_grow   | The value by which the halt     |	2
-		    | polling interval is multiplied  |
-		    | in the grow_halt_poll_ns()      |
-		    | function.			      |
+halt_poll_ns_grow	| The value by which the    | 2
+			| halt polling interval is  |
+			| multiplied in the	    |
+			| grow_halt_poll_ns()	    |
+			| function.		    |
 --------------------------------------------------------------------------------
-halt_poll_ns_shrink | The value by which the halt     |	0
-		    | polling interval is divided in  |
-		    | the shrink_halt_poll_ns()	      |
-		    | function.			      |
+halt_poll_ns_grow_start | The initial value to grow | 10000
+			| to from zero in the	    |
+			| grow_halt_poll_ns()	    |
+			| function.		    |
+--------------------------------------------------------------------------------
+halt_poll_ns_shrink	| The value by which the    | 0
+			| halt polling interval is  |
+			| divided in the	    |
+			| shrink_halt_poll_ns()	    |
+			| function.		    |
 --------------------------------------------------------------------------------
 
 These module parameters can be set from the debugfs files in:
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e316a2ddb70b..29ffc99bd79b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3634,9 +3634,8 @@ static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
 	if (!halt_poll_ns_grow)
 		return;
 
-	/* 10us base */
 	if (vc->halt_poll_ns == 0)
-		vc->halt_poll_ns = 10000;
+		vc->halt_poll_ns = halt_poll_ns_grow_start;
 	else
 		vc->halt_poll_ns *= halt_poll_ns_grow;
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 85c0c00d5159..9d55c63db09b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1203,6 +1203,7 @@ extern bool kvm_rebooting;
 
 extern unsigned int halt_poll_ns;
 extern unsigned int halt_poll_ns_grow;
+extern unsigned int halt_poll_ns_grow_start;
 extern unsigned int halt_poll_ns_shrink;
 
 struct kvm_device {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9c8a8bf6e686..ae818d27a1a4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -81,6 +81,11 @@ unsigned int halt_poll_ns_grow = 2;
 module_param(halt_poll_ns_grow, uint, 0644);
 EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
 
+/* The start value to grow halt_poll_ns from */
+unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
+module_param(halt_poll_ns_grow_start, uint, 0644);
+EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
+
 /* Default resets per-vcpu halt_poll_ns . */
 unsigned int halt_poll_ns_shrink;
 module_param(halt_poll_ns_shrink, uint, 0644);
@@ -2191,9 +2196,8 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
 	if (!grow)
 		goto out;
 
-	/* 10us base */
 	if (val == 0)
-		val = 10000;
+		val = halt_poll_ns_grow_start;
 	else
 		val *= grow;
 
-- 
cgit v1.2.3


From b38f6c50270683abf35a388f82cafecce971a003 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Wed, 20 Feb 2019 11:30:49 -0500
Subject: XArray: Fix xa_release in allocating arrays

xa_cmpxchg() was a little too magic in turning ZERO entries into NULL,
and would leave the entry set to the ZERO entry instead of releasing
it for future use.  After careful review of existing users of
xa_cmpxchg(), change the semantics so that it does not translate either
incoming argument from NULL into ZERO entries.

Add several tests to the test-suite to make sure this problem doesn't
come back.

Reported-by: Jason Gunthorpe <jgg@ziepe.ca>
Signed-off-by: Matthew Wilcox <willy@infradead.org>
---
 include/linux/xarray.h | 36 +++++++++++++++++++++++-------------
 lib/test_xarray.c      | 28 ++++++++++++++++++++++++----
 lib/xarray.c           |  6 +-----
 3 files changed, 48 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 687c150071a5..588733abd19d 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -131,6 +131,12 @@ static inline unsigned int xa_pointer_tag(void *entry)
  * xa_mk_internal() - Create an internal entry.
  * @v: Value to turn into an internal entry.
  *
+ * Internal entries are used for a number of purposes.  Entries 0-255 are
+ * used for sibling entries (only 0-62 are used by the current code).  256
+ * is used for the retry entry.  257 is used for the reserved / zero entry.
+ * Negative internal entries are used to represent errnos.  Node pointers
+ * are also tagged as internal entries in some situations.
+ *
  * Context: Any context.
  * Return: An XArray internal entry corresponding to this value.
  */
@@ -163,6 +169,22 @@ static inline bool xa_is_internal(const void *entry)
 	return ((unsigned long)entry & 3) == 2;
 }
 
+#define XA_ZERO_ENTRY		xa_mk_internal(257)
+
+/**
+ * xa_is_zero() - Is the entry a zero entry?
+ * @entry: Entry retrieved from the XArray
+ *
+ * The normal API will return NULL as the contents of a slot containing
+ * a zero entry.  You can only see zero entries by using the advanced API.
+ *
+ * Return: %true if the entry is a zero entry.
+ */
+static inline bool xa_is_zero(const void *entry)
+{
+	return unlikely(entry == XA_ZERO_ENTRY);
+}
+
 /**
  * xa_is_err() - Report whether an XArray operation returned an error
  * @entry: Result from calling an XArray function
@@ -1050,7 +1072,7 @@ int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp)
  */
 static inline void xa_release(struct xarray *xa, unsigned long index)
 {
-	xa_cmpxchg(xa, index, NULL, NULL, 0);
+	xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0);
 }
 
 /* Everything below here is the Advanced API.  Proceed with caution. */
@@ -1210,18 +1232,6 @@ static inline bool xa_is_sibling(const void *entry)
 }
 
 #define XA_RETRY_ENTRY		xa_mk_internal(256)
-#define XA_ZERO_ENTRY		xa_mk_internal(257)
-
-/**
- * xa_is_zero() - Is the entry a zero entry?
- * @entry: Entry retrieved from the XArray
- *
- * Return: %true if the entry is a zero entry.
- */
-static inline bool xa_is_zero(const void *entry)
-{
-	return unlikely(entry == XA_ZERO_ENTRY);
-}
 
 /**
  * xa_is_retry() - Is the entry a retry entry?
diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index 3eaa40ddc390..52f8ecff8c0c 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -361,6 +361,7 @@ static noinline void check_reserve(struct xarray *xa)
 {
 	void *entry;
 	unsigned long index;
+	int count;
 
 	/* An array with a reserved entry is not empty */
 	XA_BUG_ON(xa, !xa_empty(xa));
@@ -377,15 +378,15 @@ static noinline void check_reserve(struct xarray *xa)
 	xa_erase_index(xa, 12345678);
 	XA_BUG_ON(xa, !xa_empty(xa));
 
-	/* cmpxchg sees a reserved entry as NULL */
+	/* cmpxchg sees a reserved entry as ZERO */
 	XA_BUG_ON(xa, xa_reserve(xa, 12345678, GFP_KERNEL) != 0);
-	XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, NULL, xa_mk_value(12345678),
-				GFP_NOWAIT) != NULL);
+	XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, XA_ZERO_ENTRY,
+				xa_mk_value(12345678), GFP_NOWAIT) != NULL);
 	xa_release(xa, 12345678);
 	xa_erase_index(xa, 12345678);
 	XA_BUG_ON(xa, !xa_empty(xa));
 
-	/* But xa_insert does not */
+	/* xa_insert treats it as busy */
 	XA_BUG_ON(xa, xa_reserve(xa, 12345678, GFP_KERNEL) != 0);
 	XA_BUG_ON(xa, xa_insert(xa, 12345678, xa_mk_value(12345678), 0) !=
 			-EBUSY);
@@ -398,9 +399,27 @@ static noinline void check_reserve(struct xarray *xa)
 	XA_BUG_ON(xa, xa_reserve(xa, 6, GFP_KERNEL) != 0);
 	xa_store_index(xa, 7, GFP_KERNEL);
 
+	count = 0;
 	xa_for_each(xa, index, entry) {
 		XA_BUG_ON(xa, index != 5 && index != 7);
+		count++;
+	}
+	XA_BUG_ON(xa, count != 2);
+
+	/* If we free a reserved entry, we should be able to allocate it */
+	if (xa->xa_flags & XA_FLAGS_ALLOC) {
+		u32 id;
+
+		XA_BUG_ON(xa, xa_alloc(xa, &id, xa_mk_value(8),
+					XA_LIMIT(5, 10), GFP_KERNEL) != 0);
+		XA_BUG_ON(xa, id != 8);
+
+		xa_release(xa, 6);
+		XA_BUG_ON(xa, xa_alloc(xa, &id, xa_mk_value(6),
+					XA_LIMIT(5, 10), GFP_KERNEL) != 0);
+		XA_BUG_ON(xa, id != 6);
 	}
+
 	xa_destroy(xa);
 }
 
@@ -1486,6 +1505,7 @@ static int xarray_checks(void)
 	check_xas_erase(&array);
 	check_cmpxchg(&array);
 	check_reserve(&array);
+	check_reserve(&xa0);
 	check_multi_store(&array);
 	check_xa_alloc();
 	check_find(&array);
diff --git a/lib/xarray.c b/lib/xarray.c
index 89e37ac50850..b9a6cf42feee 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -1429,16 +1429,12 @@ void *__xa_cmpxchg(struct xarray *xa, unsigned long index,
 
 	if (WARN_ON_ONCE(xa_is_advanced(entry)))
 		return XA_ERROR(-EINVAL);
-	if (xa_track_free(xa) && !entry)
-		entry = XA_ZERO_ENTRY;
 
 	do {
 		curr = xas_load(&xas);
-		if (curr == XA_ZERO_ENTRY)
-			curr = NULL;
 		if (curr == old) {
 			xas_store(&xas, entry);
-			if (xa_track_free(xa))
+			if (xa_track_free(xa) && entry && !curr)
 				xas_clear_mark(&xas, XA_FREE_MARK);
 		}
 	} while (__xas_nomem(&xas, gfp));
-- 
cgit v1.2.3


From 962033d55d0761e0716a01a715c6659c8c8dfc41 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Wed, 20 Feb 2019 11:51:22 -0500
Subject: XArray: Use xa_cmpxchg to implement xa_reserve

Jason feels this is clearer, and it saves a function and an exported
symbol.

Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Signed-off-by: Matthew Wilcox <willy@infradead.org>
---
 Documentation/core-api/xarray.rst |  1 -
 include/linux/xarray.h            | 25 +++----------------------
 lib/xarray.c                      | 36 ------------------------------------
 3 files changed, 3 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst
index c7436da5c4ad..ef6f9f98f595 100644
--- a/Documentation/core-api/xarray.rst
+++ b/Documentation/core-api/xarray.rst
@@ -215,7 +215,6 @@ Assumes xa_lock held on entry:
  * :c:func:`__xa_erase`
  * :c:func:`__xa_cmpxchg`
  * :c:func:`__xa_alloc`
- * :c:func:`__xa_reserve`
  * :c:func:`__xa_set_mark`
  * :c:func:`__xa_clear_mark`
 
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 588733abd19d..0e01e6129145 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -525,7 +525,6 @@ int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry,
 		struct xa_limit, gfp_t);
 int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry,
 		struct xa_limit, u32 *next, gfp_t);
-int __must_check __xa_reserve(struct xarray *, unsigned long index, gfp_t);
 void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
 void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);
 
@@ -1004,13 +1003,7 @@ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry,
 static inline __must_check
 int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
 {
-	int ret;
-
-	xa_lock(xa);
-	ret = __xa_reserve(xa, index, gfp);
-	xa_unlock(xa);
-
-	return ret;
+	return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp));
 }
 
 /**
@@ -1028,13 +1021,7 @@ int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
 static inline __must_check
 int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp)
 {
-	int ret;
-
-	xa_lock_bh(xa);
-	ret = __xa_reserve(xa, index, gfp);
-	xa_unlock_bh(xa);
-
-	return ret;
+	return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp));
 }
 
 /**
@@ -1052,13 +1039,7 @@ int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp)
 static inline __must_check
 int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp)
 {
-	int ret;
-
-	xa_lock_irq(xa);
-	ret = __xa_reserve(xa, index, gfp);
-	xa_unlock_irq(xa);
-
-	return ret;
+	return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp));
 }
 
 /**
diff --git a/lib/xarray.c b/lib/xarray.c
index b9a6cf42feee..3f10198f00b7 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -1484,42 +1484,6 @@ int __xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
 }
 EXPORT_SYMBOL(__xa_insert);
 
-/**
- * __xa_reserve() - Reserve this index in the XArray.
- * @xa: XArray.
- * @index: Index into array.
- * @gfp: Memory allocation flags.
- *
- * Ensures there is somewhere to store an entry at @index in the array.
- * If there is already something stored at @index, this function does
- * nothing.  If there was nothing there, the entry is marked as reserved.
- * Loading from a reserved entry returns a %NULL pointer.
- *
- * If you do not use the entry that you have reserved, call xa_release()
- * or xa_erase() to free any unnecessary memory.
- *
- * Context: Any context.  Expects the xa_lock to be held on entry.  May
- * release the lock, sleep and reacquire the lock if the @gfp flags permit.
- * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
- */
-int __xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
-{
-	XA_STATE(xas, xa, index);
-	void *curr;
-
-	do {
-		curr = xas_load(&xas);
-		if (!curr) {
-			xas_store(&xas, XA_ZERO_ENTRY);
-			if (xa_track_free(xa))
-				xas_clear_mark(&xas, XA_FREE_MARK);
-		}
-	} while (__xas_nomem(&xas, gfp));
-
-	return xas_error(&xas);
-}
-EXPORT_SYMBOL(__xa_reserve);
-
 #ifdef CONFIG_XARRAY_MULTI
 static void xas_set_range(struct xa_state *xas, unsigned long first,
 		unsigned long last)
-- 
cgit v1.2.3


From 61697a6abd24acba941359c6268a94f4afe4a53d Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 18 Jan 2019 14:19:26 -0500
Subject: dm: eliminate 'split_discard_bios' flag from DM target interface

There is no need to have DM core split discards on behalf of a DM target
now that blk_queue_split() handles splitting discards based on the
queue_limits.  A DM target just needs to set max_discard_sectors,
discard_granularity, etc, in queue_limits.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c  |  1 -
 drivers/md/dm-raid.c          | 14 +++++++++-----
 drivers/md/dm-thin.c          |  1 -
 drivers/md/dm-zoned-target.c  |  1 -
 drivers/md/dm.c               | 25 ++++++-------------------
 include/linux/device-mapper.h |  6 ------
 include/uapi/linux/dm-ioctl.h |  4 ++--
 7 files changed, 17 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index b29a8327eed1..adc529f12b6b 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2496,7 +2496,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
 	ti->num_discard_bios = 1;
 	ti->discards_supported = true;
-	ti->split_discard_bios = false;
 
 	ti->per_io_data_size = sizeof(struct per_bio_data);
 
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index adcfe8ae10aa..9fdef6897316 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2986,11 +2986,6 @@ static void configure_discard_support(struct raid_set *rs)
 		}
 	}
 
-	/*
-	 * RAID1 and RAID10 personalities require bio splitting,
-	 * RAID0/4/5/6 don't and process large discard bios properly.
-	 */
-	ti->split_discard_bios = !!(rs_is_raid1(rs) || rs_is_raid10(rs));
 	ti->num_discard_bios = 1;
 }
 
@@ -3747,6 +3742,15 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 	blk_limits_io_min(limits, chunk_size);
 	blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs));
+
+	/*
+	 * RAID1 and RAID10 personalities require bio splitting,
+	 * RAID0/4/5/6 don't and process large discard bios properly.
+	 */
+	if (rs_is_raid1(rs) || rs_is_raid10(rs)) {
+		limits->discard_granularity = chunk_size;
+		limits->max_discard_sectors = chunk_size;
+	}
 }
 
 static void raid_postsuspend(struct dm_target *ti)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index e83b63608262..0d9ded0f5e50 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -4227,7 +4227,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (tc->pool->pf.discard_enabled) {
 		ti->discards_supported = true;
 		ti->num_discard_bios = 1;
-		ti->split_discard_bios = false;
 	}
 
 	mutex_unlock(&dm_thin_pool_table.mutex);
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 6af5babe6837..8865c1709e16 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -727,7 +727,6 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->per_io_data_size = sizeof(struct dmz_bioctx);
 	ti->flush_supported = true;
 	ti->discards_supported = true;
-	ti->split_discard_bios = true;
 
 	/* The exposed capacity is the number of chunks that can be mapped */
 	ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7a774fcd0194..55f12df3589d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1478,17 +1478,10 @@ static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
 	return ti->num_write_zeroes_bios;
 }
 
-typedef bool (*is_split_required_fn)(struct dm_target *ti);
-
-static bool is_split_required_for_discard(struct dm_target *ti)
-{
-	return ti->split_discard_bios;
-}
-
 static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
-				       unsigned num_bios, bool is_split_required)
+				       unsigned num_bios)
 {
-	unsigned len;
+	unsigned len = ci->sector_count;
 
 	/*
 	 * Even though the device advertised support for this type of
@@ -1499,11 +1492,6 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *
 	if (!num_bios)
 		return -EOPNOTSUPP;
 
-	if (!is_split_required)
-		len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
-	else
-		len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
-
 	__send_duplicate_bios(ci, ti, num_bios, &len);
 
 	ci->sector += len;
@@ -1514,23 +1502,22 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *
 
 static int __send_discard(struct clone_info *ci, struct dm_target *ti)
 {
-	return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti),
-					   is_split_required_for_discard(ti));
+	return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti));
 }
 
 static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti)
 {
-	return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti), false);
+	return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti));
 }
 
 static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
 {
-	return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti), false);
+	return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti));
 }
 
 static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
 {
-	return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti), false);
+	return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti));
 }
 
 static bool is_abnormal_io(struct bio *bio)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index e528baebad69..0f5b3d7c6cb3 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -315,12 +315,6 @@ struct dm_target {
 	 * whether or not its underlying devices have support.
 	 */
 	bool discards_supported:1;
-
-	/*
-	 * Set if the target required discard bios to be split
-	 * on max_io_len boundary.
-	 */
-	bool split_discard_bios:1;
 };
 
 /* Each target can link one of these into the table */
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index d1e49514977b..f396a82dfd3e 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -270,9 +270,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	39
+#define DM_VERSION_MINOR	40
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2018-04-03)"
+#define DM_VERSION_EXTRA	"-ioctl (2019-01-18)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3


From 086d08725d34c6b3333db710344ae9c4fdafb2d5 Mon Sep 17 00:00:00 2001
From: Loic Pallardy <loic.pallardy@st.com>
Date: Thu, 10 Jan 2019 14:50:49 +0100
Subject: remoteproc: create vdev subdevice with specific dma memory pool

This patch creates a dedicated vdev subdevice for each vdev declared
in firmware resource table and associates carveout named "vdev%dbuffer"
(with %d vdev index in resource table) if any as dma coherent memory pool.

Then vdev subdevice is used as parent for virtio device.

Signed-off-by: Loic Pallardy <loic.pallardy@st.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/remoteproc/remoteproc_core.c     | 47 ++++++++++++++++++++++++++++++--
 drivers/remoteproc/remoteproc_internal.h |  1 +
 drivers/remoteproc/remoteproc_virtio.c   | 42 +++++++++++++++++++++++++++-
 include/linux/remoteproc.h               |  1 +
 4 files changed, 87 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 54ec38fc5dca..821dbedef18e 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -39,9 +39,11 @@
 #include <linux/idr.h>
 #include <linux/elf.h>
 #include <linux/crc32.h>
+#include <linux/of_reserved_mem.h>
 #include <linux/virtio_ids.h>
 #include <linux/virtio_ring.h>
 #include <asm/byteorder.h>
+#include <linux/platform_device.h>
 
 #include "remoteproc_internal.h"
 
@@ -145,7 +147,7 @@ static void rproc_disable_iommu(struct rproc *rproc)
 	iommu_domain_free(domain);
 }
 
-static phys_addr_t rproc_va_to_pa(void *cpu_addr)
+phys_addr_t rproc_va_to_pa(void *cpu_addr)
 {
 	/*
 	 * Return physical address according to virtual address location
@@ -160,6 +162,7 @@ static phys_addr_t rproc_va_to_pa(void *cpu_addr)
 	WARN_ON(!virt_addr_valid(cpu_addr));
 	return virt_to_phys(cpu_addr);
 }
+EXPORT_SYMBOL(rproc_va_to_pa);
 
 /**
  * rproc_da_to_va() - lookup the kernel virtual address for a remoteproc address
@@ -422,6 +425,20 @@ static void rproc_vdev_do_stop(struct rproc_subdev *subdev, bool crashed)
 	rproc_remove_virtio_dev(rvdev);
 }
 
+/**
+ * rproc_rvdev_release() - release the existence of a rvdev
+ *
+ * @dev: the subdevice's dev
+ */
+static void rproc_rvdev_release(struct device *dev)
+{
+	struct rproc_vdev *rvdev = container_of(dev, struct rproc_vdev, dev);
+
+	of_reserved_mem_device_release(dev);
+
+	kfree(rvdev);
+}
+
 /**
  * rproc_handle_vdev() - handle a vdev fw resource
  * @rproc: the remote processor
@@ -455,6 +472,7 @@ static int rproc_handle_vdev(struct rproc *rproc, struct fw_rsc_vdev *rsc,
 	struct device *dev = &rproc->dev;
 	struct rproc_vdev *rvdev;
 	int i, ret;
+	char name[16];
 
 	/* make sure resource isn't truncated */
 	if (sizeof(*rsc) + rsc->num_of_vrings * sizeof(struct fw_rsc_vdev_vring)
@@ -488,6 +506,29 @@ static int rproc_handle_vdev(struct rproc *rproc, struct fw_rsc_vdev *rsc,
 	rvdev->rproc = rproc;
 	rvdev->index = rproc->nb_vdev++;
 
+	/* Initialise vdev subdevice */
+	snprintf(name, sizeof(name), "vdev%dbuffer", rvdev->index);
+	rvdev->dev.parent = rproc->dev.parent;
+	rvdev->dev.release = rproc_rvdev_release;
+	dev_set_name(&rvdev->dev, "%s#%s", dev_name(rvdev->dev.parent), name);
+	dev_set_drvdata(&rvdev->dev, rvdev);
+
+	ret = device_register(&rvdev->dev);
+	if (ret) {
+		put_device(&rvdev->dev);
+		return ret;
+	}
+	/* Make device dma capable by inheriting from parent's capabilities */
+	set_dma_ops(&rvdev->dev, get_dma_ops(rproc->dev.parent));
+
+	ret = dma_coerce_mask_and_coherent(&rvdev->dev,
+					   dma_get_mask(rproc->dev.parent));
+	if (ret) {
+		dev_warn(dev,
+			 "Failed to set DMA mask %llx. Trying to continue... %x\n",
+			 dma_get_mask(rproc->dev.parent), ret);
+	}
+
 	/* parse the vrings */
 	for (i = 0; i < rsc->num_of_vrings; i++) {
 		ret = rproc_parse_vring(rvdev, rsc, i);
@@ -518,7 +559,7 @@ unwind_vring_allocations:
 	for (i--; i >= 0; i--)
 		rproc_free_vring(&rvdev->vring[i]);
 free_rvdev:
-	kfree(rvdev);
+	device_unregister(&rvdev->dev);
 	return ret;
 }
 
@@ -536,7 +577,7 @@ void rproc_vdev_release(struct kref *ref)
 
 	rproc_remove_subdev(rproc, &rvdev->subdev);
 	list_del(&rvdev->node);
-	kfree(rvdev);
+	device_unregister(&rvdev->dev);
 }
 
 /**
diff --git a/drivers/remoteproc/remoteproc_internal.h b/drivers/remoteproc/remoteproc_internal.h
index f6cad243d7ca..bfeacfd40947 100644
--- a/drivers/remoteproc/remoteproc_internal.h
+++ b/drivers/remoteproc/remoteproc_internal.h
@@ -52,6 +52,7 @@ void rproc_free_vring(struct rproc_vring *rvring);
 int rproc_alloc_vring(struct rproc_vdev *rvdev, int i);
 
 void *rproc_da_to_va(struct rproc *rproc, u64 da, int len);
+phys_addr_t rproc_va_to_pa(void *cpu_addr);
 int rproc_trigger_recovery(struct rproc *rproc);
 
 int rproc_elf_sanity_check(struct rproc *rproc, const struct firmware *fw);
diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c
index 183fc42a510a..d08b2cfd875b 100644
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -17,7 +17,9 @@
  * GNU General Public License for more details.
  */
 
+#include <linux/dma-mapping.h>
 #include <linux/export.h>
+#include <linux/of_reserved_mem.h>
 #include <linux/remoteproc.h>
 #include <linux/virtio.h>
 #include <linux/virtio_config.h>
@@ -328,10 +330,48 @@ static void rproc_virtio_dev_release(struct device *dev)
 int rproc_add_virtio_dev(struct rproc_vdev *rvdev, int id)
 {
 	struct rproc *rproc = rvdev->rproc;
-	struct device *dev = &rproc->dev;
+	struct device *dev = &rvdev->dev;
 	struct virtio_device *vdev = &rvdev->vdev;
+	struct rproc_mem_entry *mem;
 	int ret;
 
+	/* Try to find dedicated vdev buffer carveout */
+	mem = rproc_find_carveout_by_name(rproc, "vdev%dbuffer", rvdev->index);
+	if (mem) {
+		phys_addr_t pa;
+
+		if (mem->of_resm_idx != -1) {
+			struct device_node *np = rproc->dev.parent->of_node;
+
+			/* Associate reserved memory to vdev device */
+			ret = of_reserved_mem_device_init_by_idx(dev, np,
+								 mem->of_resm_idx);
+			if (ret) {
+				dev_err(dev, "Can't associate reserved memory\n");
+				goto out;
+			}
+		} else {
+			if (mem->va) {
+				dev_warn(dev, "vdev %d buffer already mapped\n",
+					 rvdev->index);
+				pa = rproc_va_to_pa(mem->va);
+			} else {
+				/* Use dma address as carveout no memmapped yet */
+				pa = (phys_addr_t)mem->dma;
+			}
+
+			/* Associate vdev buffer memory pool to vdev subdev */
+			ret = dma_declare_coherent_memory(dev, pa,
+							   mem->da,
+							   mem->len,
+							   DMA_MEMORY_EXCLUSIVE);
+			if (ret < 0) {
+				dev_err(dev, "Failed to associate buffer\n");
+				goto out;
+			}
+		}
+	}
+
 	vdev->id.device	= id,
 	vdev->config = &rproc_virtio_config_ops,
 	vdev->dev.parent = dev;
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 68e72f33c705..82cb77ad37c7 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -554,6 +554,7 @@ struct rproc_vdev {
 	struct kref refcount;
 
 	struct rproc_subdev subdev;
+	struct device dev;
 
 	unsigned int id;
 	struct list_head node;
-- 
cgit v1.2.3


From d4c036fec321341f378ca95d3e99976e835a7404 Mon Sep 17 00:00:00 2001
From: Loic Pallardy <loic.pallardy@st.com>
Date: Mon, 21 Jan 2019 14:55:15 +0100
Subject: remoteproc: fix recovery procedure

Commit 7e83cab824a87e83cab824a8 ("remoteproc: Modify recovery path
to use rproc_{start,stop}()") replaces rproc_{shutdown,boot}() with
rproc_{stop,start}(), which skips destroy the virtio device at stop
but re-initializes it again at start.

Issue is that struct virtio_dev is not correctly reinitialized like done
at initial allocation thanks to kzalloc() and kobject is considered as
already initialized by kernel. That is due to the fact struct virtio_dev
is allocated and released at vdev resource handling level managed and
virtio device is registered and unregistered at rproc subdevices level.

Moreover kernel documentation mentions that device struct must be
zero initialized before calling device_initialize().

This patch disentangles struct virtio_dev from struct rproc_vdev as
the two struct don't have the same life-cycle.

struct virtio_dev is now allocated on rproc_start() and released
on rproc_stop().

This patch applies on top of patch
remoteproc: create vdev subdevice with specific dma memory pool [1]

[1]: https://patchwork.kernel.org/patch/10755781/

Fixes: 7e83cab824a8 ("remoteproc: Modify recovery path to use rproc_{start,stop}()")

Reported-by: Xiang Xiao <xiaoxiang781216@gmail.com>
Signed-off-by: Loic Pallardy <loic.pallardy@st.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/remoteproc/remoteproc_core.c     |  5 ++++-
 drivers/remoteproc/remoteproc_internal.h |  2 +-
 drivers/remoteproc/remoteproc_virtio.c   | 20 ++++++++++++++++----
 include/linux/remoteproc.h               |  3 +--
 4 files changed, 22 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 821dbedef18e..454a601d63c9 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -421,8 +421,11 @@ static int rproc_vdev_do_start(struct rproc_subdev *subdev)
 static void rproc_vdev_do_stop(struct rproc_subdev *subdev, bool crashed)
 {
 	struct rproc_vdev *rvdev = container_of(subdev, struct rproc_vdev, subdev);
+	int ret;
 
-	rproc_remove_virtio_dev(rvdev);
+	ret = device_for_each_child(&rvdev->dev, NULL, rproc_remove_virtio_dev);
+	if (ret)
+		dev_warn(&rvdev->dev, "can't remove vdev child device: %d\n", ret);
 }
 
 /**
diff --git a/drivers/remoteproc/remoteproc_internal.h b/drivers/remoteproc/remoteproc_internal.h
index bfeacfd40947..2698775c5005 100644
--- a/drivers/remoteproc/remoteproc_internal.h
+++ b/drivers/remoteproc/remoteproc_internal.h
@@ -32,7 +32,7 @@ void rproc_vdev_release(struct kref *ref);
 
 /* from remoteproc_virtio.c */
 int rproc_add_virtio_dev(struct rproc_vdev *rvdev, int id);
-void rproc_remove_virtio_dev(struct rproc_vdev *rvdev);
+int rproc_remove_virtio_dev(struct device *dev, void *data);
 
 /* from remoteproc_debugfs.c */
 void rproc_remove_trace_file(struct dentry *tfile);
diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c
index d08b2cfd875b..b7a987d1b962 100644
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -313,6 +313,8 @@ static void rproc_virtio_dev_release(struct device *dev)
 	struct rproc_vdev *rvdev = vdev_to_rvdev(vdev);
 	struct rproc *rproc = vdev_to_rproc(vdev);
 
+	kfree(vdev);
+
 	kref_put(&rvdev->refcount, rproc_vdev_release);
 
 	put_device(&rproc->dev);
@@ -331,7 +333,7 @@ int rproc_add_virtio_dev(struct rproc_vdev *rvdev, int id)
 {
 	struct rproc *rproc = rvdev->rproc;
 	struct device *dev = &rvdev->dev;
-	struct virtio_device *vdev = &rvdev->vdev;
+	struct virtio_device *vdev;
 	struct rproc_mem_entry *mem;
 	int ret;
 
@@ -372,6 +374,12 @@ int rproc_add_virtio_dev(struct rproc_vdev *rvdev, int id)
 		}
 	}
 
+	/* Allocate virtio device */
+	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+	if (!vdev) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	vdev->id.device	= id,
 	vdev->config = &rproc_virtio_config_ops,
 	vdev->dev.parent = dev;
@@ -405,11 +413,15 @@ out:
 
 /**
  * rproc_remove_virtio_dev() - remove an rproc-induced virtio device
- * @rvdev: the remote vdev
+ * @dev: the virtio device
+ * @data: must be null
  *
  * This function unregisters an existing virtio device.
  */
-void rproc_remove_virtio_dev(struct rproc_vdev *rvdev)
+int rproc_remove_virtio_dev(struct device *dev, void *data)
 {
-	unregister_virtio_device(&rvdev->vdev);
+	struct virtio_device *vdev = dev_to_virtio(dev);
+
+	unregister_virtio_device(vdev);
+	return 0;
 }
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 82cb77ad37c7..04d04709f2bd 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -559,7 +559,6 @@ struct rproc_vdev {
 	unsigned int id;
 	struct list_head node;
 	struct rproc *rproc;
-	struct virtio_device vdev;
 	struct rproc_vring vring[RVDEV_NUM_VRINGS];
 	u32 rsc_offset;
 	u32 index;
@@ -602,7 +601,7 @@ int rproc_coredump_add_custom_segment(struct rproc *rproc,
 
 static inline struct rproc_vdev *vdev_to_rvdev(struct virtio_device *vdev)
 {
-	return container_of(vdev, struct rproc_vdev, vdev);
+	return container_of(vdev->dev.parent, struct rproc_vdev, dev);
 }
 
 static inline struct rproc *vdev_to_rproc(struct virtio_device *vdev)
-- 
cgit v1.2.3


From 7945f929f1a77a1c8887a97ca07f87626858ff42 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Wed, 20 Feb 2019 11:12:39 +0000
Subject: drivers: provide devm_platform_ioremap_resource()

There are currently 1200+ instances of using platform_get_resource()
and devm_ioremap_resource() together in the kernel tree.

This patch wraps these two calls in a single helper. Thanks to that
we don't have to declare a local variable for struct resource * and can
omit the redundant argument for resource type. We also have one
function call less.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/base/platform.c         | 18 ++++++++++++++++++
 include/linux/platform_device.h |  3 +++
 2 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 1c958eb33ef4..f82691e1c26c 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -79,6 +79,24 @@ struct resource *platform_get_resource(struct platform_device *dev,
 }
 EXPORT_SYMBOL_GPL(platform_get_resource);
 
+/**
+ * devm_platform_ioremap_resource - call devm_ioremap_resource() for a platform
+ *				    device
+ *
+ * @pdev: platform device to use both for memory resource lookup as well as
+ *        resource managemend
+ * @index: resource index
+ */
+void __iomem *devm_platform_ioremap_resource(struct platform_device *pdev,
+					     unsigned int index)
+{
+	struct resource *res;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, index);
+	return devm_ioremap_resource(&pdev->dev, res);
+}
+EXPORT_SYMBOL_GPL(devm_platform_ioremap_resource);
+
 /**
  * platform_get_irq - get an IRQ for a device
  * @dev: platform device
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index c7c081dc6034..b126b73ed8ef 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -52,6 +52,9 @@ extern struct device platform_bus;
 extern void arch_setup_pdev_archdata(struct platform_device *);
 extern struct resource *platform_get_resource(struct platform_device *,
 					      unsigned int, unsigned int);
+extern void __iomem *
+devm_platform_ioremap_resource(struct platform_device *pdev,
+			       unsigned int index);
 extern int platform_get_irq(struct platform_device *, unsigned int);
 extern int platform_irq_count(struct platform_device *);
 extern struct resource *platform_get_resource_byname(struct platform_device *,
-- 
cgit v1.2.3


From 7b3d4f44abf0e7a1ba762c8a9c99a8b39ee0c8b1 Mon Sep 17 00:00:00 2001
From: Nick Crews <ncrews@chromium.org>
Date: Fri, 8 Feb 2019 17:37:17 -0700
Subject: platform/chrome: Add new driver for Wilco EC

This EC is an incompatible variant of the typical Chrome OS embedded
controller.  It uses the same low-level communication and a similar
protocol with some significant differences.  The EC firmware does
not support the same mailbox commands so it is not registered as a
cros_ec device type.  This commit exports the wilco_ec_mailbox()
function so that other modules can use it to communicate with the EC.

Signed-off-by: Duncan Laurie <dlaurie@google.com>
Signed-off-by: Nick Crews <ncrews@chromium.org>
[Fix the sparse warning: symbol 'wilco_ec_transfer' was not declared]
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
[Fix Kconfig dependencies for wilco_ec]
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
---
 drivers/platform/chrome/Kconfig            |   2 +
 drivers/platform/chrome/Makefile           |   2 +
 drivers/platform/chrome/wilco_ec/Kconfig   |  10 ++
 drivers/platform/chrome/wilco_ec/Makefile  |   4 +
 drivers/platform/chrome/wilco_ec/core.c    | 104 +++++++++++++
 drivers/platform/chrome/wilco_ec/mailbox.c | 237 +++++++++++++++++++++++++++++
 include/linux/platform_data/wilco-ec.h     | 140 +++++++++++++++++
 7 files changed, 499 insertions(+)
 create mode 100644 drivers/platform/chrome/wilco_ec/Kconfig
 create mode 100644 drivers/platform/chrome/wilco_ec/Makefile
 create mode 100644 drivers/platform/chrome/wilco_ec/core.c
 create mode 100644 drivers/platform/chrome/wilco_ec/mailbox.c
 create mode 100644 include/linux/platform_data/wilco-ec.h

(limited to 'include')

diff --git a/drivers/platform/chrome/Kconfig b/drivers/platform/chrome/Kconfig
index 5e2fde5ff63d..9186d81a51cc 100644
--- a/drivers/platform/chrome/Kconfig
+++ b/drivers/platform/chrome/Kconfig
@@ -152,4 +152,6 @@ config CROS_EC_SYSFS
 	  To compile this driver as a module, choose M here: the
 	  module will be called cros_ec_sysfs.
 
+source "drivers/platform/chrome/wilco_ec/Kconfig"
+
 endif # CHROMEOS_PLATFORMS
diff --git a/drivers/platform/chrome/Makefile b/drivers/platform/chrome/Makefile
index fdbee501931b..1e2f0029b597 100644
--- a/drivers/platform/chrome/Makefile
+++ b/drivers/platform/chrome/Makefile
@@ -14,3 +14,5 @@ obj-$(CONFIG_CROS_EC_LIGHTBAR)		+= cros_ec_lightbar.o
 obj-$(CONFIG_CROS_EC_VBC)		+= cros_ec_vbc.o
 obj-$(CONFIG_CROS_EC_DEBUGFS)		+= cros_ec_debugfs.o
 obj-$(CONFIG_CROS_EC_SYSFS)		+= cros_ec_sysfs.o
+
+obj-$(CONFIG_WILCO_EC)			+= wilco_ec/
diff --git a/drivers/platform/chrome/wilco_ec/Kconfig b/drivers/platform/chrome/wilco_ec/Kconfig
new file mode 100644
index 000000000000..c6bc4e8f3062
--- /dev/null
+++ b/drivers/platform/chrome/wilco_ec/Kconfig
@@ -0,0 +1,10 @@
+config WILCO_EC
+	tristate "ChromeOS Wilco Embedded Controller"
+	depends on ACPI && X86 && CROS_EC_LPC_MEC
+	help
+	  If you say Y here, you get support for talking to the ChromeOS
+	  Wilco EC over an eSPI bus. This uses a simple byte-level protocol
+	  with a checksum.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called wilco_ec.
diff --git a/drivers/platform/chrome/wilco_ec/Makefile b/drivers/platform/chrome/wilco_ec/Makefile
new file mode 100644
index 000000000000..03b32301dc61
--- /dev/null
+++ b/drivers/platform/chrome/wilco_ec/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+
+wilco_ec-objs				:= core.o mailbox.o
+obj-$(CONFIG_WILCO_EC)			+= wilco_ec.o
diff --git a/drivers/platform/chrome/wilco_ec/core.c b/drivers/platform/chrome/wilco_ec/core.c
new file mode 100644
index 000000000000..20ecc580d108
--- /dev/null
+++ b/drivers/platform/chrome/wilco_ec/core.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Core driver for Wilco Embedded Controller
+ *
+ * Copyright 2018 Google LLC
+ *
+ * This is the entry point for the drivers that control the Wilco EC.
+ * This driver is responsible for several tasks:
+ * - Initialize the register interface that is used by wilco_ec_mailbox()
+ * - Create a platform device which is picked up by the debugfs driver
+ * - Create a platform device which is picked up by the RTC driver
+ */
+
+#include <linux/acpi.h>
+#include <linux/device.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/platform_data/wilco-ec.h>
+#include <linux/platform_device.h>
+
+#include "../cros_ec_lpc_mec.h"
+
+#define DRV_NAME "wilco-ec"
+
+static struct resource *wilco_get_resource(struct platform_device *pdev,
+					   int index)
+{
+	struct device *dev = &pdev->dev;
+	struct resource *res;
+
+	res = platform_get_resource(pdev, IORESOURCE_IO, index);
+	if (!res) {
+		dev_dbg(dev, "Couldn't find IO resource %d\n", index);
+		return res;
+	}
+
+	return devm_request_region(dev, res->start, resource_size(res),
+				   dev_name(dev));
+}
+
+static int wilco_ec_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct wilco_ec_device *ec;
+
+	ec = devm_kzalloc(dev, sizeof(*ec), GFP_KERNEL);
+	if (!ec)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, ec);
+	ec->dev = dev;
+	mutex_init(&ec->mailbox_lock);
+
+	/* Largest data buffer size requirement is extended data response */
+	ec->data_size = sizeof(struct wilco_ec_response) +
+		EC_MAILBOX_DATA_SIZE_EXTENDED;
+	ec->data_buffer = devm_kzalloc(dev, ec->data_size, GFP_KERNEL);
+	if (!ec->data_buffer)
+		return -ENOMEM;
+
+	/* Prepare access to IO regions provided by ACPI */
+	ec->io_data = wilco_get_resource(pdev, 0);	/* Host Data */
+	ec->io_command = wilco_get_resource(pdev, 1);	/* Host Command */
+	ec->io_packet = wilco_get_resource(pdev, 2);	/* MEC EMI */
+	if (!ec->io_data || !ec->io_command || !ec->io_packet)
+		return -ENODEV;
+
+	/* Initialize cros_ec register interface for communication */
+	cros_ec_lpc_mec_init(ec->io_packet->start,
+			     ec->io_packet->start + EC_MAILBOX_DATA_SIZE);
+
+	return 0;
+}
+
+static int wilco_ec_remove(struct platform_device *pdev)
+{
+	/* Teardown cros_ec interface */
+	cros_ec_lpc_mec_destroy();
+
+	return 0;
+}
+
+static const struct acpi_device_id wilco_ec_acpi_device_ids[] = {
+	{ "GOOG000C", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(acpi, wilco_ec_acpi_device_ids);
+
+static struct platform_driver wilco_ec_driver = {
+	.driver = {
+		.name = DRV_NAME,
+		.acpi_match_table = wilco_ec_acpi_device_ids,
+	},
+	.probe = wilco_ec_probe,
+	.remove = wilco_ec_remove,
+};
+
+module_platform_driver(wilco_ec_driver);
+
+MODULE_AUTHOR("Nick Crews <ncrews@chromium.org>");
+MODULE_AUTHOR("Duncan Laurie <dlaurie@chromium.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("ChromeOS Wilco Embedded Controller driver");
+MODULE_ALIAS("platform:" DRV_NAME);
diff --git a/drivers/platform/chrome/wilco_ec/mailbox.c b/drivers/platform/chrome/wilco_ec/mailbox.c
new file mode 100644
index 000000000000..f6ff29a11f1a
--- /dev/null
+++ b/drivers/platform/chrome/wilco_ec/mailbox.c
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Mailbox interface for Wilco Embedded Controller
+ *
+ * Copyright 2018 Google LLC
+ *
+ * The Wilco EC is similar to a typical ChromeOS embedded controller.
+ * It uses the same MEC based low-level communication and a similar
+ * protocol, but with some important differences.  The EC firmware does
+ * not support the same mailbox commands so it is not registered as a
+ * cros_ec device type.
+ *
+ * Most messages follow a standard format, but there are some exceptions
+ * and an interface is provided to do direct/raw transactions that do not
+ * make assumptions about byte placement.
+ */
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/platform_data/wilco-ec.h>
+#include <linux/platform_device.h>
+
+#include "../cros_ec_lpc_mec.h"
+
+/* Version of mailbox interface */
+#define EC_MAILBOX_VERSION		0
+
+/* Command to start mailbox transaction */
+#define EC_MAILBOX_START_COMMAND	0xda
+
+/* Version of EC protocol */
+#define EC_MAILBOX_PROTO_VERSION	3
+
+/* Number of header bytes to be counted as data bytes */
+#define EC_MAILBOX_DATA_EXTRA		2
+
+/* Maximum timeout */
+#define EC_MAILBOX_TIMEOUT		HZ
+
+/* EC response flags */
+#define EC_CMDR_DATA		BIT(0)	/* Data ready for host to read */
+#define EC_CMDR_PENDING		BIT(1)	/* Write pending to EC */
+#define EC_CMDR_BUSY		BIT(2)	/* EC is busy processing a command */
+#define EC_CMDR_CMD		BIT(3)	/* Last host write was a command */
+
+/**
+ * wilco_ec_response_timed_out() - Wait for EC response.
+ * @ec: EC device.
+ *
+ * Return: true if EC timed out, false if EC did not time out.
+ */
+static bool wilco_ec_response_timed_out(struct wilco_ec_device *ec)
+{
+	unsigned long timeout = jiffies + EC_MAILBOX_TIMEOUT;
+
+	do {
+		if (!(inb(ec->io_command->start) &
+		      (EC_CMDR_PENDING | EC_CMDR_BUSY)))
+			return false;
+		usleep_range(100, 200);
+	} while (time_before(jiffies, timeout));
+
+	return true;
+}
+
+/**
+ * wilco_ec_checksum() - Compute 8-bit checksum over data range.
+ * @data: Data to checksum.
+ * @size: Number of bytes to checksum.
+ *
+ * Return: 8-bit checksum of provided data.
+ */
+static u8 wilco_ec_checksum(const void *data, size_t size)
+{
+	u8 *data_bytes = (u8 *)data;
+	u8 checksum = 0;
+	size_t i;
+
+	for (i = 0; i < size; i++)
+		checksum += data_bytes[i];
+
+	return checksum;
+}
+
+/**
+ * wilco_ec_prepare() - Prepare the request structure for the EC.
+ * @msg: EC message with request information.
+ * @rq: EC request structure to fill.
+ */
+static void wilco_ec_prepare(struct wilco_ec_message *msg,
+			     struct wilco_ec_request *rq)
+{
+	memset(rq, 0, sizeof(*rq));
+
+	/* Handle messages without trimming bytes from the request */
+	if (msg->request_size && msg->flags & WILCO_EC_FLAG_RAW_REQUEST) {
+		rq->reserved_raw = *(u8 *)msg->request_data;
+		msg->request_size--;
+		memmove(msg->request_data, msg->request_data + 1,
+			msg->request_size);
+	}
+
+	/* Fill in request packet */
+	rq->struct_version = EC_MAILBOX_PROTO_VERSION;
+	rq->mailbox_id = msg->type;
+	rq->mailbox_version = EC_MAILBOX_VERSION;
+	rq->data_size = msg->request_size + EC_MAILBOX_DATA_EXTRA;
+	rq->command = msg->command;
+
+	/* Checksum header and data */
+	rq->checksum = wilco_ec_checksum(rq, sizeof(*rq));
+	rq->checksum += wilco_ec_checksum(msg->request_data, msg->request_size);
+	rq->checksum = -rq->checksum;
+}
+
+/**
+ * wilco_ec_transfer() - Perform actual data transfer.
+ * @ec: EC device.
+ * @msg: EC message data for request and response.
+ * @rq: Filled in request structure
+ *
+ * Context: ec->mailbox_lock should be held while using this function.
+ * Return: number of bytes received or negative error code on failure.
+ */
+static int wilco_ec_transfer(struct wilco_ec_device *ec,
+			     struct wilco_ec_message *msg,
+			     struct wilco_ec_request *rq)
+{
+	struct wilco_ec_response *rs;
+	u8 checksum;
+	u8 flag;
+	size_t size;
+
+	/* Write request header, then data */
+	cros_ec_lpc_io_bytes_mec(MEC_IO_WRITE, 0, sizeof(*rq), (u8 *)rq);
+	cros_ec_lpc_io_bytes_mec(MEC_IO_WRITE, sizeof(*rq), msg->request_size,
+				 msg->request_data);
+
+	/* Start the command */
+	outb(EC_MAILBOX_START_COMMAND, ec->io_command->start);
+
+	/* For some commands (eg shutdown) the EC will not respond, that's OK */
+	if (msg->flags & WILCO_EC_FLAG_NO_RESPONSE) {
+		dev_dbg(ec->dev, "EC does not respond to this command\n");
+		return 0;
+	}
+
+	/* Wait for it to complete */
+	if (wilco_ec_response_timed_out(ec)) {
+		dev_dbg(ec->dev, "response timed out\n");
+		return -ETIMEDOUT;
+	}
+
+	/* Check result */
+	flag = inb(ec->io_data->start);
+	if (flag) {
+		dev_dbg(ec->dev, "bad response: 0x%02x\n", flag);
+		return -EIO;
+	}
+
+	if (msg->flags & WILCO_EC_FLAG_EXTENDED_DATA)
+		size = EC_MAILBOX_DATA_SIZE_EXTENDED;
+	else
+		size = EC_MAILBOX_DATA_SIZE;
+
+	/* Read back response */
+	rs = ec->data_buffer;
+	checksum = cros_ec_lpc_io_bytes_mec(MEC_IO_READ, 0,
+					    sizeof(*rs) + size, (u8 *)rs);
+	if (checksum) {
+		dev_dbg(ec->dev, "bad packet checksum 0x%02x\n", rs->checksum);
+		return -EBADMSG;
+	}
+
+	/* Check that the EC reported success */
+	msg->result = rs->result;
+	if (msg->result) {
+		dev_dbg(ec->dev, "bad response: 0x%02x\n", msg->result);
+		return -EBADMSG;
+	}
+
+	/* Check the returned data size, skipping the header */
+	if (rs->data_size != size) {
+		dev_dbg(ec->dev, "unexpected packet size (%u != %zu)",
+			rs->data_size, size);
+		return -EMSGSIZE;
+	}
+
+	/* Skip 1 response data byte unless specified */
+	size = (msg->flags & WILCO_EC_FLAG_RAW_RESPONSE) ? 0 : 1;
+	if ((ssize_t) rs->data_size - size < msg->response_size) {
+		dev_dbg(ec->dev, "response data too short (%zd < %zu)",
+			(ssize_t) rs->data_size - size, msg->response_size);
+		return -EMSGSIZE;
+	}
+
+	/* Ignore response data bytes as requested */
+	memcpy(msg->response_data, rs->data + size, msg->response_size);
+
+	/* Return actual amount of data received */
+	return msg->response_size;
+}
+
+/**
+ * wilco_ec_mailbox() - Send EC request and receive EC response.
+ * @ec: EC device.
+ * @msg: EC message data for request and response.
+ *
+ * On entry msg->type, msg->flags, msg->command, msg->request_size,
+ * msg->response_size, and msg->request_data should all be filled in.
+ *
+ * On exit msg->result and msg->response_data will be filled.
+ *
+ * Return: number of bytes received or negative error code on failure.
+ */
+int wilco_ec_mailbox(struct wilco_ec_device *ec, struct wilco_ec_message *msg)
+{
+	struct wilco_ec_request *rq;
+	int ret;
+
+	dev_dbg(ec->dev, "cmd=%02x type=%04x flags=%02x rslen=%zu rqlen=%zu\n",
+		msg->command, msg->type, msg->flags, msg->response_size,
+		msg->request_size);
+
+	/* Prepare request packet */
+	rq = ec->data_buffer;
+	wilco_ec_prepare(msg, rq);
+
+	mutex_lock(&ec->mailbox_lock);
+	ret = wilco_ec_transfer(ec, msg, rq);
+	mutex_unlock(&ec->mailbox_lock);
+
+	return ret;
+
+}
+EXPORT_SYMBOL_GPL(wilco_ec_mailbox);
diff --git a/include/linux/platform_data/wilco-ec.h b/include/linux/platform_data/wilco-ec.h
new file mode 100644
index 000000000000..0feb4b520a54
--- /dev/null
+++ b/include/linux/platform_data/wilco-ec.h
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ChromeOS Wilco Embedded Controller
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#ifndef WILCO_EC_H
+#define WILCO_EC_H
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+
+/* Message flags for using the mailbox() interface */
+#define WILCO_EC_FLAG_NO_RESPONSE	BIT(0) /* EC does not respond */
+#define WILCO_EC_FLAG_EXTENDED_DATA	BIT(1) /* EC returns 256 data bytes */
+#define WILCO_EC_FLAG_RAW_REQUEST	BIT(2) /* Do not trim request data */
+#define WILCO_EC_FLAG_RAW_RESPONSE	BIT(3) /* Do not trim response data */
+#define WILCO_EC_FLAG_RAW		(WILCO_EC_FLAG_RAW_REQUEST | \
+					 WILCO_EC_FLAG_RAW_RESPONSE)
+
+/* Normal commands have a maximum 32 bytes of data */
+#define EC_MAILBOX_DATA_SIZE		32
+/* Extended commands have 256 bytes of response data */
+#define EC_MAILBOX_DATA_SIZE_EXTENDED	256
+
+/**
+ * struct wilco_ec_device - Wilco Embedded Controller handle.
+ * @dev: Device handle.
+ * @mailbox_lock: Mutex to ensure one mailbox command at a time.
+ * @io_command: I/O port for mailbox command.  Provided by ACPI.
+ * @io_data: I/O port for mailbox data.  Provided by ACPI.
+ * @io_packet: I/O port for mailbox packet data.  Provided by ACPI.
+ * @data_buffer: Buffer used for EC communication.  The same buffer
+ *               is used to hold the request and the response.
+ * @data_size: Size of the data buffer used for EC communication.
+ */
+struct wilco_ec_device {
+	struct device *dev;
+	struct mutex mailbox_lock;
+	struct resource *io_command;
+	struct resource *io_data;
+	struct resource *io_packet;
+	void *data_buffer;
+	size_t data_size;
+};
+
+/**
+ * struct wilco_ec_request - Mailbox request message format.
+ * @struct_version: Should be %EC_MAILBOX_PROTO_VERSION
+ * @checksum: Sum of all bytes must be 0.
+ * @mailbox_id: Mailbox identifier, specifies the command set.
+ * @mailbox_version: Mailbox interface version %EC_MAILBOX_VERSION
+ * @reserved: Set to zero.
+ * @data_size: Length of request, data + last 2 bytes of the header.
+ * @command: Mailbox command code, unique for each mailbox_id set.
+ * @reserved_raw: Set to zero for most commands, but is used by
+ *                some command types and for raw commands.
+ */
+struct wilco_ec_request {
+	u8 struct_version;
+	u8 checksum;
+	u16 mailbox_id;
+	u8 mailbox_version;
+	u8 reserved;
+	u16 data_size;
+	u8 command;
+	u8 reserved_raw;
+} __packed;
+
+/**
+ * struct wilco_ec_response - Mailbox response message format.
+ * @struct_version: Should be %EC_MAILBOX_PROTO_VERSION
+ * @checksum: Sum of all bytes must be 0.
+ * @result: Result code from the EC.  Non-zero indicates an error.
+ * @data_size: Length of the response data buffer.
+ * @reserved: Set to zero.
+ * @mbox0: EC returned data at offset 0 is unused (always 0) so this byte
+ *         is treated as part of the header instead of the data.
+ * @data: Response data buffer.  Max size is %EC_MAILBOX_DATA_SIZE_EXTENDED.
+ */
+struct wilco_ec_response {
+	u8 struct_version;
+	u8 checksum;
+	u16 result;
+	u16 data_size;
+	u8 reserved[2];
+	u8 mbox0;
+	u8 data[0];
+} __packed;
+
+/**
+ * enum wilco_ec_msg_type - Message type to select a set of command codes.
+ * @WILCO_EC_MSG_LEGACY: Legacy EC messages for standard EC behavior.
+ * @WILCO_EC_MSG_PROPERTY: Get/Set/Sync EC controlled NVRAM property.
+ * @WILCO_EC_MSG_TELEMETRY_SHORT: 32 bytes of telemetry data provided by the EC.
+ * @WILCO_EC_MSG_TELEMETRY_LONG: 256 bytes of telemetry data provided by the EC.
+ */
+enum wilco_ec_msg_type {
+	WILCO_EC_MSG_LEGACY = 0x00f0,
+	WILCO_EC_MSG_PROPERTY = 0x00f2,
+	WILCO_EC_MSG_TELEMETRY_SHORT = 0x00f5,
+	WILCO_EC_MSG_TELEMETRY_LONG = 0x00f6,
+};
+
+/**
+ * struct wilco_ec_message - Request and response message.
+ * @type: Mailbox message type.
+ * @flags: Message flags, e.g. %WILCO_EC_FLAG_NO_RESPONSE.
+ * @command: Mailbox command code.
+ * @result: Result code from the EC.  Non-zero indicates an error.
+ * @request_size: Number of bytes to send to the EC.
+ * @request_data: Buffer containing the request data.
+ * @response_size: Number of bytes expected from the EC.
+ *                 This is 32 by default and 256 if the flag
+ *                 is set for %WILCO_EC_FLAG_EXTENDED_DATA
+ * @response_data: Buffer containing the response data, should be
+ *                 response_size bytes and allocated by caller.
+ */
+struct wilco_ec_message {
+	enum wilco_ec_msg_type type;
+	u8 flags;
+	u8 command;
+	u8 result;
+	size_t request_size;
+	void *request_data;
+	size_t response_size;
+	void *response_data;
+};
+
+/**
+ * wilco_ec_mailbox() - Send request to the EC and receive the response.
+ * @ec: Wilco EC device.
+ * @msg: Wilco EC message.
+ *
+ * Return: Number of bytes received or negative error code on failure.
+ */
+int wilco_ec_mailbox(struct wilco_ec_device *ec, struct wilco_ec_message *msg);
+
+#endif /* WILCO_EC_H */
-- 
cgit v1.2.3


From b787bb126cbcd73754bcbc055ae9f804ac576e4a Mon Sep 17 00:00:00 2001
From: Nick Crews <ncrews@chromium.org>
Date: Fri, 8 Feb 2019 17:37:18 -0700
Subject: platform/chrome: wilco_ec: Add support for raw commands in debugfs

Add a debugfs attribute that allows sending raw commands to the EC.
This is useful for development and debug but should not be enabled
in a production environment.

To test:
Get the EC firmware build date
First send the request command
> echo 00 f0 38 00 03 00 > raw
Then read the result. "12/21/18" is in the middle of the response
> cat raw
00 31 32 2f 32 31 2f 31 38 00 00 0f 01 00 01 00  .12/21/18.......

Get the EC firmware build date
First send the request command
> echo 00 f0 38 00 03 00 > raw
Then read the result. "12/21/18" is in the middle of the response
> cat raw
00 31 32 2f 32 31 2f 31 38 00 00 0f 01 00 01 00  .12/21/18.......

Signed-off-by: Duncan Laurie <dlaurie@google.com>
Signed-off-by: Nick Crews <ncrews@chromium.org>
[Fix off-by-one error in wilco_ec/debugfs.c]
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
---
 Documentation/ABI/testing/debugfs-wilco-ec |  23 +++
 drivers/platform/chrome/wilco_ec/Kconfig   |  10 ++
 drivers/platform/chrome/wilco_ec/Makefile  |   2 +
 drivers/platform/chrome/wilco_ec/core.c    |  14 ++
 drivers/platform/chrome/wilco_ec/debugfs.c | 238 +++++++++++++++++++++++++++++
 include/linux/platform_data/wilco-ec.h     |   2 +
 6 files changed, 289 insertions(+)
 create mode 100644 Documentation/ABI/testing/debugfs-wilco-ec
 create mode 100644 drivers/platform/chrome/wilco_ec/debugfs.c

(limited to 'include')

diff --git a/Documentation/ABI/testing/debugfs-wilco-ec b/Documentation/ABI/testing/debugfs-wilco-ec
new file mode 100644
index 000000000000..f814f112e213
--- /dev/null
+++ b/Documentation/ABI/testing/debugfs-wilco-ec
@@ -0,0 +1,23 @@
+What:		/sys/kernel/debug/wilco_ec/raw
+Date:		January 2019
+KernelVersion:	5.1
+Description:
+		Write and read raw mailbox commands to the EC.
+
+		For writing:
+		Bytes 0-1 indicate the message type:
+			00 F0 = Execute Legacy Command
+			00 F2 = Read/Write NVRAM Property
+		Byte 2 provides the command code
+		Bytes 3+ consist of the data passed in the request
+
+		At least three bytes are required, for the msg type and command,
+		with additional bytes optional for additional data.
+
+		Example:
+		// Request EC info type 3 (EC firmware build date)
+		$ echo 00 f0 38 00 03 00 > raw
+		// View the result. The decoded ASCII result "12/21/18" is
+		// included after the raw hex.
+		$ cat raw
+		00 31 32 2f 32 31 2f 31 38 00 38 00 01 00 2f 00  .12/21/18.8...
diff --git a/drivers/platform/chrome/wilco_ec/Kconfig b/drivers/platform/chrome/wilco_ec/Kconfig
index c6bc4e8f3062..4a119ced4d0c 100644
--- a/drivers/platform/chrome/wilco_ec/Kconfig
+++ b/drivers/platform/chrome/wilco_ec/Kconfig
@@ -8,3 +8,13 @@ config WILCO_EC
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called wilco_ec.
+
+config WILCO_EC_DEBUGFS
+	tristate "Enable raw access to EC via debugfs"
+	depends on WILCO_EC
+	help
+	  If you say Y here, you get support for sending raw commands to
+	  the Wilco EC via debugfs.  These commands do not do any byte
+	  manipulation and allow for testing arbitrary commands.  This
+	  interface is intended for debug only and will not be present
+	  on production devices.
diff --git a/drivers/platform/chrome/wilco_ec/Makefile b/drivers/platform/chrome/wilco_ec/Makefile
index 03b32301dc61..063e7fb4ea17 100644
--- a/drivers/platform/chrome/wilco_ec/Makefile
+++ b/drivers/platform/chrome/wilco_ec/Makefile
@@ -2,3 +2,5 @@
 
 wilco_ec-objs				:= core.o mailbox.o
 obj-$(CONFIG_WILCO_EC)			+= wilco_ec.o
+wilco_ec_debugfs-objs			:= debugfs.o
+obj-$(CONFIG_WILCO_EC_DEBUGFS)		+= wilco_ec_debugfs.o
diff --git a/drivers/platform/chrome/wilco_ec/core.c b/drivers/platform/chrome/wilco_ec/core.c
index 20ecc580d108..af5fd288b63b 100644
--- a/drivers/platform/chrome/wilco_ec/core.c
+++ b/drivers/platform/chrome/wilco_ec/core.c
@@ -69,11 +69,25 @@ static int wilco_ec_probe(struct platform_device *pdev)
 	cros_ec_lpc_mec_init(ec->io_packet->start,
 			     ec->io_packet->start + EC_MAILBOX_DATA_SIZE);
 
+	/*
+	 * Register a child device that will be found by the debugfs driver.
+	 * Ignore failure.
+	 */
+	ec->debugfs_pdev = platform_device_register_data(dev,
+							 "wilco-ec-debugfs",
+							 PLATFORM_DEVID_AUTO,
+							 NULL, 0);
+
 	return 0;
 }
 
 static int wilco_ec_remove(struct platform_device *pdev)
 {
+	struct wilco_ec_device *ec = platform_get_drvdata(pdev);
+
+	if (ec->debugfs_pdev)
+		platform_device_unregister(ec->debugfs_pdev);
+
 	/* Teardown cros_ec interface */
 	cros_ec_lpc_mec_destroy();
 
diff --git a/drivers/platform/chrome/wilco_ec/debugfs.c b/drivers/platform/chrome/wilco_ec/debugfs.c
new file mode 100644
index 000000000000..c090db2cd5be
--- /dev/null
+++ b/drivers/platform/chrome/wilco_ec/debugfs.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * debugfs attributes for Wilco EC
+ *
+ * Copyright 2019 Google LLC
+ *
+ * There is only one attribute used for debugging, called raw.
+ * You can write a hexadecimal sentence to raw, and that series of bytes
+ * will be sent to the EC. Then, you can read the bytes of response
+ * by reading from raw.
+ *
+ * For writing:
+ * Bytes 0-1 indicate the message type:
+ *         00 F0 = Execute Legacy Command
+ *         00 F2 = Read/Write NVRAM Property
+ * Byte 2 provides the command code
+ * Bytes 3+ consist of the data passed in the request
+ *
+ * When referencing the EC interface spec, byte 2 corresponds to MBOX[0],
+ * byte 3 corresponds to MBOX[1], etc.
+ *
+ * At least three bytes are required, for the msg type and command,
+ * with additional bytes optional for additional data.
+ *
+ * Example:
+ * // Request EC info type 3 (EC firmware build date)
+ * $ echo 00 f0 38 00 03 00 > raw
+ * // View the result. The decoded ASCII result "12/21/18" is
+ * // included after the raw hex.
+ * $ cat raw
+ * 00 31 32 2f 32 31 2f 31 38 00 38 00 01 00 2f 00  .12/21/18.8...
+ */
+
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/platform_data/wilco-ec.h>
+#include <linux/platform_device.h>
+
+#define DRV_NAME "wilco-ec-debugfs"
+
+/* The 256 raw bytes will take up more space when represented as a hex string */
+#define FORMATTED_BUFFER_SIZE (EC_MAILBOX_DATA_SIZE_EXTENDED * 4)
+
+struct wilco_ec_debugfs {
+	struct wilco_ec_device *ec;
+	struct dentry *dir;
+	size_t response_size;
+	u8 raw_data[EC_MAILBOX_DATA_SIZE_EXTENDED];
+	u8 formatted_data[FORMATTED_BUFFER_SIZE];
+};
+static struct wilco_ec_debugfs *debug_info;
+
+/**
+ * parse_hex_sentence() - Convert a ascii hex representation into byte array.
+ * @in: Input buffer of ascii.
+ * @isize: Length of input buffer.
+ * @out: Output buffer.
+ * @osize: Length of output buffer, e.g. max number of bytes to parse.
+ *
+ * An valid input is a series of ascii hexadecimal numbers, separated by spaces.
+ * An example valid input is
+ * "   00 f2 0    000076 6 0  ff"
+ *
+ * If an individual "word" within the hex sentence is longer than MAX_WORD_SIZE,
+ * then the sentence is illegal, and parsing will fail.
+ *
+ * Return: Number of bytes parsed, or negative error code on failure.
+ */
+static int parse_hex_sentence(const char *in, int isize, u8 *out, int osize)
+{
+	int n_parsed = 0;
+	int word_start = 0;
+	int word_end;
+	int word_len;
+	/* Temp buffer for holding a "word" of chars that represents one byte */
+	#define MAX_WORD_SIZE 16
+	char tmp[MAX_WORD_SIZE + 1];
+	u8 byte;
+
+	while (word_start < isize && n_parsed < osize) {
+		/* Find the start of the next word */
+		while (word_start < isize && isspace(in[word_start]))
+			word_start++;
+		 /* reached the end of the input before next word? */
+		if (word_start >= isize)
+			break;
+
+		/* Find the end of this word */
+		word_end = word_start;
+		while (word_end < isize && !isspace(in[word_end]))
+			word_end++;
+
+		/* Copy to a tmp NULL terminated string */
+		word_len = word_end - word_start;
+		if (word_len > MAX_WORD_SIZE)
+			return -EINVAL;
+		memcpy(tmp, in + word_start, word_len);
+		tmp[word_len] = '\0';
+
+		/*
+		 * Convert from hex string, place in output. If fails to parse,
+		 * just return -EINVAL because specific error code is only
+		 * relevant for this one word, returning it would be confusing.
+		 */
+		if (kstrtou8(tmp, 16, &byte))
+			return -EINVAL;
+		out[n_parsed++] = byte;
+
+		word_start = word_end;
+	}
+	return n_parsed;
+}
+
+/* The message type takes up two bytes*/
+#define TYPE_AND_DATA_SIZE ((EC_MAILBOX_DATA_SIZE) + 2)
+
+static ssize_t raw_write(struct file *file, const char __user *user_buf,
+			 size_t count, loff_t *ppos)
+{
+	char *buf = debug_info->formatted_data;
+	struct wilco_ec_message msg;
+	u8 request_data[TYPE_AND_DATA_SIZE];
+	ssize_t kcount;
+	int ret;
+
+	if (count > FORMATTED_BUFFER_SIZE)
+		return -EINVAL;
+
+	kcount = simple_write_to_buffer(buf, FORMATTED_BUFFER_SIZE, ppos,
+					user_buf, count);
+	if (kcount < 0)
+		return kcount;
+
+	ret = parse_hex_sentence(buf, kcount, request_data, TYPE_AND_DATA_SIZE);
+	if (ret < 0)
+		return ret;
+	/* Need at least two bytes for message type and one for command */
+	if (ret < 3)
+		return -EINVAL;
+
+	/* Clear response data buffer */
+	memset(debug_info->raw_data, '\0', EC_MAILBOX_DATA_SIZE_EXTENDED);
+
+	msg.type = request_data[0] << 8 | request_data[1];
+	msg.flags = WILCO_EC_FLAG_RAW;
+	msg.command = request_data[2];
+	msg.request_data = ret > 3 ? request_data + 3 : 0;
+	msg.request_size = ret - 3;
+	msg.response_data = debug_info->raw_data;
+	msg.response_size = EC_MAILBOX_DATA_SIZE;
+
+	/* Telemetry commands use extended response data */
+	if (msg.type == WILCO_EC_MSG_TELEMETRY_LONG) {
+		msg.flags |= WILCO_EC_FLAG_EXTENDED_DATA;
+		msg.response_size = EC_MAILBOX_DATA_SIZE_EXTENDED;
+	}
+
+	ret = wilco_ec_mailbox(debug_info->ec, &msg);
+	if (ret < 0)
+		return ret;
+	debug_info->response_size = ret;
+
+	return count;
+}
+
+static ssize_t raw_read(struct file *file, char __user *user_buf, size_t count,
+			loff_t *ppos)
+{
+	int fmt_len = 0;
+
+	if (debug_info->response_size) {
+		fmt_len = hex_dump_to_buffer(debug_info->raw_data,
+					     debug_info->response_size,
+					     16, 1, debug_info->formatted_data,
+					     FORMATTED_BUFFER_SIZE, true);
+		/* Only return response the first time it is read */
+		debug_info->response_size = 0;
+	}
+
+	return simple_read_from_buffer(user_buf, count, ppos,
+				       debug_info->formatted_data, fmt_len);
+}
+
+static const struct file_operations fops_raw = {
+	.owner = THIS_MODULE,
+	.read = raw_read,
+	.write = raw_write,
+	.llseek = no_llseek,
+};
+
+/**
+ * wilco_ec_debugfs_probe() - Create the debugfs node
+ * @pdev: The platform device, probably created in core.c
+ *
+ * Try to create a debugfs node. If it fails, then we don't want to change
+ * behavior at all, this is for debugging after all. Just fail silently.
+ *
+ * Return: 0 always.
+ */
+static int wilco_ec_debugfs_probe(struct platform_device *pdev)
+{
+	struct wilco_ec_device *ec = dev_get_drvdata(pdev->dev.parent);
+
+	debug_info = devm_kzalloc(&pdev->dev, sizeof(*debug_info), GFP_KERNEL);
+	if (!debug_info)
+		return 0;
+	debug_info->ec = ec;
+	debug_info->dir = debugfs_create_dir("wilco_ec", NULL);
+	if (!debug_info->dir)
+		return 0;
+	debugfs_create_file("raw", 0644, debug_info->dir, NULL, &fops_raw);
+
+	return 0;
+}
+
+static int wilco_ec_debugfs_remove(struct platform_device *pdev)
+{
+	debugfs_remove_recursive(debug_info->dir);
+
+	return 0;
+}
+
+static struct platform_driver wilco_ec_debugfs_driver = {
+	.driver = {
+		.name = DRV_NAME,
+	},
+	.probe = wilco_ec_debugfs_probe,
+	.remove = wilco_ec_debugfs_remove,
+};
+
+module_platform_driver(wilco_ec_debugfs_driver);
+
+MODULE_ALIAS("platform:" DRV_NAME);
+MODULE_AUTHOR("Nick Crews <ncrews@chromium.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Wilco EC debugfs driver");
diff --git a/include/linux/platform_data/wilco-ec.h b/include/linux/platform_data/wilco-ec.h
index 0feb4b520a54..5344975afa1a 100644
--- a/include/linux/platform_data/wilco-ec.h
+++ b/include/linux/platform_data/wilco-ec.h
@@ -34,6 +34,7 @@
  * @data_buffer: Buffer used for EC communication.  The same buffer
  *               is used to hold the request and the response.
  * @data_size: Size of the data buffer used for EC communication.
+ * @debugfs_pdev: The child platform_device used by the debugfs sub-driver.
  */
 struct wilco_ec_device {
 	struct device *dev;
@@ -43,6 +44,7 @@ struct wilco_ec_device {
 	struct resource *io_packet;
 	void *data_buffer;
 	size_t data_size;
+	struct platform_device *debugfs_pdev;
 };
 
 /**
-- 
cgit v1.2.3


From 0d2f2a3da1f2a9ebeb66bb03073dd149fccf1bdd Mon Sep 17 00:00:00 2001
From: Nick Crews <ncrews@chromium.org>
Date: Fri, 8 Feb 2019 17:37:19 -0700
Subject: platform/chrome: wilco_ec: Add RTC driver

This Embedded Controller has an internal RTC that is exposed
as a standard RTC class driver with read/write functionality.

The driver is added to the drivers/rtc/ so that the maintainer of that
directory will be able to comment on this change, as that maintainer is
the expert on this system. In addition, the driver code is called
indirectly after a corresponding device is registered from core.c,
as opposed to core.c registering the driver callbacks directly.

To test:
> hwclock --show --rtc /dev/rtc1
2007-12-31 16:01:20.460959-08:00
> hwclock --systohc --rtc /dev/rtc1
> hwclock --show --rtc /dev/rtc1
2018-11-29 17:08:00.780793-08:00

> hwclock --show --rtc /dev/rtc1
2007-12-31 16:01:20.460959-08:00
> hwclock --systohc --rtc /dev/rtc1
> hwclock --show --rtc /dev/rtc1
2018-11-29 17:08:00.780793-08:00

Signed-off-by: Duncan Laurie <dlaurie@google.com>
Signed-off-by: Nick Crews <ncrews@chromium.org>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
[Fix the sparse warning: symbol 'wilco_ec_rtc_read/write' was not declared]
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
---
 drivers/platform/chrome/wilco_ec/core.c |  18 ++++
 drivers/rtc/Kconfig                     |  11 ++
 drivers/rtc/Makefile                    |   1 +
 drivers/rtc/rtc-wilco-ec.c              | 177 ++++++++++++++++++++++++++++++++
 include/linux/platform_data/wilco-ec.h  |   2 +
 5 files changed, 209 insertions(+)
 create mode 100644 drivers/rtc/rtc-wilco-ec.c

(limited to 'include')

diff --git a/drivers/platform/chrome/wilco_ec/core.c b/drivers/platform/chrome/wilco_ec/core.c
index af5fd288b63b..05e1e2be1c91 100644
--- a/drivers/platform/chrome/wilco_ec/core.c
+++ b/drivers/platform/chrome/wilco_ec/core.c
@@ -42,6 +42,7 @@ static int wilco_ec_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct wilco_ec_device *ec;
+	int ret;
 
 	ec = devm_kzalloc(dev, sizeof(*ec), GFP_KERNEL);
 	if (!ec)
@@ -78,13 +79,30 @@ static int wilco_ec_probe(struct platform_device *pdev)
 							 PLATFORM_DEVID_AUTO,
 							 NULL, 0);
 
+	/* Register a child device that will be found by the RTC driver. */
+	ec->rtc_pdev = platform_device_register_data(dev, "rtc-wilco-ec",
+						     PLATFORM_DEVID_AUTO,
+						     NULL, 0);
+	if (IS_ERR(ec->rtc_pdev)) {
+		dev_err(dev, "Failed to create RTC platform device\n");
+		ret = PTR_ERR(ec->rtc_pdev);
+		goto unregister_debugfs;
+	}
+
 	return 0;
+
+unregister_debugfs:
+	if (ec->debugfs_pdev)
+		platform_device_unregister(ec->debugfs_pdev);
+	cros_ec_lpc_mec_destroy();
+	return ret;
 }
 
 static int wilco_ec_remove(struct platform_device *pdev)
 {
 	struct wilco_ec_device *ec = platform_get_drvdata(pdev);
 
+	platform_device_unregister(ec->rtc_pdev);
 	if (ec->debugfs_pdev)
 		platform_device_unregister(ec->debugfs_pdev);
 
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 225b0b8516f3..d5063c791515 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -1814,4 +1814,15 @@ config RTC_DRV_GOLDFISH
 	  Goldfish is a code name for the virtual platform developed by Google
 	  for Android emulation.
 
+config RTC_DRV_WILCO_EC
+	tristate "Wilco EC RTC"
+	depends on WILCO_EC
+	default m
+	help
+	  If you say yes here, you get read/write support for the Real Time
+	  Clock on the Wilco Embedded Controller (Wilco is a kind of Chromebook)
+
+	  This can also be built as a module. If so, the module will
+	  be named "rtc_wilco_ec".
+
 endif # RTC_CLASS
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index df022d820bee..6255ea78da25 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -172,6 +172,7 @@ obj-$(CONFIG_RTC_DRV_V3020)	+= rtc-v3020.o
 obj-$(CONFIG_RTC_DRV_VR41XX)	+= rtc-vr41xx.o
 obj-$(CONFIG_RTC_DRV_VRTC)	+= rtc-mrst.o
 obj-$(CONFIG_RTC_DRV_VT8500)	+= rtc-vt8500.o
+obj-$(CONFIG_RTC_DRV_WILCO_EC)	+= rtc-wilco-ec.o
 obj-$(CONFIG_RTC_DRV_WM831X)	+= rtc-wm831x.o
 obj-$(CONFIG_RTC_DRV_WM8350)	+= rtc-wm8350.o
 obj-$(CONFIG_RTC_DRV_X1205)	+= rtc-x1205.o
diff --git a/drivers/rtc/rtc-wilco-ec.c b/drivers/rtc/rtc-wilco-ec.c
new file mode 100644
index 000000000000..e62bda0cb53e
--- /dev/null
+++ b/drivers/rtc/rtc-wilco-ec.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * RTC interface for Wilco Embedded Controller with R/W abilities
+ *
+ * Copyright 2018 Google LLC
+ *
+ * The corresponding platform device is typically registered in
+ * drivers/platform/chrome/wilco_ec/core.c
+ */
+
+#include <linux/bcd.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/platform_data/wilco-ec.h>
+#include <linux/rtc.h>
+#include <linux/timekeeping.h>
+
+#define EC_COMMAND_CMOS			0x7c
+#define EC_CMOS_TOD_WRITE		0x02
+#define EC_CMOS_TOD_READ		0x08
+
+/**
+ * struct ec_rtc_read - Format of RTC returned by EC.
+ * @second: Second value (0..59)
+ * @minute: Minute value (0..59)
+ * @hour: Hour value (0..23)
+ * @day: Day value (1..31)
+ * @month: Month value (1..12)
+ * @year: Year value (full year % 100)
+ * @century: Century value (full year / 100)
+ *
+ * All values are presented in binary (not BCD).
+ */
+struct ec_rtc_read {
+	u8 second;
+	u8 minute;
+	u8 hour;
+	u8 day;
+	u8 month;
+	u8 year;
+	u8 century;
+} __packed;
+
+/**
+ * struct ec_rtc_write - Format of RTC sent to the EC.
+ * @param: EC_CMOS_TOD_WRITE
+ * @century: Century value (full year / 100)
+ * @year: Year value (full year % 100)
+ * @month: Month value (1..12)
+ * @day: Day value (1..31)
+ * @hour: Hour value (0..23)
+ * @minute: Minute value (0..59)
+ * @second: Second value (0..59)
+ * @weekday: Day of the week (0=Saturday)
+ *
+ * All values are presented in BCD.
+ */
+struct ec_rtc_write {
+	u8 param;
+	u8 century;
+	u8 year;
+	u8 month;
+	u8 day;
+	u8 hour;
+	u8 minute;
+	u8 second;
+	u8 weekday;
+} __packed;
+
+static int wilco_ec_rtc_read(struct device *dev, struct rtc_time *tm)
+{
+	struct wilco_ec_device *ec = dev_get_drvdata(dev->parent);
+	u8 param = EC_CMOS_TOD_READ;
+	struct ec_rtc_read rtc;
+	struct wilco_ec_message msg = {
+		.type = WILCO_EC_MSG_LEGACY,
+		.flags = WILCO_EC_FLAG_RAW_RESPONSE,
+		.command = EC_COMMAND_CMOS,
+		.request_data = &param,
+		.request_size = sizeof(param),
+		.response_data = &rtc,
+		.response_size = sizeof(rtc),
+	};
+	int ret;
+
+	ret = wilco_ec_mailbox(ec, &msg);
+	if (ret < 0)
+		return ret;
+
+	tm->tm_sec	= rtc.second;
+	tm->tm_min	= rtc.minute;
+	tm->tm_hour	= rtc.hour;
+	tm->tm_mday	= rtc.day;
+	tm->tm_mon	= rtc.month - 1;
+	tm->tm_year	= rtc.year + (rtc.century * 100) - 1900;
+	tm->tm_yday	= rtc_year_days(tm->tm_mday, tm->tm_mon, tm->tm_year);
+
+	/* Don't compute day of week, we don't need it. */
+	tm->tm_wday = -1;
+
+	return 0;
+}
+
+static int wilco_ec_rtc_write(struct device *dev, struct rtc_time *tm)
+{
+	struct wilco_ec_device *ec = dev_get_drvdata(dev->parent);
+	struct ec_rtc_write rtc;
+	struct wilco_ec_message msg = {
+		.type = WILCO_EC_MSG_LEGACY,
+		.flags = WILCO_EC_FLAG_RAW_RESPONSE,
+		.command = EC_COMMAND_CMOS,
+		.request_data = &rtc,
+		.request_size = sizeof(rtc),
+	};
+	int year = tm->tm_year + 1900;
+	/*
+	 * Convert from 0=Sunday to 0=Saturday for the EC
+	 * We DO need to set weekday because the EC controls battery charging
+	 * schedules that depend on the day of the week.
+	 */
+	int wday = tm->tm_wday == 6 ? 0 : tm->tm_wday + 1;
+	int ret;
+
+	rtc.param	= EC_CMOS_TOD_WRITE;
+	rtc.century	= bin2bcd(year / 100);
+	rtc.year	= bin2bcd(year % 100);
+	rtc.month	= bin2bcd(tm->tm_mon + 1);
+	rtc.day		= bin2bcd(tm->tm_mday);
+	rtc.hour	= bin2bcd(tm->tm_hour);
+	rtc.minute	= bin2bcd(tm->tm_min);
+	rtc.second	= bin2bcd(tm->tm_sec);
+	rtc.weekday	= bin2bcd(wday);
+
+	ret = wilco_ec_mailbox(ec, &msg);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static const struct rtc_class_ops wilco_ec_rtc_ops = {
+	.read_time = wilco_ec_rtc_read,
+	.set_time = wilco_ec_rtc_write,
+};
+
+static int wilco_ec_rtc_probe(struct platform_device *pdev)
+{
+	struct rtc_device *rtc;
+
+	rtc = devm_rtc_allocate_device(&pdev->dev);
+	if (IS_ERR(rtc))
+		return PTR_ERR(rtc);
+
+	rtc->ops = &wilco_ec_rtc_ops;
+	/* EC only supports this century */
+	rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
+	rtc->range_max = RTC_TIMESTAMP_END_2099;
+	rtc->owner = THIS_MODULE;
+
+	return rtc_register_device(rtc);
+}
+
+static struct platform_driver wilco_ec_rtc_driver = {
+	.driver = {
+		.name = "rtc-wilco-ec",
+	},
+	.probe = wilco_ec_rtc_probe,
+};
+
+module_platform_driver(wilco_ec_rtc_driver);
+
+MODULE_ALIAS("platform:rtc-wilco-ec");
+MODULE_AUTHOR("Nick Crews <ncrews@chromium.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Wilco EC RTC driver");
diff --git a/include/linux/platform_data/wilco-ec.h b/include/linux/platform_data/wilco-ec.h
index 5344975afa1a..446473a46b88 100644
--- a/include/linux/platform_data/wilco-ec.h
+++ b/include/linux/platform_data/wilco-ec.h
@@ -35,6 +35,7 @@
  *               is used to hold the request and the response.
  * @data_size: Size of the data buffer used for EC communication.
  * @debugfs_pdev: The child platform_device used by the debugfs sub-driver.
+ * @rtc_pdev: The child platform_device used by the RTC sub-driver.
  */
 struct wilco_ec_device {
 	struct device *dev;
@@ -45,6 +46,7 @@ struct wilco_ec_device {
 	void *data_buffer;
 	size_t data_size;
 	struct platform_device *debugfs_pdev;
+	struct platform_device *rtc_pdev;
 };
 
 /**
-- 
cgit v1.2.3


From 9b15cffbf21d3ac0a60618a3274b335532f33573 Mon Sep 17 00:00:00 2001
From: Michael Grzeschik <m.grzeschik@pengutronix.de>
Date: Tue, 8 Jan 2019 17:21:15 +0100
Subject: clk: imx5: add imx5_SCC2_IPG_GATE

This adds the missing clock for the SCC2 peripheral unit.

Signed-off-by: Michael Grzeschik <m.grzeschik@pengutronix.de>
Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/imx/clk-imx51-imx53.c      | 1 +
 include/dt-bindings/clock/imx5-clock.h | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/clk/imx/clk-imx51-imx53.c b/drivers/clk/imx/clk-imx51-imx53.c
index fc8e782d817b..e91c826bce70 100644
--- a/drivers/clk/imx/clk-imx51-imx53.c
+++ b/drivers/clk/imx/clk-imx51-imx53.c
@@ -428,6 +428,7 @@ static void __init mx51_clocks_init(struct device_node *np)
 	clk[IMX5_CLK_ESDHC4_PER_GATE]	= imx_clk_gate2("esdhc4_per_gate", "esdhc_d_sel", MXC_CCM_CCGR3, 14);
 	clk[IMX5_CLK_USB_PHY_GATE]	= imx_clk_gate2("usb_phy_gate", "usb_phy_sel", MXC_CCM_CCGR2, 0);
 	clk[IMX5_CLK_HSI2C_GATE]	= imx_clk_gate2("hsi2c_gate", "ipg", MXC_CCM_CCGR1, 22);
+	clk[IMX5_CLK_SCC2_IPG_GATE]	= imx_clk_gate2("scc2_gate", "ipg", MXC_CCM_CCGR1, 30);
 	clk[IMX5_CLK_MIPI_HSC1_GATE]	= imx_clk_gate2_flags("mipi_hsc1_gate", "ipg", MXC_CCM_CCGR4, 6, CLK_IS_CRITICAL);
 	clk[IMX5_CLK_MIPI_HSC2_GATE]	= imx_clk_gate2_flags("mipi_hsc2_gate", "ipg", MXC_CCM_CCGR4, 8, CLK_IS_CRITICAL);
 	clk[IMX5_CLK_MIPI_ESC_GATE]	= imx_clk_gate2_flags("mipi_esc_gate", "ipg", MXC_CCM_CCGR4, 10, CLK_IS_CRITICAL);
diff --git a/include/dt-bindings/clock/imx5-clock.h b/include/dt-bindings/clock/imx5-clock.h
index d382fc71aa83..a81be5be6700 100644
--- a/include/dt-bindings/clock/imx5-clock.h
+++ b/include/dt-bindings/clock/imx5-clock.h
@@ -214,6 +214,7 @@
 #define IMX5_CLK_IEEE1588_SEL		202
 #define IMX5_CLK_IEEE1588_PODF		203
 #define IMX5_CLK_IEEE1588_GATE		204
-#define IMX5_CLK_END			205
+#define IMX5_CLK_SCC2_IPG_GATE		205
+#define IMX5_CLK_END			206
 
 #endif /* __DT_BINDINGS_CLOCK_IMX5_H */
-- 
cgit v1.2.3


From 037a474f6144544848da72bfda80776686084a13 Mon Sep 17 00:00:00 2001
From: Bai Ping <ping.bai@nxp.com>
Date: Tue, 22 Jan 2019 09:31:46 +0000
Subject: dt-bindings: imx: Add clock binding doc for imx8mm

Add the clock binding doc for i.MX8MM.

Signed-off-by: Bai Ping <ping.bai@nxp.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 .../devicetree/bindings/clock/imx8mm-clock.txt     |  29 +++
 include/dt-bindings/clock/imx8mm-clock.h           | 244 +++++++++++++++++++++
 2 files changed, 273 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/imx8mm-clock.txt
 create mode 100644 include/dt-bindings/clock/imx8mm-clock.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/imx8mm-clock.txt b/Documentation/devicetree/bindings/clock/imx8mm-clock.txt
new file mode 100644
index 000000000000..8e4ab9e619a1
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/imx8mm-clock.txt
@@ -0,0 +1,29 @@
+* Clock bindings for NXP i.MX8M Mini
+
+Required properties:
+- compatible: Should be "fsl,imx8mm-ccm"
+- reg: Address and length of the register set
+- #clock-cells: Should be <1>
+- clocks: list of clock specifiers, must contain an entry for each required
+          entry in clock-names
+- clock-names: should include the following entries:
+    - "osc_32k"
+    - "osc_24m"
+    - "clk_ext1"
+    - "clk_ext2"
+    - "clk_ext3"
+    - "clk_ext4"
+
+clk: clock-controller@30380000 {
+	compatible = "fsl,imx8mm-ccm";
+	reg = <0x0 0x30380000 0x0 0x10000>;
+	#clock-cells = <1>;
+	clocks = <&osc_32k>, <&osc_24m>, <&clk_ext1>, <&clk_ext2>,
+		 <&clk_ext3>, <&clk_ext4>;
+	clock-names = "osc_32k", "osc_24m", "clk_ext1", "clk_ext2",
+		      "clk_ext3", "clk_ext4";
+};
+
+The clock consumer should specify the desired clock by having the clock
+ID in its "clocks" phandle cell. See include/dt-bindings/clock/imx8mm-clock.h
+for the full list of i.MX8M Mini clock IDs.
diff --git a/include/dt-bindings/clock/imx8mm-clock.h b/include/dt-bindings/clock/imx8mm-clock.h
new file mode 100644
index 000000000000..1b4353e7b486
--- /dev/null
+++ b/include/dt-bindings/clock/imx8mm-clock.h
@@ -0,0 +1,244 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2017-2018 NXP
+ */
+
+#ifndef __DT_BINDINGS_CLOCK_IMX8MM_H
+#define __DT_BINDINGS_CLOCK_IMX8MM_H
+
+#define IMX8MM_CLK_DUMMY			0
+#define IMX8MM_CLK_32K				1
+#define IMX8MM_CLK_24M				2
+#define IMX8MM_OSC_HDMI_CLK			3
+#define IMX8MM_CLK_EXT1				4
+#define IMX8MM_CLK_EXT2				5
+#define IMX8MM_CLK_EXT3				6
+#define IMX8MM_CLK_EXT4				7
+#define IMX8MM_AUDIO_PLL1_REF_SEL		8
+#define IMX8MM_AUDIO_PLL2_REF_SEL		9
+#define IMX8MM_VIDEO_PLL1_REF_SEL		10
+#define IMX8MM_DRAM_PLL_REF_SEL			11
+#define IMX8MM_GPU_PLL_REF_SEL			12
+#define IMX8MM_VPU_PLL_REF_SEL			13
+#define IMX8MM_ARM_PLL_REF_SEL			14
+#define IMX8MM_SYS_PLL1_REF_SEL			15
+#define IMX8MM_SYS_PLL2_REF_SEL			16
+#define IMX8MM_SYS_PLL3_REF_SEL			17
+#define IMX8MM_AUDIO_PLL1			18
+#define IMX8MM_AUDIO_PLL2			19
+#define IMX8MM_VIDEO_PLL1			20
+#define IMX8MM_DRAM_PLL				21
+#define IMX8MM_GPU_PLL				22
+#define IMX8MM_VPU_PLL				23
+#define IMX8MM_ARM_PLL				24
+#define IMX8MM_SYS_PLL1				25
+#define IMX8MM_SYS_PLL2				26
+#define IMX8MM_SYS_PLL3				27
+#define IMX8MM_AUDIO_PLL1_BYPASS		28
+#define IMX8MM_AUDIO_PLL2_BYPASS		29
+#define IMX8MM_VIDEO_PLL1_BYPASS		30
+#define IMX8MM_DRAM_PLL_BYPASS			31
+#define IMX8MM_GPU_PLL_BYPASS			32
+#define IMX8MM_VPU_PLL_BYPASS			33
+#define IMX8MM_ARM_PLL_BYPASS			34
+#define IMX8MM_SYS_PLL1_BYPASS			35
+#define IMX8MM_SYS_PLL2_BYPASS			36
+#define IMX8MM_SYS_PLL3_BYPASS			37
+#define IMX8MM_AUDIO_PLL1_OUT			38
+#define IMX8MM_AUDIO_PLL2_OUT			39
+#define IMX8MM_VIDEO_PLL1_OUT			40
+#define IMX8MM_DRAM_PLL_OUT			41
+#define IMX8MM_GPU_PLL_OUT			42
+#define IMX8MM_VPU_PLL_OUT			43
+#define IMX8MM_ARM_PLL_OUT			44
+#define IMX8MM_SYS_PLL1_OUT			45
+#define IMX8MM_SYS_PLL2_OUT			46
+#define IMX8MM_SYS_PLL3_OUT			47
+#define IMX8MM_SYS_PLL1_40M			48
+#define IMX8MM_SYS_PLL1_80M			49
+#define IMX8MM_SYS_PLL1_100M			50
+#define IMX8MM_SYS_PLL1_133M			51
+#define IMX8MM_SYS_PLL1_160M			52
+#define IMX8MM_SYS_PLL1_200M			53
+#define IMX8MM_SYS_PLL1_266M			54
+#define IMX8MM_SYS_PLL1_400M			55
+#define IMX8MM_SYS_PLL1_800M			56
+#define IMX8MM_SYS_PLL2_50M			57
+#define IMX8MM_SYS_PLL2_100M			58
+#define IMX8MM_SYS_PLL2_125M			59
+#define IMX8MM_SYS_PLL2_166M			60
+#define IMX8MM_SYS_PLL2_200M			61
+#define IMX8MM_SYS_PLL2_250M			62
+#define IMX8MM_SYS_PLL2_333M			63
+#define IMX8MM_SYS_PLL2_500M			64
+#define IMX8MM_SYS_PLL2_1000M			65
+
+/* core */
+#define IMX8MM_CLK_A53_SRC			66
+#define IMX8MM_CLK_M4_SRC			67
+#define IMX8MM_CLK_VPU_SRC			68
+#define IMX8MM_CLK_GPU3D_SRC			69
+#define IMX8MM_CLK_GPU2D_SRC			70
+#define IMX8MM_CLK_A53_CG			71
+#define IMX8MM_CLK_M4_CG			72
+#define IMX8MM_CLK_VPU_CG			73
+#define IMX8MM_CLK_GPU3D_CG			74
+#define IMX8MM_CLK_GPU2D_CG			75
+#define IMX8MM_CLK_A53_DIV			76
+#define IMX8MM_CLK_M4_DIV			77
+#define IMX8MM_CLK_VPU_DIV			78
+#define IMX8MM_CLK_GPU3D_DIV			79
+#define IMX8MM_CLK_GPU2D_DIV			80
+
+/* bus */
+#define IMX8MM_CLK_MAIN_AXI			81
+#define IMX8MM_CLK_ENET_AXI			82
+#define IMX8MM_CLK_NAND_USDHC_BUS		83
+#define IMX8MM_CLK_VPU_BUS			84
+#define IMX8MM_CLK_DISP_AXI			85
+#define IMX8MM_CLK_DISP_APB			86
+#define IMX8MM_CLK_DISP_RTRM			87
+#define IMX8MM_CLK_USB_BUS			88
+#define IMX8MM_CLK_GPU_AXI			89
+#define IMX8MM_CLK_GPU_AHB			90
+#define IMX8MM_CLK_NOC				91
+#define IMX8MM_CLK_NOC_APB			92
+
+#define IMX8MM_CLK_AHB				93
+#define IMX8MM_CLK_AUDIO_AHB			94
+#define IMX8MM_CLK_IPG_ROOT			95
+#define IMX8MM_CLK_IPG_AUDIO_ROOT		96
+
+#define IMX8MM_CLK_DRAM_ALT			97
+#define IMX8MM_CLK_DRAM_APB			98
+#define IMX8MM_CLK_VPU_G1			99
+#define IMX8MM_CLK_VPU_G2			100
+#define IMX8MM_CLK_DISP_DTRC			101
+#define IMX8MM_CLK_DISP_DC8000			102
+#define IMX8MM_CLK_PCIE1_CTRL			103
+#define IMX8MM_CLK_PCIE1_PHY			104
+#define IMX8MM_CLK_PCIE1_AUX			105
+#define IMX8MM_CLK_DC_PIXEL			106
+#define IMX8MM_CLK_LCDIF_PIXEL			107
+#define IMX8MM_CLK_SAI1				108
+#define IMX8MM_CLK_SAI2				109
+#define IMX8MM_CLK_SAI3				110
+#define IMX8MM_CLK_SAI4				111
+#define IMX8MM_CLK_SAI5				112
+#define IMX8MM_CLK_SAI6				113
+#define IMX8MM_CLK_SPDIF1			114
+#define IMX8MM_CLK_SPDIF2			115
+#define IMX8MM_CLK_ENET_REF			116
+#define IMX8MM_CLK_ENET_TIMER			117
+#define IMX8MM_CLK_ENET_PHY_REF			118
+#define IMX8MM_CLK_NAND				119
+#define IMX8MM_CLK_QSPI				120
+#define IMX8MM_CLK_USDHC1			121
+#define IMX8MM_CLK_USDHC2			122
+#define IMX8MM_CLK_I2C1				123
+#define IMX8MM_CLK_I2C2				124
+#define IMX8MM_CLK_I2C3				125
+#define IMX8MM_CLK_I2C4				126
+#define IMX8MM_CLK_UART1			127
+#define IMX8MM_CLK_UART2			128
+#define IMX8MM_CLK_UART3			129
+#define IMX8MM_CLK_UART4			130
+#define IMX8MM_CLK_USB_CORE_REF			131
+#define IMX8MM_CLK_USB_PHY_REF			132
+#define IMX8MM_CLK_ECSPI1			133
+#define IMX8MM_CLK_ECSPI2			134
+#define IMX8MM_CLK_PWM1				135
+#define IMX8MM_CLK_PWM2				136
+#define IMX8MM_CLK_PWM3				137
+#define IMX8MM_CLK_PWM4				138
+#define IMX8MM_CLK_GPT1				139
+#define IMX8MM_CLK_WDOG				140
+#define IMX8MM_CLK_WRCLK			141
+#define IMX8MM_CLK_DSI_CORE			142
+#define IMX8MM_CLK_DSI_PHY_REF			143
+#define IMX8MM_CLK_DSI_DBI			144
+#define IMX8MM_CLK_USDHC3			145
+#define IMX8MM_CLK_CSI1_CORE			146
+#define IMX8MM_CLK_CSI1_PHY_REF			147
+#define IMX8MM_CLK_CSI1_ESC			148
+#define IMX8MM_CLK_CSI2_CORE			149
+#define IMX8MM_CLK_CSI2_PHY_REF			150
+#define IMX8MM_CLK_CSI2_ESC			151
+#define IMX8MM_CLK_PCIE2_CTRL			152
+#define IMX8MM_CLK_PCIE2_PHY			153
+#define IMX8MM_CLK_PCIE2_AUX			154
+#define IMX8MM_CLK_ECSPI3			155
+#define IMX8MM_CLK_PDM				156
+#define IMX8MM_CLK_VPU_H1			157
+#define IMX8MM_CLK_CLKO1			158
+
+#define IMX8MM_CLK_ECSPI1_ROOT			159
+#define IMX8MM_CLK_ECSPI2_ROOT			160
+#define IMX8MM_CLK_ECSPI3_ROOT			161
+#define IMX8MM_CLK_ENET1_ROOT			162
+#define IMX8MM_CLK_GPT1_ROOT			163
+#define IMX8MM_CLK_I2C1_ROOT			164
+#define IMX8MM_CLK_I2C2_ROOT			165
+#define IMX8MM_CLK_I2C3_ROOT			166
+#define IMX8MM_CLK_I2C4_ROOT			167
+#define IMX8MM_CLK_OCOTP_ROOT			168
+#define IMX8MM_CLK_PCIE1_ROOT			169
+#define IMX8MM_CLK_PWM1_ROOT			170
+#define IMX8MM_CLK_PWM2_ROOT			171
+#define IMX8MM_CLK_PWM3_ROOT			172
+#define IMX8MM_CLK_PWM4_ROOT			173
+#define IMX8MM_CLK_QSPI_ROOT			174
+#define IMX8MM_CLK_NAND_ROOT			175
+#define IMX8MM_CLK_SAI1_ROOT			176
+#define IMX8MM_CLK_SAI1_IPG			177
+#define IMX8MM_CLK_SAI2_ROOT			178
+#define IMX8MM_CLK_SAI2_IPG			179
+#define IMX8MM_CLK_SAI3_ROOT			180
+#define IMX8MM_CLK_SAI3_IPG			181
+#define IMX8MM_CLK_SAI4_ROOT			182
+#define IMX8MM_CLK_SAI4_IPG			183
+#define IMX8MM_CLK_SAI5_ROOT			184
+#define IMX8MM_CLK_SAI5_IPG			185
+#define IMX8MM_CLK_SAI6_ROOT			186
+#define IMX8MM_CLK_SAI6_IPG			187
+#define IMX8MM_CLK_UART1_ROOT			188
+#define IMX8MM_CLK_UART2_ROOT			189
+#define IMX8MM_CLK_UART3_ROOT			190
+#define IMX8MM_CLK_UART4_ROOT			191
+#define IMX8MM_CLK_USB1_CTRL_ROOT		192
+#define IMX8MM_CLK_GPU3D_ROOT			193
+#define IMX8MM_CLK_USDHC1_ROOT			194
+#define IMX8MM_CLK_USDHC2_ROOT			195
+#define IMX8MM_CLK_WDOG1_ROOT			196
+#define IMX8MM_CLK_WDOG2_ROOT			197
+#define IMX8MM_CLK_WDOG3_ROOT			198
+#define IMX8MM_CLK_VPU_G1_ROOT			199
+#define IMX8MM_CLK_GPU_BUS_ROOT			200
+#define IMX8MM_CLK_VPU_H1_ROOT			201
+#define IMX8MM_CLK_VPU_G2_ROOT			202
+#define IMX8MM_CLK_PDM_ROOT			203
+#define IMX8MM_CLK_DISP_ROOT			204
+#define IMX8MM_CLK_DISP_AXI_ROOT		205
+#define IMX8MM_CLK_DISP_APB_ROOT		206
+#define IMX8MM_CLK_DISP_RTRM_ROOT		207
+#define IMX8MM_CLK_USDHC3_ROOT			208
+#define IMX8MM_CLK_TMU_ROOT			209
+#define IMX8MM_CLK_VPU_DEC_ROOT			210
+#define IMX8MM_CLK_SDMA1_ROOT			211
+#define IMX8MM_CLK_SDMA2_ROOT			212
+#define IMX8MM_CLK_SDMA3_ROOT			213
+#define IMX8MM_CLK_GPT_3M			214
+#define IMX8MM_CLK_ARM				215
+#define IMX8MM_CLK_PDM_IPG			216
+#define IMX8MM_CLK_GPU2D_ROOT			217
+#define IMX8MM_CLK_MU_ROOT			218
+#define IMX8MM_CLK_CSI1_ROOT			219
+
+#define IMX8MM_CLK_DRAM_CORE			220
+#define IMX8MM_CLK_DRAM_ALT_ROOT		221
+
+#define IMX8MM_CLK_NAND_USDHC_BUS_RAWNAND_CLK	222
+
+#define IMX8MM_CLK_END				223
+
+#endif
-- 
cgit v1.2.3


From 202ce5afe51e883f94736310997ea1c73219eb36 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@gmail.com>
Date: Wed, 30 Jan 2019 22:20:56 -0200
Subject: clk: imx8mq: Add support for the CLKO1 clock

Add the entry for the CLKO1 clock.

Signed-off-by: Fabio Estevam <festevam@gmail.com>
Reviewed-by: Abel Vesa <abel.vesa@nxp.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/imx/clk-imx8mq.c             | 3 +++
 include/dt-bindings/clock/imx8mq-clock.h | 4 +++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/clk/imx/clk-imx8mq.c b/drivers/clk/imx/clk-imx8mq.c
index cd904764212b..897740f0a012 100644
--- a/drivers/clk/imx/clk-imx8mq.c
+++ b/drivers/clk/imx/clk-imx8mq.c
@@ -267,6 +267,8 @@ static const char * const imx8mq_ecspi3_sels[] = {"osc_25m", "sys2_pll_200m", "s
 					   "sys1_pll_800m", "sys3_pll2_out", "sys2_pll_250m", "audio_pll2_out", };
 static const char * const imx8mq_dram_core_sels[] = {"dram_pll_out", "dram_alt_root", };
 
+static const char * const imx8mq_clko1_sels[] = {"osc_25m", "sys1_pll_800m", "osc_27m", "sys1_pll_200m",
+					  "audio_pll2_out", "sys2_pll_500m", "vpu_pll_out", "sys1_pll_80m", };
 static const char * const imx8mq_clko2_sels[] = {"osc_25m", "sys2_pll_200m", "sys1_pll_400m", "sys2_pll_166m",
 					  "sys3_pll2_out", "audio_pll1_out", "video_pll1_out", "ckil", };
 
@@ -486,6 +488,7 @@ static int imx8mq_clocks_probe(struct platform_device *pdev)
 	clks[IMX8MQ_CLK_GPT1] = imx8m_clk_composite("gpt1", imx8mq_gpt1_sels, base + 0xb580);
 	clks[IMX8MQ_CLK_WDOG] = imx8m_clk_composite("wdog", imx8mq_wdog_sels, base + 0xb900);
 	clks[IMX8MQ_CLK_WRCLK] = imx8m_clk_composite("wrclk", imx8mq_wrclk_sels, base + 0xb980);
+	clks[IMX8MQ_CLK_CLKO1] = imx8m_clk_composite("clko1", imx8mq_clko1_sels, base + 0xba00);
 	clks[IMX8MQ_CLK_CLKO2] = imx8m_clk_composite("clko2", imx8mq_clko2_sels, base + 0xba80);
 	clks[IMX8MQ_CLK_DSI_CORE] = imx8m_clk_composite("dsi_core", imx8mq_dsi_core_sels, base + 0xbb00);
 	clks[IMX8MQ_CLK_DSI_PHY_REF] = imx8m_clk_composite("dsi_phy_ref", imx8mq_dsi_phy_sels, base + 0xbb80);
diff --git a/include/dt-bindings/clock/imx8mq-clock.h b/include/dt-bindings/clock/imx8mq-clock.h
index 04f7ac345984..74f80b26f679 100644
--- a/include/dt-bindings/clock/imx8mq-clock.h
+++ b/include/dt-bindings/clock/imx8mq-clock.h
@@ -391,5 +391,7 @@
 
 #define IMX8MQ_CLK_NAND_USDHC_BUS_RAWNAND_CLK	267
 
-#define IMX8MQ_CLK_END				268
+#define IMX8MQ_CLK_CLKO1			268
+
+#define IMX8MQ_CLK_END				269
 #endif /* __DT_BINDINGS_CLOCK_IMX8MQ_H */
-- 
cgit v1.2.3


From 45a359e80d9b81b0fc3d9612053a86fc52a5f4b9 Mon Sep 17 00:00:00 2001
From: Abel Vesa <abel.vesa@nxp.com>
Date: Fri, 15 Feb 2019 17:48:45 +0000
Subject: dt-bindings: imx8mq-clock: Add the missing ARM clock

Add the missing ARM clock which will be used by cpufreq

Signed-off-by: Abel Vesa <abel.vesa@nxp.com>
Reviewed-by: Lucas Stach <l.stach@pengutronix.de>
[sboyd@kernel.org: Fixed numbering in dt header]
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/clock/imx8mq-clock.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/imx8mq-clock.h b/include/dt-bindings/clock/imx8mq-clock.h
index 74f80b26f679..491e39c146be 100644
--- a/include/dt-bindings/clock/imx8mq-clock.h
+++ b/include/dt-bindings/clock/imx8mq-clock.h
@@ -392,6 +392,7 @@
 #define IMX8MQ_CLK_NAND_USDHC_BUS_RAWNAND_CLK	267
 
 #define IMX8MQ_CLK_CLKO1			268
+#define IMX8MQ_CLK_ARM				269
 
-#define IMX8MQ_CLK_END				269
+#define IMX8MQ_CLK_END				270
 #endif /* __DT_BINDINGS_CLOCK_IMX8MQ_H */
-- 
cgit v1.2.3


From ed11aff3eef35440e23a7cf9692f84ab6233e527 Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Mon, 21 Jan 2019 07:31:28 +0100
Subject: dt-bindings: marvell,mmp2: Add clock id for the LCDC clock

The peripheral clock is required for access the the LCDC registers. It
is in fact separate from the "AXI clock" that is optionally used to
generate the pixel clock and as such requires a separate clock id.

Link: https://lists.freedesktop.org/archives/dri-devel/2019-January/203975.html
Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/clock/marvell,mmp2.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/marvell,mmp2.h b/include/dt-bindings/clock/marvell,mmp2.h
index 228a5e234af0..e785c6eb3561 100644
--- a/include/dt-bindings/clock/marvell,mmp2.h
+++ b/include/dt-bindings/clock/marvell,mmp2.h
@@ -71,6 +71,7 @@
 #define MMP2_CLK_CCIC1_MIX		117
 #define MMP2_CLK_CCIC1_PHY		118
 #define MMP2_CLK_CCIC1_SPHY		119
+#define MMP2_CLK_DISP0_LCDC		120
 
 #define MMP2_NR_CLKS			200
 #endif
-- 
cgit v1.2.3


From 04053f4d23a41b9383dcea4da302fd43f673afb3 Mon Sep 17 00:00:00 2001
From: David Dai <daidavid1@codeaurora.org>
Date: Thu, 24 Jan 2019 16:47:00 -0800
Subject: clk: qcom: clk-rpmh: Add IPA clock support

The clk-rpmh driver only supports on and off RPMh clock resources. Let's
extend the driver by adding support for clocks that are managed by a
different type of RPMh resource known as Bus Clock Manager(BCM). The BCM
is a configurable shared resource aggregator that scales performance
based on a set of frequency points. The Qualcomm IP Accelerator (IPA)
clock is an example of a resource that is managed by the BCM and this a
requirement from the IPA driver in order to scale its core clock.

Signed-off-by: David Dai <daidavid1@codeaurora.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/qcom/clk-rpmh.c           | 146 ++++++++++++++++++++++++++++++++++
 include/dt-bindings/clock/qcom,rpmh.h |   1 +
 2 files changed, 147 insertions(+)

(limited to 'include')

diff --git a/drivers/clk/qcom/clk-rpmh.c b/drivers/clk/qcom/clk-rpmh.c
index 9f4fc7773fb2..c3fd632af119 100644
--- a/drivers/clk/qcom/clk-rpmh.c
+++ b/drivers/clk/qcom/clk-rpmh.c
@@ -18,6 +18,31 @@
 #define CLK_RPMH_ARC_EN_OFFSET		0
 #define CLK_RPMH_VRM_EN_OFFSET		4
 
+#define BCM_TCS_CMD_COMMIT_MASK		0x40000000
+#define BCM_TCS_CMD_VALID_SHIFT		29
+#define BCM_TCS_CMD_VOTE_MASK		0x3fff
+#define BCM_TCS_CMD_VOTE_SHIFT		0
+
+#define BCM_TCS_CMD(valid, vote)				\
+	(BCM_TCS_CMD_COMMIT_MASK |				\
+	((valid) << BCM_TCS_CMD_VALID_SHIFT) |			\
+	((vote & BCM_TCS_CMD_VOTE_MASK)				\
+	<< BCM_TCS_CMD_VOTE_SHIFT))
+
+/**
+ * struct bcm_db - Auxiliary data pertaining to each Bus Clock Manager(BCM)
+ * @unit: divisor used to convert Hz value to an RPMh msg
+ * @width: multiplier used to convert Hz value to an RPMh msg
+ * @vcd: virtual clock domain that this bcm belongs to
+ * @reserved: reserved to pad the struct
+ */
+struct bcm_db {
+	__le32 unit;
+	__le16 width;
+	u8 vcd;
+	u8 reserved;
+};
+
 /**
  * struct clk_rpmh - individual rpmh clock data structure
  * @hw:			handle between common and hardware-specific interfaces
@@ -29,6 +54,7 @@
  * @aggr_state:		rpmh clock aggregated state
  * @last_sent_aggr_state: rpmh clock last aggr state sent to RPMh
  * @valid_state_mask:	mask to determine the state of the rpmh clock
+ * @unit:		divisor to convert rate to rpmh msg in magnitudes of Khz
  * @dev:		device to which it is attached
  * @peer:		pointer to the clock rpmh sibling
  */
@@ -42,6 +68,7 @@ struct clk_rpmh {
 	u32 aggr_state;
 	u32 last_sent_aggr_state;
 	u32 valid_state_mask;
+	u32 unit;
 	struct device *dev;
 	struct clk_rpmh *peer;
 };
@@ -98,6 +125,17 @@ static DEFINE_MUTEX(rpmh_clk_lock);
 	__DEFINE_CLK_RPMH(_platform, _name, _name_active, _res_name,	\
 			  CLK_RPMH_VRM_EN_OFFSET, 1, _div)
 
+#define DEFINE_CLK_RPMH_BCM(_platform, _name, _res_name)		\
+	static struct clk_rpmh _platform##_##_name = {			\
+		.res_name = _res_name,					\
+		.valid_state_mask = BIT(RPMH_ACTIVE_ONLY_STATE),	\
+		.div = 1,						\
+		.hw.init = &(struct clk_init_data){			\
+			.ops = &clk_rpmh_bcm_ops,			\
+			.name = #_name,					\
+		},							\
+	}
+
 static inline struct clk_rpmh *to_clk_rpmh(struct clk_hw *_hw)
 {
 	return container_of(_hw, struct clk_rpmh, hw);
@@ -210,6 +248,96 @@ static const struct clk_ops clk_rpmh_ops = {
 	.recalc_rate	= clk_rpmh_recalc_rate,
 };
 
+static int clk_rpmh_bcm_send_cmd(struct clk_rpmh *c, bool enable)
+{
+	struct tcs_cmd cmd = { 0 };
+	u32 cmd_state;
+	int ret;
+
+	mutex_lock(&rpmh_clk_lock);
+
+	cmd_state = 0;
+	if (enable) {
+		cmd_state = 1;
+		if (c->aggr_state)
+			cmd_state = c->aggr_state;
+	}
+
+	if (c->last_sent_aggr_state == cmd_state) {
+		mutex_unlock(&rpmh_clk_lock);
+		return 0;
+	}
+
+	cmd.addr = c->res_addr;
+	cmd.data = BCM_TCS_CMD(enable, cmd_state);
+
+	ret = rpmh_write_async(c->dev, RPMH_ACTIVE_ONLY_STATE, &cmd, 1);
+	if (ret) {
+		dev_err(c->dev, "set active state of %s failed: (%d)\n",
+			c->res_name, ret);
+		mutex_unlock(&rpmh_clk_lock);
+		return ret;
+	}
+
+	c->last_sent_aggr_state = cmd_state;
+
+	mutex_unlock(&rpmh_clk_lock);
+
+	return 0;
+}
+
+static int clk_rpmh_bcm_prepare(struct clk_hw *hw)
+{
+	struct clk_rpmh *c = to_clk_rpmh(hw);
+
+	return clk_rpmh_bcm_send_cmd(c, true);
+};
+
+static void clk_rpmh_bcm_unprepare(struct clk_hw *hw)
+{
+	struct clk_rpmh *c = to_clk_rpmh(hw);
+
+	clk_rpmh_bcm_send_cmd(c, false);
+};
+
+static int clk_rpmh_bcm_set_rate(struct clk_hw *hw, unsigned long rate,
+				 unsigned long parent_rate)
+{
+	struct clk_rpmh *c = to_clk_rpmh(hw);
+
+	c->aggr_state = rate / c->unit;
+	/*
+	 * Since any non-zero value sent to hw would result in enabling the
+	 * clock, only send the value if the clock has already been prepared.
+	 */
+	if (clk_hw_is_prepared(hw))
+		clk_rpmh_bcm_send_cmd(c, true);
+
+	return 0;
+};
+
+static long clk_rpmh_round_rate(struct clk_hw *hw, unsigned long rate,
+				unsigned long *parent_rate)
+{
+	return rate;
+}
+
+static unsigned long clk_rpmh_bcm_recalc_rate(struct clk_hw *hw,
+					unsigned long prate)
+{
+	struct clk_rpmh *c = to_clk_rpmh(hw);
+
+	return c->aggr_state * c->unit;
+}
+
+static const struct clk_ops clk_rpmh_bcm_ops = {
+	.prepare	= clk_rpmh_bcm_prepare,
+	.unprepare	= clk_rpmh_bcm_unprepare,
+	.set_rate	= clk_rpmh_bcm_set_rate,
+	.round_rate	= clk_rpmh_round_rate,
+	.recalc_rate	= clk_rpmh_bcm_recalc_rate,
+};
+
 /* Resource name must match resource id present in cmd-db. */
 DEFINE_CLK_RPMH_ARC(sdm845, bi_tcxo, bi_tcxo_ao, "xo.lvl", 0x3, 2);
 DEFINE_CLK_RPMH_VRM(sdm845, ln_bb_clk2, ln_bb_clk2_ao, "lnbclka2", 2);
@@ -217,6 +345,7 @@ DEFINE_CLK_RPMH_VRM(sdm845, ln_bb_clk3, ln_bb_clk3_ao, "lnbclka3", 2);
 DEFINE_CLK_RPMH_VRM(sdm845, rf_clk1, rf_clk1_ao, "rfclka1", 1);
 DEFINE_CLK_RPMH_VRM(sdm845, rf_clk2, rf_clk2_ao, "rfclka2", 1);
 DEFINE_CLK_RPMH_VRM(sdm845, rf_clk3, rf_clk3_ao, "rfclka3", 1);
+DEFINE_CLK_RPMH_BCM(sdm845, ipa, "IP0");
 
 static struct clk_hw *sdm845_rpmh_clocks[] = {
 	[RPMH_CXO_CLK]		= &sdm845_bi_tcxo.hw,
@@ -231,6 +360,7 @@ static struct clk_hw *sdm845_rpmh_clocks[] = {
 	[RPMH_RF_CLK2_A]	= &sdm845_rf_clk2_ao.hw,
 	[RPMH_RF_CLK3]		= &sdm845_rf_clk3.hw,
 	[RPMH_RF_CLK3_A]	= &sdm845_rf_clk3_ao.hw,
+	[RPMH_IPA_CLK]		= &sdm845_ipa.hw,
 };
 
 static const struct clk_rpmh_desc clk_rpmh_sdm845 = {
@@ -267,6 +397,8 @@ static int clk_rpmh_probe(struct platform_device *pdev)
 
 	for (i = 0; i < desc->num_clks; i++) {
 		u32 res_addr;
+		size_t aux_data_len;
+		const struct bcm_db *data;
 
 		rpmh_clk = to_clk_rpmh(hw_clks[i]);
 		res_addr = cmd_db_read_addr(rpmh_clk->res_name);
@@ -275,6 +407,20 @@ static int clk_rpmh_probe(struct platform_device *pdev)
 				rpmh_clk->res_name);
 			return -ENODEV;
 		}
+
+		data = cmd_db_read_aux_data(rpmh_clk->res_name, &aux_data_len);
+		if (IS_ERR(data)) {
+			ret = PTR_ERR(data);
+			dev_err(&pdev->dev,
+				"error reading RPMh aux data for %s (%d)\n",
+				rpmh_clk->res_name, ret);
+			return ret;
+		}
+
+		/* Convert unit from Khz to Hz */
+		if (aux_data_len == sizeof(*data))
+			rpmh_clk->unit = le32_to_cpu(data->unit) * 1000ULL;
+
 		rpmh_clk->res_addr += res_addr;
 		rpmh_clk->dev = &pdev->dev;
 
diff --git a/include/dt-bindings/clock/qcom,rpmh.h b/include/dt-bindings/clock/qcom,rpmh.h
index f48fbd6f2095..edcab3f7b7d3 100644
--- a/include/dt-bindings/clock/qcom,rpmh.h
+++ b/include/dt-bindings/clock/qcom,rpmh.h
@@ -18,5 +18,6 @@
 #define RPMH_RF_CLK2_A				9
 #define RPMH_RF_CLK3				10
 #define RPMH_RF_CLK3_A				11
+#define RPMH_IPA_CLK				12
 
 #endif
-- 
cgit v1.2.3


From 585fc46bd47a143ee7433768e298a7f94008e5f5 Mon Sep 17 00:00:00 2001
From: Gabriel Fernandez <gabriel.fernandez@st.com>
Date: Thu, 14 Feb 2019 11:42:42 +0100
Subject: dt-bindings: clock: remove unused definition for stm32mp1

A copy of LTDC_PX and ETHCK_K (LTDC_K and ETHMAC_K) was introduced in
stm32mp1 dt-bindings file by mistake.
These bindings are not used and shouldn't be use to be conform with
convention name of the stm32mp1 clock IP.

Signed-off-by: Gabriel Fernandez <gabriel.fernandez@st.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/clock/stm32mp1-clks.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/stm32mp1-clks.h b/include/dt-bindings/clock/stm32mp1-clks.h
index 90ec780bfc68..4cdaf135829c 100644
--- a/include/dt-bindings/clock/stm32mp1-clks.h
+++ b/include/dt-bindings/clock/stm32mp1-clks.h
@@ -248,7 +248,4 @@
 
 #define STM32MP1_LAST_CLK 232
 
-#define LTDC_K		LTDC_PX
-#define ETHMAC_K	ETHCK_K
-
 #endif /* _DT_BINDINGS_STM32MP1_CLKS_H_ */
-- 
cgit v1.2.3


From 0c8c53e033d278cad09a81ec1c46a401a6a6b1af Mon Sep 17 00:00:00 2001
From: Edgar Bernardi Righi <edgar.righi@lsitec.org.br>
Date: Tue, 15 Jan 2019 09:03:36 +0530
Subject: dt-bindings: clock: Add DT bindings for Actions Semi S500 CMU

Add devicetree bindings for Actions Semi S500 Clock Management Unit.

Signed-off-by: Edgar Bernardi Righi <edgar.righi@lsitec.org.br>
[Mani: Documented S500 CMU compatible]
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Reviewed-by: Rob Herring <robh@kernel.org>
[sboyd@kernel.org: Fix SPDX comment style in header file]
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 .../devicetree/bindings/clock/actions,owl-cmu.txt  |  7 +-
 include/dt-bindings/clock/actions,s500-cmu.h       | 78 ++++++++++++++++++++++
 2 files changed, 82 insertions(+), 3 deletions(-)
 create mode 100644 include/dt-bindings/clock/actions,s500-cmu.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/actions,owl-cmu.txt b/Documentation/devicetree/bindings/clock/actions,owl-cmu.txt
index 2ef86ae96df8..d19885b7c73f 100644
--- a/Documentation/devicetree/bindings/clock/actions,owl-cmu.txt
+++ b/Documentation/devicetree/bindings/clock/actions,owl-cmu.txt
@@ -2,13 +2,14 @@
 
 The Actions Semi Owl Clock Management Unit generates and supplies clock
 to various controllers within the SoC. The clock binding described here is
-applicable to S900 and S700 SoC's.
+applicable to S900, S700 and S500 SoC's.
 
 Required Properties:
 
 - compatible: should be one of the following,
 	"actions,s900-cmu"
 	"actions,s700-cmu"
+	"actions,s500-cmu"
 - reg: physical base address of the controller and length of memory mapped
   region.
 - clocks: Reference to the parent clocks ("hosc", "losc")
@@ -19,8 +20,8 @@ Each clock is assigned an identifier, and client nodes can use this identifier
 to specify the clock which they consume.
 
 All available clocks are defined as preprocessor macros in corresponding
-dt-bindings/clock/actions,s900-cmu.h or actions,s700-cmu.h header and can be
-used in device tree sources.
+dt-bindings/clock/actions,s900-cmu.h or actions,s700-cmu.h or
+actions,s500-cmu.h header and can be used in device tree sources.
 
 External clocks:
 
diff --git a/include/dt-bindings/clock/actions,s500-cmu.h b/include/dt-bindings/clock/actions,s500-cmu.h
new file mode 100644
index 000000000000..030981cd2d56
--- /dev/null
+++ b/include/dt-bindings/clock/actions,s500-cmu.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Device Tree binding constants for Actions Semi S500 Clock Management Unit
+ *
+ * Copyright (c) 2014 Actions Semi Inc.
+ * Copyright (c) 2018 LSI-TEC - Caninos Loucos
+ */
+
+#ifndef __DT_BINDINGS_CLOCK_S500_CMU_H
+#define __DT_BINDINGS_CLOCK_S500_CMU_H
+
+#define CLK_NONE		0
+
+/* fixed rate clocks */
+#define CLK_LOSC		1
+#define CLK_HOSC		2
+
+/* pll clocks */
+#define CLK_CORE_PLL		3
+#define CLK_DEV_PLL		4
+#define CLK_DDR_PLL		5
+#define CLK_NAND_PLL		6
+#define CLK_DISPLAY_PLL		7
+#define CLK_ETHERNET_PLL	8
+#define CLK_AUDIO_PLL		9
+
+/* system clock */
+#define CLK_DEV			10
+#define CLK_H			11
+#define CLK_AHBPREDIV		12
+#define CLK_AHB			13
+#define CLK_DE			14
+#define CLK_BISP		15
+#define CLK_VCE			16
+#define CLK_VDE			17
+
+/* peripheral device clock */
+#define CLK_TIMER		18
+#define CLK_I2C0		19
+#define CLK_I2C1		20
+#define CLK_I2C2		21
+#define CLK_I2C3		22
+#define CLK_PWM0		23
+#define CLK_PWM1		24
+#define CLK_PWM2		25
+#define CLK_PWM3		26
+#define CLK_PWM4		27
+#define CLK_PWM5		28
+#define CLK_SD0			29
+#define CLK_SD1			30
+#define CLK_SD2			31
+#define CLK_SENSOR0		32
+#define CLK_SENSOR1		33
+#define CLK_SPI0		34
+#define CLK_SPI1		35
+#define CLK_SPI2		36
+#define CLK_SPI3		37
+#define CLK_UART0		38
+#define CLK_UART1		39
+#define CLK_UART2		40
+#define CLK_UART3		41
+#define CLK_UART4		42
+#define CLK_UART5		43
+#define CLK_UART6		44
+#define CLK_DE1			45
+#define CLK_DE2			46
+#define CLK_I2SRX		47
+#define CLK_I2STX		48
+#define CLK_HDMI_AUDIO		49
+#define CLK_HDMI		50
+#define CLK_SPDIF		51
+#define CLK_NAND		52
+#define CLK_ECC			53
+#define CLK_RMII_REF		54
+
+#define CLK_NR_CLKS	       (CLK_RMII_REF + 1)
+
+#endif /* __DT_BINDINGS_CLOCK_S500_CMU_H */
-- 
cgit v1.2.3


From d13501a2bedfbea0983cc868d3f1dc692627f60d Mon Sep 17 00:00:00 2001
From: Katsuhiro Suzuki <katsuhiro@katsuster.net>
Date: Mon, 11 Feb 2019 00:38:06 +0900
Subject: clk: fractional-divider: check parent rate only if flag is set

Custom approximation of fractional-divider may not need parent clock
rate checking. For example Rockchip SoCs work fine using grand parent
clock rate even if target rate is greater than parent.

This patch checks parent clock rate only if CLK_SET_RATE_PARENT flag
is set.

For detailed example, clock tree of Rockchip I2S audio hardware.
  - Clock rate of CPLL is 1.2GHz, GPLL is 491.52MHz.
  - i2s1_div is integer divider can divide N (N is 1~128).
    Input clock is CPLL or GPLL. Initial divider value is N = 1.
    Ex) PLL = CPLL, N = 10, i2s1_div output rate is
      CPLL / 10 = 1.2GHz / 10 = 120MHz
  - i2s1_frac is fractional divider can divide input to x/y, x and
    y are 16bit integer.

CPLL --> | selector | ---> i2s1_div -+--> | selector | --> I2S1 MCLK
GPLL --> |          | ,--------------'    |          |
                      `--> i2s1_frac ---> |          |

Clock mux system try to choose suitable one from i2s1_div and
i2s1_frac for master clock (MCLK) of I2S1.

Bad scenario as follows:
  - Try to set MCLK to 8.192MHz (32kHz audio replay)
    Candidate setting is
    - i2s1_div: GPLL / 60 = 8.192MHz
    i2s1_div candidate is exactly same as target clock rate, so mux
    choose this clock source. i2s1_div output rate is changed
    491.52MHz -> 8.192MHz

  - After that try to set to 11.2896MHz (44.1kHz audio replay)
    Candidate settings are
    - i2s1_div : CPLL / 107 = 11.214945MHz
    - i2s1_frac: i2s1_div   = 8.192MHz
      This is because clk_fd_round_rate() thinks target rate
      (11.2896MHz) is higher than parent rate (i2s1_div = 8.192MHz)
      and returns parent clock rate.

Above is current upstreamed behavior. Clock mux system choose
i2s1_div, but this clock rate is not acceptable for I2S driver, so
users cannot replay audio.

Expected behavior is:
  - Try to set master clock to 11.2896MHz (44.1kHz audio replay)
    Candidate settings are
    - i2s1_div : CPLL / 107          = 11.214945MHz
    - i2s1_frac: i2s1_div * 147/6400 = 11.2896MHz
                 Change i2s1_div to GPLL / 1 = 491.52MHz at same
                 time.

If apply this commit, clk_fd_round_rate() calls custom approximate
function of Rockchip even if target rate is higher than parent.
Custom function changes both grand parent (i2s1_div) and parent
(i2s_frac) settings at same time. Clock mux system can choose
i2s1_frac and audio works fine.

Signed-off-by: Katsuhiro Suzuki <katsuhiro@katsuster.net>
Reviewed-by: Heiko Stuebner <heiko@sntech.de>
[sboyd@kernel.org: Make function into a macro instead]
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-fractional-divider.c | 2 +-
 include/linux/clk-provider.h         | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/clk/clk-fractional-divider.c b/drivers/clk/clk-fractional-divider.c
index 545dceec0bbf..fdfe2e423d15 100644
--- a/drivers/clk/clk-fractional-divider.c
+++ b/drivers/clk/clk-fractional-divider.c
@@ -79,7 +79,7 @@ static long clk_fd_round_rate(struct clk_hw *hw, unsigned long rate,
 	unsigned long m, n;
 	u64 ret;
 
-	if (!rate || rate >= *parent_rate)
+	if (!rate || (!clk_hw_can_set_rate_parent(hw) && rate >= *parent_rate))
 		return *parent_rate;
 
 	if (fd->approximation)
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index e443fa9fa859..b7cf80a71293 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -792,6 +792,9 @@ unsigned int __clk_get_enable_count(struct clk *clk);
 unsigned long clk_hw_get_rate(const struct clk_hw *hw);
 unsigned long __clk_get_flags(struct clk *clk);
 unsigned long clk_hw_get_flags(const struct clk_hw *hw);
+#define clk_hw_can_set_rate_parent(hw) \
+	(clk_hw_get_flags((hw)) & CLK_SET_RATE_PARENT)
+
 bool clk_hw_is_prepared(const struct clk_hw *hw);
 bool clk_hw_rate_is_protected(const struct clk_hw *hw);
 bool clk_hw_is_enabled(const struct clk_hw *hw);
-- 
cgit v1.2.3


From a9443a63283ae7eb78f735341da22bc3a69a464d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 18 Feb 2019 22:34:15 +0300
Subject: clk: x86: Move clk-lpss.h to platform_data/x86

clk-lpss.h is solely x86 related header. Move it to correct folder.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/acpi/acpi_lpss.c                   |  2 +-
 drivers/clk/x86/clk-lpt.c                  |  2 +-
 include/linux/platform_data/clk-lpss.h     | 23 -----------------------
 include/linux/platform_data/x86/clk-lpss.h | 23 +++++++++++++++++++++++
 4 files changed, 25 insertions(+), 25 deletions(-)
 delete mode 100644 include/linux/platform_data/clk-lpss.h
 create mode 100644 include/linux/platform_data/x86/clk-lpss.h

(limited to 'include')

diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
index 5f94c35d165f..1e2a10a06b9d 100644
--- a/drivers/acpi/acpi_lpss.c
+++ b/drivers/acpi/acpi_lpss.c
@@ -18,7 +18,7 @@
 #include <linux/mutex.h>
 #include <linux/pci.h>
 #include <linux/platform_device.h>
-#include <linux/platform_data/clk-lpss.h>
+#include <linux/platform_data/x86/clk-lpss.h>
 #include <linux/platform_data/x86/pmc_atom.h>
 #include <linux/pm_domain.h>
 #include <linux/pm_runtime.h>
diff --git a/drivers/clk/x86/clk-lpt.c b/drivers/clk/x86/clk-lpt.c
index 6b40eb89ae19..68bd3abaef2c 100644
--- a/drivers/clk/x86/clk-lpt.c
+++ b/drivers/clk/x86/clk-lpt.c
@@ -13,7 +13,7 @@
 #include <linux/clk-provider.h>
 #include <linux/err.h>
 #include <linux/module.h>
-#include <linux/platform_data/clk-lpss.h>
+#include <linux/platform_data/x86/clk-lpss.h>
 #include <linux/platform_device.h>
 
 static int lpt_clk_probe(struct platform_device *pdev)
diff --git a/include/linux/platform_data/clk-lpss.h b/include/linux/platform_data/clk-lpss.h
deleted file mode 100644
index 23901992b9dd..000000000000
--- a/include/linux/platform_data/clk-lpss.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Intel Low Power Subsystem clocks.
- *
- * Copyright (C) 2013, Intel Corporation
- * Authors: Mika Westerberg <mika.westerberg@linux.intel.com>
- *          Rafael J. Wysocki <rafael.j.wysocki@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef __CLK_LPSS_H
-#define __CLK_LPSS_H
-
-struct lpss_clk_data {
-	const char *name;
-	struct clk *clk;
-};
-
-extern int lpt_clk_init(void);
-
-#endif /* __CLK_LPSS_H */
diff --git a/include/linux/platform_data/x86/clk-lpss.h b/include/linux/platform_data/x86/clk-lpss.h
new file mode 100644
index 000000000000..23901992b9dd
--- /dev/null
+++ b/include/linux/platform_data/x86/clk-lpss.h
@@ -0,0 +1,23 @@
+/*
+ * Intel Low Power Subsystem clocks.
+ *
+ * Copyright (C) 2013, Intel Corporation
+ * Authors: Mika Westerberg <mika.westerberg@linux.intel.com>
+ *          Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __CLK_LPSS_H
+#define __CLK_LPSS_H
+
+struct lpss_clk_data {
+	const char *name;
+	struct clk *clk;
+};
+
+extern int lpt_clk_init(void);
+
+#endif /* __CLK_LPSS_H */
-- 
cgit v1.2.3


From e09d168f13f0d63df7fe095d52be04c16cbe1cef Mon Sep 17 00:00:00 2001
From: "Enrico Weigelt, metux IT consult" <info@metux.net>
Date: Fri, 22 Feb 2019 10:54:15 +0100
Subject: gpio: AMD G-Series PCH gpio driver

GPIO platform driver for the AMD G-series PCH (eg. on GX-412TC)

This driver doesn't registers itself automatically, as it needs to
be provided with platform specific configuration, provided by some
board driver setup code.

Didn't implement oftree probing yet, as it's rarely found on x86.

Cc: linux-gpio@vger.kernel.org
Cc: linus.walleij@linaro.org
Cc: bgolaszewski@baylibre.com
Cc: dvhart@infradead.org
Cc: platform-driver-x86@vger.kernel.org
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Enrico Weigelt, metux IT consult <info@metux.net>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 MAINTAINERS                                     |   7 +
 drivers/gpio/Kconfig                            |   9 ++
 drivers/gpio/Makefile                           |   1 +
 drivers/gpio/gpio-amd-fch.c                     | 185 ++++++++++++++++++++++++
 include/linux/platform_data/gpio/gpio-amd-fch.h |  46 ++++++
 5 files changed, 248 insertions(+)
 create mode 100644 drivers/gpio/gpio-amd-fch.c
 create mode 100644 include/linux/platform_data/gpio/gpio-amd-fch.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 9919840d54cd..5e4135c78862 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -766,6 +766,13 @@ S:	Supported
 F:	Documentation/hwmon/fam15h_power
 F:	drivers/hwmon/fam15h_power.c
 
+AMD FCH GPIO DRIVER
+M:	Enrico Weigelt, metux IT consult <info@metux.net>
+L:	linux-gpio@vger.kernel.org
+S:	Maintained
+F:	drivers/gpio/gpio-amd-fch.c
+F:	include/linux/platform_data/gpio/gpio-amd-fch.h
+
 AMD GEODE CS5536 USB DEVICE CONTROLLER DRIVER
 L:	linux-geode@lists.infradead.org (moderated for non-subscribers)
 S:	Orphan
diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 486d9de2716a..3f50526a771f 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -655,6 +655,15 @@ config GPIO_LOONGSON1
 	help
 	  Say Y or M here to support GPIO on Loongson1 SoCs.
 
+config GPIO_AMD_FCH
+	tristate "GPIO support for AMD Fusion Controller Hub (G-series SOCs)"
+	help
+	  This option enables driver for GPIO on AMDs Fusion Controller Hub,
+	  as found on G-series SOCs (eg. GX-412TC)
+
+	  Note: This driver doesn't registers itself automatically, as it
+	  needs to be provided with platform specific configuration.
+	  (See eg. CONFIG_PCENGINES_APU2.)
 endmenu
 
 menu "Port-mapped I/O GPIO drivers"
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index 9655927a3dcf..54d55274b93a 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_GPIO_ADP5520)	+= gpio-adp5520.o
 obj-$(CONFIG_GPIO_ADP5588)	+= gpio-adp5588.o
 obj-$(CONFIG_GPIO_ALTERA)  	+= gpio-altera.o
 obj-$(CONFIG_GPIO_ALTERA_A10SR)	+= gpio-altera-a10sr.o
+obj-$(CONFIG_GPIO_AMD_FCH)	+= gpio-amd-fch.o
 obj-$(CONFIG_GPIO_AMD8111)	+= gpio-amd8111.o
 obj-$(CONFIG_GPIO_AMDPT)	+= gpio-amdpt.o
 obj-$(CONFIG_GPIO_ARIZONA)	+= gpio-arizona.o
diff --git a/drivers/gpio/gpio-amd-fch.c b/drivers/gpio/gpio-amd-fch.c
new file mode 100644
index 000000000000..3b4fdce325c1
--- /dev/null
+++ b/drivers/gpio/gpio-amd-fch.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+/*
+ * GPIO driver for the AMD G series FCH (eg. GX-412TC)
+ *
+ * Copyright (C) 2018 metux IT consult
+ * Author: Enrico Weigelt, metux IT consult <info@metux.net>
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/gpio/driver.h>
+#include <linux/platform_data/gpio/gpio-amd-fch.h>
+#include <linux/spinlock.h>
+
+#define AMD_FCH_MMIO_BASE		0xFED80000
+#define AMD_FCH_GPIO_BANK0_BASE		0x1500
+#define AMD_FCH_GPIO_SIZE		0x0300
+
+#define AMD_FCH_GPIO_FLAG_DIRECTION	BIT(23)
+#define AMD_FCH_GPIO_FLAG_WRITE		BIT(22)
+#define AMD_FCH_GPIO_FLAG_READ		BIT(16)
+
+static const struct resource amd_fch_gpio_iores =
+	DEFINE_RES_MEM_NAMED(
+		AMD_FCH_MMIO_BASE + AMD_FCH_GPIO_BANK0_BASE,
+		AMD_FCH_GPIO_SIZE,
+		"amd-fch-gpio-iomem");
+
+struct amd_fch_gpio_priv {
+	struct platform_device		*pdev;
+	struct gpio_chip		gc;
+	void __iomem			*base;
+	struct amd_fch_gpio_pdata	*pdata;
+	spinlock_t			lock;
+};
+
+static void *amd_fch_gpio_addr(struct amd_fch_gpio_priv *priv,
+			       unsigned int gpio)
+{
+	return priv->base + priv->pdata->gpio_reg[gpio]*sizeof(u32);
+}
+
+static int amd_fch_gpio_direction_input(struct gpio_chip *gc,
+					unsigned int offset)
+{
+	unsigned long flags;
+	struct amd_fch_gpio_priv *priv = gpiochip_get_data(gc);
+	void *ptr = amd_fch_gpio_addr(priv, offset);
+
+	spin_lock_irqsave(&priv->lock, flags);
+	writel_relaxed(readl_relaxed(ptr) & ~AMD_FCH_GPIO_FLAG_DIRECTION, ptr);
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	return 0;
+}
+
+static int amd_fch_gpio_direction_output(struct gpio_chip *gc,
+					 unsigned int gpio, int value)
+{
+	unsigned long flags;
+	struct amd_fch_gpio_priv *priv = gpiochip_get_data(gc);
+	void *ptr = amd_fch_gpio_addr(priv, gpio);
+
+	spin_lock_irqsave(&priv->lock, flags);
+	writel_relaxed(readl_relaxed(ptr) | AMD_FCH_GPIO_FLAG_DIRECTION, ptr);
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	return 0;
+}
+
+static int amd_fch_gpio_get_direction(struct gpio_chip *gc, unsigned int gpio)
+{
+	int ret;
+	unsigned long flags;
+	struct amd_fch_gpio_priv *priv = gpiochip_get_data(gc);
+	void *ptr = amd_fch_gpio_addr(priv, gpio);
+
+	spin_lock_irqsave(&priv->lock, flags);
+	ret = (readl_relaxed(ptr) & AMD_FCH_GPIO_FLAG_DIRECTION);
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	return ret;
+}
+
+static void amd_fch_gpio_set(struct gpio_chip *gc,
+			     unsigned int gpio, int value)
+{
+	unsigned long flags;
+	struct amd_fch_gpio_priv *priv = gpiochip_get_data(gc);
+	void *ptr = amd_fch_gpio_addr(priv, gpio);
+	u32 mask;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	mask = readl_relaxed(ptr);
+	if (value)
+		mask |= AMD_FCH_GPIO_FLAG_WRITE;
+	else
+		mask &= ~AMD_FCH_GPIO_FLAG_WRITE;
+	writel_relaxed(mask, ptr);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static int amd_fch_gpio_get(struct gpio_chip *gc,
+			    unsigned int offset)
+{
+	unsigned long flags;
+	int ret;
+	struct amd_fch_gpio_priv *priv = gpiochip_get_data(gc);
+	void *ptr = amd_fch_gpio_addr(priv, offset);
+
+	spin_lock_irqsave(&priv->lock, flags);
+	ret = (readl_relaxed(ptr) & AMD_FCH_GPIO_FLAG_READ);
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	return ret;
+}
+
+static int amd_fch_gpio_request(struct gpio_chip *chip,
+				unsigned int gpio_pin)
+{
+	return 0;
+}
+
+static int amd_fch_gpio_probe(struct platform_device *pdev)
+{
+	struct amd_fch_gpio_priv *priv;
+	struct amd_fch_gpio_pdata *pdata;
+
+	pdata = dev_get_platdata(&pdev->dev);
+	if (!pdata) {
+		dev_err(&pdev->dev, "no platform_data\n");
+		return -ENOENT;
+	}
+
+	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->pdata	= pdata;
+	priv->pdev	= pdev;
+
+	priv->gc.owner			= THIS_MODULE;
+	priv->gc.parent			= &pdev->dev;
+	priv->gc.label			= dev_name(&pdev->dev);
+	priv->gc.ngpio			= priv->pdata->gpio_num;
+	priv->gc.names			= priv->pdata->gpio_names;
+	priv->gc.base			= -1;
+	priv->gc.request		= amd_fch_gpio_request;
+	priv->gc.direction_input	= amd_fch_gpio_direction_input;
+	priv->gc.direction_output	= amd_fch_gpio_direction_output;
+	priv->gc.get_direction		= amd_fch_gpio_get_direction;
+	priv->gc.get			= amd_fch_gpio_get;
+	priv->gc.set			= amd_fch_gpio_set;
+
+	spin_lock_init(&priv->lock);
+
+	priv->base = devm_ioremap_resource(&pdev->dev, &amd_fch_gpio_iores);
+	if (IS_ERR(priv->base))
+		return PTR_ERR(priv->base);
+
+	platform_set_drvdata(pdev, priv);
+
+	return devm_gpiochip_add_data(&pdev->dev, &priv->gc, priv);
+}
+
+static struct platform_driver amd_fch_gpio_driver = {
+	.driver = {
+		.name = AMD_FCH_GPIO_DRIVER_NAME,
+	},
+	.probe = amd_fch_gpio_probe,
+};
+
+module_platform_driver(amd_fch_gpio_driver);
+
+MODULE_AUTHOR("Enrico Weigelt, metux IT consult <info@metux.net>");
+MODULE_DESCRIPTION("AMD G-series FCH GPIO driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:" AMD_FCH_GPIO_DRIVER_NAME);
diff --git a/include/linux/platform_data/gpio/gpio-amd-fch.h b/include/linux/platform_data/gpio/gpio-amd-fch.h
new file mode 100644
index 000000000000..a867637e172d
--- /dev/null
+++ b/include/linux/platform_data/gpio/gpio-amd-fch.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL+ */
+
+/*
+ * AMD FCH gpio driver platform-data
+ *
+ * Copyright (C) 2018 metux IT consult
+ * Author: Enrico Weigelt <info@metux.net>
+ *
+ */
+
+#ifndef __LINUX_PLATFORM_DATA_GPIO_AMD_FCH_H
+#define __LINUX_PLATFORM_DATA_GPIO_AMD_FCH_H
+
+#define AMD_FCH_GPIO_DRIVER_NAME "gpio_amd_fch"
+
+/*
+ * gpio register index definitions
+ */
+#define AMD_FCH_GPIO_REG_GPIO49		0x40
+#define AMD_FCH_GPIO_REG_GPIO50		0x41
+#define AMD_FCH_GPIO_REG_GPIO51		0x42
+#define AMD_FCH_GPIO_REG_GPIO59_DEVSLP0	0x43
+#define AMD_FCH_GPIO_REG_GPIO57		0x44
+#define AMD_FCH_GPIO_REG_GPIO58		0x45
+#define AMD_FCH_GPIO_REG_GPIO59_DEVSLP1	0x46
+#define AMD_FCH_GPIO_REG_GPIO64		0x47
+#define AMD_FCH_GPIO_REG_GPIO68		0x48
+#define AMD_FCH_GPIO_REG_GPIO66_SPKR	0x5B
+#define AMD_FCH_GPIO_REG_GPIO71		0x4D
+#define AMD_FCH_GPIO_REG_GPIO32_GE1	0x59
+#define AMD_FCH_GPIO_REG_GPIO33_GE2	0x5A
+#define AMT_FCH_GPIO_REG_GEVT22		0x09
+
+/*
+ * struct amd_fch_gpio_pdata - GPIO chip platform data
+ * @gpio_num: number of entries
+ * @gpio_reg: array of gpio registers
+ * @gpio_names: array of gpio names
+ */
+struct amd_fch_gpio_pdata {
+	int			gpio_num;
+	int			*gpio_reg;
+	const char * const	*gpio_names;
+};
+
+#endif /* __LINUX_PLATFORM_DATA_GPIO_AMD_FCH_H */
-- 
cgit v1.2.3


From a2a074ef396f8738d9ee08ceefa8811381a4fe4f Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 12 Feb 2019 20:39:16 +0200
Subject: RDMA: Handle ucontext allocations by IB/core

Following the PD conversion patch, do the same for ucontext allocations.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/device.c                |  1 +
 drivers/infiniband/core/rdma_core.c             |  9 +----
 drivers/infiniband/core/uverbs_cmd.c            | 24 +++++++-----
 drivers/infiniband/hw/bnxt_re/ib_verbs.c        | 32 +++++-----------
 drivers/infiniband/hw/bnxt_re/ib_verbs.h        |  7 ++--
 drivers/infiniband/hw/bnxt_re/main.c            |  1 +
 drivers/infiniband/hw/cxgb3/iwch_provider.c     | 17 ++++-----
 drivers/infiniband/hw/cxgb4/provider.c          | 26 +++++--------
 drivers/infiniband/hw/hns/hns_roce_main.c       | 26 +++++--------
 drivers/infiniband/hw/i40iw/i40iw_verbs.c       | 50 +++++++------------------
 drivers/infiniband/hw/mlx4/main.c               | 30 ++++++---------
 drivers/infiniband/hw/mlx5/main.c               | 35 +++++++----------
 drivers/infiniband/hw/mthca/mthca_provider.c    | 39 +++++++------------
 drivers/infiniband/hw/nes/nes_verbs.c           | 32 ++++++----------
 drivers/infiniband/hw/ocrdma/ocrdma_main.c      |  1 +
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c     | 38 +++++++------------
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.h     |  5 +--
 drivers/infiniband/hw/qedr/main.c               |  1 +
 drivers/infiniband/hw/qedr/verbs.c              | 34 +++++------------
 drivers/infiniband/hw/qedr/verbs.h              |  4 +-
 drivers/infiniband/hw/usnic/usnic_ib_main.c     |  1 +
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c    | 18 +++------
 drivers/infiniband/hw/usnic/usnic_ib_verbs.h    |  5 +--
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c  |  1 +
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c | 50 ++++++++-----------------
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h |  5 +--
 drivers/infiniband/sw/rdmavt/vt.c               | 22 ++++-------
 drivers/infiniband/sw/rxe/rxe_pool.c            |  1 +
 drivers/infiniband/sw/rxe/rxe_verbs.c           | 14 +++----
 drivers/infiniband/sw/rxe/rxe_verbs.h           |  2 +-
 include/rdma/ib_verbs.h                         |  7 ++--
 31 files changed, 198 insertions(+), 340 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index bf2a215d94dd..a9f29156e486 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -1832,6 +1832,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	SET_DEVICE_OP(dev_ops, unmap_fmr);
 
 	SET_OBJ_SIZE(dev_ops, ib_pd);
+	SET_OBJ_SIZE(dev_ops, ib_ucontext);
 }
 EXPORT_SYMBOL(ib_set_device_ops);
 
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 96f919fe86e7..778375ff664e 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -844,7 +844,6 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile,
 {
 	struct ib_ucontext *ucontext = ufile->ucontext;
 	struct ib_device *ib_dev = ucontext->device;
-	int ret;
 
 	/*
 	 * If we are closing the FD then the user mmap VMAs must have
@@ -862,12 +861,8 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile,
 
 	rdma_restrack_del(&ucontext->res);
 
-	/*
-	 * FIXME: Drivers are not permitted to fail dealloc_ucontext, remove
-	 * the error return.
-	 */
-	ret = ib_dev->ops.dealloc_ucontext(ucontext);
-	WARN_ON(ret);
+	ib_dev->ops.dealloc_ucontext(ucontext);
+	kfree(ucontext);
 
 	ufile->ucontext = NULL;
 }
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index e44ac718f1cd..3128821ca36e 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -224,12 +224,13 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
 	if (ret)
 		goto err;
 
-	ucontext = ib_dev->ops.alloc_ucontext(ib_dev, &attrs->driver_udata);
-	if (IS_ERR(ucontext)) {
-		ret = PTR_ERR(ucontext);
+	ucontext = rdma_zalloc_drv_obj(ib_dev, ib_ucontext);
+	if (!ucontext) {
+		ret = -ENOMEM;
 		goto err_alloc;
 	}
 
+	ucontext->res.type = RDMA_RESTRACK_CTX;
 	ucontext->device = ib_dev;
 	ucontext->cg_obj = cg_obj;
 	/* ufile is required when some objects are released */
@@ -240,10 +241,6 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
 
 	mutex_init(&ucontext->per_mm_list_lock);
 	INIT_LIST_HEAD(&ucontext->per_mm_list);
-	if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
-		ucontext->invalidate_range = NULL;
-
-	resp.num_comp_vectors = file->device->num_comp_vectors;
 
 	ret = get_unused_fd_flags(O_CLOEXEC);
 	if (ret < 0)
@@ -256,15 +253,22 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
 		goto err_fd;
 	}
 
+	resp.num_comp_vectors = file->device->num_comp_vectors;
+
 	ret = uverbs_response(attrs, &resp, sizeof(resp));
 	if (ret)
 		goto err_file;
 
-	fd_install(resp.async_fd, filp);
+	ret = ib_dev->ops.alloc_ucontext(ucontext, &attrs->driver_udata);
+	if (ret)
+		goto err_file;
+	if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
+		ucontext->invalidate_range = NULL;
 
-	ucontext->res.type = RDMA_RESTRACK_CTX;
 	rdma_restrack_uadd(&ucontext->res);
 
+	fd_install(resp.async_fd, filp);
+
 	/*
 	 * Make sure that ib_uverbs_get_ucontext() sees the pointer update
 	 * only after all writes to setup the ucontext have completed
@@ -283,7 +287,7 @@ err_fd:
 	put_unused_fd(resp.async_fd);
 
 err_free:
-	ib_dev->ops.dealloc_ucontext(ucontext);
+	kfree(ucontext);
 
 err_alloc:
 	ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index f29f29aae537..24092911c2ac 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -3671,13 +3671,14 @@ free_mr:
 	return ERR_PTR(rc);
 }
 
-struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
-					   struct ib_udata *udata)
+int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ctx->device;
+	struct bnxt_re_ucontext *uctx =
+		container_of(ctx, struct bnxt_re_ucontext, ib_uctx);
 	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
 	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
 	struct bnxt_re_uctx_resp resp;
-	struct bnxt_re_ucontext *uctx;
 	u32 chip_met_rev_num = 0;
 	int rc;
 
@@ -3687,13 +3688,9 @@ struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
 	if (ibdev->uverbs_abi_ver != BNXT_RE_ABI_VERSION) {
 		dev_dbg(rdev_to_dev(rdev), " is different from the device %d ",
 			BNXT_RE_ABI_VERSION);
-		return ERR_PTR(-EPERM);
+		return -EPERM;
 	}
 
-	uctx = kzalloc(sizeof(*uctx), GFP_KERNEL);
-	if (!uctx)
-		return ERR_PTR(-ENOMEM);
-
 	uctx->rdev = rdev;
 
 	uctx->shpg = (void *)__get_free_page(GFP_KERNEL);
@@ -3727,23 +3724,21 @@ struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
 		goto cfail;
 	}
 
-	return &uctx->ib_uctx;
+	return 0;
 cfail:
 	free_page((unsigned long)uctx->shpg);
 	uctx->shpg = NULL;
 fail:
-	kfree(uctx);
-	return ERR_PTR(rc);
+	return rc;
 }
 
-int bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx)
+void bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx)
 {
 	struct bnxt_re_ucontext *uctx = container_of(ib_uctx,
 						   struct bnxt_re_ucontext,
 						   ib_uctx);
 
 	struct bnxt_re_dev *rdev = uctx->rdev;
-	int rc = 0;
 
 	if (uctx->shpg)
 		free_page((unsigned long)uctx->shpg);
@@ -3752,17 +3747,10 @@ int bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx)
 		/* Free DPI only if this is the first PD allocated by the
 		 * application and mark the context dpi as NULL
 		 */
-		rc = bnxt_qplib_dealloc_dpi(&rdev->qplib_res,
-					    &rdev->qplib_res.dpi_tbl,
-					    &uctx->dpi);
-		if (rc)
-			dev_err(rdev_to_dev(rdev), "Deallocate HW DPI failed!");
-			/* Don't fail, continue*/
+		bnxt_qplib_dealloc_dpi(&rdev->qplib_res,
+				       &rdev->qplib_res.dpi_tbl, &uctx->dpi);
 		uctx->dpi.dbr = NULL;
 	}
-
-	kfree(uctx);
-	return 0;
 }
 
 /* Helper function to mmap the virtual memory from user app */
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
index c7cca803cfa3..e45465ed4eee 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
@@ -135,8 +135,8 @@ struct bnxt_re_mw {
 };
 
 struct bnxt_re_ucontext {
+	struct ib_ucontext      ib_uctx;
 	struct bnxt_re_dev	*rdev;
-	struct ib_ucontext	ib_uctx;
 	struct bnxt_qplib_dpi	dpi;
 	void			*shpg;
 	spinlock_t		sh_lock;	/* protect shpg */
@@ -215,9 +215,8 @@ int bnxt_re_dealloc_mw(struct ib_mw *mw);
 struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				  u64 virt_addr, int mr_access_flags,
 				  struct ib_udata *udata);
-struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
-					   struct ib_udata *udata);
-int bnxt_re_dealloc_ucontext(struct ib_ucontext *context);
+int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata);
+void bnxt_re_dealloc_ucontext(struct ib_ucontext *context);
 int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
 
 unsigned long bnxt_re_lock_cqs(struct bnxt_re_qp *qp);
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 0a89ef6e5754..2bd24ac45ee4 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -638,6 +638,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = {
 	.reg_user_mr = bnxt_re_reg_user_mr,
 	.req_notify_cq = bnxt_re_req_notify_cq,
 	INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, bnxt_re_ucontext, ib_uctx),
 };
 
 static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index b74fd90a22dc..4accf7b3dcf2 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -62,7 +62,7 @@
 #include <rdma/cxgb3-abi.h>
 #include "common.h"
 
-static int iwch_dealloc_ucontext(struct ib_ucontext *context)
+static void iwch_dealloc_ucontext(struct ib_ucontext *context)
 {
 	struct iwch_dev *rhp = to_iwch_dev(context->device);
 	struct iwch_ucontext *ucontext = to_iwch_ucontext(context);
@@ -72,24 +72,20 @@ static int iwch_dealloc_ucontext(struct ib_ucontext *context)
 	list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
 		kfree(mm);
 	cxio_release_ucontext(&rhp->rdev, &ucontext->uctx);
-	kfree(ucontext);
-	return 0;
 }
 
-static struct ib_ucontext *iwch_alloc_ucontext(struct ib_device *ibdev,
-					struct ib_udata *udata)
+static int iwch_alloc_ucontext(struct ib_ucontext *ucontext,
+			       struct ib_udata *udata)
 {
-	struct iwch_ucontext *context;
+	struct ib_device *ibdev = ucontext->device;
+	struct iwch_ucontext *context = to_iwch_ucontext(ucontext);
 	struct iwch_dev *rhp = to_iwch_dev(ibdev);
 
 	pr_debug("%s ibdev %p\n", __func__, ibdev);
-	context = kzalloc(sizeof(*context), GFP_KERNEL);
-	if (!context)
-		return ERR_PTR(-ENOMEM);
 	cxio_init_ucontext(&rhp->rdev, &context->uctx);
 	INIT_LIST_HEAD(&context->mmaps);
 	spin_lock_init(&context->mmap_lock);
-	return &context->ibucontext;
+	return 0;
 }
 
 static int iwch_destroy_cq(struct ib_cq *ib_cq)
@@ -1342,6 +1338,7 @@ static const struct ib_device_ops iwch_dev_ops = {
 	.req_notify_cq = iwch_arm_cq,
 	.resize_cq = iwch_resize_cq,
 	INIT_RDMA_OBJ_SIZE(ib_pd, iwch_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, iwch_ucontext, ibucontext),
 };
 
 int iwch_register_device(struct iwch_dev *dev)
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index 81fcffb597ab..507c54572cc9 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -58,7 +58,7 @@ static int fastreg_support = 1;
 module_param(fastreg_support, int, 0644);
 MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default=1)");
 
-static int c4iw_dealloc_ucontext(struct ib_ucontext *context)
+static void c4iw_dealloc_ucontext(struct ib_ucontext *context)
 {
 	struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context);
 	struct c4iw_dev *rhp;
@@ -70,26 +70,19 @@ static int c4iw_dealloc_ucontext(struct ib_ucontext *context)
 	list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
 		kfree(mm);
 	c4iw_release_dev_ucontext(&rhp->rdev, &ucontext->uctx);
-	kfree(ucontext);
-	return 0;
 }
 
-static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev,
-					       struct ib_udata *udata)
+static int c4iw_alloc_ucontext(struct ib_ucontext *ucontext,
+			       struct ib_udata *udata)
 {
-	struct c4iw_ucontext *context;
+	struct ib_device *ibdev = ucontext->device;
+	struct c4iw_ucontext *context = to_c4iw_ucontext(ucontext);
 	struct c4iw_dev *rhp = to_c4iw_dev(ibdev);
 	struct c4iw_alloc_ucontext_resp uresp;
 	int ret = 0;
 	struct c4iw_mm_entry *mm = NULL;
 
 	pr_debug("ibdev %p\n", ibdev);
-	context = kzalloc(sizeof(*context), GFP_KERNEL);
-	if (!context) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
 	c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx);
 	INIT_LIST_HEAD(&context->mmaps);
 	spin_lock_init(&context->mmap_lock);
@@ -101,7 +94,7 @@ static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev,
 		mm = kmalloc(sizeof(*mm), GFP_KERNEL);
 		if (!mm) {
 			ret = -ENOMEM;
-			goto err_free;
+			goto err;
 		}
 
 		uresp.status_page_size = PAGE_SIZE;
@@ -121,13 +114,11 @@ static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev,
 		mm->len = PAGE_SIZE;
 		insert_mmap(context, mm);
 	}
-	return &context->ibucontext;
+	return 0;
 err_mm:
 	kfree(mm);
-err_free:
-	kfree(context);
 err:
-	return ERR_PTR(ret);
+	return ret;
 }
 
 static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
@@ -555,6 +546,7 @@ static const struct ib_device_ops c4iw_dev_ops = {
 	.reg_user_mr = c4iw_reg_user_mr,
 	.req_notify_cq = c4iw_arm_cq,
 	INIT_RDMA_OBJ_SIZE(ib_pd, c4iw_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, c4iw_ucontext, ibucontext),
 };
 
 void c4iw_register_device(struct work_struct *work)
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index 29fb4fbba5ba..c929125da84b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -335,23 +335,19 @@ static int hns_roce_modify_port(struct ib_device *ib_dev, u8 port_num, int mask,
 	return 0;
 }
 
-static struct ib_ucontext *hns_roce_alloc_ucontext(struct ib_device *ib_dev,
-						   struct ib_udata *udata)
+static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
+				   struct ib_udata *udata)
 {
 	int ret = 0;
-	struct hns_roce_ucontext *context;
+	struct hns_roce_ucontext *context = to_hr_ucontext(uctx);
 	struct hns_roce_ib_alloc_ucontext_resp resp = {};
-	struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
+	struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device);
 
 	if (!hr_dev->active)
-		return ERR_PTR(-EAGAIN);
+		return -EAGAIN;
 
 	resp.qp_tab_size = hr_dev->caps.num_qps;
 
-	context = kzalloc(sizeof(*context), GFP_KERNEL);
-	if (!context)
-		return ERR_PTR(-ENOMEM);
-
 	ret = hns_roce_uar_alloc(hr_dev, &context->uar);
 	if (ret)
 		goto error_fail_uar_alloc;
@@ -365,25 +361,20 @@ static struct ib_ucontext *hns_roce_alloc_ucontext(struct ib_device *ib_dev,
 	if (ret)
 		goto error_fail_copy_to_udata;
 
-	return &context->ibucontext;
+	return 0;
 
 error_fail_copy_to_udata:
 	hns_roce_uar_free(hr_dev, &context->uar);
 
 error_fail_uar_alloc:
-	kfree(context);
-
-	return ERR_PTR(ret);
+	return ret;
 }
 
-static int hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext)
+static void hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext)
 {
 	struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext);
 
 	hns_roce_uar_free(to_hr_dev(ibcontext->device), &context->uar);
-	kfree(context);
-
-	return 0;
 }
 
 static int hns_roce_mmap(struct ib_ucontext *context,
@@ -478,6 +469,7 @@ static const struct ib_device_ops hns_roce_dev_ops = {
 	.query_port = hns_roce_query_port,
 	.reg_user_mr = hns_roce_reg_user_mr,
 	INIT_RDMA_OBJ_SIZE(ib_pd, hns_roce_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, hns_roce_ucontext, ibucontext),
 };
 
 static const struct ib_device_ops hns_roce_dev_mr_ops = {
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index 76b4d1218696..a8352e3ca23d 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -121,78 +121,55 @@ static int i40iw_query_port(struct ib_device *ibdev,
 
 /**
  * i40iw_alloc_ucontext - Allocate the user context data structure
- * @ibdev: device pointer from stack
+ * @uctx: Uverbs context pointer from stack
  * @udata: user data
  *
  * This keeps track of all objects associated with a particular
  * user-mode client.
  */
-static struct ib_ucontext *i40iw_alloc_ucontext(struct ib_device *ibdev,
-						struct ib_udata *udata)
+static int i40iw_alloc_ucontext(struct ib_ucontext *uctx,
+				struct ib_udata *udata)
 {
+	struct ib_device *ibdev = uctx->device;
 	struct i40iw_device *iwdev = to_iwdev(ibdev);
 	struct i40iw_alloc_ucontext_req req;
-	struct i40iw_alloc_ucontext_resp uresp;
-	struct i40iw_ucontext *ucontext;
+	struct i40iw_alloc_ucontext_resp uresp = {};
+	struct i40iw_ucontext *ucontext = to_ucontext(uctx);
 
 	if (ib_copy_from_udata(&req, udata, sizeof(req)))
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (req.userspace_ver < 4 || req.userspace_ver > I40IW_ABI_VER) {
 		i40iw_pr_err("Unsupported provider library version %u.\n", req.userspace_ver);
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
-	memset(&uresp, 0, sizeof(uresp));
 	uresp.max_qps = iwdev->max_qp;
 	uresp.max_pds = iwdev->max_pd;
 	uresp.wq_size = iwdev->max_qp_wr * 2;
 	uresp.kernel_ver = req.userspace_ver;
 
-	ucontext = kzalloc(sizeof(*ucontext), GFP_KERNEL);
-	if (!ucontext)
-		return ERR_PTR(-ENOMEM);
-
 	ucontext->iwdev = iwdev;
 	ucontext->abi_ver = req.userspace_ver;
 
-	if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
-		kfree(ucontext);
-		return ERR_PTR(-EFAULT);
-	}
+	if (ib_copy_to_udata(udata, &uresp, sizeof(uresp)))
+		return -EFAULT;
 
 	INIT_LIST_HEAD(&ucontext->cq_reg_mem_list);
 	spin_lock_init(&ucontext->cq_reg_mem_list_lock);
 	INIT_LIST_HEAD(&ucontext->qp_reg_mem_list);
 	spin_lock_init(&ucontext->qp_reg_mem_list_lock);
 
-	return &ucontext->ibucontext;
+	return 0;
 }
 
 /**
  * i40iw_dealloc_ucontext - deallocate the user context data structure
  * @context: user context created during alloc
  */
-static int i40iw_dealloc_ucontext(struct ib_ucontext *context)
+static void i40iw_dealloc_ucontext(struct ib_ucontext *context)
 {
-	struct i40iw_ucontext *ucontext = to_ucontext(context);
-	unsigned long flags;
-
-	spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags);
-	if (!list_empty(&ucontext->cq_reg_mem_list)) {
-		spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags);
-		return -EBUSY;
-	}
-	spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags);
-	spin_lock_irqsave(&ucontext->qp_reg_mem_list_lock, flags);
-	if (!list_empty(&ucontext->qp_reg_mem_list)) {
-		spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags);
-		return -EBUSY;
-	}
-	spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags);
-
-	kfree(ucontext);
-	return 0;
+	return;
 }
 
 /**
@@ -2740,6 +2717,7 @@ static const struct ib_device_ops i40iw_dev_ops = {
 	.reg_user_mr = i40iw_reg_user_mr,
 	.req_notify_cq = i40iw_req_notify_cq,
 	INIT_RDMA_OBJ_SIZE(ib_pd, i40iw_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, i40iw_ucontext, ibucontext),
 };
 
 /**
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index c0f6aea7ed7c..733f7bbd5901 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1076,17 +1076,18 @@ out:
 	return err;
 }
 
-static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
-						  struct ib_udata *udata)
+static int mlx4_ib_alloc_ucontext(struct ib_ucontext *uctx,
+				  struct ib_udata *udata)
 {
+	struct ib_device *ibdev = uctx->device;
 	struct mlx4_ib_dev *dev = to_mdev(ibdev);
-	struct mlx4_ib_ucontext *context;
+	struct mlx4_ib_ucontext *context = to_mucontext(uctx);
 	struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3;
 	struct mlx4_ib_alloc_ucontext_resp resp;
 	int err;
 
 	if (!dev->ib_active)
-		return ERR_PTR(-EAGAIN);
+		return -EAGAIN;
 
 	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
 		resp_v3.qp_tab_size      = dev->dev->caps.num_qps;
@@ -1100,15 +1101,9 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
 		resp.cqe_size	      = dev->dev->caps.cqe_size;
 	}
 
-	context = kzalloc(sizeof(*context), GFP_KERNEL);
-	if (!context)
-		return ERR_PTR(-ENOMEM);
-
 	err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar);
-	if (err) {
-		kfree(context);
-		return ERR_PTR(err);
-	}
+	if (err)
+		return err;
 
 	INIT_LIST_HEAD(&context->db_page_list);
 	mutex_init(&context->db_page_mutex);
@@ -1123,21 +1118,17 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
 
 	if (err) {
 		mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar);
-		kfree(context);
-		return ERR_PTR(-EFAULT);
+		return -EFAULT;
 	}
 
-	return &context->ibucontext;
+	return err;
 }
 
-static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+static void mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 {
 	struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
 
 	mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar);
-	kfree(context);
-
-	return 0;
 }
 
 static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
@@ -2570,6 +2561,7 @@ static const struct ib_device_ops mlx4_ib_dev_ops = {
 	.rereg_user_mr = mlx4_ib_rereg_user_mr,
 	.resize_cq = mlx4_ib_resize_cq,
 	INIT_RDMA_OBJ_SIZE(ib_pd, mlx4_ib_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx4_ib_ucontext, ibucontext),
 };
 
 static const struct ib_device_ops mlx4_ib_dev_wq_ops = {
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 614b2acbc621..994c19d01211 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1745,14 +1745,15 @@ static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn,
 	mlx5_ib_disable_lb(dev, true, false);
 }
 
-static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
-						  struct ib_udata *udata)
+static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
+				  struct ib_udata *udata)
 {
+	struct ib_device *ibdev = uctx->device;
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
 	struct mlx5_ib_alloc_ucontext_req_v2 req = {};
 	struct mlx5_ib_alloc_ucontext_resp resp = {};
 	struct mlx5_core_dev *mdev = dev->mdev;
-	struct mlx5_ib_ucontext *context;
+	struct mlx5_ib_ucontext *context = to_mucontext(uctx);
 	struct mlx5_bfreg_info *bfregi;
 	int ver;
 	int err;
@@ -1762,29 +1763,29 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	bool lib_uar_4k;
 
 	if (!dev->ib_active)
-		return ERR_PTR(-EAGAIN);
+		return -EAGAIN;
 
 	if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
 		ver = 0;
 	else if (udata->inlen >= min_req_v2)
 		ver = 2;
 	else
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
 	if (err)
-		return ERR_PTR(err);
+		return err;
 
 	if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX)
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 
 	if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 
 	req.total_num_bfregs = ALIGN(req.total_num_bfregs,
 				    MLX5_NON_FP_BFREGS_PER_UAR);
 	if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
 	if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
@@ -1817,10 +1818,6 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 		/* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */
 	}
 
-	context = kzalloc(sizeof(*context), GFP_KERNEL);
-	if (!context)
-		return ERR_PTR(-ENOMEM);
-
 	lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
 	bfregi = &context->bfregi;
 
@@ -1955,7 +1952,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 				   1, &dev->roce[port].tx_port_affinity));
 	}
 
-	return &context->ibucontext;
+	return 0;
 
 out_mdev:
 	mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
@@ -1973,12 +1970,10 @@ out_count:
 	kfree(bfregi->count);
 
 out_ctx:
-	kfree(context);
-
-	return ERR_PTR(err);
+	return err;
 }
 
-static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 {
 	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
 	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
@@ -1998,9 +1993,6 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 	deallocate_uars(dev, context);
 	kfree(bfregi->sys_pages);
 	kfree(bfregi->count);
-	kfree(context);
-
-	return 0;
 }
 
 static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
@@ -5984,6 +5976,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
 	.rereg_user_mr = mlx5_ib_rereg_user_mr,
 	.resize_cq = mlx5_ib_resize_cq,
 	INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext),
 };
 
 static const struct ib_device_ops mlx5_ib_dev_flow_ipsec_ops = {
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 80c3af217d96..d063d7a37762 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -301,17 +301,16 @@ static int mthca_query_gid(struct ib_device *ibdev, u8 port,
 	return err;
 }
 
-static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev,
-						struct ib_udata *udata)
+static int mthca_alloc_ucontext(struct ib_ucontext *uctx,
+				struct ib_udata *udata)
 {
-	struct mthca_alloc_ucontext_resp uresp;
-	struct mthca_ucontext           *context;
+	struct ib_device *ibdev = uctx->device;
+	struct mthca_alloc_ucontext_resp uresp = {};
+	struct mthca_ucontext *context = to_mucontext(uctx);
 	int                              err;
 
 	if (!(to_mdev(ibdev)->active))
-		return ERR_PTR(-EAGAIN);
-
-	memset(&uresp, 0, sizeof uresp);
+		return -EAGAIN;
 
 	uresp.qp_tab_size = to_mdev(ibdev)->limits.num_qps;
 	if (mthca_is_memfree(to_mdev(ibdev)))
@@ -319,44 +318,33 @@ static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev,
 	else
 		uresp.uarc_size = 0;
 
-	context = kzalloc(sizeof(*context), GFP_KERNEL);
-	if (!context)
-		return ERR_PTR(-ENOMEM);
-
 	err = mthca_uar_alloc(to_mdev(ibdev), &context->uar);
-	if (err) {
-		kfree(context);
-		return ERR_PTR(err);
-	}
+	if (err)
+		return err;
 
 	context->db_tab = mthca_init_user_db_tab(to_mdev(ibdev));
 	if (IS_ERR(context->db_tab)) {
 		err = PTR_ERR(context->db_tab);
 		mthca_uar_free(to_mdev(ibdev), &context->uar);
-		kfree(context);
-		return ERR_PTR(err);
+		return err;
 	}
 
-	if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
+	if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
 		mthca_cleanup_user_db_tab(to_mdev(ibdev), &context->uar, context->db_tab);
 		mthca_uar_free(to_mdev(ibdev), &context->uar);
-		kfree(context);
-		return ERR_PTR(-EFAULT);
+		return -EFAULT;
 	}
 
 	context->reg_mr_warned = 0;
 
-	return &context->ibucontext;
+	return 0;
 }
 
-static int mthca_dealloc_ucontext(struct ib_ucontext *context)
+static void mthca_dealloc_ucontext(struct ib_ucontext *context)
 {
 	mthca_cleanup_user_db_tab(to_mdev(context->device), &to_mucontext(context)->uar,
 				  to_mucontext(context)->db_tab);
 	mthca_uar_free(to_mdev(context->device), &to_mucontext(context)->uar);
-	kfree(to_mucontext(context));
-
-	return 0;
 }
 
 static int mthca_mmap_uar(struct ib_ucontext *context,
@@ -1213,6 +1201,7 @@ static const struct ib_device_ops mthca_dev_ops = {
 	.reg_user_mr = mthca_reg_user_mr,
 	.resize_cq = mthca_resize_cq,
 	INIT_RDMA_OBJ_SIZE(ib_pd, mthca_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, mthca_ucontext, ibucontext),
 };
 
 static const struct ib_device_ops mthca_dev_arbel_srq_ops = {
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index b23956aa45b8..828e4af3f951 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -529,27 +529,27 @@ static int nes_query_gid(struct ib_device *ibdev, u8 port,
  * nes_alloc_ucontext - Allocate the user context data structure. This keeps track
  * of all objects associated with a particular user-mode client.
  */
-static struct ib_ucontext *nes_alloc_ucontext(struct ib_device *ibdev,
-		struct ib_udata *udata)
+static int nes_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 {
+	struct ib_device *ibdev = uctx->device;
 	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
 	struct nes_device *nesdev = nesvnic->nesdev;
 	struct nes_adapter *nesadapter = nesdev->nesadapter;
 	struct nes_alloc_ucontext_req req;
 	struct nes_alloc_ucontext_resp uresp = {};
-	struct nes_ucontext *nes_ucontext;
+	struct nes_ucontext *nes_ucontext = to_nesucontext(uctx);
 	struct nes_ib_device *nesibdev = nesvnic->nesibdev;
 
 
 	if (ib_copy_from_udata(&req, udata, sizeof(struct nes_alloc_ucontext_req))) {
 		printk(KERN_ERR PFX "Invalid structure size on allocate user context.\n");
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
 	if (req.userspace_ver != NES_ABI_USERSPACE_VER) {
 		printk(KERN_ERR PFX "Invalid userspace driver version detected. Detected version %d, should be %d\n",
 			req.userspace_ver, NES_ABI_USERSPACE_VER);
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
 
@@ -559,10 +559,6 @@ static struct ib_ucontext *nes_alloc_ucontext(struct ib_device *ibdev,
 	uresp.virtwq = nesadapter->virtwq;
 	uresp.kernel_ver = NES_ABI_KERNEL_VER;
 
-	nes_ucontext = kzalloc(sizeof *nes_ucontext, GFP_KERNEL);
-	if (!nes_ucontext)
-		return ERR_PTR(-ENOMEM);
-
 	nes_ucontext->nesdev = nesdev;
 	nes_ucontext->mmap_wq_offset = uresp.max_pds;
 	nes_ucontext->mmap_cq_offset = nes_ucontext->mmap_wq_offset +
@@ -570,29 +566,22 @@ static struct ib_ucontext *nes_alloc_ucontext(struct ib_device *ibdev,
 			PAGE_SIZE;
 
 
-	if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
-		kfree(nes_ucontext);
-		return ERR_PTR(-EFAULT);
-	}
+	if (ib_copy_to_udata(udata, &uresp, sizeof(uresp)))
+		return -EFAULT;
 
 	INIT_LIST_HEAD(&nes_ucontext->cq_reg_mem_list);
 	INIT_LIST_HEAD(&nes_ucontext->qp_reg_mem_list);
-	return &nes_ucontext->ibucontext;
+	return 0;
 }
 
-
 /**
  * nes_dealloc_ucontext
  */
-static int nes_dealloc_ucontext(struct ib_ucontext *context)
+static void nes_dealloc_ucontext(struct ib_ucontext *context)
 {
-	struct nes_ucontext *nes_ucontext = to_nesucontext(context);
-
-	kfree(nes_ucontext);
-	return 0;
+	return;
 }
 
-
 /**
  * nes_mmap
  */
@@ -3599,6 +3588,7 @@ static const struct ib_device_ops nes_dev_ops = {
 	.reg_user_mr = nes_reg_user_mr,
 	.req_notify_cq = nes_req_notify_cq,
 	INIT_RDMA_OBJ_SIZE(ib_pd, nes_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, nes_ucontext, ibucontext),
 };
 
 /**
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index 0de83c92691f..b9e10d55a58e 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -180,6 +180,7 @@ static const struct ib_device_ops ocrdma_dev_ops = {
 	.req_notify_cq = ocrdma_arm_cq,
 	.resize_cq = ocrdma_resize_cq,
 	INIT_RDMA_OBJ_SIZE(ib_pd, ocrdma_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, ocrdma_ucontext, ibucontext),
 };
 
 static const struct ib_device_ops ocrdma_dev_srq_ops = {
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index ed5da67b693d..b4e1777c2c97 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -440,7 +440,7 @@ err:
 	return status;
 }
 
-static int ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx)
+static void ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx)
 {
 	struct ocrdma_pd *pd = uctx->cntxt_pd;
 	struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
@@ -451,8 +451,7 @@ static int ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx)
 	}
 	kfree(uctx->cntxt_pd);
 	uctx->cntxt_pd = NULL;
-	(void)_ocrdma_dealloc_pd(dev, pd);
-	return 0;
+	_ocrdma_dealloc_pd(dev, pd);
 }
 
 static struct ocrdma_pd *ocrdma_get_ucontext_pd(struct ocrdma_ucontext *uctx)
@@ -476,33 +475,28 @@ static void ocrdma_release_ucontext_pd(struct ocrdma_ucontext *uctx)
 	mutex_unlock(&uctx->mm_list_lock);
 }
 
-struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev,
-					  struct ib_udata *udata)
+int ocrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 {
+	struct ib_device *ibdev = uctx->device;
 	int status;
-	struct ocrdma_ucontext *ctx;
-	struct ocrdma_alloc_ucontext_resp resp;
+	struct ocrdma_ucontext *ctx = get_ocrdma_ucontext(uctx);
+	struct ocrdma_alloc_ucontext_resp resp = {};
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
 	struct pci_dev *pdev = dev->nic_info.pdev;
 	u32 map_len = roundup(sizeof(u32) * 2048, PAGE_SIZE);
 
 	if (!udata)
-		return ERR_PTR(-EFAULT);
-	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
-	if (!ctx)
-		return ERR_PTR(-ENOMEM);
+		return -EFAULT;
 	INIT_LIST_HEAD(&ctx->mm_head);
 	mutex_init(&ctx->mm_list_lock);
 
 	ctx->ah_tbl.va = dma_alloc_coherent(&pdev->dev, map_len,
 					    &ctx->ah_tbl.pa, GFP_KERNEL);
-	if (!ctx->ah_tbl.va) {
-		kfree(ctx);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (!ctx->ah_tbl.va)
+		return -ENOMEM;
+
 	ctx->ah_tbl.len = map_len;
 
-	memset(&resp, 0, sizeof(resp));
 	resp.ah_tbl_len = ctx->ah_tbl.len;
 	resp.ah_tbl_page = virt_to_phys(ctx->ah_tbl.va);
 
@@ -524,7 +518,7 @@ struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev,
 	status = ib_copy_to_udata(udata, &resp, sizeof(resp));
 	if (status)
 		goto cpy_err;
-	return &ctx->ibucontext;
+	return 0;
 
 cpy_err:
 	ocrdma_dealloc_ucontext_pd(ctx);
@@ -533,19 +527,17 @@ pd_err:
 map_err:
 	dma_free_coherent(&pdev->dev, ctx->ah_tbl.len, ctx->ah_tbl.va,
 			  ctx->ah_tbl.pa);
-	kfree(ctx);
-	return ERR_PTR(status);
+	return status;
 }
 
-int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx)
+void ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx)
 {
-	int status;
 	struct ocrdma_mm *mm, *tmp;
 	struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ibctx);
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibctx->device);
 	struct pci_dev *pdev = dev->nic_info.pdev;
 
-	status = ocrdma_dealloc_ucontext_pd(uctx);
+	ocrdma_dealloc_ucontext_pd(uctx);
 
 	ocrdma_del_mmap(uctx, uctx->ah_tbl.pa, uctx->ah_tbl.len);
 	dma_free_coherent(&pdev->dev, uctx->ah_tbl.len, uctx->ah_tbl.va,
@@ -555,8 +547,6 @@ int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx)
 		list_del(&mm->entry);
 		kfree(mm);
 	}
-	kfree(uctx);
-	return status;
 }
 
 int ocrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
index 1fd66721c930..4c04ab40798e 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -64,9 +64,8 @@ void ocrdma_get_guid(struct ocrdma_dev *, u8 *guid);
 struct net_device *ocrdma_get_netdev(struct ib_device *device, u8 port_num);
 int ocrdma_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey);
 
-struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *,
-					  struct ib_udata *);
-int ocrdma_dealloc_ucontext(struct ib_ucontext *);
+int ocrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
+void ocrdma_dealloc_ucontext(struct ib_ucontext *uctx);
 
 int ocrdma_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
 
diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index 44ce4989dcef..996d9ecd93e0 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -240,6 +240,7 @@ static const struct ib_device_ops qedr_dev_ops = {
 	.req_notify_cq = qedr_arm_cq,
 	.resize_cq = qedr_resize_cq,
 	INIT_RDMA_OBJ_SIZE(ib_pd, qedr_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, qedr_ucontext, ibucontext),
 };
 
 static int qedr_register_device(struct qedr_dev *dev)
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index d51bc3ede9d1..59ad4202422c 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -316,28 +316,24 @@ static bool qedr_search_mmap(struct qedr_ucontext *uctx, u64 phy_addr,
 	return found;
 }
 
-struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *ibdev,
-					struct ib_udata *udata)
+int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 {
+	struct ib_device *ibdev = uctx->device;
 	int rc;
-	struct qedr_ucontext *ctx;
-	struct qedr_alloc_ucontext_resp uresp;
+	struct qedr_ucontext *ctx = get_qedr_ucontext(uctx);
+	struct qedr_alloc_ucontext_resp uresp = {};
 	struct qedr_dev *dev = get_qedr_dev(ibdev);
 	struct qed_rdma_add_user_out_params oparams;
 
 	if (!udata)
-		return ERR_PTR(-EFAULT);
-
-	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
-	if (!ctx)
-		return ERR_PTR(-ENOMEM);
+		return -EFAULT;
 
 	rc = dev->ops->rdma_add_user(dev->rdma_ctx, &oparams);
 	if (rc) {
 		DP_ERR(dev,
 		       "failed to allocate a DPI for a new RoCE application, rc=%d. To overcome this consider to increase the number of DPIs, increase the doorbell BAR size or just close unnecessary RoCE applications. In order to increase the number of DPIs consult the qedr readme\n",
 		       rc);
-		goto err;
+		return rc;
 	}
 
 	ctx->dpi = oparams.dpi;
@@ -347,8 +343,6 @@ struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *ibdev,
 	INIT_LIST_HEAD(&ctx->mm_head);
 	mutex_init(&ctx->mm_list_lock);
 
-	memset(&uresp, 0, sizeof(uresp));
-
 	uresp.dpm_enabled = dev->user_dpm_enabled;
 	uresp.wids_enabled = 1;
 	uresp.wid_count = oparams.wid_count;
@@ -364,28 +358,23 @@ struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *ibdev,
 
 	rc = qedr_ib_copy_to_udata(udata, &uresp, sizeof(uresp));
 	if (rc)
-		goto err;
+		return rc;
 
 	ctx->dev = dev;
 
 	rc = qedr_add_mmap(ctx, ctx->dpi_phys_addr, ctx->dpi_size);
 	if (rc)
-		goto err;
+		return rc;
 
 	DP_DEBUG(dev, QEDR_MSG_INIT, "Allocating user context %p\n",
 		 &ctx->ibucontext);
-	return &ctx->ibucontext;
-
-err:
-	kfree(ctx);
-	return ERR_PTR(rc);
+	return 0;
 }
 
-int qedr_dealloc_ucontext(struct ib_ucontext *ibctx)
+void qedr_dealloc_ucontext(struct ib_ucontext *ibctx)
 {
 	struct qedr_ucontext *uctx = get_qedr_ucontext(ibctx);
 	struct qedr_mm *mm, *tmp;
-	int status = 0;
 
 	DP_DEBUG(uctx->dev, QEDR_MSG_INIT, "Deallocating user context %p\n",
 		 uctx);
@@ -398,9 +387,6 @@ int qedr_dealloc_ucontext(struct ib_ucontext *ibctx)
 		list_del(&mm->entry);
 		kfree(mm);
 	}
-
-	kfree(uctx);
-	return status;
 }
 
 int qedr_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h
index 97a6ff3f9afb..f0c05f4771ac 100644
--- a/drivers/infiniband/hw/qedr/verbs.h
+++ b/drivers/infiniband/hw/qedr/verbs.h
@@ -43,8 +43,8 @@ int qedr_iw_query_gid(struct ib_device *ibdev, u8 port,
 
 int qedr_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey);
 
-struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *, struct ib_udata *);
-int qedr_dealloc_ucontext(struct ib_ucontext *);
+int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
+void qedr_dealloc_ucontext(struct ib_ucontext *uctx);
 
 int qedr_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
 int qedr_alloc_pd(struct ib_pd *pd, struct ib_ucontext *uctx,
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index be6468021a9a..d88d9f8a7f9a 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -350,6 +350,7 @@ static const struct ib_device_ops usnic_dev_ops = {
 	.query_qp = usnic_ib_query_qp,
 	.reg_user_mr = usnic_ib_reg_mr,
 	INIT_RDMA_OBJ_SIZE(ib_pd, usnic_ib_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, usnic_ib_ucontext, ibucontext),
 };
 
 /* Start of PF discovery section */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
index 7549ae23027e..bd4521b2cc5f 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -653,37 +653,31 @@ int usnic_ib_dereg_mr(struct ib_mr *ibmr)
 	return 0;
 }
 
-struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev,
-							struct ib_udata *udata)
+int usnic_ib_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 {
-	struct usnic_ib_ucontext *context;
+	struct ib_device *ibdev = uctx->device;
+	struct usnic_ib_ucontext *context = to_ucontext(uctx);
 	struct usnic_ib_dev *us_ibdev = to_usdev(ibdev);
 	usnic_dbg("\n");
 
-	context = kzalloc(sizeof(*context), GFP_KERNEL);
-	if (!context)
-		return ERR_PTR(-ENOMEM);
-
 	INIT_LIST_HEAD(&context->qp_grp_list);
 	mutex_lock(&us_ibdev->usdev_lock);
 	list_add_tail(&context->link, &us_ibdev->ctx_list);
 	mutex_unlock(&us_ibdev->usdev_lock);
 
-	return &context->ibucontext;
+	return 0;
 }
 
-int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+void usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 {
 	struct usnic_ib_ucontext *context = to_uucontext(ibcontext);
 	struct usnic_ib_dev *us_ibdev = to_usdev(ibcontext->device);
 	usnic_dbg("\n");
 
 	mutex_lock(&us_ibdev->usdev_lock);
-	BUG_ON(!list_empty(&context->qp_grp_list));
+	WARN_ON_ONCE(!list_empty(&context->qp_grp_list));
 	list_del(&context->link);
 	mutex_unlock(&us_ibdev->usdev_lock);
-	kfree(context);
-	return 0;
 }
 
 int usnic_ib_mmap(struct ib_ucontext *context,
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
index 2a87650949f6..c40e89b6246f 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
@@ -68,9 +68,8 @@ struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
 				u64 virt_addr, int access_flags,
 				struct ib_udata *udata);
 int usnic_ib_dereg_mr(struct ib_mr *ibmr);
-struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev,
-						struct ib_udata *udata);
-int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext);
+int usnic_ib_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
+void usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext);
 int usnic_ib_mmap(struct ib_ucontext *context,
 			struct vm_area_struct *vma);
 #endif /* !USNIC_IB_VERBS_H */
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
index 47e653d2495c..6d8b3e0de57a 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
@@ -196,6 +196,7 @@ static const struct ib_device_ops pvrdma_dev_ops = {
 	.reg_user_mr = pvrdma_reg_user_mr,
 	.req_notify_cq = pvrdma_req_notify_cq,
 	INIT_RDMA_OBJ_SIZE(ib_pd, pvrdma_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, pvrdma_ucontext, ibucontext),
 };
 
 static const struct ib_device_ops pvrdma_dev_srq_ops = {
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
index f44220f72e05..42fe821f8d58 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
@@ -306,41 +306,32 @@ out:
 
 /**
  * pvrdma_alloc_ucontext - allocate ucontext
- * @ibdev: the IB device
+ * @uctx: the uverbs countext
  * @udata: user data
  *
- * @return: the ib_ucontext pointer on success, otherwise errno.
+ * @return:  zero on success, otherwise errno.
  */
-struct ib_ucontext *pvrdma_alloc_ucontext(struct ib_device *ibdev,
-					  struct ib_udata *udata)
+int pvrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 {
+	struct ib_device *ibdev = uctx->device;
 	struct pvrdma_dev *vdev = to_vdev(ibdev);
-	struct pvrdma_ucontext *context;
-	union pvrdma_cmd_req req;
-	union pvrdma_cmd_resp rsp;
+	struct pvrdma_ucontext *context = to_vucontext(uctx);
+	union pvrdma_cmd_req req = {};
+	union pvrdma_cmd_resp rsp = {};
 	struct pvrdma_cmd_create_uc *cmd = &req.create_uc;
 	struct pvrdma_cmd_create_uc_resp *resp = &rsp.create_uc_resp;
-	struct pvrdma_alloc_ucontext_resp uresp = {0};
+	struct pvrdma_alloc_ucontext_resp uresp = {};
 	int ret;
-	void *ptr;
 
 	if (!vdev->ib_active)
-		return ERR_PTR(-EAGAIN);
-
-	context = kzalloc(sizeof(*context), GFP_KERNEL);
-	if (!context)
-		return ERR_PTR(-ENOMEM);
+		return -EAGAIN;
 
 	context->dev = vdev;
 	ret = pvrdma_uar_alloc(vdev, &context->uar);
-	if (ret) {
-		kfree(context);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (ret)
+		return -ENOMEM;
 
 	/* get ctx_handle from host */
-	memset(cmd, 0, sizeof(*cmd));
-
 	if (vdev->dsr_version < PVRDMA_PPN64_VERSION)
 		cmd->pfn = context->uar.pfn;
 	else
@@ -351,7 +342,6 @@ struct ib_ucontext *pvrdma_alloc_ucontext(struct ib_device *ibdev,
 	if (ret < 0) {
 		dev_warn(&vdev->pdev->dev,
 			 "could not create ucontext, error: %d\n", ret);
-		ptr = ERR_PTR(ret);
 		goto err;
 	}
 
@@ -362,33 +352,28 @@ struct ib_ucontext *pvrdma_alloc_ucontext(struct ib_device *ibdev,
 	ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
 	if (ret) {
 		pvrdma_uar_free(vdev, &context->uar);
-		context->ibucontext.device = ibdev;
 		pvrdma_dealloc_ucontext(&context->ibucontext);
-		return ERR_PTR(-EFAULT);
+		return -EFAULT;
 	}
 
-	return &context->ibucontext;
+	return 0;
 
 err:
 	pvrdma_uar_free(vdev, &context->uar);
-	kfree(context);
-	return ptr;
+	return ret;
 }
 
 /**
  * pvrdma_dealloc_ucontext - deallocate ucontext
  * @ibcontext: the ucontext
- *
- * @return: 0 on success, otherwise errno.
  */
-int pvrdma_dealloc_ucontext(struct ib_ucontext *ibcontext)
+void pvrdma_dealloc_ucontext(struct ib_ucontext *ibcontext)
 {
 	struct pvrdma_ucontext *context = to_vucontext(ibcontext);
-	union pvrdma_cmd_req req;
+	union pvrdma_cmd_req req = {};
 	struct pvrdma_cmd_destroy_uc *cmd = &req.destroy_uc;
 	int ret;
 
-	memset(cmd, 0, sizeof(*cmd));
 	cmd->hdr.cmd = PVRDMA_CMD_DESTROY_UC;
 	cmd->ctx_handle = context->ctx_handle;
 
@@ -399,9 +384,6 @@ int pvrdma_dealloc_ucontext(struct ib_ucontext *ibcontext)
 
 	/* Free the UAR even if the device command failed */
 	pvrdma_uar_free(to_vdev(ibcontext->device), &context->uar);
-	kfree(context);
-
-	return ret;
 }
 
 /**
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
index ed91baad1ffa..607aa131d67c 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
@@ -396,9 +396,8 @@ int pvrdma_modify_device(struct ib_device *ibdev, int mask,
 int pvrdma_modify_port(struct ib_device *ibdev, u8 port,
 		       int mask, struct ib_port_modify *props);
 int pvrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
-struct ib_ucontext *pvrdma_alloc_ucontext(struct ib_device *ibdev,
-					  struct ib_udata *udata);
-int pvrdma_dealloc_ucontext(struct ib_ucontext *context);
+int pvrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
+void pvrdma_dealloc_ucontext(struct ib_ucontext *context);
 int pvrdma_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
 		    struct ib_udata *udata);
 void pvrdma_dealloc_pd(struct ib_pd *ibpd);
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
index ee9c82cb3b6b..42c9d35f832d 100644
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -292,28 +292,21 @@ static inline struct rvt_ucontext *to_iucontext(struct ib_ucontext
 
 /**
  * rvt_alloc_ucontext - Allocate a user context
- * @ibdev: Verbs IB dev
+ * @uctx: Verbs context
  * @udata: User data allocated
  */
-static struct ib_ucontext *rvt_alloc_ucontext(struct ib_device *ibdev,
-					      struct ib_udata *udata)
+static int rvt_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 {
-	struct rvt_ucontext *context;
-
-	context = kzalloc(sizeof(*context), GFP_KERNEL);
-	if (!context)
-		return ERR_PTR(-ENOMEM);
-	return &context->ibucontext;
+	return 0;
 }
 
 /**
- *rvt_dealloc_ucontext - Free a user context
- *@context - Free this
+ * rvt_dealloc_ucontext - Free a user context
+ * @context - Free this
  */
-static int rvt_dealloc_ucontext(struct ib_ucontext *context)
+static void rvt_dealloc_ucontext(struct ib_ucontext *context)
 {
-	kfree(to_iucontext(context));
-	return 0;
+	return;
 }
 
 static int rvt_get_port_immutable(struct ib_device *ibdev, u8 port_num,
@@ -433,6 +426,7 @@ static const struct ib_device_ops rvt_dev_ops = {
 	.resize_cq = rvt_resize_cq,
 	.unmap_fmr = rvt_unmap_fmr,
 	INIT_RDMA_OBJ_SIZE(ib_pd, rvt_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, rvt_ucontext, ibucontext),
 };
 
 static noinline int check_support(struct rvt_dev_info *rdi, int verb)
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index c9b8b8c6bfb5..120fa9005954 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -42,6 +42,7 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
 	[RXE_TYPE_UC] = {
 		.name		= "rxe-uc",
 		.size		= sizeof(struct rxe_ucontext),
+		.flags          = RXE_POOL_NO_ALLOC,
 	},
 	[RXE_TYPE_PD] = {
 		.name		= "rxe-pd",
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index bd6a379b79d3..6ecf28570ff0 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -142,22 +142,19 @@ static enum rdma_link_layer rxe_get_link_layer(struct ib_device *dev,
 	return rxe_link_layer(rxe, port_num);
 }
 
-static struct ib_ucontext *rxe_alloc_ucontext(struct ib_device *dev,
-					      struct ib_udata *udata)
+static int rxe_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 {
-	struct rxe_dev *rxe = to_rdev(dev);
-	struct rxe_ucontext *uc;
+	struct rxe_dev *rxe = to_rdev(uctx->device);
+	struct rxe_ucontext *uc = to_ruc(uctx);
 
-	uc = rxe_alloc(&rxe->uc_pool);
-	return uc ? &uc->ibuc : ERR_PTR(-ENOMEM);
+	return rxe_add_to_pool(&rxe->uc_pool, &uc->pelem);
 }
 
-static int rxe_dealloc_ucontext(struct ib_ucontext *ibuc)
+static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc)
 {
 	struct rxe_ucontext *uc = to_ruc(ibuc);
 
 	rxe_drop_ref(uc);
-	return 0;
 }
 
 static int rxe_port_immutable(struct ib_device *dev, u8 port_num,
@@ -1180,6 +1177,7 @@ static const struct ib_device_ops rxe_dev_ops = {
 	.req_notify_cq = rxe_req_notify_cq,
 	.resize_cq = rxe_resize_cq,
 	INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc),
 };
 
 int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index d02eb75ef282..157e51aeb1e1 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -61,8 +61,8 @@ static inline int psn_compare(u32 psn_a, u32 psn_b)
 }
 
 struct rxe_ucontext {
+	struct ib_ucontext ibuc;
 	struct rxe_pool_entry	pelem;
-	struct ib_ucontext	ibuc;
 };
 
 struct rxe_pd {
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 225cb76d469f..9b9e17bcc201 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2389,9 +2389,9 @@ struct ib_device_ops {
 	int (*del_gid)(const struct ib_gid_attr *attr, void **context);
 	int (*query_pkey)(struct ib_device *device, u8 port_num, u16 index,
 			  u16 *pkey);
-	struct ib_ucontext *(*alloc_ucontext)(struct ib_device *device,
-					      struct ib_udata *udata);
-	int (*dealloc_ucontext)(struct ib_ucontext *context);
+	int (*alloc_ucontext)(struct ib_ucontext *context,
+			      struct ib_udata *udata);
+	void (*dealloc_ucontext)(struct ib_ucontext *context);
 	int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma);
 	void (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
 	int (*alloc_pd)(struct ib_pd *pd, struct ib_ucontext *context,
@@ -2551,6 +2551,7 @@ struct ib_device_ops {
 	void (*dealloc_driver)(struct ib_device *dev);
 
 	DECLARE_RDMA_OBJ_SIZE(ib_pd);
+	DECLARE_RDMA_OBJ_SIZE(ib_ucontext);
 };
 
 struct rdma_restrack_root;
-- 
cgit v1.2.3


From 663586c0a8929db81e617c775823efb9d65f2bc2 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Wed, 7 Nov 2018 23:16:19 +0100
Subject: ubi: Expose the bitrot interface

Using UBI_IOCRPEB and UBI_IOCSPEB userspace can force
reading and scrubbing of PEBs.

In case of bitflips UBI will automatically take action
and move data to a different PEB.
This interface allows a daemon to foster your NAND.

Signed-off-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/ubi/cdev.c      |  30 +++++++++
 drivers/mtd/ubi/ubi.h       |   1 +
 drivers/mtd/ubi/wl.c        | 144 ++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/mtd/ubi-user.h |   5 ++
 4 files changed, 180 insertions(+)

(limited to 'include')

diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c
index 22547d7a84ea..947a8adbc799 100644
--- a/drivers/mtd/ubi/cdev.c
+++ b/drivers/mtd/ubi/cdev.c
@@ -974,6 +974,36 @@ static long ubi_cdev_ioctl(struct file *file, unsigned int cmd,
 		break;
 	}
 
+	/* Check a specific PEB for bitflips and scrub it if needed */
+	case UBI_IOCRPEB:
+	{
+		int pnum;
+
+		err = get_user(pnum, (__user int32_t *)argp);
+		if (err) {
+			err = -EFAULT;
+			break;
+		}
+
+		err = ubi_bitflip_check(ubi, pnum, 0);
+		break;
+	}
+
+	/* Force scrubbing for a specific PEB */
+	case UBI_IOCSPEB:
+	{
+		int pnum;
+
+		err = get_user(pnum, (__user int32_t *)argp);
+		if (err) {
+			err = -EFAULT;
+			break;
+		}
+
+		err = ubi_bitflip_check(ubi, pnum, 1);
+		break;
+	}
+
 	default:
 		err = -ENOTTY;
 		break;
diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h
index d47b9e436e67..a1b9e764d489 100644
--- a/drivers/mtd/ubi/ubi.h
+++ b/drivers/mtd/ubi/ubi.h
@@ -929,6 +929,7 @@ int ubi_wl_put_fm_peb(struct ubi_device *ubi, struct ubi_wl_entry *used_e,
 int ubi_is_erase_work(struct ubi_work *wrk);
 void ubi_refill_pools(struct ubi_device *ubi);
 int ubi_ensure_anchor_pebs(struct ubi_device *ubi);
+int ubi_bitflip_check(struct ubi_device *ubi, int pnum, int force_scrub);
 
 /* io.c */
 int ubi_io_read(const struct ubi_device *ubi, void *buf, int pnum, int offset,
diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c
index ca1b31385eb5..40f838d54b0f 100644
--- a/drivers/mtd/ubi/wl.c
+++ b/drivers/mtd/ubi/wl.c
@@ -1440,6 +1440,150 @@ int ubi_wl_flush(struct ubi_device *ubi, int vol_id, int lnum)
 	return err;
 }
 
+static bool scrub_possible(struct ubi_device *ubi, struct ubi_wl_entry *e)
+{
+	if (in_wl_tree(e, &ubi->scrub))
+		return false;
+	else if (in_wl_tree(e, &ubi->erroneous))
+		return false;
+	else if (ubi->move_from == e)
+		return false;
+	else if (ubi->move_to == e)
+		return false;
+
+	return true;
+}
+
+/**
+ * ubi_bitflip_check - Check an eraseblock for bitflips and scrub it if needed.
+ * @ubi: UBI device description object
+ * @pnum: the physical eraseblock to schedule
+ * @force: dont't read the block, assume bitflips happened and take action.
+ *
+ * This function reads the given eraseblock and checks if bitflips occured.
+ * In case of bitflips, the eraseblock is scheduled for scrubbing.
+ * If scrubbing is forced with @force, the eraseblock is not read,
+ * but scheduled for scrubbing right away.
+ *
+ * Returns:
+ * %EINVAL, PEB is out of range
+ * %ENOENT, PEB is no longer used by UBI
+ * %EBUSY, PEB cannot be checked now or a check is currently running on it
+ * %EAGAIN, bit flips happened but scrubbing is currently not possible
+ * %EUCLEAN, bit flips happened and PEB is scheduled for scrubbing
+ * %0, no bit flips detected
+ */
+int ubi_bitflip_check(struct ubi_device *ubi, int pnum, int force)
+{
+	int err;
+	struct ubi_wl_entry *e;
+
+	if (pnum < 0 || pnum >= ubi->peb_count) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Pause all parallel work, otherwise it can happen that the
+	 * erase worker frees a wl entry under us.
+	 */
+	down_write(&ubi->work_sem);
+
+	/*
+	 * Make sure that the wl entry does not change state while
+	 * inspecting it.
+	 */
+	spin_lock(&ubi->wl_lock);
+	e = ubi->lookuptbl[pnum];
+	if (!e) {
+		spin_unlock(&ubi->wl_lock);
+		err = -ENOENT;
+		goto out_resume;
+	}
+
+	/*
+	 * Does it make sense to check this PEB?
+	 */
+	if (!scrub_possible(ubi, e)) {
+		spin_unlock(&ubi->wl_lock);
+		err = -EBUSY;
+		goto out_resume;
+	}
+	spin_unlock(&ubi->wl_lock);
+
+	if (!force) {
+		mutex_lock(&ubi->buf_mutex);
+		err = ubi_io_read(ubi, ubi->peb_buf, pnum, 0, ubi->peb_size);
+		mutex_unlock(&ubi->buf_mutex);
+	}
+
+	if (err == UBI_IO_BITFLIPS || force) {
+		/*
+		 * Okay, bit flip happened, let's figure out what we can do.
+		 */
+		spin_lock(&ubi->wl_lock);
+
+		/*
+		 * Recheck. We released wl_lock, UBI might have killed the
+		 * wl entry under us.
+		 */
+		e = ubi->lookuptbl[pnum];
+		if (!e) {
+			spin_unlock(&ubi->wl_lock);
+			err = -ENOENT;
+			goto out_resume;
+		}
+
+		/*
+		 * Need to re-check state
+		 */
+		if (!scrub_possible(ubi, e)) {
+			spin_unlock(&ubi->wl_lock);
+			err = -EBUSY;
+			goto out_resume;
+		}
+
+		if (in_pq(ubi, e)) {
+			prot_queue_del(ubi, e->pnum);
+			wl_tree_add(e, &ubi->scrub);
+			spin_unlock(&ubi->wl_lock);
+
+			err = ensure_wear_leveling(ubi, 1);
+		} else if (in_wl_tree(e, &ubi->used)) {
+			rb_erase(&e->u.rb, &ubi->used);
+			wl_tree_add(e, &ubi->scrub);
+			spin_unlock(&ubi->wl_lock);
+
+			err = ensure_wear_leveling(ubi, 1);
+		} else if (in_wl_tree(e, &ubi->free)) {
+			rb_erase(&e->u.rb, &ubi->free);
+			ubi->free_count--;
+			spin_unlock(&ubi->wl_lock);
+
+			/*
+			 * This PEB is empty we can schedule it for
+			 * erasure right away. No wear leveling needed.
+			 */
+			err = schedule_erase(ubi, e, UBI_UNKNOWN, UBI_UNKNOWN,
+					     force ? 0 : 1, true);
+		} else {
+			spin_unlock(&ubi->wl_lock);
+			err = -EAGAIN;
+		}
+
+		if (!err && !force)
+			err = -EUCLEAN;
+	} else {
+		err = 0;
+	}
+
+out_resume:
+	up_write(&ubi->work_sem);
+out:
+
+	return err;
+}
+
 /**
  * tree_destroy - destroy an RB-tree.
  * @ubi: UBI device description object
diff --git a/include/uapi/mtd/ubi-user.h b/include/uapi/mtd/ubi-user.h
index aad3b6201fc0..b69e9ba6742b 100644
--- a/include/uapi/mtd/ubi-user.h
+++ b/include/uapi/mtd/ubi-user.h
@@ -171,6 +171,11 @@
 /* Re-name volumes */
 #define UBI_IOCRNVOL _IOW(UBI_IOC_MAGIC, 3, struct ubi_rnvol_req)
 
+/* Read the specified PEB and scrub it if there are bitflips */
+#define UBI_IOCRPEB _IOW(UBI_IOC_MAGIC, 4, __s32)
+/* Force scrubbing on the specified PEB */
+#define UBI_IOCSPEB _IOW(UBI_IOC_MAGIC, 5, __s32)
+
 /* ioctl commands of the UBI control character device */
 
 #define UBI_CTRL_IOC_MAGIC 'o'
-- 
cgit v1.2.3


From 93b6604c5a669d84e45fe5129294875bf82eb1ff Mon Sep 17 00:00:00 2001
From: Jim Broadus <jbroadus@gmail.com>
Date: Tue, 19 Feb 2019 11:30:27 -0800
Subject: i2c: Allow recovery of the initial IRQ by an I2C client device.

A previous change allowed I2C client devices to discover new IRQs upon
reprobe by clearing the IRQ in i2c_device_remove. However, if an IRQ was
assigned in i2c_new_device, that information is lost.

For example, the touchscreen and trackpad devices on a Dell Inspiron laptop
are I2C devices whose IRQs are defined by ACPI extended IRQ types. The
client device structures are initialized during an ACPI walk. After
removing the i2c_hid device, modprobe fails.

This change caches the initial IRQ value in i2c_new_device and then resets
the client device IRQ to the initial value in i2c_device_remove.

Fixes: 6f108dd70d30 ("i2c: Clear client->irq in i2c_device_remove")
Signed-off-by: Jim Broadus <jbroadus@gmail.com>
Reviewed-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Reviewed-by: Charles Keepax <ckeepax@opensource.cirrus.com>
[wsa: this is an easy to backport fix for the regression. We will
refactor the code to handle irq assignments better in general.]
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core-base.c | 9 +++++----
 include/linux/i2c.h         | 1 +
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c
index 926ca0a7477f..cb6c5cb0df0b 100644
--- a/drivers/i2c/i2c-core-base.c
+++ b/drivers/i2c/i2c-core-base.c
@@ -430,7 +430,7 @@ static int i2c_device_remove(struct device *dev)
 	dev_pm_clear_wake_irq(&client->dev);
 	device_init_wakeup(&client->dev, false);
 
-	client->irq = 0;
+	client->irq = client->init_irq;
 
 	return status;
 }
@@ -741,10 +741,11 @@ i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info)
 	client->flags = info->flags;
 	client->addr = info->addr;
 
-	client->irq = info->irq;
-	if (!client->irq)
-		client->irq = i2c_dev_irq_from_resources(info->resources,
+	client->init_irq = info->irq;
+	if (!client->init_irq)
+		client->init_irq = i2c_dev_irq_from_resources(info->resources,
 							 info->num_resources);
+	client->irq = client->init_irq;
 
 	strlcpy(client->name, info->type, sizeof(client->name));
 
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 1f45331924d6..383510b4f083 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -333,6 +333,7 @@ struct i2c_client {
 	char name[I2C_NAME_SIZE];
 	struct i2c_adapter *adapter;	/* the adapter we sit on	*/
 	struct device dev;		/* the device structure		*/
+	int init_irq;			/* irq set at initialization	*/
 	int irq;			/* irq issued by device		*/
 	struct list_head detected;
 #if IS_ENABLED(CONFIG_I2C_SLAVE)
-- 
cgit v1.2.3


From fb7e160019f4abb4082740bfeb27a38f6389c745 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 22 Nov 2018 16:37:38 +0100
Subject: fs: add an iopoll method to struct file_operations

This new methods is used to explicitly poll for I/O completion for an
iocb.  It must be called for any iocb submitted asynchronously (that
is with a non-null ki_complete) which has the IOCB_HIPRI flag set.

The method is assisted by a new ki_cookie field in struct iocb to store
the polling cookie.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/filesystems/vfs.txt | 3 +++
 include/linux/fs.h                | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 8dc8e9c2913f..761c6fd24a53 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -857,6 +857,7 @@ struct file_operations {
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
+	int (*iopoll)(struct kiocb *kiocb, bool spin);
 	int (*iterate) (struct file *, struct dir_context *);
 	int (*iterate_shared) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
@@ -902,6 +903,8 @@ otherwise noted.
 
   write_iter: possibly asynchronous write with iov_iter as source
 
+  iopoll: called when aio wants to poll for completions on HIPRI iocbs
+
   iterate: called when the VFS needs to read the directory contents
 
   iterate_shared: called when the VFS needs to read the directory contents
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 29d8e2cfed0e..dedcc2e9265c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -310,6 +310,7 @@ struct kiocb {
 	int			ki_flags;
 	u16			ki_hint;
 	u16			ki_ioprio; /* See linux/ioprio.h */
+	unsigned int		ki_cookie; /* for ->iopoll */
 } __randomize_layout;
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
@@ -1787,6 +1788,7 @@ struct file_operations {
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
+	int (*iopoll)(struct kiocb *kiocb, bool spin);
 	int (*iterate) (struct file *, struct dir_context *);
 	int (*iterate_shared) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
-- 
cgit v1.2.3


From 0bbb280d7b767e7c86a5adfc87c76a6f09ab0423 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 21 Dec 2018 09:10:46 -0700
Subject: block: add bio_set_polled() helper

For the upcoming async polled IO, we can't sleep allocating requests.
If we do, then we introduce a deadlock where the submitter already
has async polled IO in-flight, but can't wait for them to complete
since polled requests must be active found and reaped.

Utilize the helper in the blockdev DIRECT_IO code.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c      |  4 ++--
 include/linux/bio.h | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1fe498b08f1b..e9faa52bb489 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -248,7 +248,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 		task_io_account_write(ret);
 	}
 	if (iocb->ki_flags & IOCB_HIPRI)
-		bio.bi_opf |= REQ_HIPRI;
+		bio_set_polled(&bio, iocb);
 
 	qc = submit_bio(&bio);
 	for (;;) {
@@ -419,7 +419,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 			bool polled = false;
 
 			if (iocb->ki_flags & IOCB_HIPRI) {
-				bio->bi_opf |= REQ_HIPRI;
+				bio_set_polled(bio, iocb);
 				polled = true;
 			}
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index bdd11d4c2f05..bb6090aa165d 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -826,5 +826,19 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page,
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
+/*
+ * Mark a bio as polled. Note that for async polled IO, the caller must
+ * expect -EWOULDBLOCK if we cannot allocate a request (or other resources).
+ * We cannot block waiting for requests on polled IO, as those completions
+ * must be found by the caller. This is different than IRQ driven IO, where
+ * it's safe to wait for IO to complete.
+ */
+static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb)
+{
+	bio->bi_opf |= REQ_HIPRI;
+	if (!is_sync_kiocb(kiocb))
+		bio->bi_opf |= REQ_NOWAIT;
+}
+
 #endif /* CONFIG_BLOCK */
 #endif /* __LINUX_BIO_H */
-- 
cgit v1.2.3


From 81214bab582eeda068e7904d57b6a3095e8f3855 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 4 Dec 2018 11:12:08 -0700
Subject: iomap: wire up the iopoll method

Store the request queue the last bio was submitted to in the iocb
private data in addition to the cookie so that we find the right block
device.  Also refactor the common direct I/O bio submission code into a
nice little helper.

Signed-off-by: Christoph Hellwig <hch@lst.de>

Modified to use bio_set_polled().

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/gfs2/file.c        |  2 ++
 fs/iomap.c            | 43 ++++++++++++++++++++++++++++---------------
 fs/xfs/xfs_file.c     |  1 +
 include/linux/iomap.h |  1 +
 4 files changed, 32 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index a2dea5bc0427..58a768e59712 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1280,6 +1280,7 @@ const struct file_operations gfs2_file_fops = {
 	.llseek		= gfs2_llseek,
 	.read_iter	= gfs2_file_read_iter,
 	.write_iter	= gfs2_file_write_iter,
+	.iopoll		= iomap_dio_iopoll,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.mmap		= gfs2_mmap,
 	.open		= gfs2_open,
@@ -1310,6 +1311,7 @@ const struct file_operations gfs2_file_fops_nolock = {
 	.llseek		= gfs2_llseek,
 	.read_iter	= gfs2_file_read_iter,
 	.write_iter	= gfs2_file_write_iter,
+	.iopoll		= iomap_dio_iopoll,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.mmap		= gfs2_mmap,
 	.open		= gfs2_open,
diff --git a/fs/iomap.c b/fs/iomap.c
index 6982d3d2bcc6..97cb9d486a7d 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1464,6 +1464,28 @@ struct iomap_dio {
 	};
 };
 
+int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
+{
+	struct request_queue *q = READ_ONCE(kiocb->private);
+
+	if (!q)
+		return 0;
+	return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
+}
+EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
+
+static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
+		struct bio *bio)
+{
+	atomic_inc(&dio->ref);
+
+	if (dio->iocb->ki_flags & IOCB_HIPRI)
+		bio_set_polled(bio, dio->iocb);
+
+	dio->submit.last_queue = bdev_get_queue(iomap->bdev);
+	dio->submit.cookie = submit_bio(bio);
+}
+
 static ssize_t iomap_dio_complete(struct iomap_dio *dio)
 {
 	struct kiocb *iocb = dio->iocb;
@@ -1577,7 +1599,7 @@ static void iomap_dio_bio_end_io(struct bio *bio)
 	}
 }
 
-static blk_qc_t
+static void
 iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
 		unsigned len)
 {
@@ -1591,15 +1613,10 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
 	bio->bi_private = dio;
 	bio->bi_end_io = iomap_dio_bio_end_io;
 
-	if (dio->iocb->ki_flags & IOCB_HIPRI)
-		flags |= REQ_HIPRI;
-
 	get_page(page);
 	__bio_add_page(bio, page, len, 0);
 	bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
-
-	atomic_inc(&dio->ref);
-	return submit_bio(bio);
+	iomap_dio_submit_bio(dio, iomap, bio);
 }
 
 static loff_t
@@ -1702,9 +1719,6 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 				bio_set_pages_dirty(bio);
 		}
 
-		if (dio->iocb->ki_flags & IOCB_HIPRI)
-			bio->bi_opf |= REQ_HIPRI;
-
 		iov_iter_advance(dio->submit.iter, n);
 
 		dio->size += n;
@@ -1712,11 +1726,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 		copied += n;
 
 		nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
-
-		atomic_inc(&dio->ref);
-
-		dio->submit.last_queue = bdev_get_queue(iomap->bdev);
-		dio->submit.cookie = submit_bio(bio);
+		iomap_dio_submit_bio(dio, iomap, bio);
 	} while (nr_pages);
 
 	/*
@@ -1927,6 +1937,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	if (dio->flags & IOMAP_DIO_WRITE_FUA)
 		dio->flags &= ~IOMAP_DIO_NEED_SYNC;
 
+	WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
+	WRITE_ONCE(iocb->private, dio->submit.last_queue);
+
 	/*
 	 * We are about to drop our additional submission reference, which
 	 * might be the last reference to the dio.  There are three three
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e47425071e65..60c2da41f0fc 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1203,6 +1203,7 @@ const struct file_operations xfs_file_operations = {
 	.write_iter	= xfs_file_write_iter,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
+	.iopoll		= iomap_dio_iopoll,
 	.unlocked_ioctl	= xfs_file_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= xfs_file_compat_ioctl,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 9a4258154b25..0fefb5455bda 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -162,6 +162,7 @@ typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t ret,
 		unsigned flags);
 ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, iomap_dio_end_io_t end_io);
+int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
 
 #ifdef CONFIG_SWAP
 struct file;
-- 
cgit v1.2.3


From de7b7dca8735f720793dae8ad818091309979c39 Mon Sep 17 00:00:00 2001
From: "Angus Ainslie (Purism)" <angus@akkea.ca>
Date: Mon, 28 Jan 2019 09:03:22 -0700
Subject: dmaengine: imx-sdma: add a test for imx8mq multi sdma devices

On i.mx8mq, there are two sdma instances, and the common dma framework
will get a channel dynamically from any available sdma instance whether
it's the first sdma device or the second sdma device. Some IPs like
SAI only work with sdma2 not sdma1. To make sure the sdma channel is from
the correct sdma device, use the node pointer to match.

Signed-off-by: Angus Ainslie (Purism) <angus@akkea.ca>
Reviewed-by: Lucas Stach <l.stach@pengutronix.de>
Tested-by: Daniel Baluta <daniel.baluta@nxp.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/imx-sdma.c                | 6 ++++++
 include/linux/platform_data/dma-imx.h | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index fc8bc80617d8..8fb0cd293b54 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -1913,11 +1913,16 @@ disable_clk_ipg:
 static bool sdma_filter_fn(struct dma_chan *chan, void *fn_param)
 {
 	struct sdma_channel *sdmac = to_sdma_chan(chan);
+	struct sdma_engine *sdma = sdmac->sdma;
 	struct imx_dma_data *data = fn_param;
 
 	if (!imx_dma_is_general_purpose(chan))
 		return false;
 
+	/* return false if it's not the right device */
+	if (sdma->dev->of_node != data->of_node)
+		return false;
+
 	sdmac->data = *data;
 	chan->private = &sdmac->data;
 
@@ -1945,6 +1950,7 @@ static struct dma_chan *sdma_xlate(struct of_phandle_args *dma_spec,
 	 * be set to sdmac->event_id1.
 	 */
 	data.dma_request2 = 0;
+	data.of_node = ofdma->of_node;
 
 	return dma_request_channel(mask, sdma_filter_fn, &data);
 }
diff --git a/include/linux/platform_data/dma-imx.h b/include/linux/platform_data/dma-imx.h
index 7d964e787299..9daea8d42a10 100644
--- a/include/linux/platform_data/dma-imx.h
+++ b/include/linux/platform_data/dma-imx.h
@@ -55,6 +55,7 @@ struct imx_dma_data {
 	int dma_request2; /* secondary DMA request line */
 	enum sdma_peripheral_type peripheral_type;
 	int priority;
+	struct device_node *of_node;
 };
 
 static inline int imx_dma_is_ipu(struct dma_chan *chan)
-- 
cgit v1.2.3


From ad5ea5b9d513107869acd460f0180d8fb94856b9 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Mon, 25 Feb 2019 21:20:45 +0100
Subject: rtc: remove rtc_class_ops.read_callback

Since commit 416f0e8056f7 ("RTC: sa1100: Update the sa1100 RTC driver."),
the last user of .read_callback is gone. It has been 8 years and now new
user appeared. Simply remove it.

Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/rtc/dev.c   | 5 -----
 include/linux/rtc.h | 3 +--
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/rtc/dev.c b/drivers/rtc/dev.c
index 43d962a9c210..1d006ef4bb57 100644
--- a/drivers/rtc/dev.c
+++ b/drivers/rtc/dev.c
@@ -178,11 +178,6 @@ rtc_dev_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	remove_wait_queue(&rtc->irq_queue, &wait);
 
 	if (ret == 0) {
-		/* Check for any data updates */
-		if (rtc->ops->read_callback)
-			data = rtc->ops->read_callback(rtc->dev.parent,
-						       data);
-
 		if (sizeof(int) != sizeof(long) &&
 		    count == sizeof(unsigned int))
 			ret = put_user(data, (unsigned int __user *)buf) ?:
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index c1089fe5344a..f89bfbb54902 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -67,7 +67,7 @@ extern struct class *rtc_class;
  *
  * The (current) exceptions are mostly filesystem hooks:
  *   - the proc() hook for procfs
- *   - non-ioctl() chardev hooks:  open(), release(), read_callback()
+ *   - non-ioctl() chardev hooks:  open(), release()
  *
  * REVISIT those periodic irq calls *do* have ops_lock when they're
  * issued through ioctl() ...
@@ -81,7 +81,6 @@ struct rtc_class_ops {
 	int (*proc)(struct device *, struct seq_file *);
 	int (*set_mmss64)(struct device *, time64_t secs);
 	int (*set_mmss)(struct device *, unsigned long secs);
-	int (*read_callback)(struct device *, int data);
 	int (*alarm_irq_enable)(struct device *, unsigned int enabled);
 	int (*read_offset)(struct device *, long *offset);
 	int (*set_offset)(struct device *, long offset);
-- 
cgit v1.2.3


From db04d4a3d72f0c5ee34609559f535d11ab47303c Mon Sep 17 00:00:00 2001
From: Tom Murphy <murphyt7@tcd.ie>
Date: Mon, 11 Feb 2019 15:50:33 +0000
Subject: iommu: Fix flush_tlb_all typo

Fix typo, flush_tlb_all should be flush_iotlb_all.

Signed-off-by: Tom Murphy <murphyt7@tcd.ie>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index e90da6b6f3d1..2b402dcbcf81 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -167,7 +167,7 @@ struct iommu_resv_region {
  * @detach_dev: detach device from an iommu domain
  * @map: map a physically contiguous memory region to an iommu domain
  * @unmap: unmap a physically contiguous memory region from an iommu domain
- * @flush_tlb_all: Synchronously flush all hardware TLBs for this domain
+ * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain
  * @iotlb_range_add: Add a given iova range to the flush queue for this domain
  * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush
  *            queue
-- 
cgit v1.2.3


From e5567f5f67621877726f99be040af9fbedda37dc Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Tue, 19 Feb 2019 11:04:51 -0800
Subject: PCI/ATS: Add pci_prg_resp_pasid_required() interface.

Return the PRG Response PASID Required bit in the Page Request
Status Register.

As per PCIe spec r4.0, sec 10.5.2.3, if this bit is Set, the device
expects a PASID TLP Prefix on PRG Response Messages when the
corresponding Page Requests had a PASID TLP Prefix. If Clear, the device
does not expect PASID TLP Prefixes on any PRG Response Message, and the
device behavior is undefined if the device receives a PRG Response Message
with a PASID TLP Prefix. Also the device behavior is undefined if this
bit is Set and the device receives a PRG Response Message with no PASID TLP
Prefix when the corresponding Page Requests had a PASID TLP Prefix.

This function will be used by drivers like IOMMU, if it is required to
check the status of the PRG Response PASID Required bit before enabling
the PASID support of the device.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Keith Busch <keith.busch@intel.com>
Suggested-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/pci/ats.c             | 30 ++++++++++++++++++++++++++++++
 include/linux/pci-ats.h       |  5 +++++
 include/uapi/linux/pci_regs.h |  1 +
 3 files changed, 36 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 5b78f3b1b918..420cd0a578d0 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -368,6 +368,36 @@ int pci_pasid_features(struct pci_dev *pdev)
 }
 EXPORT_SYMBOL_GPL(pci_pasid_features);
 
+/**
+ * pci_prg_resp_pasid_required - Return PRG Response PASID Required bit
+ *				 status.
+ * @pdev: PCI device structure
+ *
+ * Returns 1 if PASID is required in PRG Response Message, 0 otherwise.
+ *
+ * Even though the PRG response PASID status is read from PRI Status
+ * Register, since this API will mainly be used by PASID users, this
+ * function is defined within #ifdef CONFIG_PCI_PASID instead of
+ * CONFIG_PCI_PRI.
+ */
+int pci_prg_resp_pasid_required(struct pci_dev *pdev)
+{
+	u16 status;
+	int pos;
+
+	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
+	if (!pos)
+		return 0;
+
+	pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status);
+
+	if (status & PCI_PRI_STATUS_PASID)
+		return 1;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_prg_resp_pasid_required);
+
 #define PASID_NUMBER_SHIFT	8
 #define PASID_NUMBER_MASK	(0x1f << PASID_NUMBER_SHIFT)
 /**
diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h
index 7c4b8e27268c..facfd6a18fe1 100644
--- a/include/linux/pci-ats.h
+++ b/include/linux/pci-ats.h
@@ -40,6 +40,7 @@ void pci_disable_pasid(struct pci_dev *pdev);
 void pci_restore_pasid_state(struct pci_dev *pdev);
 int pci_pasid_features(struct pci_dev *pdev);
 int pci_max_pasids(struct pci_dev *pdev);
+int pci_prg_resp_pasid_required(struct pci_dev *pdev);
 
 #else  /* CONFIG_PCI_PASID */
 
@@ -66,6 +67,10 @@ static inline int pci_max_pasids(struct pci_dev *pdev)
 	return -EINVAL;
 }
 
+static int pci_prg_resp_pasid_required(struct pci_dev *pdev)
+{
+	return 0;
+}
 #endif /* CONFIG_PCI_PASID */
 
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index e1e9888c85e6..898be572b010 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -880,6 +880,7 @@
 #define  PCI_PRI_STATUS_RF	0x001	/* Response Failure */
 #define  PCI_PRI_STATUS_UPRGI	0x002	/* Unexpected PRG index */
 #define  PCI_PRI_STATUS_STOPPED	0x100	/* PRI Stopped */
+#define  PCI_PRI_STATUS_PASID	0x8000	/* PRG Response PASID Required */
 #define PCI_PRI_MAX_REQ		0x08	/* PRI max reqs supported */
 #define PCI_PRI_ALLOC_REQ	0x0c	/* PRI max reqs allowed */
 #define PCI_EXT_CAP_PRI_SIZEOF	16
-- 
cgit v1.2.3


From 8c938ddc6df3bbe72809db1be6c9f3af83f5d7a9 Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Tue, 19 Feb 2019 11:06:09 -0800
Subject: PCI/ATS: Add pci_ats_page_aligned() interface

Return the Page Aligned Request bit in the ATS Capability Register.

As per PCIe spec r4.0, sec 10.5.1.2, if the Page Aligned Request bit is
set, it indicates the Untranslated Addresses generated by the device are
always aligned to a 4096 byte boundary.

An IOMMU that can only translate page-aligned addresses can only be used
with devices that always produce aligned Untranslated Addresses. This
interface will be used by drivers for such IOMMUs to determine whether
devices can use the ATS service.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Keith Busch <keith.busch@intel.com>
Suggested-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/pci/ats.c             | 27 +++++++++++++++++++++++++++
 include/linux/pci.h           |  2 ++
 include/uapi/linux/pci_regs.h |  1 +
 3 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 420cd0a578d0..97c08146534a 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -142,6 +142,33 @@ int pci_ats_queue_depth(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(pci_ats_queue_depth);
 
+/**
+ * pci_ats_page_aligned - Return Page Aligned Request bit status.
+ * @pdev: the PCI device
+ *
+ * Returns 1, if the Untranslated Addresses generated by the device
+ * are always aligned or 0 otherwise.
+ *
+ * Per PCIe spec r4.0, sec 10.5.1.2, if the Page Aligned Request bit
+ * is set, it indicates the Untranslated Addresses generated by the
+ * device are always aligned to a 4096 byte boundary.
+ */
+int pci_ats_page_aligned(struct pci_dev *pdev)
+{
+	u16 cap;
+
+	if (!pdev->ats_cap)
+		return 0;
+
+	pci_read_config_word(pdev, pdev->ats_cap + PCI_ATS_CAP, &cap);
+
+	if (cap & PCI_ATS_CAP_PAGE_ALIGNED)
+		return 1;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_ats_page_aligned);
+
 #ifdef CONFIG_PCI_PRI
 /**
  * pci_enable_pri - Enable PRI capability
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 65f1d8c2f082..9724a8c0496b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1524,11 +1524,13 @@ void pci_ats_init(struct pci_dev *dev);
 int pci_enable_ats(struct pci_dev *dev, int ps);
 void pci_disable_ats(struct pci_dev *dev);
 int pci_ats_queue_depth(struct pci_dev *dev);
+int pci_ats_page_aligned(struct pci_dev *dev);
 #else
 static inline void pci_ats_init(struct pci_dev *d) { }
 static inline int pci_enable_ats(struct pci_dev *d, int ps) { return -ENODEV; }
 static inline void pci_disable_ats(struct pci_dev *d) { }
 static inline int pci_ats_queue_depth(struct pci_dev *d) { return -ENODEV; }
+static inline int pci_ats_page_aligned(struct pci_dev *dev) { return 0; }
 #endif
 
 #ifdef CONFIG_PCIE_PTM
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 898be572b010..5c98133f2c94 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -866,6 +866,7 @@
 #define PCI_ATS_CAP		0x04	/* ATS Capability Register */
 #define  PCI_ATS_CAP_QDEP(x)	((x) & 0x1f)	/* Invalidate Queue Depth */
 #define  PCI_ATS_MAX_QDEP	32	/* Max Invalidate Queue Depth */
+#define  PCI_ATS_CAP_PAGE_ALIGNED	0x0020 /* Page Aligned Request */
 #define PCI_ATS_CTRL		0x06	/* ATS Control Register */
 #define  PCI_ATS_CTRL_ENABLE	0x8000	/* ATS Enable */
 #define  PCI_ATS_CTRL_STU(x)	((x) & 0x1f)	/* Smallest Translation Unit */
-- 
cgit v1.2.3


From 2405bc162583e1d7c40b13bf078e87428d2dfe4e Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 20 Feb 2019 14:00:52 +0100
Subject: iommu: Document iommu_ops.iotlb_sync_map()

Add missing kerneldoc for iommu_ops.iotlb_sync_map().

Fixes: 1d7ae53b152dbc5b ("iommu: Introduce iotlb_sync_map callback")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2b402dcbcf81..28ad97801032 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -169,6 +169,7 @@ struct iommu_resv_region {
  * @unmap: unmap a physically contiguous memory region from an iommu domain
  * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain
  * @iotlb_range_add: Add a given iova range to the flush queue for this domain
+ * @iotlb_sync_map: Sync mappings created recently using @map to the hardware
  * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush
  *            queue
  * @iova_to_phys: translate iova to physical address
-- 
cgit v1.2.3


From a7055d572c51338bed8673331ead6759cae6b70b Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 20 Feb 2019 14:00:53 +0100
Subject: iommu: Document iommu_ops.is_attach_deferred()

Add missing kerneldoc for iommu_ops.is_attach_deferred().

Fixes: e01d1913b0d08171 ("iommu: Add is_attach_deferred call-back to iommu-ops")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 28ad97801032..41fa7958592d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -184,6 +184,8 @@ struct iommu_resv_region {
  * @domain_window_enable: Configure and enable a particular window for a domain
  * @domain_window_disable: Disable a particular window for a domain
  * @of_xlate: add OF master IDs to iommu grouping
+ * @is_attach_deferred: Check if domain attach should be deferred from iommu
+ *                      driver init to device driver init (default no)
  * @pgsize_bitmap: bitmap of all possible supported page sizes
  */
 struct iommu_ops {
-- 
cgit v1.2.3


From e85fa28ebcb598bbb439402609fdde5d0f80622d Mon Sep 17 00:00:00 2001
From: Mike Leach <mike.leach@linaro.org>
Date: Wed, 13 Feb 2019 14:41:49 +0100
Subject: ARM: 8838/1: drivers: amba: Updates to component identification for
 driver matching.

The CoreSight specification (ARM IHI 0029E), updates the ID register
requirements for components on an AMBA bus, to cover both traditional
ARM Primecell type devices, and newer CoreSight and other components.

The Peripheral ID (PID) / Component ID (CID) pair is extended in certain
cases to uniquely identify components. CoreSight components related to
a single function can share Peripheral ID values, and must be further
identified using a Unique Component Identifier (UCI). e.g. the ETM, CTI,
PMU and Debug hardware of the A35 all share the same PID.

Bits 15:12 of the CID are defined to be the device class.
Class 0xF remains for PrimeCell and legacy components.
Class 0x9 defines the component as CoreSight (CORESIGHT_CID above)
Class 0x0, 0x1, 0xB, 0xE define components that do not have driver support
at present.
Class 0x2-0x8,0xA and 0xD-0xD are presently reserved.

The specification futher defines which classes of device use the standard
CID/PID pair, and when additional ID registers are required.

This patch introduces the amba_cs_uci_id structure which will be used in
all coresight drivers for indentification via the private data pointer in
the amba_id structure.

Existing drivers that currently use the amba_id->data pointer for private
data are updated to use the amba_cs_uci_id->data pointer. Macros and
inline functions are added to simplify this code.

Signed-off-by: Mike Leach <mike.leach@linaro.org>
Reviewed-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Tested-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 drivers/hwtracing/coresight/coresight-etm3x.c | 44 ++++++++-------------------
 drivers/hwtracing/coresight/coresight-priv.h  | 32 +++++++++++++++++++
 drivers/hwtracing/coresight/coresight-stm.c   | 14 ++-------
 drivers/hwtracing/coresight/coresight-tmc.c   | 30 ++++++------------
 include/linux/amba/bus.h                      | 33 ++++++++++++++++++++
 5 files changed, 90 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/drivers/hwtracing/coresight/coresight-etm3x.c b/drivers/hwtracing/coresight/coresight-etm3x.c
index 9a63e87ea5f3..be302ec5f66b 100644
--- a/drivers/hwtracing/coresight/coresight-etm3x.c
+++ b/drivers/hwtracing/coresight/coresight-etm3x.c
@@ -871,7 +871,7 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id)
 	}
 
 	pm_runtime_put(&adev->dev);
-	dev_info(dev, "%s initialized\n", (char *)id->data);
+	dev_info(dev, "%s initialized\n", (char *)coresight_get_uci_data(id));
 	if (boot_enable) {
 		coresight_enable(drvdata->csdev);
 		drvdata->boot_enable = true;
@@ -915,36 +915,18 @@ static const struct dev_pm_ops etm_dev_pm_ops = {
 };
 
 static const struct amba_id etm_ids[] = {
-	{	/* ETM 3.3 */
-		.id	= 0x000bb921,
-		.mask	= 0x000fffff,
-		.data	= "ETM 3.3",
-	},
-	{	/* ETM 3.5 - Cortex-A5 */
-		.id	= 0x000bb955,
-		.mask	= 0x000fffff,
-		.data	= "ETM 3.5",
-	},
-	{	/* ETM 3.5 */
-		.id	= 0x000bb956,
-		.mask	= 0x000fffff,
-		.data	= "ETM 3.5",
-	},
-	{	/* PTM 1.0 */
-		.id	= 0x000bb950,
-		.mask	= 0x000fffff,
-		.data	= "PTM 1.0",
-	},
-	{	/* PTM 1.1 */
-		.id	= 0x000bb95f,
-		.mask	= 0x000fffff,
-		.data	= "PTM 1.1",
-	},
-	{	/* PTM 1.1 Qualcomm */
-		.id	= 0x000b006f,
-		.mask	= 0x000fffff,
-		.data	= "PTM 1.1",
-	},
+	/* ETM 3.3 */
+	CS_AMBA_ID_DATA(0x000bb921, "ETM 3.3"),
+	/* ETM 3.5 - Cortex-A5 */
+	CS_AMBA_ID_DATA(0x000bb955, "ETM 3.5"),
+	/* ETM 3.5 */
+	CS_AMBA_ID_DATA(0x000bb956, "ETM 3.5"),
+	/* PTM 1.0 */
+	CS_AMBA_ID_DATA(0x000bb950, "PTM 1.0"),
+	/* PTM 1.1 */
+	CS_AMBA_ID_DATA(0x000bb95f, "PTM 1.1"),
+	/* PTM 1.1 Qualcomm */
+	CS_AMBA_ID_DATA(0x000b006f, "PTM 1.1"),
 	{ 0, 0},
 };
 
diff --git a/drivers/hwtracing/coresight/coresight-priv.h b/drivers/hwtracing/coresight/coresight-priv.h
index 579f34943bf1..02a1f5204f9d 100644
--- a/drivers/hwtracing/coresight/coresight-priv.h
+++ b/drivers/hwtracing/coresight/coresight-priv.h
@@ -6,6 +6,7 @@
 #ifndef _CORESIGHT_PRIV_H
 #define _CORESIGHT_PRIV_H
 
+#include <linux/amba/bus.h>
 #include <linux/bitops.h>
 #include <linux/io.h>
 #include <linux/coresight.h>
@@ -159,4 +160,35 @@ static inline int etm_readl_cp14(u32 off, unsigned int *val) { return 0; }
 static inline int etm_writel_cp14(u32 off, u32 val) { return 0; }
 #endif
 
+/*
+ * Macros and inline functions to handle CoreSight UCI data and driver
+ * private data in AMBA ID table entries, and extract data values.
+ */
+
+/* coresight AMBA ID, no UCI, no driver data: id table entry */
+#define CS_AMBA_ID(pid)			\
+	{				\
+		.id	= pid,		\
+		.mask	= 0x000fffff,	\
+	}
+
+/* coresight AMBA ID, UCI with driver data only: id table entry. */
+#define CS_AMBA_ID_DATA(pid, dval)				\
+	{							\
+		.id	= pid,					\
+		.mask	= 0x000fffff,				\
+		.data	=  (void *)&(struct amba_cs_uci_id)	\
+			{				\
+				.data = (void *)dval,	\
+			}				\
+	}
+
+/* extract the data value from a UCI structure given amba_id pointer. */
+static inline void *coresight_get_uci_data(const struct amba_id *id)
+{
+	if (id->data)
+		return ((struct amba_cs_uci_id *)(id->data))->data;
+	return 0;
+}
+
 #endif
diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c
index ef339ff22090..2a70cdd68a7b 100644
--- a/drivers/hwtracing/coresight/coresight-stm.c
+++ b/drivers/hwtracing/coresight/coresight-stm.c
@@ -874,7 +874,7 @@ static int stm_probe(struct amba_device *adev, const struct amba_id *id)
 
 	pm_runtime_put(&adev->dev);
 
-	dev_info(dev, "%s initialized\n", (char *)id->data);
+	dev_info(dev, "%s initialized\n", (char *)coresight_get_uci_data(id));
 	return 0;
 
 stm_unregister:
@@ -909,16 +909,8 @@ static const struct dev_pm_ops stm_dev_pm_ops = {
 };
 
 static const struct amba_id stm_ids[] = {
-	{
-		.id     = 0x000bb962,
-		.mask   = 0x000fffff,
-		.data	= "STM32",
-	},
-	{
-		.id	= 0x000bb963,
-		.mask	= 0x000fffff,
-		.data	= "STM500",
-	},
+	CS_AMBA_ID_DATA(0x000bb962, "STM32"),
+	CS_AMBA_ID_DATA(0x000bb963, "STM500"),
 	{ 0, 0},
 };
 
diff --git a/drivers/hwtracing/coresight/coresight-tmc.c b/drivers/hwtracing/coresight/coresight-tmc.c
index ea249f0bcd73..2a02da3d630f 100644
--- a/drivers/hwtracing/coresight/coresight-tmc.c
+++ b/drivers/hwtracing/coresight/coresight-tmc.c
@@ -443,7 +443,8 @@ static int tmc_probe(struct amba_device *adev, const struct amba_id *id)
 		desc.type = CORESIGHT_DEV_TYPE_SINK;
 		desc.subtype.sink_subtype = CORESIGHT_DEV_SUBTYPE_SINK_BUFFER;
 		desc.ops = &tmc_etr_cs_ops;
-		ret = tmc_etr_setup_caps(drvdata, devid, id->data);
+		ret = tmc_etr_setup_caps(drvdata, devid,
+					 coresight_get_uci_data(id));
 		if (ret)
 			goto out;
 		break;
@@ -475,26 +476,13 @@ out:
 }
 
 static const struct amba_id tmc_ids[] = {
-	{
-		.id     = 0x000bb961,
-		.mask   = 0x000fffff,
-	},
-	{
-		/* Coresight SoC 600 TMC-ETR/ETS */
-		.id	= 0x000bb9e8,
-		.mask	= 0x000fffff,
-		.data	= (void *)(unsigned long)CORESIGHT_SOC_600_ETR_CAPS,
-	},
-	{
-		/* Coresight SoC 600 TMC-ETB */
-		.id	= 0x000bb9e9,
-		.mask	= 0x000fffff,
-	},
-	{
-		/* Coresight SoC 600 TMC-ETF */
-		.id	= 0x000bb9ea,
-		.mask	= 0x000fffff,
-	},
+	CS_AMBA_ID(0x000bb961),
+	/* Coresight SoC 600 TMC-ETR/ETS */
+	CS_AMBA_ID_DATA(0x000bb9e8, (unsigned long)CORESIGHT_SOC_600_ETR_CAPS),
+	/* Coresight SoC 600 TMC-ETB */
+	CS_AMBA_ID(0x000bb9e9),
+	/* Coresight SoC 600 TMC-ETF */
+	CS_AMBA_ID(0x000bb9ea),
 	{ 0, 0},
 };
 
diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index d143c13bed26..e3c36223e40b 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -25,6 +25,39 @@
 #define AMBA_CID	0xb105f00d
 #define CORESIGHT_CID	0xb105900d
 
+/*
+ * CoreSight Architecture specification updates the ID specification
+ * for components on the AMBA bus. (ARM IHI 0029E)
+ *
+ * Bits 15:12 of the CID are the device class.
+ *
+ * Class 0xF remains for PrimeCell and legacy components. (AMBA_CID above)
+ * Class 0x9 defines the component as CoreSight (CORESIGHT_CID above)
+ * Class 0x0, 0x1, 0xB, 0xE define components that do not have driver support
+ * at present.
+ * Class 0x2-0x8,0xA and 0xD-0xD are presently reserved.
+ *
+ * Remaining CID bits stay as 0xb105-00d
+ */
+
+/**
+ * Class 0x9 components use additional values to form a Unique Component
+ * Identifier (UCI), where peripheral ID values are identical for different
+ * components. Passed to the amba bus code from the component driver via
+ * the amba_id->data pointer.
+ * @devarch	: coresight devarch register value
+ * @devarch_mask: mask bits used for matching. 0 indicates UCI not used.
+ * @devtype	: coresight device type value
+ * @data	: additional driver data. As we have usurped the original
+ *		pointer some devices may still need additional data
+ */
+struct amba_cs_uci_id {
+	unsigned int devarch;
+	unsigned int devarch_mask;
+	unsigned int devtype;
+	void *data;
+};
+
 struct clk;
 
 struct amba_device {
-- 
cgit v1.2.3


From 4a2910fa80d75dbe18d822482a48ae50a218029c Mon Sep 17 00:00:00 2001
From: Mike Leach <mike.leach@linaro.org>
Date: Wed, 13 Feb 2019 14:41:50 +0100
Subject: ARM: 8836/1: drivers: amba: Update component matching to use the
 CoreSight UCI values.

The patches provide an update of amba_device and matching code to handle
the additional registers required for the Class 0x9 (CoreSight) UCI.

The *data pointer in the amba_id is used by the driver to provide extended
ID register values for matching.

CoreSight components where PID/CID pair is currently sufficient for
unique identification need not provide this additional information.

Signed-off-by: Mike Leach <mike.leach@linaro.org>
Reviewed-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Tested-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 drivers/amba/bus.c       | 45 +++++++++++++++++++++++++++++++++++++--------
 include/linux/amba/bus.h |  6 ++++++
 2 files changed, 43 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c
index 41b706403ef7..b4dae624b9af 100644
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -26,19 +26,36 @@
 
 #define to_amba_driver(d)	container_of(d, struct amba_driver, drv)
 
-static const struct amba_id *
-amba_lookup(const struct amba_id *table, struct amba_device *dev)
+/* called on periphid match and class 0x9 coresight device. */
+static int
+amba_cs_uci_id_match(const struct amba_id *table, struct amba_device *dev)
 {
 	int ret = 0;
+	struct amba_cs_uci_id *uci;
+
+	uci = table->data;
 
+	/* no table data or zero mask - return match on periphid */
+	if (!uci || (uci->devarch_mask == 0))
+		return 1;
+
+	/* test against read devtype and masked devarch value */
+	ret = (dev->uci.devtype == uci->devtype) &&
+		((dev->uci.devarch & uci->devarch_mask) == uci->devarch);
+	return ret;
+}
+
+static const struct amba_id *
+amba_lookup(const struct amba_id *table, struct amba_device *dev)
+{
 	while (table->mask) {
-		ret = (dev->periphid & table->mask) == table->id;
-		if (ret)
-			break;
+		if (((dev->periphid & table->mask) == table->id) &&
+			((dev->cid != CORESIGHT_CID) ||
+			 (amba_cs_uci_id_match(table, dev))))
+			return table;
 		table++;
 	}
-
-	return ret ? table : NULL;
+	return NULL;
 }
 
 static int amba_match(struct device *dev, struct device_driver *drv)
@@ -399,10 +416,22 @@ static int amba_device_try_add(struct amba_device *dev, struct resource *parent)
 			cid |= (readl(tmp + size - 0x10 + 4 * i) & 255) <<
 				(i * 8);
 
+		if (cid == CORESIGHT_CID) {
+			/* set the base to the start of the last 4k block */
+			void __iomem *csbase = tmp + size - 4096;
+
+			dev->uci.devarch =
+				readl(csbase + UCI_REG_DEVARCH_OFFSET);
+			dev->uci.devtype =
+				readl(csbase + UCI_REG_DEVTYPE_OFFSET) & 0xff;
+		}
+
 		amba_put_disable_pclk(dev);
 
-		if (cid == AMBA_CID || cid == CORESIGHT_CID)
+		if (cid == AMBA_CID || cid == CORESIGHT_CID) {
 			dev->periphid = pid;
+			dev->cid = cid;
+		}
 
 		if (!dev->periphid)
 			ret = -ENODEV;
diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index e3c36223e40b..f99b74a6e4ca 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -58,6 +58,10 @@ struct amba_cs_uci_id {
 	void *data;
 };
 
+/* define offsets for registers used by UCI */
+#define UCI_REG_DEVTYPE_OFFSET	0xFCC
+#define UCI_REG_DEVARCH_OFFSET	0xFBC
+
 struct clk;
 
 struct amba_device {
@@ -65,6 +69,8 @@ struct amba_device {
 	struct resource		res;
 	struct clk		*pclk;
 	unsigned int		periphid;
+	unsigned int		cid;
+	struct amba_cs_uci_id	uci;
 	unsigned int		irq[AMBA_NR_IRQS];
 	char			*driver_override;
 };
-- 
cgit v1.2.3


From a73881c96d73ee72b7dbbd38a6eeef66182a8ef7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 26 Feb 2019 06:33:02 -0500
Subject: SUNRPC: Fix an Oops in udp_poll()

udp_poll() checks the struct file for the O_NONBLOCK flag, so we must not
call it with a NULL file pointer.

Fixes: 0ffe86f48026 ("SUNRPC: Use poll() to fix up the socket requeue races")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprtsock.h |  1 +
 net/sunrpc/xprtsock.c           | 21 +++++++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
index 458bfe0137f5..b81d0b3e0799 100644
--- a/include/linux/sunrpc/xprtsock.h
+++ b/include/linux/sunrpc/xprtsock.h
@@ -26,6 +26,7 @@ struct sock_xprt {
 	 */
 	struct socket *		sock;
 	struct sock *		inet;
+	struct file *		file;
 
 	/*
 	 * State of TCP reply receive
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 53de72d2dded..e829036ed81f 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -670,7 +670,8 @@ out_err:
 
 static __poll_t xs_poll_socket(struct sock_xprt *transport)
 {
-	return transport->sock->ops->poll(NULL, transport->sock, NULL);
+	return transport->sock->ops->poll(transport->file, transport->sock,
+			NULL);
 }
 
 static bool xs_poll_socket_readable(struct sock_xprt *transport)
@@ -1253,6 +1254,7 @@ static void xs_reset_transport(struct sock_xprt *transport)
 	struct socket *sock = transport->sock;
 	struct sock *sk = transport->inet;
 	struct rpc_xprt *xprt = &transport->xprt;
+	struct file *filp = transport->file;
 
 	if (sk == NULL)
 		return;
@@ -1266,6 +1268,7 @@ static void xs_reset_transport(struct sock_xprt *transport)
 	write_lock_bh(&sk->sk_callback_lock);
 	transport->inet = NULL;
 	transport->sock = NULL;
+	transport->file = NULL;
 
 	sk->sk_user_data = NULL;
 
@@ -1278,7 +1281,7 @@ static void xs_reset_transport(struct sock_xprt *transport)
 	mutex_unlock(&transport->recv_mutex);
 
 	trace_rpc_socket_close(xprt, sock);
-	sock_release(sock);
+	fput(filp);
 
 	xprt_disconnect_done(xprt);
 }
@@ -1873,6 +1876,7 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt,
 		struct sock_xprt *transport, int family, int type,
 		int protocol, bool reuseport)
 {
+	struct file *filp;
 	struct socket *sock;
 	int err;
 
@@ -1893,6 +1897,11 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt,
 		goto out;
 	}
 
+	filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
+	if (IS_ERR(filp))
+		return ERR_CAST(filp);
+	transport->file = filp;
+
 	return sock;
 out:
 	return ERR_PTR(err);
@@ -1938,6 +1947,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
 static int xs_local_setup_socket(struct sock_xprt *transport)
 {
 	struct rpc_xprt *xprt = &transport->xprt;
+	struct file *filp;
 	struct socket *sock;
 	int status = -EIO;
 
@@ -1950,6 +1960,13 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
 	}
 	xs_reclassify_socket(AF_LOCAL, sock);
 
+	filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
+	if (IS_ERR(filp)) {
+		status = PTR_ERR(filp);
+		goto out;
+	}
+	transport->file = filp;
+
 	dprintk("RPC:       worker connecting xprt %p via AF_LOCAL to %s\n",
 			xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
 
-- 
cgit v1.2.3


From 82c10ac79b79481f3246cb6d011da7a15776e762 Mon Sep 17 00:00:00 2001
From: Avri Altman <avri.altman@wdc.com>
Date: Sun, 17 Feb 2019 08:30:27 +0200
Subject: scsi: clean obsolete return values of eh_timed_out

Those are no longer in use since commit 242f9dcb8ba6
("block: unify request timeout handling").

Signed-off-by: Avri Altman <avri.altman@wdc.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/scsi/scsi_host.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 4047d68d1b08..dbbdfbdb1a04 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -300,11 +300,7 @@ struct scsi_host_template {
 	/*
 	 * This is an optional routine that allows the transport to become
 	 * involved when a scsi io timer fires. The return value tells the
-	 * timer routine how to finish the io timeout handling:
-	 * EH_HANDLED:		I fixed the error, please complete the command
-	 * EH_RESET_TIMER:	I need more time, reset the timer and
-	 *			begin counting again
-	 * EH_DONE:		Begin normal error recovery
+	 * timer routine how to finish the io timeout handling.
 	 *
 	 * Status: OPTIONAL
 	 */
-- 
cgit v1.2.3


From 64f4466c887e2f16cb01467c8064ff1106c980a3 Mon Sep 17 00:00:00 2001
From: Seiya Wang <seiya.wang@mediatek.com>
Date: Mon, 25 Feb 2019 14:51:12 +0800
Subject: clk: mediatek: correct cpu clock name for MT8173 SoC

Correct cpu clock name from ca57 to ca72 since MT8173 does use cortex-a72.

Signed-off-by: Seiya Wang <seiya.wang@mediatek.com>
Reviewed-by: Matthias Brugger <matthias.bgg@gmail.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/mediatek/clk-mt8173.c      | 4 ++--
 include/dt-bindings/clock/mt8173-clk.h | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/mediatek/clk-mt8173.c b/drivers/clk/mediatek/clk-mt8173.c
index 96c292c3e440..deedeb3ea33b 100644
--- a/drivers/clk/mediatek/clk-mt8173.c
+++ b/drivers/clk/mediatek/clk-mt8173.c
@@ -533,7 +533,7 @@ static const char * const ca53_parents[] __initconst = {
 	"univpll"
 };
 
-static const char * const ca57_parents[] __initconst = {
+static const char * const ca72_parents[] __initconst = {
 	"clk26m",
 	"armca15pll",
 	"mainpll",
@@ -542,7 +542,7 @@ static const char * const ca57_parents[] __initconst = {
 
 static const struct mtk_composite cpu_muxes[] __initconst = {
 	MUX(CLK_INFRA_CA53SEL, "infra_ca53_sel", ca53_parents, 0x0000, 0, 2),
-	MUX(CLK_INFRA_CA57SEL, "infra_ca57_sel", ca57_parents, 0x0000, 2, 2),
+	MUX(CLK_INFRA_CA72SEL, "infra_ca72_sel", ca72_parents, 0x0000, 2, 2),
 };
 
 static const struct mtk_composite top_muxes[] __initconst = {
diff --git a/include/dt-bindings/clock/mt8173-clk.h b/include/dt-bindings/clock/mt8173-clk.h
index 8aea623dd518..76e4e5b65353 100644
--- a/include/dt-bindings/clock/mt8173-clk.h
+++ b/include/dt-bindings/clock/mt8173-clk.h
@@ -194,7 +194,8 @@
 #define CLK_INFRA_PMICWRAP		11
 #define CLK_INFRA_CLK_13M		12
 #define CLK_INFRA_CA53SEL               13
-#define CLK_INFRA_CA57SEL               14
+#define CLK_INFRA_CA57SEL               14 /* Deprecated. Don't use it. */
+#define CLK_INFRA_CA72SEL               14
 #define CLK_INFRA_NR_CLK                15
 
 /* PERI_SYS */
-- 
cgit v1.2.3


From 4d633062c1c0794a6b3836b7b55afba4599736e8 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 27 Feb 2019 20:40:10 +0800
Subject: block: introduce bvec_nth_page()

Single-page bvec can often be seen in small BS workloads, so
introduce bvec_nth_page() for avoiding to call nth_page() unnecessarily,
which looks not cheap.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c    |  2 +-
 include/linux/bvec.h | 11 ++++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 066b66430523..c7e8a8273460 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -483,7 +483,7 @@ static unsigned blk_bvec_map_sg(struct request_queue *q,
 
 		offset = (total + bvec->bv_offset) % PAGE_SIZE;
 		idx = (total + bvec->bv_offset) / PAGE_SIZE;
-		pg = nth_page(bvec->bv_page, idx);
+		pg = bvec_nth_page(bvec->bv_page, idx);
 
 		sg_set_page(*sg, pg, seg_size, offset);
 
diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 30a57b68d017..4376f683c08a 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -51,6 +51,11 @@ struct bvec_iter_all {
 	unsigned	done;
 };
 
+static inline struct page *bvec_nth_page(struct page *page, int idx)
+{
+	return idx == 0 ? page : nth_page(page, idx);
+}
+
 /*
  * various member access, note that bio_data should of course not be used
  * on highmem page vectors
@@ -87,8 +92,8 @@ struct bvec_iter_all {
 	      PAGE_SIZE - bvec_iter_offset((bvec), (iter)))
 
 #define bvec_iter_page(bvec, iter)				\
-	nth_page(mp_bvec_iter_page((bvec), (iter)),		\
-		 mp_bvec_iter_page_idx((bvec), (iter)))
+	bvec_nth_page(mp_bvec_iter_page((bvec), (iter)),		\
+		      mp_bvec_iter_page_idx((bvec), (iter)))
 
 #define bvec_iter_bvec(bvec, iter)				\
 ((struct bio_vec) {						\
@@ -171,7 +176,7 @@ static inline void mp_bvec_last_segment(const struct bio_vec *bvec,
 	unsigned total = bvec->bv_offset + bvec->bv_len;
 	unsigned last_page = (total - 1) / PAGE_SIZE;
 
-	seg->bv_page = nth_page(bvec->bv_page, last_page);
+	seg->bv_page = bvec_nth_page(bvec->bv_page, last_page);
 
 	/* the whole segment is inside the last page */
 	if (bvec->bv_offset >= last_page * PAGE_SIZE) {
-- 
cgit v1.2.3


From b9cef50921eb018917e3c60b0c7f4ae1e113ad03 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.com>
Date: Tue, 26 Feb 2019 15:56:42 +0100
Subject: scsi: kill command serial number

No users left, kill it.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi.c      | 16 ----------------
 drivers/scsi/scsi_lib.c  |  1 -
 include/scsi/scsi_cmnd.h | 10 ----------
 include/scsi/scsi_host.h |  8 --------
 4 files changed, 35 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 7675ff0ca2ea..99a7b9f520ae 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -174,22 +174,6 @@ void scsi_log_completion(struct scsi_cmnd *cmd, int disposition)
 }
 #endif
 
-/**
- * scsi_cmd_get_serial - Assign a serial number to a command
- * @host: the scsi host
- * @cmd: command to assign serial number to
- *
- * Description: a serial number identifies a request for error recovery
- * and debugging purposes.  Protected by the Host_Lock of host.
- */
-void scsi_cmd_get_serial(struct Scsi_Host *host, struct scsi_cmnd *cmd)
-{
-	cmd->serial_number = host->cmd_serial_number++;
-	if (cmd->serial_number == 0) 
-		cmd->serial_number = host->cmd_serial_number++;
-}
-EXPORT_SYMBOL(scsi_cmd_get_serial);
-
 /**
  * scsi_finish_command - cleanup and pass command back to upper layer
  * @cmd: the command
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index c77cb0d31dbc..95466a5bf6d6 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -316,7 +316,6 @@ EXPORT_SYMBOL(__scsi_execute);
  */
 static void scsi_init_cmd_errh(struct scsi_cmnd *cmd)
 {
-	cmd->serial_number = 0;
 	scsi_set_resid(cmd, 0);
 	memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
 	if (cmd->cmd_len == 0)
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index 6c7793c2ad01..76ed5e4acd38 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -74,16 +74,6 @@ struct scsi_cmnd {
 
 	int eh_eflags;		/* Used by error handlr */
 
-	/*
-	 * A SCSI Command is assigned a nonzero serial_number before passed
-	 * to the driver's queue command function.  The serial_number is
-	 * cleared when scsi_done is entered indicating that the command
-	 * has been completed.  It is a bug for LLDDs to use this number
-	 * for purposes other than printk (and even that is only useful
-	 * for debugging).
-	 */
-	unsigned long serial_number;
-
 	/*
 	 * This is set to jiffies as it was when the command was first
 	 * allocated.  It is used to time how long the command has
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index dbbdfbdb1a04..2b539a1b3f62 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -486,7 +486,6 @@ struct scsi_host_template {
 		unsigned long irq_flags;				\
 		int rc;							\
 		spin_lock_irqsave(shost->host_lock, irq_flags);		\
-		scsi_cmd_get_serial(shost, cmd);			\
 		rc = func_name##_lck (cmd, cmd->scsi_done);			\
 		spin_unlock_irqrestore(shost->host_lock, irq_flags);	\
 		return rc;						\
@@ -596,12 +595,6 @@ struct Scsi_Host {
 	 * is nr_hw_queues * can_queue.
 	 */
 	unsigned nr_hw_queues;
-	/* 
-	 * Used to assign serial numbers to the cmds.
-	 * Protected by the host lock.
-	 */
-	unsigned long cmd_serial_number;
-	
 	unsigned active_mode:2;
 	unsigned unchecked_isa_dma:1;
 
@@ -738,7 +731,6 @@ extern int scsi_host_busy(struct Scsi_Host *shost);
 extern void scsi_host_put(struct Scsi_Host *t);
 extern struct Scsi_Host *scsi_host_lookup(unsigned short);
 extern const char *scsi_host_state_name(enum scsi_host_state);
-extern void scsi_cmd_get_serial(struct Scsi_Host *, struct scsi_cmnd *);
 
 static inline int __must_check scsi_add_host(struct Scsi_Host *host,
 					     struct device *dev)
-- 
cgit v1.2.3


From 31d921c7fb9691722ba9503b64153cdc322a7fa8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 1 Nov 2018 23:07:24 +0000
Subject: vfs: Add configuration parser helpers

Because the new API passes in key,value parameters, match_token() cannot be
used with it.  Instead, provide three new helpers to aid with parsing:

 (1) fs_parse().  This takes a parameter and a simple static description of
     all the parameters and maps the key name to an ID.  It returns 1 on a
     match, 0 on no match if unknowns should be ignored and some other
     negative error code on a parse error.

     The parameter description includes a list of key names to IDs, desired
     parameter types and a list of enumeration name -> ID mappings.

     [!] Note that for the moment I've required that the key->ID mapping
     array is expected to be sorted and unterminated.  The size of the
     array is noted in the fsconfig_parser struct.  This allows me to use
     bsearch(), but I'm not sure any performance gain is worth the hassle
     of requiring people to keep the array sorted.

     The parameter type array is sized according to the number of parameter
     IDs and is indexed directly.  The optional enum mapping array is an
     unterminated, unsorted list and the size goes into the fsconfig_parser
     struct.

     The function can do some additional things:

	(a) If it's not ambiguous and no value is given, the prefix "no" on
	    a key name is permitted to indicate that the parameter should
	    be considered negatory.

	(b) If the desired type is a single simple integer, it will perform
	    an appropriate conversion and store the result in a union in
	    the parse result.

	(c) If the desired type is an enumeration, {key ID, name} will be
	    looked up in the enumeration list and the matching value will
	    be stored in the parse result union.

	(d) Optionally generate an error if the key is unrecognised.

     This is called something like:

	enum rdt_param {
		Opt_cdp,
		Opt_cdpl2,
		Opt_mba_mpbs,
		nr__rdt_params
	};

	const struct fs_parameter_spec rdt_param_specs[nr__rdt_params] = {
		[Opt_cdp]	= { fs_param_is_bool },
		[Opt_cdpl2]	= { fs_param_is_bool },
		[Opt_mba_mpbs]	= { fs_param_is_bool },
	};

	const const char *const rdt_param_keys[nr__rdt_params] = {
		[Opt_cdp]	= "cdp",
		[Opt_cdpl2]	= "cdpl2",
		[Opt_mba_mpbs]	= "mba_mbps",
	};

	const struct fs_parameter_description rdt_parser = {
		.name		= "rdt",
		.nr_params	= nr__rdt_params,
		.keys		= rdt_param_keys,
		.specs		= rdt_param_specs,
		.no_source	= true,
	};

	int rdt_parse_param(struct fs_context *fc,
			    struct fs_parameter *param)
	{
		struct fs_parse_result parse;
		struct rdt_fs_context *ctx = rdt_fc2context(fc);
		int ret;

		ret = fs_parse(fc, &rdt_parser, param, &parse);
		if (ret < 0)
			return ret;

		switch (parse.key) {
		case Opt_cdp:
			ctx->enable_cdpl3 = true;
			return 0;
		case Opt_cdpl2:
			ctx->enable_cdpl2 = true;
			return 0;
		case Opt_mba_mpbs:
			ctx->enable_mba_mbps = true;
			return 0;
		}

		return -EINVAL;
	}

 (2) fs_lookup_param().  This takes a { dirfd, path, LOOKUP_EMPTY? } or
     string value and performs an appropriate path lookup to convert it
     into a path object, which it will then return.

     If the desired type was a blockdev, the type of the looked up inode
     will be checked to make sure it is one.

     This can be used like:

	enum foo_param {
		Opt_source,
		nr__foo_params
	};

	const struct fs_parameter_spec foo_param_specs[nr__foo_params] = {
		[Opt_source]	= { fs_param_is_blockdev },
	};

	const char *char foo_param_keys[nr__foo_params] = {
		[Opt_source]	= "source",
	};

	const struct constant_table foo_param_alt_keys[] = {
		{ "device",	Opt_source },
	};

	const struct fs_parameter_description foo_parser = {
		.name		= "foo",
		.nr_params	= nr__foo_params,
		.nr_alt_keys	= ARRAY_SIZE(foo_param_alt_keys),
		.keys		= foo_param_keys,
		.alt_keys	= foo_param_alt_keys,
		.specs		= foo_param_specs,
	};

	int foo_parse_param(struct fs_context *fc,
			    struct fs_parameter *param)
	{
		struct fs_parse_result parse;
		struct foo_fs_context *ctx = foo_fc2context(fc);
		int ret;

		ret = fs_parse(fc, &foo_parser, param, &parse);
		if (ret < 0)
			return ret;

		switch (parse.key) {
		case Opt_source:
			return fs_lookup_param(fc, &foo_parser, param,
					       &parse, &ctx->source);
		default:
			return -EINVAL;
		}
	}

 (3) lookup_constant().  This takes a table of named constants and looks up
     the given name within it.  The table is expected to be sorted such
     that bsearch() be used upon it.

     Possibly I should require the table be terminated and just use a
     for-loop to scan it instead of using bsearch() to reduce hassle.

     Tables look something like:

	static const struct constant_table bool_names[] = {
		{ "0",		false },
		{ "1",		true },
		{ "false",	false },
		{ "no",		false },
		{ "true",	true },
		{ "yes",	true },
	};

     and a lookup is done with something like:

	b = lookup_constant(bool_names, param->string, -1);

Additionally, optional validation routines for the parameter description
are provided that can be enabled at compile time.  A later patch will
invoke these when a filesystem is registered.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/Kconfig                 |   7 +
 fs/Makefile                |   2 +-
 fs/fs_parser.c             | 447 +++++++++++++++++++++++++++++++++++++++++++++
 fs/internal.h              |   2 +
 fs/namei.c                 |   4 +-
 include/linux/errno.h      |   1 +
 include/linux/fs_context.h |  29 +++
 include/linux/fs_parser.h  | 151 +++++++++++++++
 8 files changed, 640 insertions(+), 3 deletions(-)
 create mode 100644 fs/fs_parser.c
 create mode 100644 include/linux/fs_parser.h

(limited to 'include')

diff --git a/fs/Kconfig b/fs/Kconfig
index ac474a61be37..25700b152c75 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -8,6 +8,13 @@ menu "File systems"
 config DCACHE_WORD_ACCESS
        bool
 
+config VALIDATE_FS_PARSER
+	bool "Validate filesystem parameter description"
+	default y
+	help
+	  Enable this to perform validation of the parameter description for a
+	  filesystem when it is registered.
+
 if BLOCK
 
 config FS_IOMAP
diff --git a/fs/Makefile b/fs/Makefile
index 5563cf34f7c2..9a0b8003f069 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -13,7 +13,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o splice.o sync.o utimes.o d_path.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
-		fs_context.o
+		fs_context.o fs_parser.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
new file mode 100644
index 000000000000..842e8f749db6
--- /dev/null
+++ b/fs/fs_parser.c
@@ -0,0 +1,447 @@
+/* Filesystem parameter parser.
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/export.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/namei.h>
+#include "internal.h"
+
+static const struct constant_table bool_names[] = {
+	{ "0",		false },
+	{ "1",		true },
+	{ "false",	false },
+	{ "no",		false },
+	{ "true",	true },
+	{ "yes",	true },
+};
+
+/**
+ * lookup_constant - Look up a constant by name in an ordered table
+ * @tbl: The table of constants to search.
+ * @tbl_size: The size of the table.
+ * @name: The name to look up.
+ * @not_found: The value to return if the name is not found.
+ */
+int __lookup_constant(const struct constant_table *tbl, size_t tbl_size,
+		      const char *name, int not_found)
+{
+	unsigned int i;
+
+	for (i = 0; i < tbl_size; i++)
+		if (strcmp(name, tbl[i].name) == 0)
+			return tbl[i].value;
+
+	return not_found;
+}
+EXPORT_SYMBOL(__lookup_constant);
+
+static const struct fs_parameter_spec *fs_lookup_key(
+	const struct fs_parameter_description *desc,
+	const char *name)
+{
+	const struct fs_parameter_spec *p;
+
+	if (!desc->specs)
+		return NULL;
+
+	for (p = desc->specs; p->name; p++)
+		if (strcmp(p->name, name) == 0)
+			return p;
+
+	return NULL;
+}
+
+/*
+ * fs_parse - Parse a filesystem configuration parameter
+ * @fc: The filesystem context to log errors through.
+ * @desc: The parameter description to use.
+ * @param: The parameter.
+ * @result: Where to place the result of the parse
+ *
+ * Parse a filesystem configuration parameter and attempt a conversion for a
+ * simple parameter for which this is requested.  If successful, the determined
+ * parameter ID is placed into @result->key, the desired type is indicated in
+ * @result->t and any converted value is placed into an appropriate member of
+ * the union in @result.
+ *
+ * The function returns the parameter number if the parameter was matched,
+ * -ENOPARAM if it wasn't matched and @desc->ignore_unknown indicated that
+ * unknown parameters are okay and -EINVAL if there was a conversion issue or
+ * the parameter wasn't recognised and unknowns aren't okay.
+ */
+int fs_parse(struct fs_context *fc,
+	     const struct fs_parameter_description *desc,
+	     struct fs_parameter *param,
+	     struct fs_parse_result *result)
+{
+	const struct fs_parameter_spec *p;
+	const struct fs_parameter_enum *e;
+	int ret = -ENOPARAM, b;
+
+	result->has_value = !!param->string;
+	result->negated = false;
+	result->uint_64 = 0;
+
+	p = fs_lookup_key(desc, param->key);
+	if (!p) {
+		/* If we didn't find something that looks like "noxxx", see if
+		 * "xxx" takes the "no"-form negative - but only if there
+		 * wasn't an value.
+		 */
+		if (result->has_value)
+			goto unknown_parameter;
+		if (param->key[0] != 'n' || param->key[1] != 'o' || !param->key[2])
+			goto unknown_parameter;
+
+		p = fs_lookup_key(desc, param->key + 2);
+		if (!p)
+			goto unknown_parameter;
+		if (!(p->flags & fs_param_neg_with_no))
+			goto unknown_parameter;
+		result->boolean = false;
+		result->negated = true;
+	}
+
+	if (p->flags & fs_param_deprecated)
+		warnf(fc, "%s: Deprecated parameter '%s'",
+		      desc->name, param->key);
+
+	if (result->negated)
+		goto okay;
+
+	/* Certain parameter types only take a string and convert it. */
+	switch (p->type) {
+	case __fs_param_wasnt_defined:
+		return -EINVAL;
+	case fs_param_is_u32:
+	case fs_param_is_u32_octal:
+	case fs_param_is_u32_hex:
+	case fs_param_is_s32:
+	case fs_param_is_u64:
+	case fs_param_is_enum:
+	case fs_param_is_string:
+		if (param->type != fs_value_is_string)
+			goto bad_value;
+		if (!result->has_value) {
+			if (p->flags & fs_param_v_optional)
+				goto okay;
+			goto bad_value;
+		}
+		/* Fall through */
+	default:
+		break;
+	}
+
+	/* Try to turn the type we were given into the type desired by the
+	 * parameter and give an error if we can't.
+	 */
+	switch (p->type) {
+	case fs_param_is_flag:
+		if (param->type != fs_value_is_flag &&
+		    (param->type != fs_value_is_string || result->has_value))
+			return invalf(fc, "%s: Unexpected value for '%s'",
+				      desc->name, param->key);
+		result->boolean = true;
+		goto okay;
+
+	case fs_param_is_bool:
+		switch (param->type) {
+		case fs_value_is_flag:
+			result->boolean = true;
+			goto okay;
+		case fs_value_is_string:
+			if (param->size == 0) {
+				result->boolean = true;
+				goto okay;
+			}
+			b = lookup_constant(bool_names, param->string, -1);
+			if (b == -1)
+				goto bad_value;
+			result->boolean = b;
+			goto okay;
+		default:
+			goto bad_value;
+		}
+
+	case fs_param_is_u32:
+		ret = kstrtouint(param->string, 0, &result->uint_32);
+		goto maybe_okay;
+	case fs_param_is_u32_octal:
+		ret = kstrtouint(param->string, 8, &result->uint_32);
+		goto maybe_okay;
+	case fs_param_is_u32_hex:
+		ret = kstrtouint(param->string, 16, &result->uint_32);
+		goto maybe_okay;
+	case fs_param_is_s32:
+		ret = kstrtoint(param->string, 0, &result->int_32);
+		goto maybe_okay;
+	case fs_param_is_u64:
+		ret = kstrtoull(param->string, 0, &result->uint_64);
+		goto maybe_okay;
+
+	case fs_param_is_enum:
+		for (e = desc->enums; e->name[0]; e++) {
+			if (e->opt == p->opt &&
+			    strcmp(e->name, param->string) == 0) {
+				result->uint_32 = e->value;
+				goto okay;
+			}
+		}
+		goto bad_value;
+
+	case fs_param_is_string:
+		goto okay;
+	case fs_param_is_blob:
+		if (param->type != fs_value_is_blob)
+			goto bad_value;
+		goto okay;
+
+	case fs_param_is_fd: {
+		if (param->type != fs_value_is_file)
+			goto bad_value;
+		goto okay;
+	}
+
+	case fs_param_is_blockdev:
+	case fs_param_is_path:
+		goto okay;
+	default:
+		BUG();
+	}
+
+maybe_okay:
+	if (ret < 0)
+		goto bad_value;
+okay:
+	return p->opt;
+
+bad_value:
+	return invalf(fc, "%s: Bad value for '%s'", desc->name, param->key);
+unknown_parameter:
+	return -ENOPARAM;
+}
+EXPORT_SYMBOL(fs_parse);
+
+/**
+ * fs_lookup_param - Look up a path referred to by a parameter
+ * @fc: The filesystem context to log errors through.
+ * @param: The parameter.
+ * @want_bdev: T if want a blockdev
+ * @_path: The result of the lookup
+ */
+int fs_lookup_param(struct fs_context *fc,
+		    struct fs_parameter *param,
+		    bool want_bdev,
+		    struct path *_path)
+{
+	struct filename *f;
+	unsigned int flags = 0;
+	bool put_f;
+	int ret;
+
+	switch (param->type) {
+	case fs_value_is_string:
+		f = getname_kernel(param->string);
+		if (IS_ERR(f))
+			return PTR_ERR(f);
+		put_f = true;
+		break;
+	case fs_value_is_filename_empty:
+		flags = LOOKUP_EMPTY;
+		/* Fall through */
+	case fs_value_is_filename:
+		f = param->name;
+		put_f = false;
+		break;
+	default:
+		return invalf(fc, "%s: not usable as path", param->key);
+	}
+
+	ret = filename_lookup(param->dirfd, f, flags, _path, NULL);
+	if (ret < 0) {
+		errorf(fc, "%s: Lookup failure for '%s'", param->key, f->name);
+		goto out;
+	}
+
+	if (want_bdev &&
+	    !S_ISBLK(d_backing_inode(_path->dentry)->i_mode)) {
+		path_put(_path);
+		_path->dentry = NULL;
+		_path->mnt = NULL;
+		errorf(fc, "%s: Non-blockdev passed as '%s'",
+		       param->key, f->name);
+		ret = -ENOTBLK;
+	}
+
+out:
+	if (put_f)
+		putname(f);
+	return ret;
+}
+EXPORT_SYMBOL(fs_lookup_param);
+
+#ifdef CONFIG_VALIDATE_FS_PARSER
+/**
+ * validate_constant_table - Validate a constant table
+ * @name: Name to use in reporting
+ * @tbl: The constant table to validate.
+ * @tbl_size: The size of the table.
+ * @low: The lowest permissible value.
+ * @high: The highest permissible value.
+ * @special: One special permissible value outside of the range.
+ */
+bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
+			     int low, int high, int special)
+{
+	size_t i;
+	bool good = true;
+
+	if (tbl_size == 0) {
+		pr_warn("VALIDATE C-TBL: Empty\n");
+		return true;
+	}
+
+	for (i = 0; i < tbl_size; i++) {
+		if (!tbl[i].name) {
+			pr_err("VALIDATE C-TBL[%zu]: Null\n", i);
+			good = false;
+		} else if (i > 0 && tbl[i - 1].name) {
+			int c = strcmp(tbl[i-1].name, tbl[i].name);
+
+			if (c == 0) {
+				pr_err("VALIDATE C-TBL[%zu]: Duplicate %s\n",
+				       i, tbl[i].name);
+				good = false;
+			}
+			if (c > 0) {
+				pr_err("VALIDATE C-TBL[%zu]: Missorted %s>=%s\n",
+				       i, tbl[i-1].name, tbl[i].name);
+				good = false;
+			}
+		}
+
+		if (tbl[i].value != special &&
+		    (tbl[i].value < low || tbl[i].value > high)) {
+			pr_err("VALIDATE C-TBL[%zu]: %s->%d const out of range (%d-%d)\n",
+			       i, tbl[i].name, tbl[i].value, low, high);
+			good = false;
+		}
+	}
+
+	return good;
+}
+
+/**
+ * fs_validate_description - Validate a parameter description
+ * @desc: The parameter description to validate.
+ */
+bool fs_validate_description(const struct fs_parameter_description *desc)
+{
+	const struct fs_parameter_spec *param, *p2;
+	const struct fs_parameter_enum *e;
+	const char *name = desc->name;
+	unsigned int nr_params = 0;
+	bool good = true, enums = false;
+
+	pr_notice("*** VALIDATE %s ***\n", name);
+
+	if (!name[0]) {
+		pr_err("VALIDATE Parser: No name\n");
+		name = "Unknown";
+		good = false;
+	}
+
+	if (desc->specs) {
+		for (param = desc->specs; param->name; param++) {
+			enum fs_parameter_type t = param->type;
+
+			/* Check that the type is in range */
+			if (t == __fs_param_wasnt_defined ||
+			    t >= nr__fs_parameter_type) {
+				pr_err("VALIDATE %s: PARAM[%s] Bad type %u\n",
+				       name, param->name, t);
+				good = false;
+			} else if (t == fs_param_is_enum) {
+				enums = true;
+			}
+
+			/* Check for duplicate parameter names */
+			for (p2 = desc->specs; p2 < param; p2++) {
+				if (strcmp(param->name, p2->name) == 0) {
+					pr_err("VALIDATE %s: PARAM[%s]: Duplicate\n",
+					       name, param->name);
+					good = false;
+				}
+			}
+		}
+
+		nr_params = param - desc->specs;
+	}
+
+	if (desc->enums) {
+		if (!nr_params) {
+			pr_err("VALIDATE %s: Enum table but no parameters\n",
+			       name);
+			good = false;
+			goto no_enums;
+		}
+		if (!enums) {
+			pr_err("VALIDATE %s: Enum table but no enum-type values\n",
+			       name);
+			good = false;
+			goto no_enums;
+		}
+
+		for (e = desc->enums; e->name[0]; e++) {
+			/* Check that all entries in the enum table have at
+			 * least one parameter that uses them.
+			 */
+			for (param = desc->specs; param->name; param++) {
+				if (param->opt == e->opt &&
+				    param->type != fs_param_is_enum) {
+					pr_err("VALIDATE %s: e[%lu] enum val for %s\n",
+					       name, e - desc->enums, param->name);
+					good = false;
+				}
+			}
+		}
+
+		/* Check that all enum-type parameters have at least one enum
+		 * value in the enum table.
+		 */
+		for (param = desc->specs; param->name; param++) {
+			if (param->type != fs_param_is_enum)
+				continue;
+			for (e = desc->enums; e->name[0]; e++)
+				if (e->opt == param->opt)
+					break;
+			if (!e->name[0]) {
+				pr_err("VALIDATE %s: PARAM[%s] enum with no values\n",
+				       name, param->name);
+				good = false;
+			}
+		}
+	} else {
+		if (enums) {
+			pr_err("VALIDATE %s: enum-type values, but no enum table\n",
+			       name);
+			good = false;
+			goto no_enums;
+		}
+	}
+
+no_enums:
+	return good;
+}
+#endif /* CONFIG_VALIDATE_FS_PARSER */
diff --git a/fs/internal.h b/fs/internal.h
index 8f8d07cc433f..6a8b71643af4 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -61,6 +61,8 @@ extern void fc_drop_locked(struct fs_context *);
 /*
  * namei.c
  */
+extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
+			   struct path *path, struct path *root);
 extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct path *);
diff --git a/fs/namei.c b/fs/namei.c
index 914178cdbe94..a85deb55d0c9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2333,8 +2333,8 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
 	return err;
 }
 
-static int filename_lookup(int dfd, struct filename *name, unsigned flags,
-			   struct path *path, struct path *root)
+int filename_lookup(int dfd, struct filename *name, unsigned flags,
+		    struct path *path, struct path *root)
 {
 	int retval;
 	struct nameidata nd;
diff --git a/include/linux/errno.h b/include/linux/errno.h
index 3cba627577d6..d73f597a2484 100644
--- a/include/linux/errno.h
+++ b/include/linux/errno.h
@@ -18,6 +18,7 @@
 #define ERESTART_RESTARTBLOCK 516 /* restart by calling sys_restart_syscall */
 #define EPROBE_DEFER	517	/* Driver requests probe retry */
 #define EOPENSTALE	518	/* open found a stale dentry */
+#define ENOPARAM	519	/* Parameter not supported */
 
 /* Defined for the NFSv3 protocol */
 #define EBADHANDLE	521	/* Illegal NFS file handle */
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index d208cc40b868..899027c94788 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -34,6 +34,35 @@ enum fs_context_purpose {
 	FS_CONTEXT_FOR_RECONFIGURE,	/* Superblock reconfiguration (remount) */
 };
 
+/*
+ * Type of parameter value.
+ */
+enum fs_value_type {
+	fs_value_is_undefined,
+	fs_value_is_flag,		/* Value not given a value */
+	fs_value_is_string,		/* Value is a string */
+	fs_value_is_blob,		/* Value is a binary blob */
+	fs_value_is_filename,		/* Value is a filename* + dirfd */
+	fs_value_is_filename_empty,	/* Value is a filename* + dirfd + AT_EMPTY_PATH */
+	fs_value_is_file,		/* Value is a file* */
+};
+
+/*
+ * Configuration parameter.
+ */
+struct fs_parameter {
+	const char		*key;		/* Parameter name */
+	enum fs_value_type	type:8;		/* The type of value here */
+	union {
+		char		*string;
+		void		*blob;
+		struct filename	*name;
+		struct file	*file;
+	};
+	size_t	size;
+	int	dirfd;
+};
+
 /*
  * Filesystem context for holding the parameters used in the creation or
  * reconfiguration of a superblock.
diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h
new file mode 100644
index 000000000000..d966f96ffe62
--- /dev/null
+++ b/include/linux/fs_parser.h
@@ -0,0 +1,151 @@
+/* Filesystem parameter description and parser
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _LINUX_FS_PARSER_H
+#define _LINUX_FS_PARSER_H
+
+#include <linux/fs_context.h>
+
+struct path;
+
+struct constant_table {
+	const char	*name;
+	int		value;
+};
+
+/*
+ * The type of parameter expected.
+ */
+enum fs_parameter_type {
+	__fs_param_wasnt_defined,
+	fs_param_is_flag,
+	fs_param_is_bool,
+	fs_param_is_u32,
+	fs_param_is_u32_octal,
+	fs_param_is_u32_hex,
+	fs_param_is_s32,
+	fs_param_is_u64,
+	fs_param_is_enum,
+	fs_param_is_string,
+	fs_param_is_blob,
+	fs_param_is_blockdev,
+	fs_param_is_path,
+	fs_param_is_fd,
+	nr__fs_parameter_type,
+};
+
+/*
+ * Specification of the type of value a parameter wants.
+ *
+ * Note that the fsparam_flag(), fsparam_string(), fsparam_u32(), ... macros
+ * should be used to generate elements of this type.
+ */
+struct fs_parameter_spec {
+	const char		*name;
+	u8			opt;	/* Option number (returned by fs_parse()) */
+	enum fs_parameter_type	type:8;	/* The desired parameter type */
+	unsigned short		flags;
+#define fs_param_v_optional	0x0001	/* The value is optional */
+#define fs_param_neg_with_no	0x0002	/* "noxxx" is negative param */
+#define fs_param_neg_with_empty	0x0004	/* "xxx=" is negative param */
+#define fs_param_deprecated	0x0008	/* The param is deprecated */
+};
+
+struct fs_parameter_enum {
+	u8		opt;		/* Option number (as fs_parameter_spec::opt) */
+	char		name[14];
+	u8		value;
+};
+
+struct fs_parameter_description {
+	const char	name[16];		/* Name for logging purposes */
+	const struct fs_parameter_spec *specs;	/* List of param specifications */
+	const struct fs_parameter_enum *enums;	/* Enum values */
+};
+
+/*
+ * Result of parse.
+ */
+struct fs_parse_result {
+	bool			negated;	/* T if param was "noxxx" */
+	bool			has_value;	/* T if value supplied to param */
+	union {
+		bool		boolean;	/* For spec_bool */
+		int		int_32;		/* For spec_s32/spec_enum */
+		unsigned int	uint_32;	/* For spec_u32{,_octal,_hex}/spec_enum */
+		u64		uint_64;	/* For spec_u64 */
+	};
+};
+
+extern int fs_parse(struct fs_context *fc,
+		    const struct fs_parameter_description *desc,
+		    struct fs_parameter *value,
+		    struct fs_parse_result *result);
+extern int fs_lookup_param(struct fs_context *fc,
+			   struct fs_parameter *param,
+			   bool want_bdev,
+			   struct path *_path);
+
+extern int __lookup_constant(const struct constant_table tbl[], size_t tbl_size,
+			     const char *name, int not_found);
+#define lookup_constant(t, n, nf) __lookup_constant(t, ARRAY_SIZE(t), (n), (nf))
+
+#ifdef CONFIG_VALIDATE_FS_PARSER
+extern bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
+				    int low, int high, int special);
+extern bool fs_validate_description(const struct fs_parameter_description *desc);
+#else
+static inline bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
+					   int low, int high, int special)
+{ return true; }
+static inline bool fs_validate_description(const struct fs_parameter_description *desc)
+{ return true; }
+#endif
+
+/*
+ * Parameter type, name, index and flags element constructors.  Use as:
+ *
+ *  fsparam_xxxx("foo", Opt_foo)
+ *
+ * If existing helpers are not enough, direct use of __fsparam() would
+ * work, but any such case is probably a sign that new helper is needed.
+ * Helpers will remain stable; low-level implementation may change.
+ */
+#define __fsparam(TYPE, NAME, OPT, FLAGS) \
+	{ \
+		.name = NAME, \
+		.opt = OPT, \
+		.type = TYPE, \
+		.flags = FLAGS \
+	}
+
+#define fsparam_flag(NAME, OPT)	__fsparam(fs_param_is_flag, NAME, OPT, 0)
+#define fsparam_flag_no(NAME, OPT) \
+				__fsparam(fs_param_is_flag, NAME, OPT, \
+					    fs_param_neg_with_no)
+#define fsparam_bool(NAME, OPT)	__fsparam(fs_param_is_bool, NAME, OPT, 0)
+#define fsparam_u32(NAME, OPT)	__fsparam(fs_param_is_u32, NAME, OPT, 0)
+#define fsparam_u32oct(NAME, OPT) \
+				__fsparam(fs_param_is_u32_octal, NAME, OPT, 0)
+#define fsparam_u32hex(NAME, OPT) \
+				__fsparam(fs_param_is_u32_hex, NAME, OPT, 0)
+#define fsparam_s32(NAME, OPT)	__fsparam(fs_param_is_s32, NAME, OPT, 0)
+#define fsparam_u64(NAME, OPT)	__fsparam(fs_param_is_u64, NAME, OPT, 0)
+#define fsparam_enum(NAME, OPT)	__fsparam(fs_param_is_enum, NAME, OPT, 0)
+#define fsparam_string(NAME, OPT) \
+				__fsparam(fs_param_is_string, NAME, OPT, 0)
+#define fsparam_blob(NAME, OPT)	__fsparam(fs_param_is_blob, NAME, OPT, 0)
+#define fsparam_bdev(NAME, OPT)	__fsparam(fs_param_is_blockdev, NAME, OPT, 0)
+#define fsparam_path(NAME, OPT)	__fsparam(fs_param_is_path, NAME, OPT, 0)
+#define fsparam_fd(NAME, OPT)	__fsparam(fs_param_is_fd, NAME, OPT, 0)
+
+
+#endif /* _LINUX_FS_PARSER_H */
-- 
cgit v1.2.3


From da2441fdffbf7602da702aea5bd95ca4dc3d63fc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 1 Nov 2018 23:07:24 +0000
Subject: vfs: Add LSM hooks for the new mount API

Add LSM hooks for use by the new mount API and filesystem context code.
This includes:

 (1) Hooks to handle allocation, duplication and freeing of the security
     record attached to a filesystem context.

 (2) A hook to snoop source specifications.  There may be multiple of these
     if the filesystem supports it.  They will to be local files/devices if
     fs_context::source_is_dev is true and will be something else, possibly
     remote server specifications, if false.

 (3) A hook to snoop superblock configuration options in key[=val] form.
     If the LSM decides it wants to handle it, it can suppress the option
     being passed to the filesystem.  Note that 'val' may include commas
     and binary data with the fsopen patch.

 (4) A hook to perform validation and allocation after the configuration
     has been done but before the superblock is allocated and set up.

 (5) A hook to transfer the security from the context to a newly created
     superblock.

 (6) A hook to rule on whether a path point can be used as a mountpoint.

These are intended to replace:

	security_sb_copy_data
	security_sb_kern_mount
	security_sb_mount
	security_sb_set_mnt_opts
	security_sb_clone_mnt_opts
	security_sb_parse_opts_str

[AV -- some of the methods being replaced are already gone, some of the
methods are not added for the lack of need]

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-security-module@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/lsm_hooks.h | 14 ++++++++++++++
 include/linux/security.h  | 10 ++++++++++
 security/security.c       |  5 +++++
 3 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 9a0bdf91e646..47ba4db4d8fb 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -76,6 +76,17 @@
  *	changes on the process such as clearing out non-inheritable signal
  *	state.  This is called immediately after commit_creds().
  *
+ * Security hooks for mount using fs_context.
+ *	[See also Documentation/filesystems/mounting.txt]
+ *
+ * @fs_context_parse_param:
+ *	Userspace provided a parameter to configure a superblock.  The LSM may
+ *	reject it with an error and may use it for itself, in which case it
+ *	should return 0; otherwise it should return -ENOPARAM to pass it on to
+ *	the filesystem.
+ *	@fc indicates the filesystem context.
+ *	@param The parameter
+ *
  * Security hooks for filesystem operations.
  *
  * @sb_alloc_security:
@@ -1459,6 +1470,8 @@ union security_list_options {
 	void (*bprm_committing_creds)(struct linux_binprm *bprm);
 	void (*bprm_committed_creds)(struct linux_binprm *bprm);
 
+	int (*fs_context_parse_param)(struct fs_context *fc, struct fs_parameter *param);
+
 	int (*sb_alloc_security)(struct super_block *sb);
 	void (*sb_free_security)(struct super_block *sb);
 	void (*sb_free_mnt_opts)(void *mnt_opts);
@@ -1800,6 +1813,7 @@ struct security_hook_heads {
 	struct hlist_head bprm_check_security;
 	struct hlist_head bprm_committing_creds;
 	struct hlist_head bprm_committed_creds;
+	struct hlist_head fs_context_parse_param;
 	struct hlist_head sb_alloc_security;
 	struct hlist_head sb_free_security;
 	struct hlist_head sb_free_mnt_opts;
diff --git a/include/linux/security.h b/include/linux/security.h
index dbfb5a66babb..1cc4d7a3d6fa 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -53,6 +53,9 @@ struct msg_msg;
 struct xattr;
 struct xfrm_sec_ctx;
 struct mm_struct;
+struct fs_context;
+struct fs_parameter;
+enum fs_value_type;
 
 /* If capable should audit the security request */
 #define SECURITY_CAP_NOAUDIT 0
@@ -220,6 +223,7 @@ int security_bprm_set_creds(struct linux_binprm *bprm);
 int security_bprm_check(struct linux_binprm *bprm);
 void security_bprm_committing_creds(struct linux_binprm *bprm);
 void security_bprm_committed_creds(struct linux_binprm *bprm);
+int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param);
 int security_sb_alloc(struct super_block *sb);
 void security_sb_free(struct super_block *sb);
 void security_free_mnt_opts(void **mnt_opts);
@@ -517,6 +521,12 @@ static inline void security_bprm_committed_creds(struct linux_binprm *bprm)
 {
 }
 
+static inline int security_fs_context_parse_param(struct fs_context *fc,
+						  struct fs_parameter *param)
+{
+	return -ENOPARAM;
+}
+
 static inline int security_sb_alloc(struct super_block *sb)
 {
 	return 0;
diff --git a/security/security.c b/security/security.c
index f1b8d2587639..e5519488327d 100644
--- a/security/security.c
+++ b/security/security.c
@@ -374,6 +374,11 @@ void security_bprm_committed_creds(struct linux_binprm *bprm)
 	call_void_hook(bprm_committed_creds, bprm);
 }
 
+int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	return call_int_hook(fs_context_parse_param, -ENOPARAM, fc, param);
+}
+
 int security_sb_alloc(struct super_block *sb)
 {
 	return call_int_hook(sb_alloc_security, 0, sb);
-- 
cgit v1.2.3


From 846e56621897a63966b7f03a70be29060394c363 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 1 Nov 2018 23:07:24 +0000
Subject: vfs: Put security flags into the fs_context struct

Put security flags, such as SECURITY_LSM_NATIVE_LABELS, into the filesystem
context so that the filesystem can communicate them to the LSM more easily.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs_context.h | 1 +
 include/linux/security.h   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 899027c94788..d5ff3b0bc28d 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -85,6 +85,7 @@ struct fs_context {
 	void			*security;	/* Linux S&M options */
 	unsigned int		sb_flags;	/* Proposed superblock flags (SB_*) */
 	unsigned int		sb_flags_mask;	/* Superblock flags that were changed */
+	unsigned int		lsm_flags;	/* Information flags from the fs to the LSM */
 	enum fs_context_purpose	purpose:8;
 	bool			need_free:1;	/* Need to call ops->free() */
 };
diff --git a/include/linux/security.h b/include/linux/security.h
index 1cc4d7a3d6fa..2da9336a987e 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -61,7 +61,7 @@ enum fs_value_type;
 #define SECURITY_CAP_NOAUDIT 0
 #define SECURITY_CAP_AUDIT 1
 
-/* LSM Agnostic defines for sb_set_mnt_opts */
+/* LSM Agnostic defines for fs_context::lsm_flags */
 #define SECURITY_LSM_NATIVE_LABELS	1
 
 struct ctl_table;
-- 
cgit v1.2.3


From 3e1aeb00e6d132efc151dacc062b38269bc9eccc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 1 Nov 2018 23:07:25 +0000
Subject: vfs: Implement a filesystem superblock creation/configuration context

[AV - unfuck kern_mount_data(); we want non-NULL ->mnt_ns on long-living
mounts]
[AV - reordering fs/namespace.c is badly overdue, but let's keep it
separate from that series]
[AV - drop simple_pin_fs() change]
[AV - clean vfs_kern_mount() failure exits up]

Implement a filesystem context concept to be used during superblock
creation for mount and superblock reconfiguration for remount.

The mounting procedure then becomes:

 (1) Allocate new fs_context context.

 (2) Configure the context.

 (3) Create superblock.

 (4) Query the superblock.

 (5) Create a mount for the superblock.

 (6) Destroy the context.

Rather than calling fs_type->mount(), an fs_context struct is created and
fs_type->init_fs_context() is called to set it up.  Pointers exist for the
filesystem and LSM to hang their private data off.

A set of operations has to be set by ->init_fs_context() to provide
freeing, duplication, option parsing, binary data parsing, validation,
mounting and superblock filling.

Legacy filesystems are supported by the provision of a set of legacy
fs_context operations that build up a list of mount options and then invoke
fs_type->mount() from within the fs_context ->get_tree() operation.  This
allows all filesystems to be accessed using fs_context.

It should be noted that, whilst this patch adds a lot of lines of code,
there is quite a bit of duplication with existing code that can be
eliminated should all filesystems be converted over.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/filesystems.c           |   4 +
 fs/fs_context.c            | 300 ++++++++++++++++++++++++++++++++++++++++++++-
 fs/namespace.c             |  25 ++--
 include/linux/fs.h         |   2 +
 include/linux/fs_context.h |   5 +
 5 files changed, 319 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/fs/filesystems.c b/fs/filesystems.c
index b03f57b1105b..9135646e41ac 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/fs_parser.h>
 
 /*
  * Handling of filesystem drivers list.
@@ -73,6 +74,9 @@ int register_filesystem(struct file_system_type * fs)
 	int res = 0;
 	struct file_system_type ** p;
 
+	if (fs->parameters && !fs_validate_description(fs->parameters))
+		return -EINVAL;
+
 	BUG_ON(strchr(fs->name, '.'));
 	if (fs->next)
 		return -EBUSY;
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 825d1b2c8807..aa7e0ffb591a 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -12,6 +12,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/nsproxy.h>
@@ -25,13 +26,217 @@
 #include "mount.h"
 #include "internal.h"
 
+enum legacy_fs_param {
+	LEGACY_FS_UNSET_PARAMS,
+	LEGACY_FS_MONOLITHIC_PARAMS,
+	LEGACY_FS_INDIVIDUAL_PARAMS,
+};
+
 struct legacy_fs_context {
 	char			*legacy_data;	/* Data page for legacy filesystems */
 	size_t			data_size;
+	enum legacy_fs_param	param_type;
 };
 
 static int legacy_init_fs_context(struct fs_context *fc);
 
+static const struct constant_table common_set_sb_flag[] = {
+	{ "dirsync",	SB_DIRSYNC },
+	{ "lazytime",	SB_LAZYTIME },
+	{ "mand",	SB_MANDLOCK },
+	{ "posixacl",	SB_POSIXACL },
+	{ "ro",		SB_RDONLY },
+	{ "sync",	SB_SYNCHRONOUS },
+};
+
+static const struct constant_table common_clear_sb_flag[] = {
+	{ "async",	SB_SYNCHRONOUS },
+	{ "nolazytime",	SB_LAZYTIME },
+	{ "nomand",	SB_MANDLOCK },
+	{ "rw",		SB_RDONLY },
+	{ "silent",	SB_SILENT },
+};
+
+static const char *const forbidden_sb_flag[] = {
+	"bind",
+	"dev",
+	"exec",
+	"move",
+	"noatime",
+	"nodev",
+	"nodiratime",
+	"noexec",
+	"norelatime",
+	"nostrictatime",
+	"nosuid",
+	"private",
+	"rec",
+	"relatime",
+	"remount",
+	"shared",
+	"slave",
+	"strictatime",
+	"suid",
+	"unbindable",
+};
+
+/*
+ * Check for a common mount option that manipulates s_flags.
+ */
+static int vfs_parse_sb_flag(struct fs_context *fc, const char *key)
+{
+	unsigned int token;
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(forbidden_sb_flag); i++)
+		if (strcmp(key, forbidden_sb_flag[i]) == 0)
+			return -EINVAL;
+
+	token = lookup_constant(common_set_sb_flag, key, 0);
+	if (token) {
+		fc->sb_flags |= token;
+		fc->sb_flags_mask |= token;
+		return 0;
+	}
+
+	token = lookup_constant(common_clear_sb_flag, key, 0);
+	if (token) {
+		fc->sb_flags &= ~token;
+		fc->sb_flags_mask |= token;
+		return 0;
+	}
+
+	return -ENOPARAM;
+}
+
+/**
+ * vfs_parse_fs_param - Add a single parameter to a superblock config
+ * @fc: The filesystem context to modify
+ * @param: The parameter
+ *
+ * A single mount option in string form is applied to the filesystem context
+ * being set up.  Certain standard options (for example "ro") are translated
+ * into flag bits without going to the filesystem.  The active security module
+ * is allowed to observe and poach options.  Any other options are passed over
+ * to the filesystem to parse.
+ *
+ * This may be called multiple times for a context.
+ *
+ * Returns 0 on success and a negative error code on failure.  In the event of
+ * failure, supplementary error information may have been set.
+ */
+int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	int ret;
+
+	if (!param->key)
+		return invalf(fc, "Unnamed parameter\n");
+
+	ret = vfs_parse_sb_flag(fc, param->key);
+	if (ret != -ENOPARAM)
+		return ret;
+
+	ret = security_fs_context_parse_param(fc, param);
+	if (ret != -ENOPARAM)
+		/* Param belongs to the LSM or is disallowed by the LSM; so
+		 * don't pass to the FS.
+		 */
+		return ret;
+
+	if (fc->ops->parse_param) {
+		ret = fc->ops->parse_param(fc, param);
+		if (ret != -ENOPARAM)
+			return ret;
+	}
+
+	/* If the filesystem doesn't take any arguments, give it the
+	 * default handling of source.
+	 */
+	if (strcmp(param->key, "source") == 0) {
+		if (param->type != fs_value_is_string)
+			return invalf(fc, "VFS: Non-string source");
+		if (fc->source)
+			return invalf(fc, "VFS: Multiple sources");
+		fc->source = param->string;
+		param->string = NULL;
+		return 0;
+	}
+
+	return invalf(fc, "%s: Unknown parameter '%s'",
+		      fc->fs_type->name, param->key);
+}
+EXPORT_SYMBOL(vfs_parse_fs_param);
+
+/**
+ * vfs_parse_fs_string - Convenience function to just parse a string.
+ */
+int vfs_parse_fs_string(struct fs_context *fc, const char *key,
+			const char *value, size_t v_size)
+{
+	int ret;
+
+	struct fs_parameter param = {
+		.key	= key,
+		.type	= fs_value_is_string,
+		.size	= v_size,
+	};
+
+	if (v_size > 0) {
+		param.string = kmemdup_nul(value, v_size, GFP_KERNEL);
+		if (!param.string)
+			return -ENOMEM;
+	}
+
+	ret = vfs_parse_fs_param(fc, &param);
+	kfree(param.string);
+	return ret;
+}
+EXPORT_SYMBOL(vfs_parse_fs_string);
+
+/**
+ * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data
+ * @ctx: The superblock configuration to fill in.
+ * @data: The data to parse
+ *
+ * Parse a blob of data that's in key[=val][,key[=val]]* form.  This can be
+ * called from the ->monolithic_mount_data() fs_context operation.
+ *
+ * Returns 0 on success or the error returned by the ->parse_option() fs_context
+ * operation on failure.
+ */
+int generic_parse_monolithic(struct fs_context *fc, void *data)
+{
+	char *options = data, *key;
+	int ret = 0;
+
+	if (!options)
+		return 0;
+
+	ret = security_sb_eat_lsm_opts(options, &fc->security);
+	if (ret)
+		return ret;
+
+	while ((key = strsep(&options, ",")) != NULL) {
+		if (*key) {
+			size_t v_len = 0;
+			char *value = strchr(key, '=');
+
+			if (value) {
+				if (value == key)
+					continue;
+				*value++ = 0;
+				v_len = strlen(value);
+			}
+			ret = vfs_parse_fs_string(fc, key, value, v_len);
+			if (ret < 0)
+				break;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(generic_parse_monolithic);
+
 /**
  * alloc_fs_context - Create a filesystem context.
  * @fs_type: The filesystem type.
@@ -166,7 +371,87 @@ EXPORT_SYMBOL(put_fs_context);
  */
 static void legacy_fs_context_free(struct fs_context *fc)
 {
-	kfree(fc->fs_private);
+	struct legacy_fs_context *ctx = fc->fs_private;
+
+	if (ctx) {
+		if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS)
+			kfree(ctx->legacy_data);
+		kfree(ctx);
+	}
+}
+
+/*
+ * Add a parameter to a legacy config.  We build up a comma-separated list of
+ * options.
+ */
+static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	struct legacy_fs_context *ctx = fc->fs_private;
+	unsigned int size = ctx->data_size;
+	size_t len = 0;
+
+	if (strcmp(param->key, "source") == 0) {
+		if (param->type != fs_value_is_string)
+			return invalf(fc, "VFS: Legacy: Non-string source");
+		if (fc->source)
+			return invalf(fc, "VFS: Legacy: Multiple sources");
+		fc->source = param->string;
+		param->string = NULL;
+		return 0;
+	}
+
+	if ((fc->fs_type->fs_flags & FS_HAS_SUBTYPE) &&
+	    strcmp(param->key, "subtype") == 0) {
+		if (param->type != fs_value_is_string)
+			return invalf(fc, "VFS: Legacy: Non-string subtype");
+		if (fc->subtype)
+			return invalf(fc, "VFS: Legacy: Multiple subtype");
+		fc->subtype = param->string;
+		param->string = NULL;
+		return 0;
+	}
+
+	if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS)
+		return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options");
+
+	switch (param->type) {
+	case fs_value_is_string:
+		len = 1 + param->size;
+		/* Fall through */
+	case fs_value_is_flag:
+		len += strlen(param->key);
+		break;
+	default:
+		return invalf(fc, "VFS: Legacy: Parameter type for '%s' not supported",
+			      param->key);
+	}
+
+	if (len > PAGE_SIZE - 2 - size)
+		return invalf(fc, "VFS: Legacy: Cumulative options too large");
+	if (strchr(param->key, ',') ||
+	    (param->type == fs_value_is_string &&
+	     memchr(param->string, ',', param->size)))
+		return invalf(fc, "VFS: Legacy: Option '%s' contained comma",
+			      param->key);
+	if (!ctx->legacy_data) {
+		ctx->legacy_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
+		if (!ctx->legacy_data)
+			return -ENOMEM;
+	}
+
+	ctx->legacy_data[size++] = ',';
+	len = strlen(param->key);
+	memcpy(ctx->legacy_data + size, param->key, len);
+	size += len;
+	if (param->type == fs_value_is_string) {
+		ctx->legacy_data[size++] = '=';
+		memcpy(ctx->legacy_data + size, param->string, param->size);
+		size += param->size;
+	}
+	ctx->legacy_data[size] = '\0';
+	ctx->data_size = size;
+	ctx->param_type = LEGACY_FS_INDIVIDUAL_PARAMS;
+	return 0;
 }
 
 /*
@@ -175,9 +460,17 @@ static void legacy_fs_context_free(struct fs_context *fc)
 static int legacy_parse_monolithic(struct fs_context *fc, void *data)
 {
 	struct legacy_fs_context *ctx = fc->fs_private;
+
+	if (ctx->param_type != LEGACY_FS_UNSET_PARAMS) {
+		pr_warn("VFS: Can't mix monolithic and individual options\n");
+		return -EINVAL;
+	}
+
 	ctx->legacy_data = data;
+	ctx->param_type = LEGACY_FS_MONOLITHIC_PARAMS;
 	if (!ctx->legacy_data)
 		return 0;
+
 	if (fc->fs_type->fs_flags & FS_BINARY_MOUNTDATA)
 		return 0;
 	return security_sb_eat_lsm_opts(ctx->legacy_data, &fc->security);
@@ -221,6 +514,7 @@ static int legacy_reconfigure(struct fs_context *fc)
 
 const struct fs_context_operations legacy_fs_context_ops = {
 	.free			= legacy_fs_context_free,
+	.parse_param		= legacy_parse_param,
 	.parse_monolithic	= legacy_parse_monolithic,
 	.get_tree		= legacy_get_tree,
 	.reconfigure		= legacy_reconfigure,
@@ -242,6 +536,10 @@ static int legacy_init_fs_context(struct fs_context *fc)
 int parse_monolithic_mount_data(struct fs_context *fc, void *data)
 {
 	int (*monolithic_mount_data)(struct fs_context *, void *);
+
 	monolithic_mount_data = fc->ops->parse_monolithic;
+	if (!monolithic_mount_data)
+		monolithic_mount_data = generic_parse_monolithic;
+
 	return monolithic_mount_data(fc, data);
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index 931228d8518a..1a1ed2528f47 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -997,17 +997,15 @@ struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 	int ret = 0;
 
 	if (!type)
-		return ERR_PTR(-ENODEV);
+		return ERR_PTR(-EINVAL);
 
 	fc = fs_context_for_mount(type, flags);
 	if (IS_ERR(fc))
 		return ERR_CAST(fc);
 
-	if (name) {
-		fc->source = kstrdup(name, GFP_KERNEL);
-		if (!fc->source)
-			ret = -ENOMEM;
-	}
+	if (name)
+		ret = vfs_parse_fs_string(fc, "source",
+					  name, strlen(name));
 	if (!ret)
 		ret = parse_monolithic_mount_data(fc, data);
 	if (!ret)
@@ -2611,16 +2609,11 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
 	if (IS_ERR(fc))
 		return PTR_ERR(fc);
 
-	if (subtype) {
-		fc->subtype = kstrdup(subtype, GFP_KERNEL);
-		if (!fc->subtype)
-			err = -ENOMEM;
-	}
-	if (!err && name) {
-		fc->source = kstrdup(name, GFP_KERNEL);
-		if (!fc->source)
-			err = -ENOMEM;
-	}
+	if (subtype)
+		err = vfs_parse_fs_string(fc, "subtype",
+					  subtype, strlen(subtype));
+	if (!err && name)
+		err = vfs_parse_fs_string(fc, "source", name, strlen(name));
 	if (!err)
 		err = parse_monolithic_mount_data(fc, data);
 	if (!err)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8d578a9e1e8c..cf6e9ea161eb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -62,6 +62,7 @@ struct iov_iter;
 struct fscrypt_info;
 struct fscrypt_operations;
 struct fs_context;
+struct fs_parameter_description;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -2175,6 +2176,7 @@ struct file_system_type {
 #define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	int (*init_fs_context)(struct fs_context *);
+	const struct fs_parameter_description *parameters;
 	struct dentry *(*mount) (struct file_system_type *, int,
 		       const char *, void *);
 	void (*kill_sb) (struct super_block *);
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index d5ff3b0bc28d..d794b04e9fbb 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -92,6 +92,7 @@ struct fs_context {
 
 struct fs_context_operations {
 	void (*free)(struct fs_context *fc);
+	int (*parse_param)(struct fs_context *fc, struct fs_parameter *param);
 	int (*parse_monolithic)(struct fs_context *fc, void *data);
 	int (*get_tree)(struct fs_context *fc);
 	int (*reconfigure)(struct fs_context *fc);
@@ -108,6 +109,10 @@ extern struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
 extern struct fs_context *fs_context_for_submount(struct file_system_type *fs_type,
 						struct dentry *reference);
 
+extern int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param);
+extern int vfs_parse_fs_string(struct fs_context *fc, const char *key,
+			       const char *value, size_t v_size);
+extern int generic_parse_monolithic(struct fs_context *fc, void *data);
 extern int vfs_get_tree(struct fs_context *fc);
 extern void put_fs_context(struct fs_context *fc);
 
-- 
cgit v1.2.3


From cb50b348c71ffa90d7d1b2a494b553b5099bc090 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 23 Dec 2018 17:25:47 -0500
Subject: convenience helpers: vfs_get_super() and sget_fc()

the former is an analogue of mount_{single,nodev} for use in
->get_tree() instances, the latter - analogue of sget() for the
same.

These are fairly similar to the originals, but the callback signature
for sget_fc() is different from sget() ones, so getting bits and
pieces shared would be too convoluted; we might get around to that
later, but for now let's just remember to keep them in sync.  They
do live next to each other, and changes in either won't be hard
to spot.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c                 | 171 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h         |   4 ++
 include/linux/fs_context.h |  15 ++++
 3 files changed, 190 insertions(+)

(limited to 'include')

diff --git a/fs/super.c b/fs/super.c
index 76b3181c782d..0ebb5c11fa56 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -476,6 +476,94 @@ void generic_shutdown_super(struct super_block *sb)
 
 EXPORT_SYMBOL(generic_shutdown_super);
 
+/**
+ * sget_fc - Find or create a superblock
+ * @fc:	Filesystem context.
+ * @test: Comparison callback
+ * @set: Setup callback
+ *
+ * Find or create a superblock using the parameters stored in the filesystem
+ * context and the two callback functions.
+ *
+ * If an extant superblock is matched, then that will be returned with an
+ * elevated reference count that the caller must transfer or discard.
+ *
+ * If no match is made, a new superblock will be allocated and basic
+ * initialisation will be performed (s_type, s_fs_info and s_id will be set and
+ * the set() callback will be invoked), the superblock will be published and it
+ * will be returned in a partially constructed state with SB_BORN and SB_ACTIVE
+ * as yet unset.
+ */
+struct super_block *sget_fc(struct fs_context *fc,
+			    int (*test)(struct super_block *, struct fs_context *),
+			    int (*set)(struct super_block *, struct fs_context *))
+{
+	struct super_block *s = NULL;
+	struct super_block *old;
+	struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns;
+	int err;
+
+	if (!(fc->sb_flags & SB_KERNMOUNT) &&
+	    fc->purpose != FS_CONTEXT_FOR_SUBMOUNT) {
+		/* Don't allow mounting unless the caller has CAP_SYS_ADMIN
+		 * over the namespace.
+		 */
+		if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) {
+			if (!capable(CAP_SYS_ADMIN))
+				return ERR_PTR(-EPERM);
+		} else {
+			if (!ns_capable(fc->user_ns, CAP_SYS_ADMIN))
+				return ERR_PTR(-EPERM);
+		}
+	}
+
+retry:
+	spin_lock(&sb_lock);
+	if (test) {
+		hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) {
+			if (test(old, fc))
+				goto share_extant_sb;
+		}
+	}
+	if (!s) {
+		spin_unlock(&sb_lock);
+		s = alloc_super(fc->fs_type, fc->sb_flags, user_ns);
+		if (!s)
+			return ERR_PTR(-ENOMEM);
+		goto retry;
+	}
+
+	s->s_fs_info = fc->s_fs_info;
+	err = set(s, fc);
+	if (err) {
+		s->s_fs_info = NULL;
+		spin_unlock(&sb_lock);
+		destroy_unused_super(s);
+		return ERR_PTR(err);
+	}
+	fc->s_fs_info = NULL;
+	s->s_type = fc->fs_type;
+	strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id));
+	list_add_tail(&s->s_list, &super_blocks);
+	hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
+	spin_unlock(&sb_lock);
+	get_filesystem(s->s_type);
+	register_shrinker_prepared(&s->s_shrink);
+	return s;
+
+share_extant_sb:
+	if (user_ns != old->s_user_ns) {
+		spin_unlock(&sb_lock);
+		destroy_unused_super(s);
+		return ERR_PTR(-EBUSY);
+	}
+	if (!grab_super(old))
+		goto retry;
+	destroy_unused_super(s);
+	return old;
+}
+EXPORT_SYMBOL(sget_fc);
+
 /**
  *	sget_userns -	find or create a superblock
  *	@type:	filesystem type superblock should belong to
@@ -1103,6 +1191,89 @@ struct dentry *mount_ns(struct file_system_type *fs_type,
 
 EXPORT_SYMBOL(mount_ns);
 
+int set_anon_super_fc(struct super_block *sb, struct fs_context *fc)
+{
+	return set_anon_super(sb, NULL);
+}
+EXPORT_SYMBOL(set_anon_super_fc);
+
+static int test_keyed_super(struct super_block *sb, struct fs_context *fc)
+{
+	return sb->s_fs_info == fc->s_fs_info;
+}
+
+static int test_single_super(struct super_block *s, struct fs_context *fc)
+{
+	return 1;
+}
+
+/**
+ * vfs_get_super - Get a superblock with a search key set in s_fs_info.
+ * @fc: The filesystem context holding the parameters
+ * @keying: How to distinguish superblocks
+ * @fill_super: Helper to initialise a new superblock
+ *
+ * Search for a superblock and create a new one if not found.  The search
+ * criterion is controlled by @keying.  If the search fails, a new superblock
+ * is created and @fill_super() is called to initialise it.
+ *
+ * @keying can take one of a number of values:
+ *
+ * (1) vfs_get_single_super - Only one superblock of this type may exist on the
+ *     system.  This is typically used for special system filesystems.
+ *
+ * (2) vfs_get_keyed_super - Multiple superblocks may exist, but they must have
+ *     distinct keys (where the key is in s_fs_info).  Searching for the same
+ *     key again will turn up the superblock for that key.
+ *
+ * (3) vfs_get_independent_super - Multiple superblocks may exist and are
+ *     unkeyed.  Each call will get a new superblock.
+ *
+ * A permissions check is made by sget_fc() unless we're getting a superblock
+ * for a kernel-internal mount or a submount.
+ */
+int vfs_get_super(struct fs_context *fc,
+		  enum vfs_get_super_keying keying,
+		  int (*fill_super)(struct super_block *sb,
+				    struct fs_context *fc))
+{
+	int (*test)(struct super_block *, struct fs_context *);
+	struct super_block *sb;
+
+	switch (keying) {
+	case vfs_get_single_super:
+		test = test_single_super;
+		break;
+	case vfs_get_keyed_super:
+		test = test_keyed_super;
+		break;
+	case vfs_get_independent_super:
+		test = NULL;
+		break;
+	default:
+		BUG();
+	}
+
+	sb = sget_fc(fc, test, set_anon_super_fc);
+	if (IS_ERR(sb))
+		return PTR_ERR(sb);
+
+	if (!sb->s_root) {
+		int err = fill_super(sb, fc);
+		if (err) {
+			deactivate_locked_super(sb);
+			return err;
+		}
+
+		sb->s_flags |= SB_ACTIVE;
+	}
+
+	BUG_ON(fc->root);
+	fc->root = dget(sb->s_root);
+	return 0;
+}
+EXPORT_SYMBOL(vfs_get_super);
+
 #ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cf6e9ea161eb..9d05c128ccf6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2232,8 +2232,12 @@ void kill_litter_super(struct super_block *sb);
 void deactivate_super(struct super_block *sb);
 void deactivate_locked_super(struct super_block *sb);
 int set_anon_super(struct super_block *s, void *data);
+int set_anon_super_fc(struct super_block *s, struct fs_context *fc);
 int get_anon_bdev(dev_t *);
 void free_anon_bdev(dev_t);
+struct super_block *sget_fc(struct fs_context *fc,
+			    int (*test)(struct super_block *, struct fs_context *),
+			    int (*set)(struct super_block *, struct fs_context *));
 struct super_block *sget_userns(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index d794b04e9fbb..b1a95db7a111 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -83,11 +83,13 @@ struct fs_context {
 	const char		*source;	/* The source name (eg. dev path) */
 	const char		*subtype;	/* The subtype to set on the superblock */
 	void			*security;	/* Linux S&M options */
+	void			*s_fs_info;	/* Proposed s_fs_info */
 	unsigned int		sb_flags;	/* Proposed superblock flags (SB_*) */
 	unsigned int		sb_flags_mask;	/* Superblock flags that were changed */
 	unsigned int		lsm_flags;	/* Information flags from the fs to the LSM */
 	enum fs_context_purpose	purpose:8;
 	bool			need_free:1;	/* Need to call ops->free() */
+	bool			global:1;	/* Goes into &init_user_ns */
 };
 
 struct fs_context_operations {
@@ -116,6 +118,19 @@ extern int generic_parse_monolithic(struct fs_context *fc, void *data);
 extern int vfs_get_tree(struct fs_context *fc);
 extern void put_fs_context(struct fs_context *fc);
 
+/*
+ * sget() wrapper to be called from the ->get_tree() op.
+ */
+enum vfs_get_super_keying {
+	vfs_get_single_super,	/* Only one such superblock may exist */
+	vfs_get_keyed_super,	/* Superblocks with different s_fs_info keys may exist */
+	vfs_get_independent_super, /* Multiple independent superblocks may exist */
+};
+extern int vfs_get_super(struct fs_context *fc,
+			 enum vfs_get_super_keying keying,
+			 int (*fill_super)(struct super_block *sb,
+					   struct fs_context *fc));
+
 #define logfc(FC, FMT, ...) pr_notice(FMT, ## __VA_ARGS__)
 
 /**
-- 
cgit v1.2.3


From 0b52075ee62301dd150c9f2c3ddd0035ed894cde Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 23 Dec 2018 16:02:47 -0500
Subject: introduce cloning of fs_context

new primitive: vfs_dup_fs_context().  Comes with fs_context
method (->dup()) for copying the filesystem-specific parts
of fs_context, along with LSM one (->fs_context_dup()) for
doing the same to LSM parts.

[needs better commit message, and change of Author:, anyway]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs_context.c            | 67 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs_context.h |  2 ++
 include/linux/lsm_hooks.h  |  7 +++++
 include/linux/security.h   |  6 +++++
 security/security.c        |  5 ++++
 security/selinux/hooks.c   | 39 +++++++++++++++++++++++++++
 security/smack/smack_lsm.c | 49 +++++++++++++++++++++++++++++++++
 7 files changed, 175 insertions(+)

(limited to 'include')

diff --git a/fs/fs_context.c b/fs/fs_context.c
index aa7e0ffb591a..57f61833ac83 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -337,6 +337,47 @@ void fc_drop_locked(struct fs_context *fc)
 
 static void legacy_fs_context_free(struct fs_context *fc);
 
+/**
+ * vfs_dup_fc_config: Duplicate a filesystem context.
+ * @src_fc: The context to copy.
+ */
+struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc)
+{
+	struct fs_context *fc;
+	int ret;
+
+	if (!src_fc->ops->dup)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	fc = kmemdup(src_fc, sizeof(struct fs_context), GFP_KERNEL);
+	if (!fc)
+		return ERR_PTR(-ENOMEM);
+
+	fc->fs_private	= NULL;
+	fc->s_fs_info	= NULL;
+	fc->source	= NULL;
+	fc->security	= NULL;
+	get_filesystem(fc->fs_type);
+	get_net(fc->net_ns);
+	get_user_ns(fc->user_ns);
+	get_cred(fc->cred);
+
+	/* Can't call put until we've called ->dup */
+	ret = fc->ops->dup(fc, src_fc);
+	if (ret < 0)
+		goto err_fc;
+
+	ret = security_fs_context_dup(fc, src_fc);
+	if (ret < 0)
+		goto err_fc;
+	return fc;
+
+err_fc:
+	put_fs_context(fc);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(vfs_dup_fs_context);
+
 /**
  * put_fs_context - Dispose of a superblock configuration context.
  * @fc: The context to dispose of.
@@ -380,6 +421,31 @@ static void legacy_fs_context_free(struct fs_context *fc)
 	}
 }
 
+/*
+ * Duplicate a legacy config.
+ */
+static int legacy_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
+{
+	struct legacy_fs_context *ctx;
+	struct legacy_fs_context *src_ctx = src_fc->fs_private;
+
+	ctx = kmemdup(src_ctx, sizeof(*src_ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) {
+		ctx->legacy_data = kmemdup(src_ctx->legacy_data,
+					   src_ctx->data_size, GFP_KERNEL);
+		if (!ctx->legacy_data) {
+			kfree(ctx);
+			return -ENOMEM;
+		}
+	}
+
+	fc->fs_private = ctx;
+	return 0;
+}
+
 /*
  * Add a parameter to a legacy config.  We build up a comma-separated list of
  * options.
@@ -514,6 +580,7 @@ static int legacy_reconfigure(struct fs_context *fc)
 
 const struct fs_context_operations legacy_fs_context_ops = {
 	.free			= legacy_fs_context_free,
+	.dup			= legacy_fs_context_dup,
 	.parse_param		= legacy_parse_param,
 	.parse_monolithic	= legacy_parse_monolithic,
 	.get_tree		= legacy_get_tree,
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index b1a95db7a111..0db0b645c7b8 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -94,6 +94,7 @@ struct fs_context {
 
 struct fs_context_operations {
 	void (*free)(struct fs_context *fc);
+	int (*dup)(struct fs_context *fc, struct fs_context *src_fc);
 	int (*parse_param)(struct fs_context *fc, struct fs_parameter *param);
 	int (*parse_monolithic)(struct fs_context *fc, void *data);
 	int (*get_tree)(struct fs_context *fc);
@@ -111,6 +112,7 @@ extern struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
 extern struct fs_context *fs_context_for_submount(struct file_system_type *fs_type,
 						struct dentry *reference);
 
+extern struct fs_context *vfs_dup_fs_context(struct fs_context *fc);
 extern int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param);
 extern int vfs_parse_fs_string(struct fs_context *fc, const char *key,
 			       const char *value, size_t v_size);
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 47ba4db4d8fb..356e78fe90a8 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -79,6 +79,11 @@
  * Security hooks for mount using fs_context.
  *	[See also Documentation/filesystems/mounting.txt]
  *
+ * @fs_context_dup:
+ *	Allocate and attach a security structure to sc->security.  This pointer
+ *	is initialised to NULL by the caller.
+ *	@fc indicates the new filesystem context.
+ *	@src_fc indicates the original filesystem context.
  * @fs_context_parse_param:
  *	Userspace provided a parameter to configure a superblock.  The LSM may
  *	reject it with an error and may use it for itself, in which case it
@@ -1470,6 +1475,7 @@ union security_list_options {
 	void (*bprm_committing_creds)(struct linux_binprm *bprm);
 	void (*bprm_committed_creds)(struct linux_binprm *bprm);
 
+	int (*fs_context_dup)(struct fs_context *fc, struct fs_context *src_sc);
 	int (*fs_context_parse_param)(struct fs_context *fc, struct fs_parameter *param);
 
 	int (*sb_alloc_security)(struct super_block *sb);
@@ -1813,6 +1819,7 @@ struct security_hook_heads {
 	struct hlist_head bprm_check_security;
 	struct hlist_head bprm_committing_creds;
 	struct hlist_head bprm_committed_creds;
+	struct hlist_head fs_context_dup;
 	struct hlist_head fs_context_parse_param;
 	struct hlist_head sb_alloc_security;
 	struct hlist_head sb_free_security;
diff --git a/include/linux/security.h b/include/linux/security.h
index 2da9336a987e..f28a1ebfd78e 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -223,6 +223,7 @@ int security_bprm_set_creds(struct linux_binprm *bprm);
 int security_bprm_check(struct linux_binprm *bprm);
 void security_bprm_committing_creds(struct linux_binprm *bprm);
 void security_bprm_committed_creds(struct linux_binprm *bprm);
+int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc);
 int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param);
 int security_sb_alloc(struct super_block *sb);
 void security_sb_free(struct super_block *sb);
@@ -521,6 +522,11 @@ static inline void security_bprm_committed_creds(struct linux_binprm *bprm)
 {
 }
 
+static inline int security_fs_context_dup(struct fs_context *fc,
+					  struct fs_context *src_fc)
+{
+	return 0;
+}
 static inline int security_fs_context_parse_param(struct fs_context *fc,
 						  struct fs_parameter *param)
 {
diff --git a/security/security.c b/security/security.c
index e5519488327d..5759339319dc 100644
--- a/security/security.c
+++ b/security/security.c
@@ -374,6 +374,11 @@ void security_bprm_committed_creds(struct linux_binprm *bprm)
 	call_void_hook(bprm_committed_creds, bprm);
 }
 
+int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
+{
+	return call_int_hook(fs_context_dup, 0, fc, src_fc);
+}
+
 int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
 	return call_int_hook(fs_context_parse_param, -ENOPARAM, fc, param);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index f99381e97d73..4ba83de5fa80 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2764,6 +2764,44 @@ static int selinux_umount(struct vfsmount *mnt, int flags)
 				   FILESYSTEM__UNMOUNT, NULL);
 }
 
+static int selinux_fs_context_dup(struct fs_context *fc,
+				  struct fs_context *src_fc)
+{
+	const struct selinux_mnt_opts *src = src_fc->security;
+	struct selinux_mnt_opts *opts;
+
+	if (!src)
+		return 0;
+
+	fc->security = kzalloc(sizeof(struct selinux_mnt_opts), GFP_KERNEL);
+	if (!fc->security)
+		return -ENOMEM;
+
+	opts = fc->security;
+
+	if (src->fscontext) {
+		opts->fscontext = kstrdup(src->fscontext, GFP_KERNEL);
+		if (!opts->fscontext)
+			return -ENOMEM;
+	}
+	if (src->context) {
+		opts->context = kstrdup(src->context, GFP_KERNEL);
+		if (!opts->context)
+			return -ENOMEM;
+	}
+	if (src->rootcontext) {
+		opts->rootcontext = kstrdup(src->rootcontext, GFP_KERNEL);
+		if (!opts->rootcontext)
+			return -ENOMEM;
+	}
+	if (src->defcontext) {
+		opts->defcontext = kstrdup(src->defcontext, GFP_KERNEL);
+		if (!opts->defcontext)
+			return -ENOMEM;
+	}
+	return 0;
+}
+
 static const struct fs_parameter_spec selinux_param_specs[] = {
 	fsparam_string(CONTEXT_STR,	Opt_context),
 	fsparam_string(DEFCONTEXT_STR,	Opt_defcontext),
@@ -6745,6 +6783,7 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(bprm_committing_creds, selinux_bprm_committing_creds),
 	LSM_HOOK_INIT(bprm_committed_creds, selinux_bprm_committed_creds),
 
+	LSM_HOOK_INIT(fs_context_dup, selinux_fs_context_dup),
 	LSM_HOOK_INIT(fs_context_parse_param, selinux_fs_context_parse_param),
 
 	LSM_HOOK_INIT(sb_alloc_security, selinux_sb_alloc_security),
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 5f93c4f84384..03176f600a87 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -647,6 +647,54 @@ out_opt_err:
 	return -EINVAL;
 }
 
+/**
+ * smack_fs_context_dup - Duplicate the security data on fs_context duplication
+ * @fc: The new filesystem context.
+ * @src_fc: The source filesystem context being duplicated.
+ *
+ * Returns 0 on success or -ENOMEM on error.
+ */
+static int smack_fs_context_dup(struct fs_context *fc,
+				struct fs_context *src_fc)
+{
+	struct smack_mnt_opts *dst, *src = src_fc->security;
+
+	if (!src)
+		return 0;
+
+	fc->security = kzalloc(sizeof(struct smack_mnt_opts), GFP_KERNEL);
+	if (!fc->security)
+		return -ENOMEM;
+	dst = fc->security;
+
+	if (src->fsdefault) {
+		dst->fsdefault = kstrdup(src->fsdefault, GFP_KERNEL);
+		if (!dst->fsdefault)
+			return -ENOMEM;
+	}
+	if (src->fsfloor) {
+		dst->fsfloor = kstrdup(src->fsfloor, GFP_KERNEL);
+		if (!dst->fsfloor)
+			return -ENOMEM;
+	}
+	if (src->fshat) {
+		dst->fshat = kstrdup(src->fshat, GFP_KERNEL);
+		if (!dst->fshat)
+			return -ENOMEM;
+	}
+	if (src->fsroot) {
+		dst->fsroot = kstrdup(src->fsroot, GFP_KERNEL);
+		if (!dst->fsroot)
+			return -ENOMEM;
+	}
+	if (src->fstransmute) {
+		dst->fstransmute = kstrdup(src->fstransmute, GFP_KERNEL);
+		if (!dst->fstransmute)
+			return -ENOMEM;
+	}
+	return 0;
+}
+
 static const struct fs_parameter_spec smack_param_specs[] = {
 	fsparam_string("fsdefault",	Opt_fsdefault),
 	fsparam_string("fsfloor",	Opt_fsfloor),
@@ -4626,6 +4674,7 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme),
 	LSM_HOOK_INIT(syslog, smack_syslog),
 
+	LSM_HOOK_INIT(fs_context_dup, smack_fs_context_dup),
 	LSM_HOOK_INIT(fs_context_parse_param, smack_fs_context_parse_param),
 
 	LSM_HOOK_INIT(sb_alloc_security, smack_sb_alloc_security),
-- 
cgit v1.2.3


From 23bf1b6be9c291a7130118dcc7384f72ac04d813 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 1 Nov 2018 23:07:26 +0000
Subject: kernfs, sysfs, cgroup, intel_rdt: Support fs_context

Make kernfs support superblock creation/mount/remount with fs_context.

This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.

Notes:

 (1) A kernfs_fs_context struct is created to wrap fs_context and the
     kernfs mount parameters are moved in here (or are in fs_context).

 (2) kernfs_mount{,_ns}() are made into kernfs_get_tree().  The extra
     namespace tag parameter is passed in the context if desired

 (3) kernfs_free_fs_context() is provided as a destructor for the
     kernfs_fs_context struct, but for the moment it does nothing except
     get called in the right places.

 (4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
     pass, but possibly this should be done anyway in case someone wants to
     add a parameter in future.

 (5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
     the cgroup v1 and v2 mount parameters are all moved there.

 (6) cgroup1 parameter parsing error messages are now handled by invalf(),
     which allows userspace to collect them directly.

 (7) cgroup1 parameter cleanup is now done in the context destructor rather
     than in the mount/get_tree and remount functions.

Weirdies:

 (*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
     but then uses the resulting pointer after dropping the locks.  I'm
     told this is okay and needs commenting.

 (*) The cgroup refcount web.  This really needs documenting.

 (*) cgroup2 only has one root?

Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.

[folded a leak fix from Andrey Vagin]

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/kernel/cpu/resctrl/internal.h |  16 +++
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 185 +++++++++++++++++++++------------
 fs/kernfs/kernfs-internal.h            |   1 +
 fs/kernfs/mount.c                      |  89 +++++++---------
 fs/sysfs/mount.c                       |  73 +++++++++----
 include/linux/kernfs.h                 |  38 +++----
 kernel/cgroup/cgroup-internal.h        |   5 +-
 kernel/cgroup/cgroup.c                 |  31 +++---
 8 files changed, 262 insertions(+), 176 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 822b7db634ee..e49b77283924 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -4,6 +4,7 @@
 
 #include <linux/sched.h>
 #include <linux/kernfs.h>
+#include <linux/fs_context.h>
 #include <linux/jump_label.h>
 
 #define MSR_IA32_L3_QOS_CFG		0xc81
@@ -40,6 +41,21 @@
 #define RMID_VAL_ERROR			BIT_ULL(63)
 #define RMID_VAL_UNAVAIL		BIT_ULL(62)
 
+
+struct rdt_fs_context {
+	struct kernfs_fs_context	kfc;
+	bool				enable_cdpl2;
+	bool				enable_cdpl3;
+	bool				enable_mba_mbps;
+};
+
+static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
+{
+	struct kernfs_fs_context *kfc = fc->fs_private;
+
+	return container_of(kfc, struct rdt_fs_context, kfc);
+}
+
 DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
 
 /**
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 8388adf241b2..399601eda8e4 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -24,6 +24,7 @@
 #include <linux/cpu.h>
 #include <linux/debugfs.h>
 #include <linux/fs.h>
+#include <linux/fs_parser.h>
 #include <linux/sysfs.h>
 #include <linux/kernfs.h>
 #include <linux/seq_buf.h>
@@ -32,6 +33,7 @@
 #include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/task_work.h>
+#include <linux/user_namespace.h>
 
 #include <uapi/linux/magic.h>
 
@@ -1858,46 +1860,6 @@ static void cdp_disable_all(void)
 		cdpl2_disable();
 }
 
-static int parse_rdtgroupfs_options(char *data)
-{
-	char *token, *o = data;
-	int ret = 0;
-
-	while ((token = strsep(&o, ",")) != NULL) {
-		if (!*token) {
-			ret = -EINVAL;
-			goto out;
-		}
-
-		if (!strcmp(token, "cdp")) {
-			ret = cdpl3_enable();
-			if (ret)
-				goto out;
-		} else if (!strcmp(token, "cdpl2")) {
-			ret = cdpl2_enable();
-			if (ret)
-				goto out;
-		} else if (!strcmp(token, "mba_MBps")) {
-			if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-				ret = set_mba_sc(true);
-			else
-				ret = -EINVAL;
-			if (ret)
-				goto out;
-		} else {
-			ret = -EINVAL;
-			goto out;
-		}
-	}
-
-	return 0;
-
-out:
-	pr_err("Invalid mount option \"%s\"\n", token);
-
-	return ret;
-}
-
 /*
  * We don't allow rdtgroup directories to be created anywhere
  * except the root directory. Thus when looking for the rdtgroup
@@ -1969,13 +1931,27 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn,
 			     struct rdtgroup *prgrp,
 			     struct kernfs_node **mon_data_kn);
 
-static struct dentry *rdt_mount(struct file_system_type *fs_type,
-				int flags, const char *unused_dev_name,
-				void *data)
+static int rdt_enable_ctx(struct rdt_fs_context *ctx)
+{
+	int ret = 0;
+
+	if (ctx->enable_cdpl2)
+		ret = cdpl2_enable();
+
+	if (!ret && ctx->enable_cdpl3)
+		ret = cdpl3_enable();
+
+	if (!ret && ctx->enable_mba_mbps)
+		ret = set_mba_sc(true);
+
+	return ret;
+}
+
+static int rdt_get_tree(struct fs_context *fc)
 {
+	struct rdt_fs_context *ctx = rdt_fc2context(fc);
 	struct rdt_domain *dom;
 	struct rdt_resource *r;
-	struct dentry *dentry;
 	int ret;
 
 	cpus_read_lock();
@@ -1984,53 +1960,42 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
 	 * resctrl file system can only be mounted once.
 	 */
 	if (static_branch_unlikely(&rdt_enable_key)) {
-		dentry = ERR_PTR(-EBUSY);
+		ret = -EBUSY;
 		goto out;
 	}
 
-	ret = parse_rdtgroupfs_options(data);
-	if (ret) {
-		dentry = ERR_PTR(ret);
+	ret = rdt_enable_ctx(ctx);
+	if (ret < 0)
 		goto out_cdp;
-	}
 
 	closid_init();
 
 	ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
-	if (ret) {
-		dentry = ERR_PTR(ret);
-		goto out_cdp;
-	}
+	if (ret < 0)
+		goto out_mba;
 
 	if (rdt_mon_capable) {
 		ret = mongroup_create_dir(rdtgroup_default.kn,
 					  NULL, "mon_groups",
 					  &kn_mongrp);
-		if (ret) {
-			dentry = ERR_PTR(ret);
+		if (ret < 0)
 			goto out_info;
-		}
 		kernfs_get(kn_mongrp);
 
 		ret = mkdir_mondata_all(rdtgroup_default.kn,
 					&rdtgroup_default, &kn_mondata);
-		if (ret) {
-			dentry = ERR_PTR(ret);
+		if (ret < 0)
 			goto out_mongrp;
-		}
 		kernfs_get(kn_mondata);
 		rdtgroup_default.mon.mon_data_kn = kn_mondata;
 	}
 
 	ret = rdt_pseudo_lock_init();
-	if (ret) {
-		dentry = ERR_PTR(ret);
+	if (ret)
 		goto out_mondata;
-	}
 
-	dentry = kernfs_mount(fs_type, flags, rdt_root,
-			      RDTGROUP_SUPER_MAGIC, NULL);
-	if (IS_ERR(dentry))
+	ret = kernfs_get_tree(fc);
+	if (ret < 0)
 		goto out_psl;
 
 	if (rdt_alloc_capable)
@@ -2059,14 +2024,95 @@ out_mongrp:
 		kernfs_remove(kn_mongrp);
 out_info:
 	kernfs_remove(kn_info);
+out_mba:
+	if (ctx->enable_mba_mbps)
+		set_mba_sc(false);
 out_cdp:
 	cdp_disable_all();
 out:
 	rdt_last_cmd_clear();
 	mutex_unlock(&rdtgroup_mutex);
 	cpus_read_unlock();
+	return ret;
+}
+
+enum rdt_param {
+	Opt_cdp,
+	Opt_cdpl2,
+	Opt_mba_mpbs,
+	nr__rdt_params
+};
+
+static const struct fs_parameter_spec rdt_param_specs[] = {
+	fsparam_flag("cdp",		Opt_cdp),
+	fsparam_flag("cdpl2",		Opt_cdpl2),
+	fsparam_flag("mba_mpbs",	Opt_mba_mpbs),
+	{}
+};
+
+static const struct fs_parameter_description rdt_fs_parameters = {
+	.name		= "rdt",
+	.specs		= rdt_param_specs,
+};
+
+static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	struct rdt_fs_context *ctx = rdt_fc2context(fc);
+	struct fs_parse_result result;
+	int opt;
+
+	opt = fs_parse(fc, &rdt_fs_parameters, param, &result);
+	if (opt < 0)
+		return opt;
 
-	return dentry;
+	switch (opt) {
+	case Opt_cdp:
+		ctx->enable_cdpl3 = true;
+		return 0;
+	case Opt_cdpl2:
+		ctx->enable_cdpl2 = true;
+		return 0;
+	case Opt_mba_mpbs:
+		if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+			return -EINVAL;
+		ctx->enable_mba_mbps = true;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static void rdt_fs_context_free(struct fs_context *fc)
+{
+	struct rdt_fs_context *ctx = rdt_fc2context(fc);
+
+	kernfs_free_fs_context(fc);
+	kfree(ctx);
+}
+
+static const struct fs_context_operations rdt_fs_context_ops = {
+	.free		= rdt_fs_context_free,
+	.parse_param	= rdt_parse_param,
+	.get_tree	= rdt_get_tree,
+};
+
+static int rdt_init_fs_context(struct fs_context *fc)
+{
+	struct rdt_fs_context *ctx;
+
+	ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->kfc.root = rdt_root;
+	ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
+	fc->fs_private = &ctx->kfc;
+	fc->ops = &rdt_fs_context_ops;
+	if (fc->user_ns)
+		put_user_ns(fc->user_ns);
+	fc->user_ns = get_user_ns(&init_user_ns);
+	fc->global = true;
+	return 0;
 }
 
 static int reset_all_ctrls(struct rdt_resource *r)
@@ -2239,9 +2285,10 @@ static void rdt_kill_sb(struct super_block *sb)
 }
 
 static struct file_system_type rdt_fs_type = {
-	.name    = "resctrl",
-	.mount   = rdt_mount,
-	.kill_sb = rdt_kill_sb,
+	.name			= "resctrl",
+	.init_fs_context	= rdt_init_fs_context,
+	.parameters		= &rdt_fs_parameters,
+	.kill_sb		= rdt_kill_sb,
 };
 
 static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 3d83b114bb08..379e3a9eb1ec 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -17,6 +17,7 @@
 #include <linux/xattr.h>
 
 #include <linux/kernfs.h>
+#include <linux/fs_context.h>
 
 struct kernfs_iattrs {
 	struct iattr		ia_iattr;
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 4d303047a4f8..36376cc5c9c2 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -22,16 +22,6 @@
 
 struct kmem_cache *kernfs_node_cache;
 
-static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data)
-{
-	struct kernfs_root *root = kernfs_info(sb)->root;
-	struct kernfs_syscall_ops *scops = root->syscall_ops;
-
-	if (scops && scops->remount_fs)
-		return scops->remount_fs(root, flags, data);
-	return 0;
-}
-
 static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
 {
 	struct kernfs_root *root = kernfs_root(kernfs_dentry_node(dentry));
@@ -60,7 +50,6 @@ const struct super_operations kernfs_sops = {
 	.drop_inode	= generic_delete_inode,
 	.evict_inode	= kernfs_evict_inode,
 
-	.remount_fs	= kernfs_sop_remount_fs,
 	.show_options	= kernfs_sop_show_options,
 	.show_path	= kernfs_sop_show_path,
 };
@@ -222,7 +211,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
 	} while (true);
 }
 
-static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
+static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc)
 {
 	struct kernfs_super_info *info = kernfs_info(sb);
 	struct inode *inode;
@@ -233,7 +222,7 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
 	sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
 	sb->s_blocksize = PAGE_SIZE;
 	sb->s_blocksize_bits = PAGE_SHIFT;
-	sb->s_magic = magic;
+	sb->s_magic = kfc->magic;
 	sb->s_op = &kernfs_sops;
 	sb->s_xattr = kernfs_xattr_handlers;
 	if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP)
@@ -263,21 +252,20 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
 	return 0;
 }
 
-static int kernfs_test_super(struct super_block *sb, void *data)
+static int kernfs_test_super(struct super_block *sb, struct fs_context *fc)
 {
 	struct kernfs_super_info *sb_info = kernfs_info(sb);
-	struct kernfs_super_info *info = data;
+	struct kernfs_super_info *info = fc->s_fs_info;
 
 	return sb_info->root == info->root && sb_info->ns == info->ns;
 }
 
-static int kernfs_set_super(struct super_block *sb, void *data)
+static int kernfs_set_super(struct super_block *sb, struct fs_context *fc)
 {
-	int error;
-	error = set_anon_super(sb, data);
-	if (!error)
-		sb->s_fs_info = data;
-	return error;
+	struct kernfs_fs_context *kfc = fc->fs_private;
+
+	kfc->ns_tag = NULL;
+	return set_anon_super_fc(sb, fc);
 }
 
 /**
@@ -294,63 +282,60 @@ const void *kernfs_super_ns(struct super_block *sb)
 }
 
 /**
- * kernfs_mount_ns - kernfs mount helper
- * @fs_type: file_system_type of the fs being mounted
- * @flags: mount flags specified for the mount
- * @root: kernfs_root of the hierarchy being mounted
- * @magic: file system specific magic number
- * @new_sb_created: tell the caller if we allocated a new superblock
- * @ns: optional namespace tag of the mount
+ * kernfs_get_tree - kernfs filesystem access/retrieval helper
+ * @fc: The filesystem context.
  *
- * This is to be called from each kernfs user's file_system_type->mount()
- * implementation, which should pass through the specified @fs_type and
- * @flags, and specify the hierarchy and namespace tag to mount via @root
- * and @ns, respectively.
- *
- * The return value can be passed to the vfs layer verbatim.
+ * This is to be called from each kernfs user's fs_context->ops->get_tree()
+ * implementation, which should set the specified ->@fs_type and ->@flags, and
+ * specify the hierarchy and namespace tag to mount via ->@root and ->@ns,
+ * respectively.
  */
-struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
-				struct kernfs_root *root, unsigned long magic,
-				bool *new_sb_created, const void *ns)
+int kernfs_get_tree(struct fs_context *fc)
 {
+	struct kernfs_fs_context *kfc = fc->fs_private;
 	struct super_block *sb;
 	struct kernfs_super_info *info;
 	int error;
 
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
-	info->root = root;
-	info->ns = ns;
+	info->root = kfc->root;
+	info->ns = kfc->ns_tag;
 	INIT_LIST_HEAD(&info->node);
 
-	sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags,
-			 &init_user_ns, info);
-	if (IS_ERR(sb) || sb->s_fs_info != info)
-		kfree(info);
+	fc->s_fs_info = info;
+	sb = sget_fc(fc, kernfs_test_super, kernfs_set_super);
 	if (IS_ERR(sb))
-		return ERR_CAST(sb);
-
-	if (new_sb_created)
-		*new_sb_created = !sb->s_root;
+		return PTR_ERR(sb);
 
 	if (!sb->s_root) {
 		struct kernfs_super_info *info = kernfs_info(sb);
 
-		error = kernfs_fill_super(sb, magic);
+		kfc->new_sb_created = true;
+
+		error = kernfs_fill_super(sb, kfc);
 		if (error) {
 			deactivate_locked_super(sb);
-			return ERR_PTR(error);
+			return error;
 		}
 		sb->s_flags |= SB_ACTIVE;
 
 		mutex_lock(&kernfs_mutex);
-		list_add(&info->node, &root->supers);
+		list_add(&info->node, &info->root->supers);
 		mutex_unlock(&kernfs_mutex);
 	}
 
-	return dget(sb->s_root);
+	fc->root = dget(sb->s_root);
+	return 0;
+}
+
+void kernfs_free_fs_context(struct fs_context *fc)
+{
+	/* Note that we don't deal with kfc->ns_tag here. */
+	kfree(fc->s_fs_info);
+	fc->s_fs_info = NULL;
 }
 
 /**
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 92682fcc41f6..4cb21b558a85 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -13,34 +13,69 @@
 #include <linux/magic.h>
 #include <linux/mount.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/user_namespace.h>
+#include <linux/fs_context.h>
+#include <net/net_namespace.h>
 
 #include "sysfs.h"
 
 static struct kernfs_root *sysfs_root;
 struct kernfs_node *sysfs_root_kn;
 
-static struct dentry *sysfs_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int sysfs_get_tree(struct fs_context *fc)
 {
-	struct dentry *root;
-	void *ns;
-	bool new_sb = false;
+	struct kernfs_fs_context *kfc = fc->fs_private;
+	int ret;
 
-	if (!(flags & SB_KERNMOUNT)) {
+	ret = kernfs_get_tree(fc);
+	if (ret)
+		return ret;
+
+	if (kfc->new_sb_created)
+		fc->root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
+	return 0;
+}
+
+static void sysfs_fs_context_free(struct fs_context *fc)
+{
+	struct kernfs_fs_context *kfc = fc->fs_private;
+
+	if (kfc->ns_tag)
+		kobj_ns_drop(KOBJ_NS_TYPE_NET, kfc->ns_tag);
+	kernfs_free_fs_context(fc);
+	kfree(kfc);
+}
+
+static const struct fs_context_operations sysfs_fs_context_ops = {
+	.free		= sysfs_fs_context_free,
+	.get_tree	= sysfs_get_tree,
+};
+
+static int sysfs_init_fs_context(struct fs_context *fc)
+{
+	struct kernfs_fs_context *kfc;
+	struct net *netns;
+
+	if (!(fc->sb_flags & SB_KERNMOUNT)) {
 		if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
-			return ERR_PTR(-EPERM);
+			return -EPERM;
 	}
 
-	ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
-	root = kernfs_mount_ns(fs_type, flags, sysfs_root,
-				SYSFS_MAGIC, &new_sb, ns);
-	if (!new_sb)
-		kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
-	else if (!IS_ERR(root))
-		root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
+	kfc = kzalloc(sizeof(struct kernfs_fs_context), GFP_KERNEL);
+	if (!kfc)
+		return -ENOMEM;
 
-	return root;
+	kfc->ns_tag = netns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
+	kfc->root = sysfs_root;
+	kfc->magic = SYSFS_MAGIC;
+	fc->fs_private = kfc;
+	fc->ops = &sysfs_fs_context_ops;
+	if (fc->user_ns)
+		put_user_ns(fc->user_ns);
+	fc->user_ns = get_user_ns(netns->user_ns);
+	fc->global = true;
+	return 0;
 }
 
 static void sysfs_kill_sb(struct super_block *sb)
@@ -52,10 +87,10 @@ static void sysfs_kill_sb(struct super_block *sb)
 }
 
 static struct file_system_type sysfs_fs_type = {
-	.name		= "sysfs",
-	.mount		= sysfs_mount,
-	.kill_sb	= sysfs_kill_sb,
-	.fs_flags	= FS_USERNS_MOUNT,
+	.name			= "sysfs",
+	.init_fs_context	= sysfs_init_fs_context,
+	.kill_sb		= sysfs_kill_sb,
+	.fs_flags		= FS_USERNS_MOUNT,
 };
 
 int __init sysfs_init(void)
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 44acb4c3659c..822a64e65b41 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -25,7 +25,9 @@ struct seq_file;
 struct vm_area_struct;
 struct super_block;
 struct file_system_type;
+struct fs_context;
 
+struct kernfs_fs_context;
 struct kernfs_open_node;
 struct kernfs_iattrs;
 
@@ -167,7 +169,6 @@ struct kernfs_node {
  * kernfs_node parameter.
  */
 struct kernfs_syscall_ops {
-	int (*remount_fs)(struct kernfs_root *root, int *flags, char *data);
 	int (*show_options)(struct seq_file *sf, struct kernfs_root *root);
 
 	int (*mkdir)(struct kernfs_node *parent, const char *name,
@@ -268,6 +269,18 @@ struct kernfs_ops {
 #endif
 };
 
+/*
+ * The kernfs superblock creation/mount parameter context.
+ */
+struct kernfs_fs_context {
+	struct kernfs_root	*root;		/* Root of the hierarchy being mounted */
+	void			*ns_tag;	/* Namespace tag of the mount (or NULL) */
+	unsigned long		magic;		/* File system specific magic number */
+
+	/* The following are set/used by kernfs_mount() */
+	bool			new_sb_created;	/* Set to T if we allocated a new sb */
+};
+
 #ifdef CONFIG_KERNFS
 
 static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
@@ -353,9 +366,8 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
 void kernfs_notify(struct kernfs_node *kn);
 
 const void *kernfs_super_ns(struct super_block *sb);
-struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
-			       struct kernfs_root *root, unsigned long magic,
-			       bool *new_sb_created, const void *ns);
+int kernfs_get_tree(struct fs_context *fc);
+void kernfs_free_fs_context(struct fs_context *fc);
 void kernfs_kill_sb(struct super_block *sb);
 
 void kernfs_init(void);
@@ -458,11 +470,10 @@ static inline void kernfs_notify(struct kernfs_node *kn) { }
 static inline const void *kernfs_super_ns(struct super_block *sb)
 { return NULL; }
 
-static inline struct dentry *
-kernfs_mount_ns(struct file_system_type *fs_type, int flags,
-		struct kernfs_root *root, unsigned long magic,
-		bool *new_sb_created, const void *ns)
-{ return ERR_PTR(-ENOSYS); }
+static inline int kernfs_get_tree(struct fs_context *fc)
+{ return -ENOSYS; }
+
+static inline void kernfs_free_fs_context(struct fs_context *fc) { }
 
 static inline void kernfs_kill_sb(struct super_block *sb) { }
 
@@ -545,13 +556,4 @@ static inline int kernfs_rename(struct kernfs_node *kn,
 	return kernfs_rename_ns(kn, new_parent, new_name, NULL);
 }
 
-static inline struct dentry *
-kernfs_mount(struct file_system_type *fs_type, int flags,
-		struct kernfs_root *root, unsigned long magic,
-		bool *new_sb_created)
-{
-	return kernfs_mount_ns(fs_type, flags, root,
-				magic, new_sb_created, NULL);
-}
-
 #endif	/* __LINUX_KERNFS_H */
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 37cf709b7a0e..30e39f3932ad 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -41,6 +41,7 @@ extern void __init enable_debug_cgroup(void);
  * The cgroup filesystem superblock creation/mount context.
  */
 struct cgroup_fs_context {
+	struct kernfs_fs_context kfc;
 	struct cgroup_root	*root;
 	struct cgroup_namespace	*ns;
 	unsigned int	flags;			/* CGRP_ROOT_* flags */
@@ -56,7 +57,9 @@ struct cgroup_fs_context {
 
 static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc)
 {
-	return fc->fs_private;
+	struct kernfs_fs_context *kfc = fc->fs_private;
+
+	return container_of(kfc, struct cgroup_fs_context, kfc);
 }
 
 /*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0c6bef234a7c..747e5b17f9da 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2039,18 +2039,14 @@ out:
 int cgroup_do_get_tree(struct fs_context *fc)
 {
 	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
-	bool new_sb = false;
-	unsigned long magic;
-	int ret = 0;
+	int ret;
 
+	ctx->kfc.root = ctx->root->kf_root;
 	if (fc->fs_type == &cgroup2_fs_type)
-		magic = CGROUP2_SUPER_MAGIC;
+		ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
 	else
-		magic = CGROUP_SUPER_MAGIC;
-	fc->root = kernfs_mount(fc->fs_type, fc->sb_flags, ctx->root->kf_root,
-				magic, &new_sb);
-	if (IS_ERR(fc->root))
-		ret = PTR_ERR(fc->root);
+		ctx->kfc.magic = CGROUP_SUPER_MAGIC;
+	ret = kernfs_get_tree(fc);
 
 	/*
 	 * In non-init cgroup namespace, instead of root cgroup's dentry,
@@ -2078,7 +2074,7 @@ int cgroup_do_get_tree(struct fs_context *fc)
 		}
 	}
 
-	if (!new_sb)
+	if (!ctx->kfc.new_sb_created)
 		cgroup_put(&ctx->root->cgrp);
 
 	return ret;
@@ -2094,19 +2090,15 @@ static void cgroup_fs_context_free(struct fs_context *fc)
 	kfree(ctx->name);
 	kfree(ctx->release_agent);
 	put_cgroup_ns(ctx->ns);
+	kernfs_free_fs_context(fc);
 	kfree(ctx);
 }
 
 static int cgroup_get_tree(struct fs_context *fc)
 {
-	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
 	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
 	int ret;
 
-	/* Check if the caller has permission to mount. */
-	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
-		return -EPERM;
-
 	cgrp_dfl_visible = true;
 	cgroup_get_live(&cgrp_dfl_root.cgrp);
 	ctx->root = &cgrp_dfl_root;
@@ -2132,7 +2124,8 @@ static const struct fs_context_operations cgroup1_fs_context_ops = {
 };
 
 /*
- * Initialise the cgroup filesystem creation/reconfiguration context.
+ * Initialise the cgroup filesystem creation/reconfiguration context.  Notably,
+ * we select the namespace we're going to use.
  */
 static int cgroup_init_fs_context(struct fs_context *fc)
 {
@@ -2151,11 +2144,15 @@ static int cgroup_init_fs_context(struct fs_context *fc)
 
 	ctx->ns = current->nsproxy->cgroup_ns;
 	get_cgroup_ns(ctx->ns);
-	fc->fs_private = ctx;
+	fc->fs_private = &ctx->kfc;
 	if (fc->fs_type == &cgroup2_fs_type)
 		fc->ops = &cgroup_fs_context_ops;
 	else
 		fc->ops = &cgroup1_fs_context_ops;
+	if (fc->user_ns)
+		put_user_ns(fc->user_ns);
+	fc->user_ns = get_user_ns(ctx->ns->user_ns);
+	fc->global = true;
 	return 0;
 }
 
-- 
cgit v1.2.3


From d911b4585eb3501f752160e8e0f1bb00c3c7c4e5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 1 Nov 2018 23:07:26 +0000
Subject: vfs: Remove kern_mount_data()

The kern_mount_data() isn't used any more so remove it.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c     | 6 +++---
 include/linux/fs.h | 3 +--
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/namespace.c b/fs/namespace.c
index 1a1ed2528f47..bb9b7db1c66c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3390,10 +3390,10 @@ void put_mnt_ns(struct mnt_namespace *ns)
 	free_mnt_ns(ns);
 }
 
-struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
+struct vfsmount *kern_mount(struct file_system_type *type)
 {
 	struct vfsmount *mnt;
-	mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, data);
+	mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
 	if (!IS_ERR(mnt)) {
 		/*
 		 * it is a longterm mount, don't release mnt until
@@ -3403,7 +3403,7 @@ struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
 	}
 	return mnt;
 }
-EXPORT_SYMBOL_GPL(kern_mount_data);
+EXPORT_SYMBOL_GPL(kern_mount);
 
 void kern_unmount(struct vfsmount *mnt)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9d05c128ccf6..3e85cb8e8c20 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2280,8 +2280,7 @@ mount_pseudo(struct file_system_type *fs_type, char *name,
 
 extern int register_filesystem(struct file_system_type *);
 extern int unregister_filesystem(struct file_system_type *);
-extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
-#define kern_mount(type) kern_mount_data(type, NULL)
+extern struct vfsmount *kern_mount(struct file_system_type *);
 extern void kern_unmount(struct vfsmount *mnt);
 extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
-- 
cgit v1.2.3


From e7582e16a170db4c85995c1c03d194ea1ea621fc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 1 Nov 2018 23:07:26 +0000
Subject: vfs: Implement logging through fs_context

Implement the ability for filesystems to log error, warning and
informational messages through the fs_context.  In the future, these will
be extractable by userspace by reading from an fd created by the fsopen()
syscall.

Error messages are prefixed with "e ", warnings with "w " and informational
messages with "i ".

In the future, inside the kernel, formatted messages will be malloc'd but
unformatted messages will not copied if they're either in the core .rodata
section or in the .rodata section of the filesystem module pinned by
fs_context::fs_type.  The messages will only be good till the fs_type is
released.

Note that the logging object will be shared between duplicated fs_context
structures.  This is so that such as NFS which do a mount within a mount
can get at least some of the errors from the inner mount.

Five logging functions are provided for this:

 (1) void logfc(struct fs_context *fc, const char *fmt, ...);

     This logs a message into the context.  If the buffer is full, the
     earliest message is discarded.

 (2) void errorf(fc, fmt, ...);

     This wraps logfc() to log an error.

 (3) void invalf(fc, fmt, ...);

     This wraps errorf() and returns -EINVAL for convenience.

 (4) void warnf(fc, fmt, ...);

     This wraps logfc() to log a warning.

 (5) void infof(fc, fmt, ...);

     This wraps logfc() to log an informational message.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs_context.c            | 30 ++++++++++++++++++++++++++++++
 include/linux/fs_context.h | 18 ++++++++++++++----
 2 files changed, 44 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/fs_context.c b/fs/fs_context.c
index 57f61833ac83..87e3546b9a52 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -378,6 +378,36 @@ err_fc:
 }
 EXPORT_SYMBOL(vfs_dup_fs_context);
 
+#ifdef CONFIG_PRINTK
+/**
+ * logfc - Log a message to a filesystem context
+ * @fc: The filesystem context to log to.
+ * @fmt: The format of the buffer.
+ */
+void logfc(struct fs_context *fc, const char *fmt, ...)
+{
+	va_list va;
+
+	va_start(va, fmt);
+
+	switch (fmt[0]) {
+	case 'w':
+		vprintk_emit(0, LOGLEVEL_WARNING, NULL, 0, fmt, va);
+		break;
+	case 'e':
+		vprintk_emit(0, LOGLEVEL_ERR, NULL, 0, fmt, va);
+		break;
+	default:
+		vprintk_emit(0, LOGLEVEL_NOTICE, NULL, 0, fmt, va);
+		break;
+	}
+
+	pr_cont("\n");
+	va_end(va);
+}
+EXPORT_SYMBOL(logfc);
+#endif
+
 /**
  * put_fs_context - Dispose of a superblock configuration context.
  * @fc: The context to dispose of.
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 0db0b645c7b8..eaca452088fa 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -133,7 +133,17 @@ extern int vfs_get_super(struct fs_context *fc,
 			 int (*fill_super)(struct super_block *sb,
 					   struct fs_context *fc));
 
-#define logfc(FC, FMT, ...) pr_notice(FMT, ## __VA_ARGS__)
+extern const struct file_operations fscontext_fops;
+
+#ifdef CONFIG_PRINTK
+extern __attribute__((format(printf, 2, 3)))
+void logfc(struct fs_context *fc, const char *fmt, ...);
+#else
+static inline __attribute__((format(printf, 2, 3)))
+void logfc(struct fs_context *fc, const char *fmt, ...)
+{
+}
+#endif
 
 /**
  * infof - Store supplementary informational message
@@ -143,7 +153,7 @@ extern int vfs_get_super(struct fs_context *fc,
  * Store the supplementary informational message for the process if the process
  * has enabled the facility.
  */
-#define infof(fc, fmt, ...) ({ logfc(fc, fmt, ## __VA_ARGS__); })
+#define infof(fc, fmt, ...) ({ logfc(fc, "i "fmt, ## __VA_ARGS__); })
 
 /**
  * warnf - Store supplementary warning message
@@ -153,7 +163,7 @@ extern int vfs_get_super(struct fs_context *fc,
  * Store the supplementary warning message for the process if the process has
  * enabled the facility.
  */
-#define warnf(fc, fmt, ...) ({ logfc(fc, fmt, ## __VA_ARGS__); })
+#define warnf(fc, fmt, ...) ({ logfc(fc, "w "fmt, ## __VA_ARGS__); })
 
 /**
  * errorf - Store supplementary error message
@@ -163,7 +173,7 @@ extern int vfs_get_super(struct fs_context *fc,
  * Store the supplementary error message for the process if the process has
  * enabled the facility.
  */
-#define errorf(fc, fmt, ...) ({ logfc(fc, fmt, ## __VA_ARGS__); })
+#define errorf(fc, fmt, ...) ({ logfc(fc, "e "fmt, ## __VA_ARGS__); })
 
 /**
  * invalf - Store supplementary invalid argument error message
-- 
cgit v1.2.3


From fff42928ade591969836ff49888d063b829ac888 Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Wed, 27 Feb 2019 11:26:46 -0800
Subject: PCI/ATS: Add inline to pci_prg_resp_pasid_required()

Fix unused function warning when compiled with CONFIG_PCI_PASID
disabled.

Fixes: e5567f5f6762 ("PCI/ATS: Add pci_prg_resp_pasid_required() interface.")
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/pci-ats.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h
index facfd6a18fe1..1ebb88e7c184 100644
--- a/include/linux/pci-ats.h
+++ b/include/linux/pci-ats.h
@@ -67,7 +67,7 @@ static inline int pci_max_pasids(struct pci_dev *pdev)
 	return -EINVAL;
 }
 
-static int pci_prg_resp_pasid_required(struct pci_dev *pdev)
+static inline int pci_prg_resp_pasid_required(struct pci_dev *pdev)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 594b9a89af8e7629e95a4cd844d188361be32790 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 27 Feb 2019 20:40:13 +0800
Subject: block: introduce mp_bvec_for_each_page() for iterating over page

mp_bvec_for_each_segment() is a bit big for the iteration, so introduce
a light-weight helper for iterating over pages, then 32bytes stack
space can be saved.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bvec.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 4376f683c08a..87e82e503a52 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -188,4 +188,9 @@ static inline void mp_bvec_last_segment(const struct bio_vec *bvec,
 	}
 }
 
+#define mp_bvec_for_each_page(pg, bv, i)				\
+	for (i = (bv)->bv_offset / PAGE_SIZE;				\
+		(i <= (((bv)->bv_offset + (bv)->bv_len - 1) / PAGE_SIZE)) && \
+		(pg = bvec_nth_page((bv)->bv_page, i)); i += 1)
+
 #endif /* __LINUX_BVEC_ITER_H */
-- 
cgit v1.2.3


From 2b188cc1bb857a9d4701ae59aa7768b5124e262e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 7 Jan 2019 10:46:33 -0700
Subject: Add io_uring IO interface

The submission queue (SQ) and completion queue (CQ) rings are shared
between the application and the kernel. This eliminates the need to
copy data back and forth to submit and complete IO.

IO submissions use the io_uring_sqe data structure, and completions
are generated in the form of io_uring_cqe data structures. The SQ
ring is an index into the io_uring_sqe array, which makes it possible
to submit a batch of IOs without them being contiguous in the ring.
The CQ ring is always contiguous, as completion events are inherently
unordered, and hence any io_uring_cqe entry can point back to an
arbitrary submission.

Two new system calls are added for this:

io_uring_setup(entries, params)
	Sets up an io_uring instance for doing async IO. On success,
	returns a file descriptor that the application can mmap to
	gain access to the SQ ring, CQ ring, and io_uring_sqes.

io_uring_enter(fd, to_submit, min_complete, flags, sigset, sigsetsize)
	Initiates IO against the rings mapped to this fd, or waits for
	them to complete, or both. The behavior is controlled by the
	parameters passed in. If 'to_submit' is non-zero, then we'll
	try and submit new IO. If IORING_ENTER_GETEVENTS is set, the
	kernel will wait for 'min_complete' events, if they aren't
	already available. It's valid to set IORING_ENTER_GETEVENTS
	and 'min_complete' == 0 at the same time, this allows the
	kernel to return already completed events without waiting
	for them. This is useful only for polling, as for IRQ
	driven IO, the application can just check the CQ ring
	without entering the kernel.

With this setup, it's possible to do async IO with a single system
call. Future developments will enable polled IO with this interface,
and polled submission as well. The latter will enable an application
to do IO without doing ANY system calls at all.

For IRQ driven IO, an application only needs to enter the kernel for
completions if it wants to wait for them to occur.

Each io_uring is backed by a workqueue, to support buffered async IO
as well. We will only punt to an async context if the command would
need to wait for IO on the device side. Any data that can be accessed
directly in the page cache is done inline. This avoids the slowness
issue of usual threadpools, since cached data is accessed as quickly
as a sync interface.

Sample application: http://git.kernel.dk/cgit/fio/plain/t/io_uring.c

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 arch/x86/entry/syscalls/syscall_32.tbl |    2 +
 arch/x86/entry/syscalls/syscall_64.tbl |    2 +
 fs/Makefile                            |    1 +
 fs/io_uring.c                          | 1255 ++++++++++++++++++++++++++++++++
 include/linux/fs.h                     |    9 +
 include/linux/sched/user.h             |    2 +-
 include/linux/syscalls.h               |    6 +
 include/uapi/asm-generic/unistd.h      |    6 +-
 include/uapi/linux/io_uring.h          |   95 +++
 init/Kconfig                           |    9 +
 kernel/sys_ni.c                        |    2 +
 net/unix/garbage.c                     |    3 +
 12 files changed, 1390 insertions(+), 2 deletions(-)
 create mode 100644 fs/io_uring.c
 create mode 100644 include/uapi/linux/io_uring.h

(limited to 'include')

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 3cf7b533b3d1..481c126259e9 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -398,3 +398,5 @@
 384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl
 385	i386	io_pgetevents		sys_io_pgetevents		__ia32_compat_sys_io_pgetevents
 386	i386	rseq			sys_rseq			__ia32_sys_rseq
+425	i386	io_uring_setup		sys_io_uring_setup		__ia32_sys_io_uring_setup
+426	i386	io_uring_enter		sys_io_uring_enter		__ia32_sys_io_uring_enter
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index f0b1709a5ffb..6a32a430c8e0 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -343,6 +343,8 @@
 332	common	statx			__x64_sys_statx
 333	common	io_pgetevents		__x64_sys_io_pgetevents
 334	common	rseq			__x64_sys_rseq
+425	common	io_uring_setup		__x64_sys_io_uring_setup
+426	common	io_uring_enter		__x64_sys_io_uring_enter
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/Makefile b/fs/Makefile
index 293733f61594..8e15d6fc4340 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_TIMERFD)		+= timerfd.o
 obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_USERFAULTFD)	+= userfaultfd.o
 obj-$(CONFIG_AIO)               += aio.o
+obj-$(CONFIG_IO_URING)		+= io_uring.o
 obj-$(CONFIG_FS_DAX)		+= dax.o
 obj-$(CONFIG_FS_ENCRYPTION)	+= crypto/
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
diff --git a/fs/io_uring.c b/fs/io_uring.c
new file mode 100644
index 000000000000..f68052290426
--- /dev/null
+++ b/fs/io_uring.c
@@ -0,0 +1,1255 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared application/kernel submission and completion ring pairs, for
+ * supporting fast/efficient IO.
+ *
+ * A note on the read/write ordering memory barriers that are matched between
+ * the application and kernel side. When the application reads the CQ ring
+ * tail, it must use an appropriate smp_rmb() to order with the smp_wmb()
+ * the kernel uses after writing the tail. Failure to do so could cause a
+ * delay in when the application notices that completion events available.
+ * This isn't a fatal condition. Likewise, the application must use an
+ * appropriate smp_wmb() both before writing the SQ tail, and after writing
+ * the SQ tail. The first one orders the sqe writes with the tail write, and
+ * the latter is paired with the smp_rmb() the kernel will issue before
+ * reading the SQ tail on submission.
+ *
+ * Also see the examples in the liburing library:
+ *
+ *	git://git.kernel.dk/liburing
+ *
+ * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
+ * from data shared between the kernel and application. This is done both
+ * for ordering purposes, but also to ensure that once a value is loaded from
+ * data that the application could potentially modify, it remains stable.
+ *
+ * Copyright (C) 2018-2019 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/syscalls.h>
+#include <linux/compat.h>
+#include <linux/refcount.h>
+#include <linux/uio.h>
+
+#include <linux/sched/signal.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmu_context.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/blkdev.h>
+#include <linux/net.h>
+#include <net/sock.h>
+#include <net/af_unix.h>
+#include <linux/anon_inodes.h>
+#include <linux/sched/mm.h>
+#include <linux/uaccess.h>
+#include <linux/nospec.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "internal.h"
+
+#define IORING_MAX_ENTRIES	4096
+
+struct io_uring {
+	u32 head ____cacheline_aligned_in_smp;
+	u32 tail ____cacheline_aligned_in_smp;
+};
+
+struct io_sq_ring {
+	struct io_uring		r;
+	u32			ring_mask;
+	u32			ring_entries;
+	u32			dropped;
+	u32			flags;
+	u32			array[];
+};
+
+struct io_cq_ring {
+	struct io_uring		r;
+	u32			ring_mask;
+	u32			ring_entries;
+	u32			overflow;
+	struct io_uring_cqe	cqes[];
+};
+
+struct io_ring_ctx {
+	struct {
+		struct percpu_ref	refs;
+	} ____cacheline_aligned_in_smp;
+
+	struct {
+		unsigned int		flags;
+		bool			compat;
+		bool			account_mem;
+
+		/* SQ ring */
+		struct io_sq_ring	*sq_ring;
+		unsigned		cached_sq_head;
+		unsigned		sq_entries;
+		unsigned		sq_mask;
+		struct io_uring_sqe	*sq_sqes;
+	} ____cacheline_aligned_in_smp;
+
+	/* IO offload */
+	struct workqueue_struct	*sqo_wq;
+	struct mm_struct	*sqo_mm;
+
+	struct {
+		/* CQ ring */
+		struct io_cq_ring	*cq_ring;
+		unsigned		cached_cq_tail;
+		unsigned		cq_entries;
+		unsigned		cq_mask;
+		struct wait_queue_head	cq_wait;
+		struct fasync_struct	*cq_fasync;
+	} ____cacheline_aligned_in_smp;
+
+	struct user_struct	*user;
+
+	struct completion	ctx_done;
+
+	struct {
+		struct mutex		uring_lock;
+		wait_queue_head_t	wait;
+	} ____cacheline_aligned_in_smp;
+
+	struct {
+		spinlock_t		completion_lock;
+	} ____cacheline_aligned_in_smp;
+
+#if defined(CONFIG_UNIX)
+	struct socket		*ring_sock;
+#endif
+};
+
+struct sqe_submit {
+	const struct io_uring_sqe	*sqe;
+	unsigned short			index;
+	bool				has_user;
+};
+
+struct io_kiocb {
+	struct kiocb		rw;
+
+	struct sqe_submit	submit;
+
+	struct io_ring_ctx	*ctx;
+	struct list_head	list;
+	unsigned int		flags;
+#define REQ_F_FORCE_NONBLOCK	1	/* inline submission attempt */
+	u64			user_data;
+
+	struct work_struct	work;
+};
+
+#define IO_PLUG_THRESHOLD		2
+
+static struct kmem_cache *req_cachep;
+
+static const struct file_operations io_uring_fops;
+
+struct sock *io_uring_get_socket(struct file *file)
+{
+#if defined(CONFIG_UNIX)
+	if (file->f_op == &io_uring_fops) {
+		struct io_ring_ctx *ctx = file->private_data;
+
+		return ctx->ring_sock->sk;
+	}
+#endif
+	return NULL;
+}
+EXPORT_SYMBOL(io_uring_get_socket);
+
+static void io_ring_ctx_ref_free(struct percpu_ref *ref)
+{
+	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
+
+	complete(&ctx->ctx_done);
+}
+
+static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
+{
+	struct io_ring_ctx *ctx;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return NULL;
+
+	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) {
+		kfree(ctx);
+		return NULL;
+	}
+
+	ctx->flags = p->flags;
+	init_waitqueue_head(&ctx->cq_wait);
+	init_completion(&ctx->ctx_done);
+	mutex_init(&ctx->uring_lock);
+	init_waitqueue_head(&ctx->wait);
+	spin_lock_init(&ctx->completion_lock);
+	return ctx;
+}
+
+static void io_commit_cqring(struct io_ring_ctx *ctx)
+{
+	struct io_cq_ring *ring = ctx->cq_ring;
+
+	if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) {
+		/* order cqe stores with ring update */
+		smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
+
+		/*
+		 * Write sider barrier of tail update, app has read side. See
+		 * comment at the top of this file.
+		 */
+		smp_wmb();
+
+		if (wq_has_sleeper(&ctx->cq_wait)) {
+			wake_up_interruptible(&ctx->cq_wait);
+			kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
+		}
+	}
+}
+
+static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
+{
+	struct io_cq_ring *ring = ctx->cq_ring;
+	unsigned tail;
+
+	tail = ctx->cached_cq_tail;
+	/* See comment at the top of the file */
+	smp_rmb();
+	if (tail + 1 == READ_ONCE(ring->r.head))
+		return NULL;
+
+	ctx->cached_cq_tail++;
+	return &ring->cqes[tail & ctx->cq_mask];
+}
+
+static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
+				 long res, unsigned ev_flags)
+{
+	struct io_uring_cqe *cqe;
+
+	/*
+	 * If we can't get a cq entry, userspace overflowed the
+	 * submission (by quite a lot). Increment the overflow count in
+	 * the ring.
+	 */
+	cqe = io_get_cqring(ctx);
+	if (cqe) {
+		WRITE_ONCE(cqe->user_data, ki_user_data);
+		WRITE_ONCE(cqe->res, res);
+		WRITE_ONCE(cqe->flags, ev_flags);
+	} else {
+		unsigned overflow = READ_ONCE(ctx->cq_ring->overflow);
+
+		WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1);
+	}
+}
+
+static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
+				long res, unsigned ev_flags)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+	io_cqring_fill_event(ctx, ki_user_data, res, ev_flags);
+	io_commit_cqring(ctx);
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+	if (waitqueue_active(&ctx->wait))
+		wake_up(&ctx->wait);
+}
+
+static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
+{
+	percpu_ref_put_many(&ctx->refs, refs);
+
+	if (waitqueue_active(&ctx->wait))
+		wake_up(&ctx->wait);
+}
+
+static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx)
+{
+	struct io_kiocb *req;
+
+	if (!percpu_ref_tryget(&ctx->refs))
+		return NULL;
+
+	req = kmem_cache_alloc(req_cachep, __GFP_NOWARN);
+	if (req) {
+		req->ctx = ctx;
+		req->flags = 0;
+		return req;
+	}
+
+	io_ring_drop_ctx_refs(ctx, 1);
+	return NULL;
+}
+
+static void io_free_req(struct io_kiocb *req)
+{
+	io_ring_drop_ctx_refs(req->ctx, 1);
+	kmem_cache_free(req_cachep, req);
+}
+
+static void kiocb_end_write(struct kiocb *kiocb)
+{
+	if (kiocb->ki_flags & IOCB_WRITE) {
+		struct inode *inode = file_inode(kiocb->ki_filp);
+
+		/*
+		 * Tell lockdep we inherited freeze protection from submission
+		 * thread.
+		 */
+		if (S_ISREG(inode->i_mode))
+			__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
+		file_end_write(kiocb->ki_filp);
+	}
+}
+
+static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+{
+	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+
+	kiocb_end_write(kiocb);
+
+	fput(kiocb->ki_filp);
+	io_cqring_add_event(req->ctx, req->user_data, res, 0);
+	io_free_req(req);
+}
+
+/*
+ * If we tracked the file through the SCM inflight mechanism, we could support
+ * any file. For now, just ensure that anything potentially problematic is done
+ * inline.
+ */
+static bool io_file_supports_async(struct file *file)
+{
+	umode_t mode = file_inode(file)->i_mode;
+
+	if (S_ISBLK(mode) || S_ISCHR(mode))
+		return true;
+	if (S_ISREG(mode) && file->f_op != &io_uring_fops)
+		return true;
+
+	return false;
+}
+
+static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		      bool force_nonblock)
+{
+	struct kiocb *kiocb = &req->rw;
+	unsigned ioprio;
+	int fd, ret;
+
+	/* For -EAGAIN retry, everything is already prepped */
+	if (kiocb->ki_filp)
+		return 0;
+
+	fd = READ_ONCE(sqe->fd);
+	kiocb->ki_filp = fget(fd);
+	if (unlikely(!kiocb->ki_filp))
+		return -EBADF;
+	if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
+		force_nonblock = false;
+	kiocb->ki_pos = READ_ONCE(sqe->off);
+	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
+	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
+
+	ioprio = READ_ONCE(sqe->ioprio);
+	if (ioprio) {
+		ret = ioprio_check_cap(ioprio);
+		if (ret)
+			goto out_fput;
+
+		kiocb->ki_ioprio = ioprio;
+	} else
+		kiocb->ki_ioprio = get_current_ioprio();
+
+	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
+	if (unlikely(ret))
+		goto out_fput;
+	if (force_nonblock) {
+		kiocb->ki_flags |= IOCB_NOWAIT;
+		req->flags |= REQ_F_FORCE_NONBLOCK;
+	}
+	if (kiocb->ki_flags & IOCB_HIPRI) {
+		ret = -EINVAL;
+		goto out_fput;
+	}
+
+	kiocb->ki_complete = io_complete_rw;
+	return 0;
+out_fput:
+	fput(kiocb->ki_filp);
+	return ret;
+}
+
+static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
+{
+	switch (ret) {
+	case -EIOCBQUEUED:
+		break;
+	case -ERESTARTSYS:
+	case -ERESTARTNOINTR:
+	case -ERESTARTNOHAND:
+	case -ERESTART_RESTARTBLOCK:
+		/*
+		 * We can't just restart the syscall, since previously
+		 * submitted sqes may already be in progress. Just fail this
+		 * IO with EINTR.
+		 */
+		ret = -EINTR;
+		/* fall through */
+	default:
+		kiocb->ki_complete(kiocb, ret, 0);
+	}
+}
+
+static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
+			   const struct sqe_submit *s, struct iovec **iovec,
+			   struct iov_iter *iter)
+{
+	const struct io_uring_sqe *sqe = s->sqe;
+	void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	size_t sqe_len = READ_ONCE(sqe->len);
+
+	if (!s->has_user)
+		return -EFAULT;
+
+#ifdef CONFIG_COMPAT
+	if (ctx->compat)
+		return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
+						iovec, iter);
+#endif
+
+	return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
+}
+
+static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
+		       bool force_nonblock)
+{
+	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+	struct kiocb *kiocb = &req->rw;
+	struct iov_iter iter;
+	struct file *file;
+	ssize_t ret;
+
+	ret = io_prep_rw(req, s->sqe, force_nonblock);
+	if (ret)
+		return ret;
+	file = kiocb->ki_filp;
+
+	ret = -EBADF;
+	if (unlikely(!(file->f_mode & FMODE_READ)))
+		goto out_fput;
+	ret = -EINVAL;
+	if (unlikely(!file->f_op->read_iter))
+		goto out_fput;
+
+	ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
+	if (ret)
+		goto out_fput;
+
+	ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_iter_count(&iter));
+	if (!ret) {
+		ssize_t ret2;
+
+		/* Catch -EAGAIN return for forced non-blocking submission */
+		ret2 = call_read_iter(file, kiocb, &iter);
+		if (!force_nonblock || ret2 != -EAGAIN)
+			io_rw_done(kiocb, ret2);
+		else
+			ret = -EAGAIN;
+	}
+	kfree(iovec);
+out_fput:
+	/* Hold on to the file for -EAGAIN */
+	if (unlikely(ret && ret != -EAGAIN))
+		fput(file);
+	return ret;
+}
+
+static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
+			bool force_nonblock)
+{
+	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+	struct kiocb *kiocb = &req->rw;
+	struct iov_iter iter;
+	struct file *file;
+	ssize_t ret;
+
+	ret = io_prep_rw(req, s->sqe, force_nonblock);
+	if (ret)
+		return ret;
+	/* Hold on to the file for -EAGAIN */
+	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
+		return -EAGAIN;
+
+	ret = -EBADF;
+	file = kiocb->ki_filp;
+	if (unlikely(!(file->f_mode & FMODE_WRITE)))
+		goto out_fput;
+	ret = -EINVAL;
+	if (unlikely(!file->f_op->write_iter))
+		goto out_fput;
+
+	ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
+	if (ret)
+		goto out_fput;
+
+	ret = rw_verify_area(WRITE, file, &kiocb->ki_pos,
+				iov_iter_count(&iter));
+	if (!ret) {
+		/*
+		 * Open-code file_start_write here to grab freeze protection,
+		 * which will be released by another thread in
+		 * io_complete_rw().  Fool lockdep by telling it the lock got
+		 * released so that it doesn't complain about the held lock when
+		 * we return to userspace.
+		 */
+		if (S_ISREG(file_inode(file)->i_mode)) {
+			__sb_start_write(file_inode(file)->i_sb,
+						SB_FREEZE_WRITE, true);
+			__sb_writers_release(file_inode(file)->i_sb,
+						SB_FREEZE_WRITE);
+		}
+		kiocb->ki_flags |= IOCB_WRITE;
+		io_rw_done(kiocb, call_write_iter(file, kiocb, &iter));
+	}
+	kfree(iovec);
+out_fput:
+	if (unlikely(ret))
+		fput(file);
+	return ret;
+}
+
+/*
+ * IORING_OP_NOP just posts a completion event, nothing else.
+ */
+static int io_nop(struct io_kiocb *req, u64 user_data)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	long err = 0;
+
+	/*
+	 * Twilight zone - it's possible that someone issued an opcode that
+	 * has a file attached, then got -EAGAIN on submission, and changed
+	 * the sqe before we retried it from async context. Avoid dropping
+	 * a file reference for this malicious case, and flag the error.
+	 */
+	if (req->rw.ki_filp) {
+		err = -EBADF;
+		fput(req->rw.ki_filp);
+	}
+	io_cqring_add_event(ctx, user_data, err, 0);
+	io_free_req(req);
+	return 0;
+}
+
+static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
+			   const struct sqe_submit *s, bool force_nonblock)
+{
+	ssize_t ret;
+	int opcode;
+
+	if (unlikely(s->index >= ctx->sq_entries))
+		return -EINVAL;
+	req->user_data = READ_ONCE(s->sqe->user_data);
+
+	opcode = READ_ONCE(s->sqe->opcode);
+	switch (opcode) {
+	case IORING_OP_NOP:
+		ret = io_nop(req, req->user_data);
+		break;
+	case IORING_OP_READV:
+		ret = io_read(req, s, force_nonblock);
+		break;
+	case IORING_OP_WRITEV:
+		ret = io_write(req, s, force_nonblock);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static void io_sq_wq_submit_work(struct work_struct *work)
+{
+	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+	struct sqe_submit *s = &req->submit;
+	const struct io_uring_sqe *sqe = s->sqe;
+	struct io_ring_ctx *ctx = req->ctx;
+	mm_segment_t old_fs = get_fs();
+	int ret;
+
+	 /* Ensure we clear previously set forced non-block flag */
+	req->flags &= ~REQ_F_FORCE_NONBLOCK;
+	req->rw.ki_flags &= ~IOCB_NOWAIT;
+
+	if (!mmget_not_zero(ctx->sqo_mm)) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	use_mm(ctx->sqo_mm);
+	set_fs(USER_DS);
+	s->has_user = true;
+
+	ret = __io_submit_sqe(ctx, req, s, false);
+
+	set_fs(old_fs);
+	unuse_mm(ctx->sqo_mm);
+	mmput(ctx->sqo_mm);
+err:
+	if (ret) {
+		io_cqring_add_event(ctx, sqe->user_data, ret, 0);
+		io_free_req(req);
+	}
+
+	/* async context always use a copy of the sqe */
+	kfree(sqe);
+}
+
+static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s)
+{
+	struct io_kiocb *req;
+	ssize_t ret;
+
+	/* enforce forwards compatibility on users */
+	if (unlikely(s->sqe->flags))
+		return -EINVAL;
+
+	req = io_get_req(ctx);
+	if (unlikely(!req))
+		return -EAGAIN;
+
+	req->rw.ki_filp = NULL;
+
+	ret = __io_submit_sqe(ctx, req, s, true);
+	if (ret == -EAGAIN) {
+		struct io_uring_sqe *sqe_copy;
+
+		sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
+		if (sqe_copy) {
+			memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
+			s->sqe = sqe_copy;
+
+			memcpy(&req->submit, s, sizeof(*s));
+			INIT_WORK(&req->work, io_sq_wq_submit_work);
+			queue_work(ctx->sqo_wq, &req->work);
+			ret = 0;
+		}
+	}
+	if (ret)
+		io_free_req(req);
+
+	return ret;
+}
+
+static void io_commit_sqring(struct io_ring_ctx *ctx)
+{
+	struct io_sq_ring *ring = ctx->sq_ring;
+
+	if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) {
+		/*
+		 * Ensure any loads from the SQEs are done at this point,
+		 * since once we write the new head, the application could
+		 * write new data to them.
+		 */
+		smp_store_release(&ring->r.head, ctx->cached_sq_head);
+
+		/*
+		 * write side barrier of head update, app has read side. See
+		 * comment at the top of this file
+		 */
+		smp_wmb();
+	}
+}
+
+/*
+ * Undo last io_get_sqring()
+ */
+static void io_drop_sqring(struct io_ring_ctx *ctx)
+{
+	ctx->cached_sq_head--;
+}
+
+/*
+ * Fetch an sqe, if one is available. Note that s->sqe will point to memory
+ * that is mapped by userspace. This means that care needs to be taken to
+ * ensure that reads are stable, as we cannot rely on userspace always
+ * being a good citizen. If members of the sqe are validated and then later
+ * used, it's important that those reads are done through READ_ONCE() to
+ * prevent a re-load down the line.
+ */
+static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
+{
+	struct io_sq_ring *ring = ctx->sq_ring;
+	unsigned head;
+
+	/*
+	 * The cached sq head (or cq tail) serves two purposes:
+	 *
+	 * 1) allows us to batch the cost of updating the user visible
+	 *    head updates.
+	 * 2) allows the kernel side to track the head on its own, even
+	 *    though the application is the one updating it.
+	 */
+	head = ctx->cached_sq_head;
+	/* See comment at the top of this file */
+	smp_rmb();
+	if (head == READ_ONCE(ring->r.tail))
+		return false;
+
+	head = READ_ONCE(ring->array[head & ctx->sq_mask]);
+	if (head < ctx->sq_entries) {
+		s->index = head;
+		s->sqe = &ctx->sq_sqes[head];
+		ctx->cached_sq_head++;
+		return true;
+	}
+
+	/* drop invalid entries */
+	ctx->cached_sq_head++;
+	ring->dropped++;
+	/* See comment at the top of this file */
+	smp_wmb();
+	return false;
+}
+
+static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
+{
+	int i, ret = 0, submit = 0;
+	struct blk_plug plug;
+
+	if (to_submit > IO_PLUG_THRESHOLD)
+		blk_start_plug(&plug);
+
+	for (i = 0; i < to_submit; i++) {
+		struct sqe_submit s;
+
+		if (!io_get_sqring(ctx, &s))
+			break;
+
+		s.has_user = true;
+		ret = io_submit_sqe(ctx, &s);
+		if (ret) {
+			io_drop_sqring(ctx);
+			break;
+		}
+
+		submit++;
+	}
+	io_commit_sqring(ctx);
+
+	if (to_submit > IO_PLUG_THRESHOLD)
+		blk_finish_plug(&plug);
+
+	return submit ? submit : ret;
+}
+
+static unsigned io_cqring_events(struct io_cq_ring *ring)
+{
+	return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
+}
+
+/*
+ * Wait until events become available, if we don't already have some. The
+ * application must reap them itself, as they reside on the shared cq ring.
+ */
+static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
+			  const sigset_t __user *sig, size_t sigsz)
+{
+	struct io_cq_ring *ring = ctx->cq_ring;
+	sigset_t ksigmask, sigsaved;
+	DEFINE_WAIT(wait);
+	int ret;
+
+	/* See comment at the top of this file */
+	smp_rmb();
+	if (io_cqring_events(ring) >= min_events)
+		return 0;
+
+	if (sig) {
+		ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz);
+		if (ret)
+			return ret;
+	}
+
+	do {
+		prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE);
+
+		ret = 0;
+		/* See comment at the top of this file */
+		smp_rmb();
+		if (io_cqring_events(ring) >= min_events)
+			break;
+
+		schedule();
+
+		ret = -EINTR;
+		if (signal_pending(current))
+			break;
+	} while (1);
+
+	finish_wait(&ctx->wait, &wait);
+
+	if (sig)
+		restore_user_sigmask(sig, &sigsaved);
+
+	return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
+}
+
+static int io_sq_offload_start(struct io_ring_ctx *ctx)
+{
+	int ret;
+
+	mmgrab(current->mm);
+	ctx->sqo_mm = current->mm;
+
+	/* Do QD, or 2 * CPUS, whatever is smallest */
+	ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE,
+			min(ctx->sq_entries - 1, 2 * num_online_cpus()));
+	if (!ctx->sqo_wq) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	return 0;
+err:
+	mmdrop(ctx->sqo_mm);
+	ctx->sqo_mm = NULL;
+	return ret;
+}
+
+static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
+{
+	atomic_long_sub(nr_pages, &user->locked_vm);
+}
+
+static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
+{
+	unsigned long page_limit, cur_pages, new_pages;
+
+	/* Don't allow more pages than we can safely lock */
+	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	do {
+		cur_pages = atomic_long_read(&user->locked_vm);
+		new_pages = cur_pages + nr_pages;
+		if (new_pages > page_limit)
+			return -ENOMEM;
+	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
+					new_pages) != cur_pages);
+
+	return 0;
+}
+
+static void io_mem_free(void *ptr)
+{
+	struct page *page = virt_to_head_page(ptr);
+
+	if (put_page_testzero(page))
+		free_compound_page(page);
+}
+
+static void *io_mem_alloc(size_t size)
+{
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
+				__GFP_NORETRY;
+
+	return (void *) __get_free_pages(gfp_flags, get_order(size));
+}
+
+static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
+{
+	struct io_sq_ring *sq_ring;
+	struct io_cq_ring *cq_ring;
+	size_t bytes;
+
+	bytes = struct_size(sq_ring, array, sq_entries);
+	bytes += array_size(sizeof(struct io_uring_sqe), sq_entries);
+	bytes += struct_size(cq_ring, cqes, cq_entries);
+
+	return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+}
+
+static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+{
+	if (ctx->sqo_wq)
+		destroy_workqueue(ctx->sqo_wq);
+	if (ctx->sqo_mm)
+		mmdrop(ctx->sqo_mm);
+#if defined(CONFIG_UNIX)
+	if (ctx->ring_sock)
+		sock_release(ctx->ring_sock);
+#endif
+
+	io_mem_free(ctx->sq_ring);
+	io_mem_free(ctx->sq_sqes);
+	io_mem_free(ctx->cq_ring);
+
+	percpu_ref_exit(&ctx->refs);
+	if (ctx->account_mem)
+		io_unaccount_mem(ctx->user,
+				ring_pages(ctx->sq_entries, ctx->cq_entries));
+	free_uid(ctx->user);
+	kfree(ctx);
+}
+
+static __poll_t io_uring_poll(struct file *file, poll_table *wait)
+{
+	struct io_ring_ctx *ctx = file->private_data;
+	__poll_t mask = 0;
+
+	poll_wait(file, &ctx->cq_wait, wait);
+	/* See comment at the top of this file */
+	smp_rmb();
+	if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head)
+		mask |= EPOLLOUT | EPOLLWRNORM;
+	if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
+		mask |= EPOLLIN | EPOLLRDNORM;
+
+	return mask;
+}
+
+static int io_uring_fasync(int fd, struct file *file, int on)
+{
+	struct io_ring_ctx *ctx = file->private_data;
+
+	return fasync_helper(fd, file, on, &ctx->cq_fasync);
+}
+
+static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
+{
+	mutex_lock(&ctx->uring_lock);
+	percpu_ref_kill(&ctx->refs);
+	mutex_unlock(&ctx->uring_lock);
+
+	wait_for_completion(&ctx->ctx_done);
+	io_ring_ctx_free(ctx);
+}
+
+static int io_uring_release(struct inode *inode, struct file *file)
+{
+	struct io_ring_ctx *ctx = file->private_data;
+
+	file->private_data = NULL;
+	io_ring_ctx_wait_and_kill(ctx);
+	return 0;
+}
+
+static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long sz = vma->vm_end - vma->vm_start;
+	struct io_ring_ctx *ctx = file->private_data;
+	unsigned long pfn;
+	struct page *page;
+	void *ptr;
+
+	switch (offset) {
+	case IORING_OFF_SQ_RING:
+		ptr = ctx->sq_ring;
+		break;
+	case IORING_OFF_SQES:
+		ptr = ctx->sq_sqes;
+		break;
+	case IORING_OFF_CQ_RING:
+		ptr = ctx->cq_ring;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	page = virt_to_head_page(ptr);
+	if (sz > (PAGE_SIZE << compound_order(page)))
+		return -EINVAL;
+
+	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
+	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
+}
+
+SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
+		u32, min_complete, u32, flags, const sigset_t __user *, sig,
+		size_t, sigsz)
+{
+	struct io_ring_ctx *ctx;
+	long ret = -EBADF;
+	int submitted = 0;
+	struct fd f;
+
+	if (flags & ~IORING_ENTER_GETEVENTS)
+		return -EINVAL;
+
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+
+	ret = -EOPNOTSUPP;
+	if (f.file->f_op != &io_uring_fops)
+		goto out_fput;
+
+	ret = -ENXIO;
+	ctx = f.file->private_data;
+	if (!percpu_ref_tryget(&ctx->refs))
+		goto out_fput;
+
+	ret = 0;
+	if (to_submit) {
+		to_submit = min(to_submit, ctx->sq_entries);
+
+		mutex_lock(&ctx->uring_lock);
+		submitted = io_ring_submit(ctx, to_submit);
+		mutex_unlock(&ctx->uring_lock);
+
+		if (submitted < 0)
+			goto out_ctx;
+	}
+	if (flags & IORING_ENTER_GETEVENTS) {
+		min_complete = min(min_complete, ctx->cq_entries);
+
+		/*
+		 * The application could have included the 'to_submit' count
+		 * in how many events it wanted to wait for. If we failed to
+		 * submit the desired count, we may need to adjust the number
+		 * of events to poll/wait for.
+		 */
+		if (submitted < to_submit)
+			min_complete = min_t(unsigned, submitted, min_complete);
+
+		ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+	}
+
+out_ctx:
+	io_ring_drop_ctx_refs(ctx, 1);
+out_fput:
+	fdput(f);
+	return submitted ? submitted : ret;
+}
+
+static const struct file_operations io_uring_fops = {
+	.release	= io_uring_release,
+	.mmap		= io_uring_mmap,
+	.poll		= io_uring_poll,
+	.fasync		= io_uring_fasync,
+};
+
+static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
+				  struct io_uring_params *p)
+{
+	struct io_sq_ring *sq_ring;
+	struct io_cq_ring *cq_ring;
+	size_t size;
+
+	sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries));
+	if (!sq_ring)
+		return -ENOMEM;
+
+	ctx->sq_ring = sq_ring;
+	sq_ring->ring_mask = p->sq_entries - 1;
+	sq_ring->ring_entries = p->sq_entries;
+	ctx->sq_mask = sq_ring->ring_mask;
+	ctx->sq_entries = sq_ring->ring_entries;
+
+	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
+	if (size == SIZE_MAX)
+		return -EOVERFLOW;
+
+	ctx->sq_sqes = io_mem_alloc(size);
+	if (!ctx->sq_sqes) {
+		io_mem_free(ctx->sq_ring);
+		return -ENOMEM;
+	}
+
+	cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
+	if (!cq_ring) {
+		io_mem_free(ctx->sq_ring);
+		io_mem_free(ctx->sq_sqes);
+		return -ENOMEM;
+	}
+
+	ctx->cq_ring = cq_ring;
+	cq_ring->ring_mask = p->cq_entries - 1;
+	cq_ring->ring_entries = p->cq_entries;
+	ctx->cq_mask = cq_ring->ring_mask;
+	ctx->cq_entries = cq_ring->ring_entries;
+	return 0;
+}
+
+/*
+ * Allocate an anonymous fd, this is what constitutes the application
+ * visible backing of an io_uring instance. The application mmaps this
+ * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
+ * we have to tie this fd to a socket for file garbage collection purposes.
+ */
+static int io_uring_get_fd(struct io_ring_ctx *ctx)
+{
+	struct file *file;
+	int ret;
+
+#if defined(CONFIG_UNIX)
+	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
+				&ctx->ring_sock);
+	if (ret)
+		return ret;
+#endif
+
+	ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+	if (ret < 0)
+		goto err;
+
+	file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
+					O_RDWR | O_CLOEXEC);
+	if (IS_ERR(file)) {
+		put_unused_fd(ret);
+		ret = PTR_ERR(file);
+		goto err;
+	}
+
+#if defined(CONFIG_UNIX)
+	ctx->ring_sock->file = file;
+#endif
+	fd_install(ret, file);
+	return ret;
+err:
+#if defined(CONFIG_UNIX)
+	sock_release(ctx->ring_sock);
+	ctx->ring_sock = NULL;
+#endif
+	return ret;
+}
+
+static int io_uring_create(unsigned entries, struct io_uring_params *p)
+{
+	struct user_struct *user = NULL;
+	struct io_ring_ctx *ctx;
+	bool account_mem;
+	int ret;
+
+	if (!entries || entries > IORING_MAX_ENTRIES)
+		return -EINVAL;
+
+	/*
+	 * Use twice as many entries for the CQ ring. It's possible for the
+	 * application to drive a higher depth than the size of the SQ ring,
+	 * since the sqes are only used at submission time. This allows for
+	 * some flexibility in overcommitting a bit.
+	 */
+	p->sq_entries = roundup_pow_of_two(entries);
+	p->cq_entries = 2 * p->sq_entries;
+
+	user = get_uid(current_user());
+	account_mem = !capable(CAP_IPC_LOCK);
+
+	if (account_mem) {
+		ret = io_account_mem(user,
+				ring_pages(p->sq_entries, p->cq_entries));
+		if (ret) {
+			free_uid(user);
+			return ret;
+		}
+	}
+
+	ctx = io_ring_ctx_alloc(p);
+	if (!ctx) {
+		if (account_mem)
+			io_unaccount_mem(user, ring_pages(p->sq_entries,
+								p->cq_entries));
+		free_uid(user);
+		return -ENOMEM;
+	}
+	ctx->compat = in_compat_syscall();
+	ctx->account_mem = account_mem;
+	ctx->user = user;
+
+	ret = io_allocate_scq_urings(ctx, p);
+	if (ret)
+		goto err;
+
+	ret = io_sq_offload_start(ctx);
+	if (ret)
+		goto err;
+
+	ret = io_uring_get_fd(ctx);
+	if (ret < 0)
+		goto err;
+
+	memset(&p->sq_off, 0, sizeof(p->sq_off));
+	p->sq_off.head = offsetof(struct io_sq_ring, r.head);
+	p->sq_off.tail = offsetof(struct io_sq_ring, r.tail);
+	p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask);
+	p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries);
+	p->sq_off.flags = offsetof(struct io_sq_ring, flags);
+	p->sq_off.dropped = offsetof(struct io_sq_ring, dropped);
+	p->sq_off.array = offsetof(struct io_sq_ring, array);
+
+	memset(&p->cq_off, 0, sizeof(p->cq_off));
+	p->cq_off.head = offsetof(struct io_cq_ring, r.head);
+	p->cq_off.tail = offsetof(struct io_cq_ring, r.tail);
+	p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask);
+	p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries);
+	p->cq_off.overflow = offsetof(struct io_cq_ring, overflow);
+	p->cq_off.cqes = offsetof(struct io_cq_ring, cqes);
+	return ret;
+err:
+	io_ring_ctx_wait_and_kill(ctx);
+	return ret;
+}
+
+/*
+ * Sets up an aio uring context, and returns the fd. Applications asks for a
+ * ring size, we return the actual sq/cq ring sizes (among other things) in the
+ * params structure passed in.
+ */
+static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
+{
+	struct io_uring_params p;
+	long ret;
+	int i;
+
+	if (copy_from_user(&p, params, sizeof(p)))
+		return -EFAULT;
+	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
+		if (p.resv[i])
+			return -EINVAL;
+	}
+
+	if (p.flags)
+		return -EINVAL;
+
+	ret = io_uring_create(entries, &p);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(params, &p, sizeof(p)))
+		return -EFAULT;
+
+	return ret;
+}
+
+SYSCALL_DEFINE2(io_uring_setup, u32, entries,
+		struct io_uring_params __user *, params)
+{
+	return io_uring_setup(entries, params);
+}
+
+static int __init io_uring_init(void)
+{
+	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+	return 0;
+};
+__initcall(io_uring_init);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dedcc2e9265c..61aa210f0c2b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3517,4 +3517,13 @@ extern void inode_nohighmem(struct inode *inode);
 extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
 		       int advice);
 
+#if defined(CONFIG_IO_URING)
+extern struct sock *io_uring_get_socket(struct file *file);
+#else
+static inline struct sock *io_uring_get_socket(struct file *file)
+{
+	return NULL;
+}
+#endif
+
 #endif /* _LINUX_FS_H */
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 39ad98c09c58..c7b5f86b91a1 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -40,7 +40,7 @@ struct user_struct {
 	kuid_t uid;
 
 #if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \
-    defined(CONFIG_NET)
+    defined(CONFIG_NET) || defined(CONFIG_IO_URING)
 	atomic_long_t locked_vm;
 #endif
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 257cccba3062..3072dbaa7869 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -69,6 +69,7 @@ struct file_handle;
 struct sigaltstack;
 struct rseq;
 union bpf_attr;
+struct io_uring_params;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -309,6 +310,11 @@ asmlinkage long sys_io_pgetevents_time32(aio_context_t ctx_id,
 				struct io_event __user *events,
 				struct old_timespec32 __user *timeout,
 				const struct __aio_sigset *sig);
+asmlinkage long sys_io_uring_setup(u32 entries,
+				struct io_uring_params __user *p);
+asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit,
+				u32 min_complete, u32 flags,
+				const sigset_t __user *sig, size_t sigsz);
 
 /* fs/xattr.c */
 asmlinkage long sys_setxattr(const char __user *path, const char __user *name,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index d90127298f12..87871e7b7ea7 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -740,9 +740,13 @@ __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
 __SYSCALL(__NR_rseq, sys_rseq)
 #define __NR_kexec_file_load 294
 __SYSCALL(__NR_kexec_file_load,     sys_kexec_file_load)
+#define __NR_io_uring_setup 425
+__SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
+#define __NR_io_uring_enter 426
+__SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
 
 #undef __NR_syscalls
-#define __NR_syscalls 295
+#define __NR_syscalls 427
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
new file mode 100644
index 000000000000..ac692823d6f4
--- /dev/null
+++ b/include/uapi/linux/io_uring.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Header file for the io_uring interface.
+ *
+ * Copyright (C) 2019 Jens Axboe
+ * Copyright (C) 2019 Christoph Hellwig
+ */
+#ifndef LINUX_IO_URING_H
+#define LINUX_IO_URING_H
+
+#include <linux/fs.h>
+#include <linux/types.h>
+
+/*
+ * IO submission data structure (Submission Queue Entry)
+ */
+struct io_uring_sqe {
+	__u8	opcode;		/* type of operation for this sqe */
+	__u8	flags;		/* as of now unused */
+	__u16	ioprio;		/* ioprio for the request */
+	__s32	fd;		/* file descriptor to do IO on */
+	__u64	off;		/* offset into file */
+	__u64	addr;		/* pointer to buffer or iovecs */
+	__u32	len;		/* buffer size or number of iovecs */
+	union {
+		__kernel_rwf_t	rw_flags;
+		__u32		__resv;
+	};
+	__u64	user_data;	/* data to be passed back at completion time */
+	__u64	__pad2[3];
+};
+
+#define IORING_OP_NOP		0
+#define IORING_OP_READV		1
+#define IORING_OP_WRITEV	2
+
+/*
+ * IO completion data structure (Completion Queue Entry)
+ */
+struct io_uring_cqe {
+	__u64	user_data;	/* sqe->data submission passed back */
+	__s32	res;		/* result code for this event */
+	__u32	flags;
+};
+
+/*
+ * Magic offsets for the application to mmap the data it needs
+ */
+#define IORING_OFF_SQ_RING		0ULL
+#define IORING_OFF_CQ_RING		0x8000000ULL
+#define IORING_OFF_SQES			0x10000000ULL
+
+/*
+ * Filled with the offset for mmap(2)
+ */
+struct io_sqring_offsets {
+	__u32 head;
+	__u32 tail;
+	__u32 ring_mask;
+	__u32 ring_entries;
+	__u32 flags;
+	__u32 dropped;
+	__u32 array;
+	__u32 resv1;
+	__u64 resv2;
+};
+
+struct io_cqring_offsets {
+	__u32 head;
+	__u32 tail;
+	__u32 ring_mask;
+	__u32 ring_entries;
+	__u32 overflow;
+	__u32 cqes;
+	__u64 resv[2];
+};
+
+/*
+ * io_uring_enter(2) flags
+ */
+#define IORING_ENTER_GETEVENTS	(1U << 0)
+
+/*
+ * Passed in for io_uring_setup(2). Copied back with updated info on success
+ */
+struct io_uring_params {
+	__u32 sq_entries;
+	__u32 cq_entries;
+	__u32 flags;
+	__u32 resv[7];
+	struct io_sqring_offsets sq_off;
+	struct io_cqring_offsets cq_off;
+};
+
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index c9386a365eea..53b54214a36e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1414,6 +1414,15 @@ config AIO
 	  by some high performance threaded applications. Disabling
 	  this option saves about 7k.
 
+config IO_URING
+	bool "Enable IO uring support" if EXPERT
+	select ANON_INODES
+	default y
+	help
+	  This option enables support for the io_uring interface, enabling
+	  applications to submit and complete IO through submission and
+	  completion rings that are shared between the kernel and application.
+
 config ADVISE_SYSCALLS
 	bool "Enable madvise/fadvise syscalls" if EXPERT
 	default y
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ab9d0e3c6d50..ee5e523564bb 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -46,6 +46,8 @@ COND_SYSCALL(io_getevents);
 COND_SYSCALL(io_pgetevents);
 COND_SYSCALL_COMPAT(io_getevents);
 COND_SYSCALL_COMPAT(io_pgetevents);
+COND_SYSCALL(io_uring_setup);
+COND_SYSCALL(io_uring_enter);
 
 /* fs/xattr.c */
 
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index c36757e72844..f81854d74c7d 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -108,6 +108,9 @@ struct sock *unix_get_socket(struct file *filp)
 		/* PF_UNIX ? */
 		if (s && sock->ops && sock->ops->family == PF_UNIX)
 			u_sock = s;
+	} else {
+		/* Could be an io_uring instance */
+		u_sock = io_uring_get_socket(filp);
 	}
 	return u_sock;
 }
-- 
cgit v1.2.3


From c992fe2925d776be066d9f6cc13f9ea11d78b657 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 11 Jan 2019 09:43:02 -0700
Subject: io_uring: add fsync support

Add a new fsync opcode, which either syncs a range if one is passed,
or the whole file if the offset and length fields are both cleared
to zero.  A flag is provided to use fdatasync semantics, that is only
force out metadata which is required to retrieve the file data, but
not others like metadata.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 54 +++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |  8 ++++++-
 2 files changed, 61 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f68052290426..745413d92712 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -24,6 +24,7 @@
  * data that the application could potentially modify, it remains stable.
  *
  * Copyright (C) 2018-2019 Jens Axboe
+ * Copyright (c) 2018-2019 Christoph Hellwig
  */
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -557,6 +558,56 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
 	return 0;
 }
 
+static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	int fd;
+
+	/* Prep already done */
+	if (req->rw.ki_filp)
+		return 0;
+
+	if (unlikely(sqe->addr || sqe->ioprio))
+		return -EINVAL;
+
+	fd = READ_ONCE(sqe->fd);
+	req->rw.ki_filp = fget(fd);
+	if (unlikely(!req->rw.ki_filp))
+		return -EBADF;
+
+	return 0;
+}
+
+static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		    bool force_nonblock)
+{
+	loff_t sqe_off = READ_ONCE(sqe->off);
+	loff_t sqe_len = READ_ONCE(sqe->len);
+	loff_t end = sqe_off + sqe_len;
+	unsigned fsync_flags;
+	int ret;
+
+	fsync_flags = READ_ONCE(sqe->fsync_flags);
+	if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
+		return -EINVAL;
+
+	ret = io_prep_fsync(req, sqe);
+	if (ret)
+		return ret;
+
+	/* fsync always requires a blocking context */
+	if (force_nonblock)
+		return -EAGAIN;
+
+	ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
+				end > 0 ? end : LLONG_MAX,
+				fsync_flags & IORING_FSYNC_DATASYNC);
+
+	fput(req->rw.ki_filp);
+	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
+	io_free_req(req);
+	return 0;
+}
+
 static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			   const struct sqe_submit *s, bool force_nonblock)
 {
@@ -578,6 +629,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	case IORING_OP_WRITEV:
 		ret = io_write(req, s, force_nonblock);
 		break;
+	case IORING_OP_FSYNC:
+		ret = io_fsync(req, s->sqe, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ac692823d6f4..4589d56d0b68 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -24,7 +24,7 @@ struct io_uring_sqe {
 	__u32	len;		/* buffer size or number of iovecs */
 	union {
 		__kernel_rwf_t	rw_flags;
-		__u32		__resv;
+		__u32		fsync_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	__u64	__pad2[3];
@@ -33,6 +33,12 @@ struct io_uring_sqe {
 #define IORING_OP_NOP		0
 #define IORING_OP_READV		1
 #define IORING_OP_WRITEV	2
+#define IORING_OP_FSYNC		3
+
+/*
+ * sqe->fsync_flags
+ */
+#define IORING_FSYNC_DATASYNC	(1U << 0)
 
 /*
  * IO completion data structure (Completion Queue Entry)
-- 
cgit v1.2.3


From def596e9557c91d9846fc4d84d26f2c564644416 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 9 Jan 2019 08:59:42 -0700
Subject: io_uring: support for IO polling

Add support for a polled io_uring instance. When a read or write is
submitted to a polled io_uring, the application must poll for
completions on the CQ ring through io_uring_enter(2). Polled IO may not
generate IRQ completions, hence they need to be actively found by the
application itself.

To use polling, io_uring_setup() must be used with the
IORING_SETUP_IOPOLL flag being set. It is illegal to mix and match
polled and non-polled IO on an io_uring.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 275 ++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/io_uring.h |   5 +
 2 files changed, 271 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 745413d92712..578797a4f318 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -124,6 +124,14 @@ struct io_ring_ctx {
 
 	struct {
 		spinlock_t		completion_lock;
+		bool			poll_multi_file;
+		/*
+		 * ->poll_list is protected by the ctx->uring_lock for
+		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
+		 * For SQPOLL, only the single threaded io_sq_thread() will
+		 * manipulate the list, hence no extra locking is needed there.
+		 */
+		struct list_head	poll_list;
 	} ____cacheline_aligned_in_smp;
 
 #if defined(CONFIG_UNIX)
@@ -135,6 +143,7 @@ struct sqe_submit {
 	const struct io_uring_sqe	*sqe;
 	unsigned short			index;
 	bool				has_user;
+	bool				needs_lock;
 };
 
 struct io_kiocb {
@@ -146,12 +155,15 @@ struct io_kiocb {
 	struct list_head	list;
 	unsigned int		flags;
 #define REQ_F_FORCE_NONBLOCK	1	/* inline submission attempt */
+#define REQ_F_IOPOLL_COMPLETED	2	/* polled IO has completed */
 	u64			user_data;
+	u64			error;
 
 	struct work_struct	work;
 };
 
 #define IO_PLUG_THRESHOLD		2
+#define IO_IOPOLL_BATCH			8
 
 static struct kmem_cache *req_cachep;
 
@@ -196,6 +208,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	mutex_init(&ctx->uring_lock);
 	init_waitqueue_head(&ctx->wait);
 	spin_lock_init(&ctx->completion_lock);
+	INIT_LIST_HEAD(&ctx->poll_list);
 	return ctx;
 }
 
@@ -297,12 +310,153 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx)
 	return NULL;
 }
 
+static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
+{
+	if (*nr) {
+		kmem_cache_free_bulk(req_cachep, *nr, reqs);
+		io_ring_drop_ctx_refs(ctx, *nr);
+		*nr = 0;
+	}
+}
+
 static void io_free_req(struct io_kiocb *req)
 {
 	io_ring_drop_ctx_refs(req->ctx, 1);
 	kmem_cache_free(req_cachep, req);
 }
 
+/*
+ * Find and free completed poll iocbs
+ */
+static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
+			       struct list_head *done)
+{
+	void *reqs[IO_IOPOLL_BATCH];
+	struct io_kiocb *req;
+	int to_free = 0;
+
+	while (!list_empty(done)) {
+		req = list_first_entry(done, struct io_kiocb, list);
+		list_del(&req->list);
+
+		io_cqring_fill_event(ctx, req->user_data, req->error, 0);
+
+		reqs[to_free++] = req;
+		(*nr_events)++;
+
+		fput(req->rw.ki_filp);
+		if (to_free == ARRAY_SIZE(reqs))
+			io_free_req_many(ctx, reqs, &to_free);
+	}
+	io_commit_cqring(ctx);
+
+	io_free_req_many(ctx, reqs, &to_free);
+}
+
+static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
+			long min)
+{
+	struct io_kiocb *req, *tmp;
+	LIST_HEAD(done);
+	bool spin;
+	int ret;
+
+	/*
+	 * Only spin for completions if we don't have multiple devices hanging
+	 * off our complete list, and we're under the requested amount.
+	 */
+	spin = !ctx->poll_multi_file && *nr_events < min;
+
+	ret = 0;
+	list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
+		struct kiocb *kiocb = &req->rw;
+
+		/*
+		 * Move completed entries to our local list. If we find a
+		 * request that requires polling, break out and complete
+		 * the done list first, if we have entries there.
+		 */
+		if (req->flags & REQ_F_IOPOLL_COMPLETED) {
+			list_move_tail(&req->list, &done);
+			continue;
+		}
+		if (!list_empty(&done))
+			break;
+
+		ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
+		if (ret < 0)
+			break;
+
+		if (ret && spin)
+			spin = false;
+		ret = 0;
+	}
+
+	if (!list_empty(&done))
+		io_iopoll_complete(ctx, nr_events, &done);
+
+	return ret;
+}
+
+/*
+ * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
+ * non-spinning poll check - we'll still enter the driver poll loop, but only
+ * as a non-spinning completion check.
+ */
+static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
+				long min)
+{
+	while (!list_empty(&ctx->poll_list)) {
+		int ret;
+
+		ret = io_do_iopoll(ctx, nr_events, min);
+		if (ret < 0)
+			return ret;
+		if (!min || *nr_events >= min)
+			return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * We can't just wait for polled events to come to us, we have to actively
+ * find and complete them.
+ */
+static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
+{
+	if (!(ctx->flags & IORING_SETUP_IOPOLL))
+		return;
+
+	mutex_lock(&ctx->uring_lock);
+	while (!list_empty(&ctx->poll_list)) {
+		unsigned int nr_events = 0;
+
+		io_iopoll_getevents(ctx, &nr_events, 1);
+	}
+	mutex_unlock(&ctx->uring_lock);
+}
+
+static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
+			   long min)
+{
+	int ret = 0;
+
+	do {
+		int tmin = 0;
+
+		if (*nr_events < min)
+			tmin = min - *nr_events;
+
+		ret = io_iopoll_getevents(ctx, nr_events, tmin);
+		if (ret <= 0)
+			break;
+		ret = 0;
+	} while (min && !*nr_events && !need_resched());
+
+	return ret;
+}
+
 static void kiocb_end_write(struct kiocb *kiocb)
 {
 	if (kiocb->ki_flags & IOCB_WRITE) {
@@ -329,6 +483,53 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 	io_free_req(req);
 }
 
+static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
+{
+	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+
+	kiocb_end_write(kiocb);
+
+	req->error = res;
+	if (res != -EAGAIN)
+		req->flags |= REQ_F_IOPOLL_COMPLETED;
+}
+
+/*
+ * After the iocb has been issued, it's safe to be found on the poll list.
+ * Adding the kiocb to the list AFTER submission ensures that we don't
+ * find it from a io_iopoll_getevents() thread before the issuer is done
+ * accessing the kiocb cookie.
+ */
+static void io_iopoll_req_issued(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+
+	/*
+	 * Track whether we have multiple files in our lists. This will impact
+	 * how we do polling eventually, not spinning if we're on potentially
+	 * different devices.
+	 */
+	if (list_empty(&ctx->poll_list)) {
+		ctx->poll_multi_file = false;
+	} else if (!ctx->poll_multi_file) {
+		struct io_kiocb *list_req;
+
+		list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
+						list);
+		if (list_req->rw.ki_filp != req->rw.ki_filp)
+			ctx->poll_multi_file = true;
+	}
+
+	/*
+	 * For fast devices, IO may have already completed. If it has, add
+	 * it to the front so we find it first.
+	 */
+	if (req->flags & REQ_F_IOPOLL_COMPLETED)
+		list_add(&req->list, &ctx->poll_list);
+	else
+		list_add_tail(&req->list, &ctx->poll_list);
+}
+
 /*
  * If we tracked the file through the SCM inflight mechanism, we could support
  * any file. For now, just ensure that anything potentially problematic is done
@@ -349,6 +550,7 @@ static bool io_file_supports_async(struct file *file)
 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		      bool force_nonblock)
 {
+	struct io_ring_ctx *ctx = req->ctx;
 	struct kiocb *kiocb = &req->rw;
 	unsigned ioprio;
 	int fd, ret;
@@ -384,12 +586,22 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		kiocb->ki_flags |= IOCB_NOWAIT;
 		req->flags |= REQ_F_FORCE_NONBLOCK;
 	}
-	if (kiocb->ki_flags & IOCB_HIPRI) {
-		ret = -EINVAL;
-		goto out_fput;
-	}
+	if (ctx->flags & IORING_SETUP_IOPOLL) {
+		ret = -EOPNOTSUPP;
+		if (!(kiocb->ki_flags & IOCB_DIRECT) ||
+		    !kiocb->ki_filp->f_op->iopoll)
+			goto out_fput;
 
-	kiocb->ki_complete = io_complete_rw;
+		req->error = 0;
+		kiocb->ki_flags |= IOCB_HIPRI;
+		kiocb->ki_complete = io_complete_rw_iopoll;
+	} else {
+		if (kiocb->ki_flags & IOCB_HIPRI) {
+			ret = -EINVAL;
+			goto out_fput;
+		}
+		kiocb->ki_complete = io_complete_rw;
+	}
 	return 0;
 out_fput:
 	fput(kiocb->ki_filp);
@@ -543,6 +755,9 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
 	struct io_ring_ctx *ctx = req->ctx;
 	long err = 0;
 
+	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+
 	/*
 	 * Twilight zone - it's possible that someone issued an opcode that
 	 * has a file attached, then got -EAGAIN on submission, and changed
@@ -566,6 +781,8 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (req->rw.ki_filp)
 		return 0;
 
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
 	if (unlikely(sqe->addr || sqe->ioprio))
 		return -EINVAL;
 
@@ -637,7 +854,22 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		break;
 	}
 
-	return ret;
+	if (ret)
+		return ret;
+
+	if (ctx->flags & IORING_SETUP_IOPOLL) {
+		if (req->error == -EAGAIN)
+			return -EAGAIN;
+
+		/* workqueue context doesn't hold uring_lock, grab it now */
+		if (s->needs_lock)
+			mutex_lock(&ctx->uring_lock);
+		io_iopoll_req_issued(req);
+		if (s->needs_lock)
+			mutex_unlock(&ctx->uring_lock);
+	}
+
+	return 0;
 }
 
 static void io_sq_wq_submit_work(struct work_struct *work)
@@ -661,8 +893,19 @@ static void io_sq_wq_submit_work(struct work_struct *work)
 	use_mm(ctx->sqo_mm);
 	set_fs(USER_DS);
 	s->has_user = true;
+	s->needs_lock = true;
 
-	ret = __io_submit_sqe(ctx, req, s, false);
+	do {
+		ret = __io_submit_sqe(ctx, req, s, false);
+		/*
+		 * We can get EAGAIN for polled IO even though we're forcing
+		 * a sync submission from here, since we can't wait for
+		 * request slots on the block side.
+		 */
+		if (ret != -EAGAIN)
+			break;
+		cond_resched();
+	} while (1);
 
 	set_fs(old_fs);
 	unuse_mm(ctx->sqo_mm);
@@ -799,6 +1042,8 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
 			break;
 
 		s.has_user = true;
+		s.needs_lock = false;
+
 		ret = io_submit_sqe(ctx, &s);
 		if (ret) {
 			io_drop_sqring(ctx);
@@ -947,6 +1192,9 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		destroy_workqueue(ctx->sqo_wq);
 	if (ctx->sqo_mm)
 		mmdrop(ctx->sqo_mm);
+
+	io_iopoll_reap_events(ctx);
+
 #if defined(CONFIG_UNIX)
 	if (ctx->ring_sock)
 		sock_release(ctx->ring_sock);
@@ -993,6 +1241,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 	percpu_ref_kill(&ctx->refs);
 	mutex_unlock(&ctx->uring_lock);
 
+	io_iopoll_reap_events(ctx);
 	wait_for_completion(&ctx->ctx_done);
 	io_ring_ctx_free(ctx);
 }
@@ -1074,6 +1323,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			goto out_ctx;
 	}
 	if (flags & IORING_ENTER_GETEVENTS) {
+		unsigned nr_events = 0;
+
 		min_complete = min(min_complete, ctx->cq_entries);
 
 		/*
@@ -1085,7 +1336,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		if (submitted < to_submit)
 			min_complete = min_t(unsigned, submitted, min_complete);
 
-		ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+		if (ctx->flags & IORING_SETUP_IOPOLL) {
+			mutex_lock(&ctx->uring_lock);
+			ret = io_iopoll_check(ctx, &nr_events, min_complete);
+			mutex_unlock(&ctx->uring_lock);
+		} else {
+			ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+		}
 	}
 
 out_ctx:
@@ -1282,7 +1539,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			return -EINVAL;
 	}
 
-	if (p.flags)
+	if (p.flags & ~IORING_SETUP_IOPOLL)
 		return -EINVAL;
 
 	ret = io_uring_create(entries, &p);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 4589d56d0b68..5c457ea396e6 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -30,6 +30,11 @@ struct io_uring_sqe {
 	__u64	__pad2[3];
 };
 
+/*
+ * io_uring_setup() flags
+ */
+#define IORING_SETUP_IOPOLL	(1U << 0)	/* io_context is polled */
+
 #define IORING_OP_NOP		0
 #define IORING_OP_READV		1
 #define IORING_OP_WRITEV	2
-- 
cgit v1.2.3


From 091141a42e15fe47ada737f3996b317072afcefb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 21 Nov 2018 10:32:39 -0700
Subject: fs: add fget_many() and fput_many()

Some uses cases repeatedly get and put references to the same file, but
the only exposed interface is doing these one at the time. As each of
these entail an atomic inc or dec on a shared structure, that cost can
add up.

Add fget_many(), which works just like fget(), except it takes an
argument for how many references to get on the file. Ditto fput_many(),
which can drop an arbitrary number of references to a file.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/file.c            | 15 ++++++++++-----
 fs/file_table.c      |  9 +++++++--
 include/linux/file.h |  2 ++
 include/linux/fs.h   |  4 +++-
 4 files changed, 22 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/file.c b/fs/file.c
index 3209ee271c41..97df385d6ab0 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -705,7 +705,7 @@ void do_close_on_exec(struct files_struct *files)
 	spin_unlock(&files->file_lock);
 }
 
-static struct file *__fget(unsigned int fd, fmode_t mask)
+static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
 {
 	struct files_struct *files = current->files;
 	struct file *file;
@@ -720,7 +720,7 @@ loop:
 		 */
 		if (file->f_mode & mask)
 			file = NULL;
-		else if (!get_file_rcu(file))
+		else if (!get_file_rcu_many(file, refs))
 			goto loop;
 	}
 	rcu_read_unlock();
@@ -728,15 +728,20 @@ loop:
 	return file;
 }
 
+struct file *fget_many(unsigned int fd, unsigned int refs)
+{
+	return __fget(fd, FMODE_PATH, refs);
+}
+
 struct file *fget(unsigned int fd)
 {
-	return __fget(fd, FMODE_PATH);
+	return __fget(fd, FMODE_PATH, 1);
 }
 EXPORT_SYMBOL(fget);
 
 struct file *fget_raw(unsigned int fd)
 {
-	return __fget(fd, 0);
+	return __fget(fd, 0, 1);
 }
 EXPORT_SYMBOL(fget_raw);
 
@@ -767,7 +772,7 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask)
 			return 0;
 		return (unsigned long)file;
 	} else {
-		file = __fget(fd, mask);
+		file = __fget(fd, mask, 1);
 		if (!file)
 			return 0;
 		return FDPUT_FPUT | (unsigned long)file;
diff --git a/fs/file_table.c b/fs/file_table.c
index 5679e7fcb6b0..155d7514a094 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -326,9 +326,9 @@ void flush_delayed_fput(void)
 
 static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
 
-void fput(struct file *file)
+void fput_many(struct file *file, unsigned int refs)
 {
-	if (atomic_long_dec_and_test(&file->f_count)) {
+	if (atomic_long_sub_and_test(refs, &file->f_count)) {
 		struct task_struct *task = current;
 
 		if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
@@ -347,6 +347,11 @@ void fput(struct file *file)
 	}
 }
 
+void fput(struct file *file)
+{
+	fput_many(file, 1);
+}
+
 /*
  * synchronous analog of fput(); for kernel threads that might be needed
  * in some umount() (and thus can't use flush_delayed_fput() without
diff --git a/include/linux/file.h b/include/linux/file.h
index 6b2fb032416c..3fcddff56bc4 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -13,6 +13,7 @@
 struct file;
 
 extern void fput(struct file *);
+extern void fput_many(struct file *, unsigned int);
 
 struct file_operations;
 struct vfsmount;
@@ -44,6 +45,7 @@ static inline void fdput(struct fd fd)
 }
 
 extern struct file *fget(unsigned int fd);
+extern struct file *fget_many(unsigned int fd, unsigned int refs);
 extern struct file *fget_raw(unsigned int fd);
 extern unsigned long __fdget(unsigned int fd);
 extern unsigned long __fdget_raw(unsigned int fd);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 61aa210f0c2b..80e1b199a4b1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -952,7 +952,9 @@ static inline struct file *get_file(struct file *f)
 	atomic_long_inc(&f->f_count);
 	return f;
 }
-#define get_file_rcu(x) atomic_long_inc_not_zero(&(x)->f_count)
+#define get_file_rcu_many(x, cnt)	\
+	atomic_long_add_unless(&(x)->f_count, (cnt), 0)
+#define get_file_rcu(x) get_file_rcu_many((x), 1)
 #define fput_atomic(x)	atomic_long_add_unless(&(x)->f_count, -1, 1)
 #define file_count(x)	atomic_long_read(&(x)->f_count)
 
-- 
cgit v1.2.3


From edafccee56ff31678a091ddb7219aba9b28bc3cb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 9 Jan 2019 09:16:05 -0700
Subject: io_uring: add support for pre-mapped user IO buffers

If we have fixed user buffers, we can map them into the kernel when we
setup the io_uring. That avoids the need to do get_user_pages() for
each and every IO.

To utilize this feature, the application must call io_uring_register()
after having setup an io_uring instance, passing in
IORING_REGISTER_BUFFERS as the opcode. The argument must be a pointer to
an iovec array, and the nr_args should contain how many iovecs the
application wishes to map.

If successful, these buffers are now mapped into the kernel, eligible
for IO. To use these fixed buffers, the application must use the
IORING_OP_READ_FIXED and IORING_OP_WRITE_FIXED opcodes, and then
set sqe->index to the desired buffer index. sqe->addr..sqe->addr+seq->len
must point to somewhere inside the indexed buffer.

The application may register buffers throughout the lifetime of the
io_uring instance. It can call io_uring_register() with
IORING_UNREGISTER_BUFFERS as the opcode to unregister the current set of
buffers, and then register a new set. The application need not
unregister buffers explicitly before shutting down the io_uring
instance.

It's perfectly valid to setup a larger buffer, and then sometimes only
use parts of it for an IO. As long as the range is within the originally
mapped region, it will work just fine.

For now, buffers must not be file backed. If file backed buffers are
passed in, the registration will fail with -1/EOPNOTSUPP. This
restriction may be relaxed in the future.

RLIMIT_MEMLOCK is used to check how much memory we can pin. A somewhat
arbitrary 1G per buffer size is also imposed.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 arch/x86/entry/syscalls/syscall_32.tbl |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 fs/io_uring.c                          | 374 +++++++++++++++++++++++++++++++--
 include/linux/syscalls.h               |   2 +
 include/uapi/asm-generic/unistd.h      |   4 +-
 include/uapi/linux/io_uring.h          |  13 +-
 kernel/sys_ni.c                        |   1 +
 7 files changed, 381 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 481c126259e9..2eefd2a7c1ce 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -400,3 +400,4 @@
 386	i386	rseq			sys_rseq			__ia32_sys_rseq
 425	i386	io_uring_setup		sys_io_uring_setup		__ia32_sys_io_uring_setup
 426	i386	io_uring_enter		sys_io_uring_enter		__ia32_sys_io_uring_enter
+427	i386	io_uring_register	sys_io_uring_register		__ia32_sys_io_uring_register
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 6a32a430c8e0..65c026185e61 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -345,6 +345,7 @@
 334	common	rseq			__x64_sys_rseq
 425	common	io_uring_setup		__x64_sys_io_uring_setup
 426	common	io_uring_enter		__x64_sys_io_uring_enter
+427	common	io_uring_register	__x64_sys_io_uring_register
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 31f43ed894ba..c0c0f68568b5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -45,6 +45,7 @@
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/blkdev.h>
+#include <linux/bvec.h>
 #include <linux/net.h>
 #include <net/sock.h>
 #include <net/af_unix.h>
@@ -52,6 +53,8 @@
 #include <linux/sched/mm.h>
 #include <linux/uaccess.h>
 #include <linux/nospec.h>
+#include <linux/sizes.h>
+#include <linux/hugetlb.h>
 
 #include <uapi/linux/io_uring.h>
 
@@ -81,6 +84,13 @@ struct io_cq_ring {
 	struct io_uring_cqe	cqes[];
 };
 
+struct io_mapped_ubuf {
+	u64		ubuf;
+	size_t		len;
+	struct		bio_vec *bvec;
+	unsigned int	nr_bvecs;
+};
+
 struct io_ring_ctx {
 	struct {
 		struct percpu_ref	refs;
@@ -113,6 +123,10 @@ struct io_ring_ctx {
 		struct fasync_struct	*cq_fasync;
 	} ____cacheline_aligned_in_smp;
 
+	/* if used, fixed mapped user buffers */
+	unsigned		nr_user_bufs;
+	struct io_mapped_ubuf	*user_bufs;
+
 	struct user_struct	*user;
 
 	struct completion	ctx_done;
@@ -732,6 +746,46 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
 	}
 }
 
+static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
+			   const struct io_uring_sqe *sqe,
+			   struct iov_iter *iter)
+{
+	size_t len = READ_ONCE(sqe->len);
+	struct io_mapped_ubuf *imu;
+	unsigned index, buf_index;
+	size_t offset;
+	u64 buf_addr;
+
+	/* attempt to use fixed buffers without having provided iovecs */
+	if (unlikely(!ctx->user_bufs))
+		return -EFAULT;
+
+	buf_index = READ_ONCE(sqe->buf_index);
+	if (unlikely(buf_index >= ctx->nr_user_bufs))
+		return -EFAULT;
+
+	index = array_index_nospec(buf_index, ctx->nr_user_bufs);
+	imu = &ctx->user_bufs[index];
+	buf_addr = READ_ONCE(sqe->addr);
+
+	/* overflow */
+	if (buf_addr + len < buf_addr)
+		return -EFAULT;
+	/* not inside the mapped region */
+	if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
+		return -EFAULT;
+
+	/*
+	 * May not be a start of buffer, set size appropriately
+	 * and advance us to the beginning.
+	 */
+	offset = buf_addr - imu->ubuf;
+	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
+	if (offset)
+		iov_iter_advance(iter, offset);
+	return 0;
+}
+
 static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
 			   const struct sqe_submit *s, struct iovec **iovec,
 			   struct iov_iter *iter)
@@ -739,6 +793,23 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
 	const struct io_uring_sqe *sqe = s->sqe;
 	void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	size_t sqe_len = READ_ONCE(sqe->len);
+	u8 opcode;
+
+	/*
+	 * We're reading ->opcode for the second time, but the first read
+	 * doesn't care whether it's _FIXED or not, so it doesn't matter
+	 * whether ->opcode changes concurrently. The first read does care
+	 * about whether it is a READ or a WRITE, so we don't trust this read
+	 * for that purpose and instead let the caller pass in the read/write
+	 * flag.
+	 */
+	opcode = READ_ONCE(sqe->opcode);
+	if (opcode == IORING_OP_READ_FIXED ||
+	    opcode == IORING_OP_WRITE_FIXED) {
+		ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
+		*iovec = NULL;
+		return ret;
+	}
 
 	if (!s->has_user)
 		return -EFAULT;
@@ -886,7 +957,7 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (unlikely(sqe->addr || sqe->ioprio))
+	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
 		return -EINVAL;
 
 	fd = READ_ONCE(sqe->fd);
@@ -945,9 +1016,19 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		ret = io_nop(req, req->user_data);
 		break;
 	case IORING_OP_READV:
+		if (unlikely(s->sqe->buf_index))
+			return -EINVAL;
 		ret = io_read(req, s, force_nonblock, state);
 		break;
 	case IORING_OP_WRITEV:
+		if (unlikely(s->sqe->buf_index))
+			return -EINVAL;
+		ret = io_write(req, s, force_nonblock, state);
+		break;
+	case IORING_OP_READ_FIXED:
+		ret = io_read(req, s, force_nonblock, state);
+		break;
+	case IORING_OP_WRITE_FIXED:
 		ret = io_write(req, s, force_nonblock, state);
 		break;
 	case IORING_OP_FSYNC:
@@ -976,28 +1057,46 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	return 0;
 }
 
+static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
+{
+	u8 opcode = READ_ONCE(sqe->opcode);
+
+	return !(opcode == IORING_OP_READ_FIXED ||
+		 opcode == IORING_OP_WRITE_FIXED);
+}
+
 static void io_sq_wq_submit_work(struct work_struct *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	struct sqe_submit *s = &req->submit;
 	const struct io_uring_sqe *sqe = s->sqe;
 	struct io_ring_ctx *ctx = req->ctx;
-	mm_segment_t old_fs = get_fs();
+	mm_segment_t old_fs;
+	bool needs_user;
 	int ret;
 
 	 /* Ensure we clear previously set forced non-block flag */
 	req->flags &= ~REQ_F_FORCE_NONBLOCK;
 	req->rw.ki_flags &= ~IOCB_NOWAIT;
 
-	if (!mmget_not_zero(ctx->sqo_mm)) {
-		ret = -EFAULT;
-		goto err;
-	}
-
-	use_mm(ctx->sqo_mm);
-	set_fs(USER_DS);
-	s->has_user = true;
 	s->needs_lock = true;
+	s->has_user = false;
+
+	/*
+	 * If we're doing IO to fixed buffers, we don't need to get/set
+	 * user context
+	 */
+	needs_user = io_sqe_needs_user(s->sqe);
+	if (needs_user) {
+		if (!mmget_not_zero(ctx->sqo_mm)) {
+			ret = -EFAULT;
+			goto err;
+		}
+		use_mm(ctx->sqo_mm);
+		old_fs = get_fs();
+		set_fs(USER_DS);
+		s->has_user = true;
+	}
 
 	do {
 		ret = __io_submit_sqe(ctx, req, s, false, NULL);
@@ -1011,9 +1110,11 @@ static void io_sq_wq_submit_work(struct work_struct *work)
 		cond_resched();
 	} while (1);
 
-	set_fs(old_fs);
-	unuse_mm(ctx->sqo_mm);
-	mmput(ctx->sqo_mm);
+	if (needs_user) {
+		set_fs(old_fs);
+		unuse_mm(ctx->sqo_mm);
+		mmput(ctx->sqo_mm);
+	}
 err:
 	if (ret) {
 		io_cqring_add_event(ctx, sqe->user_data, ret, 0);
@@ -1317,6 +1418,198 @@ static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
 	return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
 }
 
+static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
+{
+	int i, j;
+
+	if (!ctx->user_bufs)
+		return -ENXIO;
+
+	for (i = 0; i < ctx->nr_user_bufs; i++) {
+		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+
+		for (j = 0; j < imu->nr_bvecs; j++)
+			put_page(imu->bvec[j].bv_page);
+
+		if (ctx->account_mem)
+			io_unaccount_mem(ctx->user, imu->nr_bvecs);
+		kfree(imu->bvec);
+		imu->nr_bvecs = 0;
+	}
+
+	kfree(ctx->user_bufs);
+	ctx->user_bufs = NULL;
+	ctx->nr_user_bufs = 0;
+	return 0;
+}
+
+static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
+		       void __user *arg, unsigned index)
+{
+	struct iovec __user *src;
+
+#ifdef CONFIG_COMPAT
+	if (ctx->compat) {
+		struct compat_iovec __user *ciovs;
+		struct compat_iovec ciov;
+
+		ciovs = (struct compat_iovec __user *) arg;
+		if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
+			return -EFAULT;
+
+		dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
+		dst->iov_len = ciov.iov_len;
+		return 0;
+	}
+#endif
+	src = (struct iovec __user *) arg;
+	if (copy_from_user(dst, &src[index], sizeof(*dst)))
+		return -EFAULT;
+	return 0;
+}
+
+static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
+				  unsigned nr_args)
+{
+	struct vm_area_struct **vmas = NULL;
+	struct page **pages = NULL;
+	int i, j, got_pages = 0;
+	int ret = -EINVAL;
+
+	if (ctx->user_bufs)
+		return -EBUSY;
+	if (!nr_args || nr_args > UIO_MAXIOV)
+		return -EINVAL;
+
+	ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
+					GFP_KERNEL);
+	if (!ctx->user_bufs)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_args; i++) {
+		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+		unsigned long off, start, end, ubuf;
+		int pret, nr_pages;
+		struct iovec iov;
+		size_t size;
+
+		ret = io_copy_iov(ctx, &iov, arg, i);
+		if (ret)
+			break;
+
+		/*
+		 * Don't impose further limits on the size and buffer
+		 * constraints here, we'll -EINVAL later when IO is
+		 * submitted if they are wrong.
+		 */
+		ret = -EFAULT;
+		if (!iov.iov_base || !iov.iov_len)
+			goto err;
+
+		/* arbitrary limit, but we need something */
+		if (iov.iov_len > SZ_1G)
+			goto err;
+
+		ubuf = (unsigned long) iov.iov_base;
+		end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		start = ubuf >> PAGE_SHIFT;
+		nr_pages = end - start;
+
+		if (ctx->account_mem) {
+			ret = io_account_mem(ctx->user, nr_pages);
+			if (ret)
+				goto err;
+		}
+
+		ret = 0;
+		if (!pages || nr_pages > got_pages) {
+			kfree(vmas);
+			kfree(pages);
+			pages = kmalloc_array(nr_pages, sizeof(struct page *),
+						GFP_KERNEL);
+			vmas = kmalloc_array(nr_pages,
+					sizeof(struct vm_area_struct *),
+					GFP_KERNEL);
+			if (!pages || !vmas) {
+				ret = -ENOMEM;
+				if (ctx->account_mem)
+					io_unaccount_mem(ctx->user, nr_pages);
+				goto err;
+			}
+			got_pages = nr_pages;
+		}
+
+		imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec),
+						GFP_KERNEL);
+		ret = -ENOMEM;
+		if (!imu->bvec) {
+			if (ctx->account_mem)
+				io_unaccount_mem(ctx->user, nr_pages);
+			goto err;
+		}
+
+		ret = 0;
+		down_read(&current->mm->mmap_sem);
+		pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
+						pages, vmas);
+		if (pret == nr_pages) {
+			/* don't support file backed memory */
+			for (j = 0; j < nr_pages; j++) {
+				struct vm_area_struct *vma = vmas[j];
+
+				if (vma->vm_file &&
+				    !is_file_hugepages(vma->vm_file)) {
+					ret = -EOPNOTSUPP;
+					break;
+				}
+			}
+		} else {
+			ret = pret < 0 ? pret : -EFAULT;
+		}
+		up_read(&current->mm->mmap_sem);
+		if (ret) {
+			/*
+			 * if we did partial map, or found file backed vmas,
+			 * release any pages we did get
+			 */
+			if (pret > 0) {
+				for (j = 0; j < pret; j++)
+					put_page(pages[j]);
+			}
+			if (ctx->account_mem)
+				io_unaccount_mem(ctx->user, nr_pages);
+			goto err;
+		}
+
+		off = ubuf & ~PAGE_MASK;
+		size = iov.iov_len;
+		for (j = 0; j < nr_pages; j++) {
+			size_t vec_len;
+
+			vec_len = min_t(size_t, size, PAGE_SIZE - off);
+			imu->bvec[j].bv_page = pages[j];
+			imu->bvec[j].bv_len = vec_len;
+			imu->bvec[j].bv_offset = off;
+			off = 0;
+			size -= vec_len;
+		}
+		/* store original address for later verification */
+		imu->ubuf = ubuf;
+		imu->len = iov.iov_len;
+		imu->nr_bvecs = nr_pages;
+
+		ctx->nr_user_bufs++;
+	}
+	kfree(pages);
+	kfree(vmas);
+	return 0;
+err:
+	kfree(pages);
+	kfree(vmas);
+	io_sqe_buffer_unregister(ctx);
+	return ret;
+}
+
 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	if (ctx->sqo_wq)
@@ -1325,6 +1618,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		mmdrop(ctx->sqo_mm);
 
 	io_iopoll_reap_events(ctx);
+	io_sqe_buffer_unregister(ctx);
 
 #if defined(CONFIG_UNIX)
 	if (ctx->ring_sock)
@@ -1689,6 +1983,60 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
 	return io_uring_setup(entries, params);
 }
 
+static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
+			       void __user *arg, unsigned nr_args)
+{
+	int ret;
+
+	percpu_ref_kill(&ctx->refs);
+	wait_for_completion(&ctx->ctx_done);
+
+	switch (opcode) {
+	case IORING_REGISTER_BUFFERS:
+		ret = io_sqe_buffer_register(ctx, arg, nr_args);
+		break;
+	case IORING_UNREGISTER_BUFFERS:
+		ret = -EINVAL;
+		if (arg || nr_args)
+			break;
+		ret = io_sqe_buffer_unregister(ctx);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	/* bring the ctx back to life */
+	reinit_completion(&ctx->ctx_done);
+	percpu_ref_reinit(&ctx->refs);
+	return ret;
+}
+
+SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
+		void __user *, arg, unsigned int, nr_args)
+{
+	struct io_ring_ctx *ctx;
+	long ret = -EBADF;
+	struct fd f;
+
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+
+	ret = -EOPNOTSUPP;
+	if (f.file->f_op != &io_uring_fops)
+		goto out_fput;
+
+	ctx = f.file->private_data;
+
+	mutex_lock(&ctx->uring_lock);
+	ret = __io_uring_register(ctx, opcode, arg, nr_args);
+	mutex_unlock(&ctx->uring_lock);
+out_fput:
+	fdput(f);
+	return ret;
+}
+
 static int __init io_uring_init(void)
 {
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3072dbaa7869..3681c05ac538 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -315,6 +315,8 @@ asmlinkage long sys_io_uring_setup(u32 entries,
 asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit,
 				u32 min_complete, u32 flags,
 				const sigset_t __user *sig, size_t sigsz);
+asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op,
+				void __user *arg, unsigned int nr_args);
 
 /* fs/xattr.c */
 asmlinkage long sys_setxattr(const char __user *path, const char __user *name,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 87871e7b7ea7..d346229a1eb0 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -744,9 +744,11 @@ __SYSCALL(__NR_kexec_file_load,     sys_kexec_file_load)
 __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
 #define __NR_io_uring_enter 426
 __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
+#define __NR_io_uring_register 427
+__SYSCALL(__NR_io_uring_register, sys_io_uring_register)
 
 #undef __NR_syscalls
-#define __NR_syscalls 427
+#define __NR_syscalls 428
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 5c457ea396e6..cf28f7a11f12 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -27,7 +27,10 @@ struct io_uring_sqe {
 		__u32		fsync_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
-	__u64	__pad2[3];
+	union {
+		__u16	buf_index;	/* index into fixed buffers, if used */
+		__u64	__pad2[3];
+	};
 };
 
 /*
@@ -39,6 +42,8 @@ struct io_uring_sqe {
 #define IORING_OP_READV		1
 #define IORING_OP_WRITEV	2
 #define IORING_OP_FSYNC		3
+#define IORING_OP_READ_FIXED	4
+#define IORING_OP_WRITE_FIXED	5
 
 /*
  * sqe->fsync_flags
@@ -103,4 +108,10 @@ struct io_uring_params {
 	struct io_cqring_offsets cq_off;
 };
 
+/*
+ * io_uring_register(2) opcodes and arguments
+ */
+#define IORING_REGISTER_BUFFERS		0
+#define IORING_UNREGISTER_BUFFERS	1
+
 #endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ee5e523564bb..1bb6604dc19f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -48,6 +48,7 @@ COND_SYSCALL_COMPAT(io_getevents);
 COND_SYSCALL_COMPAT(io_pgetevents);
 COND_SYSCALL(io_uring_setup);
 COND_SYSCALL(io_uring_enter);
+COND_SYSCALL(io_uring_register);
 
 /* fs/xattr.c */
 
-- 
cgit v1.2.3


From f4e65870e5cede5ca1ec0006b6c9803994e5f7b8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 8 Feb 2019 09:01:44 -0700
Subject: net: split out functions related to registering inflight socket files

We need this functionality for the io_uring file registration, but
we cannot rely on it since CONFIG_UNIX can be modular. Move the helpers
to a separate file, that's always builtin to the kernel if CONFIG_UNIX is
m/y.

No functional changes in this patch, just moving code around.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/net/af_unix.h |   1 +
 net/Makefile          |   2 +-
 net/unix/Kconfig      |   5 ++
 net/unix/Makefile     |   2 +
 net/unix/af_unix.c    |  63 +--------------------
 net/unix/garbage.c    |  71 +-----------------------
 net/unix/scm.c        | 151 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/unix/scm.h        |  10 ++++
 8 files changed, 174 insertions(+), 131 deletions(-)
 create mode 100644 net/unix/scm.c
 create mode 100644 net/unix/scm.h

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index ddbba838d048..3426d6dacc45 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -10,6 +10,7 @@
 
 void unix_inflight(struct user_struct *user, struct file *fp);
 void unix_notinflight(struct user_struct *user, struct file *fp);
+void unix_destruct_scm(struct sk_buff *skb);
 void unix_gc(void);
 void wait_for_unix_gc(void);
 struct sock *unix_get_socket(struct file *filp);
diff --git a/net/Makefile b/net/Makefile
index bdaf53925acd..449fc0b221f8 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -18,7 +18,7 @@ obj-$(CONFIG_NETFILTER)		+= netfilter/
 obj-$(CONFIG_INET)		+= ipv4/
 obj-$(CONFIG_TLS)		+= tls/
 obj-$(CONFIG_XFRM)		+= xfrm/
-obj-$(CONFIG_UNIX)		+= unix/
+obj-$(CONFIG_UNIX_SCM)		+= unix/
 obj-$(CONFIG_NET)		+= ipv6/
 obj-$(CONFIG_BPFILTER)		+= bpfilter/
 obj-$(CONFIG_PACKET)		+= packet/
diff --git a/net/unix/Kconfig b/net/unix/Kconfig
index 8b31ab85d050..3b9e450656a4 100644
--- a/net/unix/Kconfig
+++ b/net/unix/Kconfig
@@ -19,6 +19,11 @@ config UNIX
 
 	  Say Y unless you know what you are doing.
 
+config UNIX_SCM
+	bool
+	depends on UNIX
+	default y
+
 config UNIX_DIAG
 	tristate "UNIX: socket monitoring interface"
 	depends on UNIX
diff --git a/net/unix/Makefile b/net/unix/Makefile
index ffd0a275c3a7..54e58cc4f945 100644
--- a/net/unix/Makefile
+++ b/net/unix/Makefile
@@ -10,3 +10,5 @@ unix-$(CONFIG_SYSCTL)	+= sysctl_net_unix.o
 
 obj-$(CONFIG_UNIX_DIAG)	+= unix_diag.o
 unix_diag-y		:= diag.o
+
+obj-$(CONFIG_UNIX_SCM)	+= scm.o
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 74d1eed7cbd4..2ce32dbb2feb 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -119,6 +119,8 @@
 #include <linux/freezer.h>
 #include <linux/file.h>
 
+#include "scm.h"
+
 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 EXPORT_SYMBOL_GPL(unix_socket_table);
 DEFINE_SPINLOCK(unix_table_lock);
@@ -1486,67 +1488,6 @@ out:
 	return err;
 }
 
-static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
-{
-	int i;
-
-	scm->fp = UNIXCB(skb).fp;
-	UNIXCB(skb).fp = NULL;
-
-	for (i = scm->fp->count-1; i >= 0; i--)
-		unix_notinflight(scm->fp->user, scm->fp->fp[i]);
-}
-
-static void unix_destruct_scm(struct sk_buff *skb)
-{
-	struct scm_cookie scm;
-	memset(&scm, 0, sizeof(scm));
-	scm.pid  = UNIXCB(skb).pid;
-	if (UNIXCB(skb).fp)
-		unix_detach_fds(&scm, skb);
-
-	/* Alas, it calls VFS */
-	/* So fscking what? fput() had been SMP-safe since the last Summer */
-	scm_destroy(&scm);
-	sock_wfree(skb);
-}
-
-/*
- * The "user->unix_inflight" variable is protected by the garbage
- * collection lock, and we just read it locklessly here. If you go
- * over the limit, there might be a tiny race in actually noticing
- * it across threads. Tough.
- */
-static inline bool too_many_unix_fds(struct task_struct *p)
-{
-	struct user_struct *user = current_user();
-
-	if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
-		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
-	return false;
-}
-
-static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
-{
-	int i;
-
-	if (too_many_unix_fds(current))
-		return -ETOOMANYREFS;
-
-	/*
-	 * Need to duplicate file references for the sake of garbage
-	 * collection.  Otherwise a socket in the fps might become a
-	 * candidate for GC while the skb is not yet queued.
-	 */
-	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
-	if (!UNIXCB(skb).fp)
-		return -ENOMEM;
-
-	for (i = scm->fp->count - 1; i >= 0; i--)
-		unix_inflight(scm->fp->user, scm->fp->fp[i]);
-	return 0;
-}
-
 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
 {
 	int err = 0;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index f81854d74c7d..8bbe1b8e4ff7 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -86,80 +86,13 @@
 #include <net/scm.h>
 #include <net/tcp_states.h>
 
+#include "scm.h"
+
 /* Internal data structures and random procedures: */
 
-static LIST_HEAD(gc_inflight_list);
 static LIST_HEAD(gc_candidates);
-static DEFINE_SPINLOCK(unix_gc_lock);
 static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait);
 
-unsigned int unix_tot_inflight;
-
-struct sock *unix_get_socket(struct file *filp)
-{
-	struct sock *u_sock = NULL;
-	struct inode *inode = file_inode(filp);
-
-	/* Socket ? */
-	if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
-		struct socket *sock = SOCKET_I(inode);
-		struct sock *s = sock->sk;
-
-		/* PF_UNIX ? */
-		if (s && sock->ops && sock->ops->family == PF_UNIX)
-			u_sock = s;
-	} else {
-		/* Could be an io_uring instance */
-		u_sock = io_uring_get_socket(filp);
-	}
-	return u_sock;
-}
-
-/* Keep the number of times in flight count for the file
- * descriptor if it is for an AF_UNIX socket.
- */
-
-void unix_inflight(struct user_struct *user, struct file *fp)
-{
-	struct sock *s = unix_get_socket(fp);
-
-	spin_lock(&unix_gc_lock);
-
-	if (s) {
-		struct unix_sock *u = unix_sk(s);
-
-		if (atomic_long_inc_return(&u->inflight) == 1) {
-			BUG_ON(!list_empty(&u->link));
-			list_add_tail(&u->link, &gc_inflight_list);
-		} else {
-			BUG_ON(list_empty(&u->link));
-		}
-		unix_tot_inflight++;
-	}
-	user->unix_inflight++;
-	spin_unlock(&unix_gc_lock);
-}
-
-void unix_notinflight(struct user_struct *user, struct file *fp)
-{
-	struct sock *s = unix_get_socket(fp);
-
-	spin_lock(&unix_gc_lock);
-
-	if (s) {
-		struct unix_sock *u = unix_sk(s);
-
-		BUG_ON(!atomic_long_read(&u->inflight));
-		BUG_ON(list_empty(&u->link));
-
-		if (atomic_long_dec_and_test(&u->inflight))
-			list_del_init(&u->link);
-		unix_tot_inflight--;
-	}
-	user->unix_inflight--;
-	spin_unlock(&unix_gc_lock);
-}
-
 static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
 			  struct sk_buff_head *hitlist)
 {
diff --git a/net/unix/scm.c b/net/unix/scm.c
new file mode 100644
index 000000000000..8c40f2b32392
--- /dev/null
+++ b/net/unix/scm.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/net.h>
+#include <linux/fs.h>
+#include <net/af_unix.h>
+#include <net/scm.h>
+#include <linux/init.h>
+
+#include "scm.h"
+
+unsigned int unix_tot_inflight;
+EXPORT_SYMBOL(unix_tot_inflight);
+
+LIST_HEAD(gc_inflight_list);
+EXPORT_SYMBOL(gc_inflight_list);
+
+DEFINE_SPINLOCK(unix_gc_lock);
+EXPORT_SYMBOL(unix_gc_lock);
+
+struct sock *unix_get_socket(struct file *filp)
+{
+	struct sock *u_sock = NULL;
+	struct inode *inode = file_inode(filp);
+
+	/* Socket ? */
+	if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
+		struct socket *sock = SOCKET_I(inode);
+		struct sock *s = sock->sk;
+
+		/* PF_UNIX ? */
+		if (s && sock->ops && sock->ops->family == PF_UNIX)
+			u_sock = s;
+	} else {
+		/* Could be an io_uring instance */
+		u_sock = io_uring_get_socket(filp);
+	}
+	return u_sock;
+}
+EXPORT_SYMBOL(unix_get_socket);
+
+/* Keep the number of times in flight count for the file
+ * descriptor if it is for an AF_UNIX socket.
+ */
+void unix_inflight(struct user_struct *user, struct file *fp)
+{
+	struct sock *s = unix_get_socket(fp);
+
+	spin_lock(&unix_gc_lock);
+
+	if (s) {
+		struct unix_sock *u = unix_sk(s);
+
+		if (atomic_long_inc_return(&u->inflight) == 1) {
+			BUG_ON(!list_empty(&u->link));
+			list_add_tail(&u->link, &gc_inflight_list);
+		} else {
+			BUG_ON(list_empty(&u->link));
+		}
+		unix_tot_inflight++;
+	}
+	user->unix_inflight++;
+	spin_unlock(&unix_gc_lock);
+}
+
+void unix_notinflight(struct user_struct *user, struct file *fp)
+{
+	struct sock *s = unix_get_socket(fp);
+
+	spin_lock(&unix_gc_lock);
+
+	if (s) {
+		struct unix_sock *u = unix_sk(s);
+
+		BUG_ON(!atomic_long_read(&u->inflight));
+		BUG_ON(list_empty(&u->link));
+
+		if (atomic_long_dec_and_test(&u->inflight))
+			list_del_init(&u->link);
+		unix_tot_inflight--;
+	}
+	user->unix_inflight--;
+	spin_unlock(&unix_gc_lock);
+}
+
+/*
+ * The "user->unix_inflight" variable is protected by the garbage
+ * collection lock, and we just read it locklessly here. If you go
+ * over the limit, there might be a tiny race in actually noticing
+ * it across threads. Tough.
+ */
+static inline bool too_many_unix_fds(struct task_struct *p)
+{
+	struct user_struct *user = current_user();
+
+	if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
+		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
+	return false;
+}
+
+int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+{
+	int i;
+
+	if (too_many_unix_fds(current))
+		return -ETOOMANYREFS;
+
+	/*
+	 * Need to duplicate file references for the sake of garbage
+	 * collection.  Otherwise a socket in the fps might become a
+	 * candidate for GC while the skb is not yet queued.
+	 */
+	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
+	if (!UNIXCB(skb).fp)
+		return -ENOMEM;
+
+	for (i = scm->fp->count - 1; i >= 0; i--)
+		unix_inflight(scm->fp->user, scm->fp->fp[i]);
+	return 0;
+}
+EXPORT_SYMBOL(unix_attach_fds);
+
+void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+{
+	int i;
+
+	scm->fp = UNIXCB(skb).fp;
+	UNIXCB(skb).fp = NULL;
+
+	for (i = scm->fp->count-1; i >= 0; i--)
+		unix_notinflight(scm->fp->user, scm->fp->fp[i]);
+}
+EXPORT_SYMBOL(unix_detach_fds);
+
+void unix_destruct_scm(struct sk_buff *skb)
+{
+	struct scm_cookie scm;
+
+	memset(&scm, 0, sizeof(scm));
+	scm.pid  = UNIXCB(skb).pid;
+	if (UNIXCB(skb).fp)
+		unix_detach_fds(&scm, skb);
+
+	/* Alas, it calls VFS */
+	/* So fscking what? fput() had been SMP-safe since the last Summer */
+	scm_destroy(&scm);
+	sock_wfree(skb);
+}
+EXPORT_SYMBOL(unix_destruct_scm);
diff --git a/net/unix/scm.h b/net/unix/scm.h
new file mode 100644
index 000000000000..5a255a477f16
--- /dev/null
+++ b/net/unix/scm.h
@@ -0,0 +1,10 @@
+#ifndef NET_UNIX_SCM_H
+#define NET_UNIX_SCM_H
+
+extern struct list_head gc_inflight_list;
+extern spinlock_t unix_gc_lock;
+
+int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb);
+void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb);
+
+#endif
-- 
cgit v1.2.3


From 6b06314c47e141031be043539900d80d2c7ba10f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 10 Jan 2019 22:13:58 -0700
Subject: io_uring: add file set registration

We normally have to fget/fput for each IO we do on a file. Even with
the batching we do, the cost of the atomic inc/dec of the file usage
count adds up.

This adds IORING_REGISTER_FILES, and IORING_UNREGISTER_FILES opcodes
for the io_uring_register(2) system call. The arguments passed in must
be an array of __s32 holding file descriptors, and nr_args should hold
the number of file descriptors the application wishes to pin for the
duration of the io_uring instance (or until IORING_UNREGISTER_FILES is
called).

When used, the application must set IOSQE_FIXED_FILE in the sqe->flags
member. Then, instead of setting sqe->fd to the real fd, it sets sqe->fd
to the index in the array passed in to IORING_REGISTER_FILES.

Files are automatically unregistered when the io_uring instance is torn
down. An application need only unregister if it wishes to register a new
set of fds.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 311 +++++++++++++++++++++++++++++++++++++-----
 include/uapi/linux/io_uring.h |   9 +-
 2 files changed, 288 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c0c0f68568b5..5eaa1c4a3f7c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -49,6 +49,7 @@
 #include <linux/net.h>
 #include <net/sock.h>
 #include <net/af_unix.h>
+#include <net/scm.h>
 #include <linux/anon_inodes.h>
 #include <linux/sched/mm.h>
 #include <linux/uaccess.h>
@@ -61,6 +62,7 @@
 #include "internal.h"
 
 #define IORING_MAX_ENTRIES	4096
+#define IORING_MAX_FIXED_FILES	1024
 
 struct io_uring {
 	u32 head ____cacheline_aligned_in_smp;
@@ -123,6 +125,14 @@ struct io_ring_ctx {
 		struct fasync_struct	*cq_fasync;
 	} ____cacheline_aligned_in_smp;
 
+	/*
+	 * If used, fixed file set. Writers must ensure that ->refs is dead,
+	 * readers must ensure that ->refs is alive as long as the file* is
+	 * used. Only updated through io_uring_register(2).
+	 */
+	struct file		**user_files;
+	unsigned		nr_user_files;
+
 	/* if used, fixed mapped user buffers */
 	unsigned		nr_user_bufs;
 	struct io_mapped_ubuf	*user_bufs;
@@ -170,6 +180,7 @@ struct io_kiocb {
 	unsigned int		flags;
 #define REQ_F_FORCE_NONBLOCK	1	/* inline submission attempt */
 #define REQ_F_IOPOLL_COMPLETED	2	/* polled IO has completed */
+#define REQ_F_FIXED_FILE	4	/* ctx owns file */
 	u64			user_data;
 	u64			error;
 
@@ -404,15 +415,17 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		 * Batched puts of the same file, to avoid dirtying the
 		 * file usage count multiple times, if avoidable.
 		 */
-		if (!file) {
-			file = req->rw.ki_filp;
-			file_count = 1;
-		} else if (file == req->rw.ki_filp) {
-			file_count++;
-		} else {
-			fput_many(file, file_count);
-			file = req->rw.ki_filp;
-			file_count = 1;
+		if (!(req->flags & REQ_F_FIXED_FILE)) {
+			if (!file) {
+				file = req->rw.ki_filp;
+				file_count = 1;
+			} else if (file == req->rw.ki_filp) {
+				file_count++;
+			} else {
+				fput_many(file, file_count);
+				file = req->rw.ki_filp;
+				file_count = 1;
+			}
 		}
 
 		if (to_free == ARRAY_SIZE(reqs))
@@ -544,13 +557,19 @@ static void kiocb_end_write(struct kiocb *kiocb)
 	}
 }
 
+static void io_fput(struct io_kiocb *req)
+{
+	if (!(req->flags & REQ_F_FIXED_FILE))
+		fput(req->rw.ki_filp);
+}
+
 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
 
 	kiocb_end_write(kiocb);
 
-	fput(kiocb->ki_filp);
+	io_fput(req);
 	io_cqring_add_event(req->ctx, req->user_data, res, 0);
 	io_free_req(req);
 }
@@ -666,19 +685,29 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct kiocb *kiocb = &req->rw;
-	unsigned ioprio;
+	unsigned ioprio, flags;
 	int fd, ret;
 
 	/* For -EAGAIN retry, everything is already prepped */
 	if (kiocb->ki_filp)
 		return 0;
 
+	flags = READ_ONCE(sqe->flags);
 	fd = READ_ONCE(sqe->fd);
-	kiocb->ki_filp = io_file_get(state, fd);
-	if (unlikely(!kiocb->ki_filp))
-		return -EBADF;
-	if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
-		force_nonblock = false;
+
+	if (flags & IOSQE_FIXED_FILE) {
+		if (unlikely(!ctx->user_files ||
+		    (unsigned) fd >= ctx->nr_user_files))
+			return -EBADF;
+		kiocb->ki_filp = ctx->user_files[fd];
+		req->flags |= REQ_F_FIXED_FILE;
+	} else {
+		kiocb->ki_filp = io_file_get(state, fd);
+		if (unlikely(!kiocb->ki_filp))
+			return -EBADF;
+		if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
+			force_nonblock = false;
+	}
 	kiocb->ki_pos = READ_ONCE(sqe->off);
 	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
 	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
@@ -718,10 +747,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	}
 	return 0;
 out_fput:
-	/* in case of error, we didn't use this file reference. drop it. */
-	if (state)
-		state->used_refs--;
-	io_file_put(state, kiocb->ki_filp);
+	if (!(flags & IOSQE_FIXED_FILE)) {
+		/*
+		 * in case of error, we didn't use this file reference. drop it.
+		 */
+		if (state)
+			state->used_refs--;
+		io_file_put(state, kiocb->ki_filp);
+	}
 	return ret;
 }
 
@@ -863,7 +896,7 @@ static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
 out_fput:
 	/* Hold on to the file for -EAGAIN */
 	if (unlikely(ret && ret != -EAGAIN))
-		fput(file);
+		io_fput(req);
 	return ret;
 }
 
@@ -917,7 +950,7 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
 	kfree(iovec);
 out_fput:
 	if (unlikely(ret))
-		fput(file);
+		io_fput(req);
 	return ret;
 }
 
@@ -940,7 +973,7 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
 	 */
 	if (req->rw.ki_filp) {
 		err = -EBADF;
-		fput(req->rw.ki_filp);
+		io_fput(req);
 	}
 	io_cqring_add_event(ctx, user_data, err, 0);
 	io_free_req(req);
@@ -949,21 +982,32 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
 
 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
+	struct io_ring_ctx *ctx = req->ctx;
+	unsigned flags;
 	int fd;
 
 	/* Prep already done */
 	if (req->rw.ki_filp)
 		return 0;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
 	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
 		return -EINVAL;
 
 	fd = READ_ONCE(sqe->fd);
-	req->rw.ki_filp = fget(fd);
-	if (unlikely(!req->rw.ki_filp))
-		return -EBADF;
+	flags = READ_ONCE(sqe->flags);
+
+	if (flags & IOSQE_FIXED_FILE) {
+		if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
+			return -EBADF;
+		req->rw.ki_filp = ctx->user_files[fd];
+		req->flags |= REQ_F_FIXED_FILE;
+	} else {
+		req->rw.ki_filp = fget(fd);
+		if (unlikely(!req->rw.ki_filp))
+			return -EBADF;
+	}
 
 	return 0;
 }
@@ -993,7 +1037,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 				end > 0 ? end : LLONG_MAX,
 				fsync_flags & IORING_FSYNC_DATASYNC);
 
-	fput(req->rw.ki_filp);
+	io_fput(req);
 	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
 	io_free_req(req);
 	return 0;
@@ -1132,7 +1176,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 	ssize_t ret;
 
 	/* enforce forwards compatibility on users */
-	if (unlikely(s->sqe->flags))
+	if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
 		return -EINVAL;
 
 	req = io_get_req(ctx, state);
@@ -1344,6 +1388,201 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
 }
 
+static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+#if defined(CONFIG_UNIX)
+	if (ctx->ring_sock) {
+		struct sock *sock = ctx->ring_sock->sk;
+		struct sk_buff *skb;
+
+		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
+			kfree_skb(skb);
+	}
+#else
+	int i;
+
+	for (i = 0; i < ctx->nr_user_files; i++)
+		fput(ctx->user_files[i]);
+#endif
+}
+
+static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+	if (!ctx->user_files)
+		return -ENXIO;
+
+	__io_sqe_files_unregister(ctx);
+	kfree(ctx->user_files);
+	ctx->user_files = NULL;
+	ctx->nr_user_files = 0;
+	return 0;
+}
+
+static void io_finish_async(struct io_ring_ctx *ctx)
+{
+	if (ctx->sqo_wq) {
+		destroy_workqueue(ctx->sqo_wq);
+		ctx->sqo_wq = NULL;
+	}
+}
+
+#if defined(CONFIG_UNIX)
+static void io_destruct_skb(struct sk_buff *skb)
+{
+	struct io_ring_ctx *ctx = skb->sk->sk_user_data;
+
+	io_finish_async(ctx);
+	unix_destruct_scm(skb);
+}
+
+/*
+ * Ensure the UNIX gc is aware of our file set, so we are certain that
+ * the io_uring can be safely unregistered on process exit, even if we have
+ * loops in the file referencing.
+ */
+static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
+{
+	struct sock *sk = ctx->ring_sock->sk;
+	struct scm_fp_list *fpl;
+	struct sk_buff *skb;
+	int i;
+
+	if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
+		unsigned long inflight = ctx->user->unix_inflight + nr;
+
+		if (inflight > task_rlimit(current, RLIMIT_NOFILE))
+			return -EMFILE;
+	}
+
+	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
+	if (!fpl)
+		return -ENOMEM;
+
+	skb = alloc_skb(0, GFP_KERNEL);
+	if (!skb) {
+		kfree(fpl);
+		return -ENOMEM;
+	}
+
+	skb->sk = sk;
+	skb->destructor = io_destruct_skb;
+
+	fpl->user = get_uid(ctx->user);
+	for (i = 0; i < nr; i++) {
+		fpl->fp[i] = get_file(ctx->user_files[i + offset]);
+		unix_inflight(fpl->user, fpl->fp[i]);
+	}
+
+	fpl->max = fpl->count = nr;
+	UNIXCB(skb).fp = fpl;
+	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
+	skb_queue_head(&sk->sk_receive_queue, skb);
+
+	for (i = 0; i < nr; i++)
+		fput(fpl->fp[i]);
+
+	return 0;
+}
+
+/*
+ * If UNIX sockets are enabled, fd passing can cause a reference cycle which
+ * causes regular reference counting to break down. We rely on the UNIX
+ * garbage collection to take care of this problem for us.
+ */
+static int io_sqe_files_scm(struct io_ring_ctx *ctx)
+{
+	unsigned left, total;
+	int ret = 0;
+
+	total = 0;
+	left = ctx->nr_user_files;
+	while (left) {
+		unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
+		int ret;
+
+		ret = __io_sqe_files_scm(ctx, this_files, total);
+		if (ret)
+			break;
+		left -= this_files;
+		total += this_files;
+	}
+
+	if (!ret)
+		return 0;
+
+	while (total < ctx->nr_user_files) {
+		fput(ctx->user_files[total]);
+		total++;
+	}
+
+	return ret;
+}
+#else
+static int io_sqe_files_scm(struct io_ring_ctx *ctx)
+{
+	return 0;
+}
+#endif
+
+static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
+				 unsigned nr_args)
+{
+	__s32 __user *fds = (__s32 __user *) arg;
+	int fd, ret = 0;
+	unsigned i;
+
+	if (ctx->user_files)
+		return -EBUSY;
+	if (!nr_args)
+		return -EINVAL;
+	if (nr_args > IORING_MAX_FIXED_FILES)
+		return -EMFILE;
+
+	ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
+	if (!ctx->user_files)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_args; i++) {
+		ret = -EFAULT;
+		if (copy_from_user(&fd, &fds[i], sizeof(fd)))
+			break;
+
+		ctx->user_files[i] = fget(fd);
+
+		ret = -EBADF;
+		if (!ctx->user_files[i])
+			break;
+		/*
+		 * Don't allow io_uring instances to be registered. If UNIX
+		 * isn't enabled, then this causes a reference cycle and this
+		 * instance can never get freed. If UNIX is enabled we'll
+		 * handle it just fine, but there's still no point in allowing
+		 * a ring fd as it doesn't support regular read/write anyway.
+		 */
+		if (ctx->user_files[i]->f_op == &io_uring_fops) {
+			fput(ctx->user_files[i]);
+			break;
+		}
+		ctx->nr_user_files++;
+		ret = 0;
+	}
+
+	if (ret) {
+		for (i = 0; i < ctx->nr_user_files; i++)
+			fput(ctx->user_files[i]);
+
+		kfree(ctx->user_files);
+		ctx->nr_user_files = 0;
+		return ret;
+	}
+
+	ret = io_sqe_files_scm(ctx);
+	if (ret)
+		io_sqe_files_unregister(ctx);
+
+	return ret;
+}
+
 static int io_sq_offload_start(struct io_ring_ctx *ctx)
 {
 	int ret;
@@ -1612,13 +1851,13 @@ err:
 
 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
-	if (ctx->sqo_wq)
-		destroy_workqueue(ctx->sqo_wq);
+	io_finish_async(ctx);
 	if (ctx->sqo_mm)
 		mmdrop(ctx->sqo_mm);
 
 	io_iopoll_reap_events(ctx);
 	io_sqe_buffer_unregister(ctx);
+	io_sqe_files_unregister(ctx);
 
 #if defined(CONFIG_UNIX)
 	if (ctx->ring_sock)
@@ -1858,6 +2097,7 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx)
 
 #if defined(CONFIG_UNIX)
 	ctx->ring_sock->file = file;
+	ctx->ring_sock->sk->sk_user_data = ctx;
 #endif
 	fd_install(ret, file);
 	return ret;
@@ -2001,6 +2241,15 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_sqe_buffer_unregister(ctx);
 		break;
+	case IORING_REGISTER_FILES:
+		ret = io_sqe_files_register(ctx, arg, nr_args);
+		break;
+	case IORING_UNREGISTER_FILES:
+		ret = -EINVAL;
+		if (arg || nr_args)
+			break;
+		ret = io_sqe_files_unregister(ctx);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index cf28f7a11f12..6257478d55e9 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -16,7 +16,7 @@
  */
 struct io_uring_sqe {
 	__u8	opcode;		/* type of operation for this sqe */
-	__u8	flags;		/* as of now unused */
+	__u8	flags;		/* IOSQE_ flags */
 	__u16	ioprio;		/* ioprio for the request */
 	__s32	fd;		/* file descriptor to do IO on */
 	__u64	off;		/* offset into file */
@@ -33,6 +33,11 @@ struct io_uring_sqe {
 	};
 };
 
+/*
+ * sqe->flags
+ */
+#define IOSQE_FIXED_FILE	(1U << 0)	/* use fixed fileset */
+
 /*
  * io_uring_setup() flags
  */
@@ -113,5 +118,7 @@ struct io_uring_params {
  */
 #define IORING_REGISTER_BUFFERS		0
 #define IORING_UNREGISTER_BUFFERS	1
+#define IORING_REGISTER_FILES		2
+#define IORING_UNREGISTER_FILES		3
 
 #endif
-- 
cgit v1.2.3


From 6c271ce2f1d572f7fa225700a13cfe7ced492434 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 10 Jan 2019 11:22:30 -0700
Subject: io_uring: add submission polling

This enables an application to do IO, without ever entering the kernel.
By using the SQ ring to fill in new sqes and watching for completions
on the CQ ring, we can submit and reap IOs without doing a single system
call. The kernel side thread will poll for new submissions, and in case
of HIPRI/polled IO, it'll also poll for completions.

By default, we allow 1 second of active spinning. This can by changed
by passing in a different grace period at io_uring_register(2) time.
If the thread exceeds this idle time without having any work to do, it
will set:

sq_ring->flags |= IORING_SQ_NEED_WAKEUP.

The application will have to call io_uring_enter() to start things back
up again. If IO is kept busy, that will never be needed. Basically an
application that has this feature enabled will guard it's
io_uring_enter(2) call with:

read_barrier();
if (*sq_ring->flags & IORING_SQ_NEED_WAKEUP)
	io_uring_enter(fd, 0, 0, IORING_ENTER_SQ_WAKEUP);

instead of calling it unconditionally.

It's mandatory to use fixed files with this feature. Failure to do so
will result in the application getting an -EBADF CQ entry when
submitting IO.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 249 ++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/io_uring.h |  12 +-
 2 files changed, 253 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5eaa1c4a3f7c..0a4caedf82c1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -44,6 +44,7 @@
 #include <linux/percpu.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
+#include <linux/kthread.h>
 #include <linux/blkdev.h>
 #include <linux/bvec.h>
 #include <linux/net.h>
@@ -108,12 +109,16 @@ struct io_ring_ctx {
 		unsigned		cached_sq_head;
 		unsigned		sq_entries;
 		unsigned		sq_mask;
+		unsigned		sq_thread_idle;
 		struct io_uring_sqe	*sq_sqes;
 	} ____cacheline_aligned_in_smp;
 
 	/* IO offload */
 	struct workqueue_struct	*sqo_wq;
+	struct task_struct	*sqo_thread;	/* if using sq thread polling */
 	struct mm_struct	*sqo_mm;
+	wait_queue_head_t	sqo_wait;
+	unsigned		sqo_stop;
 
 	struct {
 		/* CQ ring */
@@ -168,6 +173,7 @@ struct sqe_submit {
 	unsigned short			index;
 	bool				has_user;
 	bool				needs_lock;
+	bool				needs_fixed_file;
 };
 
 struct io_kiocb {
@@ -327,6 +333,8 @@ static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 
 	if (waitqueue_active(&ctx->wait))
 		wake_up(&ctx->wait);
+	if (waitqueue_active(&ctx->sqo_wait))
+		wake_up(&ctx->sqo_wait);
 }
 
 static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
@@ -680,9 +688,10 @@ static bool io_file_supports_async(struct file *file)
 	return false;
 }
 
-static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
 		      bool force_nonblock, struct io_submit_state *state)
 {
+	const struct io_uring_sqe *sqe = s->sqe;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct kiocb *kiocb = &req->rw;
 	unsigned ioprio, flags;
@@ -702,6 +711,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		kiocb->ki_filp = ctx->user_files[fd];
 		req->flags |= REQ_F_FIXED_FILE;
 	} else {
+		if (s->needs_fixed_file)
+			return -EBADF;
 		kiocb->ki_filp = io_file_get(state, fd);
 		if (unlikely(!kiocb->ki_filp))
 			return -EBADF;
@@ -865,7 +876,7 @@ static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
 	struct file *file;
 	ssize_t ret;
 
-	ret = io_prep_rw(req, s->sqe, force_nonblock, state);
+	ret = io_prep_rw(req, s, force_nonblock, state);
 	if (ret)
 		return ret;
 	file = kiocb->ki_filp;
@@ -909,7 +920,7 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
 	struct file *file;
 	ssize_t ret;
 
-	ret = io_prep_rw(req, s->sqe, force_nonblock, state);
+	ret = io_prep_rw(req, s, force_nonblock, state);
 	if (ret)
 		return ret;
 	/* Hold on to the file for -EAGAIN */
@@ -1301,6 +1312,169 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
 	return false;
 }
 
+static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
+			  unsigned int nr, bool has_user, bool mm_fault)
+{
+	struct io_submit_state state, *statep = NULL;
+	int ret, i, submitted = 0;
+
+	if (nr > IO_PLUG_THRESHOLD) {
+		io_submit_state_start(&state, ctx, nr);
+		statep = &state;
+	}
+
+	for (i = 0; i < nr; i++) {
+		if (unlikely(mm_fault)) {
+			ret = -EFAULT;
+		} else {
+			sqes[i].has_user = has_user;
+			sqes[i].needs_lock = true;
+			sqes[i].needs_fixed_file = true;
+			ret = io_submit_sqe(ctx, &sqes[i], statep);
+		}
+		if (!ret) {
+			submitted++;
+			continue;
+		}
+
+		io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret, 0);
+	}
+
+	if (statep)
+		io_submit_state_end(&state);
+
+	return submitted;
+}
+
+static int io_sq_thread(void *data)
+{
+	struct sqe_submit sqes[IO_IOPOLL_BATCH];
+	struct io_ring_ctx *ctx = data;
+	struct mm_struct *cur_mm = NULL;
+	mm_segment_t old_fs;
+	DEFINE_WAIT(wait);
+	unsigned inflight;
+	unsigned long timeout;
+
+	old_fs = get_fs();
+	set_fs(USER_DS);
+
+	timeout = inflight = 0;
+	while (!kthread_should_stop() && !ctx->sqo_stop) {
+		bool all_fixed, mm_fault = false;
+		int i;
+
+		if (inflight) {
+			unsigned nr_events = 0;
+
+			if (ctx->flags & IORING_SETUP_IOPOLL) {
+				/*
+				 * We disallow the app entering submit/complete
+				 * with polling, but we still need to lock the
+				 * ring to prevent racing with polled issue
+				 * that got punted to a workqueue.
+				 */
+				mutex_lock(&ctx->uring_lock);
+				io_iopoll_check(ctx, &nr_events, 0);
+				mutex_unlock(&ctx->uring_lock);
+			} else {
+				/*
+				 * Normal IO, just pretend everything completed.
+				 * We don't have to poll completions for that.
+				 */
+				nr_events = inflight;
+			}
+
+			inflight -= nr_events;
+			if (!inflight)
+				timeout = jiffies + ctx->sq_thread_idle;
+		}
+
+		if (!io_get_sqring(ctx, &sqes[0])) {
+			/*
+			 * We're polling. If we're within the defined idle
+			 * period, then let us spin without work before going
+			 * to sleep.
+			 */
+			if (inflight || !time_after(jiffies, timeout)) {
+				cpu_relax();
+				continue;
+			}
+
+			/*
+			 * Drop cur_mm before scheduling, we can't hold it for
+			 * long periods (or over schedule()). Do this before
+			 * adding ourselves to the waitqueue, as the unuse/drop
+			 * may sleep.
+			 */
+			if (cur_mm) {
+				unuse_mm(cur_mm);
+				mmput(cur_mm);
+				cur_mm = NULL;
+			}
+
+			prepare_to_wait(&ctx->sqo_wait, &wait,
+						TASK_INTERRUPTIBLE);
+
+			/* Tell userspace we may need a wakeup call */
+			ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP;
+			smp_wmb();
+
+			if (!io_get_sqring(ctx, &sqes[0])) {
+				if (kthread_should_stop()) {
+					finish_wait(&ctx->sqo_wait, &wait);
+					break;
+				}
+				if (signal_pending(current))
+					flush_signals(current);
+				schedule();
+				finish_wait(&ctx->sqo_wait, &wait);
+
+				ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
+				smp_wmb();
+				continue;
+			}
+			finish_wait(&ctx->sqo_wait, &wait);
+
+			ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
+			smp_wmb();
+		}
+
+		i = 0;
+		all_fixed = true;
+		do {
+			if (all_fixed && io_sqe_needs_user(sqes[i].sqe))
+				all_fixed = false;
+
+			i++;
+			if (i == ARRAY_SIZE(sqes))
+				break;
+		} while (io_get_sqring(ctx, &sqes[i]));
+
+		/* Unless all new commands are FIXED regions, grab mm */
+		if (!all_fixed && !cur_mm) {
+			mm_fault = !mmget_not_zero(ctx->sqo_mm);
+			if (!mm_fault) {
+				use_mm(ctx->sqo_mm);
+				cur_mm = ctx->sqo_mm;
+			}
+		}
+
+		inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL,
+						mm_fault);
+
+		/* Commit SQ ring head once we've consumed all SQEs */
+		io_commit_sqring(ctx);
+	}
+
+	set_fs(old_fs);
+	if (cur_mm) {
+		unuse_mm(cur_mm);
+		mmput(cur_mm);
+	}
+	return 0;
+}
+
 static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
 {
 	struct io_submit_state state, *statep = NULL;
@@ -1319,6 +1493,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
 
 		s.has_user = true;
 		s.needs_lock = false;
+		s.needs_fixed_file = false;
 
 		ret = io_submit_sqe(ctx, &s, statep);
 		if (ret) {
@@ -1418,8 +1593,20 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	return 0;
 }
 
+static void io_sq_thread_stop(struct io_ring_ctx *ctx)
+{
+	if (ctx->sqo_thread) {
+		ctx->sqo_stop = 1;
+		mb();
+		kthread_stop(ctx->sqo_thread);
+		ctx->sqo_thread = NULL;
+	}
+}
+
 static void io_finish_async(struct io_ring_ctx *ctx)
 {
+	io_sq_thread_stop(ctx);
+
 	if (ctx->sqo_wq) {
 		destroy_workqueue(ctx->sqo_wq);
 		ctx->sqo_wq = NULL;
@@ -1583,13 +1770,47 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	return ret;
 }
 
-static int io_sq_offload_start(struct io_ring_ctx *ctx)
+static int io_sq_offload_start(struct io_ring_ctx *ctx,
+			       struct io_uring_params *p)
 {
 	int ret;
 
+	init_waitqueue_head(&ctx->sqo_wait);
 	mmgrab(current->mm);
 	ctx->sqo_mm = current->mm;
 
+	ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
+	if (!ctx->sq_thread_idle)
+		ctx->sq_thread_idle = HZ;
+
+	ret = -EINVAL;
+	if (!cpu_possible(p->sq_thread_cpu))
+		goto err;
+
+	if (ctx->flags & IORING_SETUP_SQPOLL) {
+		if (p->flags & IORING_SETUP_SQ_AFF) {
+			int cpu;
+
+			cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS);
+			ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
+							ctx, cpu,
+							"io_uring-sq");
+		} else {
+			ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
+							"io_uring-sq");
+		}
+		if (IS_ERR(ctx->sqo_thread)) {
+			ret = PTR_ERR(ctx->sqo_thread);
+			ctx->sqo_thread = NULL;
+			goto err;
+		}
+		wake_up_process(ctx->sqo_thread);
+	} else if (p->flags & IORING_SETUP_SQ_AFF) {
+		/* Can't have SQ_AFF without SQPOLL */
+		ret = -EINVAL;
+		goto err;
+	}
+
 	/* Do QD, or 2 * CPUS, whatever is smallest */
 	ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE,
 			min(ctx->sq_entries - 1, 2 * num_online_cpus()));
@@ -1600,6 +1821,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx)
 
 	return 0;
 err:
+	io_sq_thread_stop(ctx);
 	mmdrop(ctx->sqo_mm);
 	ctx->sqo_mm = NULL;
 	return ret;
@@ -1959,7 +2181,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	int submitted = 0;
 	struct fd f;
 
-	if (flags & ~IORING_ENTER_GETEVENTS)
+	if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
 		return -EINVAL;
 
 	f = fdget(fd);
@@ -1975,6 +2197,18 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	if (!percpu_ref_tryget(&ctx->refs))
 		goto out_fput;
 
+	/*
+	 * For SQ polling, the thread will do all submissions and completions.
+	 * Just return the requested submit count, and wake the thread if
+	 * we were asked to.
+	 */
+	if (ctx->flags & IORING_SETUP_SQPOLL) {
+		if (flags & IORING_ENTER_SQ_WAKEUP)
+			wake_up(&ctx->sqo_wait);
+		submitted = to_submit;
+		goto out_ctx;
+	}
+
 	ret = 0;
 	if (to_submit) {
 		to_submit = min(to_submit, ctx->sq_entries);
@@ -2156,7 +2390,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
 	if (ret)
 		goto err;
 
-	ret = io_sq_offload_start(ctx);
+	ret = io_sq_offload_start(ctx, p);
 	if (ret)
 		goto err;
 
@@ -2204,7 +2438,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			return -EINVAL;
 	}
 
-	if (p.flags & ~IORING_SETUP_IOPOLL)
+	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
+			IORING_SETUP_SQ_AFF))
 		return -EINVAL;
 
 	ret = io_uring_create(entries, &p);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 6257478d55e9..0ec74bab8dbe 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -42,6 +42,8 @@ struct io_uring_sqe {
  * io_uring_setup() flags
  */
 #define IORING_SETUP_IOPOLL	(1U << 0)	/* io_context is polled */
+#define IORING_SETUP_SQPOLL	(1U << 1)	/* SQ poll thread */
+#define IORING_SETUP_SQ_AFF	(1U << 2)	/* sq_thread_cpu is valid */
 
 #define IORING_OP_NOP		0
 #define IORING_OP_READV		1
@@ -86,6 +88,11 @@ struct io_sqring_offsets {
 	__u64 resv2;
 };
 
+/*
+ * sq_ring->flags
+ */
+#define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
+
 struct io_cqring_offsets {
 	__u32 head;
 	__u32 tail;
@@ -100,6 +107,7 @@ struct io_cqring_offsets {
  * io_uring_enter(2) flags
  */
 #define IORING_ENTER_GETEVENTS	(1U << 0)
+#define IORING_ENTER_SQ_WAKEUP	(1U << 1)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -108,7 +116,9 @@ struct io_uring_params {
 	__u32 sq_entries;
 	__u32 cq_entries;
 	__u32 flags;
-	__u32 resv[7];
+	__u32 sq_thread_cpu;
+	__u32 sq_thread_idle;
+	__u32 resv[5];
 	struct io_sqring_offsets sq_off;
 	struct io_cqring_offsets cq_off;
 };
-- 
cgit v1.2.3


From abbbbc83a210e95e2ae4ee71e50930a8cec6f081 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 22 Feb 2019 18:04:52 +0000
Subject: asm-generic/io: Pass result of I/O accessor to __io_[p]ar()

The inX() and readX() I/O accessors must enforce ordering against
subsequent calls to the delay() routines, so that a read-back from a
device can be used to postpone a subsequent write to the same device.

On some architectures, including arm64, this ordering can only be
achieved by creating a dependency on the value returned by the I/O
accessor operation, so we need to pass the value we read to the
__io_par() and __io_ar() macros in these cases.

Acked-by: Arnd Bergmann <arnd@arndb.de>
Reported-by: Andrew Murray <andrew.murray@arm.com>
Reviewed-by: Palmer Dabbelt <palmer@sifive.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/asm-generic/io.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index d356f802945a..303871651f8a 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -32,9 +32,9 @@
 /* prevent prefetching of coherent DMA data ahead of a dma-complete */
 #ifndef __io_ar
 #ifdef rmb
-#define __io_ar()      rmb()
+#define __io_ar(v)      rmb()
 #else
-#define __io_ar()      barrier()
+#define __io_ar(v)      barrier()
 #endif
 #endif
 
@@ -65,7 +65,7 @@
 #endif
 
 #ifndef __io_par
-#define __io_par()     __io_ar()
+#define __io_par(v)     __io_ar(v)
 #endif
 
 
@@ -158,7 +158,7 @@ static inline u8 readb(const volatile void __iomem *addr)
 
 	__io_br();
 	val = __raw_readb(addr);
-	__io_ar();
+	__io_ar(val);
 	return val;
 }
 #endif
@@ -171,7 +171,7 @@ static inline u16 readw(const volatile void __iomem *addr)
 
 	__io_br();
 	val = __le16_to_cpu(__raw_readw(addr));
-	__io_ar();
+	__io_ar(val);
 	return val;
 }
 #endif
@@ -184,7 +184,7 @@ static inline u32 readl(const volatile void __iomem *addr)
 
 	__io_br();
 	val = __le32_to_cpu(__raw_readl(addr));
-	__io_ar();
+	__io_ar(val);
 	return val;
 }
 #endif
@@ -198,7 +198,7 @@ static inline u64 readq(const volatile void __iomem *addr)
 
 	__io_br();
 	val = __le64_to_cpu(__raw_readq(addr));
-	__io_ar();
+	__io_ar(val);
 	return val;
 }
 #endif
@@ -471,7 +471,7 @@ static inline u8 inb(unsigned long addr)
 
 	__io_pbr();
 	val = __raw_readb(PCI_IOBASE + addr);
-	__io_par();
+	__io_par(val);
 	return val;
 }
 #endif
@@ -484,7 +484,7 @@ static inline u16 inw(unsigned long addr)
 
 	__io_pbr();
 	val = __le16_to_cpu(__raw_readw(PCI_IOBASE + addr));
-	__io_par();
+	__io_par(val);
 	return val;
 }
 #endif
@@ -497,7 +497,7 @@ static inline u32 inl(unsigned long addr)
 
 	__io_pbr();
 	val = __le32_to_cpu(__raw_readl(PCI_IOBASE + addr));
-	__io_par();
+	__io_par(val);
 	return val;
 }
 #endif
-- 
cgit v1.2.3


From 221e1e0b016529f33b0d1bbf7d07c54463b55ca6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 11 Feb 2019 14:35:45 +0100
Subject: of: mark early_init_dt_alloc_reserved_memory_arch static

This function is only used in of_reserved_mem.c, and never overridden
despite the __weak marker.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/of_reserved_mem.c    | 2 +-
 include/linux/of_reserved_mem.h | 7 -------
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c
index 9e02a5d80225..e773063c6de9 100644
--- a/drivers/of/of_reserved_mem.c
+++ b/drivers/of/of_reserved_mem.c
@@ -26,7 +26,7 @@
 static struct reserved_mem reserved_mem[MAX_RESERVED_REGIONS];
 static int reserved_mem_count;
 
-int __init __weak early_init_dt_alloc_reserved_memory_arch(phys_addr_t size,
+static int __init early_init_dt_alloc_reserved_memory_arch(phys_addr_t size,
 	phys_addr_t align, phys_addr_t start, phys_addr_t end, bool nomap,
 	phys_addr_t *res_base)
 {
diff --git a/include/linux/of_reserved_mem.h b/include/linux/of_reserved_mem.h
index 67ab8d271df3..60f541912ccf 100644
--- a/include/linux/of_reserved_mem.h
+++ b/include/linux/of_reserved_mem.h
@@ -35,13 +35,6 @@ int of_reserved_mem_device_init_by_idx(struct device *dev,
 				       struct device_node *np, int idx);
 void of_reserved_mem_device_release(struct device *dev);
 
-int early_init_dt_alloc_reserved_memory_arch(phys_addr_t size,
-					     phys_addr_t align,
-					     phys_addr_t start,
-					     phys_addr_t end,
-					     bool nomap,
-					     phys_addr_t *res_base);
-
 void fdt_init_reserved_mem(void);
 void fdt_reserved_mem_save_node(unsigned long node, const char *uname,
 			       phys_addr_t base, phys_addr_t size);
-- 
cgit v1.2.3


From 0c91c11c7d62de9fea6f5520edb143a7199f3307 Mon Sep 17 00:00:00 2001
From: Anson Huang <anson.huang@nxp.com>
Date: Wed, 27 Feb 2019 01:28:26 +0000
Subject: clk: imx8mq: add GPIO clocks to clock tree

i.MX8MQ has clock gate for each GPIO bank, add them
into clock tree for GPIO driver to manage.

Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
Reviewed-by: Fabio Estevam <festevam@gmail.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/imx/clk-imx8mq.c             | 5 +++++
 include/dt-bindings/clock/imx8mq-clock.h | 8 +++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/clk/imx/clk-imx8mq.c b/drivers/clk/imx/clk-imx8mq.c
index 081dc2d7c99c..a9b3888aef0c 100644
--- a/drivers/clk/imx/clk-imx8mq.c
+++ b/drivers/clk/imx/clk-imx8mq.c
@@ -473,6 +473,11 @@ static int imx8mq_clocks_probe(struct platform_device *pdev)
 	clks[IMX8MQ_CLK_ECSPI2_ROOT] = imx_clk_gate4("ecspi2_root_clk", "ecspi2", base + 0x4080, 0);
 	clks[IMX8MQ_CLK_ECSPI3_ROOT] = imx_clk_gate4("ecspi3_root_clk", "ecspi3", base + 0x4090, 0);
 	clks[IMX8MQ_CLK_ENET1_ROOT] = imx_clk_gate4("enet1_root_clk", "enet_axi", base + 0x40a0, 0);
+	clks[IMX8MQ_CLK_GPIO1_ROOT] = imx_clk_gate4("gpio1_root_clk", "ipg_root", base + 0x40b0, 0);
+	clks[IMX8MQ_CLK_GPIO2_ROOT] = imx_clk_gate4("gpio2_root_clk", "ipg_root", base + 0x40c0, 0);
+	clks[IMX8MQ_CLK_GPIO3_ROOT] = imx_clk_gate4("gpio3_root_clk", "ipg_root", base + 0x40d0, 0);
+	clks[IMX8MQ_CLK_GPIO4_ROOT] = imx_clk_gate4("gpio4_root_clk", "ipg_root", base + 0x40e0, 0);
+	clks[IMX8MQ_CLK_GPIO5_ROOT] = imx_clk_gate4("gpio5_root_clk", "ipg_root", base + 0x40f0, 0);
 	clks[IMX8MQ_CLK_GPT1_ROOT] = imx_clk_gate4("gpt1_root_clk", "gpt1", base + 0x4100, 0);
 	clks[IMX8MQ_CLK_I2C1_ROOT] = imx_clk_gate4("i2c1_root_clk", "i2c1", base + 0x4170, 0);
 	clks[IMX8MQ_CLK_I2C2_ROOT] = imx_clk_gate4("i2c2_root_clk", "i2c2", base + 0x4180, 0);
diff --git a/include/dt-bindings/clock/imx8mq-clock.h b/include/dt-bindings/clock/imx8mq-clock.h
index 491e39c146be..b58cc643c9c9 100644
--- a/include/dt-bindings/clock/imx8mq-clock.h
+++ b/include/dt-bindings/clock/imx8mq-clock.h
@@ -394,5 +394,11 @@
 #define IMX8MQ_CLK_CLKO1			268
 #define IMX8MQ_CLK_ARM				269
 
-#define IMX8MQ_CLK_END				270
+#define IMX8MQ_CLK_GPIO1_ROOT			270
+#define IMX8MQ_CLK_GPIO2_ROOT			271
+#define IMX8MQ_CLK_GPIO3_ROOT			272
+#define IMX8MQ_CLK_GPIO4_ROOT			273
+#define IMX8MQ_CLK_GPIO5_ROOT			274
+
+#define IMX8MQ_CLK_END				275
 #endif /* __DT_BINDINGS_CLOCK_IMX8MQ_H */
-- 
cgit v1.2.3


From 5b88a17cfdeba75e0092bab2c79aaf7d9e7db482 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 Feb 2019 11:00:18 -0500
Subject: block: optimize bvec iteration in bvec_iter_advance

There is no need to only iterate in chunks of PAGE_SIZE or less in
bvec_iter_advance, given that the callers pass in the chunk length that
they are operating on - either that already is less than PAGE_SIZE
because they do classic page-based iteration, or it is larger because
the caller operates on multi-page bvecs.

This should help shaving off a few cycles of the I/O hot path.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bvec.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 87e82e503a52..f6275c4da13a 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -112,14 +112,15 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv,
 	}
 
 	while (bytes) {
-		unsigned iter_len = bvec_iter_len(bv, *iter);
-		unsigned len = min(bytes, iter_len);
+		const struct bio_vec *cur = bv + iter->bi_idx;
+		unsigned len = min3(bytes, iter->bi_size,
+				    cur->bv_len - iter->bi_bvec_done);
 
 		bytes -= len;
 		iter->bi_size -= len;
 		iter->bi_bvec_done += len;
 
-		if (iter->bi_bvec_done == __bvec_iter_bvec(bv, *iter)->bv_len) {
+		if (iter->bi_bvec_done == cur->bv_len) {
 			iter->bi_bvec_done = 0;
 			iter->bi_idx++;
 		}
-- 
cgit v1.2.3


From e907bf3c9820c8480b1d83aca42a5668c5364be9 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Mon, 18 Feb 2019 14:29:06 -0500
Subject: media: include: fix several typos

Use codespell to fix lots of typos over frontends.

Manually verified to avoid false-positives.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Reviewed-by: Lad, Prabhakar <prabhakar.csengg@gmail.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/linux/platform_data/media/si4713.h | 4 ++--
 include/media/davinci/dm355_ccdc.h         | 4 ++--
 include/media/davinci/dm644x_ccdc.h        | 2 +-
 include/media/drv-intf/exynos-fimc.h       | 2 +-
 include/media/drv-intf/saa7146.h           | 2 +-
 include/media/drv-intf/saa7146_vv.h        | 4 ++--
 include/media/dvb_frontend.h               | 8 ++++----
 include/media/rc-map.h                     | 4 ++--
 include/media/v4l2-ctrls.h                 | 2 +-
 include/media/v4l2-fwnode.h                | 4 ++--
 include/media/v4l2-subdev.h                | 2 +-
 include/media/videobuf-core.h              | 2 +-
 include/media/videobuf2-core.h             | 2 +-
 13 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/platform_data/media/si4713.h b/include/linux/platform_data/media/si4713.h
index 932668ad54f7..13b3eb7a9059 100644
--- a/include/linux/platform_data/media/si4713.h
+++ b/include/linux/platform_data/media/si4713.h
@@ -31,7 +31,7 @@ struct si4713_platform_data {
  */
 struct si4713_rnl {
 	__u32 index;		/* modulator index */
-	__u32 frequency;	/* frequency to peform rnl measurement */
+	__u32 frequency;	/* frequency to perform rnl measurement */
 	__s32 rnl;		/* result of measurement in dBuV */
 	__u32 reserved[4];	/* drivers and apps must init this to 0 */
 };
@@ -40,7 +40,7 @@ struct si4713_rnl {
  * This is the ioctl number to query for rnl. Users must pass a
  * struct si4713_rnl pointer specifying desired frequency in 'frequency' field
  * following driver capabilities (i.e V4L2_TUNER_CAP_LOW).
- * Driver must return measured value in the same struture, filling 'rnl' field.
+ * Driver must return measured value in the same structure, filling 'rnl' field.
  */
 #define SI4713_IOC_MEASURE_RNL	_IOWR('V', BASE_VIDIOC_PRIVATE + 0, \
 						struct si4713_rnl)
diff --git a/include/media/davinci/dm355_ccdc.h b/include/media/davinci/dm355_ccdc.h
index e6bc72f6b60f..1cba42d805fa 100644
--- a/include/media/davinci/dm355_ccdc.h
+++ b/include/media/davinci/dm355_ccdc.h
@@ -228,7 +228,7 @@ struct ccdc_config_params_raw {
 	/* Threshold of median filter */
 	int med_filt_thres;
 	/*
-	 * horz and vertical data offset. Appliable for defect correction
+	 * horz and vertical data offset. Applicable for defect correction
 	 * and lsc
 	 */
 	struct ccdc_data_offset data_offset;
@@ -238,7 +238,7 @@ struct ccdc_config_params_raw {
 	struct ccdc_black_clamp blk_clamp;
 	/* Structure for Black Compensation */
 	struct ccdc_black_compensation blk_comp;
-	/* struture for vertical Defect Correction Module Configuration */
+	/* structure for vertical Defect Correction Module Configuration */
 	struct ccdc_vertical_dft vertical_dft;
 	/* structure for color space converter Module Configuration */
 	struct ccdc_csc csc;
diff --git a/include/media/davinci/dm644x_ccdc.h b/include/media/davinci/dm644x_ccdc.h
index 6ea2ce241851..694fc8f6081f 100644
--- a/include/media/davinci/dm644x_ccdc.h
+++ b/include/media/davinci/dm644x_ccdc.h
@@ -152,7 +152,7 @@ struct ccdc_params_raw {
 	 * order in memory(bottom to top)
 	 */
 	unsigned char image_invert_enable;
-	/* configurable paramaters */
+	/* configurable parameters */
 	struct ccdc_config_params_raw config_params;
 };
 
diff --git a/include/media/drv-intf/exynos-fimc.h b/include/media/drv-intf/exynos-fimc.h
index f9c64338841f..54c214737142 100644
--- a/include/media/drv-intf/exynos-fimc.h
+++ b/include/media/drv-intf/exynos-fimc.h
@@ -81,7 +81,7 @@ struct fimc_source_info {
  * v4l2_device notification id. This is only for internal use in the kernel.
  * Sensor subdevs should issue S5P_FIMC_TX_END_NOTIFY notification in single
  * frame capture mode when there is only one VSYNC pulse issued by the sensor
- * at begining of the frame transmission.
+ * at beginning of the frame transmission.
  */
 #define S5P_FIMC_TX_END_NOTIFY _IO('e', 0)
 
diff --git a/include/media/drv-intf/saa7146.h b/include/media/drv-intf/saa7146.h
index a7bf2c4a2e4d..71ce63c99cb4 100644
--- a/include/media/drv-intf/saa7146.h
+++ b/include/media/drv-intf/saa7146.h
@@ -139,7 +139,7 @@ struct saa7146_dev
 	void				*ext_priv;	/* pointer for extension private use (most likely some private data) */
 	struct saa7146_ext_vv		*ext_vv_data;
 
-	/* per device video/vbi informations (if available) */
+	/* per device video/vbi information (if available) */
 	struct saa7146_vv	*vv_data;
 	void (*vv_callback)(struct saa7146_dev *dev, unsigned long status);
 
diff --git a/include/media/drv-intf/saa7146_vv.h b/include/media/drv-intf/saa7146_vv.h
index 6f80fb7f31a5..b34d86bb0664 100644
--- a/include/media/drv-intf/saa7146_vv.h
+++ b/include/media/drv-intf/saa7146_vv.h
@@ -151,7 +151,7 @@ struct saa7146_vv
 
 struct saa7146_ext_vv
 {
-	/* informations about the video capabilities of the device */
+	/* information about the video capabilities of the device */
 	int	inputs;
 	int	audios;
 	u32	capabilities;
@@ -241,7 +241,7 @@ void saa7146_res_free(struct saa7146_fh *fh, unsigned int bits);
 #define SAA7146_CLIPPING_MASK		0x6
 #define SAA7146_CLIPPING_MASK_INVERTED	0x7
 
-/* output formats: each entry holds four informations */
+/* output formats: each entry holds four information */
 #define RGB08_COMPOSED	0x0217 /* composed is used in the sense of "not-planar" */
 /* this means: planar?=0, yuv2rgb-conversation-mode=2, dither=yes(=1), format-mode = 7 */
 #define RGB15_COMPOSED	0x0213
diff --git a/include/media/dvb_frontend.h b/include/media/dvb_frontend.h
index 6f7a85ab3541..f05cd7b94a2c 100644
--- a/include/media/dvb_frontend.h
+++ b/include/media/dvb_frontend.h
@@ -160,7 +160,7 @@ enum dvbfe_algo {
  *	The frontend search for a signal failed
  *
  * @DVBFE_ALGO_SEARCH_INVALID:
- *	The frontend search algorith was probably supplied with invalid
+ *	The frontend search algorithm was probably supplied with invalid
  *	parameters and the search is an invalid one
  *
  * @DVBFE_ALGO_SEARCH_ERROR:
@@ -204,7 +204,7 @@ enum dvbfe_search {
  * @set_config:		callback function used to send some tuner-specific
  *			parameters.
  * @get_frequency:	get the actual tuned frequency
- * @get_bandwidth:	get the bandwitdh used by the low pass filters
+ * @get_bandwidth:	get the bandwidth used by the low pass filters
  * @get_if_frequency:	get the Intermediate Frequency, in Hz. For baseband,
  *			should return 0.
  * @get_status:		returns the frontend lock status
@@ -232,7 +232,7 @@ struct dvb_tuner_ops {
 	int (*suspend)(struct dvb_frontend *fe);
 	int (*resume)(struct dvb_frontend *fe);
 
-	/* This is the recomended way to set the tuner */
+	/* This is the recommended way to set the tuner */
 	int (*set_params)(struct dvb_frontend *fe);
 	int (*set_analog_params)(struct dvb_frontend *fe, struct analog_parameters *p);
 
@@ -358,7 +358,7 @@ struct dvb_frontend_internal_info {
  * @release:		callback function called when frontend is ready to be
  *			freed.
  *			drivers should free any allocated memory.
- * @release_sec:	callback function requesting that the Satelite Equipment
+ * @release_sec:	callback function requesting that the Satellite Equipment
  *			Control (SEC) driver to release and free any memory
  *			allocated by the driver.
  * @init:		callback function used to initialize the tuner device.
diff --git a/include/media/rc-map.h b/include/media/rc-map.h
index e5e86d595645..5e684bb0d64c 100644
--- a/include/media/rc-map.h
+++ b/include/media/rc-map.h
@@ -144,14 +144,14 @@ struct rc_map_list {
 /* Routines from rc-map.c */
 
 /**
- * rc_map_register() - Registers a Remote Controler scancode map
+ * rc_map_register() - Registers a Remote Controller scancode map
  *
  * @map:	pointer to struct rc_map_list
  */
 int rc_map_register(struct rc_map_list *map);
 
 /**
- * rc_map_unregister() - Unregisters a Remote Controler scancode map
+ * rc_map_unregister() - Unregisters a Remote Controller scancode map
  *
  * @map:	pointer to struct rc_map_list
  */
diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h
index d63cf227b0ab..e5cae37ced2d 100644
--- a/include/media/v4l2-ctrls.h
+++ b/include/media/v4l2-ctrls.h
@@ -648,7 +648,7 @@ struct v4l2_ctrl *v4l2_ctrl_new_std_menu_items(struct v4l2_ctrl_handler *hdl,
  * @def:	The control's default value.
  * @qmenu_int:	The control's menu entries.
  *
- * Same as v4l2_ctrl_new_std_menu(), but @mask is set to 0 and it additionaly
+ * Same as v4l2_ctrl_new_std_menu(), but @mask is set to 0 and it additionally
  * takes as an argument an array of integers determining the menu items.
  *
  * If @id refers to a non-integer-menu control, then this function will
diff --git a/include/media/v4l2-fwnode.h b/include/media/v4l2-fwnode.h
index 6d9d9f1839ac..6c07825e18b9 100644
--- a/include/media/v4l2-fwnode.h
+++ b/include/media/v4l2-fwnode.h
@@ -143,7 +143,7 @@ struct v4l2_fwnode_link {
  * @vep.bus_type to V4L2_MBUS_UNKNOWN. The caller may not provide a default
  * configuration in this case as the defaults are specific to a given bus type.
  * This functionality is deprecated and should not be used in new drivers and it
- * is only supported for CSI-2 D-PHY, parallel and Bt.656 busses.
+ * is only supported for CSI-2 D-PHY, parallel and Bt.656 buses.
  *
  * The function does not change the V4L2 fwnode endpoint state if it fails.
  *
@@ -186,7 +186,7 @@ void v4l2_fwnode_endpoint_free(struct v4l2_fwnode_endpoint *vep);
  * @vep.bus_type to V4L2_MBUS_UNKNOWN. The caller may not provide a default
  * configuration in this case as the defaults are specific to a given bus type.
  * This functionality is deprecated and should not be used in new drivers and it
- * is only supported for CSI-2 D-PHY, parallel and Bt.656 busses.
+ * is only supported for CSI-2 D-PHY, parallel and Bt.656 buses.
  *
  * The function does not change the V4L2 fwnode endpoint state if it fails.
  *
diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h
index 34da094a3f40..349e1c18cf48 100644
--- a/include/media/v4l2-subdev.h
+++ b/include/media/v4l2-subdev.h
@@ -70,7 +70,7 @@ struct v4l2_decode_vbi_line {
  * device. These devices are usually audio/video muxers/encoders/decoders or
  * sensors and webcam controllers.
  *
- * Usually these devices are controlled through an i2c bus, but other busses
+ * Usually these devices are controlled through an i2c bus, but other buses
  * may also be used.
  *
  * The v4l2_subdev struct provides a way of accessing these devices in a
diff --git a/include/media/videobuf-core.h b/include/media/videobuf-core.h
index 5684dc6f0d0d..2c4db97cd96f 100644
--- a/include/media/videobuf-core.h
+++ b/include/media/videobuf-core.h
@@ -43,7 +43,7 @@ struct videobuf_queue;
  * (which v4l2 uses).
  *
  * If there is a valid mapping for a buffer, buffer->baddr/bsize holds
- * userspace address + size which can be feeded into the
+ * userspace address + size which can be fed into the
  * videobuf_dma_init_user function listed above.
  *
  */
diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
index a844abcae71e..910f3d469005 100644
--- a/include/media/videobuf2-core.h
+++ b/include/media/videobuf2-core.h
@@ -399,7 +399,7 @@ struct vb2_buffer {
  * @buf_queue:		passes buffer vb to the driver; driver may start
  *			hardware operation on this buffer; driver should give
  *			the buffer back by calling vb2_buffer_done() function;
- *			it is allways called after calling VIDIOC_STREAMON()
+ *			it is always called after calling VIDIOC_STREAMON()
  *			ioctl; might be called before @start_streaming callback
  *			if user pre-queued buffers before calling
  *			VIDIOC_STREAMON().
-- 
cgit v1.2.3


From a79f194aa4879e9baad118c3f8bb2ca24dbef765 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 27 Feb 2019 15:37:36 -0500
Subject: NFSv4/flexfiles: Abort I/O early if the layout segment was
 invalidated

If a layout segment gets invalidated while a pNFS I/O operation
is queued for transmission, then we ideally want to abort
immediately. This is particularly the case when there is a large
number of I/O related RPCs queued in the RPC layer, and the layout
segment gets invalidated due to an ENOSPC error, or an EACCES (because
the client was fenced). We may end up forced to spam the MDS with a
lot of otherwise unnecessary LAYOUTERRORs after that I/O fails.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 17 +++++++++++++++++
 include/linux/sunrpc/sched.h           |  1 +
 net/sunrpc/xprt.c                      |  7 +++++++
 3 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 244a03c22b31..a8e9bdd978e7 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1071,6 +1071,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
 		break;
 	case -NFS4ERR_RETRY_UNCACHED_REP:
 		break;
+	case -EAGAIN:
+		return -NFS4ERR_RESET_TO_PNFS;
 	/* Invalidate Layout errors */
 	case -NFS4ERR_PNFS_NO_LAYOUT:
 	case -ESTALE:           /* mapped NFS4ERR_STALE */
@@ -1131,6 +1133,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
 	case -EBADHANDLE:
 	case -ELOOP:
 	case -ENOSPC:
+	case -EAGAIN:
 		break;
 	case -EJUKEBOX:
 		nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
@@ -1369,6 +1372,16 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
 	ff_layout_read_prepare_common(task, hdr);
 }
 
+static void
+ff_layout_io_prepare_transmit(struct rpc_task *task,
+		void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	if (!pnfs_is_valid_lseg(hdr->lseg))
+		rpc_exit(task, -EAGAIN);
+}
+
 static void ff_layout_read_call_done(struct rpc_task *task, void *data)
 {
 	struct nfs_pgio_header *hdr = data;
@@ -1657,6 +1670,7 @@ static void ff_layout_commit_release(void *data)
 
 static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
 	.rpc_call_prepare = ff_layout_read_prepare_v3,
+	.rpc_call_prepare_transmit = ff_layout_io_prepare_transmit,
 	.rpc_call_done = ff_layout_read_call_done,
 	.rpc_count_stats = ff_layout_read_count_stats,
 	.rpc_release = ff_layout_read_release,
@@ -1664,6 +1678,7 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
 
 static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
 	.rpc_call_prepare = ff_layout_read_prepare_v4,
+	.rpc_call_prepare_transmit = ff_layout_io_prepare_transmit,
 	.rpc_call_done = ff_layout_read_call_done,
 	.rpc_count_stats = ff_layout_read_count_stats,
 	.rpc_release = ff_layout_read_release,
@@ -1671,6 +1686,7 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
 
 static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
 	.rpc_call_prepare = ff_layout_write_prepare_v3,
+	.rpc_call_prepare_transmit = ff_layout_io_prepare_transmit,
 	.rpc_call_done = ff_layout_write_call_done,
 	.rpc_count_stats = ff_layout_write_count_stats,
 	.rpc_release = ff_layout_write_release,
@@ -1678,6 +1694,7 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
 
 static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
 	.rpc_call_prepare = ff_layout_write_prepare_v4,
+	.rpc_call_prepare_transmit = ff_layout_io_prepare_transmit,
 	.rpc_call_done = ff_layout_write_call_done,
 	.rpc_count_stats = ff_layout_write_count_stats,
 	.rpc_release = ff_layout_write_release,
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 219aa3910a0c..52d41d0c1ae1 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -97,6 +97,7 @@ typedef void			(*rpc_action)(struct rpc_task *);
 
 struct rpc_call_ops {
 	void (*rpc_call_prepare)(struct rpc_task *, void *);
+	void (*rpc_call_prepare_transmit)(struct rpc_task *, void *);
 	void (*rpc_call_done)(struct rpc_task *, void *);
 	void (*rpc_count_stats)(struct rpc_task *, void *);
 	void (*rpc_release)(void *);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 1cf4e379be7b..e096c5a725df 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1330,6 +1330,13 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task)
 			status = -EBADMSG;
 			goto out_dequeue;
 		}
+		if (task->tk_ops->rpc_call_prepare_transmit) {
+			task->tk_ops->rpc_call_prepare_transmit(task,
+					task->tk_calldata);
+			status = task->tk_status;
+			if (status < 0)
+				goto out_dequeue;
+		}
 	}
 
 	/*
-- 
cgit v1.2.3


From 3eb86093ea400c58f444eac0debcf6c50d617418 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 8 Feb 2019 10:31:05 -0500
Subject: NFSv4.2: Add client support for the generic 'layouterror' RPC call

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs42.h            |   3 +
 fs/nfs/nfs42proc.c        | 164 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/nfs42xdr.c         |  99 ++++++++++++++++++++++++++++
 fs/nfs/nfs4proc.c         |   3 +-
 fs/nfs/nfs4xdr.c          |   1 +
 include/linux/nfs4.h      |   1 +
 include/linux/nfs_fs_sb.h |   1 +
 include/linux/nfs_xdr.h   |  35 ++++++++++
 8 files changed, 306 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index 19ec38f85ce0..901cca7542f9 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -20,5 +20,8 @@ loff_t nfs42_proc_llseek(struct file *, loff_t, int);
 int nfs42_proc_layoutstats_generic(struct nfs_server *,
 				   struct nfs42_layoutstat_data *);
 int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t);
+int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
+			   const struct nfs42_layout_error *errors,
+			   size_t n);
 
 #endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index fed06fd9998d..ff6f85fb676b 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -672,6 +672,170 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server,
 	return 0;
 }
 
+static struct nfs42_layouterror_data *
+nfs42_alloc_layouterror_data(struct pnfs_layout_segment *lseg, gfp_t gfp_flags)
+{
+	struct nfs42_layouterror_data *data;
+	struct inode *inode = lseg->pls_layout->plh_inode;
+
+	data = kzalloc(sizeof(*data), gfp_flags);
+	if (data) {
+		data->args.inode = data->inode = nfs_igrab_and_active(inode);
+		if (data->inode) {
+			data->lseg = pnfs_get_lseg(lseg);
+			if (data->lseg)
+				return data;
+			nfs_iput_and_deactive(data->inode);
+		}
+		kfree(data);
+	}
+	return NULL;
+}
+
+static void
+nfs42_free_layouterror_data(struct nfs42_layouterror_data *data)
+{
+	pnfs_put_lseg(data->lseg);
+	nfs_iput_and_deactive(data->inode);
+	kfree(data);
+}
+
+static void
+nfs42_layouterror_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs42_layouterror_data *data = calldata;
+	struct inode *inode = data->inode;
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct pnfs_layout_hdr *lo = data->lseg->pls_layout;
+	unsigned i;
+
+	spin_lock(&inode->i_lock);
+	if (!pnfs_layout_is_valid(lo)) {
+		spin_unlock(&inode->i_lock);
+		rpc_exit(task, 0);
+		return;
+	}
+	for (i = 0; i < data->args.num_errors; i++)
+		nfs4_stateid_copy(&data->args.errors[i].stateid,
+				&lo->plh_stateid);
+	spin_unlock(&inode->i_lock);
+	nfs4_setup_sequence(server->nfs_client, &data->args.seq_args,
+			    &data->res.seq_res, task);
+}
+
+static void
+nfs42_layouterror_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs42_layouterror_data *data = calldata;
+	struct inode *inode = data->inode;
+	struct pnfs_layout_hdr *lo = data->lseg->pls_layout;
+
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return;
+
+	switch (task->tk_status) {
+	case 0:
+		break;
+	case -NFS4ERR_BADHANDLE:
+	case -ESTALE:
+		pnfs_destroy_layout(NFS_I(inode));
+		break;
+	case -NFS4ERR_EXPIRED:
+	case -NFS4ERR_ADMIN_REVOKED:
+	case -NFS4ERR_DELEG_REVOKED:
+	case -NFS4ERR_STALE_STATEID:
+	case -NFS4ERR_BAD_STATEID:
+		spin_lock(&inode->i_lock);
+		if (pnfs_layout_is_valid(lo) &&
+		    nfs4_stateid_match(&data->args.errors[0].stateid,
+					     &lo->plh_stateid)) {
+			LIST_HEAD(head);
+
+			/*
+			 * Mark the bad layout state as invalid, then retry
+			 * with the current stateid.
+			 */
+			pnfs_mark_layout_stateid_invalid(lo, &head);
+			spin_unlock(&inode->i_lock);
+			pnfs_free_lseg_list(&head);
+			nfs_commit_inode(inode, 0);
+		} else
+			spin_unlock(&inode->i_lock);
+		break;
+	case -NFS4ERR_OLD_STATEID:
+		spin_lock(&inode->i_lock);
+		if (pnfs_layout_is_valid(lo) &&
+		    nfs4_stateid_match_other(&data->args.errors[0].stateid,
+					&lo->plh_stateid)) {
+			/* Do we need to delay before resending? */
+			if (!nfs4_stateid_is_newer(&lo->plh_stateid,
+						&data->args.errors[0].stateid))
+				rpc_delay(task, HZ);
+			rpc_restart_call_prepare(task);
+		}
+		spin_unlock(&inode->i_lock);
+		break;
+	case -ENOTSUPP:
+	case -EOPNOTSUPP:
+		NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTERROR;
+	}
+}
+
+static void
+nfs42_layouterror_release(void *calldata)
+{
+	struct nfs42_layouterror_data *data = calldata;
+
+	nfs42_free_layouterror_data(data);
+}
+
+static const struct rpc_call_ops nfs42_layouterror_ops = {
+	.rpc_call_prepare = nfs42_layouterror_prepare,
+	.rpc_call_done = nfs42_layouterror_done,
+	.rpc_release = nfs42_layouterror_release,
+};
+
+int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
+		const struct nfs42_layout_error *errors, size_t n)
+{
+	struct inode *inode = lseg->pls_layout->plh_inode;
+	struct nfs42_layouterror_data *data;
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTERROR],
+	};
+	struct rpc_task_setup task_setup = {
+		.rpc_message = &msg,
+		.callback_ops = &nfs42_layouterror_ops,
+		.flags = RPC_TASK_ASYNC,
+	};
+	unsigned int i;
+
+	if (!nfs_server_capable(inode, NFS_CAP_LAYOUTERROR))
+		return -EOPNOTSUPP;
+	if (n > NFS42_LAYOUTERROR_MAX)
+		return -EINVAL;
+	data = nfs42_alloc_layouterror_data(lseg, GFP_NOFS);
+	if (!data)
+		return -ENOMEM;
+	for (i = 0; i < n; i++) {
+		data->args.errors[i] = errors[i];
+		data->args.num_errors++;
+		data->res.num_errors++;
+	}
+	msg.rpc_argp = &data->args;
+	msg.rpc_resp = &data->res;
+	task_setup.callback_data = data;
+	task_setup.rpc_client = NFS_SERVER(inode)->client;
+	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0, 0);
+	task = rpc_run_task(&task_setup);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	rpc_put_task(task);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs42_proc_layouterror);
+
 static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
 		struct file *dst_f, struct nfs_lock_context *src_lock,
 		struct nfs_lock_context *dst_lock, loff_t src_offset,
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 7d596e8a0941..aed865a84629 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -51,6 +51,15 @@
 					1 /* opaque devaddr4 length */ + \
 					XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
 #define decode_layoutstats_maxsz	(op_decode_hdr_maxsz)
+#define encode_device_error_maxsz	(XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
+					1 /* status */ + 1 /* opnum */)
+#define encode_layouterror_maxsz	(op_decode_hdr_maxsz + \
+					2 /* offset */ + \
+					2 /* length */ + \
+					encode_stateid_maxsz + \
+					1 /* Array size */ + \
+					encode_device_error_maxsz)
+#define decode_layouterror_maxsz	(op_decode_hdr_maxsz)
 #define encode_clone_maxsz		(encode_stateid_maxsz + \
 					encode_stateid_maxsz + \
 					2 /* src offset */ + \
@@ -116,6 +125,16 @@
 					 decode_sequence_maxsz + \
 					 decode_putfh_maxsz + \
 					 PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
+#define NFS4_enc_layouterror_sz		(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
+					 encode_putfh_maxsz + \
+					 NFS42_LAYOUTERROR_MAX * \
+					 encode_layouterror_maxsz)
+#define NFS4_dec_layouterror_sz		(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
+					 decode_putfh_maxsz + \
+					 NFS42_LAYOUTERROR_MAX * \
+					 decode_layouterror_maxsz)
 #define NFS4_enc_clone_sz		(compound_encode_hdr_maxsz + \
 					 encode_sequence_maxsz + \
 					 encode_putfh_maxsz + \
@@ -233,6 +252,34 @@ static void encode_clone(struct xdr_stream *xdr,
 	xdr_encode_hyper(p, args->count);
 }
 
+static void encode_device_error(struct xdr_stream *xdr,
+				const struct nfs42_device_error *error)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 2*4);
+	p = xdr_encode_opaque_fixed(p, error->dev_id.data,
+			NFS4_DEVICEID4_SIZE);
+	*p++ = cpu_to_be32(error->status);
+	*p = cpu_to_be32(error->opnum);
+}
+
+static void encode_layouterror(struct xdr_stream *xdr,
+			       const struct nfs42_layout_error *args,
+			       struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_LAYOUTERROR, decode_layouterror_maxsz, hdr);
+	p = reserve_space(xdr, 8 + 8);
+	p = xdr_encode_hyper(p, args->offset);
+	p = xdr_encode_hyper(p, args->length);
+	encode_nfs4_stateid(xdr, &args->stateid);
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(1);
+	encode_device_error(xdr, &args->errors[0]);
+}
+
 /*
  * Encode ALLOCATE request
  */
@@ -391,6 +438,27 @@ static void nfs4_xdr_enc_clone(struct rpc_rqst *req,
 	encode_nops(&hdr);
 }
 
+/*
+ * Encode LAYOUTERROR request
+ */
+static void nfs4_xdr_enc_layouterror(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const void *data)
+{
+	const struct nfs42_layouterror_args *args = data;
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+	int i;
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+	for (i = 0; i < args->num_errors; i++)
+		encode_layouterror(xdr, &args->errors[i], &hdr);
+	encode_nops(&hdr);
+}
+
 static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
 {
 	return decode_op_hdr(xdr, OP_ALLOCATE);
@@ -494,6 +562,11 @@ static int decode_clone(struct xdr_stream *xdr)
 	return decode_op_hdr(xdr, OP_CLONE);
 }
 
+static int decode_layouterror(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_LAYOUTERROR);
+}
+
 /*
  * Decode ALLOCATE request
  */
@@ -703,4 +776,30 @@ out:
 	return status;
 }
 
+/*
+ * Decode LAYOUTERROR request
+ */
+static int nfs4_xdr_dec_layouterror(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
+				    void *data)
+{
+	struct nfs42_layouterror_res *res = data;
+	struct compound_hdr hdr;
+	int status, i;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+
+	for (i = 0; i < res->num_errors && status == 0; i++)
+		status = decode_layouterror(xdr);
+out:
+	res->rpc_status = status;
+	return status;
+}
+
 #endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5b980246b035..73889ea7d196 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -9690,7 +9690,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
 		| NFS_CAP_DEALLOCATE
 		| NFS_CAP_SEEK
 		| NFS_CAP_LAYOUTSTATS
-		| NFS_CAP_CLONE,
+		| NFS_CAP_CLONE
+		| NFS_CAP_LAYOUTERROR,
 	.init_client = nfs41_init_client,
 	.shutdown_client = nfs41_shutdown_client,
 	.match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 6d9d5e2f6308..cfcabc33e24d 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -7572,6 +7572,7 @@ const struct rpc_procinfo nfs4_procedures[] = {
 	PROC42(COPY,		enc_copy,		dec_copy),
 	PROC42(OFFLOAD_CANCEL,	enc_offload_cancel,	dec_offload_cancel),
 	PROC(LOOKUPP,		enc_lookupp,		dec_lookupp),
+	PROC42(LAYOUTERROR,	enc_layouterror,	dec_layouterror),
 };
 
 static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 1b06f0b28453..22494d170619 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -538,6 +538,7 @@ enum {
 	NFSPROC4_CLNT_OFFLOAD_CANCEL,
 
 	NFSPROC4_CLNT_LOOKUPP,
+	NFSPROC4_CLNT_LAYOUTERROR,
 };
 
 /* nfs41 types */
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 6aa8cc83c3b6..c827d31298cc 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -261,5 +261,6 @@ struct nfs_server {
 #define NFS_CAP_CLONE		(1U << 23)
 #define NFS_CAP_COPY		(1U << 24)
 #define NFS_CAP_OFFLOAD_CANCEL	(1U << 25)
+#define NFS_CAP_LAYOUTERROR	(1U << 26)
 
 #endif
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index b4bd2bf5f585..9b8324ec08f3 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -383,6 +383,41 @@ struct nfs42_layoutstat_data {
 	struct nfs42_layoutstat_res res;
 };
 
+struct nfs42_device_error {
+	struct nfs4_deviceid dev_id;
+	int status;
+	enum nfs_opnum4 opnum;
+};
+
+struct nfs42_layout_error {
+	__u64 offset;
+	__u64 length;
+	nfs4_stateid stateid;
+	struct nfs42_device_error errors[1];
+};
+
+#define NFS42_LAYOUTERROR_MAX 5
+
+struct nfs42_layouterror_args {
+	struct nfs4_sequence_args seq_args;
+	struct inode *inode;
+	unsigned int num_errors;
+	struct nfs42_layout_error errors[NFS42_LAYOUTERROR_MAX];
+};
+
+struct nfs42_layouterror_res {
+	struct nfs4_sequence_res seq_res;
+	unsigned int num_errors;
+	int rpc_status;
+};
+
+struct nfs42_layouterror_data {
+	struct nfs42_layouterror_args args;
+	struct nfs42_layouterror_res res;
+	struct inode *inode;
+	struct pnfs_layout_segment *lseg;
+};
+
 struct nfs42_clone_args {
 	struct nfs4_sequence_args	seq_args;
 	struct nfs_fh			*src_fh;
-- 
cgit v1.2.3


From 9f03161a1bd8cd9ccf11533e52326718c656036e Mon Sep 17 00:00:00 2001
From: Michael Shych <michaelsh@mellanox.com>
Date: Wed, 20 Feb 2019 09:34:22 +0000
Subject: platform_data/mlxreg: additions for Mellanox watchdog driver.

There are two new fields added to mlxreg core structure:
features - supported features of device and
identity - device identity name.
Add new defines for watchdog features.

Signed-off-by: Michael Shych <michaelsh@mellanox.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
---
 include/linux/platform_data/mlxreg.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h
index 19f5cb618c55..31f7c25a44da 100644
--- a/include/linux/platform_data/mlxreg.h
+++ b/include/linux/platform_data/mlxreg.h
@@ -35,6 +35,19 @@
 #define __LINUX_PLATFORM_DATA_MLXREG_H
 
 #define MLXREG_CORE_LABEL_MAX_SIZE	32
+#define MLXREG_CORE_WD_FEATURE_NOWAYOUT		BIT(0)
+#define MLXREG_CORE_WD_FEATURE_START_AT_BOOT	BIT(1)
+
+/**
+ * enum mlxreg_wdt_type - type of HW watchdog
+ *
+ * TYPE1 HW watchdog implementation exist in old systems.
+ * All new systems have TYPE2 HW watchdog.
+ */
+enum mlxreg_wdt_type {
+	MLX_WDT_TYPE1,
+	MLX_WDT_TYPE2,
+};
 
 /**
  * struct mlxreg_hotplug_device - I2C device data:
@@ -110,11 +123,17 @@ struct mlxreg_core_item {
  * @led_data: led private data;
  * @regmap: register map of parent device;
  * @counter: number of led instances;
+ * @features: supported features of device;
+ * @version: implementation version;
+ * @identity: device identity name;
  */
 struct mlxreg_core_platform_data {
 	struct mlxreg_core_data *data;
 	void *regmap;
 	int counter;
+	u32 features;
+	u32 version;
+	char identity[MLXREG_CORE_LABEL_MAX_SIZE];
 };
 
 /**
-- 
cgit v1.2.3


From e36202a844d4eff2ab07bcef998d7b4beda9761f Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Fri, 22 Feb 2019 18:59:40 +0900
Subject: printk: Remove no longer used LOG_PREFIX.

When commit 5becfb1df5ac8e49 ("kmsg: merge continuation records while
printing") introduced LOG_PREFIX, we used KERN_DEFAULT etc. as a flag
for setting LOG_PREFIX in order to tell whether to call cont_add()
(i.e. whether to append the message to "struct cont").

But since commit 4bcc595ccd80decb ("printk: reinstate KERN_CONT for
printing continuation lines") inverted the behavior (i.e. don't append
the message to "struct cont" unless KERN_CONT is specified) and commit
5aa068ea4082b39e ("printk: remove games with previous record flags")
removed the last LOG_PREFIX check, setting LOG_PREFIX via KERN_DEFAULT
etc. is no longer meaningful.

Therefore, we can remove LOG_PREFIX and make KERN_DEFAULT empty string.

Link: http://lkml.kernel.org/r/1550829580-9189-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
To: Steven Rostedt <rostedt@goodmis.org>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/kern_levels.h | 2 +-
 include/linux/printk.h      | 1 -
 kernel/printk/printk.c      | 6 +-----
 3 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/kern_levels.h b/include/linux/kern_levels.h
index d237fe854ad9..bf2389c26ae3 100644
--- a/include/linux/kern_levels.h
+++ b/include/linux/kern_levels.h
@@ -14,7 +14,7 @@
 #define KERN_INFO	KERN_SOH "6"	/* informational */
 #define KERN_DEBUG	KERN_SOH "7"	/* debug-level messages */
 
-#define KERN_DEFAULT	KERN_SOH "d"	/* the default kernel loglevel */
+#define KERN_DEFAULT	""		/* the default kernel loglevel */
 
 /*
  * Annotation for a "continued" line of log printout (only done after a
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 55aa96975fa2..97aa12c928d4 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -18,7 +18,6 @@ static inline int printk_get_level(const char *buffer)
 	if (buffer[0] == KERN_SOH_ASCII && buffer[1]) {
 		switch (buffer[1]) {
 		case '0' ... '7':
-		case 'd':	/* KERN_DEFAULT */
 		case 'c':	/* KERN_CONT */
 			return buffer[1];
 		}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b4d26388bc62..9b6783c158f9 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -345,7 +345,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
 
 enum log_flags {
 	LOG_NEWLINE	= 2,	/* text ended with a newline */
-	LOG_PREFIX	= 4,	/* text started with a prefix */
 	LOG_CONT	= 8,	/* text is a fragment of a continuation line */
 };
 
@@ -1922,9 +1921,6 @@ int vprintk_store(int facility, int level,
 			case '0' ... '7':
 				if (level == LOGLEVEL_DEFAULT)
 					level = kern_level - '0';
-				/* fallthrough */
-			case 'd':	/* KERN_DEFAULT */
-				lflags |= LOG_PREFIX;
 				break;
 			case 'c':	/* KERN_CONT */
 				lflags |= LOG_CONT;
@@ -1939,7 +1935,7 @@ int vprintk_store(int facility, int level,
 		level = default_message_loglevel;
 
 	if (dict)
-		lflags |= LOG_PREFIX|LOG_NEWLINE;
+		lflags |= LOG_NEWLINE;
 
 	return log_output(facility, level, lflags,
 			  dict, dictlen, text, text_len);
-- 
cgit v1.2.3


From 3eb39f47934f9d5a3027fe00d906a45fe3a15fad Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Mon, 19 Nov 2018 00:51:56 +0100
Subject: signal: add pidfd_send_signal() syscall

The kill() syscall operates on process identifiers (pid). After a process
has exited its pid can be reused by another process. If a caller sends a
signal to a reused pid it will end up signaling the wrong process. This
issue has often surfaced and there has been a push to address this problem [1].

This patch uses file descriptors (fd) from proc/<pid> as stable handles on
struct pid. Even if a pid is recycled the handle will not change. The fd
can be used to send signals to the process it refers to.
Thus, the new syscall pidfd_send_signal() is introduced to solve this
problem. Instead of pids it operates on process fds (pidfd).

/* prototype and argument /*
long pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags);

/* syscall number 424 */
The syscall number was chosen to be 424 to align with Arnd's rework in his
y2038 to minimize merge conflicts (cf. [25]).

In addition to the pidfd and signal argument it takes an additional
siginfo_t and flags argument. If the siginfo_t argument is NULL then
pidfd_send_signal() is equivalent to kill(<positive-pid>, <signal>). If it
is not NULL pidfd_send_signal() is equivalent to rt_sigqueueinfo().
The flags argument is added to allow for future extensions of this syscall.
It currently needs to be passed as 0. Failing to do so will cause EINVAL.

/* pidfd_send_signal() replaces multiple pid-based syscalls */
The pidfd_send_signal() syscall currently takes on the job of
rt_sigqueueinfo(2) and parts of the functionality of kill(2), Namely, when a
positive pid is passed to kill(2). It will however be possible to also
replace tgkill(2) and rt_tgsigqueueinfo(2) if this syscall is extended.

/* sending signals to threads (tid) and process groups (pgid) */
Specifically, the pidfd_send_signal() syscall does currently not operate on
process groups or threads. This is left for future extensions.
In order to extend the syscall to allow sending signal to threads and
process groups appropriately named flags (e.g. PIDFD_TYPE_PGID, and
PIDFD_TYPE_TID) should be added. This implies that the flags argument will
determine what is signaled and not the file descriptor itself. Put in other
words, grouping in this api is a property of the flags argument not a
property of the file descriptor (cf. [13]). Clarification for this has been
requested by Eric (cf. [19]).
When appropriate extensions through the flags argument are added then
pidfd_send_signal() can additionally replace the part of kill(2) which
operates on process groups as well as the tgkill(2) and
rt_tgsigqueueinfo(2) syscalls.
How such an extension could be implemented has been very roughly sketched
in [14], [15], and [16]. However, this should not be taken as a commitment
to a particular implementation. There might be better ways to do it.
Right now this is intentionally left out to keep this patchset as simple as
possible (cf. [4]).

/* naming */
The syscall had various names throughout iterations of this patchset:
- procfd_signal()
- procfd_send_signal()
- taskfd_send_signal()
In the last round of reviews it was pointed out that given that if the
flags argument decides the scope of the signal instead of different types
of fds it might make sense to either settle for "procfd_" or "pidfd_" as
prefix. The community was willing to accept either (cf. [17] and [18]).
Given that one developer expressed strong preference for the "pidfd_"
prefix (cf. [13]) and with other developers less opinionated about the name
we should settle for "pidfd_" to avoid further bikeshedding.

The  "_send_signal" suffix was chosen to reflect the fact that the syscall
takes on the job of multiple syscalls. It is therefore intentional that the
name is not reminiscent of neither kill(2) nor rt_sigqueueinfo(2). Not the
fomer because it might imply that pidfd_send_signal() is a replacement for
kill(2), and not the latter because it is a hassle to remember the correct
spelling - especially for non-native speakers - and because it is not
descriptive enough of what the syscall actually does. The name
"pidfd_send_signal" makes it very clear that its job is to send signals.

/* zombies */
Zombies can be signaled just as any other process. No special error will be
reported since a zombie state is an unreliable state (cf. [3]). However,
this can be added as an extension through the @flags argument if the need
ever arises.

/* cross-namespace signals */
The patch currently enforces that the signaler and signalee either are in
the same pid namespace or that the signaler's pid namespace is an ancestor
of the signalee's pid namespace. This is done for the sake of simplicity
and because it is unclear to what values certain members of struct
siginfo_t would need to be set to (cf. [5], [6]).

/* compat syscalls */
It became clear that we would like to avoid adding compat syscalls
(cf. [7]).  The compat syscall handling is now done in kernel/signal.c
itself by adding __copy_siginfo_from_user_generic() which lets us avoid
compat syscalls (cf. [8]). It should be noted that the addition of
__copy_siginfo_from_user_any() is caused by a bug in the original
implementation of rt_sigqueueinfo(2) (cf. 12).
With upcoming rework for syscall handling things might improve
significantly (cf. [11]) and __copy_siginfo_from_user_any() will not gain
any additional callers.

/* testing */
This patch was tested on x64 and x86.

/* userspace usage */
An asciinema recording for the basic functionality can be found under [9].
With this patch a process can be killed via:

 #define _GNU_SOURCE
 #include <errno.h>
 #include <fcntl.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
 #include <unistd.h>

 static inline int do_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
                                         unsigned int flags)
 {
 #ifdef __NR_pidfd_send_signal
         return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags);
 #else
         return -ENOSYS;
 #endif
 }

 int main(int argc, char *argv[])
 {
         int fd, ret, saved_errno, sig;

         if (argc < 3)
                 exit(EXIT_FAILURE);

         fd = open(argv[1], O_DIRECTORY | O_CLOEXEC);
         if (fd < 0) {
                 printf("%s - Failed to open \"%s\"\n", strerror(errno), argv[1]);
                 exit(EXIT_FAILURE);
         }

         sig = atoi(argv[2]);

         printf("Sending signal %d to process %s\n", sig, argv[1]);
         ret = do_pidfd_send_signal(fd, sig, NULL, 0);

         saved_errno = errno;
         close(fd);
         errno = saved_errno;

         if (ret < 0) {
                 printf("%s - Failed to send signal %d to process %s\n",
                        strerror(errno), sig, argv[1]);
                 exit(EXIT_FAILURE);
         }

         exit(EXIT_SUCCESS);
 }

/* Q&A
 * Given that it seems the same questions get asked again by people who are
 * late to the party it makes sense to add a Q&A section to the commit
 * message so it's hopefully easier to avoid duplicate threads.
 *
 * For the sake of progress please consider these arguments settled unless
 * there is a new point that desperately needs to be addressed. Please make
 * sure to check the links to the threads in this commit message whether
 * this has not already been covered.
 */
Q-01: (Florian Weimer [20], Andrew Morton [21])
      What happens when the target process has exited?
A-01: Sending the signal will fail with ESRCH (cf. [22]).

Q-02:  (Andrew Morton [21])
       Is the task_struct pinned by the fd?
A-02:  No. A reference to struct pid is kept. struct pid - as far as I
       understand - was created exactly for the reason to not require to
       pin struct task_struct (cf. [22]).

Q-03: (Andrew Morton [21])
      Does the entire procfs directory remain visible? Just one entry
      within it?
A-03: The same thing that happens right now when you hold a file descriptor
      to /proc/<pid> open (cf. [22]).

Q-04: (Andrew Morton [21])
      Does the pid remain reserved?
A-04: No. This patchset guarantees a stable handle not that pids are not
      recycled (cf. [22]).

Q-05: (Andrew Morton [21])
      Do attempts to signal that fd return errors?
A-05: See {Q,A}-01.

Q-06: (Andrew Morton [22])
      Is there a cleaner way of obtaining the fd? Another syscall perhaps.
A-06: Userspace can already trivially retrieve file descriptors from procfs
      so this is something that we will need to support anyway. Hence,
      there's no immediate need to add another syscalls just to make
      pidfd_send_signal() not dependent on the presence of procfs. However,
      adding a syscalls to get such file descriptors is planned for a
      future patchset (cf. [22]).

Q-07: (Andrew Morton [21] and others)
      This fd-for-a-process sounds like a handy thing and people may well
      think up other uses for it in the future, probably unrelated to
      signals. Are the code and the interface designed to permit such
      future applications?
A-07: Yes (cf. [22]).

Q-08: (Andrew Morton [21] and others)
      Now I think about it, why a new syscall? This thing is looking
      rather like an ioctl?
A-08: This has been extensively discussed. It was agreed that a syscall is
      preferred for a variety or reasons. Here are just a few taken from
      prior threads. Syscalls are safer than ioctl()s especially when
      signaling to fds. Processes are a core kernel concept so a syscall
      seems more appropriate. The layout of the syscall with its four
      arguments would require the addition of a custom struct for the
      ioctl() thereby causing at least the same amount or even more
      complexity for userspace than a simple syscall. The new syscall will
      replace multiple other pid-based syscalls (see description above).
      The file-descriptors-for-processes concept introduced with this
      syscall will be extended with other syscalls in the future. See also
      [22], [23] and various other threads already linked in here.

Q-09: (Florian Weimer [24])
      What happens if you use the new interface with an O_PATH descriptor?
A-09:
      pidfds opened as O_PATH fds cannot be used to send signals to a
      process (cf. [2]). Signaling processes through pidfds is the
      equivalent of writing to a file. Thus, this is not an operation that
      operates "purely at the file descriptor level" as required by the
      open(2) manpage. See also [4].

/* References */
[1]:  https://lore.kernel.org/lkml/20181029221037.87724-1-dancol@google.com/
[2]:  https://lore.kernel.org/lkml/874lbtjvtd.fsf@oldenburg2.str.redhat.com/
[3]:  https://lore.kernel.org/lkml/20181204132604.aspfupwjgjx6fhva@brauner.io/
[4]:  https://lore.kernel.org/lkml/20181203180224.fkvw4kajtbvru2ku@brauner.io/
[5]:  https://lore.kernel.org/lkml/20181121213946.GA10795@mail.hallyn.com/
[6]:  https://lore.kernel.org/lkml/20181120103111.etlqp7zop34v6nv4@brauner.io/
[7]:  https://lore.kernel.org/lkml/36323361-90BD-41AF-AB5B-EE0D7BA02C21@amacapital.net/
[8]:  https://lore.kernel.org/lkml/87tvjxp8pc.fsf@xmission.com/
[9]:  https://asciinema.org/a/IQjuCHew6bnq1cr78yuMv16cy
[11]: https://lore.kernel.org/lkml/F53D6D38-3521-4C20-9034-5AF447DF62FF@amacapital.net/
[12]: https://lore.kernel.org/lkml/87zhtjn8ck.fsf@xmission.com/
[13]: https://lore.kernel.org/lkml/871s6u9z6u.fsf@xmission.com/
[14]: https://lore.kernel.org/lkml/20181206231742.xxi4ghn24z4h2qki@brauner.io/
[15]: https://lore.kernel.org/lkml/20181207003124.GA11160@mail.hallyn.com/
[16]: https://lore.kernel.org/lkml/20181207015423.4miorx43l3qhppfz@brauner.io/
[17]: https://lore.kernel.org/lkml/CAGXu5jL8PciZAXvOvCeCU3wKUEB_dU-O3q0tDw4uB_ojMvDEew@mail.gmail.com/
[18]: https://lore.kernel.org/lkml/20181206222746.GB9224@mail.hallyn.com/
[19]: https://lore.kernel.org/lkml/20181208054059.19813-1-christian@brauner.io/
[20]: https://lore.kernel.org/lkml/8736rebl9s.fsf@oldenburg.str.redhat.com/
[21]: https://lore.kernel.org/lkml/20181228152012.dbf0508c2508138efc5f2bbe@linux-foundation.org/
[22]: https://lore.kernel.org/lkml/20181228233725.722tdfgijxcssg76@brauner.io/
[23]: https://lwn.net/Articles/773459/
[24]: https://lore.kernel.org/lkml/8736rebl9s.fsf@oldenburg.str.redhat.com/
[25]: https://lore.kernel.org/lkml/CAK8P3a0ej9NcJM8wXNPbcGUyOUZYX+VLoDFdbenW3s3114oQZw@mail.gmail.com/

Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Jann Horn <jannh@google.com>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Florian Weimer <fweimer@redhat.com>
Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Tycho Andersen <tycho@tycho.ws>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Serge Hallyn <serge@hallyn.com>
Acked-by: Aleksa Sarai <cyphar@cyphar.com>
---
 arch/x86/entry/syscalls/syscall_32.tbl |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 fs/proc/base.c                         |   9 +++
 include/linux/proc_fs.h                |   6 ++
 include/linux/syscalls.h               |   3 +
 include/uapi/asm-generic/unistd.h      |   4 +-
 kernel/signal.c                        | 133 +++++++++++++++++++++++++++++++--
 kernel/sys_ni.c                        |   1 +
 8 files changed, 151 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 3cf7b533b3d1..234d91df8ca6 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -398,3 +398,4 @@
 384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl
 385	i386	io_pgetevents		sys_io_pgetevents		__ia32_compat_sys_io_pgetevents
 386	i386	rseq			sys_rseq			__ia32_sys_rseq
+424	i386	pidfd_send_signal	sys_pidfd_send_signal		__ia32_sys_pidfd_send_signal
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index f0b1709a5ffb..58f4b3ad4fe0 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -343,6 +343,7 @@
 332	common	statx			__x64_sys_statx
 333	common	io_pgetevents		__x64_sys_io_pgetevents
 334	common	rseq			__x64_sys_rseq
+424	common	pidfd_send_signal	__x64_sys_pidfd_send_signal
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 633a63462573..b6627c471078 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3046,6 +3046,15 @@ static const struct file_operations proc_tgid_base_operations = {
 	.llseek		= generic_file_llseek,
 };
 
+struct pid *tgid_pidfd_to_pid(const struct file *file)
+{
+	if (!d_is_dir(file->f_path.dentry) ||
+	    (file->f_op != &proc_tgid_base_operations))
+		return ERR_PTR(-EBADF);
+
+	return proc_pid(file_inode(file));
+}
+
 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	return proc_pident_lookup(dir, dentry,
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index d0e1f1522a78..52a283ba0465 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -73,6 +73,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
 						    int (*show)(struct seq_file *, void *),
 						    proc_write_t write,
 						    void *data);
+extern struct pid *tgid_pidfd_to_pid(const struct file *file);
 
 #else /* CONFIG_PROC_FS */
 
@@ -114,6 +115,11 @@ static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *p
 #define proc_create_net(name, mode, parent, state_size, ops) ({NULL;})
 #define proc_create_net_single(name, mode, parent, show, data) ({NULL;})
 
+static inline struct pid *tgid_pidfd_to_pid(const struct file *file)
+{
+	return ERR_PTR(-EBADF);
+}
+
 #endif /* CONFIG_PROC_FS */
 
 struct net;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 257cccba3062..5eb2e351675e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -926,6 +926,9 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
 			  unsigned mask, struct statx __user *buffer);
 asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
 			 int flags, uint32_t sig);
+asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
+				       siginfo_t __user *info,
+				       unsigned int flags);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index d90127298f12..c861e7d1053b 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -740,9 +740,11 @@ __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
 __SYSCALL(__NR_rseq, sys_rseq)
 #define __NR_kexec_file_load 294
 __SYSCALL(__NR_kexec_file_load,     sys_kexec_file_load)
+#define __NR_pidfd_send_signal 424
+__SYSCALL(__NR_pidfd_send_signal, sys_pidfd_send_signal)
 
 #undef __NR_syscalls
-#define __NR_syscalls 295
+#define __NR_syscalls 425
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/signal.c b/kernel/signal.c
index e1d7ad8e6ab1..268bed80244f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -19,7 +19,9 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/sched/cputime.h>
+#include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/proc_fs.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
 #include <linux/coredump.h>
@@ -3429,6 +3431,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
 #endif
 #endif
 
+static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info)
+{
+	clear_siginfo(info);
+	info->si_signo = sig;
+	info->si_errno = 0;
+	info->si_code = SI_USER;
+	info->si_pid = task_tgid_vnr(current);
+	info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
+}
+
 /**
  *  sys_kill - send a signal to a process
  *  @pid: the PID of the process
@@ -3438,16 +3450,125 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
 {
 	struct kernel_siginfo info;
 
-	clear_siginfo(&info);
-	info.si_signo = sig;
-	info.si_errno = 0;
-	info.si_code = SI_USER;
-	info.si_pid = task_tgid_vnr(current);
-	info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+	prepare_kill_siginfo(sig, &info);
 
 	return kill_something_info(sig, &info, pid);
 }
 
+#ifdef CONFIG_PROC_FS
+/*
+ * Verify that the signaler and signalee either are in the same pid namespace
+ * or that the signaler's pid namespace is an ancestor of the signalee's pid
+ * namespace.
+ */
+static bool access_pidfd_pidns(struct pid *pid)
+{
+	struct pid_namespace *active = task_active_pid_ns(current);
+	struct pid_namespace *p = ns_of_pid(pid);
+
+	for (;;) {
+		if (!p)
+			return false;
+		if (p == active)
+			break;
+		p = p->parent;
+	}
+
+	return true;
+}
+
+static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
+{
+#ifdef CONFIG_COMPAT
+	/*
+	 * Avoid hooking up compat syscalls and instead handle necessary
+	 * conversions here. Note, this is a stop-gap measure and should not be
+	 * considered a generic solution.
+	 */
+	if (in_compat_syscall())
+		return copy_siginfo_from_user32(
+			kinfo, (struct compat_siginfo __user *)info);
+#endif
+	return copy_siginfo_from_user(kinfo, info);
+}
+
+/**
+ * sys_pidfd_send_signal - send a signal to a process through a task file
+ *                          descriptor
+ * @pidfd:  the file descriptor of the process
+ * @sig:    signal to be sent
+ * @info:   the signal info
+ * @flags:  future flags to be passed
+ *
+ * The syscall currently only signals via PIDTYPE_PID which covers
+ * kill(<positive-pid>, <signal>. It does not signal threads or process
+ * groups.
+ * In order to extend the syscall to threads and process groups the @flags
+ * argument should be used. In essence, the @flags argument will determine
+ * what is signaled and not the file descriptor itself. Put in other words,
+ * grouping is a property of the flags argument not a property of the file
+ * descriptor.
+ *
+ * Return: 0 on success, negative errno on failure
+ */
+SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
+		siginfo_t __user *, info, unsigned int, flags)
+{
+	int ret;
+	struct fd f;
+	struct pid *pid;
+	kernel_siginfo_t kinfo;
+
+	/* Enforce flags be set to 0 until we add an extension. */
+	if (flags)
+		return -EINVAL;
+
+	f = fdget_raw(pidfd);
+	if (!f.file)
+		return -EBADF;
+
+	/* Is this a pidfd? */
+	pid = tgid_pidfd_to_pid(f.file);
+	if (IS_ERR(pid)) {
+		ret = PTR_ERR(pid);
+		goto err;
+	}
+
+	ret = -EINVAL;
+	if (!access_pidfd_pidns(pid))
+		goto err;
+
+	if (info) {
+		ret = copy_siginfo_from_user_any(&kinfo, info);
+		if (unlikely(ret))
+			goto err;
+
+		ret = -EINVAL;
+		if (unlikely(sig != kinfo.si_signo))
+			goto err;
+
+		if ((task_pid(current) != pid) &&
+		    (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) {
+			/* Only allow sending arbitrary signals to yourself. */
+			ret = -EPERM;
+			if (kinfo.si_code != SI_USER)
+				goto err;
+
+			/* Turn this into a regular kill signal. */
+			prepare_kill_siginfo(sig, &kinfo);
+		}
+	} else {
+		prepare_kill_siginfo(sig, &kinfo);
+	}
+
+	ret = kill_pid_info(sig, &kinfo, pid);
+
+err:
+	fdput(f);
+	return ret;
+}
+#endif /* CONFIG_PROC_FS */
+
 static int
 do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
 {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ab9d0e3c6d50..f905f4f9f677 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -163,6 +163,7 @@ COND_SYSCALL(syslog);
 /* kernel/sched/core.c */
 
 /* kernel/signal.c */
+COND_SYSCALL(pidfd_send_signal);
 
 /* kernel/sys.c */
 COND_SYSCALL(setregid);
-- 
cgit v1.2.3


From fe33032daae2e584d9e7e33bab44c9eafced1f8f Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Fri, 1 Feb 2019 14:57:15 +0800
Subject: ceph: add mount option to limit caps count

If number of caps exceed the limit, ceph_trim_dentires() also trim
dentries with valid leases. Trimming dentry releases references to
associated inode, which may evict inode and release caps.

By default, there is no limit for caps count.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 Documentation/filesystems/ceph.txt |  4 ++++
 fs/ceph/caps.c                     | 33 ++++++++++++++++++++++++++-------
 fs/ceph/dir.c                      | 20 +++++++++++++++++++-
 fs/ceph/mds_client.c               | 34 ++++++++++++++++++++++++++--------
 fs/ceph/mds_client.h               |  3 +++
 fs/ceph/super.c                    | 12 +++++++++---
 fs/ceph/super.h                    |  5 +++--
 include/linux/ceph/types.h         |  1 +
 8 files changed, 91 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
index 1177052701e1..bc4145ee5dba 100644
--- a/Documentation/filesystems/ceph.txt
+++ b/Documentation/filesystems/ceph.txt
@@ -118,6 +118,10 @@ Mount Options
 	of a non-responsive Ceph file system.  The default is 30
 	seconds.
 
+  caps_max=X
+	Specify the maximum number of caps to hold. Unused caps are released
+	when number of caps exceeds the limit. The default is 0 (no limit)
+
   rbytes
 	When stat() is called on a directory, set st_size to 'rbytes',
 	the summation of file sizes over all files nested beneath that
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 6fbdc1a0afbe..36a8dc699448 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -148,11 +148,17 @@ void ceph_caps_finalize(struct ceph_mds_client *mdsc)
 	spin_unlock(&mdsc->caps_list_lock);
 }
 
-void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
+void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
+			      struct ceph_mount_options *fsopt)
 {
 	spin_lock(&mdsc->caps_list_lock);
-	mdsc->caps_min_count += delta;
-	BUG_ON(mdsc->caps_min_count < 0);
+	mdsc->caps_min_count = fsopt->max_readdir;
+	if (mdsc->caps_min_count < 1024)
+		mdsc->caps_min_count = 1024;
+	mdsc->caps_use_max = fsopt->caps_max;
+	if (mdsc->caps_use_max > 0 &&
+	    mdsc->caps_use_max < mdsc->caps_min_count)
+		mdsc->caps_use_max = mdsc->caps_min_count;
 	spin_unlock(&mdsc->caps_list_lock);
 }
 
@@ -272,6 +278,7 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
 	if (!err) {
 		BUG_ON(have + alloc != need);
 		ctx->count = need;
+		ctx->used = 0;
 	}
 
 	spin_lock(&mdsc->caps_list_lock);
@@ -295,13 +302,24 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
 }
 
 void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
-			struct ceph_cap_reservation *ctx)
+			 struct ceph_cap_reservation *ctx)
 {
+	bool reclaim = false;
+	if (!ctx->count)
+		return;
+
 	dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
 	spin_lock(&mdsc->caps_list_lock);
 	__ceph_unreserve_caps(mdsc, ctx->count);
 	ctx->count = 0;
+
+	if (mdsc->caps_use_max > 0 &&
+	    mdsc->caps_use_count > mdsc->caps_use_max)
+		reclaim = true;
 	spin_unlock(&mdsc->caps_list_lock);
+
+	if (reclaim)
+		ceph_reclaim_caps_nr(mdsc, ctx->used);
 }
 
 struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
@@ -346,6 +364,7 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
 	BUG_ON(list_empty(&mdsc->caps_list));
 
 	ctx->count--;
+	ctx->used++;
 	mdsc->caps_reserve_count--;
 	mdsc->caps_use_count++;
 
@@ -500,12 +519,12 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
 static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
 			       struct ceph_inode_info *ci)
 {
-	struct ceph_mount_options *ma = mdsc->fsc->mount_options;
+	struct ceph_mount_options *opt = mdsc->fsc->mount_options;
 
 	ci->i_hold_caps_min = round_jiffies(jiffies +
-					    ma->caps_wanted_delay_min * HZ);
+					    opt->caps_wanted_delay_min * HZ);
 	ci->i_hold_caps_max = round_jiffies(jiffies +
-					    ma->caps_wanted_delay_max * HZ);
+					    opt->caps_wanted_delay_max * HZ);
 	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
 	     ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
 }
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index eba283557653..a8f429882249 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1224,6 +1224,7 @@ enum {
 
 struct ceph_lease_walk_control {
 	bool dir_lease;
+	bool expire_dir_lease;
 	unsigned long nr_to_scan;
 	unsigned long dir_lease_ttl;
 };
@@ -1345,7 +1346,13 @@ static int __dir_lease_check(struct dentry *dentry, void *arg)
 		/* Move dentry to tail of dir lease list if we don't want
 		 * to delete it. So dentries in the list are checked in a
 		 * round robin manner */
-		return TOUCH;
+		if (!lwc->expire_dir_lease)
+			return TOUCH;
+		if (dentry->d_lockref.count > 0 ||
+		    (di->flags & CEPH_DENTRY_REFERENCED))
+			return TOUCH;
+		/* invalidate dir lease */
+		di->lease_shared_gen = 0;
 	}
 	return DELETE;
 }
@@ -1353,8 +1360,17 @@ static int __dir_lease_check(struct dentry *dentry, void *arg)
 int ceph_trim_dentries(struct ceph_mds_client *mdsc)
 {
 	struct ceph_lease_walk_control lwc;
+	unsigned long count;
 	unsigned long freed;
 
+	spin_lock(&mdsc->caps_list_lock);
+        if (mdsc->caps_use_max > 0 &&
+            mdsc->caps_use_count > mdsc->caps_use_max)
+		count = mdsc->caps_use_count - mdsc->caps_use_max;
+	else
+		count = 0;
+        spin_unlock(&mdsc->caps_list_lock);
+
 	lwc.dir_lease = false;
 	lwc.nr_to_scan  = CEPH_CAPS_PER_RELEASE * 2;
 	freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check);
@@ -1365,6 +1381,8 @@ int ceph_trim_dentries(struct ceph_mds_client *mdsc)
 		lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE;
 
 	lwc.dir_lease = true;
+	lwc.expire_dir_lease = freed < count;
+	lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ;
 	freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check);
 	if (!lwc.nr_to_scan) /* more to check */
 		return -EAGAIN;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 2095e5d038f8..21c33ed048ed 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1965,6 +1965,18 @@ void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc)
         }
 }
 
+void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)
+{
+	int val;
+	if (!nr)
+		return;
+	val = atomic_add_return(nr, &mdsc->cap_reclaim_pending);
+	if (!(val % CEPH_CAPS_PER_RELEASE)) {
+		atomic_set(&mdsc->cap_reclaim_pending, 0);
+		ceph_queue_cap_reclaim_work(mdsc);
+	}
+}
+
 /*
  * requests
  */
@@ -2878,7 +2890,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 		if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
 				    req->r_op == CEPH_MDS_OP_LSSNAP))
 			ceph_readdir_prepopulate(req, req->r_session);
-		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
 	}
 	current->journal_info = NULL;
 	mutex_unlock(&req->r_fill_mutex);
@@ -2887,12 +2898,18 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	if (realm)
 		ceph_put_snap_realm(mdsc, realm);
 
-	if (err == 0 && req->r_target_inode &&
-	    test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
-		struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
-		spin_lock(&ci->i_unsafe_lock);
-		list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
-		spin_unlock(&ci->i_unsafe_lock);
+	if (err == 0) {
+		if (req->r_target_inode &&
+		    test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
+			struct ceph_inode_info *ci =
+				ceph_inode(req->r_target_inode);
+			spin_lock(&ci->i_unsafe_lock);
+			list_add_tail(&req->r_unsafe_target_item,
+				      &ci->i_unsafe_iops);
+			spin_unlock(&ci->i_unsafe_lock);
+		}
+
+		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
 	}
 out_err:
 	mutex_lock(&mdsc->mutex);
@@ -4083,13 +4100,14 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	spin_lock_init(&mdsc->cap_dirty_lock);
 	init_waitqueue_head(&mdsc->cap_flushing_wq);
 	INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
+	atomic_set(&mdsc->cap_reclaim_pending, 0);
 
 	spin_lock_init(&mdsc->dentry_list_lock);
 	INIT_LIST_HEAD(&mdsc->dentry_leases);
 	INIT_LIST_HEAD(&mdsc->dentry_dir_leases);
 
 	ceph_caps_init(mdsc);
-	ceph_adjust_min_caps(mdsc, fsc->min_caps);
+	ceph_adjust_caps_max_min(mdsc, fsc->mount_options);
 
 	spin_lock_init(&mdsc->snapid_map_lock);
 	mdsc->snapid_map_tree = RB_ROOT;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 580b235f343b..50385a481fdb 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -379,6 +379,7 @@ struct ceph_mds_client {
 	wait_queue_head_t cap_flushing_wq;
 
 	struct work_struct cap_reclaim_work;
+	atomic_t	   cap_reclaim_pending;
 
 	/*
 	 * Cap reservations
@@ -396,6 +397,7 @@ struct ceph_mds_client {
 						unreserved) */
 	int		caps_total_count;    /* total caps allocated */
 	int		caps_use_count;      /* in use */
+	int		caps_use_max;	     /* max used caps */
 	int		caps_reserve_count;  /* unused, reserved */
 	int		caps_avail_count;    /* unused, unreserved */
 	int		caps_min_count;      /* keep at least this many
@@ -465,6 +467,7 @@ extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
 extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
 				    struct ceph_mds_session *session);
 extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
+extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
 extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
 
 extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 200836bcf542..6d5bb2f74612 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -133,6 +133,7 @@ enum {
 	Opt_rasize,
 	Opt_caps_wanted_delay_min,
 	Opt_caps_wanted_delay_max,
+	Opt_caps_max,
 	Opt_readdir_max_entries,
 	Opt_readdir_max_bytes,
 	Opt_congestion_kb,
@@ -175,6 +176,7 @@ static match_table_t fsopt_tokens = {
 	{Opt_rasize, "rasize=%d"},
 	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
 	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+	{Opt_caps_max, "caps_max=%d"},
 	{Opt_readdir_max_entries, "readdir_max_entries=%d"},
 	{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
 	{Opt_congestion_kb, "write_congestion_kb=%d"},
@@ -286,6 +288,11 @@ static int parse_fsopt_token(char *c, void *private)
 			return -EINVAL;
 		fsopt->caps_wanted_delay_max = intval;
 		break;
+	case Opt_caps_max:
+		if (intval < 0)
+			return -EINVAL;
+		fsopt->caps_max = intval;
+		break;
 	case Opt_readdir_max_entries:
 		if (intval < 1)
 			return -EINVAL;
@@ -576,6 +583,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
 		seq_printf(m, ",rasize=%d", fsopt->rasize);
 	if (fsopt->congestion_kb != default_congestion_kb())
 		seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+	if (fsopt->caps_max)
+		seq_printf(m, ",caps_max=%d", fsopt->caps_max);
 	if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
 		seq_printf(m, ",caps_wanted_delay_min=%d",
 			 fsopt->caps_wanted_delay_min);
@@ -683,9 +692,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 	if (!fsc->wb_pagevec_pool)
 		goto fail_cap_wq;
 
-	/* caps */
-	fsc->min_caps = fsopt->max_readdir;
-
 	return fsc;
 
 fail_cap_wq:
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b3bcfb3c27bd..16c03188578e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -79,6 +79,7 @@ struct ceph_mount_options {
 	int rasize;           /* max readahead */
 	int congestion_kb;    /* max writeback in flight */
 	int caps_wanted_delay_min, caps_wanted_delay_max;
+	int caps_max;
 	int max_readdir;       /* max readdir result (entires) */
 	int max_readdir_bytes; /* max readdir result (bytes) */
 
@@ -100,7 +101,6 @@ struct ceph_fs_client {
 	struct ceph_client *client;
 
 	unsigned long mount_state;
-	int min_caps;                  /* min caps i added */
 	loff_t max_file_size;
 
 	struct ceph_mds_client *mdsc;
@@ -668,7 +668,8 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check);
 
 extern void ceph_caps_init(struct ceph_mds_client *mdsc);
 extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
-extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
+extern void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
+				     struct ceph_mount_options *fsopt);
 extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
 			     struct ceph_cap_reservation *ctx, int need);
 extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
diff --git a/include/linux/ceph/types.h b/include/linux/ceph/types.h
index 27cd973d3881..bd3d532902d7 100644
--- a/include/linux/ceph/types.h
+++ b/include/linux/ceph/types.h
@@ -24,6 +24,7 @@ struct ceph_vino {
 /* context for the caps reservation mechanism */
 struct ceph_cap_reservation {
 	int count;
+	int used;
 };
 
 
-- 
cgit v1.2.3


From 0bdb50c531f7377a9da80d3ce2d61f389c84cb30 Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Sun, 6 Jan 2019 21:06:25 +1100
Subject: dm: fix to_sector() for 32bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A dm-raid array with devices larger than 4GB won't assemble on
a 32 bit host since _check_data_dev_sectors() was added in 4.16.
This is because to_sector() treats its argument as an "unsigned long"
which is 32bits (4GB) on a 32bit host.  Using "unsigned long long"
is more correct.

Kernels as early as 4.2 can have other problems due to to_sector()
being used on the size of a device.

Fixes: 0cf4503174c1 ("dm raid: add support for the MD RAID0 personality")
cc: stable@vger.kernel.org (v4.2+)
Reported-and-tested-by: Guillaume Perréal <gperreal@free.fr>
Signed-off-by: NeilBrown <neil@brown.name>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 include/linux/device-mapper.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 0f5b3d7c6cb3..52e8709c6df0 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -603,7 +603,7 @@ do {									\
  */
 #define dm_target_offset(ti, sector) ((sector) - (ti)->begin)
 
-static inline sector_t to_sector(unsigned long n)
+static inline sector_t to_sector(unsigned long long n)
 {
 	return (n >> SECTOR_SHIFT);
 }
-- 
cgit v1.2.3


From 6bbc923dfcf57d6b97388819a7393835664c7a8e Mon Sep 17 00:00:00 2001
From: Helen Koike <helen.koike@collabora.com>
Date: Thu, 21 Feb 2019 17:33:34 -0300
Subject: dm: add support to directly boot to a mapped device

Add a "create" module parameter, which allows device-mapper targets to
be configured at boot time. This enables early use of DM targets in the
boot process (as the root device or otherwise) without the need of an
initramfs.

The syntax used in the boot param is based on the concise format from
the dmsetup tool to follow the rule of least surprise:

	dmsetup table --concise /dev/mapper/lroot

Which is:
	dm-mod.create=<name>,<uuid>,<minor>,<flags>,<table>[,<table>+][;<name>,<uuid>,<minor>,<flags>,<table>[,<table>+]+]

Where,
	<name>		::= The device name.
	<uuid>		::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | ""
	<minor>		::= The device minor number | ""
	<flags>		::= "ro" | "rw"
	<table>		::= <start_sector> <num_sectors> <target_type> <target_args>
	<target_type>	::= "verity" | "linear" | ...

For example, the following could be added in the boot parameters:
dm-mod.create="lroot,,,rw, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" root=/dev/dm-0

Only the targets that were tested are allowed and the ones that don't
change any block device when the device is create as read-only. For
example, mirror and cache targets are not allowed. The rationale behind
this is that if the user makes a mistake, choosing the wrong device to
be the mirror or the cache can corrupt data.

The only targets initially allowed are:
* crypt
* delay
* linear
* snapshot-origin
* striped
* verity

Co-developed-by: Will Drewry <wad@chromium.org>
Co-developed-by: Kees Cook <keescook@chromium.org>
Co-developed-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Signed-off-by: Helen Koike <helen.koike@collabora.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/dm-init.txt | 114 ++++++++++++
 drivers/md/Kconfig                      |  12 ++
 drivers/md/Makefile                     |   4 +
 drivers/md/dm-init.c                    | 303 ++++++++++++++++++++++++++++++++
 drivers/md/dm-ioctl.c                   | 103 +++++++++++
 include/linux/device-mapper.h           |   9 +
 6 files changed, 545 insertions(+)
 create mode 100644 Documentation/device-mapper/dm-init.txt
 create mode 100644 drivers/md/dm-init.c

(limited to 'include')

diff --git a/Documentation/device-mapper/dm-init.txt b/Documentation/device-mapper/dm-init.txt
new file mode 100644
index 000000000000..8464ee7c01b8
--- /dev/null
+++ b/Documentation/device-mapper/dm-init.txt
@@ -0,0 +1,114 @@
+Early creation of mapped devices
+====================================
+
+It is possible to configure a device-mapper device to act as the root device for
+your system in two ways.
+
+The first is to build an initial ramdisk which boots to a minimal userspace
+which configures the device, then pivot_root(8) in to it.
+
+The second is to create one or more device-mappers using the module parameter
+"dm-mod.create=" through the kernel boot command line argument.
+
+The format is specified as a string of data separated by commas and optionally
+semi-colons, where:
+ - a comma is used to separate fields like name, uuid, flags and table
+   (specifies one device)
+ - a semi-colon is used to separate devices.
+
+So the format will look like this:
+
+ dm-mod.create=<name>,<uuid>,<minor>,<flags>,<table>[,<table>+][;<name>,<uuid>,<minor>,<flags>,<table>[,<table>+]+]
+
+Where,
+	<name>		::= The device name.
+	<uuid>		::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | ""
+	<minor>		::= The device minor number | ""
+	<flags>		::= "ro" | "rw"
+	<table>		::= <start_sector> <num_sectors> <target_type> <target_args>
+	<target_type>	::= "verity" | "linear" | ... (see list below)
+
+The dm line should be equivalent to the one used by the dmsetup tool with the
+--concise argument.
+
+Target types
+============
+
+Not all target types are available as there are serious risks in allowing
+activation of certain DM targets without first using userspace tools to check
+the validity of associated metadata.
+
+	"cache":		constrained, userspace should verify cache device
+	"crypt":		allowed
+	"delay":		allowed
+	"era":			constrained, userspace should verify metadata device
+	"flakey":		constrained, meant for test
+	"linear":		allowed
+	"log-writes":		constrained, userspace should verify metadata device
+	"mirror":		constrained, userspace should verify main/mirror device
+	"raid":			constrained, userspace should verify metadata device
+	"snapshot":		constrained, userspace should verify src/dst device
+	"snapshot-origin":	allowed
+	"snapshot-merge":	constrained, userspace should verify src/dst device
+	"striped":		allowed
+	"switch":		constrained, userspace should verify dev path
+	"thin":			constrained, requires dm target message from userspace
+	"thin-pool":		constrained, requires dm target message from userspace
+	"verity":		allowed
+	"writecache":		constrained, userspace should verify cache device
+	"zero":			constrained, not meant for rootfs
+
+If the target is not listed above, it is constrained by default (not tested).
+
+Examples
+========
+An example of booting to a linear array made up of user-mode linux block
+devices:
+
+  dm-mod.create="lroot,,,rw, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" root=/dev/dm-0
+
+This will boot to a rw dm-linear target of 8192 sectors split across two block
+devices identified by their major:minor numbers.  After boot, udev will rename
+this target to /dev/mapper/lroot (depending on the rules). No uuid was assigned.
+
+An example of multiple device-mappers, with the dm-mod.create="..." contents is shown here
+split on multiple lines for readability:
+
+  vroot,,,ro,
+    0 1740800 verity 254:0 254:0 1740800 sha1
+      76e9be054b15884a9fa85973e9cb274c93afadb6
+      5b3549d54d6c7a3837b9b81ed72e49463a64c03680c47835bef94d768e5646fe;
+  vram,,,rw,
+    0 32768 linear 1:0 0,
+    32768 32768 linear 1:1 0
+
+Other examples (per target):
+
+"crypt":
+  dm-crypt,,8,ro,
+    0 1048576 crypt aes-xts-plain64
+    babebabebabebabebabebabebabebabebabebabebabebabebabebabebabebabe 0
+    /dev/sda 0 1 allow_discards
+
+"delay":
+  dm-delay,,4,ro,0 409600 delay /dev/sda1 0 500
+
+"linear":
+  dm-linear,,,rw,
+    0 32768 linear /dev/sda1 0,
+    32768 1024000 linear /dev/sda2 0,
+    1056768 204800 linear /dev/sda3 0,
+    1261568 512000 linear /dev/sda4 0
+
+"snapshot-origin":
+  dm-snap-orig,,4,ro,0 409600 snapshot-origin 8:2
+
+"striped":
+  dm-striped,,4,ro,0 1638400 striped 4 4096
+  /dev/sda1 0 /dev/sda2 0 /dev/sda3 0 /dev/sda4 0
+
+"verity":
+  dm-verity,,4,ro,
+    0 1638400 verity 1 8:1 8:2 4096 4096 204800 1 sha256
+    fb1a5a0f00deb908d8b53cb270858975e76cf64105d412ce764225d53b8f3cfd
+    51934789604d1b92399c52e7cb149d1b3a1b74bbbcb103b2a0aaacbed5c08584
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 3db222509e44..2557f198e175 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -436,6 +436,18 @@ config DM_DELAY
 
 	If unsure, say N.
 
+config DM_INIT
+	bool "DM \"dm-mod.create=\" parameter support"
+	depends on BLK_DEV_DM=y
+	---help---
+	Enable "dm-mod.create=" parameter to create mapped devices at init time.
+	This option is useful to allow mounting rootfs without requiring an
+	initramfs.
+	See Documentation/device-mapper/dm-init.txt for dm-mod.create="..."
+	format.
+
+	If unsure, say N.
+
 config DM_UEVENT
 	bool "DM uevents"
 	depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 822f4e8753bc..a52b703e588e 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -69,6 +69,10 @@ obj-$(CONFIG_DM_INTEGRITY)	+= dm-integrity.o
 obj-$(CONFIG_DM_ZONED)		+= dm-zoned.o
 obj-$(CONFIG_DM_WRITECACHE)	+= dm-writecache.o
 
+ifeq ($(CONFIG_DM_INIT),y)
+dm-mod-objs			+= dm-init.o
+endif
+
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
 endif
diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c
new file mode 100644
index 000000000000..b53f30f16b4d
--- /dev/null
+++ b/drivers/md/dm-init.c
@@ -0,0 +1,303 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * dm-init.c
+ * Copyright (C) 2017 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/device-mapper.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/moduleparam.h>
+
+#define DM_MSG_PREFIX "init"
+#define DM_MAX_DEVICES 256
+#define DM_MAX_TARGETS 256
+#define DM_MAX_STR_SIZE 4096
+
+static char *create;
+
+/*
+ * Format: dm-mod.create=<name>,<uuid>,<minor>,<flags>,<table>[,<table>+][;<name>,<uuid>,<minor>,<flags>,<table>[,<table>+]+]
+ * Table format: <start_sector> <num_sectors> <target_type> <target_args>
+ *
+ * See Documentation/device-mapper/dm-init.txt for dm-mod.create="..." format
+ * details.
+ */
+
+struct dm_device {
+	struct dm_ioctl dmi;
+	struct dm_target_spec *table[DM_MAX_TARGETS];
+	char *target_args_array[DM_MAX_TARGETS];
+	struct list_head list;
+};
+
+const char *dm_allowed_targets[] __initconst = {
+	"crypt",
+	"delay",
+	"linear",
+	"snapshot-origin",
+	"striped",
+	"verity",
+};
+
+static int __init dm_verify_target_type(const char *target)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(dm_allowed_targets); i++) {
+		if (!strcmp(dm_allowed_targets[i], target))
+			return 0;
+	}
+	return -EINVAL;
+}
+
+static void __init dm_setup_cleanup(struct list_head *devices)
+{
+	struct dm_device *dev, *tmp;
+	unsigned int i;
+
+	list_for_each_entry_safe(dev, tmp, devices, list) {
+		list_del(&dev->list);
+		for (i = 0; i < dev->dmi.target_count; i++) {
+			kfree(dev->table[i]);
+			kfree(dev->target_args_array[i]);
+		}
+		kfree(dev);
+	}
+}
+
+/**
+ * str_field_delimit - delimit a string based on a separator char.
+ * @str: the pointer to the string to delimit.
+ * @separator: char that delimits the field
+ *
+ * Find a @separator and replace it by '\0'.
+ * Remove leading and trailing spaces.
+ * Return the remainder string after the @separator.
+ */
+static char __init *str_field_delimit(char **str, char separator)
+{
+	char *s;
+
+	/* TODO: add support for escaped characters */
+	*str = skip_spaces(*str);
+	s = strchr(*str, separator);
+	/* Delimit the field and remove trailing spaces */
+	if (s)
+		*s = '\0';
+	*str = strim(*str);
+	return s ? ++s : NULL;
+}
+
+/**
+ * dm_parse_table_entry - parse a table entry
+ * @dev: device to store the parsed information.
+ * @str: the pointer to a string with the format:
+ *	<start_sector> <num_sectors> <target_type> <target_args>[, ...]
+ *
+ * Return the remainder string after the table entry, i.e, after the comma which
+ * delimits the entry or NULL if reached the end of the string.
+ */
+static char __init *dm_parse_table_entry(struct dm_device *dev, char *str)
+{
+	const unsigned int n = dev->dmi.target_count - 1;
+	struct dm_target_spec *sp;
+	unsigned int i;
+	/* fields:  */
+	char *field[4];
+	char *next;
+
+	field[0] = str;
+	/* Delimit first 3 fields that are separated by space */
+	for (i = 0; i < ARRAY_SIZE(field) - 1; i++) {
+		field[i + 1] = str_field_delimit(&field[i], ' ');
+		if (!field[i + 1])
+			return ERR_PTR(-EINVAL);
+	}
+	/* Delimit last field that can be terminated by comma */
+	next = str_field_delimit(&field[i], ',');
+
+	sp = kzalloc(sizeof(*sp), GFP_KERNEL);
+	if (!sp)
+		return ERR_PTR(-ENOMEM);
+	dev->table[n] = sp;
+
+	/* start_sector */
+	if (kstrtoull(field[0], 0, &sp->sector_start))
+		return ERR_PTR(-EINVAL);
+	/* num_sector */
+	if (kstrtoull(field[1], 0, &sp->length))
+		return ERR_PTR(-EINVAL);
+	/* target_type */
+	strscpy(sp->target_type, field[2], sizeof(sp->target_type));
+	if (dm_verify_target_type(sp->target_type)) {
+		DMERR("invalid type \"%s\"", sp->target_type);
+		return ERR_PTR(-EINVAL);
+	}
+	/* target_args */
+	dev->target_args_array[n] = kstrndup(field[3], GFP_KERNEL,
+					     DM_MAX_STR_SIZE);
+	if (!dev->target_args_array[n])
+		return ERR_PTR(-ENOMEM);
+
+	return next;
+}
+
+/**
+ * dm_parse_table - parse "dm-mod.create=" table field
+ * @dev: device to store the parsed information.
+ * @str: the pointer to a string with the format:
+ *	<table>[,<table>+]
+ */
+static int __init dm_parse_table(struct dm_device *dev, char *str)
+{
+	char *table_entry = str;
+
+	while (table_entry) {
+		DMDEBUG("parsing table \"%s\"", str);
+		if (++dev->dmi.target_count >= DM_MAX_TARGETS) {
+			DMERR("too many targets %u > %d",
+			      dev->dmi.target_count, DM_MAX_TARGETS);
+			return -EINVAL;
+		}
+		table_entry = dm_parse_table_entry(dev, table_entry);
+		if (IS_ERR(table_entry)) {
+			DMERR("couldn't parse table");
+			return PTR_ERR(table_entry);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * dm_parse_device_entry - parse a device entry
+ * @dev: device to store the parsed information.
+ * @str: the pointer to a string with the format:
+ *	name,uuid,minor,flags,table[; ...]
+ *
+ * Return the remainder string after the table entry, i.e, after the semi-colon
+ * which delimits the entry or NULL if reached the end of the string.
+ */
+static char __init *dm_parse_device_entry(struct dm_device *dev, char *str)
+{
+	/* There are 5 fields: name,uuid,minor,flags,table; */
+	char *field[5];
+	unsigned int i;
+	char *next;
+
+	field[0] = str;
+	/* Delimit first 4 fields that are separated by comma */
+	for (i = 0; i < ARRAY_SIZE(field) - 1; i++) {
+		field[i+1] = str_field_delimit(&field[i], ',');
+		if (!field[i+1])
+			return ERR_PTR(-EINVAL);
+	}
+	/* Delimit last field that can be delimited by semi-colon */
+	next = str_field_delimit(&field[i], ';');
+
+	/* name */
+	strscpy(dev->dmi.name, field[0], sizeof(dev->dmi.name));
+	/* uuid */
+	strscpy(dev->dmi.uuid, field[1], sizeof(dev->dmi.uuid));
+	/* minor */
+	if (strlen(field[2])) {
+		if (kstrtoull(field[2], 0, &dev->dmi.dev))
+			return ERR_PTR(-EINVAL);
+		dev->dmi.flags |= DM_PERSISTENT_DEV_FLAG;
+	}
+	/* flags */
+	if (!strcmp(field[3], "ro"))
+		dev->dmi.flags |= DM_READONLY_FLAG;
+	else if (strcmp(field[3], "rw"))
+		return ERR_PTR(-EINVAL);
+	/* table */
+	if (dm_parse_table(dev, field[4]))
+		return ERR_PTR(-EINVAL);
+
+	return next;
+}
+
+/**
+ * dm_parse_devices - parse "dm-mod.create=" argument
+ * @devices: list of struct dm_device to store the parsed information.
+ * @str: the pointer to a string with the format:
+ *	<device>[;<device>+]
+ */
+static int __init dm_parse_devices(struct list_head *devices, char *str)
+{
+	unsigned long ndev = 0;
+	struct dm_device *dev;
+	char *device = str;
+
+	DMDEBUG("parsing \"%s\"", str);
+	while (device) {
+		dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+		if (!dev)
+			return -ENOMEM;
+		list_add_tail(&dev->list, devices);
+
+		if (++ndev >= DM_MAX_DEVICES) {
+			DMERR("too many targets %u > %d",
+			      dev->dmi.target_count, DM_MAX_TARGETS);
+			return -EINVAL;
+		}
+
+		device = dm_parse_device_entry(dev, device);
+		if (IS_ERR(device)) {
+			DMERR("couldn't parse device");
+			return PTR_ERR(device);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * dm_init_init - parse "dm-mod.create=" argument and configure drivers
+ */
+static int __init dm_init_init(void)
+{
+	struct dm_device *dev;
+	LIST_HEAD(devices);
+	char *str;
+	int r;
+
+	if (!create)
+		return 0;
+
+	if (strlen(create) >= DM_MAX_STR_SIZE) {
+		DMERR("Argument is too big. Limit is %d\n", DM_MAX_STR_SIZE);
+		return -EINVAL;
+	}
+	str = kstrndup(create, GFP_KERNEL, DM_MAX_STR_SIZE);
+	if (!str)
+		return -ENOMEM;
+
+	r = dm_parse_devices(&devices, str);
+	if (r)
+		goto out;
+
+	DMINFO("waiting for all devices to be available before creating mapped devices\n");
+	wait_for_device_probe();
+
+	list_for_each_entry(dev, &devices, list) {
+		if (dm_early_create(&dev->dmi, dev->table,
+				    dev->target_args_array))
+			break;
+	}
+out:
+	kfree(str);
+	dm_setup_cleanup(&devices);
+	return r;
+}
+
+late_initcall(dm_init_init);
+
+module_param(create, charp, 0);
+MODULE_PARM_DESC(create, "Create a mapped device in early boot");
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index f666778ad237..c740153b4e52 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -2018,3 +2018,106 @@ out:
 
 	return r;
 }
+
+
+/**
+ * dm_early_create - create a mapped device in early boot.
+ *
+ * @dmi: Contains main information of the device mapping to be created.
+ * @spec_array: array of pointers to struct dm_target_spec. Describes the
+ * mapping table of the device.
+ * @target_params_array: array of strings with the parameters to a specific
+ * target.
+ *
+ * Instead of having the struct dm_target_spec and the parameters for every
+ * target embedded at the end of struct dm_ioctl (as performed in a normal
+ * ioctl), pass them as arguments, so the caller doesn't need to serialize them.
+ * The size of the spec_array and target_params_array is given by
+ * @dmi->target_count.
+ * This function is supposed to be called in early boot, so locking mechanisms
+ * to protect against concurrent loads are not required.
+ */
+int __init dm_early_create(struct dm_ioctl *dmi,
+			   struct dm_target_spec **spec_array,
+			   char **target_params_array)
+{
+	int r, m = DM_ANY_MINOR;
+	struct dm_table *t, *old_map;
+	struct mapped_device *md;
+	unsigned int i;
+
+	if (!dmi->target_count)
+		return -EINVAL;
+
+	r = check_name(dmi->name);
+	if (r)
+		return r;
+
+	if (dmi->flags & DM_PERSISTENT_DEV_FLAG)
+		m = MINOR(huge_decode_dev(dmi->dev));
+
+	/* alloc dm device */
+	r = dm_create(m, &md);
+	if (r)
+		return r;
+
+	/* hash insert */
+	r = dm_hash_insert(dmi->name, *dmi->uuid ? dmi->uuid : NULL, md);
+	if (r)
+		goto err_destroy_dm;
+
+	/* alloc table */
+	r = dm_table_create(&t, get_mode(dmi), dmi->target_count, md);
+	if (r)
+		goto err_destroy_dm;
+
+	/* add targets */
+	for (i = 0; i < dmi->target_count; i++) {
+		r = dm_table_add_target(t, spec_array[i]->target_type,
+					(sector_t) spec_array[i]->sector_start,
+					(sector_t) spec_array[i]->length,
+					target_params_array[i]);
+		if (r) {
+			DMWARN("error adding target to table");
+			goto err_destroy_table;
+		}
+	}
+
+	/* finish table */
+	r = dm_table_complete(t);
+	if (r)
+		goto err_destroy_table;
+
+	md->type = dm_table_get_type(t);
+	/* setup md->queue to reflect md's type (may block) */
+	r = dm_setup_md_queue(md, t);
+	if (r) {
+		DMWARN("unable to set up device queue for new table.");
+		goto err_destroy_table;
+	}
+
+	/* Set new map */
+	dm_suspend(md, 0);
+	old_map = dm_swap_table(md, t);
+	if (IS_ERR(old_map)) {
+		r = PTR_ERR(old_map);
+		goto err_destroy_table;
+	}
+	set_disk_ro(dm_disk(md), !!(dmi->flags & DM_READONLY_FLAG));
+
+	/* resume device */
+	r = dm_resume(md);
+	if (r)
+		goto err_destroy_table;
+
+	DMINFO("%s (%s) is ready", md->disk->disk_name, dmi->name);
+	dm_put(md);
+	return 0;
+
+err_destroy_table:
+	dm_table_destroy(t);
+err_destroy_dm:
+	dm_put(md);
+	dm_destroy(md);
+	return r;
+}
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 52e8709c6df0..b0672756d056 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -10,6 +10,7 @@
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/dm-ioctl.h>
 #include <linux/math64.h>
 #include <linux/ratelimit.h>
 
@@ -425,6 +426,14 @@ void dm_remap_zone_report(struct dm_target *ti, sector_t start,
 			  struct blk_zone *zones, unsigned int *nr_zones);
 union map_info *dm_get_rq_mapinfo(struct request *rq);
 
+/*
+ * Device mapper functions to parse and create devices specified by the
+ * parameter "dm-mod.create="
+ */
+int __init dm_early_create(struct dm_ioctl *dmi,
+			   struct dm_target_spec **spec_array,
+			   char **target_params_array);
+
 struct queue_limits *dm_get_queue_limits(struct mapped_device *md);
 
 /*
-- 
cgit v1.2.3


From 500e0b28ecd3c5aade98f3c3a339d18dcb166bb6 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Fri, 15 Feb 2019 00:08:25 +0800
Subject: f2fs: fix to check inline_xattr_size boundary correctly

We use below condition to check inline_xattr_size boundary:

	if (!F2FS_OPTION(sbi).inline_xattr_size ||
		F2FS_OPTION(sbi).inline_xattr_size >=
				DEF_ADDRS_PER_INODE -
				F2FS_TOTAL_EXTRA_ATTR_SIZE -
				DEF_INLINE_RESERVED_SIZE -
				DEF_MIN_INLINE_SIZE)

There is there problems in that check:
- we should allow inline_xattr_size equaling to min size of inline
{data,dentry} area.
- F2FS_TOTAL_EXTRA_ATTR_SIZE and inline_xattr_size are based on
different size unit, previous one is 4 bytes, latter one is 1 bytes.
- DEF_MIN_INLINE_SIZE only indicate min size of inline data area,
however, we need to consider min size of inline dentry area as well,
minimal inline dentry should at least contain two entries: '.' and
'..', so that min inline_dentry size is 40 bytes.

.bitmap		1 * 1 = 1
.reserved	1 * 1 = 1
.dentry		11 * 2 = 22
.filename	8 * 2 = 16
total		40

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h          |  1 -
 fs/f2fs/super.c         | 13 +++++++------
 include/linux/f2fs_fs.h | 13 +++++++------
 3 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4665bff1bf55..f1f0d2810852 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -459,7 +459,6 @@ struct f2fs_flush_device {
 
 /* for inline stuff */
 #define DEF_INLINE_RESERVED_SIZE	1
-#define DEF_MIN_INLINE_SIZE		1
 static inline int get_extra_isize(struct inode *inode);
 static inline int get_inline_xattr_addrs(struct inode *inode);
 #define MAX_INLINE_DATA(inode)	(sizeof(__le32) *			\
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 83bbe7424fc1..be8be445c6ed 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -834,12 +834,13 @@ static int parse_options(struct super_block *sb, char *options)
 					"set with inline_xattr option");
 			return -EINVAL;
 		}
-		if (!F2FS_OPTION(sbi).inline_xattr_size ||
-			F2FS_OPTION(sbi).inline_xattr_size >=
-					DEF_ADDRS_PER_INODE -
-					F2FS_TOTAL_EXTRA_ATTR_SIZE -
-					DEF_INLINE_RESERVED_SIZE -
-					DEF_MIN_INLINE_SIZE) {
+		if (F2FS_OPTION(sbi).inline_xattr_size <
+			sizeof(struct f2fs_xattr_header) / sizeof(__le32) ||
+			F2FS_OPTION(sbi).inline_xattr_size >
+			DEF_ADDRS_PER_INODE -
+			F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) -
+			DEF_INLINE_RESERVED_SIZE -
+			MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) {
 			f2fs_msg(sb, KERN_ERR,
 					"inline xattr size is out of range");
 			return -EINVAL;
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 8d57aaee8166..666db8eb71e0 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -490,12 +490,12 @@ typedef __le32	f2fs_hash_t;
 
 /*
  * space utilization of regular dentry and inline dentry (w/o extra reservation)
- *		regular dentry			inline dentry
- * bitmap	1 * 27 = 27			1 * 23 = 23
- * reserved	1 * 3 = 3			1 * 7 = 7
- * dentry	11 * 214 = 2354			11 * 182 = 2002
- * filename	8 * 214 = 1712			8 * 182 = 1456
- * total	4096				3488
+ *		regular dentry		inline dentry (def)	inline dentry (min)
+ * bitmap	1 * 27 = 27		1 * 23 = 23		1 * 1 = 1
+ * reserved	1 * 3 = 3		1 * 7 = 7		1 * 1 = 1
+ * dentry	11 * 214 = 2354		11 * 182 = 2002		11 * 2 = 22
+ * filename	8 * 214 = 1712		8 * 182 = 1456		8 * 2 = 16
+ * total	4096			3488			40
  *
  * Note: there are more reserved space in inline dentry than in regular
  * dentry, when converting inline dentry we should handle this carefully.
@@ -507,6 +507,7 @@ typedef __le32	f2fs_hash_t;
 #define SIZE_OF_RESERVED	(PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \
 				F2FS_SLOT_LEN) * \
 				NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP))
+#define MIN_INLINE_DENTRY_SIZE		40	/* just include '.' and '..' entries */
 
 /* One directory entry slot representing F2FS_SLOT_LEN-sized file name */
 struct f2fs_dir_entry {
-- 
cgit v1.2.3


From abe420bfae528c92bd8cc5ecb62dc95672b1fd6f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 7 Feb 2019 12:59:13 +0100
Subject: swiotlb: Introduce swiotlb_max_mapping_size()

The function returns the maximum size that can be remapped
by the SWIOTLB implementation. This function will be later
exposed to users through the DMA-API.

Cc: stable@vger.kernel.org
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/swiotlb.h | 5 +++++
 kernel/dma/swiotlb.c    | 5 +++++
 2 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 7c007ed7505f..d3980aeed4a0 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -76,6 +76,7 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs);
 void __init swiotlb_exit(void);
 unsigned int swiotlb_max_segment(void);
+size_t swiotlb_max_mapping_size(struct device *dev);
 #else
 #define swiotlb_force SWIOTLB_NO_FORCE
 static inline bool is_swiotlb_buffer(phys_addr_t paddr)
@@ -95,6 +96,10 @@ static inline unsigned int swiotlb_max_segment(void)
 {
 	return 0;
 }
+static inline size_t swiotlb_max_mapping_size(struct device *dev)
+{
+	return SIZE_MAX;
+}
 #endif /* CONFIG_SWIOTLB */
 
 extern void swiotlb_print_info(void);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 1fb6fd68b9c7..9cb21259cb0b 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -662,3 +662,8 @@ swiotlb_dma_supported(struct device *hwdev, u64 mask)
 {
 	return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
 }
+
+size_t swiotlb_max_mapping_size(struct device *dev)
+{
+	return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE;
+}
-- 
cgit v1.2.3


From 492366f7b4237257ef50ca9c431a6a0d50225aca Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 7 Feb 2019 12:59:14 +0100
Subject: swiotlb: Add is_swiotlb_active() function

This function will be used from dma_direct code to determine
the maximum segment size of a dma mapping.

Cc: stable@vger.kernel.org
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/swiotlb.h | 6 ++++++
 kernel/dma/swiotlb.c    | 9 +++++++++
 2 files changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index d3980aeed4a0..29bc3a203283 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -77,6 +77,7 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
 void __init swiotlb_exit(void);
 unsigned int swiotlb_max_segment(void);
 size_t swiotlb_max_mapping_size(struct device *dev);
+bool is_swiotlb_active(void);
 #else
 #define swiotlb_force SWIOTLB_NO_FORCE
 static inline bool is_swiotlb_buffer(phys_addr_t paddr)
@@ -100,6 +101,11 @@ static inline size_t swiotlb_max_mapping_size(struct device *dev)
 {
 	return SIZE_MAX;
 }
+
+static inline bool is_swiotlb_active(void)
+{
+	return false;
+}
 #endif /* CONFIG_SWIOTLB */
 
 extern void swiotlb_print_info(void);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 9cb21259cb0b..c873f9cc2146 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -667,3 +667,12 @@ size_t swiotlb_max_mapping_size(struct device *dev)
 {
 	return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE;
 }
+
+bool is_swiotlb_active(void)
+{
+	/*
+	 * When SWIOTLB is initialized, even if io_tlb_start points to physical
+	 * address zero, io_tlb_end surely doesn't.
+	 */
+	return io_tlb_end != 0;
+}
-- 
cgit v1.2.3


From 133d624b1cee16906134e92d5befb843b58bcf31 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 7 Feb 2019 12:59:15 +0100
Subject: dma: Introduce dma_max_mapping_size()

The function returns the maximum size that can be mapped
using DMA-API functions. The patch also adds the
implementation for direct DMA and a new dma_map_ops pointer
so that other implementations can expose their limit.

Cc: stable@vger.kernel.org
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 Documentation/DMA-API.txt   |  8 ++++++++
 include/linux/dma-mapping.h |  8 ++++++++
 kernel/dma/direct.c         | 11 +++++++++++
 kernel/dma/mapping.c        | 14 ++++++++++++++
 4 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index e133ccd60228..acfe3d0f78d1 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -195,6 +195,14 @@ Requesting the required mask does not alter the current mask.  If you
 wish to take advantage of it, you should issue a dma_set_mask()
 call to set the mask to the value returned.
 
+::
+
+	size_t
+	dma_direct_max_mapping_size(struct device *dev);
+
+Returns the maximum size of a mapping for the device. The size parameter
+of the mapping functions like dma_map_single(), dma_map_page() and
+others should not be larger than the returned value.
 
 Part Id - Streaming DMA mappings
 --------------------------------
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index f6ded992c183..5b21f14802e1 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -130,6 +130,7 @@ struct dma_map_ops {
 			enum dma_data_direction direction);
 	int (*dma_supported)(struct device *dev, u64 mask);
 	u64 (*get_required_mask)(struct device *dev);
+	size_t (*max_mapping_size)(struct device *dev);
 };
 
 #define DMA_MAPPING_ERROR		(~(dma_addr_t)0)
@@ -257,6 +258,8 @@ static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
 }
 #endif
 
+size_t dma_direct_max_mapping_size(struct device *dev);
+
 #ifdef CONFIG_HAS_DMA
 #include <asm/dma-mapping.h>
 
@@ -460,6 +463,7 @@ int dma_supported(struct device *dev, u64 mask);
 int dma_set_mask(struct device *dev, u64 mask);
 int dma_set_coherent_mask(struct device *dev, u64 mask);
 u64 dma_get_required_mask(struct device *dev);
+size_t dma_max_mapping_size(struct device *dev);
 #else /* CONFIG_HAS_DMA */
 static inline dma_addr_t dma_map_page_attrs(struct device *dev,
 		struct page *page, size_t offset, size_t size,
@@ -561,6 +565,10 @@ static inline u64 dma_get_required_mask(struct device *dev)
 {
 	return 0;
 }
+static inline size_t dma_max_mapping_size(struct device *dev)
+{
+	return 0;
+}
 #endif /* CONFIG_HAS_DMA */
 
 static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 355d16acee6d..6310ad01f915 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -380,3 +380,14 @@ int dma_direct_supported(struct device *dev, u64 mask)
 	 */
 	return mask >= __phys_to_dma(dev, min_mask);
 }
+
+size_t dma_direct_max_mapping_size(struct device *dev)
+{
+	size_t size = SIZE_MAX;
+
+	/* If SWIOTLB is active, use its maximum mapping size */
+	if (is_swiotlb_active())
+		size = swiotlb_max_mapping_size(dev);
+
+	return size;
+}
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index a11006b6d8e8..5753008ab286 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -357,3 +357,17 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 		ops->cache_sync(dev, vaddr, size, dir);
 }
 EXPORT_SYMBOL(dma_cache_sync);
+
+size_t dma_max_mapping_size(struct device *dev)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	size_t size = SIZE_MAX;
+
+	if (dma_is_direct(ops))
+		size = dma_direct_max_mapping_size(dev);
+	else if (ops && ops->max_mapping_size)
+		size = ops->max_mapping_size(dev);
+
+	return size;
+}
+EXPORT_SYMBOL_GPL(dma_max_mapping_size);
-- 
cgit v1.2.3


From e6d6dd6c875eb3c9b69bb640419405726e6e0bbe Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 7 Feb 2019 12:59:16 +0100
Subject: virtio: Introduce virtio_max_dma_size()

This function returns the maximum segment size for a single
dma transaction of a virtio device. The possible limit comes
from the SWIOTLB implementation in the Linux kernel, that
has an upper limit of (currently) 256kb of contiguous
memory it can map. Other DMA-API implementations might also
have limits.

Use the new dma_max_mapping_size() function to determine the
maximum mapping size when DMA-API is in use for virtio.

Cc: stable@vger.kernel.org
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c | 11 +++++++++++
 include/linux/virtio.h       |  2 ++
 2 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index a0b07c331255..18846afb39da 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -271,6 +271,17 @@ static bool vring_use_dma_api(struct virtio_device *vdev)
 	return false;
 }
 
+size_t virtio_max_dma_size(struct virtio_device *vdev)
+{
+	size_t max_segment_size = SIZE_MAX;
+
+	if (vring_use_dma_api(vdev))
+		max_segment_size = dma_max_mapping_size(&vdev->dev);
+
+	return max_segment_size;
+}
+EXPORT_SYMBOL_GPL(virtio_max_dma_size);
+
 static void *vring_alloc_queue(struct virtio_device *vdev, size_t size,
 			      dma_addr_t *dma_handle, gfp_t flag)
 {
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index fa1b5da2804e..673fe3ef3607 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -157,6 +157,8 @@ int virtio_device_freeze(struct virtio_device *dev);
 int virtio_device_restore(struct virtio_device *dev);
 #endif
 
+size_t virtio_max_dma_size(struct virtio_device *vdev);
+
 #define virtio_device_for_each_vq(vdev, vq) \
 	list_for_each_entry(vq, &vdev->vqs, list)
 
-- 
cgit v1.2.3


From ab7a2375fb8e83f8744c34442f476fa5a9df5e35 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cohuck@redhat.com>
Date: Thu, 31 Jan 2019 13:53:14 +0100
Subject: virtio: hint if callbacks surprisingly might sleep

A virtio transport is free to implement some of the callbacks in
virtio_config_ops in a matter that they cannot be called from
atomic context (e.g. virtio-ccw, which maps a lot of the callbacks
to channel I/O, which is an inherently asynchronous mechanism).
This can be very surprising for developers using the much more
common virtio-pci transport, just to find out that things break
when used on s390.

The documentation for virtio_config_ops now contains a comment
explaining this, but it makes sense to add a might_sleep() annotation
to various wrapper functions in the virtio core to avoid surprises
later.

Note that annotations are NOT added to two classes of calls:
- direct calls from device drivers (all current callers should be
  fine, however)
- calls which clearly won't be made from atomic context (such as
  those ultimately coming in via the driver core)

Signed-off-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio.c       |  2 ++
 include/linux/virtio_config.h | 13 +++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include')

diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 59e36ef4920f..98b30f54342c 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -161,6 +161,7 @@ EXPORT_SYMBOL_GPL(virtio_config_enable);
 
 void virtio_add_status(struct virtio_device *dev, unsigned int status)
 {
+	might_sleep();
 	dev->config->set_status(dev, dev->config->get_status(dev) | status);
 }
 EXPORT_SYMBOL_GPL(virtio_add_status);
@@ -170,6 +171,7 @@ int virtio_finalize_features(struct virtio_device *dev)
 	int ret = dev->config->finalize_features(dev);
 	unsigned status;
 
+	might_sleep();
 	if (ret)
 		return ret;
 
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 987b6491b946..bb4cc4910750 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -290,6 +290,7 @@ static inline __virtio64 cpu_to_virtio64(struct virtio_device *vdev, u64 val)
 /* Config space accessors. */
 #define virtio_cread(vdev, structname, member, ptr)			\
 	do {								\
+		might_sleep();						\
 		/* Must match the member's type, and be integer */	\
 		if (!typecheck(typeof((((structname*)0)->member)), *(ptr))) \
 			(*ptr) = 1;					\
@@ -319,6 +320,7 @@ static inline __virtio64 cpu_to_virtio64(struct virtio_device *vdev, u64 val)
 /* Config space accessors. */
 #define virtio_cwrite(vdev, structname, member, ptr)			\
 	do {								\
+		might_sleep();						\
 		/* Must match the member's type, and be integer */	\
 		if (!typecheck(typeof((((structname*)0)->member)), *(ptr))) \
 			BUG_ON((*ptr) == 1);				\
@@ -358,6 +360,7 @@ static inline void __virtio_cread_many(struct virtio_device *vdev,
 		vdev->config->generation(vdev) : 0;
 	int i;
 
+	might_sleep();
 	do {
 		old = gen;
 
@@ -380,6 +383,8 @@ static inline void virtio_cread_bytes(struct virtio_device *vdev,
 static inline u8 virtio_cread8(struct virtio_device *vdev, unsigned int offset)
 {
 	u8 ret;
+
+	might_sleep();
 	vdev->config->get(vdev, offset, &ret, sizeof(ret));
 	return ret;
 }
@@ -387,6 +392,7 @@ static inline u8 virtio_cread8(struct virtio_device *vdev, unsigned int offset)
 static inline void virtio_cwrite8(struct virtio_device *vdev,
 				  unsigned int offset, u8 val)
 {
+	might_sleep();
 	vdev->config->set(vdev, offset, &val, sizeof(val));
 }
 
@@ -394,6 +400,8 @@ static inline u16 virtio_cread16(struct virtio_device *vdev,
 				 unsigned int offset)
 {
 	u16 ret;
+
+	might_sleep();
 	vdev->config->get(vdev, offset, &ret, sizeof(ret));
 	return virtio16_to_cpu(vdev, (__force __virtio16)ret);
 }
@@ -401,6 +409,7 @@ static inline u16 virtio_cread16(struct virtio_device *vdev,
 static inline void virtio_cwrite16(struct virtio_device *vdev,
 				   unsigned int offset, u16 val)
 {
+	might_sleep();
 	val = (__force u16)cpu_to_virtio16(vdev, val);
 	vdev->config->set(vdev, offset, &val, sizeof(val));
 }
@@ -409,6 +418,8 @@ static inline u32 virtio_cread32(struct virtio_device *vdev,
 				 unsigned int offset)
 {
 	u32 ret;
+
+	might_sleep();
 	vdev->config->get(vdev, offset, &ret, sizeof(ret));
 	return virtio32_to_cpu(vdev, (__force __virtio32)ret);
 }
@@ -416,6 +427,7 @@ static inline u32 virtio_cread32(struct virtio_device *vdev,
 static inline void virtio_cwrite32(struct virtio_device *vdev,
 				   unsigned int offset, u32 val)
 {
+	might_sleep();
 	val = (__force u32)cpu_to_virtio32(vdev, val);
 	vdev->config->set(vdev, offset, &val, sizeof(val));
 }
@@ -431,6 +443,7 @@ static inline u64 virtio_cread64(struct virtio_device *vdev,
 static inline void virtio_cwrite64(struct virtio_device *vdev,
 				   unsigned int offset, u64 val)
 {
+	might_sleep();
 	val = (__force u64)cpu_to_virtio64(vdev, val);
 	vdev->config->set(vdev, offset, &val, sizeof(val));
 }
-- 
cgit v1.2.3


From 27da0d2ef998e222a876c0cec72aa7829a626266 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 6 Mar 2019 11:52:36 +0100
Subject: appletalk: Fix compile regression

A bugfix just broke compilation of appletalk when CONFIG_SYSCTL
is disabled:

In file included from net/appletalk/ddp.c:65:
net/appletalk/ddp.c: In function 'atalk_init':
include/linux/atalk.h:164:34: error: expected expression before 'do'
 #define atalk_register_sysctl()  do { } while(0)
                                  ^~
net/appletalk/ddp.c:1934:7: note: in expansion of macro 'atalk_register_sysctl'
  rc = atalk_register_sysctl();

This is easier to avoid by using conventional inline functions
as stubs rather than macros. The header already has inline
functions for other purposes, so I'm changing over all the
macros for consistency.

Fixes: 6377f787aeb9 ("appletalk: Fix use-after-free in atalk_proc_exit")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/atalk.h | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/atalk.h b/include/linux/atalk.h
index 5a90f28d5ff2..d5cfc0b15b76 100644
--- a/include/linux/atalk.h
+++ b/include/linux/atalk.h
@@ -161,16 +161,26 @@ extern int sysctl_aarp_resolve_time;
 extern int atalk_register_sysctl(void);
 extern void atalk_unregister_sysctl(void);
 #else
-#define atalk_register_sysctl()		do { } while(0)
-#define atalk_unregister_sysctl()	do { } while(0)
+static inline int atalk_register_sysctl(void)
+{
+	return 0;
+}
+static inline void atalk_unregister_sysctl(void)
+{
+}
 #endif
 
 #ifdef CONFIG_PROC_FS
 extern int atalk_proc_init(void);
 extern void atalk_proc_exit(void);
 #else
-#define atalk_proc_init()	({ 0; })
-#define atalk_proc_exit()	do { } while(0)
+static inline int atalk_proc_init(void)
+{
+	return 0;
+}
+static inline void atalk_proc_exit(void)
+{
+}
 #endif /* CONFIG_PROC_FS */
 
 #endif /* __LINUX_ATALK_H__ */
-- 
cgit v1.2.3


From f76a16adc485699f95bb71fce114f97c832fe664 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 6 Mar 2019 11:07:24 -0600
Subject: x86/unwind/orc: Fix ORC unwind table alignment

The .orc_unwind section is a packed array of 6-byte structs.  It's
currently aligned to 6 bytes, which is causing warnings in the LLD
linker.

Six isn't a power of two, so it's not a valid alignment value.  The
actual alignment doesn't matter much because it's an array of packed
structs.  An alignment of two is sufficient.  In reality it always gets
aligned to four bytes because it comes immediately after the
4-byte-aligned .orc_unwind_ip section.

Fixes: ee9f8fce9964 ("x86/unwind: Add the ORC unwinder")
Reported-by: Nick Desaulniers <ndesaulniers@google.com>
Reported-by: Dmitry Golovin <dima@golovin.in>
Reported-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://github.com/ClangBuiltLinux/linux/issues/218
Link: https://lkml.kernel.org/r/d55027ee95fe73e952dcd8be90aebd31b0095c45.1551892041.git.jpoimboe@redhat.com
---
 include/asm-generic/vmlinux.lds.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 3d7a6a9c2370..f8f6f04c4453 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -733,7 +733,7 @@
 		KEEP(*(.orc_unwind_ip))					\
 		__stop_orc_unwind_ip = .;				\
 	}								\
-	. = ALIGN(6);							\
+	. = ALIGN(2);							\
 	.orc_unwind : AT(ADDR(.orc_unwind) - LOAD_OFFSET) {		\
 		__start_orc_unwind = .;					\
 		KEEP(*(.orc_unwind))					\
-- 
cgit v1.2.3


From 221c5eb2338232f7340386de1c43decc32682e58 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 17 Jan 2019 09:41:58 -0700
Subject: io_uring: add support for IORING_OP_POLL

This is basically a direct port of bfe4037e722e, which implements a
one-shot poll command through aio. Description below is based on that
commit as well. However, instead of adding a POLL command and relying
on io_cancel(2) to remove it, we mimic the epoll(2) interface of
having a command to add a poll notification, IORING_OP_POLL_ADD,
and one to remove it again, IORING_OP_POLL_REMOVE.

To poll for a file descriptor the application should submit an sqe of
type IORING_OP_POLL. It will poll the fd for the events specified in the
poll_events field.

Unlike poll or epoll without EPOLLONESHOT this interface always works in
one shot mode, that is once the sqe is completed, it will have to be
resubmitted.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Based-on-code-from: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 263 +++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/io_uring.h |   3 +
 2 files changed, 265 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d2a3a1bc85cc..85b914bd823c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -161,6 +161,7 @@ struct io_ring_ctx {
 		 * manipulate the list, hence no extra locking is needed there.
 		 */
 		struct list_head	poll_list;
+		struct list_head	cancel_list;
 	} ____cacheline_aligned_in_smp;
 
 #if defined(CONFIG_UNIX)
@@ -176,8 +177,20 @@ struct sqe_submit {
 	bool				needs_fixed_file;
 };
 
+struct io_poll_iocb {
+	struct file			*file;
+	struct wait_queue_head		*head;
+	__poll_t			events;
+	bool				woken;
+	bool				canceled;
+	struct wait_queue_entry		wait;
+};
+
 struct io_kiocb {
-	struct kiocb		rw;
+	union {
+		struct kiocb		rw;
+		struct io_poll_iocb	poll;
+	};
 
 	struct sqe_submit	submit;
 
@@ -261,6 +274,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	init_waitqueue_head(&ctx->wait);
 	spin_lock_init(&ctx->completion_lock);
 	INIT_LIST_HEAD(&ctx->poll_list);
+	INIT_LIST_HEAD(&ctx->cancel_list);
 	return ctx;
 }
 
@@ -1058,6 +1072,246 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	return 0;
 }
 
+static void io_poll_remove_one(struct io_kiocb *req)
+{
+	struct io_poll_iocb *poll = &req->poll;
+
+	spin_lock(&poll->head->lock);
+	WRITE_ONCE(poll->canceled, true);
+	if (!list_empty(&poll->wait.entry)) {
+		list_del_init(&poll->wait.entry);
+		queue_work(req->ctx->sqo_wq, &req->work);
+	}
+	spin_unlock(&poll->head->lock);
+
+	list_del_init(&req->list);
+}
+
+static void io_poll_remove_all(struct io_ring_ctx *ctx)
+{
+	struct io_kiocb *req;
+
+	spin_lock_irq(&ctx->completion_lock);
+	while (!list_empty(&ctx->cancel_list)) {
+		req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
+		io_poll_remove_one(req);
+	}
+	spin_unlock_irq(&ctx->completion_lock);
+}
+
+/*
+ * Find a running poll command that matches one specified in sqe->addr,
+ * and remove it if found.
+ */
+static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_kiocb *poll_req, *next;
+	int ret = -ENOENT;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
+	    sqe->poll_events)
+		return -EINVAL;
+
+	spin_lock_irq(&ctx->completion_lock);
+	list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
+		if (READ_ONCE(sqe->addr) == poll_req->user_data) {
+			io_poll_remove_one(poll_req);
+			ret = 0;
+			break;
+		}
+	}
+	spin_unlock_irq(&ctx->completion_lock);
+
+	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
+	io_free_req(req);
+	return 0;
+}
+
+static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
+{
+	io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0);
+	io_fput(req);
+	io_free_req(req);
+}
+
+static void io_poll_complete_work(struct work_struct *work)
+{
+	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+	struct io_poll_iocb *poll = &req->poll;
+	struct poll_table_struct pt = { ._key = poll->events };
+	struct io_ring_ctx *ctx = req->ctx;
+	__poll_t mask = 0;
+
+	if (!READ_ONCE(poll->canceled))
+		mask = vfs_poll(poll->file, &pt) & poll->events;
+
+	/*
+	 * Note that ->ki_cancel callers also delete iocb from active_reqs after
+	 * calling ->ki_cancel.  We need the ctx_lock roundtrip here to
+	 * synchronize with them.  In the cancellation case the list_del_init
+	 * itself is not actually needed, but harmless so we keep it in to
+	 * avoid further branches in the fast path.
+	 */
+	spin_lock_irq(&ctx->completion_lock);
+	if (!mask && !READ_ONCE(poll->canceled)) {
+		add_wait_queue(poll->head, &poll->wait);
+		spin_unlock_irq(&ctx->completion_lock);
+		return;
+	}
+	list_del_init(&req->list);
+	spin_unlock_irq(&ctx->completion_lock);
+
+	io_poll_complete(req, mask);
+}
+
+static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+			void *key)
+{
+	struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
+							wait);
+	struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
+	struct io_ring_ctx *ctx = req->ctx;
+	__poll_t mask = key_to_poll(key);
+
+	poll->woken = true;
+
+	/* for instances that support it check for an event match first: */
+	if (mask) {
+		unsigned long flags;
+
+		if (!(mask & poll->events))
+			return 0;
+
+		/* try to complete the iocb inline if we can: */
+		if (spin_trylock_irqsave(&ctx->completion_lock, flags)) {
+			list_del(&req->list);
+			spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+			list_del_init(&poll->wait.entry);
+			io_poll_complete(req, mask);
+			return 1;
+		}
+	}
+
+	list_del_init(&poll->wait.entry);
+	queue_work(ctx->sqo_wq, &req->work);
+	return 1;
+}
+
+struct io_poll_table {
+	struct poll_table_struct pt;
+	struct io_kiocb *req;
+	int error;
+};
+
+static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
+			       struct poll_table_struct *p)
+{
+	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+
+	if (unlikely(pt->req->poll.head)) {
+		pt->error = -EINVAL;
+		return;
+	}
+
+	pt->error = 0;
+	pt->req->poll.head = head;
+	add_wait_queue(head, &pt->req->poll.wait);
+}
+
+static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_poll_iocb *poll = &req->poll;
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_poll_table ipt;
+	unsigned flags;
+	__poll_t mask;
+	u16 events;
+	int fd;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+		return -EINVAL;
+
+	INIT_WORK(&req->work, io_poll_complete_work);
+	events = READ_ONCE(sqe->poll_events);
+	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
+
+	flags = READ_ONCE(sqe->flags);
+	fd = READ_ONCE(sqe->fd);
+
+	if (flags & IOSQE_FIXED_FILE) {
+		if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
+			return -EBADF;
+		poll->file = ctx->user_files[fd];
+		req->flags |= REQ_F_FIXED_FILE;
+	} else {
+		poll->file = fget(fd);
+	}
+	if (unlikely(!poll->file))
+		return -EBADF;
+
+	poll->head = NULL;
+	poll->woken = false;
+	poll->canceled = false;
+
+	ipt.pt._qproc = io_poll_queue_proc;
+	ipt.pt._key = poll->events;
+	ipt.req = req;
+	ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
+
+	/* initialized the list so that we can do list_empty checks */
+	INIT_LIST_HEAD(&poll->wait.entry);
+	init_waitqueue_func_entry(&poll->wait, io_poll_wake);
+
+	/* one for removal from waitqueue, one for this function */
+	refcount_set(&req->refs, 2);
+
+	mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
+	if (unlikely(!poll->head)) {
+		/* we did not manage to set up a waitqueue, done */
+		goto out;
+	}
+
+	spin_lock_irq(&ctx->completion_lock);
+	spin_lock(&poll->head->lock);
+	if (poll->woken) {
+		/* wake_up context handles the rest */
+		mask = 0;
+		ipt.error = 0;
+	} else if (mask || ipt.error) {
+		/* if we get an error or a mask we are done */
+		WARN_ON_ONCE(list_empty(&poll->wait.entry));
+		list_del_init(&poll->wait.entry);
+	} else {
+		/* actually waiting for an event */
+		list_add_tail(&req->list, &ctx->cancel_list);
+	}
+	spin_unlock(&poll->head->lock);
+	spin_unlock_irq(&ctx->completion_lock);
+
+out:
+	if (unlikely(ipt.error)) {
+		if (!(flags & IOSQE_FIXED_FILE))
+			fput(poll->file);
+		/*
+		 * Drop one of our refs to this req, __io_submit_sqe() will
+		 * drop the other one since we're returning an error.
+		 */
+		io_free_req(req);
+		return ipt.error;
+	}
+
+	if (mask)
+		io_poll_complete(req, mask);
+	io_free_req(req);
+	return 0;
+}
+
 static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			   const struct sqe_submit *s, bool force_nonblock,
 			   struct io_submit_state *state)
@@ -1093,6 +1347,12 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	case IORING_OP_FSYNC:
 		ret = io_fsync(req, s->sqe, force_nonblock);
 		break;
+	case IORING_OP_POLL_ADD:
+		ret = io_poll_add(req, s->sqe);
+		break;
+	case IORING_OP_POLL_REMOVE:
+		ret = io_poll_remove(req, s->sqe);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -2131,6 +2391,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 	percpu_ref_kill(&ctx->refs);
 	mutex_unlock(&ctx->uring_lock);
 
+	io_poll_remove_all(ctx);
 	io_iopoll_reap_events(ctx);
 	wait_for_completion(&ctx->ctx_done);
 	io_ring_ctx_free(ctx);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 0ec74bab8dbe..e23408692118 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -25,6 +25,7 @@ struct io_uring_sqe {
 	union {
 		__kernel_rwf_t	rw_flags;
 		__u32		fsync_flags;
+		__u16		poll_events;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	union {
@@ -51,6 +52,8 @@ struct io_uring_sqe {
 #define IORING_OP_FSYNC		3
 #define IORING_OP_READ_FIXED	4
 #define IORING_OP_WRITE_FIXED	5
+#define IORING_OP_POLL_ADD	6
+#define IORING_OP_POLL_REMOVE	7
 
 /*
  * sqe->fsync_flags
-- 
cgit v1.2.3


From 4981b82ba2ff87df6a711fcd7a233c615df5fc79 Mon Sep 17 00:00:00 2001
From: Wendy Liang <wendy.liang@xilinx.com>
Date: Thu, 21 Feb 2019 16:36:33 -0800
Subject: mailbox: ZynqMP IPI mailbox controller

This patch is to introduce ZynqMP IPI mailbox controller driver
to use the ZynqMP IPI block as mailboxes.

Signed-off-by: Wendy Liang <wendy.liang@xilinx.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/Kconfig                    |  11 +
 drivers/mailbox/Makefile                   |   2 +
 drivers/mailbox/zynqmp-ipi-mailbox.c       | 725 +++++++++++++++++++++++++++++
 include/linux/mailbox/zynqmp-ipi-message.h |  20 +
 4 files changed, 758 insertions(+)
 create mode 100644 drivers/mailbox/zynqmp-ipi-mailbox.c
 create mode 100644 include/linux/mailbox/zynqmp-ipi-message.h

(limited to 'include')

diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig
index 3eeb12e93e98..d86e7a4ac04d 100644
--- a/drivers/mailbox/Kconfig
+++ b/drivers/mailbox/Kconfig
@@ -205,4 +205,15 @@ config MTK_CMDQ_MBOX
 	  mailbox driver. The CMDQ is used to help read/write registers with
 	  critical time limitation, such as updating display configuration
 	  during the vblank.
+
+config ZYNQMP_IPI_MBOX
+	bool "Xilinx ZynqMP IPI Mailbox"
+	depends on ARCH_ZYNQMP && OF
+	help
+	  Say yes here to add support for Xilinx IPI mailbox driver.
+	  This mailbox driver is used to send notification or short message
+	  between processors with Xilinx ZynqMP IPI. It will place the
+	  message to the IPI buffer and will access the IPI control
+	  registers to kick the other processor or enquire status.
+
 endif
diff --git a/drivers/mailbox/Makefile b/drivers/mailbox/Makefile
index c818b5d011ae..8be3bcbcf882 100644
--- a/drivers/mailbox/Makefile
+++ b/drivers/mailbox/Makefile
@@ -44,3 +44,5 @@ obj-$(CONFIG_TEGRA_HSP_MBOX)	+= tegra-hsp.o
 obj-$(CONFIG_STM32_IPCC) 	+= stm32-ipcc.o
 
 obj-$(CONFIG_MTK_CMDQ_MBOX)	+= mtk-cmdq-mailbox.o
+
+obj-$(CONFIG_ZYNQMP_IPI_MBOX)	+= zynqmp-ipi-mailbox.o
diff --git a/drivers/mailbox/zynqmp-ipi-mailbox.c b/drivers/mailbox/zynqmp-ipi-mailbox.c
new file mode 100644
index 000000000000..86887c9a349a
--- /dev/null
+++ b/drivers/mailbox/zynqmp-ipi-mailbox.c
@@ -0,0 +1,725 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Xilinx Inter Processor Interrupt(IPI) Mailbox Driver
+ *
+ * Copyright (C) 2018 Xilinx, Inc.
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/mailbox_controller.h>
+#include <linux/mailbox/zynqmp-ipi-message.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+
+/* IPI agent ID any */
+#define IPI_ID_ANY 0xFFUL
+
+/* indicate if ZynqMP IPI mailbox driver uses SMC calls or HVC calls */
+#define USE_SMC 0
+#define USE_HVC 1
+
+/* Default IPI SMC function IDs */
+#define SMC_IPI_MAILBOX_OPEN		0x82001000U
+#define SMC_IPI_MAILBOX_RELEASE		0x82001001U
+#define SMC_IPI_MAILBOX_STATUS_ENQUIRY	0x82001002U
+#define SMC_IPI_MAILBOX_NOTIFY		0x82001003U
+#define SMC_IPI_MAILBOX_ACK		0x82001004U
+#define SMC_IPI_MAILBOX_ENABLE_IRQ	0x82001005U
+#define SMC_IPI_MAILBOX_DISABLE_IRQ	0x82001006U
+
+/* IPI SMC Macros */
+#define IPI_SMC_ENQUIRY_DIRQ_MASK	0x00000001UL /* Flag to indicate if
+						      * notification interrupt
+						      * to be disabled.
+						      */
+#define IPI_SMC_ACK_EIRQ_MASK		0x00000001UL /* Flag to indicate if
+						      * notification interrupt
+						      * to be enabled.
+						      */
+
+/* IPI mailbox status */
+#define IPI_MB_STATUS_IDLE		0
+#define IPI_MB_STATUS_SEND_PENDING	1
+#define IPI_MB_STATUS_RECV_PENDING	2
+
+#define IPI_MB_CHNL_TX	0 /* IPI mailbox TX channel */
+#define IPI_MB_CHNL_RX	1 /* IPI mailbox RX channel */
+
+/**
+ * struct zynqmp_ipi_mchan - Description of a Xilinx ZynqMP IPI mailbox channel
+ * @is_opened: indicate if the IPI channel is opened
+ * @req_buf: local to remote request buffer start address
+ * @resp_buf: local to remote response buffer start address
+ * @req_buf_size: request buffer size
+ * @resp_buf_size: response buffer size
+ * @rx_buf: receive buffer to pass received message to client
+ * @chan_type: channel type
+ */
+struct zynqmp_ipi_mchan {
+	int is_opened;
+	void __iomem *req_buf;
+	void __iomem *resp_buf;
+	void *rx_buf;
+	size_t req_buf_size;
+	size_t resp_buf_size;
+	unsigned int chan_type;
+};
+
+/**
+ * struct zynqmp_ipi_mbox - Description of a ZynqMP IPI mailbox
+ *                          platform data.
+ * @pdata:		  pointer to the IPI private data
+ * @dev:                  device pointer corresponding to the Xilinx ZynqMP
+ *                        IPI mailbox
+ * @remote_id:            remote IPI agent ID
+ * @mbox:                 mailbox Controller
+ * @mchans:               array for channels, tx channel and rx channel.
+ * @irq:                  IPI agent interrupt ID
+ */
+struct zynqmp_ipi_mbox {
+	struct zynqmp_ipi_pdata *pdata;
+	struct device dev;
+	u32 remote_id;
+	struct mbox_controller mbox;
+	struct zynqmp_ipi_mchan mchans[2];
+};
+
+/**
+ * struct zynqmp_ipi_pdata - Description of z ZynqMP IPI agent platform data.
+ *
+ * @dev:                  device pointer corresponding to the Xilinx ZynqMP
+ *                        IPI agent
+ * @irq:                  IPI agent interrupt ID
+ * @method:               IPI SMC or HVC is going to be used
+ * @local_id:             local IPI agent ID
+ * @num_mboxes:           number of mailboxes of this IPI agent
+ * @ipi_mboxes:           IPI mailboxes of this IPI agent
+ */
+struct zynqmp_ipi_pdata {
+	struct device *dev;
+	int irq;
+	unsigned int method;
+	u32 local_id;
+	int num_mboxes;
+	struct zynqmp_ipi_mbox *ipi_mboxes;
+};
+
+static struct device_driver zynqmp_ipi_mbox_driver = {
+	.owner = THIS_MODULE,
+	.name = "zynqmp-ipi-mbox",
+};
+
+static void zynqmp_ipi_fw_call(struct zynqmp_ipi_mbox *ipi_mbox,
+			       unsigned long a0, unsigned long a3,
+			       struct arm_smccc_res *res)
+{
+	struct zynqmp_ipi_pdata *pdata = ipi_mbox->pdata;
+	unsigned long a1, a2;
+
+	a1 = pdata->local_id;
+	a2 = ipi_mbox->remote_id;
+	if (pdata->method == USE_SMC)
+		arm_smccc_smc(a0, a1, a2, a3, 0, 0, 0, 0, res);
+	else
+		arm_smccc_hvc(a0, a1, a2, a3, 0, 0, 0, 0, res);
+}
+
+/**
+ * zynqmp_ipi_interrupt - Interrupt handler for IPI notification
+ *
+ * @irq:  Interrupt number
+ * @data: ZynqMP IPI mailbox platform data.
+ *
+ * Return: -EINVAL if there is no instance
+ * IRQ_NONE if the interrupt is not ours.
+ * IRQ_HANDLED if the rx interrupt was successfully handled.
+ */
+static irqreturn_t zynqmp_ipi_interrupt(int irq, void *data)
+{
+	struct zynqmp_ipi_pdata *pdata = data;
+	struct mbox_chan *chan;
+	struct zynqmp_ipi_mbox *ipi_mbox;
+	struct zynqmp_ipi_mchan *mchan;
+	struct zynqmp_ipi_message *msg;
+	u64 arg0, arg3;
+	struct arm_smccc_res res;
+	int ret, i;
+
+	(void)irq;
+	arg0 = SMC_IPI_MAILBOX_STATUS_ENQUIRY;
+	arg3 = IPI_SMC_ENQUIRY_DIRQ_MASK;
+	for (i = 0; i < pdata->num_mboxes; i++) {
+		ipi_mbox = &pdata->ipi_mboxes[i];
+		mchan = &ipi_mbox->mchans[IPI_MB_CHNL_RX];
+		chan = &ipi_mbox->mbox.chans[IPI_MB_CHNL_RX];
+		zynqmp_ipi_fw_call(ipi_mbox, arg0, arg3, &res);
+		ret = (int)(res.a0 & 0xFFFFFFFF);
+		if (ret > 0 && ret & IPI_MB_STATUS_RECV_PENDING) {
+			if (mchan->is_opened) {
+				msg = mchan->rx_buf;
+				msg->len = mchan->req_buf_size;
+				memcpy_fromio(msg->data, mchan->req_buf,
+					      msg->len);
+				mbox_chan_received_data(chan, (void *)msg);
+				return IRQ_HANDLED;
+			}
+		}
+	}
+	return IRQ_NONE;
+}
+
+/**
+ * zynqmp_ipi_peek_data - Peek to see if there are any rx messages.
+ *
+ * @chan: Channel Pointer
+ *
+ * Return: 'true' if there is pending rx data, 'false' if there is none.
+ */
+static bool zynqmp_ipi_peek_data(struct mbox_chan *chan)
+{
+	struct device *dev = chan->mbox->dev;
+	struct zynqmp_ipi_mbox *ipi_mbox = dev_get_drvdata(dev);
+	struct zynqmp_ipi_mchan *mchan = chan->con_priv;
+	int ret;
+	u64 arg0;
+	struct arm_smccc_res res;
+
+	if (WARN_ON(!ipi_mbox)) {
+		dev_err(dev, "no platform drv data??\n");
+		return false;
+	}
+
+	arg0 = SMC_IPI_MAILBOX_STATUS_ENQUIRY;
+	zynqmp_ipi_fw_call(ipi_mbox, arg0, 0, &res);
+	ret = (int)(res.a0 & 0xFFFFFFFF);
+
+	if (mchan->chan_type == IPI_MB_CHNL_TX) {
+		/* TX channel, check if the message has been acked
+		 * by the remote, if yes, response is available.
+		 */
+		if (ret < 0 || ret & IPI_MB_STATUS_SEND_PENDING)
+			return false;
+		else
+			return true;
+	} else if (ret > 0 && ret & IPI_MB_STATUS_RECV_PENDING) {
+		/* RX channel, check if there is message arrived. */
+		return true;
+	}
+	return false;
+}
+
+/**
+ * zynqmp_ipi_last_tx_done - See if the last tx message is sent
+ *
+ * @chan: Channel pointer
+ *
+ * Return: 'true' is no pending tx data, 'false' if there are any.
+ */
+static bool zynqmp_ipi_last_tx_done(struct mbox_chan *chan)
+{
+	struct device *dev = chan->mbox->dev;
+	struct zynqmp_ipi_mbox *ipi_mbox = dev_get_drvdata(dev);
+	struct zynqmp_ipi_mchan *mchan = chan->con_priv;
+	int ret;
+	u64 arg0;
+	struct arm_smccc_res res;
+
+	if (WARN_ON(!ipi_mbox)) {
+		dev_err(dev, "no platform drv data??\n");
+		return false;
+	}
+
+	if (mchan->chan_type == IPI_MB_CHNL_TX) {
+		/* We only need to check if the message been taken
+		 * by the remote in the TX channel
+		 */
+		arg0 = SMC_IPI_MAILBOX_STATUS_ENQUIRY;
+		zynqmp_ipi_fw_call(ipi_mbox, arg0, 0, &res);
+		/* Check the SMC call status, a0 of the result */
+		ret = (int)(res.a0 & 0xFFFFFFFF);
+		if (ret < 0 || ret & IPI_MB_STATUS_SEND_PENDING)
+			return false;
+		return true;
+	}
+	/* Always true for the response message in RX channel */
+	return true;
+}
+
+/**
+ * zynqmp_ipi_send_data - Send data
+ *
+ * @chan: Channel Pointer
+ * @data: Message Pointer
+ *
+ * Return: 0 if all goes good, else appropriate error messages.
+ */
+static int zynqmp_ipi_send_data(struct mbox_chan *chan, void *data)
+{
+	struct device *dev = chan->mbox->dev;
+	struct zynqmp_ipi_mbox *ipi_mbox = dev_get_drvdata(dev);
+	struct zynqmp_ipi_mchan *mchan = chan->con_priv;
+	struct zynqmp_ipi_message *msg = data;
+	u64 arg0;
+	struct arm_smccc_res res;
+
+	if (WARN_ON(!ipi_mbox)) {
+		dev_err(dev, "no platform drv data??\n");
+		return -EINVAL;
+	}
+
+	if (mchan->chan_type == IPI_MB_CHNL_TX) {
+		/* Send request message */
+		if (msg && msg->len > mchan->req_buf_size) {
+			dev_err(dev, "channel %d message length %u > max %lu\n",
+				mchan->chan_type, (unsigned int)msg->len,
+				mchan->req_buf_size);
+			return -EINVAL;
+		}
+		if (msg && msg->len)
+			memcpy_toio(mchan->req_buf, msg->data, msg->len);
+		/* Kick IPI mailbox to send message */
+		arg0 = SMC_IPI_MAILBOX_NOTIFY;
+		zynqmp_ipi_fw_call(ipi_mbox, arg0, 0, &res);
+	} else {
+		/* Send response message */
+		if (msg && msg->len > mchan->resp_buf_size) {
+			dev_err(dev, "channel %d message length %u > max %lu\n",
+				mchan->chan_type, (unsigned int)msg->len,
+				mchan->resp_buf_size);
+			return -EINVAL;
+		}
+		if (msg && msg->len)
+			memcpy_toio(mchan->resp_buf, msg->data, msg->len);
+		arg0 = SMC_IPI_MAILBOX_ACK;
+		zynqmp_ipi_fw_call(ipi_mbox, arg0, IPI_SMC_ACK_EIRQ_MASK,
+				   &res);
+	}
+	return 0;
+}
+
+/**
+ * zynqmp_ipi_startup - Startup the IPI channel
+ *
+ * @chan: Channel pointer
+ *
+ * Return: 0 if all goes good, else return corresponding error message
+ */
+static int zynqmp_ipi_startup(struct mbox_chan *chan)
+{
+	struct device *dev = chan->mbox->dev;
+	struct zynqmp_ipi_mbox *ipi_mbox = dev_get_drvdata(dev);
+	struct zynqmp_ipi_mchan *mchan = chan->con_priv;
+	u64 arg0;
+	struct arm_smccc_res res;
+	int ret = 0;
+	unsigned int nchan_type;
+
+	if (mchan->is_opened)
+		return 0;
+
+	/* If no channel has been opened, open the IPI mailbox */
+	nchan_type = (mchan->chan_type + 1) % 2;
+	if (!ipi_mbox->mchans[nchan_type].is_opened) {
+		arg0 = SMC_IPI_MAILBOX_OPEN;
+		zynqmp_ipi_fw_call(ipi_mbox, arg0, 0, &res);
+		/* Check the SMC call status, a0 of the result */
+		ret = (int)(res.a0 & 0xFFFFFFFF);
+		if (ret < 0) {
+			dev_err(dev, "SMC to open the IPI channel failed.\n");
+			return ret;
+		}
+		ret = 0;
+	}
+
+	/* If it is RX channel, enable the IPI notification interrupt */
+	if (mchan->chan_type == IPI_MB_CHNL_RX) {
+		arg0 = SMC_IPI_MAILBOX_ENABLE_IRQ;
+		zynqmp_ipi_fw_call(ipi_mbox, arg0, 0, &res);
+	}
+	mchan->is_opened = 1;
+
+	return ret;
+}
+
+/**
+ * zynqmp_ipi_shutdown - Shutdown the IPI channel
+ *
+ * @chan: Channel pointer
+ */
+static void zynqmp_ipi_shutdown(struct mbox_chan *chan)
+{
+	struct device *dev = chan->mbox->dev;
+	struct zynqmp_ipi_mbox *ipi_mbox = dev_get_drvdata(dev);
+	struct zynqmp_ipi_mchan *mchan = chan->con_priv;
+	u64 arg0;
+	struct arm_smccc_res res;
+	unsigned int chan_type;
+
+	if (!mchan->is_opened)
+		return;
+
+	/* If it is RX channel, disable notification interrupt */
+	chan_type = mchan->chan_type;
+	if (chan_type == IPI_MB_CHNL_RX) {
+		arg0 = SMC_IPI_MAILBOX_DISABLE_IRQ;
+		zynqmp_ipi_fw_call(ipi_mbox, arg0, 0, &res);
+	}
+	/* Release IPI mailbox if no other channel is opened */
+	chan_type = (chan_type + 1) % 2;
+	if (!ipi_mbox->mchans[chan_type].is_opened) {
+		arg0 = SMC_IPI_MAILBOX_RELEASE;
+		zynqmp_ipi_fw_call(ipi_mbox, arg0, 0, &res);
+	}
+
+	mchan->is_opened = 0;
+}
+
+/* ZynqMP IPI mailbox operations */
+static const struct mbox_chan_ops zynqmp_ipi_chan_ops = {
+	.startup = zynqmp_ipi_startup,
+	.shutdown = zynqmp_ipi_shutdown,
+	.peek_data = zynqmp_ipi_peek_data,
+	.last_tx_done = zynqmp_ipi_last_tx_done,
+	.send_data = zynqmp_ipi_send_data,
+};
+
+/**
+ * zynqmp_ipi_of_xlate - Translate of phandle to IPI mailbox channel
+ *
+ * @mbox: mailbox controller pointer
+ * @p:    phandle pointer
+ *
+ * Return: Mailbox channel, else return error pointer.
+ */
+static struct mbox_chan *zynqmp_ipi_of_xlate(struct mbox_controller *mbox,
+					     const struct of_phandle_args *p)
+{
+	struct mbox_chan *chan;
+	struct device *dev = mbox->dev;
+	unsigned int chan_type;
+
+	/* Only supports TX and RX channels */
+	chan_type = p->args[0];
+	if (chan_type != IPI_MB_CHNL_TX && chan_type != IPI_MB_CHNL_RX) {
+		dev_err(dev, "req chnl failure: invalid chnl type %u.\n",
+			chan_type);
+		return ERR_PTR(-EINVAL);
+	}
+	chan = &mbox->chans[chan_type];
+	return chan;
+}
+
+static const struct of_device_id zynqmp_ipi_of_match[] = {
+	{ .compatible = "xlnx,zynqmp-ipi-mailbox" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, zynqmp_ipi_of_match);
+
+/**
+ * zynqmp_ipi_mbox_get_buf_res - Get buffer resource from the IPI dev node
+ *
+ * @node: IPI mbox device child node
+ * @name: name of the IPI buffer
+ * @res: pointer to where the resource information will be stored.
+ *
+ * Return: 0 for success, negative value for failure
+ */
+static int zynqmp_ipi_mbox_get_buf_res(struct device_node *node,
+				       const char *name,
+				       struct resource *res)
+{
+	int ret, index;
+
+	index = of_property_match_string(node, "reg-names", name);
+	if (index >= 0) {
+		ret = of_address_to_resource(node, index, res);
+		if (ret < 0)
+			return -EINVAL;
+		return 0;
+	}
+	return -ENODEV;
+}
+
+/**
+ * zynqmp_ipi_mbox_dev_release() - release the existence of a ipi mbox dev
+ *
+ * @dev: the ipi mailbox device
+ *
+ * This is to avoid the no device release() function kernel warning.
+ *
+ */
+static void zynqmp_ipi_mbox_dev_release(struct device *dev)
+{
+	(void)dev;
+}
+
+/**
+ * zynqmp_ipi_mbox_probe - probe IPI mailbox resource from device node
+ *
+ * @ipi_mbox: pointer to IPI mailbox private data structure
+ * @node: IPI mailbox device node
+ *
+ * Return: 0 for success, negative value for failure
+ */
+static int zynqmp_ipi_mbox_probe(struct zynqmp_ipi_mbox *ipi_mbox,
+				 struct device_node *node)
+{
+	struct zynqmp_ipi_mchan *mchan;
+	struct mbox_chan *chans;
+	struct mbox_controller *mbox;
+	struct resource res;
+	struct device *dev, *mdev;
+	const char *name;
+	int ret;
+
+	dev = ipi_mbox->pdata->dev;
+	/* Initialize dev for IPI mailbox */
+	ipi_mbox->dev.parent = dev;
+	ipi_mbox->dev.release = NULL;
+	ipi_mbox->dev.of_node = node;
+	dev_set_name(&ipi_mbox->dev, "%s", of_node_full_name(node));
+	dev_set_drvdata(&ipi_mbox->dev, ipi_mbox);
+	ipi_mbox->dev.release = zynqmp_ipi_mbox_dev_release;
+	ipi_mbox->dev.driver = &zynqmp_ipi_mbox_driver;
+	ret = device_register(&ipi_mbox->dev);
+	if (ret) {
+		dev_err(dev, "Failed to register ipi mbox dev.\n");
+		return ret;
+	}
+	mdev = &ipi_mbox->dev;
+
+	mchan = &ipi_mbox->mchans[IPI_MB_CHNL_TX];
+	name = "local_request_region";
+	ret = zynqmp_ipi_mbox_get_buf_res(node, name, &res);
+	if (!ret) {
+		mchan->req_buf_size = resource_size(&res);
+		mchan->req_buf = devm_ioremap(mdev, res.start,
+					      mchan->req_buf_size);
+		if (IS_ERR(mchan->req_buf)) {
+			dev_err(mdev, "Unable to map IPI buffer I/O memory\n");
+			ret = PTR_ERR(mchan->req_buf);
+			return ret;
+		}
+	} else if (ret != -ENODEV) {
+		dev_err(mdev, "Unmatched resource %s, %d.\n", name, ret);
+		return ret;
+	}
+
+	name = "remote_response_region";
+	ret = zynqmp_ipi_mbox_get_buf_res(node, name, &res);
+	if (!ret) {
+		mchan->resp_buf_size = resource_size(&res);
+		mchan->resp_buf = devm_ioremap(mdev, res.start,
+					       mchan->resp_buf_size);
+		if (IS_ERR(mchan->resp_buf)) {
+			dev_err(mdev, "Unable to map IPI buffer I/O memory\n");
+			ret = PTR_ERR(mchan->resp_buf);
+			return ret;
+		}
+	} else if (ret != -ENODEV) {
+		dev_err(mdev, "Unmatched resource %s.\n", name);
+		return ret;
+	}
+	mchan->rx_buf = devm_kzalloc(mdev,
+				     mchan->resp_buf_size +
+				     sizeof(struct zynqmp_ipi_message),
+				     GFP_KERNEL);
+	if (!mchan->rx_buf)
+		return -ENOMEM;
+
+	mchan = &ipi_mbox->mchans[IPI_MB_CHNL_RX];
+	name = "remote_request_region";
+	ret = zynqmp_ipi_mbox_get_buf_res(node, name, &res);
+	if (!ret) {
+		mchan->req_buf_size = resource_size(&res);
+		mchan->req_buf = devm_ioremap(mdev, res.start,
+					      mchan->req_buf_size);
+		if (IS_ERR(mchan->req_buf)) {
+			dev_err(mdev, "Unable to map IPI buffer I/O memory\n");
+			ret = PTR_ERR(mchan->req_buf);
+			return ret;
+		}
+	} else if (ret != -ENODEV) {
+		dev_err(mdev, "Unmatched resource %s.\n", name);
+		return ret;
+	}
+
+	name = "local_response_region";
+	ret = zynqmp_ipi_mbox_get_buf_res(node, name, &res);
+	if (!ret) {
+		mchan->resp_buf_size = resource_size(&res);
+		mchan->resp_buf = devm_ioremap(mdev, res.start,
+					       mchan->resp_buf_size);
+		if (IS_ERR(mchan->resp_buf)) {
+			dev_err(mdev, "Unable to map IPI buffer I/O memory\n");
+			ret = PTR_ERR(mchan->resp_buf);
+			return ret;
+		}
+	} else if (ret != -ENODEV) {
+		dev_err(mdev, "Unmatched resource %s.\n", name);
+		return ret;
+	}
+	mchan->rx_buf = devm_kzalloc(mdev,
+				     mchan->resp_buf_size +
+				     sizeof(struct zynqmp_ipi_message),
+				     GFP_KERNEL);
+	if (!mchan->rx_buf)
+		return -ENOMEM;
+
+	/* Get the IPI remote agent ID */
+	ret = of_property_read_u32(node, "xlnx,ipi-id", &ipi_mbox->remote_id);
+	if (ret < 0) {
+		dev_err(dev, "No IPI remote ID is specified.\n");
+		return ret;
+	}
+
+	mbox = &ipi_mbox->mbox;
+	mbox->dev = mdev;
+	mbox->ops = &zynqmp_ipi_chan_ops;
+	mbox->num_chans = 2;
+	mbox->txdone_irq = false;
+	mbox->txdone_poll = true;
+	mbox->txpoll_period = 5;
+	mbox->of_xlate = zynqmp_ipi_of_xlate;
+	chans = devm_kzalloc(mdev, 2 * sizeof(*chans), GFP_KERNEL);
+	if (!chans)
+		return -ENOMEM;
+	mbox->chans = chans;
+	chans[IPI_MB_CHNL_TX].con_priv = &ipi_mbox->mchans[IPI_MB_CHNL_TX];
+	chans[IPI_MB_CHNL_RX].con_priv = &ipi_mbox->mchans[IPI_MB_CHNL_RX];
+	ipi_mbox->mchans[IPI_MB_CHNL_TX].chan_type = IPI_MB_CHNL_TX;
+	ipi_mbox->mchans[IPI_MB_CHNL_RX].chan_type = IPI_MB_CHNL_RX;
+	ret = devm_mbox_controller_register(mdev, mbox);
+	if (ret)
+		dev_err(mdev,
+			"Failed to register mbox_controller(%d)\n", ret);
+	else
+		dev_info(mdev,
+			 "Registered ZynqMP IPI mbox with TX/RX channels.\n");
+	return ret;
+}
+
+/**
+ * zynqmp_ipi_free_mboxes - Free IPI mailboxes devices
+ *
+ * @pdata: IPI private data
+ */
+static void zynqmp_ipi_free_mboxes(struct zynqmp_ipi_pdata *pdata)
+{
+	struct zynqmp_ipi_mbox *ipi_mbox;
+	int i;
+
+	i = pdata->num_mboxes;
+	for (; i >= 0; i--) {
+		ipi_mbox = &pdata->ipi_mboxes[i];
+		if (ipi_mbox->dev.parent) {
+			mbox_controller_unregister(&ipi_mbox->mbox);
+			device_unregister(&ipi_mbox->dev);
+		}
+	}
+}
+
+static int zynqmp_ipi_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct device_node *nc, *np = pdev->dev.of_node;
+	struct zynqmp_ipi_pdata *pdata;
+	struct zynqmp_ipi_mbox *mbox;
+	int num_mboxes, ret = -EINVAL;
+
+	num_mboxes = of_get_child_count(np);
+	pdata = devm_kzalloc(dev, sizeof(*pdata) + (num_mboxes * sizeof(*mbox)),
+			     GFP_KERNEL);
+	if (!pdata)
+		return -ENOMEM;
+	pdata->dev = dev;
+
+	/* Get the IPI local agents ID */
+	ret = of_property_read_u32(np, "xlnx,ipi-id", &pdata->local_id);
+	if (ret < 0) {
+		dev_err(dev, "No IPI local ID is specified.\n");
+		return ret;
+	}
+
+	pdata->num_mboxes = num_mboxes;
+	pdata->ipi_mboxes = (struct zynqmp_ipi_mbox *)
+			    ((char *)pdata + sizeof(*pdata));
+
+	mbox = pdata->ipi_mboxes;
+	for_each_available_child_of_node(np, nc) {
+		mbox->pdata = pdata;
+		ret = zynqmp_ipi_mbox_probe(mbox, nc);
+		if (ret) {
+			dev_err(dev, "failed to probe subdev.\n");
+			ret = -EINVAL;
+			goto free_mbox_dev;
+		}
+		mbox++;
+	}
+
+	/* IPI IRQ */
+	ret = platform_get_irq(pdev, 0);
+	if (ret < 0) {
+		dev_err(dev, "unable to find IPI IRQ.\n");
+		goto free_mbox_dev;
+	}
+	pdata->irq = ret;
+	ret = devm_request_irq(dev, pdata->irq, zynqmp_ipi_interrupt,
+			       IRQF_SHARED, dev_name(dev), pdata);
+	if (ret) {
+		dev_err(dev, "IRQ %d is not requested successfully.\n",
+			pdata->irq);
+		goto free_mbox_dev;
+	}
+
+	platform_set_drvdata(pdev, pdata);
+	return ret;
+
+free_mbox_dev:
+	zynqmp_ipi_free_mboxes(pdata);
+	return ret;
+}
+
+static int zynqmp_ipi_remove(struct platform_device *pdev)
+{
+	struct zynqmp_ipi_pdata *pdata;
+
+	pdata = platform_get_drvdata(pdev);
+	zynqmp_ipi_free_mboxes(pdata);
+
+	return 0;
+}
+
+static struct platform_driver zynqmp_ipi_driver = {
+	.probe = zynqmp_ipi_probe,
+	.remove = zynqmp_ipi_remove,
+	.driver = {
+		   .name = "zynqmp-ipi",
+		   .of_match_table = of_match_ptr(zynqmp_ipi_of_match),
+	},
+};
+
+static int __init zynqmp_ipi_init(void)
+{
+	return platform_driver_register(&zynqmp_ipi_driver);
+}
+subsys_initcall(zynqmp_ipi_init);
+
+static void __exit zynqmp_ipi_exit(void)
+{
+	platform_driver_unregister(&zynqmp_ipi_driver);
+}
+module_exit(zynqmp_ipi_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Xilinx ZynqMP IPI Mailbox driver");
+MODULE_AUTHOR("Xilinx Inc.");
diff --git a/include/linux/mailbox/zynqmp-ipi-message.h b/include/linux/mailbox/zynqmp-ipi-message.h
new file mode 100644
index 000000000000..9542b41eacfd
--- /dev/null
+++ b/include/linux/mailbox/zynqmp-ipi-message.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_ZYNQMP_IPI_MESSAGE_H_
+#define _LINUX_ZYNQMP_IPI_MESSAGE_H_
+
+/**
+ * struct zynqmp_ipi_message - ZynqMP IPI message structure
+ * @len:  Length of message
+ * @data: message payload
+ *
+ * This is the structure for data used in mbox_send_message
+ * the maximum length of data buffer is fixed to 12 bytes.
+ * Client is supposed to be aware of this.
+ */
+struct zynqmp_ipi_message {
+	size_t len;
+	u8 data[0];
+};
+
+#endif /* _LINUX_ZYNQMP_IPI_MESSAGE_H_ */
-- 
cgit v1.2.3


From 4c3024debf62de4c6ac6d3cb4c0063be21d4f652 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 6 Mar 2019 14:35:15 -0500
Subject: bpf: only test gso type on gso packets

BPF can adjust gso only for tcp bytestreams. Fail on other gso types.

But only on gso packets. It does not touch this field if !gso_size.

Fixes: b90efd225874 ("bpf: only adjust gso_size on bytestream protocols")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/skbuff.h | 4 ++--
 net/core/filter.c      | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 27beb549ffbe..f32f32407dc4 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4232,10 +4232,10 @@ static inline bool skb_is_gso_sctp(const struct sk_buff *skb)
 	return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP;
 }
 
+/* Note: Should be called only if skb_is_gso(skb) is true */
 static inline bool skb_is_gso_tcp(const struct sk_buff *skb)
 {
-	return skb_is_gso(skb) &&
-	       skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6);
+	return skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6);
 }
 
 static inline void skb_gso_reset(struct sk_buff *skb)
diff --git a/net/core/filter.c b/net/core/filter.c
index 5ceba98069d4..f274620945ff 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2804,7 +2804,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
 	u32 off = skb_mac_header_len(skb);
 	int ret;
 
-	if (!skb_is_gso_tcp(skb))
+	if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
 		return -ENOTSUPP;
 
 	ret = skb_cow(skb, len_diff);
@@ -2845,7 +2845,7 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
 	u32 off = skb_mac_header_len(skb);
 	int ret;
 
-	if (!skb_is_gso_tcp(skb))
+	if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
 		return -ENOTSUPP;
 
 	ret = skb_unclone(skb, GFP_ATOMIC);
@@ -2970,7 +2970,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff)
 	u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
 	int ret;
 
-	if (!skb_is_gso_tcp(skb))
+	if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
 		return -ENOTSUPP;
 
 	ret = skb_cow(skb, len_diff);
@@ -2999,7 +2999,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
 	u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
 	int ret;
 
-	if (!skb_is_gso_tcp(skb))
+	if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
 		return -ENOTSUPP;
 
 	ret = skb_unclone(skb, GFP_ATOMIC);
-- 
cgit v1.2.3


From c6b38fbbde91ee7b072febe4b83022e4850f934f Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Fri, 1 Mar 2019 10:24:59 +0100
Subject: drm: move i915_kick_out_vgacon to vgaarb

Also rename it to vga_remove_vgacon and add kerneldoc text.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/20190301092502.30948-2-kraxel@redhat.com
---
 drivers/gpu/drm/i915/i915_drv.c | 35 +----------------------------
 drivers/gpu/vga/vgaarb.c        | 49 +++++++++++++++++++++++++++++++++++++++++
 include/linux/vgaarb.h          |  2 ++
 3 files changed, 52 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 6630212f2faf..9df65d386d11 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -757,39 +757,6 @@ static int i915_kick_out_firmware_fb(struct drm_i915_private *dev_priv)
 	return ret;
 }
 
-#if !defined(CONFIG_VGA_CONSOLE)
-static int i915_kick_out_vgacon(struct drm_i915_private *dev_priv)
-{
-	return 0;
-}
-#elif !defined(CONFIG_DUMMY_CONSOLE)
-static int i915_kick_out_vgacon(struct drm_i915_private *dev_priv)
-{
-	return -ENODEV;
-}
-#else
-static int i915_kick_out_vgacon(struct drm_i915_private *dev_priv)
-{
-	int ret = 0;
-
-	DRM_INFO("Replacing VGA console driver\n");
-
-	console_lock();
-	if (con_is_bound(&vga_con))
-		ret = do_take_over_console(&dummy_con, 0, MAX_NR_CONSOLES - 1, 1);
-	if (ret == 0) {
-		ret = do_unregister_con_driver(&vga_con);
-
-		/* Ignore "already unregistered". */
-		if (ret == -ENODEV)
-			ret = 0;
-	}
-	console_unlock();
-
-	return ret;
-}
-#endif
-
 static void intel_init_dpio(struct drm_i915_private *dev_priv)
 {
 	/*
@@ -1420,7 +1387,7 @@ static int i915_driver_init_hw(struct drm_i915_private *dev_priv)
 		goto err_ggtt;
 	}
 
-	ret = i915_kick_out_vgacon(dev_priv);
+	ret = vga_remove_vgacon(pdev);
 	if (ret) {
 		DRM_ERROR("failed to remove conflicting VGA console\n");
 		goto err_ggtt;
diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c
index dc8e039bfab5..f2f3ef8af271 100644
--- a/drivers/gpu/vga/vgaarb.c
+++ b/drivers/gpu/vga/vgaarb.c
@@ -48,6 +48,8 @@
 #include <linux/miscdevice.h>
 #include <linux/slab.h>
 #include <linux/screen_info.h>
+#include <linux/vt.h>
+#include <linux/console.h>
 
 #include <linux/uaccess.h>
 
@@ -168,6 +170,53 @@ void vga_set_default_device(struct pci_dev *pdev)
 	vga_default = pci_dev_get(pdev);
 }
 
+/**
+ * vga_remove_vgacon - deactivete vga console
+ *
+ * Unbind and unregister vgacon in case pdev is the default vga
+ * device.  Can be called by gpu drivers on initialization to make
+ * sure vga register access done by vgacon will not disturb the
+ * device.
+ *
+ * @pdev: pci device.
+ */
+#if !defined(CONFIG_VGA_CONSOLE)
+int vga_remove_vgacon(struct pci_dev *pdev)
+{
+	return 0;
+}
+#elif !defined(CONFIG_DUMMY_CONSOLE)
+int vga_remove_vgacon(struct pci_dev *pdev)
+{
+	return -ENODEV;
+}
+#else
+int vga_remove_vgacon(struct pci_dev *pdev)
+{
+	int ret = 0;
+
+	if (pdev != vga_default)
+		return 0;
+	vgaarb_info(&pdev->dev, "deactivate vga console\n");
+
+	console_lock();
+	if (con_is_bound(&vga_con))
+		ret = do_take_over_console(&dummy_con, 0,
+					   MAX_NR_CONSOLES - 1, 1);
+	if (ret == 0) {
+		ret = do_unregister_con_driver(&vga_con);
+
+		/* Ignore "already unregistered". */
+		if (ret == -ENODEV)
+			ret = 0;
+	}
+	console_unlock();
+
+	return ret;
+}
+#endif
+EXPORT_SYMBOL(vga_remove_vgacon);
+
 static inline void vga_irq_set_state(struct vga_device *vgadev, bool state)
 {
 	if (vgadev->irq_set_state)
diff --git a/include/linux/vgaarb.h b/include/linux/vgaarb.h
index ee162e3e879b..553b34c8b5f7 100644
--- a/include/linux/vgaarb.h
+++ b/include/linux/vgaarb.h
@@ -125,9 +125,11 @@ extern void vga_put(struct pci_dev *pdev, unsigned int rsrc);
 #ifdef CONFIG_VGA_ARB
 extern struct pci_dev *vga_default_device(void);
 extern void vga_set_default_device(struct pci_dev *pdev);
+extern int vga_remove_vgacon(struct pci_dev *pdev);
 #else
 static inline struct pci_dev *vga_default_device(void) { return NULL; };
 static inline void vga_set_default_device(struct pci_dev *pdev) { };
+static inline int vga_remove_vgacon(struct pci_dev *pdev) { return 0; };
 #endif
 
 /*
-- 
cgit v1.2.3


From 1c74ca7a1a9ae6ce71032c997a586f061a0029f8 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Fri, 1 Mar 2019 10:25:00 +0100
Subject: drm/fb-helper: call vga_remove_vgacon automatically.

Add vga_remove_vgacon() call to
drm_fb_helper_remove_conflicting_pci_framebuffers().

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/20190301092502.30948-3-kraxel@redhat.com
---
 include/drm/drm_fb_helper.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/drm/drm_fb_helper.h b/include/drm/drm_fb_helper.h
index bb9acea61369..286d58efed5d 100644
--- a/include/drm/drm_fb_helper.h
+++ b/include/drm/drm_fb_helper.h
@@ -36,6 +36,7 @@ struct drm_fb_helper;
 #include <drm/drm_crtc.h>
 #include <drm/drm_device.h>
 #include <linux/kgdb.h>
+#include <linux/vgaarb.h>
 
 enum mode_set_atomic {
 	LEAVE_ATOMIC_MODE_SET,
@@ -642,11 +643,18 @@ drm_fb_helper_remove_conflicting_pci_framebuffers(struct pci_dev *pdev,
 						  int resource_id,
 						  const char *name)
 {
+	int ret = 0;
+
+	/*
+	 * WARNING: Apparently we must kick fbdev drivers before vgacon,
+	 * otherwise the vga fbdev driver falls over.
+	 */
 #if IS_REACHABLE(CONFIG_FB)
-	return remove_conflicting_pci_framebuffers(pdev, resource_id, name);
-#else
-	return 0;
+	ret = remove_conflicting_pci_framebuffers(pdev, resource_id, name);
 #endif
+	if (ret == 0)
+		ret = vga_remove_vgacon(pdev);
+	return ret;
 }
 
 #endif
-- 
cgit v1.2.3


From 0996584b3026bed7f38abe02e8535e6a6c474118 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 5 Mar 2019 13:55:35 +0100
Subject: PM-runtime: Call pm_runtime_active|suspended_time() from sysfs

Avoid the open-coding of the accounted time acquisition in
runtime_active|suspend_time_show() and make them call
pm_runtime_active|suspended_time() instead.

Note that this change also indirectly avoids holding dev->power.lock
around the do_div() computation and the sprintf() call which is an
additional improvement.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
[ rjw: Changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/runtime.c |  2 +-
 drivers/base/power/sysfs.c   | 12 ++----------
 include/linux/pm.h           |  1 -
 3 files changed, 3 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 32f6bf076bd7..a2d22e3ecf3a 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -64,7 +64,7 @@ static int rpm_suspend(struct device *dev, int rpmflags);
  * runtime_status field is updated, to account the time in the old state
  * correctly.
  */
-void update_pm_runtime_accounting(struct device *dev)
+static void update_pm_runtime_accounting(struct device *dev)
 {
 	u64 now, last, delta;
 
diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index c6bf76124184..1226e441ddfe 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -125,13 +125,9 @@ static ssize_t runtime_active_time_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
 	int ret;
-	u64 tmp;
-	spin_lock_irq(&dev->power.lock);
-	update_pm_runtime_accounting(dev);
-	tmp = dev->power.active_time;
+	u64 tmp = pm_runtime_active_time(dev);
 	do_div(tmp, NSEC_PER_MSEC);
 	ret = sprintf(buf, "%llu\n", tmp);
-	spin_unlock_irq(&dev->power.lock);
 	return ret;
 }
 
@@ -141,13 +137,9 @@ static ssize_t runtime_suspended_time_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
 	int ret;
-	u64 tmp;
-	spin_lock_irq(&dev->power.lock);
-	update_pm_runtime_accounting(dev);
-	tmp = dev->power.suspended_time;
+	u64 tmp = pm_runtime_suspended_time(dev);
 	do_div(tmp, NSEC_PER_MSEC);
 	ret = sprintf(buf, "%llu\n", tmp);
-	spin_unlock_irq(&dev->power.lock);
 	return ret;
 }
 
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 06f7ed893928..66c19a65a514 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -643,7 +643,6 @@ struct dev_pm_info {
 	struct dev_pm_qos	*qos;
 };
 
-extern void update_pm_runtime_accounting(struct device *dev);
 extern int dev_pm_get_subsys_data(struct device *dev);
 extern void dev_pm_put_subsys_data(struct device *dev);
 
-- 
cgit v1.2.3


From eacc95eae6837d3f41aed7d30b855a79ab2cb101 Mon Sep 17 00:00:00 2001
From: Mattias Jacobsson <2pi@mok.nu>
Date: Tue, 19 Feb 2019 20:59:49 +0100
Subject: platform/x86: wmi: move struct wmi_device_id to mod_devicetable.h

In preparation for adding WMI support to MODULE_DEVICE_TABLE() move the
definition of struct wmi_device_id to mod_devicetable.h and inline
guid_string in the struct.

Changing guid_string to an inline char array changes the loop conditions
when looping over an array of struct wmi_device_id. Therefore update
wmi_dev_match()'s loop to check for an empty guid_string instead of a
NULL pointer.

Signed-off-by: Mattias Jacobsson <2pi@mok.nu>
[dvhart: Move UUID_STRING_LEN define to this patch]
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 drivers/platform/x86/wmi.c      |  2 +-
 include/linux/mod_devicetable.h | 12 ++++++++++++
 include/linux/wmi.h             |  5 +----
 scripts/mod/file2alias.c        |  1 +
 4 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index b0f3d8ecd898..7b26b6ccf1a0 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -771,7 +771,7 @@ static int wmi_dev_match(struct device *dev, struct device_driver *driver)
 	if (id == NULL)
 		return 0;
 
-	while (id->guid_string) {
+	while (*id->guid_string) {
 		uuid_le driver_guid;
 
 		if (WARN_ON(uuid_le_to_bin(id->guid_string, &driver_guid)))
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index f9bd2f34b99f..e44b90fa0aef 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -779,4 +779,16 @@ struct typec_device_id {
 	kernel_ulong_t driver_data;
 };
 
+/* WMI */
+
+#define WMI_MODULE_PREFIX	"wmi:"
+
+/**
+ * struct wmi_device_id - WMI device identifier
+ * @guid_string: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba
+ */
+struct wmi_device_id {
+	const char guid_string[UUID_STRING_LEN+1];
+};
+
 #endif /* LINUX_MOD_DEVICETABLE_H */
diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index 4757cb5077e5..592f81afecbb 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -18,6 +18,7 @@
 
 #include <linux/device.h>
 #include <linux/acpi.h>
+#include <linux/mod_devicetable.h>
 #include <uapi/linux/wmi.h>
 
 struct wmi_device {
@@ -39,10 +40,6 @@ extern union acpi_object *wmidev_block_query(struct wmi_device *wdev,
 
 extern int set_required_buffer_size(struct wmi_device *wdev, u64 length);
 
-struct wmi_device_id {
-	const char *guid_string;
-};
-
 struct wmi_driver {
 	struct device_driver driver;
 	const struct wmi_device_id *id_table;
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index afe22af20d7d..4e4f03a12cc0 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -37,6 +37,7 @@ typedef unsigned char	__u8;
 typedef struct {
 	__u8 b[16];
 } uuid_le;
+#define	UUID_STRING_LEN		36
 
 /* Big exception to the "don't include kernel headers into userspace, which
  * even potentially has different endianness and word sizes, since
-- 
cgit v1.2.3


From 40ba1d9b4d19796afc9b7ece872f5f3e8f5e2c13 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 8 Mar 2019 00:58:53 +0100
Subject: netfilter: nf_tables: fix set double-free in abort path

The abort path can cause a double-free of an anonymous set.
Added-and-to-be-aborted rule looks like this:

udp dport { 137, 138 } drop

The to-be-aborted transaction list looks like this:

newset
newsetelem
newsetelem
rule

This gets walked in reverse order, so first pass disables the rule, the
set elements, then the set.

After synchronize_rcu(), we then destroy those in same order: rule, set
element, set element, newset.

Problem is that the anonymous set has already been bound to the rule, so
the rule (lookup expression destructor) already frees the set, when then
cause use-after-free when trying to delete the elements from this set,
then try to free the set again when handling the newset expression.

Rule releases the bound set in first place from the abort path, this
causes the use-after-free on set element removal when undoing the new
element transactions. To handle this, skip new element transaction if
set is bound from the abort path.

This is still causes the use-after-free on set element removal.  To
handle this, remove transaction from the list when the set is already
bound.

Joint work with Florian Westphal.

Fixes: f6ac85858976 ("netfilter: nf_tables: unbind set in rule from commit path")
Bugzilla: https://bugzilla.netfilter.org/show_bug.cgi?id=1325
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  6 ++----
 net/netfilter/nf_tables_api.c     | 17 +++++++++++------
 2 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index c331e96a713b..ed0687b0e0f4 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -416,7 +416,8 @@ struct nft_set {
 	unsigned char			*udata;
 	/* runtime data below here */
 	const struct nft_set_ops	*ops ____cacheline_aligned;
-	u16				flags:14,
+	u16				flags:13,
+					bound:1,
 					genmask:2;
 	u8				klen;
 	u8				dlen;
@@ -1344,15 +1345,12 @@ struct nft_trans_rule {
 struct nft_trans_set {
 	struct nft_set			*set;
 	u32				set_id;
-	bool				bound;
 };
 
 #define nft_trans_set(trans)	\
 	(((struct nft_trans_set *)trans->data)->set)
 #define nft_trans_set_id(trans)	\
 	(((struct nft_trans_set *)trans->data)->set_id)
-#define nft_trans_set_bound(trans)	\
-	(((struct nft_trans_set *)trans->data)->bound)
 
 struct nft_trans_chain {
 	bool				update;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index faf6bd10a19f..1333bf97dc26 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -142,7 +142,7 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set)
 	list_for_each_entry_reverse(trans, &net->nft.commit_list, list) {
 		if (trans->msg_type == NFT_MSG_NEWSET &&
 		    nft_trans_set(trans) == set) {
-			nft_trans_set_bound(trans) = true;
+			set->bound = true;
 			break;
 		}
 	}
@@ -6709,8 +6709,7 @@ static void nf_tables_abort_release(struct nft_trans *trans)
 		nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
 		break;
 	case NFT_MSG_NEWSET:
-		if (!nft_trans_set_bound(trans))
-			nft_set_destroy(nft_trans_set(trans));
+		nft_set_destroy(nft_trans_set(trans));
 		break;
 	case NFT_MSG_NEWSETELEM:
 		nft_set_elem_destroy(nft_trans_elem_set(trans),
@@ -6783,8 +6782,11 @@ static int __nf_tables_abort(struct net *net)
 			break;
 		case NFT_MSG_NEWSET:
 			trans->ctx.table->use--;
-			if (!nft_trans_set_bound(trans))
-				list_del_rcu(&nft_trans_set(trans)->list);
+			if (nft_trans_set(trans)->bound) {
+				nft_trans_destroy(trans);
+				break;
+			}
+			list_del_rcu(&nft_trans_set(trans)->list);
 			break;
 		case NFT_MSG_DELSET:
 			trans->ctx.table->use++;
@@ -6792,8 +6794,11 @@ static int __nf_tables_abort(struct net *net)
 			nft_trans_destroy(trans);
 			break;
 		case NFT_MSG_NEWSETELEM:
+			if (nft_trans_elem_set(trans)->bound) {
+				nft_trans_destroy(trans);
+				break;
+			}
 			te = (struct nft_trans_elem *)trans->data;
-
 			te->set->ops->remove(net, te->set, &te->elem);
 			atomic_dec(&te->set->nelems);
 			break;
-- 
cgit v1.2.3


From 930c9f9125c85b5134b3e711bc252ecc094708e3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 8 Mar 2019 12:48:39 +0000
Subject: rxrpc: Fix client call connect/disconnect race

rxrpc_disconnect_client_call() reads the call's connection ID protocol
value (call->cid) as part of that function's variable declarations.  This
is bad because it's not inside the locked section and so may race with
someone granting use of the channel to the call.

This manifests as an assertion failure (see below) where the call in the
presumed channel (0 because call->cid wasn't set when we read it) doesn't
match the call attached to the channel we were actually granted (if 1, 2 or
3).

Fix this by moving the read and dependent calculations inside of the
channel_lock section.  Also, only set the channel number and pointer
variables if cid is not zero (ie. unset).

This problem can be induced by injecting an occasional error in
rxrpc_wait_for_channel() before the call to schedule().

Make two further changes also:

 (1) Add a trace for wait failure in rxrpc_connect_call().

 (2) Drop channel_lock before BUG'ing in the case of the assertion failure.

The failure causes a trace akin to the following:

rxrpc: Assertion failed - 18446612685268945920(0xffff8880beab8c00) == 18446612685268621312(0xffff8880bea69800) is false
------------[ cut here ]------------
kernel BUG at net/rxrpc/conn_client.c:824!
...
RIP: 0010:rxrpc_disconnect_client_call+0x2bf/0x99d
...
Call Trace:
 rxrpc_connect_call+0x902/0x9b3
 ? wake_up_q+0x54/0x54
 rxrpc_new_client_call+0x3a0/0x751
 ? rxrpc_kernel_begin_call+0x141/0x1bc
 ? afs_alloc_call+0x1b5/0x1b5
 rxrpc_kernel_begin_call+0x141/0x1bc
 afs_make_call+0x20c/0x525
 ? afs_alloc_call+0x1b5/0x1b5
 ? __lock_is_held+0x40/0x71
 ? lockdep_init_map+0xaf/0x193
 ? lockdep_init_map+0xaf/0x193
 ? __lock_is_held+0x40/0x71
 ? yfs_fs_fetch_data+0x33b/0x34a
 yfs_fs_fetch_data+0x33b/0x34a
 afs_fetch_data+0xdc/0x3b7
 afs_read_dir+0x52d/0x97f
 afs_dir_iterate+0xa0/0x661
 ? iterate_dir+0x63/0x141
 iterate_dir+0xa2/0x141
 ksys_getdents64+0x9f/0x11b
 ? filldir+0x111/0x111
 ? do_syscall_64+0x3e/0x1a0
 __x64_sys_getdents64+0x16/0x19
 do_syscall_64+0x7d/0x1a0
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Fixes: 45025bceef17 ("rxrpc: Improve management and caching of client connection objects")
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/rxrpc.h |  2 ++
 net/rxrpc/conn_client.c      | 20 +++++++++++++++-----
 2 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 5b50fe4906d2..7b60fd186cfe 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -76,6 +76,7 @@ enum rxrpc_client_trace {
 	rxrpc_client_chan_disconnect,
 	rxrpc_client_chan_pass,
 	rxrpc_client_chan_unstarted,
+	rxrpc_client_chan_wait_failed,
 	rxrpc_client_cleanup,
 	rxrpc_client_count,
 	rxrpc_client_discard,
@@ -276,6 +277,7 @@ enum rxrpc_tx_point {
 	EM(rxrpc_client_chan_disconnect,	"ChDisc") \
 	EM(rxrpc_client_chan_pass,		"ChPass") \
 	EM(rxrpc_client_chan_unstarted,		"ChUnst") \
+	EM(rxrpc_client_chan_wait_failed,	"ChWtFl") \
 	EM(rxrpc_client_cleanup,		"Clean ") \
 	EM(rxrpc_client_count,			"Count ") \
 	EM(rxrpc_client_discard,		"Discar") \
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index b2adfa825363..f307a05076e1 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -704,6 +704,7 @@ int rxrpc_connect_call(struct rxrpc_sock *rx,
 
 	ret = rxrpc_wait_for_channel(call, gfp);
 	if (ret < 0) {
+		trace_rxrpc_client(call->conn, ret, rxrpc_client_chan_wait_failed);
 		rxrpc_disconnect_client_call(call);
 		goto out;
 	}
@@ -774,16 +775,22 @@ static void rxrpc_set_client_reap_timer(struct rxrpc_net *rxnet)
  */
 void rxrpc_disconnect_client_call(struct rxrpc_call *call)
 {
-	unsigned int channel = call->cid & RXRPC_CHANNELMASK;
 	struct rxrpc_connection *conn = call->conn;
-	struct rxrpc_channel *chan = &conn->channels[channel];
+	struct rxrpc_channel *chan = NULL;
 	struct rxrpc_net *rxnet = conn->params.local->rxnet;
+	unsigned int channel = -1;
+	u32 cid;
 
+	spin_lock(&conn->channel_lock);
+
+	cid = call->cid;
+	if (cid) {
+		channel = cid & RXRPC_CHANNELMASK;
+		chan = &conn->channels[channel];
+	}
 	trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect);
 	call->conn = NULL;
 
-	spin_lock(&conn->channel_lock);
-
 	/* Calls that have never actually been assigned a channel can simply be
 	 * discarded.  If the conn didn't get used either, it will follow
 	 * immediately unless someone else grabs it in the meantime.
@@ -807,7 +814,10 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
 		goto out;
 	}
 
-	ASSERTCMP(rcu_access_pointer(chan->call), ==, call);
+	if (rcu_access_pointer(chan->call) != call) {
+		spin_unlock(&conn->channel_lock);
+		BUG();
+	}
 
 	/* If a client call was exposed to the world, we save the result for
 	 * retransmission.
-- 
cgit v1.2.3


From 71b91a506bb05f9aef3acd57af2e835d85721942 Mon Sep 17 00:00:00 2001
From: Bo YU <tsu.yubo@gmail.com>
Date: Fri, 8 Mar 2019 01:45:51 -0500
Subject: bpf: fix warning about using plain integer as NULL

Sparse warning below:

sudo make C=2 CF=-D__CHECK_ENDIAN__ M=net/bpf/
CHECK   net/bpf//test_run.c
net/bpf//test_run.c:19:77: warning: Using plain integer as NULL pointer
./include/linux/bpf-cgroup.h:295:77: warning: Using plain integer as NULL pointer

Fixes: 8bad74f9840f ("bpf: extend cgroup bpf core to allow multiple cgroup storage types")
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Bo YU <tsu.yubo@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h | 2 +-
 net/bpf/test_run.c         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 695b2a880d9a..a4c644c1c091 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -292,7 +292,7 @@ static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog,
 static inline void bpf_cgroup_storage_release(struct bpf_prog *prog,
 					      struct bpf_map *map) {}
 static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
-	struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; }
+	struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return NULL; }
 static inline void bpf_cgroup_storage_free(
 	struct bpf_cgroup_storage *storage) {}
 static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key,
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index da7051d62727..fab142b796ef 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -16,7 +16,7 @@
 static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 			u32 *retval, u32 *time)
 {
-	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 };
+	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL };
 	enum bpf_cgroup_storage_type stype;
 	u64 time_start, time_spent = 0;
 	int ret = 0;
-- 
cgit v1.2.3


From 161e613755e93c45cc47e75ab046f0f8de9e6d49 Mon Sep 17 00:00:00 2001
From: Pedro Tammela <pctammela@gmail.com>
Date: Tue, 5 Mar 2019 11:35:54 -0300
Subject: net: add missing documentation in linux/skbuff.h

This patch adds missing documentation for some inline functions on
linux/skbuff.h. The patch is incomplete and a lot more can be added,
just wondering if it's of interest of the netdev developers.

Also fixed some whitespaces.

Signed-off-by: Pedro Tammela <pctammela@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 64 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 59 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 27beb549ffbe..730b333be591 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -327,26 +327,49 @@ struct skb_frag_struct {
 #endif
 };
 
+/**
+ * skb_frag_size - Returns the size of a skb fragment
+ * @frag: skb fragment
+ */
 static inline unsigned int skb_frag_size(const skb_frag_t *frag)
 {
 	return frag->size;
 }
 
+/**
+ * skb_frag_size_set - Sets the size of a skb fragment
+ * @frag: skb fragment
+ * @size: size of fragment
+ */
 static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size)
 {
 	frag->size = size;
 }
 
+/**
+ * skb_frag_size_add - Incrementes the size of a skb fragment by %delta
+ * @frag: skb fragment
+ * @delta: value to add
+ */
 static inline void skb_frag_size_add(skb_frag_t *frag, int delta)
 {
 	frag->size += delta;
 }
 
+/**
+ * skb_frag_size_sub - Decrements the size of a skb fragment by %delta
+ * @frag: skb fragment
+ * @delta: value to subtract
+ */
 static inline void skb_frag_size_sub(skb_frag_t *frag, int delta)
 {
 	frag->size -= delta;
 }
 
+/**
+ * skb_frag_must_loop - Test if %p is a high memory page
+ * @p: fragment's page
+ */
 static inline bool skb_frag_must_loop(struct page *p)
 {
 #if defined(CONFIG_HIGHMEM)
@@ -590,7 +613,7 @@ typedef unsigned int sk_buff_data_t;
 typedef unsigned char *sk_buff_data_t;
 #endif
 
-/** 
+/**
  *	struct sk_buff - socket buffer
  *	@next: Next buffer in list
  *	@prev: Previous buffer in list
@@ -648,7 +671,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL
  *	@dst_pending_confirm: need to confirm neighbour
  *	@decrypted: Decrypted SKB
-  *	@napi_id: id of the NAPI struct this skb came from
+ *	@napi_id: id of the NAPI struct this skb came from
  *	@secmark: security marking
  *	@mark: Generic packet mark
  *	@vlan_proto: vlan encapsulation protocol
@@ -883,7 +906,10 @@ struct sk_buff {
 #define SKB_ALLOC_RX		0x02
 #define SKB_ALLOC_NAPI		0x04
 
-/* Returns true if the skb was allocated from PFMEMALLOC reserves */
+/**
+ * skb_pfmemalloc - Test if the skb was allocated from PFMEMALLOC reserves
+ * @skb: buffer
+ */
 static inline bool skb_pfmemalloc(const struct sk_buff *skb)
 {
 	return unlikely(skb->pfmemalloc);
@@ -905,7 +931,7 @@ static inline bool skb_pfmemalloc(const struct sk_buff *skb)
  */
 static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
 {
-	/* If refdst was not refcounted, check we still are in a 
+	/* If refdst was not refcounted, check we still are in a
 	 * rcu_read_lock section
 	 */
 	WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) &&
@@ -952,6 +978,10 @@ static inline bool skb_dst_is_noref(const struct sk_buff *skb)
 	return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb);
 }
 
+/**
+ * skb_rtable - Returns the skb &rtable
+ * @skb: buffer
+ */
 static inline struct rtable *skb_rtable(const struct sk_buff *skb)
 {
 	return (struct rtable *)skb_dst(skb);
@@ -966,6 +996,10 @@ static inline bool skb_pkt_type_ok(u32 ptype)
 	return ptype <= PACKET_OTHERHOST;
 }
 
+/**
+ * skb_napi_id - Returns the skb's NAPI id
+ * @skb: buffer
+ */
 static inline unsigned int skb_napi_id(const struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_RX_BUSY_POLL
@@ -975,7 +1009,12 @@ static inline unsigned int skb_napi_id(const struct sk_buff *skb)
 #endif
 }
 
-/* decrement the reference count and return true if we can free the skb */
+/**
+ * skb_unref - decrement the skb's reference count
+ * @skb: buffer
+ *
+ * Returns true if we can free the skb.
+ */
 static inline bool skb_unref(struct sk_buff *skb)
 {
 	if (unlikely(!skb))
@@ -1005,6 +1044,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags,
 			    int node);
 struct sk_buff *__build_skb(void *data, unsigned int frag_size);
 struct sk_buff *build_skb(void *data, unsigned int frag_size);
+
+/**
+ * alloc_skb - allocate a network buffer
+ * @size: size to allocate
+ * @priority: allocation mask
+ *
+ * This function is a convenient wrapper around __alloc_skb().
+ */
 static inline struct sk_buff *alloc_skb(unsigned int size,
 					gfp_t priority)
 {
@@ -1047,6 +1094,13 @@ static inline bool skb_fclone_busy(const struct sock *sk,
 	       fclones->skb2.sk == sk;
 }
 
+/**
+ * alloc_skb_fclone - allocate a network buffer from fclone cache
+ * @size: size to allocate
+ * @priority: allocation mask
+ *
+ * This function is a convenient wrapper around __alloc_skb().
+ */
 static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 					       gfp_t priority)
 {
-- 
cgit v1.2.3


From 1039c6e1936ef2be0f342bd56fcf0fb25c1df5fe Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Sat, 9 Mar 2019 10:26:53 +0100
Subject: net: keep refcount warning in reqsk_free()

As Eric Dumazet said, "We do not have a way to tell if the req was ever
inserted in a hash table, so better play safe.".
Let's remove this comment, so that nobody will be tempted to drop the
WARN_ON_ONCE() line.

Signed-off-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/request_sock.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 347015515a7d..21a5243fecd1 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -108,7 +108,6 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener,
 
 static inline void reqsk_free(struct request_sock *req)
 {
-	/* temporary debugging */
 	WARN_ON_ONCE(refcount_read(&req->rsk_refcnt) != 0);
 
 	req->rsk_ops->destructor(req);
-- 
cgit v1.2.3


From 083b78a9ed64bc71957dd7da866c128a307ea062 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 9 Mar 2019 14:43:38 -0800
Subject: ip: fix ip_mc_may_pull() return value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ip_mc_may_pull() must return 0 if there is a problem, not an errno.

syzbot reported :

BUG: KASAN: use-after-free in br_ip4_multicast_igmp3_report net/bridge/br_multicast.c:947 [inline]
BUG: KASAN: use-after-free in br_multicast_ipv4_rcv net/bridge/br_multicast.c:1631 [inline]
BUG: KASAN: use-after-free in br_multicast_rcv+0x3cd8/0x4440 net/bridge/br_multicast.c:1741
Read of size 4 at addr ffff88820a4084ee by task syz-executor.2/11183

CPU: 1 PID: 11183 Comm: syz-executor.2 Not tainted 5.0.0+ #14
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x172/0x1f0 lib/dump_stack.c:113
 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:187
 kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317
 __asan_report_load4_noabort+0x14/0x20 mm/kasan/generic_report.c:131
 br_ip4_multicast_igmp3_report net/bridge/br_multicast.c:947 [inline]
 br_multicast_ipv4_rcv net/bridge/br_multicast.c:1631 [inline]
 br_multicast_rcv+0x3cd8/0x4440 net/bridge/br_multicast.c:1741
 br_handle_frame_finish+0xa3a/0x14c0 net/bridge/br_input.c:108
 br_nf_hook_thresh+0x2ec/0x380 net/bridge/br_netfilter_hooks.c:1005
 br_nf_pre_routing_finish+0x8e2/0x1750 net/bridge/br_netfilter_hooks.c:410
 NF_HOOK include/linux/netfilter.h:289 [inline]
 NF_HOOK include/linux/netfilter.h:283 [inline]
 br_nf_pre_routing+0x7e7/0x13a0 net/bridge/br_netfilter_hooks.c:506
 nf_hook_entry_hookfn include/linux/netfilter.h:119 [inline]
 nf_hook_slow+0xbf/0x1f0 net/netfilter/core.c:511
 nf_hook include/linux/netfilter.h:244 [inline]
 NF_HOOK include/linux/netfilter.h:287 [inline]
 br_handle_frame+0x95b/0x1450 net/bridge/br_input.c:305
 __netif_receive_skb_core+0xa96/0x3040 net/core/dev.c:4902
 __netif_receive_skb_one_core+0xa8/0x1a0 net/core/dev.c:4971
 __netif_receive_skb+0x2c/0x1c0 net/core/dev.c:5083
 netif_receive_skb_internal+0x117/0x660 net/core/dev.c:5186
 netif_receive_skb+0x6e/0x5a0 net/core/dev.c:5261

Fixes: ba5ea614622d ("bridge: simplify ip_mc_check_igmp() and ipv6_mc_check_mld() calls")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index cc85f4524dbf..9c94b2ea789c 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -110,7 +110,7 @@ struct ip_mc_list {
 static inline int ip_mc_may_pull(struct sk_buff *skb, unsigned int len)
 {
 	if (skb_transport_offset(skb) + ip_transport_len(skb) < len)
-		return -EINVAL;
+		return 0;
 
 	return pskb_may_pull(skb, len);
 }
-- 
cgit v1.2.3


From 009a82f6437490c262584d65a14094a818bcb747 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 9 Mar 2019 12:07:17 -0500
Subject: SUNRPC: Micro-optimise when the task is known not to be sleeping

In cases where we know the task is not sleeping, try to optimise
away the indirect call to task->tk_action() by replacing it with
a direct call.
Only change tail calls, to allow gcc to perform tail call
elimination.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/sched.h |  8 ++++
 net/sunrpc/clnt.c            | 99 +++++++++++++++++++++++++++++---------------
 2 files changed, 73 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 52d41d0c1ae1..ec861cd0cfe8 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -304,4 +304,12 @@ rpc_clnt_swap_deactivate(struct rpc_clnt *clnt)
 }
 #endif /* CONFIG_SUNRPC_SWAP */
 
+static inline bool
+rpc_task_need_resched(const struct rpc_task *task)
+{
+	if (RPC_IS_QUEUED(task) || task->tk_callback)
+		return true;
+	return false;
+}
+
 #endif /* _LINUX_SUNRPC_SCHED_H_ */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 67c955d8b21b..498dd6ad5bc5 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1540,6 +1540,7 @@ call_start(struct rpc_task *task)
 	clnt->cl_stats->rpccnt++;
 	task->tk_action = call_reserve;
 	rpc_task_set_transport(task, clnt);
+	call_reserve(task);
 }
 
 /*
@@ -1553,6 +1554,9 @@ call_reserve(struct rpc_task *task)
 	task->tk_status  = 0;
 	task->tk_action  = call_reserveresult;
 	xprt_reserve(task);
+	if (rpc_task_need_resched(task))
+		return;
+	 call_reserveresult(task);
 }
 
 static void call_retry_reserve(struct rpc_task *task);
@@ -1575,6 +1579,7 @@ call_reserveresult(struct rpc_task *task)
 	if (status >= 0) {
 		if (task->tk_rqstp) {
 			task->tk_action = call_refresh;
+			call_refresh(task);
 			return;
 		}
 
@@ -1600,6 +1605,7 @@ call_reserveresult(struct rpc_task *task)
 		/* fall through */
 	case -EAGAIN:	/* woken up; retry */
 		task->tk_action = call_retry_reserve;
+		call_retry_reserve(task);
 		return;
 	case -EIO:	/* probably a shutdown */
 		break;
@@ -1622,6 +1628,9 @@ call_retry_reserve(struct rpc_task *task)
 	task->tk_status  = 0;
 	task->tk_action  = call_reserveresult;
 	xprt_retry_reserve(task);
+	if (rpc_task_need_resched(task))
+		return;
+	call_reserveresult(task);
 }
 
 /*
@@ -1636,6 +1645,9 @@ call_refresh(struct rpc_task *task)
 	task->tk_status = 0;
 	task->tk_client->cl_stats->rpcauthrefresh++;
 	rpcauth_refreshcred(task);
+	if (rpc_task_need_resched(task))
+		return;
+	call_refreshresult(task);
 }
 
 /*
@@ -1654,6 +1666,7 @@ call_refreshresult(struct rpc_task *task)
 	case 0:
 		if (rpcauth_uptodatecred(task)) {
 			task->tk_action = call_allocate;
+			call_allocate(task);
 			return;
 		}
 		/* Use rate-limiting and a max number of retries if refresh
@@ -1672,6 +1685,7 @@ call_refreshresult(struct rpc_task *task)
 		task->tk_cred_retry--;
 		dprintk("RPC: %5u %s: retry refresh creds\n",
 				task->tk_pid, __func__);
+		call_refresh(task);
 		return;
 	}
 	dprintk("RPC: %5u %s: refresh creds failed with error %d\n",
@@ -1697,8 +1711,10 @@ call_allocate(struct rpc_task *task)
 	task->tk_status = 0;
 	task->tk_action = call_encode;
 
-	if (req->rq_buffer)
+	if (req->rq_buffer) {
+		call_encode(task);
 		return;
+	}
 
 	if (proc->p_proc != 0) {
 		BUG_ON(proc->p_arglen == 0);
@@ -1719,8 +1735,12 @@ call_allocate(struct rpc_task *task)
 
 	status = xprt->ops->buf_alloc(task);
 	xprt_inject_disconnect(xprt);
-	if (status == 0)
+	if (status == 0) {
+		if (rpc_task_need_resched(task))
+			return;
+		call_encode(task);
 		return;
+	}
 	if (status != -ENOMEM) {
 		rpc_exit(task, status);
 		return;
@@ -1803,12 +1823,8 @@ call_encode(struct rpc_task *task)
 		xprt_request_enqueue_receive(task);
 	xprt_request_enqueue_transmit(task);
 out:
-	task->tk_action = call_transmit;
-	/* Check that the connection is OK */
-	if (!xprt_bound(task->tk_xprt))
-		task->tk_action = call_bind;
-	else if (!xprt_connected(task->tk_xprt))
-		task->tk_action = call_connect;
+	task->tk_action = call_bind;
+	call_bind(task);
 }
 
 /*
@@ -1842,14 +1858,17 @@ call_bind(struct rpc_task *task)
 		return;
 	}
 
+	if (xprt_bound(xprt)) {
+		task->tk_action = call_connect;
+		call_connect(task);
+		return;
+	}
+
 	dprint_status(task);
 
-	task->tk_action = call_connect;
-	if (!xprt_bound(xprt)) {
-		task->tk_action = call_bind_status;
-		task->tk_timeout = xprt->bind_timeout;
-		xprt->ops->rpcbind(task);
-	}
+	task->tk_action = call_bind_status;
+	task->tk_timeout = xprt->bind_timeout;
+	xprt->ops->rpcbind(task);
 }
 
 /*
@@ -1869,6 +1888,7 @@ call_bind_status(struct rpc_task *task)
 		dprint_status(task);
 		task->tk_status = 0;
 		task->tk_action = call_connect;
+		call_connect(task);
 		return;
 	}
 
@@ -1949,21 +1969,24 @@ call_connect(struct rpc_task *task)
 		return;
 	}
 
+	if (xprt_connected(xprt)) {
+		task->tk_action = call_transmit;
+		call_transmit(task);
+		return;
+	}
+
 	dprintk("RPC: %5u call_connect xprt %p %s connected\n",
 			task->tk_pid, xprt,
 			(xprt_connected(xprt) ? "is" : "is not"));
 
-	task->tk_action = call_transmit;
-	if (!xprt_connected(xprt)) {
-		task->tk_action = call_connect_status;
-		if (task->tk_status < 0)
-			return;
-		if (task->tk_flags & RPC_TASK_NOCONNECT) {
-			rpc_exit(task, -ENOTCONN);
-			return;
-		}
-		xprt_connect(task);
+	task->tk_action = call_connect_status;
+	if (task->tk_status < 0)
+		return;
+	if (task->tk_flags & RPC_TASK_NOCONNECT) {
+		rpc_exit(task, -ENOTCONN);
+		return;
 	}
+	xprt_connect(task);
 }
 
 /*
@@ -2016,6 +2039,7 @@ call_connect_status(struct rpc_task *task)
 	case 0:
 		clnt->cl_stats->netreconn++;
 		task->tk_action = call_transmit;
+		call_transmit(task);
 		return;
 	}
 	rpc_exit(task, status);
@@ -2040,19 +2064,20 @@ call_transmit(struct rpc_task *task)
 	dprint_status(task);
 
 	task->tk_action = call_transmit_status;
+	if (!xprt_prepare_transmit(task))
+		return;
+	task->tk_status = 0;
 	if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) {
-		if (!xprt_prepare_transmit(task))
+		if (!xprt_connected(task->tk_xprt)) {
+			task->tk_status = -ENOTCONN;
 			return;
-		task->tk_status = 0;
-		if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) {
-			if (!xprt_connected(task->tk_xprt)) {
-				task->tk_status = -ENOTCONN;
-				return;
-			}
-			xprt_transmit(task);
 		}
+		xprt_transmit(task);
 	}
 	xprt_end_transmit(task);
+	if (rpc_task_need_resched(task))
+		return;
+	call_transmit_status(task);
 }
 
 /*
@@ -2067,8 +2092,12 @@ call_transmit_status(struct rpc_task *task)
 	 * Common case: success.  Force the compiler to put this
 	 * test first.
 	 */
-	if (task->tk_status == 0) {
-		xprt_request_wait_receive(task);
+	if (rpc_task_transmitted(task)) {
+		if (task->tk_status == 0)
+			xprt_request_wait_receive(task);
+		if (rpc_task_need_resched(task))
+			return;
+		call_status(task);
 		return;
 	}
 
@@ -2129,6 +2158,7 @@ call_bc_encode(struct rpc_task *task)
 {
 	xprt_request_enqueue_transmit(task);
 	task->tk_action = call_bc_transmit;
+	call_bc_transmit(task);
 }
 
 /*
@@ -2219,6 +2249,7 @@ call_status(struct rpc_task *task)
 	status = task->tk_status;
 	if (status >= 0) {
 		task->tk_action = call_decode;
+		call_decode(task);
 		return;
 	}
 
-- 
cgit v1.2.3


From 273fe3f1006ea5ebc63d6729e43e8e45e32b256a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 8 Mar 2019 15:30:03 +0100
Subject: netfilter: nf_tables: bogus EBUSY when deleting set after flush
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Set deletion after flush coming in the same batch results in EBUSY. Add
set use counter to track the number of references to this set from
rules. We cannot rely on the list of bindings for this since such list
is still populated from the preparation phase.

Reported-by: Václav Zindulka <vaclav.zindulka@tlapnet.cz>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  6 ++++++
 net/netfilter/nf_tables_api.c     | 28 +++++++++++++++++++++++++++-
 net/netfilter/nft_dynset.c        | 13 +++++++++----
 net/netfilter/nft_lookup.c        | 13 +++++++++----
 net/netfilter/nft_objref.c        | 13 +++++++++----
 5 files changed, 60 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index ed0687b0e0f4..3e9ab643eedf 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -382,6 +382,7 @@ void nft_unregister_set(struct nft_set_type *type);
  * 	@dtype: data type (verdict or numeric type defined by userspace)
  * 	@objtype: object type (see NFT_OBJECT_* definitions)
  * 	@size: maximum set size
+ *	@use: number of rules references to this set
  * 	@nelems: number of elements
  * 	@ndeact: number of deactivated elements queued for removal
  *	@timeout: default timeout value in jiffies
@@ -407,6 +408,7 @@ struct nft_set {
 	u32				dtype;
 	u32				objtype;
 	u32				size;
+	u32				use;
 	atomic_t			nelems;
 	u32				ndeact;
 	u64				timeout;
@@ -467,6 +469,10 @@ struct nft_set_binding {
 	u32				flags;
 };
 
+enum nft_trans_phase;
+void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set,
+			      struct nft_set_binding *binding,
+			      enum nft_trans_phase phase);
 int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
 		       struct nft_set_binding *binding);
 void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 1333bf97dc26..ca09ef037ca5 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3672,6 +3672,9 @@ err1:
 
 static void nft_set_destroy(struct nft_set *set)
 {
+	if (WARN_ON(set->use > 0))
+		return;
+
 	set->ops->destroy(set);
 	module_put(to_set_type(set->ops)->owner);
 	kfree(set->name);
@@ -3712,7 +3715,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
 		NL_SET_BAD_ATTR(extack, attr);
 		return PTR_ERR(set);
 	}
-	if (!list_empty(&set->bindings) ||
+	if (set->use ||
 	    (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0)) {
 		NL_SET_BAD_ATTR(extack, attr);
 		return -EBUSY;
@@ -3742,6 +3745,9 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
 	struct nft_set_binding *i;
 	struct nft_set_iter iter;
 
+	if (set->use == UINT_MAX)
+		return -EOVERFLOW;
+
 	if (!list_empty(&set->bindings) && nft_set_is_anonymous(set))
 		return -EBUSY;
 
@@ -3769,6 +3775,7 @@ bind:
 	binding->chain = ctx->chain;
 	list_add_tail_rcu(&binding->list, &set->bindings);
 	nft_set_trans_bind(ctx, set);
+	set->use++;
 
 	return 0;
 }
@@ -3788,6 +3795,25 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
 }
 EXPORT_SYMBOL_GPL(nf_tables_unbind_set);
 
+void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set,
+			      struct nft_set_binding *binding,
+			      enum nft_trans_phase phase)
+{
+	switch (phase) {
+	case NFT_TRANS_PREPARE:
+		set->use--;
+		return;
+	case NFT_TRANS_ABORT:
+	case NFT_TRANS_RELEASE:
+		set->use--;
+		/* fall through */
+	default:
+		nf_tables_unbind_set(ctx, set, binding,
+				     phase == NFT_TRANS_COMMIT);
+	}
+}
+EXPORT_SYMBOL_GPL(nf_tables_deactivate_set);
+
 void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set)
 {
 	if (list_empty(&set->bindings) && nft_set_is_anonymous(set))
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index a8a74a16f9c4..e461007558e8 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -240,11 +240,15 @@ static void nft_dynset_deactivate(const struct nft_ctx *ctx,
 {
 	struct nft_dynset *priv = nft_expr_priv(expr);
 
-	if (phase == NFT_TRANS_PREPARE)
-		return;
+	nf_tables_deactivate_set(ctx, priv->set, &priv->binding, phase);
+}
+
+static void nft_dynset_activate(const struct nft_ctx *ctx,
+				const struct nft_expr *expr)
+{
+	struct nft_dynset *priv = nft_expr_priv(expr);
 
-	nf_tables_unbind_set(ctx, priv->set, &priv->binding,
-			     phase == NFT_TRANS_COMMIT);
+	priv->set->use++;
 }
 
 static void nft_dynset_destroy(const struct nft_ctx *ctx,
@@ -292,6 +296,7 @@ static const struct nft_expr_ops nft_dynset_ops = {
 	.eval		= nft_dynset_eval,
 	.init		= nft_dynset_init,
 	.destroy	= nft_dynset_destroy,
+	.activate	= nft_dynset_activate,
 	.deactivate	= nft_dynset_deactivate,
 	.dump		= nft_dynset_dump,
 };
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 14496da5141d..161c3451a747 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -127,11 +127,15 @@ static void nft_lookup_deactivate(const struct nft_ctx *ctx,
 {
 	struct nft_lookup *priv = nft_expr_priv(expr);
 
-	if (phase == NFT_TRANS_PREPARE)
-		return;
+	nf_tables_deactivate_set(ctx, priv->set, &priv->binding, phase);
+}
+
+static void nft_lookup_activate(const struct nft_ctx *ctx,
+				const struct nft_expr *expr)
+{
+	struct nft_lookup *priv = nft_expr_priv(expr);
 
-	nf_tables_unbind_set(ctx, priv->set, &priv->binding,
-			     phase == NFT_TRANS_COMMIT);
+	priv->set->use++;
 }
 
 static void nft_lookup_destroy(const struct nft_ctx *ctx,
@@ -222,6 +226,7 @@ static const struct nft_expr_ops nft_lookup_ops = {
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
 	.eval		= nft_lookup_eval,
 	.init		= nft_lookup_init,
+	.activate	= nft_lookup_activate,
 	.deactivate	= nft_lookup_deactivate,
 	.destroy	= nft_lookup_destroy,
 	.dump		= nft_lookup_dump,
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index 79ef074c18ca..457a9ceb46af 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -162,11 +162,15 @@ static void nft_objref_map_deactivate(const struct nft_ctx *ctx,
 {
 	struct nft_objref_map *priv = nft_expr_priv(expr);
 
-	if (phase == NFT_TRANS_PREPARE)
-		return;
+	nf_tables_deactivate_set(ctx, priv->set, &priv->binding, phase);
+}
+
+static void nft_objref_map_activate(const struct nft_ctx *ctx,
+				    const struct nft_expr *expr)
+{
+	struct nft_objref_map *priv = nft_expr_priv(expr);
 
-	nf_tables_unbind_set(ctx, priv->set, &priv->binding,
-			     phase == NFT_TRANS_COMMIT);
+	priv->set->use++;
 }
 
 static void nft_objref_map_destroy(const struct nft_ctx *ctx,
@@ -183,6 +187,7 @@ static const struct nft_expr_ops nft_objref_map_ops = {
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_objref_map)),
 	.eval		= nft_objref_map_eval,
 	.init		= nft_objref_map_init,
+	.activate	= nft_objref_map_activate,
 	.deactivate	= nft_objref_map_deactivate,
 	.destroy	= nft_objref_map_destroy,
 	.dump		= nft_objref_map_dump,
-- 
cgit v1.2.3


From b41fdc4a7bf9045e4871c5b15905ea732ffd044f Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 11 Mar 2019 15:38:10 +0000
Subject: irqchip/gic: Drop support for secondary GIC in non-DT systems

We do not have any in-tree platform with this pathological setup,
and only a single system (Cavium's cns3xxx) isn't DT aware.

Let's drop the secondary GIC support for now, until we remove
the above horror altogether.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm/mach-cns3xxx/core.c    |  2 +-
 drivers/irqchip/irq-gic.c       | 45 +++++++++++++++--------------------------
 include/linux/irqchip/arm-gic.h |  3 +--
 3 files changed, 18 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-cns3xxx/core.c b/arch/arm/mach-cns3xxx/core.c
index 7d5a44a06648..f676592d8402 100644
--- a/arch/arm/mach-cns3xxx/core.c
+++ b/arch/arm/mach-cns3xxx/core.c
@@ -90,7 +90,7 @@ void __init cns3xxx_map_io(void)
 /* used by entry-macro.S */
 void __init cns3xxx_init_irq(void)
 {
-	gic_init(0, 29, IOMEM(CNS3XXX_TC11MP_GIC_DIST_BASE_VIRT),
+	gic_init(IOMEM(CNS3XXX_TC11MP_GIC_DIST_BASE_VIRT),
 		 IOMEM(CNS3XXX_TC11MP_GIC_CPU_BASE_VIRT));
 }
 
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index ba2a37a27a54..fd3110c171ba 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -1089,11 +1089,10 @@ static void gic_init_chip(struct gic_chip_data *gic, struct device *dev,
 #endif
 }
 
-static int gic_init_bases(struct gic_chip_data *gic, int irq_start,
+static int gic_init_bases(struct gic_chip_data *gic,
 			  struct fwnode_handle *handle)
 {
-	irq_hw_number_t hwirq_base;
-	int gic_irqs, irq_base, ret;
+	int gic_irqs, ret;
 
 	if (IS_ENABLED(CONFIG_GIC_NON_BANKED) && gic->percpu_offset) {
 		/* Frankein-GIC without banked registers... */
@@ -1145,28 +1144,21 @@ static int gic_init_bases(struct gic_chip_data *gic, int irq_start,
 	} else {		/* Legacy support */
 		/*
 		 * For primary GICs, skip over SGIs.
-		 * For secondary GICs, skip over PPIs, too.
+		 * No secondary GIC support whatsoever.
 		 */
-		if (gic == &gic_data[0] && (irq_start & 31) > 0) {
-			hwirq_base = 16;
-			if (irq_start != -1)
-				irq_start = (irq_start & ~31) + 16;
-		} else {
-			hwirq_base = 32;
-		}
+		int irq_base;
 
-		gic_irqs -= hwirq_base; /* calculate # of irqs to allocate */
+		gic_irqs -= 16; /* calculate # of irqs to allocate */
 
-		irq_base = irq_alloc_descs(irq_start, 16, gic_irqs,
+		irq_base = irq_alloc_descs(16, 16, gic_irqs,
 					   numa_node_id());
 		if (irq_base < 0) {
-			WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
-			     irq_start);
-			irq_base = irq_start;
+			WARN(1, "Cannot allocate irq_descs @ IRQ16, assuming pre-allocated\n");
+			irq_base = 16;
 		}
 
 		gic->domain = irq_domain_add_legacy(NULL, gic_irqs, irq_base,
-					hwirq_base, &gic_irq_domain_ops, gic);
+						    16, &gic_irq_domain_ops, gic);
 	}
 
 	if (WARN_ON(!gic->domain)) {
@@ -1195,7 +1187,6 @@ error:
 }
 
 static int __init __gic_init_bases(struct gic_chip_data *gic,
-				   int irq_start,
 				   struct fwnode_handle *handle)
 {
 	char *name;
@@ -1231,32 +1222,28 @@ static int __init __gic_init_bases(struct gic_chip_data *gic,
 		gic_init_chip(gic, NULL, name, false);
 	}
 
-	ret = gic_init_bases(gic, irq_start, handle);
+	ret = gic_init_bases(gic, handle);
 	if (ret)
 		kfree(name);
 
 	return ret;
 }
 
-void __init gic_init(unsigned int gic_nr, int irq_start,
-		     void __iomem *dist_base, void __iomem *cpu_base)
+void __init gic_init(void __iomem *dist_base, void __iomem *cpu_base)
 {
 	struct gic_chip_data *gic;
 
-	if (WARN_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR))
-		return;
-
 	/*
 	 * Non-DT/ACPI systems won't run a hypervisor, so let's not
 	 * bother with these...
 	 */
 	static_branch_disable(&supports_deactivate_key);
 
-	gic = &gic_data[gic_nr];
+	gic = &gic_data[0];
 	gic->raw_dist_base = dist_base;
 	gic->raw_cpu_base = cpu_base;
 
-	__gic_init_bases(gic, irq_start, NULL);
+	__gic_init_bases(gic, NULL);
 }
 
 static void gic_teardown(struct gic_chip_data *gic)
@@ -1399,7 +1386,7 @@ int gic_of_init_child(struct device *dev, struct gic_chip_data **gic, int irq)
 	if (ret)
 		return ret;
 
-	ret = gic_init_bases(*gic, -1, &dev->of_node->fwnode);
+	ret = gic_init_bases(*gic, &dev->of_node->fwnode);
 	if (ret) {
 		gic_teardown(*gic);
 		return ret;
@@ -1459,7 +1446,7 @@ gic_of_init(struct device_node *node, struct device_node *parent)
 	if (gic_cnt == 0 && !gic_check_eoimode(node, &gic->raw_cpu_base))
 		static_branch_disable(&supports_deactivate_key);
 
-	ret = __gic_init_bases(gic, -1, &node->fwnode);
+	ret = __gic_init_bases(gic, &node->fwnode);
 	if (ret) {
 		gic_teardown(gic);
 		return ret;
@@ -1650,7 +1637,7 @@ static int __init gic_v2_acpi_init(struct acpi_subtable_header *header,
 		return -ENOMEM;
 	}
 
-	ret = __gic_init_bases(gic, -1, domain_handle);
+	ret = __gic_init_bases(gic, domain_handle);
 	if (ret) {
 		pr_err("Failed to initialise GIC\n");
 		irq_domain_free_fwnode(domain_handle);
diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index 626179077bb0..0f049b384ccd 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -158,8 +158,7 @@ int gic_of_init_child(struct device *dev, struct gic_chip_data **gic, int irq);
  * Legacy platforms not converted to DT yet must use this to init
  * their GIC
  */
-void gic_init(unsigned int nr, int start,
-	      void __iomem *dist , void __iomem *cpu);
+void gic_init(void __iomem *dist , void __iomem *cpu);
 
 int gicv2m_init(struct fwnode_handle *parent_handle,
 		struct irq_domain *parent);
-- 
cgit v1.2.3


From a623a7a1a5670c25a16881f5078072d272d96b71 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 11 Mar 2019 16:38:17 +0100
Subject: y2038: fix socket.h header inclusion

Referencing the __kernel_long_t type caused some user space applications
to stop compiling when they had not already included linux/posix_types.h,
e.g.

s/multicast.c -o ext/sockets/multicast.lo
In file included from /builddir/build/BUILD/php-7.3.3/main/php.h:468,
                 from /builddir/build/BUILD/php-7.3.3/ext/sockets/sockets.c:27:
/builddir/build/BUILD/php-7.3.3/ext/sockets/sockets.c: In function 'zm_startup_sockets':
/builddir/build/BUILD/php-7.3.3/ext/sockets/sockets.c:776:40: error: '__kernel_long_t' undeclared (first use in this function)
  776 |  REGISTER_LONG_CONSTANT("SO_SNDTIMEO", SO_SNDTIMEO, CONST_CS | CONST_PERSISTENT);

It is safe to include that header here, since it only contains kernel
internal types that do not conflict with other user space types.

It's still possible that some related build failures remain, but those
are likely to be for code that is not already y2038 safe.

Reported-by: Laura Abbott <labbott@redhat.com>
Fixes: a9beb86ae6e5 ("sock: Add SO_RCVTIMEO_NEW and SO_SNDTIMEO_NEW")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/uapi/asm/socket.h  | 2 +-
 arch/mips/include/uapi/asm/socket.h   | 2 +-
 arch/parisc/include/uapi/asm/socket.h | 2 +-
 arch/sparc/include/uapi/asm/socket.h  | 2 +-
 include/uapi/asm-generic/socket.h     | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 0d0fddb7e738..976e89b116e5 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -2,8 +2,8 @@
 #ifndef _UAPI_ASM_SOCKET_H
 #define _UAPI_ASM_SOCKET_H
 
+#include <linux/posix_types.h>
 #include <asm/sockios.h>
-#include <asm/bitsperlong.h>
 
 /* For setsockopt(2) */
 /*
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index eb9f33f8a8b3..d41765cfbc6e 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -10,8 +10,8 @@
 #ifndef _UAPI_ASM_SOCKET_H
 #define _UAPI_ASM_SOCKET_H
 
+#include <linux/posix_types.h>
 #include <asm/sockios.h>
-#include <asm/bitsperlong.h>
 
 /*
  * For setsockopt(2)
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 16e428f03526..66c5dd245ac7 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -2,8 +2,8 @@
 #ifndef _UAPI_ASM_SOCKET_H
 #define _UAPI_ASM_SOCKET_H
 
+#include <linux/posix_types.h>
 #include <asm/sockios.h>
-#include <asm/bitsperlong.h>
 
 /* For setsockopt(2) */
 #define SOL_SOCKET	0xffff
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 88fe4f978aca..9265a9eece15 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -2,8 +2,8 @@
 #ifndef _ASM_SOCKET_H
 #define _ASM_SOCKET_H
 
+#include <linux/posix_types.h>
 #include <asm/sockios.h>
-#include <asm/bitsperlong.h>
 
 /* For setsockopt(2) */
 #define SOL_SOCKET	0xffff
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index c8b430cb6dc4..8c1391c89171 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -2,8 +2,8 @@
 #ifndef __ASM_GENERIC_SOCKET_H
 #define __ASM_GENERIC_SOCKET_H
 
+#include <linux/posix_types.h>
 #include <asm/sockios.h>
-#include <asm/bitsperlong.h>
 
 /* For setsockopt(2) */
 #define SOL_SOCKET	1
-- 
cgit v1.2.3


From 623217a0cc45a6c179303b3bbfdc594806a464cc Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 11 Mar 2019 12:53:59 +0100
Subject: PM / wakeup: Drop wakeup_source_drop()

After commit d856f39ac1cc ("PM / wakeup: Rework wakeup source timer
cancellation") wakeup_source_drop() is a trivial wrapper around
__pm_relax() and it has no users except for wakeup_source_destroy()
and wakeup_source_trash() which also has no users, so drop it along
with the latter and make wakeup_source_destroy() call __pm_relax()
directly.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/base/power/wakeup.c | 18 +-----------------
 include/linux/pm_wakeup.h   |  9 ---------
 2 files changed, 1 insertion(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index a25d2d82f44d..ecbe152d151f 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -106,22 +106,6 @@ struct wakeup_source *wakeup_source_create(const char *name)
 }
 EXPORT_SYMBOL_GPL(wakeup_source_create);
 
-/**
- * wakeup_source_drop - Prepare a struct wakeup_source object for destruction.
- * @ws: Wakeup source to prepare for destruction.
- *
- * Callers must ensure that __pm_stay_awake() or __pm_wakeup_event() will never
- * be run in parallel with this function for the same wakeup source object.
- */
-void wakeup_source_drop(struct wakeup_source *ws)
-{
-	if (!ws)
-		return;
-
-	__pm_relax(ws);
-}
-EXPORT_SYMBOL_GPL(wakeup_source_drop);
-
 /*
  * Record wakeup_source statistics being deleted into a dummy wakeup_source.
  */
@@ -161,7 +145,7 @@ void wakeup_source_destroy(struct wakeup_source *ws)
 	if (!ws)
 		return;
 
-	wakeup_source_drop(ws);
+	__pm_relax(ws);
 	wakeup_source_record(ws);
 	kfree_const(ws->name);
 	kfree(ws);
diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h
index 4238dde0aaf0..0ff134d6575a 100644
--- a/include/linux/pm_wakeup.h
+++ b/include/linux/pm_wakeup.h
@@ -96,7 +96,6 @@ static inline void device_set_wakeup_path(struct device *dev)
 /* drivers/base/power/wakeup.c */
 extern void wakeup_source_prepare(struct wakeup_source *ws, const char *name);
 extern struct wakeup_source *wakeup_source_create(const char *name);
-extern void wakeup_source_drop(struct wakeup_source *ws);
 extern void wakeup_source_destroy(struct wakeup_source *ws);
 extern void wakeup_source_add(struct wakeup_source *ws);
 extern void wakeup_source_remove(struct wakeup_source *ws);
@@ -134,8 +133,6 @@ static inline struct wakeup_source *wakeup_source_create(const char *name)
 	return NULL;
 }
 
-static inline void wakeup_source_drop(struct wakeup_source *ws) {}
-
 static inline void wakeup_source_destroy(struct wakeup_source *ws) {}
 
 static inline void wakeup_source_add(struct wakeup_source *ws) {}
@@ -204,12 +201,6 @@ static inline void wakeup_source_init(struct wakeup_source *ws,
 	wakeup_source_add(ws);
 }
 
-static inline void wakeup_source_trash(struct wakeup_source *ws)
-{
-	wakeup_source_remove(ws);
-	wakeup_source_drop(ws);
-}
-
 static inline void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec)
 {
 	return pm_wakeup_ws_event(ws, msec, false);
-- 
cgit v1.2.3


From b57e622e6da9048c96fa0ed6943834949a398e3f Mon Sep 17 00:00:00 2001
From: Souptick Joarder <jrdr.linux@gmail.com>
Date: Mon, 11 Mar 2019 23:28:10 -0700
Subject: mm/hmm: convert to use vm_fault_t
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert to use vm_fault_t type as return type for fault handler.

kbuild reported warning during testing of
*mm-create-the-new-vm_fault_t-type.patch* available in below link -
https://patchwork.kernel.org/patch/10752741/

  kernel/memremap.c:46:34: warning: incorrect type in return expression
                           (different base types)
  kernel/memremap.c:46:34: expected restricted vm_fault_t
  kernel/memremap.c:46:34: got int

This patch has fixed the warnings and also hmm_devmem_fault() is
converted to return vm_fault_t to avoid further warnings.

[sfr@canb.auug.org.au: drm/nouveau/dmem: update for struct hmm_devmem_ops member change]
  Link: http://lkml.kernel.org/r/20190220174407.753d94e5@canb.auug.org.au
Link: http://lkml.kernel.org/r/20190110145900.GA1317@jordon-HP-15-Notebook-PC
Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpu/drm/nouveau/nouveau_dmem.c | 2 +-
 include/linux/hmm.h                    | 4 ++--
 mm/hmm.c                               | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 8be7a83ced9b..aa9fec80492d 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -261,7 +261,7 @@ static const struct migrate_vma_ops nouveau_dmem_fault_migrate_ops = {
 	.finalize_and_map	= nouveau_dmem_fault_finalize_and_map,
 };
 
-static int
+static vm_fault_t
 nouveau_dmem_fault(struct hmm_devmem *devmem,
 		   struct vm_area_struct *vma,
 		   unsigned long addr,
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 66f9ebbb1df3..ad50b7b4f141 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -468,7 +468,7 @@ struct hmm_devmem_ops {
 	 * Note that mmap semaphore is held in read mode at least when this
 	 * callback occurs, hence the vma is valid upon callback entry.
 	 */
-	int (*fault)(struct hmm_devmem *devmem,
+	vm_fault_t (*fault)(struct hmm_devmem *devmem,
 		     struct vm_area_struct *vma,
 		     unsigned long addr,
 		     const struct page *page,
@@ -511,7 +511,7 @@ struct hmm_devmem_ops {
  * chunk, as an optimization. It must, however, prioritize the faulting address
  * over all the others.
  */
-typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
+typedef vm_fault_t (*dev_page_fault_t)(struct vm_area_struct *vma,
 				unsigned long addr,
 				const struct page *page,
 				unsigned int flags,
diff --git a/mm/hmm.c b/mm/hmm.c
index a04e4b810610..fe1cd87e49ac 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -990,7 +990,7 @@ static void hmm_devmem_ref_kill(struct percpu_ref *ref)
 	percpu_ref_kill(ref);
 }
 
-static int hmm_devmem_fault(struct vm_area_struct *vma,
+static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma,
 			    unsigned long addr,
 			    const struct page *page,
 			    unsigned int flags,
-- 
cgit v1.2.3


From b5420237ec817b0b5f729a674c81ace0865c3b3b Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 11 Mar 2019 23:28:13 -0700
Subject: mm: refactor readahead defines in mm.h

All users of VM_MAX_READAHEAD actually convert it to kbytes and then to
pages. Define the macro explicitly as (SZ_128K / PAGE_SIZE). This
simplifies the expression in every filesystem. Also rename the macro to
VM_READAHEAD_PAGES to properly convey its meaning. Finally remove unused
VM_MIN_READAHEAD

[akpm@linux-foundation.org: fix fs/io_uring.c, per Stephen]
Link: http://lkml.kernel.org/r/20181221144053.24318-1-nborisov@suse.com
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Matthew Wilcox <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Eric Van Hensbergen <ericvh@gmail.com>
Cc: Latchesar Ionkov <lucho@ionkov.net>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/blk-core.c   | 3 +--
 fs/9p/vfs_super.c  | 2 +-
 fs/afs/super.c     | 2 +-
 fs/btrfs/disk-io.c | 2 +-
 fs/fuse/inode.c    | 2 +-
 fs/io_uring.c      | 2 +-
 include/linux/mm.h | 4 ++--
 7 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index 6b78ec56a4f2..4673ebe42255 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -500,8 +500,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (!q->stats)
 		goto fail_stats;
 
-	q->backing_dev_info->ra_pages =
-			(VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+	q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES;
 	q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
 	q->backing_dev_info->name = "block";
 	q->node = node_id;
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 48ce50484e80..10d3bd3f534b 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -92,7 +92,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 		return ret;
 
 	if (v9ses->cache)
-		sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
+		sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
 
 	sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
 	if (!v9ses->cache)
diff --git a/fs/afs/super.c b/fs/afs/super.c
index dcd07fe99871..e684f6769b15 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -399,7 +399,7 @@ static int afs_fill_super(struct super_block *sb,
 	ret = super_setup_bdi(sb);
 	if (ret)
 		return ret;
-	sb->s_bdi->ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+	sb->s_bdi->ra_pages	= VM_READAHEAD_PAGES;
 
 	/* allocate the root inode and dentry */
 	if (as->dyn_root) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f0cdb53f3e2d..6fe9197f6ee4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2958,7 +2958,7 @@ int open_ctree(struct super_block *sb,
 	sb->s_bdi->congested_fn = btrfs_congested_fn;
 	sb->s_bdi->congested_data = fs_info;
 	sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
-	sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE;
+	sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
 	sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
 	sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index c2d4099429be..16750ed591ae 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1010,7 +1010,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 	if (err)
 		return err;
 
-	sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+	sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
 	/* fuse does it's own writeback accounting */
 	sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
 
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5d99376d2369..c88088d92613 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -923,7 +923,7 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
 		/* Use 8x RA size as a decent limiter for both reads/writes */
 		max_pages = filp->f_ra.ra_pages;
 		if (!max_pages)
-			max_pages = VM_MAX_READAHEAD >> (PAGE_SHIFT - 10);
+			max_pages = VM_READAHEAD_PAGES;
 		max_pages *= 8;
 
 		/* If max pages are exceeded, reset the state */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5801ee849f36..76769749b5a5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -26,6 +26,7 @@
 #include <linux/page_ref.h>
 #include <linux/memremap.h>
 #include <linux/overflow.h>
+#include <linux/sizes.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -2402,8 +2403,7 @@ int __must_check write_one_page(struct page *page);
 void task_dirty_inc(struct task_struct *tsk);
 
 /* readahead.c */
-#define VM_MAX_READAHEAD	128	/* kbytes */
-#define VM_MIN_READAHEAD	16	/* kbytes (includes current page) */
+#define VM_READAHEAD_PAGES	(SZ_128K / PAGE_SIZE)
 
 int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 			pgoff_t offset, unsigned long nr_to_read);
-- 
cgit v1.2.3


From 53d818d2747ca84f1a87a0006b903523cd5bf0cd Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 11 Mar 2019 23:29:11 -0700
Subject: memblock: drop memblock_alloc_base_nid()

memblock_alloc_base_nid() is a oneliner wrapper for
memblock_alloc_range_nid() without any side effect.

Replace it's usage by the direct calls to memblock_alloc_range_nid().

Link: http://lkml.kernel.org/r/1548057848-15136-5-git-send-email-rppt@linux.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Christoph Hellwig <hch@lst.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Greentime Hu <green.hu@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: Guo Ren <guoren@kernel.org>
Cc: Guo Ren <ren_guo@c-sky.com>				[c-sky]
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Juergen Gross <jgross@suse.com>			[Xen]
Cc: Mark Salter <msalter@redhat.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h |  3 ---
 mm/memblock.c            | 15 ++++-----------
 2 files changed, 4 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 859b55b66db2..4db53f7c6b17 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -446,9 +446,6 @@ static inline bool memblock_bottom_up(void)
 phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
 					phys_addr_t start, phys_addr_t end,
 					enum memblock_flags flags);
-phys_addr_t memblock_alloc_base_nid(phys_addr_t size,
-					phys_addr_t align, phys_addr_t max_addr,
-					int nid, enum memblock_flags flags);
 phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
 				phys_addr_t max_addr);
 phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
diff --git a/mm/memblock.c b/mm/memblock.c
index 470601115892..e9e440cfd210 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1289,21 +1289,14 @@ phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
 					flags);
 }
 
-phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
-					phys_addr_t align, phys_addr_t max_addr,
-					int nid, enum memblock_flags flags)
-{
-	return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags);
-}
-
 phys_addr_t __init memblock_phys_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
 {
 	enum memblock_flags flags = choose_memblock_flags();
 	phys_addr_t ret;
 
 again:
-	ret = memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE,
-				      nid, flags);
+	ret = memblock_alloc_range_nid(size, align, 0,
+				       MEMBLOCK_ALLOC_ACCESSIBLE, nid, flags);
 
 	if (!ret && (flags & MEMBLOCK_MIRROR)) {
 		flags &= ~MEMBLOCK_MIRROR;
@@ -1314,8 +1307,8 @@ again:
 
 phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
 {
-	return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE,
-				       MEMBLOCK_NONE);
+	return memblock_alloc_range_nid(size, align, 0, max_addr, NUMA_NO_NODE,
+					MEMBLOCK_NONE);
 }
 
 phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
-- 
cgit v1.2.3


From 8a770c2a83eaf4c3d493ca4056abd6d6ddce6f18 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 11 Mar 2019 23:29:16 -0700
Subject: memblock: emphasize that memblock_alloc_range() returns a physical
 address

Rename memblock_alloc_range() to memblock_phys_alloc_range() to
emphasize that it returns a physical address.

While on it, remove the 'enum memblock_flags' parameter from this
function as its only user anyway sets it to MEMBLOCK_NONE, which is the
default for the most of memblock allocations.

Link: http://lkml.kernel.org/r/1548057848-15136-6-git-send-email-rppt@linux.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Christoph Hellwig <hch@lst.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Greentime Hu <green.hu@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: Guo Ren <guoren@kernel.org>
Cc: Guo Ren <ren_guo@c-sky.com>				[c-sky]
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Juergen Gross <jgross@suse.com>			[Xen]
Cc: Mark Salter <msalter@redhat.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h |  5 ++---
 mm/cma.c                 | 10 ++++------
 mm/memblock.c            |  9 +++++----
 3 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 4db53f7c6b17..251cd66b151b 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -325,6 +325,8 @@ static inline int memblock_get_region_node(const struct memblock_region *r)
 #define ARCH_LOW_ADDRESS_LIMIT  0xffffffffUL
 #endif
 
+phys_addr_t memblock_phys_alloc_range(phys_addr_t size, phys_addr_t align,
+				      phys_addr_t start, phys_addr_t end);
 phys_addr_t memblock_phys_alloc_nid(phys_addr_t size, phys_addr_t align, int nid);
 phys_addr_t memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid);
 
@@ -443,9 +445,6 @@ static inline bool memblock_bottom_up(void)
 	return memblock.bottom_up;
 }
 
-phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
-					phys_addr_t start, phys_addr_t end,
-					enum memblock_flags flags);
 phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
 				phys_addr_t max_addr);
 phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
diff --git a/mm/cma.c b/mm/cma.c
index f4f3a8a57d86..bb2d333ffcb3 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -327,16 +327,14 @@ int __init cma_declare_contiguous(phys_addr_t base,
 		 * memory in case of failure.
 		 */
 		if (base < highmem_start && limit > highmem_start) {
-			addr = memblock_alloc_range(size, alignment,
-						    highmem_start, limit,
-						    MEMBLOCK_NONE);
+			addr = memblock_phys_alloc_range(size, alignment,
+							 highmem_start, limit);
 			limit = highmem_start;
 		}
 
 		if (!addr) {
-			addr = memblock_alloc_range(size, alignment, base,
-						    limit,
-						    MEMBLOCK_NONE);
+			addr = memblock_phys_alloc_range(size, alignment, base,
+							 limit);
 			if (!addr) {
 				ret = -ENOMEM;
 				goto err;
diff --git a/mm/memblock.c b/mm/memblock.c
index e9e440cfd210..eb785ea6757b 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1281,12 +1281,13 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
 	return 0;
 }
 
-phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
-					phys_addr_t start, phys_addr_t end,
-					enum memblock_flags flags)
+phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size,
+					     phys_addr_t align,
+					     phys_addr_t start,
+					     phys_addr_t end)
 {
 	return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
-					flags);
+					MEMBLOCK_NONE);
 }
 
 phys_addr_t __init memblock_phys_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
-- 
cgit v1.2.3


From ecc3e771f4ca98c52a072e41804434b4979bdf84 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 11 Mar 2019 23:29:26 -0700
Subject: memblock: memblock_phys_alloc(): don't panic

Make the memblock_phys_alloc() function an inline wrapper for
memblock_phys_alloc_range() and update the memblock_phys_alloc() callers
to check the returned value and panic in case of error.

Link: http://lkml.kernel.org/r/1548057848-15136-8-git-send-email-rppt@linux.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Christoph Hellwig <hch@lst.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Greentime Hu <green.hu@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: Guo Ren <guoren@kernel.org>
Cc: Guo Ren <ren_guo@c-sky.com>				[c-sky]
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Juergen Gross <jgross@suse.com>			[Xen]
Cc: Mark Salter <msalter@redhat.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/init.c                   | 4 ++++
 arch/arm64/mm/mmu.c                  | 2 ++
 arch/powerpc/sysdev/dart_iommu.c     | 3 +++
 arch/s390/kernel/crash_dump.c        | 3 +++
 arch/s390/kernel/setup.c             | 3 +++
 arch/sh/boards/mach-ap325rxa/setup.c | 3 +++
 arch/sh/boards/mach-ecovec24/setup.c | 6 ++++++
 arch/sh/boards/mach-kfr2r09/setup.c  | 3 +++
 arch/sh/boards/mach-migor/setup.c    | 3 +++
 arch/sh/boards/mach-se/7724/setup.c  | 6 ++++++
 arch/xtensa/mm/kasan_init.c          | 3 +++
 include/linux/memblock.h             | 7 ++++++-
 mm/memblock.c                        | 5 -----
 13 files changed, 45 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index b76b90eb9356..15dddfe43319 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -206,6 +206,10 @@ phys_addr_t __init arm_memblock_steal(phys_addr_t size, phys_addr_t align)
 	BUG_ON(!arm_memblock_steal_permitted);
 
 	phys = memblock_phys_alloc(size, align);
+	if (!phys)
+		panic("Failed to steal %pa bytes at %pS\n",
+		      &size, (void *)_RET_IP_);
+
 	memblock_free(phys, size);
 	memblock_remove(phys, size);
 
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 402b6495ff58..e97f018ff740 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -103,6 +103,8 @@ static phys_addr_t __init early_pgtable_alloc(void)
 	void *ptr;
 
 	phys = memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
+	if (!phys)
+		panic("Failed to allocate page table page\n");
 
 	/*
 	 * The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE
diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index fc5c5c23303e..2a751795ec1e 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -265,6 +265,9 @@ static void allocate_dart(void)
 	 * prefetching into invalid pages and corrupting data
 	 */
 	tmp = memblock_phys_alloc(DART_PAGE_SIZE, DART_PAGE_SIZE);
+	if (!tmp)
+		panic("DART: table allocation failed\n");
+
 	dart_emptyval = DARTMAP_VALID | ((tmp >> DART_PAGE_SHIFT) &
 					 DARTMAP_RPNMASK);
 
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index 97eae3871868..f96a5857bbfd 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -61,6 +61,9 @@ struct save_area * __init save_area_alloc(bool is_boot_cpu)
 	struct save_area *sa;
 
 	sa = (void *) memblock_phys_alloc(sizeof(*sa), 8);
+	if (!sa)
+		panic("Failed to allocate save area\n");
+
 	if (is_boot_cpu)
 		list_add(&sa->list, &dump_save_areas);
 	else
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 12934e8fbb91..d7920f3e76c6 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -966,6 +966,9 @@ static void __init setup_randomness(void)
 
 	vmms = (struct sysinfo_3_2_2 *) memblock_phys_alloc(PAGE_SIZE,
 							    PAGE_SIZE);
+	if (!vmms)
+		panic("Failed to allocate memory for sysinfo structure\n");
+
 	if (stsi(vmms, 3, 2, 2) == 0 && vmms->count)
 		add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count);
 	memblock_free((unsigned long) vmms, PAGE_SIZE);
diff --git a/arch/sh/boards/mach-ap325rxa/setup.c b/arch/sh/boards/mach-ap325rxa/setup.c
index 97774424fbee..8301a4378f50 100644
--- a/arch/sh/boards/mach-ap325rxa/setup.c
+++ b/arch/sh/boards/mach-ap325rxa/setup.c
@@ -557,6 +557,9 @@ static void __init ap325rxa_mv_mem_reserve(void)
 	phys_addr_t size = CEU_BUFFER_MEMORY_SIZE;
 
 	phys = memblock_phys_alloc(size, PAGE_SIZE);
+	if (!phys)
+		panic("Failed to allocate CEU memory\n");
+
 	memblock_free(phys, size);
 	memblock_remove(phys, size);
 
diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c
index d329bf3be487..34e5414c5563 100644
--- a/arch/sh/boards/mach-ecovec24/setup.c
+++ b/arch/sh/boards/mach-ecovec24/setup.c
@@ -1477,11 +1477,17 @@ static void __init ecovec_mv_mem_reserve(void)
 	phys_addr_t size = CEU_BUFFER_MEMORY_SIZE;
 
 	phys = memblock_phys_alloc(size, PAGE_SIZE);
+	if (!phys)
+		panic("Failed to allocate CEU0 memory\n");
+
 	memblock_free(phys, size);
 	memblock_remove(phys, size);
 	ceu0_dma_membase = phys;
 
 	phys = memblock_phys_alloc(size, PAGE_SIZE);
+	if (!phys)
+		panic("Failed to allocate CEU1 memory\n");
+
 	memblock_free(phys, size);
 	memblock_remove(phys, size);
 	ceu1_dma_membase = phys;
diff --git a/arch/sh/boards/mach-kfr2r09/setup.c b/arch/sh/boards/mach-kfr2r09/setup.c
index 5c258ae9c43a..1cf9a47ac90e 100644
--- a/arch/sh/boards/mach-kfr2r09/setup.c
+++ b/arch/sh/boards/mach-kfr2r09/setup.c
@@ -631,6 +631,9 @@ static void __init kfr2r09_mv_mem_reserve(void)
 	phys_addr_t size = CEU_BUFFER_MEMORY_SIZE;
 
 	phys = memblock_phys_alloc(size, PAGE_SIZE);
+	if (!phys)
+		panic("Failed to allocate CEU memory\n");
+
 	memblock_free(phys, size);
 	memblock_remove(phys, size);
 
diff --git a/arch/sh/boards/mach-migor/setup.c b/arch/sh/boards/mach-migor/setup.c
index 193d91bb84bf..90702740f207 100644
--- a/arch/sh/boards/mach-migor/setup.c
+++ b/arch/sh/boards/mach-migor/setup.c
@@ -631,6 +631,9 @@ static void __init migor_mv_mem_reserve(void)
 	phys_addr_t size = CEU_BUFFER_MEMORY_SIZE;
 
 	phys = memblock_phys_alloc(size, PAGE_SIZE);
+	if (!phys)
+		panic("Failed to allocate CEU memory\n");
+
 	memblock_free(phys, size);
 	memblock_remove(phys, size);
 
diff --git a/arch/sh/boards/mach-se/7724/setup.c b/arch/sh/boards/mach-se/7724/setup.c
index 5c7aa37bfd86..3674064816c7 100644
--- a/arch/sh/boards/mach-se/7724/setup.c
+++ b/arch/sh/boards/mach-se/7724/setup.c
@@ -964,11 +964,17 @@ static void __init ms7724se_mv_mem_reserve(void)
 	phys_addr_t size = CEU_BUFFER_MEMORY_SIZE;
 
 	phys = memblock_phys_alloc(size, PAGE_SIZE);
+	if (!phys)
+		panic("Failed to allocate CEU0 memory\n");
+
 	memblock_free(phys, size);
 	memblock_remove(phys, size);
 	ceu0_dma_membase = phys;
 
 	phys = memblock_phys_alloc(size, PAGE_SIZE);
+	if (!phys)
+		panic("Failed to allocate CEU1 memory\n");
+
 	memblock_free(phys, size);
 	memblock_remove(phys, size);
 	ceu1_dma_membase = phys;
diff --git a/arch/xtensa/mm/kasan_init.c b/arch/xtensa/mm/kasan_init.c
index 48dbb03f4f6f..4852848a0c28 100644
--- a/arch/xtensa/mm/kasan_init.c
+++ b/arch/xtensa/mm/kasan_init.c
@@ -54,6 +54,9 @@ static void __init populate(void *start, void *end)
 			phys_addr_t phys =
 				memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
 
+			if (!phys)
+				panic("Failed to allocate page table page\n");
+
 			set_pte(pte + j, pfn_pte(PHYS_PFN(phys), PAGE_KERNEL));
 		}
 	}
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 251cd66b151b..7caecb42bfea 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -330,7 +330,12 @@ phys_addr_t memblock_phys_alloc_range(phys_addr_t size, phys_addr_t align,
 phys_addr_t memblock_phys_alloc_nid(phys_addr_t size, phys_addr_t align, int nid);
 phys_addr_t memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid);
 
-phys_addr_t memblock_phys_alloc(phys_addr_t size, phys_addr_t align);
+static inline phys_addr_t memblock_phys_alloc(phys_addr_t size,
+					      phys_addr_t align)
+{
+	return memblock_phys_alloc_range(size, align, 0,
+					 MEMBLOCK_ALLOC_ACCESSIBLE);
+}
 
 void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
 				 phys_addr_t min_addr, phys_addr_t max_addr,
diff --git a/mm/memblock.c b/mm/memblock.c
index ac57bd3082bb..d0b76bb7340d 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1325,11 +1325,6 @@ phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys
 	return alloc;
 }
 
-phys_addr_t __init memblock_phys_alloc(phys_addr_t size, phys_addr_t align)
-{
-	return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
-}
-
 phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
 {
 	phys_addr_t res = memblock_phys_alloc_nid(size, align, nid);
-- 
cgit v1.2.3


From 42b46aeff2e366bad54bd1c069b7b5381d9be8b3 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 11 Mar 2019 23:29:31 -0700
Subject: memblock: drop __memblock_alloc_base()

The __memblock_alloc_base() function tries to allocate a memory up to
the limit specified by its max_addr parameter.  Depending on the value
of this parameter, the __memblock_alloc_base() can is replaced with the
appropriate memblock_phys_alloc*() variant.

Link: http://lkml.kernel.org/r/1548057848-15136-9-git-send-email-rppt@linux.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Rob Herring <robh@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Christoph Hellwig <hch@lst.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Greentime Hu <green.hu@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: Guo Ren <guoren@kernel.org>
Cc: Guo Ren <ren_guo@c-sky.com>				[c-sky]
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Juergen Gross <jgross@suse.com>			[Xen]
Cc: Mark Salter <msalter@redhat.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sh/kernel/machine_kexec.c |  3 ++-
 arch/x86/kernel/e820.c         |  2 +-
 arch/x86/mm/numa.c             | 12 ++++--------
 drivers/of/of_reserved_mem.c   |  7 ++-----
 include/linux/memblock.h       |  2 --
 mm/memblock.c                  |  9 ++-------
 6 files changed, 11 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c
index b9f9f1a5afdc..63d63a36f6f2 100644
--- a/arch/sh/kernel/machine_kexec.c
+++ b/arch/sh/kernel/machine_kexec.c
@@ -168,7 +168,8 @@ void __init reserve_crashkernel(void)
 	crash_size = PAGE_ALIGN(resource_size(&crashk_res));
 	if (!crashk_res.start) {
 		unsigned long max = memblock_end_of_DRAM() - memory_limit;
-		crashk_res.start = __memblock_alloc_base(crash_size, PAGE_SIZE, max);
+		crashk_res.start = memblock_phys_alloc_range(crash_size,
+							     PAGE_SIZE, 0, max);
 		if (!crashk_res.start) {
 			pr_err("crashkernel allocation failed\n");
 			goto disable;
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a687d10da417..5203ee4e6435 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -775,7 +775,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
 {
 	u64 addr;
 
-	addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
+	addr = memblock_phys_alloc(size, align);
 	if (addr) {
 		e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
 		pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n");
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 12c1b7a83ed7..dfb6c4df639a 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -195,15 +195,11 @@ static void __init alloc_node_data(int nid)
 	 * Allocate node data.  Try node-local memory and then any node.
 	 * Never allocate in DMA zone.
 	 */
-	nd_pa = memblock_phys_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+	nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
 	if (!nd_pa) {
-		nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES,
-					      MEMBLOCK_ALLOC_ACCESSIBLE);
-		if (!nd_pa) {
-			pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
-			       nd_size, nid);
-			return;
-		}
+		pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
+		       nd_size, nid);
+		return;
 	}
 	nd = __va(nd_pa);
 
diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c
index e773063c6de9..8c07d7da5256 100644
--- a/drivers/of/of_reserved_mem.c
+++ b/drivers/of/of_reserved_mem.c
@@ -31,13 +31,10 @@ static int __init early_init_dt_alloc_reserved_memory_arch(phys_addr_t size,
 	phys_addr_t *res_base)
 {
 	phys_addr_t base;
-	/*
-	 * We use __memblock_alloc_base() because memblock_alloc_base()
-	 * panic()s on allocation failure.
-	 */
+
 	end = !end ? MEMBLOCK_ALLOC_ANYWHERE : end;
 	align = !align ? SMP_CACHE_BYTES : align;
-	base = __memblock_alloc_base(size, align, end);
+	base = memblock_phys_alloc_range(size, align, 0, end);
 	if (!base)
 		return -ENOMEM;
 
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 7caecb42bfea..017aeb223b24 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -452,8 +452,6 @@ static inline bool memblock_bottom_up(void)
 
 phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
 				phys_addr_t max_addr);
-phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
-				  phys_addr_t max_addr);
 phys_addr_t memblock_phys_mem_size(void);
 phys_addr_t memblock_reserved_size(void);
 phys_addr_t memblock_mem_size(unsigned long limit_pfn);
diff --git a/mm/memblock.c b/mm/memblock.c
index d0b76bb7340d..5b6aeb8108d9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1306,17 +1306,12 @@ again:
 	return ret;
 }
 
-phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
-{
-	return memblock_alloc_range_nid(size, align, 0, max_addr, NUMA_NO_NODE,
-					MEMBLOCK_NONE);
-}
-
 phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
 {
 	phys_addr_t alloc;
 
-	alloc = __memblock_alloc_base(size, align, max_addr);
+	alloc = memblock_alloc_range_nid(size, align, 0, max_addr, NUMA_NO_NODE,
+					MEMBLOCK_NONE);
 
 	if (alloc == 0)
 		panic("ERROR: Failed to allocate %pa bytes below %pa.\n",
-- 
cgit v1.2.3


From 0ba9e6edd4c2e563a9b34c8a46649218814a363f Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 11 Mar 2019 23:29:35 -0700
Subject: memblock: drop memblock_alloc_base()

The memblock_alloc_base() function tries to allocate a memory up to the
limit specified by its max_addr parameter and panics if the allocation
fails.  Replace its usage with memblock_phys_alloc_range() and make the
callers check the return value and panic in case of error.

Link: http://lkml.kernel.org/r/1548057848-15136-10-git-send-email-rppt@linux.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>		[powerpc]
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Christoph Hellwig <hch@lst.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Greentime Hu <green.hu@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: Guo Ren <guoren@kernel.org>
Cc: Guo Ren <ren_guo@c-sky.com>				[c-sky]
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Juergen Gross <jgross@suse.com>			[Xen]
Cc: Mark Salter <msalter@redhat.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kernel/rtas.c      |  6 +++++-
 arch/powerpc/mm/hash_utils_64.c |  8 ++++++--
 arch/s390/kernel/smp.c          |  6 +++++-
 drivers/macintosh/smu.c         |  2 +-
 include/linux/memblock.h        |  2 --
 mm/memblock.c                   | 14 --------------
 6 files changed, 17 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index de35bd8f047f..fbc676160adf 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -1187,7 +1187,11 @@ void __init rtas_initialize(void)
 		ibm_suspend_me_token = rtas_token("ibm,suspend-me");
 	}
 #endif
-	rtas_rmo_buf = memblock_alloc_base(RTAS_RMOBUF_MAX, PAGE_SIZE, rtas_region);
+	rtas_rmo_buf = memblock_phys_alloc_range(RTAS_RMOBUF_MAX, PAGE_SIZE,
+						 0, rtas_region);
+	if (!rtas_rmo_buf)
+		panic("ERROR: RTAS: Failed to allocate %lx bytes below %pa\n",
+		      PAGE_SIZE, &rtas_region);
 
 #ifdef CONFIG_RTAS_ERROR_LOGGING
 	rtas_last_error_token = rtas_token("rtas-last-error");
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 3d4b2399192f..880a366c229c 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -882,8 +882,12 @@ static void __init htab_initialize(void)
 		}
 #endif /* CONFIG_PPC_CELL */
 
-		table = memblock_alloc_base(htab_size_bytes, htab_size_bytes,
-					    limit);
+		table = memblock_phys_alloc_range(htab_size_bytes,
+						  htab_size_bytes,
+						  0, limit);
+		if (!table)
+			panic("ERROR: Failed to allocate %pa bytes below %pa\n",
+			      &htab_size_bytes, &limit);
 
 		DBG("Hash table allocated at %lx, size: %lx\n", table,
 		    htab_size_bytes);
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index b198ece2aad6..5e3cccc408b8 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -656,7 +656,11 @@ void __init smp_save_dump_cpus(void)
 		/* No previous system present, normal boot. */
 		return;
 	/* Allocate a page as dumping area for the store status sigps */
-	page = memblock_alloc_base(PAGE_SIZE, PAGE_SIZE, 1UL << 31);
+	page = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0, 1UL << 31);
+	if (!page)
+		panic("ERROR: Failed to allocate %x bytes below %lx\n",
+		      PAGE_SIZE, 1UL << 31);
+
 	/* Set multi-threading state to the previous system. */
 	pcpu_set_smt(sclp.mtid_prev);
 	boot_cpu_addr = stap();
diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c
index 0a0b8e1f4236..42cf68d15da3 100644
--- a/drivers/macintosh/smu.c
+++ b/drivers/macintosh/smu.c
@@ -485,7 +485,7 @@ int __init smu_init (void)
 	 * SMU based G5s need some memory below 2Gb. Thankfully this is
 	 * called at a time where memblock is still available.
 	 */
-	smu_cmdbuf_abs = memblock_alloc_base(4096, 4096, 0x80000000UL);
+	smu_cmdbuf_abs = memblock_phys_alloc_range(4096, 4096, 0, 0x80000000UL);
 	if (smu_cmdbuf_abs == 0) {
 		printk(KERN_ERR "SMU: Command buffer allocation failed !\n");
 		ret = -EINVAL;
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 017aeb223b24..0c8375120322 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -450,8 +450,6 @@ static inline bool memblock_bottom_up(void)
 	return memblock.bottom_up;
 }
 
-phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
-				phys_addr_t max_addr);
 phys_addr_t memblock_phys_mem_size(void);
 phys_addr_t memblock_reserved_size(void);
 phys_addr_t memblock_mem_size(unsigned long limit_pfn);
diff --git a/mm/memblock.c b/mm/memblock.c
index 5b6aeb8108d9..42fe65447d8b 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1306,20 +1306,6 @@ again:
 	return ret;
 }
 
-phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
-{
-	phys_addr_t alloc;
-
-	alloc = memblock_alloc_range_nid(size, align, 0, max_addr, NUMA_NO_NODE,
-					MEMBLOCK_NONE);
-
-	if (alloc == 0)
-		panic("ERROR: Failed to allocate %pa bytes below %pa.\n",
-		      &size, &max_addr);
-
-	return alloc;
-}
-
 phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
 {
 	phys_addr_t res = memblock_phys_alloc_nid(size, align, nid);
-- 
cgit v1.2.3


From 92d12f9544b7b133b54cb64f687f3f45fce0043c Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 11 Mar 2019 23:29:41 -0700
Subject: memblock: refactor internal allocation functions

Currently, memblock has several internal functions with overlapping
functionality.  They all call memblock_find_in_range_node() to find free
memory and then reserve the allocated range and mark it with kmemleak.
However, there is difference in the allocation constraints and in
fallback strategies.

The allocations returning physical address first attempt to find free
memory on the specified node within mirrored memory regions, then retry
on the same node without the requirement for memory mirroring and
finally fall back to all available memory.

The allocations returning virtual address start with clamping the
allowed range to memblock.current_limit, attempt to allocate from the
specified node from regions with mirroring and with user defined minimal
address.  If such allocation fails, next attempt is done with node
restriction lifted.  Next, the allocation is retried with minimal
address reset to zero and at last without the requirement for mirrored
regions.

Let's consolidate various fallbacks handling and make them more
consistent for physical and virtual variants.  Most of the fallback
handling is moved to memblock_alloc_range_nid() and it now handles node
and mirror fallbacks.

The memblock_alloc_internal() uses memblock_alloc_range_nid() to get a
physical address of the allocated range and converts it to virtual
address.

The fallback for allocation below the specified minimal address remains
in memblock_alloc_internal() because memblock_alloc_range_nid() is used
by CMA with exact requirement for lower bounds.

The memblock_phys_alloc_nid() function is completely dropped as it is not
used anywhere outside memblock and its only usage can be replaced by a
call to memblock_alloc_range_nid().

[rppt@linux.ibm.com: fix parameter order in memblock_phys_alloc_try_nid()]
  Link: http://lkml.kernel.org/r/20190203113915.GC8620@rapoport-lnx
Link: http://lkml.kernel.org/r/1548057848-15136-11-git-send-email-rppt@linux.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Christoph Hellwig <hch@lst.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Greentime Hu <green.hu@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: Guo Ren <guoren@kernel.org>
Cc: Guo Ren <ren_guo@c-sky.com>				[c-sky]
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Juergen Gross <jgross@suse.com>			[Xen]
Cc: Mark Salter <msalter@redhat.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h |   1 -
 mm/memblock.c            | 171 +++++++++++++++++++++--------------------------
 2 files changed, 77 insertions(+), 95 deletions(-)

(limited to 'include')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 0c8375120322..c1315c331a8e 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -327,7 +327,6 @@ static inline int memblock_get_region_node(const struct memblock_region *r)
 
 phys_addr_t memblock_phys_alloc_range(phys_addr_t size, phys_addr_t align,
 				      phys_addr_t start, phys_addr_t end);
-phys_addr_t memblock_phys_alloc_nid(phys_addr_t size, phys_addr_t align, int nid);
 phys_addr_t memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid);
 
 static inline phys_addr_t memblock_phys_alloc(phys_addr_t size,
diff --git a/mm/memblock.c b/mm/memblock.c
index 42fe65447d8b..31e89dac9a23 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1255,30 +1255,84 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
 }
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
+/**
+ * memblock_alloc_range_nid - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @start: the lower bound of the memory region to allocate (phys address)
+ * @end: the upper bound of the memory region to allocate (phys address)
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * The allocation is performed from memory region limited by
+ * memblock.current_limit if @max_addr == %MEMBLOCK_ALLOC_ACCESSIBLE.
+ *
+ * If the specified node can not hold the requested memory the
+ * allocation falls back to any node in the system
+ *
+ * For systems with memory mirroring, the allocation is attempted first
+ * from the regions with mirroring enabled and then retried from any
+ * memory region.
+ *
+ * In addition, function sets the min_count to 0 using kmemleak_alloc_phys for
+ * allocated boot memory block, so that it is never reported as leaks.
+ *
+ * Return:
+ * Physical address of allocated memory block on success, %0 on failure.
+ */
 static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
 					phys_addr_t align, phys_addr_t start,
-					phys_addr_t end, int nid,
-					enum memblock_flags flags)
+					phys_addr_t end, int nid)
 {
+	enum memblock_flags flags = choose_memblock_flags();
 	phys_addr_t found;
 
+	if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+		nid = NUMA_NO_NODE;
+
 	if (!align) {
 		/* Can't use WARNs this early in boot on powerpc */
 		dump_stack();
 		align = SMP_CACHE_BYTES;
 	}
 
+	if (end > memblock.current_limit)
+		end = memblock.current_limit;
+
+again:
 	found = memblock_find_in_range_node(size, align, start, end, nid,
 					    flags);
-	if (found && !memblock_reserve(found, size)) {
+	if (found && !memblock_reserve(found, size))
+		goto done;
+
+	if (nid != NUMA_NO_NODE) {
+		found = memblock_find_in_range_node(size, align, start,
+						    end, NUMA_NO_NODE,
+						    flags);
+		if (found && !memblock_reserve(found, size))
+			goto done;
+	}
+
+	if (flags & MEMBLOCK_MIRROR) {
+		flags &= ~MEMBLOCK_MIRROR;
+		pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+			&size);
+		goto again;
+	}
+
+	return 0;
+
+done:
+	/* Skip kmemleak for kasan_init() due to high volume. */
+	if (end != MEMBLOCK_ALLOC_KASAN)
 		/*
-		 * The min_count is set to 0 so that memblock allocations are
-		 * never reported as leaks.
+		 * The min_count is set to 0 so that memblock allocated
+		 * blocks are never reported as leaks. This is because many
+		 * of these blocks are only referred via the physical
+		 * address which is not looked up by kmemleak.
 		 */
 		kmemleak_alloc_phys(found, size, 0, 0);
-		return found;
-	}
-	return 0;
+
+	return found;
 }
 
 phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size,
@@ -1286,35 +1340,13 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size,
 					     phys_addr_t start,
 					     phys_addr_t end)
 {
-	return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
-					MEMBLOCK_NONE);
-}
-
-phys_addr_t __init memblock_phys_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
-{
-	enum memblock_flags flags = choose_memblock_flags();
-	phys_addr_t ret;
-
-again:
-	ret = memblock_alloc_range_nid(size, align, 0,
-				       MEMBLOCK_ALLOC_ACCESSIBLE, nid, flags);
-
-	if (!ret && (flags & MEMBLOCK_MIRROR)) {
-		flags &= ~MEMBLOCK_MIRROR;
-		goto again;
-	}
-	return ret;
+	return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
 }
 
 phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
 {
-	phys_addr_t res = memblock_phys_alloc_nid(size, align, nid);
-
-	if (res)
-		return res;
 	return memblock_alloc_range_nid(size, align, 0,
-					MEMBLOCK_ALLOC_ACCESSIBLE,
-					NUMA_NO_NODE, MEMBLOCK_NONE);
+					MEMBLOCK_ALLOC_ACCESSIBLE, nid);
 }
 
 /**
@@ -1325,19 +1357,13 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali
  * @max_addr: the upper bound of the memory region to allocate (phys address)
  * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
  *
- * The @min_addr limit is dropped if it can not be satisfied and the allocation
- * will fall back to memory below @min_addr. Also, allocation may fall back
- * to any node in the system if the specified node can not
- * hold the requested memory.
- *
- * The allocation is performed from memory region limited by
- * memblock.current_limit if @max_addr == %MEMBLOCK_ALLOC_ACCESSIBLE.
- *
- * The phys address of allocated boot memory block is converted to virtual and
- * allocated memory is reset to 0.
+ * Allocates memory block using memblock_alloc_range_nid() and
+ * converts the returned physical address to virtual.
  *
- * In addition, function sets the min_count to 0 using kmemleak_alloc for
- * allocated boot memory block, so that it is never reported as leaks.
+ * The @min_addr limit is dropped if it can not be satisfied and the allocation
+ * will fall back to memory below @min_addr. Other constraints, such
+ * as node and mirrored memory will be handled again in
+ * memblock_alloc_range_nid().
  *
  * Return:
  * Virtual address of allocated memory block on success, NULL on failure.
@@ -1348,11 +1374,6 @@ static void * __init memblock_alloc_internal(
 				int nid)
 {
 	phys_addr_t alloc;
-	void *ptr;
-	enum memblock_flags flags = choose_memblock_flags();
-
-	if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
-		nid = NUMA_NO_NODE;
 
 	/*
 	 * Detect any accidental use of these APIs after slab is ready, as at
@@ -1362,54 +1383,16 @@ static void * __init memblock_alloc_internal(
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, nid);
 
-	if (!align) {
-		dump_stack();
-		align = SMP_CACHE_BYTES;
-	}
-
-	if (max_addr > memblock.current_limit)
-		max_addr = memblock.current_limit;
-again:
-	alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
-					    nid, flags);
-	if (alloc && !memblock_reserve(alloc, size))
-		goto done;
-
-	if (nid != NUMA_NO_NODE) {
-		alloc = memblock_find_in_range_node(size, align, min_addr,
-						    max_addr, NUMA_NO_NODE,
-						    flags);
-		if (alloc && !memblock_reserve(alloc, size))
-			goto done;
-	}
-
-	if (min_addr) {
-		min_addr = 0;
-		goto again;
-	}
-
-	if (flags & MEMBLOCK_MIRROR) {
-		flags &= ~MEMBLOCK_MIRROR;
-		pr_warn("Could not allocate %pap bytes of mirrored memory\n",
-			&size);
-		goto again;
-	}
+	alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid);
 
-	return NULL;
-done:
-	ptr = phys_to_virt(alloc);
+	/* retry allocation without lower limit */
+	if (!alloc && min_addr)
+		alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid);
 
-	/* Skip kmemleak for kasan_init() due to high volume. */
-	if (max_addr != MEMBLOCK_ALLOC_KASAN)
-		/*
-		 * The min_count is set to 0 so that bootmem allocated
-		 * blocks are never reported as leaks. This is because many
-		 * of these blocks are only referred via the physical
-		 * address which is not looked up by kmemleak.
-		 */
-		kmemleak_alloc(ptr, size, 0, 0);
+	if (!alloc)
+		return NULL;
 
-	return ptr;
+	return phys_to_virt(alloc);
 }
 
 /**
-- 
cgit v1.2.3


From c366ea89fa40f244d1210e74485fce110835b71b Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 11 Mar 2019 23:29:46 -0700
Subject: memblock: make memblock_find_in_range_node() and
 choose_memblock_flags() static

These functions are not used outside memblock.  Make them static.

Link: http://lkml.kernel.org/r/1548057848-15136-12-git-send-email-rppt@linux.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Christoph Hellwig <hch@lst.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Greentime Hu <green.hu@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: Guo Ren <guoren@kernel.org>
Cc: Guo Ren <ren_guo@c-sky.com>				[c-sky]
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Juergen Gross <jgross@suse.com>			[Xen]
Cc: Mark Salter <msalter@redhat.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 4 ----
 mm/memblock.c            | 4 ++--
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index c1315c331a8e..c077227e6d53 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -108,9 +108,6 @@ void memblock_discard(void);
 #define memblock_dbg(fmt, ...) \
 	if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
 
-phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
-					phys_addr_t start, phys_addr_t end,
-					int nid, enum memblock_flags flags);
 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
 				   phys_addr_t size, phys_addr_t align);
 void memblock_allow_resize(void);
@@ -127,7 +124,6 @@ int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
 int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
 int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
-enum memblock_flags choose_memblock_flags(void);
 
 unsigned long memblock_free_all(void);
 void reset_node_managed_pages(pg_data_t *pgdat);
diff --git a/mm/memblock.c b/mm/memblock.c
index 31e89dac9a23..618f94a1eedb 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -132,7 +132,7 @@ static int memblock_can_resize __initdata_memblock;
 static int memblock_memory_in_slab __initdata_memblock = 0;
 static int memblock_reserved_in_slab __initdata_memblock = 0;
 
-enum memblock_flags __init_memblock choose_memblock_flags(void)
+static enum memblock_flags __init_memblock choose_memblock_flags(void)
 {
 	return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
 }
@@ -261,7 +261,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
  * Return:
  * Found address on success, 0 on failure.
  */
-phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
+static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
 					phys_addr_t align, phys_addr_t start,
 					phys_addr_t end, int nid,
 					enum memblock_flags flags)
-- 
cgit v1.2.3


From 26fb3dae0a1ec78bdde4b5b72e0e709503e8c596 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 11 Mar 2019 23:30:42 -0700
Subject: memblock: drop memblock_alloc_*_nopanic() variants

As all the memblock allocation functions return NULL in case of error
rather than panic(), the duplicates with _nopanic suffix can be removed.

Link: http://lkml.kernel.org/r/1548057848-15136-22-git-send-email-rppt@linux.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>		[printk]
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Christoph Hellwig <hch@lst.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Greentime Hu <green.hu@gmail.com>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: Guo Ren <guoren@kernel.org>
Cc: Guo Ren <ren_guo@c-sky.com>				[c-sky]
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Juergen Gross <jgross@suse.com>			[Xen]
Cc: Mark Salter <msalter@redhat.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arc/kernel/unwind.c       |  3 +--
 arch/sh/mm/init.c              |  2 +-
 arch/x86/kernel/setup_percpu.c | 10 +++++-----
 arch/x86/mm/kasan_init_64.c    | 14 ++++++++------
 drivers/firmware/memmap.c      |  2 +-
 drivers/usb/early/xhci-dbc.c   |  2 +-
 include/linux/memblock.h       | 35 -----------------------------------
 kernel/dma/swiotlb.c           |  2 +-
 kernel/printk/printk.c         |  9 +--------
 mm/memblock.c                  | 35 -----------------------------------
 mm/page_alloc.c                | 10 +++++-----
 mm/page_ext.c                  |  2 +-
 mm/percpu.c                    | 11 ++++-------
 mm/sparse.c                    |  6 ++----
 14 files changed, 31 insertions(+), 112 deletions(-)

(limited to 'include')

diff --git a/arch/arc/kernel/unwind.c b/arch/arc/kernel/unwind.c
index d34f69eb1a95..271e9fafa479 100644
--- a/arch/arc/kernel/unwind.c
+++ b/arch/arc/kernel/unwind.c
@@ -181,8 +181,7 @@ static void init_unwind_hdr(struct unwind_table *table,
  */
 static void *__init unw_hdr_alloc_early(unsigned long sz)
 {
-	return memblock_alloc_from_nopanic(sz, sizeof(unsigned int),
-					   MAX_DMA_ADDRESS);
+	return memblock_alloc_from(sz, sizeof(unsigned int), MAX_DMA_ADDRESS);
 }
 
 static void *unw_hdr_alloc(unsigned long sz)
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index fceefd92016f..70621324db41 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -202,7 +202,7 @@ void __init allocate_pgdat(unsigned int nid)
 	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
 
 #ifdef CONFIG_NEED_MULTIPLE_NODES
-	NODE_DATA(nid) = memblock_alloc_try_nid_nopanic(
+	NODE_DATA(nid) = memblock_alloc_try_nid(
 				sizeof(struct pglist_data),
 				SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
 				MEMBLOCK_ALLOC_ACCESSIBLE, nid);
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 13af08827eef..4bf46575568a 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -106,22 +106,22 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
 	void *ptr;
 
 	if (!node_online(node) || !NODE_DATA(node)) {
-		ptr = memblock_alloc_from_nopanic(size, align, goal);
+		ptr = memblock_alloc_from(size, align, goal);
 		pr_info("cpu %d has no node %d or node-local memory\n",
 			cpu, node);
 		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
 			 cpu, size, __pa(ptr));
 	} else {
-		ptr = memblock_alloc_try_nid_nopanic(size, align, goal,
-						     MEMBLOCK_ALLOC_ACCESSIBLE,
-						     node);
+		ptr = memblock_alloc_try_nid(size, align, goal,
+					     MEMBLOCK_ALLOC_ACCESSIBLE,
+					     node);
 
 		pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
 			 cpu, size, node, __pa(ptr));
 	}
 	return ptr;
 #else
-	return memblock_alloc_from_nopanic(size, align, goal);
+	return memblock_alloc_from(size, align, goal);
 #endif
 }
 
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 462fde83b515..8dc0fc0b1382 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -24,14 +24,16 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES];
 
 static p4d_t tmp_p4d_table[MAX_PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
 
-static __init void *early_alloc(size_t size, int nid, bool panic)
+static __init void *early_alloc(size_t size, int nid, bool should_panic)
 {
-	if (panic)
-		return memblock_alloc_try_nid(size, size,
-			__pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid);
-	else
-		return memblock_alloc_try_nid_nopanic(size, size,
+	void *ptr = memblock_alloc_try_nid(size, size,
 			__pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+
+	if (!ptr && should_panic)
+		panic("%pS: Failed to allocate page, nid=%d from=%lx\n",
+		      (void *)_RET_IP_, nid, __pa(MAX_DMA_ADDRESS));
+
+	return ptr;
 }
 
 static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
index ec4fd253a4e9..d168c87c7d30 100644
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -333,7 +333,7 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type)
 {
 	struct firmware_map_entry *entry;
 
-	entry = memblock_alloc_nopanic(sizeof(struct firmware_map_entry),
+	entry = memblock_alloc(sizeof(struct firmware_map_entry),
 			       SMP_CACHE_BYTES);
 	if (WARN_ON(!entry))
 		return -ENOMEM;
diff --git a/drivers/usb/early/xhci-dbc.c b/drivers/usb/early/xhci-dbc.c
index d2652dccc699..c9cfb100ecdc 100644
--- a/drivers/usb/early/xhci-dbc.c
+++ b/drivers/usb/early/xhci-dbc.c
@@ -94,7 +94,7 @@ static void * __init xdbc_get_page(dma_addr_t *dma_addr)
 {
 	void *virt;
 
-	virt = memblock_alloc_nopanic(PAGE_SIZE, PAGE_SIZE);
+	virt = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
 	if (!virt)
 		return NULL;
 
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index c077227e6d53..db69ad97aa2e 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -335,9 +335,6 @@ static inline phys_addr_t memblock_phys_alloc(phys_addr_t size,
 void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
 				 phys_addr_t min_addr, phys_addr_t max_addr,
 				 int nid);
-void *memblock_alloc_try_nid_nopanic(phys_addr_t size, phys_addr_t align,
-				     phys_addr_t min_addr, phys_addr_t max_addr,
-				     int nid);
 void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align,
 			     phys_addr_t min_addr, phys_addr_t max_addr,
 			     int nid);
@@ -364,36 +361,12 @@ static inline void * __init memblock_alloc_from(phys_addr_t size,
 				      MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE);
 }
 
-static inline void * __init memblock_alloc_nopanic(phys_addr_t size,
-						   phys_addr_t align)
-{
-	return memblock_alloc_try_nid_nopanic(size, align, MEMBLOCK_LOW_LIMIT,
-					      MEMBLOCK_ALLOC_ACCESSIBLE,
-					      NUMA_NO_NODE);
-}
-
 static inline void * __init memblock_alloc_low(phys_addr_t size,
 					       phys_addr_t align)
 {
 	return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT,
 				      ARCH_LOW_ADDRESS_LIMIT, NUMA_NO_NODE);
 }
-static inline void * __init memblock_alloc_low_nopanic(phys_addr_t size,
-						       phys_addr_t align)
-{
-	return memblock_alloc_try_nid_nopanic(size, align, MEMBLOCK_LOW_LIMIT,
-					      ARCH_LOW_ADDRESS_LIMIT,
-					      NUMA_NO_NODE);
-}
-
-static inline void * __init memblock_alloc_from_nopanic(phys_addr_t size,
-							phys_addr_t align,
-							phys_addr_t min_addr)
-{
-	return memblock_alloc_try_nid_nopanic(size, align, min_addr,
-					      MEMBLOCK_ALLOC_ACCESSIBLE,
-					      NUMA_NO_NODE);
-}
 
 static inline void * __init memblock_alloc_node(phys_addr_t size,
 						phys_addr_t align, int nid)
@@ -402,14 +375,6 @@ static inline void * __init memblock_alloc_node(phys_addr_t size,
 				      MEMBLOCK_ALLOC_ACCESSIBLE, nid);
 }
 
-static inline void * __init memblock_alloc_node_nopanic(phys_addr_t size,
-							int nid)
-{
-	return memblock_alloc_try_nid_nopanic(size, SMP_CACHE_BYTES,
-					      MEMBLOCK_LOW_LIMIT,
-					      MEMBLOCK_ALLOC_ACCESSIBLE, nid);
-}
-
 static inline void __init memblock_free_early(phys_addr_t base,
 					      phys_addr_t size)
 {
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 56ac77a80b1f..53012db1e53c 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -256,7 +256,7 @@ swiotlb_init(int verbose)
 	bytes = io_tlb_nslabs << IO_TLB_SHIFT;
 
 	/* Get IO TLB memory from the low pages */
-	vstart = memblock_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE);
+	vstart = memblock_alloc_low(PAGE_ALIGN(bytes), PAGE_SIZE);
 	if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose))
 		return;
 
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 8eee85bb2687..6b7654b8001f 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1143,14 +1143,7 @@ void __init setup_log_buf(int early)
 	if (!new_log_buf_len)
 		return;
 
-	if (early) {
-		new_log_buf =
-			memblock_alloc(new_log_buf_len, LOG_ALIGN);
-	} else {
-		new_log_buf = memblock_alloc_nopanic(new_log_buf_len,
-							  LOG_ALIGN);
-	}
-
+	new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN);
 	if (unlikely(!new_log_buf)) {
 		pr_err("log_buf_len: %lu bytes not available\n",
 			new_log_buf_len);
diff --git a/mm/memblock.c b/mm/memblock.c
index a838c50ca9a8..0ab30d0185bc 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1433,41 +1433,6 @@ void * __init memblock_alloc_try_nid_raw(
 	return ptr;
 }
 
-/**
- * memblock_alloc_try_nid_nopanic - allocate boot memory block
- * @size: size of memory block to be allocated in bytes
- * @align: alignment of the region and block's size
- * @min_addr: the lower bound of the memory region from where the allocation
- *	  is preferred (phys address)
- * @max_addr: the upper bound of the memory region from where the allocation
- *	      is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to
- *	      allocate only from memory limited by memblock.current_limit value
- * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
- *
- * Public function, provides additional debug information (including caller
- * info), if enabled. This function zeroes the allocated memory.
- *
- * Return:
- * Virtual address of allocated memory block on success, NULL on failure.
- */
-void * __init memblock_alloc_try_nid_nopanic(
-				phys_addr_t size, phys_addr_t align,
-				phys_addr_t min_addr, phys_addr_t max_addr,
-				int nid)
-{
-	void *ptr;
-
-	memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
-		     __func__, (u64)size, (u64)align, nid, &min_addr,
-		     &max_addr, (void *)_RET_IP_);
-
-	ptr = memblock_alloc_internal(size, align,
-					   min_addr, max_addr, nid);
-	if (ptr)
-		memset(ptr, 0, size);
-	return ptr;
-}
-
 /**
  * memblock_alloc_try_nid - allocate boot memory block
  * @size: size of memory block to be allocated in bytes
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3eb01dedfb50..03fcf73d47da 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6445,8 +6445,8 @@ static void __ref setup_usemap(struct pglist_data *pgdat,
 	zone->pageblock_flags = NULL;
 	if (usemapsize) {
 		zone->pageblock_flags =
-			memblock_alloc_node_nopanic(usemapsize,
-							 pgdat->node_id);
+			memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
+					    pgdat->node_id);
 		if (!zone->pageblock_flags)
 			panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
 			      usemapsize, zone->name, pgdat->node_id);
@@ -6679,7 +6679,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
 		end = pgdat_end_pfn(pgdat);
 		end = ALIGN(end, MAX_ORDER_NR_PAGES);
 		size =  (end - start) * sizeof(struct page);
-		map = memblock_alloc_node_nopanic(size, pgdat->node_id);
+		map = memblock_alloc_node(size, SMP_CACHE_BYTES,
+					  pgdat->node_id);
 		if (!map)
 			panic("Failed to allocate %ld bytes for node %d memory map\n",
 			      size, pgdat->node_id);
@@ -7959,8 +7960,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 		size = bucketsize << log2qty;
 		if (flags & HASH_EARLY) {
 			if (flags & HASH_ZERO)
-				table = memblock_alloc_nopanic(size,
-							       SMP_CACHE_BYTES);
+				table = memblock_alloc(size, SMP_CACHE_BYTES);
 			else
 				table = memblock_alloc_raw(size,
 							   SMP_CACHE_BYTES);
diff --git a/mm/page_ext.c b/mm/page_ext.c
index ab4244920e0f..d8f1aca4ad43 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -161,7 +161,7 @@ static int __init alloc_node_page_ext(int nid)
 
 	table_size = get_entry_size() * nr_pages;
 
-	base = memblock_alloc_try_nid_nopanic(
+	base = memblock_alloc_try_nid(
 			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
 			MEMBLOCK_ALLOC_ACCESSIBLE, nid);
 	if (!base)
diff --git a/mm/percpu.c b/mm/percpu.c
index 3f9fb3086a9b..2e6fc8d552c9 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1905,7 +1905,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
 			  __alignof__(ai->groups[0].cpu_map[0]));
 	ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
 
-	ptr = memblock_alloc_nopanic(PFN_ALIGN(ai_size), PAGE_SIZE);
+	ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
 	if (!ptr)
 		return NULL;
 	ai = ptr;
@@ -2496,7 +2496,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 	size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
 	areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
 
-	areas = memblock_alloc_nopanic(areas_size, SMP_CACHE_BYTES);
+	areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
 	if (!areas) {
 		rc = -ENOMEM;
 		goto out_free;
@@ -2729,8 +2729,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
 static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
 				       size_t align)
 {
-	return  memblock_alloc_from_nopanic(
-			size, align, __pa(MAX_DMA_ADDRESS));
+	return  memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
 }
 
 static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
@@ -2778,9 +2777,7 @@ void __init setup_per_cpu_areas(void)
 	void *fc;
 
 	ai = pcpu_alloc_alloc_info(1, 1);
-	fc = memblock_alloc_from_nopanic(unit_size,
-					      PAGE_SIZE,
-					      __pa(MAX_DMA_ADDRESS));
+	fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
 	if (!ai || !fc)
 		panic("Failed to allocate memory for percpu areas.");
 	/* kmemleak tracks the percpu allocations separately */
diff --git a/mm/sparse.c b/mm/sparse.c
index 7397fb4e78b4..69904aa6165b 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -330,9 +330,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 	limit = goal + (1UL << PA_SECTION_SHIFT);
 	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
 again:
-	p = memblock_alloc_try_nid_nopanic(size,
-						SMP_CACHE_BYTES, goal, limit,
-						nid);
+	p = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
 	if (!p && limit) {
 		limit = 0;
 		goto again;
@@ -386,7 +384,7 @@ static unsigned long * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 					 unsigned long size)
 {
-	return memblock_alloc_node_nopanic(size, pgdat->node_id);
+	return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
 }
 
 static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
-- 
cgit v1.2.3


From fe145124dbe53c86bf32b941b2f2f88f891d985d Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 11 Mar 2019 23:30:46 -0700
Subject: memblock: remove memblock_{set,clear}_region_flags

The memblock API provides dedicated helpers to set or clear a flag on a
memory region, e.g.  memblock_{mark,clear}_hotplug().

The memblock_{set,clear}_region_flags() functions are used only by the
memblock internal function that adjusts the region flags.  Drop these
functions and use open-coded implementation instead.

Link: http://lkml.kernel.org/r/1549455025-17706-2-git-send-email-rppt@linux.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 12 ------------
 mm/memblock.c            |  9 ++++++---
 2 files changed, 6 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index db69ad97aa2e..294d5d80e150 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -273,18 +273,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
 	for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved,	\
 			       nid, flags, p_start, p_end, p_nid)
 
-static inline void memblock_set_region_flags(struct memblock_region *r,
-					     enum memblock_flags flags)
-{
-	r->flags |= flags;
-}
-
-static inline void memblock_clear_region_flags(struct memblock_region *r,
-					       enum memblock_flags flags)
-{
-	r->flags &= ~flags;
-}
-
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 int memblock_set_node(phys_addr_t base, phys_addr_t size,
 		      struct memblock_type *type, int nid);
diff --git a/mm/memblock.c b/mm/memblock.c
index 0ab30d0185bc..068e147695ee 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -858,11 +858,14 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base,
 	if (ret)
 		return ret;
 
-	for (i = start_rgn; i < end_rgn; i++)
+	for (i = start_rgn; i < end_rgn; i++) {
+		struct memblock_region *r = &type->regions[i];
+
 		if (set)
-			memblock_set_region_flags(&type->regions[i], flag);
+			r->flags |= flag;
 		else
-			memblock_clear_region_flags(&type->regions[i], flag);
+			r->flags &= ~flag;
+	}
 
 	memblock_merge_regions(type);
 	return 0;
-- 
cgit v1.2.3


From ba20ba2e3743bac786dff777954c11930256075e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Mar 2019 23:31:14 -0700
Subject: generic radix trees

Very simple radix tree implementation that supports storing arbitrary
size entries, up to PAGE_SIZE - upcoming patches will convert existing
flex_array users to genradixes.  The new genradix code has a much
simpler API and implementation, and doesn't have a hard limit on the
number of elements like flex_array does.

Link: http://lkml.kernel.org/r/20181217131929.11727-5-kent.overstreet@gmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Eric Paris <eparis@parisplace.org>
Cc: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Pravin B Shelar <pshelar@ovn.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Vlad Yasevich <vyasevich@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/core-api/generic-radix-tree.rst |  12 ++
 Documentation/core-api/index.rst              |   1 +
 include/linux/generic-radix-tree.h            | 231 ++++++++++++++++++++++++++
 lib/Makefile                                  |   3 +-
 lib/generic-radix-tree.c                      | 217 ++++++++++++++++++++++++
 5 files changed, 463 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/core-api/generic-radix-tree.rst
 create mode 100644 include/linux/generic-radix-tree.h
 create mode 100644 lib/generic-radix-tree.c

(limited to 'include')

diff --git a/Documentation/core-api/generic-radix-tree.rst b/Documentation/core-api/generic-radix-tree.rst
new file mode 100644
index 000000000000..ed42839ae42f
--- /dev/null
+++ b/Documentation/core-api/generic-radix-tree.rst
@@ -0,0 +1,12 @@
+=================================
+Generic radix trees/sparse arrays
+=================================
+
+.. kernel-doc:: include/linux/generic-radix-tree.h
+   :doc: Generic radix trees/sparse arrays
+
+generic radix tree functions
+----------------------------
+
+.. kernel-doc:: include/linux/generic-radix-tree.h
+   :functions:
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index 3adee82be311..6870baffef82 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -28,6 +28,7 @@ Core utilities
    errseq
    printk-formats
    circular-buffers
+   generic-radix-tree
    memory-allocation
    mm-api
    gfp_mask-from-fs-io
diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
new file mode 100644
index 000000000000..3a91130a4fbd
--- /dev/null
+++ b/include/linux/generic-radix-tree.h
@@ -0,0 +1,231 @@
+#ifndef _LINUX_GENERIC_RADIX_TREE_H
+#define _LINUX_GENERIC_RADIX_TREE_H
+
+/**
+ * DOC: Generic radix trees/sparse arrays:
+ *
+ * Very simple and minimalistic, supporting arbitrary size entries up to
+ * PAGE_SIZE.
+ *
+ * A genradix is defined with the type it will store, like so:
+ *
+ * static GENRADIX(struct foo) foo_genradix;
+ *
+ * The main operations are:
+ *
+ * - genradix_init(radix) - initialize an empty genradix
+ *
+ * - genradix_free(radix) - free all memory owned by the genradix and
+ *   reinitialize it
+ *
+ * - genradix_ptr(radix, idx) - gets a pointer to the entry at idx, returning
+ *   NULL if that entry does not exist
+ *
+ * - genradix_ptr_alloc(radix, idx, gfp) - gets a pointer to an entry,
+ *   allocating it if necessary
+ *
+ * - genradix_for_each(radix, iter, p) - iterate over each entry in a genradix
+ *
+ * The radix tree allocates one page of entries at a time, so entries may exist
+ * that were never explicitly allocated - they will be initialized to all
+ * zeroes.
+ *
+ * Internally, a genradix is just a radix tree of pages, and indexing works in
+ * terms of byte offsets. The wrappers in this header file use sizeof on the
+ * type the radix contains to calculate a byte offset from the index - see
+ * __idx_to_offset.
+ */
+
+#include <asm/page.h>
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/log2.h>
+
+struct genradix_root;
+
+struct __genradix {
+	struct genradix_root __rcu	*root;
+};
+
+/*
+ * NOTE: currently, sizeof(_type) must not be larger than PAGE_SIZE:
+ */
+
+#define __GENRADIX_INITIALIZER					\
+	{							\
+		.tree = {					\
+			.root = NULL,				\
+		}						\
+	}
+
+/*
+ * We use a 0 size array to stash the type we're storing without taking any
+ * space at runtime - then the various accessor macros can use typeof() to get
+ * to it for casts/sizeof - we also force the alignment so that storing a type
+ * with a ridiculous alignment doesn't blow up the alignment or size of the
+ * genradix.
+ */
+
+#define GENRADIX(_type)						\
+struct {							\
+	struct __genradix	tree;				\
+	_type			type[0] __aligned(1);		\
+}
+
+#define DEFINE_GENRADIX(_name, _type)				\
+	GENRADIX(_type) _name = __GENRADIX_INITIALIZER
+
+/**
+ * genradix_init - initialize a genradix
+ * @_radix:	genradix to initialize
+ *
+ * Does not fail
+ */
+#define genradix_init(_radix)					\
+do {								\
+	*(_radix) = (typeof(*_radix)) __GENRADIX_INITIALIZER;	\
+} while (0)
+
+void __genradix_free(struct __genradix *);
+
+/**
+ * genradix_free: free all memory owned by a genradix
+ * @_radix: the genradix to free
+ *
+ * After freeing, @_radix will be reinitialized and empty
+ */
+#define genradix_free(_radix)	__genradix_free(&(_radix)->tree)
+
+static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
+{
+	if (__builtin_constant_p(obj_size))
+		BUILD_BUG_ON(obj_size > PAGE_SIZE);
+	else
+		BUG_ON(obj_size > PAGE_SIZE);
+
+	if (!is_power_of_2(obj_size)) {
+		size_t objs_per_page = PAGE_SIZE / obj_size;
+
+		return (idx / objs_per_page) * PAGE_SIZE +
+			(idx % objs_per_page) * obj_size;
+	} else {
+		return idx * obj_size;
+	}
+}
+
+#define __genradix_cast(_radix)		(typeof((_radix)->type[0]) *)
+#define __genradix_obj_size(_radix)	sizeof((_radix)->type[0])
+#define __genradix_idx_to_offset(_radix, _idx)			\
+	__idx_to_offset(_idx, __genradix_obj_size(_radix))
+
+void *__genradix_ptr(struct __genradix *, size_t);
+
+/**
+ * genradix_ptr - get a pointer to a genradix entry
+ * @_radix:	genradix to access
+ * @_idx:	index to fetch
+ *
+ * Returns a pointer to entry at @_idx, or NULL if that entry does not exist.
+ */
+#define genradix_ptr(_radix, _idx)				\
+	(__genradix_cast(_radix)				\
+	 __genradix_ptr(&(_radix)->tree,			\
+			__genradix_idx_to_offset(_radix, _idx)))
+
+void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
+
+/**
+ * genradix_ptr_alloc - get a pointer to a genradix entry, allocating it
+ *			if necessary
+ * @_radix:	genradix to access
+ * @_idx:	index to fetch
+ * @_gfp:	gfp mask
+ *
+ * Returns a pointer to entry at @_idx, or NULL on allocation failure
+ */
+#define genradix_ptr_alloc(_radix, _idx, _gfp)			\
+	(__genradix_cast(_radix)				\
+	 __genradix_ptr_alloc(&(_radix)->tree,			\
+			__genradix_idx_to_offset(_radix, _idx),	\
+			_gfp))
+
+struct genradix_iter {
+	size_t			offset;
+	size_t			pos;
+};
+
+/**
+ * genradix_iter_init - initialize a genradix_iter
+ * @_radix:	genradix that will be iterated over
+ * @_idx:	index to start iterating from
+ */
+#define genradix_iter_init(_radix, _idx)			\
+	((struct genradix_iter) {				\
+		.pos	= (_idx),				\
+		.offset	= __genradix_idx_to_offset((_radix), (_idx)),\
+	})
+
+void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
+
+/**
+ * genradix_iter_peek - get first entry at or above iterator's current
+ *			position
+ * @_iter:	a genradix_iter
+ * @_radix:	genradix being iterated over
+ *
+ * If no more entries exist at or above @_iter's current position, returns NULL
+ */
+#define genradix_iter_peek(_iter, _radix)			\
+	(__genradix_cast(_radix)				\
+	 __genradix_iter_peek(_iter, &(_radix)->tree,		\
+			      PAGE_SIZE / __genradix_obj_size(_radix)))
+
+static inline void __genradix_iter_advance(struct genradix_iter *iter,
+					   size_t obj_size)
+{
+	iter->offset += obj_size;
+
+	if (!is_power_of_2(obj_size) &&
+	    (iter->offset & (PAGE_SIZE - 1)) + obj_size > PAGE_SIZE)
+		iter->offset = round_up(iter->offset, PAGE_SIZE);
+
+	iter->pos++;
+}
+
+#define genradix_iter_advance(_iter, _radix)			\
+	__genradix_iter_advance(_iter, __genradix_obj_size(_radix))
+
+#define genradix_for_each_from(_radix, _iter, _p, _start)	\
+	for (_iter = genradix_iter_init(_radix, _start);	\
+	     (_p = genradix_iter_peek(&_iter, _radix)) != NULL;	\
+	     genradix_iter_advance(&_iter, _radix))
+
+/**
+ * genradix_for_each - iterate over entry in a genradix
+ * @_radix:	genradix to iterate over
+ * @_iter:	a genradix_iter to track current position
+ * @_p:		pointer to genradix entry type
+ *
+ * On every iteration, @_p will point to the current entry, and @_iter.pos
+ * will be the current entry's index.
+ */
+#define genradix_for_each(_radix, _iter, _p)			\
+	genradix_for_each_from(_radix, _iter, _p, 0)
+
+int __genradix_prealloc(struct __genradix *, size_t, gfp_t);
+
+/**
+ * genradix_prealloc - preallocate entries in a generic radix tree
+ * @_radix:	genradix to preallocate
+ * @_nr:	number of entries to preallocate
+ * @_gfp:	gfp mask
+ *
+ * Returns 0 on success, -ENOMEM on failure
+ */
+#define genradix_prealloc(_radix, _nr, _gfp)			\
+	 __genradix_prealloc(&(_radix)->tree,			\
+			__genradix_idx_to_offset(_radix, _nr + 1),\
+			_gfp)
+
+
+#endif /* _LINUX_GENERIC_RADIX_TREE_H */
diff --git a/lib/Makefile b/lib/Makefile
index 647517940b29..b798b41d01ae 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -38,7 +38,8 @@ obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
 	 percpu-refcount.o rhashtable.o reciprocal_div.o \
-	 once.o refcount.o usercopy.o errseq.o bucket_locks.o
+	 once.o refcount.o usercopy.o errseq.o bucket_locks.o \
+	 generic-radix-tree.o
 obj-$(CONFIG_STRING_SELFTEST) += test_string.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c
new file mode 100644
index 000000000000..a7bafc413730
--- /dev/null
+++ b/lib/generic-radix-tree.c
@@ -0,0 +1,217 @@
+
+#include <linux/export.h>
+#include <linux/generic-radix-tree.h>
+#include <linux/gfp.h>
+
+#define GENRADIX_ARY		(PAGE_SIZE / sizeof(struct genradix_node *))
+#define GENRADIX_ARY_SHIFT	ilog2(GENRADIX_ARY)
+
+struct genradix_node {
+	union {
+		/* Interior node: */
+		struct genradix_node	*children[GENRADIX_ARY];
+
+		/* Leaf: */
+		u8			data[PAGE_SIZE];
+	};
+};
+
+static inline int genradix_depth_shift(unsigned depth)
+{
+	return PAGE_SHIFT + GENRADIX_ARY_SHIFT * depth;
+}
+
+/*
+ * Returns size (of data, in bytes) that a tree of a given depth holds:
+ */
+static inline size_t genradix_depth_size(unsigned depth)
+{
+	return 1UL << genradix_depth_shift(depth);
+}
+
+/* depth that's needed for a genradix that can address up to ULONG_MAX: */
+#define GENRADIX_MAX_DEPTH	\
+	DIV_ROUND_UP(BITS_PER_LONG - PAGE_SHIFT, GENRADIX_ARY_SHIFT)
+
+#define GENRADIX_DEPTH_MASK				\
+	((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
+
+unsigned genradix_root_to_depth(struct genradix_root *r)
+{
+	return (unsigned long) r & GENRADIX_DEPTH_MASK;
+}
+
+struct genradix_node *genradix_root_to_node(struct genradix_root *r)
+{
+	return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
+}
+
+/*
+ * Returns pointer to the specified byte @offset within @radix, or NULL if not
+ * allocated
+ */
+void *__genradix_ptr(struct __genradix *radix, size_t offset)
+{
+	struct genradix_root *r = READ_ONCE(radix->root);
+	struct genradix_node *n = genradix_root_to_node(r);
+	unsigned level		= genradix_root_to_depth(r);
+
+	if (ilog2(offset) >= genradix_depth_shift(level))
+		return NULL;
+
+	while (1) {
+		if (!n)
+			return NULL;
+		if (!level)
+			break;
+
+		level--;
+
+		n = n->children[offset >> genradix_depth_shift(level)];
+		offset &= genradix_depth_size(level) - 1;
+	}
+
+	return &n->data[offset];
+}
+EXPORT_SYMBOL(__genradix_ptr);
+
+/*
+ * Returns pointer to the specified byte @offset within @radix, allocating it if
+ * necessary - newly allocated slots are always zeroed out:
+ */
+void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
+			   gfp_t gfp_mask)
+{
+	struct genradix_root *v = READ_ONCE(radix->root);
+	struct genradix_node *n, *new_node = NULL;
+	unsigned level;
+
+	/* Increase tree depth if necessary: */
+	while (1) {
+		struct genradix_root *r = v, *new_root;
+
+		n	= genradix_root_to_node(r);
+		level	= genradix_root_to_depth(r);
+
+		if (n && ilog2(offset) < genradix_depth_shift(level))
+			break;
+
+		if (!new_node) {
+			new_node = (void *)
+				__get_free_page(gfp_mask|__GFP_ZERO);
+			if (!new_node)
+				return NULL;
+		}
+
+		new_node->children[0] = n;
+		new_root = ((struct genradix_root *)
+			    ((unsigned long) new_node | (n ? level + 1 : 0)));
+
+		if ((v = cmpxchg_release(&radix->root, r, new_root)) == r) {
+			v = new_root;
+			new_node = NULL;
+		}
+	}
+
+	while (level--) {
+		struct genradix_node **p =
+			&n->children[offset >> genradix_depth_shift(level)];
+		offset &= genradix_depth_size(level) - 1;
+
+		n = READ_ONCE(*p);
+		if (!n) {
+			if (!new_node) {
+				new_node = (void *)
+					__get_free_page(gfp_mask|__GFP_ZERO);
+				if (!new_node)
+					return NULL;
+			}
+
+			if (!(n = cmpxchg_release(p, NULL, new_node)))
+				swap(n, new_node);
+		}
+	}
+
+	if (new_node)
+		free_page((unsigned long) new_node);
+
+	return &n->data[offset];
+}
+EXPORT_SYMBOL(__genradix_ptr_alloc);
+
+void *__genradix_iter_peek(struct genradix_iter *iter,
+			   struct __genradix *radix,
+			   size_t objs_per_page)
+{
+	struct genradix_root *r;
+	struct genradix_node *n;
+	unsigned level, i;
+restart:
+	r = READ_ONCE(radix->root);
+	if (!r)
+		return NULL;
+
+	n	= genradix_root_to_node(r);
+	level	= genradix_root_to_depth(r);
+
+	if (ilog2(iter->offset) >= genradix_depth_shift(level))
+		return NULL;
+
+	while (level) {
+		level--;
+
+		i = (iter->offset >> genradix_depth_shift(level)) &
+			(GENRADIX_ARY - 1);
+
+		while (!n->children[i]) {
+			i++;
+			iter->offset = round_down(iter->offset +
+					   genradix_depth_size(level),
+					   genradix_depth_size(level));
+			iter->pos = (iter->offset >> PAGE_SHIFT) *
+				objs_per_page;
+			if (i == GENRADIX_ARY)
+				goto restart;
+		}
+
+		n = n->children[i];
+	}
+
+	return &n->data[iter->offset & (PAGE_SIZE - 1)];
+}
+EXPORT_SYMBOL(__genradix_iter_peek);
+
+static void genradix_free_recurse(struct genradix_node *n, unsigned level)
+{
+	if (level) {
+		unsigned i;
+
+		for (i = 0; i < GENRADIX_ARY; i++)
+			if (n->children[i])
+				genradix_free_recurse(n->children[i], level - 1);
+	}
+
+	free_page((unsigned long) n);
+}
+
+int __genradix_prealloc(struct __genradix *radix, size_t size,
+			gfp_t gfp_mask)
+{
+	size_t offset;
+
+	for (offset = 0; offset < size; offset += PAGE_SIZE)
+		if (!__genradix_ptr_alloc(radix, offset, gfp_mask))
+			return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL(__genradix_prealloc);
+
+void __genradix_free(struct __genradix *radix)
+{
+	struct genradix_root *r = xchg(&radix->root, NULL);
+
+	genradix_free_recurse(genradix_root_to_node(r),
+			      genradix_root_to_depth(r));
+}
+EXPORT_SYMBOL(__genradix_free);
-- 
cgit v1.2.3


From 2075e50caf5ea28be3cba0d01b3058bb5c3b0168 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Mar 2019 23:31:22 -0700
Subject: sctp: convert to genradix

This also makes sctp_stream_alloc_(out|in) saner, in that they no longer
allocate new flex_arrays/genradixes, they just preallocate more
elements.

This code does however have a suspicious lack of locking.

Link: http://lkml.kernel.org/r/20181217131929.11727-7-kent.overstreet@gmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Vlad Yasevich <vyasevich@gmail.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Eric Paris <eparis@parisplace.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Pravin B Shelar <pshelar@ovn.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/net/sctp/structs.h   |  15 +++---
 net/sctp/stream.c            | 125 +++++++------------------------------------
 net/sctp/stream_interleave.c |   2 +-
 3 files changed, 28 insertions(+), 114 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 58e4b23cecf4..140fd836a396 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -48,6 +48,7 @@
 #define __sctp_structs_h__
 
 #include <linux/ktime.h>
+#include <linux/generic-radix-tree.h>
 #include <linux/rhashtable-types.h>
 #include <linux/socket.h>	/* linux/in.h needs this!!    */
 #include <linux/in.h>		/* We get struct sockaddr_in. */
@@ -57,7 +58,6 @@
 #include <linux/atomic.h>		/* This gets us atomic counters.  */
 #include <linux/skbuff.h>	/* We need sk_buff_head. */
 #include <linux/workqueue.h>	/* We need tq_struct.	 */
-#include <linux/flex_array.h>	/* We need flex_array.   */
 #include <linux/sctp.h>		/* We need sctp* header structs.  */
 #include <net/sctp/auth.h>	/* We need auth specific structs */
 #include <net/ip.h>		/* For inet_skb_parm */
@@ -1449,8 +1449,9 @@ struct sctp_stream_in {
 };
 
 struct sctp_stream {
-	struct flex_array *out;
-	struct flex_array *in;
+	GENRADIX(struct sctp_stream_out) out;
+	GENRADIX(struct sctp_stream_in)	in;
+
 	__u16 outcnt;
 	__u16 incnt;
 	/* Current stream being sent, if any */
@@ -1473,17 +1474,17 @@ struct sctp_stream {
 };
 
 static inline struct sctp_stream_out *sctp_stream_out(
-	const struct sctp_stream *stream,
+	struct sctp_stream *stream,
 	__u16 sid)
 {
-	return flex_array_get(stream->out, sid);
+	return genradix_ptr(&stream->out, sid);
 }
 
 static inline struct sctp_stream_in *sctp_stream_in(
-	const struct sctp_stream *stream,
+	struct sctp_stream *stream,
 	__u16 sid)
 {
-	return flex_array_get(stream->in, sid);
+	return genradix_ptr(&stream->in, sid);
 }
 
 #define SCTP_SO(s, i) sctp_stream_out((s), (i))
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 3b47457862cc..b6bb68adac6e 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -37,66 +37,6 @@
 #include <net/sctp/sm.h>
 #include <net/sctp/stream_sched.h>
 
-static struct flex_array *fa_alloc(size_t elem_size, size_t elem_count,
-				   gfp_t gfp)
-{
-	struct flex_array *result;
-	int err;
-
-	result = flex_array_alloc(elem_size, elem_count, gfp);
-	if (result) {
-		err = flex_array_prealloc(result, 0, elem_count, gfp);
-		if (err) {
-			flex_array_free(result);
-			result = NULL;
-		}
-	}
-
-	return result;
-}
-
-static void fa_free(struct flex_array *fa)
-{
-	if (fa)
-		flex_array_free(fa);
-}
-
-static void fa_copy(struct flex_array *fa, struct flex_array *from,
-		    size_t index, size_t count)
-{
-	void *elem;
-
-	while (count--) {
-		elem = flex_array_get(from, index);
-		flex_array_put(fa, index, elem, 0);
-		index++;
-	}
-}
-
-static void fa_zero(struct flex_array *fa, size_t index, size_t count)
-{
-	void *elem;
-
-	while (count--) {
-		elem = flex_array_get(fa, index);
-		memset(elem, 0, fa->element_size);
-		index++;
-	}
-}
-
-static size_t fa_index(struct flex_array *fa, void *elem, size_t count)
-{
-	size_t index = 0;
-
-	while (count--) {
-		if (elem == flex_array_get(fa, index))
-			break;
-		index++;
-	}
-
-	return index;
-}
-
 /* Migrates chunks from stream queues to new stream queues if needed,
  * but not across associations. Also, removes those chunks to streams
  * higher than the new max.
@@ -153,53 +93,32 @@ static void sctp_stream_outq_migrate(struct sctp_stream *stream,
 static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
 				 gfp_t gfp)
 {
-	struct flex_array *out;
-	size_t elem_size = sizeof(struct sctp_stream_out);
-
-	out = fa_alloc(elem_size, outcnt, gfp);
-	if (!out)
-		return -ENOMEM;
-
-	if (stream->out) {
-		fa_copy(out, stream->out, 0, min(outcnt, stream->outcnt));
-		if (stream->out_curr) {
-			size_t index = fa_index(stream->out, stream->out_curr,
-						stream->outcnt);
-
-			BUG_ON(index == stream->outcnt);
-			stream->out_curr = flex_array_get(out, index);
-		}
-		fa_free(stream->out);
-	}
+	int ret;
 
-	if (outcnt > stream->outcnt)
-		fa_zero(out, stream->outcnt, (outcnt - stream->outcnt));
+	if (outcnt <= stream->outcnt)
+		return 0;
 
-	stream->out = out;
+	ret = genradix_prealloc(&stream->out, outcnt, gfp);
+	if (ret)
+		return ret;
 
+	stream->outcnt = outcnt;
 	return 0;
 }
 
 static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt,
 				gfp_t gfp)
 {
-	struct flex_array *in;
-	size_t elem_size = sizeof(struct sctp_stream_in);
-
-	in = fa_alloc(elem_size, incnt, gfp);
-	if (!in)
-		return -ENOMEM;
-
-	if (stream->in) {
-		fa_copy(in, stream->in, 0, min(incnt, stream->incnt));
-		fa_free(stream->in);
-	}
+	int ret;
 
-	if (incnt > stream->incnt)
-		fa_zero(in, stream->incnt, (incnt - stream->incnt));
+	if (incnt <= stream->incnt)
+		return 0;
 
-	stream->in = in;
+	ret = genradix_prealloc(&stream->in, incnt, gfp);
+	if (ret)
+		return ret;
 
+	stream->incnt = incnt;
 	return 0;
 }
 
@@ -226,7 +145,6 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
 	if (ret)
 		goto out;
 
-	stream->outcnt = outcnt;
 	for (i = 0; i < stream->outcnt; i++)
 		SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;
 
@@ -238,14 +156,11 @@ in:
 	ret = sctp_stream_alloc_in(stream, incnt, gfp);
 	if (ret) {
 		sched->free(stream);
-		fa_free(stream->out);
-		stream->out = NULL;
+		genradix_free(&stream->out);
 		stream->outcnt = 0;
 		goto out;
 	}
 
-	stream->incnt = incnt;
-
 out:
 	return ret;
 }
@@ -270,8 +185,8 @@ void sctp_stream_free(struct sctp_stream *stream)
 	sched->free(stream);
 	for (i = 0; i < stream->outcnt; i++)
 		kfree(SCTP_SO(stream, i)->ext);
-	fa_free(stream->out);
-	fa_free(stream->in);
+	genradix_free(&stream->out);
+	genradix_free(&stream->in);
 }
 
 void sctp_stream_clear(struct sctp_stream *stream)
@@ -302,8 +217,8 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new)
 
 	sched->sched_all(stream);
 
-	new->out = NULL;
-	new->in  = NULL;
+	new->out.tree.root = NULL;
+	new->in.tree.root  = NULL;
 	new->outcnt = 0;
 	new->incnt  = 0;
 }
@@ -555,8 +470,6 @@ int sctp_send_add_streams(struct sctp_association *asoc,
 		goto out;
 	}
 
-	stream->outcnt = outcnt;
-
 	asoc->strreset_outstanding = !!out + !!in;
 
 out:
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index a6bf21579466..102c6fefe38c 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -101,7 +101,7 @@ static void sctp_chunk_assign_mid(struct sctp_chunk *chunk)
 
 static bool sctp_validate_data(struct sctp_chunk *chunk)
 {
-	const struct sctp_stream *stream;
+	struct sctp_stream *stream;
 	__u16 sid, ssn;
 
 	if (chunk->chunk_hdr->type != SCTP_CID_DATA)
-- 
cgit v1.2.3


From 586187d7de71b4da7956ba588ae42253b9ff6482 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 11 Mar 2019 23:31:26 -0700
Subject: Drop flex_arrays

All existing users have been converted to generic radix trees

Link: http://lkml.kernel.org/r/20181217131929.11727-8-kent.overstreet@gmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric Paris <eparis@parisplace.org>
Cc: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Pravin B Shelar <pshelar@ovn.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Vlad Yasevich <vyasevich@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/core-api/flexible-arrays.rst | 130 ----------
 Documentation/flexible-arrays.txt          | 123 ---------
 include/linux/flex_array.h                 | 149 -----------
 include/linux/poison.h                     |   3 -
 lib/Makefile                               |   2 +-
 lib/flex_array.c                           | 398 -----------------------------
 tools/include/linux/poison.h               |   3 -
 7 files changed, 1 insertion(+), 807 deletions(-)
 delete mode 100644 Documentation/core-api/flexible-arrays.rst
 delete mode 100644 Documentation/flexible-arrays.txt
 delete mode 100644 include/linux/flex_array.h
 delete mode 100644 lib/flex_array.c

(limited to 'include')

diff --git a/Documentation/core-api/flexible-arrays.rst b/Documentation/core-api/flexible-arrays.rst
deleted file mode 100644
index b6b85a1b518e..000000000000
--- a/Documentation/core-api/flexible-arrays.rst
+++ /dev/null
@@ -1,130 +0,0 @@
-
-===================================
-Using flexible arrays in the kernel
-===================================
-
-Large contiguous memory allocations can be unreliable in the Linux kernel.
-Kernel programmers will sometimes respond to this problem by allocating
-pages with :c:func:`vmalloc()`.  This solution not ideal, though.  On 32-bit
-systems, memory from vmalloc() must be mapped into a relatively small address
-space; it's easy to run out.  On SMP systems, the page table changes required
-by vmalloc() allocations can require expensive cross-processor interrupts on
-all CPUs.  And, on all systems, use of space in the vmalloc() range increases
-pressure on the translation lookaside buffer (TLB), reducing the performance
-of the system.
-
-In many cases, the need for memory from vmalloc() can be eliminated by piecing
-together an array from smaller parts; the flexible array library exists to make
-this task easier.
-
-A flexible array holds an arbitrary (within limits) number of fixed-sized
-objects, accessed via an integer index.  Sparse arrays are handled
-reasonably well.  Only single-page allocations are made, so memory
-allocation failures should be relatively rare.  The down sides are that the
-arrays cannot be indexed directly, individual object size cannot exceed the
-system page size, and putting data into a flexible array requires a copy
-operation.  It's also worth noting that flexible arrays do no internal
-locking at all; if concurrent access to an array is possible, then the
-caller must arrange for appropriate mutual exclusion.
-
-The creation of a flexible array is done with :c:func:`flex_array_alloc()`::
-
-    #include <linux/flex_array.h>
-
-    struct flex_array *flex_array_alloc(int element_size,
-					unsigned int total,
-					gfp_t flags);
-
-The individual object size is provided by ``element_size``, while total is the
-maximum number of objects which can be stored in the array.  The flags
-argument is passed directly to the internal memory allocation calls.  With
-the current code, using flags to ask for high memory is likely to lead to
-notably unpleasant side effects.
-
-It is also possible to define flexible arrays at compile time with::
-
-    DEFINE_FLEX_ARRAY(name, element_size, total);
-
-This macro will result in a definition of an array with the given name; the
-element size and total will be checked for validity at compile time.
-
-Storing data into a flexible array is accomplished with a call to
-:c:func:`flex_array_put()`::
-
-    int flex_array_put(struct flex_array *array, unsigned int element_nr,
-    		       void *src, gfp_t flags);
-
-This call will copy the data from src into the array, in the position
-indicated by ``element_nr`` (which must be less than the maximum specified when
-the array was created).  If any memory allocations must be performed, flags
-will be used.  The return value is zero on success, a negative error code
-otherwise.
-
-There might possibly be a need to store data into a flexible array while
-running in some sort of atomic context; in this situation, sleeping in the
-memory allocator would be a bad thing.  That can be avoided by using
-``GFP_ATOMIC`` for the flags value, but, often, there is a better way.  The
-trick is to ensure that any needed memory allocations are done before
-entering atomic context, using :c:func:`flex_array_prealloc()`::
-
-    int flex_array_prealloc(struct flex_array *array, unsigned int start,
-			    unsigned int nr_elements, gfp_t flags);
-
-This function will ensure that memory for the elements indexed in the range
-defined by ``start`` and ``nr_elements`` has been allocated.  Thereafter, a
-``flex_array_put()`` call on an element in that range is guaranteed not to
-block.
-
-Getting data back out of the array is done with :c:func:`flex_array_get()`::
-
-    void *flex_array_get(struct flex_array *fa, unsigned int element_nr);
-
-The return value is a pointer to the data element, or NULL if that
-particular element has never been allocated.
-
-Note that it is possible to get back a valid pointer for an element which
-has never been stored in the array.  Memory for array elements is allocated
-one page at a time; a single allocation could provide memory for several
-adjacent elements.  Flexible array elements are normally initialized to the
-value ``FLEX_ARRAY_FREE`` (defined as 0x6c in <linux/poison.h>), so errors
-involving that number probably result from use of unstored array entries.
-Note that, if array elements are allocated with ``__GFP_ZERO``, they will be
-initialized to zero and this poisoning will not happen.
-
-Individual elements in the array can be cleared with
-:c:func:`flex_array_clear()`::
-
-    int flex_array_clear(struct flex_array *array, unsigned int element_nr);
-
-This function will set the given element to ``FLEX_ARRAY_FREE`` and return
-zero.  If storage for the indicated element is not allocated for the array,
-``flex_array_clear()`` will return ``-EINVAL`` instead.  Note that clearing an
-element does not release the storage associated with it; to reduce the
-allocated size of an array, call :c:func:`flex_array_shrink()`::
-
-    int flex_array_shrink(struct flex_array *array);
-
-The return value will be the number of pages of memory actually freed.
-This function works by scanning the array for pages containing nothing but
-``FLEX_ARRAY_FREE`` bytes, so (1) it can be expensive, and (2) it will not work
-if the array's pages are allocated with ``__GFP_ZERO``.
-
-It is possible to remove all elements of an array with a call to
-:c:func:`flex_array_free_parts()`::
-
-    void flex_array_free_parts(struct flex_array *array);
-
-This call frees all elements, but leaves the array itself in place.
-Freeing the entire array is done with :c:func:`flex_array_free()`::
-
-    void flex_array_free(struct flex_array *array);
-
-As of this writing, there are no users of flexible arrays in the mainline
-kernel.  The functions described here are also not exported to modules;
-that will probably be fixed when somebody comes up with a need for it.
-
-
-Flexible array functions
-------------------------
-
-.. kernel-doc:: include/linux/flex_array.h
diff --git a/Documentation/flexible-arrays.txt b/Documentation/flexible-arrays.txt
deleted file mode 100644
index a0f2989dd804..000000000000
--- a/Documentation/flexible-arrays.txt
+++ /dev/null
@@ -1,123 +0,0 @@
-===================================
-Using flexible arrays in the kernel
-===================================
-
-:Updated: Last updated for 2.6.32
-:Author: Jonathan Corbet <corbet@lwn.net>
-
-Large contiguous memory allocations can be unreliable in the Linux kernel.
-Kernel programmers will sometimes respond to this problem by allocating
-pages with vmalloc().  This solution not ideal, though.  On 32-bit systems,
-memory from vmalloc() must be mapped into a relatively small address space;
-it's easy to run out.  On SMP systems, the page table changes required by
-vmalloc() allocations can require expensive cross-processor interrupts on
-all CPUs.  And, on all systems, use of space in the vmalloc() range
-increases pressure on the translation lookaside buffer (TLB), reducing the
-performance of the system.
-
-In many cases, the need for memory from vmalloc() can be eliminated by
-piecing together an array from smaller parts; the flexible array library
-exists to make this task easier.
-
-A flexible array holds an arbitrary (within limits) number of fixed-sized
-objects, accessed via an integer index.  Sparse arrays are handled
-reasonably well.  Only single-page allocations are made, so memory
-allocation failures should be relatively rare.  The down sides are that the
-arrays cannot be indexed directly, individual object size cannot exceed the
-system page size, and putting data into a flexible array requires a copy
-operation.  It's also worth noting that flexible arrays do no internal
-locking at all; if concurrent access to an array is possible, then the
-caller must arrange for appropriate mutual exclusion.
-
-The creation of a flexible array is done with::
-
-    #include <linux/flex_array.h>
-
-    struct flex_array *flex_array_alloc(int element_size,
-					unsigned int total,
-					gfp_t flags);
-
-The individual object size is provided by element_size, while total is the
-maximum number of objects which can be stored in the array.  The flags
-argument is passed directly to the internal memory allocation calls.  With
-the current code, using flags to ask for high memory is likely to lead to
-notably unpleasant side effects.
-
-It is also possible to define flexible arrays at compile time with::
-
-    DEFINE_FLEX_ARRAY(name, element_size, total);
-
-This macro will result in a definition of an array with the given name; the
-element size and total will be checked for validity at compile time.
-
-Storing data into a flexible array is accomplished with a call to::
-
-    int flex_array_put(struct flex_array *array, unsigned int element_nr,
-    		       void *src, gfp_t flags);
-
-This call will copy the data from src into the array, in the position
-indicated by element_nr (which must be less than the maximum specified when
-the array was created).  If any memory allocations must be performed, flags
-will be used.  The return value is zero on success, a negative error code
-otherwise.
-
-There might possibly be a need to store data into a flexible array while
-running in some sort of atomic context; in this situation, sleeping in the
-memory allocator would be a bad thing.  That can be avoided by using
-GFP_ATOMIC for the flags value, but, often, there is a better way.  The
-trick is to ensure that any needed memory allocations are done before
-entering atomic context, using::
-
-    int flex_array_prealloc(struct flex_array *array, unsigned int start,
-			    unsigned int nr_elements, gfp_t flags);
-
-This function will ensure that memory for the elements indexed in the range
-defined by start and nr_elements has been allocated.  Thereafter, a
-flex_array_put() call on an element in that range is guaranteed not to
-block.
-
-Getting data back out of the array is done with::
-
-    void *flex_array_get(struct flex_array *fa, unsigned int element_nr);
-
-The return value is a pointer to the data element, or NULL if that
-particular element has never been allocated.
-
-Note that it is possible to get back a valid pointer for an element which
-has never been stored in the array.  Memory for array elements is allocated
-one page at a time; a single allocation could provide memory for several
-adjacent elements.  Flexible array elements are normally initialized to the
-value FLEX_ARRAY_FREE (defined as 0x6c in <linux/poison.h>), so errors
-involving that number probably result from use of unstored array entries.
-Note that, if array elements are allocated with __GFP_ZERO, they will be
-initialized to zero and this poisoning will not happen.
-
-Individual elements in the array can be cleared with::
-
-    int flex_array_clear(struct flex_array *array, unsigned int element_nr);
-
-This function will set the given element to FLEX_ARRAY_FREE and return
-zero.  If storage for the indicated element is not allocated for the array,
-flex_array_clear() will return -EINVAL instead.  Note that clearing an
-element does not release the storage associated with it; to reduce the
-allocated size of an array, call::
-
-    int flex_array_shrink(struct flex_array *array);
-
-The return value will be the number of pages of memory actually freed.
-This function works by scanning the array for pages containing nothing but
-FLEX_ARRAY_FREE bytes, so (1) it can be expensive, and (2) it will not work
-if the array's pages are allocated with __GFP_ZERO.
-
-It is possible to remove all elements of an array with a call to::
-
-    void flex_array_free_parts(struct flex_array *array);
-
-This call frees all elements, but leaves the array itself in place.
-Freeing the entire array is done with::
-
-    void flex_array_free(struct flex_array *array);
-
-As of this writing, there are no users of flexible arrays in the mainline
-kernel.  The functions described here are also not exported to modules;
-that will probably be fixed when somebody comes up with a need for it.
diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h
deleted file mode 100644
index b94fa61b51fb..000000000000
--- a/include/linux/flex_array.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _FLEX_ARRAY_H
-#define _FLEX_ARRAY_H
-
-#include <linux/types.h>
-#include <linux/reciprocal_div.h>
-#include <asm/page.h>
-
-#define FLEX_ARRAY_PART_SIZE PAGE_SIZE
-#define FLEX_ARRAY_BASE_SIZE PAGE_SIZE
-
-struct flex_array_part;
-
-/*
- * This is meant to replace cases where an array-like
- * structure has gotten too big to fit into kmalloc()
- * and the developer is getting tempted to use
- * vmalloc().
- */
-
-struct flex_array {
-	union {
-		struct {
-			int element_size;
-			int total_nr_elements;
-			int elems_per_part;
-			struct reciprocal_value reciprocal_elems;
-			struct flex_array_part *parts[];
-		};
-		/*
-		 * This little trick makes sure that
-		 * sizeof(flex_array) == PAGE_SIZE
-		 */
-		char padding[FLEX_ARRAY_BASE_SIZE];
-	};
-};
-
-/* Number of bytes left in base struct flex_array, excluding metadata */
-#define FLEX_ARRAY_BASE_BYTES_LEFT					\
-	(FLEX_ARRAY_BASE_SIZE - offsetof(struct flex_array, parts))
-
-/* Number of pointers in base to struct flex_array_part pages */
-#define FLEX_ARRAY_NR_BASE_PTRS						\
-	(FLEX_ARRAY_BASE_BYTES_LEFT / sizeof(struct flex_array_part *))
-
-/* Number of elements of size that fit in struct flex_array_part */
-#define FLEX_ARRAY_ELEMENTS_PER_PART(size)				\
-	(FLEX_ARRAY_PART_SIZE / size)
-
-/*
- * Defines a statically allocated flex array and ensures its parameters are
- * valid.
- */
-#define DEFINE_FLEX_ARRAY(__arrayname, __element_size, __total)		\
-	struct flex_array __arrayname = { { {				\
-		.element_size = (__element_size),			\
-		.total_nr_elements = (__total),				\
-	} } };								\
-	static inline void __arrayname##_invalid_parameter(void)	\
-	{								\
-		BUILD_BUG_ON((__total) > FLEX_ARRAY_NR_BASE_PTRS *	\
-			FLEX_ARRAY_ELEMENTS_PER_PART(__element_size));	\
-	}
-
-/**
- * flex_array_alloc() - Creates a flexible array.
- * @element_size:	individual object size.
- * @total:		maximum number of objects which can be stored.
- * @flags:		GFP flags
- *
- * Return:		Returns an object of structure flex_array.
- */
-struct flex_array *flex_array_alloc(int element_size, unsigned int total,
-		gfp_t flags);
-
-/**
- * flex_array_prealloc() - Ensures that memory for the elements indexed in the
- * range defined by start and nr_elements has been allocated.
- * @fa:			array to allocate memory to.
- * @start:		start address
- * @nr_elements:	number of elements to be allocated.
- * @flags:		GFP flags
- *
- */
-int flex_array_prealloc(struct flex_array *fa, unsigned int start,
-		unsigned int nr_elements, gfp_t flags);
-
-/**
- * flex_array_free() - Removes all elements of a flexible array.
- * @fa:		array to be freed.
- */
-void flex_array_free(struct flex_array *fa);
-
-/**
- * flex_array_free_parts() - Removes all elements of a flexible array, but
- * leaves the array itself in place.
- * @fa:		array to be emptied.
- */
-void flex_array_free_parts(struct flex_array *fa);
-
-/**
- * flex_array_put() - Stores data into a flexible array.
- * @fa:		array where element is to be stored.
- * @element_nr:	position to copy, must be less than the maximum specified when
- *		the array was created.
- * @src:	data source to be copied into the array.
- * @flags:	GFP flags
- *
- * Return:	Returns zero on success, a negative error code otherwise.
- */
-int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src,
-		gfp_t flags);
-
-/**
- * flex_array_clear() - Clears an individual element in the array, sets the
- * given element to FLEX_ARRAY_FREE.
- * @element_nr:	element position to clear.
- * @fa:		array to which element to be cleared belongs.
- *
- * Return:	Returns zero on success, -EINVAL otherwise.
- */
-int flex_array_clear(struct flex_array *fa, unsigned int element_nr);
-
-/**
- * flex_array_get() - Retrieves data into a flexible array.
- *
- * @element_nr:	Element position to retrieve data from.
- * @fa:		array from which data is to be retrieved.
- *
- * Return:	Returns a pointer to the data element, or NULL if that
- *		particular element has never been allocated.
- */
-void *flex_array_get(struct flex_array *fa, unsigned int element_nr);
-
-/**
- * flex_array_shrink() - Reduces the allocated size of an array.
- * @fa:		array to shrink.
- *
- * Return:	Returns number of pages of memory actually freed.
- *
- */
-int flex_array_shrink(struct flex_array *fa);
-
-#define flex_array_put_ptr(fa, nr, src, gfp) \
-	flex_array_put(fa, nr, (void *)&(src), gfp)
-
-void *flex_array_get_ptr(struct flex_array *fa, unsigned int element_nr);
-
-#endif /* _FLEX_ARRAY_H */
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 5046bad0c1c5..d6d980a681c7 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -83,9 +83,6 @@
 #define MUTEX_DEBUG_FREE	0x22
 #define MUTEX_POISON_WW_CTX	((void *) 0x500 + POISON_POINTER_DELTA)
 
-/********** lib/flex_array.c **********/
-#define FLEX_ARRAY_FREE	0x6c	/* for use-after-free poisoning */
-
 /********** security/ **********/
 #define KEY_DESTROY		0xbd
 
diff --git a/lib/Makefile b/lib/Makefile
index b798b41d01ae..4e066120a0d6 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -35,7 +35,7 @@ obj-y	+= lockref.o
 
 obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \
 	 bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
-	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
+	 gcd.o lcm.o list_sort.o uuid.o iov_iter.o clz_ctz.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
 	 percpu-refcount.o rhashtable.o reciprocal_div.o \
 	 once.o refcount.o usercopy.o errseq.o bucket_locks.o \
diff --git a/lib/flex_array.c b/lib/flex_array.c
deleted file mode 100644
index 2eed22fa507c..000000000000
--- a/lib/flex_array.c
+++ /dev/null
@@ -1,398 +0,0 @@
-/*
- * Flexible array managed in PAGE_SIZE parts
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright IBM Corporation, 2009
- *
- * Author: Dave Hansen <dave@linux.vnet.ibm.com>
- */
-
-#include <linux/flex_array.h>
-#include <linux/slab.h>
-#include <linux/stddef.h>
-#include <linux/export.h>
-#include <linux/reciprocal_div.h>
-
-struct flex_array_part {
-	char elements[FLEX_ARRAY_PART_SIZE];
-};
-
-/*
- * If a user requests an allocation which is small
- * enough, we may simply use the space in the
- * flex_array->parts[] array to store the user
- * data.
- */
-static inline int elements_fit_in_base(struct flex_array *fa)
-{
-	int data_size = fa->element_size * fa->total_nr_elements;
-	if (data_size <= FLEX_ARRAY_BASE_BYTES_LEFT)
-		return 1;
-	return 0;
-}
-
-/**
- * flex_array_alloc - allocate a new flexible array
- * @element_size:	the size of individual elements in the array
- * @total:		total number of elements that this should hold
- * @flags:		page allocation flags to use for base array
- *
- * Note: all locking must be provided by the caller.
- *
- * @total is used to size internal structures.  If the user ever
- * accesses any array indexes >=@total, it will produce errors.
- *
- * The maximum number of elements is defined as: the number of
- * elements that can be stored in a page times the number of
- * page pointers that we can fit in the base structure or (using
- * integer math):
- *
- * 	(PAGE_SIZE/element_size) * (PAGE_SIZE-8)/sizeof(void *)
- *
- * Here's a table showing example capacities.  Note that the maximum
- * index that the get/put() functions is just nr_objects-1.   This
- * basically means that you get 4MB of storage on 32-bit and 2MB on
- * 64-bit.
- *
- *
- * Element size | Objects | Objects |
- * PAGE_SIZE=4k |  32-bit |  64-bit |
- * ---------------------------------|
- *      1 bytes | 4177920 | 2088960 |
- *      2 bytes | 2088960 | 1044480 |
- *      3 bytes | 1392300 |  696150 |
- *      4 bytes | 1044480 |  522240 |
- *     32 bytes |  130560 |   65408 |
- *     33 bytes |  126480 |   63240 |
- *   2048 bytes |    2040 |    1020 |
- *   2049 bytes |    1020 |     510 |
- *       void * | 1044480 |  261120 |
- *
- * Since 64-bit pointers are twice the size, we lose half the
- * capacity in the base structure.  Also note that no effort is made
- * to efficiently pack objects across page boundaries.
- */
-struct flex_array *flex_array_alloc(int element_size, unsigned int total,
-					gfp_t flags)
-{
-	struct flex_array *ret;
-	int elems_per_part = 0;
-	int max_size = 0;
-	struct reciprocal_value reciprocal_elems = { 0 };
-
-	if (element_size) {
-		elems_per_part = FLEX_ARRAY_ELEMENTS_PER_PART(element_size);
-		reciprocal_elems = reciprocal_value(elems_per_part);
-		max_size = FLEX_ARRAY_NR_BASE_PTRS * elems_per_part;
-	}
-
-	/* max_size will end up 0 if element_size > PAGE_SIZE */
-	if (total > max_size)
-		return NULL;
-	ret = kzalloc(sizeof(struct flex_array), flags);
-	if (!ret)
-		return NULL;
-	ret->element_size = element_size;
-	ret->total_nr_elements = total;
-	ret->elems_per_part = elems_per_part;
-	ret->reciprocal_elems = reciprocal_elems;
-	if (elements_fit_in_base(ret) && !(flags & __GFP_ZERO))
-		memset(&ret->parts[0], FLEX_ARRAY_FREE,
-						FLEX_ARRAY_BASE_BYTES_LEFT);
-	return ret;
-}
-EXPORT_SYMBOL(flex_array_alloc);
-
-static int fa_element_to_part_nr(struct flex_array *fa,
-					unsigned int element_nr)
-{
-	/*
-	 * if element_size == 0 we don't get here, so we never touch
-	 * the zeroed fa->reciprocal_elems, which would yield invalid
-	 * results
-	 */
-	return reciprocal_divide(element_nr, fa->reciprocal_elems);
-}
-
-/**
- * flex_array_free_parts - just free the second-level pages
- * @fa:		the flex array from which to free parts
- *
- * This is to be used in cases where the base 'struct flex_array'
- * has been statically allocated and should not be free.
- */
-void flex_array_free_parts(struct flex_array *fa)
-{
-	int part_nr;
-
-	if (elements_fit_in_base(fa))
-		return;
-	for (part_nr = 0; part_nr < FLEX_ARRAY_NR_BASE_PTRS; part_nr++)
-		kfree(fa->parts[part_nr]);
-}
-EXPORT_SYMBOL(flex_array_free_parts);
-
-void flex_array_free(struct flex_array *fa)
-{
-	flex_array_free_parts(fa);
-	kfree(fa);
-}
-EXPORT_SYMBOL(flex_array_free);
-
-static unsigned int index_inside_part(struct flex_array *fa,
-					unsigned int element_nr,
-					unsigned int part_nr)
-{
-	unsigned int part_offset;
-
-	part_offset = element_nr - part_nr * fa->elems_per_part;
-	return part_offset * fa->element_size;
-}
-
-static struct flex_array_part *
-__fa_get_part(struct flex_array *fa, int part_nr, gfp_t flags)
-{
-	struct flex_array_part *part = fa->parts[part_nr];
-	if (!part) {
-		part = kmalloc(sizeof(struct flex_array_part), flags);
-		if (!part)
-			return NULL;
-		if (!(flags & __GFP_ZERO))
-			memset(part, FLEX_ARRAY_FREE,
-				sizeof(struct flex_array_part));
-		fa->parts[part_nr] = part;
-	}
-	return part;
-}
-
-/**
- * flex_array_put - copy data into the array at @element_nr
- * @fa:		the flex array to copy data into
- * @element_nr:	index of the position in which to insert
- * 		the new element.
- * @src:	address of data to copy into the array
- * @flags:	page allocation flags to use for array expansion
- *
- *
- * Note that this *copies* the contents of @src into
- * the array.  If you are trying to store an array of
- * pointers, make sure to pass in &ptr instead of ptr.
- * You may instead wish to use the flex_array_put_ptr()
- * helper function.
- *
- * Locking must be provided by the caller.
- */
-int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src,
-			gfp_t flags)
-{
-	int part_nr = 0;
-	struct flex_array_part *part;
-	void *dst;
-
-	if (element_nr >= fa->total_nr_elements)
-		return -ENOSPC;
-	if (!fa->element_size)
-		return 0;
-	if (elements_fit_in_base(fa))
-		part = (struct flex_array_part *)&fa->parts[0];
-	else {
-		part_nr = fa_element_to_part_nr(fa, element_nr);
-		part = __fa_get_part(fa, part_nr, flags);
-		if (!part)
-			return -ENOMEM;
-	}
-	dst = &part->elements[index_inside_part(fa, element_nr, part_nr)];
-	memcpy(dst, src, fa->element_size);
-	return 0;
-}
-EXPORT_SYMBOL(flex_array_put);
-
-/**
- * flex_array_clear - clear element in array at @element_nr
- * @fa:		the flex array of the element.
- * @element_nr:	index of the position to clear.
- *
- * Locking must be provided by the caller.
- */
-int flex_array_clear(struct flex_array *fa, unsigned int element_nr)
-{
-	int part_nr = 0;
-	struct flex_array_part *part;
-	void *dst;
-
-	if (element_nr >= fa->total_nr_elements)
-		return -ENOSPC;
-	if (!fa->element_size)
-		return 0;
-	if (elements_fit_in_base(fa))
-		part = (struct flex_array_part *)&fa->parts[0];
-	else {
-		part_nr = fa_element_to_part_nr(fa, element_nr);
-		part = fa->parts[part_nr];
-		if (!part)
-			return -EINVAL;
-	}
-	dst = &part->elements[index_inside_part(fa, element_nr, part_nr)];
-	memset(dst, FLEX_ARRAY_FREE, fa->element_size);
-	return 0;
-}
-EXPORT_SYMBOL(flex_array_clear);
-
-/**
- * flex_array_prealloc - guarantee that array space exists
- * @fa:			the flex array for which to preallocate parts
- * @start:		index of first array element for which space is allocated
- * @nr_elements:	number of elements for which space is allocated
- * @flags:		page allocation flags
- *
- * This will guarantee that no future calls to flex_array_put()
- * will allocate memory.  It can be used if you are expecting to
- * be holding a lock or in some atomic context while writing
- * data into the array.
- *
- * Locking must be provided by the caller.
- */
-int flex_array_prealloc(struct flex_array *fa, unsigned int start,
-			unsigned int nr_elements, gfp_t flags)
-{
-	int start_part;
-	int end_part;
-	int part_nr;
-	unsigned int end;
-	struct flex_array_part *part;
-
-	if (!start && !nr_elements)
-		return 0;
-	if (start >= fa->total_nr_elements)
-		return -ENOSPC;
-	if (!nr_elements)
-		return 0;
-
-	end = start + nr_elements - 1;
-
-	if (end >= fa->total_nr_elements)
-		return -ENOSPC;
-	if (!fa->element_size)
-		return 0;
-	if (elements_fit_in_base(fa))
-		return 0;
-	start_part = fa_element_to_part_nr(fa, start);
-	end_part = fa_element_to_part_nr(fa, end);
-	for (part_nr = start_part; part_nr <= end_part; part_nr++) {
-		part = __fa_get_part(fa, part_nr, flags);
-		if (!part)
-			return -ENOMEM;
-	}
-	return 0;
-}
-EXPORT_SYMBOL(flex_array_prealloc);
-
-/**
- * flex_array_get - pull data back out of the array
- * @fa:		the flex array from which to extract data
- * @element_nr:	index of the element to fetch from the array
- *
- * Returns a pointer to the data at index @element_nr.  Note
- * that this is a copy of the data that was passed in.  If you
- * are using this to store pointers, you'll get back &ptr.  You
- * may instead wish to use the flex_array_get_ptr helper.
- *
- * Locking must be provided by the caller.
- */
-void *flex_array_get(struct flex_array *fa, unsigned int element_nr)
-{
-	int part_nr = 0;
-	struct flex_array_part *part;
-
-	if (!fa->element_size)
-		return NULL;
-	if (element_nr >= fa->total_nr_elements)
-		return NULL;
-	if (elements_fit_in_base(fa))
-		part = (struct flex_array_part *)&fa->parts[0];
-	else {
-		part_nr = fa_element_to_part_nr(fa, element_nr);
-		part = fa->parts[part_nr];
-		if (!part)
-			return NULL;
-	}
-	return &part->elements[index_inside_part(fa, element_nr, part_nr)];
-}
-EXPORT_SYMBOL(flex_array_get);
-
-/**
- * flex_array_get_ptr - pull a ptr back out of the array
- * @fa:		the flex array from which to extract data
- * @element_nr:	index of the element to fetch from the array
- *
- * Returns the pointer placed in the flex array at element_nr using
- * flex_array_put_ptr().  This function should not be called if the
- * element in question was not set using the _put_ptr() helper.
- */
-void *flex_array_get_ptr(struct flex_array *fa, unsigned int element_nr)
-{
-	void **tmp;
-
-	tmp = flex_array_get(fa, element_nr);
-	if (!tmp)
-		return NULL;
-
-	return *tmp;
-}
-EXPORT_SYMBOL(flex_array_get_ptr);
-
-static int part_is_free(struct flex_array_part *part)
-{
-	int i;
-
-	for (i = 0; i < sizeof(struct flex_array_part); i++)
-		if (part->elements[i] != FLEX_ARRAY_FREE)
-			return 0;
-	return 1;
-}
-
-/**
- * flex_array_shrink - free unused second-level pages
- * @fa:		the flex array to shrink
- *
- * Frees all second-level pages that consist solely of unused
- * elements.  Returns the number of pages freed.
- *
- * Locking must be provided by the caller.
- */
-int flex_array_shrink(struct flex_array *fa)
-{
-	struct flex_array_part *part;
-	int part_nr;
-	int ret = 0;
-
-	if (!fa->total_nr_elements || !fa->element_size)
-		return 0;
-	if (elements_fit_in_base(fa))
-		return ret;
-	for (part_nr = 0; part_nr < FLEX_ARRAY_NR_BASE_PTRS; part_nr++) {
-		part = fa->parts[part_nr];
-		if (!part)
-			continue;
-		if (part_is_free(part)) {
-			fa->parts[part_nr] = NULL;
-			kfree(part);
-			ret++;
-		}
-	}
-	return ret;
-}
-EXPORT_SYMBOL(flex_array_shrink);
diff --git a/tools/include/linux/poison.h b/tools/include/linux/poison.h
index 9fdcd3eaac3b..d29725769107 100644
--- a/tools/include/linux/poison.h
+++ b/tools/include/linux/poison.h
@@ -87,9 +87,6 @@
 #define MUTEX_DEBUG_INIT	0x11
 #define MUTEX_DEBUG_FREE	0x22
 
-/********** lib/flex_array.c **********/
-#define FLEX_ARRAY_FREE	0x6c	/* for use-after-free poisoning */
-
 /********** security/ **********/
 #define KEY_DESTROY		0xbd
 
-- 
cgit v1.2.3


From f87b543af45ea217fa1d000483f9a40944c1ff73 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Tue, 12 Mar 2019 12:06:35 -0400
Subject: fix null pointer deref in tracepoints in back channel

Backchannel doesn't have the rq_task->tk_clientid pointer set.

Otherwise can lead to the following oops:
ocalhost login: [  111.385319] BUG: unable to handle kernel NULL pointer dereference at 0000000000000004
[  111.388073] #PF error: [normal kernel read fault]
[  111.389452] PGD 80000000290d8067 P4D 80000000290d8067 PUD 75f25067 PMD 0
[  111.391224] Oops: 0000 [#1] SMP PTI
[  111.392151] CPU: 0 PID: 3533 Comm: NFSv4 callback Not tainted 5.0.0-rc7+ #1
[  111.393787] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
[  111.396340] RIP: 0010:trace_event_raw_event_xprt_enq_xmit+0x6f/0xf0 [sunrpc]
[  111.397974] Code: 00 00 00 48 89 ee 48 89 e7 e8 bd 0a 85 d7 48 85 c0 74 4a 41 0f b7 94 24 e0 00 00 00 48 89 e7 89 50 08 49 8b 94 24 a8 00 00 00 <8b> 52 04 89 50 0c 49 8b 94 24 c0 00 00 00 8b 92 a8 00 00 00 0f ca
[  111.402215] RSP: 0018:ffffb98743263cf8 EFLAGS: 00010286
[  111.403406] RAX: ffffa0890fc3bc88 RBX: 0000000000000003 RCX: 0000000000000000
[  111.405057] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffb98743263cf8
[  111.406656] RBP: ffffa0896f5368f0 R08: 0000000000000246 R09: 0000000000000000
[  111.408437] R10: ffffe19b01c01500 R11: 0000000000000000 R12: ffffa08977d28a00
[  111.410210] R13: 0000000000000004 R14: ffffa089315303f0 R15: ffffa08931530000
[  111.411856] FS:  0000000000000000(0000) GS:ffffa0897bc00000(0000) knlGS:0000000000000000
[  111.413699] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  111.415068] CR2: 0000000000000004 CR3: 000000002ac90004 CR4: 00000000001606f0
[  111.416745] Call Trace:
[  111.417339]  xprt_request_enqueue_transmit+0x2b6/0x4a0 [sunrpc]
[  111.418709]  ? rpc_task_need_encode+0x40/0x40 [sunrpc]
[  111.419957]  call_bc_transmit+0xd5/0x170 [sunrpc]
[  111.421067]  __rpc_execute+0x7e/0x3f0 [sunrpc]
[  111.422177]  rpc_run_bc_task+0x78/0xd0 [sunrpc]
[  111.423212]  bc_svc_process+0x281/0x340 [sunrpc]
[  111.424325]  nfs41_callback_svc+0x130/0x1c0 [nfsv4]
[  111.425430]  ? remove_wait_queue+0x60/0x60
[  111.426398]  kthread+0xf5/0x130
[  111.427155]  ? nfs_callback_authenticate+0x50/0x50 [nfsv4]
[  111.428388]  ? kthread_bind+0x10/0x10
[  111.429270]  ret_from_fork+0x1f/0x30

localhost login: [  467.462259] BUG: unable to handle kernel NULL pointer dereference at 0000000000000004
[  467.464411] #PF error: [normal kernel read fault]
[  467.465445] PGD 80000000728c1067 P4D 80000000728c1067 PUD 728c0067 PMD 0
[  467.466980] Oops: 0000 [#1] SMP PTI
[  467.467759] CPU: 0 PID: 3517 Comm: NFSv4 callback Not tainted 5.0.0-rc7+ #1
[  467.469393] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
[  467.471840] RIP: 0010:trace_event_raw_event_xprt_transmit+0x7c/0xf0 [sunrpc]
[  467.473392] Code: f6 48 85 c0 74 4b 49 8b 94 24 98 00 00 00 48 89 e7 0f b7 92 e0 00 00 00 89 50 08 49 8b 94 24 98 00 00 00 48 8b 92 a8 00 00 00 <8b> 52 04 89 50 0c 41 8b 94 24 a8 00 00 00 0f ca 89 50 10 41 8b 94
[  467.477605] RSP: 0018:ffffabe7434fbcd0 EFLAGS: 00010282
[  467.478793] RAX: ffff99720fc3bce0 RBX: 0000000000000003 RCX: 0000000000000000
[  467.480409] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffabe7434fbcd0
[  467.482011] RBP: ffff99726f631948 R08: 0000000000000246 R09: 0000000000000000
[  467.483591] R10: 0000000070000000 R11: 0000000000000000 R12: ffff997277dfcc00
[  467.485226] R13: 0000000000000000 R14: 0000000000000000 R15: ffff99722fecdca8
[  467.486830] FS:  0000000000000000(0000) GS:ffff99727bc00000(0000) knlGS:0000000000000000
[  467.488596] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  467.489931] CR2: 0000000000000004 CR3: 00000000270e6006 CR4: 00000000001606f0
[  467.491559] Call Trace:
[  467.492128]  xprt_transmit+0x303/0x3f0 [sunrpc]
[  467.493143]  ? rpc_task_need_encode+0x40/0x40 [sunrpc]
[  467.494328]  call_bc_transmit+0x49/0x170 [sunrpc]
[  467.495379]  __rpc_execute+0x7e/0x3f0 [sunrpc]
[  467.496451]  rpc_run_bc_task+0x78/0xd0 [sunrpc]
[  467.497467]  bc_svc_process+0x281/0x340 [sunrpc]
[  467.498507]  nfs41_callback_svc+0x130/0x1c0 [nfsv4]
[  467.499751]  ? remove_wait_queue+0x60/0x60
[  467.500686]  kthread+0xf5/0x130
[  467.501438]  ? nfs_callback_authenticate+0x50/0x50 [nfsv4]
[  467.502640]  ? kthread_bind+0x10/0x10
[  467.503454]  ret_from_fork+0x1f/0x30

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/trace/events/sunrpc.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 8451f30c6a0f..7e899e635d33 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -712,7 +712,8 @@ TRACE_EVENT(xprt_transmit,
 
 	TP_fast_assign(
 		__entry->task_id = rqst->rq_task->tk_pid;
-		__entry->client_id = rqst->rq_task->tk_client->cl_clid;
+		__entry->client_id = rqst->rq_task->tk_client ?
+			rqst->rq_task->tk_client->cl_clid : -1;
 		__entry->xid = be32_to_cpu(rqst->rq_xid);
 		__entry->seqno = rqst->rq_seqno;
 		__entry->status = status;
@@ -742,7 +743,8 @@ TRACE_EVENT(xprt_enq_xmit,
 
 	TP_fast_assign(
 		__entry->task_id = task->tk_pid;
-		__entry->client_id = task->tk_client->cl_clid;
+		__entry->client_id = task->tk_client ?
+			task->tk_client->cl_clid : -1;
 		__entry->xid = be32_to_cpu(task->tk_rqstp->rq_xid);
 		__entry->seqno = task->tk_rqstp->rq_seqno;
 		__entry->stage = stage;
-- 
cgit v1.2.3


From 010d5166bbe95523e8584f3caca9f1bbeac9ea6e Mon Sep 17 00:00:00 2001
From: Abel Vesa <abel.vesa@nxp.com>
Date: Tue, 5 Mar 2019 09:49:16 +0000
Subject: dt-bindings: clock: imx8mq: Fix numbering overlaps and gaps

IMX8MQ_CLK_USB_PHY_REF changes from 163 to 153, this way removing the gap.
All the following clock ids are now decreased by 10 to keep the numbering
right. Doing this, the IMX8MQ_CLK_CSI2_CORE is not overlapped with
IMX8MQ_CLK_GPT1 anymore. IMX8MQ_CLK_GPT1_ROOT changes from 193 to 183 and
all the following ids are updated accordingly.

Reported-by: Patrick Wildt <patrick@blueri.se>
Fixes: 1cf3817b ("dt-bindings: Add binding for i.MX8MQ CCM")
Signed-off-by: Abel Vesa <abel.vesa@nxp.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/clock/imx8mq-clock.h | 220 +++++++++++++++----------------
 1 file changed, 110 insertions(+), 110 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/imx8mq-clock.h b/include/dt-bindings/clock/imx8mq-clock.h
index b58cc643c9c9..6677e920dc2d 100644
--- a/include/dt-bindings/clock/imx8mq-clock.h
+++ b/include/dt-bindings/clock/imx8mq-clock.h
@@ -245,160 +245,160 @@
 /* USB_CORE_REF */
 #define IMX8MQ_CLK_USB_CORE_REF		152
 /* USB_PHY_REF */
-#define IMX8MQ_CLK_USB_PHY_REF		163
+#define IMX8MQ_CLK_USB_PHY_REF		153
 /* ECSPI1 */
-#define IMX8MQ_CLK_ECSPI1		164
+#define IMX8MQ_CLK_ECSPI1		154
 /* ECSPI2 */
-#define IMX8MQ_CLK_ECSPI2		165
+#define IMX8MQ_CLK_ECSPI2		155
 /* PWM1 */
-#define IMX8MQ_CLK_PWM1			166
+#define IMX8MQ_CLK_PWM1			156
 /* PWM2 */
-#define IMX8MQ_CLK_PWM2			167
+#define IMX8MQ_CLK_PWM2			157
 /* PWM3 */
-#define IMX8MQ_CLK_PWM3			168
+#define IMX8MQ_CLK_PWM3			158
 /* PWM4 */
-#define IMX8MQ_CLK_PWM4			169
+#define IMX8MQ_CLK_PWM4			159
 /* GPT1 */
-#define IMX8MQ_CLK_GPT1			170
+#define IMX8MQ_CLK_GPT1			160
 /* WDOG */
-#define IMX8MQ_CLK_WDOG			171
+#define IMX8MQ_CLK_WDOG			161
 /* WRCLK */
-#define IMX8MQ_CLK_WRCLK		172
+#define IMX8MQ_CLK_WRCLK		162
 /* DSI_CORE */
-#define IMX8MQ_CLK_DSI_CORE		173
+#define IMX8MQ_CLK_DSI_CORE		163
 /* DSI_PHY */
-#define IMX8MQ_CLK_DSI_PHY_REF		174
+#define IMX8MQ_CLK_DSI_PHY_REF		164
 /* DSI_DBI */
-#define IMX8MQ_CLK_DSI_DBI		175
+#define IMX8MQ_CLK_DSI_DBI		165
 /*DSI_ESC */
-#define IMX8MQ_CLK_DSI_ESC		176
+#define IMX8MQ_CLK_DSI_ESC		166
 /* CSI1_CORE */
-#define IMX8MQ_CLK_CSI1_CORE		177
+#define IMX8MQ_CLK_CSI1_CORE		167
 /* CSI1_PHY */
-#define IMX8MQ_CLK_CSI1_PHY_REF		178
+#define IMX8MQ_CLK_CSI1_PHY_REF		168
 /* CSI_ESC */
-#define IMX8MQ_CLK_CSI1_ESC		179
+#define IMX8MQ_CLK_CSI1_ESC		169
 /* CSI2_CORE */
 #define IMX8MQ_CLK_CSI2_CORE		170
 /* CSI2_PHY */
-#define IMX8MQ_CLK_CSI2_PHY_REF		181
+#define IMX8MQ_CLK_CSI2_PHY_REF		171
 /* CSI2_ESC */
-#define IMX8MQ_CLK_CSI2_ESC		182
+#define IMX8MQ_CLK_CSI2_ESC		172
 /* PCIE2_CTRL */
-#define IMX8MQ_CLK_PCIE2_CTRL		183
+#define IMX8MQ_CLK_PCIE2_CTRL		173
 /* PCIE2_PHY */
-#define IMX8MQ_CLK_PCIE2_PHY		184
+#define IMX8MQ_CLK_PCIE2_PHY		174
 /* PCIE2_AUX */
-#define IMX8MQ_CLK_PCIE2_AUX		185
+#define IMX8MQ_CLK_PCIE2_AUX		175
 /* ECSPI3 */
-#define IMX8MQ_CLK_ECSPI3		186
+#define IMX8MQ_CLK_ECSPI3		176
 
 /* CCGR clocks */
-#define IMX8MQ_CLK_A53_ROOT			187
-#define IMX8MQ_CLK_DRAM_ROOT			188
-#define IMX8MQ_CLK_ECSPI1_ROOT			189
+#define IMX8MQ_CLK_A53_ROOT			177
+#define IMX8MQ_CLK_DRAM_ROOT			178
+#define IMX8MQ_CLK_ECSPI1_ROOT			179
 #define IMX8MQ_CLK_ECSPI2_ROOT			180
 #define IMX8MQ_CLK_ECSPI3_ROOT			181
 #define IMX8MQ_CLK_ENET1_ROOT			182
-#define IMX8MQ_CLK_GPT1_ROOT			193
-#define IMX8MQ_CLK_I2C1_ROOT			194
-#define IMX8MQ_CLK_I2C2_ROOT			195
-#define IMX8MQ_CLK_I2C3_ROOT			196
-#define IMX8MQ_CLK_I2C4_ROOT			197
-#define IMX8MQ_CLK_M4_ROOT			198
-#define IMX8MQ_CLK_PCIE1_ROOT			199
-#define IMX8MQ_CLK_PCIE2_ROOT			200
-#define IMX8MQ_CLK_PWM1_ROOT			201
-#define IMX8MQ_CLK_PWM2_ROOT			202
-#define IMX8MQ_CLK_PWM3_ROOT			203
-#define IMX8MQ_CLK_PWM4_ROOT			204
-#define IMX8MQ_CLK_QSPI_ROOT			205
-#define IMX8MQ_CLK_SAI1_ROOT			206
-#define IMX8MQ_CLK_SAI2_ROOT			207
-#define IMX8MQ_CLK_SAI3_ROOT			208
-#define IMX8MQ_CLK_SAI4_ROOT			209
-#define IMX8MQ_CLK_SAI5_ROOT			210
-#define IMX8MQ_CLK_SAI6_ROOT			212
-#define IMX8MQ_CLK_UART1_ROOT			213
-#define IMX8MQ_CLK_UART2_ROOT			214
-#define IMX8MQ_CLK_UART3_ROOT			215
-#define IMX8MQ_CLK_UART4_ROOT			216
-#define IMX8MQ_CLK_USB1_CTRL_ROOT		217
-#define IMX8MQ_CLK_USB2_CTRL_ROOT		218
-#define IMX8MQ_CLK_USB1_PHY_ROOT		219
-#define IMX8MQ_CLK_USB2_PHY_ROOT		220
-#define IMX8MQ_CLK_USDHC1_ROOT			221
-#define IMX8MQ_CLK_USDHC2_ROOT			222
-#define IMX8MQ_CLK_WDOG1_ROOT			223
-#define IMX8MQ_CLK_WDOG2_ROOT			224
-#define IMX8MQ_CLK_WDOG3_ROOT			225
-#define IMX8MQ_CLK_GPU_ROOT			226
-#define IMX8MQ_CLK_HEVC_ROOT			227
-#define IMX8MQ_CLK_AVC_ROOT			228
-#define IMX8MQ_CLK_VP9_ROOT			229
-#define IMX8MQ_CLK_HEVC_INTER_ROOT		230
-#define IMX8MQ_CLK_DISP_ROOT			231
-#define IMX8MQ_CLK_HDMI_ROOT			232
-#define IMX8MQ_CLK_HDMI_PHY_ROOT		233
-#define IMX8MQ_CLK_VPU_DEC_ROOT			234
-#define IMX8MQ_CLK_CSI1_ROOT			235
-#define IMX8MQ_CLK_CSI2_ROOT			236
-#define IMX8MQ_CLK_RAWNAND_ROOT			237
-#define IMX8MQ_CLK_SDMA1_ROOT			238
-#define IMX8MQ_CLK_SDMA2_ROOT			239
-#define IMX8MQ_CLK_VPU_G1_ROOT			240
-#define IMX8MQ_CLK_VPU_G2_ROOT			241
+#define IMX8MQ_CLK_GPT1_ROOT			183
+#define IMX8MQ_CLK_I2C1_ROOT			184
+#define IMX8MQ_CLK_I2C2_ROOT			185
+#define IMX8MQ_CLK_I2C3_ROOT			186
+#define IMX8MQ_CLK_I2C4_ROOT			187
+#define IMX8MQ_CLK_M4_ROOT			188
+#define IMX8MQ_CLK_PCIE1_ROOT			189
+#define IMX8MQ_CLK_PCIE2_ROOT			190
+#define IMX8MQ_CLK_PWM1_ROOT			191
+#define IMX8MQ_CLK_PWM2_ROOT			192
+#define IMX8MQ_CLK_PWM3_ROOT			193
+#define IMX8MQ_CLK_PWM4_ROOT			194
+#define IMX8MQ_CLK_QSPI_ROOT			195
+#define IMX8MQ_CLK_SAI1_ROOT			196
+#define IMX8MQ_CLK_SAI2_ROOT			197
+#define IMX8MQ_CLK_SAI3_ROOT			198
+#define IMX8MQ_CLK_SAI4_ROOT			199
+#define IMX8MQ_CLK_SAI5_ROOT			200
+#define IMX8MQ_CLK_SAI6_ROOT			201
+#define IMX8MQ_CLK_UART1_ROOT			202
+#define IMX8MQ_CLK_UART2_ROOT			203
+#define IMX8MQ_CLK_UART3_ROOT			204
+#define IMX8MQ_CLK_UART4_ROOT			205
+#define IMX8MQ_CLK_USB1_CTRL_ROOT		206
+#define IMX8MQ_CLK_USB2_CTRL_ROOT		207
+#define IMX8MQ_CLK_USB1_PHY_ROOT		208
+#define IMX8MQ_CLK_USB2_PHY_ROOT		209
+#define IMX8MQ_CLK_USDHC1_ROOT			210
+#define IMX8MQ_CLK_USDHC2_ROOT			211
+#define IMX8MQ_CLK_WDOG1_ROOT			212
+#define IMX8MQ_CLK_WDOG2_ROOT			213
+#define IMX8MQ_CLK_WDOG3_ROOT			214
+#define IMX8MQ_CLK_GPU_ROOT			215
+#define IMX8MQ_CLK_HEVC_ROOT			216
+#define IMX8MQ_CLK_AVC_ROOT			217
+#define IMX8MQ_CLK_VP9_ROOT			218
+#define IMX8MQ_CLK_HEVC_INTER_ROOT		219
+#define IMX8MQ_CLK_DISP_ROOT			220
+#define IMX8MQ_CLK_HDMI_ROOT			221
+#define IMX8MQ_CLK_HDMI_PHY_ROOT		222
+#define IMX8MQ_CLK_VPU_DEC_ROOT			223
+#define IMX8MQ_CLK_CSI1_ROOT			224
+#define IMX8MQ_CLK_CSI2_ROOT			225
+#define IMX8MQ_CLK_RAWNAND_ROOT			226
+#define IMX8MQ_CLK_SDMA1_ROOT			227
+#define IMX8MQ_CLK_SDMA2_ROOT			228
+#define IMX8MQ_CLK_VPU_G1_ROOT			229
+#define IMX8MQ_CLK_VPU_G2_ROOT			230
 
 /* SCCG PLL GATE */
-#define IMX8MQ_SYS1_PLL_OUT			242
-#define IMX8MQ_SYS2_PLL_OUT			243
-#define IMX8MQ_SYS3_PLL_OUT			244
-#define IMX8MQ_DRAM_PLL_OUT			245
-
-#define IMX8MQ_GPT_3M_CLK			246
-
-#define IMX8MQ_CLK_IPG_ROOT			247
-#define IMX8MQ_CLK_IPG_AUDIO_ROOT		248
-#define IMX8MQ_CLK_SAI1_IPG			249
-#define IMX8MQ_CLK_SAI2_IPG			250
-#define IMX8MQ_CLK_SAI3_IPG			251
-#define IMX8MQ_CLK_SAI4_IPG			252
-#define IMX8MQ_CLK_SAI5_IPG			253
-#define IMX8MQ_CLK_SAI6_IPG			254
+#define IMX8MQ_SYS1_PLL_OUT			231
+#define IMX8MQ_SYS2_PLL_OUT			232
+#define IMX8MQ_SYS3_PLL_OUT			233
+#define IMX8MQ_DRAM_PLL_OUT			234
+
+#define IMX8MQ_GPT_3M_CLK			235
+
+#define IMX8MQ_CLK_IPG_ROOT			236
+#define IMX8MQ_CLK_IPG_AUDIO_ROOT		237
+#define IMX8MQ_CLK_SAI1_IPG			238
+#define IMX8MQ_CLK_SAI2_IPG			239
+#define IMX8MQ_CLK_SAI3_IPG			240
+#define IMX8MQ_CLK_SAI4_IPG			241
+#define IMX8MQ_CLK_SAI5_IPG			242
+#define IMX8MQ_CLK_SAI6_IPG			243
 
 /* DSI AHB/IPG clocks */
 /* rxesc clock */
-#define IMX8MQ_CLK_DSI_AHB			255
+#define IMX8MQ_CLK_DSI_AHB			244
 /* txesc clock */
-#define IMX8MQ_CLK_DSI_IPG_DIV                  256
+#define IMX8MQ_CLK_DSI_IPG_DIV                  245
 
-#define IMX8MQ_CLK_TMU_ROOT			257
+#define IMX8MQ_CLK_TMU_ROOT			246
 
 /* Display root clocks */
-#define IMX8MQ_CLK_DISP_AXI_ROOT		258
-#define IMX8MQ_CLK_DISP_APB_ROOT		259
-#define IMX8MQ_CLK_DISP_RTRM_ROOT		260
+#define IMX8MQ_CLK_DISP_AXI_ROOT		247
+#define IMX8MQ_CLK_DISP_APB_ROOT		248
+#define IMX8MQ_CLK_DISP_RTRM_ROOT		249
 
-#define IMX8MQ_CLK_OCOTP_ROOT			261
+#define IMX8MQ_CLK_OCOTP_ROOT			250
 
-#define IMX8MQ_CLK_DRAM_ALT_ROOT		262
-#define IMX8MQ_CLK_DRAM_CORE			263
+#define IMX8MQ_CLK_DRAM_ALT_ROOT		251
+#define IMX8MQ_CLK_DRAM_CORE			252
 
-#define IMX8MQ_CLK_MU_ROOT			264
-#define IMX8MQ_VIDEO2_PLL_OUT			265
+#define IMX8MQ_CLK_MU_ROOT			253
+#define IMX8MQ_VIDEO2_PLL_OUT			254
 
-#define IMX8MQ_CLK_CLKO2			266
+#define IMX8MQ_CLK_CLKO2			255
 
-#define IMX8MQ_CLK_NAND_USDHC_BUS_RAWNAND_CLK	267
+#define IMX8MQ_CLK_NAND_USDHC_BUS_RAWNAND_CLK	256
 
-#define IMX8MQ_CLK_CLKO1			268
-#define IMX8MQ_CLK_ARM				269
+#define IMX8MQ_CLK_CLKO1			257
+#define IMX8MQ_CLK_ARM				258
 
-#define IMX8MQ_CLK_GPIO1_ROOT			270
-#define IMX8MQ_CLK_GPIO2_ROOT			271
-#define IMX8MQ_CLK_GPIO3_ROOT			272
-#define IMX8MQ_CLK_GPIO4_ROOT			273
-#define IMX8MQ_CLK_GPIO5_ROOT			274
+#define IMX8MQ_CLK_GPIO1_ROOT			259
+#define IMX8MQ_CLK_GPIO2_ROOT			260
+#define IMX8MQ_CLK_GPIO3_ROOT			261
+#define IMX8MQ_CLK_GPIO4_ROOT			262
+#define IMX8MQ_CLK_GPIO5_ROOT			263
 
-#define IMX8MQ_CLK_END				275
+#define IMX8MQ_CLK_END				264
 #endif /* __DT_BINDINGS_CLOCK_IMX8MQ_H */
-- 
cgit v1.2.3


From 68b79cdc6de97fe270ceb40082a4aa6ad3e41ea7 Mon Sep 17 00:00:00 2001
From: Zeng Guangyue <zengguangyue@hisilicon.com>
Date: Mon, 18 Feb 2019 14:26:41 +0800
Subject: f2fs: correct spelling mistake

correct spelling mistake for "nunmber"

Signed-off-by: Zeng Guangyue <zengguangyue@hisilicon.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/linux/f2fs_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 666db8eb71e0..f5740423b002 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -285,7 +285,7 @@ enum {
 
 struct node_footer {
 	__le32 nid;		/* node id */
-	__le32 ino;		/* inode nunmber */
+	__le32 ino;		/* inode number */
 	__le32 flag;		/* include cold/fsync/dentry marks and offset */
 	__le64 cp_ver;		/* checkpoint version */
 	__le32 next_blkaddr;	/* next node page block address */
-- 
cgit v1.2.3


From 559e87c497a820e132995ad56c8361509e5410da Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 26 Feb 2019 19:01:15 +0800
Subject: f2fs: trace f2fs_ioc_shutdown

This patch supports to trace f2fs_ioc_shutdown.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c              |  3 +++
 include/trace/events/f2fs.h | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 3a8c8eb0f549..807a97ad2430 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1987,6 +1987,9 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
 out:
 	if (in != F2FS_GOING_DOWN_FULLSYNC)
 		mnt_drop_write_file(filp);
+
+	trace_f2fs_shutdown(sbi, in, ret);
+
 	return ret;
 }
 
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 8a28a2b1be74..0cb746fe6ae1 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -149,6 +149,14 @@ TRACE_DEFINE_ENUM(CP_TRIMMED);
 		{ CP_SPEC_LOG_NUM,	"log type is 2" },		\
 		{ CP_RECOVER_DIR,	"dir needs recovery" })
 
+#define show_shutdown_mode(type)					\
+	__print_symbolic(type,						\
+		{ F2FS_GOING_DOWN_FULLSYNC,	"full sync" },		\
+		{ F2FS_GOING_DOWN_METASYNC,	"meta sync" },		\
+		{ F2FS_GOING_DOWN_NOSYNC,	"no sync" },		\
+		{ F2FS_GOING_DOWN_METAFLUSH,	"meta flush" },		\
+		{ F2FS_GOING_DOWN_NEED_FSCK,	"need fsck" })
+
 struct f2fs_sb_info;
 struct f2fs_io_info;
 struct extent_info;
@@ -1619,6 +1627,30 @@ DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_exit,
 	TP_ARGS(sb, type, count)
 );
 
+TRACE_EVENT(f2fs_shutdown,
+
+	TP_PROTO(struct f2fs_sb_info *sbi, unsigned int mode, int ret),
+
+	TP_ARGS(sbi, mode, ret),
+
+	TP_STRUCT__entry(
+		__field(dev_t,	dev)
+		__field(unsigned int, mode)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->dev = sbi->sb->s_dev;
+		__entry->mode = mode;
+		__entry->ret = ret;
+	),
+
+	TP_printk("dev = (%d,%d), mode: %s, ret:%d",
+		show_dev(__entry->dev),
+		show_shutdown_mode(__entry->mode),
+		__entry->ret)
+);
+
 #endif /* _TRACE_F2FS_H */
 
  /* This part must be outside protection */
-- 
cgit v1.2.3


From 76630f20059245372a78ccc0845db9d698098c34 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 26 Feb 2019 19:01:16 +0800
Subject: f2fs: print more parameters in trace_f2fs_map_blocks

for better map_blocks trace.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/trace/events/f2fs.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 0cb746fe6ae1..a3916b4dd57e 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -544,6 +544,9 @@ TRACE_EVENT(f2fs_map_blocks,
 		__field(block_t,	m_lblk)
 		__field(block_t,	m_pblk)
 		__field(unsigned int,	m_len)
+		__field(unsigned int,	m_flags)
+		__field(int,	m_seg_type)
+		__field(bool,	m_may_create)
 		__field(int,	ret)
 	),
 
@@ -553,15 +556,22 @@ TRACE_EVENT(f2fs_map_blocks,
 		__entry->m_lblk		= map->m_lblk;
 		__entry->m_pblk		= map->m_pblk;
 		__entry->m_len		= map->m_len;
+		__entry->m_flags	= map->m_flags;
+		__entry->m_seg_type	= map->m_seg_type;
+		__entry->m_may_create	= map->m_may_create;
 		__entry->ret		= ret;
 	),
 
 	TP_printk("dev = (%d,%d), ino = %lu, file offset = %llu, "
-		"start blkaddr = 0x%llx, len = 0x%llx, err = %d",
+		"start blkaddr = 0x%llx, len = 0x%llx, flags = %u,"
+		"seg_type = %d, may_create = %d, err = %d",
 		show_dev_ino(__entry),
 		(unsigned long long)__entry->m_lblk,
 		(unsigned long long)__entry->m_pblk,
 		(unsigned long long)__entry->m_len,
+		__entry->m_flags,
+		__entry->m_seg_type,
+		__entry->m_may_create,
 		__entry->ret)
 );
 
-- 
cgit v1.2.3


From 31b265b3baaf55f209229888b7ffea523ddab366 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Fri, 8 Mar 2019 11:32:04 -0800
Subject: tracing: kdb: Fix ftdump to not sleep

As reported back in 2016-11 [1], the "ftdump" kdb command triggers a
BUG for "sleeping function called from invalid context".

kdb's "ftdump" command wants to call ring_buffer_read_prepare() in
atomic context.  A very simple solution for this is to add allocation
flags to ring_buffer_read_prepare() so kdb can call it without
triggering the allocation error.  This patch does that.

Note that in the original email thread about this, it was suggested
that perhaps the solution for kdb was to either preallocate the buffer
ahead of time or create our own iterator.  I'm hoping that this
alternative of adding allocation flags to ring_buffer_read_prepare()
can be considered since it means I don't need to duplicate more of the
core trace code into "trace_kdb.c" (for either creating my own
iterator or re-preparing a ring allocator whose memory was already
allocated).

NOTE: another option for kdb is to actually figure out how to make it
reuse the existing ftrace_dump() function and totally eliminate the
duplication.  This sounds very appealing and actually works (the "sr
z" command can be seen to properly dump the ftrace buffer).  The
downside here is that ftrace_dump() fully consumes the trace buffer.
Unless that is changed I'd rather not use it because it means "ftdump
| grep xyz" won't be very useful to search the ftrace buffer since it
will throw away the whole trace on the first grep.  A future patch to
dump only the last few lines of the buffer will also be hard to
implement.

[1] https://lkml.kernel.org/r/20161117191605.GA21459@google.com

Link: http://lkml.kernel.org/r/20190308193205.213659-1-dianders@chromium.org

Reported-by: Brian Norris <briannorris@chromium.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h | 2 +-
 kernel/trace/ring_buffer.c  | 5 +++--
 kernel/trace/trace.c        | 6 ++++--
 kernel/trace/trace_kdb.c    | 6 ++++--
 4 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index f1429675f252..1a40277b512c 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -128,7 +128,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
 		    unsigned long *lost_events);
 
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu);
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags);
 void ring_buffer_read_prepare_sync(void);
 void ring_buffer_read_start(struct ring_buffer_iter *iter);
 void ring_buffer_read_finish(struct ring_buffer_iter *iter);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9a91479bbbfe..41b6f96e5366 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4191,6 +4191,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
  * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
  * @buffer: The ring buffer to read from
  * @cpu: The cpu buffer to iterate over
+ * @flags: gfp flags to use for memory allocation
  *
  * This performs the initial preparations necessary to iterate
  * through the buffer.  Memory is allocated, buffer recording
@@ -4208,7 +4209,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
  * This overall must be paired with ring_buffer_read_finish.
  */
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_iter *iter;
@@ -4216,7 +4217,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return NULL;
 
-	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+	iter = kmalloc(sizeof(*iter), flags);
 	if (!iter)
 		return NULL;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e9cc47e59d25..ccd759eaad79 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4077,7 +4077,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter->buffer_iter[cpu] =
-				ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
+				ring_buffer_read_prepare(iter->trace_buffer->buffer,
+							 cpu, GFP_KERNEL);
 		}
 		ring_buffer_read_prepare_sync();
 		for_each_tracing_cpu(cpu) {
@@ -4087,7 +4088,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
+			ring_buffer_read_prepare(iter->trace_buffer->buffer,
+						 cpu, GFP_KERNEL);
 		ring_buffer_read_prepare_sync();
 		ring_buffer_read_start(iter->buffer_iter[cpu]);
 		tracing_iter_reset(iter, cpu);
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index d953c163a079..810d78a8d14c 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -51,14 +51,16 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
 	if (cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter.buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);
+			ring_buffer_read_prepare(iter.trace_buffer->buffer,
+						 cpu, GFP_ATOMIC);
 			ring_buffer_read_start(iter.buffer_iter[cpu]);
 			tracing_iter_reset(&iter, cpu);
 		}
 	} else {
 		iter.cpu_file = cpu_file;
 		iter.buffer_iter[cpu_file] =
-			ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);
+			ring_buffer_read_prepare(iter.trace_buffer->buffer,
+						 cpu_file, GFP_ATOMIC);
 		ring_buffer_read_start(iter.buffer_iter[cpu_file]);
 		tracing_iter_reset(&iter, cpu_file);
 	}
-- 
cgit v1.2.3


From 1b986589680a2a5b6fc1ac196ea69925a93d9dd9 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 12 Mar 2019 10:23:02 -0700
Subject: bpf: Fix bpf_tcp_sock and bpf_sk_fullsock issue related to
 bpf_sk_release

Lorenz Bauer [thanks!] reported that a ptr returned by bpf_tcp_sock(sk)
can still be accessed after bpf_sk_release(sk).
Both bpf_tcp_sock() and bpf_sk_fullsock() have the same issue.
This patch addresses them together.

A simple reproducer looks like this:

	sk = bpf_sk_lookup_tcp();
	/* if (!sk) ... */
	tp = bpf_tcp_sock(sk);
	/* if (!tp) ... */
	bpf_sk_release(sk);
	snd_cwnd = tp->snd_cwnd; /* oops! The verifier does not complain. */

The problem is the verifier did not scrub the register's states of
the tcp_sock ptr (tp) after bpf_sk_release(sk).

[ Note that when calling bpf_tcp_sock(sk), the sk is not always
  refcount-acquired. e.g. bpf_tcp_sock(skb->sk). The verifier works
  fine for this case. ]

Currently, the verifier does not track if a helper's return ptr (in REG_0)
is "carry"-ing one of its argument's refcount status. To carry this info,
the reg1->id needs to be stored in reg0.

One approach was tried, like "reg0->id = reg1->id", when calling
"bpf_tcp_sock()".  The main idea was to avoid adding another "ref_obj_id"
for the same reg.  However, overlapping the NULL marking and ref
tracking purpose in one "id" does not work well:

	ref_sk = bpf_sk_lookup_tcp();
	fullsock = bpf_sk_fullsock(ref_sk);
	tp = bpf_tcp_sock(ref_sk);
	if (!fullsock) {
	     bpf_sk_release(ref_sk);
	     return 0;
	}
	/* fullsock_reg->id is marked for NOT-NULL.
	 * Same for tp_reg->id because they have the same id.
	 */

	/* oops. verifier did not complain about the missing !tp check */
	snd_cwnd = tp->snd_cwnd;

Hence, a new "ref_obj_id" is needed in "struct bpf_reg_state".
With a new ref_obj_id, when bpf_sk_release(sk) is called, the verifier can
scrub all reg states which has a ref_obj_id match.  It is done with the
changes in release_reg_references() in this patch.

While fixing it, sk_to_full_sk() is removed from bpf_tcp_sock() and
bpf_sk_fullsock() to avoid these helpers from returning
another ptr. It will make bpf_sk_release(tp) possible:

	sk = bpf_sk_lookup_tcp();
	/* if (!sk) ... */
	tp = bpf_tcp_sock(sk);
	/* if (!tp) ... */
	bpf_sk_release(tp);

A separate helper "bpf_get_listener_sock()" will be added in a later
patch to do sk_to_full_sk().

Misc change notes:
- To allow bpf_sk_release(tp), the arg of bpf_sk_release() is changed
  from ARG_PTR_TO_SOCKET to ARG_PTR_TO_SOCK_COMMON.  ARG_PTR_TO_SOCKET
  is removed from bpf.h since no helper is using it.

- arg_type_is_refcounted() is renamed to arg_type_may_be_refcounted()
  because ARG_PTR_TO_SOCK_COMMON is the only one and skb->sk is not
  refcounted.  All bpf_sk_release(), bpf_sk_fullsock() and bpf_tcp_sock()
  take ARG_PTR_TO_SOCK_COMMON.

- check_refcount_ok() ensures is_acquire_function() cannot take
  arg_type_may_be_refcounted() as its argument.

- The check_func_arg() can only allow one refcount-ed arg.  It is
  guaranteed by check_refcount_ok() which ensures at most one arg can be
  refcounted.  Hence, it is a verifier internal error if >1 refcount arg
  found in check_func_arg().

- In release_reference(), release_reference_state() is called
  first to ensure a match on "reg->ref_obj_id" can be found before
  scrubbing the reg states with release_reg_references().

- reg_is_refcounted() is no longer needed.
  1. In mark_ptr_or_null_regs(), its usage is replaced by
     "ref_obj_id && ref_obj_id == id" because,
     when is_null == true, release_reference_state() should only be
     called on the ref_obj_id obtained by a acquire helper (i.e.
     is_acquire_function() == true).  Otherwise, the following
     would happen:

	sk = bpf_sk_lookup_tcp();
	/* if (!sk) { ... } */
	fullsock = bpf_sk_fullsock(sk);
	if (!fullsock) {
		/*
		 * release_reference_state(fullsock_reg->ref_obj_id)
		 * where fullsock_reg->ref_obj_id == sk_reg->ref_obj_id.
		 *
		 * Hence, the following bpf_sk_release(sk) will fail
		 * because the ref state has already been released in the
		 * earlier release_reference_state(fullsock_reg->ref_obj_id).
		 */
		bpf_sk_release(sk);
	}

  2. In release_reg_references(), the current reg_is_refcounted() call
     is unnecessary because the id check is enough.

- The type_is_refcounted() and type_is_refcounted_or_null()
  are no longer needed also because reg_is_refcounted() is removed.

Fixes: 655a51e536c0 ("bpf: Add struct bpf_tcp_sock and BPF_FUNC_tcp_sock")
Reported-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |   1 -
 include/linux/bpf_verifier.h |  40 +++++++++++++
 kernel/bpf/verifier.c        | 131 ++++++++++++++++++++++++-------------------
 net/core/filter.c            |   6 +-
 4 files changed, 115 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a2132e09dc1c..f02367faa58d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -193,7 +193,6 @@ enum bpf_arg_type {
 
 	ARG_PTR_TO_CTX,		/* pointer to context */
 	ARG_ANYTHING,		/* any (initialized) argument is ok */
-	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock */
 	ARG_PTR_TO_SPIN_LOCK,	/* pointer to bpf_spin_lock */
 	ARG_PTR_TO_SOCK_COMMON,	/* pointer to sock_common */
 };
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 69f7a3449eda..7d8228d1c898 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -66,6 +66,46 @@ struct bpf_reg_state {
 	 * same reference to the socket, to determine proper reference freeing.
 	 */
 	u32 id;
+	/* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
+	 * from a pointer-cast helper, bpf_sk_fullsock() and
+	 * bpf_tcp_sock().
+	 *
+	 * Consider the following where "sk" is a reference counted
+	 * pointer returned from "sk = bpf_sk_lookup_tcp();":
+	 *
+	 * 1: sk = bpf_sk_lookup_tcp();
+	 * 2: if (!sk) { return 0; }
+	 * 3: fullsock = bpf_sk_fullsock(sk);
+	 * 4: if (!fullsock) { bpf_sk_release(sk); return 0; }
+	 * 5: tp = bpf_tcp_sock(fullsock);
+	 * 6: if (!tp) { bpf_sk_release(sk); return 0; }
+	 * 7: bpf_sk_release(sk);
+	 * 8: snd_cwnd = tp->snd_cwnd;  // verifier will complain
+	 *
+	 * After bpf_sk_release(sk) at line 7, both "fullsock" ptr and
+	 * "tp" ptr should be invalidated also.  In order to do that,
+	 * the reg holding "fullsock" and "sk" need to remember
+	 * the original refcounted ptr id (i.e. sk_reg->id) in ref_obj_id
+	 * such that the verifier can reset all regs which have
+	 * ref_obj_id matching the sk_reg->id.
+	 *
+	 * sk_reg->ref_obj_id is set to sk_reg->id at line 1.
+	 * sk_reg->id will stay as NULL-marking purpose only.
+	 * After NULL-marking is done, sk_reg->id can be reset to 0.
+	 *
+	 * After "fullsock = bpf_sk_fullsock(sk);" at line 3,
+	 * fullsock_reg->ref_obj_id is set to sk_reg->ref_obj_id.
+	 *
+	 * After "tp = bpf_tcp_sock(fullsock);" at line 5,
+	 * tp_reg->ref_obj_id is set to fullsock_reg->ref_obj_id
+	 * which is the same as sk_reg->ref_obj_id.
+	 *
+	 * From the verifier perspective, if sk, fullsock and tp
+	 * are not NULL, they are the same ptr with different
+	 * reg->type.  In particular, bpf_sk_release(tp) is also
+	 * allowed and has the same effect as bpf_sk_release(sk).
+	 */
+	u32 ref_obj_id;
 	/* For scalar types (SCALAR_VALUE), this represents our knowledge of
 	 * the actual value.
 	 * For pointer types, this represents the variable part of the offset
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ce166a002d16..86f9cd5d1c4e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -212,7 +212,7 @@ struct bpf_call_arg_meta {
 	int access_size;
 	s64 msize_smax_value;
 	u64 msize_umax_value;
-	int ptr_id;
+	int ref_obj_id;
 	int func_id;
 };
 
@@ -346,35 +346,15 @@ static bool reg_type_may_be_null(enum bpf_reg_type type)
 	       type == PTR_TO_TCP_SOCK_OR_NULL;
 }
 
-static bool type_is_refcounted(enum bpf_reg_type type)
-{
-	return type == PTR_TO_SOCKET;
-}
-
-static bool type_is_refcounted_or_null(enum bpf_reg_type type)
-{
-	return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL;
-}
-
-static bool reg_is_refcounted(const struct bpf_reg_state *reg)
-{
-	return type_is_refcounted(reg->type);
-}
-
 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
 {
 	return reg->type == PTR_TO_MAP_VALUE &&
 		map_value_has_spin_lock(reg->map_ptr);
 }
 
-static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg)
+static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
 {
-	return type_is_refcounted_or_null(reg->type);
-}
-
-static bool arg_type_is_refcounted(enum bpf_arg_type type)
-{
-	return type == ARG_PTR_TO_SOCKET;
+	return type == ARG_PTR_TO_SOCK_COMMON;
 }
 
 /* Determine whether the function releases some resources allocated by another
@@ -392,6 +372,12 @@ static bool is_acquire_function(enum bpf_func_id func_id)
 		func_id == BPF_FUNC_sk_lookup_udp;
 }
 
+static bool is_ptr_cast_function(enum bpf_func_id func_id)
+{
+	return func_id == BPF_FUNC_tcp_sock ||
+		func_id == BPF_FUNC_sk_fullsock;
+}
+
 /* string representation of 'enum bpf_reg_type' */
 static const char * const reg_type_str[] = {
 	[NOT_INIT]		= "?",
@@ -465,7 +451,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			if (t == PTR_TO_STACK)
 				verbose(env, ",call_%d", func(env, reg)->callsite);
 		} else {
-			verbose(env, "(id=%d", reg->id);
+			verbose(env, "(id=%d ref_obj_id=%d", reg->id,
+				reg->ref_obj_id);
 			if (t != SCALAR_VALUE)
 				verbose(env, ",off=%d", reg->off);
 			if (type_is_pkt_pointer(t))
@@ -2414,16 +2401,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		/* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */
 		if (!type_is_sk_pointer(type))
 			goto err_type;
-	} else if (arg_type == ARG_PTR_TO_SOCKET) {
-		expected_type = PTR_TO_SOCKET;
-		if (type != expected_type)
-			goto err_type;
-		if (meta->ptr_id || !reg->id) {
-			verbose(env, "verifier internal error: mismatched references meta=%d, reg=%d\n",
-				meta->ptr_id, reg->id);
-			return -EFAULT;
+		if (reg->ref_obj_id) {
+			if (meta->ref_obj_id) {
+				verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+					regno, reg->ref_obj_id,
+					meta->ref_obj_id);
+				return -EFAULT;
+			}
+			meta->ref_obj_id = reg->ref_obj_id;
 		}
-		meta->ptr_id = reg->id;
 	} else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
 		if (meta->func_id == BPF_FUNC_spin_lock) {
 			if (process_spin_lock(env, regno, true))
@@ -2740,32 +2726,38 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
 	return true;
 }
 
-static bool check_refcount_ok(const struct bpf_func_proto *fn)
+static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)
 {
 	int count = 0;
 
-	if (arg_type_is_refcounted(fn->arg1_type))
+	if (arg_type_may_be_refcounted(fn->arg1_type))
 		count++;
-	if (arg_type_is_refcounted(fn->arg2_type))
+	if (arg_type_may_be_refcounted(fn->arg2_type))
 		count++;
-	if (arg_type_is_refcounted(fn->arg3_type))
+	if (arg_type_may_be_refcounted(fn->arg3_type))
 		count++;
-	if (arg_type_is_refcounted(fn->arg4_type))
+	if (arg_type_may_be_refcounted(fn->arg4_type))
 		count++;
-	if (arg_type_is_refcounted(fn->arg5_type))
+	if (arg_type_may_be_refcounted(fn->arg5_type))
 		count++;
 
+	/* A reference acquiring function cannot acquire
+	 * another refcounted ptr.
+	 */
+	if (is_acquire_function(func_id) && count)
+		return false;
+
 	/* We only support one arg being unreferenced at the moment,
 	 * which is sufficient for the helper functions we have right now.
 	 */
 	return count <= 1;
 }
 
-static int check_func_proto(const struct bpf_func_proto *fn)
+static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
 {
 	return check_raw_mode_ok(fn) &&
 	       check_arg_pair_ok(fn) &&
-	       check_refcount_ok(fn) ? 0 : -EINVAL;
+	       check_refcount_ok(fn, func_id) ? 0 : -EINVAL;
 }
 
 /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
@@ -2799,19 +2791,20 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 }
 
 static void release_reg_references(struct bpf_verifier_env *env,
-				   struct bpf_func_state *state, int id)
+				   struct bpf_func_state *state,
+				   int ref_obj_id)
 {
 	struct bpf_reg_state *regs = state->regs, *reg;
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++)
-		if (regs[i].id == id)
+		if (regs[i].ref_obj_id == ref_obj_id)
 			mark_reg_unknown(env, regs, i);
 
 	bpf_for_each_spilled_reg(i, state, reg) {
 		if (!reg)
 			continue;
-		if (reg_is_refcounted(reg) && reg->id == id)
+		if (reg->ref_obj_id == ref_obj_id)
 			__mark_reg_unknown(reg);
 	}
 }
@@ -2820,15 +2813,20 @@ static void release_reg_references(struct bpf_verifier_env *env,
  * resources. Identify all copies of the same pointer and clear the reference.
  */
 static int release_reference(struct bpf_verifier_env *env,
-			     struct bpf_call_arg_meta *meta)
+			     int ref_obj_id)
 {
 	struct bpf_verifier_state *vstate = env->cur_state;
+	int err;
 	int i;
 
+	err = release_reference_state(cur_func(env), ref_obj_id);
+	if (err)
+		return err;
+
 	for (i = 0; i <= vstate->curframe; i++)
-		release_reg_references(env, vstate->frame[i], meta->ptr_id);
+		release_reg_references(env, vstate->frame[i], ref_obj_id);
 
-	return release_reference_state(cur_func(env), meta->ptr_id);
+	return 0;
 }
 
 static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
@@ -3047,7 +3045,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	memset(&meta, 0, sizeof(meta));
 	meta.pkt_access = fn->pkt_access;
 
-	err = check_func_proto(fn);
+	err = check_func_proto(fn, func_id);
 	if (err) {
 		verbose(env, "kernel subsystem misconfigured func %s#%d\n",
 			func_id_name(func_id), func_id);
@@ -3093,7 +3091,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 			return err;
 		}
 	} else if (is_release_function(func_id)) {
-		err = release_reference(env, &meta);
+		err = release_reference(env, meta.ref_obj_id);
 		if (err) {
 			verbose(env, "func %s#%d reference has not been acquired before\n",
 				func_id_name(func_id), func_id);
@@ -3154,8 +3152,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 
 			if (id < 0)
 				return id;
-			/* For release_reference() */
+			/* For mark_ptr_or_null_reg() */
 			regs[BPF_REG_0].id = id;
+			/* For release_reference() */
+			regs[BPF_REG_0].ref_obj_id = id;
 		} else {
 			/* For mark_ptr_or_null_reg() */
 			regs[BPF_REG_0].id = ++env->id_gen;
@@ -3170,6 +3170,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		return -EINVAL;
 	}
 
+	if (is_ptr_cast_function(func_id))
+		/* For release_reference() */
+		regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+
 	do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
 
 	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
@@ -4665,11 +4669,19 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 		} else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
 			reg->type = PTR_TO_TCP_SOCK;
 		}
-		if (is_null || !(reg_is_refcounted(reg) ||
-				 reg_may_point_to_spin_lock(reg))) {
-			/* We don't need id from this point onwards anymore,
-			 * thus we should better reset it, so that state
-			 * pruning has chances to take effect.
+		if (is_null) {
+			/* We don't need id and ref_obj_id from this point
+			 * onwards anymore, thus we should better reset it,
+			 * so that state pruning has chances to take effect.
+			 */
+			reg->id = 0;
+			reg->ref_obj_id = 0;
+		} else if (!reg_may_point_to_spin_lock(reg)) {
+			/* For not-NULL ptr, reg->ref_obj_id will be reset
+			 * in release_reg_references().
+			 *
+			 * reg->id is still used by spin_lock ptr. Other
+			 * than spin_lock ptr type, reg->id can be reset.
 			 */
 			reg->id = 0;
 		}
@@ -4684,11 +4696,16 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
 {
 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
 	struct bpf_reg_state *reg, *regs = state->regs;
+	u32 ref_obj_id = regs[regno].ref_obj_id;
 	u32 id = regs[regno].id;
 	int i, j;
 
-	if (reg_is_refcounted_or_null(&regs[regno]) && is_null)
-		release_reference_state(state, id);
+	if (ref_obj_id && ref_obj_id == id && is_null)
+		/* regs[regno] is in the " == NULL" branch.
+		 * No one could have freed the reference state before
+		 * doing the NULL check.
+		 */
+		WARN_ON_ONCE(release_reference_state(state, id));
 
 	for (i = 0; i < MAX_BPF_REG; i++)
 		mark_ptr_or_null_reg(state, &regs[i], id, is_null);
diff --git a/net/core/filter.c b/net/core/filter.c
index f274620945ff..36b6afacf83c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1796,8 +1796,6 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = {
 
 BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk)
 {
-	sk = sk_to_full_sk(sk);
-
 	return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL;
 }
 
@@ -5266,7 +5264,7 @@ static const struct bpf_func_proto bpf_sk_release_proto = {
 	.func		= bpf_sk_release,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_SOCKET,
+	.arg1_type	= ARG_PTR_TO_SOCK_COMMON,
 };
 
 BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,
@@ -5407,8 +5405,6 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
 
 BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
 {
-	sk = sk_to_full_sk(sk);
-
 	if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
 		return (unsigned long)sk;
 
-- 
cgit v1.2.3


From dbafd7ddd62369b2f3926ab847cbf8fc40e800b7 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 12 Mar 2019 10:23:04 -0700
Subject: bpf: Add bpf_get_listener_sock(struct bpf_sock *sk) helper

Add a new helper "struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk)"
which returns a bpf_sock in TCP_LISTEN state.  It will trace back to
the listener sk from a request_sock if possible.  It returns NULL
for all other cases.

No reference is taken because the helper ensures the sk is
in SOCK_RCU_FREE (where the TCP_LISTEN sock should be in).
Hence, bpf_sk_release() is unnecessary and the verifier does not
allow bpf_sk_release(listen_sk) to be called either.

The following is also allowed because the bpf_prog is run under
rcu_read_lock():

	sk = bpf_sk_lookup_tcp();
	/* if (!sk) { ... } */
	listen_sk = bpf_get_listener_sock(sk);
	/* if (!listen_sk) { ... } */
	bpf_sk_release(sk);
	src_port = listen_sk->src_port; /* Allowed */

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 11 ++++++++++-
 net/core/filter.c        | 21 +++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3c38ac9a92a7..983b25cb608d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2366,6 +2366,14 @@ union bpf_attr {
  *             current value is ect (ECN capable). Works with IPv6 and IPv4.
  *     Return
  *             1 if set, 0 if not set.
+ *
+ * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk)
+ *	Description
+ *		Return a **struct bpf_sock** pointer in TCP_LISTEN state.
+ *		bpf_sk_release() is unnecessary and not allowed.
+ *	Return
+ *		A **struct bpf_sock** pointer on success, or NULL in
+ *		case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2465,7 +2473,8 @@ union bpf_attr {
 	FN(spin_unlock),		\
 	FN(sk_fullsock),		\
 	FN(tcp_sock),			\
-	FN(skb_ecn_set_ce),
+	FN(skb_ecn_set_ce),		\
+	FN(get_listener_sock),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 36b6afacf83c..647c63a7b25b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5418,6 +5418,23 @@ static const struct bpf_func_proto bpf_tcp_sock_proto = {
 	.arg1_type	= ARG_PTR_TO_SOCK_COMMON,
 };
 
+BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
+{
+	sk = sk_to_full_sk(sk);
+
+	if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
+		return (unsigned long)sk;
+
+	return (unsigned long)NULL;
+}
+
+static const struct bpf_func_proto bpf_get_listener_sock_proto = {
+	.func		= bpf_get_listener_sock,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_SOCK_COMMON,
+};
+
 BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
 {
 	unsigned int iphdr_len;
@@ -5603,6 +5620,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #ifdef CONFIG_INET
 	case BPF_FUNC_tcp_sock:
 		return &bpf_tcp_sock_proto;
+	case BPF_FUNC_get_listener_sock:
+		return &bpf_get_listener_sock_proto;
 	case BPF_FUNC_skb_ecn_set_ce:
 		return &bpf_skb_ecn_set_ce_proto;
 #endif
@@ -5698,6 +5717,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sk_release_proto;
 	case BPF_FUNC_tcp_sock:
 		return &bpf_tcp_sock_proto;
+	case BPF_FUNC_get_listener_sock:
+		return &bpf_get_listener_sock_proto;
 #endif
 	default:
 		return bpf_base_func_proto(func_id);
-- 
cgit v1.2.3


From c24a1269652006b401bda29fea15a0e42b7870d1 Mon Sep 17 00:00:00 2001
From: Ricardo Biehl Pasquali <pasqualirb@gmail.com>
Date: Wed, 13 Mar 2019 16:06:48 -0300
Subject: ALSA: pcm: Fix function name in kernel-doc comment

Signed-off-by: Ricardo Biehl Pasquali <pasqualirb@gmail.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/pcm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/pcm.h b/include/sound/pcm.h
index 465d7d033c4c..18bd8c3ea605 100644
--- a/include/sound/pcm.h
+++ b/include/sound/pcm.h
@@ -750,7 +750,7 @@ static inline snd_pcm_uframes_t snd_pcm_playback_avail(struct snd_pcm_runtime *r
 }
 
 /**
- * snd_pcm_playback_avail - Get the available (readable) space for capture
+ * snd_pcm_capture_avail - Get the available (readable) space for capture
  * @runtime: PCM runtime instance
  *
  * Result is between 0 ... (boundary - 1)
-- 
cgit v1.2.3


From 62369db2df8d1edfa040878203b446e023a16802 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Thu, 14 Mar 2019 12:38:39 +0000
Subject: bpf: fix documentation for eBPF helpers

Another round of minor fixes for the documentation of the BPF helpers
located in the UAPI bpf.h header file. Changes include:

- Moving around description of some helpers, to keep the descriptions in
  the same order as helpers are declared (bpf_map_push_elem(), leftover
  from commit 90b1023f68c7 ("bpf: fix documentation for eBPF helpers"),
  bpf_rc_keydown(), and bpf_skb_ancestor_cgroup_id()).
- Fixing typos ("contex" -> "context").
- Harmonising return types ("void* " -> "void *", "uint64_t" -> "u64").
- Addition of the "bpf_" prefix to bpf_get_storage().
- Light additions of RST markup on some keywords.
- Empty line deletion between description and return value for
  bpf_tcp_sock().
- Edit for the description for bpf_skb_ecn_set_ce() (capital letters,
  acronym expansion, no effect if ECT not set, more details on return
  value).

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 128 ++++++++++++++++++++++++-----------------------
 1 file changed, 65 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 983b25cb608d..4465d00d3493 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -502,16 +502,6 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags)
- * 	Description
- * 		Push an element *value* in *map*. *flags* is one of:
- *
- * 		**BPF_EXIST**
- * 		If the queue/stack is full, the oldest element is removed to
- * 		make room for this.
- * 	Return
- * 		0 on success, or a negative error in case of failure.
- *
  * int bpf_probe_read(void *dst, u32 size, const void *src)
  * 	Description
  * 		For tracing programs, safely attempt to read *size* bytes from
@@ -1435,14 +1425,14 @@ union bpf_attr {
  * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx)
  * 	Description
  * 		Equivalent to bpf_get_socket_cookie() helper that accepts
- * 		*skb*, but gets socket from **struct bpf_sock_addr** contex.
+ * 		*skb*, but gets socket from **struct bpf_sock_addr** context.
  * 	Return
  * 		A 8-byte long non-decreasing number.
  *
  * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx)
  * 	Description
  * 		Equivalent to bpf_get_socket_cookie() helper that accepts
- * 		*skb*, but gets socket from **struct bpf_sock_ops** contex.
+ * 		*skb*, but gets socket from **struct bpf_sock_ops** context.
  * 	Return
  * 		A 8-byte long non-decreasing number.
  *
@@ -2098,52 +2088,52 @@ union bpf_attr {
  *	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
+ * int bpf_rc_repeat(void *ctx)
  *	Description
  *		This helper is used in programs implementing IR decoding, to
- *		report a successfully decoded key press with *scancode*,
- *		*toggle* value in the given *protocol*. The scancode will be
- *		translated to a keycode using the rc keymap, and reported as
- *		an input key down event. After a period a key up event is
- *		generated. This period can be extended by calling either
- *		**bpf_rc_keydown**\ () again with the same values, or calling
- *		**bpf_rc_repeat**\ ().
+ *		report a successfully decoded repeat key message. This delays
+ *		the generation of a key up event for previously generated
+ *		key down event.
  *
- *		Some protocols include a toggle bit, in case the button	was
- *		released and pressed again between consecutive scancodes.
+ *		Some IR protocols like NEC have a special IR message for
+ *		repeating last button, for when a button is held down.
  *
  *		The *ctx* should point to the lirc sample as passed into
  *		the program.
  *
- *		The *protocol* is the decoded protocol number (see
- *		**enum rc_proto** for some predefined values).
- *
  *		This helper is only available is the kernel was compiled with
  *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
  *		"**y**".
  *	Return
  *		0
  *
- * int bpf_rc_repeat(void *ctx)
+ * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
  *	Description
  *		This helper is used in programs implementing IR decoding, to
- *		report a successfully decoded repeat key message. This delays
- *		the generation of a key up event for previously generated
- *		key down event.
+ *		report a successfully decoded key press with *scancode*,
+ *		*toggle* value in the given *protocol*. The scancode will be
+ *		translated to a keycode using the rc keymap, and reported as
+ *		an input key down event. After a period a key up event is
+ *		generated. This period can be extended by calling either
+ *		**bpf_rc_keydown**\ () again with the same values, or calling
+ *		**bpf_rc_repeat**\ ().
  *
- *		Some IR protocols like NEC have a special IR message for
- *		repeating last button, for when a button is held down.
+ *		Some protocols include a toggle bit, in case the button	was
+ *		released and pressed again between consecutive scancodes.
  *
  *		The *ctx* should point to the lirc sample as passed into
  *		the program.
  *
+ *		The *protocol* is the decoded protocol number (see
+ *		**enum rc_proto** for some predefined values).
+ *
  *		This helper is only available is the kernel was compiled with
  *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
  *		"**y**".
  *	Return
  *		0
  *
- * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb)
+ * u64 bpf_skb_cgroup_id(struct sk_buff *skb)
  * 	Description
  * 		Return the cgroup v2 id of the socket associated with the *skb*.
  * 		This is roughly similar to the **bpf_get_cgroup_classid**\ ()
@@ -2159,30 +2149,12 @@ union bpf_attr {
  * 	Return
  * 		The id is returned or 0 in case the id could not be retrieved.
  *
- * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level)
- *	Description
- *		Return id of cgroup v2 that is ancestor of cgroup associated
- *		with the *skb* at the *ancestor_level*.  The root cgroup is at
- *		*ancestor_level* zero and each step down the hierarchy
- *		increments the level. If *ancestor_level* == level of cgroup
- *		associated with *skb*, then return value will be same as that
- *		of **bpf_skb_cgroup_id**\ ().
- *
- *		The helper is useful to implement policies based on cgroups
- *		that are upper in hierarchy than immediate cgroup associated
- *		with *skb*.
- *
- *		The format of returned id and helper limitations are same as in
- *		**bpf_skb_cgroup_id**\ ().
- *	Return
- *		The id is returned or 0 in case the id could not be retrieved.
- *
  * u64 bpf_get_current_cgroup_id(void)
  * 	Return
  * 		A 64-bit integer containing the current cgroup id based
  * 		on the cgroup within which the current task is running.
  *
- * void* get_local_storage(void *map, u64 flags)
+ * void *bpf_get_local_storage(void *map, u64 flags)
  *	Description
  *		Get the pointer to the local storage area.
  *		The type and the size of the local storage is defined
@@ -2209,6 +2181,24 @@ union bpf_attr {
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
+ * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level)
+ *	Description
+ *		Return id of cgroup v2 that is ancestor of cgroup associated
+ *		with the *skb* at the *ancestor_level*.  The root cgroup is at
+ *		*ancestor_level* zero and each step down the hierarchy
+ *		increments the level. If *ancestor_level* == level of cgroup
+ *		associated with *skb*, then return value will be same as that
+ *		of **bpf_skb_cgroup_id**\ ().
+ *
+ *		The helper is useful to implement policies based on cgroups
+ *		that are upper in hierarchy than immediate cgroup associated
+ *		with *skb*.
+ *
+ *		The format of returned id and helper limitations are same as in
+ *		**bpf_skb_cgroup_id**\ ().
+ *	Return
+ *		The id is returned or 0 in case the id could not be retrieved.
+ *
  * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
  *	Description
  *		Look for TCP socket matching *tuple*, optionally in a child
@@ -2289,6 +2279,16 @@ union bpf_attr {
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
+ * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags)
+ * 	Description
+ * 		Push an element *value* in *map*. *flags* is one of:
+ *
+ * 		**BPF_EXIST**
+ * 			If the queue/stack is full, the oldest element is
+ * 			removed to make room for this.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_map_pop_elem(struct bpf_map *map, void *value)
  * 	Description
  * 		Pop an element from *map*.
@@ -2346,33 +2346,35 @@ union bpf_attr {
  * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk)
  *	Description
  *		This helper gets a **struct bpf_sock** pointer such
- *		that all the fields in bpf_sock can be accessed.
+ *		that all the fields in this **bpf_sock** can be accessed.
  *	Return
- *		A **struct bpf_sock** pointer on success, or NULL in
+ *		A **struct bpf_sock** pointer on success, or **NULL** in
  *		case of failure.
  *
  * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk)
  *	Description
  *		This helper gets a **struct bpf_tcp_sock** pointer from a
  *		**struct bpf_sock** pointer.
- *
  *	Return
- *		A **struct bpf_tcp_sock** pointer on success, or NULL in
+ *		A **struct bpf_tcp_sock** pointer on success, or **NULL** in
  *		case of failure.
  *
  * int bpf_skb_ecn_set_ce(struct sk_buf *skb)
- *     Description
- *             Sets ECN of IP header to ce (congestion encountered) if
- *             current value is ect (ECN capable). Works with IPv6 and IPv4.
- *     Return
- *             1 if set, 0 if not set.
+ *	Description
+ *		Set ECN (Explicit Congestion Notification) field of IP header
+ *		to **CE** (Congestion Encountered) if current value is **ECT**
+ *		(ECN Capable Transport). Otherwise, do nothing. Works with IPv6
+ *		and IPv4.
+ *	Return
+ *		1 if the **CE** flag is set (either by the current helper call
+ *		or because it was already present), 0 if it is not set.
  *
  * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk)
  *	Description
- *		Return a **struct bpf_sock** pointer in TCP_LISTEN state.
- *		bpf_sk_release() is unnecessary and not allowed.
+ *		Return a **struct bpf_sock** pointer in **TCP_LISTEN** state.
+ *		**bpf_sk_release**\ () is unnecessary and not allowed.
  *	Return
- *		A **struct bpf_sock** pointer on success, or NULL in
+ *		A **struct bpf_sock** pointer on success, or **NULL** in
  *		case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
-- 
cgit v1.2.3


From 0eb0978528d47699edd091dc2c337952ad8da436 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Thu, 14 Mar 2019 12:38:40 +0000
Subject: bpf: add documentation for helpers bpf_spin_lock(), bpf_spin_unlock()

Add documentation for the BPF spinlock-related helpers to the doc in
bpf.h. I added the constraints and restrictions coming with the use of
spinlocks for BPF: not all of it is directly related to the use of the
helper, but I thought it would be nice for users to find them in the man
page.

This list of restrictions is nearly a verbatim copy of the list in
Alexei's commit log for those helpers.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4465d00d3493..929c8e537a14 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2343,6 +2343,61 @@ union bpf_attr {
  *	Return
  *		0
  *
+ * int bpf_spin_lock(struct bpf_spin_lock *lock)
+ *	Description
+ *		Acquire a spinlock represented by the pointer *lock*, which is
+ *		stored as part of a value of a map. Taking the lock allows to
+ *		safely update the rest of the fields in that value. The
+ *		spinlock can (and must) later be released with a call to
+ *		**bpf_spin_unlock**\ (\ *lock*\ ).
+ *
+ *		Spinlocks in BPF programs come with a number of restrictions
+ *		and constraints:
+ *
+ *		* **bpf_spin_lock** objects are only allowed inside maps of
+ *		  types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this
+ *		  list could be extended in the future).
+ *		* BTF description of the map is mandatory.
+ *		* The BPF program can take ONE lock at a time, since taking two
+ *		  or more could cause dead locks.
+ *		* Only one **struct bpf_spin_lock** is allowed per map element.
+ *		* When the lock is taken, calls (either BPF to BPF or helpers)
+ *		  are not allowed.
+ *		* The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not
+ *		  allowed inside a spinlock-ed region.
+ *		* The BPF program MUST call **bpf_spin_unlock**\ () to release
+ *		  the lock, on all execution paths, before it returns.
+ *		* The BPF program can access **struct bpf_spin_lock** only via
+ *		  the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ ()
+ *		  helpers. Loading or storing data into the **struct
+ *		  bpf_spin_lock** *lock*\ **;** field of a map is not allowed.
+ *		* To use the **bpf_spin_lock**\ () helper, the BTF description
+ *		  of the map value must be a struct and have **struct
+ *		  bpf_spin_lock** *anyname*\ **;** field at the top level.
+ *		  Nested lock inside another struct is not allowed.
+ *		* The **struct bpf_spin_lock** *lock* field in a map value must
+ *		  be aligned on a multiple of 4 bytes in that value.
+ *		* Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy
+ *		  the **bpf_spin_lock** field to user space.
+ *		* Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from
+ *		  a BPF program, do not update the **bpf_spin_lock** field.
+ *		* **bpf_spin_lock** cannot be on the stack or inside a
+ *		  networking packet (it can only be inside of a map values).
+ *		* **bpf_spin_lock** is available to root only.
+ *		* Tracing programs and socket filter programs cannot use
+ *		  **bpf_spin_lock**\ () due to insufficient preemption checks
+ *		  (but this may change in the future).
+ *		* **bpf_spin_lock** is not allowed in inner maps of map-in-map.
+ *	Return
+ *		0
+ *
+ * int bpf_spin_unlock(struct bpf_spin_lock *lock)
+ *	Description
+ *		Release the *lock* previously locked by a call to
+ *		**bpf_spin_lock**\ (\ *lock*\ ).
+ *	Return
+ *		0
+ *
  * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk)
  *	Description
  *		This helper gets a **struct bpf_sock** pointer such
-- 
cgit v1.2.3


From a4046c06be50a4f01d435aa7fe57514818e6cc82 Mon Sep 17 00:00:00 2001
From: Pi-Hsun Shih <pihsun@chromium.org>
Date: Wed, 13 Mar 2019 11:44:33 -0700
Subject: include/linux/swap.h: use offsetof() instead of custom __swapoffset
 macro

Use offsetof() to calculate offset of a field to take advantage of
compiler built-in version when possible, and avoid UBSAN warning when
compiling with Clang:

  UBSAN: Undefined behaviour in mm/swapfile.c:3010:38
  member access within null pointer of type 'union swap_header'
  CPU: 6 PID: 1833 Comm: swapon Tainted: G S                4.19.23 #43
  Call trace:
   dump_backtrace+0x0/0x194
   show_stack+0x20/0x2c
   __dump_stack+0x20/0x28
   dump_stack+0x70/0x94
   ubsan_epilogue+0x14/0x44
   ubsan_type_mismatch_common+0xf4/0xfc
   __ubsan_handle_type_mismatch_v1+0x34/0x54
   __se_sys_swapon+0x654/0x1084
   __arm64_sys_swapon+0x1c/0x24
   el0_svc_common+0xa8/0x150
   el0_svc_compat_handler+0x2c/0x38
   el0_svc_compat+0x8/0x18

Link: http://lkml.kernel.org/r/20190312081902.223764-1-pihsun@chromium.org
Signed-off-by: Pi-Hsun Shih <pihsun@chromium.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index fc50e21b3b88..4bfb5c4ac108 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -157,9 +157,9 @@ struct swap_extent {
 /*
  * Max bad pages in the new format..
  */
-#define __swapoffset(x) ((unsigned long)&((union swap_header *)0)->x)
 #define MAX_SWAP_BADPAGES \
-	((__swapoffset(magic.magic) - __swapoffset(info.badpages)) / sizeof(int))
+	((offsetof(union swap_header, magic.magic) - \
+	  offsetof(union swap_header, info.badpages)) / sizeof(int))
 
 enum {
 	SWP_USED	= (1 << 0),	/* is slot in swap_info[] used? */
-- 
cgit v1.2.3


From a75d4c33377277b6034dd1e2663bce444f952c14 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 13 Mar 2019 11:44:14 -0700
Subject: filemap: kill page_cache_read usage in filemap_fault

Patch series "drop the mmap_sem when doing IO in the fault path", v6.

Now that we have proper isolation in place with cgroups2 we have started
going through and fixing the various priority inversions.  Most are all
gone now, but this one is sort of weird since it's not necessarily a
priority inversion that happens within the kernel, but rather because of
something userspace does.

We have giant applications that we want to protect, and parts of these
giant applications do things like watch the system state to determine how
healthy the box is for load balancing and such.  This involves running
'ps' or other such utilities.  These utilities will often walk
/proc/<pid>/whatever, and these files can sometimes need to
down_read(&task->mmap_sem).  Not usually a big deal, but we noticed when
we are stress testing that sometimes our protected application has latency
spikes trying to get the mmap_sem for tasks that are in lower priority
cgroups.

This is because any down_write() on a semaphore essentially turns it into
a mutex, so even if we currently have it held for reading, any new readers
will not be allowed on to keep from starving the writer.  This is fine,
except a lower priority task could be stuck doing IO because it has been
throttled to the point that its IO is taking much longer than normal.  But
because a higher priority group depends on this completing it is now stuck
behind lower priority work.

In order to avoid this particular priority inversion we want to use the
existing retry mechanism to stop from holding the mmap_sem at all if we
are going to do IO.  This already exists in the read case sort of, but
needed to be extended for more than just grabbing the page lock.  With
io.latency we throttle at submit_bio() time, so the readahead stuff can
block and even page_cache_read can block, so all these paths need to have
the mmap_sem dropped.

The other big thing is ->page_mkwrite.  btrfs is particularly shitty here
because we have to reserve space for the dirty page, which can be a very
expensive operation.  We use the same retry method as the read path, and
simply cache the page and verify the page is still setup properly the next
pass through ->page_mkwrite().

I've tested these patches with xfstests and there are no regressions.

This patch (of 3):

If we do not have a page at filemap_fault time we'll do this weird forced
page_cache_read thing to populate the page, and then drop it again and
loop around and find it.  This makes for 2 ways we can read a page in
filemap_fault, and it's not really needed.  Instead add a FGP_FOR_MMAP
flag so that pagecache_get_page() will return a unlocked page that's in
pagecache.  Then use the normal page locking and readpage logic already in
filemap_fault.  This simplifies the no page in page cache case
significantly.

[akpm@linux-foundation.org: fix comment text]
[josef@toxicpanda.com: don't unlock null page in FGP_FOR_MMAP case]
  Link: http://lkml.kernel.org/r/20190312201742.22935-1-josef@toxicpanda.com
Link: http://lkml.kernel.org/r/20181211173801.29535-2-josef@toxicpanda.com
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h |  1 +
 mm/filemap.c            | 75 ++++++++++---------------------------------------
 2 files changed, 16 insertions(+), 60 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index b477a70cc2e4..bcf909d0de5f 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -239,6 +239,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
 #define FGP_WRITE		0x00000008
 #define FGP_NOFS		0x00000010
 #define FGP_NOWAIT		0x00000020
+#define FGP_FOR_MMAP		0x00000040
 
 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
 		int fgp_flags, gfp_t cache_gfp_mask);
diff --git a/mm/filemap.c b/mm/filemap.c
index ec6566ffbd90..64d014f940e9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1587,6 +1587,9 @@ EXPORT_SYMBOL(find_lock_entry);
  *   @gfp_mask and added to the page cache and the VM's LRU
  *   list. The page is returned locked and with an increased
  *   refcount.
+ * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do
+ *   its own locking dance if the page is already in cache, or unlock the page
+ *   before returning if we had to add the page to pagecache.
  *
  * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
  * if the GFP flags specified for FGP_CREAT are atomic.
@@ -1641,7 +1644,7 @@ no_page:
 		if (!page)
 			return NULL;
 
-		if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
+		if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
 			fgp_flags |= FGP_LOCK;
 
 		/* Init accessed so avoid atomic mark_page_accessed later */
@@ -1655,6 +1658,13 @@ no_page:
 			if (err == -EEXIST)
 				goto repeat;
 		}
+
+		/*
+		 * add_to_page_cache_lru locks the page, and for mmap we expect
+		 * an unlocked page.
+		 */
+		if (page && (fgp_flags & FGP_FOR_MMAP))
+			unlock_page(page);
 	}
 
 	return page;
@@ -2379,41 +2389,6 @@ out:
 EXPORT_SYMBOL(generic_file_read_iter);
 
 #ifdef CONFIG_MMU
-/**
- * page_cache_read - adds requested page to the page cache if not already there
- * @file:	file to read
- * @offset:	page index
- * @gfp_mask:	memory allocation flags
- *
- * This adds the requested page to the page cache if it isn't already there,
- * and schedules an I/O to read in its contents from disk.
- *
- * Return: %0 on success, negative error code otherwise.
- */
-static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
-{
-	struct address_space *mapping = file->f_mapping;
-	struct page *page;
-	int ret;
-
-	do {
-		page = __page_cache_alloc(gfp_mask);
-		if (!page)
-			return -ENOMEM;
-
-		ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
-		if (ret == 0)
-			ret = mapping->a_ops->readpage(file, page);
-		else if (ret == -EEXIST)
-			ret = 0; /* losing race to add is OK */
-
-		put_page(page);
-
-	} while (ret == AOP_TRUNCATED_PAGE);
-
-	return ret;
-}
-
 #define MMAP_LOTSAMISS  (100)
 
 /*
@@ -2539,9 +2514,11 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
 		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
 		ret = VM_FAULT_MAJOR;
 retry_find:
-		page = find_get_page(mapping, offset);
+		page = pagecache_get_page(mapping, offset,
+					  FGP_CREAT|FGP_FOR_MMAP,
+					  vmf->gfp_mask);
 		if (!page)
-			goto no_cached_page;
+			return vmf_error(-ENOMEM);
 	}
 
 	if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) {
@@ -2578,28 +2555,6 @@ retry_find:
 	vmf->page = page;
 	return ret | VM_FAULT_LOCKED;
 
-no_cached_page:
-	/*
-	 * We're only likely to ever get here if MADV_RANDOM is in
-	 * effect.
-	 */
-	error = page_cache_read(file, offset, vmf->gfp_mask);
-
-	/*
-	 * The page we want has now been added to the page cache.
-	 * In the unlikely event that someone removed it in the
-	 * meantime, we'll just come back here and read it again.
-	 */
-	if (error >= 0)
-		goto retry_find;
-
-	/*
-	 * An error return from page_cache_read can result if the
-	 * system is low on memory, or a problem occurs while trying
-	 * to schedule I/O.
-	 */
-	return vmf_error(error);
-
 page_not_uptodate:
 	/*
 	 * Umm, take care of errors if the page isn't up-to-date.
-- 
cgit v1.2.3


From 9804501fa1228048857910a6bf23e085aade37cc Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 14 Mar 2019 13:47:59 +0800
Subject: appletalk: Fix potential NULL pointer dereference in
 unregister_snap_client

register_snap_client may return NULL, all the callers
check it, but only print a warning. This will result in
NULL pointer dereference in unregister_snap_client and other
places.

It has always been used like this since v2.6

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/atalk.h |  2 +-
 net/appletalk/aarp.c  | 15 ++++++++++++---
 net/appletalk/ddp.c   | 20 ++++++++++++--------
 3 files changed, 25 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/atalk.h b/include/linux/atalk.h
index d5cfc0b15b76..f6034ba774be 100644
--- a/include/linux/atalk.h
+++ b/include/linux/atalk.h
@@ -108,7 +108,7 @@ static __inline__ struct elapaarp *aarp_hdr(struct sk_buff *skb)
 #define AARP_RESOLVE_TIME	(10 * HZ)
 
 extern struct datalink_proto *ddp_dl, *aarp_dl;
-extern void aarp_proto_init(void);
+extern int aarp_proto_init(void);
 
 /* Inter module exports */
 
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index 49a16cee2aae..420a98bf79b5 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -879,15 +879,24 @@ static struct notifier_block aarp_notifier = {
 
 static unsigned char aarp_snap_id[] = { 0x00, 0x00, 0x00, 0x80, 0xF3 };
 
-void __init aarp_proto_init(void)
+int __init aarp_proto_init(void)
 {
+	int rc;
+
 	aarp_dl = register_snap_client(aarp_snap_id, aarp_rcv);
-	if (!aarp_dl)
+	if (!aarp_dl) {
 		printk(KERN_CRIT "Unable to register AARP with SNAP.\n");
+		return -ENOMEM;
+	}
 	timer_setup(&aarp_timer, aarp_expire_timeout, 0);
 	aarp_timer.expires  = jiffies + sysctl_aarp_expiry_time;
 	add_timer(&aarp_timer);
-	register_netdevice_notifier(&aarp_notifier);
+	rc = register_netdevice_notifier(&aarp_notifier);
+	if (rc) {
+		del_timer_sync(&aarp_timer);
+		unregister_snap_client(aarp_dl);
+	}
+	return rc;
 }
 
 /* Remove the AARP entries associated with a device. */
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 795fbc6c06aa..709d2542f729 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1904,9 +1904,6 @@ static unsigned char ddp_snap_id[] = { 0x08, 0x00, 0x07, 0x80, 0x9B };
 EXPORT_SYMBOL(atrtr_get_dev);
 EXPORT_SYMBOL(atalk_find_dev_addr);
 
-static const char atalk_err_snap[] __initconst =
-	KERN_CRIT "Unable to register DDP with SNAP.\n";
-
 /* Called by proto.c on kernel start up */
 static int __init atalk_init(void)
 {
@@ -1921,17 +1918,22 @@ static int __init atalk_init(void)
 		goto out_proto;
 
 	ddp_dl = register_snap_client(ddp_snap_id, atalk_rcv);
-	if (!ddp_dl)
-		printk(atalk_err_snap);
+	if (!ddp_dl) {
+		pr_crit("Unable to register DDP with SNAP.\n");
+		goto out_sock;
+	}
 
 	dev_add_pack(&ltalk_packet_type);
 	dev_add_pack(&ppptalk_packet_type);
 
 	rc = register_netdevice_notifier(&ddp_notifier);
 	if (rc)
-		goto out_sock;
+		goto out_snap;
+
+	rc = aarp_proto_init();
+	if (rc)
+		goto out_dev;
 
-	aarp_proto_init();
 	rc = atalk_proc_init();
 	if (rc)
 		goto out_aarp;
@@ -1945,11 +1947,13 @@ out_proc:
 	atalk_proc_exit();
 out_aarp:
 	aarp_cleanup_module();
+out_dev:
 	unregister_netdevice_notifier(&ddp_notifier);
-out_sock:
+out_snap:
 	dev_remove_pack(&ppptalk_packet_type);
 	dev_remove_pack(&ltalk_packet_type);
 	unregister_snap_client(ddp_dl);
+out_sock:
 	sock_unregister(PF_APPLETALK);
 out_proto:
 	proto_unregister(&ddp_proto);
-- 
cgit v1.2.3


From 8a3c245c031944f2176118270e7bc5d4fd4a1075 Mon Sep 17 00:00:00 2001
From: Pedro Tammela <pctammela@gmail.com>
Date: Thu, 14 Mar 2019 10:45:23 -0300
Subject: net: add documentation to socket.c

Adds missing sphinx documentation to the
socket.c's functions. Also fixes some whitespaces.

I also changed the style of older documentation as an
effort to have an uniform documentation style.

Signed-off-by: Pedro Tammela <pctammela@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h    |   6 ++
 include/linux/socket.h |  12 +--
 net/socket.c           | 277 +++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 271 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/net.h b/include/linux/net.h
index 651fca72286c..c606c72311d0 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -83,6 +83,12 @@ enum sock_type {
 
 #endif /* ARCH_HAS_SOCKET_TYPES */
 
+/**
+ * enum sock_shutdown_cmd - Shutdown types
+ * @SHUT_RD: shutdown receptions
+ * @SHUT_WR: shutdown transmissions
+ * @SHUT_RDWR: shutdown receptions/transmissions
+ */
 enum sock_shutdown_cmd {
 	SHUT_RD,
 	SHUT_WR,
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 6016daeecee4..b57cd8bf96e2 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -26,7 +26,7 @@ typedef __kernel_sa_family_t	sa_family_t;
 /*
  *	1003.1g requires sa_family_t and that sa_data is char.
  */
- 
+
 struct sockaddr {
 	sa_family_t	sa_family;	/* address family, AF_xxx	*/
 	char		sa_data[14];	/* 14 bytes of protocol address	*/
@@ -44,7 +44,7 @@ struct linger {
  *	system, not 4.3. Thus msg_accrights(len) are now missing. They
  *	belong in an obscure libc emulation or the bin.
  */
- 
+
 struct msghdr {
 	void		*msg_name;	/* ptr to socket address structure */
 	int		msg_namelen;	/* size of socket address structure */
@@ -54,7 +54,7 @@ struct msghdr {
 	unsigned int	msg_flags;	/* flags on received message */
 	struct kiocb	*msg_iocb;	/* ptr to iocb for async requests */
 };
- 
+
 struct user_msghdr {
 	void		__user *msg_name;	/* ptr to socket address structure */
 	int		msg_namelen;		/* size of socket address structure */
@@ -122,7 +122,7 @@ struct cmsghdr {
  *	inside range, given by msg->msg_controllen before using
  *	ancillary object DATA.				--ANK (980731)
  */
- 
+
 static inline struct cmsghdr * __cmsg_nxthdr(void *__ctl, __kernel_size_t __size,
 					       struct cmsghdr *__cmsg)
 {
@@ -264,10 +264,10 @@ struct ucred {
 /* Maximum queue length specifiable by listen.  */
 #define SOMAXCONN	128
 
-/* Flags we can use with send/ and recv. 
+/* Flags we can use with send/ and recv.
    Added those for 1003.1g not all are supported yet
  */
- 
+
 #define MSG_OOB		1
 #define MSG_PEEK	2
 #define MSG_DONTROUTE	4
diff --git a/net/socket.c b/net/socket.c
index 3c176a12fe48..8255f5bda0aa 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -384,6 +384,18 @@ static struct file_system_type sock_fs_type = {
  *	but we take care of internal coherence yet.
  */
 
+/**
+ *	sock_alloc_file - Bind a &socket to a &file
+ *	@sock: socket
+ *	@flags: file status flags
+ *	@dname: protocol name
+ *
+ *	Returns the &file bound with @sock, implicitly storing it
+ *	in sock->file. If dname is %NULL, sets to "".
+ *	On failure the return is a ERR pointer (see linux/err.h).
+ *	This function uses GFP_KERNEL internally.
+ */
+
 struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
 {
 	struct file *file;
@@ -424,6 +436,14 @@ static int sock_map_fd(struct socket *sock, int flags)
 	return PTR_ERR(newfile);
 }
 
+/**
+ *	sock_from_file - Return the &socket bounded to @file.
+ *	@file: file
+ *	@err: pointer to an error code return
+ *
+ *	On failure returns %NULL and assigns -ENOTSOCK to @err.
+ */
+
 struct socket *sock_from_file(struct file *file, int *err)
 {
 	if (file->f_op == &socket_file_ops)
@@ -532,11 +552,11 @@ static const struct inode_operations sockfs_inode_ops = {
 };
 
 /**
- *	sock_alloc	-	allocate a socket
+ *	sock_alloc - allocate a socket
  *
  *	Allocate a new inode and socket object. The two are bound together
  *	and initialised. The socket is then returned. If we are out of inodes
- *	NULL is returned.
+ *	NULL is returned. This functions uses GFP_KERNEL internally.
  */
 
 struct socket *sock_alloc(void)
@@ -561,7 +581,7 @@ struct socket *sock_alloc(void)
 EXPORT_SYMBOL(sock_alloc);
 
 /**
- *	sock_release	-	close a socket
+ *	sock_release - close a socket
  *	@sock: socket to close
  *
  *	The socket is released from the protocol stack if it has a release
@@ -617,6 +637,15 @@ void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags)
 }
 EXPORT_SYMBOL(__sock_tx_timestamp);
 
+/**
+ *	sock_sendmsg - send a message through @sock
+ *	@sock: socket
+ *	@msg: message to send
+ *
+ *	Sends @msg through @sock, passing through LSM.
+ *	Returns the number of bytes sent, or an error code.
+ */
+
 static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg)
 {
 	int ret = sock->ops->sendmsg(sock, msg, msg_data_left(msg));
@@ -633,6 +662,18 @@ int sock_sendmsg(struct socket *sock, struct msghdr *msg)
 }
 EXPORT_SYMBOL(sock_sendmsg);
 
+/**
+ *	kernel_sendmsg - send a message through @sock (kernel-space)
+ *	@sock: socket
+ *	@msg: message header
+ *	@vec: kernel vec
+ *	@num: vec array length
+ *	@size: total message data size
+ *
+ *	Builds the message data with @vec and sends it through @sock.
+ *	Returns the number of bytes sent, or an error code.
+ */
+
 int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
 		   struct kvec *vec, size_t num, size_t size)
 {
@@ -641,6 +682,19 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
 }
 EXPORT_SYMBOL(kernel_sendmsg);
 
+/**
+ *	kernel_sendmsg_locked - send a message through @sock (kernel-space)
+ *	@sk: sock
+ *	@msg: message header
+ *	@vec: output s/g array
+ *	@num: output s/g array length
+ *	@size: total message data size
+ *
+ *	Builds the message data with @vec and sends it through @sock.
+ *	Returns the number of bytes sent, or an error code.
+ *	Caller must hold @sk.
+ */
+
 int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg,
 			  struct kvec *vec, size_t num, size_t size)
 {
@@ -811,6 +865,16 @@ void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
 }
 EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops);
 
+/**
+ *	sock_recvmsg - receive a message from @sock
+ *	@sock: socket
+ *	@msg: message to receive
+ *	@flags: message flags
+ *
+ *	Receives @msg from @sock, passing through LSM. Returns the total number
+ *	of bytes received, or an error.
+ */
+
 static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
 				     int flags)
 {
@@ -826,20 +890,21 @@ int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags)
 EXPORT_SYMBOL(sock_recvmsg);
 
 /**
- * kernel_recvmsg - Receive a message from a socket (kernel space)
- * @sock:       The socket to receive the message from
- * @msg:        Received message
- * @vec:        Input s/g array for message data
- * @num:        Size of input s/g array
- * @size:       Number of bytes to read
- * @flags:      Message flags (MSG_DONTWAIT, etc...)
+ *	kernel_recvmsg - Receive a message from a socket (kernel space)
+ *	@sock: The socket to receive the message from
+ *	@msg: Received message
+ *	@vec: Input s/g array for message data
+ *	@num: Size of input s/g array
+ *	@size: Number of bytes to read
+ *	@flags: Message flags (MSG_DONTWAIT, etc...)
  *
- * On return the msg structure contains the scatter/gather array passed in the
- * vec argument. The array is modified so that it consists of the unfilled
- * portion of the original array.
+ *	On return the msg structure contains the scatter/gather array passed in the
+ *	vec argument. The array is modified so that it consists of the unfilled
+ *	portion of the original array.
  *
- * The returned value is the total number of bytes received, or an error.
+ *	The returned value is the total number of bytes received, or an error.
  */
+
 int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
 		   struct kvec *vec, size_t num, size_t size, int flags)
 {
@@ -1005,6 +1070,13 @@ static long sock_do_ioctl(struct net *net, struct socket *sock,
  *	what to do with it - that's up to the protocol still.
  */
 
+/**
+ *	get_net_ns - increment the refcount of the network namespace
+ *	@ns: common namespace (net)
+ *
+ *	Returns the net's common namespace.
+ */
+
 struct ns_common *get_net_ns(struct ns_common *ns)
 {
 	return &get_net(container_of(ns, struct net, ns))->ns;
@@ -1099,6 +1171,19 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	return err;
 }
 
+/**
+ *	sock_create_lite - creates a socket
+ *	@family: protocol family (AF_INET, ...)
+ *	@type: communication type (SOCK_STREAM, ...)
+ *	@protocol: protocol (0, ...)
+ *	@res: new socket
+ *
+ *	Creates a new socket and assigns it to @res, passing through LSM.
+ *	The new socket initialization is not complete, see kernel_accept().
+ *	Returns 0 or an error. On failure @res is set to %NULL.
+ *	This function internally uses GFP_KERNEL.
+ */
+
 int sock_create_lite(int family, int type, int protocol, struct socket **res)
 {
 	int err;
@@ -1224,6 +1309,21 @@ call_kill:
 }
 EXPORT_SYMBOL(sock_wake_async);
 
+/**
+ *	__sock_create - creates a socket
+ *	@net: net namespace
+ *	@family: protocol family (AF_INET, ...)
+ *	@type: communication type (SOCK_STREAM, ...)
+ *	@protocol: protocol (0, ...)
+ *	@res: new socket
+ *	@kern: boolean for kernel space sockets
+ *
+ *	Creates a new socket and assigns it to @res, passing through LSM.
+ *	Returns 0 or an error. On failure @res is set to %NULL. @kern must
+ *	be set to true if the socket resides in kernel space.
+ *	This function internally uses GFP_KERNEL.
+ */
+
 int __sock_create(struct net *net, int family, int type, int protocol,
 			 struct socket **res, int kern)
 {
@@ -1333,12 +1433,35 @@ out_release:
 }
 EXPORT_SYMBOL(__sock_create);
 
+/**
+ *	sock_create - creates a socket
+ *	@family: protocol family (AF_INET, ...)
+ *	@type: communication type (SOCK_STREAM, ...)
+ *	@protocol: protocol (0, ...)
+ *	@res: new socket
+ *
+ *	A wrapper around __sock_create().
+ *	Returns 0 or an error. This function internally uses GFP_KERNEL.
+ */
+
 int sock_create(int family, int type, int protocol, struct socket **res)
 {
 	return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
 }
 EXPORT_SYMBOL(sock_create);
 
+/**
+ *	sock_create_kern - creates a socket (kernel space)
+ *	@net: net namespace
+ *	@family: protocol family (AF_INET, ...)
+ *	@type: communication type (SOCK_STREAM, ...)
+ *	@protocol: protocol (0, ...)
+ *	@res: new socket
+ *
+ *	A wrapper around __sock_create().
+ *	Returns 0 or an error. This function internally uses GFP_KERNEL.
+ */
+
 int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res)
 {
 	return __sock_create(net, family, type, protocol, res, 1);
@@ -3322,18 +3445,46 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd,
 }
 #endif
 
+/**
+ *	kernel_bind - bind an address to a socket (kernel space)
+ *	@sock: socket
+ *	@addr: address
+ *	@addrlen: length of address
+ *
+ *	Returns 0 or an error.
+ */
+
 int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
 {
 	return sock->ops->bind(sock, addr, addrlen);
 }
 EXPORT_SYMBOL(kernel_bind);
 
+/**
+ *	kernel_listen - move socket to listening state (kernel space)
+ *	@sock: socket
+ *	@backlog: pending connections queue size
+ *
+ *	Returns 0 or an error.
+ */
+
 int kernel_listen(struct socket *sock, int backlog)
 {
 	return sock->ops->listen(sock, backlog);
 }
 EXPORT_SYMBOL(kernel_listen);
 
+/**
+ *	kernel_accept - accept a connection (kernel space)
+ *	@sock: listening socket
+ *	@newsock: new connected socket
+ *	@flags: flags
+ *
+ *	@flags must be SOCK_CLOEXEC, SOCK_NONBLOCK or 0.
+ *	If it fails, @newsock is guaranteed to be %NULL.
+ *	Returns 0 or an error.
+ */
+
 int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
 {
 	struct sock *sk = sock->sk;
@@ -3359,6 +3510,19 @@ done:
 }
 EXPORT_SYMBOL(kernel_accept);
 
+/**
+ *	kernel_connect - connect a socket (kernel space)
+ *	@sock: socket
+ *	@addr: address
+ *	@addrlen: address length
+ *	@flags: flags (O_NONBLOCK, ...)
+ *
+ *	For datagram sockets, @addr is the addres to which datagrams are sent
+ *	by default, and the only address from which datagrams are received.
+ *	For stream sockets, attempts to connect to @addr.
+ *	Returns 0 or an error code.
+ */
+
 int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
 		   int flags)
 {
@@ -3366,18 +3530,48 @@ int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
 }
 EXPORT_SYMBOL(kernel_connect);
 
+/**
+ *	kernel_getsockname - get the address which the socket is bound (kernel space)
+ *	@sock: socket
+ *	@addr: address holder
+ *
+ * 	Fills the @addr pointer with the address which the socket is bound.
+ *	Returns 0 or an error code.
+ */
+
 int kernel_getsockname(struct socket *sock, struct sockaddr *addr)
 {
 	return sock->ops->getname(sock, addr, 0);
 }
 EXPORT_SYMBOL(kernel_getsockname);
 
+/**
+ *	kernel_peername - get the address which the socket is connected (kernel space)
+ *	@sock: socket
+ *	@addr: address holder
+ *
+ * 	Fills the @addr pointer with the address which the socket is connected.
+ *	Returns 0 or an error code.
+ */
+
 int kernel_getpeername(struct socket *sock, struct sockaddr *addr)
 {
 	return sock->ops->getname(sock, addr, 1);
 }
 EXPORT_SYMBOL(kernel_getpeername);
 
+/**
+ *	kernel_getsockopt - get a socket option (kernel space)
+ *	@sock: socket
+ *	@level: API level (SOL_SOCKET, ...)
+ *	@optname: option tag
+ *	@optval: option value
+ *	@optlen: option length
+ *
+ *	Assigns the option length to @optlen.
+ *	Returns 0 or an error.
+ */
+
 int kernel_getsockopt(struct socket *sock, int level, int optname,
 			char *optval, int *optlen)
 {
@@ -3400,6 +3594,17 @@ int kernel_getsockopt(struct socket *sock, int level, int optname,
 }
 EXPORT_SYMBOL(kernel_getsockopt);
 
+/**
+ *	kernel_setsockopt - set a socket option (kernel space)
+ *	@sock: socket
+ *	@level: API level (SOL_SOCKET, ...)
+ *	@optname: option tag
+ *	@optval: option value
+ *	@optlen: option length
+ *
+ *	Returns 0 or an error.
+ */
+
 int kernel_setsockopt(struct socket *sock, int level, int optname,
 			char *optval, unsigned int optlen)
 {
@@ -3420,6 +3625,17 @@ int kernel_setsockopt(struct socket *sock, int level, int optname,
 }
 EXPORT_SYMBOL(kernel_setsockopt);
 
+/**
+ *	kernel_sendpage - send a &page through a socket (kernel space)
+ *	@sock: socket
+ *	@page: page
+ *	@offset: page offset
+ *	@size: total size in bytes
+ *	@flags: flags (MSG_DONTWAIT, ...)
+ *
+ *	Returns the total amount sent in bytes or an error.
+ */
+
 int kernel_sendpage(struct socket *sock, struct page *page, int offset,
 		    size_t size, int flags)
 {
@@ -3430,6 +3646,18 @@ int kernel_sendpage(struct socket *sock, struct page *page, int offset,
 }
 EXPORT_SYMBOL(kernel_sendpage);
 
+/**
+ *	kernel_sendpage_locked - send a &page through the locked sock (kernel space)
+ *	@sk: sock
+ *	@page: page
+ *	@offset: page offset
+ *	@size: total size in bytes
+ *	@flags: flags (MSG_DONTWAIT, ...)
+ *
+ *	Returns the total amount sent in bytes or an error.
+ *	Caller must hold @sk.
+ */
+
 int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset,
 			   size_t size, int flags)
 {
@@ -3443,17 +3671,30 @@ int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset,
 }
 EXPORT_SYMBOL(kernel_sendpage_locked);
 
+/**
+ *	kernel_shutdown - shut down part of a full-duplex connection (kernel space)
+ *	@sock: socket
+ *	@how: connection part
+ *
+ *	Returns 0 or an error.
+ */
+
 int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how)
 {
 	return sock->ops->shutdown(sock, how);
 }
 EXPORT_SYMBOL(kernel_sock_shutdown);
 
-/* This routine returns the IP overhead imposed by a socket i.e.
- * the length of the underlying IP header, depending on whether
- * this is an IPv4 or IPv6 socket and the length from IP options turned
- * on at the socket. Assumes that the caller has a lock on the socket.
+/**
+ *	kernel_sock_ip_overhead - returns the IP overhead imposed by a socket
+ *	@sk: socket
+ *
+ *	This routine returns the IP overhead imposed by a socket i.e.
+ *	the length of the underlying IP header, depending on whether
+ *	this is an IPv4 or IPv6 socket and the length from IP options turned
+ *	on at the socket. Assumes that the caller has a lock on the socket.
  */
+
 u32 kernel_sock_ip_overhead(struct sock *sk)
 {
 	struct inet_sock *inet;
-- 
cgit v1.2.3


From 044175a06706d516aa42874bb44dbbfc3c4d20eb Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 13 Mar 2019 15:15:49 +0100
Subject: xsk: fix umem memory leak on cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the umem is cleaned up, the task that created it might already be
gone. If the task was gone, the xdp_umem_release function did not free
the pages member of struct xdp_umem.

It turned out that the task lookup was not needed at all; The code was
a left-over when we moved from task accounting to user accounting [1].

This patch fixes the memory leak by removing the task lookup logic
completely.

[1] https://lore.kernel.org/netdev/20180131135356.19134-3-bjorn.topel@gmail.com/

Link: https://lore.kernel.org/netdev/c1cb2ca8-6a14-3980-8672-f3de0bb38dfd@suse.cz/
Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt")
Reported-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h |  1 -
 net/xdp/xdp_umem.c     | 19 +------------------
 2 files changed, 1 insertion(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 61cf7dbb6782..d074b6d60f8a 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -36,7 +36,6 @@ struct xdp_umem {
 	u32 headroom;
 	u32 chunk_size_nohr;
 	struct user_struct *user;
-	struct pid *pid;
 	unsigned long address;
 	refcount_t users;
 	struct work_struct work;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 77520eacee8f..989e52386c35 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -193,9 +193,6 @@ static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
 
 static void xdp_umem_release(struct xdp_umem *umem)
 {
-	struct task_struct *task;
-	struct mm_struct *mm;
-
 	xdp_umem_clear_dev(umem);
 
 	ida_simple_remove(&umem_ida, umem->id);
@@ -214,21 +211,10 @@ static void xdp_umem_release(struct xdp_umem *umem)
 
 	xdp_umem_unpin_pages(umem);
 
-	task = get_pid_task(umem->pid, PIDTYPE_PID);
-	put_pid(umem->pid);
-	if (!task)
-		goto out;
-	mm = get_task_mm(task);
-	put_task_struct(task);
-	if (!mm)
-		goto out;
-
-	mmput(mm);
 	kfree(umem->pages);
 	umem->pages = NULL;
 
 	xdp_umem_unaccount_pages(umem);
-out:
 	kfree(umem);
 }
 
@@ -357,7 +343,6 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	if (size_chk < 0)
 		return -EINVAL;
 
-	umem->pid = get_task_pid(current, PIDTYPE_PID);
 	umem->address = (unsigned long)addr;
 	umem->chunk_mask = ~((u64)chunk_size - 1);
 	umem->size = size;
@@ -373,7 +358,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 
 	err = xdp_umem_account_pages(umem);
 	if (err)
-		goto out;
+		return err;
 
 	err = xdp_umem_pin_pages(umem);
 	if (err)
@@ -392,8 +377,6 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 
 out_account:
 	xdp_umem_unaccount_pages(umem);
-out:
-	put_pid(umem->pid);
 	return err;
 }
 
-- 
cgit v1.2.3


From 037fc3368be46dc1a2a90f6e50c8cbce49d75fd6 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Sun, 17 Mar 2019 11:01:09 +0900
Subject: kbuild: force all architectures except um to include mandatory-y

Currently, every arch/*/include/uapi/asm/Kbuild explicitly includes
the common Kbuild.asm file. Factor out the duplicated include directives
to scripts/Makefile.asm-generic so that no architecture would opt out
of the mandatory-y mechanism.

um is not forced to include mandatory-y since it is a very exceptional
case which does not support UAPI.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 Documentation/kbuild/makefiles.txt      |  2 +-
 Makefile                                |  6 ++++--
 arch/alpha/include/uapi/asm/Kbuild      |  2 --
 arch/arc/include/uapi/asm/Kbuild        |  2 --
 arch/arm/include/uapi/asm/Kbuild        |  1 -
 arch/arm64/include/uapi/asm/Kbuild      |  1 -
 arch/c6x/include/uapi/asm/Kbuild        |  2 --
 arch/csky/include/uapi/asm/Kbuild       |  2 --
 arch/h8300/include/uapi/asm/Kbuild      |  2 --
 arch/hexagon/include/uapi/asm/Kbuild    |  2 --
 arch/ia64/include/uapi/asm/Kbuild       |  2 --
 arch/m68k/include/uapi/asm/Kbuild       |  2 --
 arch/microblaze/include/uapi/asm/Kbuild |  2 --
 arch/mips/include/uapi/asm/Kbuild       |  2 --
 arch/nds32/include/uapi/asm/Kbuild      |  2 --
 arch/nios2/include/uapi/asm/Kbuild      |  2 --
 arch/openrisc/include/uapi/asm/Kbuild   |  2 --
 arch/parisc/include/uapi/asm/Kbuild     |  2 --
 arch/powerpc/include/uapi/asm/Kbuild    |  2 --
 arch/riscv/include/uapi/asm/Kbuild      |  1 -
 arch/s390/include/uapi/asm/Kbuild       |  1 -
 arch/sh/include/uapi/asm/Kbuild         |  1 -
 arch/sparc/include/uapi/asm/Kbuild      |  2 --
 arch/unicore32/include/uapi/asm/Kbuild  |  2 --
 arch/x86/include/uapi/asm/Kbuild        |  2 --
 arch/xtensa/include/uapi/asm/Kbuild     |  2 --
 include/asm-generic/Kbuild              |  5 +++++
 include/uapi/asm-generic/Kbuild         | 36 +++++++++++++++++++++++++++++++++
 include/uapi/asm-generic/Kbuild.asm     | 34 -------------------------------
 scripts/Makefile.asm-generic            |  5 +++++
 30 files changed, 51 insertions(+), 80 deletions(-)
 create mode 100644 include/asm-generic/Kbuild
 create mode 100644 include/uapi/asm-generic/Kbuild
 delete mode 100644 include/uapi/asm-generic/Kbuild.asm

(limited to 'include')

diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt
index f124be6e4c3a..03c065855eaf 100644
--- a/Documentation/kbuild/makefiles.txt
+++ b/Documentation/kbuild/makefiles.txt
@@ -1274,7 +1274,7 @@ See subsequent chapter for the syntax of the Kbuild file.
 
 --- 7.4 mandatory-y
 
-	mandatory-y is essentially used by include/(uapi/)asm-generic/Kbuild.asm
+	mandatory-y is essentially used by include/(uapi/)asm-generic/Kbuild
 	to define the minimum set of ASM headers that all architectures must have.
 
 	This works like optional generic-y. If a mandatory header is missing
diff --git a/Makefile b/Makefile
index 08a01add09a6..4f45ae628f54 100644
--- a/Makefile
+++ b/Makefile
@@ -1098,9 +1098,11 @@ asm-generic := -f $(srctree)/scripts/Makefile.asm-generic obj
 
 PHONY += asm-generic uapi-asm-generic
 asm-generic: uapi-asm-generic
-	$(Q)$(MAKE) $(asm-generic)=arch/$(SRCARCH)/include/generated/asm
+	$(Q)$(MAKE) $(asm-generic)=arch/$(SRCARCH)/include/generated/asm \
+	generic=include/asm-generic
 uapi-asm-generic:
-	$(Q)$(MAKE) $(asm-generic)=arch/$(SRCARCH)/include/generated/uapi/asm
+	$(Q)$(MAKE) $(asm-generic)=arch/$(SRCARCH)/include/generated/uapi/asm \
+	generic=include/uapi/asm-generic
 
 PHONY += prepare-objtool
 prepare-objtool: $(objtool_target)
diff --git a/arch/alpha/include/uapi/asm/Kbuild b/arch/alpha/include/uapi/asm/Kbuild
index 439f5157aa35..7417847dc438 100644
--- a/arch/alpha/include/uapi/asm/Kbuild
+++ b/arch/alpha/include/uapi/asm/Kbuild
@@ -1,3 +1 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_32.h
diff --git a/arch/arc/include/uapi/asm/Kbuild b/arch/arc/include/uapi/asm/Kbuild
index 0febf1a07c30..755bb11323d8 100644
--- a/arch/arc/include/uapi/asm/Kbuild
+++ b/arch/arc/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild
index eee8f7d23899..23b4464c0995 100644
--- a/arch/arm/include/uapi/asm/Kbuild
+++ b/arch/arm/include/uapi/asm/Kbuild
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-include include/uapi/asm-generic/Kbuild.asm
 
 generated-y += unistd-common.h
 generated-y += unistd-oabi.h
diff --git a/arch/arm64/include/uapi/asm/Kbuild b/arch/arm64/include/uapi/asm/Kbuild
index 87eea29b24ab..602d137932dc 100644
--- a/arch/arm64/include/uapi/asm/Kbuild
+++ b/arch/arm64/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
-include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += kvm_para.h
diff --git a/arch/c6x/include/uapi/asm/Kbuild b/arch/c6x/include/uapi/asm/Kbuild
index 0febf1a07c30..755bb11323d8 100644
--- a/arch/c6x/include/uapi/asm/Kbuild
+++ b/arch/c6x/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/csky/include/uapi/asm/Kbuild b/arch/csky/include/uapi/asm/Kbuild
index c1b06dcf6cf8..1c72f04ff75d 100644
--- a/arch/csky/include/uapi/asm/Kbuild
+++ b/arch/csky/include/uapi/asm/Kbuild
@@ -1,3 +1 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generic-y += ucontext.h
diff --git a/arch/h8300/include/uapi/asm/Kbuild b/arch/h8300/include/uapi/asm/Kbuild
index 0febf1a07c30..755bb11323d8 100644
--- a/arch/h8300/include/uapi/asm/Kbuild
+++ b/arch/h8300/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/hexagon/include/uapi/asm/Kbuild b/arch/hexagon/include/uapi/asm/Kbuild
index c1b06dcf6cf8..1c72f04ff75d 100644
--- a/arch/hexagon/include/uapi/asm/Kbuild
+++ b/arch/hexagon/include/uapi/asm/Kbuild
@@ -1,3 +1 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generic-y += ucontext.h
diff --git a/arch/ia64/include/uapi/asm/Kbuild b/arch/ia64/include/uapi/asm/Kbuild
index 5b819e53c397..20018cb883a9 100644
--- a/arch/ia64/include/uapi/asm/Kbuild
+++ b/arch/ia64/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_64.h
 generic-y += kvm_para.h
diff --git a/arch/m68k/include/uapi/asm/Kbuild b/arch/m68k/include/uapi/asm/Kbuild
index 960bf1e4be53..8a7ad40be463 100644
--- a/arch/m68k/include/uapi/asm/Kbuild
+++ b/arch/m68k/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_32.h
 generic-y += kvm_para.h
diff --git a/arch/microblaze/include/uapi/asm/Kbuild b/arch/microblaze/include/uapi/asm/Kbuild
index 97823ec46e97..3ce84fbb2678 100644
--- a/arch/microblaze/include/uapi/asm/Kbuild
+++ b/arch/microblaze/include/uapi/asm/Kbuild
@@ -1,5 +1,3 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_32.h
 generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/mips/include/uapi/asm/Kbuild b/arch/mips/include/uapi/asm/Kbuild
index 0851c103a8ce..c3798bfe0486 100644
--- a/arch/mips/include/uapi/asm/Kbuild
+++ b/arch/mips/include/uapi/asm/Kbuild
@@ -1,5 +1,3 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_n32.h
 generated-y += unistd_n64.h
 generated-y += unistd_o32.h
diff --git a/arch/nds32/include/uapi/asm/Kbuild b/arch/nds32/include/uapi/asm/Kbuild
index c1b06dcf6cf8..1c72f04ff75d 100644
--- a/arch/nds32/include/uapi/asm/Kbuild
+++ b/arch/nds32/include/uapi/asm/Kbuild
@@ -1,3 +1 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generic-y += ucontext.h
diff --git a/arch/nios2/include/uapi/asm/Kbuild b/arch/nios2/include/uapi/asm/Kbuild
index 0febf1a07c30..755bb11323d8 100644
--- a/arch/nios2/include/uapi/asm/Kbuild
+++ b/arch/nios2/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/openrisc/include/uapi/asm/Kbuild b/arch/openrisc/include/uapi/asm/Kbuild
index 0febf1a07c30..755bb11323d8 100644
--- a/arch/openrisc/include/uapi/asm/Kbuild
+++ b/arch/openrisc/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/parisc/include/uapi/asm/Kbuild b/arch/parisc/include/uapi/asm/Kbuild
index c54353d390ff..22fdbd08cdc8 100644
--- a/arch/parisc/include/uapi/asm/Kbuild
+++ b/arch/parisc/include/uapi/asm/Kbuild
@@ -1,5 +1,3 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_32.h
 generated-y += unistd_64.h
 generic-y += kvm_para.h
diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild
index 214a39acdf25..2bd5b392277c 100644
--- a/arch/powerpc/include/uapi/asm/Kbuild
+++ b/arch/powerpc/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_32.h
 generated-y += unistd_64.h
diff --git a/arch/riscv/include/uapi/asm/Kbuild b/arch/riscv/include/uapi/asm/Kbuild
index d2ee86b4c091..e69de29bb2d1 100644
--- a/arch/riscv/include/uapi/asm/Kbuild
+++ b/arch/riscv/include/uapi/asm/Kbuild
@@ -1 +0,0 @@
-include include/uapi/asm-generic/Kbuild.asm
diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild
index da3e0d48abbc..46c1ff0b842a 100644
--- a/arch/s390/include/uapi/asm/Kbuild
+++ b/arch/s390/include/uapi/asm/Kbuild
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-include include/uapi/asm-generic/Kbuild.asm
 
 generated-y += unistd_32.h
 generated-y += unistd_64.h
diff --git a/arch/sh/include/uapi/asm/Kbuild b/arch/sh/include/uapi/asm/Kbuild
index eaa30bcd93bf..ecfbd40924dd 100644
--- a/arch/sh/include/uapi/asm/Kbuild
+++ b/arch/sh/include/uapi/asm/Kbuild
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-include include/uapi/asm-generic/Kbuild.asm
 
 generated-y += unistd_32.h
 generic-y += kvm_para.h
diff --git a/arch/sparc/include/uapi/asm/Kbuild b/arch/sparc/include/uapi/asm/Kbuild
index 214a39acdf25..2bd5b392277c 100644
--- a/arch/sparc/include/uapi/asm/Kbuild
+++ b/arch/sparc/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_32.h
 generated-y += unistd_64.h
diff --git a/arch/unicore32/include/uapi/asm/Kbuild b/arch/unicore32/include/uapi/asm/Kbuild
index 0febf1a07c30..755bb11323d8 100644
--- a/arch/unicore32/include/uapi/asm/Kbuild
+++ b/arch/unicore32/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
index f6648e9928b3..59b5ad310f78 100644
--- a/arch/x86/include/uapi/asm/Kbuild
+++ b/arch/x86/include/uapi/asm/Kbuild
@@ -1,5 +1,3 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_32.h
 generated-y += unistd_64.h
 generated-y += unistd_x32.h
diff --git a/arch/xtensa/include/uapi/asm/Kbuild b/arch/xtensa/include/uapi/asm/Kbuild
index 960bf1e4be53..8a7ad40be463 100644
--- a/arch/xtensa/include/uapi/asm/Kbuild
+++ b/arch/xtensa/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_32.h
 generic-y += kvm_para.h
diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild
new file mode 100644
index 000000000000..6f4536d70b8e
--- /dev/null
+++ b/include/asm-generic/Kbuild
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# asm headers that all architectures except um should have
+# (This file is not included when SRCARCH=um since UML borrows several
+# asm headers from the host architecutre.)
diff --git a/include/uapi/asm-generic/Kbuild b/include/uapi/asm-generic/Kbuild
new file mode 100644
index 000000000000..ebb180aac74e
--- /dev/null
+++ b/include/uapi/asm-generic/Kbuild
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Headers that are mandatory in usr/include/asm/
+# (This file is not included when SRCARCH=um since UML does not support UAPI.)
+
+mandatory-y += auxvec.h
+mandatory-y += bitsperlong.h
+mandatory-y += bpf_perf_event.h
+mandatory-y += byteorder.h
+mandatory-y += errno.h
+mandatory-y += fcntl.h
+mandatory-y += ioctl.h
+mandatory-y += ioctls.h
+mandatory-y += ipcbuf.h
+mandatory-y += mman.h
+mandatory-y += msgbuf.h
+mandatory-y += param.h
+mandatory-y += poll.h
+mandatory-y += posix_types.h
+mandatory-y += ptrace.h
+mandatory-y += resource.h
+mandatory-y += sembuf.h
+mandatory-y += setup.h
+mandatory-y += shmbuf.h
+mandatory-y += sigcontext.h
+mandatory-y += siginfo.h
+mandatory-y += signal.h
+mandatory-y += socket.h
+mandatory-y += sockios.h
+mandatory-y += stat.h
+mandatory-y += statfs.h
+mandatory-y += swab.h
+mandatory-y += termbits.h
+mandatory-y += termios.h
+mandatory-y += types.h
+mandatory-y += unistd.h
diff --git a/include/uapi/asm-generic/Kbuild.asm b/include/uapi/asm-generic/Kbuild.asm
deleted file mode 100644
index 355c4ac2c0b0..000000000000
--- a/include/uapi/asm-generic/Kbuild.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-#
-# Headers that are mandatory in usr/include/asm/
-#
-mandatory-y += auxvec.h
-mandatory-y += bitsperlong.h
-mandatory-y += bpf_perf_event.h
-mandatory-y += byteorder.h
-mandatory-y += errno.h
-mandatory-y += fcntl.h
-mandatory-y += ioctl.h
-mandatory-y += ioctls.h
-mandatory-y += ipcbuf.h
-mandatory-y += mman.h
-mandatory-y += msgbuf.h
-mandatory-y += param.h
-mandatory-y += poll.h
-mandatory-y += posix_types.h
-mandatory-y += ptrace.h
-mandatory-y += resource.h
-mandatory-y += sembuf.h
-mandatory-y += setup.h
-mandatory-y += shmbuf.h
-mandatory-y += sigcontext.h
-mandatory-y += siginfo.h
-mandatory-y += signal.h
-mandatory-y += socket.h
-mandatory-y += sockios.h
-mandatory-y += stat.h
-mandatory-y += statfs.h
-mandatory-y += swab.h
-mandatory-y += termbits.h
-mandatory-y += termios.h
-mandatory-y += types.h
-mandatory-y += unistd.h
diff --git a/scripts/Makefile.asm-generic b/scripts/Makefile.asm-generic
index c7d2b7acad26..82ad63dcd62b 100644
--- a/scripts/Makefile.asm-generic
+++ b/scripts/Makefile.asm-generic
@@ -12,6 +12,11 @@ all:
 src := $(subst /generated,,$(obj))
 -include $(src)/Kbuild
 
+# $(generic)/Kbuild lists mandatory-y. Exclude um since it is a special case.
+ifneq ($(SRCARCH),um)
+include $(generic)/Kbuild
+endif
+
 include scripts/Kbuild.include
 
 redundant := $(filter $(mandatory-y) $(generated-y), $(generic-y))
-- 
cgit v1.2.3


From 8e44fc85060ec997e9c6f3c49a04274db6621d26 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 12 Mar 2019 16:44:30 +0200
Subject: auxdisplay: charlcd: Introduce charlcd_free() helper

The charlcd_free() is a counterpart to charlcd_alloc()
and should be called symmetrically on tear down.

Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
---
 drivers/auxdisplay/charlcd.c | 6 ++++++
 include/misc/charlcd.h       | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/drivers/auxdisplay/charlcd.c b/drivers/auxdisplay/charlcd.c
index 407acd22efa8..a351a9400054 100644
--- a/drivers/auxdisplay/charlcd.c
+++ b/drivers/auxdisplay/charlcd.c
@@ -818,6 +818,12 @@ struct charlcd *charlcd_alloc(unsigned int drvdata_size)
 }
 EXPORT_SYMBOL_GPL(charlcd_alloc);
 
+void charlcd_free(struct charlcd *lcd)
+{
+	kfree(charlcd_to_priv(lcd));
+}
+EXPORT_SYMBOL_GPL(charlcd_free);
+
 static int panel_notify_sys(struct notifier_block *this, unsigned long code,
 			    void *unused)
 {
diff --git a/include/misc/charlcd.h b/include/misc/charlcd.h
index 23f61850f363..1832402324ce 100644
--- a/include/misc/charlcd.h
+++ b/include/misc/charlcd.h
@@ -35,6 +35,7 @@ struct charlcd_ops {
 };
 
 struct charlcd *charlcd_alloc(unsigned int drvdata_size);
+void charlcd_free(struct charlcd *lcd);
 
 int charlcd_register(struct charlcd *lcd);
 int charlcd_unregister(struct charlcd *lcd);
-- 
cgit v1.2.3


From c5ae1954c47d3fd8815bd5a592aba18702c93f33 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Wed, 6 Mar 2019 19:21:42 +0200
Subject: IB/mlx5: Use mlx5 core to create/destroy a DEVX DCT

To prevent a hardware memory leak when a DEVX DCT object is destroyed
without calling DRAIN DCT before, (e.g. under cleanup flow), need to
manage its creation and destruction via mlx5 core.

In that case the DRAIN DCT command will be called and only once that it
will be completed the DESTROY DCT command will be called.  Otherwise, the
DESTROY DCT may fail and a hardware leak may occur.

As of that change the DRAIN DCT command should not be exposed any more
from DEVX, it's managed internally by the driver to work as expected by
the device specification.

Fixes: 7efce3691d33 ("IB/mlx5: Add obj create and destroy functionality")
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/devx.c            | 34 +++++++++++++++++++++-------
 drivers/infiniband/hw/mlx5/qp.c              |  4 +++-
 drivers/net/ethernet/mellanox/mlx5/core/qp.c |  6 ++---
 include/linux/mlx5/qp.h                      |  3 ++-
 4 files changed, 34 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index eaa055007f28..9e08df7914aa 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -20,6 +20,7 @@
 
 enum devx_obj_flags {
 	DEVX_OBJ_FLAGS_INDIRECT_MKEY = 1 << 0,
+	DEVX_OBJ_FLAGS_DCT = 1 << 1,
 };
 
 struct devx_async_data {
@@ -39,7 +40,10 @@ struct devx_obj {
 	u32			dinlen; /* destroy inbox length */
 	u32			dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW];
 	u32			flags;
-	struct mlx5_ib_devx_mr	devx_mr;
+	union {
+		struct mlx5_ib_devx_mr	devx_mr;
+		struct mlx5_core_dct	core_dct;
+	};
 };
 
 struct devx_umem {
@@ -347,7 +351,6 @@ static u64 devx_get_obj_id(const void *in)
 		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ,
 					MLX5_GET(arm_rq_in, in, srq_number));
 		break;
-	case MLX5_CMD_OP_DRAIN_DCT:
 	case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION:
 		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_DCT,
 					MLX5_GET(drain_dct_in, in, dctn));
@@ -618,7 +621,6 @@ static bool devx_is_obj_modify_cmd(const void *in)
 	case MLX5_CMD_OP_2RST_QP:
 	case MLX5_CMD_OP_ARM_XRC_SRQ:
 	case MLX5_CMD_OP_ARM_RQ:
-	case MLX5_CMD_OP_DRAIN_DCT:
 	case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION:
 	case MLX5_CMD_OP_ARM_XRQ:
 	case MLX5_CMD_OP_SET_XRQ_DC_PARAMS_ENTRY:
@@ -1124,7 +1126,11 @@ static int devx_obj_cleanup(struct ib_uobject *uobject,
 	if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY)
 		devx_cleanup_mkey(obj);
 
-	ret = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out));
+	if (obj->flags & DEVX_OBJ_FLAGS_DCT)
+		ret = mlx5_core_destroy_dct(obj->mdev, &obj->core_dct);
+	else
+		ret = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out,
+				    sizeof(out));
 	if (ib_is_destroy_retryable(ret, why, uobject))
 		return ret;
 
@@ -1185,9 +1191,17 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
 		devx_set_umem_valid(cmd_in);
 	}
 
-	err = mlx5_cmd_exec(dev->mdev, cmd_in,
-			    cmd_in_len,
-			    cmd_out, cmd_out_len);
+	if (opcode == MLX5_CMD_OP_CREATE_DCT) {
+		obj->flags |= DEVX_OBJ_FLAGS_DCT;
+		err = mlx5_core_create_dct(dev->mdev, &obj->core_dct,
+					   cmd_in, cmd_in_len,
+					   cmd_out, cmd_out_len);
+	} else {
+		err = mlx5_cmd_exec(dev->mdev, cmd_in,
+				    cmd_in_len,
+				    cmd_out, cmd_out_len);
+	}
+
 	if (err)
 		goto obj_free;
 
@@ -1214,7 +1228,11 @@ err_copy:
 	if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY)
 		devx_cleanup_mkey(obj);
 obj_destroy:
-	mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out));
+	if (obj->flags & DEVX_OBJ_FLAGS_DCT)
+		mlx5_core_destroy_dct(obj->mdev, &obj->core_dct);
+	else
+		mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out,
+			      sizeof(out));
 obj_free:
 	kfree(obj);
 	return err;
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 6b1f0e76900b..7cd006da1dae 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -3729,6 +3729,7 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 
 	} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
 		struct mlx5_ib_modify_qp_resp resp = {};
+		u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {0};
 		u32 min_resp_len = offsetof(typeof(resp), dctn) +
 				   sizeof(resp.dctn);
 
@@ -3747,7 +3748,8 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit);
 
 		err = mlx5_core_create_dct(dev->mdev, &qp->dct.mdct, qp->dct.in,
-					   MLX5_ST_SZ_BYTES(create_dct_in));
+					   MLX5_ST_SZ_BYTES(create_dct_in), out,
+					   sizeof(out));
 		if (err)
 			return err;
 		resp.dctn = qp->dct.mdct.mqp.qpn;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index c7c2920c05c4..b8ba74de9555 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -263,16 +263,16 @@ destroy:
 
 int mlx5_core_create_dct(struct mlx5_core_dev *dev,
 			 struct mlx5_core_dct *dct,
-			 u32 *in, int inlen)
+			 u32 *in, int inlen,
+			 u32 *out, int outlen)
 {
-	u32 out[MLX5_ST_SZ_DW(create_dct_out)]   = {0};
 	struct mlx5_core_qp *qp = &dct->mqp;
 	int err;
 
 	init_completion(&dct->drained);
 	MLX5_SET(create_dct_in, in, opcode, MLX5_CMD_OP_CREATE_DCT);
 
-	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
+	err = mlx5_cmd_exec(dev, in, inlen, out, outlen);
 	if (err) {
 		mlx5_core_warn(dev, "create DCT failed, ret %d\n", err);
 		return err;
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index b26ea9077384..0343c81d4c5f 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -557,7 +557,8 @@ static inline struct mlx5_core_mkey *__mlx5_mr_lookup(struct mlx5_core_dev *dev,
 
 int mlx5_core_create_dct(struct mlx5_core_dev *dev,
 			 struct mlx5_core_dct *qp,
-			 u32 *in, int inlen);
+			 u32 *in, int inlen,
+			 u32 *out, int outlen);
 int mlx5_core_create_qp(struct mlx5_core_dev *dev,
 			struct mlx5_core_qp *qp,
 			u32 *in,
-- 
cgit v1.2.3


From cd1b772d4881d1cd15b90ec17aab9ac7950e8850 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 29 Oct 2018 16:32:31 +0100
Subject: driver core: remove BUS_ATTR()

There are now no in-kernel users of BUS_ATTR() so drop it from device.h

Everyone should use BUS_ATTR_RO/RW/WO() from now on.

Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/device.h b/include/linux/device.h
index b425a7ee04ce..4e6987e11f68 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -49,8 +49,6 @@ struct bus_attribute {
 	ssize_t (*store)(struct bus_type *bus, const char *buf, size_t count);
 };
 
-#define BUS_ATTR(_name, _mode, _show, _store)	\
-	struct bus_attribute bus_attr_##_name = __ATTR(_name, _mode, _show, _store)
 #define BUS_ATTR_RW(_name) \
 	struct bus_attribute bus_attr_##_name = __ATTR_RW(_name)
 #define BUS_ATTR_RO(_name) \
-- 
cgit v1.2.3


From 875f1d0769cdcfe1596ff0ca609b453359e42ec9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 27 Feb 2019 13:05:25 -0700
Subject: iov_iter: add ITER_BVEC_FLAG_NO_REF flag

For ITER_BVEC, if we're holding on to kernel pages, the caller
doesn't need to grab a reference to the bvec pages, and drop that
same reference on IO completion. This is essentially safe for any
ITER_BVEC, but some use cases end up reusing pages and uncondtionally
dropping a page reference on completion. And example of that is
sendfile(2), that ends up being a splice_in + splice_out on the
pipe pages.

Add a flag that tells us it's fine to not grab a page reference
to the bvec pages, since that caller knows not to drop a reference
when it's done with the pages.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c       |  3 +++
 include/linux/uio.h | 24 +++++++++++++++++++-----
 2 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4c6a5e60ddbe..c592a0933b0d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -855,6 +855,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
 	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
 	if (offset)
 		iov_iter_advance(iter, offset);
+
+	/* don't drop a reference to these pages */
+	iter->type |= ITER_BVEC_FLAG_NO_REF;
 	return 0;
 }
 
diff --git a/include/linux/uio.h b/include/linux/uio.h
index ecf584f6b82d..4e926641fa80 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -23,14 +23,23 @@ struct kvec {
 };
 
 enum iter_type {
-	ITER_IOVEC = 0,
-	ITER_KVEC = 2,
-	ITER_BVEC = 4,
-	ITER_PIPE = 8,
-	ITER_DISCARD = 16,
+	/* set if ITER_BVEC doesn't hold a bv_page ref */
+	ITER_BVEC_FLAG_NO_REF = 2,
+
+	/* iter types */
+	ITER_IOVEC = 4,
+	ITER_KVEC = 8,
+	ITER_BVEC = 16,
+	ITER_PIPE = 32,
+	ITER_DISCARD = 64,
 };
 
 struct iov_iter {
+	/*
+	 * Bit 0 is the read/write bit, set if we're writing.
+	 * Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and
+	 * the caller isn't expecting to drop a page reference when done.
+	 */
 	unsigned int type;
 	size_t iov_offset;
 	size_t count;
@@ -84,6 +93,11 @@ static inline unsigned char iov_iter_rw(const struct iov_iter *i)
 	return i->type & (READ | WRITE);
 }
 
+static inline bool iov_iter_bvec_no_ref(const struct iov_iter *i)
+{
+	return (i->type & ITER_BVEC_FLAG_NO_REF) != 0;
+}
+
 /*
  * Total number of bytes covered by an iovec.
  *
-- 
cgit v1.2.3


From 399254aaf4892113c806816f7e64cf40c804d46d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 27 Feb 2019 13:13:23 -0700
Subject: block: add BIO_NO_PAGE_REF flag

If bio_iov_iter_get_pages() is called on an iov_iter that is flagged
with NO_REF, then we don't need to add a page reference for the pages
that we add.

Add BIO_NO_PAGE_REF to track this in the bio, so IO completion knows
not to drop a reference to these pages.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c               | 43 ++++++++++++++++++++++++-------------------
 fs/block_dev.c            | 12 +++++++-----
 fs/iomap.c                | 12 +++++++-----
 include/linux/blk_types.h |  1 +
 4 files changed, 39 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index 71a78d9fb8b7..b64cedc7f87c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -849,20 +849,14 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
 	size = bio_add_page(bio, bv->bv_page, len,
 				bv->bv_offset + iter->iov_offset);
 	if (size == len) {
-		struct page *page;
-		int i;
+		if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
+			struct page *page;
+			int i;
+
+			mp_bvec_for_each_page(page, bv, i)
+				get_page(page);
+		}
 
-		/*
-		 * For the normal O_DIRECT case, we could skip grabbing this
-		 * reference and then not have to put them again when IO
-		 * completes. But this breaks some in-kernel users, like
-		 * splicing to/from a loop device, where we release the pipe
-		 * pages unconditionally. If we can fix that case, we can
-		 * get rid of the get here and the need to call
-		 * bio_release_pages() at IO completion time.
-		 */
-		mp_bvec_for_each_page(page, bv, i)
-			get_page(page);
 		iov_iter_advance(iter, size);
 		return 0;
 	}
@@ -925,10 +919,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
  * This takes either an iterator pointing to user memory, or one pointing to
  * kernel pages (BVEC iterator). If we're adding user pages, we pin them and
  * map them into the kernel. On IO completion, the caller should put those
- * pages. For now, when adding kernel pages, we still grab a reference to the
- * page. This isn't strictly needed for the common case, but some call paths
- * end up releasing pages from eg a pipe and we can't easily control these.
- * See comment in __bio_iov_bvec_add_pages().
+ * pages. If we're adding kernel pages, and the caller told us it's safe to
+ * do so, we just have to add the pages to the bio directly. We don't grab an
+ * extra reference to those pages (the user should already have that), and we
+ * don't put the page on IO completion. The caller needs to check if the bio is
+ * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be
+ * released.
  *
  * The function tries, but does not guarantee, to pin as many pages as
  * fit into the bio, or are requested in *iter, whatever is smaller. If
@@ -940,6 +936,13 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	const bool is_bvec = iov_iter_is_bvec(iter);
 	unsigned short orig_vcnt = bio->bi_vcnt;
 
+	/*
+	 * If this is a BVEC iter, then the pages are kernel pages. Don't
+	 * release them on IO completion, if the caller asked us to.
+	 */
+	if (is_bvec && iov_iter_bvec_no_ref(iter))
+		bio_set_flag(bio, BIO_NO_PAGE_REF);
+
 	do {
 		int ret;
 
@@ -1696,7 +1699,8 @@ static void bio_dirty_fn(struct work_struct *work)
 		next = bio->bi_private;
 
 		bio_set_pages_dirty(bio);
-		bio_release_pages(bio);
+		if (!bio_flagged(bio, BIO_NO_PAGE_REF))
+			bio_release_pages(bio);
 		bio_put(bio);
 	}
 }
@@ -1713,7 +1717,8 @@ void bio_check_pages_dirty(struct bio *bio)
 			goto defer;
 	}
 
-	bio_release_pages(bio);
+	if (!bio_flagged(bio, BIO_NO_PAGE_REF))
+		bio_release_pages(bio);
 	bio_put(bio);
 	return;
 defer:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index e9faa52bb489..78d3257435c0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -336,12 +336,14 @@ static void blkdev_bio_end_io(struct bio *bio)
 	if (should_dirty) {
 		bio_check_pages_dirty(bio);
 	} else {
-		struct bio_vec *bvec;
-		int i;
-		struct bvec_iter_all iter_all;
+		if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
+			struct bvec_iter_all iter_all;
+			struct bio_vec *bvec;
+			int i;
 
-		bio_for_each_segment_all(bvec, bio, i, iter_all)
-			put_page(bvec->bv_page);
+			bio_for_each_segment_all(bvec, bio, i, iter_all)
+				put_page(bvec->bv_page);
+		}
 		bio_put(bio);
 	}
 }
diff --git a/fs/iomap.c b/fs/iomap.c
index 97cb9d486a7d..abdd18e404f8 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1589,12 +1589,14 @@ static void iomap_dio_bio_end_io(struct bio *bio)
 	if (should_dirty) {
 		bio_check_pages_dirty(bio);
 	} else {
-		struct bio_vec *bvec;
-		int i;
-		struct bvec_iter_all iter_all;
+		if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
+			struct bvec_iter_all iter_all;
+			struct bio_vec *bvec;
+			int i;
 
-		bio_for_each_segment_all(bvec, bio, i, iter_all)
-			put_page(bvec->bv_page);
+			bio_for_each_segment_all(bvec, bio, i, iter_all)
+				put_page(bvec->bv_page);
+		}
 		bio_put(bio);
 	}
 }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d66bf5f32610..791fee35df88 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -215,6 +215,7 @@ struct bio {
 /*
  * bio flags
  */
+#define BIO_NO_PAGE_REF	0	/* don't put release vec pages */
 #define BIO_SEG_VALID	1	/* bi_phys_segments valid */
 #define BIO_CLONED	2	/* doesn't own data */
 #define BIO_BOUNCED	3	/* bio is a bounce bio */
-- 
cgit v1.2.3


From a4dc6a49156b1f8d6e17251ffda17c9e6a5db78a Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Sat, 16 Mar 2019 14:41:30 +0100
Subject: packets: Always register packet sk in the same order

When using fanouts with AF_PACKET, the demux functions such as
fanout_demux_cpu will return an index in the fanout socket array, which
corresponds to the selected socket.

The ordering of this array depends on the order the sockets were added
to a given fanout group, so for FANOUT_CPU this means sockets are bound
to cpus in the order they are configured, which is OK.

However, when stopping then restarting the interface these sockets are
bound to, the sockets are reassigned to the fanout group in the reverse
order, due to the fact that they were inserted at the head of the
interface's AF_PACKET socket list.

This means that traffic that was directed to the first socket in the
fanout group is now directed to the last one after an interface restart.

In the case of FANOUT_CPU, traffic from CPU0 will be directed to the
socket that used to receive traffic from the last CPU after an interface
restart.

This commit introduces a helper to add a socket at the tail of a list,
then uses it to register AF_PACKET sockets.

Note that this changes the order in which sockets are listed in /proc and
with sock_diag.

Fixes: dc99f600698d ("packet: Add fanout support")
Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h     | 6 ++++++
 net/packet/af_packet.c | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 328cb7cb7b0b..8de5ee258b93 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -710,6 +710,12 @@ static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
 		hlist_add_head_rcu(&sk->sk_node, list);
 }
 
+static inline void sk_add_node_tail_rcu(struct sock *sk, struct hlist_head *list)
+{
+	sock_hold(sk);
+	hlist_add_tail_rcu(&sk->sk_node, list);
+}
+
 static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
 {
 	hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 8376bc1c1508..8754d7c93b84 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3243,7 +3243,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 	}
 
 	mutex_lock(&net->packet.sklist_lock);
-	sk_add_node_rcu(sk, &net->packet.sklist);
+	sk_add_node_tail_rcu(sk, &net->packet.sklist);
 	mutex_unlock(&net->packet.sklist_lock);
 
 	preempt_disable();
-- 
cgit v1.2.3


From 273160ffc6b993c7c91627f5a84799c66dfe4dee Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 18 Mar 2019 19:47:00 +0800
Subject: sctp: get sctphdr by offset in sctp_compute_cksum

sctp_hdr(skb) only works when skb->transport_header is set properly.

But in Netfilter, skb->transport_header for ipv6 is not guaranteed
to be right value for sctphdr. It would cause to fail to check the
checksum for sctp packets.

So fix it by using offset, which is always right in all places.

v1->v2:
  - Fix the changelog.

Fixes: e6d8b64b34aa ("net: sctp: fix and consolidate SCTP checksumming code")
Reported-by: Li Shuang <shuali@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/checksum.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h
index 32ee65a30aff..1c6e6c0766ca 100644
--- a/include/net/sctp/checksum.h
+++ b/include/net/sctp/checksum.h
@@ -61,7 +61,7 @@ static inline __wsum sctp_csum_combine(__wsum csum, __wsum csum2,
 static inline __le32 sctp_compute_cksum(const struct sk_buff *skb,
 					unsigned int offset)
 {
-	struct sctphdr *sh = sctp_hdr(skb);
+	struct sctphdr *sh = (struct sctphdr *)(skb->data + offset);
 	const struct skb_checksum_ops ops = {
 		.update  = sctp_csum_update,
 		.combine = sctp_csum_combine,
-- 
cgit v1.2.3


From 7ab2184246bd0b32e427ff60bfa07a2435011ce2 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Mon, 18 Mar 2019 12:14:20 -0700
Subject: HID: intel-ish-hid: Hide members of struct ishtp_cl_device

ISH clients don't need to access any field of struct ishtp_cl_device. To
avoid this create an interface functions instead where it is required.
In the case of ishtp_cl_allocate(), modify the parameters so that the
clients don't have to dereference.
Clients can also use tracing, here a new interface is added to get the
common trace function pointer, instead of direct call.
The new interface functions defined in one external header file, named
intel-ish-client-if.h. This is the only header files all ISHTP clients
must include.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/intel-ish-hid/ishtp-hid-client.c | 56 ++++++++++++++++------------
 drivers/hid/intel-ish-hid/ishtp-hid.c        |  4 +-
 drivers/hid/intel-ish-hid/ishtp-hid.h        |  2 +-
 drivers/hid/intel-ish-hid/ishtp/bus.c        | 27 ++++++++++++++
 drivers/hid/intel-ish-hid/ishtp/client.c     |  4 +-
 drivers/hid/intel-ish-hid/ishtp/client.h     |  2 +-
 include/linux/intel-ish-client-if.h          | 18 +++++++++
 7 files changed, 84 insertions(+), 29 deletions(-)
 create mode 100644 include/linux/intel-ish-client-if.h

(limited to 'include')

diff --git a/drivers/hid/intel-ish-hid/ishtp-hid-client.c b/drivers/hid/intel-ish-hid/ishtp-hid-client.c
index b86290611a5f..17d5bd4f52c1 100644
--- a/drivers/hid/intel-ish-hid/ishtp-hid-client.c
+++ b/drivers/hid/intel-ish-hid/ishtp-hid-client.c
@@ -15,6 +15,7 @@
 
 #include <linux/module.h>
 #include <linux/hid.h>
+#include <linux/intel-ish-client-if.h>
 #include <linux/sched.h>
 #include "ishtp/ishtp-dev.h"
 #include "ishtp/client.h"
@@ -24,6 +25,8 @@
 #define HID_CL_RX_RING_SIZE	32
 #define HID_CL_TX_RING_SIZE	16
 
+#define cl_data_to_dev(client_data) ishtp_device(client_data->cl_device)
+
 /**
  * report_bad_packets() - Report bad packets
  * @hid_ishtp_cl:	Client instance to get stats
@@ -39,7 +42,7 @@ static void report_bad_packet(struct ishtp_cl *hid_ishtp_cl, void *recv_buf,
 	struct hostif_msg *recv_msg = recv_buf;
 	struct ishtp_cl_data *client_data = hid_ishtp_cl->client_data;
 
-	dev_err(&client_data->cl_device->dev, "[hid-ish]: BAD packet %02X\n"
+	dev_err(cl_data_to_dev(client_data), "[hid-ish]: BAD packet %02X\n"
 		"total_bad=%u cur_pos=%u\n"
 		"[%02X %02X %02X %02X]\n"
 		"payload_len=%u\n"
@@ -85,7 +88,7 @@ static void process_recv(struct ishtp_cl *hid_ishtp_cl, void *recv_buf,
 
 	do {
 		if (cur_pos + sizeof(struct hostif_msg) > total_len) {
-			dev_err(&client_data->cl_device->dev,
+			dev_err(cl_data_to_dev(client_data),
 				"[hid-ish]: error, received %u which is less than data header %u\n",
 				(unsigned int)data_len,
 				(unsigned int)sizeof(struct hostif_msg_hdr));
@@ -124,12 +127,12 @@ static void process_recv(struct ishtp_cl *hid_ishtp_cl, void *recv_buf,
 			client_data->hid_dev_count = (unsigned int)*payload;
 			if (!client_data->hid_devices)
 				client_data->hid_devices = devm_kcalloc(
-						&client_data->cl_device->dev,
+						cl_data_to_dev(client_data),
 						client_data->hid_dev_count,
 						sizeof(struct device_info),
 						GFP_KERNEL);
 			if (!client_data->hid_devices) {
-				dev_err(&client_data->cl_device->dev,
+				dev_err(cl_data_to_dev(client_data),
 				"Mem alloc failed for hid device info\n");
 				wake_up_interruptible(&client_data->init_wait);
 				break;
@@ -137,7 +140,7 @@ static void process_recv(struct ishtp_cl *hid_ishtp_cl, void *recv_buf,
 			for (i = 0; i < client_data->hid_dev_count; ++i) {
 				if (1 + sizeof(struct device_info) * i >=
 						payload_len) {
-					dev_err(&client_data->cl_device->dev,
+					dev_err(cl_data_to_dev(client_data),
 						"[hid-ish]: [ENUM_DEVICES]: content size %zu is bigger than payload_len %zu\n",
 						1 + sizeof(struct device_info)
 						* i, payload_len);
@@ -172,7 +175,7 @@ static void process_recv(struct ishtp_cl *hid_ishtp_cl, void *recv_buf,
 			}
 			if (!client_data->hid_descr[curr_hid_dev])
 				client_data->hid_descr[curr_hid_dev] =
-				devm_kmalloc(&client_data->cl_device->dev,
+				devm_kmalloc(cl_data_to_dev(client_data),
 					     payload_len, GFP_KERNEL);
 			if (client_data->hid_descr[curr_hid_dev]) {
 				memcpy(client_data->hid_descr[curr_hid_dev],
@@ -197,7 +200,7 @@ static void process_recv(struct ishtp_cl *hid_ishtp_cl, void *recv_buf,
 			}
 			if (!client_data->report_descr[curr_hid_dev])
 				client_data->report_descr[curr_hid_dev] =
-				devm_kmalloc(&client_data->cl_device->dev,
+				devm_kmalloc(cl_data_to_dev(client_data),
 					     payload_len, GFP_KERNEL);
 			if (client_data->report_descr[curr_hid_dev])  {
 				memcpy(client_data->report_descr[curr_hid_dev],
@@ -516,12 +519,12 @@ static int ishtp_enum_enum_devices(struct ishtp_cl *hid_ishtp_cl)
 					   sizeof(struct hostif_msg));
 	}
 	if (!client_data->enum_devices_done) {
-		dev_err(&client_data->cl_device->dev,
+		dev_err(cl_data_to_dev(client_data),
 			"[hid-ish]: timed out waiting for enum_devices\n");
 		return -ETIMEDOUT;
 	}
 	if (!client_data->hid_devices) {
-		dev_err(&client_data->cl_device->dev,
+		dev_err(cl_data_to_dev(client_data),
 			"[hid-ish]: failed to allocate HID dev structures\n");
 		return -ENOMEM;
 	}
@@ -564,13 +567,13 @@ static int ishtp_get_hid_descriptor(struct ishtp_cl *hid_ishtp_cl, int index)
 						 client_data->hid_descr_done,
 						 3 * HZ);
 		if (!client_data->hid_descr_done) {
-			dev_err(&client_data->cl_device->dev,
+			dev_err(cl_data_to_dev(client_data),
 				"[hid-ish]: timed out for hid_descr_done\n");
 			return -EIO;
 		}
 
 		if (!client_data->hid_descr[index]) {
-			dev_err(&client_data->cl_device->dev,
+			dev_err(cl_data_to_dev(client_data),
 				"[hid-ish]: allocation HID desc fail\n");
 			return -ENOMEM;
 		}
@@ -611,12 +614,12 @@ static int ishtp_get_report_descriptor(struct ishtp_cl *hid_ishtp_cl,
 					 client_data->report_descr_done,
 					 3 * HZ);
 	if (!client_data->report_descr_done) {
-		dev_err(&client_data->cl_device->dev,
+		dev_err(cl_data_to_dev(client_data),
 				"[hid-ish]: timed out for report descr\n");
 		return -EIO;
 	}
 	if (!client_data->report_descr[index]) {
-		dev_err(&client_data->cl_device->dev,
+		dev_err(cl_data_to_dev(client_data),
 			"[hid-ish]: failed to alloc report descr\n");
 		return -ENOMEM;
 	}
@@ -646,12 +649,12 @@ static int hid_ishtp_cl_init(struct ishtp_cl *hid_ishtp_cl, int reset)
 	int i;
 	int rv;
 
-	dev_dbg(&client_data->cl_device->dev, "%s\n", __func__);
+	dev_dbg(cl_data_to_dev(client_data), "%s\n", __func__);
 	hid_ishtp_trace(client_data,  "%s reset flag: %d\n", __func__, reset);
 
 	rv = ishtp_cl_link(hid_ishtp_cl, ISHTP_HOST_CLIENT_ID_ANY);
 	if (rv) {
-		dev_err(&client_data->cl_device->dev,
+		dev_err(cl_data_to_dev(client_data),
 			"ishtp_cl_link failed\n");
 		return	-ENOMEM;
 	}
@@ -666,7 +669,7 @@ static int hid_ishtp_cl_init(struct ishtp_cl *hid_ishtp_cl, int reset)
 
 	fw_client = ishtp_fw_cl_get_client(dev, &hid_ishtp_guid);
 	if (!fw_client) {
-		dev_err(&client_data->cl_device->dev,
+		dev_err(cl_data_to_dev(client_data),
 			"ish client uuid not found\n");
 		return -ENOENT;
 	}
@@ -676,7 +679,7 @@ static int hid_ishtp_cl_init(struct ishtp_cl *hid_ishtp_cl, int reset)
 
 	rv = ishtp_cl_connect(hid_ishtp_cl);
 	if (rv) {
-		dev_err(&client_data->cl_device->dev,
+		dev_err(cl_data_to_dev(client_data),
 			"client connect fail\n");
 		goto err_cl_unlink;
 	}
@@ -707,7 +710,7 @@ static int hid_ishtp_cl_init(struct ishtp_cl *hid_ishtp_cl, int reset)
 		if (!reset) {
 			rv = ishtp_hid_probe(i, client_data);
 			if (rv) {
-				dev_err(&client_data->cl_device->dev,
+				dev_err(cl_data_to_dev(client_data),
 				"[hid-ish]: HID probe for #%u failed: %d\n",
 				i, rv);
 				goto err_cl_disconnect;
@@ -763,7 +766,7 @@ static void hid_ishtp_cl_reset_handler(struct work_struct *work)
 
 	hid_ishtp_cl_deinit(hid_ishtp_cl);
 
-	hid_ishtp_cl = ishtp_cl_allocate(cl_device->ishtp_dev);
+	hid_ishtp_cl = ishtp_cl_allocate(cl_device);
 	if (!hid_ishtp_cl)
 		return;
 
@@ -777,15 +780,17 @@ static void hid_ishtp_cl_reset_handler(struct work_struct *work)
 		rv = hid_ishtp_cl_init(hid_ishtp_cl, 1);
 		if (!rv)
 			break;
-		dev_err(&client_data->cl_device->dev, "Retry reset init\n");
+		dev_err(cl_data_to_dev(client_data), "Retry reset init\n");
 	}
 	if (rv) {
-		dev_err(&client_data->cl_device->dev, "Reset Failed\n");
+		dev_err(cl_data_to_dev(client_data), "Reset Failed\n");
 		hid_ishtp_trace(client_data, "%s Failed hid_ishtp_cl %p\n",
 				__func__, hid_ishtp_cl);
 	}
 }
 
+void (*hid_print_trace)(void *dev, const char *format, ...);
+
 /**
  * hid_ishtp_cl_probe() - ISHTP client driver probe
  * @cl_device:		ISHTP client device instance
@@ -803,12 +808,13 @@ static int hid_ishtp_cl_probe(struct ishtp_cl_device *cl_device)
 	if (!cl_device)
 		return	-ENODEV;
 
-	client_data = devm_kzalloc(&cl_device->dev, sizeof(*client_data),
+	client_data = devm_kzalloc(ishtp_device(cl_device),
+				   sizeof(*client_data),
 				   GFP_KERNEL);
 	if (!client_data)
 		return -ENOMEM;
 
-	hid_ishtp_cl = ishtp_cl_allocate(cl_device->ishtp_dev);
+	hid_ishtp_cl = ishtp_cl_allocate(cl_device);
 	if (!hid_ishtp_cl)
 		return -ENOMEM;
 
@@ -822,6 +828,8 @@ static int hid_ishtp_cl_probe(struct ishtp_cl_device *cl_device)
 
 	INIT_WORK(&client_data->work, hid_ishtp_cl_reset_handler);
 
+	hid_print_trace = ishtp_trace_callback(cl_device);
+
 	rv = hid_ishtp_cl_init(hid_ishtp_cl, 0);
 	if (rv) {
 		ishtp_cl_free(hid_ishtp_cl);
@@ -848,7 +856,7 @@ static int hid_ishtp_cl_remove(struct ishtp_cl_device *cl_device)
 	hid_ishtp_trace(client_data, "%s hid_ishtp_cl %p\n", __func__,
 			hid_ishtp_cl);
 
-	dev_dbg(&cl_device->dev, "%s\n", __func__);
+	dev_dbg(ishtp_device(cl_device), "%s\n", __func__);
 	hid_ishtp_cl->state = ISHTP_CL_DISCONNECTING;
 	ishtp_cl_disconnect(hid_ishtp_cl);
 	ishtp_put_device(cl_device);
diff --git a/drivers/hid/intel-ish-hid/ishtp-hid.c b/drivers/hid/intel-ish-hid/ishtp-hid.c
index 5c7e127b0483..95a5b87d9105 100644
--- a/drivers/hid/intel-ish-hid/ishtp-hid.c
+++ b/drivers/hid/intel-ish-hid/ishtp-hid.c
@@ -14,6 +14,7 @@
  */
 
 #include <linux/hid.h>
+#include <linux/intel-ish-client-if.h>
 #include <uapi/linux/input.h>
 #include "ishtp/client.h"
 #include "ishtp-hid.h"
@@ -241,7 +242,8 @@ int ishtp_hid_probe(unsigned int cur_hid_dev,
 
 	hid->ll_driver = &ishtp_hid_ll_driver;
 	hid->bus = BUS_INTEL_ISHTP;
-	hid->dev.parent = &client_data->cl_device->dev;
+	hid->dev.parent = ishtp_device(client_data->cl_device);
+
 	hid->version = le16_to_cpu(ISH_HID_VERSION);
 	hid->vendor = le16_to_cpu(client_data->hid_devices[cur_hid_dev].vid);
 	hid->product = le16_to_cpu(client_data->hid_devices[cur_hid_dev].pid);
diff --git a/drivers/hid/intel-ish-hid/ishtp-hid.h b/drivers/hid/intel-ish-hid/ishtp-hid.h
index 40663639e43b..8af44e855a49 100644
--- a/drivers/hid/intel-ish-hid/ishtp-hid.h
+++ b/drivers/hid/intel-ish-hid/ishtp-hid.h
@@ -24,7 +24,7 @@
 #define	IS_RESPONSE	0x80
 
 /* Used to dump to Linux trace buffer, if enabled */
-#define hid_ishtp_trace(client, ...)	\
+#define hid_ishtp_trace(client, ...)   \
 	client->cl_device->ishtp_dev->print_log(\
 		client->cl_device->ishtp_dev, __VA_ARGS__)
 
diff --git a/drivers/hid/intel-ish-hid/ishtp/bus.c b/drivers/hid/intel-ish-hid/ishtp/bus.c
index 6348fee8aadc..308853eab91c 100644
--- a/drivers/hid/intel-ish-hid/ishtp/bus.c
+++ b/drivers/hid/intel-ish-hid/ishtp/bus.c
@@ -827,6 +827,33 @@ int ishtp_use_dma_transfer(void)
 	return ishtp_use_dma;
 }
 
+/**
+ * ishtp_device() - Return device pointer
+ *
+ * This interface is used to return device pointer from ishtp_cl_device
+ * instance.
+ *
+ * Return: device *.
+ */
+struct device *ishtp_device(struct ishtp_cl_device *device)
+{
+	return &device->dev;
+}
+EXPORT_SYMBOL(ishtp_device);
+
+/**
+ * ishtp_trace_callback() - Return trace callback
+ *
+ * This interface is used to return trace callback function pointer.
+ *
+ * Return: void *.
+ */
+void *ishtp_trace_callback(struct ishtp_cl_device *cl_device)
+{
+	return cl_device->ishtp_dev->print_log;
+}
+EXPORT_SYMBOL(ishtp_trace_callback);
+
 /**
  * ishtp_bus_register() - Function to register bus
  *
diff --git a/drivers/hid/intel-ish-hid/ishtp/client.c b/drivers/hid/intel-ish-hid/ishtp/client.c
index faeccdb1475b..760e48a368a8 100644
--- a/drivers/hid/intel-ish-hid/ishtp/client.c
+++ b/drivers/hid/intel-ish-hid/ishtp/client.c
@@ -126,7 +126,7 @@ static void ishtp_cl_init(struct ishtp_cl *cl, struct ishtp_device *dev)
  *
  * Return: The allocated client instance or NULL on failure
  */
-struct ishtp_cl *ishtp_cl_allocate(struct ishtp_device *dev)
+struct ishtp_cl *ishtp_cl_allocate(struct ishtp_cl_device *cl_device)
 {
 	struct ishtp_cl *cl;
 
@@ -134,7 +134,7 @@ struct ishtp_cl *ishtp_cl_allocate(struct ishtp_device *dev)
 	if (!cl)
 		return NULL;
 
-	ishtp_cl_init(cl, dev);
+	ishtp_cl_init(cl, cl_device->ishtp_dev);
 	return cl;
 }
 EXPORT_SYMBOL(ishtp_cl_allocate);
diff --git a/drivers/hid/intel-ish-hid/ishtp/client.h b/drivers/hid/intel-ish-hid/ishtp/client.h
index e0df3eb611e6..992625891a6c 100644
--- a/drivers/hid/intel-ish-hid/ishtp/client.h
+++ b/drivers/hid/intel-ish-hid/ishtp/client.h
@@ -170,7 +170,7 @@ static inline bool ishtp_cl_cmp_id(const struct ishtp_cl *cl1,
 }
 
 /* exported functions from ISHTP under client management scope */
-struct ishtp_cl	*ishtp_cl_allocate(struct ishtp_device *dev);
+struct ishtp_cl *ishtp_cl_allocate(struct ishtp_cl_device *cl_device);
 void ishtp_cl_free(struct ishtp_cl *cl);
 int ishtp_cl_link(struct ishtp_cl *cl, int id);
 void ishtp_cl_unlink(struct ishtp_cl *cl);
diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h
new file mode 100644
index 000000000000..11e285172735
--- /dev/null
+++ b/include/linux/intel-ish-client-if.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Intel ISH client Interface definitions
+ *
+ * Copyright (c) 2019, Intel Corporation.
+ */
+
+#ifndef _INTEL_ISH_CLIENT_IF_H_
+#define _INTEL_ISH_CLIENT_IF_H_
+
+struct ishtp_cl_device;
+
+/* Get the device * from ishtp device instance */
+struct device *ishtp_device(struct ishtp_cl_device *cl_device);
+/* Trace interface for clients */
+void *ishtp_trace_callback(struct ishtp_cl_device *cl_device);
+
+#endif /* _INTEL_ISH_CLIENT_IF_H_ */
-- 
cgit v1.2.3


From e00a864f976ab2646929ae82aa65ccd05a4f7539 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Mon, 18 Mar 2019 12:14:22 -0700
Subject: HID: intel-ish-hid: Move driver registry functions

Move the driver registry with the ishtp bus to the common interface
file, which clients can include.
Also rename __ishtp_cl_driver_register() to ishtp_cl_driver_register()
and removed define for ishtp_cl_driver_register.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/intel-ish-hid/ishtp-hid-client.c |  2 +-
 drivers/hid/intel-ish-hid/ishtp/bus.c        |  8 ++++----
 drivers/hid/intel-ish-hid/ishtp/bus.h        | 27 +--------------------------
 include/linux/intel-ish-client-if.h          | 25 +++++++++++++++++++++++++
 4 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/hid/intel-ish-hid/ishtp-hid-client.c b/drivers/hid/intel-ish-hid/ishtp-hid-client.c
index 94a00ffeb09b..4afb39713213 100644
--- a/drivers/hid/intel-ish-hid/ishtp-hid-client.c
+++ b/drivers/hid/intel-ish-hid/ishtp-hid-client.c
@@ -953,7 +953,7 @@ static int __init ish_hid_init(void)
 	int	rv;
 
 	/* Register ISHTP client device driver with ISHTP Bus */
-	rv = ishtp_cl_driver_register(&hid_ishtp_cl_driver);
+	rv = ishtp_cl_driver_register(&hid_ishtp_cl_driver, THIS_MODULE);
 
 	return rv;
 
diff --git a/drivers/hid/intel-ish-hid/ishtp/bus.c b/drivers/hid/intel-ish-hid/ishtp/bus.c
index 308853eab91c..95a97534fcda 100644
--- a/drivers/hid/intel-ish-hid/ishtp/bus.c
+++ b/drivers/hid/intel-ish-hid/ishtp/bus.c
@@ -485,7 +485,7 @@ static void ishtp_bus_remove_device(struct ishtp_cl_device *device)
 }
 
 /**
- * __ishtp_cl_driver_register() - Client driver register
+ * ishtp_cl_driver_register() - Client driver register
  * @driver:	the client driver instance
  * @owner:	Owner of this driver module
  *
@@ -494,8 +494,8 @@ static void ishtp_bus_remove_device(struct ishtp_cl_device *device)
  *
  * Return: Return value of driver_register or -ENODEV if not ready
  */
-int __ishtp_cl_driver_register(struct ishtp_cl_driver *driver,
-			       struct module *owner)
+int ishtp_cl_driver_register(struct ishtp_cl_driver *driver,
+			     struct module *owner)
 {
 	int err;
 
@@ -512,7 +512,7 @@ int __ishtp_cl_driver_register(struct ishtp_cl_driver *driver,
 
 	return 0;
 }
-EXPORT_SYMBOL(__ishtp_cl_driver_register);
+EXPORT_SYMBOL(ishtp_cl_driver_register);
 
 /**
  * ishtp_cl_driver_unregister() - Client driver unregister
diff --git a/drivers/hid/intel-ish-hid/ishtp/bus.h b/drivers/hid/intel-ish-hid/ishtp/bus.h
index c96e7fb42f01..4aed195719de 100644
--- a/drivers/hid/intel-ish-hid/ishtp/bus.h
+++ b/drivers/hid/intel-ish-hid/ishtp/bus.h
@@ -17,6 +17,7 @@
 
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
+#include <linux/intel-ish-client-if.h>
 
 struct ishtp_cl;
 struct ishtp_cl_device;
@@ -52,26 +53,6 @@ struct ishtp_cl_device {
 	void (*event_cb)(struct ishtp_cl_device *device);
 };
 
-/**
- * struct ishtp_cl_device - ISHTP device handle
- * @driver:	driver instance on a bus
- * @name:	Name of the device for probe
- * @probe:	driver callback for device probe
- * @remove:	driver callback on device removal
- *
- * Client drivers defines to get probed/removed for ISHTP client device.
- */
-struct ishtp_cl_driver {
-	struct device_driver driver;
-	const char *name;
-	const guid_t *guid;
-	int (*probe)(struct ishtp_cl_device *dev);
-	int (*remove)(struct ishtp_cl_device *dev);
-	int (*reset)(struct ishtp_cl_device *dev);
-	const struct dev_pm_ops *pm;
-};
-
-
 int	ishtp_bus_new_client(struct ishtp_device *dev);
 void	ishtp_remove_all_clients(struct ishtp_device *dev);
 int	ishtp_cl_device_bind(struct ishtp_cl *cl);
@@ -105,12 +86,6 @@ void	ishtp_get_device(struct ishtp_cl_device *);
 void	ishtp_set_drvdata(struct ishtp_cl_device *cl_device, void *data);
 void	*ishtp_get_drvdata(struct ishtp_cl_device *cl_device);
 
-int	__ishtp_cl_driver_register(struct ishtp_cl_driver *driver,
-				   struct module *owner);
-#define ishtp_cl_driver_register(driver)		\
-	__ishtp_cl_driver_register(driver, THIS_MODULE)
-void	ishtp_cl_driver_unregister(struct ishtp_cl_driver *driver);
-
 int	ishtp_register_event_cb(struct ishtp_cl_device *device,
 				void (*read_cb)(struct ishtp_cl_device *));
 int	ishtp_fw_cl_by_uuid(struct ishtp_device *dev, const guid_t *cuuid);
diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h
index 11e285172735..abc0b8122f07 100644
--- a/include/linux/intel-ish-client-if.h
+++ b/include/linux/intel-ish-client-if.h
@@ -10,6 +10,31 @@
 
 struct ishtp_cl_device;
 
+/**
+ * struct ishtp_cl_device - ISHTP device handle
+ * @driver:	driver instance on a bus
+ * @name:	Name of the device for probe
+ * @probe:	driver callback for device probe
+ * @remove:	driver callback on device removal
+ *
+ * Client drivers defines to get probed/removed for ISHTP client device.
+ */
+struct ishtp_cl_driver {
+	struct device_driver driver;
+	const char *name;
+	const guid_t *guid;
+	int (*probe)(struct ishtp_cl_device *dev);
+	int (*remove)(struct ishtp_cl_device *dev);
+	int (*reset)(struct ishtp_cl_device *dev);
+	const struct dev_pm_ops *pm;
+};
+
+int ishtp_cl_driver_register(struct ishtp_cl_driver *driver,
+			     struct module *owner);
+void ishtp_cl_driver_unregister(struct ishtp_cl_driver *driver);
+int ishtp_register_event_cb(struct ishtp_cl_device *device,
+			    void (*read_cb)(struct ishtp_cl_device *));
+
 /* Get the device * from ishtp device instance */
 struct device *ishtp_device(struct ishtp_cl_device *cl_device);
 /* Trace interface for clients */
-- 
cgit v1.2.3


From 8991eb309e1faa04ce1ca950d89dd36e3c8584cd Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Mon, 18 Mar 2019 12:14:24 -0700
Subject: HID: intel-ish-hid: Move the common functions from client.h

Move the interface functions in client.h to common include. These are
already abstracted well to use as is. Also move any associated structures
used by these functions.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/intel-ish-hid/ishtp/client.h    | 24 ---------------
 drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h | 26 ----------------
 include/linux/intel-ish-client-if.h         | 48 +++++++++++++++++++++++++++++
 3 files changed, 48 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/drivers/hid/intel-ish-hid/ishtp/client.h b/drivers/hid/intel-ish-hid/ishtp/client.h
index afa8b1f521d0..6ed00947d6bc 100644
--- a/drivers/hid/intel-ish-hid/ishtp/client.h
+++ b/drivers/hid/intel-ish-hid/ishtp/client.h
@@ -19,15 +19,6 @@
 #include <linux/types.h>
 #include "ishtp-dev.h"
 
-/* Client state */
-enum cl_state {
-	ISHTP_CL_INITIALIZING = 0,
-	ISHTP_CL_CONNECTING,
-	ISHTP_CL_CONNECTED,
-	ISHTP_CL_DISCONNECTING,
-	ISHTP_CL_DISCONNECTED
-};
-
 /* Tx and Rx ring size */
 #define	CL_DEF_RX_RING_SIZE	2
 #define	CL_DEF_TX_RING_SIZE	2
@@ -169,19 +160,4 @@ static inline bool ishtp_cl_cmp_id(const struct ishtp_cl *cl1,
 		(cl1->fw_client_id == cl2->fw_client_id);
 }
 
-/* exported functions from ISHTP under client management scope */
-struct ishtp_cl *ishtp_cl_allocate(struct ishtp_cl_device *cl_device);
-void ishtp_cl_free(struct ishtp_cl *cl);
-int ishtp_cl_link(struct ishtp_cl *cl);
-void ishtp_cl_unlink(struct ishtp_cl *cl);
-int ishtp_cl_disconnect(struct ishtp_cl *cl);
-int ishtp_cl_connect(struct ishtp_cl *cl);
-int ishtp_cl_send(struct ishtp_cl *cl, uint8_t *buf, size_t length);
-int ishtp_cl_flush_queues(struct ishtp_cl *cl);
-
-/* exported functions from ISHTP client buffer management scope */
-int ishtp_cl_io_rb_recycle(struct ishtp_cl_rb *rb);
-bool ishtp_cl_tx_empty(struct ishtp_cl *cl);
-struct ishtp_cl_rb *ishtp_cl_rx_get_rb(struct ishtp_cl *cl);
-
 #endif /* _ISHTP_CLIENT_H_ */
diff --git a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h
index e54ce1ef27dd..e0a320e67a41 100644
--- a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h
+++ b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h
@@ -79,32 +79,6 @@ struct ishtp_fw_client {
 	uint8_t client_id;
 };
 
-/**
- * struct ishtp_msg_data - ISHTP message data struct
- * @size:	Size of data in the *data
- * @data:	Pointer to data
- */
-struct ishtp_msg_data {
-	uint32_t size;
-	unsigned char *data;
-};
-
-/*
- * struct ishtp_cl_rb - request block structure
- * @list:	Link to list members
- * @cl:		ISHTP client instance
- * @buffer:	message header
- * @buf_idx:	Index into buffer
- * @read_time:	 unused at this time
- */
-struct ishtp_cl_rb {
-	struct list_head list;
-	struct ishtp_cl *cl;
-	struct ishtp_msg_data buffer;
-	unsigned long buf_idx;
-	unsigned long read_time;
-};
-
 /*
  * Control info for IPC messages ISHTP/IPC sending FIFO -
  * list with inline data buffer
diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h
index abc0b8122f07..7ce172f656f8 100644
--- a/include/linux/intel-ish-client-if.h
+++ b/include/linux/intel-ish-client-if.h
@@ -9,6 +9,16 @@
 #define _INTEL_ISH_CLIENT_IF_H_
 
 struct ishtp_cl_device;
+struct ishtp_cl;
+
+/* Client state */
+enum cl_state {
+	ISHTP_CL_INITIALIZING = 0,
+	ISHTP_CL_CONNECTING,
+	ISHTP_CL_CONNECTED,
+	ISHTP_CL_DISCONNECTING,
+	ISHTP_CL_DISCONNECTED
+};
 
 /**
  * struct ishtp_cl_device - ISHTP device handle
@@ -29,6 +39,32 @@ struct ishtp_cl_driver {
 	const struct dev_pm_ops *pm;
 };
 
+/**
+ * struct ishtp_msg_data - ISHTP message data struct
+ * @size:	Size of data in the *data
+ * @data:	Pointer to data
+ */
+struct ishtp_msg_data {
+	uint32_t size;
+	unsigned char *data;
+};
+
+/*
+ * struct ishtp_cl_rb - request block structure
+ * @list:	Link to list members
+ * @cl:		ISHTP client instance
+ * @buffer:	message header
+ * @buf_idx:	Index into buffer
+ * @read_time:	 unused at this time
+ */
+struct ishtp_cl_rb {
+	struct list_head list;
+	struct ishtp_cl *cl;
+	struct ishtp_msg_data buffer;
+	unsigned long buf_idx;
+	unsigned long read_time;
+};
+
 int ishtp_cl_driver_register(struct ishtp_cl_driver *driver,
 			     struct module *owner);
 void ishtp_cl_driver_unregister(struct ishtp_cl_driver *driver);
@@ -40,4 +76,16 @@ struct device *ishtp_device(struct ishtp_cl_device *cl_device);
 /* Trace interface for clients */
 void *ishtp_trace_callback(struct ishtp_cl_device *cl_device);
 
+struct ishtp_cl *ishtp_cl_allocate(struct ishtp_cl_device *cl_device);
+void ishtp_cl_free(struct ishtp_cl *cl);
+int ishtp_cl_link(struct ishtp_cl *cl);
+void ishtp_cl_unlink(struct ishtp_cl *cl);
+int ishtp_cl_disconnect(struct ishtp_cl *cl);
+int ishtp_cl_connect(struct ishtp_cl *cl);
+int ishtp_cl_send(struct ishtp_cl *cl, uint8_t *buf, size_t length);
+int ishtp_cl_flush_queues(struct ishtp_cl *cl);
+int ishtp_cl_io_rb_recycle(struct ishtp_cl_rb *rb);
+bool ishtp_cl_tx_empty(struct ishtp_cl *cl);
+struct ishtp_cl_rb *ishtp_cl_rx_get_rb(struct ishtp_cl *cl);
+
 #endif /* _INTEL_ISH_CLIENT_IF_H_ */
-- 
cgit v1.2.3


From 51cbc7079ecae545be93c137edeb7453d28b9ec9 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Mon, 18 Mar 2019 12:14:25 -0700
Subject: HID: intel-ish-hid: Add interface functions for struct ishtp_cl

Instead of directly accessing members of struct ishtp_cl, create interface
functions to access them.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/intel-ish-hid/ishtp/client.c | 42 ++++++++++++++++++++++++++++++++
 include/linux/intel-ish-client-if.h      |  7 ++++++
 2 files changed, 49 insertions(+)

(limited to 'include')

diff --git a/drivers/hid/intel-ish-hid/ishtp/client.c b/drivers/hid/intel-ish-hid/ishtp/client.c
index 657b46dcefa6..b7ac5e3b1e82 100644
--- a/drivers/hid/intel-ish-hid/ishtp/client.c
+++ b/drivers/hid/intel-ish-hid/ishtp/client.c
@@ -1063,3 +1063,45 @@ void recv_ishtp_cl_msg_dma(struct ishtp_device *dev, void *msg,
 eoi:
 	return;
 }
+
+void *ishtp_get_client_data(struct ishtp_cl *cl)
+{
+	return cl->client_data;
+}
+EXPORT_SYMBOL(ishtp_get_client_data);
+
+void ishtp_set_client_data(struct ishtp_cl *cl, void *data)
+{
+	cl->client_data = data;
+}
+EXPORT_SYMBOL(ishtp_set_client_data);
+
+struct ishtp_device *ishtp_get_ishtp_device(struct ishtp_cl *cl)
+{
+	return cl->dev;
+}
+EXPORT_SYMBOL(ishtp_get_ishtp_device);
+
+void ishtp_set_tx_ring_size(struct ishtp_cl *cl, int size)
+{
+	cl->tx_ring_size = size;
+}
+EXPORT_SYMBOL(ishtp_set_tx_ring_size);
+
+void ishtp_set_rx_ring_size(struct ishtp_cl *cl, int size)
+{
+	cl->rx_ring_size = size;
+}
+EXPORT_SYMBOL(ishtp_set_rx_ring_size);
+
+void ishtp_set_connection_state(struct ishtp_cl *cl, int state)
+{
+	cl->state = state;
+}
+EXPORT_SYMBOL(ishtp_set_connection_state);
+
+void ishtp_cl_set_fw_client_id(struct ishtp_cl *cl, int fw_client_id)
+{
+	cl->fw_client_id = fw_client_id;
+}
+EXPORT_SYMBOL(ishtp_cl_set_fw_client_id);
diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h
index 7ce172f656f8..526e3048e09f 100644
--- a/include/linux/intel-ish-client-if.h
+++ b/include/linux/intel-ish-client-if.h
@@ -87,5 +87,12 @@ int ishtp_cl_flush_queues(struct ishtp_cl *cl);
 int ishtp_cl_io_rb_recycle(struct ishtp_cl_rb *rb);
 bool ishtp_cl_tx_empty(struct ishtp_cl *cl);
 struct ishtp_cl_rb *ishtp_cl_rx_get_rb(struct ishtp_cl *cl);
+void *ishtp_get_client_data(struct ishtp_cl *cl);
+void ishtp_set_client_data(struct ishtp_cl *cl, void *data);
+struct ishtp_device *ishtp_get_ishtp_device(struct ishtp_cl *cl);
+void ishtp_set_tx_ring_size(struct ishtp_cl *cl, int size);
+void ishtp_set_rx_ring_size(struct ishtp_cl *cl, int size);
+void ishtp_set_connection_state(struct ishtp_cl *cl, int state);
+void ishtp_cl_set_fw_client_id(struct ishtp_cl *cl, int fw_client_id);
 
 #endif /* _INTEL_ISH_CLIENT_IF_H_ */
-- 
cgit v1.2.3


From 5f7224cf418511b9d3c40d35b098d15f9e4e7404 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Mon, 18 Mar 2019 12:14:26 -0700
Subject: HID: intel-ish-hid: Move functions related to bus and device

Move function idefinitions related to bus and device to common header file.
Also create new function to get fw client id and move ish_hw_reset() from
inline to exported function.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/intel-ish-hid/ishtp/bus.c       | 26 ++++++++++++++++++++++++++
 drivers/hid/intel-ish-hid/ishtp/bus.h       | 11 -----------
 drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h |  5 -----
 include/linux/intel-ish-client-if.h         | 12 ++++++++++++
 4 files changed, 38 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/hid/intel-ish-hid/ishtp/bus.c b/drivers/hid/intel-ish-hid/ishtp/bus.c
index 2ca65864192f..66a485a41481 100644
--- a/drivers/hid/intel-ish-hid/ishtp/bus.c
+++ b/drivers/hid/intel-ish-hid/ishtp/bus.c
@@ -170,6 +170,19 @@ struct ishtp_fw_client *ishtp_fw_cl_get_client(struct ishtp_device *dev,
 }
 EXPORT_SYMBOL(ishtp_fw_cl_get_client);
 
+/**
+ * ishtp_get_fw_client_id() - Get fw client id
+ *
+ * This interface is used to reset HW get FW client id.
+ *
+ * Return: firmware client id.
+ */
+int ishtp_get_fw_client_id(struct ishtp_fw_client *fw_client)
+{
+	return fw_client->client_id;
+}
+EXPORT_SYMBOL(ishtp_get_fw_client_id);
+
 /**
  * ishtp_fw_cl_by_id() - return index to fw_clients for client_id
  * @dev: the ishtp device structure
@@ -855,6 +868,19 @@ void *ishtp_trace_callback(struct ishtp_cl_device *cl_device)
 }
 EXPORT_SYMBOL(ishtp_trace_callback);
 
+/**
+ * ish_hw_reset() - Call HW reset IPC callback
+ *
+ * This interface is used to reset HW in case of error.
+ *
+ * Return: value from IPC hw_reset callback
+ */
+int ish_hw_reset(struct ishtp_device *dev)
+{
+	return dev->ops->hw_reset(dev);
+}
+EXPORT_SYMBOL(ish_hw_reset);
+
 /**
  * ishtp_bus_register() - Function to register bus
  *
diff --git a/drivers/hid/intel-ish-hid/ishtp/bus.h b/drivers/hid/intel-ish-hid/ishtp/bus.h
index 4aed195719de..93d516f5a853 100644
--- a/drivers/hid/intel-ish-hid/ishtp/bus.h
+++ b/drivers/hid/intel-ish-hid/ishtp/bus.h
@@ -80,16 +80,5 @@ void	ishtp_recv(struct ishtp_device *dev);
 void	ishtp_reset_handler(struct ishtp_device *dev);
 void	ishtp_reset_compl_handler(struct ishtp_device *dev);
 
-void	ishtp_put_device(struct ishtp_cl_device *);
-void	ishtp_get_device(struct ishtp_cl_device *);
-
-void	ishtp_set_drvdata(struct ishtp_cl_device *cl_device, void *data);
-void	*ishtp_get_drvdata(struct ishtp_cl_device *cl_device);
-
-int	ishtp_register_event_cb(struct ishtp_cl_device *device,
-				void (*read_cb)(struct ishtp_cl_device *));
 int	ishtp_fw_cl_by_uuid(struct ishtp_device *dev, const guid_t *cuuid);
-struct	ishtp_fw_client *ishtp_fw_cl_get_client(struct ishtp_device *dev,
-						const guid_t *uuid);
-
 #endif /* _LINUX_ISHTP_CL_BUS_H */
diff --git a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h
index e0a320e67a41..3cfef084b9fc 100644
--- a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h
+++ b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h
@@ -238,11 +238,6 @@ static inline int ish_ipc_reset(struct ishtp_device *dev)
 	return dev->ops->ipc_reset(dev);
 }
 
-static inline int ish_hw_reset(struct ishtp_device *dev)
-{
-	return dev->ops->hw_reset(dev);
-}
-
 /* Exported function */
 void	ishtp_device_init(struct ishtp_device *dev);
 int	ishtp_start(struct ishtp_device *dev);
diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h
index 526e3048e09f..e98bfbb1e07e 100644
--- a/include/linux/intel-ish-client-if.h
+++ b/include/linux/intel-ish-client-if.h
@@ -9,7 +9,9 @@
 #define _INTEL_ISH_CLIENT_IF_H_
 
 struct ishtp_cl_device;
+struct ishtp_device;
 struct ishtp_cl;
+struct ishtp_fw_client;
 
 /* Client state */
 enum cl_state {
@@ -95,4 +97,14 @@ void ishtp_set_rx_ring_size(struct ishtp_cl *cl, int size);
 void ishtp_set_connection_state(struct ishtp_cl *cl, int state);
 void ishtp_cl_set_fw_client_id(struct ishtp_cl *cl, int fw_client_id);
 
+void ishtp_put_device(struct ishtp_cl_device *cl_dev);
+void ishtp_get_device(struct ishtp_cl_device *cl_dev);
+void ishtp_set_drvdata(struct ishtp_cl_device *cl_device, void *data);
+void *ishtp_get_drvdata(struct ishtp_cl_device *cl_device);
+int ishtp_register_event_cb(struct ishtp_cl_device *device,
+				void (*read_cb)(struct ishtp_cl_device *));
+struct	ishtp_fw_client *ishtp_fw_cl_get_client(struct ishtp_device *dev,
+						const guid_t *uuid);
+int ishtp_get_fw_client_id(struct ishtp_fw_client *fw_client);
+int ish_hw_reset(struct ishtp_device *dev);
 #endif /* _INTEL_ISH_CLIENT_IF_H_ */
-- 
cgit v1.2.3


From 0e568a16af403263f8e421f1f10b91f9f15b52c3 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Mon, 18 Mar 2019 12:14:28 -0700
Subject: HID: intel-ish-hid: Add interface function for PCI device pointer

Instead of directly accessing PCI device poitner via struct ishtp_cl,
create interface function for same. This is required for DMA transfer.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/intel-ish-hid/ishtp/bus.c | 13 +++++++++++++
 include/linux/intel-ish-client-if.h   |  2 ++
 2 files changed, 15 insertions(+)

(limited to 'include')

diff --git a/drivers/hid/intel-ish-hid/ishtp/bus.c b/drivers/hid/intel-ish-hid/ishtp/bus.c
index 66a485a41481..fb8ca12955b4 100644
--- a/drivers/hid/intel-ish-hid/ishtp/bus.c
+++ b/drivers/hid/intel-ish-hid/ishtp/bus.c
@@ -855,6 +855,19 @@ struct device *ishtp_device(struct ishtp_cl_device *device)
 }
 EXPORT_SYMBOL(ishtp_device);
 
+/**
+ * ishtp_get_pci_device() - Return PCI device dev pointer
+ * This interface is used to return PCI device pointer
+ * from ishtp_cl_device instance.
+ *
+ * Return: device *.
+ */
+struct device *ishtp_get_pci_device(struct ishtp_cl_device *device)
+{
+	return device->ishtp_dev->devc;
+}
+EXPORT_SYMBOL(ishtp_get_pci_device);
+
 /**
  * ishtp_trace_callback() - Return trace callback
  *
diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h
index e98bfbb1e07e..16255c2ca2f4 100644
--- a/include/linux/intel-ish-client-if.h
+++ b/include/linux/intel-ish-client-if.h
@@ -77,6 +77,8 @@ int ishtp_register_event_cb(struct ishtp_cl_device *device,
 struct device *ishtp_device(struct ishtp_cl_device *cl_device);
 /* Trace interface for clients */
 void *ishtp_trace_callback(struct ishtp_cl_device *cl_device);
+/* Get device pointer of PCI device for DMA acces */
+struct device *ishtp_get_pci_device(struct ishtp_cl_device *cl_device);
 
 struct ishtp_cl *ishtp_cl_allocate(struct ishtp_cl_device *cl_device);
 void ishtp_cl_free(struct ishtp_cl *cl);
-- 
cgit v1.2.3


From 9496c015ed39ddfce971d63a1442e6d258504a7d Mon Sep 17 00:00:00 2001
From: Dongli Zhang <dongli.zhang@oracle.com>
Date: Tue, 19 Mar 2019 23:05:18 +0800
Subject: blk-mq: remove unused 'nr_expired' from blk_mq_hw_ctx

There is no usage of 'nr_expired'.

The 'nr_expired' was introduced by commit 1d9bd5161ba3 ("blk-mq: replace
timeout synchronization with a RCU and generation based scheme"). Its usage
was removed since commit 12f5b9314545 ("blk-mq: Remove generation
seqeunce").

Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b0c814bcc7e3..35359697318b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -57,7 +57,6 @@ struct blk_mq_hw_ctx {
 	unsigned int		queue_num;
 
 	atomic_t		nr_active;
-	unsigned int		nr_expired;
 
 	struct hlist_node	cpuhp_dead;
 	struct kobject		kobj;
-- 
cgit v1.2.3


From bb229bbb3bf63d23128e851a1f3b85c083178fa1 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 20 Mar 2019 09:46:58 +0100
Subject: libceph: wait for latest osdmap in ceph_monc_blacklist_add()

Because map updates are distributed lazily, an OSD may not know about
the new blacklist for quite some time after "osd blacklist add" command
is completed.  This makes it possible for a blacklisted but still alive
client to overwrite a post-blacklist update, resulting in data
corruption.

Waiting for latest osdmap in ceph_monc_blacklist_add() and thus using
the post-blacklist epoch for all post-blacklist requests ensures that
all such requests "wait" for the blacklist to come into force on their
respective OSDs.

Cc: stable@vger.kernel.org
Fixes: 6305a3b41515 ("libceph: support for blacklisting clients")
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Jason Dillaman <dillaman@redhat.com>
---
 include/linux/ceph/libceph.h |  2 ++
 net/ceph/ceph_common.c       | 18 +++++++++++++++++-
 net/ceph/mon_client.c        |  9 +++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index a420c07904bc..337d5049ff93 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -294,6 +294,8 @@ extern void ceph_destroy_client(struct ceph_client *client);
 extern int __ceph_open_session(struct ceph_client *client,
 			       unsigned long started);
 extern int ceph_open_session(struct ceph_client *client);
+int ceph_wait_for_latest_osdmap(struct ceph_client *client,
+				unsigned long timeout);
 
 /* pagevec.c */
 extern void ceph_release_page_vector(struct page **pages, int num_pages);
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 9cab80207ced..79eac465ec65 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -738,7 +738,6 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started)
 }
 EXPORT_SYMBOL(__ceph_open_session);
 
-
 int ceph_open_session(struct ceph_client *client)
 {
 	int ret;
@@ -754,6 +753,23 @@ int ceph_open_session(struct ceph_client *client)
 }
 EXPORT_SYMBOL(ceph_open_session);
 
+int ceph_wait_for_latest_osdmap(struct ceph_client *client,
+				unsigned long timeout)
+{
+	u64 newest_epoch;
+	int ret;
+
+	ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch);
+	if (ret)
+		return ret;
+
+	if (client->osdc.osdmap->epoch >= newest_epoch)
+		return 0;
+
+	ceph_osdc_maybe_request_map(&client->osdc);
+	return ceph_monc_wait_osdmap(&client->monc, newest_epoch, timeout);
+}
+EXPORT_SYMBOL(ceph_wait_for_latest_osdmap);
 
 static int __init init_ceph_lib(void)
 {
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 18deb3d889c4..a53e4fbb6319 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -922,6 +922,15 @@ int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
 	mutex_unlock(&monc->mutex);
 
 	ret = wait_generic_request(req);
+	if (!ret)
+		/*
+		 * Make sure we have the osdmap that includes the blacklist
+		 * entry.  This is needed to ensure that the OSDs pick up the
+		 * new blacklist before processing any future requests from
+		 * this client.
+		 */
+		ret = ceph_wait_for_latest_osdmap(monc->client, 0);
+
 out:
 	put_generic_request(req);
 	return ret;
-- 
cgit v1.2.3


From 29ece8b4354f8c5eaee798a3d8a1b356efee426f Mon Sep 17 00:00:00 2001
From: Yufen Yu <yuyufen@huawei.com>
Date: Mon, 18 Mar 2019 22:44:41 +0800
Subject: block: add BLK_MQ_POLL_CLASSIC for hybrid poll and return EINVAL for
 unexpected value

For q->poll_nsec == -1, means doing classic poll, not hybrid poll.
We introduce a new flag BLK_MQ_POLL_CLASSIC to replace -1, which
may make code much easier to read.

Additionally, since val is an int obtained with kstrtoint(), val can be
a negative value other than -1, so return -EINVAL for that case.

Thanks to Damien Le Moal for some good suggestion.

Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Yufen Yu <yuyufen@huawei.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         |  4 ++--
 block/blk-sysfs.c      | 12 +++++++-----
 include/linux/blkdev.h |  3 +++
 3 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ea01c23b58a3..76a3f78c566a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2856,7 +2856,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	/*
 	 * Default to classic polling
 	 */
-	q->poll_nsec = -1;
+	q->poll_nsec = BLK_MQ_POLL_CLASSIC;
 
 	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
 	blk_mq_add_queue_tag_set(set, q);
@@ -3391,7 +3391,7 @@ static bool blk_mq_poll_hybrid(struct request_queue *q,
 {
 	struct request *rq;
 
-	if (q->poll_nsec == -1)
+	if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
 		return false;
 
 	if (!blk_qc_t_is_internal(cookie))
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 59685918167e..422327089e0f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -360,8 +360,8 @@ static ssize_t queue_poll_delay_show(struct request_queue *q, char *page)
 {
 	int val;
 
-	if (q->poll_nsec == -1)
-		val = -1;
+	if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
+		val = BLK_MQ_POLL_CLASSIC;
 	else
 		val = q->poll_nsec / 1000;
 
@@ -380,10 +380,12 @@ static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
 	if (err < 0)
 		return err;
 
-	if (val == -1)
-		q->poll_nsec = -1;
-	else
+	if (val == BLK_MQ_POLL_CLASSIC)
+		q->poll_nsec = BLK_MQ_POLL_CLASSIC;
+	else if (val >= 0)
 		q->poll_nsec = val * 1000;
+	else
+		return -EINVAL;
 
 	return count;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0de92b29f589..5c58a3b2bf00 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -50,6 +50,9 @@ struct blk_stat_callback;
 /* Must be consistent with blk_mq_poll_stats_bkt() */
 #define BLK_MQ_POLL_STATS_BKTS 16
 
+/* Doing classic polling */
+#define BLK_MQ_POLL_CLASSIC -1
+
 /*
  * Maximum number of blkcg policies allowed to be registered concurrently.
  * Defined here to simplify include dependency.
-- 
cgit v1.2.3


From e6c987120e24cb913cb7bd4e675129a30fa49e0d Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Wed, 20 Mar 2019 13:14:37 -0700
Subject: block: Unexport blk_mq_add_to_requeue_list()

This function is not used outside the block layer core. Hence unexport it.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 1 -
 block/blk-mq.h         | 2 ++
 include/linux/blk-mq.h | 2 --
 3 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 76a3f78c566a..70b210a308c4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -782,7 +782,6 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 	if (kick_requeue_list)
 		blk_mq_kick_requeue_list(q);
 }
-EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
 
 void blk_mq_kick_requeue_list(struct request_queue *q)
 {
diff --git a/block/blk-mq.h b/block/blk-mq.h
index c11353a3749d..0ed8e5a8729f 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -41,6 +41,8 @@ void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
 bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
+void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
+				bool kick_requeue_list);
 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
 bool blk_mq_get_driver_tag(struct request *rq);
 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 35359697318b..cb2aa7ecafff 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -299,8 +299,6 @@ void blk_mq_end_request(struct request *rq, blk_status_t error);
 void __blk_mq_end_request(struct request *rq, blk_status_t error);
 
 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
-void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
-				bool kick_requeue_list);
 void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
 bool blk_mq_complete_request(struct request *rq);
-- 
cgit v1.2.3


From 551417af91b163bd697eb50b3601adae2177c28a Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Mon, 18 Mar 2019 14:51:23 +0800
Subject: genirq: Fix typo in comment of IRQD_MOVE_PCNTXT

Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Dou Liyang <douliyangs@gmail.com>
Cc: Julien Thierry <julien.thierry@arm.com>
Link: https://lkml.kernel.org/r/20190318065123.11862-1-peterx@redhat.com
---
 include/linux/irq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index d6160d479b14..7ae8de5ad0f2 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -195,7 +195,7 @@ struct irq_data {
  * IRQD_LEVEL			- Interrupt is level triggered
  * IRQD_WAKEUP_STATE		- Interrupt is configured for wakeup
  *				  from suspend
- * IRDQ_MOVE_PCNTXT		- Interrupt can be moved in process
+ * IRQD_MOVE_PCNTXT		- Interrupt can be moved in process
  *				  context
  * IRQD_IRQ_DISABLED		- Disabled state of the interrupt
  * IRQD_IRQ_MASKED		- Masked state of the interrupt
-- 
cgit v1.2.3


From 85d0966fa57e0ef2d30d913c98ca93674f7a03c9 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Wed, 20 Mar 2019 14:59:59 +0100
Subject: net/sched: prepare TC actions to properly validate the control action

- pass a pointer to struct tcf_proto in each actions's init() handler,
  to allow validating the control action, checking whether the chain
  exists and (eventually) refcounting it.
- remove code that validates the control action after a successful call
  to the action's init() handler, and replace it with a test that forbids
  addition of actions having 'goto_chain' and NULL goto_chain pointer at
  the same time.
- add tcf_action_check_ctrlact(), that will validate the control action
  and eventually allocate the action 'goto_chain' within the init()
  handler.
- add tcf_action_set_ctrlact(), that will assign the control action and
  swap the current 'goto_chain' pointer with the new given one.

This disallows 'goto_chain' on actions that don't initialize it properly
in their init() handler, i.e. calling tcf_action_check_ctrlact() after
successful IDR reservation and then calling tcf_action_set_ctrlact()
to assign 'goto_chain' and 'tcf_action' consistently.

By doing this, the kernel does not leak anymore refcounts when a valid
'goto chain' handle is replaced in TC actions, causing kmemleak splats
like the following one:

 # tc chain add dev dd0 chain 42 ingress protocol ip flower \
 > ip_proto tcp action drop
 # tc chain add dev dd0 chain 43 ingress protocol ip flower \
 > ip_proto udp action drop
 # tc filter add dev dd0 ingress matchall \
 > action gact goto chain 42 index 66
 # tc filter replace dev dd0 ingress matchall \
 > action gact goto chain 43 index 66
 # echo scan >/sys/kernel/debug/kmemleak
 <...>
 unreferenced object 0xffff93c0ee09f000 (size 1024):
 comm "tc", pid 2565, jiffies 4295339808 (age 65.426s)
 hex dump (first 32 bytes):
   00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
   00 00 00 00 08 00 06 00 00 00 00 00 00 00 00 00  ................
 backtrace:
   [<000000009b63f92d>] tc_ctl_chain+0x3d2/0x4c0
   [<00000000683a8d72>] rtnetlink_rcv_msg+0x263/0x2d0
   [<00000000ddd88f8e>] netlink_rcv_skb+0x4a/0x110
   [<000000006126a348>] netlink_unicast+0x1a0/0x250
   [<00000000b3340877>] netlink_sendmsg+0x2c1/0x3c0
   [<00000000a25a2171>] sock_sendmsg+0x36/0x40
   [<00000000f19ee1ec>] ___sys_sendmsg+0x280/0x2f0
   [<00000000d0422042>] __sys_sendmsg+0x5e/0xa0
   [<000000007a6c61f9>] do_syscall_64+0x5b/0x180
   [<00000000ccd07542>] entry_SYSCALL_64_after_hwframe+0x44/0xa9
   [<0000000013eaa334>] 0xffffffffffffffff

Fixes: db50514f9a9c ("net: sched: add termination action to allow goto chain")
Fixes: 97763dc0f401 ("net_sched: reject unknown tcfa_action values")
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h      |  7 +++-
 net/sched/act_api.c        | 97 ++++++++++++++++++++++++++--------------------
 net/sched/act_bpf.c        |  2 +-
 net/sched/act_connmark.c   |  1 +
 net/sched/act_csum.c       |  2 +-
 net/sched/act_gact.c       |  2 +-
 net/sched/act_ife.c        |  2 +-
 net/sched/act_ipt.c        | 11 +++---
 net/sched/act_mirred.c     |  1 +
 net/sched/act_nat.c        |  3 +-
 net/sched/act_pedit.c      |  2 +-
 net/sched/act_police.c     |  1 +
 net/sched/act_sample.c     |  2 +-
 net/sched/act_simple.c     |  2 +-
 net/sched/act_skbedit.c    |  1 +
 net/sched/act_skbmod.c     |  1 +
 net/sched/act_tunnel_key.c |  1 +
 net/sched/act_vlan.c       |  2 +-
 18 files changed, 84 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index c745e9ccfab2..54fbb49bd08a 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -90,7 +90,7 @@ struct tc_action_ops {
 	int     (*lookup)(struct net *net, struct tc_action **a, u32 index);
 	int     (*init)(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action **act, int ovr,
-			int bind, bool rtnl_held,
+			int bind, bool rtnl_held, struct tcf_proto *tp,
 			struct netlink_ext_ack *extack);
 	int     (*walk)(struct net *, struct sk_buff *,
 			struct netlink_callback *, int,
@@ -181,6 +181,11 @@ int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int);
 int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int);
 int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int);
 
+int tcf_action_check_ctrlact(int action, struct tcf_proto *tp,
+			     struct tcf_chain **handle,
+			     struct netlink_ext_ack *newchain);
+struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action,
+					 struct tcf_chain *newchain);
 #endif /* CONFIG_NET_CLS_ACT */
 
 static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes,
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index aecf1bf233c8..fe67b98ac641 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -28,23 +28,6 @@
 #include <net/act_api.h>
 #include <net/netlink.h>
 
-static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp)
-{
-	u32 chain_index = a->tcfa_action & TC_ACT_EXT_VAL_MASK;
-
-	if (!tp)
-		return -EINVAL;
-	a->goto_chain = tcf_chain_get_by_act(tp->chain->block, chain_index);
-	if (!a->goto_chain)
-		return -ENOMEM;
-	return 0;
-}
-
-static void tcf_action_goto_chain_fini(struct tc_action *a)
-{
-	tcf_chain_put_by_act(a->goto_chain);
-}
-
 static void tcf_action_goto_chain_exec(const struct tc_action *a,
 				       struct tcf_result *res)
 {
@@ -71,6 +54,53 @@ static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie,
 		call_rcu(&old->rcu, tcf_free_cookie_rcu);
 }
 
+int tcf_action_check_ctrlact(int action, struct tcf_proto *tp,
+			     struct tcf_chain **newchain,
+			     struct netlink_ext_ack *extack)
+{
+	int opcode = TC_ACT_EXT_OPCODE(action), ret = -EINVAL;
+	u32 chain_index;
+
+	if (!opcode)
+		ret = action > TC_ACT_VALUE_MAX ? -EINVAL : 0;
+	else if (opcode <= TC_ACT_EXT_OPCODE_MAX || action == TC_ACT_UNSPEC)
+		ret = 0;
+	if (ret) {
+		NL_SET_ERR_MSG(extack, "invalid control action");
+		goto end;
+	}
+
+	if (TC_ACT_EXT_CMP(action, TC_ACT_GOTO_CHAIN)) {
+		chain_index = action & TC_ACT_EXT_VAL_MASK;
+		if (!tp || !newchain) {
+			ret = -EINVAL;
+			NL_SET_ERR_MSG(extack,
+				       "can't goto NULL proto/chain");
+			goto end;
+		}
+		*newchain = tcf_chain_get_by_act(tp->chain->block, chain_index);
+		if (!*newchain) {
+			ret = -ENOMEM;
+			NL_SET_ERR_MSG(extack,
+				       "can't allocate goto_chain");
+		}
+	}
+end:
+	return ret;
+}
+EXPORT_SYMBOL(tcf_action_check_ctrlact);
+
+struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action,
+					 struct tcf_chain *newchain)
+{
+	struct tcf_chain *oldchain = a->goto_chain;
+
+	a->tcfa_action = action;
+	a->goto_chain = newchain;
+	return oldchain;
+}
+EXPORT_SYMBOL(tcf_action_set_ctrlact);
+
 /* XXX: For standalone actions, we don't need a RCU grace period either, because
  * actions are always connected to filters and filters are already destroyed in
  * RCU callbacks, so after a RCU grace period actions are already disconnected
@@ -78,13 +108,15 @@ static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie,
  */
 static void free_tcf(struct tc_action *p)
 {
+	struct tcf_chain *chain = p->goto_chain;
+
 	free_percpu(p->cpu_bstats);
 	free_percpu(p->cpu_bstats_hw);
 	free_percpu(p->cpu_qstats);
 
 	tcf_set_action_cookie(&p->act_cookie, NULL);
-	if (p->goto_chain)
-		tcf_action_goto_chain_fini(p);
+	if (chain)
+		tcf_chain_put_by_act(chain);
 
 	kfree(p);
 }
@@ -800,15 +832,6 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
 	return c;
 }
 
-static bool tcf_action_valid(int action)
-{
-	int opcode = TC_ACT_EXT_OPCODE(action);
-
-	if (!opcode)
-		return action <= TC_ACT_VALUE_MAX;
-	return opcode <= TC_ACT_EXT_OPCODE_MAX || action == TC_ACT_UNSPEC;
-}
-
 struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 				    struct nlattr *nla, struct nlattr *est,
 				    char *name, int ovr, int bind,
@@ -890,10 +913,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 	/* backward compatibility for policer */
 	if (name == NULL)
 		err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind,
-				rtnl_held, extack);
+				rtnl_held, tp, extack);
 	else
 		err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held,
-				extack);
+				tp, extack);
 	if (err < 0)
 		goto err_mod;
 
@@ -907,18 +930,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 	if (err != ACT_P_CREATED)
 		module_put(a_o->owner);
 
-	if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) {
-		err = tcf_action_goto_chain_init(a, tp);
-		if (err) {
-			tcf_action_destroy_1(a, bind);
-			NL_SET_ERR_MSG(extack, "Failed to init TC action chain");
-			return ERR_PTR(err);
-		}
-	}
-
-	if (!tcf_action_valid(a->tcfa_action)) {
+	if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN) &&
+	    !a->goto_chain) {
 		tcf_action_destroy_1(a, bind);
-		NL_SET_ERR_MSG(extack, "Invalid control action value");
+		NL_SET_ERR_MSG(extack, "can't use goto chain with NULL chain");
 		return ERR_PTR(-EINVAL);
 	}
 
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index aa5c38d11a30..3c0468f2aae6 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -278,7 +278,7 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
 static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action **act,
 			int replace, int bind, bool rtnl_held,
-			struct netlink_ext_ack *extack)
+			struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, bpf_net_id);
 	struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 5d24993cccfe..44aa046a92ea 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -97,6 +97,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
 static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 			     struct nlattr *est, struct tc_action **a,
 			     int ovr, int bind, bool rtnl_held,
+			     struct tcf_proto *tp,
 			     struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, connmark_net_id);
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index c79aca29505e..9ba0f61a1e82 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -46,7 +46,7 @@ static struct tc_action_ops act_csum_ops;
 
 static int tcf_csum_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a, int ovr,
-			 int bind, bool rtnl_held,
+			 int bind, bool rtnl_held, struct tcf_proto *tp,
 			 struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, csum_net_id);
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 93da0004e9f4..b8ad311bd8cc 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -57,7 +57,7 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
 static int tcf_gact_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
 			 int ovr, int bind, bool rtnl_held,
-			 struct netlink_ext_ack *extack)
+			 struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, gact_net_id);
 	struct nlattr *tb[TCA_GACT_MAX + 1];
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 9b1f2b3990ee..c1ba74d5c1e3 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -469,7 +469,7 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
 static int tcf_ife_init(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action **a,
 			int ovr, int bind, bool rtnl_held,
-			struct netlink_ext_ack *extack)
+			struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, ife_net_id);
 	struct nlattr *tb[TCA_IFE_MAX + 1];
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 98f5b6ea77b4..04a0b5c61194 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -97,7 +97,8 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
 
 static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
 			  struct nlattr *est, struct tc_action **a,
-			  const struct tc_action_ops *ops, int ovr, int bind)
+			  const struct tc_action_ops *ops, int ovr, int bind,
+			  struct tcf_proto *tp)
 {
 	struct tc_action_net *tn = net_generic(net, id);
 	struct nlattr *tb[TCA_IPT_MAX + 1];
@@ -205,20 +206,20 @@ err1:
 
 static int tcf_ipt_init(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action **a, int ovr,
-			int bind, bool rtnl_held,
+			int bind, bool rtnl_held, struct tcf_proto *tp,
 			struct netlink_ext_ack *extack)
 {
 	return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr,
-			      bind);
+			      bind, tp);
 }
 
 static int tcf_xt_init(struct net *net, struct nlattr *nla,
 		       struct nlattr *est, struct tc_action **a, int ovr,
-		       int bind, bool unlocked,
+		       int bind, bool unlocked, struct tcf_proto *tp,
 		       struct netlink_ext_ack *extack)
 {
 	return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr,
-			      bind);
+			      bind, tp);
 }
 
 static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a,
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 6692fd054617..383f4024452c 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -94,6 +94,7 @@ static struct tc_action_ops act_mirred_ops;
 static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a,
 			   int ovr, int bind, bool rtnl_held,
+			   struct tcf_proto *tp,
 			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, mirred_net_id);
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 543eab9193f1..de4b493e26d2 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -38,7 +38,8 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
 
 static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 			struct tc_action **a, int ovr, int bind,
-			bool rtnl_held, struct netlink_ext_ack *extack)
+			bool rtnl_held,	struct tcf_proto *tp,
+			struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, nat_net_id);
 	struct nlattr *tb[TCA_NAT_MAX + 1];
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index a80373878df7..8ca82aefa11a 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -138,7 +138,7 @@ nla_failure:
 static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 			  struct nlattr *est, struct tc_action **a,
 			  int ovr, int bind, bool rtnl_held,
-			  struct netlink_ext_ack *extack)
+			  struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, pedit_net_id);
 	struct nlattr *tb[TCA_PEDIT_MAX + 1];
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 8271a6263824..229eba7925e5 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -83,6 +83,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
 static int tcf_police_init(struct net *net, struct nlattr *nla,
 			       struct nlattr *est, struct tc_action **a,
 			       int ovr, int bind, bool rtnl_held,
+			       struct tcf_proto *tp,
 			       struct netlink_ext_ack *extack)
 {
 	int ret = 0, tcfp_result = TC_ACT_OK, err, size;
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 203e399e5c85..36b8adbe935d 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -37,7 +37,7 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
 
 static int tcf_sample_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a, int ovr,
-			   int bind, bool rtnl_held,
+			   int bind, bool rtnl_held, struct tcf_proto *tp,
 			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, sample_net_id);
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index d54cb608dbaf..4916dc3e3668 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -78,7 +78,7 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
 static int tcf_simp_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
 			 int ovr, int bind, bool rtnl_held,
-			 struct netlink_ext_ack *extack)
+			 struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, simp_net_id);
 	struct nlattr *tb[TCA_DEF_MAX + 1];
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 65879500b688..4566eff3d027 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -96,6 +96,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
 static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 			    struct nlattr *est, struct tc_action **a,
 			    int ovr, int bind, bool rtnl_held,
+			    struct tcf_proto *tp,
 			    struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 7bac1d78e7a3..b9ab2c8f07f1 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -82,6 +82,7 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
 static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a,
 			   int ovr, int bind, bool rtnl_held,
+			   struct tcf_proto *tp,
 			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 7c6591b991d5..fc295d91559a 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -210,6 +210,7 @@ static void tunnel_key_release_params(struct tcf_tunnel_key_params *p)
 static int tunnel_key_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a,
 			   int ovr, int bind, bool rtnl_held,
+			   struct tcf_proto *tp,
 			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index ac0061599225..4651ee15e35d 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -105,7 +105,7 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
 static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
 			 int ovr, int bind, bool rtnl_held,
-			 struct netlink_ext_ack *extack)
+			 struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, vlan_net_id);
 	struct nlattr *tb[TCA_VLAN_MAX + 1];
-- 
cgit v1.2.3


From fe384e2fa36ca084a456fd30558cccc75b4b3fbd Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Wed, 20 Mar 2019 15:00:15 +0100
Subject: net/sched: don't dereference a->goto_chain to read the chain index

callers of tcf_gact_goto_chain_index() can potentially read an old value
of the chain index, or even dereference a NULL 'goto_chain' pointer,
because 'goto_chain' and 'tcfa_action' are read in the traffic path
without caring of concurrent write in the control path. The most recent
value of chain index can be read also from a->tcfa_action (it's encoded
there together with TC_ACT_GOTO_CHAIN bits), so we don't really need to
dereference 'goto_chain': just read the chain id from the control action.

Fixes: e457d86ada27 ("net: sched: add couple of goto_chain helpers")
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_gact.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h
index ee8d005f56fc..eb8f01c819e6 100644
--- a/include/net/tc_act/tc_gact.h
+++ b/include/net/tc_act/tc_gact.h
@@ -56,7 +56,7 @@ static inline bool is_tcf_gact_goto_chain(const struct tc_action *a)
 
 static inline u32 tcf_gact_goto_chain_index(const struct tc_action *a)
 {
-	return a->goto_chain->index;
+	return READ_ONCE(a->tcfa_action) & TC_ACT_EXT_VAL_MASK;
 }
 
 #endif /* __NET_TC_GACT_H */
-- 
cgit v1.2.3


From ee3bbfe806cdb46b02cda63626cb50a7a7b19fc5 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Wed, 20 Mar 2019 15:00:16 +0100
Subject: net/sched: let actions use RCU to access 'goto_chain'

use RCU when accessing the action chain, to avoid use after free in the
traffic path when 'goto chain' is replaced on existing TC actions (see
script below). Since the control action is read in the traffic path
without holding the action spinlock, we need to explicitly ensure that
a->goto_chain is not NULL before dereferencing (i.e it's not sufficient
to rely on the value of TC_ACT_GOTO_CHAIN bits). Not doing so caused NULL
dereferences in tcf_action_goto_chain_exec() when the following script:

 # tc chain add dev dd0 chain 42 ingress protocol ip flower \
 > ip_proto udp action pass index 4
 # tc filter add dev dd0 ingress protocol ip flower \
 > ip_proto udp action csum udp goto chain 42 index 66
 # tc chain del dev dd0 chain 42 ingress
 (start UDP traffic towards dd0)
 # tc action replace action csum udp pass index 66

was run repeatedly for several hours.

Suggested-by: Cong Wang <xiyou.wangcong@gmail.com>
Suggested-by: Vlad Buslov <vladbu@mellanox.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h     |  2 +-
 include/net/sch_generic.h |  1 +
 net/sched/act_api.c       | 18 ++++++++++--------
 net/sched/cls_api.c       |  2 +-
 4 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 54fbb49bd08a..c61a1bf4e3de 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -39,7 +39,7 @@ struct tc_action {
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats_hw;
 	struct gnet_stats_queue __percpu *cpu_qstats;
 	struct tc_cookie	__rcu *act_cookie;
-	struct tcf_chain	*goto_chain;
+	struct tcf_chain	__rcu *goto_chain;
 };
 #define tcf_index	common.tcfa_index
 #define tcf_refcnt	common.tcfa_refcnt
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 31284c078d06..7d1a0483a17b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -378,6 +378,7 @@ struct tcf_chain {
 	bool flushing;
 	const struct tcf_proto_ops *tmplt_ops;
 	void *tmplt_priv;
+	struct rcu_head rcu;
 };
 
 struct tcf_block {
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index fe67b98ac641..5a87e271d35a 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -31,7 +31,7 @@
 static void tcf_action_goto_chain_exec(const struct tc_action *a,
 				       struct tcf_result *res)
 {
-	const struct tcf_chain *chain = a->goto_chain;
+	const struct tcf_chain *chain = rcu_dereference_bh(a->goto_chain);
 
 	res->goto_tp = rcu_dereference_bh(chain->filter_chain);
 }
@@ -91,13 +91,11 @@ end:
 EXPORT_SYMBOL(tcf_action_check_ctrlact);
 
 struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action,
-					 struct tcf_chain *newchain)
+					 struct tcf_chain *goto_chain)
 {
-	struct tcf_chain *oldchain = a->goto_chain;
-
 	a->tcfa_action = action;
-	a->goto_chain = newchain;
-	return oldchain;
+	rcu_swap_protected(a->goto_chain, goto_chain, 1);
+	return goto_chain;
 }
 EXPORT_SYMBOL(tcf_action_set_ctrlact);
 
@@ -108,7 +106,7 @@ EXPORT_SYMBOL(tcf_action_set_ctrlact);
  */
 static void free_tcf(struct tc_action *p)
 {
-	struct tcf_chain *chain = p->goto_chain;
+	struct tcf_chain *chain = rcu_dereference_protected(p->goto_chain, 1);
 
 	free_percpu(p->cpu_bstats);
 	free_percpu(p->cpu_bstats_hw);
@@ -686,6 +684,10 @@ repeat:
 					return TC_ACT_OK;
 			}
 		} else if (TC_ACT_EXT_CMP(ret, TC_ACT_GOTO_CHAIN)) {
+			if (unlikely(!rcu_access_pointer(a->goto_chain))) {
+				net_warn_ratelimited("can't go to NULL chain!\n");
+				return TC_ACT_SHOT;
+			}
 			tcf_action_goto_chain_exec(a, res);
 		}
 
@@ -931,7 +933,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 		module_put(a_o->owner);
 
 	if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN) &&
-	    !a->goto_chain) {
+	    !rcu_access_pointer(a->goto_chain)) {
 		tcf_action_destroy_1(a, bind);
 		NL_SET_ERR_MSG(extack, "can't use goto chain with NULL chain");
 		return ERR_PTR(-EINVAL);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index dc10525e90e7..99ae30c177c7 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -367,7 +367,7 @@ static void tcf_chain_destroy(struct tcf_chain *chain, bool free_block)
 	struct tcf_block *block = chain->block;
 
 	mutex_destroy(&chain->filter_chain_lock);
-	kfree(chain);
+	kfree_rcu(chain, rcu);
 	if (free_block)
 		tcf_block_destroy(block);
 }
-- 
cgit v1.2.3


From b45a02e13ee74b6fde56df4d76786058821a3aba Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 19 Mar 2019 15:54:16 +0100
Subject: gpio: amd-fch: Fix bogus SPDX identifier

spdxcheck.py complains:

 include/linux/platform_data/gpio/gpio-amd-fch.h: 1:28 Invalid License ID: GPL+

which is correct because GPL+ is not a valid identifier. Of course this
could have been caught by checkpatch.pl _before_ submitting or merging the
patch.

 WARNING: 'SPDX-License-Identifier: GPL+ */' is not supported in LICENSES/...
 #271: FILE: include/linux/platform_data/gpio/gpio-amd-fch.h:1:
 +/* SPDX-License-Identifier: GPL+ */

Fix it under the assumption that the author meant GPL-2.0+, which makes
sense as the corresponding C file is using that identifier.

Fixes: e09d168f13f0 ("gpio: AMD G-Series PCH gpio driver")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
 include/linux/platform_data/gpio/gpio-amd-fch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/platform_data/gpio/gpio-amd-fch.h b/include/linux/platform_data/gpio/gpio-amd-fch.h
index a867637e172d..9e46678edb2a 100644
--- a/include/linux/platform_data/gpio/gpio-amd-fch.h
+++ b/include/linux/platform_data/gpio/gpio-amd-fch.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL+ */
+/* SPDX-License-Identifier: GPL-2.0+ */
 
 /*
  * AMD FCH gpio driver platform-data
-- 
cgit v1.2.3


From 1e4471e74c75acb3f89959ffa02a241227937ae2 Mon Sep 17 00:00:00 2001
From: Shenghui Wang <shhuiw@foxmail.com>
Date: Sat, 16 Mar 2019 16:24:37 +0800
Subject: sbitmap: trivial - update comment for sbitmap_deferred_clear_bit

"sbitmap_batch_clear" should be "sbitmap_deferred_clear"

Acked-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Shenghui Wang <shhuiw@foxmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 14d558146aea..20f3e3f029b9 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -330,7 +330,7 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
 /*
  * This one is special, since it doesn't actually clear the bit, rather it
  * sets the corresponding bit in the ->cleared mask instead. Paired with
- * the caller doing sbitmap_batch_clear() if a given index is full, which
+ * the caller doing sbitmap_deferred_clear() if a given index is full, which
  * will clear the previously freed entries in the corresponding ->word.
  */
 static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr)
-- 
cgit v1.2.3


From ffc8599aa9763f39f6736a79da4d1575e7006f9a Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@redhat.com>
Date: Fri, 8 Mar 2019 11:05:08 +0800
Subject: x86/gart: Exclude GART aperture from kcore

On machines where the GART aperture is mapped over physical RAM,
/proc/kcore contains the GART aperture range. Accessing the GART range via
/proc/kcore results in a kernel crash.

vmcore used to have the same issue, until it was fixed with commit
2a3e83c6f96c ("x86/gart: Exclude GART aperture from vmcore")', leveraging
existing hook infrastructure in vmcore to let /proc/vmcore return zeroes
when attempting to read the aperture region, and so it won't read from the
actual memory.

Apply the same workaround for kcore. First implement the same hook
infrastructure for kcore, then reuse the hook functions introduced in the
previous vmcore fix. Just with some minor adjustment, rename some functions
for more general usage, and simplify the hook infrastructure a bit as there
is no module usage yet.

Suggested-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Kairui Song <kasong@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Jiri Bohac <jbohac@suse.cz>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Dave Young <dyoung@redhat.com>
Link: https://lkml.kernel.org/r/20190308030508.13548-1-kasong@redhat.com
---
 arch/x86/kernel/aperture_64.c | 20 +++++++++++++-------
 fs/proc/kcore.c               | 27 +++++++++++++++++++++++++++
 include/linux/kcore.h         |  2 ++
 3 files changed, 42 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 58176b56354e..294ed4392a0e 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -14,6 +14,7 @@
 #define pr_fmt(fmt) "AGP: " fmt
 
 #include <linux/kernel.h>
+#include <linux/kcore.h>
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/memblock.h>
@@ -57,7 +58,7 @@ int fallback_aper_force __initdata;
 
 int fix_aperture __initdata = 1;
 
-#ifdef CONFIG_PROC_VMCORE
+#if defined(CONFIG_PROC_VMCORE) || defined(CONFIG_PROC_KCORE)
 /*
  * If the first kernel maps the aperture over e820 RAM, the kdump kernel will
  * use the same range because it will remain configured in the northbridge.
@@ -66,20 +67,25 @@ int fix_aperture __initdata = 1;
  */
 static unsigned long aperture_pfn_start, aperture_page_count;
 
-static int gart_oldmem_pfn_is_ram(unsigned long pfn)
+static int gart_mem_pfn_is_ram(unsigned long pfn)
 {
 	return likely((pfn < aperture_pfn_start) ||
 		      (pfn >= aperture_pfn_start + aperture_page_count));
 }
 
-static void exclude_from_vmcore(u64 aper_base, u32 aper_order)
+static void __init exclude_from_core(u64 aper_base, u32 aper_order)
 {
 	aperture_pfn_start = aper_base >> PAGE_SHIFT;
 	aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT;
-	WARN_ON(register_oldmem_pfn_is_ram(&gart_oldmem_pfn_is_ram));
+#ifdef CONFIG_PROC_VMCORE
+	WARN_ON(register_oldmem_pfn_is_ram(&gart_mem_pfn_is_ram));
+#endif
+#ifdef CONFIG_PROC_KCORE
+	WARN_ON(register_mem_pfn_is_ram(&gart_mem_pfn_is_ram));
+#endif
 }
 #else
-static void exclude_from_vmcore(u64 aper_base, u32 aper_order)
+static void exclude_from_core(u64 aper_base, u32 aper_order)
 {
 }
 #endif
@@ -474,7 +480,7 @@ out:
 			 * may have allocated the range over its e820 RAM
 			 * and fixed up the northbridge
 			 */
-			exclude_from_vmcore(last_aper_base, last_aper_order);
+			exclude_from_core(last_aper_base, last_aper_order);
 
 			return 1;
 		}
@@ -520,7 +526,7 @@ out:
 	 * overlap with the first kernel's memory. We can't access the
 	 * range through vmcore even though it should be part of the dump.
 	 */
-	exclude_from_vmcore(aper_alloc, aper_order);
+	exclude_from_core(aper_alloc, aper_order);
 
 	/* Fix up the north bridges */
 	for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index bbcc185062bb..d29d869abec1 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -54,6 +54,28 @@ static LIST_HEAD(kclist_head);
 static DECLARE_RWSEM(kclist_lock);
 static int kcore_need_update = 1;
 
+/*
+ * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
+ * Same as oldmem_pfn_is_ram in vmcore
+ */
+static int (*mem_pfn_is_ram)(unsigned long pfn);
+
+int __init register_mem_pfn_is_ram(int (*fn)(unsigned long pfn))
+{
+	if (mem_pfn_is_ram)
+		return -EBUSY;
+	mem_pfn_is_ram = fn;
+	return 0;
+}
+
+static int pfn_is_ram(unsigned long pfn)
+{
+	if (mem_pfn_is_ram)
+		return mem_pfn_is_ram(pfn);
+	else
+		return 1;
+}
+
 /* This doesn't grab kclist_lock, so it should only be used at init time. */
 void __init kclist_add(struct kcore_list *new, void *addr, size_t size,
 		       int type)
@@ -465,6 +487,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 				goto out;
 			}
 			m = NULL;	/* skip the list anchor */
+		} else if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) {
+			if (clear_user(buffer, tsz)) {
+				ret = -EFAULT;
+				goto out;
+			}
 		} else if (m->type == KCORE_VMALLOC) {
 			vread(buf, (char *)start, tsz);
 			/* we have to zero-fill user buffer even if no read */
diff --git a/include/linux/kcore.h b/include/linux/kcore.h
index 8c3f8c14eeaa..c843f4a9c512 100644
--- a/include/linux/kcore.h
+++ b/include/linux/kcore.h
@@ -44,6 +44,8 @@ void kclist_add_remap(struct kcore_list *m, void *addr, void *vaddr, size_t sz)
 	m->vaddr = (unsigned long)vaddr;
 	kclist_add(m, addr, sz, KCORE_REMAP);
 }
+
+extern int __init register_mem_pfn_is_ram(int (*fn)(unsigned long pfn));
 #else
 static inline
 void kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
-- 
cgit v1.2.3


From aa9aaa4d61c0048d3faad056893cd7860bbc084c Mon Sep 17 00:00:00 2001
From: Erik Schmauss <erik.schmauss@intel.com>
Date: Thu, 21 Mar 2019 18:20:21 -0700
Subject: ACPI: use different default debug value than ACPICA

Rather than setting debug output flags during early init, its makes
more sense to simply re-define ACPI_DEBUG_DEFAULT specifically for
Linux.

ACPICA commit 60903715711f4b00ca1831779a8a23279a66497d

Link: https://github.com/acpica/acpica/commit/60903715
Fixes: ce5cbf53496b ("ACPI: Set debug output flags independent of ACPICA")
Reported-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
Tested-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
Signed-off-by: Erik Schmauss <erik.schmauss@intel.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c              | 3 ---
 include/acpi/acoutput.h         | 3 +++
 include/acpi/platform/aclinux.h | 5 +++++
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 6ecbbabf1233..eec263c9019e 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -1043,9 +1043,6 @@ void __init acpi_early_init(void)
 
 	acpi_permanent_mmap = true;
 
-	/* Initialize debug output. Linux does not use ACPICA defaults */
-	acpi_dbg_level = ACPI_LV_INFO | ACPI_LV_REPAIR;
-
 #ifdef CONFIG_X86
 	/*
 	 * If the machine falls into the DMI check table,
diff --git a/include/acpi/acoutput.h b/include/acpi/acoutput.h
index 30b1ae53689f..c50542dc71e0 100644
--- a/include/acpi/acoutput.h
+++ b/include/acpi/acoutput.h
@@ -150,7 +150,10 @@
 
 /* Defaults for debug_level, debug and normal */
 
+#ifndef ACPI_DEBUG_DEFAULT
 #define ACPI_DEBUG_DEFAULT          (ACPI_LV_INIT | ACPI_LV_DEBUG_OBJECT | ACPI_LV_EVALUATION | ACPI_LV_REPAIR)
+#endif
+
 #define ACPI_NORMAL_DEFAULT         (ACPI_LV_INIT | ACPI_LV_DEBUG_OBJECT | ACPI_LV_REPAIR)
 #define ACPI_DEBUG_ALL              (ACPI_LV_AML_DISASSEMBLE | ACPI_LV_ALL_EXCEPTIONS | ACPI_LV_ALL)
 
diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
index 9ff328fd946a..624b90b34085 100644
--- a/include/acpi/platform/aclinux.h
+++ b/include/acpi/platform/aclinux.h
@@ -82,6 +82,11 @@
 #define ACPI_NO_ERROR_MESSAGES
 #undef ACPI_DEBUG_OUTPUT
 
+/* Use a specific bugging default separate from ACPICA */
+
+#undef ACPI_DEBUG_DEFAULT
+#define ACPI_DEBUG_DEFAULT          (ACPI_LV_INFO | ACPI_LV_REPAIR)
+
 /* External interface for __KERNEL__, stub is needed */
 
 #define ACPI_EXTERNAL_RETURN_STATUS(prototype) \
-- 
cgit v1.2.3


From a3ac7917b73070010c05b4485b8582a6c9cd69b6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 25 Mar 2019 14:49:00 -0700
Subject: Revert "parport: daisy: use new parport device model"

This reverts commit 1aec4211204d9463d1fd209eb50453de16254599.

Steven Rostedt reports that it causes a hang at bootup and bisected it
to this commit.

The troigger is apparently a module alias for "parport_lowlevel" that
points to "parport_pc", which causes a hang with

    modprobe -q -- parport_lowlevel

blocking forever with a backtrace like this:

    wait_for_completion_killable+0x1c/0x28
    call_usermodehelper_exec+0xa7/0x108
    __request_module+0x351/0x3d8
    get_lowlevel_driver+0x28/0x41 [parport]
    __parport_register_driver+0x39/0x1f4 [parport]
    daisy_drv_init+0x31/0x4f [parport]
    parport_bus_init+0x5d/0x7b [parport]
    parport_default_proc_register+0x26/0x1000 [parport]
    do_one_initcall+0xc2/0x1e0
    do_init_module+0x50/0x1d4
    load_module+0x1c2e/0x21b3
    sys_init_module+0xef/0x117

Supid says:
 "Due to the new device model daisy driver will now try to find the
  parallel ports while trying to register its driver so that it can bind
  with them. Now, since daisy driver is loaded while parport bus is
  initialising the list of parport is still empty and it tries to load
  the lowlevel driver, which has an alias set to parport_pc, now causes
  a deadlock"

But I don't think the daisy driver should be loaded by the parport
initialization in the first place, so let's revert the whole change.

If the daisy driver can just initialize separately on its own (like a
driver should), instead of hooking into the parport init sequence
directly, this issue probably would go away.

Reported-and-bisected-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Reported-by: Michal Kubecek <mkubecek@suse.cz>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/parport/daisy.c | 32 +-------------------------------
 drivers/parport/probe.c |  2 +-
 drivers/parport/share.c | 10 +---------
 include/linux/parport.h | 13 -------------
 4 files changed, 3 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/drivers/parport/daisy.c b/drivers/parport/daisy.c
index 56dd83a45e55..5484a46dafda 100644
--- a/drivers/parport/daisy.c
+++ b/drivers/parport/daisy.c
@@ -213,12 +213,10 @@ void parport_daisy_fini(struct parport *port)
 struct pardevice *parport_open(int devnum, const char *name)
 {
 	struct daisydev *p = topology;
-	struct pardev_cb par_cb;
 	struct parport *port;
 	struct pardevice *dev;
 	int daisy;
 
-	memset(&par_cb, 0, sizeof(par_cb));
 	spin_lock(&topology_lock);
 	while (p && p->devnum != devnum)
 		p = p->next;
@@ -232,7 +230,7 @@ struct pardevice *parport_open(int devnum, const char *name)
 	port = parport_get_port(p->port);
 	spin_unlock(&topology_lock);
 
-	dev = parport_register_dev_model(port, name, &par_cb, devnum);
+	dev = parport_register_device(port, name, NULL, NULL, NULL, 0, NULL);
 	parport_put_port(port);
 	if (!dev)
 		return NULL;
@@ -482,31 +480,3 @@ static int assign_addrs(struct parport *port)
 	kfree(deviceid);
 	return detected;
 }
-
-static int daisy_drv_probe(struct pardevice *par_dev)
-{
-	struct device_driver *drv = par_dev->dev.driver;
-
-	if (strcmp(drv->name, "daisy_drv"))
-		return -ENODEV;
-	if (strcmp(par_dev->name, daisy_dev_name))
-		return -ENODEV;
-
-	return 0;
-}
-
-static struct parport_driver daisy_driver = {
-	.name = "daisy_drv",
-	.probe = daisy_drv_probe,
-	.devmodel = true,
-};
-
-int daisy_drv_init(void)
-{
-	return parport_register_driver(&daisy_driver);
-}
-
-void daisy_drv_exit(void)
-{
-	parport_unregister_driver(&daisy_driver);
-}
diff --git a/drivers/parport/probe.c b/drivers/parport/probe.c
index e5e6a463a941..e035174ba205 100644
--- a/drivers/parport/probe.c
+++ b/drivers/parport/probe.c
@@ -257,7 +257,7 @@ static ssize_t parport_read_device_id (struct parport *port, char *buffer,
 ssize_t parport_device_id (int devnum, char *buffer, size_t count)
 {
 	ssize_t retval = -ENXIO;
-	struct pardevice *dev = parport_open(devnum, daisy_dev_name);
+	struct pardevice *dev = parport_open (devnum, "Device ID probe");
 	if (!dev)
 		return -ENXIO;
 
diff --git a/drivers/parport/share.c b/drivers/parport/share.c
index 0171b8dbcdcd..5dc53d420ca8 100644
--- a/drivers/parport/share.c
+++ b/drivers/parport/share.c
@@ -137,19 +137,11 @@ static struct bus_type parport_bus_type = {
 
 int parport_bus_init(void)
 {
-	int retval;
-
-	retval = bus_register(&parport_bus_type);
-	if (retval)
-		return retval;
-	daisy_drv_init();
-
-	return 0;
+	return bus_register(&parport_bus_type);
 }
 
 void parport_bus_exit(void)
 {
-	daisy_drv_exit();
 	bus_unregister(&parport_bus_type);
 }
 
diff --git a/include/linux/parport.h b/include/linux/parport.h
index f41f1d041e2c..397607a0c0eb 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -460,7 +460,6 @@ extern size_t parport_ieee1284_epp_read_addr (struct parport *,
 					      void *, size_t, int);
 
 /* IEEE1284.3 functions */
-#define daisy_dev_name "Device ID probe"
 extern int parport_daisy_init (struct parport *port);
 extern void parport_daisy_fini (struct parport *port);
 extern struct pardevice *parport_open (int devnum, const char *name);
@@ -469,18 +468,6 @@ extern ssize_t parport_device_id (int devnum, char *buffer, size_t len);
 extern void parport_daisy_deselect_all (struct parport *port);
 extern int parport_daisy_select (struct parport *port, int daisy, int mode);
 
-#ifdef CONFIG_PARPORT_1284
-extern int daisy_drv_init(void);
-extern void daisy_drv_exit(void);
-#else
-static inline int daisy_drv_init(void)
-{
-	return 0;
-}
-
-static inline void daisy_drv_exit(void) {}
-#endif
-
 /* Lowlevel drivers _can_ call this support function to handle irqs.  */
 static inline void parport_generic_irq(struct parport *port)
 {
-- 
cgit v1.2.3


From db779ef67ffeadbb44e9e818eb64dbe528e2f48f Mon Sep 17 00:00:00 2001
From: Bhupesh Sharma <bhsharma@redhat.com>
Date: Tue, 26 Mar 2019 12:20:28 +0530
Subject: proc/kcore: Remove unused kclist_add_remap()

Commit

  bf904d2762ee ("x86/pti/64: Remove the SYSCALL64 entry trampoline")

removed the sole usage of kclist_add_remap() but it did not remove the
left-over definition from the include file.

Fix the same.

Signed-off-by: Bhupesh Sharma <bhsharma@redhat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Anderson <anderson@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Morse <james.morse@arm.com>
Cc: Kairui Song <kasong@redhat.com>
Cc: kexec@lists.infradead.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Omar Sandoval <osandov@fb.com>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/1553583028-17804-1-git-send-email-bhsharma@redhat.com
---
 include/linux/kcore.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/kcore.h b/include/linux/kcore.h
index 8c3f8c14eeaa..94b561df3877 100644
--- a/include/linux/kcore.h
+++ b/include/linux/kcore.h
@@ -38,22 +38,11 @@ struct vmcoredd_node {
 
 #ifdef CONFIG_PROC_KCORE
 void __init kclist_add(struct kcore_list *, void *, size_t, int type);
-static inline
-void kclist_add_remap(struct kcore_list *m, void *addr, void *vaddr, size_t sz)
-{
-	m->vaddr = (unsigned long)vaddr;
-	kclist_add(m, addr, sz, KCORE_REMAP);
-}
 #else
 static inline
 void kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
 {
 }
-
-static inline
-void kclist_add_remap(struct kcore_list *m, void *addr, void *vaddr, size_t sz)
-{
-}
 #endif
 
 #endif /* _LINUX_KCORE_H */
-- 
cgit v1.2.3


From 450895d04ba13a96886eddfeddb11556ae8624f1 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 24 Mar 2019 00:18:46 +0200
Subject: net: phy: bcm54xx: Encode link speed and activity into LEDs

Previously the green and amber LEDs on this quad PHY were solid, to
indicate an encoding of the link speed (10/100/1000).

This keeps the LEDs always on just as before, but now they flash on
Rx/Tx activity.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 13 +++++++++++++
 include/linux/brcmphy.h    | 16 ++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 9605d4fe540b..cb86a3e90c7d 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -323,6 +323,19 @@ static int bcm54xx_config_init(struct phy_device *phydev)
 
 	bcm54xx_phydsp_config(phydev);
 
+	/* Encode link speed into LED1 and LED3 pair (green/amber).
+	 * Also flash these two LEDs on activity. This means configuring
+	 * them for MULTICOLOR and encoding link/activity into them.
+	 */
+	val = BCM5482_SHD_LEDS1_LED1(BCM_LED_SRC_MULTICOLOR1) |
+		BCM5482_SHD_LEDS1_LED3(BCM_LED_SRC_MULTICOLOR1);
+	bcm_phy_write_shadow(phydev, BCM5482_SHD_LEDS1, val);
+
+	val = BCM_LED_MULTICOLOR_IN_PHASE |
+		BCM5482_SHD_LEDS1_LED1(BCM_LED_MULTICOLOR_LINK_ACT) |
+		BCM5482_SHD_LEDS1_LED3(BCM_LED_MULTICOLOR_LINK_ACT);
+	bcm_phy_write_exp(phydev, BCM_EXP_MULTICOLOR, val);
+
 	return 0;
 }
 
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 9cd00a37b8d3..6db2d9a6e503 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -148,6 +148,22 @@
 #define BCM_LED_SRC_OFF		0xe	/* Tied high */
 #define BCM_LED_SRC_ON		0xf	/* Tied low */
 
+/*
+ * Broadcom Multicolor LED configurations (expansion register 4)
+ */
+#define BCM_EXP_MULTICOLOR		(MII_BCM54XX_EXP_SEL_ER + 0x04)
+#define BCM_LED_MULTICOLOR_IN_PHASE	BIT(8)
+#define BCM_LED_MULTICOLOR_LINK_ACT	0x0
+#define BCM_LED_MULTICOLOR_SPEED	0x1
+#define BCM_LED_MULTICOLOR_ACT_FLASH	0x2
+#define BCM_LED_MULTICOLOR_FDX		0x3
+#define BCM_LED_MULTICOLOR_OFF		0x4
+#define BCM_LED_MULTICOLOR_ON		0x5
+#define BCM_LED_MULTICOLOR_ALT		0x6
+#define BCM_LED_MULTICOLOR_FLASH	0x7
+#define BCM_LED_MULTICOLOR_LINK		0x8
+#define BCM_LED_MULTICOLOR_ACT		0x9
+#define BCM_LED_MULTICOLOR_PROGRAM	0xa
 
 /*
  * BCM5482: Shadow registers
-- 
cgit v1.2.3


From 0532a1b0d045115521a93acf28f1270df89ad806 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 22 Mar 2019 09:19:34 +0100
Subject: virt: vbox: Implement passing requestor info to the host for
 VirtualBox 6.0.x

VirtualBox 6.0.x has a new feature where the guest kernel driver passes
info about the origin of the request (e.g. userspace or kernelspace) to
the hypervisor.

If we do not pass this information then when running the 6.0.x userspace
guest-additions tools on a 6.0.x host, some requests will get denied
with a VERR_VERSION_MISMATCH error, breaking vboxservice.service and
the mounting of shared folders marked to be auto-mounted.

This commit implements passing the requestor info to the host, fixing this.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/vboxguest/vboxguest_core.c    | 106 ++++++++++++++++++++---------
 drivers/virt/vboxguest/vboxguest_core.h    |  15 ++--
 drivers/virt/vboxguest/vboxguest_linux.c   |  26 ++++++-
 drivers/virt/vboxguest/vboxguest_utils.c   |  32 +++++----
 drivers/virt/vboxguest/vboxguest_version.h |   9 ++-
 drivers/virt/vboxguest/vmmdev.h            |   8 ++-
 include/linux/vbox_utils.h                 |  12 ++--
 include/uapi/linux/vbox_vmmdev_types.h     |  60 ++++++++++++++++
 8 files changed, 197 insertions(+), 71 deletions(-)

(limited to 'include')

diff --git a/drivers/virt/vboxguest/vboxguest_core.c b/drivers/virt/vboxguest/vboxguest_core.c
index df7d09409efe..8ca333f21292 100644
--- a/drivers/virt/vboxguest/vboxguest_core.c
+++ b/drivers/virt/vboxguest/vboxguest_core.c
@@ -27,6 +27,10 @@
 
 #define GUEST_MAPPINGS_TRIES	5
 
+#define VBG_KERNEL_REQUEST \
+	(VMMDEV_REQUESTOR_KERNEL | VMMDEV_REQUESTOR_USR_DRV | \
+	 VMMDEV_REQUESTOR_CON_DONT_KNOW | VMMDEV_REQUESTOR_TRUST_NOT_GIVEN)
+
 /**
  * Reserves memory in which the VMM can relocate any guest mappings
  * that are floating around.
@@ -48,7 +52,8 @@ static void vbg_guest_mappings_init(struct vbg_dev *gdev)
 	int i, rc;
 
 	/* Query the required space. */
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_GET_HYPERVISOR_INFO);
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_GET_HYPERVISOR_INFO,
+			    VBG_KERNEL_REQUEST);
 	if (!req)
 		return;
 
@@ -135,7 +140,8 @@ static void vbg_guest_mappings_exit(struct vbg_dev *gdev)
 	 * Tell the host that we're going to free the memory we reserved for
 	 * it, the free it up. (Leak the memory if anything goes wrong here.)
 	 */
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_HYPERVISOR_INFO);
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_HYPERVISOR_INFO,
+			    VBG_KERNEL_REQUEST);
 	if (!req)
 		return;
 
@@ -172,8 +178,10 @@ static int vbg_report_guest_info(struct vbg_dev *gdev)
 	struct vmmdev_guest_info2 *req2 = NULL;
 	int rc, ret = -ENOMEM;
 
-	req1 = vbg_req_alloc(sizeof(*req1), VMMDEVREQ_REPORT_GUEST_INFO);
-	req2 = vbg_req_alloc(sizeof(*req2), VMMDEVREQ_REPORT_GUEST_INFO2);
+	req1 = vbg_req_alloc(sizeof(*req1), VMMDEVREQ_REPORT_GUEST_INFO,
+			     VBG_KERNEL_REQUEST);
+	req2 = vbg_req_alloc(sizeof(*req2), VMMDEVREQ_REPORT_GUEST_INFO2,
+			     VBG_KERNEL_REQUEST);
 	if (!req1 || !req2)
 		goto out_free;
 
@@ -187,8 +195,8 @@ static int vbg_report_guest_info(struct vbg_dev *gdev)
 	req2->additions_minor = VBG_VERSION_MINOR;
 	req2->additions_build = VBG_VERSION_BUILD;
 	req2->additions_revision = VBG_SVN_REV;
-	/* (no features defined yet) */
-	req2->additions_features = 0;
+	req2->additions_features =
+		VMMDEV_GUEST_INFO2_ADDITIONS_FEATURES_REQUESTOR_INFO;
 	strlcpy(req2->name, VBG_VERSION_STRING,
 		sizeof(req2->name));
 
@@ -230,7 +238,8 @@ static int vbg_report_driver_status(struct vbg_dev *gdev, bool active)
 	struct vmmdev_guest_status *req;
 	int rc;
 
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_REPORT_GUEST_STATUS);
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_REPORT_GUEST_STATUS,
+			    VBG_KERNEL_REQUEST);
 	if (!req)
 		return -ENOMEM;
 
@@ -423,7 +432,8 @@ static int vbg_heartbeat_host_config(struct vbg_dev *gdev, bool enabled)
 	struct vmmdev_heartbeat *req;
 	int rc;
 
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_HEARTBEAT_CONFIGURE);
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_HEARTBEAT_CONFIGURE,
+			    VBG_KERNEL_REQUEST);
 	if (!req)
 		return -ENOMEM;
 
@@ -457,7 +467,8 @@ static int vbg_heartbeat_init(struct vbg_dev *gdev)
 
 	gdev->guest_heartbeat_req = vbg_req_alloc(
 					sizeof(*gdev->guest_heartbeat_req),
-					VMMDEVREQ_GUEST_HEARTBEAT);
+					VMMDEVREQ_GUEST_HEARTBEAT,
+					VBG_KERNEL_REQUEST);
 	if (!gdev->guest_heartbeat_req)
 		return -ENOMEM;
 
@@ -528,7 +539,8 @@ static int vbg_reset_host_event_filter(struct vbg_dev *gdev,
 	struct vmmdev_mask *req;
 	int rc;
 
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_CTL_GUEST_FILTER_MASK);
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_CTL_GUEST_FILTER_MASK,
+			    VBG_KERNEL_REQUEST);
 	if (!req)
 		return -ENOMEM;
 
@@ -567,8 +579,14 @@ static int vbg_set_session_event_filter(struct vbg_dev *gdev,
 	u32 changed, previous;
 	int rc, ret = 0;
 
-	/* Allocate a request buffer before taking the spinlock */
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_CTL_GUEST_FILTER_MASK);
+	/*
+	 * Allocate a request buffer before taking the spinlock, when
+	 * the session is being terminated the requestor is the kernel,
+	 * as we're cleaning up.
+	 */
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_CTL_GUEST_FILTER_MASK,
+			    session_termination ? VBG_KERNEL_REQUEST :
+						  session->requestor);
 	if (!req) {
 		if (!session_termination)
 			return -ENOMEM;
@@ -627,7 +645,8 @@ static int vbg_reset_host_capabilities(struct vbg_dev *gdev)
 	struct vmmdev_mask *req;
 	int rc;
 
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_GUEST_CAPABILITIES);
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_GUEST_CAPABILITIES,
+			    VBG_KERNEL_REQUEST);
 	if (!req)
 		return -ENOMEM;
 
@@ -662,8 +681,14 @@ static int vbg_set_session_capabilities(struct vbg_dev *gdev,
 	u32 changed, previous;
 	int rc, ret = 0;
 
-	/* Allocate a request buffer before taking the spinlock */
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_GUEST_CAPABILITIES);
+	/*
+	 * Allocate a request buffer before taking the spinlock, when
+	 * the session is being terminated the requestor is the kernel,
+	 * as we're cleaning up.
+	 */
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_GUEST_CAPABILITIES,
+			    session_termination ? VBG_KERNEL_REQUEST :
+						  session->requestor);
 	if (!req) {
 		if (!session_termination)
 			return -ENOMEM;
@@ -722,7 +747,8 @@ static int vbg_query_host_version(struct vbg_dev *gdev)
 	struct vmmdev_host_version *req;
 	int rc, ret;
 
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_GET_HOST_VERSION);
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_GET_HOST_VERSION,
+			    VBG_KERNEL_REQUEST);
 	if (!req)
 		return -ENOMEM;
 
@@ -783,19 +809,24 @@ int vbg_core_init(struct vbg_dev *gdev, u32 fixed_events)
 
 	gdev->mem_balloon.get_req =
 		vbg_req_alloc(sizeof(*gdev->mem_balloon.get_req),
-			      VMMDEVREQ_GET_MEMBALLOON_CHANGE_REQ);
+			      VMMDEVREQ_GET_MEMBALLOON_CHANGE_REQ,
+			      VBG_KERNEL_REQUEST);
 	gdev->mem_balloon.change_req =
 		vbg_req_alloc(sizeof(*gdev->mem_balloon.change_req),
-			      VMMDEVREQ_CHANGE_MEMBALLOON);
+			      VMMDEVREQ_CHANGE_MEMBALLOON,
+			      VBG_KERNEL_REQUEST);
 	gdev->cancel_req =
 		vbg_req_alloc(sizeof(*(gdev->cancel_req)),
-			      VMMDEVREQ_HGCM_CANCEL2);
+			      VMMDEVREQ_HGCM_CANCEL2,
+			      VBG_KERNEL_REQUEST);
 	gdev->ack_events_req =
 		vbg_req_alloc(sizeof(*gdev->ack_events_req),
-			      VMMDEVREQ_ACKNOWLEDGE_EVENTS);
+			      VMMDEVREQ_ACKNOWLEDGE_EVENTS,
+			      VBG_KERNEL_REQUEST);
 	gdev->mouse_status_req =
 		vbg_req_alloc(sizeof(*gdev->mouse_status_req),
-			      VMMDEVREQ_GET_MOUSE_STATUS);
+			      VMMDEVREQ_GET_MOUSE_STATUS,
+			      VBG_KERNEL_REQUEST);
 
 	if (!gdev->mem_balloon.get_req || !gdev->mem_balloon.change_req ||
 	    !gdev->cancel_req || !gdev->ack_events_req ||
@@ -892,9 +923,9 @@ void vbg_core_exit(struct vbg_dev *gdev)
  * vboxguest_linux.c calls this when userspace opens the char-device.
  * Return: A pointer to the new session or an ERR_PTR on error.
  * @gdev:		The Guest extension device.
- * @user:		Set if this is a session for the vboxuser device.
+ * @requestor:		VMMDEV_REQUESTOR_* flags
  */
-struct vbg_session *vbg_core_open_session(struct vbg_dev *gdev, bool user)
+struct vbg_session *vbg_core_open_session(struct vbg_dev *gdev, u32 requestor)
 {
 	struct vbg_session *session;
 
@@ -903,7 +934,7 @@ struct vbg_session *vbg_core_open_session(struct vbg_dev *gdev, bool user)
 		return ERR_PTR(-ENOMEM);
 
 	session->gdev = gdev;
-	session->user_session = user;
+	session->requestor = requestor;
 
 	return session;
 }
@@ -924,7 +955,9 @@ void vbg_core_close_session(struct vbg_session *session)
 		if (!session->hgcm_client_ids[i])
 			continue;
 
-		vbg_hgcm_disconnect(gdev, session->hgcm_client_ids[i], &rc);
+		/* requestor is kernel here, as we're cleaning up. */
+		vbg_hgcm_disconnect(gdev, VBG_KERNEL_REQUEST,
+				    session->hgcm_client_ids[i], &rc);
 	}
 
 	kfree(session);
@@ -1152,7 +1185,8 @@ static int vbg_req_allowed(struct vbg_dev *gdev, struct vbg_session *session,
 		return -EPERM;
 	}
 
-	if (trusted_apps_only && session->user_session) {
+	if (trusted_apps_only &&
+	    (session->requestor & VMMDEV_REQUESTOR_USER_DEVICE)) {
 		vbg_err("Denying userspace vmm call type %#08x through vboxuser device node\n",
 			req->request_type);
 		return -EPERM;
@@ -1209,8 +1243,8 @@ static int vbg_ioctl_hgcm_connect(struct vbg_dev *gdev,
 	if (i >= ARRAY_SIZE(session->hgcm_client_ids))
 		return -EMFILE;
 
-	ret = vbg_hgcm_connect(gdev, &conn->u.in.loc, &client_id,
-			       &conn->hdr.rc);
+	ret = vbg_hgcm_connect(gdev, session->requestor, &conn->u.in.loc,
+			       &client_id, &conn->hdr.rc);
 
 	mutex_lock(&gdev->session_mutex);
 	if (ret == 0 && conn->hdr.rc >= 0) {
@@ -1251,7 +1285,8 @@ static int vbg_ioctl_hgcm_disconnect(struct vbg_dev *gdev,
 	if (i >= ARRAY_SIZE(session->hgcm_client_ids))
 		return -EINVAL;
 
-	ret = vbg_hgcm_disconnect(gdev, client_id, &disconn->hdr.rc);
+	ret = vbg_hgcm_disconnect(gdev, session->requestor, client_id,
+				  &disconn->hdr.rc);
 
 	mutex_lock(&gdev->session_mutex);
 	if (ret == 0 && disconn->hdr.rc >= 0)
@@ -1313,12 +1348,12 @@ static int vbg_ioctl_hgcm_call(struct vbg_dev *gdev,
 	}
 
 	if (IS_ENABLED(CONFIG_COMPAT) && f32bit)
-		ret = vbg_hgcm_call32(gdev, client_id,
+		ret = vbg_hgcm_call32(gdev, session->requestor, client_id,
 				      call->function, call->timeout_ms,
 				      VBG_IOCTL_HGCM_CALL_PARMS32(call),
 				      call->parm_count, &call->hdr.rc);
 	else
-		ret = vbg_hgcm_call(gdev, client_id,
+		ret = vbg_hgcm_call(gdev, session->requestor, client_id,
 				    call->function, call->timeout_ms,
 				    VBG_IOCTL_HGCM_CALL_PARMS(call),
 				    call->parm_count, &call->hdr.rc);
@@ -1408,6 +1443,7 @@ static int vbg_ioctl_check_balloon(struct vbg_dev *gdev,
 }
 
 static int vbg_ioctl_write_core_dump(struct vbg_dev *gdev,
+				     struct vbg_session *session,
 				     struct vbg_ioctl_write_coredump *dump)
 {
 	struct vmmdev_write_core_dump *req;
@@ -1415,7 +1451,8 @@ static int vbg_ioctl_write_core_dump(struct vbg_dev *gdev,
 	if (vbg_ioctl_chk(&dump->hdr, sizeof(dump->u.in), 0))
 		return -EINVAL;
 
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_WRITE_COREDUMP);
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_WRITE_COREDUMP,
+			    session->requestor);
 	if (!req)
 		return -ENOMEM;
 
@@ -1476,7 +1513,7 @@ int vbg_core_ioctl(struct vbg_session *session, unsigned int req, void *data)
 	case VBG_IOCTL_CHECK_BALLOON:
 		return vbg_ioctl_check_balloon(gdev, data);
 	case VBG_IOCTL_WRITE_CORE_DUMP:
-		return vbg_ioctl_write_core_dump(gdev, data);
+		return vbg_ioctl_write_core_dump(gdev, session, data);
 	}
 
 	/* Variable sized requests. */
@@ -1508,7 +1545,8 @@ int vbg_core_set_mouse_status(struct vbg_dev *gdev, u32 features)
 	struct vmmdev_mouse_status *req;
 	int rc;
 
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_MOUSE_STATUS);
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_MOUSE_STATUS,
+			    VBG_KERNEL_REQUEST);
 	if (!req)
 		return -ENOMEM;
 
diff --git a/drivers/virt/vboxguest/vboxguest_core.h b/drivers/virt/vboxguest/vboxguest_core.h
index 7ad9ec45bfa9..4188c12b839f 100644
--- a/drivers/virt/vboxguest/vboxguest_core.h
+++ b/drivers/virt/vboxguest/vboxguest_core.h
@@ -154,15 +154,15 @@ struct vbg_session {
 	 * host. Protected by vbg_gdev.session_mutex.
 	 */
 	u32 guest_caps;
-	/** Does this session belong to a root process or a user one? */
-	bool user_session;
+	/** VMMDEV_REQUESTOR_* flags */
+	u32 requestor;
 	/** Set on CANCEL_ALL_WAITEVENTS, protected by vbg_devevent_spinlock. */
 	bool cancel_waiters;
 };
 
 int  vbg_core_init(struct vbg_dev *gdev, u32 fixed_events);
 void vbg_core_exit(struct vbg_dev *gdev);
-struct vbg_session *vbg_core_open_session(struct vbg_dev *gdev, bool user);
+struct vbg_session *vbg_core_open_session(struct vbg_dev *gdev, u32 requestor);
 void vbg_core_close_session(struct vbg_session *session);
 int  vbg_core_ioctl(struct vbg_session *session, unsigned int req, void *data);
 int  vbg_core_set_mouse_status(struct vbg_dev *gdev, u32 features);
@@ -172,12 +172,13 @@ irqreturn_t vbg_core_isr(int irq, void *dev_id);
 void vbg_linux_mouse_event(struct vbg_dev *gdev);
 
 /* Private (non exported) functions form vboxguest_utils.c */
-void *vbg_req_alloc(size_t len, enum vmmdev_request_type req_type);
+void *vbg_req_alloc(size_t len, enum vmmdev_request_type req_type,
+		    u32 requestor);
 void vbg_req_free(void *req, size_t len);
 int vbg_req_perform(struct vbg_dev *gdev, void *req);
 int vbg_hgcm_call32(
-	struct vbg_dev *gdev, u32 client_id, u32 function, u32 timeout_ms,
-	struct vmmdev_hgcm_function_parameter32 *parm32, u32 parm_count,
-	int *vbox_status);
+	struct vbg_dev *gdev, u32 requestor, u32 client_id, u32 function,
+	u32 timeout_ms, struct vmmdev_hgcm_function_parameter32 *parm32,
+	u32 parm_count, int *vbox_status);
 
 #endif
diff --git a/drivers/virt/vboxguest/vboxguest_linux.c b/drivers/virt/vboxguest/vboxguest_linux.c
index 6e2a9619192d..6e8c0f1c1056 100644
--- a/drivers/virt/vboxguest/vboxguest_linux.c
+++ b/drivers/virt/vboxguest/vboxguest_linux.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2006-2016 Oracle Corporation
  */
 
+#include <linux/cred.h>
 #include <linux/input.h>
 #include <linux/kernel.h>
 #include <linux/miscdevice.h>
@@ -28,6 +29,23 @@ static DEFINE_MUTEX(vbg_gdev_mutex);
 /** Global vbg_gdev pointer used by vbg_get/put_gdev. */
 static struct vbg_dev *vbg_gdev;
 
+static u32 vbg_misc_device_requestor(struct inode *inode)
+{
+	u32 requestor = VMMDEV_REQUESTOR_USERMODE |
+			VMMDEV_REQUESTOR_CON_DONT_KNOW |
+			VMMDEV_REQUESTOR_TRUST_NOT_GIVEN;
+
+	if (from_kuid(current_user_ns(), current->cred->uid) == 0)
+		requestor |= VMMDEV_REQUESTOR_USR_ROOT;
+	else
+		requestor |= VMMDEV_REQUESTOR_USR_USER;
+
+	if (in_egroup_p(inode->i_gid))
+		requestor |= VMMDEV_REQUESTOR_GRP_VBOX;
+
+	return requestor;
+}
+
 static int vbg_misc_device_open(struct inode *inode, struct file *filp)
 {
 	struct vbg_session *session;
@@ -36,7 +54,7 @@ static int vbg_misc_device_open(struct inode *inode, struct file *filp)
 	/* misc_open sets filp->private_data to our misc device */
 	gdev = container_of(filp->private_data, struct vbg_dev, misc_device);
 
-	session = vbg_core_open_session(gdev, false);
+	session = vbg_core_open_session(gdev, vbg_misc_device_requestor(inode));
 	if (IS_ERR(session))
 		return PTR_ERR(session);
 
@@ -53,7 +71,8 @@ static int vbg_misc_device_user_open(struct inode *inode, struct file *filp)
 	gdev = container_of(filp->private_data, struct vbg_dev,
 			    misc_device_user);
 
-	session = vbg_core_open_session(gdev, false);
+	session = vbg_core_open_session(gdev, vbg_misc_device_requestor(inode) |
+					      VMMDEV_REQUESTOR_USER_DEVICE);
 	if (IS_ERR(session))
 		return PTR_ERR(session);
 
@@ -115,7 +134,8 @@ static long vbg_misc_device_ioctl(struct file *filp, unsigned int req,
 			 req == VBG_IOCTL_VMMDEV_REQUEST_BIG;
 
 	if (is_vmmdev_req)
-		buf = vbg_req_alloc(size, VBG_IOCTL_HDR_TYPE_DEFAULT);
+		buf = vbg_req_alloc(size, VBG_IOCTL_HDR_TYPE_DEFAULT,
+				    session->requestor);
 	else
 		buf = kmalloc(size, GFP_KERNEL);
 	if (!buf)
diff --git a/drivers/virt/vboxguest/vboxguest_utils.c b/drivers/virt/vboxguest/vboxguest_utils.c
index bf4474214b4d..75fd140b02ff 100644
--- a/drivers/virt/vboxguest/vboxguest_utils.c
+++ b/drivers/virt/vboxguest/vboxguest_utils.c
@@ -62,7 +62,8 @@ VBG_LOG(vbg_err, pr_err);
 VBG_LOG(vbg_debug, pr_debug);
 #endif
 
-void *vbg_req_alloc(size_t len, enum vmmdev_request_type req_type)
+void *vbg_req_alloc(size_t len, enum vmmdev_request_type req_type,
+		    u32 requestor)
 {
 	struct vmmdev_request_header *req;
 	int order = get_order(PAGE_ALIGN(len));
@@ -78,7 +79,7 @@ void *vbg_req_alloc(size_t len, enum vmmdev_request_type req_type)
 	req->request_type = req_type;
 	req->rc = VERR_GENERAL_FAILURE;
 	req->reserved1 = 0;
-	req->reserved2 = 0;
+	req->requestor = requestor;
 
 	return req;
 }
@@ -119,7 +120,7 @@ static bool hgcm_req_done(struct vbg_dev *gdev,
 	return done;
 }
 
-int vbg_hgcm_connect(struct vbg_dev *gdev,
+int vbg_hgcm_connect(struct vbg_dev *gdev, u32 requestor,
 		     struct vmmdev_hgcm_service_location *loc,
 		     u32 *client_id, int *vbox_status)
 {
@@ -127,7 +128,7 @@ int vbg_hgcm_connect(struct vbg_dev *gdev,
 	int rc;
 
 	hgcm_connect = vbg_req_alloc(sizeof(*hgcm_connect),
-				     VMMDEVREQ_HGCM_CONNECT);
+				     VMMDEVREQ_HGCM_CONNECT, requestor);
 	if (!hgcm_connect)
 		return -ENOMEM;
 
@@ -153,13 +154,15 @@ int vbg_hgcm_connect(struct vbg_dev *gdev,
 }
 EXPORT_SYMBOL(vbg_hgcm_connect);
 
-int vbg_hgcm_disconnect(struct vbg_dev *gdev, u32 client_id, int *vbox_status)
+int vbg_hgcm_disconnect(struct vbg_dev *gdev, u32 requestor,
+			u32 client_id, int *vbox_status)
 {
 	struct vmmdev_hgcm_disconnect *hgcm_disconnect = NULL;
 	int rc;
 
 	hgcm_disconnect = vbg_req_alloc(sizeof(*hgcm_disconnect),
-					VMMDEVREQ_HGCM_DISCONNECT);
+					VMMDEVREQ_HGCM_DISCONNECT,
+					requestor);
 	if (!hgcm_disconnect)
 		return -ENOMEM;
 
@@ -593,9 +596,10 @@ static int hgcm_call_copy_back_result(
 	return 0;
 }
 
-int vbg_hgcm_call(struct vbg_dev *gdev, u32 client_id, u32 function,
-		  u32 timeout_ms, struct vmmdev_hgcm_function_parameter *parms,
-		  u32 parm_count, int *vbox_status)
+int vbg_hgcm_call(struct vbg_dev *gdev, u32 requestor, u32 client_id,
+		  u32 function, u32 timeout_ms,
+		  struct vmmdev_hgcm_function_parameter *parms, u32 parm_count,
+		  int *vbox_status)
 {
 	struct vmmdev_hgcm_call *call;
 	void **bounce_bufs = NULL;
@@ -615,7 +619,7 @@ int vbg_hgcm_call(struct vbg_dev *gdev, u32 client_id, u32 function,
 		goto free_bounce_bufs;
 	}
 
-	call = vbg_req_alloc(size, VMMDEVREQ_HGCM_CALL);
+	call = vbg_req_alloc(size, VMMDEVREQ_HGCM_CALL, requestor);
 	if (!call) {
 		ret = -ENOMEM;
 		goto free_bounce_bufs;
@@ -647,9 +651,9 @@ EXPORT_SYMBOL(vbg_hgcm_call);
 
 #ifdef CONFIG_COMPAT
 int vbg_hgcm_call32(
-	struct vbg_dev *gdev, u32 client_id, u32 function, u32 timeout_ms,
-	struct vmmdev_hgcm_function_parameter32 *parm32, u32 parm_count,
-	int *vbox_status)
+	struct vbg_dev *gdev, u32 requestor, u32 client_id, u32 function,
+	u32 timeout_ms, struct vmmdev_hgcm_function_parameter32 *parm32,
+	u32 parm_count, int *vbox_status)
 {
 	struct vmmdev_hgcm_function_parameter *parm64 = NULL;
 	u32 i, size;
@@ -689,7 +693,7 @@ int vbg_hgcm_call32(
 			goto out_free;
 	}
 
-	ret = vbg_hgcm_call(gdev, client_id, function, timeout_ms,
+	ret = vbg_hgcm_call(gdev, requestor, client_id, function, timeout_ms,
 			    parm64, parm_count, vbox_status);
 	if (ret < 0)
 		goto out_free;
diff --git a/drivers/virt/vboxguest/vboxguest_version.h b/drivers/virt/vboxguest/vboxguest_version.h
index 77f0c8f8a231..84834dad38d5 100644
--- a/drivers/virt/vboxguest/vboxguest_version.h
+++ b/drivers/virt/vboxguest/vboxguest_version.h
@@ -9,11 +9,10 @@
 #ifndef __VBOX_VERSION_H__
 #define __VBOX_VERSION_H__
 
-/* Last synced October 4th 2017 */
-#define VBG_VERSION_MAJOR 5
-#define VBG_VERSION_MINOR 2
+#define VBG_VERSION_MAJOR 6
+#define VBG_VERSION_MINOR 0
 #define VBG_VERSION_BUILD 0
-#define VBG_SVN_REV 68940
-#define VBG_VERSION_STRING "5.2.0"
+#define VBG_SVN_REV 127566
+#define VBG_VERSION_STRING "6.0.0"
 
 #endif
diff --git a/drivers/virt/vboxguest/vmmdev.h b/drivers/virt/vboxguest/vmmdev.h
index 5e2ae978935d..6337b8d75d96 100644
--- a/drivers/virt/vboxguest/vmmdev.h
+++ b/drivers/virt/vboxguest/vmmdev.h
@@ -98,8 +98,8 @@ struct vmmdev_request_header {
 	s32 rc;
 	/** Reserved field no.1. MBZ. */
 	u32 reserved1;
-	/** Reserved field no.2. MBZ. */
-	u32 reserved2;
+	/** IN: Requestor information (VMMDEV_REQUESTOR_*) */
+	u32 requestor;
 };
 VMMDEV_ASSERT_SIZE(vmmdev_request_header, 24);
 
@@ -247,6 +247,8 @@ struct vmmdev_guest_info {
 };
 VMMDEV_ASSERT_SIZE(vmmdev_guest_info, 24 + 8);
 
+#define VMMDEV_GUEST_INFO2_ADDITIONS_FEATURES_REQUESTOR_INFO	BIT(0)
+
 /** struct vmmdev_guestinfo2 - Guest information report, version 2. */
 struct vmmdev_guest_info2 {
 	/** Header. */
@@ -259,7 +261,7 @@ struct vmmdev_guest_info2 {
 	u32 additions_build;
 	/** SVN revision. */
 	u32 additions_revision;
-	/** Feature mask, currently unused. */
+	/** Feature mask. */
 	u32 additions_features;
 	/**
 	 * The intentional meaning of this field was:
diff --git a/include/linux/vbox_utils.h b/include/linux/vbox_utils.h
index a240ed2a0372..ff56c443180c 100644
--- a/include/linux/vbox_utils.h
+++ b/include/linux/vbox_utils.h
@@ -24,15 +24,17 @@ __printf(1, 2) void vbg_debug(const char *fmt, ...);
 #define vbg_debug pr_debug
 #endif
 
-int vbg_hgcm_connect(struct vbg_dev *gdev,
+int vbg_hgcm_connect(struct vbg_dev *gdev, u32 requestor,
 		     struct vmmdev_hgcm_service_location *loc,
 		     u32 *client_id, int *vbox_status);
 
-int vbg_hgcm_disconnect(struct vbg_dev *gdev, u32 client_id, int *vbox_status);
+int vbg_hgcm_disconnect(struct vbg_dev *gdev, u32 requestor,
+			u32 client_id, int *vbox_status);
 
-int vbg_hgcm_call(struct vbg_dev *gdev, u32 client_id, u32 function,
-		  u32 timeout_ms, struct vmmdev_hgcm_function_parameter *parms,
-		  u32 parm_count, int *vbox_status);
+int vbg_hgcm_call(struct vbg_dev *gdev, u32 requestor, u32 client_id,
+		  u32 function, u32 timeout_ms,
+		  struct vmmdev_hgcm_function_parameter *parms, u32 parm_count,
+		  int *vbox_status);
 
 /**
  * Convert a VirtualBox status code to a standard Linux kernel return value.
diff --git a/include/uapi/linux/vbox_vmmdev_types.h b/include/uapi/linux/vbox_vmmdev_types.h
index 0e68024f36c7..26f39816af14 100644
--- a/include/uapi/linux/vbox_vmmdev_types.h
+++ b/include/uapi/linux/vbox_vmmdev_types.h
@@ -102,6 +102,66 @@ enum vmmdev_request_type {
 #define VMMDEVREQ_HGCM_CALL VMMDEVREQ_HGCM_CALL32
 #endif
 
+/* vmmdev_request_header.requestor defines */
+
+/* Requestor user not given. */
+#define VMMDEV_REQUESTOR_USR_NOT_GIVEN                      0x00000000
+/* The kernel driver (vboxguest) is the requestor. */
+#define VMMDEV_REQUESTOR_USR_DRV                            0x00000001
+/* Some other kernel driver is the requestor. */
+#define VMMDEV_REQUESTOR_USR_DRV_OTHER                      0x00000002
+/* The root or a admin user is the requestor. */
+#define VMMDEV_REQUESTOR_USR_ROOT                           0x00000003
+/* Regular joe user is making the request. */
+#define VMMDEV_REQUESTOR_USR_USER                           0x00000006
+/* User classification mask. */
+#define VMMDEV_REQUESTOR_USR_MASK                           0x00000007
+
+/* Kernel mode request. Note this is 0, check for !USERMODE instead. */
+#define VMMDEV_REQUESTOR_KERNEL                             0x00000000
+/* User mode request. */
+#define VMMDEV_REQUESTOR_USERMODE                           0x00000008
+/* User or kernel mode classification mask. */
+#define VMMDEV_REQUESTOR_MODE_MASK                          0x00000008
+
+/* Don't know the physical console association of the requestor. */
+#define VMMDEV_REQUESTOR_CON_DONT_KNOW                      0x00000000
+/*
+ * The request originates with a process that is NOT associated with the
+ * physical console.
+ */
+#define VMMDEV_REQUESTOR_CON_NO                             0x00000010
+/* Requestor process is associated with the physical console. */
+#define VMMDEV_REQUESTOR_CON_YES                            0x00000020
+/* Console classification mask. */
+#define VMMDEV_REQUESTOR_CON_MASK                           0x00000030
+
+/* Requestor is member of special VirtualBox user group. */
+#define VMMDEV_REQUESTOR_GRP_VBOX                           0x00000080
+
+/* Note: trust level is for windows guests only, linux always uses not-given */
+/* Requestor trust level: Unspecified */
+#define VMMDEV_REQUESTOR_TRUST_NOT_GIVEN                    0x00000000
+/* Requestor trust level: Untrusted (SID S-1-16-0) */
+#define VMMDEV_REQUESTOR_TRUST_UNTRUSTED                    0x00001000
+/* Requestor trust level: Untrusted (SID S-1-16-4096) */
+#define VMMDEV_REQUESTOR_TRUST_LOW                          0x00002000
+/* Requestor trust level: Medium (SID S-1-16-8192) */
+#define VMMDEV_REQUESTOR_TRUST_MEDIUM                       0x00003000
+/* Requestor trust level: Medium plus (SID S-1-16-8448) */
+#define VMMDEV_REQUESTOR_TRUST_MEDIUM_PLUS                  0x00004000
+/* Requestor trust level: High (SID S-1-16-12288) */
+#define VMMDEV_REQUESTOR_TRUST_HIGH                         0x00005000
+/* Requestor trust level: System (SID S-1-16-16384) */
+#define VMMDEV_REQUESTOR_TRUST_SYSTEM                       0x00006000
+/* Requestor trust level >= Protected (SID S-1-16-20480, S-1-16-28672) */
+#define VMMDEV_REQUESTOR_TRUST_PROTECTED                    0x00007000
+/* Requestor trust level mask */
+#define VMMDEV_REQUESTOR_TRUST_MASK                         0x00007000
+
+/* Requestor is using the less trusted user device node (/dev/vboxuser) */
+#define VMMDEV_REQUESTOR_USER_DEVICE                        0x00008000
+
 /** HGCM service location types. */
 enum vmmdev_hgcm_service_location_type {
 	VMMDEV_HGCM_LOC_INVALID    = 0,
-- 
cgit v1.2.3


From 3d9683cf3bfb6d4e4605a153958dfca7e18b52f2 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 18 Mar 2019 18:08:12 +0900
Subject: KVM: export <linux/kvm_para.h> and <asm/kvm_para.h> iif KVM is
 supported

I do not see any consistency about headers_install of <linux/kvm_para.h>
and <asm/kvm_para.h>.

According to my analysis of Linux 5.1-rc1, there are 3 groups:

 [1] Both <linux/kvm_para.h> and <asm/kvm_para.h> are exported

    alpha, arm, hexagon, mips, powerpc, s390, sparc, x86

 [2] <asm/kvm_para.h> is exported, but <linux/kvm_para.h> is not

    arc, arm64, c6x, h8300, ia64, m68k, microblaze, nios2, openrisc,
    parisc, sh, unicore32, xtensa

 [3] Neither <linux/kvm_para.h> nor <asm/kvm_para.h> is exported

    csky, nds32, riscv

This does not match to the actual KVM support. At least, [2] is
half-baked.

Nor do arch maintainers look like they care about this. For example,
commit 0add53713b1c ("microblaze: Add missing kvm_para.h to Kbuild")
exported <asm/kvm_para.h> to user-space in order to fix an in-kernel
build error.

We have two ways to make this consistent:

 [A] export both <linux/kvm_para.h> and <asm/kvm_para.h> for all
     architectures, irrespective of the KVM support

 [B] Match the header export of <linux/kvm_para.h> and <asm/kvm_para.h>
     to the KVM support

My first attempt was [A] because the code looks cleaner, but Paolo
suggested [B].

So, this commit goes with [B].

For most architectures, <asm/kvm_para.h> was moved to the kernel-space.
I changed include/uapi/linux/Kbuild so that it checks generated
asm/kvm_para.h as well as check-in ones.

After this commit, there will be two groups:

 [1] Both <linux/kvm_para.h> and <asm/kvm_para.h> are exported

    arm, arm64, mips, powerpc, s390, x86

 [2] Neither <linux/kvm_para.h> nor <asm/kvm_para.h> is exported

    alpha, arc, c6x, csky, h8300, hexagon, ia64, m68k, microblaze,
    nds32, nios2, openrisc, parisc, riscv, sh, sparc, unicore32, xtensa

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/alpha/include/asm/Kbuild            | 1 +
 arch/alpha/include/uapi/asm/kvm_para.h   | 2 --
 arch/arc/include/asm/Kbuild              | 1 +
 arch/arc/include/uapi/asm/Kbuild         | 1 -
 arch/arm/include/uapi/asm/Kbuild         | 1 +
 arch/arm/include/uapi/asm/kvm_para.h     | 2 --
 arch/c6x/include/asm/Kbuild              | 1 +
 arch/c6x/include/uapi/asm/Kbuild         | 1 -
 arch/h8300/include/asm/Kbuild            | 1 +
 arch/h8300/include/uapi/asm/Kbuild       | 1 -
 arch/hexagon/include/asm/Kbuild          | 1 +
 arch/hexagon/include/uapi/asm/kvm_para.h | 2 --
 arch/ia64/include/asm/Kbuild             | 1 +
 arch/ia64/include/uapi/asm/Kbuild        | 1 -
 arch/m68k/include/asm/Kbuild             | 1 +
 arch/m68k/include/uapi/asm/Kbuild        | 1 -
 arch/microblaze/include/asm/Kbuild       | 1 +
 arch/microblaze/include/uapi/asm/Kbuild  | 1 -
 arch/nios2/include/asm/Kbuild            | 1 +
 arch/nios2/include/uapi/asm/Kbuild       | 1 -
 arch/openrisc/include/asm/Kbuild         | 1 +
 arch/openrisc/include/uapi/asm/Kbuild    | 1 -
 arch/parisc/include/asm/Kbuild           | 1 +
 arch/parisc/include/uapi/asm/Kbuild      | 1 -
 arch/sh/include/asm/Kbuild               | 1 +
 arch/sh/include/uapi/asm/Kbuild          | 1 -
 arch/sparc/include/asm/Kbuild            | 1 +
 arch/sparc/include/uapi/asm/kvm_para.h   | 2 --
 arch/unicore32/include/asm/Kbuild        | 1 +
 arch/unicore32/include/uapi/asm/Kbuild   | 1 -
 arch/xtensa/include/asm/Kbuild           | 1 +
 arch/xtensa/include/uapi/asm/Kbuild      | 1 -
 include/uapi/linux/Kbuild                | 2 ++
 33 files changed, 18 insertions(+), 20 deletions(-)
 delete mode 100644 arch/alpha/include/uapi/asm/kvm_para.h
 delete mode 100644 arch/arm/include/uapi/asm/kvm_para.h
 delete mode 100644 arch/hexagon/include/uapi/asm/kvm_para.h
 delete mode 100644 arch/sparc/include/uapi/asm/kvm_para.h

(limited to 'include')

diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index dc0ab28baca1..70b783333965 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -6,6 +6,7 @@ generic-y += exec.h
 generic-y += export.h
 generic-y += fb.h
 generic-y += irq_work.h
+generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
 generic-y += preempt.h
diff --git a/arch/alpha/include/uapi/asm/kvm_para.h b/arch/alpha/include/uapi/asm/kvm_para.h
deleted file mode 100644
index baacc4996d18..000000000000
--- a/arch/alpha/include/uapi/asm/kvm_para.h
+++ /dev/null
@@ -1,2 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#include <asm-generic/kvm_para.h>
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index b41f8881ecc8..decc306a3b52 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -11,6 +11,7 @@ generic-y += hardirq.h
 generic-y += hw_irq.h
 generic-y += irq_regs.h
 generic-y += irq_work.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
diff --git a/arch/arc/include/uapi/asm/Kbuild b/arch/arc/include/uapi/asm/Kbuild
index 755bb11323d8..1c72f04ff75d 100644
--- a/arch/arc/include/uapi/asm/Kbuild
+++ b/arch/arc/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
-generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild
index 23b4464c0995..ce8573157774 100644
--- a/arch/arm/include/uapi/asm/Kbuild
+++ b/arch/arm/include/uapi/asm/Kbuild
@@ -3,3 +3,4 @@
 generated-y += unistd-common.h
 generated-y += unistd-oabi.h
 generated-y += unistd-eabi.h
+generic-y += kvm_para.h
diff --git a/arch/arm/include/uapi/asm/kvm_para.h b/arch/arm/include/uapi/asm/kvm_para.h
deleted file mode 100644
index baacc4996d18..000000000000
--- a/arch/arm/include/uapi/asm/kvm_para.h
+++ /dev/null
@@ -1,2 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#include <asm-generic/kvm_para.h>
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index 63b4a1705182..249c9f6f26dc 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -19,6 +19,7 @@ generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
diff --git a/arch/c6x/include/uapi/asm/Kbuild b/arch/c6x/include/uapi/asm/Kbuild
index 755bb11323d8..1c72f04ff75d 100644
--- a/arch/c6x/include/uapi/asm/Kbuild
+++ b/arch/c6x/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
-generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index 3e7c8ecf151e..e3dead402e5f 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -23,6 +23,7 @@ generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += linkage.h
 generic-y += local.h
 generic-y += local64.h
diff --git a/arch/h8300/include/uapi/asm/Kbuild b/arch/h8300/include/uapi/asm/Kbuild
index 755bb11323d8..1c72f04ff75d 100644
--- a/arch/h8300/include/uapi/asm/Kbuild
+++ b/arch/h8300/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
-generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index b25fd42aa0f4..d046e8ccdf78 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -19,6 +19,7 @@ generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
diff --git a/arch/hexagon/include/uapi/asm/kvm_para.h b/arch/hexagon/include/uapi/asm/kvm_para.h
deleted file mode 100644
index baacc4996d18..000000000000
--- a/arch/hexagon/include/uapi/asm/kvm_para.h
+++ /dev/null
@@ -1,2 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#include <asm-generic/kvm_para.h>
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
index 43e21fe3499c..11f191689c9e 100644
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -2,6 +2,7 @@ generated-y += syscall_table.h
 generic-y += compat.h
 generic-y += exec.h
 generic-y += irq_work.h
+generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
 generic-y += preempt.h
diff --git a/arch/ia64/include/uapi/asm/Kbuild b/arch/ia64/include/uapi/asm/Kbuild
index 20018cb883a9..62a9522af51e 100644
--- a/arch/ia64/include/uapi/asm/Kbuild
+++ b/arch/ia64/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
 generated-y += unistd_64.h
-generic-y += kvm_para.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index 95f8f631c4df..2c359d9e80f6 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -13,6 +13,7 @@ generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
diff --git a/arch/m68k/include/uapi/asm/Kbuild b/arch/m68k/include/uapi/asm/Kbuild
index 8a7ad40be463..7417847dc438 100644
--- a/arch/m68k/include/uapi/asm/Kbuild
+++ b/arch/m68k/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
 generated-y += unistd_32.h
-generic-y += kvm_para.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index 791cc8d54d0a..1a8285c3f693 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -17,6 +17,7 @@ generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += linkage.h
 generic-y += local.h
 generic-y += local64.h
diff --git a/arch/microblaze/include/uapi/asm/Kbuild b/arch/microblaze/include/uapi/asm/Kbuild
index 3ce84fbb2678..13f59631c576 100644
--- a/arch/microblaze/include/uapi/asm/Kbuild
+++ b/arch/microblaze/include/uapi/asm/Kbuild
@@ -1,3 +1,2 @@
 generated-y += unistd_32.h
-generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index 8fde4fa2c34f..88a667d12aaa 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -23,6 +23,7 @@ generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
diff --git a/arch/nios2/include/uapi/asm/Kbuild b/arch/nios2/include/uapi/asm/Kbuild
index 755bb11323d8..1c72f04ff75d 100644
--- a/arch/nios2/include/uapi/asm/Kbuild
+++ b/arch/nios2/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
-generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index 5a73e2956ac4..22aa97136c01 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -20,6 +20,7 @@ generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
diff --git a/arch/openrisc/include/uapi/asm/Kbuild b/arch/openrisc/include/uapi/asm/Kbuild
index 755bb11323d8..1c72f04ff75d 100644
--- a/arch/openrisc/include/uapi/asm/Kbuild
+++ b/arch/openrisc/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
-generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index 6f49e77d82a2..9bcd0c903dbb 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -11,6 +11,7 @@ generic-y += irq_regs.h
 generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
diff --git a/arch/parisc/include/uapi/asm/Kbuild b/arch/parisc/include/uapi/asm/Kbuild
index 22fdbd08cdc8..2bd5b392277c 100644
--- a/arch/parisc/include/uapi/asm/Kbuild
+++ b/arch/parisc/include/uapi/asm/Kbuild
@@ -1,3 +1,2 @@
 generated-y += unistd_32.h
 generated-y += unistd_64.h
-generic-y += kvm_para.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index a6ef3fee5f85..7bf2cb680d32 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -9,6 +9,7 @@ generic-y += emergency-restart.h
 generic-y += exec.h
 generic-y += irq_regs.h
 generic-y += irq_work.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
diff --git a/arch/sh/include/uapi/asm/Kbuild b/arch/sh/include/uapi/asm/Kbuild
index ecfbd40924dd..b8812c74c1de 100644
--- a/arch/sh/include/uapi/asm/Kbuild
+++ b/arch/sh/include/uapi/asm/Kbuild
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
 
 generated-y += unistd_32.h
-generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index b82f64e28f55..a22cfd5c0ee8 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -9,6 +9,7 @@ generic-y += exec.h
 generic-y += export.h
 generic-y += irq_regs.h
 generic-y += irq_work.h
+generic-y += kvm_para.h
 generic-y += linkage.h
 generic-y += local.h
 generic-y += local64.h
diff --git a/arch/sparc/include/uapi/asm/kvm_para.h b/arch/sparc/include/uapi/asm/kvm_para.h
deleted file mode 100644
index baacc4996d18..000000000000
--- a/arch/sparc/include/uapi/asm/kvm_para.h
+++ /dev/null
@@ -1,2 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#include <asm-generic/kvm_para.h>
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index 1d1544b6ca74..d77d953c04c1 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -18,6 +18,7 @@ generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
diff --git a/arch/unicore32/include/uapi/asm/Kbuild b/arch/unicore32/include/uapi/asm/Kbuild
index 755bb11323d8..1c72f04ff75d 100644
--- a/arch/unicore32/include/uapi/asm/Kbuild
+++ b/arch/unicore32/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
-generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 42b6cb3d16f7..3843198e03d4 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -15,6 +15,7 @@ generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
diff --git a/arch/xtensa/include/uapi/asm/Kbuild b/arch/xtensa/include/uapi/asm/Kbuild
index 8a7ad40be463..7417847dc438 100644
--- a/arch/xtensa/include/uapi/asm/Kbuild
+++ b/arch/xtensa/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
 generated-y += unistd_32.h
-generic-y += kvm_para.h
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 5f24b50c9e88..059dc2bedaf6 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -7,5 +7,7 @@ no-export-headers += kvm.h
 endif
 
 ifeq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/kvm_para.h),)
+ifeq ($(wildcard $(objtree)/arch/$(SRCARCH)/include/generated/uapi/asm/kvm_para.h),)
 no-export-headers += kvm_para.h
 endif
+endif
-- 
cgit v1.2.3


From 9b7ea46a82b31c74a37e6ff1c2a1df7d53e392ab Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Thu, 28 Mar 2019 20:43:34 -0700
Subject: mm/hotplug: fix offline undo_isolate_page_range()

Commit f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded
memory to zones until online") introduced move_pfn_range_to_zone() which
calls memmap_init_zone() during onlining a memory block.
memmap_init_zone() will reset pagetype flags and makes migrate type to
be MOVABLE.

However, in __offline_pages(), it also call undo_isolate_page_range()
after offline_isolated_pages() to do the same thing.  Due to commit
2ce13640b3f4 ("mm: __first_valid_page skip over offline pages") changed
__first_valid_page() to skip offline pages, undo_isolate_page_range()
here just waste CPU cycles looping around the offlining PFN range while
doing nothing, because __first_valid_page() will return NULL as
offline_isolated_pages() has already marked all memory sections within
the pfn range as offline via offline_mem_sections().

Also, after calling the "useless" undo_isolate_page_range() here, it
reaches the point of no returning by notifying MEM_OFFLINE.  Those pages
will be marked as MIGRATE_MOVABLE again once onlining.  The only thing
left to do is to decrease the number of isolated pageblocks zone counter
which would make some paths of the page allocation slower that the above
commit introduced.

Even if alloc_contig_range() can be used to isolate 16GB-hugetlb pages
on ppc64, an "int" should still be enough to represent the number of
pageblocks there.  Fix an incorrect comment along the way.

[cai@lca.pw: v4]
  Link: http://lkml.kernel.org/r/20190314150641.59358-1-cai@lca.pw
Link: http://lkml.kernel.org/r/20190313143133.46200-1-cai@lca.pw
Fixes: 2ce13640b3f4 ("mm: __first_valid_page skip over offline pages")
Signed-off-by: Qian Cai <cai@lca.pw>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>	[4.13+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-isolation.h | 10 ---------
 mm/memory_hotplug.c            | 17 +++++++++++----
 mm/page_alloc.c                |  2 +-
 mm/page_isolation.c            | 48 ++++++++++++++++++++++++++----------------
 mm/sparse.c                    |  2 +-
 5 files changed, 45 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 4eb26d278046..280ae96dc4c3 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -41,16 +41,6 @@ int move_freepages_block(struct zone *zone, struct page *page,
 
 /*
  * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
- * If specified range includes migrate types other than MOVABLE or CMA,
- * this will fail with -EBUSY.
- *
- * For isolating all pages in the range finally, the caller have to
- * free all pages in the range. test_page_isolated() can be used for
- * test it.
- *
- * The following flags are allowed (they can be combined in a bit mask)
- * SKIP_HWPOISON - ignore hwpoison pages
- * REPORT_FAILURE - report details about the failure to isolate the range
  */
 int
 start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f767582af4f8..0e0a16021fd5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1576,7 +1576,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
 {
 	unsigned long pfn, nr_pages;
 	long offlined_pages;
-	int ret, node;
+	int ret, node, nr_isolate_pageblock;
 	unsigned long flags;
 	unsigned long valid_start, valid_end;
 	struct zone *zone;
@@ -1602,10 +1602,11 @@ static int __ref __offline_pages(unsigned long start_pfn,
 	ret = start_isolate_page_range(start_pfn, end_pfn,
 				       MIGRATE_MOVABLE,
 				       SKIP_HWPOISON | REPORT_FAILURE);
-	if (ret) {
+	if (ret < 0) {
 		reason = "failure to isolate range";
 		goto failed_removal;
 	}
+	nr_isolate_pageblock = ret;
 
 	arg.start_pfn = start_pfn;
 	arg.nr_pages = nr_pages;
@@ -1657,8 +1658,16 @@ static int __ref __offline_pages(unsigned long start_pfn,
 	/* Ok, all of our target is isolated.
 	   We cannot do rollback at this point. */
 	offline_isolated_pages(start_pfn, end_pfn);
-	/* reset pagetype flags and makes migrate type to be MOVABLE */
-	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+
+	/*
+	 * Onlining will reset pagetype flags and makes migrate type
+	 * MOVABLE, so just need to decrease the number of isolated
+	 * pageblocks zone counter here.
+	 */
+	spin_lock_irqsave(&zone->lock, flags);
+	zone->nr_isolate_pageblock -= nr_isolate_pageblock;
+	spin_unlock_irqrestore(&zone->lock, flags);
+
 	/* removal success */
 	adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
 	zone->present_pages -= offlined_pages;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 03fcf73d47da..d96ca5bc555b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8233,7 +8233,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 
 	ret = start_isolate_page_range(pfn_max_align_down(start),
 				       pfn_max_align_up(end), migratetype, 0);
-	if (ret)
+	if (ret < 0)
 		return ret;
 
 	/*
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index ce323e56b34d..bf4159d771c7 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -160,27 +160,36 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
 	return NULL;
 }
 
-/*
- * start_isolate_page_range() -- make page-allocation-type of range of pages
- * to be MIGRATE_ISOLATE.
- * @start_pfn: The lower PFN of the range to be isolated.
- * @end_pfn: The upper PFN of the range to be isolated.
- * @migratetype: migrate type to set in error recovery.
+/**
+ * start_isolate_page_range() - make page-allocation-type of range of pages to
+ * be MIGRATE_ISOLATE.
+ * @start_pfn:		The lower PFN of the range to be isolated.
+ * @end_pfn:		The upper PFN of the range to be isolated.
+ *			start_pfn/end_pfn must be aligned to pageblock_order.
+ * @migratetype:	Migrate type to set in error recovery.
+ * @flags:		The following flags are allowed (they can be combined in
+ *			a bit mask)
+ *			SKIP_HWPOISON - ignore hwpoison pages
+ *			REPORT_FAILURE - report details about the failure to
+ *			isolate the range
  *
  * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
  * the range will never be allocated. Any free pages and pages freed in the
- * future will not be allocated again.
- *
- * start_pfn/end_pfn must be aligned to pageblock_order.
- * Return 0 on success and -EBUSY if any part of range cannot be isolated.
+ * future will not be allocated again. If specified range includes migrate types
+ * other than MOVABLE or CMA, this will fail with -EBUSY. For isolating all
+ * pages in the range finally, the caller have to free all pages in the range.
+ * test_page_isolated() can be used for test it.
  *
  * There is no high level synchronization mechanism that prevents two threads
- * from trying to isolate overlapping ranges.  If this happens, one thread
+ * from trying to isolate overlapping ranges. If this happens, one thread
  * will notice pageblocks in the overlapping range already set to isolate.
  * This happens in set_migratetype_isolate, and set_migratetype_isolate
- * returns an error.  We then clean up by restoring the migration type on
- * pageblocks we may have modified and return -EBUSY to caller.  This
+ * returns an error. We then clean up by restoring the migration type on
+ * pageblocks we may have modified and return -EBUSY to caller. This
  * prevents two threads from simultaneously working on overlapping ranges.
+ *
+ * Return: the number of isolated pageblocks on success and -EBUSY if any part
+ * of range cannot be isolated.
  */
 int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 			     unsigned migratetype, int flags)
@@ -188,6 +197,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 	unsigned long pfn;
 	unsigned long undo_pfn;
 	struct page *page;
+	int nr_isolate_pageblock = 0;
 
 	BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
 	BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
@@ -196,13 +206,15 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 	     pfn < end_pfn;
 	     pfn += pageblock_nr_pages) {
 		page = __first_valid_page(pfn, pageblock_nr_pages);
-		if (page &&
-		    set_migratetype_isolate(page, migratetype, flags)) {
-			undo_pfn = pfn;
-			goto undo;
+		if (page) {
+			if (set_migratetype_isolate(page, migratetype, flags)) {
+				undo_pfn = pfn;
+				goto undo;
+			}
+			nr_isolate_pageblock++;
 		}
 	}
-	return 0;
+	return nr_isolate_pageblock;
 undo:
 	for (pfn = start_pfn;
 	     pfn < undo_pfn;
diff --git a/mm/sparse.c b/mm/sparse.c
index 69904aa6165b..56e057c432f9 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -567,7 +567,7 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-/* Mark all memory sections within the pfn range as online */
+/* Mark all memory sections within the pfn range as offline */
 void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 {
 	unsigned long pfn;
-- 
cgit v1.2.3


From 6d6ea1e967a246f12cfe2f5fb743b70b2e608d4a Mon Sep 17 00:00:00 2001
From: Nicolas Boichat <drinkcat@chromium.org>
Date: Thu, 28 Mar 2019 20:43:42 -0700
Subject: mm: add support for kmem caches in DMA32 zone

Patch series "iommu/io-pgtable-arm-v7s: Use DMA32 zone for page tables",
v6.

This is a followup to the discussion in [1], [2].

IOMMUs using ARMv7 short-descriptor format require page tables (level 1
and 2) to be allocated within the first 4GB of RAM, even on 64-bit
systems.

For L1 tables that are bigger than a page, we can just use
__get_free_pages with GFP_DMA32 (on arm64 systems only, arm would still
use GFP_DMA).

For L2 tables that only take 1KB, it would be a waste to allocate a full
page, so we considered 3 approaches:
 1. This series, adding support for GFP_DMA32 slab caches.
 2. genalloc, which requires pre-allocating the maximum number of L2 page
    tables (4096, so 4MB of memory).
 3. page_frag, which is not very memory-efficient as it is unable to reuse
    freed fragments until the whole page is freed. [3]

This series is the most memory-efficient approach.

stable@ note:
  We confirmed that this is a regression, and IOMMU errors happen on 4.19
  and linux-next/master on MT8173 (elm, Acer Chromebook R13). The issue
  most likely starts from commit ad67f5a6545f ("arm64: replace ZONE_DMA
  with ZONE_DMA32"), i.e. 4.15, and presumably breaks a number of Mediatek
  platforms (and maybe others?).

[1] https://lists.linuxfoundation.org/pipermail/iommu/2018-November/030876.html
[2] https://lists.linuxfoundation.org/pipermail/iommu/2018-December/031696.html
[3] https://patchwork.codeaurora.org/patch/671639/

This patch (of 3):

IOMMUs using ARMv7 short-descriptor format require page tables to be
allocated within the first 4GB of RAM, even on 64-bit systems.  On arm64,
this is done by passing GFP_DMA32 flag to memory allocation functions.

For IOMMU L2 tables that only take 1KB, it would be a waste to allocate
a full page using get_free_pages, so we considered 3 approaches:
 1. This patch, adding support for GFP_DMA32 slab caches.
 2. genalloc, which requires pre-allocating the maximum number of L2
    page tables (4096, so 4MB of memory).
 3. page_frag, which is not very memory-efficient as it is unable
    to reuse freed fragments until the whole page is freed.

This change makes it possible to create a custom cache in DMA32 zone using
kmem_cache_create, then allocate memory using kmem_cache_alloc.

We do not create a DMA32 kmalloc cache array, as there are currently no
users of kmalloc(..., GFP_DMA32).  These calls will continue to trigger a
warning, as we keep GFP_DMA32 in GFP_SLAB_BUG_MASK.

This implies that calls to kmem_cache_*alloc on a SLAB_CACHE_DMA32
kmem_cache must _not_ use GFP_DMA32 (it is anyway redundant and
unnecessary).

Link: http://lkml.kernel.org/r/20181210011504.122604-2-drinkcat@chromium.org
Signed-off-by: Nicolas Boichat <drinkcat@chromium.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Will Deacon <will.deacon@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Sasha Levin <Alexander.Levin@microsoft.com>
Cc: Huaisheng Ye <yehs1@lenovo.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Yong Wu <yong.wu@mediatek.com>
Cc: Matthias Brugger <matthias.bgg@gmail.com>
Cc: Tomasz Figa <tfiga@google.com>
Cc: Yingjoe Chen <yingjoe.chen@mediatek.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Hsin-Yi Wang <hsinyi@chromium.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 2 ++
 mm/slab.c            | 2 ++
 mm/slab.h            | 3 ++-
 mm/slab_common.c     | 2 +-
 mm/slub.c            | 5 +++++
 5 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 11b45f7ae405..9449b19c5f10 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -32,6 +32,8 @@
 #define SLAB_HWCACHE_ALIGN	((slab_flags_t __force)0x00002000U)
 /* Use GFP_DMA memory */
 #define SLAB_CACHE_DMA		((slab_flags_t __force)0x00004000U)
+/* Use GFP_DMA32 memory */
+#define SLAB_CACHE_DMA32	((slab_flags_t __force)0x00008000U)
 /* DEBUG: Store the last owner for bug hunting */
 #define SLAB_STORE_USER		((slab_flags_t __force)0x00010000U)
 /* Panic if kmem_cache_create() fails */
diff --git a/mm/slab.c b/mm/slab.c
index 28652e4218e0..329bfe67f2ca 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2115,6 +2115,8 @@ done:
 	cachep->allocflags = __GFP_COMP;
 	if (flags & SLAB_CACHE_DMA)
 		cachep->allocflags |= GFP_DMA;
+	if (flags & SLAB_CACHE_DMA32)
+		cachep->allocflags |= GFP_DMA32;
 	if (flags & SLAB_RECLAIM_ACCOUNT)
 		cachep->allocflags |= __GFP_RECLAIMABLE;
 	cachep->size = size;
diff --git a/mm/slab.h b/mm/slab.h
index e5e6658eeacc..43ac818b8592 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -127,7 +127,8 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
 
 
 /* Legal flag mask for kmem_cache_create(), for various configurations */
-#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
+#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
+			 SLAB_CACHE_DMA32 | SLAB_PANIC | \
 			 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )
 
 #if defined(CONFIG_DEBUG_SLAB)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 03eeb8b7b4b1..58251ba63e4a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -53,7 +53,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
 		SLAB_FAILSLAB | SLAB_KASAN)
 
 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
-			 SLAB_ACCOUNT)
+			 SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
 
 /*
  * Merge control. If this is set then no merging of slab caches will occur.
diff --git a/mm/slub.c b/mm/slub.c
index 1b08fbcb7e61..d30ede89f4a6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3589,6 +3589,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 	if (s->flags & SLAB_CACHE_DMA)
 		s->allocflags |= GFP_DMA;
 
+	if (s->flags & SLAB_CACHE_DMA32)
+		s->allocflags |= GFP_DMA32;
+
 	if (s->flags & SLAB_RECLAIM_ACCOUNT)
 		s->allocflags |= __GFP_RECLAIMABLE;
 
@@ -5679,6 +5682,8 @@ static char *create_unique_id(struct kmem_cache *s)
 	 */
 	if (s->flags & SLAB_CACHE_DMA)
 		*p++ = 'd';
+	if (s->flags & SLAB_CACHE_DMA32)
+		*p++ = 'D';
 	if (s->flags & SLAB_RECLAIM_ACCOUNT)
 		*p++ = 'a';
 	if (s->flags & SLAB_CONSISTENCY_CHECKS)
-- 
cgit v1.2.3


From a953e7721fa9999fd628885ed451e16641a23d1e Mon Sep 17 00:00:00 2001
From: Souptick Joarder <jrdr.linux@gmail.com>
Date: Thu, 28 Mar 2019 20:43:51 -0700
Subject: include/linux/hugetlb.h: convert to use vm_fault_t

kbuild produces the below warning:

  tree: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master
  head:   5453a3df2a5eb49bc24615d4cf0d66b2aae05e5f
  commit 3d3539018d2c ("mm: create the new vm_fault_t type")
  reproduce:
        # apt-get install sparse
        git checkout 3d3539018d2cbd12e5af4a132636ee7fd8d43ef0
        make ARCH=x86_64 allmodconfig
        make C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__'

  >> mm/memory.c:3968:21: sparse: incorrect type in assignment (different
  >> base types) @@    expected restricted vm_fault_t [usertype] ret @@
  >> got e] ret @@
     mm/memory.c:3968:21:    expected restricted vm_fault_t [usertype] ret
     mm/memory.c:3968:21:    got int

This patch converts to return vm_fault_t type for hugetlb_fault() when
CONFIG_HUGETLB_PAGE=n.

Regarding the sparse warning, Luc said:

: This is the expected behaviour.  The constant 0 is magic regarding bitwise
: types but ({ ...; 0; }) is not, it is just an ordinary expression of type
: 'int'.
:
: So, IMHO, Souptick's patch is the right thing to do.

Link: http://lkml.kernel.org/r/20190318162604.GA31553@jordon-HP-15-Notebook-PC
Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ea35263eb76b..11943b60f208 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -203,7 +203,6 @@ static inline void hugetlb_show_meminfo(void)
 #define pud_huge(x)	0
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
-#define hugetlb_fault(mm, vma, addr, flags)	({ BUG(); 0; })
 #define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
 				src_addr, pagep)	({ BUG(); 0; })
 #define huge_pte_offset(mm, address, sz)	0
@@ -234,6 +233,13 @@ static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
 {
 	BUG();
 }
+static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
+				struct vm_area_struct *vma, unsigned long address,
+				unsigned int flags)
+{
+	BUG();
+	return 0;
+}
 
 #endif /* !CONFIG_HUGETLB_PAGE */
 /*
-- 
cgit v1.2.3


From b736523f0759d1debeb56f8e0c4c87a2bea0fb23 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 28 Mar 2019 20:44:05 -0700
Subject: include/linux/list.h: fix list_is_first() kernel-doc

Fix typo of kernel-doc parameter notation (there should be no space
between '@' and the parameter name).

Also fixes bogus kernel-doc notation output formatting.

Link: http://lkml.kernel.org/r/ddce8b80-9a8a-d52d-3546-87b2211c089a@infradead.org
Fixes: 70b44595eafe9 ("mm, compaction: use free lists to quickly locate a migration source")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/list.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/list.h b/include/linux/list.h
index 79626b5ab36c..58aa3adf94e6 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -207,7 +207,7 @@ static inline void list_bulk_move_tail(struct list_head *head,
 }
 
 /**
- * list_is_first -- tests whether @ list is the first entry in list @head
+ * list_is_first -- tests whether @list is the first entry in list @head
  * @list: the entry to test
  * @head: the head of the list
  */
-- 
cgit v1.2.3


From fcfc2aa0185f4a731d05a21e9f359968fdfd02e7 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Thu, 28 Mar 2019 20:44:13 -0700
Subject: ptrace: take into account saved_sigmask in PTRACE{GET,SET}SIGMASK

There are a few system calls (pselect, ppoll, etc) which replace a task
sigmask while they are running in a kernel-space

When a task calls one of these syscalls, the kernel saves a current
sigmask in task->saved_sigmask and sets a syscall sigmask.

On syscall-exit-stop, ptrace traps a task before restoring the
saved_sigmask, so PTRACE_GETSIGMASK returns the syscall sigmask and
PTRACE_SETSIGMASK does nothing, because its sigmask is replaced by
saved_sigmask, when the task returns to user-space.

This patch fixes this problem.  PTRACE_GETSIGMASK returns saved_sigmask
if it's set.  PTRACE_SETSIGMASK drops the TIF_RESTORE_SIGMASK flag.

Link: http://lkml.kernel.org/r/20181120060616.6043-1-avagin@gmail.com
Fixes: 29000caecbe8 ("ptrace: add ability to get/set signal-blocked mask")
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/signal.h | 18 ++++++++++++++++++
 kernel/ptrace.c              | 15 +++++++++++++--
 2 files changed, 31 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index ae5655197698..e412c092c1e8 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -418,10 +418,20 @@ static inline void set_restore_sigmask(void)
 	set_thread_flag(TIF_RESTORE_SIGMASK);
 	WARN_ON(!test_thread_flag(TIF_SIGPENDING));
 }
+
+static inline void clear_tsk_restore_sigmask(struct task_struct *tsk)
+{
+	clear_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK);
+}
+
 static inline void clear_restore_sigmask(void)
 {
 	clear_thread_flag(TIF_RESTORE_SIGMASK);
 }
+static inline bool test_tsk_restore_sigmask(struct task_struct *tsk)
+{
+	return test_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK);
+}
 static inline bool test_restore_sigmask(void)
 {
 	return test_thread_flag(TIF_RESTORE_SIGMASK);
@@ -439,6 +449,10 @@ static inline void set_restore_sigmask(void)
 	current->restore_sigmask = true;
 	WARN_ON(!test_thread_flag(TIF_SIGPENDING));
 }
+static inline void clear_tsk_restore_sigmask(struct task_struct *tsk)
+{
+	tsk->restore_sigmask = false;
+}
 static inline void clear_restore_sigmask(void)
 {
 	current->restore_sigmask = false;
@@ -447,6 +461,10 @@ static inline bool test_restore_sigmask(void)
 {
 	return current->restore_sigmask;
 }
+static inline bool test_tsk_restore_sigmask(struct task_struct *tsk)
+{
+	return tsk->restore_sigmask;
+}
 static inline bool test_and_clear_restore_sigmask(void)
 {
 	if (!current->restore_sigmask)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 771e93f9c43f..6f357f4fc859 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -29,6 +29,7 @@
 #include <linux/hw_breakpoint.h>
 #include <linux/cn_proc.h>
 #include <linux/compat.h>
+#include <linux/sched/signal.h>
 
 /*
  * Access another process' address space via ptrace.
@@ -924,18 +925,26 @@ int ptrace_request(struct task_struct *child, long request,
 			ret = ptrace_setsiginfo(child, &siginfo);
 		break;
 
-	case PTRACE_GETSIGMASK:
+	case PTRACE_GETSIGMASK: {
+		sigset_t *mask;
+
 		if (addr != sizeof(sigset_t)) {
 			ret = -EINVAL;
 			break;
 		}
 
-		if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t)))
+		if (test_tsk_restore_sigmask(child))
+			mask = &child->saved_sigmask;
+		else
+			mask = &child->blocked;
+
+		if (copy_to_user(datavp, mask, sizeof(sigset_t)))
 			ret = -EFAULT;
 		else
 			ret = 0;
 
 		break;
+	}
 
 	case PTRACE_SETSIGMASK: {
 		sigset_t new_set;
@@ -961,6 +970,8 @@ int ptrace_request(struct task_struct *child, long request,
 		child->blocked = new_set;
 		spin_unlock_irq(&child->sighand->siglock);
 
+		clear_tsk_restore_sigmask(child);
+
 		ret = 0;
 		break;
 	}
-- 
cgit v1.2.3


From 58e75155009cc800005629955d3482f36a1e0eec Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Date: Wed, 27 Mar 2019 11:18:48 +0100
Subject: HID: core: move Usage Page concatenation to Main item

As seen on some USB wireless keyboards manufactured by Primax, the HID
parser was using some assumptions that are not always true. In this case
it's s the fact that, inside the scope of a main item, an Usage Page
will always precede an Usage.

The spec is not pretty clear as 6.2.2.7 states "Any usage that follows
is interpreted as a Usage ID and concatenated with the Usage Page".
While 6.2.2.8 states "When the parser encounters a main item it
concatenates the last declared Usage Page with a Usage to form a
complete usage value." Being somewhat contradictory it was decided to
match Window's implementation, which follows 6.2.2.8.

In summary, the patch moves the Usage Page concatenation from the local
item parsing function to the main item parsing function.

Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Reviewed-by: Terry Junge <terry.junge@poly.com>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
---
 drivers/hid/hid-core.c | 36 ++++++++++++++++++++++++------------
 include/linux/hid.h    |  1 +
 2 files changed, 25 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 9993b692598f..852bbd303d9a 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -218,13 +218,14 @@ static unsigned hid_lookup_collection(struct hid_parser *parser, unsigned type)
  * Add a usage to the temporary parser table.
  */
 
-static int hid_add_usage(struct hid_parser *parser, unsigned usage)
+static int hid_add_usage(struct hid_parser *parser, unsigned usage, u8 size)
 {
 	if (parser->local.usage_index >= HID_MAX_USAGES) {
 		hid_err(parser->device, "usage index exceeded\n");
 		return -1;
 	}
 	parser->local.usage[parser->local.usage_index] = usage;
+	parser->local.usage_size[parser->local.usage_index] = size;
 	parser->local.collection_index[parser->local.usage_index] =
 		parser->collection_stack_ptr ?
 		parser->collection_stack[parser->collection_stack_ptr - 1] : 0;
@@ -486,10 +487,7 @@ static int hid_parser_local(struct hid_parser *parser, struct hid_item *item)
 			return 0;
 		}
 
-		if (item->size <= 2)
-			data = (parser->global.usage_page << 16) + data;
-
-		return hid_add_usage(parser, data);
+		return hid_add_usage(parser, data, item->size);
 
 	case HID_LOCAL_ITEM_TAG_USAGE_MINIMUM:
 
@@ -498,9 +496,6 @@ static int hid_parser_local(struct hid_parser *parser, struct hid_item *item)
 			return 0;
 		}
 
-		if (item->size <= 2)
-			data = (parser->global.usage_page << 16) + data;
-
 		parser->local.usage_minimum = data;
 		return 0;
 
@@ -511,9 +506,6 @@ static int hid_parser_local(struct hid_parser *parser, struct hid_item *item)
 			return 0;
 		}
 
-		if (item->size <= 2)
-			data = (parser->global.usage_page << 16) + data;
-
 		count = data - parser->local.usage_minimum;
 		if (count + parser->local.usage_index >= HID_MAX_USAGES) {
 			/*
@@ -533,7 +525,7 @@ static int hid_parser_local(struct hid_parser *parser, struct hid_item *item)
 		}
 
 		for (n = parser->local.usage_minimum; n <= data; n++)
-			if (hid_add_usage(parser, n)) {
+			if (hid_add_usage(parser, n, item->size)) {
 				dbg_hid("hid_add_usage failed\n");
 				return -1;
 			}
@@ -547,6 +539,22 @@ static int hid_parser_local(struct hid_parser *parser, struct hid_item *item)
 	return 0;
 }
 
+/*
+ * Concatenate Usage Pages into Usages where relevant:
+ * As per specification, 6.2.2.8: "When the parser encounters a main item it
+ * concatenates the last declared Usage Page with a Usage to form a complete
+ * usage value."
+ */
+
+static void hid_concatenate_usage_page(struct hid_parser *parser)
+{
+	int i;
+
+	for (i = 0; i < parser->local.usage_index; i++)
+		if (parser->local.usage_size[i] <= 2)
+			parser->local.usage[i] += parser->global.usage_page << 16;
+}
+
 /*
  * Process a main item.
  */
@@ -556,6 +564,8 @@ static int hid_parser_main(struct hid_parser *parser, struct hid_item *item)
 	__u32 data;
 	int ret;
 
+	hid_concatenate_usage_page(parser);
+
 	data = item_udata(item);
 
 	switch (item->tag) {
@@ -765,6 +775,8 @@ static int hid_scan_main(struct hid_parser *parser, struct hid_item *item)
 	__u32 data;
 	int i;
 
+	hid_concatenate_usage_page(parser);
+
 	data = item_udata(item);
 
 	switch (item->tag) {
diff --git a/include/linux/hid.h b/include/linux/hid.h
index f9707d1dcb58..ac0c70b4ce10 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -417,6 +417,7 @@ struct hid_global {
 
 struct hid_local {
 	unsigned usage[HID_MAX_USAGES]; /* usage array */
+	u8 usage_size[HID_MAX_USAGES]; /* usage size array */
 	unsigned collection_index[HID_MAX_USAGES]; /* collection index array */
 	unsigned usage_index;
 	unsigned usage_minimum;
-- 
cgit v1.2.3


From c9121cf637331b6fc07a60708c6ca0a161e7deb5 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sat, 20 Apr 2019 13:21:56 +0200
Subject: HID: logitech-dj: add support for 27 MHz receivers

Most Logitech wireless keyboard and mice using the 27 MHz are hidpp10
devices, add support to logitech-dj for their receivers.

Doing so leads to 2 improvements:

1) All these devices share the same USB product-id for their receiver,
making it impossible to properly map some special keys / buttons
which differ from device to device. Adding support to logitech-dj to
see these as hidpp10 devices allows us to get the actual device-id
from the keyboard / mouse.

2) It enables battery-monitoring of these devices

This patch uses a new HID group for 27Mhz devices, since the logitech-hidpp
code needs to be able to differentiate them from other devices instantiated
by the logitech-dj code.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
---
 drivers/hid/hid-lg.c          |  2 -
 drivers/hid/hid-logitech-dj.c | 94 ++++++++++++++++++++++++++++++++++++++++++-
 drivers/hid/hid-quirks.c      |  1 -
 include/linux/hid.h           |  1 +
 4 files changed, 94 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/hid/hid-lg.c b/drivers/hid/hid-lg.c
index 5d419a95b6c2..36d725fdb199 100644
--- a/drivers/hid/hid-lg.c
+++ b/drivers/hid/hid-lg.c
@@ -876,8 +876,6 @@ static const struct hid_device_id lg_devices[] = {
 		.driver_data = LG_RDESC | LG_WIRELESS },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_S510_RECEIVER),
 		.driver_data = LG_RDESC | LG_WIRELESS },
-	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_S510_RECEIVER_2),
-		.driver_data = LG_RDESC | LG_WIRELESS },
 
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RECEIVER),
 		.driver_data = LG_BAD_RELATIVE_KEYS },
diff --git a/drivers/hid/hid-logitech-dj.c b/drivers/hid/hid-logitech-dj.c
index 9ece9aca40a0..3004ca91b76b 100644
--- a/drivers/hid/hid-logitech-dj.c
+++ b/drivers/hid/hid-logitech-dj.c
@@ -106,6 +106,7 @@
 #define HIDPP_PARAM_DEVICE_INFO			0x01
 #define HIDPP_PARAM_EQUAD_LSB			0x02
 #define HIDPP_PARAM_EQUAD_MSB			0x03
+#define HIDPP_PARAM_27MHZ_DEVID			0x03
 #define HIDPP_DEVICE_TYPE_MASK			GENMASK(3, 0)
 #define HIDPP_LINK_STATUS_MASK			BIT(6)
 
@@ -120,6 +121,7 @@ enum recvr_type {
 	recvr_type_dj,
 	recvr_type_hidpp,
 	recvr_type_gaming_hidpp,
+	recvr_type_27mhz,
 };
 
 struct dj_report {
@@ -248,6 +250,44 @@ static const char mse_descriptor[] = {
 	0xC0,			/*  END_COLLECTION                      */
 };
 
+/* Mouse descriptor (2) for 27 MHz receiver, only 8 buttons */
+static const char mse_27mhz_descriptor[] = {
+	0x05, 0x01,		/*  USAGE_PAGE (Generic Desktop)        */
+	0x09, 0x02,		/*  USAGE (Mouse)                       */
+	0xA1, 0x01,		/*  COLLECTION (Application)            */
+	0x85, 0x02,		/*    REPORT_ID = 2                     */
+	0x09, 0x01,		/*    USAGE (pointer)                   */
+	0xA1, 0x00,		/*    COLLECTION (physical)             */
+	0x05, 0x09,		/*      USAGE_PAGE (buttons)            */
+	0x19, 0x01,		/*      USAGE_MIN (1)                   */
+	0x29, 0x08,		/*      USAGE_MAX (8)                   */
+	0x15, 0x00,		/*      LOGICAL_MIN (0)                 */
+	0x25, 0x01,		/*      LOGICAL_MAX (1)                 */
+	0x95, 0x08,		/*      REPORT_COUNT (8)                */
+	0x75, 0x01,		/*      REPORT_SIZE (1)                 */
+	0x81, 0x02,		/*      INPUT (data var abs)            */
+	0x05, 0x01,		/*      USAGE_PAGE (generic desktop)    */
+	0x16, 0x01, 0xF8,	/*      LOGICAL_MIN (-2047)             */
+	0x26, 0xFF, 0x07,	/*      LOGICAL_MAX (2047)              */
+	0x75, 0x0C,		/*      REPORT_SIZE (12)                */
+	0x95, 0x02,		/*      REPORT_COUNT (2)                */
+	0x09, 0x30,		/*      USAGE (X)                       */
+	0x09, 0x31,		/*      USAGE (Y)                       */
+	0x81, 0x06,		/*      INPUT                           */
+	0x15, 0x81,		/*      LOGICAL_MIN (-127)              */
+	0x25, 0x7F,		/*      LOGICAL_MAX (127)               */
+	0x75, 0x08,		/*      REPORT_SIZE (8)                 */
+	0x95, 0x01,		/*      REPORT_COUNT (1)                */
+	0x09, 0x38,		/*      USAGE (wheel)                   */
+	0x81, 0x06,		/*      INPUT                           */
+	0x05, 0x0C,		/*      USAGE_PAGE(consumer)            */
+	0x0A, 0x38, 0x02,	/*      USAGE(AC Pan)                   */
+	0x95, 0x01,		/*      REPORT_COUNT (1)                */
+	0x81, 0x06,		/*      INPUT                           */
+	0xC0,			/*    END_COLLECTION                    */
+	0xC0,			/*  END_COLLECTION                      */
+};
+
 /* Gaming Mouse descriptor (2) */
 static const char mse_high_res_descriptor[] = {
 	0x05, 0x01,		/*  USAGE_PAGE (Generic Desktop)        */
@@ -596,7 +636,10 @@ static void logi_dj_recv_add_djhid_device(struct dj_receiver_dev *djrcv_dev,
 		"Logitech Unifying Device. Wireless PID:%04x",
 		dj_hiddev->product);
 
-	dj_hiddev->group = HID_GROUP_LOGITECH_DJ_DEVICE;
+	if (djrcv_dev->type == recvr_type_27mhz)
+		dj_hiddev->group = HID_GROUP_LOGITECH_27MHZ_DEVICE;
+	else
+		dj_hiddev->group = HID_GROUP_LOGITECH_DJ_DEVICE;
 
 	memcpy(dj_hiddev->phys, djrcv_hdev->phys, sizeof(djrcv_hdev->phys));
 	snprintf(tmpstr, sizeof(tmpstr), ":%d", device_index);
@@ -782,6 +825,28 @@ static void logi_hidpp_dev_conn_notif_equad(struct hidpp_event *hidpp_report,
 	}
 }
 
+static void logi_hidpp_dev_conn_notif_27mhz(struct hid_device *hdev,
+					    struct hidpp_event *hidpp_report,
+					    struct dj_workitem *workitem)
+{
+	workitem->type = WORKITEM_TYPE_PAIRED;
+	workitem->quad_id_lsb = hidpp_report->params[HIDPP_PARAM_27MHZ_DEVID];
+	switch (hidpp_report->device_index) {
+	case 1: /* Index 1 is always a mouse */
+	case 2: /* Index 2 is always a mouse */
+		workitem->reports_supported |= STD_MOUSE;
+		break;
+	case 3: /* Index 3 is always the keyboard */
+	case 4: /* Index 4 is used for an optional separate numpad */
+		workitem->reports_supported |= STD_KEYBOARD | MULTIMEDIA |
+					       POWER_KEYS;
+		break;
+	default:
+		hid_warn(hdev, "%s: unexpected device-index %d", __func__,
+			 hidpp_report->device_index);
+	}
+}
+
 static void logi_hidpp_recv_queue_notif(struct hid_device *hdev,
 					struct hidpp_event *hidpp_report)
 {
@@ -799,6 +864,7 @@ static void logi_hidpp_recv_queue_notif(struct hid_device *hdev,
 		break;
 	case 0x02:
 		device_type = "27 Mhz";
+		logi_hidpp_dev_conn_notif_27mhz(hdev, hidpp_report, &workitem);
 		break;
 	case 0x03:
 		device_type = "QUAD or eQUAD";
@@ -1186,6 +1252,9 @@ static int logi_dj_ll_parse(struct hid_device *hid)
 		if (djdev->dj_receiver_dev->type == recvr_type_gaming_hidpp)
 			rdcat(rdesc, &rsize, mse_high_res_descriptor,
 			      sizeof(mse_high_res_descriptor));
+		else if (djdev->dj_receiver_dev->type == recvr_type_27mhz)
+			rdcat(rdesc, &rsize, mse_27mhz_descriptor,
+			      sizeof(mse_27mhz_descriptor));
 		else
 			rdcat(rdesc, &rsize, mse_descriptor,
 			      sizeof(mse_descriptor));
@@ -1357,6 +1426,25 @@ static int logi_dj_hidpp_event(struct hid_device *hdev,
 	spin_lock_irqsave(&djrcv_dev->lock, flags);
 
 	dj_dev = djrcv_dev->paired_dj_devices[device_index];
+
+	/*
+	 * With 27 MHz receivers, we do not get an explicit unpair event,
+	 * remove the old device if the user has paired a *different* device.
+	 */
+	if (djrcv_dev->type == recvr_type_27mhz && dj_dev &&
+	    hidpp_report->sub_id == REPORT_TYPE_NOTIF_DEVICE_CONNECTED &&
+	    hidpp_report->params[HIDPP_PARAM_PROTO_TYPE] == 0x02 &&
+	    hidpp_report->params[HIDPP_PARAM_27MHZ_DEVID] !=
+						dj_dev->hdev->product) {
+		struct dj_workitem workitem = {
+			.device_index = hidpp_report->device_index,
+			.type = WORKITEM_TYPE_UNPAIRED,
+		};
+		kfifo_in(&djrcv_dev->notif_fifo, &workitem, sizeof(workitem));
+		/* logi_hidpp_recv_queue_notif will queue the work */
+		dj_dev = NULL;
+	}
+
 	if (dj_dev) {
 		logi_dj_recv_forward_report(dj_dev, data, size);
 	} else {
@@ -1628,6 +1716,10 @@ static const struct hid_device_id logi_dj_receivers[] = {
 	  HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH,
 		USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_GAMING),
 	 .driver_data = recvr_type_gaming_hidpp},
+	{ /* Logitech 27 MHz HID++ 1.0 receiver (0xc517) */
+	  HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH,
+		USB_DEVICE_ID_S510_RECEIVER_2),
+	 .driver_data = recvr_type_27mhz},
 	{}
 };
 
diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c
index 426753482494..fea7f7ff5ab1 100644
--- a/drivers/hid/hid-quirks.c
+++ b/drivers/hid/hid-quirks.c
@@ -432,7 +432,6 @@ static const struct hid_device_id hid_have_special_driver[] = {
 #if IS_ENABLED(CONFIG_HID_LOGITECH)
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_MX3000_RECEIVER) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_S510_RECEIVER) },
-	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_S510_RECEIVER_2) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RECEIVER) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_DINOVO_DESKTOP) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_DINOVO_EDGE) },
diff --git a/include/linux/hid.h b/include/linux/hid.h
index f9707d1dcb58..9f161fa5cbd4 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -382,6 +382,7 @@ struct hid_item {
 #define HID_GROUP_WACOM				0x0101
 #define HID_GROUP_LOGITECH_DJ_DEVICE		0x0102
 #define HID_GROUP_STEAM				0x0103
+#define HID_GROUP_LOGITECH_27MHZ_DEVICE		0x0104
 
 /*
  * HID protocol status
-- 
cgit v1.2.3