From 780b1350d316fda28d85fcae17854c778d89cbbe Mon Sep 17 00:00:00 2001
From: Ramesh Shanmugasundaram <ramesh.shanmugasundaram@bp.renesas.com>
Date: Mon, 3 Jul 2017 12:04:21 +0100
Subject: regmap: Avoid namespace collision within macro & tidy up

Renamed variable "timeout" to "__timeout" & "pollret" to "__ret" to
avoid namespace collision. Tidy up macro arguments with parentheses.

Signed-off-by: Ramesh Shanmugasundaram <ramesh.shanmugasundaram@bp.renesas.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regmap.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index e88649225a60..6e1df5e721a9 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -120,23 +120,24 @@ struct reg_sequence {
  */
 #define regmap_read_poll_timeout(map, addr, val, cond, sleep_us, timeout_us) \
 ({ \
-	ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \
-	int pollret; \
+	ktime_t __timeout = ktime_add_us(ktime_get(), timeout_us); \
+	int __ret; \
 	might_sleep_if(sleep_us); \
 	for (;;) { \
-		pollret = regmap_read((map), (addr), &(val)); \
-		if (pollret) \
+		__ret = regmap_read((map), (addr), &(val)); \
+		if (__ret) \
 			break; \
 		if (cond) \
 			break; \
-		if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \
-			pollret = regmap_read((map), (addr), &(val)); \
+		if ((timeout_us) && \
+		    ktime_compare(ktime_get(), __timeout) > 0) { \
+			__ret = regmap_read((map), (addr), &(val)); \
 			break; \
 		} \
 		if (sleep_us) \
-			usleep_range((sleep_us >> 2) + 1, sleep_us); \
+			usleep_range(((sleep_us) >> 2) + 1, sleep_us); \
 	} \
-	pollret ?: ((cond) ? 0 : -ETIMEDOUT); \
+	__ret ?: ((cond) ? 0 : -ETIMEDOUT); \
 })
 
 #ifdef CONFIG_REGMAP
-- 
cgit v1.2.3


From 300238cecede382fbd9084499e57f3b30acc75fa Mon Sep 17 00:00:00 2001
From: Gustavo Padovan <gustavo.padovan@collabora.com>
Date: Mon, 31 Jul 2017 16:36:55 -0300
Subject: dma-buf/sync_file: document flags field

Documentation for it was missing.

Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.com>
Acked-by: Sumit Semwal <sumit.semwal@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20170731193655.6176-1-gustavo@padovan.org
---
 include/linux/sync_file.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sync_file.h b/include/linux/sync_file.h
index 0ad87c434ae6..790ca021203a 100644
--- a/include/linux/sync_file.h
+++ b/include/linux/sync_file.h
@@ -25,8 +25,12 @@
  * @file:		file representing this fence
  * @sync_file_list:	membership in global file list
  * @wq:			wait queue for fence signaling
+ * @flags:		flags for the sync_file
  * @fence:		fence with the fences in the sync_file
  * @cb:			fence callback information
+ *
+ * flags:
+ * POLL_ENABLED: whether userspace is currently poll()'ing or not
  */
 struct sync_file {
 	struct file		*file;
-- 
cgit v1.2.3


From 63b19547cc3d96041d7bc7ab8de6292b0ebaf2c9 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@kernel.org>
Date: Sun, 23 Jul 2017 17:25:43 +0100
Subject: iio: Use macro magic to avoid manual assign of driver_module

Starting point in boiler plate reduction similar to that done for
many similar cases elsewhere in the kernel.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Lars-Peter Clausen <lars@metafoo.de>
---
 drivers/iio/industrialio-core.c    | 35 +++++++++--------------------------
 drivers/iio/industrialio-trigger.c |  6 +++---
 include/linux/iio/iio.h            | 31 +++++++++++++++++++++++++++++--
 3 files changed, 41 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 17ec4cee51dc..7a5aa127c52e 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1660,14 +1660,11 @@ static int iio_check_unique_scan_index(struct iio_dev *indio_dev)
 
 static const struct iio_buffer_setup_ops noop_ring_setup_ops;
 
-/**
- * iio_device_register() - register a device with the IIO subsystem
- * @indio_dev:		Device structure filled by the device driver
- **/
-int iio_device_register(struct iio_dev *indio_dev)
+int __iio_device_register(struct iio_dev *indio_dev, struct module *this_mod)
 {
 	int ret;
 
+	indio_dev->driver_module = this_mod;
 	/* If the calling driver did not initialize of_node, do it here */
 	if (!indio_dev->dev.of_node && indio_dev->dev.parent)
 		indio_dev->dev.of_node = indio_dev->dev.parent->of_node;
@@ -1713,7 +1710,8 @@ int iio_device_register(struct iio_dev *indio_dev)
 		indio_dev->setup_ops = &noop_ring_setup_ops;
 
 	cdev_init(&indio_dev->chrdev, &iio_buffer_fileops);
-	indio_dev->chrdev.owner = indio_dev->info->driver_module;
+
+	indio_dev->chrdev.owner = this_mod;
 
 	ret = cdev_device_add(&indio_dev->chrdev, &indio_dev->dev);
 	if (ret < 0)
@@ -1731,7 +1729,7 @@ error_unreg_debugfs:
 	iio_device_unregister_debugfs(indio_dev);
 	return ret;
 }
-EXPORT_SYMBOL(iio_device_register);
+EXPORT_SYMBOL(__iio_device_register);
 
 /**
  * iio_device_unregister() - unregister a device from the IIO subsystem
@@ -1763,23 +1761,8 @@ static void devm_iio_device_unreg(struct device *dev, void *res)
 	iio_device_unregister(*(struct iio_dev **)res);
 }
 
-/**
- * devm_iio_device_register - Resource-managed iio_device_register()
- * @dev:	Device to allocate iio_dev for
- * @indio_dev:	Device structure filled by the device driver
- *
- * Managed iio_device_register.  The IIO device registered with this
- * function is automatically unregistered on driver detach. This function
- * calls iio_device_register() internally. Refer to that function for more
- * information.
- *
- * If an iio_dev registered with this function needs to be unregistered
- * separately, devm_iio_device_unregister() must be used.
- *
- * RETURNS:
- * 0 on success, negative error number on failure.
- */
-int devm_iio_device_register(struct device *dev, struct iio_dev *indio_dev)
+int __devm_iio_device_register(struct device *dev, struct iio_dev *indio_dev,
+			       struct module *this_mod)
 {
 	struct iio_dev **ptr;
 	int ret;
@@ -1789,7 +1772,7 @@ int devm_iio_device_register(struct device *dev, struct iio_dev *indio_dev)
 		return -ENOMEM;
 
 	*ptr = indio_dev;
-	ret = iio_device_register(indio_dev);
+	ret = __iio_device_register(indio_dev, this_mod);
 	if (!ret)
 		devres_add(dev, ptr);
 	else
@@ -1797,7 +1780,7 @@ int devm_iio_device_register(struct device *dev, struct iio_dev *indio_dev)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(devm_iio_device_register);
+EXPORT_SYMBOL_GPL(__devm_iio_device_register);
 
 /**
  * devm_iio_device_unregister - Resource-managed iio_device_unregister()
diff --git a/drivers/iio/industrialio-trigger.c b/drivers/iio/industrialio-trigger.c
index 4061fed93f1f..9596fedacedb 100644
--- a/drivers/iio/industrialio-trigger.c
+++ b/drivers/iio/industrialio-trigger.c
@@ -250,7 +250,7 @@ static int iio_trigger_attach_poll_func(struct iio_trigger *trig,
 		= bitmap_empty(trig->pool, CONFIG_IIO_CONSUMERS_PER_TRIGGER);
 
 	/* Prevent the module from being removed whilst attached to a trigger */
-	__module_get(pf->indio_dev->info->driver_module);
+	__module_get(pf->indio_dev->driver_module);
 
 	/* Get irq number */
 	pf->irq = iio_trigger_get_irq(trig);
@@ -286,7 +286,7 @@ out_free_irq:
 out_put_irq:
 	iio_trigger_put_irq(trig, pf->irq);
 out_put_module:
-	module_put(pf->indio_dev->info->driver_module);
+	module_put(pf->indio_dev->driver_module);
 	return ret;
 }
 
@@ -307,7 +307,7 @@ static int iio_trigger_detach_poll_func(struct iio_trigger *trig,
 		trig->attached_own_device = false;
 	iio_trigger_put_irq(trig, pf->irq);
 	free_irq(pf->irq, pf);
-	module_put(pf->indio_dev->info->driver_module);
+	module_put(pf->indio_dev->driver_module);
 
 	return ret;
 }
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index d68bec297a45..97a014300947 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -518,6 +518,7 @@ struct iio_buffer_setup_ops {
 /**
  * struct iio_dev - industrial I/O device
  * @id:			[INTERN] used to identify device internally
+ * @driver_module:	[INTERN] used to make it harder to undercut users
  * @modes:		[DRIVER] operating modes supported by device
  * @currentmode:	[DRIVER] current operating mode
  * @dev:		[DRIVER] device structure, should be assigned a parent
@@ -558,6 +559,7 @@ struct iio_buffer_setup_ops {
  */
 struct iio_dev {
 	int				id;
+	struct module			*driver_module;
 
 	int				modes;
 	int				currentmode;
@@ -604,9 +606,34 @@ struct iio_dev {
 
 const struct iio_chan_spec
 *iio_find_channel_from_si(struct iio_dev *indio_dev, int si);
-int iio_device_register(struct iio_dev *indio_dev);
+/**
+ * iio_device_register() - register a device with the IIO subsystem
+ * @indio_dev:		Device structure filled by the device driver
+ **/
+#define iio_device_register(iio_dev) \
+	__iio_device_register((iio_dev), THIS_MODULE)
+int __iio_device_register(struct iio_dev *indio_dev, struct module *this_mod);
 void iio_device_unregister(struct iio_dev *indio_dev);
-int devm_iio_device_register(struct device *dev, struct iio_dev *indio_dev);
+/**
+ * devm_iio_device_register - Resource-managed iio_device_register()
+ * @dev:	Device to allocate iio_dev for
+ * @indio_dev:	Device structure filled by the device driver
+ *
+ * Managed iio_device_register.  The IIO device registered with this
+ * function is automatically unregistered on driver detach. This function
+ * calls iio_device_register() internally. Refer to that function for more
+ * information.
+ *
+ * If an iio_dev registered with this function needs to be unregistered
+ * separately, devm_iio_device_unregister() must be used.
+ *
+ * RETURNS:
+ * 0 on success, negative error number on failure.
+ */
+#define devm_iio_device_register(dev, indio_dev) \
+	__devm_iio_device_register((dev), (indio_dev), THIS_MODULE);
+int __devm_iio_device_register(struct device *dev, struct iio_dev *indio_dev,
+			       struct module *this_mod);
 void devm_iio_device_unregister(struct device *dev, struct iio_dev *indio_dev);
 int iio_push_event(struct iio_dev *indio_dev, u64 ev_code, s64 timestamp);
 int iio_device_claim_direct_mode(struct iio_dev *indio_dev);
-- 
cgit v1.2.3


From 035c70aeb64b861aa88a666c61fda8b2ae49aeae Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@kernel.org>
Date: Sun, 23 Jul 2017 17:25:44 +0100
Subject: iio: triggers: Use macros to avoid boilerplate assignment of owner.

This trig_ops.owner assignment occurs in all trigger drivers and
can be simply automated using a macro as has been done in many
other places in the kernel.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Lars-Peter Clausen <lars@metafoo.de>
---
 drivers/iio/industrialio-trigger.c | 18 ++++++++++++------
 include/linux/iio/trigger.h        | 19 +++++++++++++------
 2 files changed, 25 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-trigger.c b/drivers/iio/industrialio-trigger.c
index 9596fedacedb..faf00202915a 100644
--- a/drivers/iio/industrialio-trigger.c
+++ b/drivers/iio/industrialio-trigger.c
@@ -66,10 +66,13 @@ ATTRIBUTE_GROUPS(iio_trig_dev);
 
 static struct iio_trigger *__iio_trigger_find_by_name(const char *name);
 
-int iio_trigger_register(struct iio_trigger *trig_info)
+int __iio_trigger_register(struct iio_trigger *trig_info,
+			   struct module *this_mod)
 {
 	int ret;
 
+	trig_info->owner = this_mod;
+
 	/* trig_info->ops is required for the module member */
 	if (!trig_info->ops)
 		return -EINVAL;
@@ -105,7 +108,7 @@ error_unregister_id:
 	ida_simple_remove(&iio_trigger_ida, trig_info->id);
 	return ret;
 }
-EXPORT_SYMBOL(iio_trigger_register);
+EXPORT_SYMBOL(__iio_trigger_register);
 
 void iio_trigger_unregister(struct iio_trigger *trig_info)
 {
@@ -663,9 +666,10 @@ static void devm_iio_trigger_unreg(struct device *dev, void *res)
 }
 
 /**
- * devm_iio_trigger_register - Resource-managed iio_trigger_register()
+ * __devm_iio_trigger_register - Resource-managed iio_trigger_register()
  * @dev:	device this trigger was allocated for
  * @trig_info:	trigger to register
+ * @this_mod:   module registering the trigger
  *
  * Managed iio_trigger_register().  The IIO trigger registered with this
  * function is automatically unregistered on driver detach. This function
@@ -678,7 +682,9 @@ static void devm_iio_trigger_unreg(struct device *dev, void *res)
  * RETURNS:
  * 0 on success, negative error number on failure.
  */
-int devm_iio_trigger_register(struct device *dev, struct iio_trigger *trig_info)
+int __devm_iio_trigger_register(struct device *dev,
+				struct iio_trigger *trig_info,
+				struct module *this_mod)
 {
 	struct iio_trigger **ptr;
 	int ret;
@@ -688,7 +694,7 @@ int devm_iio_trigger_register(struct device *dev, struct iio_trigger *trig_info)
 		return -ENOMEM;
 
 	*ptr = trig_info;
-	ret = iio_trigger_register(trig_info);
+	ret = __iio_trigger_register(trig_info, this_mod);
 	if (!ret)
 		devres_add(dev, ptr);
 	else
@@ -696,7 +702,7 @@ int devm_iio_trigger_register(struct device *dev, struct iio_trigger *trig_info)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(devm_iio_trigger_register);
+EXPORT_SYMBOL_GPL(__devm_iio_trigger_register);
 
 /**
  * devm_iio_trigger_unregister - Resource-managed iio_trigger_unregister()
diff --git a/include/linux/iio/trigger.h b/include/linux/iio/trigger.h
index ea08302f2d7b..999793212b40 100644
--- a/include/linux/iio/trigger.h
+++ b/include/linux/iio/trigger.h
@@ -62,6 +62,7 @@ struct iio_trigger_ops {
  **/
 struct iio_trigger {
 	const struct iio_trigger_ops	*ops;
+	struct module			*owner;
 	int				id;
 	const char			*name;
 	struct device			dev;
@@ -87,14 +88,14 @@ static inline struct iio_trigger *to_iio_trigger(struct device *d)
 
 static inline void iio_trigger_put(struct iio_trigger *trig)
 {
-	module_put(trig->ops->owner);
+	module_put(trig->owner);
 	put_device(&trig->dev);
 }
 
 static inline struct iio_trigger *iio_trigger_get(struct iio_trigger *trig)
 {
 	get_device(&trig->dev);
-	__module_get(trig->ops->owner);
+	__module_get(trig->owner);
 
 	return trig;
 }
@@ -127,10 +128,16 @@ static inline void *iio_trigger_get_drvdata(struct iio_trigger *trig)
  * iio_trigger_register() - register a trigger with the IIO core
  * @trig_info:	trigger to be registered
  **/
-int iio_trigger_register(struct iio_trigger *trig_info);
-
-int devm_iio_trigger_register(struct device *dev,
-			      struct iio_trigger *trig_info);
+#define iio_trigger_register(trig_info) \
+	__iio_trigger_register((trig_info), THIS_MODULE)
+int __iio_trigger_register(struct iio_trigger *trig_info,
+			   struct module *this_mod);
+
+#define devm_iio_trigger_register(dev, trig_info) \
+	__devm_iio_trigger_register((dev), (trig_info), THIS_MODULE)
+int __devm_iio_trigger_register(struct device *dev,
+				struct iio_trigger *trig_info,
+				struct module *this_mod);
 
 /**
  * iio_trigger_unregister() - unregister a trigger from the core
-- 
cgit v1.2.3


From 97623c0a80a605ef3fae337081ed008796bf8cc2 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@kernel.org>
Date: Sun, 23 Jul 2017 17:26:21 +0100
Subject: iio: drop iio_info.driver_module and iio_trigger_ops.owner.

The equivalents are now assigned automatically in the relevant
registration calls and so are not needed in these operations
structures.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Lars-Peter Clausen <lars@metafoo.de>
---
 include/linux/iio/iio.h     | 3 ---
 include/linux/iio/trigger.h | 2 --
 2 files changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 97a014300947..486ffbb1a926 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -369,8 +369,6 @@ struct iio_dev;
 
 /**
  * struct iio_info - constant information about device
- * @driver_module:	module structure used to ensure correct
- *			ownership of chrdevs etc
  * @event_attrs:	event control attributes
  * @attrs:		general purpose device attributes
  * @read_raw:		function to request a value from the device.
@@ -425,7 +423,6 @@ struct iio_dev;
  *			were flushed and there was an error.
  **/
 struct iio_info {
-	struct module			*driver_module;
 	const struct attribute_group	*event_attrs;
 	const struct attribute_group	*attrs;
 
diff --git a/include/linux/iio/trigger.h b/include/linux/iio/trigger.h
index 999793212b40..1afe349af1fa 100644
--- a/include/linux/iio/trigger.h
+++ b/include/linux/iio/trigger.h
@@ -23,7 +23,6 @@ struct iio_trigger;
 
 /**
  * struct iio_trigger_ops - operations structure for an iio_trigger.
- * @owner:		used to monitor usage count of the trigger.
  * @set_trigger_state:	switch on/off the trigger on demand
  * @try_reenable:	function to reenable the trigger when the
  *			use count is zero (may be NULL)
@@ -34,7 +33,6 @@ struct iio_trigger;
  * instances of a given device.
  **/
 struct iio_trigger_ops {
-	struct module *owner;
 	int (*set_trigger_state)(struct iio_trigger *trig, bool state);
 	int (*try_reenable)(struct iio_trigger *trig);
 	int (*validate_device)(struct iio_trigger *trig,
-- 
cgit v1.2.3


From c4860ad60564838994b74e7ee7dd12ceeda0f520 Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Mon, 31 Jul 2017 19:55:08 +0100
Subject: lib/scatterlist: Fix offset type in sg_alloc_table_from_pages

Scatterlist entries have an unsigned int for the offset so
correct the sg_alloc_table_from_pages function accordingly.

Since these are offsets withing a page, unsigned int is
wide enough.

Also converts callers which were using unsigned long locally
with the lower_32_bits annotation to make it explicitly
clear what is happening.

v2: Use offset_in_page. (Chris Wilson)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Pawel Osciak <pawel@osciak.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Tomasz Stanislawski <t.stanislaws@samsung.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Alexandre Bounine <alexandre.bounine@idt.com>
Cc: linux-media@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com> (v1)
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20170731185512.20010-1-tvrtko.ursulin@linux.intel.com
---
 drivers/media/v4l2-core/videobuf2-dma-contig.c | 4 ++--
 drivers/rapidio/devices/rio_mport_cdev.c       | 4 ++--
 include/linux/scatterlist.h                    | 2 +-
 lib/scatterlist.c                              | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/v4l2-core/videobuf2-dma-contig.c b/drivers/media/v4l2-core/videobuf2-dma-contig.c
index 4f246d166111..2405077fdc71 100644
--- a/drivers/media/v4l2-core/videobuf2-dma-contig.c
+++ b/drivers/media/v4l2-core/videobuf2-dma-contig.c
@@ -479,7 +479,7 @@ static void *vb2_dc_get_userptr(struct device *dev, unsigned long vaddr,
 {
 	struct vb2_dc_buf *buf;
 	struct frame_vector *vec;
-	unsigned long offset;
+	unsigned int offset;
 	int n_pages, i;
 	int ret = 0;
 	struct sg_table *sgt;
@@ -507,7 +507,7 @@ static void *vb2_dc_get_userptr(struct device *dev, unsigned long vaddr,
 	buf->dev = dev;
 	buf->dma_dir = dma_dir;
 
-	offset = vaddr & ~PAGE_MASK;
+	offset = lower_32_bits(offset_in_page(vaddr));
 	vec = vb2_create_framevec(vaddr, size, dma_dir == DMA_FROM_DEVICE);
 	if (IS_ERR(vec)) {
 		ret = PTR_ERR(vec);
diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index 5beb0c361076..5c1b6388122a 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -876,10 +876,10 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode,
 	 * offset within the internal buffer specified by handle parameter.
 	 */
 	if (xfer->loc_addr) {
-		unsigned long offset;
+		unsigned int offset;
 		long pinned;
 
-		offset = (unsigned long)(uintptr_t)xfer->loc_addr & ~PAGE_MASK;
+		offset = lower_32_bits(offset_in_page(xfer->loc_addr));
 		nr_pages = PAGE_ALIGN(xfer->length + offset) >> PAGE_SHIFT;
 
 		page_list = kmalloc_array(nr_pages,
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 4b3286ac60c8..205aefb4ed93 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -263,7 +263,7 @@ int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int,
 int sg_alloc_table(struct sg_table *, unsigned int, gfp_t);
 int sg_alloc_table_from_pages(struct sg_table *sgt,
 	struct page **pages, unsigned int n_pages,
-	unsigned long offset, unsigned long size,
+	unsigned int offset, unsigned long size,
 	gfp_t gfp_mask);
 
 size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index be7b4dd6b68d..dee0c5004e2f 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -391,7 +391,7 @@ EXPORT_SYMBOL(sg_alloc_table);
  */
 int sg_alloc_table_from_pages(struct sg_table *sgt,
 	struct page **pages, unsigned int n_pages,
-	unsigned long offset, unsigned long size,
+	unsigned int offset, unsigned long size,
 	gfp_t gfp_mask)
 {
 	unsigned int chunks;
-- 
cgit v1.2.3


From c125906b839b794c580a5de911de65bd2c63aaee Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Thu, 3 Aug 2017 10:13:12 +0100
Subject: lib/scatterlist: Avoid potential scatterlist entry overflow

Since the scatterlist length field is an unsigned int, make
sure that sg_alloc_table_from_pages does not overflow it while
coalescing pages to a single entry.

v2: Drop reference to future use. Use UINT_MAX.
v3: max_segment must be page aligned.
v4: Do not rely on compiler to optimise out the rounddown.
    (Joonas Lahtinen)
v5: Simplified loops and use post-increments rather than
    pre-increments. Use PAGE_MASK and fix comment typo.
    (Andy Shevchenko)
v6: Commit spelling fix.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20170803091312.22875-1-tvrtko.ursulin@linux.intel.com
---
 include/linux/scatterlist.h |  6 ++++++
 lib/scatterlist.c           | 31 ++++++++++++++++++++-----------
 2 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 205aefb4ed93..6dd2ddbc6230 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -20,6 +20,12 @@ struct scatterlist {
 #endif
 };
 
+/*
+ * Since the above length field is an unsigned int, below we define the maximum
+ * length in bytes that can be stored in one scatterlist entry.
+ */
+#define SCATTERLIST_MAX_SEGMENT (UINT_MAX & PAGE_MASK)
+
 /*
  * These macros should be used after a dma_map_sg call has been done
  * to get bus addresses of each of the SG entries and their lengths.
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index dee0c5004e2f..7b2e74da2c44 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -394,17 +394,22 @@ int sg_alloc_table_from_pages(struct sg_table *sgt,
 	unsigned int offset, unsigned long size,
 	gfp_t gfp_mask)
 {
-	unsigned int chunks;
-	unsigned int i;
-	unsigned int cur_page;
+	const unsigned int max_segment = SCATTERLIST_MAX_SEGMENT;
+	unsigned int chunks, cur_page, seg_len, i;
 	int ret;
 	struct scatterlist *s;
 
 	/* compute number of contiguous chunks */
 	chunks = 1;
-	for (i = 1; i < n_pages; ++i)
-		if (page_to_pfn(pages[i]) != page_to_pfn(pages[i - 1]) + 1)
-			++chunks;
+	seg_len = 0;
+	for (i = 1; i < n_pages; i++) {
+		seg_len += PAGE_SIZE;
+		if (seg_len >= max_segment ||
+		    page_to_pfn(pages[i]) != page_to_pfn(pages[i - 1]) + 1) {
+			chunks++;
+			seg_len = 0;
+		}
+	}
 
 	ret = sg_alloc_table(sgt, chunks, gfp_mask);
 	if (unlikely(ret))
@@ -413,17 +418,21 @@ int sg_alloc_table_from_pages(struct sg_table *sgt,
 	/* merging chunks and putting them into the scatterlist */
 	cur_page = 0;
 	for_each_sg(sgt->sgl, s, sgt->orig_nents, i) {
-		unsigned long chunk_size;
-		unsigned int j;
+		unsigned int j, chunk_size;
 
 		/* look for the end of the current chunk */
-		for (j = cur_page + 1; j < n_pages; ++j)
-			if (page_to_pfn(pages[j]) !=
+		seg_len = 0;
+		for (j = cur_page + 1; j < n_pages; j++) {
+			seg_len += PAGE_SIZE;
+			if (seg_len >= max_segment ||
+			    page_to_pfn(pages[j]) !=
 			    page_to_pfn(pages[j - 1]) + 1)
 				break;
+		}
 
 		chunk_size = ((j - cur_page) << PAGE_SHIFT) - offset;
-		sg_set_page(s, pages[cur_page], min(size, chunk_size), offset);
+		sg_set_page(s, pages[cur_page],
+			    min_t(unsigned long, size, chunk_size), offset);
 		size -= chunk_size;
 		offset = 0;
 		cur_page = j;
-- 
cgit v1.2.3


From 89d8589cd72c6f48b19c370517d16f3ee23909df Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Thu, 3 Aug 2017 10:13:51 +0100
Subject: lib/scatterlist: Introduce and export __sg_alloc_table_from_pages

Drivers like i915 benefit from being able to control the maxium
size of the sg coalesced segment while building the scatter-
gather list.

Introduce and export the __sg_alloc_table_from_pages function
which will allow it that control.

v2: Reorder parameters. (Chris Wilson)
v3: Fix incomplete reordering in v2.
v4: max_segment needs to be page aligned.
v5: Rebase.
v6: Rebase.
v7: Fix spelling in commit and mention max segment size in
    __sg_alloc_table_from_pages kerneldoc. (Andrew Morton)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: linux-kernel@vger.kernel.org
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20170803091351.23594-1-tvrtko.ursulin@linux.intel.com
---
 include/linux/scatterlist.h | 11 +++++---
 lib/scatterlist.c           | 66 +++++++++++++++++++++++++++++++++------------
 2 files changed, 56 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 6dd2ddbc6230..874b50c232de 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -267,10 +267,13 @@ void sg_free_table(struct sg_table *);
 int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int,
 		     struct scatterlist *, gfp_t, sg_alloc_fn *);
 int sg_alloc_table(struct sg_table *, unsigned int, gfp_t);
-int sg_alloc_table_from_pages(struct sg_table *sgt,
-	struct page **pages, unsigned int n_pages,
-	unsigned int offset, unsigned long size,
-	gfp_t gfp_mask);
+int __sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages,
+				unsigned int n_pages, unsigned int offset,
+				unsigned long size, unsigned int max_segment,
+				gfp_t gfp_mask);
+int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages,
+			      unsigned int n_pages, unsigned int offset,
+			      unsigned long size, gfp_t gfp_mask);
 
 size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
 		      size_t buflen, off_t skip, bool to_buffer);
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 7b2e74da2c44..7c1c55f7daaa 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -370,35 +370,38 @@ int sg_alloc_table(struct sg_table *table, unsigned int nents, gfp_t gfp_mask)
 EXPORT_SYMBOL(sg_alloc_table);
 
 /**
- * sg_alloc_table_from_pages - Allocate and initialize an sg table from
- *			       an array of pages
- * @sgt:	The sg table header to use
- * @pages:	Pointer to an array of page pointers
- * @n_pages:	Number of pages in the pages array
- * @offset:     Offset from start of the first page to the start of a buffer
- * @size:       Number of valid bytes in the buffer (after offset)
- * @gfp_mask:	GFP allocation mask
+ * __sg_alloc_table_from_pages - Allocate and initialize an sg table from
+ *			         an array of pages
+ * @sgt:	 The sg table header to use
+ * @pages:	 Pointer to an array of page pointers
+ * @n_pages:	 Number of pages in the pages array
+ * @offset:      Offset from start of the first page to the start of a buffer
+ * @size:        Number of valid bytes in the buffer (after offset)
+ * @max_segment: Maximum size of a scatterlist node in bytes (page aligned)
+ * @gfp_mask:	 GFP allocation mask
  *
  *  Description:
  *    Allocate and initialize an sg table from a list of pages. Contiguous
- *    ranges of the pages are squashed into a single scatterlist node. A user
- *    may provide an offset at a start and a size of valid data in a buffer
- *    specified by the page array. The returned sg table is released by
- *    sg_free_table.
+ *    ranges of the pages are squashed into a single scatterlist node up to the
+ *    maximum size specified in @max_segment. An user may provide an offset at a
+ *    start and a size of valid data in a buffer specified by the page array.
+ *    The returned sg table is released by sg_free_table.
  *
  * Returns:
  *   0 on success, negative error on failure
  */
-int sg_alloc_table_from_pages(struct sg_table *sgt,
-	struct page **pages, unsigned int n_pages,
-	unsigned int offset, unsigned long size,
-	gfp_t gfp_mask)
+int __sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages,
+				unsigned int n_pages, unsigned int offset,
+				unsigned long size, unsigned int max_segment,
+				gfp_t gfp_mask)
 {
-	const unsigned int max_segment = SCATTERLIST_MAX_SEGMENT;
 	unsigned int chunks, cur_page, seg_len, i;
 	int ret;
 	struct scatterlist *s;
 
+	if (WARN_ON(!max_segment || offset_in_page(max_segment)))
+		return -EINVAL;
+
 	/* compute number of contiguous chunks */
 	chunks = 1;
 	seg_len = 0;
@@ -440,6 +443,35 @@ int sg_alloc_table_from_pages(struct sg_table *sgt,
 
 	return 0;
 }
+EXPORT_SYMBOL(__sg_alloc_table_from_pages);
+
+/**
+ * sg_alloc_table_from_pages - Allocate and initialize an sg table from
+ *			       an array of pages
+ * @sgt:	 The sg table header to use
+ * @pages:	 Pointer to an array of page pointers
+ * @n_pages:	 Number of pages in the pages array
+ * @offset:      Offset from start of the first page to the start of a buffer
+ * @size:        Number of valid bytes in the buffer (after offset)
+ * @gfp_mask:	 GFP allocation mask
+ *
+ *  Description:
+ *    Allocate and initialize an sg table from a list of pages. Contiguous
+ *    ranges of the pages are squashed into a single scatterlist node. A user
+ *    may provide an offset at a start and a size of valid data in a buffer
+ *    specified by the page array. The returned sg table is released by
+ *    sg_free_table.
+ *
+ * Returns:
+ *   0 on success, negative error on failure
+ */
+int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages,
+			      unsigned int n_pages, unsigned int offset,
+			      unsigned long size, gfp_t gfp_mask)
+{
+	return __sg_alloc_table_from_pages(sgt, pages, n_pages, offset, size,
+					   SCATTERLIST_MAX_SEGMENT, gfp_mask);
+}
 EXPORT_SYMBOL(sg_alloc_table_from_pages);
 
 void __sg_page_iter_start(struct sg_page_iter *piter,
-- 
cgit v1.2.3


From 439e7271dc2b63de379e37971dc2f64d71e24f8a Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Thu, 31 Aug 2017 16:37:41 -0400
Subject: livepatch: introduce shadow variable API

Add exported API for livepatch modules:

  klp_shadow_get()
  klp_shadow_alloc()
  klp_shadow_get_or_alloc()
  klp_shadow_free()
  klp_shadow_free_all()

that implement "shadow" variables, which allow callers to associate new
shadow fields to existing data structures.  This is intended to be used
by livepatch modules seeking to emulate additions to data structure
definitions.

See Documentation/livepatch/shadow-vars.txt for a summary of the new
shadow variable API, including a few common use cases.

See samples/livepatch/livepatch-shadow-* for example modules that
demonstrate shadow variables.

[jkosina@suse.cz: fix __klp_shadow_get_or_alloc() comment as spotted by
 Josh]
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/livepatch/shadow-vars.txt   | 192 +++++++++++++++++++++
 include/linux/livepatch.h                 |   8 +
 kernel/livepatch/Makefile                 |   2 +-
 kernel/livepatch/shadow.c                 | 277 ++++++++++++++++++++++++++++++
 samples/Kconfig                           |   5 +-
 samples/livepatch/Makefile                |   3 +
 samples/livepatch/livepatch-shadow-fix1.c | 173 +++++++++++++++++++
 samples/livepatch/livepatch-shadow-fix2.c | 168 ++++++++++++++++++
 samples/livepatch/livepatch-shadow-mod.c  | 224 ++++++++++++++++++++++++
 9 files changed, 1048 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/livepatch/shadow-vars.txt
 create mode 100644 kernel/livepatch/shadow.c
 create mode 100644 samples/livepatch/livepatch-shadow-fix1.c
 create mode 100644 samples/livepatch/livepatch-shadow-fix2.c
 create mode 100644 samples/livepatch/livepatch-shadow-mod.c

(limited to 'include/linux')

diff --git a/Documentation/livepatch/shadow-vars.txt b/Documentation/livepatch/shadow-vars.txt
new file mode 100644
index 000000000000..e3ffc4301bfa
--- /dev/null
+++ b/Documentation/livepatch/shadow-vars.txt
@@ -0,0 +1,192 @@
+================
+Shadow Variables
+================
+
+Shadow variables are a simple way for livepatch modules to associate
+additional "shadow" data with existing data structures.  Shadow data is
+allocated separately from parent data structures, which are left
+unmodified.  The shadow variable API described in this document is used
+to allocate/attach and detach/release shadow variables to their parents.
+
+The implementation introduces a global, in-kernel hashtable that
+associates pointers to parent objects and a numeric identifier of the
+shadow data.  The numeric identifier is a simple enumeration that may be
+used to describe shadow variable version, class or type, etc.  More
+specifically, the parent pointer serves as the hashtable key while the
+numeric id subsequently filters hashtable queries.  Multiple shadow
+variables may attach to the same parent object, but their numeric
+identifier distinguishes between them.
+
+
+1. Brief API summary
+====================
+
+(See the full API usage docbook notes in livepatch/shadow.c.)
+
+A hashtable references all shadow variables.  These references are
+stored and retrieved through a <obj, id> pair.
+
+* The klp_shadow variable data structure encapsulates both tracking
+meta-data and shadow-data:
+  - meta-data
+    - obj - pointer to parent object
+    - id - data identifier
+  - data[] - storage for shadow data
+
+It is important to note that the klp_shadow_alloc() and
+klp_shadow_get_or_alloc() calls, described below, store a *copy* of the
+data that the functions are provided.  Callers should provide whatever
+mutual exclusion is required of the shadow data.
+
+* klp_shadow_get() - retrieve a shadow variable data pointer
+  - search hashtable for <obj, id> pair
+
+* klp_shadow_alloc() - allocate and add a new shadow variable
+  - search hashtable for <obj, id> pair
+  - if exists
+    - WARN and return NULL
+  - if <obj, id> doesn't already exist
+    - allocate a new shadow variable
+    - copy data into the new shadow variable
+    - add <obj, id> to the global hashtable
+
+* klp_shadow_get_or_alloc() - get existing or alloc a new shadow variable
+  - search hashtable for <obj, id> pair
+  - if exists
+    - return existing shadow variable
+  - if <obj, id> doesn't already exist
+    - allocate a new shadow variable
+    - copy data into the new shadow variable
+    - add <obj, id> pair to the global hashtable
+
+* klp_shadow_free() - detach and free a <obj, id> shadow variable
+  - find and remove a <obj, id> reference from global hashtable
+    - if found, free shadow variable
+
+* klp_shadow_free_all() - detach and free all <*, id> shadow variables
+  - find and remove any <*, id> references from global hashtable
+    - if found, free shadow variable
+
+
+2. Use cases
+============
+
+(See the example shadow variable livepatch modules in samples/livepatch/
+for full working demonstrations.)
+
+For the following use-case examples, consider commit 1d147bfa6429
+("mac80211: fix AP powersave TX vs.  wakeup race"), which added a
+spinlock to net/mac80211/sta_info.h :: struct sta_info.  Each use-case
+example can be considered a stand-alone livepatch implementation of this
+fix.
+
+
+Matching parent's lifecycle
+---------------------------
+
+If parent data structures are frequently created and destroyed, it may
+be easiest to align their shadow variables lifetimes to the same
+allocation and release functions.  In this case, the parent data
+structure is typically allocated, initialized, then registered in some
+manner.  Shadow variable allocation and setup can then be considered
+part of the parent's initialization and should be completed before the
+parent "goes live" (ie, any shadow variable get-API requests are made
+for this <obj, id> pair.)
+
+For commit 1d147bfa6429, when a parent sta_info structure is allocated,
+allocate a shadow copy of the ps_lock pointer, then initialize it:
+
+#define PS_LOCK 1
+struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
+				const u8 *addr, gfp_t gfp)
+{
+	struct sta_info *sta;
+	spinlock_t *ps_lock;
+
+	/* Parent structure is created */
+	sta = kzalloc(sizeof(*sta) + hw->sta_data_size, gfp);
+
+	/* Attach a corresponding shadow variable, then initialize it */
+	ps_lock = klp_shadow_alloc(sta, PS_LOCK, NULL, sizeof(ps_lock), gfp);
+	if (!ps_lock)
+		goto shadow_fail;
+	spin_lock_init(ps_lock);
+	...
+
+When requiring a ps_lock, query the shadow variable API to retrieve one
+for a specific struct sta_info:
+
+void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
+{
+	spinlock_t *ps_lock;
+
+	/* sync with ieee80211_tx_h_unicast_ps_buf */
+	ps_lock = klp_shadow_get(sta, PS_LOCK);
+	if (ps_lock)
+		spin_lock(ps_lock);
+	...
+
+When the parent sta_info structure is freed, first free the shadow
+variable:
+
+void sta_info_free(struct ieee80211_local *local, struct sta_info *sta)
+{
+	klp_shadow_free(sta, PS_LOCK);
+	kfree(sta);
+	...
+
+
+In-flight parent objects
+------------------------
+
+Sometimes it may not be convenient or possible to allocate shadow
+variables alongside their parent objects.  Or a livepatch fix may
+require shadow varibles to only a subset of parent object instances.  In
+these cases, the klp_shadow_get_or_alloc() call can be used to attach
+shadow variables to parents already in-flight.
+
+For commit 1d147bfa6429, a good spot to allocate a shadow spinlock is
+inside ieee80211_sta_ps_deliver_wakeup():
+
+#define PS_LOCK 1
+void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
+{
+	DEFINE_SPINLOCK(ps_lock_fallback);
+	spinlock_t *ps_lock;
+
+	/* sync with ieee80211_tx_h_unicast_ps_buf */
+	ps_lock = klp_shadow_get_or_alloc(sta, PS_LOCK,
+			&ps_lock_fallback, sizeof(ps_lock_fallback),
+			GFP_ATOMIC);
+	if (ps_lock)
+		spin_lock(ps_lock);
+	...
+
+This usage will create a shadow variable, only if needed, otherwise it
+will use one that was already created for this <obj, id> pair.
+
+Like the previous use-case, the shadow spinlock needs to be cleaned up.
+A shadow variable can be freed just before its parent object is freed,
+or even when the shadow variable itself is no longer required.
+
+
+Other use-cases
+---------------
+
+Shadow variables can also be used as a flag indicating that a data
+structure was allocated by new, livepatched code.  In this case, it
+doesn't matter what data value the shadow variable holds, its existence
+suggests how to handle the parent object.
+
+
+3. References
+=============
+
+* https://github.com/dynup/kpatch
+The livepatch implementation is based on the kpatch version of shadow
+variables.
+
+* http://files.mkgnu.net/files/dynamos/doc/papers/dynamos_eurosys_07.pdf
+Dynamic and Adaptive Updates of Non-Quiescent Subsystems in Commodity
+Operating System Kernels (Kritis Makris, Kyung Dong Ryu 2007) presented
+a datatype update technique called "shadow data structures".
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 194991ef9347..d08eddc00497 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -164,6 +164,14 @@ static inline bool klp_have_reliable_stack(void)
 	       IS_ENABLED(CONFIG_HAVE_RELIABLE_STACKTRACE);
 }
 
+void *klp_shadow_get(void *obj, unsigned long id);
+void *klp_shadow_alloc(void *obj, unsigned long id, void *data,
+		       size_t size, gfp_t gfp_flags);
+void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
+			      size_t size, gfp_t gfp_flags);
+void klp_shadow_free(void *obj, unsigned long id);
+void klp_shadow_free_all(unsigned long id);
+
 #else /* !CONFIG_LIVEPATCH */
 
 static inline int klp_module_coming(struct module *mod) { return 0; }
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index 2b8bdb1925da..b36ceda6488e 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_LIVEPATCH) += livepatch.o
 
-livepatch-objs := core.o patch.o transition.o
+livepatch-objs := core.o patch.o shadow.o transition.o
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
new file mode 100644
index 000000000000..67e4360521f3
--- /dev/null
+++ b/kernel/livepatch/shadow.c
@@ -0,0 +1,277 @@
+/*
+ * shadow.c - Shadow Variables
+ *
+ * Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com>
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * DOC: Shadow variable API concurrency notes:
+ *
+ * The shadow variable API provides a simple relationship between an
+ * <obj, id> pair and a pointer value.  It is the responsibility of the
+ * caller to provide any mutual exclusion required of the shadow data.
+ *
+ * Once a shadow variable is attached to its parent object via the
+ * klp_shadow_*alloc() API calls, it is considered live: any subsequent
+ * call to klp_shadow_get() may then return the shadow variable's data
+ * pointer.  Callers of klp_shadow_*alloc() should prepare shadow data
+ * accordingly.
+ *
+ * The klp_shadow_*alloc() API calls may allocate memory for new shadow
+ * variable structures.  Their implementation does not call kmalloc
+ * inside any spinlocks, but API callers should pass GFP flags according
+ * to their specific needs.
+ *
+ * The klp_shadow_hash is an RCU-enabled hashtable and is safe against
+ * concurrent klp_shadow_free() and klp_shadow_get() operations.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/hashtable.h>
+#include <linux/slab.h>
+#include <linux/livepatch.h>
+
+static DEFINE_HASHTABLE(klp_shadow_hash, 12);
+
+/*
+ * klp_shadow_lock provides exclusive access to the klp_shadow_hash and
+ * the shadow variables it references.
+ */
+static DEFINE_SPINLOCK(klp_shadow_lock);
+
+/**
+ * struct klp_shadow - shadow variable structure
+ * @node:	klp_shadow_hash hash table node
+ * @rcu_head:	RCU is used to safely free this structure
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ * @data:	data area
+ */
+struct klp_shadow {
+	struct hlist_node node;
+	struct rcu_head rcu_head;
+	void *obj;
+	unsigned long id;
+	char data[];
+};
+
+/**
+ * klp_shadow_match() - verify a shadow variable matches given <obj, id>
+ * @shadow:	shadow variable to match
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ *
+ * Return: true if the shadow variable matches.
+ */
+static inline bool klp_shadow_match(struct klp_shadow *shadow, void *obj,
+				unsigned long id)
+{
+	return shadow->obj == obj && shadow->id == id;
+}
+
+/**
+ * klp_shadow_get() - retrieve a shadow variable data pointer
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ *
+ * Return: the shadow variable data element, NULL on failure.
+ */
+void *klp_shadow_get(void *obj, unsigned long id)
+{
+	struct klp_shadow *shadow;
+
+	rcu_read_lock();
+
+	hash_for_each_possible_rcu(klp_shadow_hash, shadow, node,
+				   (unsigned long)obj) {
+
+		if (klp_shadow_match(shadow, obj, id)) {
+			rcu_read_unlock();
+			return shadow->data;
+		}
+	}
+
+	rcu_read_unlock();
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(klp_shadow_get);
+
+void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
+		       size_t size, gfp_t gfp_flags, bool warn_on_exist)
+{
+	struct klp_shadow *new_shadow;
+	void *shadow_data;
+	unsigned long flags;
+
+	/* Check if the shadow variable already exists */
+	shadow_data = klp_shadow_get(obj, id);
+	if (shadow_data)
+		goto exists;
+
+	/* Allocate a new shadow variable for use inside the lock below */
+	new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags);
+	if (!new_shadow)
+		return NULL;
+
+	new_shadow->obj = obj;
+	new_shadow->id = id;
+
+	/* Initialize the shadow variable if data provided */
+	if (data)
+		memcpy(new_shadow->data, data, size);
+
+	/* Look for <obj, id> again under the lock */
+	spin_lock_irqsave(&klp_shadow_lock, flags);
+	shadow_data = klp_shadow_get(obj, id);
+	if (unlikely(shadow_data)) {
+		/*
+		 * Shadow variable was found, throw away speculative
+		 * allocation.
+		 */
+		spin_unlock_irqrestore(&klp_shadow_lock, flags);
+		kfree(new_shadow);
+		goto exists;
+	}
+
+	/* No <obj, id> found, so attach the newly allocated one */
+	hash_add_rcu(klp_shadow_hash, &new_shadow->node,
+		     (unsigned long)new_shadow->obj);
+	spin_unlock_irqrestore(&klp_shadow_lock, flags);
+
+	return new_shadow->data;
+
+exists:
+	if (warn_on_exist) {
+		WARN(1, "Duplicate shadow variable <%p, %lx>\n", obj, id);
+		return NULL;
+	}
+
+	return shadow_data;
+}
+
+/**
+ * klp_shadow_alloc() - allocate and add a new shadow variable
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ * @data:	pointer to data to attach to parent
+ * @size:	size of attached data
+ * @gfp_flags:	GFP mask for allocation
+ *
+ * Allocates @size bytes for new shadow variable data using @gfp_flags
+ * and copies @size bytes from @data into the new shadow variable's own
+ * data space.  If @data is NULL, @size bytes are still allocated, but
+ * no copy is performed.  The new shadow variable is then added to the
+ * global hashtable.
+ *
+ * If an existing <obj, id> shadow variable can be found, this routine
+ * will issue a WARN, exit early and return NULL.
+ *
+ * Return: the shadow variable data element, NULL on duplicate or
+ * failure.
+ */
+void *klp_shadow_alloc(void *obj, unsigned long id, void *data,
+		       size_t size, gfp_t gfp_flags)
+{
+	return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_alloc);
+
+/**
+ * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ * @data:	pointer to data to attach to parent
+ * @size:	size of attached data
+ * @gfp_flags:	GFP mask for allocation
+ *
+ * Returns a pointer to existing shadow data if an <obj, id> shadow
+ * variable is already present.  Otherwise, it creates a new shadow
+ * variable like klp_shadow_alloc().
+ *
+ * This function guarantees that only one shadow variable exists with
+ * the given @id for the given @obj.  It also guarantees that the shadow
+ * variable will be initialized by the given @data only when it did not
+ * exist before.
+ *
+ * Return: the shadow variable data element, NULL on failure.
+ */
+void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
+			       size_t size, gfp_t gfp_flags)
+{
+	return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc);
+
+/**
+ * klp_shadow_free() - detach and free a <obj, id> shadow variable
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ *
+ * This function releases the memory for this <obj, id> shadow variable
+ * instance, callers should stop referencing it accordingly.
+ */
+void klp_shadow_free(void *obj, unsigned long id)
+{
+	struct klp_shadow *shadow;
+	unsigned long flags;
+
+	spin_lock_irqsave(&klp_shadow_lock, flags);
+
+	/* Delete <obj, id> from hash */
+	hash_for_each_possible(klp_shadow_hash, shadow, node,
+			       (unsigned long)obj) {
+
+		if (klp_shadow_match(shadow, obj, id)) {
+			hash_del_rcu(&shadow->node);
+			kfree_rcu(shadow, rcu_head);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&klp_shadow_lock, flags);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_free);
+
+/**
+ * klp_shadow_free_all() - detach and free all <*, id> shadow variables
+ * @id:		data identifier
+ *
+ * This function releases the memory for all <*, id> shadow variable
+ * instances, callers should stop referencing them accordingly.
+ */
+void klp_shadow_free_all(unsigned long id)
+{
+	struct klp_shadow *shadow;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&klp_shadow_lock, flags);
+
+	/* Delete all <*, id> from hash */
+	hash_for_each(klp_shadow_hash, i, shadow, node) {
+		if (klp_shadow_match(shadow, shadow->obj, id)) {
+			hash_del_rcu(&shadow->node);
+			kfree_rcu(shadow, rcu_head);
+		}
+	}
+
+	spin_unlock_irqrestore(&klp_shadow_lock, flags);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_free_all);
diff --git a/samples/Kconfig b/samples/Kconfig
index 9cb63188d3ef..c332a3b9de05 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -71,11 +71,10 @@ config SAMPLE_RPMSG_CLIENT
 	  the rpmsg bus.
 
 config SAMPLE_LIVEPATCH
-	tristate "Build live patching sample -- loadable modules only"
+	tristate "Build live patching samples -- loadable modules only"
 	depends on LIVEPATCH && m
 	help
-	  Builds a sample live patch that replaces the procfs handler
-	  for /proc/cmdline to print "this has been live patched".
+	  Build sample live patch demonstrations.
 
 config SAMPLE_CONFIGFS
 	tristate "Build configfs patching sample -- loadable modules only"
diff --git a/samples/livepatch/Makefile b/samples/livepatch/Makefile
index 10319d7ea0b1..539e81d433cd 100644
--- a/samples/livepatch/Makefile
+++ b/samples/livepatch/Makefile
@@ -1 +1,4 @@
 obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-sample.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-mod.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix1.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix2.o
diff --git a/samples/livepatch/livepatch-shadow-fix1.c b/samples/livepatch/livepatch-shadow-fix1.c
new file mode 100644
index 000000000000..fbe0a1f3d99b
--- /dev/null
+++ b/samples/livepatch/livepatch-shadow-fix1.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-shadow-fix1.c - Shadow variables, livepatch demo
+ *
+ * Purpose
+ * -------
+ *
+ * Fixes the memory leak introduced in livepatch-shadow-mod through the
+ * use of a shadow variable.  This fix demonstrates the "extending" of
+ * short-lived data structures by patching its allocation and release
+ * functions.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone.  See the "Usage"
+ * section of livepatch-shadow-mod.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/livepatch.h>
+#include <linux/slab.h>
+
+/* Shadow variable enums */
+#define SV_LEAK		1
+
+/* Allocate new dummies every second */
+#define ALLOC_PERIOD	1
+/* Check for expired dummies after a few new ones have been allocated */
+#define CLEANUP_PERIOD	(3 * ALLOC_PERIOD)
+/* Dummies expire after a few cleanup instances */
+#define EXPIRE_PERIOD	(4 * CLEANUP_PERIOD)
+
+struct dummy {
+	struct list_head list;
+	unsigned long jiffies_expire;
+};
+
+struct dummy *livepatch_fix1_dummy_alloc(void)
+{
+	struct dummy *d;
+	void *leak;
+
+	d = kzalloc(sizeof(*d), GFP_KERNEL);
+	if (!d)
+		return NULL;
+
+	d->jiffies_expire = jiffies +
+		msecs_to_jiffies(1000 * EXPIRE_PERIOD);
+
+	/*
+	 * Patch: save the extra memory location into a SV_LEAK shadow
+	 * variable.  A patched dummy_free routine can later fetch this
+	 * pointer to handle resource release.
+	 */
+	leak = kzalloc(sizeof(int), GFP_KERNEL);
+	klp_shadow_alloc(d, SV_LEAK, &leak, sizeof(leak), GFP_KERNEL);
+
+	pr_info("%s: dummy @ %p, expires @ %lx\n",
+		__func__, d, d->jiffies_expire);
+
+	return d;
+}
+
+void livepatch_fix1_dummy_free(struct dummy *d)
+{
+	void **shadow_leak, *leak;
+
+	/*
+	 * Patch: fetch the saved SV_LEAK shadow variable, detach and
+	 * free it.  Note: handle cases where this shadow variable does
+	 * not exist (ie, dummy structures allocated before this livepatch
+	 * was loaded.)
+	 */
+	shadow_leak = klp_shadow_get(d, SV_LEAK);
+	if (shadow_leak) {
+		leak = *shadow_leak;
+		klp_shadow_free(d, SV_LEAK);
+		kfree(leak);
+		pr_info("%s: dummy @ %p, prevented leak @ %p\n",
+			 __func__, d, leak);
+	} else {
+		pr_info("%s: dummy @ %p leaked!\n", __func__, d);
+	}
+
+	kfree(d);
+}
+
+static struct klp_func funcs[] = {
+	{
+		.old_name = "dummy_alloc",
+		.new_func = livepatch_fix1_dummy_alloc,
+	},
+	{
+		.old_name = "dummy_free",
+		.new_func = livepatch_fix1_dummy_free,
+	}, { }
+};
+
+static struct klp_object objs[] = {
+	{
+		.name = "livepatch_shadow_mod",
+		.funcs = funcs,
+	}, { }
+};
+
+static struct klp_patch patch = {
+	.mod = THIS_MODULE,
+	.objs = objs,
+};
+
+static int livepatch_shadow_fix1_init(void)
+{
+	int ret;
+
+	if (!klp_have_reliable_stack() && !patch.immediate) {
+		/*
+		 * WARNING: Be very careful when using 'patch.immediate' in
+		 * your patches.  It's ok to use it for simple patches like
+		 * this, but for more complex patches which change function
+		 * semantics, locking semantics, or data structures, it may not
+		 * be safe.  Use of this option will also prevent removal of
+		 * the patch.
+		 *
+		 * See Documentation/livepatch/livepatch.txt for more details.
+		 */
+		patch.immediate = true;
+		pr_notice("The consistency model isn't supported for your architecture.  Bypassing safety mechanisms and applying the patch immediately.\n");
+	}
+
+	ret = klp_register_patch(&patch);
+	if (ret)
+		return ret;
+	ret = klp_enable_patch(&patch);
+	if (ret) {
+		WARN_ON(klp_unregister_patch(&patch));
+		return ret;
+	}
+	return 0;
+}
+
+static void livepatch_shadow_fix1_exit(void)
+{
+	/* Cleanup any existing SV_LEAK shadow variables */
+	klp_shadow_free_all(SV_LEAK);
+
+	WARN_ON(klp_unregister_patch(&patch));
+}
+
+module_init(livepatch_shadow_fix1_init);
+module_exit(livepatch_shadow_fix1_exit);
+MODULE_LICENSE("GPL");
+MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-shadow-fix2.c b/samples/livepatch/livepatch-shadow-fix2.c
new file mode 100644
index 000000000000..53c1794bdc5f
--- /dev/null
+++ b/samples/livepatch/livepatch-shadow-fix2.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-shadow-fix2.c - Shadow variables, livepatch demo
+ *
+ * Purpose
+ * -------
+ *
+ * Adds functionality to livepatch-shadow-mod's in-flight data
+ * structures through a shadow variable.  The livepatch patches a
+ * routine that periodically inspects data structures, incrementing a
+ * per-data-structure counter, creating the counter if needed.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone.  See the "Usage"
+ * section of livepatch-shadow-mod.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/livepatch.h>
+#include <linux/slab.h>
+
+/* Shadow variable enums */
+#define SV_LEAK		1
+#define SV_COUNTER	2
+
+struct dummy {
+	struct list_head list;
+	unsigned long jiffies_expire;
+};
+
+bool livepatch_fix2_dummy_check(struct dummy *d, unsigned long jiffies)
+{
+	int *shadow_count;
+	int count;
+
+	/*
+	 * Patch: handle in-flight dummy structures, if they do not
+	 * already have a SV_COUNTER shadow variable, then attach a
+	 * new one.
+	 */
+	count = 0;
+	shadow_count = klp_shadow_get_or_alloc(d, SV_COUNTER,
+					       &count, sizeof(count),
+					       GFP_NOWAIT);
+	if (shadow_count)
+		*shadow_count += 1;
+
+	return time_after(jiffies, d->jiffies_expire);
+}
+
+void livepatch_fix2_dummy_free(struct dummy *d)
+{
+	void **shadow_leak, *leak;
+	int *shadow_count;
+
+	/* Patch: copy the memory leak patch from the fix1 module. */
+	shadow_leak = klp_shadow_get(d, SV_LEAK);
+	if (shadow_leak) {
+		leak = *shadow_leak;
+		klp_shadow_free(d, SV_LEAK);
+		kfree(leak);
+		pr_info("%s: dummy @ %p, prevented leak @ %p\n",
+			 __func__, d, leak);
+	} else {
+		pr_info("%s: dummy @ %p leaked!\n", __func__, d);
+	}
+
+	/*
+	 * Patch: fetch the SV_COUNTER shadow variable and display
+	 * the final count.  Detach the shadow variable.
+	 */
+	shadow_count = klp_shadow_get(d, SV_COUNTER);
+	if (shadow_count) {
+		pr_info("%s: dummy @ %p, check counter = %d\n",
+			__func__, d, *shadow_count);
+		klp_shadow_free(d, SV_COUNTER);
+	}
+
+	kfree(d);
+}
+
+static struct klp_func funcs[] = {
+	{
+		.old_name = "dummy_check",
+		.new_func = livepatch_fix2_dummy_check,
+	},
+	{
+		.old_name = "dummy_free",
+		.new_func = livepatch_fix2_dummy_free,
+	}, { }
+};
+
+static struct klp_object objs[] = {
+	{
+		.name = "livepatch_shadow_mod",
+		.funcs = funcs,
+	}, { }
+};
+
+static struct klp_patch patch = {
+	.mod = THIS_MODULE,
+	.objs = objs,
+};
+
+static int livepatch_shadow_fix2_init(void)
+{
+	int ret;
+
+	if (!klp_have_reliable_stack() && !patch.immediate) {
+		/*
+		 * WARNING: Be very careful when using 'patch.immediate' in
+		 * your patches.  It's ok to use it for simple patches like
+		 * this, but for more complex patches which change function
+		 * semantics, locking semantics, or data structures, it may not
+		 * be safe.  Use of this option will also prevent removal of
+		 * the patch.
+		 *
+		 * See Documentation/livepatch/livepatch.txt for more details.
+		 */
+		patch.immediate = true;
+		pr_notice("The consistency model isn't supported for your architecture.  Bypassing safety mechanisms and applying the patch immediately.\n");
+	}
+
+	ret = klp_register_patch(&patch);
+	if (ret)
+		return ret;
+	ret = klp_enable_patch(&patch);
+	if (ret) {
+		WARN_ON(klp_unregister_patch(&patch));
+		return ret;
+	}
+	return 0;
+}
+
+static void livepatch_shadow_fix2_exit(void)
+{
+	/* Cleanup any existing SV_COUNTER shadow variables */
+	klp_shadow_free_all(SV_COUNTER);
+
+	WARN_ON(klp_unregister_patch(&patch));
+}
+
+module_init(livepatch_shadow_fix2_init);
+module_exit(livepatch_shadow_fix2_exit);
+MODULE_LICENSE("GPL");
+MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-shadow-mod.c b/samples/livepatch/livepatch-shadow-mod.c
new file mode 100644
index 000000000000..4c54b250332d
--- /dev/null
+++ b/samples/livepatch/livepatch-shadow-mod.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-shadow-mod.c - Shadow variables, buggy module demo
+ *
+ * Purpose
+ * -------
+ *
+ * As a demonstration of livepatch shadow variable API, this module
+ * introduces memory leak behavior that livepatch modules
+ * livepatch-shadow-fix1.ko and livepatch-shadow-fix2.ko correct and
+ * enhance.
+ *
+ * WARNING - even though the livepatch-shadow-fix modules patch the
+ * memory leak, please load these modules at your own risk -- some
+ * amount of memory may leaked before the bug is patched.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * Step 1 - Load the buggy demonstration module:
+ *
+ *   insmod samples/livepatch/livepatch-shadow-mod.ko
+ *
+ * Watch dmesg output for a few moments to see new dummy being allocated
+ * and a periodic cleanup check.  (Note: a small amount of memory is
+ * being leaked.)
+ *
+ *
+ * Step 2 - Load livepatch fix1:
+ *
+ *   insmod samples/livepatch/livepatch-shadow-fix1.ko
+ *
+ * Continue watching dmesg and note that now livepatch_fix1_dummy_free()
+ * and livepatch_fix1_dummy_alloc() are logging messages about leaked
+ * memory and eventually leaks prevented.
+ *
+ *
+ * Step 3 - Load livepatch fix2 (on top of fix1):
+ *
+ *   insmod samples/livepatch/livepatch-shadow-fix2.ko
+ *
+ * This module extends functionality through shadow variables, as a new
+ * "check" counter is added to the dummy structure.  Periodic dmesg
+ * messages will log these as dummies are cleaned up.
+ *
+ *
+ * Step 4 - Cleanup
+ *
+ * Unwind the demonstration by disabling the livepatch fix modules, then
+ * removing them and the demo module:
+ *
+ *   echo 0 > /sys/kernel/livepatch/livepatch_shadow_fix2/enabled
+ *   echo 0 > /sys/kernel/livepatch/livepatch_shadow_fix1/enabled
+ *   rmmod livepatch-shadow-fix2
+ *   rmmod livepatch-shadow-fix1
+ *   rmmod livepatch-shadow-mod
+ */
+
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/workqueue.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Joe Lawrence <joe.lawrence@redhat.com>");
+MODULE_DESCRIPTION("Buggy module for shadow variable demo");
+
+/* Allocate new dummies every second */
+#define ALLOC_PERIOD	1
+/* Check for expired dummies after a few new ones have been allocated */
+#define CLEANUP_PERIOD	(3 * ALLOC_PERIOD)
+/* Dummies expire after a few cleanup instances */
+#define EXPIRE_PERIOD	(4 * CLEANUP_PERIOD)
+
+/*
+ * Keep a list of all the dummies so we can clean up any residual ones
+ * on module exit
+ */
+LIST_HEAD(dummy_list);
+DEFINE_MUTEX(dummy_list_mutex);
+
+struct dummy {
+	struct list_head list;
+	unsigned long jiffies_expire;
+};
+
+noinline struct dummy *dummy_alloc(void)
+{
+	struct dummy *d;
+	void *leak;
+
+	d = kzalloc(sizeof(*d), GFP_KERNEL);
+	if (!d)
+		return NULL;
+
+	d->jiffies_expire = jiffies +
+		msecs_to_jiffies(1000 * EXPIRE_PERIOD);
+
+	/* Oops, forgot to save leak! */
+	leak = kzalloc(sizeof(int), GFP_KERNEL);
+
+	pr_info("%s: dummy @ %p, expires @ %lx\n",
+		__func__, d, d->jiffies_expire);
+
+	return d;
+}
+
+noinline void dummy_free(struct dummy *d)
+{
+	pr_info("%s: dummy @ %p, expired = %lx\n",
+		__func__, d, d->jiffies_expire);
+
+	kfree(d);
+}
+
+noinline bool dummy_check(struct dummy *d, unsigned long jiffies)
+{
+	return time_after(jiffies, d->jiffies_expire);
+}
+
+/*
+ * alloc_work_func: allocates new dummy structures, allocates additional
+ *                  memory, aptly named "leak", but doesn't keep
+ *                  permanent record of it.
+ */
+
+static void alloc_work_func(struct work_struct *work);
+static DECLARE_DELAYED_WORK(alloc_dwork, alloc_work_func);
+
+static void alloc_work_func(struct work_struct *work)
+{
+	struct dummy *d;
+
+	d = dummy_alloc();
+	if (!d)
+		return;
+
+	mutex_lock(&dummy_list_mutex);
+	list_add(&d->list, &dummy_list);
+	mutex_unlock(&dummy_list_mutex);
+
+	schedule_delayed_work(&alloc_dwork,
+		msecs_to_jiffies(1000 * ALLOC_PERIOD));
+}
+
+/*
+ * cleanup_work_func: frees dummy structures.  Without knownledge of
+ *                    "leak", it leaks the additional memory that
+ *                    alloc_work_func created.
+ */
+
+static void cleanup_work_func(struct work_struct *work);
+static DECLARE_DELAYED_WORK(cleanup_dwork, cleanup_work_func);
+
+static void cleanup_work_func(struct work_struct *work)
+{
+	struct dummy *d, *tmp;
+	unsigned long j;
+
+	j = jiffies;
+	pr_info("%s: jiffies = %lx\n", __func__, j);
+
+	mutex_lock(&dummy_list_mutex);
+	list_for_each_entry_safe(d, tmp, &dummy_list, list) {
+
+		/* Kick out and free any expired dummies */
+		if (dummy_check(d, j)) {
+			list_del(&d->list);
+			dummy_free(d);
+		}
+	}
+	mutex_unlock(&dummy_list_mutex);
+
+	schedule_delayed_work(&cleanup_dwork,
+		msecs_to_jiffies(1000 * CLEANUP_PERIOD));
+}
+
+static int livepatch_shadow_mod_init(void)
+{
+	schedule_delayed_work(&alloc_dwork,
+		msecs_to_jiffies(1000 * ALLOC_PERIOD));
+	schedule_delayed_work(&cleanup_dwork,
+		msecs_to_jiffies(1000 * CLEANUP_PERIOD));
+
+	return 0;
+}
+
+static void livepatch_shadow_mod_exit(void)
+{
+	struct dummy *d, *tmp;
+
+	/* Wait for any dummies at work */
+	cancel_delayed_work_sync(&alloc_dwork);
+	cancel_delayed_work_sync(&cleanup_dwork);
+
+	/* Cleanup residual dummies */
+	list_for_each_entry_safe(d, tmp, &dummy_list, list) {
+		list_del(&d->list);
+		dummy_free(d);
+	}
+}
+
+module_init(livepatch_shadow_mod_init);
+module_exit(livepatch_shadow_mod_exit);
-- 
cgit v1.2.3


From f3ae7d9155c79bb8f97ca3ff61ea979dec402952 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Tue, 5 Sep 2017 16:43:49 +0200
Subject: dmaengine: xilinx_dma: Move enum xdma_ip_type to driver file

The enum xdma_ip_type is only used inside the Xilinx DMA driver and not
exported to any consumers (nor should it be). So move it from the global
header to driver file itself.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Acked-by: Michal Simek <michal.simek@xilinx.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/xilinx/xilinx_dma.c | 14 ++++++++++++++
 include/linux/dma/xilinx_dma.h  | 14 --------------
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c
index 8722bcba489d..5eef13380ca8 100644
--- a/drivers/dma/xilinx/xilinx_dma.c
+++ b/drivers/dma/xilinx/xilinx_dma.c
@@ -366,6 +366,20 @@ struct xilinx_dma_chan {
 	u16 tdest;
 };
 
+/**
+ * enum xdma_ip_type: DMA IP type.
+ *
+ * XDMA_TYPE_AXIDMA: Axi dma ip.
+ * XDMA_TYPE_CDMA: Axi cdma ip.
+ * XDMA_TYPE_VDMA: Axi vdma ip.
+ *
+ */
+enum xdma_ip_type {
+	XDMA_TYPE_AXIDMA = 0,
+	XDMA_TYPE_CDMA,
+	XDMA_TYPE_VDMA,
+};
+
 struct xilinx_dma_config {
 	enum xdma_ip_type dmatype;
 	int (*clk_init)(struct platform_device *pdev, struct clk **axi_clk,
diff --git a/include/linux/dma/xilinx_dma.h b/include/linux/dma/xilinx_dma.h
index 3ae300052553..34b98f276ed0 100644
--- a/include/linux/dma/xilinx_dma.h
+++ b/include/linux/dma/xilinx_dma.h
@@ -41,20 +41,6 @@ struct xilinx_vdma_config {
 	int ext_fsync;
 };
 
-/**
- * enum xdma_ip_type: DMA IP type.
- *
- * XDMA_TYPE_AXIDMA: Axi dma ip.
- * XDMA_TYPE_CDMA: Axi cdma ip.
- * XDMA_TYPE_VDMA: Axi vdma ip.
- *
- */
-enum xdma_ip_type {
-	XDMA_TYPE_AXIDMA = 0,
-	XDMA_TYPE_CDMA,
-	XDMA_TYPE_VDMA,
-};
-
 int xilinx_vdma_channel_set_config(struct dma_chan *dchan,
 					struct xilinx_vdma_config *cfg);
 
-- 
cgit v1.2.3


From 4b4e02c83167dca260e6bf974809979d44694e19 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Mon, 11 Sep 2017 20:32:07 -0700
Subject: typec: tcpm: Move out of staging

Move tcpm (USB Type-C Port Manager) out of staging.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/typec/Kconfig           |    8 -
 drivers/staging/typec/Makefile          |    1 -
 drivers/staging/typec/TODO              |   10 -
 drivers/staging/typec/fusb302/fusb302.c |    4 +-
 drivers/staging/typec/pd.h              |  298 ---
 drivers/staging/typec/pd_bdo.h          |   31 -
 drivers/staging/typec/pd_vdo.h          |  251 ---
 drivers/staging/typec/tcpci.c           |    4 +-
 drivers/staging/typec/tcpm.c            | 3616 -------------------------------
 drivers/staging/typec/tcpm.h            |  204 --
 drivers/usb/typec/Kconfig               |    8 +
 drivers/usb/typec/Makefile              |    1 +
 drivers/usb/typec/tcpm.c                | 3615 ++++++++++++++++++++++++++++++
 include/linux/usb/pd.h                  |  298 +++
 include/linux/usb/pd_bdo.h              |   31 +
 include/linux/usb/pd_vdo.h              |  251 +++
 include/linux/usb/tcpm.h                |  204 ++
 17 files changed, 4412 insertions(+), 4423 deletions(-)
 delete mode 100644 drivers/staging/typec/pd.h
 delete mode 100644 drivers/staging/typec/pd_bdo.h
 delete mode 100644 drivers/staging/typec/pd_vdo.h
 delete mode 100644 drivers/staging/typec/tcpm.c
 delete mode 100644 drivers/staging/typec/tcpm.h
 create mode 100644 drivers/usb/typec/tcpm.c
 create mode 100644 include/linux/usb/pd.h
 create mode 100644 include/linux/usb/pd_bdo.h
 create mode 100644 include/linux/usb/pd_vdo.h
 create mode 100644 include/linux/usb/tcpm.h

(limited to 'include/linux')

diff --git a/drivers/staging/typec/Kconfig b/drivers/staging/typec/Kconfig
index 37a0781b0d0c..31fad23c2553 100644
--- a/drivers/staging/typec/Kconfig
+++ b/drivers/staging/typec/Kconfig
@@ -1,13 +1,5 @@
 menu "USB Power Delivery and Type-C drivers"
 
-config TYPEC_TCPM
-	tristate "USB Type-C Port Controller Manager"
-	depends on USB
-	select TYPEC
-	help
-	  The Type-C Port Controller Manager provides a USB PD and USB Type-C
-	  state machine for use with Type-C Port Controllers.
-
 if TYPEC_TCPM
 
 config TYPEC_TCPCI
diff --git a/drivers/staging/typec/Makefile b/drivers/staging/typec/Makefile
index 30a7e29cbc9e..e1df3f0fde10 100644
--- a/drivers/staging/typec/Makefile
+++ b/drivers/staging/typec/Makefile
@@ -1,3 +1,2 @@
-obj-$(CONFIG_TYPEC_TCPM)	+= tcpm.o
 obj-$(CONFIG_TYPEC_TCPCI)	+= tcpci.o
 obj-y				+= fusb302/
diff --git a/drivers/staging/typec/TODO b/drivers/staging/typec/TODO
index bc1f97a2d1bf..53fe2f726c88 100644
--- a/drivers/staging/typec/TODO
+++ b/drivers/staging/typec/TODO
@@ -1,13 +1,3 @@
-tcpm:
-- Add documentation (at the very least for the API to low level drivers)
-- Split PD code into separate file
-- Check if it makes sense to use tracepoints instead of debugfs for debug logs
-- Implement Alternate Mode handling
-- Address "#if 0" code if not addressed with the above
-- Validate all comments marked with "XXX"; either address or remove comments
-- Add support for USB PD 3.0. While not mandatory, at least fast role swap
-  as well as authentication support would be very desirable.
-
 tcpci:
 - Test with real hardware
 
diff --git a/drivers/staging/typec/fusb302/fusb302.c b/drivers/staging/typec/fusb302/fusb302.c
index fc6a3cf74eb3..e790b67d4953 100644
--- a/drivers/staging/typec/fusb302/fusb302.c
+++ b/drivers/staging/typec/fusb302/fusb302.c
@@ -37,11 +37,11 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/usb/typec.h>
+#include <linux/usb/tcpm.h>
+#include <linux/usb/pd.h>
 #include <linux/workqueue.h>
 
 #include "fusb302_reg.h"
-#include "../tcpm.h"
-#include "../pd.h"
 
 /*
  * When the device is SNK, BC_LVL interrupt is used to monitor cc pins
diff --git a/drivers/staging/typec/pd.h b/drivers/staging/typec/pd.h
deleted file mode 100644
index e00051ced806..000000000000
--- a/drivers/staging/typec/pd.h
+++ /dev/null
@@ -1,298 +0,0 @@
-/*
- * Copyright 2015-2017 Google, Inc
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#ifndef __LINUX_USB_PD_H
-#define __LINUX_USB_PD_H
-
-#include <linux/types.h>
-#include <linux/usb/typec.h>
-
-/* USB PD Messages */
-enum pd_ctrl_msg_type {
-	/* 0 Reserved */
-	PD_CTRL_GOOD_CRC = 1,
-	PD_CTRL_GOTO_MIN = 2,
-	PD_CTRL_ACCEPT = 3,
-	PD_CTRL_REJECT = 4,
-	PD_CTRL_PING = 5,
-	PD_CTRL_PS_RDY = 6,
-	PD_CTRL_GET_SOURCE_CAP = 7,
-	PD_CTRL_GET_SINK_CAP = 8,
-	PD_CTRL_DR_SWAP = 9,
-	PD_CTRL_PR_SWAP = 10,
-	PD_CTRL_VCONN_SWAP = 11,
-	PD_CTRL_WAIT = 12,
-	PD_CTRL_SOFT_RESET = 13,
-	/* 14-15 Reserved */
-};
-
-enum pd_data_msg_type {
-	/* 0 Reserved */
-	PD_DATA_SOURCE_CAP = 1,
-	PD_DATA_REQUEST = 2,
-	PD_DATA_BIST = 3,
-	PD_DATA_SINK_CAP = 4,
-	/* 5-14 Reserved */
-	PD_DATA_VENDOR_DEF = 15,
-};
-
-#define PD_REV10	0x0
-#define PD_REV20	0x1
-
-#define PD_HEADER_CNT_SHIFT	12
-#define PD_HEADER_CNT_MASK	0x7
-#define PD_HEADER_ID_SHIFT	9
-#define PD_HEADER_ID_MASK	0x7
-#define PD_HEADER_PWR_ROLE	BIT(8)
-#define PD_HEADER_REV_SHIFT	6
-#define PD_HEADER_REV_MASK	0x3
-#define PD_HEADER_DATA_ROLE	BIT(5)
-#define PD_HEADER_TYPE_SHIFT	0
-#define PD_HEADER_TYPE_MASK	0xf
-
-#define PD_HEADER(type, pwr, data, id, cnt)				\
-	((((type) & PD_HEADER_TYPE_MASK) << PD_HEADER_TYPE_SHIFT) |	\
-	 ((pwr) == TYPEC_SOURCE ? PD_HEADER_PWR_ROLE : 0) |		\
-	 ((data) == TYPEC_HOST ? PD_HEADER_DATA_ROLE : 0) |		\
-	 (PD_REV20 << PD_HEADER_REV_SHIFT) |				\
-	 (((id) & PD_HEADER_ID_MASK) << PD_HEADER_ID_SHIFT) |		\
-	 (((cnt) & PD_HEADER_CNT_MASK) << PD_HEADER_CNT_SHIFT))
-
-#define PD_HEADER_LE(type, pwr, data, id, cnt) \
-	cpu_to_le16(PD_HEADER((type), (pwr), (data), (id), (cnt)))
-
-static inline unsigned int pd_header_cnt(u16 header)
-{
-	return (header >> PD_HEADER_CNT_SHIFT) & PD_HEADER_CNT_MASK;
-}
-
-static inline unsigned int pd_header_cnt_le(__le16 header)
-{
-	return pd_header_cnt(le16_to_cpu(header));
-}
-
-static inline unsigned int pd_header_type(u16 header)
-{
-	return (header >> PD_HEADER_TYPE_SHIFT) & PD_HEADER_TYPE_MASK;
-}
-
-static inline unsigned int pd_header_type_le(__le16 header)
-{
-	return pd_header_type(le16_to_cpu(header));
-}
-
-static inline unsigned int pd_header_msgid(u16 header)
-{
-	return (header >> PD_HEADER_ID_SHIFT) & PD_HEADER_ID_MASK;
-}
-
-static inline unsigned int pd_header_msgid_le(__le16 header)
-{
-	return pd_header_msgid(le16_to_cpu(header));
-}
-
-#define PD_MAX_PAYLOAD		7
-
-/**
- * struct pd_message - PD message as seen on wire
- * @header:	PD message header
- * @payload:	PD message payload
- */
-struct pd_message {
-	__le16 header;
-	__le32 payload[PD_MAX_PAYLOAD];
-} __packed;
-
-/* PDO: Power Data Object */
-#define PDO_MAX_OBJECTS		7
-
-enum pd_pdo_type {
-	PDO_TYPE_FIXED = 0,
-	PDO_TYPE_BATT = 1,
-	PDO_TYPE_VAR = 2,
-};
-
-#define PDO_TYPE_SHIFT		30
-#define PDO_TYPE_MASK		0x3
-
-#define PDO_TYPE(t)	((t) << PDO_TYPE_SHIFT)
-
-#define PDO_VOLT_MASK		0x3ff
-#define PDO_CURR_MASK		0x3ff
-#define PDO_PWR_MASK		0x3ff
-
-#define PDO_FIXED_DUAL_ROLE	BIT(29)	/* Power role swap supported */
-#define PDO_FIXED_SUSPEND	BIT(28) /* USB Suspend supported (Source) */
-#define PDO_FIXED_HIGHER_CAP	BIT(28) /* Requires more than vSafe5V (Sink) */
-#define PDO_FIXED_EXTPOWER	BIT(27) /* Externally powered */
-#define PDO_FIXED_USB_COMM	BIT(26) /* USB communications capable */
-#define PDO_FIXED_DATA_SWAP	BIT(25) /* Data role swap supported */
-#define PDO_FIXED_VOLT_SHIFT	10	/* 50mV units */
-#define PDO_FIXED_CURR_SHIFT	0	/* 10mA units */
-
-#define PDO_FIXED_VOLT(mv)	((((mv) / 50) & PDO_VOLT_MASK) << PDO_FIXED_VOLT_SHIFT)
-#define PDO_FIXED_CURR(ma)	((((ma) / 10) & PDO_CURR_MASK) << PDO_FIXED_CURR_SHIFT)
-
-#define PDO_FIXED(mv, ma, flags)			\
-	(PDO_TYPE(PDO_TYPE_FIXED) | (flags) |		\
-	 PDO_FIXED_VOLT(mv) | PDO_FIXED_CURR(ma))
-
-#define PDO_BATT_MAX_VOLT_SHIFT	20	/* 50mV units */
-#define PDO_BATT_MIN_VOLT_SHIFT	10	/* 50mV units */
-#define PDO_BATT_MAX_PWR_SHIFT	0	/* 250mW units */
-
-#define PDO_BATT_MIN_VOLT(mv) ((((mv) / 50) & PDO_VOLT_MASK) << PDO_BATT_MIN_VOLT_SHIFT)
-#define PDO_BATT_MAX_VOLT(mv) ((((mv) / 50) & PDO_VOLT_MASK) << PDO_BATT_MAX_VOLT_SHIFT)
-#define PDO_BATT_MAX_POWER(mw) ((((mw) / 250) & PDO_PWR_MASK) << PDO_BATT_MAX_PWR_SHIFT)
-
-#define PDO_BATT(min_mv, max_mv, max_mw)			\
-	(PDO_TYPE(PDO_TYPE_BATT) | PDO_BATT_MIN_VOLT(min_mv) |	\
-	 PDO_BATT_MAX_VOLT(max_mv) | PDO_BATT_MAX_POWER(max_mw))
-
-#define PDO_VAR_MAX_VOLT_SHIFT	20	/* 50mV units */
-#define PDO_VAR_MIN_VOLT_SHIFT	10	/* 50mV units */
-#define PDO_VAR_MAX_CURR_SHIFT	0	/* 10mA units */
-
-#define PDO_VAR_MIN_VOLT(mv) ((((mv) / 50) & PDO_VOLT_MASK) << PDO_VAR_MIN_VOLT_SHIFT)
-#define PDO_VAR_MAX_VOLT(mv) ((((mv) / 50) & PDO_VOLT_MASK) << PDO_VAR_MAX_VOLT_SHIFT)
-#define PDO_VAR_MAX_CURR(ma) ((((ma) / 10) & PDO_CURR_MASK) << PDO_VAR_MAX_CURR_SHIFT)
-
-#define PDO_VAR(min_mv, max_mv, max_ma)				\
-	(PDO_TYPE(PDO_TYPE_VAR) | PDO_VAR_MIN_VOLT(min_mv) |	\
-	 PDO_VAR_MAX_VOLT(max_mv) | PDO_VAR_MAX_CURR(max_ma))
-
-static inline enum pd_pdo_type pdo_type(u32 pdo)
-{
-	return (pdo >> PDO_TYPE_SHIFT) & PDO_TYPE_MASK;
-}
-
-static inline unsigned int pdo_fixed_voltage(u32 pdo)
-{
-	return ((pdo >> PDO_FIXED_VOLT_SHIFT) & PDO_VOLT_MASK) * 50;
-}
-
-static inline unsigned int pdo_min_voltage(u32 pdo)
-{
-	return ((pdo >> PDO_VAR_MIN_VOLT_SHIFT) & PDO_VOLT_MASK) * 50;
-}
-
-static inline unsigned int pdo_max_voltage(u32 pdo)
-{
-	return ((pdo >> PDO_VAR_MAX_VOLT_SHIFT) & PDO_VOLT_MASK) * 50;
-}
-
-static inline unsigned int pdo_max_current(u32 pdo)
-{
-	return ((pdo >> PDO_VAR_MAX_CURR_SHIFT) & PDO_CURR_MASK) * 10;
-}
-
-static inline unsigned int pdo_max_power(u32 pdo)
-{
-	return ((pdo >> PDO_BATT_MAX_PWR_SHIFT) & PDO_PWR_MASK) * 250;
-}
-
-/* RDO: Request Data Object */
-#define RDO_OBJ_POS_SHIFT	28
-#define RDO_OBJ_POS_MASK	0x7
-#define RDO_GIVE_BACK		BIT(27)	/* Supports reduced operating current */
-#define RDO_CAP_MISMATCH	BIT(26) /* Not satisfied by source caps */
-#define RDO_USB_COMM		BIT(25) /* USB communications capable */
-#define RDO_NO_SUSPEND		BIT(24) /* USB Suspend not supported */
-
-#define RDO_PWR_MASK			0x3ff
-#define RDO_CURR_MASK			0x3ff
-
-#define RDO_FIXED_OP_CURR_SHIFT		10
-#define RDO_FIXED_MAX_CURR_SHIFT	0
-
-#define RDO_OBJ(idx) (((idx) & RDO_OBJ_POS_MASK) << RDO_OBJ_POS_SHIFT)
-
-#define PDO_FIXED_OP_CURR(ma) ((((ma) / 10) & RDO_CURR_MASK) << RDO_FIXED_OP_CURR_SHIFT)
-#define PDO_FIXED_MAX_CURR(ma) ((((ma) / 10) & RDO_CURR_MASK) << RDO_FIXED_MAX_CURR_SHIFT)
-
-#define RDO_FIXED(idx, op_ma, max_ma, flags)			\
-	(RDO_OBJ(idx) | (flags) |				\
-	 PDO_FIXED_OP_CURR(op_ma) | PDO_FIXED_MAX_CURR(max_ma))
-
-#define RDO_BATT_OP_PWR_SHIFT		10	/* 250mW units */
-#define RDO_BATT_MAX_PWR_SHIFT		0	/* 250mW units */
-
-#define RDO_BATT_OP_PWR(mw) ((((mw) / 250) & RDO_PWR_MASK) << RDO_BATT_OP_PWR_SHIFT)
-#define RDO_BATT_MAX_PWR(mw) ((((mw) / 250) & RDO_PWR_MASK) << RDO_BATT_MAX_PWR_SHIFT)
-
-#define RDO_BATT(idx, op_mw, max_mw, flags)			\
-	(RDO_OBJ(idx) | (flags) |				\
-	 RDO_BATT_OP_PWR(op_mw) | RDO_BATT_MAX_PWR(max_mw))
-
-static inline unsigned int rdo_index(u32 rdo)
-{
-	return (rdo >> RDO_OBJ_POS_SHIFT) & RDO_OBJ_POS_MASK;
-}
-
-static inline unsigned int rdo_op_current(u32 rdo)
-{
-	return ((rdo >> RDO_FIXED_OP_CURR_SHIFT) & RDO_CURR_MASK) * 10;
-}
-
-static inline unsigned int rdo_max_current(u32 rdo)
-{
-	return ((rdo >> RDO_FIXED_MAX_CURR_SHIFT) &
-		RDO_CURR_MASK) * 10;
-}
-
-static inline unsigned int rdo_op_power(u32 rdo)
-{
-	return ((rdo >> RDO_BATT_OP_PWR_SHIFT) & RDO_PWR_MASK) * 250;
-}
-
-static inline unsigned int rdo_max_power(u32 rdo)
-{
-	return ((rdo >> RDO_BATT_MAX_PWR_SHIFT) & RDO_PWR_MASK) * 250;
-}
-
-/* USB PD timers and counters */
-#define PD_T_NO_RESPONSE	5000	/* 4.5 - 5.5 seconds */
-#define PD_T_DB_DETECT		10000	/* 10 - 15 seconds */
-#define PD_T_SEND_SOURCE_CAP	150	/* 100 - 200 ms */
-#define PD_T_SENDER_RESPONSE	60	/* 24 - 30 ms, relaxed */
-#define PD_T_SOURCE_ACTIVITY	45
-#define PD_T_SINK_ACTIVITY	135
-#define PD_T_SINK_WAIT_CAP	240
-#define PD_T_PS_TRANSITION	500
-#define PD_T_SRC_TRANSITION	35
-#define PD_T_DRP_SNK		40
-#define PD_T_DRP_SRC		30
-#define PD_T_PS_SOURCE_OFF	920
-#define PD_T_PS_SOURCE_ON	480
-#define PD_T_PS_HARD_RESET	30
-#define PD_T_SRC_RECOVER	760
-#define PD_T_SRC_RECOVER_MAX	1000
-#define PD_T_SRC_TURN_ON	275
-#define PD_T_SAFE_0V		650
-#define PD_T_VCONN_SOURCE_ON	100
-#define PD_T_SINK_REQUEST	100	/* 100 ms minimum */
-#define PD_T_ERROR_RECOVERY	100	/* minimum 25 is insufficient */
-#define PD_T_SRCSWAPSTDBY      625     /* Maximum of 650ms */
-#define PD_T_NEWSRC            250     /* Maximum of 275ms */
-
-#define PD_T_DRP_TRY		100	/* 75 - 150 ms */
-#define PD_T_DRP_TRYWAIT	600	/* 400 - 800 ms */
-
-#define PD_T_CC_DEBOUNCE	200	/* 100 - 200 ms */
-#define PD_T_PD_DEBOUNCE	20	/* 10 - 20 ms */
-
-#define PD_N_CAPS_COUNT		(PD_T_NO_RESPONSE / PD_T_SEND_SOURCE_CAP)
-#define PD_N_HARD_RESET_COUNT	2
-
-#endif /* __LINUX_USB_PD_H */
diff --git a/drivers/staging/typec/pd_bdo.h b/drivers/staging/typec/pd_bdo.h
deleted file mode 100644
index 90b94d9fea5d..000000000000
--- a/drivers/staging/typec/pd_bdo.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright 2015-2017 Google, Inc
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#ifndef __LINUX_USB_PD_BDO_H
-#define __LINUX_USB_PD_BDO_H
-
-/* BDO : BIST Data Object */
-#define BDO_MODE_RECV		(0 << 28)
-#define BDO_MODE_TRANSMIT	(1 << 28)
-#define BDO_MODE_COUNTERS	(2 << 28)
-#define BDO_MODE_CARRIER0	(3 << 28)
-#define BDO_MODE_CARRIER1	(4 << 28)
-#define BDO_MODE_CARRIER2	(5 << 28)
-#define BDO_MODE_CARRIER3	(6 << 28)
-#define BDO_MODE_EYE		(7 << 28)
-#define BDO_MODE_TESTDATA	(8 << 28)
-
-#define BDO_MODE_MASK(mode)	((mode) & 0xf0000000)
-
-#endif
diff --git a/drivers/staging/typec/pd_vdo.h b/drivers/staging/typec/pd_vdo.h
deleted file mode 100644
index d92259f8de0a..000000000000
--- a/drivers/staging/typec/pd_vdo.h
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * Copyright 2015-2017 Google, Inc
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#ifndef __LINUX_USB_PD_VDO_H
-#define __LINUX_USB_PD_VDO_H
-
-#include "pd.h"
-
-/*
- * VDO : Vendor Defined Message Object
- * VDM object is minimum of VDM header + 6 additional data objects.
- */
-
-#define VDO_MAX_OBJECTS		6
-#define VDO_MAX_SIZE		(VDO_MAX_OBJECTS + 1)
-
-/*
- * VDM header
- * ----------
- * <31:16>  :: SVID
- * <15>     :: VDM type ( 1b == structured, 0b == unstructured )
- * <14:13>  :: Structured VDM version (can only be 00 == 1.0 currently)
- * <12:11>  :: reserved
- * <10:8>   :: object position (1-7 valid ... used for enter/exit mode only)
- * <7:6>    :: command type (SVDM only?)
- * <5>      :: reserved (SVDM), command type (UVDM)
- * <4:0>    :: command
- */
-#define VDO(vid, type, custom)				\
-	(((vid) << 16) |				\
-	 ((type) << 15) |				\
-	 ((custom) & 0x7FFF))
-
-#define VDO_SVDM_TYPE		(1 << 15)
-#define VDO_SVDM_VERS(x)	((x) << 13)
-#define VDO_OPOS(x)		((x) << 8)
-#define VDO_CMDT(x)		((x) << 6)
-#define VDO_OPOS_MASK		VDO_OPOS(0x7)
-#define VDO_CMDT_MASK		VDO_CMDT(0x3)
-
-#define CMDT_INIT		0
-#define CMDT_RSP_ACK		1
-#define CMDT_RSP_NAK		2
-#define CMDT_RSP_BUSY		3
-
-/* reserved for SVDM ... for Google UVDM */
-#define VDO_SRC_INITIATOR	(0 << 5)
-#define VDO_SRC_RESPONDER	(1 << 5)
-
-#define CMD_DISCOVER_IDENT	1
-#define CMD_DISCOVER_SVID	2
-#define CMD_DISCOVER_MODES	3
-#define CMD_ENTER_MODE		4
-#define CMD_EXIT_MODE		5
-#define CMD_ATTENTION		6
-
-#define VDO_CMD_VENDOR(x)    (((10 + (x)) & 0x1f))
-
-/* ChromeOS specific commands */
-#define VDO_CMD_VERSION		VDO_CMD_VENDOR(0)
-#define VDO_CMD_SEND_INFO	VDO_CMD_VENDOR(1)
-#define VDO_CMD_READ_INFO	VDO_CMD_VENDOR(2)
-#define VDO_CMD_REBOOT		VDO_CMD_VENDOR(5)
-#define VDO_CMD_FLASH_ERASE	VDO_CMD_VENDOR(6)
-#define VDO_CMD_FLASH_WRITE	VDO_CMD_VENDOR(7)
-#define VDO_CMD_ERASE_SIG	VDO_CMD_VENDOR(8)
-#define VDO_CMD_PING_ENABLE	VDO_CMD_VENDOR(10)
-#define VDO_CMD_CURRENT		VDO_CMD_VENDOR(11)
-#define VDO_CMD_FLIP		VDO_CMD_VENDOR(12)
-#define VDO_CMD_GET_LOG		VDO_CMD_VENDOR(13)
-#define VDO_CMD_CCD_EN		VDO_CMD_VENDOR(14)
-
-#define PD_VDO_VID(vdo)		((vdo) >> 16)
-#define PD_VDO_SVDM(vdo)	(((vdo) >> 15) & 1)
-#define PD_VDO_OPOS(vdo)	(((vdo) >> 8) & 0x7)
-#define PD_VDO_CMD(vdo)		((vdo) & 0x1f)
-#define PD_VDO_CMDT(vdo)	(((vdo) >> 6) & 0x3)
-
-/*
- * SVDM Identity request -> response
- *
- * Request is simply properly formatted SVDM header
- *
- * Response is 4 data objects:
- * [0] :: SVDM header
- * [1] :: Identitiy header
- * [2] :: Cert Stat VDO
- * [3] :: (Product | Cable) VDO
- * [4] :: AMA VDO
- *
- */
-#define VDO_INDEX_HDR		0
-#define VDO_INDEX_IDH		1
-#define VDO_INDEX_CSTAT		2
-#define VDO_INDEX_CABLE		3
-#define VDO_INDEX_PRODUCT	3
-#define VDO_INDEX_AMA		4
-
-/*
- * SVDM Identity Header
- * --------------------
- * <31>     :: data capable as a USB host
- * <30>     :: data capable as a USB device
- * <29:27>  :: product type
- * <26>     :: modal operation supported (1b == yes)
- * <25:16>  :: Reserved, Shall be set to zero
- * <15:0>   :: USB-IF assigned VID for this cable vendor
- */
-#define IDH_PTYPE_UNDEF		0
-#define IDH_PTYPE_HUB		1
-#define IDH_PTYPE_PERIPH	2
-#define IDH_PTYPE_PCABLE	3
-#define IDH_PTYPE_ACABLE	4
-#define IDH_PTYPE_AMA		5
-
-#define VDO_IDH(usbh, usbd, ptype, is_modal, vid)		\
-	((usbh) << 31 | (usbd) << 30 | ((ptype) & 0x7) << 27	\
-	 | (is_modal) << 26 | ((vid) & 0xffff))
-
-#define PD_IDH_PTYPE(vdo)	(((vdo) >> 27) & 0x7)
-#define PD_IDH_VID(vdo)		((vdo) & 0xffff)
-#define PD_IDH_MODAL_SUPP(vdo)	((vdo) & (1 << 26))
-
-/*
- * Cert Stat VDO
- * -------------
- * <31:0>  : USB-IF assigned XID for this cable
- */
-#define PD_CSTAT_XID(vdo)	(vdo)
-
-/*
- * Product VDO
- * -----------
- * <31:16> : USB Product ID
- * <15:0>  : USB bcdDevice
- */
-#define VDO_PRODUCT(pid, bcd)	(((pid) & 0xffff) << 16 | ((bcd) & 0xffff))
-#define PD_PRODUCT_PID(vdo)	(((vdo) >> 16) & 0xffff)
-
-/*
- * Cable VDO
- * ---------
- * <31:28> :: Cable HW version
- * <27:24> :: Cable FW version
- * <23:20> :: Reserved, Shall be set to zero
- * <19:18> :: type-C to Type-A/B/C (00b == A, 01 == B, 10 == C)
- * <17>    :: Type-C to Plug/Receptacle (0b == plug, 1b == receptacle)
- * <16:13> :: cable latency (0001 == <10ns(~1m length))
- * <12:11> :: cable termination type (11b == both ends active VCONN req)
- * <10>    :: SSTX1 Directionality support (0b == fixed, 1b == cfgable)
- * <9>     :: SSTX2 Directionality support
- * <8>     :: SSRX1 Directionality support
- * <7>     :: SSRX2 Directionality support
- * <6:5>   :: Vbus current handling capability
- * <4>     :: Vbus through cable (0b == no, 1b == yes)
- * <3>     :: SOP" controller present? (0b == no, 1b == yes)
- * <2:0>   :: USB SS Signaling support
- */
-#define CABLE_ATYPE		0
-#define CABLE_BTYPE		1
-#define CABLE_CTYPE		2
-#define CABLE_PLUG		0
-#define CABLE_RECEPTACLE	1
-#define CABLE_CURR_1A5		0
-#define CABLE_CURR_3A		1
-#define CABLE_CURR_5A		2
-#define CABLE_USBSS_U2_ONLY	0
-#define CABLE_USBSS_U31_GEN1	1
-#define CABLE_USBSS_U31_GEN2	2
-#define VDO_CABLE(hw, fw, cbl, gdr, lat, term, tx1d, tx2d, rx1d, rx2d, cur,\
-		  vps, sopp, usbss) \
-	(((hw) & 0x7) << 28 | ((fw) & 0x7) << 24 | ((cbl) & 0x3) << 18	\
-	 | (gdr) << 17 | ((lat) & 0x7) << 13 | ((term) & 0x3) << 11	\
-	 | (tx1d) << 10 | (tx2d) << 9 | (rx1d) << 8 | (rx2d) << 7	\
-	 | ((cur) & 0x3) << 5 | (vps) << 4 | (sopp) << 3		\
-	 | ((usbss) & 0x7))
-
-/*
- * AMA VDO
- * ---------
- * <31:28> :: Cable HW version
- * <27:24> :: Cable FW version
- * <23:12> :: Reserved, Shall be set to zero
- * <11>    :: SSTX1 Directionality support (0b == fixed, 1b == cfgable)
- * <10>    :: SSTX2 Directionality support
- * <9>     :: SSRX1 Directionality support
- * <8>     :: SSRX2 Directionality support
- * <7:5>   :: Vconn power
- * <4>     :: Vconn power required
- * <3>     :: Vbus power required
- * <2:0>   :: USB SS Signaling support
- */
-#define VDO_AMA(hw, fw, tx1d, tx2d, rx1d, rx2d, vcpwr, vcr, vbr, usbss) \
-	(((hw) & 0x7) << 28 | ((fw) & 0x7) << 24			\
-	 | (tx1d) << 11 | (tx2d) << 10 | (rx1d) << 9 | (rx2d) << 8	\
-	 | ((vcpwr) & 0x7) << 5 | (vcr) << 4 | (vbr) << 3		\
-	 | ((usbss) & 0x7))
-
-#define PD_VDO_AMA_VCONN_REQ(vdo)	(((vdo) >> 4) & 1)
-#define PD_VDO_AMA_VBUS_REQ(vdo)	(((vdo) >> 3) & 1)
-
-#define AMA_VCONN_PWR_1W	0
-#define AMA_VCONN_PWR_1W5	1
-#define AMA_VCONN_PWR_2W	2
-#define AMA_VCONN_PWR_3W	3
-#define AMA_VCONN_PWR_4W	4
-#define AMA_VCONN_PWR_5W	5
-#define AMA_VCONN_PWR_6W	6
-#define AMA_USBSS_U2_ONLY	0
-#define AMA_USBSS_U31_GEN1	1
-#define AMA_USBSS_U31_GEN2	2
-#define AMA_USBSS_BBONLY	3
-
-/*
- * SVDM Discover SVIDs request -> response
- *
- * Request is properly formatted VDM Header with discover SVIDs command.
- * Response is a set of SVIDs of all all supported SVIDs with all zero's to
- * mark the end of SVIDs.  If more than 12 SVIDs are supported command SHOULD be
- * repeated.
- */
-#define VDO_SVID(svid0, svid1)	(((svid0) & 0xffff) << 16 | ((svid1) & 0xffff))
-#define PD_VDO_SVID_SVID0(vdo)	((vdo) >> 16)
-#define PD_VDO_SVID_SVID1(vdo)	((vdo) & 0xffff)
-
-/* USB-IF SIDs */
-#define USB_SID_PD		0xff00 /* power delivery */
-#define USB_SID_DISPLAYPORT	0xff01
-#define USB_SID_MHL		0xff02	/* Mobile High-Definition Link */
-
-/* VDM command timeouts (in ms) */
-
-#define PD_T_VDM_UNSTRUCTURED	500
-#define PD_T_VDM_BUSY		100
-#define PD_T_VDM_WAIT_MODE_E	100
-#define PD_T_VDM_SNDR_RSP	30
-#define PD_T_VDM_E_MODE		25
-#define PD_T_VDM_RCVR_RSP	15
-
-#endif /* __LINUX_USB_PD_VDO_H */
diff --git a/drivers/staging/typec/tcpci.c b/drivers/staging/typec/tcpci.c
index df72d8b01e73..4636804ea1a4 100644
--- a/drivers/staging/typec/tcpci.c
+++ b/drivers/staging/typec/tcpci.c
@@ -20,11 +20,11 @@
 #include <linux/i2c.h>
 #include <linux/interrupt.h>
 #include <linux/regmap.h>
+#include <linux/usb/pd.h>
+#include <linux/usb/tcpm.h>
 #include <linux/usb/typec.h>
 
-#include "pd.h"
 #include "tcpci.h"
-#include "tcpm.h"
 
 #define PD_RETRY_COUNT 3
 
diff --git a/drivers/staging/typec/tcpm.c b/drivers/staging/typec/tcpm.c
deleted file mode 100644
index cb25ec8334b0..000000000000
--- a/drivers/staging/typec/tcpm.c
+++ /dev/null
@@ -1,3616 +0,0 @@
-/*
- * Copyright 2015-2017 Google, Inc
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * USB Power Delivery protocol stack.
- */
-
-#include <linux/completion.h>
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/proc_fs.h>
-#include <linux/sched/clock.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/usb/typec.h>
-#include <linux/workqueue.h>
-
-#include "pd.h"
-#include "pd_vdo.h"
-#include "pd_bdo.h"
-#include "tcpm.h"
-
-#define FOREACH_STATE(S)			\
-	S(INVALID_STATE),			\
-	S(DRP_TOGGLING),			\
-	S(SRC_UNATTACHED),			\
-	S(SRC_ATTACH_WAIT),			\
-	S(SRC_ATTACHED),			\
-	S(SRC_STARTUP),				\
-	S(SRC_SEND_CAPABILITIES),		\
-	S(SRC_NEGOTIATE_CAPABILITIES),		\
-	S(SRC_TRANSITION_SUPPLY),		\
-	S(SRC_READY),				\
-	S(SRC_WAIT_NEW_CAPABILITIES),		\
-						\
-	S(SNK_UNATTACHED),			\
-	S(SNK_ATTACH_WAIT),			\
-	S(SNK_DEBOUNCED),			\
-	S(SNK_ATTACHED),			\
-	S(SNK_STARTUP),				\
-	S(SNK_DISCOVERY),			\
-	S(SNK_DISCOVERY_DEBOUNCE),		\
-	S(SNK_DISCOVERY_DEBOUNCE_DONE),		\
-	S(SNK_WAIT_CAPABILITIES),		\
-	S(SNK_NEGOTIATE_CAPABILITIES),		\
-	S(SNK_TRANSITION_SINK),			\
-	S(SNK_TRANSITION_SINK_VBUS),		\
-	S(SNK_READY),				\
-						\
-	S(ACC_UNATTACHED),			\
-	S(DEBUG_ACC_ATTACHED),			\
-	S(AUDIO_ACC_ATTACHED),			\
-	S(AUDIO_ACC_DEBOUNCE),			\
-						\
-	S(HARD_RESET_SEND),			\
-	S(HARD_RESET_START),			\
-	S(SRC_HARD_RESET_VBUS_OFF),		\
-	S(SRC_HARD_RESET_VBUS_ON),		\
-	S(SNK_HARD_RESET_SINK_OFF),		\
-	S(SNK_HARD_RESET_WAIT_VBUS),		\
-	S(SNK_HARD_RESET_SINK_ON),		\
-						\
-	S(SOFT_RESET),				\
-	S(SOFT_RESET_SEND),			\
-						\
-	S(DR_SWAP_ACCEPT),			\
-	S(DR_SWAP_SEND),			\
-	S(DR_SWAP_SEND_TIMEOUT),		\
-	S(DR_SWAP_CANCEL),			\
-	S(DR_SWAP_CHANGE_DR),			\
-						\
-	S(PR_SWAP_ACCEPT),			\
-	S(PR_SWAP_SEND),			\
-	S(PR_SWAP_SEND_TIMEOUT),		\
-	S(PR_SWAP_CANCEL),			\
-	S(PR_SWAP_START),			\
-	S(PR_SWAP_SRC_SNK_TRANSITION_OFF),	\
-	S(PR_SWAP_SRC_SNK_SOURCE_OFF),		\
-	S(PR_SWAP_SRC_SNK_SOURCE_OFF_CC_DEBOUNCED), \
-	S(PR_SWAP_SRC_SNK_SINK_ON),		\
-	S(PR_SWAP_SNK_SRC_SINK_OFF),		\
-	S(PR_SWAP_SNK_SRC_SOURCE_ON),		\
-	S(PR_SWAP_SNK_SRC_SOURCE_ON_VBUS_RAMPED_UP),    \
-						\
-	S(VCONN_SWAP_ACCEPT),			\
-	S(VCONN_SWAP_SEND),			\
-	S(VCONN_SWAP_SEND_TIMEOUT),		\
-	S(VCONN_SWAP_CANCEL),			\
-	S(VCONN_SWAP_START),			\
-	S(VCONN_SWAP_WAIT_FOR_VCONN),		\
-	S(VCONN_SWAP_TURN_ON_VCONN),		\
-	S(VCONN_SWAP_TURN_OFF_VCONN),		\
-						\
-	S(SNK_TRY),				\
-	S(SNK_TRY_WAIT),			\
-	S(SNK_TRY_WAIT_DEBOUNCE),               \
-	S(SNK_TRY_WAIT_DEBOUNCE_CHECK_VBUS),    \
-	S(SRC_TRYWAIT),				\
-	S(SRC_TRYWAIT_DEBOUNCE),		\
-	S(SRC_TRYWAIT_UNATTACHED),		\
-						\
-	S(SRC_TRY),				\
-	S(SRC_TRY_WAIT),                        \
-	S(SRC_TRY_DEBOUNCE),			\
-	S(SNK_TRYWAIT),				\
-	S(SNK_TRYWAIT_DEBOUNCE),		\
-	S(SNK_TRYWAIT_VBUS),			\
-	S(BIST_RX),				\
-						\
-	S(ERROR_RECOVERY),			\
-	S(PORT_RESET),				\
-	S(PORT_RESET_WAIT_OFF)
-
-#define GENERATE_ENUM(e)	e
-#define GENERATE_STRING(s)	#s
-
-enum tcpm_state {
-	FOREACH_STATE(GENERATE_ENUM)
-};
-
-static const char * const tcpm_states[] = {
-	FOREACH_STATE(GENERATE_STRING)
-};
-
-enum vdm_states {
-	VDM_STATE_ERR_BUSY = -3,
-	VDM_STATE_ERR_SEND = -2,
-	VDM_STATE_ERR_TMOUT = -1,
-	VDM_STATE_DONE = 0,
-	/* Anything >0 represents an active state */
-	VDM_STATE_READY = 1,
-	VDM_STATE_BUSY = 2,
-	VDM_STATE_WAIT_RSP_BUSY = 3,
-};
-
-enum pd_msg_request {
-	PD_MSG_NONE = 0,
-	PD_MSG_CTRL_REJECT,
-	PD_MSG_CTRL_WAIT,
-	PD_MSG_DATA_SINK_CAP,
-	PD_MSG_DATA_SOURCE_CAP,
-};
-
-/* Events from low level driver */
-
-#define TCPM_CC_EVENT		BIT(0)
-#define TCPM_VBUS_EVENT		BIT(1)
-#define TCPM_RESET_EVENT	BIT(2)
-
-#define LOG_BUFFER_ENTRIES	1024
-#define LOG_BUFFER_ENTRY_SIZE	128
-
-/* Alternate mode support */
-
-#define SVID_DISCOVERY_MAX	16
-
-struct pd_mode_data {
-	int svid_index;		/* current SVID index		*/
-	int nsvids;
-	u16 svids[SVID_DISCOVERY_MAX];
-	int altmodes;		/* number of alternate modes	*/
-	struct typec_altmode_desc altmode_desc[SVID_DISCOVERY_MAX];
-};
-
-struct tcpm_port {
-	struct device *dev;
-
-	struct mutex lock;		/* tcpm state machine lock */
-	struct workqueue_struct *wq;
-
-	struct typec_capability typec_caps;
-	struct typec_port *typec_port;
-
-	struct tcpc_dev	*tcpc;
-
-	enum typec_role vconn_role;
-	enum typec_role pwr_role;
-	enum typec_data_role data_role;
-	enum typec_pwr_opmode pwr_opmode;
-
-	struct usb_pd_identity partner_ident;
-	struct typec_partner_desc partner_desc;
-	struct typec_partner *partner;
-
-	enum typec_cc_status cc_req;
-
-	enum typec_cc_status cc1;
-	enum typec_cc_status cc2;
-	enum typec_cc_polarity polarity;
-
-	bool attached;
-	bool connected;
-	enum typec_port_type port_type;
-	bool vbus_present;
-	bool vbus_never_low;
-	bool vbus_source;
-	bool vbus_charge;
-
-	bool send_discover;
-	bool op_vsafe5v;
-
-	int try_role;
-	int try_snk_count;
-	int try_src_count;
-
-	enum pd_msg_request queued_message;
-
-	enum tcpm_state enter_state;
-	enum tcpm_state prev_state;
-	enum tcpm_state state;
-	enum tcpm_state delayed_state;
-	unsigned long delayed_runtime;
-	unsigned long delay_ms;
-
-	spinlock_t pd_event_lock;
-	u32 pd_events;
-
-	struct work_struct event_work;
-	struct delayed_work state_machine;
-	struct delayed_work vdm_state_machine;
-	bool state_machine_running;
-
-	struct completion tx_complete;
-	enum tcpm_transmit_status tx_status;
-
-	struct mutex swap_lock;		/* swap command lock */
-	bool swap_pending;
-	bool non_pd_role_swap;
-	struct completion swap_complete;
-	int swap_status;
-
-	unsigned int message_id;
-	unsigned int caps_count;
-	unsigned int hard_reset_count;
-	bool pd_capable;
-	bool explicit_contract;
-	unsigned int rx_msgid;
-
-	/* Partner capabilities/requests */
-	u32 sink_request;
-	u32 source_caps[PDO_MAX_OBJECTS];
-	unsigned int nr_source_caps;
-	u32 sink_caps[PDO_MAX_OBJECTS];
-	unsigned int nr_sink_caps;
-
-	/* Local capabilities */
-	u32 src_pdo[PDO_MAX_OBJECTS];
-	unsigned int nr_src_pdo;
-	u32 snk_pdo[PDO_MAX_OBJECTS];
-	unsigned int nr_snk_pdo;
-	u32 snk_vdo[VDO_MAX_OBJECTS];
-	unsigned int nr_snk_vdo;
-
-	unsigned int max_snk_mv;
-	unsigned int max_snk_ma;
-	unsigned int max_snk_mw;
-	unsigned int operating_snk_mw;
-
-	/* Requested current / voltage */
-	u32 current_limit;
-	u32 supply_voltage;
-
-	u32 bist_request;
-
-	/* PD state for Vendor Defined Messages */
-	enum vdm_states vdm_state;
-	u32 vdm_retries;
-	/* next Vendor Defined Message to send */
-	u32 vdo_data[VDO_MAX_SIZE];
-	u8 vdo_count;
-	/* VDO to retry if UFP responder replied busy */
-	u32 vdo_retry;
-
-	/* Alternate mode data */
-
-	struct pd_mode_data mode_data;
-	struct typec_altmode *partner_altmode[SVID_DISCOVERY_MAX];
-	struct typec_altmode *port_altmode[SVID_DISCOVERY_MAX];
-
-	/* Deadline in jiffies to exit src_try_wait state */
-	unsigned long max_wait;
-
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *dentry;
-	struct mutex logbuffer_lock;	/* log buffer access lock */
-	int logbuffer_head;
-	int logbuffer_tail;
-	u8 *logbuffer[LOG_BUFFER_ENTRIES];
-#endif
-};
-
-struct pd_rx_event {
-	struct work_struct work;
-	struct tcpm_port *port;
-	struct pd_message msg;
-};
-
-#define tcpm_cc_is_sink(cc) \
-	((cc) == TYPEC_CC_RP_DEF || (cc) == TYPEC_CC_RP_1_5 || \
-	 (cc) == TYPEC_CC_RP_3_0)
-
-#define tcpm_port_is_sink(port) \
-	((tcpm_cc_is_sink((port)->cc1) && !tcpm_cc_is_sink((port)->cc2)) || \
-	 (tcpm_cc_is_sink((port)->cc2) && !tcpm_cc_is_sink((port)->cc1)))
-
-#define tcpm_cc_is_source(cc) ((cc) == TYPEC_CC_RD)
-#define tcpm_cc_is_audio(cc) ((cc) == TYPEC_CC_RA)
-#define tcpm_cc_is_open(cc) ((cc) == TYPEC_CC_OPEN)
-
-#define tcpm_port_is_source(port) \
-	((tcpm_cc_is_source((port)->cc1) && \
-	 !tcpm_cc_is_source((port)->cc2)) || \
-	 (tcpm_cc_is_source((port)->cc2) && \
-	  !tcpm_cc_is_source((port)->cc1)))
-
-#define tcpm_port_is_debug(port) \
-	(tcpm_cc_is_source((port)->cc1) && tcpm_cc_is_source((port)->cc2))
-
-#define tcpm_port_is_audio(port) \
-	(tcpm_cc_is_audio((port)->cc1) && tcpm_cc_is_audio((port)->cc2))
-
-#define tcpm_port_is_audio_detached(port) \
-	((tcpm_cc_is_audio((port)->cc1) && tcpm_cc_is_open((port)->cc2)) || \
-	 (tcpm_cc_is_audio((port)->cc2) && tcpm_cc_is_open((port)->cc1)))
-
-#define tcpm_try_snk(port) \
-	((port)->try_snk_count == 0 && (port)->try_role == TYPEC_SINK && \
-	(port)->port_type == TYPEC_PORT_DRP)
-
-#define tcpm_try_src(port) \
-	((port)->try_src_count == 0 && (port)->try_role == TYPEC_SOURCE && \
-	(port)->port_type == TYPEC_PORT_DRP)
-
-static enum tcpm_state tcpm_default_state(struct tcpm_port *port)
-{
-	if (port->port_type == TYPEC_PORT_DRP) {
-		if (port->try_role == TYPEC_SINK)
-			return SNK_UNATTACHED;
-		else if (port->try_role == TYPEC_SOURCE)
-			return SRC_UNATTACHED;
-		else if (port->tcpc->config->default_role == TYPEC_SINK)
-			return SNK_UNATTACHED;
-		/* Fall through to return SRC_UNATTACHED */
-	} else if (port->port_type == TYPEC_PORT_UFP) {
-		return SNK_UNATTACHED;
-	}
-	return SRC_UNATTACHED;
-}
-
-static inline
-struct tcpm_port *typec_cap_to_tcpm(const struct typec_capability *cap)
-{
-	return container_of(cap, struct tcpm_port, typec_caps);
-}
-
-static bool tcpm_port_is_disconnected(struct tcpm_port *port)
-{
-	return (!port->attached && port->cc1 == TYPEC_CC_OPEN &&
-		port->cc2 == TYPEC_CC_OPEN) ||
-	       (port->attached && ((port->polarity == TYPEC_POLARITY_CC1 &&
-				    port->cc1 == TYPEC_CC_OPEN) ||
-				   (port->polarity == TYPEC_POLARITY_CC2 &&
-				    port->cc2 == TYPEC_CC_OPEN)));
-}
-
-/*
- * Logging
- */
-
-#ifdef CONFIG_DEBUG_FS
-
-static bool tcpm_log_full(struct tcpm_port *port)
-{
-	return port->logbuffer_tail ==
-		(port->logbuffer_head + 1) % LOG_BUFFER_ENTRIES;
-}
-
-__printf(2, 0)
-static void _tcpm_log(struct tcpm_port *port, const char *fmt, va_list args)
-{
-	char tmpbuffer[LOG_BUFFER_ENTRY_SIZE];
-	u64 ts_nsec = local_clock();
-	unsigned long rem_nsec;
-
-	if (!port->logbuffer[port->logbuffer_head]) {
-		port->logbuffer[port->logbuffer_head] =
-				kzalloc(LOG_BUFFER_ENTRY_SIZE, GFP_KERNEL);
-		if (!port->logbuffer[port->logbuffer_head])
-			return;
-	}
-
-	vsnprintf(tmpbuffer, sizeof(tmpbuffer), fmt, args);
-
-	mutex_lock(&port->logbuffer_lock);
-
-	if (tcpm_log_full(port)) {
-		port->logbuffer_head = max(port->logbuffer_head - 1, 0);
-		strcpy(tmpbuffer, "overflow");
-	}
-
-	if (port->logbuffer_head < 0 ||
-	    port->logbuffer_head >= LOG_BUFFER_ENTRIES) {
-		dev_warn(port->dev,
-			 "Bad log buffer index %d\n", port->logbuffer_head);
-		goto abort;
-	}
-
-	if (!port->logbuffer[port->logbuffer_head]) {
-		dev_warn(port->dev,
-			 "Log buffer index %d is NULL\n", port->logbuffer_head);
-		goto abort;
-	}
-
-	rem_nsec = do_div(ts_nsec, 1000000000);
-	scnprintf(port->logbuffer[port->logbuffer_head],
-		  LOG_BUFFER_ENTRY_SIZE, "[%5lu.%06lu] %s",
-		  (unsigned long)ts_nsec, rem_nsec / 1000,
-		  tmpbuffer);
-	port->logbuffer_head = (port->logbuffer_head + 1) % LOG_BUFFER_ENTRIES;
-
-abort:
-	mutex_unlock(&port->logbuffer_lock);
-}
-
-__printf(2, 3)
-static void tcpm_log(struct tcpm_port *port, const char *fmt, ...)
-{
-	va_list args;
-
-	/* Do not log while disconnected and unattached */
-	if (tcpm_port_is_disconnected(port) &&
-	    (port->state == SRC_UNATTACHED || port->state == SNK_UNATTACHED ||
-	     port->state == DRP_TOGGLING))
-		return;
-
-	va_start(args, fmt);
-	_tcpm_log(port, fmt, args);
-	va_end(args);
-}
-
-__printf(2, 3)
-static void tcpm_log_force(struct tcpm_port *port, const char *fmt, ...)
-{
-	va_list args;
-
-	va_start(args, fmt);
-	_tcpm_log(port, fmt, args);
-	va_end(args);
-}
-
-static void tcpm_log_source_caps(struct tcpm_port *port)
-{
-	int i;
-
-	for (i = 0; i < port->nr_source_caps; i++) {
-		u32 pdo = port->source_caps[i];
-		enum pd_pdo_type type = pdo_type(pdo);
-		char msg[64];
-
-		switch (type) {
-		case PDO_TYPE_FIXED:
-			scnprintf(msg, sizeof(msg),
-				  "%u mV, %u mA [%s%s%s%s%s%s]",
-				  pdo_fixed_voltage(pdo),
-				  pdo_max_current(pdo),
-				  (pdo & PDO_FIXED_DUAL_ROLE) ?
-							"R" : "",
-				  (pdo & PDO_FIXED_SUSPEND) ?
-							"S" : "",
-				  (pdo & PDO_FIXED_HIGHER_CAP) ?
-							"H" : "",
-				  (pdo & PDO_FIXED_USB_COMM) ?
-							"U" : "",
-				  (pdo & PDO_FIXED_DATA_SWAP) ?
-							"D" : "",
-				  (pdo & PDO_FIXED_EXTPOWER) ?
-							"E" : "");
-			break;
-		case PDO_TYPE_VAR:
-			scnprintf(msg, sizeof(msg),
-				  "%u-%u mV, %u mA",
-				  pdo_min_voltage(pdo),
-				  pdo_max_voltage(pdo),
-				  pdo_max_current(pdo));
-			break;
-		case PDO_TYPE_BATT:
-			scnprintf(msg, sizeof(msg),
-				  "%u-%u mV, %u mW",
-				  pdo_min_voltage(pdo),
-				  pdo_max_voltage(pdo),
-				  pdo_max_power(pdo));
-			break;
-		default:
-			strcpy(msg, "undefined");
-			break;
-		}
-		tcpm_log(port, " PDO %d: type %d, %s",
-			 i, type, msg);
-	}
-}
-
-static int tcpm_seq_show(struct seq_file *s, void *v)
-{
-	struct tcpm_port *port = (struct tcpm_port *)s->private;
-	int tail;
-
-	mutex_lock(&port->logbuffer_lock);
-	tail = port->logbuffer_tail;
-	while (tail != port->logbuffer_head) {
-		seq_printf(s, "%s\n", port->logbuffer[tail]);
-		tail = (tail + 1) % LOG_BUFFER_ENTRIES;
-	}
-	if (!seq_has_overflowed(s))
-		port->logbuffer_tail = tail;
-	mutex_unlock(&port->logbuffer_lock);
-
-	return 0;
-}
-
-static int tcpm_debug_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, tcpm_seq_show, inode->i_private);
-}
-
-static const struct file_operations tcpm_debug_operations = {
-	.open		= tcpm_debug_open,
-	.llseek		= seq_lseek,
-	.read		= seq_read,
-	.release	= single_release,
-};
-
-static struct dentry *rootdir;
-
-static int tcpm_debugfs_init(struct tcpm_port *port)
-{
-	mutex_init(&port->logbuffer_lock);
-	/* /sys/kernel/debug/tcpm/usbcX */
-	if (!rootdir) {
-		rootdir = debugfs_create_dir("tcpm", NULL);
-		if (!rootdir)
-			return -ENOMEM;
-	}
-
-	port->dentry = debugfs_create_file(dev_name(port->dev),
-					   S_IFREG | 0444, rootdir,
-					   port, &tcpm_debug_operations);
-
-	return 0;
-}
-
-static void tcpm_debugfs_exit(struct tcpm_port *port)
-{
-	debugfs_remove(port->dentry);
-}
-
-#else
-
-__printf(2, 3)
-static void tcpm_log(const struct tcpm_port *port, const char *fmt, ...) { }
-__printf(2, 3)
-static void tcpm_log_force(struct tcpm_port *port, const char *fmt, ...) { }
-static void tcpm_log_source_caps(struct tcpm_port *port) { }
-static int tcpm_debugfs_init(const struct tcpm_port *port) { return 0; }
-static void tcpm_debugfs_exit(const struct tcpm_port *port) { }
-
-#endif
-
-static int tcpm_pd_transmit(struct tcpm_port *port,
-			    enum tcpm_transmit_type type,
-			    const struct pd_message *msg)
-{
-	unsigned long timeout;
-	int ret;
-
-	if (msg)
-		tcpm_log(port, "PD TX, header: %#x", le16_to_cpu(msg->header));
-	else
-		tcpm_log(port, "PD TX, type: %#x", type);
-
-	reinit_completion(&port->tx_complete);
-	ret = port->tcpc->pd_transmit(port->tcpc, type, msg);
-	if (ret < 0)
-		return ret;
-
-	mutex_unlock(&port->lock);
-	timeout = wait_for_completion_timeout(&port->tx_complete,
-				msecs_to_jiffies(PD_T_TCPC_TX_TIMEOUT));
-	mutex_lock(&port->lock);
-	if (!timeout)
-		return -ETIMEDOUT;
-
-	switch (port->tx_status) {
-	case TCPC_TX_SUCCESS:
-		port->message_id = (port->message_id + 1) & PD_HEADER_ID_MASK;
-		return 0;
-	case TCPC_TX_DISCARDED:
-		return -EAGAIN;
-	case TCPC_TX_FAILED:
-	default:
-		return -EIO;
-	}
-}
-
-void tcpm_pd_transmit_complete(struct tcpm_port *port,
-			       enum tcpm_transmit_status status)
-{
-	tcpm_log(port, "PD TX complete, status: %u", status);
-	port->tx_status = status;
-	complete(&port->tx_complete);
-}
-EXPORT_SYMBOL_GPL(tcpm_pd_transmit_complete);
-
-static int tcpm_mux_set(struct tcpm_port *port, enum tcpc_mux_mode mode,
-			enum tcpc_usb_switch config)
-{
-	int ret = 0;
-
-	tcpm_log(port, "Requesting mux mode %d, config %d, polarity %d",
-		 mode, config, port->polarity);
-
-	if (port->tcpc->mux)
-		ret = port->tcpc->mux->set(port->tcpc->mux, mode, config,
-					   port->polarity);
-
-	return ret;
-}
-
-static int tcpm_set_polarity(struct tcpm_port *port,
-			     enum typec_cc_polarity polarity)
-{
-	int ret;
-
-	tcpm_log(port, "polarity %d", polarity);
-
-	ret = port->tcpc->set_polarity(port->tcpc, polarity);
-	if (ret < 0)
-		return ret;
-
-	port->polarity = polarity;
-
-	return 0;
-}
-
-static int tcpm_set_vconn(struct tcpm_port *port, bool enable)
-{
-	int ret;
-
-	tcpm_log(port, "vconn:=%d", enable);
-
-	ret = port->tcpc->set_vconn(port->tcpc, enable);
-	if (!ret) {
-		port->vconn_role = enable ? TYPEC_SOURCE : TYPEC_SINK;
-		typec_set_vconn_role(port->typec_port, port->vconn_role);
-	}
-
-	return ret;
-}
-
-static u32 tcpm_get_current_limit(struct tcpm_port *port)
-{
-	enum typec_cc_status cc;
-	u32 limit;
-
-	cc = port->polarity ? port->cc2 : port->cc1;
-	switch (cc) {
-	case TYPEC_CC_RP_1_5:
-		limit = 1500;
-		break;
-	case TYPEC_CC_RP_3_0:
-		limit = 3000;
-		break;
-	case TYPEC_CC_RP_DEF:
-	default:
-		if (port->tcpc->get_current_limit)
-			limit = port->tcpc->get_current_limit(port->tcpc);
-		else
-			limit = 0;
-		break;
-	}
-
-	return limit;
-}
-
-static int tcpm_set_current_limit(struct tcpm_port *port, u32 max_ma, u32 mv)
-{
-	int ret = -EOPNOTSUPP;
-
-	tcpm_log(port, "Setting voltage/current limit %u mV %u mA", mv, max_ma);
-
-	if (port->tcpc->set_current_limit)
-		ret = port->tcpc->set_current_limit(port->tcpc, max_ma, mv);
-
-	return ret;
-}
-
-/*
- * Determine RP value to set based on maximum current supported
- * by a port if configured as source.
- * Returns CC value to report to link partner.
- */
-static enum typec_cc_status tcpm_rp_cc(struct tcpm_port *port)
-{
-	const u32 *src_pdo = port->src_pdo;
-	int nr_pdo = port->nr_src_pdo;
-	int i;
-
-	/*
-	 * Search for first entry with matching voltage.
-	 * It should report the maximum supported current.
-	 */
-	for (i = 0; i < nr_pdo; i++) {
-		const u32 pdo = src_pdo[i];
-
-		if (pdo_type(pdo) == PDO_TYPE_FIXED &&
-		    pdo_fixed_voltage(pdo) == 5000) {
-			unsigned int curr = pdo_max_current(pdo);
-
-			if (curr >= 3000)
-				return TYPEC_CC_RP_3_0;
-			else if (curr >= 1500)
-				return TYPEC_CC_RP_1_5;
-			return TYPEC_CC_RP_DEF;
-		}
-	}
-
-	return TYPEC_CC_RP_DEF;
-}
-
-static int tcpm_set_attached_state(struct tcpm_port *port, bool attached)
-{
-	return port->tcpc->set_roles(port->tcpc, attached, port->pwr_role,
-				     port->data_role);
-}
-
-static int tcpm_set_roles(struct tcpm_port *port, bool attached,
-			  enum typec_role role, enum typec_data_role data)
-{
-	int ret;
-
-	if (data == TYPEC_HOST)
-		ret = tcpm_mux_set(port, TYPEC_MUX_USB,
-				   TCPC_USB_SWITCH_CONNECT);
-	else
-		ret = tcpm_mux_set(port, TYPEC_MUX_NONE,
-				   TCPC_USB_SWITCH_DISCONNECT);
-	if (ret < 0)
-		return ret;
-
-	ret = port->tcpc->set_roles(port->tcpc, attached, role, data);
-	if (ret < 0)
-		return ret;
-
-	port->pwr_role = role;
-	port->data_role = data;
-	typec_set_data_role(port->typec_port, data);
-	typec_set_pwr_role(port->typec_port, role);
-
-	return 0;
-}
-
-static int tcpm_set_pwr_role(struct tcpm_port *port, enum typec_role role)
-{
-	int ret;
-
-	ret = port->tcpc->set_roles(port->tcpc, true, role,
-				    port->data_role);
-	if (ret < 0)
-		return ret;
-
-	port->pwr_role = role;
-	typec_set_pwr_role(port->typec_port, role);
-
-	return 0;
-}
-
-static int tcpm_pd_send_source_caps(struct tcpm_port *port)
-{
-	struct pd_message msg;
-	int i;
-
-	memset(&msg, 0, sizeof(msg));
-	if (!port->nr_src_pdo) {
-		/* No source capabilities defined, sink only */
-		msg.header = PD_HEADER_LE(PD_CTRL_REJECT,
-					  port->pwr_role,
-					  port->data_role,
-					  port->message_id, 0);
-	} else {
-		msg.header = PD_HEADER_LE(PD_DATA_SOURCE_CAP,
-					  port->pwr_role,
-					  port->data_role,
-					  port->message_id,
-					  port->nr_src_pdo);
-	}
-	for (i = 0; i < port->nr_src_pdo; i++)
-		msg.payload[i] = cpu_to_le32(port->src_pdo[i]);
-
-	return tcpm_pd_transmit(port, TCPC_TX_SOP, &msg);
-}
-
-static int tcpm_pd_send_sink_caps(struct tcpm_port *port)
-{
-	struct pd_message msg;
-	int i;
-
-	memset(&msg, 0, sizeof(msg));
-	if (!port->nr_snk_pdo) {
-		/* No sink capabilities defined, source only */
-		msg.header = PD_HEADER_LE(PD_CTRL_REJECT,
-					  port->pwr_role,
-					  port->data_role,
-					  port->message_id, 0);
-	} else {
-		msg.header = PD_HEADER_LE(PD_DATA_SINK_CAP,
-					  port->pwr_role,
-					  port->data_role,
-					  port->message_id,
-					  port->nr_snk_pdo);
-	}
-	for (i = 0; i < port->nr_snk_pdo; i++)
-		msg.payload[i] = cpu_to_le32(port->snk_pdo[i]);
-
-	return tcpm_pd_transmit(port, TCPC_TX_SOP, &msg);
-}
-
-static void tcpm_set_state(struct tcpm_port *port, enum tcpm_state state,
-			   unsigned int delay_ms)
-{
-	if (delay_ms) {
-		tcpm_log(port, "pending state change %s -> %s @ %u ms",
-			 tcpm_states[port->state], tcpm_states[state],
-			 delay_ms);
-		port->delayed_state = state;
-		mod_delayed_work(port->wq, &port->state_machine,
-				 msecs_to_jiffies(delay_ms));
-		port->delayed_runtime = jiffies + msecs_to_jiffies(delay_ms);
-		port->delay_ms = delay_ms;
-	} else {
-		tcpm_log(port, "state change %s -> %s",
-			 tcpm_states[port->state], tcpm_states[state]);
-		port->delayed_state = INVALID_STATE;
-		port->prev_state = port->state;
-		port->state = state;
-		/*
-		 * Don't re-queue the state machine work item if we're currently
-		 * in the state machine and we're immediately changing states.
-		 * tcpm_state_machine_work() will continue running the state
-		 * machine.
-		 */
-		if (!port->state_machine_running)
-			mod_delayed_work(port->wq, &port->state_machine, 0);
-	}
-}
-
-static void tcpm_set_state_cond(struct tcpm_port *port, enum tcpm_state state,
-				unsigned int delay_ms)
-{
-	if (port->enter_state == port->state)
-		tcpm_set_state(port, state, delay_ms);
-	else
-		tcpm_log(port,
-			 "skipped %sstate change %s -> %s [%u ms], context state %s",
-			 delay_ms ? "delayed " : "",
-			 tcpm_states[port->state], tcpm_states[state],
-			 delay_ms, tcpm_states[port->enter_state]);
-}
-
-static void tcpm_queue_message(struct tcpm_port *port,
-			       enum pd_msg_request message)
-{
-	port->queued_message = message;
-	mod_delayed_work(port->wq, &port->state_machine, 0);
-}
-
-/*
- * VDM/VDO handling functions
- */
-static void tcpm_queue_vdm(struct tcpm_port *port, const u32 header,
-			   const u32 *data, int cnt)
-{
-	port->vdo_count = cnt + 1;
-	port->vdo_data[0] = header;
-	memcpy(&port->vdo_data[1], data, sizeof(u32) * cnt);
-	/* Set ready, vdm state machine will actually send */
-	port->vdm_retries = 0;
-	port->vdm_state = VDM_STATE_READY;
-}
-
-static void svdm_consume_identity(struct tcpm_port *port, const __le32 *payload,
-				  int cnt)
-{
-	u32 vdo = le32_to_cpu(payload[VDO_INDEX_IDH]);
-	u32 product = le32_to_cpu(payload[VDO_INDEX_PRODUCT]);
-
-	memset(&port->mode_data, 0, sizeof(port->mode_data));
-
-	port->partner_ident.id_header = vdo;
-	port->partner_ident.cert_stat = le32_to_cpu(payload[VDO_INDEX_CSTAT]);
-	port->partner_ident.product = product;
-
-	typec_partner_set_identity(port->partner);
-
-	tcpm_log(port, "Identity: %04x:%04x.%04x",
-		 PD_IDH_VID(vdo),
-		 PD_PRODUCT_PID(product), product & 0xffff);
-}
-
-static bool svdm_consume_svids(struct tcpm_port *port, const __le32 *payload,
-			       int cnt)
-{
-	struct pd_mode_data *pmdata = &port->mode_data;
-	int i;
-
-	for (i = 1; i < cnt; i++) {
-		u32 p = le32_to_cpu(payload[i]);
-		u16 svid;
-
-		svid = (p >> 16) & 0xffff;
-		if (!svid)
-			return false;
-
-		if (pmdata->nsvids >= SVID_DISCOVERY_MAX)
-			goto abort;
-
-		pmdata->svids[pmdata->nsvids++] = svid;
-		tcpm_log(port, "SVID %d: 0x%x", pmdata->nsvids, svid);
-
-		svid = p & 0xffff;
-		if (!svid)
-			return false;
-
-		if (pmdata->nsvids >= SVID_DISCOVERY_MAX)
-			goto abort;
-
-		pmdata->svids[pmdata->nsvids++] = svid;
-		tcpm_log(port, "SVID %d: 0x%x", pmdata->nsvids, svid);
-	}
-	return true;
-abort:
-	tcpm_log(port, "SVID_DISCOVERY_MAX(%d) too low!", SVID_DISCOVERY_MAX);
-	return false;
-}
-
-static void svdm_consume_modes(struct tcpm_port *port, const __le32 *payload,
-			       int cnt)
-{
-	struct pd_mode_data *pmdata = &port->mode_data;
-	struct typec_altmode_desc *paltmode;
-	struct typec_mode_desc *pmode;
-	int i;
-
-	if (pmdata->altmodes >= ARRAY_SIZE(port->partner_altmode)) {
-		/* Already logged in svdm_consume_svids() */
-		return;
-	}
-
-	paltmode = &pmdata->altmode_desc[pmdata->altmodes];
-	memset(paltmode, 0, sizeof(*paltmode));
-
-	paltmode->svid = pmdata->svids[pmdata->svid_index];
-
-	tcpm_log(port, " Alternate mode %d: SVID 0x%04x",
-		 pmdata->altmodes, paltmode->svid);
-
-	for (i = 1; i < cnt && paltmode->n_modes < ALTMODE_MAX_MODES; i++) {
-		pmode = &paltmode->modes[paltmode->n_modes];
-		memset(pmode, 0, sizeof(*pmode));
-		pmode->vdo = le32_to_cpu(payload[i]);
-		pmode->index = i - 1;
-		paltmode->n_modes++;
-		tcpm_log(port, "  VDO %d: 0x%08x",
-			 pmode->index, pmode->vdo);
-	}
-	port->partner_altmode[pmdata->altmodes] =
-		typec_partner_register_altmode(port->partner, paltmode);
-	if (port->partner_altmode[pmdata->altmodes] == NULL) {
-		tcpm_log(port,
-			 "Failed to register alternate modes for SVID 0x%04x",
-			 paltmode->svid);
-		return;
-	}
-	pmdata->altmodes++;
-}
-
-#define supports_modal(port)	PD_IDH_MODAL_SUPP((port)->partner_ident.id_header)
-
-static int tcpm_pd_svdm(struct tcpm_port *port, const __le32 *payload, int cnt,
-			u32 *response)
-{
-	u32 p0 = le32_to_cpu(payload[0]);
-	int cmd_type = PD_VDO_CMDT(p0);
-	int cmd = PD_VDO_CMD(p0);
-	struct pd_mode_data *modep;
-	int rlen = 0;
-	u16 svid;
-	int i;
-
-	tcpm_log(port, "Rx VDM cmd 0x%x type %d cmd %d len %d",
-		 p0, cmd_type, cmd, cnt);
-
-	modep = &port->mode_data;
-
-	switch (cmd_type) {
-	case CMDT_INIT:
-		switch (cmd) {
-		case CMD_DISCOVER_IDENT:
-			/* 6.4.4.3.1: Only respond as UFP (device) */
-			if (port->data_role == TYPEC_DEVICE &&
-			    port->nr_snk_vdo) {
-				for (i = 0; i <  port->nr_snk_vdo; i++)
-					response[i + 1] = port->snk_vdo[i];
-				rlen = port->nr_snk_vdo + 1;
-			}
-			break;
-		case CMD_DISCOVER_SVID:
-			break;
-		case CMD_DISCOVER_MODES:
-			break;
-		case CMD_ENTER_MODE:
-			break;
-		case CMD_EXIT_MODE:
-			break;
-		case CMD_ATTENTION:
-			break;
-		default:
-			break;
-		}
-		if (rlen >= 1) {
-			response[0] = p0 | VDO_CMDT(CMDT_RSP_ACK);
-		} else if (rlen == 0) {
-			response[0] = p0 | VDO_CMDT(CMDT_RSP_NAK);
-			rlen = 1;
-		} else {
-			response[0] = p0 | VDO_CMDT(CMDT_RSP_BUSY);
-			rlen = 1;
-		}
-		break;
-	case CMDT_RSP_ACK:
-		/* silently drop message if we are not connected */
-		if (!port->partner)
-			break;
-
-		switch (cmd) {
-		case CMD_DISCOVER_IDENT:
-			/* 6.4.4.3.1 */
-			svdm_consume_identity(port, payload, cnt);
-			response[0] = VDO(USB_SID_PD, 1, CMD_DISCOVER_SVID);
-			rlen = 1;
-			break;
-		case CMD_DISCOVER_SVID:
-			/* 6.4.4.3.2 */
-			if (svdm_consume_svids(port, payload, cnt)) {
-				response[0] = VDO(USB_SID_PD, 1,
-						  CMD_DISCOVER_SVID);
-				rlen = 1;
-			} else if (modep->nsvids && supports_modal(port)) {
-				response[0] = VDO(modep->svids[0], 1,
-						  CMD_DISCOVER_MODES);
-				rlen = 1;
-			}
-			break;
-		case CMD_DISCOVER_MODES:
-			/* 6.4.4.3.3 */
-			svdm_consume_modes(port, payload, cnt);
-			modep->svid_index++;
-			if (modep->svid_index < modep->nsvids) {
-				svid = modep->svids[modep->svid_index];
-				response[0] = VDO(svid, 1, CMD_DISCOVER_MODES);
-				rlen = 1;
-			} else {
-				/* enter alternate mode if/when implemented */
-			}
-			break;
-		case CMD_ENTER_MODE:
-			break;
-		default:
-			break;
-		}
-		break;
-	default:
-		break;
-	}
-
-	return rlen;
-}
-
-static void tcpm_handle_vdm_request(struct tcpm_port *port,
-				    const __le32 *payload, int cnt)
-{
-	int rlen = 0;
-	u32 response[8] = { };
-	u32 p0 = le32_to_cpu(payload[0]);
-
-	if (port->vdm_state == VDM_STATE_BUSY) {
-		/* If UFP responded busy retry after timeout */
-		if (PD_VDO_CMDT(p0) == CMDT_RSP_BUSY) {
-			port->vdm_state = VDM_STATE_WAIT_RSP_BUSY;
-			port->vdo_retry = (p0 & ~VDO_CMDT_MASK) |
-				CMDT_INIT;
-			mod_delayed_work(port->wq, &port->vdm_state_machine,
-					 msecs_to_jiffies(PD_T_VDM_BUSY));
-			return;
-		}
-		port->vdm_state = VDM_STATE_DONE;
-	}
-
-	if (PD_VDO_SVDM(p0))
-		rlen = tcpm_pd_svdm(port, payload, cnt, response);
-
-	if (rlen > 0) {
-		tcpm_queue_vdm(port, response[0], &response[1], rlen - 1);
-		mod_delayed_work(port->wq, &port->vdm_state_machine, 0);
-	}
-}
-
-static void tcpm_send_vdm(struct tcpm_port *port, u32 vid, int cmd,
-			  const u32 *data, int count)
-{
-	u32 header;
-
-	if (WARN_ON(count > VDO_MAX_SIZE - 1))
-		count = VDO_MAX_SIZE - 1;
-
-	/* set VDM header with VID & CMD */
-	header = VDO(vid, ((vid & USB_SID_PD) == USB_SID_PD) ?
-			1 : (PD_VDO_CMD(cmd) <= CMD_ATTENTION), cmd);
-	tcpm_queue_vdm(port, header, data, count);
-
-	mod_delayed_work(port->wq, &port->vdm_state_machine, 0);
-}
-
-static unsigned int vdm_ready_timeout(u32 vdm_hdr)
-{
-	unsigned int timeout;
-	int cmd = PD_VDO_CMD(vdm_hdr);
-
-	/* its not a structured VDM command */
-	if (!PD_VDO_SVDM(vdm_hdr))
-		return PD_T_VDM_UNSTRUCTURED;
-
-	switch (PD_VDO_CMDT(vdm_hdr)) {
-	case CMDT_INIT:
-		if (cmd == CMD_ENTER_MODE || cmd == CMD_EXIT_MODE)
-			timeout = PD_T_VDM_WAIT_MODE_E;
-		else
-			timeout = PD_T_VDM_SNDR_RSP;
-		break;
-	default:
-		if (cmd == CMD_ENTER_MODE || cmd == CMD_EXIT_MODE)
-			timeout = PD_T_VDM_E_MODE;
-		else
-			timeout = PD_T_VDM_RCVR_RSP;
-		break;
-	}
-	return timeout;
-}
-
-static void vdm_run_state_machine(struct tcpm_port *port)
-{
-	struct pd_message msg;
-	int i, res;
-
-	switch (port->vdm_state) {
-	case VDM_STATE_READY:
-		/* Only transmit VDM if attached */
-		if (!port->attached) {
-			port->vdm_state = VDM_STATE_ERR_BUSY;
-			break;
-		}
-
-		/*
-		 * if there's traffic or we're not in PDO ready state don't send
-		 * a VDM.
-		 */
-		if (port->state != SRC_READY && port->state != SNK_READY)
-			break;
-
-		/* Prepare and send VDM */
-		memset(&msg, 0, sizeof(msg));
-		msg.header = PD_HEADER_LE(PD_DATA_VENDOR_DEF,
-					  port->pwr_role,
-					  port->data_role,
-					  port->message_id, port->vdo_count);
-		for (i = 0; i < port->vdo_count; i++)
-			msg.payload[i] = cpu_to_le32(port->vdo_data[i]);
-		res = tcpm_pd_transmit(port, TCPC_TX_SOP, &msg);
-		if (res < 0) {
-			port->vdm_state = VDM_STATE_ERR_SEND;
-		} else {
-			unsigned long timeout;
-
-			port->vdm_retries = 0;
-			port->vdm_state = VDM_STATE_BUSY;
-			timeout = vdm_ready_timeout(port->vdo_data[0]);
-			mod_delayed_work(port->wq, &port->vdm_state_machine,
-					 timeout);
-		}
-		break;
-	case VDM_STATE_WAIT_RSP_BUSY:
-		port->vdo_data[0] = port->vdo_retry;
-		port->vdo_count = 1;
-		port->vdm_state = VDM_STATE_READY;
-		break;
-	case VDM_STATE_BUSY:
-		port->vdm_state = VDM_STATE_ERR_TMOUT;
-		break;
-	case VDM_STATE_ERR_SEND:
-		/*
-		 * A partner which does not support USB PD will not reply,
-		 * so this is not a fatal error. At the same time, some
-		 * devices may not return GoodCRC under some circumstances,
-		 * so we need to retry.
-		 */
-		if (port->vdm_retries < 3) {
-			tcpm_log(port, "VDM Tx error, retry");
-			port->vdm_retries++;
-			port->vdm_state = VDM_STATE_READY;
-		}
-		break;
-	default:
-		break;
-	}
-}
-
-static void vdm_state_machine_work(struct work_struct *work)
-{
-	struct tcpm_port *port = container_of(work, struct tcpm_port,
-					      vdm_state_machine.work);
-	enum vdm_states prev_state;
-
-	mutex_lock(&port->lock);
-
-	/*
-	 * Continue running as long as the port is not busy and there was
-	 * a state change.
-	 */
-	do {
-		prev_state = port->vdm_state;
-		vdm_run_state_machine(port);
-	} while (port->vdm_state != prev_state &&
-		 port->vdm_state != VDM_STATE_BUSY);
-
-	mutex_unlock(&port->lock);
-}
-
-/*
- * PD (data, control) command handling functions
- */
-static void tcpm_pd_data_request(struct tcpm_port *port,
-				 const struct pd_message *msg)
-{
-	enum pd_data_msg_type type = pd_header_type_le(msg->header);
-	unsigned int cnt = pd_header_cnt_le(msg->header);
-	unsigned int i;
-
-	switch (type) {
-	case PD_DATA_SOURCE_CAP:
-		if (port->pwr_role != TYPEC_SINK)
-			break;
-
-		for (i = 0; i < cnt; i++)
-			port->source_caps[i] = le32_to_cpu(msg->payload[i]);
-
-		port->nr_source_caps = cnt;
-
-		tcpm_log_source_caps(port);
-
-		/*
-		 * This message may be received even if VBUS is not
-		 * present. This is quite unexpected; see USB PD
-		 * specification, sections 8.3.3.6.3.1 and 8.3.3.6.3.2.
-		 * However, at the same time, we must be ready to
-		 * receive this message and respond to it 15ms after
-		 * receiving PS_RDY during power swap operations, no matter
-		 * if VBUS is available or not (USB PD specification,
-		 * section 6.5.9.2).
-		 * So we need to accept the message either way,
-		 * but be prepared to keep waiting for VBUS after it was
-		 * handled.
-		 */
-		tcpm_set_state(port, SNK_NEGOTIATE_CAPABILITIES, 0);
-		break;
-	case PD_DATA_REQUEST:
-		if (port->pwr_role != TYPEC_SOURCE ||
-		    cnt != 1) {
-			tcpm_queue_message(port, PD_MSG_CTRL_REJECT);
-			break;
-		}
-		port->sink_request = le32_to_cpu(msg->payload[0]);
-		tcpm_set_state(port, SRC_NEGOTIATE_CAPABILITIES, 0);
-		break;
-	case PD_DATA_SINK_CAP:
-		/* We don't do anything with this at the moment... */
-		for (i = 0; i < cnt; i++)
-			port->sink_caps[i] = le32_to_cpu(msg->payload[i]);
-		port->nr_sink_caps = cnt;
-		break;
-	case PD_DATA_VENDOR_DEF:
-		tcpm_handle_vdm_request(port, msg->payload, cnt);
-		break;
-	case PD_DATA_BIST:
-		if (port->state == SRC_READY || port->state == SNK_READY) {
-			port->bist_request = le32_to_cpu(msg->payload[0]);
-			tcpm_set_state(port, BIST_RX, 0);
-		}
-		break;
-	default:
-		tcpm_log(port, "Unhandled data message type %#x", type);
-		break;
-	}
-}
-
-static void tcpm_pd_ctrl_request(struct tcpm_port *port,
-				 const struct pd_message *msg)
-{
-	enum pd_ctrl_msg_type type = pd_header_type_le(msg->header);
-	enum tcpm_state next_state;
-
-	switch (type) {
-	case PD_CTRL_GOOD_CRC:
-	case PD_CTRL_PING:
-		break;
-	case PD_CTRL_GET_SOURCE_CAP:
-		switch (port->state) {
-		case SRC_READY:
-		case SNK_READY:
-			tcpm_queue_message(port, PD_MSG_DATA_SOURCE_CAP);
-			break;
-		default:
-			tcpm_queue_message(port, PD_MSG_CTRL_REJECT);
-			break;
-		}
-		break;
-	case PD_CTRL_GET_SINK_CAP:
-		switch (port->state) {
-		case SRC_READY:
-		case SNK_READY:
-			tcpm_queue_message(port, PD_MSG_DATA_SINK_CAP);
-			break;
-		default:
-			tcpm_queue_message(port, PD_MSG_CTRL_REJECT);
-			break;
-		}
-		break;
-	case PD_CTRL_GOTO_MIN:
-		break;
-	case PD_CTRL_PS_RDY:
-		switch (port->state) {
-		case SNK_TRANSITION_SINK:
-			if (port->vbus_present) {
-				tcpm_set_current_limit(port,
-						       port->current_limit,
-						       port->supply_voltage);
-				port->explicit_contract = true;
-				tcpm_set_state(port, SNK_READY, 0);
-			} else {
-				/*
-				 * Seen after power swap. Keep waiting for VBUS
-				 * in a transitional state.
-				 */
-				tcpm_set_state(port,
-					       SNK_TRANSITION_SINK_VBUS, 0);
-			}
-			break;
-		case PR_SWAP_SRC_SNK_SOURCE_OFF_CC_DEBOUNCED:
-			tcpm_set_state(port, PR_SWAP_SRC_SNK_SINK_ON, 0);
-			break;
-		case PR_SWAP_SNK_SRC_SINK_OFF:
-			tcpm_set_state(port, PR_SWAP_SNK_SRC_SOURCE_ON, 0);
-			break;
-		case VCONN_SWAP_WAIT_FOR_VCONN:
-			tcpm_set_state(port, VCONN_SWAP_TURN_OFF_VCONN, 0);
-			break;
-		default:
-			break;
-		}
-		break;
-	case PD_CTRL_REJECT:
-	case PD_CTRL_WAIT:
-		switch (port->state) {
-		case SNK_NEGOTIATE_CAPABILITIES:
-			/* USB PD specification, Figure 8-43 */
-			if (port->explicit_contract)
-				next_state = SNK_READY;
-			else
-				next_state = SNK_WAIT_CAPABILITIES;
-			tcpm_set_state(port, next_state, 0);
-			break;
-		case DR_SWAP_SEND:
-			port->swap_status = (type == PD_CTRL_WAIT ?
-					     -EAGAIN : -EOPNOTSUPP);
-			tcpm_set_state(port, DR_SWAP_CANCEL, 0);
-			break;
-		case PR_SWAP_SEND:
-			port->swap_status = (type == PD_CTRL_WAIT ?
-					     -EAGAIN : -EOPNOTSUPP);
-			tcpm_set_state(port, PR_SWAP_CANCEL, 0);
-			break;
-		case VCONN_SWAP_SEND:
-			port->swap_status = (type == PD_CTRL_WAIT ?
-					     -EAGAIN : -EOPNOTSUPP);
-			tcpm_set_state(port, VCONN_SWAP_CANCEL, 0);
-			break;
-		default:
-			break;
-		}
-		break;
-	case PD_CTRL_ACCEPT:
-		switch (port->state) {
-		case SNK_NEGOTIATE_CAPABILITIES:
-			tcpm_set_state(port, SNK_TRANSITION_SINK, 0);
-			break;
-		case SOFT_RESET_SEND:
-			port->message_id = 0;
-			port->rx_msgid = -1;
-			if (port->pwr_role == TYPEC_SOURCE)
-				next_state = SRC_SEND_CAPABILITIES;
-			else
-				next_state = SNK_WAIT_CAPABILITIES;
-			tcpm_set_state(port, next_state, 0);
-			break;
-		case DR_SWAP_SEND:
-			tcpm_set_state(port, DR_SWAP_CHANGE_DR, 0);
-			break;
-		case PR_SWAP_SEND:
-			tcpm_set_state(port, PR_SWAP_START, 0);
-			break;
-		case VCONN_SWAP_SEND:
-			tcpm_set_state(port, VCONN_SWAP_START, 0);
-			break;
-		default:
-			break;
-		}
-		break;
-	case PD_CTRL_SOFT_RESET:
-		tcpm_set_state(port, SOFT_RESET, 0);
-		break;
-	case PD_CTRL_DR_SWAP:
-		if (port->port_type != TYPEC_PORT_DRP) {
-			tcpm_queue_message(port, PD_MSG_CTRL_REJECT);
-			break;
-		}
-		/*
-		 * XXX
-		 * 6.3.9: If an alternate mode is active, a request to swap
-		 * alternate modes shall trigger a port reset.
-		 */
-		switch (port->state) {
-		case SRC_READY:
-		case SNK_READY:
-			tcpm_set_state(port, DR_SWAP_ACCEPT, 0);
-			break;
-		default:
-			tcpm_queue_message(port, PD_MSG_CTRL_WAIT);
-			break;
-		}
-		break;
-	case PD_CTRL_PR_SWAP:
-		if (port->port_type != TYPEC_PORT_DRP) {
-			tcpm_queue_message(port, PD_MSG_CTRL_REJECT);
-			break;
-		}
-		switch (port->state) {
-		case SRC_READY:
-		case SNK_READY:
-			tcpm_set_state(port, PR_SWAP_ACCEPT, 0);
-			break;
-		default:
-			tcpm_queue_message(port, PD_MSG_CTRL_WAIT);
-			break;
-		}
-		break;
-	case PD_CTRL_VCONN_SWAP:
-		switch (port->state) {
-		case SRC_READY:
-		case SNK_READY:
-			tcpm_set_state(port, VCONN_SWAP_ACCEPT, 0);
-			break;
-		default:
-			tcpm_queue_message(port, PD_MSG_CTRL_WAIT);
-			break;
-		}
-		break;
-	default:
-		tcpm_log(port, "Unhandled ctrl message type %#x", type);
-		break;
-	}
-}
-
-static void tcpm_pd_rx_handler(struct work_struct *work)
-{
-	struct pd_rx_event *event = container_of(work,
-						 struct pd_rx_event, work);
-	const struct pd_message *msg = &event->msg;
-	unsigned int cnt = pd_header_cnt_le(msg->header);
-	struct tcpm_port *port = event->port;
-
-	mutex_lock(&port->lock);
-
-	tcpm_log(port, "PD RX, header: %#x [%d]", le16_to_cpu(msg->header),
-		 port->attached);
-
-	if (port->attached) {
-		enum pd_ctrl_msg_type type = pd_header_type_le(msg->header);
-		unsigned int msgid = pd_header_msgid_le(msg->header);
-
-		/*
-		 * USB PD standard, 6.6.1.2:
-		 * "... if MessageID value in a received Message is the
-		 * same as the stored value, the receiver shall return a
-		 * GoodCRC Message with that MessageID value and drop
-		 * the Message (this is a retry of an already received
-		 * Message). Note: this shall not apply to the Soft_Reset
-		 * Message which always has a MessageID value of zero."
-		 */
-		if (msgid == port->rx_msgid && type != PD_CTRL_SOFT_RESET)
-			goto done;
-		port->rx_msgid = msgid;
-
-		/*
-		 * If both ends believe to be DFP/host, we have a data role
-		 * mismatch.
-		 */
-		if (!!(le16_to_cpu(msg->header) & PD_HEADER_DATA_ROLE) ==
-		    (port->data_role == TYPEC_HOST)) {
-			tcpm_log(port,
-				 "Data role mismatch, initiating error recovery");
-			tcpm_set_state(port, ERROR_RECOVERY, 0);
-		} else {
-			if (cnt)
-				tcpm_pd_data_request(port, msg);
-			else
-				tcpm_pd_ctrl_request(port, msg);
-		}
-	}
-
-done:
-	mutex_unlock(&port->lock);
-	kfree(event);
-}
-
-void tcpm_pd_receive(struct tcpm_port *port, const struct pd_message *msg)
-{
-	struct pd_rx_event *event;
-
-	event = kzalloc(sizeof(*event), GFP_ATOMIC);
-	if (!event)
-		return;
-
-	INIT_WORK(&event->work, tcpm_pd_rx_handler);
-	event->port = port;
-	memcpy(&event->msg, msg, sizeof(*msg));
-	queue_work(port->wq, &event->work);
-}
-EXPORT_SYMBOL_GPL(tcpm_pd_receive);
-
-static int tcpm_pd_send_control(struct tcpm_port *port,
-				enum pd_ctrl_msg_type type)
-{
-	struct pd_message msg;
-
-	memset(&msg, 0, sizeof(msg));
-	msg.header = PD_HEADER_LE(type, port->pwr_role,
-				  port->data_role,
-				  port->message_id, 0);
-
-	return tcpm_pd_transmit(port, TCPC_TX_SOP, &msg);
-}
-
-/*
- * Send queued message without affecting state.
- * Return true if state machine should go back to sleep,
- * false otherwise.
- */
-static bool tcpm_send_queued_message(struct tcpm_port *port)
-{
-	enum pd_msg_request queued_message;
-
-	do {
-		queued_message = port->queued_message;
-		port->queued_message = PD_MSG_NONE;
-
-		switch (queued_message) {
-		case PD_MSG_CTRL_WAIT:
-			tcpm_pd_send_control(port, PD_CTRL_WAIT);
-			break;
-		case PD_MSG_CTRL_REJECT:
-			tcpm_pd_send_control(port, PD_CTRL_REJECT);
-			break;
-		case PD_MSG_DATA_SINK_CAP:
-			tcpm_pd_send_sink_caps(port);
-			break;
-		case PD_MSG_DATA_SOURCE_CAP:
-			tcpm_pd_send_source_caps(port);
-			break;
-		default:
-			break;
-		}
-	} while (port->queued_message != PD_MSG_NONE);
-
-	if (port->delayed_state != INVALID_STATE) {
-		if (time_is_after_jiffies(port->delayed_runtime)) {
-			mod_delayed_work(port->wq, &port->state_machine,
-					 port->delayed_runtime - jiffies);
-			return true;
-		}
-		port->delayed_state = INVALID_STATE;
-	}
-	return false;
-}
-
-static int tcpm_pd_check_request(struct tcpm_port *port)
-{
-	u32 pdo, rdo = port->sink_request;
-	unsigned int max, op, pdo_max, index;
-	enum pd_pdo_type type;
-
-	index = rdo_index(rdo);
-	if (!index || index > port->nr_src_pdo)
-		return -EINVAL;
-
-	pdo = port->src_pdo[index - 1];
-	type = pdo_type(pdo);
-	switch (type) {
-	case PDO_TYPE_FIXED:
-	case PDO_TYPE_VAR:
-		max = rdo_max_current(rdo);
-		op = rdo_op_current(rdo);
-		pdo_max = pdo_max_current(pdo);
-
-		if (op > pdo_max)
-			return -EINVAL;
-		if (max > pdo_max && !(rdo & RDO_CAP_MISMATCH))
-			return -EINVAL;
-
-		if (type == PDO_TYPE_FIXED)
-			tcpm_log(port,
-				 "Requested %u mV, %u mA for %u / %u mA",
-				 pdo_fixed_voltage(pdo), pdo_max, op, max);
-		else
-			tcpm_log(port,
-				 "Requested %u -> %u mV, %u mA for %u / %u mA",
-				 pdo_min_voltage(pdo), pdo_max_voltage(pdo),
-				 pdo_max, op, max);
-		break;
-	case PDO_TYPE_BATT:
-		max = rdo_max_power(rdo);
-		op = rdo_op_power(rdo);
-		pdo_max = pdo_max_power(pdo);
-
-		if (op > pdo_max)
-			return -EINVAL;
-		if (max > pdo_max && !(rdo & RDO_CAP_MISMATCH))
-			return -EINVAL;
-		tcpm_log(port,
-			 "Requested %u -> %u mV, %u mW for %u / %u mW",
-			 pdo_min_voltage(pdo), pdo_max_voltage(pdo),
-			 pdo_max, op, max);
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	port->op_vsafe5v = index == 1;
-
-	return 0;
-}
-
-static int tcpm_pd_select_pdo(struct tcpm_port *port)
-{
-	unsigned int i, max_mw = 0, max_mv = 0;
-	int ret = -EINVAL;
-
-	/*
-	 * Select the source PDO providing the most power while staying within
-	 * the board's voltage limits. Prefer PDO providing exp
-	 */
-	for (i = 0; i < port->nr_source_caps; i++) {
-		u32 pdo = port->source_caps[i];
-		enum pd_pdo_type type = pdo_type(pdo);
-		unsigned int mv, ma, mw;
-
-		if (type == PDO_TYPE_FIXED)
-			mv = pdo_fixed_voltage(pdo);
-		else
-			mv = pdo_min_voltage(pdo);
-
-		if (type == PDO_TYPE_BATT) {
-			mw = pdo_max_power(pdo);
-		} else {
-			ma = min(pdo_max_current(pdo),
-				 port->max_snk_ma);
-			mw = ma * mv / 1000;
-		}
-
-		/* Perfer higher voltages if available */
-		if ((mw > max_mw || (mw == max_mw && mv > max_mv)) &&
-		    mv <= port->max_snk_mv) {
-			ret = i;
-			max_mw = mw;
-			max_mv = mv;
-		}
-	}
-
-	return ret;
-}
-
-static int tcpm_pd_build_request(struct tcpm_port *port, u32 *rdo)
-{
-	unsigned int mv, ma, mw, flags;
-	unsigned int max_ma, max_mw;
-	enum pd_pdo_type type;
-	int index;
-	u32 pdo;
-
-	index = tcpm_pd_select_pdo(port);
-	if (index < 0)
-		return -EINVAL;
-	pdo = port->source_caps[index];
-	type = pdo_type(pdo);
-
-	if (type == PDO_TYPE_FIXED)
-		mv = pdo_fixed_voltage(pdo);
-	else
-		mv = pdo_min_voltage(pdo);
-
-	/* Select maximum available current within the board's power limit */
-	if (type == PDO_TYPE_BATT) {
-		mw = pdo_max_power(pdo);
-		ma = 1000 * min(mw, port->max_snk_mw) / mv;
-	} else {
-		ma = min(pdo_max_current(pdo),
-			 1000 * port->max_snk_mw / mv);
-	}
-	ma = min(ma, port->max_snk_ma);
-
-	flags = RDO_USB_COMM | RDO_NO_SUSPEND;
-
-	/* Set mismatch bit if offered power is less than operating power */
-	mw = ma * mv / 1000;
-	max_ma = ma;
-	max_mw = mw;
-	if (mw < port->operating_snk_mw) {
-		flags |= RDO_CAP_MISMATCH;
-		max_mw = port->operating_snk_mw;
-		max_ma = max_mw * 1000 / mv;
-	}
-
-	tcpm_log(port, "cc=%d cc1=%d cc2=%d vbus=%d vconn=%s polarity=%d",
-		 port->cc_req, port->cc1, port->cc2, port->vbus_source,
-		 port->vconn_role == TYPEC_SOURCE ? "source" : "sink",
-		 port->polarity);
-
-	if (type == PDO_TYPE_BATT) {
-		*rdo = RDO_BATT(index + 1, mw, max_mw, flags);
-
-		tcpm_log(port, "Requesting PDO %d: %u mV, %u mW%s",
-			 index, mv, mw,
-			 flags & RDO_CAP_MISMATCH ? " [mismatch]" : "");
-	} else {
-		*rdo = RDO_FIXED(index + 1, ma, max_ma, flags);
-
-		tcpm_log(port, "Requesting PDO %d: %u mV, %u mA%s",
-			 index, mv, ma,
-			 flags & RDO_CAP_MISMATCH ? " [mismatch]" : "");
-	}
-
-	port->current_limit = ma;
-	port->supply_voltage = mv;
-
-	return 0;
-}
-
-static int tcpm_pd_send_request(struct tcpm_port *port)
-{
-	struct pd_message msg;
-	int ret;
-	u32 rdo;
-
-	ret = tcpm_pd_build_request(port, &rdo);
-	if (ret < 0)
-		return ret;
-
-	memset(&msg, 0, sizeof(msg));
-	msg.header = PD_HEADER_LE(PD_DATA_REQUEST,
-				  port->pwr_role,
-				  port->data_role,
-				  port->message_id, 1);
-	msg.payload[0] = cpu_to_le32(rdo);
-
-	return tcpm_pd_transmit(port, TCPC_TX_SOP, &msg);
-}
-
-static int tcpm_set_vbus(struct tcpm_port *port, bool enable)
-{
-	int ret;
-
-	if (enable && port->vbus_charge)
-		return -EINVAL;
-
-	tcpm_log(port, "vbus:=%d charge=%d", enable, port->vbus_charge);
-
-	ret = port->tcpc->set_vbus(port->tcpc, enable, port->vbus_charge);
-	if (ret < 0)
-		return ret;
-
-	port->vbus_source = enable;
-	return 0;
-}
-
-static int tcpm_set_charge(struct tcpm_port *port, bool charge)
-{
-	int ret;
-
-	if (charge && port->vbus_source)
-		return -EINVAL;
-
-	if (charge != port->vbus_charge) {
-		tcpm_log(port, "vbus=%d charge:=%d", port->vbus_source, charge);
-		ret = port->tcpc->set_vbus(port->tcpc, port->vbus_source,
-					   charge);
-		if (ret < 0)
-			return ret;
-	}
-	port->vbus_charge = charge;
-	return 0;
-}
-
-static bool tcpm_start_drp_toggling(struct tcpm_port *port)
-{
-	int ret;
-
-	if (port->tcpc->start_drp_toggling &&
-	    port->port_type == TYPEC_PORT_DRP) {
-		tcpm_log_force(port, "Start DRP toggling");
-		ret = port->tcpc->start_drp_toggling(port->tcpc,
-						     tcpm_rp_cc(port));
-		if (!ret)
-			return true;
-	}
-
-	return false;
-}
-
-static void tcpm_set_cc(struct tcpm_port *port, enum typec_cc_status cc)
-{
-	tcpm_log(port, "cc:=%d", cc);
-	port->cc_req = cc;
-	port->tcpc->set_cc(port->tcpc, cc);
-}
-
-static int tcpm_init_vbus(struct tcpm_port *port)
-{
-	int ret;
-
-	ret = port->tcpc->set_vbus(port->tcpc, false, false);
-	port->vbus_source = false;
-	port->vbus_charge = false;
-	return ret;
-}
-
-static int tcpm_init_vconn(struct tcpm_port *port)
-{
-	int ret;
-
-	ret = port->tcpc->set_vconn(port->tcpc, false);
-	port->vconn_role = TYPEC_SINK;
-	return ret;
-}
-
-static void tcpm_typec_connect(struct tcpm_port *port)
-{
-	if (!port->connected) {
-		/* Make sure we don't report stale identity information */
-		memset(&port->partner_ident, 0, sizeof(port->partner_ident));
-		port->partner_desc.usb_pd = port->pd_capable;
-		if (tcpm_port_is_debug(port))
-			port->partner_desc.accessory = TYPEC_ACCESSORY_DEBUG;
-		else if (tcpm_port_is_audio(port))
-			port->partner_desc.accessory = TYPEC_ACCESSORY_AUDIO;
-		else
-			port->partner_desc.accessory = TYPEC_ACCESSORY_NONE;
-		port->partner = typec_register_partner(port->typec_port,
-						       &port->partner_desc);
-		port->connected = true;
-	}
-}
-
-static int tcpm_src_attach(struct tcpm_port *port)
-{
-	enum typec_cc_polarity polarity =
-				port->cc2 == TYPEC_CC_RD ? TYPEC_POLARITY_CC2
-							 : TYPEC_POLARITY_CC1;
-	int ret;
-
-	if (port->attached)
-		return 0;
-
-	ret = tcpm_set_polarity(port, polarity);
-	if (ret < 0)
-		return ret;
-
-	ret = tcpm_set_roles(port, true, TYPEC_SOURCE, TYPEC_HOST);
-	if (ret < 0)
-		return ret;
-
-	ret = port->tcpc->set_pd_rx(port->tcpc, true);
-	if (ret < 0)
-		goto out_disable_mux;
-
-	/*
-	 * USB Type-C specification, version 1.2,
-	 * chapter 4.5.2.2.8.1 (Attached.SRC Requirements)
-	 * Enable VCONN only if the non-RD port is set to RA.
-	 */
-	if ((polarity == TYPEC_POLARITY_CC1 && port->cc2 == TYPEC_CC_RA) ||
-	    (polarity == TYPEC_POLARITY_CC2 && port->cc1 == TYPEC_CC_RA)) {
-		ret = tcpm_set_vconn(port, true);
-		if (ret < 0)
-			goto out_disable_pd;
-	}
-
-	ret = tcpm_set_vbus(port, true);
-	if (ret < 0)
-		goto out_disable_vconn;
-
-	port->pd_capable = false;
-
-	port->partner = NULL;
-
-	port->attached = true;
-	port->send_discover = true;
-
-	return 0;
-
-out_disable_vconn:
-	tcpm_set_vconn(port, false);
-out_disable_pd:
-	port->tcpc->set_pd_rx(port->tcpc, false);
-out_disable_mux:
-	tcpm_mux_set(port, TYPEC_MUX_NONE, TCPC_USB_SWITCH_DISCONNECT);
-	return ret;
-}
-
-static void tcpm_typec_disconnect(struct tcpm_port *port)
-{
-	if (port->connected) {
-		typec_unregister_partner(port->partner);
-		port->partner = NULL;
-		port->connected = false;
-	}
-}
-
-static void tcpm_unregister_altmodes(struct tcpm_port *port)
-{
-	struct pd_mode_data *modep = &port->mode_data;
-	int i;
-
-	for (i = 0; i < modep->altmodes; i++) {
-		typec_unregister_altmode(port->partner_altmode[i]);
-		port->partner_altmode[i] = NULL;
-	}
-
-	memset(modep, 0, sizeof(*modep));
-}
-
-static void tcpm_reset_port(struct tcpm_port *port)
-{
-	tcpm_unregister_altmodes(port);
-	tcpm_typec_disconnect(port);
-	port->attached = false;
-	port->pd_capable = false;
-
-	/*
-	 * First Rx ID should be 0; set this to a sentinel of -1 so that
-	 * we can check tcpm_pd_rx_handler() if we had seen it before.
-	 */
-	port->rx_msgid = -1;
-
-	port->tcpc->set_pd_rx(port->tcpc, false);
-	tcpm_init_vbus(port);	/* also disables charging */
-	tcpm_init_vconn(port);
-	tcpm_set_current_limit(port, 0, 0);
-	tcpm_set_polarity(port, TYPEC_POLARITY_CC1);
-	tcpm_set_attached_state(port, false);
-	port->try_src_count = 0;
-	port->try_snk_count = 0;
-}
-
-static void tcpm_detach(struct tcpm_port *port)
-{
-	if (!port->attached)
-		return;
-
-	if (tcpm_port_is_disconnected(port))
-		port->hard_reset_count = 0;
-
-	tcpm_reset_port(port);
-}
-
-static void tcpm_src_detach(struct tcpm_port *port)
-{
-	tcpm_detach(port);
-}
-
-static int tcpm_snk_attach(struct tcpm_port *port)
-{
-	int ret;
-
-	if (port->attached)
-		return 0;
-
-	ret = tcpm_set_polarity(port, port->cc2 != TYPEC_CC_OPEN ?
-				TYPEC_POLARITY_CC2 : TYPEC_POLARITY_CC1);
-	if (ret < 0)
-		return ret;
-
-	ret = tcpm_set_roles(port, true, TYPEC_SINK, TYPEC_DEVICE);
-	if (ret < 0)
-		return ret;
-
-	port->pd_capable = false;
-
-	port->partner = NULL;
-
-	port->attached = true;
-	port->send_discover = true;
-
-	return 0;
-}
-
-static void tcpm_snk_detach(struct tcpm_port *port)
-{
-	tcpm_detach(port);
-
-	/* XXX: (Dis)connect SuperSpeed mux? */
-}
-
-static int tcpm_acc_attach(struct tcpm_port *port)
-{
-	int ret;
-
-	if (port->attached)
-		return 0;
-
-	ret = tcpm_set_roles(port, true, TYPEC_SOURCE, TYPEC_HOST);
-	if (ret < 0)
-		return ret;
-
-	port->partner = NULL;
-
-	tcpm_typec_connect(port);
-
-	port->attached = true;
-
-	return 0;
-}
-
-static void tcpm_acc_detach(struct tcpm_port *port)
-{
-	tcpm_detach(port);
-}
-
-static inline enum tcpm_state hard_reset_state(struct tcpm_port *port)
-{
-	if (port->hard_reset_count < PD_N_HARD_RESET_COUNT)
-		return HARD_RESET_SEND;
-	if (port->pd_capable)
-		return ERROR_RECOVERY;
-	if (port->pwr_role == TYPEC_SOURCE)
-		return SRC_UNATTACHED;
-	if (port->state == SNK_WAIT_CAPABILITIES)
-		return SNK_READY;
-	return SNK_UNATTACHED;
-}
-
-static inline enum tcpm_state ready_state(struct tcpm_port *port)
-{
-	if (port->pwr_role == TYPEC_SOURCE)
-		return SRC_READY;
-	else
-		return SNK_READY;
-}
-
-static inline enum tcpm_state unattached_state(struct tcpm_port *port)
-{
-	if (port->port_type == TYPEC_PORT_DRP) {
-		if (port->pwr_role == TYPEC_SOURCE)
-			return SRC_UNATTACHED;
-		else
-			return SNK_UNATTACHED;
-	} else if (port->port_type == TYPEC_PORT_DFP) {
-		return SRC_UNATTACHED;
-	}
-
-	return SNK_UNATTACHED;
-}
-
-static void tcpm_check_send_discover(struct tcpm_port *port)
-{
-	if (port->data_role == TYPEC_HOST && port->send_discover &&
-	    port->pd_capable) {
-		tcpm_send_vdm(port, USB_SID_PD, CMD_DISCOVER_IDENT, NULL, 0);
-		port->send_discover = false;
-	}
-}
-
-static void tcpm_swap_complete(struct tcpm_port *port, int result)
-{
-	if (port->swap_pending) {
-		port->swap_status = result;
-		port->swap_pending = false;
-		port->non_pd_role_swap = false;
-		complete(&port->swap_complete);
-	}
-}
-
-static enum typec_pwr_opmode tcpm_get_pwr_opmode(enum typec_cc_status cc)
-{
-	switch (cc) {
-	case TYPEC_CC_RP_1_5:
-		return TYPEC_PWR_MODE_1_5A;
-	case TYPEC_CC_RP_3_0:
-		return TYPEC_PWR_MODE_3_0A;
-	case TYPEC_CC_RP_DEF:
-	default:
-		return TYPEC_PWR_MODE_USB;
-	}
-}
-
-static void run_state_machine(struct tcpm_port *port)
-{
-	int ret;
-	enum typec_pwr_opmode opmode;
-	unsigned int msecs;
-
-	port->enter_state = port->state;
-	switch (port->state) {
-	case DRP_TOGGLING:
-		break;
-	/* SRC states */
-	case SRC_UNATTACHED:
-		if (!port->non_pd_role_swap)
-			tcpm_swap_complete(port, -ENOTCONN);
-		tcpm_src_detach(port);
-		if (tcpm_start_drp_toggling(port)) {
-			tcpm_set_state(port, DRP_TOGGLING, 0);
-			break;
-		}
-		tcpm_set_cc(port, tcpm_rp_cc(port));
-		if (port->port_type == TYPEC_PORT_DRP)
-			tcpm_set_state(port, SNK_UNATTACHED, PD_T_DRP_SNK);
-		break;
-	case SRC_ATTACH_WAIT:
-		if (tcpm_port_is_debug(port))
-			tcpm_set_state(port, DEBUG_ACC_ATTACHED,
-				       PD_T_CC_DEBOUNCE);
-		else if (tcpm_port_is_audio(port))
-			tcpm_set_state(port, AUDIO_ACC_ATTACHED,
-				       PD_T_CC_DEBOUNCE);
-		else if (tcpm_port_is_source(port))
-			tcpm_set_state(port,
-				       tcpm_try_snk(port) ? SNK_TRY
-							  : SRC_ATTACHED,
-				       PD_T_CC_DEBOUNCE);
-		break;
-
-	case SNK_TRY:
-		port->try_snk_count++;
-		/*
-		 * Requirements:
-		 * - Do not drive vconn or vbus
-		 * - Terminate CC pins (both) to Rd
-		 * Action:
-		 * - Wait for tDRPTry (PD_T_DRP_TRY).
-		 *   Until then, ignore any state changes.
-		 */
-		tcpm_set_cc(port, TYPEC_CC_RD);
-		tcpm_set_state(port, SNK_TRY_WAIT, PD_T_DRP_TRY);
-		break;
-	case SNK_TRY_WAIT:
-		if (tcpm_port_is_sink(port)) {
-			tcpm_set_state(port, SNK_TRY_WAIT_DEBOUNCE, 0);
-		} else {
-			tcpm_set_state(port, SRC_TRYWAIT, 0);
-			port->max_wait = 0;
-		}
-		break;
-	case SNK_TRY_WAIT_DEBOUNCE:
-		tcpm_set_state(port, SNK_TRY_WAIT_DEBOUNCE_CHECK_VBUS,
-			       PD_T_PD_DEBOUNCE);
-		break;
-	case SNK_TRY_WAIT_DEBOUNCE_CHECK_VBUS:
-		if (port->vbus_present && tcpm_port_is_sink(port)) {
-			tcpm_set_state(port, SNK_ATTACHED, 0);
-		} else {
-			tcpm_set_state(port, SRC_TRYWAIT, 0);
-			port->max_wait = 0;
-		}
-		break;
-	case SRC_TRYWAIT:
-		tcpm_set_cc(port, tcpm_rp_cc(port));
-		if (port->max_wait == 0) {
-			port->max_wait = jiffies +
-					 msecs_to_jiffies(PD_T_DRP_TRY);
-			tcpm_set_state(port, SRC_TRYWAIT_UNATTACHED,
-				       PD_T_DRP_TRY);
-		} else {
-			if (time_is_after_jiffies(port->max_wait))
-				tcpm_set_state(port, SRC_TRYWAIT_UNATTACHED,
-					       jiffies_to_msecs(port->max_wait -
-								jiffies));
-			else
-				tcpm_set_state(port, SNK_UNATTACHED, 0);
-		}
-		break;
-	case SRC_TRYWAIT_DEBOUNCE:
-		tcpm_set_state(port, SRC_ATTACHED, PD_T_CC_DEBOUNCE);
-		break;
-	case SRC_TRYWAIT_UNATTACHED:
-		tcpm_set_state(port, SNK_UNATTACHED, 0);
-		break;
-
-	case SRC_ATTACHED:
-		ret = tcpm_src_attach(port);
-		tcpm_set_state(port, SRC_UNATTACHED,
-			       ret < 0 ? 0 : PD_T_PS_SOURCE_ON);
-		break;
-	case SRC_STARTUP:
-		opmode =  tcpm_get_pwr_opmode(tcpm_rp_cc(port));
-		typec_set_pwr_opmode(port->typec_port, opmode);
-		port->pwr_opmode = TYPEC_PWR_MODE_USB;
-		port->caps_count = 0;
-		port->message_id = 0;
-		port->rx_msgid = -1;
-		port->explicit_contract = false;
-		tcpm_set_state(port, SRC_SEND_CAPABILITIES, 0);
-		break;
-	case SRC_SEND_CAPABILITIES:
-		port->caps_count++;
-		if (port->caps_count > PD_N_CAPS_COUNT) {
-			tcpm_set_state(port, SRC_READY, 0);
-			break;
-		}
-		ret = tcpm_pd_send_source_caps(port);
-		if (ret < 0) {
-			tcpm_set_state(port, SRC_SEND_CAPABILITIES,
-				       PD_T_SEND_SOURCE_CAP);
-		} else {
-			/*
-			 * Per standard, we should clear the reset counter here.
-			 * However, that can result in state machine hang-ups.
-			 * Reset it only in READY state to improve stability.
-			 */
-			/* port->hard_reset_count = 0; */
-			port->caps_count = 0;
-			port->pd_capable = true;
-			tcpm_set_state_cond(port, hard_reset_state(port),
-					    PD_T_SEND_SOURCE_CAP);
-		}
-		break;
-	case SRC_NEGOTIATE_CAPABILITIES:
-		ret = tcpm_pd_check_request(port);
-		if (ret < 0) {
-			tcpm_pd_send_control(port, PD_CTRL_REJECT);
-			if (!port->explicit_contract) {
-				tcpm_set_state(port,
-					       SRC_WAIT_NEW_CAPABILITIES, 0);
-			} else {
-				tcpm_set_state(port, SRC_READY, 0);
-			}
-		} else {
-			tcpm_pd_send_control(port, PD_CTRL_ACCEPT);
-			tcpm_set_state(port, SRC_TRANSITION_SUPPLY,
-				       PD_T_SRC_TRANSITION);
-		}
-		break;
-	case SRC_TRANSITION_SUPPLY:
-		/* XXX: regulator_set_voltage(vbus, ...) */
-		tcpm_pd_send_control(port, PD_CTRL_PS_RDY);
-		port->explicit_contract = true;
-		typec_set_pwr_opmode(port->typec_port, TYPEC_PWR_MODE_PD);
-		port->pwr_opmode = TYPEC_PWR_MODE_PD;
-		tcpm_set_state_cond(port, SRC_READY, 0);
-		break;
-	case SRC_READY:
-#if 1
-		port->hard_reset_count = 0;
-#endif
-		port->try_src_count = 0;
-
-		tcpm_swap_complete(port, 0);
-		tcpm_typec_connect(port);
-		tcpm_check_send_discover(port);
-		/*
-		 * 6.3.5
-		 * Sending ping messages is not necessary if
-		 * - the source operates at vSafe5V
-		 * or
-		 * - The system is not operating in PD mode
-		 * or
-		 * - Both partners are connected using a Type-C connector
-		 *
-		 * There is no actual need to send PD messages since the local
-		 * port type-c and the spec does not clearly say whether PD is
-		 * possible when type-c is connected to Type-A/B
-		 */
-		break;
-	case SRC_WAIT_NEW_CAPABILITIES:
-		/* Nothing to do... */
-		break;
-
-	/* SNK states */
-	case SNK_UNATTACHED:
-		if (!port->non_pd_role_swap)
-			tcpm_swap_complete(port, -ENOTCONN);
-		tcpm_snk_detach(port);
-		if (tcpm_start_drp_toggling(port)) {
-			tcpm_set_state(port, DRP_TOGGLING, 0);
-			break;
-		}
-		tcpm_set_cc(port, TYPEC_CC_RD);
-		if (port->port_type == TYPEC_PORT_DRP)
-			tcpm_set_state(port, SRC_UNATTACHED, PD_T_DRP_SRC);
-		break;
-	case SNK_ATTACH_WAIT:
-		if ((port->cc1 == TYPEC_CC_OPEN &&
-		     port->cc2 != TYPEC_CC_OPEN) ||
-		    (port->cc1 != TYPEC_CC_OPEN &&
-		     port->cc2 == TYPEC_CC_OPEN))
-			tcpm_set_state(port, SNK_DEBOUNCED,
-				       PD_T_CC_DEBOUNCE);
-		else if (tcpm_port_is_disconnected(port))
-			tcpm_set_state(port, SNK_UNATTACHED,
-				       PD_T_PD_DEBOUNCE);
-		break;
-	case SNK_DEBOUNCED:
-		if (tcpm_port_is_disconnected(port))
-			tcpm_set_state(port, SNK_UNATTACHED,
-				       PD_T_PD_DEBOUNCE);
-		else if (port->vbus_present)
-			tcpm_set_state(port,
-				       tcpm_try_src(port) ? SRC_TRY
-							  : SNK_ATTACHED,
-				       0);
-		else
-			/* Wait for VBUS, but not forever */
-			tcpm_set_state(port, PORT_RESET, PD_T_PS_SOURCE_ON);
-		break;
-
-	case SRC_TRY:
-		port->try_src_count++;
-		tcpm_set_cc(port, tcpm_rp_cc(port));
-		port->max_wait = 0;
-		tcpm_set_state(port, SRC_TRY_WAIT, 0);
-		break;
-	case SRC_TRY_WAIT:
-		if (port->max_wait == 0) {
-			port->max_wait = jiffies +
-					 msecs_to_jiffies(PD_T_DRP_TRY);
-			msecs = PD_T_DRP_TRY;
-		} else {
-			if (time_is_after_jiffies(port->max_wait))
-				msecs = jiffies_to_msecs(port->max_wait -
-							 jiffies);
-			else
-				msecs = 0;
-		}
-		tcpm_set_state(port, SNK_TRYWAIT, msecs);
-		break;
-	case SRC_TRY_DEBOUNCE:
-		tcpm_set_state(port, SRC_ATTACHED, PD_T_PD_DEBOUNCE);
-		break;
-	case SNK_TRYWAIT:
-		tcpm_set_cc(port, TYPEC_CC_RD);
-		tcpm_set_state(port, SNK_TRYWAIT_VBUS, PD_T_CC_DEBOUNCE);
-		break;
-	case SNK_TRYWAIT_VBUS:
-		/*
-		 * TCPM stays in this state indefinitely until VBUS
-		 * is detected as long as Rp is not detected for
-		 * more than a time period of tPDDebounce.
-		 */
-		if (port->vbus_present && tcpm_port_is_sink(port)) {
-			tcpm_set_state(port, SNK_ATTACHED, 0);
-			break;
-		}
-		if (!tcpm_port_is_sink(port))
-			tcpm_set_state(port, SNK_TRYWAIT_DEBOUNCE, 0);
-		break;
-	case SNK_TRYWAIT_DEBOUNCE:
-		tcpm_set_state(port, SNK_UNATTACHED, PD_T_PD_DEBOUNCE);
-		break;
-	case SNK_ATTACHED:
-		ret = tcpm_snk_attach(port);
-		if (ret < 0)
-			tcpm_set_state(port, SNK_UNATTACHED, 0);
-		else
-			tcpm_set_state(port, SNK_STARTUP, 0);
-		break;
-	case SNK_STARTUP:
-		opmode =  tcpm_get_pwr_opmode(port->polarity ?
-					      port->cc2 : port->cc1);
-		typec_set_pwr_opmode(port->typec_port, opmode);
-		port->pwr_opmode = TYPEC_PWR_MODE_USB;
-		port->message_id = 0;
-		port->rx_msgid = -1;
-		port->explicit_contract = false;
-		tcpm_set_state(port, SNK_DISCOVERY, 0);
-		break;
-	case SNK_DISCOVERY:
-		if (port->vbus_present) {
-			tcpm_set_current_limit(port,
-					       tcpm_get_current_limit(port),
-					       5000);
-			tcpm_set_charge(port, true);
-			tcpm_set_state(port, SNK_WAIT_CAPABILITIES, 0);
-			break;
-		}
-		/*
-		 * For DRP, timeouts differ. Also, handling is supposed to be
-		 * different and much more complex (dead battery detection;
-		 * see USB power delivery specification, section 8.3.3.6.1.5.1).
-		 */
-		tcpm_set_state(port, hard_reset_state(port),
-			       port->port_type == TYPEC_PORT_DRP ?
-					PD_T_DB_DETECT : PD_T_NO_RESPONSE);
-		break;
-	case SNK_DISCOVERY_DEBOUNCE:
-		tcpm_set_state(port, SNK_DISCOVERY_DEBOUNCE_DONE,
-			       PD_T_CC_DEBOUNCE);
-		break;
-	case SNK_DISCOVERY_DEBOUNCE_DONE:
-		if (!tcpm_port_is_disconnected(port) &&
-		    tcpm_port_is_sink(port) &&
-		    time_is_after_jiffies(port->delayed_runtime)) {
-			tcpm_set_state(port, SNK_DISCOVERY,
-				       port->delayed_runtime - jiffies);
-			break;
-		}
-		tcpm_set_state(port, unattached_state(port), 0);
-		break;
-	case SNK_WAIT_CAPABILITIES:
-		ret = port->tcpc->set_pd_rx(port->tcpc, true);
-		if (ret < 0) {
-			tcpm_set_state(port, SNK_READY, 0);
-			break;
-		}
-		/*
-		 * If VBUS has never been low, and we time out waiting
-		 * for source cap, try a soft reset first, in case we
-		 * were already in a stable contract before this boot.
-		 * Do this only once.
-		 */
-		if (port->vbus_never_low) {
-			port->vbus_never_low = false;
-			tcpm_set_state(port, SOFT_RESET_SEND,
-				       PD_T_SINK_WAIT_CAP);
-		} else {
-			tcpm_set_state(port, hard_reset_state(port),
-				       PD_T_SINK_WAIT_CAP);
-		}
-		break;
-	case SNK_NEGOTIATE_CAPABILITIES:
-		port->pd_capable = true;
-		port->hard_reset_count = 0;
-		ret = tcpm_pd_send_request(port);
-		if (ret < 0) {
-			/* Let the Source send capabilities again. */
-			tcpm_set_state(port, SNK_WAIT_CAPABILITIES, 0);
-		} else {
-			tcpm_set_state_cond(port, hard_reset_state(port),
-					    PD_T_SENDER_RESPONSE);
-		}
-		break;
-	case SNK_TRANSITION_SINK:
-	case SNK_TRANSITION_SINK_VBUS:
-		tcpm_set_state(port, hard_reset_state(port),
-			       PD_T_PS_TRANSITION);
-		break;
-	case SNK_READY:
-		port->try_snk_count = 0;
-		if (port->explicit_contract) {
-			typec_set_pwr_opmode(port->typec_port,
-					     TYPEC_PWR_MODE_PD);
-			port->pwr_opmode = TYPEC_PWR_MODE_PD;
-		}
-
-		tcpm_swap_complete(port, 0);
-		tcpm_typec_connect(port);
-		tcpm_check_send_discover(port);
-		break;
-
-	/* Accessory states */
-	case ACC_UNATTACHED:
-		tcpm_acc_detach(port);
-		tcpm_set_state(port, SRC_UNATTACHED, 0);
-		break;
-	case DEBUG_ACC_ATTACHED:
-	case AUDIO_ACC_ATTACHED:
-		ret = tcpm_acc_attach(port);
-		if (ret < 0)
-			tcpm_set_state(port, ACC_UNATTACHED, 0);
-		break;
-	case AUDIO_ACC_DEBOUNCE:
-		tcpm_set_state(port, ACC_UNATTACHED, PD_T_CC_DEBOUNCE);
-		break;
-
-	/* Hard_Reset states */
-	case HARD_RESET_SEND:
-		tcpm_pd_transmit(port, TCPC_TX_HARD_RESET, NULL);
-		tcpm_set_state(port, HARD_RESET_START, 0);
-		break;
-	case HARD_RESET_START:
-		port->hard_reset_count++;
-		port->tcpc->set_pd_rx(port->tcpc, false);
-		tcpm_unregister_altmodes(port);
-		port->send_discover = true;
-		if (port->pwr_role == TYPEC_SOURCE)
-			tcpm_set_state(port, SRC_HARD_RESET_VBUS_OFF,
-				       PD_T_PS_HARD_RESET);
-		else
-			tcpm_set_state(port, SNK_HARD_RESET_SINK_OFF, 0);
-		break;
-	case SRC_HARD_RESET_VBUS_OFF:
-		tcpm_set_vconn(port, true);
-		tcpm_set_vbus(port, false);
-		tcpm_set_roles(port, false, TYPEC_SOURCE, TYPEC_HOST);
-		tcpm_set_state(port, SRC_HARD_RESET_VBUS_ON, PD_T_SRC_RECOVER);
-		break;
-	case SRC_HARD_RESET_VBUS_ON:
-		tcpm_set_vbus(port, true);
-		port->tcpc->set_pd_rx(port->tcpc, true);
-		tcpm_set_attached_state(port, true);
-		tcpm_set_state(port, SRC_UNATTACHED, PD_T_PS_SOURCE_ON);
-		break;
-	case SNK_HARD_RESET_SINK_OFF:
-		tcpm_set_vconn(port, false);
-		tcpm_set_charge(port, false);
-		tcpm_set_roles(port, false, TYPEC_SINK, TYPEC_DEVICE);
-		/*
-		 * VBUS may or may not toggle, depending on the adapter.
-		 * If it doesn't toggle, transition to SNK_HARD_RESET_SINK_ON
-		 * directly after timeout.
-		 */
-		tcpm_set_state(port, SNK_HARD_RESET_SINK_ON, PD_T_SAFE_0V);
-		break;
-	case SNK_HARD_RESET_WAIT_VBUS:
-		/* Assume we're disconnected if VBUS doesn't come back. */
-		tcpm_set_state(port, SNK_UNATTACHED,
-			       PD_T_SRC_RECOVER_MAX + PD_T_SRC_TURN_ON);
-		break;
-	case SNK_HARD_RESET_SINK_ON:
-		/* Note: There is no guarantee that VBUS is on in this state */
-		/*
-		 * XXX:
-		 * The specification suggests that dual mode ports in sink
-		 * mode should transition to state PE_SRC_Transition_to_default.
-		 * See USB power delivery specification chapter 8.3.3.6.1.3.
-		 * This would mean to to
-		 * - turn off VCONN, reset power supply
-		 * - request hardware reset
-		 * - turn on VCONN
-		 * - Transition to state PE_Src_Startup
-		 * SNK only ports shall transition to state Snk_Startup
-		 * (see chapter 8.3.3.3.8).
-		 * Similar, dual-mode ports in source mode should transition
-		 * to PE_SNK_Transition_to_default.
-		 */
-		tcpm_set_attached_state(port, true);
-		tcpm_set_state(port, SNK_STARTUP, 0);
-		break;
-
-	/* Soft_Reset states */
-	case SOFT_RESET:
-		port->message_id = 0;
-		port->rx_msgid = -1;
-		tcpm_pd_send_control(port, PD_CTRL_ACCEPT);
-		if (port->pwr_role == TYPEC_SOURCE)
-			tcpm_set_state(port, SRC_SEND_CAPABILITIES, 0);
-		else
-			tcpm_set_state(port, SNK_WAIT_CAPABILITIES, 0);
-		break;
-	case SOFT_RESET_SEND:
-		port->message_id = 0;
-		port->rx_msgid = -1;
-		if (tcpm_pd_send_control(port, PD_CTRL_SOFT_RESET))
-			tcpm_set_state_cond(port, hard_reset_state(port), 0);
-		else
-			tcpm_set_state_cond(port, hard_reset_state(port),
-					    PD_T_SENDER_RESPONSE);
-		break;
-
-	/* DR_Swap states */
-	case DR_SWAP_SEND:
-		tcpm_pd_send_control(port, PD_CTRL_DR_SWAP);
-		tcpm_set_state_cond(port, DR_SWAP_SEND_TIMEOUT,
-				    PD_T_SENDER_RESPONSE);
-		break;
-	case DR_SWAP_ACCEPT:
-		tcpm_pd_send_control(port, PD_CTRL_ACCEPT);
-		tcpm_set_state_cond(port, DR_SWAP_CHANGE_DR, 0);
-		break;
-	case DR_SWAP_SEND_TIMEOUT:
-		tcpm_swap_complete(port, -ETIMEDOUT);
-		tcpm_set_state(port, ready_state(port), 0);
-		break;
-	case DR_SWAP_CHANGE_DR:
-		if (port->data_role == TYPEC_HOST) {
-			tcpm_unregister_altmodes(port);
-			tcpm_set_roles(port, true, port->pwr_role,
-				       TYPEC_DEVICE);
-		} else {
-			tcpm_set_roles(port, true, port->pwr_role,
-				       TYPEC_HOST);
-			port->send_discover = true;
-		}
-		tcpm_set_state(port, ready_state(port), 0);
-		break;
-
-	/* PR_Swap states */
-	case PR_SWAP_ACCEPT:
-		tcpm_pd_send_control(port, PD_CTRL_ACCEPT);
-		tcpm_set_state(port, PR_SWAP_START, 0);
-		break;
-	case PR_SWAP_SEND:
-		tcpm_pd_send_control(port, PD_CTRL_PR_SWAP);
-		tcpm_set_state_cond(port, PR_SWAP_SEND_TIMEOUT,
-				    PD_T_SENDER_RESPONSE);
-		break;
-	case PR_SWAP_SEND_TIMEOUT:
-		tcpm_swap_complete(port, -ETIMEDOUT);
-		tcpm_set_state(port, ready_state(port), 0);
-		break;
-	case PR_SWAP_START:
-		if (port->pwr_role == TYPEC_SOURCE)
-			tcpm_set_state(port, PR_SWAP_SRC_SNK_TRANSITION_OFF,
-				       PD_T_SRC_TRANSITION);
-		else
-			tcpm_set_state(port, PR_SWAP_SNK_SRC_SINK_OFF, 0);
-		break;
-	case PR_SWAP_SRC_SNK_TRANSITION_OFF:
-		tcpm_set_vbus(port, false);
-		port->explicit_contract = false;
-		/* allow time for Vbus discharge, must be < tSrcSwapStdby */
-		tcpm_set_state(port, PR_SWAP_SRC_SNK_SOURCE_OFF,
-			       PD_T_SRCSWAPSTDBY);
-		break;
-	case PR_SWAP_SRC_SNK_SOURCE_OFF:
-		tcpm_set_cc(port, TYPEC_CC_RD);
-		/* allow CC debounce */
-		tcpm_set_state(port, PR_SWAP_SRC_SNK_SOURCE_OFF_CC_DEBOUNCED,
-			       PD_T_CC_DEBOUNCE);
-		break;
-	case PR_SWAP_SRC_SNK_SOURCE_OFF_CC_DEBOUNCED:
-		/*
-		 * USB-PD standard, 6.2.1.4, Port Power Role:
-		 * "During the Power Role Swap Sequence, for the initial Source
-		 * Port, the Port Power Role field shall be set to Sink in the
-		 * PS_RDY Message indicating that the initial Source’s power
-		 * supply is turned off"
-		 */
-		tcpm_set_pwr_role(port, TYPEC_SINK);
-		if (tcpm_pd_send_control(port, PD_CTRL_PS_RDY)) {
-			tcpm_set_state(port, ERROR_RECOVERY, 0);
-			break;
-		}
-		tcpm_set_state_cond(port, SNK_UNATTACHED, PD_T_PS_SOURCE_ON);
-		break;
-	case PR_SWAP_SRC_SNK_SINK_ON:
-		tcpm_set_state(port, SNK_STARTUP, 0);
-		break;
-	case PR_SWAP_SNK_SRC_SINK_OFF:
-		tcpm_set_charge(port, false);
-		tcpm_set_state(port, hard_reset_state(port),
-			       PD_T_PS_SOURCE_OFF);
-		break;
-	case PR_SWAP_SNK_SRC_SOURCE_ON:
-		tcpm_set_cc(port, tcpm_rp_cc(port));
-		tcpm_set_vbus(port, true);
-		/*
-		 * allow time VBUS ramp-up, must be < tNewSrc
-		 * Also, this window overlaps with CC debounce as well.
-		 * So, Wait for the max of two which is PD_T_NEWSRC
-		 */
-		tcpm_set_state(port, PR_SWAP_SNK_SRC_SOURCE_ON_VBUS_RAMPED_UP,
-			       PD_T_NEWSRC);
-		break;
-	case PR_SWAP_SNK_SRC_SOURCE_ON_VBUS_RAMPED_UP:
-		/*
-		 * USB PD standard, 6.2.1.4:
-		 * "Subsequent Messages initiated by the Policy Engine,
-		 * such as the PS_RDY Message sent to indicate that Vbus
-		 * is ready, will have the Port Power Role field set to
-		 * Source."
-		 */
-		tcpm_set_pwr_role(port, TYPEC_SOURCE);
-		tcpm_pd_send_control(port, PD_CTRL_PS_RDY);
-		tcpm_set_state(port, SRC_STARTUP, 0);
-		break;
-
-	case VCONN_SWAP_ACCEPT:
-		tcpm_pd_send_control(port, PD_CTRL_ACCEPT);
-		tcpm_set_state(port, VCONN_SWAP_START, 0);
-		break;
-	case VCONN_SWAP_SEND:
-		tcpm_pd_send_control(port, PD_CTRL_VCONN_SWAP);
-		tcpm_set_state(port, VCONN_SWAP_SEND_TIMEOUT,
-			       PD_T_SENDER_RESPONSE);
-		break;
-	case VCONN_SWAP_SEND_TIMEOUT:
-		tcpm_swap_complete(port, -ETIMEDOUT);
-		tcpm_set_state(port, ready_state(port), 0);
-		break;
-	case VCONN_SWAP_START:
-		if (port->vconn_role == TYPEC_SOURCE)
-			tcpm_set_state(port, VCONN_SWAP_WAIT_FOR_VCONN, 0);
-		else
-			tcpm_set_state(port, VCONN_SWAP_TURN_ON_VCONN, 0);
-		break;
-	case VCONN_SWAP_WAIT_FOR_VCONN:
-		tcpm_set_state(port, hard_reset_state(port),
-			       PD_T_VCONN_SOURCE_ON);
-		break;
-	case VCONN_SWAP_TURN_ON_VCONN:
-		tcpm_set_vconn(port, true);
-		tcpm_pd_send_control(port, PD_CTRL_PS_RDY);
-		tcpm_set_state(port, ready_state(port), 0);
-		break;
-	case VCONN_SWAP_TURN_OFF_VCONN:
-		tcpm_set_vconn(port, false);
-		tcpm_set_state(port, ready_state(port), 0);
-		break;
-
-	case DR_SWAP_CANCEL:
-	case PR_SWAP_CANCEL:
-	case VCONN_SWAP_CANCEL:
-		tcpm_swap_complete(port, port->swap_status);
-		if (port->pwr_role == TYPEC_SOURCE)
-			tcpm_set_state(port, SRC_READY, 0);
-		else
-			tcpm_set_state(port, SNK_READY, 0);
-		break;
-
-	case BIST_RX:
-		switch (BDO_MODE_MASK(port->bist_request)) {
-		case BDO_MODE_CARRIER2:
-			tcpm_pd_transmit(port, TCPC_TX_BIST_MODE_2, NULL);
-			break;
-		default:
-			break;
-		}
-		/* Always switch to unattached state */
-		tcpm_set_state(port, unattached_state(port), 0);
-		break;
-	case ERROR_RECOVERY:
-		tcpm_swap_complete(port, -EPROTO);
-		tcpm_set_state(port, PORT_RESET, 0);
-		break;
-	case PORT_RESET:
-		tcpm_reset_port(port);
-		tcpm_set_cc(port, TYPEC_CC_OPEN);
-		tcpm_set_state(port, PORT_RESET_WAIT_OFF,
-			       PD_T_ERROR_RECOVERY);
-		break;
-	case PORT_RESET_WAIT_OFF:
-		tcpm_set_state(port,
-			       tcpm_default_state(port),
-			       port->vbus_present ? PD_T_PS_SOURCE_OFF : 0);
-		break;
-	default:
-		WARN(1, "Unexpected port state %d\n", port->state);
-		break;
-	}
-}
-
-static void tcpm_state_machine_work(struct work_struct *work)
-{
-	struct tcpm_port *port = container_of(work, struct tcpm_port,
-					      state_machine.work);
-	enum tcpm_state prev_state;
-
-	mutex_lock(&port->lock);
-	port->state_machine_running = true;
-
-	if (port->queued_message && tcpm_send_queued_message(port))
-		goto done;
-
-	/* If we were queued due to a delayed state change, update it now */
-	if (port->delayed_state) {
-		tcpm_log(port, "state change %s -> %s [delayed %ld ms]",
-			 tcpm_states[port->state],
-			 tcpm_states[port->delayed_state], port->delay_ms);
-		port->prev_state = port->state;
-		port->state = port->delayed_state;
-		port->delayed_state = INVALID_STATE;
-	}
-
-	/*
-	 * Continue running as long as we have (non-delayed) state changes
-	 * to make.
-	 */
-	do {
-		prev_state = port->state;
-		run_state_machine(port);
-		if (port->queued_message)
-			tcpm_send_queued_message(port);
-	} while (port->state != prev_state && !port->delayed_state);
-
-done:
-	port->state_machine_running = false;
-	mutex_unlock(&port->lock);
-}
-
-static void _tcpm_cc_change(struct tcpm_port *port, enum typec_cc_status cc1,
-			    enum typec_cc_status cc2)
-{
-	enum typec_cc_status old_cc1, old_cc2;
-	enum tcpm_state new_state;
-
-	old_cc1 = port->cc1;
-	old_cc2 = port->cc2;
-	port->cc1 = cc1;
-	port->cc2 = cc2;
-
-	tcpm_log_force(port,
-		       "CC1: %u -> %u, CC2: %u -> %u [state %s, polarity %d, %s]",
-		       old_cc1, cc1, old_cc2, cc2, tcpm_states[port->state],
-		       port->polarity,
-		       tcpm_port_is_disconnected(port) ? "disconnected"
-						       : "connected");
-
-	switch (port->state) {
-	case DRP_TOGGLING:
-		if (tcpm_port_is_debug(port) || tcpm_port_is_audio(port) ||
-		    tcpm_port_is_source(port))
-			tcpm_set_state(port, SRC_ATTACH_WAIT, 0);
-		else if (tcpm_port_is_sink(port))
-			tcpm_set_state(port, SNK_ATTACH_WAIT, 0);
-		break;
-	case SRC_UNATTACHED:
-	case ACC_UNATTACHED:
-		if (tcpm_port_is_debug(port) || tcpm_port_is_audio(port) ||
-		    tcpm_port_is_source(port))
-			tcpm_set_state(port, SRC_ATTACH_WAIT, 0);
-		break;
-	case SRC_ATTACH_WAIT:
-		if (tcpm_port_is_disconnected(port) ||
-		    tcpm_port_is_audio_detached(port))
-			tcpm_set_state(port, SRC_UNATTACHED, 0);
-		else if (cc1 != old_cc1 || cc2 != old_cc2)
-			tcpm_set_state(port, SRC_ATTACH_WAIT, 0);
-		break;
-	case SRC_ATTACHED:
-	case SRC_SEND_CAPABILITIES:
-	case SRC_READY:
-		if (tcpm_port_is_disconnected(port) ||
-		    !tcpm_port_is_source(port))
-			tcpm_set_state(port, SRC_UNATTACHED, 0);
-		break;
-	case SNK_UNATTACHED:
-		if (tcpm_port_is_sink(port))
-			tcpm_set_state(port, SNK_ATTACH_WAIT, 0);
-		break;
-	case SNK_ATTACH_WAIT:
-		if ((port->cc1 == TYPEC_CC_OPEN &&
-		     port->cc2 != TYPEC_CC_OPEN) ||
-		    (port->cc1 != TYPEC_CC_OPEN &&
-		     port->cc2 == TYPEC_CC_OPEN))
-			new_state = SNK_DEBOUNCED;
-		else if (tcpm_port_is_disconnected(port))
-			new_state = SNK_UNATTACHED;
-		else
-			break;
-		if (new_state != port->delayed_state)
-			tcpm_set_state(port, SNK_ATTACH_WAIT, 0);
-		break;
-	case SNK_DEBOUNCED:
-		if (tcpm_port_is_disconnected(port))
-			new_state = SNK_UNATTACHED;
-		else if (port->vbus_present)
-			new_state = tcpm_try_src(port) ? SRC_TRY : SNK_ATTACHED;
-		else
-			new_state = SNK_UNATTACHED;
-		if (new_state != port->delayed_state)
-			tcpm_set_state(port, SNK_DEBOUNCED, 0);
-		break;
-	case SNK_READY:
-		if (tcpm_port_is_disconnected(port))
-			tcpm_set_state(port, unattached_state(port), 0);
-		else if (!port->pd_capable &&
-			 (cc1 != old_cc1 || cc2 != old_cc2))
-			tcpm_set_current_limit(port,
-					       tcpm_get_current_limit(port),
-					       5000);
-		break;
-
-	case AUDIO_ACC_ATTACHED:
-		if (cc1 == TYPEC_CC_OPEN || cc2 == TYPEC_CC_OPEN)
-			tcpm_set_state(port, AUDIO_ACC_DEBOUNCE, 0);
-		break;
-	case AUDIO_ACC_DEBOUNCE:
-		if (tcpm_port_is_audio(port))
-			tcpm_set_state(port, AUDIO_ACC_ATTACHED, 0);
-		break;
-
-	case DEBUG_ACC_ATTACHED:
-		if (cc1 == TYPEC_CC_OPEN || cc2 == TYPEC_CC_OPEN)
-			tcpm_set_state(port, ACC_UNATTACHED, 0);
-		break;
-
-	case SNK_TRY:
-		/* Do nothing, waiting for timeout */
-		break;
-
-	case SNK_DISCOVERY:
-		/* CC line is unstable, wait for debounce */
-		if (tcpm_port_is_disconnected(port))
-			tcpm_set_state(port, SNK_DISCOVERY_DEBOUNCE, 0);
-		break;
-	case SNK_DISCOVERY_DEBOUNCE:
-		break;
-
-	case SRC_TRYWAIT:
-		/* Hand over to state machine if needed */
-		if (!port->vbus_present && tcpm_port_is_source(port))
-			tcpm_set_state(port, SRC_TRYWAIT_DEBOUNCE, 0);
-		break;
-	case SRC_TRYWAIT_DEBOUNCE:
-		if (port->vbus_present || !tcpm_port_is_source(port))
-			tcpm_set_state(port, SRC_TRYWAIT, 0);
-		break;
-	case SNK_TRY_WAIT_DEBOUNCE:
-		if (!tcpm_port_is_sink(port)) {
-			port->max_wait = 0;
-			tcpm_set_state(port, SRC_TRYWAIT, 0);
-		}
-		break;
-	case SRC_TRY_WAIT:
-		if (tcpm_port_is_source(port))
-			tcpm_set_state(port, SRC_TRY_DEBOUNCE, 0);
-		break;
-	case SRC_TRY_DEBOUNCE:
-		tcpm_set_state(port, SRC_TRY_WAIT, 0);
-		break;
-	case SNK_TRYWAIT_DEBOUNCE:
-		if (tcpm_port_is_sink(port))
-			tcpm_set_state(port, SNK_TRYWAIT_VBUS, 0);
-		break;
-	case SNK_TRYWAIT_VBUS:
-		if (!tcpm_port_is_sink(port))
-			tcpm_set_state(port, SNK_TRYWAIT_DEBOUNCE, 0);
-		break;
-	case SNK_TRYWAIT:
-		/* Do nothing, waiting for tCCDebounce */
-		break;
-	case PR_SWAP_SNK_SRC_SINK_OFF:
-	case PR_SWAP_SRC_SNK_TRANSITION_OFF:
-	case PR_SWAP_SRC_SNK_SOURCE_OFF:
-	case PR_SWAP_SRC_SNK_SOURCE_OFF_CC_DEBOUNCED:
-	case PR_SWAP_SNK_SRC_SOURCE_ON:
-		/*
-		 * CC state change is expected in PR_SWAP
-		 * Ignore it.
-		 */
-		break;
-
-	default:
-		if (tcpm_port_is_disconnected(port))
-			tcpm_set_state(port, unattached_state(port), 0);
-		break;
-	}
-}
-
-static void _tcpm_pd_vbus_on(struct tcpm_port *port)
-{
-	tcpm_log_force(port, "VBUS on");
-	port->vbus_present = true;
-	switch (port->state) {
-	case SNK_TRANSITION_SINK_VBUS:
-		port->explicit_contract = true;
-		tcpm_set_state(port, SNK_READY, 0);
-		break;
-	case SNK_DISCOVERY:
-		tcpm_set_state(port, SNK_DISCOVERY, 0);
-		break;
-
-	case SNK_DEBOUNCED:
-		tcpm_set_state(port, tcpm_try_src(port) ? SRC_TRY
-							: SNK_ATTACHED,
-				       0);
-		break;
-	case SNK_HARD_RESET_WAIT_VBUS:
-		tcpm_set_state(port, SNK_HARD_RESET_SINK_ON, 0);
-		break;
-	case SRC_ATTACHED:
-		tcpm_set_state(port, SRC_STARTUP, 0);
-		break;
-	case SRC_HARD_RESET_VBUS_ON:
-		tcpm_set_state(port, SRC_STARTUP, 0);
-		break;
-
-	case SNK_TRY:
-		/* Do nothing, waiting for timeout */
-		break;
-	case SRC_TRYWAIT:
-		/* Do nothing, Waiting for Rd to be detected */
-		break;
-	case SRC_TRYWAIT_DEBOUNCE:
-		tcpm_set_state(port, SRC_TRYWAIT, 0);
-		break;
-	case SNK_TRY_WAIT_DEBOUNCE:
-		/* Do nothing, waiting for PD_DEBOUNCE to do be done */
-		break;
-	case SNK_TRYWAIT:
-		/* Do nothing, waiting for tCCDebounce */
-		break;
-	case SNK_TRYWAIT_VBUS:
-		if (tcpm_port_is_sink(port))
-			tcpm_set_state(port, SNK_ATTACHED, 0);
-		break;
-	case SNK_TRYWAIT_DEBOUNCE:
-		/* Do nothing, waiting for Rp */
-		break;
-	case SRC_TRY_WAIT:
-	case SRC_TRY_DEBOUNCE:
-		/* Do nothing, waiting for sink detection */
-		break;
-	default:
-		break;
-	}
-}
-
-static void _tcpm_pd_vbus_off(struct tcpm_port *port)
-{
-	tcpm_log_force(port, "VBUS off");
-	port->vbus_present = false;
-	port->vbus_never_low = false;
-	switch (port->state) {
-	case SNK_HARD_RESET_SINK_OFF:
-		tcpm_set_state(port, SNK_HARD_RESET_WAIT_VBUS, 0);
-		break;
-	case SRC_HARD_RESET_VBUS_OFF:
-		tcpm_set_state(port, SRC_HARD_RESET_VBUS_ON, 0);
-		break;
-	case HARD_RESET_SEND:
-		break;
-
-	case SNK_TRY:
-		/* Do nothing, waiting for timeout */
-		break;
-	case SRC_TRYWAIT:
-		/* Hand over to state machine if needed */
-		if (tcpm_port_is_source(port))
-			tcpm_set_state(port, SRC_TRYWAIT_DEBOUNCE, 0);
-		break;
-	case SNK_TRY_WAIT_DEBOUNCE:
-		/* Do nothing, waiting for PD_DEBOUNCE to do be done */
-		break;
-	case SNK_TRYWAIT:
-	case SNK_TRYWAIT_VBUS:
-	case SNK_TRYWAIT_DEBOUNCE:
-		break;
-	case SNK_ATTACH_WAIT:
-		tcpm_set_state(port, SNK_UNATTACHED, 0);
-		break;
-
-	case SNK_NEGOTIATE_CAPABILITIES:
-		break;
-
-	case PR_SWAP_SRC_SNK_TRANSITION_OFF:
-		tcpm_set_state(port, PR_SWAP_SRC_SNK_SOURCE_OFF, 0);
-		break;
-
-	case PR_SWAP_SNK_SRC_SINK_OFF:
-		/* Do nothing, expected */
-		break;
-
-	case PORT_RESET_WAIT_OFF:
-		tcpm_set_state(port, tcpm_default_state(port), 0);
-		break;
-	case SRC_TRY_WAIT:
-	case SRC_TRY_DEBOUNCE:
-		/* Do nothing, waiting for sink detection */
-		break;
-	default:
-		if (port->pwr_role == TYPEC_SINK &&
-		    port->attached)
-			tcpm_set_state(port, SNK_UNATTACHED, 0);
-		break;
-	}
-}
-
-static void _tcpm_pd_hard_reset(struct tcpm_port *port)
-{
-	tcpm_log_force(port, "Received hard reset");
-	/*
-	 * If we keep receiving hard reset requests, executing the hard reset
-	 * must have failed. Revert to error recovery if that happens.
-	 */
-	tcpm_set_state(port,
-		       port->hard_reset_count < PD_N_HARD_RESET_COUNT ?
-				HARD_RESET_START : ERROR_RECOVERY,
-		       0);
-}
-
-static void tcpm_pd_event_handler(struct work_struct *work)
-{
-	struct tcpm_port *port = container_of(work, struct tcpm_port,
-					      event_work);
-	u32 events;
-
-	mutex_lock(&port->lock);
-
-	spin_lock(&port->pd_event_lock);
-	while (port->pd_events) {
-		events = port->pd_events;
-		port->pd_events = 0;
-		spin_unlock(&port->pd_event_lock);
-		if (events & TCPM_RESET_EVENT)
-			_tcpm_pd_hard_reset(port);
-		if (events & TCPM_VBUS_EVENT) {
-			bool vbus;
-
-			vbus = port->tcpc->get_vbus(port->tcpc);
-			if (vbus)
-				_tcpm_pd_vbus_on(port);
-			else
-				_tcpm_pd_vbus_off(port);
-		}
-		if (events & TCPM_CC_EVENT) {
-			enum typec_cc_status cc1, cc2;
-
-			if (port->tcpc->get_cc(port->tcpc, &cc1, &cc2) == 0)
-				_tcpm_cc_change(port, cc1, cc2);
-		}
-		spin_lock(&port->pd_event_lock);
-	}
-	spin_unlock(&port->pd_event_lock);
-	mutex_unlock(&port->lock);
-}
-
-void tcpm_cc_change(struct tcpm_port *port)
-{
-	spin_lock(&port->pd_event_lock);
-	port->pd_events |= TCPM_CC_EVENT;
-	spin_unlock(&port->pd_event_lock);
-	queue_work(port->wq, &port->event_work);
-}
-EXPORT_SYMBOL_GPL(tcpm_cc_change);
-
-void tcpm_vbus_change(struct tcpm_port *port)
-{
-	spin_lock(&port->pd_event_lock);
-	port->pd_events |= TCPM_VBUS_EVENT;
-	spin_unlock(&port->pd_event_lock);
-	queue_work(port->wq, &port->event_work);
-}
-EXPORT_SYMBOL_GPL(tcpm_vbus_change);
-
-void tcpm_pd_hard_reset(struct tcpm_port *port)
-{
-	spin_lock(&port->pd_event_lock);
-	port->pd_events = TCPM_RESET_EVENT;
-	spin_unlock(&port->pd_event_lock);
-	queue_work(port->wq, &port->event_work);
-}
-EXPORT_SYMBOL_GPL(tcpm_pd_hard_reset);
-
-static int tcpm_dr_set(const struct typec_capability *cap,
-		       enum typec_data_role data)
-{
-	struct tcpm_port *port = typec_cap_to_tcpm(cap);
-	int ret;
-
-	mutex_lock(&port->swap_lock);
-	mutex_lock(&port->lock);
-
-	if (port->port_type != TYPEC_PORT_DRP) {
-		ret = -EINVAL;
-		goto port_unlock;
-	}
-	if (port->state != SRC_READY && port->state != SNK_READY) {
-		ret = -EAGAIN;
-		goto port_unlock;
-	}
-
-	if (port->data_role == data) {
-		ret = 0;
-		goto port_unlock;
-	}
-
-	/*
-	 * XXX
-	 * 6.3.9: If an alternate mode is active, a request to swap
-	 * alternate modes shall trigger a port reset.
-	 * Reject data role swap request in this case.
-	 */
-
-	if (!port->pd_capable) {
-		/*
-		 * If the partner is not PD capable, reset the port to
-		 * trigger a role change. This can only work if a preferred
-		 * role is configured, and if it matches the requested role.
-		 */
-		if (port->try_role == TYPEC_NO_PREFERRED_ROLE ||
-		    port->try_role == port->pwr_role) {
-			ret = -EINVAL;
-			goto port_unlock;
-		}
-		port->non_pd_role_swap = true;
-		tcpm_set_state(port, PORT_RESET, 0);
-	} else {
-		tcpm_set_state(port, DR_SWAP_SEND, 0);
-	}
-
-	port->swap_status = 0;
-	port->swap_pending = true;
-	reinit_completion(&port->swap_complete);
-	mutex_unlock(&port->lock);
-
-	if (!wait_for_completion_timeout(&port->swap_complete,
-				msecs_to_jiffies(PD_ROLE_SWAP_TIMEOUT)))
-		ret = -ETIMEDOUT;
-	else
-		ret = port->swap_status;
-
-	port->non_pd_role_swap = false;
-	goto swap_unlock;
-
-port_unlock:
-	mutex_unlock(&port->lock);
-swap_unlock:
-	mutex_unlock(&port->swap_lock);
-	return ret;
-}
-
-static int tcpm_pr_set(const struct typec_capability *cap,
-		       enum typec_role role)
-{
-	struct tcpm_port *port = typec_cap_to_tcpm(cap);
-	int ret;
-
-	mutex_lock(&port->swap_lock);
-	mutex_lock(&port->lock);
-
-	if (port->port_type != TYPEC_PORT_DRP) {
-		ret = -EINVAL;
-		goto port_unlock;
-	}
-	if (port->state != SRC_READY && port->state != SNK_READY) {
-		ret = -EAGAIN;
-		goto port_unlock;
-	}
-
-	if (role == port->pwr_role) {
-		ret = 0;
-		goto port_unlock;
-	}
-
-	port->swap_status = 0;
-	port->swap_pending = true;
-	reinit_completion(&port->swap_complete);
-	tcpm_set_state(port, PR_SWAP_SEND, 0);
-	mutex_unlock(&port->lock);
-
-	if (!wait_for_completion_timeout(&port->swap_complete,
-				msecs_to_jiffies(PD_ROLE_SWAP_TIMEOUT)))
-		ret = -ETIMEDOUT;
-	else
-		ret = port->swap_status;
-
-	goto swap_unlock;
-
-port_unlock:
-	mutex_unlock(&port->lock);
-swap_unlock:
-	mutex_unlock(&port->swap_lock);
-	return ret;
-}
-
-static int tcpm_vconn_set(const struct typec_capability *cap,
-			  enum typec_role role)
-{
-	struct tcpm_port *port = typec_cap_to_tcpm(cap);
-	int ret;
-
-	mutex_lock(&port->swap_lock);
-	mutex_lock(&port->lock);
-
-	if (port->state != SRC_READY && port->state != SNK_READY) {
-		ret = -EAGAIN;
-		goto port_unlock;
-	}
-
-	if (role == port->vconn_role) {
-		ret = 0;
-		goto port_unlock;
-	}
-
-	port->swap_status = 0;
-	port->swap_pending = true;
-	reinit_completion(&port->swap_complete);
-	tcpm_set_state(port, VCONN_SWAP_SEND, 0);
-	mutex_unlock(&port->lock);
-
-	if (!wait_for_completion_timeout(&port->swap_complete,
-				msecs_to_jiffies(PD_ROLE_SWAP_TIMEOUT)))
-		ret = -ETIMEDOUT;
-	else
-		ret = port->swap_status;
-
-	goto swap_unlock;
-
-port_unlock:
-	mutex_unlock(&port->lock);
-swap_unlock:
-	mutex_unlock(&port->swap_lock);
-	return ret;
-}
-
-static int tcpm_try_role(const struct typec_capability *cap, int role)
-{
-	struct tcpm_port *port = typec_cap_to_tcpm(cap);
-	struct tcpc_dev	*tcpc = port->tcpc;
-	int ret = 0;
-
-	mutex_lock(&port->lock);
-	if (tcpc->try_role)
-		ret = tcpc->try_role(tcpc, role);
-	if (!ret && !tcpc->config->try_role_hw)
-		port->try_role = role;
-	port->try_src_count = 0;
-	port->try_snk_count = 0;
-	mutex_unlock(&port->lock);
-
-	return ret;
-}
-
-static void tcpm_init(struct tcpm_port *port)
-{
-	enum typec_cc_status cc1, cc2;
-
-	port->tcpc->init(port->tcpc);
-
-	tcpm_reset_port(port);
-
-	/*
-	 * XXX
-	 * Should possibly wait for VBUS to settle if it was enabled locally
-	 * since tcpm_reset_port() will disable VBUS.
-	 */
-	port->vbus_present = port->tcpc->get_vbus(port->tcpc);
-	if (port->vbus_present)
-		port->vbus_never_low = true;
-
-	tcpm_set_state(port, tcpm_default_state(port), 0);
-
-	if (port->tcpc->get_cc(port->tcpc, &cc1, &cc2) == 0)
-		_tcpm_cc_change(port, cc1, cc2);
-
-	/*
-	 * Some adapters need a clean slate at startup, and won't recover
-	 * otherwise. So do not try to be fancy and force a clean disconnect.
-	 */
-	tcpm_set_state(port, PORT_RESET, 0);
-}
-
-static int tcpm_port_type_set(const struct typec_capability *cap,
-			      enum typec_port_type type)
-{
-	struct tcpm_port *port = typec_cap_to_tcpm(cap);
-
-	mutex_lock(&port->lock);
-	if (type == port->port_type)
-		goto port_unlock;
-
-	port->port_type = type;
-
-	if (!port->connected) {
-		tcpm_set_state(port, PORT_RESET, 0);
-	} else if (type == TYPEC_PORT_UFP) {
-		if (!(port->pwr_role == TYPEC_SINK &&
-		      port->data_role == TYPEC_DEVICE))
-			tcpm_set_state(port, PORT_RESET, 0);
-	} else if (type == TYPEC_PORT_DFP) {
-		if (!(port->pwr_role == TYPEC_SOURCE &&
-		      port->data_role == TYPEC_HOST))
-			tcpm_set_state(port, PORT_RESET, 0);
-	}
-
-port_unlock:
-	mutex_unlock(&port->lock);
-	return 0;
-}
-
-void tcpm_tcpc_reset(struct tcpm_port *port)
-{
-	mutex_lock(&port->lock);
-	/* XXX: Maintain PD connection if possible? */
-	tcpm_init(port);
-	mutex_unlock(&port->lock);
-}
-EXPORT_SYMBOL_GPL(tcpm_tcpc_reset);
-
-static int tcpm_copy_pdos(u32 *dest_pdo, const u32 *src_pdo,
-			  unsigned int nr_pdo)
-{
-	unsigned int i;
-
-	if (nr_pdo > PDO_MAX_OBJECTS)
-		nr_pdo = PDO_MAX_OBJECTS;
-
-	for (i = 0; i < nr_pdo; i++)
-		dest_pdo[i] = src_pdo[i];
-
-	return nr_pdo;
-}
-
-static int tcpm_copy_vdos(u32 *dest_vdo, const u32 *src_vdo,
-			  unsigned int nr_vdo)
-{
-	unsigned int i;
-
-	if (nr_vdo > VDO_MAX_OBJECTS)
-		nr_vdo = VDO_MAX_OBJECTS;
-
-	for (i = 0; i < nr_vdo; i++)
-		dest_vdo[i] = src_vdo[i];
-
-	return nr_vdo;
-}
-
-void tcpm_update_source_capabilities(struct tcpm_port *port, const u32 *pdo,
-				     unsigned int nr_pdo)
-{
-	mutex_lock(&port->lock);
-	port->nr_src_pdo = tcpm_copy_pdos(port->src_pdo, pdo, nr_pdo);
-	switch (port->state) {
-	case SRC_UNATTACHED:
-	case SRC_ATTACH_WAIT:
-	case SRC_TRYWAIT:
-		tcpm_set_cc(port, tcpm_rp_cc(port));
-		break;
-	case SRC_SEND_CAPABILITIES:
-	case SRC_NEGOTIATE_CAPABILITIES:
-	case SRC_READY:
-	case SRC_WAIT_NEW_CAPABILITIES:
-		tcpm_set_cc(port, tcpm_rp_cc(port));
-		tcpm_set_state(port, SRC_SEND_CAPABILITIES, 0);
-		break;
-	default:
-		break;
-	}
-	mutex_unlock(&port->lock);
-}
-EXPORT_SYMBOL_GPL(tcpm_update_source_capabilities);
-
-void tcpm_update_sink_capabilities(struct tcpm_port *port, const u32 *pdo,
-				   unsigned int nr_pdo,
-				   unsigned int max_snk_mv,
-				   unsigned int max_snk_ma,
-				   unsigned int max_snk_mw,
-				   unsigned int operating_snk_mw)
-{
-	mutex_lock(&port->lock);
-	port->nr_snk_pdo = tcpm_copy_pdos(port->snk_pdo, pdo, nr_pdo);
-	port->max_snk_mv = max_snk_mv;
-	port->max_snk_ma = max_snk_ma;
-	port->max_snk_mw = max_snk_mw;
-	port->operating_snk_mw = operating_snk_mw;
-
-	switch (port->state) {
-	case SNK_NEGOTIATE_CAPABILITIES:
-	case SNK_READY:
-	case SNK_TRANSITION_SINK:
-	case SNK_TRANSITION_SINK_VBUS:
-		tcpm_set_state(port, SNK_NEGOTIATE_CAPABILITIES, 0);
-		break;
-	default:
-		break;
-	}
-	mutex_unlock(&port->lock);
-}
-EXPORT_SYMBOL_GPL(tcpm_update_sink_capabilities);
-
-struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc)
-{
-	struct tcpm_port *port;
-	int i, err;
-
-	if (!dev || !tcpc || !tcpc->config ||
-	    !tcpc->get_vbus || !tcpc->set_cc || !tcpc->get_cc ||
-	    !tcpc->set_polarity || !tcpc->set_vconn || !tcpc->set_vbus ||
-	    !tcpc->set_pd_rx || !tcpc->set_roles || !tcpc->pd_transmit)
-		return ERR_PTR(-EINVAL);
-
-	port = devm_kzalloc(dev, sizeof(*port), GFP_KERNEL);
-	if (!port)
-		return ERR_PTR(-ENOMEM);
-
-	port->dev = dev;
-	port->tcpc = tcpc;
-
-	mutex_init(&port->lock);
-	mutex_init(&port->swap_lock);
-
-	port->wq = create_singlethread_workqueue(dev_name(dev));
-	if (!port->wq)
-		return ERR_PTR(-ENOMEM);
-	INIT_DELAYED_WORK(&port->state_machine, tcpm_state_machine_work);
-	INIT_DELAYED_WORK(&port->vdm_state_machine, vdm_state_machine_work);
-	INIT_WORK(&port->event_work, tcpm_pd_event_handler);
-
-	spin_lock_init(&port->pd_event_lock);
-
-	init_completion(&port->tx_complete);
-	init_completion(&port->swap_complete);
-
-	port->nr_src_pdo = tcpm_copy_pdos(port->src_pdo, tcpc->config->src_pdo,
-					  tcpc->config->nr_src_pdo);
-	port->nr_snk_pdo = tcpm_copy_pdos(port->snk_pdo, tcpc->config->snk_pdo,
-					  tcpc->config->nr_snk_pdo);
-	port->nr_snk_vdo = tcpm_copy_vdos(port->snk_vdo, tcpc->config->snk_vdo,
-					  tcpc->config->nr_snk_vdo);
-
-	port->max_snk_mv = tcpc->config->max_snk_mv;
-	port->max_snk_ma = tcpc->config->max_snk_ma;
-	port->max_snk_mw = tcpc->config->max_snk_mw;
-	port->operating_snk_mw = tcpc->config->operating_snk_mw;
-	if (!tcpc->config->try_role_hw)
-		port->try_role = tcpc->config->default_role;
-	else
-		port->try_role = TYPEC_NO_PREFERRED_ROLE;
-
-	port->typec_caps.prefer_role = tcpc->config->default_role;
-	port->typec_caps.type = tcpc->config->type;
-	port->typec_caps.revision = 0x0120;	/* Type-C spec release 1.2 */
-	port->typec_caps.pd_revision = 0x0200;	/* USB-PD spec release 2.0 */
-	port->typec_caps.dr_set = tcpm_dr_set;
-	port->typec_caps.pr_set = tcpm_pr_set;
-	port->typec_caps.vconn_set = tcpm_vconn_set;
-	port->typec_caps.try_role = tcpm_try_role;
-	port->typec_caps.port_type_set = tcpm_port_type_set;
-
-	port->partner_desc.identity = &port->partner_ident;
-	port->port_type = tcpc->config->type;
-
-	port->typec_port = typec_register_port(port->dev, &port->typec_caps);
-	if (!port->typec_port) {
-		err = -ENOMEM;
-		goto out_destroy_wq;
-	}
-
-	if (tcpc->config->alt_modes) {
-		const struct typec_altmode_desc *paltmode = tcpc->config->alt_modes;
-
-		i = 0;
-		while (paltmode->svid && i < ARRAY_SIZE(port->port_altmode)) {
-			port->port_altmode[i] =
-			  typec_port_register_altmode(port->typec_port,
-						      paltmode);
-			if (!port->port_altmode[i]) {
-				tcpm_log(port,
-					 "%s: failed to register port alternate mode 0x%x",
-					 dev_name(dev), paltmode->svid);
-				break;
-			}
-			i++;
-			paltmode++;
-		}
-	}
-
-	tcpm_debugfs_init(port);
-	mutex_lock(&port->lock);
-	tcpm_init(port);
-	mutex_unlock(&port->lock);
-
-	tcpm_log(port, "%s: registered", dev_name(dev));
-	return port;
-
-out_destroy_wq:
-	destroy_workqueue(port->wq);
-	return ERR_PTR(err);
-}
-EXPORT_SYMBOL_GPL(tcpm_register_port);
-
-void tcpm_unregister_port(struct tcpm_port *port)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(port->port_altmode); i++)
-		typec_unregister_altmode(port->port_altmode[i]);
-	typec_unregister_port(port->typec_port);
-	tcpm_debugfs_exit(port);
-	destroy_workqueue(port->wq);
-}
-EXPORT_SYMBOL_GPL(tcpm_unregister_port);
-
-MODULE_AUTHOR("Guenter Roeck <groeck@chromium.org>");
-MODULE_DESCRIPTION("USB Type-C Port Manager");
-MODULE_LICENSE("GPL");
diff --git a/drivers/staging/typec/tcpm.h b/drivers/staging/typec/tcpm.h
deleted file mode 100644
index 073197f0d2bb..000000000000
--- a/drivers/staging/typec/tcpm.h
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright 2015-2017 Google, Inc
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#ifndef __LINUX_USB_TCPM_H
-#define __LINUX_USB_TCPM_H
-
-#include <linux/bitops.h>
-#include <linux/usb/typec.h>
-#include "pd.h"
-
-enum typec_cc_status {
-	TYPEC_CC_OPEN,
-	TYPEC_CC_RA,
-	TYPEC_CC_RD,
-	TYPEC_CC_RP_DEF,
-	TYPEC_CC_RP_1_5,
-	TYPEC_CC_RP_3_0,
-};
-
-enum typec_cc_polarity {
-	TYPEC_POLARITY_CC1,
-	TYPEC_POLARITY_CC2,
-};
-
-/* Time to wait for TCPC to complete transmit */
-#define PD_T_TCPC_TX_TIMEOUT	100		/* in ms	*/
-#define PD_ROLE_SWAP_TIMEOUT	(MSEC_PER_SEC * 10)
-
-enum tcpm_transmit_status {
-	TCPC_TX_SUCCESS = 0,
-	TCPC_TX_DISCARDED = 1,
-	TCPC_TX_FAILED = 2,
-};
-
-enum tcpm_transmit_type {
-	TCPC_TX_SOP = 0,
-	TCPC_TX_SOP_PRIME = 1,
-	TCPC_TX_SOP_PRIME_PRIME = 2,
-	TCPC_TX_SOP_DEBUG_PRIME = 3,
-	TCPC_TX_SOP_DEBUG_PRIME_PRIME = 4,
-	TCPC_TX_HARD_RESET = 5,
-	TCPC_TX_CABLE_RESET = 6,
-	TCPC_TX_BIST_MODE_2 = 7
-};
-
-/**
- * struct tcpc_config - Port configuration
- * @src_pdo:	PDO parameters sent to port partner as response to
- *		PD_CTRL_GET_SOURCE_CAP message
- * @nr_src_pdo:	Number of entries in @src_pdo
- * @snk_pdo:	PDO parameters sent to partner as response to
- *		PD_CTRL_GET_SINK_CAP message
- * @nr_snk_pdo:	Number of entries in @snk_pdo
- * @max_snk_mv:	Maximum acceptable sink voltage in mV
- * @max_snk_ma:	Maximum sink current in mA
- * @max_snk_mw:	Maximum required sink power in mW
- * @operating_snk_mw:
- *		Required operating sink power in mW
- * @type:	Port type (TYPEC_PORT_DFP, TYPEC_PORT_UFP, or
- *		TYPEC_PORT_DRP)
- * @default_role:
- *		Default port role (TYPEC_SINK or TYPEC_SOURCE).
- *		Set to TYPEC_NO_PREFERRED_ROLE if no default role.
- * @try_role_hw:True if try.{Src,Snk} is implemented in hardware
- * @alt_modes:	List of supported alternate modes
- */
-struct tcpc_config {
-	const u32 *src_pdo;
-	unsigned int nr_src_pdo;
-
-	const u32 *snk_pdo;
-	unsigned int nr_snk_pdo;
-
-	const u32 *snk_vdo;
-	unsigned int nr_snk_vdo;
-
-	unsigned int max_snk_mv;
-	unsigned int max_snk_ma;
-	unsigned int max_snk_mw;
-	unsigned int operating_snk_mw;
-
-	enum typec_port_type type;
-	enum typec_role default_role;
-	bool try_role_hw;	/* try.{src,snk} implemented in hardware */
-
-	const struct typec_altmode_desc *alt_modes;
-};
-
-enum tcpc_usb_switch {
-	TCPC_USB_SWITCH_CONNECT,
-	TCPC_USB_SWITCH_DISCONNECT,
-};
-
-/* Mux state attributes */
-#define TCPC_MUX_USB_ENABLED		BIT(0)	/* USB enabled */
-#define TCPC_MUX_DP_ENABLED		BIT(1)	/* DP enabled */
-#define TCPC_MUX_POLARITY_INVERTED	BIT(2)	/* Polarity inverted */
-
-/* Mux modes, decoded to attributes */
-enum tcpc_mux_mode {
-	TYPEC_MUX_NONE	= 0,				/* Open switch */
-	TYPEC_MUX_USB	= TCPC_MUX_USB_ENABLED,		/* USB only */
-	TYPEC_MUX_DP	= TCPC_MUX_DP_ENABLED,		/* DP only */
-	TYPEC_MUX_DOCK	= TCPC_MUX_USB_ENABLED |	/* Both USB and DP */
-			  TCPC_MUX_DP_ENABLED,
-};
-
-struct tcpc_mux_dev {
-	int (*set)(struct tcpc_mux_dev *dev, enum tcpc_mux_mode mux_mode,
-		   enum tcpc_usb_switch usb_config,
-		   enum typec_cc_polarity polarity);
-	bool dfp_only;
-	void *priv_data;
-};
-
-/**
- * struct tcpc_dev - Port configuration and callback functions
- * @config:	Pointer to port configuration
- * @get_vbus:	Called to read current VBUS state
- * @get_current_limit:
- *		Optional; called by the tcpm core when configured as a snk
- *		and cc=Rp-def. This allows the tcpm to provide a fallback
- *		current-limit detection method for the cc=Rp-def case.
- *		For example, some tcpcs may include BC1.2 charger detection
- *		and use that in this case.
- * @set_cc:	Called to set value of CC pins
- * @get_cc:	Called to read current CC pin values
- * @set_polarity:
- *		Called to set polarity
- * @set_vconn:	Called to enable or disable VCONN
- * @set_vbus:	Called to enable or disable VBUS
- * @set_current_limit:
- *		Optional; called to set current limit as negotiated
- *		with partner.
- * @set_pd_rx:	Called to enable or disable reception of PD messages
- * @set_roles:	Called to set power and data roles
- * @start_drp_toggling:
- *		Optional; if supported by hardware, called to start DRP
- *		toggling. DRP toggling is stopped automatically if
- *		a connection is established.
- * @try_role:	Optional; called to set a preferred role
- * @pd_transmit:Called to transmit PD message
- * @mux:	Pointer to multiplexer data
- */
-struct tcpc_dev {
-	const struct tcpc_config *config;
-
-	int (*init)(struct tcpc_dev *dev);
-	int (*get_vbus)(struct tcpc_dev *dev);
-	int (*get_current_limit)(struct tcpc_dev *dev);
-	int (*set_cc)(struct tcpc_dev *dev, enum typec_cc_status cc);
-	int (*get_cc)(struct tcpc_dev *dev, enum typec_cc_status *cc1,
-		      enum typec_cc_status *cc2);
-	int (*set_polarity)(struct tcpc_dev *dev,
-			    enum typec_cc_polarity polarity);
-	int (*set_vconn)(struct tcpc_dev *dev, bool on);
-	int (*set_vbus)(struct tcpc_dev *dev, bool on, bool charge);
-	int (*set_current_limit)(struct tcpc_dev *dev, u32 max_ma, u32 mv);
-	int (*set_pd_rx)(struct tcpc_dev *dev, bool on);
-	int (*set_roles)(struct tcpc_dev *dev, bool attached,
-			 enum typec_role role, enum typec_data_role data);
-	int (*start_drp_toggling)(struct tcpc_dev *dev,
-				  enum typec_cc_status cc);
-	int (*try_role)(struct tcpc_dev *dev, int role);
-	int (*pd_transmit)(struct tcpc_dev *dev, enum tcpm_transmit_type type,
-			   const struct pd_message *msg);
-	struct tcpc_mux_dev *mux;
-};
-
-struct tcpm_port;
-
-struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc);
-void tcpm_unregister_port(struct tcpm_port *port);
-
-void tcpm_update_source_capabilities(struct tcpm_port *port, const u32 *pdo,
-				     unsigned int nr_pdo);
-void tcpm_update_sink_capabilities(struct tcpm_port *port, const u32 *pdo,
-				   unsigned int nr_pdo,
-				   unsigned int max_snk_mv,
-				   unsigned int max_snk_ma,
-				   unsigned int max_snk_mw,
-				   unsigned int operating_snk_mw);
-
-void tcpm_vbus_change(struct tcpm_port *port);
-void tcpm_cc_change(struct tcpm_port *port);
-void tcpm_pd_receive(struct tcpm_port *port,
-		     const struct pd_message *msg);
-void tcpm_pd_transmit_complete(struct tcpm_port *port,
-			       enum tcpm_transmit_status status);
-void tcpm_pd_hard_reset(struct tcpm_port *port);
-void tcpm_tcpc_reset(struct tcpm_port *port);
-
-#endif /* __LINUX_USB_TCPM_H */
diff --git a/drivers/usb/typec/Kconfig b/drivers/usb/typec/Kconfig
index bc1b7745f1d4..888605860091 100644
--- a/drivers/usb/typec/Kconfig
+++ b/drivers/usb/typec/Kconfig
@@ -4,6 +4,14 @@ menu "USB Power Delivery and Type-C drivers"
 config TYPEC
 	tristate
 
+config TYPEC_TCPM
+	tristate "USB Type-C Port Controller Manager"
+	depends on USB
+	select TYPEC
+	help
+	  The Type-C Port Controller Manager provides a USB PD and USB Type-C
+	  state machine for use with Type-C Port Controllers.
+
 config TYPEC_WCOVE
 	tristate "Intel WhiskeyCove PMIC USB Type-C PHY driver"
 	depends on ACPI
diff --git a/drivers/usb/typec/Makefile b/drivers/usb/typec/Makefile
index bc214f15f1b5..eb883984724b 100644
--- a/drivers/usb/typec/Makefile
+++ b/drivers/usb/typec/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_TYPEC)		+= typec.o
+obj-$(CONFIG_TYPEC_TCPM)	+= tcpm.o
 obj-$(CONFIG_TYPEC_WCOVE)	+= typec_wcove.o
 obj-$(CONFIG_TYPEC_UCSI)	+= ucsi/
diff --git a/drivers/usb/typec/tcpm.c b/drivers/usb/typec/tcpm.c
new file mode 100644
index 000000000000..f557c479fdc2
--- /dev/null
+++ b/drivers/usb/typec/tcpm.c
@@ -0,0 +1,3615 @@
+/*
+ * Copyright 2015-2017 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * USB Power Delivery protocol stack.
+ */
+
+#include <linux/completion.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/proc_fs.h>
+#include <linux/sched/clock.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/usb/pd.h>
+#include <linux/usb/pd_bdo.h>
+#include <linux/usb/pd_vdo.h>
+#include <linux/usb/tcpm.h>
+#include <linux/usb/typec.h>
+#include <linux/workqueue.h>
+
+#define FOREACH_STATE(S)			\
+	S(INVALID_STATE),			\
+	S(DRP_TOGGLING),			\
+	S(SRC_UNATTACHED),			\
+	S(SRC_ATTACH_WAIT),			\
+	S(SRC_ATTACHED),			\
+	S(SRC_STARTUP),				\
+	S(SRC_SEND_CAPABILITIES),		\
+	S(SRC_NEGOTIATE_CAPABILITIES),		\
+	S(SRC_TRANSITION_SUPPLY),		\
+	S(SRC_READY),				\
+	S(SRC_WAIT_NEW_CAPABILITIES),		\
+						\
+	S(SNK_UNATTACHED),			\
+	S(SNK_ATTACH_WAIT),			\
+	S(SNK_DEBOUNCED),			\
+	S(SNK_ATTACHED),			\
+	S(SNK_STARTUP),				\
+	S(SNK_DISCOVERY),			\
+	S(SNK_DISCOVERY_DEBOUNCE),		\
+	S(SNK_DISCOVERY_DEBOUNCE_DONE),		\
+	S(SNK_WAIT_CAPABILITIES),		\
+	S(SNK_NEGOTIATE_CAPABILITIES),		\
+	S(SNK_TRANSITION_SINK),			\
+	S(SNK_TRANSITION_SINK_VBUS),		\
+	S(SNK_READY),				\
+						\
+	S(ACC_UNATTACHED),			\
+	S(DEBUG_ACC_ATTACHED),			\
+	S(AUDIO_ACC_ATTACHED),			\
+	S(AUDIO_ACC_DEBOUNCE),			\
+						\
+	S(HARD_RESET_SEND),			\
+	S(HARD_RESET_START),			\
+	S(SRC_HARD_RESET_VBUS_OFF),		\
+	S(SRC_HARD_RESET_VBUS_ON),		\
+	S(SNK_HARD_RESET_SINK_OFF),		\
+	S(SNK_HARD_RESET_WAIT_VBUS),		\
+	S(SNK_HARD_RESET_SINK_ON),		\
+						\
+	S(SOFT_RESET),				\
+	S(SOFT_RESET_SEND),			\
+						\
+	S(DR_SWAP_ACCEPT),			\
+	S(DR_SWAP_SEND),			\
+	S(DR_SWAP_SEND_TIMEOUT),		\
+	S(DR_SWAP_CANCEL),			\
+	S(DR_SWAP_CHANGE_DR),			\
+						\
+	S(PR_SWAP_ACCEPT),			\
+	S(PR_SWAP_SEND),			\
+	S(PR_SWAP_SEND_TIMEOUT),		\
+	S(PR_SWAP_CANCEL),			\
+	S(PR_SWAP_START),			\
+	S(PR_SWAP_SRC_SNK_TRANSITION_OFF),	\
+	S(PR_SWAP_SRC_SNK_SOURCE_OFF),		\
+	S(PR_SWAP_SRC_SNK_SOURCE_OFF_CC_DEBOUNCED), \
+	S(PR_SWAP_SRC_SNK_SINK_ON),		\
+	S(PR_SWAP_SNK_SRC_SINK_OFF),		\
+	S(PR_SWAP_SNK_SRC_SOURCE_ON),		\
+	S(PR_SWAP_SNK_SRC_SOURCE_ON_VBUS_RAMPED_UP),    \
+						\
+	S(VCONN_SWAP_ACCEPT),			\
+	S(VCONN_SWAP_SEND),			\
+	S(VCONN_SWAP_SEND_TIMEOUT),		\
+	S(VCONN_SWAP_CANCEL),			\
+	S(VCONN_SWAP_START),			\
+	S(VCONN_SWAP_WAIT_FOR_VCONN),		\
+	S(VCONN_SWAP_TURN_ON_VCONN),		\
+	S(VCONN_SWAP_TURN_OFF_VCONN),		\
+						\
+	S(SNK_TRY),				\
+	S(SNK_TRY_WAIT),			\
+	S(SNK_TRY_WAIT_DEBOUNCE),               \
+	S(SNK_TRY_WAIT_DEBOUNCE_CHECK_VBUS),    \
+	S(SRC_TRYWAIT),				\
+	S(SRC_TRYWAIT_DEBOUNCE),		\
+	S(SRC_TRYWAIT_UNATTACHED),		\
+						\
+	S(SRC_TRY),				\
+	S(SRC_TRY_WAIT),                        \
+	S(SRC_TRY_DEBOUNCE),			\
+	S(SNK_TRYWAIT),				\
+	S(SNK_TRYWAIT_DEBOUNCE),		\
+	S(SNK_TRYWAIT_VBUS),			\
+	S(BIST_RX),				\
+						\
+	S(ERROR_RECOVERY),			\
+	S(PORT_RESET),				\
+	S(PORT_RESET_WAIT_OFF)
+
+#define GENERATE_ENUM(e)	e
+#define GENERATE_STRING(s)	#s
+
+enum tcpm_state {
+	FOREACH_STATE(GENERATE_ENUM)
+};
+
+static const char * const tcpm_states[] = {
+	FOREACH_STATE(GENERATE_STRING)
+};
+
+enum vdm_states {
+	VDM_STATE_ERR_BUSY = -3,
+	VDM_STATE_ERR_SEND = -2,
+	VDM_STATE_ERR_TMOUT = -1,
+	VDM_STATE_DONE = 0,
+	/* Anything >0 represents an active state */
+	VDM_STATE_READY = 1,
+	VDM_STATE_BUSY = 2,
+	VDM_STATE_WAIT_RSP_BUSY = 3,
+};
+
+enum pd_msg_request {
+	PD_MSG_NONE = 0,
+	PD_MSG_CTRL_REJECT,
+	PD_MSG_CTRL_WAIT,
+	PD_MSG_DATA_SINK_CAP,
+	PD_MSG_DATA_SOURCE_CAP,
+};
+
+/* Events from low level driver */
+
+#define TCPM_CC_EVENT		BIT(0)
+#define TCPM_VBUS_EVENT		BIT(1)
+#define TCPM_RESET_EVENT	BIT(2)
+
+#define LOG_BUFFER_ENTRIES	1024
+#define LOG_BUFFER_ENTRY_SIZE	128
+
+/* Alternate mode support */
+
+#define SVID_DISCOVERY_MAX	16
+
+struct pd_mode_data {
+	int svid_index;		/* current SVID index		*/
+	int nsvids;
+	u16 svids[SVID_DISCOVERY_MAX];
+	int altmodes;		/* number of alternate modes	*/
+	struct typec_altmode_desc altmode_desc[SVID_DISCOVERY_MAX];
+};
+
+struct tcpm_port {
+	struct device *dev;
+
+	struct mutex lock;		/* tcpm state machine lock */
+	struct workqueue_struct *wq;
+
+	struct typec_capability typec_caps;
+	struct typec_port *typec_port;
+
+	struct tcpc_dev	*tcpc;
+
+	enum typec_role vconn_role;
+	enum typec_role pwr_role;
+	enum typec_data_role data_role;
+	enum typec_pwr_opmode pwr_opmode;
+
+	struct usb_pd_identity partner_ident;
+	struct typec_partner_desc partner_desc;
+	struct typec_partner *partner;
+
+	enum typec_cc_status cc_req;
+
+	enum typec_cc_status cc1;
+	enum typec_cc_status cc2;
+	enum typec_cc_polarity polarity;
+
+	bool attached;
+	bool connected;
+	enum typec_port_type port_type;
+	bool vbus_present;
+	bool vbus_never_low;
+	bool vbus_source;
+	bool vbus_charge;
+
+	bool send_discover;
+	bool op_vsafe5v;
+
+	int try_role;
+	int try_snk_count;
+	int try_src_count;
+
+	enum pd_msg_request queued_message;
+
+	enum tcpm_state enter_state;
+	enum tcpm_state prev_state;
+	enum tcpm_state state;
+	enum tcpm_state delayed_state;
+	unsigned long delayed_runtime;
+	unsigned long delay_ms;
+
+	spinlock_t pd_event_lock;
+	u32 pd_events;
+
+	struct work_struct event_work;
+	struct delayed_work state_machine;
+	struct delayed_work vdm_state_machine;
+	bool state_machine_running;
+
+	struct completion tx_complete;
+	enum tcpm_transmit_status tx_status;
+
+	struct mutex swap_lock;		/* swap command lock */
+	bool swap_pending;
+	bool non_pd_role_swap;
+	struct completion swap_complete;
+	int swap_status;
+
+	unsigned int message_id;
+	unsigned int caps_count;
+	unsigned int hard_reset_count;
+	bool pd_capable;
+	bool explicit_contract;
+	unsigned int rx_msgid;
+
+	/* Partner capabilities/requests */
+	u32 sink_request;
+	u32 source_caps[PDO_MAX_OBJECTS];
+	unsigned int nr_source_caps;
+	u32 sink_caps[PDO_MAX_OBJECTS];
+	unsigned int nr_sink_caps;
+
+	/* Local capabilities */
+	u32 src_pdo[PDO_MAX_OBJECTS];
+	unsigned int nr_src_pdo;
+	u32 snk_pdo[PDO_MAX_OBJECTS];
+	unsigned int nr_snk_pdo;
+	u32 snk_vdo[VDO_MAX_OBJECTS];
+	unsigned int nr_snk_vdo;
+
+	unsigned int max_snk_mv;
+	unsigned int max_snk_ma;
+	unsigned int max_snk_mw;
+	unsigned int operating_snk_mw;
+
+	/* Requested current / voltage */
+	u32 current_limit;
+	u32 supply_voltage;
+
+	u32 bist_request;
+
+	/* PD state for Vendor Defined Messages */
+	enum vdm_states vdm_state;
+	u32 vdm_retries;
+	/* next Vendor Defined Message to send */
+	u32 vdo_data[VDO_MAX_SIZE];
+	u8 vdo_count;
+	/* VDO to retry if UFP responder replied busy */
+	u32 vdo_retry;
+
+	/* Alternate mode data */
+
+	struct pd_mode_data mode_data;
+	struct typec_altmode *partner_altmode[SVID_DISCOVERY_MAX];
+	struct typec_altmode *port_altmode[SVID_DISCOVERY_MAX];
+
+	/* Deadline in jiffies to exit src_try_wait state */
+	unsigned long max_wait;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *dentry;
+	struct mutex logbuffer_lock;	/* log buffer access lock */
+	int logbuffer_head;
+	int logbuffer_tail;
+	u8 *logbuffer[LOG_BUFFER_ENTRIES];
+#endif
+};
+
+struct pd_rx_event {
+	struct work_struct work;
+	struct tcpm_port *port;
+	struct pd_message msg;
+};
+
+#define tcpm_cc_is_sink(cc) \
+	((cc) == TYPEC_CC_RP_DEF || (cc) == TYPEC_CC_RP_1_5 || \
+	 (cc) == TYPEC_CC_RP_3_0)
+
+#define tcpm_port_is_sink(port) \
+	((tcpm_cc_is_sink((port)->cc1) && !tcpm_cc_is_sink((port)->cc2)) || \
+	 (tcpm_cc_is_sink((port)->cc2) && !tcpm_cc_is_sink((port)->cc1)))
+
+#define tcpm_cc_is_source(cc) ((cc) == TYPEC_CC_RD)
+#define tcpm_cc_is_audio(cc) ((cc) == TYPEC_CC_RA)
+#define tcpm_cc_is_open(cc) ((cc) == TYPEC_CC_OPEN)
+
+#define tcpm_port_is_source(port) \
+	((tcpm_cc_is_source((port)->cc1) && \
+	 !tcpm_cc_is_source((port)->cc2)) || \
+	 (tcpm_cc_is_source((port)->cc2) && \
+	  !tcpm_cc_is_source((port)->cc1)))
+
+#define tcpm_port_is_debug(port) \
+	(tcpm_cc_is_source((port)->cc1) && tcpm_cc_is_source((port)->cc2))
+
+#define tcpm_port_is_audio(port) \
+	(tcpm_cc_is_audio((port)->cc1) && tcpm_cc_is_audio((port)->cc2))
+
+#define tcpm_port_is_audio_detached(port) \
+	((tcpm_cc_is_audio((port)->cc1) && tcpm_cc_is_open((port)->cc2)) || \
+	 (tcpm_cc_is_audio((port)->cc2) && tcpm_cc_is_open((port)->cc1)))
+
+#define tcpm_try_snk(port) \
+	((port)->try_snk_count == 0 && (port)->try_role == TYPEC_SINK && \
+	(port)->port_type == TYPEC_PORT_DRP)
+
+#define tcpm_try_src(port) \
+	((port)->try_src_count == 0 && (port)->try_role == TYPEC_SOURCE && \
+	(port)->port_type == TYPEC_PORT_DRP)
+
+static enum tcpm_state tcpm_default_state(struct tcpm_port *port)
+{
+	if (port->port_type == TYPEC_PORT_DRP) {
+		if (port->try_role == TYPEC_SINK)
+			return SNK_UNATTACHED;
+		else if (port->try_role == TYPEC_SOURCE)
+			return SRC_UNATTACHED;
+		else if (port->tcpc->config->default_role == TYPEC_SINK)
+			return SNK_UNATTACHED;
+		/* Fall through to return SRC_UNATTACHED */
+	} else if (port->port_type == TYPEC_PORT_UFP) {
+		return SNK_UNATTACHED;
+	}
+	return SRC_UNATTACHED;
+}
+
+static inline
+struct tcpm_port *typec_cap_to_tcpm(const struct typec_capability *cap)
+{
+	return container_of(cap, struct tcpm_port, typec_caps);
+}
+
+static bool tcpm_port_is_disconnected(struct tcpm_port *port)
+{
+	return (!port->attached && port->cc1 == TYPEC_CC_OPEN &&
+		port->cc2 == TYPEC_CC_OPEN) ||
+	       (port->attached && ((port->polarity == TYPEC_POLARITY_CC1 &&
+				    port->cc1 == TYPEC_CC_OPEN) ||
+				   (port->polarity == TYPEC_POLARITY_CC2 &&
+				    port->cc2 == TYPEC_CC_OPEN)));
+}
+
+/*
+ * Logging
+ */
+
+#ifdef CONFIG_DEBUG_FS
+
+static bool tcpm_log_full(struct tcpm_port *port)
+{
+	return port->logbuffer_tail ==
+		(port->logbuffer_head + 1) % LOG_BUFFER_ENTRIES;
+}
+
+__printf(2, 0)
+static void _tcpm_log(struct tcpm_port *port, const char *fmt, va_list args)
+{
+	char tmpbuffer[LOG_BUFFER_ENTRY_SIZE];
+	u64 ts_nsec = local_clock();
+	unsigned long rem_nsec;
+
+	if (!port->logbuffer[port->logbuffer_head]) {
+		port->logbuffer[port->logbuffer_head] =
+				kzalloc(LOG_BUFFER_ENTRY_SIZE, GFP_KERNEL);
+		if (!port->logbuffer[port->logbuffer_head])
+			return;
+	}
+
+	vsnprintf(tmpbuffer, sizeof(tmpbuffer), fmt, args);
+
+	mutex_lock(&port->logbuffer_lock);
+
+	if (tcpm_log_full(port)) {
+		port->logbuffer_head = max(port->logbuffer_head - 1, 0);
+		strcpy(tmpbuffer, "overflow");
+	}
+
+	if (port->logbuffer_head < 0 ||
+	    port->logbuffer_head >= LOG_BUFFER_ENTRIES) {
+		dev_warn(port->dev,
+			 "Bad log buffer index %d\n", port->logbuffer_head);
+		goto abort;
+	}
+
+	if (!port->logbuffer[port->logbuffer_head]) {
+		dev_warn(port->dev,
+			 "Log buffer index %d is NULL\n", port->logbuffer_head);
+		goto abort;
+	}
+
+	rem_nsec = do_div(ts_nsec, 1000000000);
+	scnprintf(port->logbuffer[port->logbuffer_head],
+		  LOG_BUFFER_ENTRY_SIZE, "[%5lu.%06lu] %s",
+		  (unsigned long)ts_nsec, rem_nsec / 1000,
+		  tmpbuffer);
+	port->logbuffer_head = (port->logbuffer_head + 1) % LOG_BUFFER_ENTRIES;
+
+abort:
+	mutex_unlock(&port->logbuffer_lock);
+}
+
+__printf(2, 3)
+static void tcpm_log(struct tcpm_port *port, const char *fmt, ...)
+{
+	va_list args;
+
+	/* Do not log while disconnected and unattached */
+	if (tcpm_port_is_disconnected(port) &&
+	    (port->state == SRC_UNATTACHED || port->state == SNK_UNATTACHED ||
+	     port->state == DRP_TOGGLING))
+		return;
+
+	va_start(args, fmt);
+	_tcpm_log(port, fmt, args);
+	va_end(args);
+}
+
+__printf(2, 3)
+static void tcpm_log_force(struct tcpm_port *port, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	_tcpm_log(port, fmt, args);
+	va_end(args);
+}
+
+static void tcpm_log_source_caps(struct tcpm_port *port)
+{
+	int i;
+
+	for (i = 0; i < port->nr_source_caps; i++) {
+		u32 pdo = port->source_caps[i];
+		enum pd_pdo_type type = pdo_type(pdo);
+		char msg[64];
+
+		switch (type) {
+		case PDO_TYPE_FIXED:
+			scnprintf(msg, sizeof(msg),
+				  "%u mV, %u mA [%s%s%s%s%s%s]",
+				  pdo_fixed_voltage(pdo),
+				  pdo_max_current(pdo),
+				  (pdo & PDO_FIXED_DUAL_ROLE) ?
+							"R" : "",
+				  (pdo & PDO_FIXED_SUSPEND) ?
+							"S" : "",
+				  (pdo & PDO_FIXED_HIGHER_CAP) ?
+							"H" : "",
+				  (pdo & PDO_FIXED_USB_COMM) ?
+							"U" : "",
+				  (pdo & PDO_FIXED_DATA_SWAP) ?
+							"D" : "",
+				  (pdo & PDO_FIXED_EXTPOWER) ?
+							"E" : "");
+			break;
+		case PDO_TYPE_VAR:
+			scnprintf(msg, sizeof(msg),
+				  "%u-%u mV, %u mA",
+				  pdo_min_voltage(pdo),
+				  pdo_max_voltage(pdo),
+				  pdo_max_current(pdo));
+			break;
+		case PDO_TYPE_BATT:
+			scnprintf(msg, sizeof(msg),
+				  "%u-%u mV, %u mW",
+				  pdo_min_voltage(pdo),
+				  pdo_max_voltage(pdo),
+				  pdo_max_power(pdo));
+			break;
+		default:
+			strcpy(msg, "undefined");
+			break;
+		}
+		tcpm_log(port, " PDO %d: type %d, %s",
+			 i, type, msg);
+	}
+}
+
+static int tcpm_seq_show(struct seq_file *s, void *v)
+{
+	struct tcpm_port *port = (struct tcpm_port *)s->private;
+	int tail;
+
+	mutex_lock(&port->logbuffer_lock);
+	tail = port->logbuffer_tail;
+	while (tail != port->logbuffer_head) {
+		seq_printf(s, "%s\n", port->logbuffer[tail]);
+		tail = (tail + 1) % LOG_BUFFER_ENTRIES;
+	}
+	if (!seq_has_overflowed(s))
+		port->logbuffer_tail = tail;
+	mutex_unlock(&port->logbuffer_lock);
+
+	return 0;
+}
+
+static int tcpm_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, tcpm_seq_show, inode->i_private);
+}
+
+static const struct file_operations tcpm_debug_operations = {
+	.open		= tcpm_debug_open,
+	.llseek		= seq_lseek,
+	.read		= seq_read,
+	.release	= single_release,
+};
+
+static struct dentry *rootdir;
+
+static int tcpm_debugfs_init(struct tcpm_port *port)
+{
+	mutex_init(&port->logbuffer_lock);
+	/* /sys/kernel/debug/tcpm/usbcX */
+	if (!rootdir) {
+		rootdir = debugfs_create_dir("tcpm", NULL);
+		if (!rootdir)
+			return -ENOMEM;
+	}
+
+	port->dentry = debugfs_create_file(dev_name(port->dev),
+					   S_IFREG | 0444, rootdir,
+					   port, &tcpm_debug_operations);
+
+	return 0;
+}
+
+static void tcpm_debugfs_exit(struct tcpm_port *port)
+{
+	debugfs_remove(port->dentry);
+}
+
+#else
+
+__printf(2, 3)
+static void tcpm_log(const struct tcpm_port *port, const char *fmt, ...) { }
+__printf(2, 3)
+static void tcpm_log_force(struct tcpm_port *port, const char *fmt, ...) { }
+static void tcpm_log_source_caps(struct tcpm_port *port) { }
+static int tcpm_debugfs_init(const struct tcpm_port *port) { return 0; }
+static void tcpm_debugfs_exit(const struct tcpm_port *port) { }
+
+#endif
+
+static int tcpm_pd_transmit(struct tcpm_port *port,
+			    enum tcpm_transmit_type type,
+			    const struct pd_message *msg)
+{
+	unsigned long timeout;
+	int ret;
+
+	if (msg)
+		tcpm_log(port, "PD TX, header: %#x", le16_to_cpu(msg->header));
+	else
+		tcpm_log(port, "PD TX, type: %#x", type);
+
+	reinit_completion(&port->tx_complete);
+	ret = port->tcpc->pd_transmit(port->tcpc, type, msg);
+	if (ret < 0)
+		return ret;
+
+	mutex_unlock(&port->lock);
+	timeout = wait_for_completion_timeout(&port->tx_complete,
+				msecs_to_jiffies(PD_T_TCPC_TX_TIMEOUT));
+	mutex_lock(&port->lock);
+	if (!timeout)
+		return -ETIMEDOUT;
+
+	switch (port->tx_status) {
+	case TCPC_TX_SUCCESS:
+		port->message_id = (port->message_id + 1) & PD_HEADER_ID_MASK;
+		return 0;
+	case TCPC_TX_DISCARDED:
+		return -EAGAIN;
+	case TCPC_TX_FAILED:
+	default:
+		return -EIO;
+	}
+}
+
+void tcpm_pd_transmit_complete(struct tcpm_port *port,
+			       enum tcpm_transmit_status status)
+{
+	tcpm_log(port, "PD TX complete, status: %u", status);
+	port->tx_status = status;
+	complete(&port->tx_complete);
+}
+EXPORT_SYMBOL_GPL(tcpm_pd_transmit_complete);
+
+static int tcpm_mux_set(struct tcpm_port *port, enum tcpc_mux_mode mode,
+			enum tcpc_usb_switch config)
+{
+	int ret = 0;
+
+	tcpm_log(port, "Requesting mux mode %d, config %d, polarity %d",
+		 mode, config, port->polarity);
+
+	if (port->tcpc->mux)
+		ret = port->tcpc->mux->set(port->tcpc->mux, mode, config,
+					   port->polarity);
+
+	return ret;
+}
+
+static int tcpm_set_polarity(struct tcpm_port *port,
+			     enum typec_cc_polarity polarity)
+{
+	int ret;
+
+	tcpm_log(port, "polarity %d", polarity);
+
+	ret = port->tcpc->set_polarity(port->tcpc, polarity);
+	if (ret < 0)
+		return ret;
+
+	port->polarity = polarity;
+
+	return 0;
+}
+
+static int tcpm_set_vconn(struct tcpm_port *port, bool enable)
+{
+	int ret;
+
+	tcpm_log(port, "vconn:=%d", enable);
+
+	ret = port->tcpc->set_vconn(port->tcpc, enable);
+	if (!ret) {
+		port->vconn_role = enable ? TYPEC_SOURCE : TYPEC_SINK;
+		typec_set_vconn_role(port->typec_port, port->vconn_role);
+	}
+
+	return ret;
+}
+
+static u32 tcpm_get_current_limit(struct tcpm_port *port)
+{
+	enum typec_cc_status cc;
+	u32 limit;
+
+	cc = port->polarity ? port->cc2 : port->cc1;
+	switch (cc) {
+	case TYPEC_CC_RP_1_5:
+		limit = 1500;
+		break;
+	case TYPEC_CC_RP_3_0:
+		limit = 3000;
+		break;
+	case TYPEC_CC_RP_DEF:
+	default:
+		if (port->tcpc->get_current_limit)
+			limit = port->tcpc->get_current_limit(port->tcpc);
+		else
+			limit = 0;
+		break;
+	}
+
+	return limit;
+}
+
+static int tcpm_set_current_limit(struct tcpm_port *port, u32 max_ma, u32 mv)
+{
+	int ret = -EOPNOTSUPP;
+
+	tcpm_log(port, "Setting voltage/current limit %u mV %u mA", mv, max_ma);
+
+	if (port->tcpc->set_current_limit)
+		ret = port->tcpc->set_current_limit(port->tcpc, max_ma, mv);
+
+	return ret;
+}
+
+/*
+ * Determine RP value to set based on maximum current supported
+ * by a port if configured as source.
+ * Returns CC value to report to link partner.
+ */
+static enum typec_cc_status tcpm_rp_cc(struct tcpm_port *port)
+{
+	const u32 *src_pdo = port->src_pdo;
+	int nr_pdo = port->nr_src_pdo;
+	int i;
+
+	/*
+	 * Search for first entry with matching voltage.
+	 * It should report the maximum supported current.
+	 */
+	for (i = 0; i < nr_pdo; i++) {
+		const u32 pdo = src_pdo[i];
+
+		if (pdo_type(pdo) == PDO_TYPE_FIXED &&
+		    pdo_fixed_voltage(pdo) == 5000) {
+			unsigned int curr = pdo_max_current(pdo);
+
+			if (curr >= 3000)
+				return TYPEC_CC_RP_3_0;
+			else if (curr >= 1500)
+				return TYPEC_CC_RP_1_5;
+			return TYPEC_CC_RP_DEF;
+		}
+	}
+
+	return TYPEC_CC_RP_DEF;
+}
+
+static int tcpm_set_attached_state(struct tcpm_port *port, bool attached)
+{
+	return port->tcpc->set_roles(port->tcpc, attached, port->pwr_role,
+				     port->data_role);
+}
+
+static int tcpm_set_roles(struct tcpm_port *port, bool attached,
+			  enum typec_role role, enum typec_data_role data)
+{
+	int ret;
+
+	if (data == TYPEC_HOST)
+		ret = tcpm_mux_set(port, TYPEC_MUX_USB,
+				   TCPC_USB_SWITCH_CONNECT);
+	else
+		ret = tcpm_mux_set(port, TYPEC_MUX_NONE,
+				   TCPC_USB_SWITCH_DISCONNECT);
+	if (ret < 0)
+		return ret;
+
+	ret = port->tcpc->set_roles(port->tcpc, attached, role, data);
+	if (ret < 0)
+		return ret;
+
+	port->pwr_role = role;
+	port->data_role = data;
+	typec_set_data_role(port->typec_port, data);
+	typec_set_pwr_role(port->typec_port, role);
+
+	return 0;
+}
+
+static int tcpm_set_pwr_role(struct tcpm_port *port, enum typec_role role)
+{
+	int ret;
+
+	ret = port->tcpc->set_roles(port->tcpc, true, role,
+				    port->data_role);
+	if (ret < 0)
+		return ret;
+
+	port->pwr_role = role;
+	typec_set_pwr_role(port->typec_port, role);
+
+	return 0;
+}
+
+static int tcpm_pd_send_source_caps(struct tcpm_port *port)
+{
+	struct pd_message msg;
+	int i;
+
+	memset(&msg, 0, sizeof(msg));
+	if (!port->nr_src_pdo) {
+		/* No source capabilities defined, sink only */
+		msg.header = PD_HEADER_LE(PD_CTRL_REJECT,
+					  port->pwr_role,
+					  port->data_role,
+					  port->message_id, 0);
+	} else {
+		msg.header = PD_HEADER_LE(PD_DATA_SOURCE_CAP,
+					  port->pwr_role,
+					  port->data_role,
+					  port->message_id,
+					  port->nr_src_pdo);
+	}
+	for (i = 0; i < port->nr_src_pdo; i++)
+		msg.payload[i] = cpu_to_le32(port->src_pdo[i]);
+
+	return tcpm_pd_transmit(port, TCPC_TX_SOP, &msg);
+}
+
+static int tcpm_pd_send_sink_caps(struct tcpm_port *port)
+{
+	struct pd_message msg;
+	int i;
+
+	memset(&msg, 0, sizeof(msg));
+	if (!port->nr_snk_pdo) {
+		/* No sink capabilities defined, source only */
+		msg.header = PD_HEADER_LE(PD_CTRL_REJECT,
+					  port->pwr_role,
+					  port->data_role,
+					  port->message_id, 0);
+	} else {
+		msg.header = PD_HEADER_LE(PD_DATA_SINK_CAP,
+					  port->pwr_role,
+					  port->data_role,
+					  port->message_id,
+					  port->nr_snk_pdo);
+	}
+	for (i = 0; i < port->nr_snk_pdo; i++)
+		msg.payload[i] = cpu_to_le32(port->snk_pdo[i]);
+
+	return tcpm_pd_transmit(port, TCPC_TX_SOP, &msg);
+}
+
+static void tcpm_set_state(struct tcpm_port *port, enum tcpm_state state,
+			   unsigned int delay_ms)
+{
+	if (delay_ms) {
+		tcpm_log(port, "pending state change %s -> %s @ %u ms",
+			 tcpm_states[port->state], tcpm_states[state],
+			 delay_ms);
+		port->delayed_state = state;
+		mod_delayed_work(port->wq, &port->state_machine,
+				 msecs_to_jiffies(delay_ms));
+		port->delayed_runtime = jiffies + msecs_to_jiffies(delay_ms);
+		port->delay_ms = delay_ms;
+	} else {
+		tcpm_log(port, "state change %s -> %s",
+			 tcpm_states[port->state], tcpm_states[state]);
+		port->delayed_state = INVALID_STATE;
+		port->prev_state = port->state;
+		port->state = state;
+		/*
+		 * Don't re-queue the state machine work item if we're currently
+		 * in the state machine and we're immediately changing states.
+		 * tcpm_state_machine_work() will continue running the state
+		 * machine.
+		 */
+		if (!port->state_machine_running)
+			mod_delayed_work(port->wq, &port->state_machine, 0);
+	}
+}
+
+static void tcpm_set_state_cond(struct tcpm_port *port, enum tcpm_state state,
+				unsigned int delay_ms)
+{
+	if (port->enter_state == port->state)
+		tcpm_set_state(port, state, delay_ms);
+	else
+		tcpm_log(port,
+			 "skipped %sstate change %s -> %s [%u ms], context state %s",
+			 delay_ms ? "delayed " : "",
+			 tcpm_states[port->state], tcpm_states[state],
+			 delay_ms, tcpm_states[port->enter_state]);
+}
+
+static void tcpm_queue_message(struct tcpm_port *port,
+			       enum pd_msg_request message)
+{
+	port->queued_message = message;
+	mod_delayed_work(port->wq, &port->state_machine, 0);
+}
+
+/*
+ * VDM/VDO handling functions
+ */
+static void tcpm_queue_vdm(struct tcpm_port *port, const u32 header,
+			   const u32 *data, int cnt)
+{
+	port->vdo_count = cnt + 1;
+	port->vdo_data[0] = header;
+	memcpy(&port->vdo_data[1], data, sizeof(u32) * cnt);
+	/* Set ready, vdm state machine will actually send */
+	port->vdm_retries = 0;
+	port->vdm_state = VDM_STATE_READY;
+}
+
+static void svdm_consume_identity(struct tcpm_port *port, const __le32 *payload,
+				  int cnt)
+{
+	u32 vdo = le32_to_cpu(payload[VDO_INDEX_IDH]);
+	u32 product = le32_to_cpu(payload[VDO_INDEX_PRODUCT]);
+
+	memset(&port->mode_data, 0, sizeof(port->mode_data));
+
+	port->partner_ident.id_header = vdo;
+	port->partner_ident.cert_stat = le32_to_cpu(payload[VDO_INDEX_CSTAT]);
+	port->partner_ident.product = product;
+
+	typec_partner_set_identity(port->partner);
+
+	tcpm_log(port, "Identity: %04x:%04x.%04x",
+		 PD_IDH_VID(vdo),
+		 PD_PRODUCT_PID(product), product & 0xffff);
+}
+
+static bool svdm_consume_svids(struct tcpm_port *port, const __le32 *payload,
+			       int cnt)
+{
+	struct pd_mode_data *pmdata = &port->mode_data;
+	int i;
+
+	for (i = 1; i < cnt; i++) {
+		u32 p = le32_to_cpu(payload[i]);
+		u16 svid;
+
+		svid = (p >> 16) & 0xffff;
+		if (!svid)
+			return false;
+
+		if (pmdata->nsvids >= SVID_DISCOVERY_MAX)
+			goto abort;
+
+		pmdata->svids[pmdata->nsvids++] = svid;
+		tcpm_log(port, "SVID %d: 0x%x", pmdata->nsvids, svid);
+
+		svid = p & 0xffff;
+		if (!svid)
+			return false;
+
+		if (pmdata->nsvids >= SVID_DISCOVERY_MAX)
+			goto abort;
+
+		pmdata->svids[pmdata->nsvids++] = svid;
+		tcpm_log(port, "SVID %d: 0x%x", pmdata->nsvids, svid);
+	}
+	return true;
+abort:
+	tcpm_log(port, "SVID_DISCOVERY_MAX(%d) too low!", SVID_DISCOVERY_MAX);
+	return false;
+}
+
+static void svdm_consume_modes(struct tcpm_port *port, const __le32 *payload,
+			       int cnt)
+{
+	struct pd_mode_data *pmdata = &port->mode_data;
+	struct typec_altmode_desc *paltmode;
+	struct typec_mode_desc *pmode;
+	int i;
+
+	if (pmdata->altmodes >= ARRAY_SIZE(port->partner_altmode)) {
+		/* Already logged in svdm_consume_svids() */
+		return;
+	}
+
+	paltmode = &pmdata->altmode_desc[pmdata->altmodes];
+	memset(paltmode, 0, sizeof(*paltmode));
+
+	paltmode->svid = pmdata->svids[pmdata->svid_index];
+
+	tcpm_log(port, " Alternate mode %d: SVID 0x%04x",
+		 pmdata->altmodes, paltmode->svid);
+
+	for (i = 1; i < cnt && paltmode->n_modes < ALTMODE_MAX_MODES; i++) {
+		pmode = &paltmode->modes[paltmode->n_modes];
+		memset(pmode, 0, sizeof(*pmode));
+		pmode->vdo = le32_to_cpu(payload[i]);
+		pmode->index = i - 1;
+		paltmode->n_modes++;
+		tcpm_log(port, "  VDO %d: 0x%08x",
+			 pmode->index, pmode->vdo);
+	}
+	port->partner_altmode[pmdata->altmodes] =
+		typec_partner_register_altmode(port->partner, paltmode);
+	if (port->partner_altmode[pmdata->altmodes] == NULL) {
+		tcpm_log(port,
+			 "Failed to register alternate modes for SVID 0x%04x",
+			 paltmode->svid);
+		return;
+	}
+	pmdata->altmodes++;
+}
+
+#define supports_modal(port)	PD_IDH_MODAL_SUPP((port)->partner_ident.id_header)
+
+static int tcpm_pd_svdm(struct tcpm_port *port, const __le32 *payload, int cnt,
+			u32 *response)
+{
+	u32 p0 = le32_to_cpu(payload[0]);
+	int cmd_type = PD_VDO_CMDT(p0);
+	int cmd = PD_VDO_CMD(p0);
+	struct pd_mode_data *modep;
+	int rlen = 0;
+	u16 svid;
+	int i;
+
+	tcpm_log(port, "Rx VDM cmd 0x%x type %d cmd %d len %d",
+		 p0, cmd_type, cmd, cnt);
+
+	modep = &port->mode_data;
+
+	switch (cmd_type) {
+	case CMDT_INIT:
+		switch (cmd) {
+		case CMD_DISCOVER_IDENT:
+			/* 6.4.4.3.1: Only respond as UFP (device) */
+			if (port->data_role == TYPEC_DEVICE &&
+			    port->nr_snk_vdo) {
+				for (i = 0; i <  port->nr_snk_vdo; i++)
+					response[i + 1] = port->snk_vdo[i];
+				rlen = port->nr_snk_vdo + 1;
+			}
+			break;
+		case CMD_DISCOVER_SVID:
+			break;
+		case CMD_DISCOVER_MODES:
+			break;
+		case CMD_ENTER_MODE:
+			break;
+		case CMD_EXIT_MODE:
+			break;
+		case CMD_ATTENTION:
+			break;
+		default:
+			break;
+		}
+		if (rlen >= 1) {
+			response[0] = p0 | VDO_CMDT(CMDT_RSP_ACK);
+		} else if (rlen == 0) {
+			response[0] = p0 | VDO_CMDT(CMDT_RSP_NAK);
+			rlen = 1;
+		} else {
+			response[0] = p0 | VDO_CMDT(CMDT_RSP_BUSY);
+			rlen = 1;
+		}
+		break;
+	case CMDT_RSP_ACK:
+		/* silently drop message if we are not connected */
+		if (!port->partner)
+			break;
+
+		switch (cmd) {
+		case CMD_DISCOVER_IDENT:
+			/* 6.4.4.3.1 */
+			svdm_consume_identity(port, payload, cnt);
+			response[0] = VDO(USB_SID_PD, 1, CMD_DISCOVER_SVID);
+			rlen = 1;
+			break;
+		case CMD_DISCOVER_SVID:
+			/* 6.4.4.3.2 */
+			if (svdm_consume_svids(port, payload, cnt)) {
+				response[0] = VDO(USB_SID_PD, 1,
+						  CMD_DISCOVER_SVID);
+				rlen = 1;
+			} else if (modep->nsvids && supports_modal(port)) {
+				response[0] = VDO(modep->svids[0], 1,
+						  CMD_DISCOVER_MODES);
+				rlen = 1;
+			}
+			break;
+		case CMD_DISCOVER_MODES:
+			/* 6.4.4.3.3 */
+			svdm_consume_modes(port, payload, cnt);
+			modep->svid_index++;
+			if (modep->svid_index < modep->nsvids) {
+				svid = modep->svids[modep->svid_index];
+				response[0] = VDO(svid, 1, CMD_DISCOVER_MODES);
+				rlen = 1;
+			} else {
+				/* enter alternate mode if/when implemented */
+			}
+			break;
+		case CMD_ENTER_MODE:
+			break;
+		default:
+			break;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return rlen;
+}
+
+static void tcpm_handle_vdm_request(struct tcpm_port *port,
+				    const __le32 *payload, int cnt)
+{
+	int rlen = 0;
+	u32 response[8] = { };
+	u32 p0 = le32_to_cpu(payload[0]);
+
+	if (port->vdm_state == VDM_STATE_BUSY) {
+		/* If UFP responded busy retry after timeout */
+		if (PD_VDO_CMDT(p0) == CMDT_RSP_BUSY) {
+			port->vdm_state = VDM_STATE_WAIT_RSP_BUSY;
+			port->vdo_retry = (p0 & ~VDO_CMDT_MASK) |
+				CMDT_INIT;
+			mod_delayed_work(port->wq, &port->vdm_state_machine,
+					 msecs_to_jiffies(PD_T_VDM_BUSY));
+			return;
+		}
+		port->vdm_state = VDM_STATE_DONE;
+	}
+
+	if (PD_VDO_SVDM(p0))
+		rlen = tcpm_pd_svdm(port, payload, cnt, response);
+
+	if (rlen > 0) {
+		tcpm_queue_vdm(port, response[0], &response[1], rlen - 1);
+		mod_delayed_work(port->wq, &port->vdm_state_machine, 0);
+	}
+}
+
+static void tcpm_send_vdm(struct tcpm_port *port, u32 vid, int cmd,
+			  const u32 *data, int count)
+{
+	u32 header;
+
+	if (WARN_ON(count > VDO_MAX_SIZE - 1))
+		count = VDO_MAX_SIZE - 1;
+
+	/* set VDM header with VID & CMD */
+	header = VDO(vid, ((vid & USB_SID_PD) == USB_SID_PD) ?
+			1 : (PD_VDO_CMD(cmd) <= CMD_ATTENTION), cmd);
+	tcpm_queue_vdm(port, header, data, count);
+
+	mod_delayed_work(port->wq, &port->vdm_state_machine, 0);
+}
+
+static unsigned int vdm_ready_timeout(u32 vdm_hdr)
+{
+	unsigned int timeout;
+	int cmd = PD_VDO_CMD(vdm_hdr);
+
+	/* its not a structured VDM command */
+	if (!PD_VDO_SVDM(vdm_hdr))
+		return PD_T_VDM_UNSTRUCTURED;
+
+	switch (PD_VDO_CMDT(vdm_hdr)) {
+	case CMDT_INIT:
+		if (cmd == CMD_ENTER_MODE || cmd == CMD_EXIT_MODE)
+			timeout = PD_T_VDM_WAIT_MODE_E;
+		else
+			timeout = PD_T_VDM_SNDR_RSP;
+		break;
+	default:
+		if (cmd == CMD_ENTER_MODE || cmd == CMD_EXIT_MODE)
+			timeout = PD_T_VDM_E_MODE;
+		else
+			timeout = PD_T_VDM_RCVR_RSP;
+		break;
+	}
+	return timeout;
+}
+
+static void vdm_run_state_machine(struct tcpm_port *port)
+{
+	struct pd_message msg;
+	int i, res;
+
+	switch (port->vdm_state) {
+	case VDM_STATE_READY:
+		/* Only transmit VDM if attached */
+		if (!port->attached) {
+			port->vdm_state = VDM_STATE_ERR_BUSY;
+			break;
+		}
+
+		/*
+		 * if there's traffic or we're not in PDO ready state don't send
+		 * a VDM.
+		 */
+		if (port->state != SRC_READY && port->state != SNK_READY)
+			break;
+
+		/* Prepare and send VDM */
+		memset(&msg, 0, sizeof(msg));
+		msg.header = PD_HEADER_LE(PD_DATA_VENDOR_DEF,
+					  port->pwr_role,
+					  port->data_role,
+					  port->message_id, port->vdo_count);
+		for (i = 0; i < port->vdo_count; i++)
+			msg.payload[i] = cpu_to_le32(port->vdo_data[i]);
+		res = tcpm_pd_transmit(port, TCPC_TX_SOP, &msg);
+		if (res < 0) {
+			port->vdm_state = VDM_STATE_ERR_SEND;
+		} else {
+			unsigned long timeout;
+
+			port->vdm_retries = 0;
+			port->vdm_state = VDM_STATE_BUSY;
+			timeout = vdm_ready_timeout(port->vdo_data[0]);
+			mod_delayed_work(port->wq, &port->vdm_state_machine,
+					 timeout);
+		}
+		break;
+	case VDM_STATE_WAIT_RSP_BUSY:
+		port->vdo_data[0] = port->vdo_retry;
+		port->vdo_count = 1;
+		port->vdm_state = VDM_STATE_READY;
+		break;
+	case VDM_STATE_BUSY:
+		port->vdm_state = VDM_STATE_ERR_TMOUT;
+		break;
+	case VDM_STATE_ERR_SEND:
+		/*
+		 * A partner which does not support USB PD will not reply,
+		 * so this is not a fatal error. At the same time, some
+		 * devices may not return GoodCRC under some circumstances,
+		 * so we need to retry.
+		 */
+		if (port->vdm_retries < 3) {
+			tcpm_log(port, "VDM Tx error, retry");
+			port->vdm_retries++;
+			port->vdm_state = VDM_STATE_READY;
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+static void vdm_state_machine_work(struct work_struct *work)
+{
+	struct tcpm_port *port = container_of(work, struct tcpm_port,
+					      vdm_state_machine.work);
+	enum vdm_states prev_state;
+
+	mutex_lock(&port->lock);
+
+	/*
+	 * Continue running as long as the port is not busy and there was
+	 * a state change.
+	 */
+	do {
+		prev_state = port->vdm_state;
+		vdm_run_state_machine(port);
+	} while (port->vdm_state != prev_state &&
+		 port->vdm_state != VDM_STATE_BUSY);
+
+	mutex_unlock(&port->lock);
+}
+
+/*
+ * PD (data, control) command handling functions
+ */
+static void tcpm_pd_data_request(struct tcpm_port *port,
+				 const struct pd_message *msg)
+{
+	enum pd_data_msg_type type = pd_header_type_le(msg->header);
+	unsigned int cnt = pd_header_cnt_le(msg->header);
+	unsigned int i;
+
+	switch (type) {
+	case PD_DATA_SOURCE_CAP:
+		if (port->pwr_role != TYPEC_SINK)
+			break;
+
+		for (i = 0; i < cnt; i++)
+			port->source_caps[i] = le32_to_cpu(msg->payload[i]);
+
+		port->nr_source_caps = cnt;
+
+		tcpm_log_source_caps(port);
+
+		/*
+		 * This message may be received even if VBUS is not
+		 * present. This is quite unexpected; see USB PD
+		 * specification, sections 8.3.3.6.3.1 and 8.3.3.6.3.2.
+		 * However, at the same time, we must be ready to
+		 * receive this message and respond to it 15ms after
+		 * receiving PS_RDY during power swap operations, no matter
+		 * if VBUS is available or not (USB PD specification,
+		 * section 6.5.9.2).
+		 * So we need to accept the message either way,
+		 * but be prepared to keep waiting for VBUS after it was
+		 * handled.
+		 */
+		tcpm_set_state(port, SNK_NEGOTIATE_CAPABILITIES, 0);
+		break;
+	case PD_DATA_REQUEST:
+		if (port->pwr_role != TYPEC_SOURCE ||
+		    cnt != 1) {
+			tcpm_queue_message(port, PD_MSG_CTRL_REJECT);
+			break;
+		}
+		port->sink_request = le32_to_cpu(msg->payload[0]);
+		tcpm_set_state(port, SRC_NEGOTIATE_CAPABILITIES, 0);
+		break;
+	case PD_DATA_SINK_CAP:
+		/* We don't do anything with this at the moment... */
+		for (i = 0; i < cnt; i++)
+			port->sink_caps[i] = le32_to_cpu(msg->payload[i]);
+		port->nr_sink_caps = cnt;
+		break;
+	case PD_DATA_VENDOR_DEF:
+		tcpm_handle_vdm_request(port, msg->payload, cnt);
+		break;
+	case PD_DATA_BIST:
+		if (port->state == SRC_READY || port->state == SNK_READY) {
+			port->bist_request = le32_to_cpu(msg->payload[0]);
+			tcpm_set_state(port, BIST_RX, 0);
+		}
+		break;
+	default:
+		tcpm_log(port, "Unhandled data message type %#x", type);
+		break;
+	}
+}
+
+static void tcpm_pd_ctrl_request(struct tcpm_port *port,
+				 const struct pd_message *msg)
+{
+	enum pd_ctrl_msg_type type = pd_header_type_le(msg->header);
+	enum tcpm_state next_state;
+
+	switch (type) {
+	case PD_CTRL_GOOD_CRC:
+	case PD_CTRL_PING:
+		break;
+	case PD_CTRL_GET_SOURCE_CAP:
+		switch (port->state) {
+		case SRC_READY:
+		case SNK_READY:
+			tcpm_queue_message(port, PD_MSG_DATA_SOURCE_CAP);
+			break;
+		default:
+			tcpm_queue_message(port, PD_MSG_CTRL_REJECT);
+			break;
+		}
+		break;
+	case PD_CTRL_GET_SINK_CAP:
+		switch (port->state) {
+		case SRC_READY:
+		case SNK_READY:
+			tcpm_queue_message(port, PD_MSG_DATA_SINK_CAP);
+			break;
+		default:
+			tcpm_queue_message(port, PD_MSG_CTRL_REJECT);
+			break;
+		}
+		break;
+	case PD_CTRL_GOTO_MIN:
+		break;
+	case PD_CTRL_PS_RDY:
+		switch (port->state) {
+		case SNK_TRANSITION_SINK:
+			if (port->vbus_present) {
+				tcpm_set_current_limit(port,
+						       port->current_limit,
+						       port->supply_voltage);
+				port->explicit_contract = true;
+				tcpm_set_state(port, SNK_READY, 0);
+			} else {
+				/*
+				 * Seen after power swap. Keep waiting for VBUS
+				 * in a transitional state.
+				 */
+				tcpm_set_state(port,
+					       SNK_TRANSITION_SINK_VBUS, 0);
+			}
+			break;
+		case PR_SWAP_SRC_SNK_SOURCE_OFF_CC_DEBOUNCED:
+			tcpm_set_state(port, PR_SWAP_SRC_SNK_SINK_ON, 0);
+			break;
+		case PR_SWAP_SNK_SRC_SINK_OFF:
+			tcpm_set_state(port, PR_SWAP_SNK_SRC_SOURCE_ON, 0);
+			break;
+		case VCONN_SWAP_WAIT_FOR_VCONN:
+			tcpm_set_state(port, VCONN_SWAP_TURN_OFF_VCONN, 0);
+			break;
+		default:
+			break;
+		}
+		break;
+	case PD_CTRL_REJECT:
+	case PD_CTRL_WAIT:
+		switch (port->state) {
+		case SNK_NEGOTIATE_CAPABILITIES:
+			/* USB PD specification, Figure 8-43 */
+			if (port->explicit_contract)
+				next_state = SNK_READY;
+			else
+				next_state = SNK_WAIT_CAPABILITIES;
+			tcpm_set_state(port, next_state, 0);
+			break;
+		case DR_SWAP_SEND:
+			port->swap_status = (type == PD_CTRL_WAIT ?
+					     -EAGAIN : -EOPNOTSUPP);
+			tcpm_set_state(port, DR_SWAP_CANCEL, 0);
+			break;
+		case PR_SWAP_SEND:
+			port->swap_status = (type == PD_CTRL_WAIT ?
+					     -EAGAIN : -EOPNOTSUPP);
+			tcpm_set_state(port, PR_SWAP_CANCEL, 0);
+			break;
+		case VCONN_SWAP_SEND:
+			port->swap_status = (type == PD_CTRL_WAIT ?
+					     -EAGAIN : -EOPNOTSUPP);
+			tcpm_set_state(port, VCONN_SWAP_CANCEL, 0);
+			break;
+		default:
+			break;
+		}
+		break;
+	case PD_CTRL_ACCEPT:
+		switch (port->state) {
+		case SNK_NEGOTIATE_CAPABILITIES:
+			tcpm_set_state(port, SNK_TRANSITION_SINK, 0);
+			break;
+		case SOFT_RESET_SEND:
+			port->message_id = 0;
+			port->rx_msgid = -1;
+			if (port->pwr_role == TYPEC_SOURCE)
+				next_state = SRC_SEND_CAPABILITIES;
+			else
+				next_state = SNK_WAIT_CAPABILITIES;
+			tcpm_set_state(port, next_state, 0);
+			break;
+		case DR_SWAP_SEND:
+			tcpm_set_state(port, DR_SWAP_CHANGE_DR, 0);
+			break;
+		case PR_SWAP_SEND:
+			tcpm_set_state(port, PR_SWAP_START, 0);
+			break;
+		case VCONN_SWAP_SEND:
+			tcpm_set_state(port, VCONN_SWAP_START, 0);
+			break;
+		default:
+			break;
+		}
+		break;
+	case PD_CTRL_SOFT_RESET:
+		tcpm_set_state(port, SOFT_RESET, 0);
+		break;
+	case PD_CTRL_DR_SWAP:
+		if (port->port_type != TYPEC_PORT_DRP) {
+			tcpm_queue_message(port, PD_MSG_CTRL_REJECT);
+			break;
+		}
+		/*
+		 * XXX
+		 * 6.3.9: If an alternate mode is active, a request to swap
+		 * alternate modes shall trigger a port reset.
+		 */
+		switch (port->state) {
+		case SRC_READY:
+		case SNK_READY:
+			tcpm_set_state(port, DR_SWAP_ACCEPT, 0);
+			break;
+		default:
+			tcpm_queue_message(port, PD_MSG_CTRL_WAIT);
+			break;
+		}
+		break;
+	case PD_CTRL_PR_SWAP:
+		if (port->port_type != TYPEC_PORT_DRP) {
+			tcpm_queue_message(port, PD_MSG_CTRL_REJECT);
+			break;
+		}
+		switch (port->state) {
+		case SRC_READY:
+		case SNK_READY:
+			tcpm_set_state(port, PR_SWAP_ACCEPT, 0);
+			break;
+		default:
+			tcpm_queue_message(port, PD_MSG_CTRL_WAIT);
+			break;
+		}
+		break;
+	case PD_CTRL_VCONN_SWAP:
+		switch (port->state) {
+		case SRC_READY:
+		case SNK_READY:
+			tcpm_set_state(port, VCONN_SWAP_ACCEPT, 0);
+			break;
+		default:
+			tcpm_queue_message(port, PD_MSG_CTRL_WAIT);
+			break;
+		}
+		break;
+	default:
+		tcpm_log(port, "Unhandled ctrl message type %#x", type);
+		break;
+	}
+}
+
+static void tcpm_pd_rx_handler(struct work_struct *work)
+{
+	struct pd_rx_event *event = container_of(work,
+						 struct pd_rx_event, work);
+	const struct pd_message *msg = &event->msg;
+	unsigned int cnt = pd_header_cnt_le(msg->header);
+	struct tcpm_port *port = event->port;
+
+	mutex_lock(&port->lock);
+
+	tcpm_log(port, "PD RX, header: %#x [%d]", le16_to_cpu(msg->header),
+		 port->attached);
+
+	if (port->attached) {
+		enum pd_ctrl_msg_type type = pd_header_type_le(msg->header);
+		unsigned int msgid = pd_header_msgid_le(msg->header);
+
+		/*
+		 * USB PD standard, 6.6.1.2:
+		 * "... if MessageID value in a received Message is the
+		 * same as the stored value, the receiver shall return a
+		 * GoodCRC Message with that MessageID value and drop
+		 * the Message (this is a retry of an already received
+		 * Message). Note: this shall not apply to the Soft_Reset
+		 * Message which always has a MessageID value of zero."
+		 */
+		if (msgid == port->rx_msgid && type != PD_CTRL_SOFT_RESET)
+			goto done;
+		port->rx_msgid = msgid;
+
+		/*
+		 * If both ends believe to be DFP/host, we have a data role
+		 * mismatch.
+		 */
+		if (!!(le16_to_cpu(msg->header) & PD_HEADER_DATA_ROLE) ==
+		    (port->data_role == TYPEC_HOST)) {
+			tcpm_log(port,
+				 "Data role mismatch, initiating error recovery");
+			tcpm_set_state(port, ERROR_RECOVERY, 0);
+		} else {
+			if (cnt)
+				tcpm_pd_data_request(port, msg);
+			else
+				tcpm_pd_ctrl_request(port, msg);
+		}
+	}
+
+done:
+	mutex_unlock(&port->lock);
+	kfree(event);
+}
+
+void tcpm_pd_receive(struct tcpm_port *port, const struct pd_message *msg)
+{
+	struct pd_rx_event *event;
+
+	event = kzalloc(sizeof(*event), GFP_ATOMIC);
+	if (!event)
+		return;
+
+	INIT_WORK(&event->work, tcpm_pd_rx_handler);
+	event->port = port;
+	memcpy(&event->msg, msg, sizeof(*msg));
+	queue_work(port->wq, &event->work);
+}
+EXPORT_SYMBOL_GPL(tcpm_pd_receive);
+
+static int tcpm_pd_send_control(struct tcpm_port *port,
+				enum pd_ctrl_msg_type type)
+{
+	struct pd_message msg;
+
+	memset(&msg, 0, sizeof(msg));
+	msg.header = PD_HEADER_LE(type, port->pwr_role,
+				  port->data_role,
+				  port->message_id, 0);
+
+	return tcpm_pd_transmit(port, TCPC_TX_SOP, &msg);
+}
+
+/*
+ * Send queued message without affecting state.
+ * Return true if state machine should go back to sleep,
+ * false otherwise.
+ */
+static bool tcpm_send_queued_message(struct tcpm_port *port)
+{
+	enum pd_msg_request queued_message;
+
+	do {
+		queued_message = port->queued_message;
+		port->queued_message = PD_MSG_NONE;
+
+		switch (queued_message) {
+		case PD_MSG_CTRL_WAIT:
+			tcpm_pd_send_control(port, PD_CTRL_WAIT);
+			break;
+		case PD_MSG_CTRL_REJECT:
+			tcpm_pd_send_control(port, PD_CTRL_REJECT);
+			break;
+		case PD_MSG_DATA_SINK_CAP:
+			tcpm_pd_send_sink_caps(port);
+			break;
+		case PD_MSG_DATA_SOURCE_CAP:
+			tcpm_pd_send_source_caps(port);
+			break;
+		default:
+			break;
+		}
+	} while (port->queued_message != PD_MSG_NONE);
+
+	if (port->delayed_state != INVALID_STATE) {
+		if (time_is_after_jiffies(port->delayed_runtime)) {
+			mod_delayed_work(port->wq, &port->state_machine,
+					 port->delayed_runtime - jiffies);
+			return true;
+		}
+		port->delayed_state = INVALID_STATE;
+	}
+	return false;
+}
+
+static int tcpm_pd_check_request(struct tcpm_port *port)
+{
+	u32 pdo, rdo = port->sink_request;
+	unsigned int max, op, pdo_max, index;
+	enum pd_pdo_type type;
+
+	index = rdo_index(rdo);
+	if (!index || index > port->nr_src_pdo)
+		return -EINVAL;
+
+	pdo = port->src_pdo[index - 1];
+	type = pdo_type(pdo);
+	switch (type) {
+	case PDO_TYPE_FIXED:
+	case PDO_TYPE_VAR:
+		max = rdo_max_current(rdo);
+		op = rdo_op_current(rdo);
+		pdo_max = pdo_max_current(pdo);
+
+		if (op > pdo_max)
+			return -EINVAL;
+		if (max > pdo_max && !(rdo & RDO_CAP_MISMATCH))
+			return -EINVAL;
+
+		if (type == PDO_TYPE_FIXED)
+			tcpm_log(port,
+				 "Requested %u mV, %u mA for %u / %u mA",
+				 pdo_fixed_voltage(pdo), pdo_max, op, max);
+		else
+			tcpm_log(port,
+				 "Requested %u -> %u mV, %u mA for %u / %u mA",
+				 pdo_min_voltage(pdo), pdo_max_voltage(pdo),
+				 pdo_max, op, max);
+		break;
+	case PDO_TYPE_BATT:
+		max = rdo_max_power(rdo);
+		op = rdo_op_power(rdo);
+		pdo_max = pdo_max_power(pdo);
+
+		if (op > pdo_max)
+			return -EINVAL;
+		if (max > pdo_max && !(rdo & RDO_CAP_MISMATCH))
+			return -EINVAL;
+		tcpm_log(port,
+			 "Requested %u -> %u mV, %u mW for %u / %u mW",
+			 pdo_min_voltage(pdo), pdo_max_voltage(pdo),
+			 pdo_max, op, max);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	port->op_vsafe5v = index == 1;
+
+	return 0;
+}
+
+static int tcpm_pd_select_pdo(struct tcpm_port *port)
+{
+	unsigned int i, max_mw = 0, max_mv = 0;
+	int ret = -EINVAL;
+
+	/*
+	 * Select the source PDO providing the most power while staying within
+	 * the board's voltage limits. Prefer PDO providing exp
+	 */
+	for (i = 0; i < port->nr_source_caps; i++) {
+		u32 pdo = port->source_caps[i];
+		enum pd_pdo_type type = pdo_type(pdo);
+		unsigned int mv, ma, mw;
+
+		if (type == PDO_TYPE_FIXED)
+			mv = pdo_fixed_voltage(pdo);
+		else
+			mv = pdo_min_voltage(pdo);
+
+		if (type == PDO_TYPE_BATT) {
+			mw = pdo_max_power(pdo);
+		} else {
+			ma = min(pdo_max_current(pdo),
+				 port->max_snk_ma);
+			mw = ma * mv / 1000;
+		}
+
+		/* Perfer higher voltages if available */
+		if ((mw > max_mw || (mw == max_mw && mv > max_mv)) &&
+		    mv <= port->max_snk_mv) {
+			ret = i;
+			max_mw = mw;
+			max_mv = mv;
+		}
+	}
+
+	return ret;
+}
+
+static int tcpm_pd_build_request(struct tcpm_port *port, u32 *rdo)
+{
+	unsigned int mv, ma, mw, flags;
+	unsigned int max_ma, max_mw;
+	enum pd_pdo_type type;
+	int index;
+	u32 pdo;
+
+	index = tcpm_pd_select_pdo(port);
+	if (index < 0)
+		return -EINVAL;
+	pdo = port->source_caps[index];
+	type = pdo_type(pdo);
+
+	if (type == PDO_TYPE_FIXED)
+		mv = pdo_fixed_voltage(pdo);
+	else
+		mv = pdo_min_voltage(pdo);
+
+	/* Select maximum available current within the board's power limit */
+	if (type == PDO_TYPE_BATT) {
+		mw = pdo_max_power(pdo);
+		ma = 1000 * min(mw, port->max_snk_mw) / mv;
+	} else {
+		ma = min(pdo_max_current(pdo),
+			 1000 * port->max_snk_mw / mv);
+	}
+	ma = min(ma, port->max_snk_ma);
+
+	flags = RDO_USB_COMM | RDO_NO_SUSPEND;
+
+	/* Set mismatch bit if offered power is less than operating power */
+	mw = ma * mv / 1000;
+	max_ma = ma;
+	max_mw = mw;
+	if (mw < port->operating_snk_mw) {
+		flags |= RDO_CAP_MISMATCH;
+		max_mw = port->operating_snk_mw;
+		max_ma = max_mw * 1000 / mv;
+	}
+
+	tcpm_log(port, "cc=%d cc1=%d cc2=%d vbus=%d vconn=%s polarity=%d",
+		 port->cc_req, port->cc1, port->cc2, port->vbus_source,
+		 port->vconn_role == TYPEC_SOURCE ? "source" : "sink",
+		 port->polarity);
+
+	if (type == PDO_TYPE_BATT) {
+		*rdo = RDO_BATT(index + 1, mw, max_mw, flags);
+
+		tcpm_log(port, "Requesting PDO %d: %u mV, %u mW%s",
+			 index, mv, mw,
+			 flags & RDO_CAP_MISMATCH ? " [mismatch]" : "");
+	} else {
+		*rdo = RDO_FIXED(index + 1, ma, max_ma, flags);
+
+		tcpm_log(port, "Requesting PDO %d: %u mV, %u mA%s",
+			 index, mv, ma,
+			 flags & RDO_CAP_MISMATCH ? " [mismatch]" : "");
+	}
+
+	port->current_limit = ma;
+	port->supply_voltage = mv;
+
+	return 0;
+}
+
+static int tcpm_pd_send_request(struct tcpm_port *port)
+{
+	struct pd_message msg;
+	int ret;
+	u32 rdo;
+
+	ret = tcpm_pd_build_request(port, &rdo);
+	if (ret < 0)
+		return ret;
+
+	memset(&msg, 0, sizeof(msg));
+	msg.header = PD_HEADER_LE(PD_DATA_REQUEST,
+				  port->pwr_role,
+				  port->data_role,
+				  port->message_id, 1);
+	msg.payload[0] = cpu_to_le32(rdo);
+
+	return tcpm_pd_transmit(port, TCPC_TX_SOP, &msg);
+}
+
+static int tcpm_set_vbus(struct tcpm_port *port, bool enable)
+{
+	int ret;
+
+	if (enable && port->vbus_charge)
+		return -EINVAL;
+
+	tcpm_log(port, "vbus:=%d charge=%d", enable, port->vbus_charge);
+
+	ret = port->tcpc->set_vbus(port->tcpc, enable, port->vbus_charge);
+	if (ret < 0)
+		return ret;
+
+	port->vbus_source = enable;
+	return 0;
+}
+
+static int tcpm_set_charge(struct tcpm_port *port, bool charge)
+{
+	int ret;
+
+	if (charge && port->vbus_source)
+		return -EINVAL;
+
+	if (charge != port->vbus_charge) {
+		tcpm_log(port, "vbus=%d charge:=%d", port->vbus_source, charge);
+		ret = port->tcpc->set_vbus(port->tcpc, port->vbus_source,
+					   charge);
+		if (ret < 0)
+			return ret;
+	}
+	port->vbus_charge = charge;
+	return 0;
+}
+
+static bool tcpm_start_drp_toggling(struct tcpm_port *port)
+{
+	int ret;
+
+	if (port->tcpc->start_drp_toggling &&
+	    port->port_type == TYPEC_PORT_DRP) {
+		tcpm_log_force(port, "Start DRP toggling");
+		ret = port->tcpc->start_drp_toggling(port->tcpc,
+						     tcpm_rp_cc(port));
+		if (!ret)
+			return true;
+	}
+
+	return false;
+}
+
+static void tcpm_set_cc(struct tcpm_port *port, enum typec_cc_status cc)
+{
+	tcpm_log(port, "cc:=%d", cc);
+	port->cc_req = cc;
+	port->tcpc->set_cc(port->tcpc, cc);
+}
+
+static int tcpm_init_vbus(struct tcpm_port *port)
+{
+	int ret;
+
+	ret = port->tcpc->set_vbus(port->tcpc, false, false);
+	port->vbus_source = false;
+	port->vbus_charge = false;
+	return ret;
+}
+
+static int tcpm_init_vconn(struct tcpm_port *port)
+{
+	int ret;
+
+	ret = port->tcpc->set_vconn(port->tcpc, false);
+	port->vconn_role = TYPEC_SINK;
+	return ret;
+}
+
+static void tcpm_typec_connect(struct tcpm_port *port)
+{
+	if (!port->connected) {
+		/* Make sure we don't report stale identity information */
+		memset(&port->partner_ident, 0, sizeof(port->partner_ident));
+		port->partner_desc.usb_pd = port->pd_capable;
+		if (tcpm_port_is_debug(port))
+			port->partner_desc.accessory = TYPEC_ACCESSORY_DEBUG;
+		else if (tcpm_port_is_audio(port))
+			port->partner_desc.accessory = TYPEC_ACCESSORY_AUDIO;
+		else
+			port->partner_desc.accessory = TYPEC_ACCESSORY_NONE;
+		port->partner = typec_register_partner(port->typec_port,
+						       &port->partner_desc);
+		port->connected = true;
+	}
+}
+
+static int tcpm_src_attach(struct tcpm_port *port)
+{
+	enum typec_cc_polarity polarity =
+				port->cc2 == TYPEC_CC_RD ? TYPEC_POLARITY_CC2
+							 : TYPEC_POLARITY_CC1;
+	int ret;
+
+	if (port->attached)
+		return 0;
+
+	ret = tcpm_set_polarity(port, polarity);
+	if (ret < 0)
+		return ret;
+
+	ret = tcpm_set_roles(port, true, TYPEC_SOURCE, TYPEC_HOST);
+	if (ret < 0)
+		return ret;
+
+	ret = port->tcpc->set_pd_rx(port->tcpc, true);
+	if (ret < 0)
+		goto out_disable_mux;
+
+	/*
+	 * USB Type-C specification, version 1.2,
+	 * chapter 4.5.2.2.8.1 (Attached.SRC Requirements)
+	 * Enable VCONN only if the non-RD port is set to RA.
+	 */
+	if ((polarity == TYPEC_POLARITY_CC1 && port->cc2 == TYPEC_CC_RA) ||
+	    (polarity == TYPEC_POLARITY_CC2 && port->cc1 == TYPEC_CC_RA)) {
+		ret = tcpm_set_vconn(port, true);
+		if (ret < 0)
+			goto out_disable_pd;
+	}
+
+	ret = tcpm_set_vbus(port, true);
+	if (ret < 0)
+		goto out_disable_vconn;
+
+	port->pd_capable = false;
+
+	port->partner = NULL;
+
+	port->attached = true;
+	port->send_discover = true;
+
+	return 0;
+
+out_disable_vconn:
+	tcpm_set_vconn(port, false);
+out_disable_pd:
+	port->tcpc->set_pd_rx(port->tcpc, false);
+out_disable_mux:
+	tcpm_mux_set(port, TYPEC_MUX_NONE, TCPC_USB_SWITCH_DISCONNECT);
+	return ret;
+}
+
+static void tcpm_typec_disconnect(struct tcpm_port *port)
+{
+	if (port->connected) {
+		typec_unregister_partner(port->partner);
+		port->partner = NULL;
+		port->connected = false;
+	}
+}
+
+static void tcpm_unregister_altmodes(struct tcpm_port *port)
+{
+	struct pd_mode_data *modep = &port->mode_data;
+	int i;
+
+	for (i = 0; i < modep->altmodes; i++) {
+		typec_unregister_altmode(port->partner_altmode[i]);
+		port->partner_altmode[i] = NULL;
+	}
+
+	memset(modep, 0, sizeof(*modep));
+}
+
+static void tcpm_reset_port(struct tcpm_port *port)
+{
+	tcpm_unregister_altmodes(port);
+	tcpm_typec_disconnect(port);
+	port->attached = false;
+	port->pd_capable = false;
+
+	/*
+	 * First Rx ID should be 0; set this to a sentinel of -1 so that
+	 * we can check tcpm_pd_rx_handler() if we had seen it before.
+	 */
+	port->rx_msgid = -1;
+
+	port->tcpc->set_pd_rx(port->tcpc, false);
+	tcpm_init_vbus(port);	/* also disables charging */
+	tcpm_init_vconn(port);
+	tcpm_set_current_limit(port, 0, 0);
+	tcpm_set_polarity(port, TYPEC_POLARITY_CC1);
+	tcpm_set_attached_state(port, false);
+	port->try_src_count = 0;
+	port->try_snk_count = 0;
+}
+
+static void tcpm_detach(struct tcpm_port *port)
+{
+	if (!port->attached)
+		return;
+
+	if (tcpm_port_is_disconnected(port))
+		port->hard_reset_count = 0;
+
+	tcpm_reset_port(port);
+}
+
+static void tcpm_src_detach(struct tcpm_port *port)
+{
+	tcpm_detach(port);
+}
+
+static int tcpm_snk_attach(struct tcpm_port *port)
+{
+	int ret;
+
+	if (port->attached)
+		return 0;
+
+	ret = tcpm_set_polarity(port, port->cc2 != TYPEC_CC_OPEN ?
+				TYPEC_POLARITY_CC2 : TYPEC_POLARITY_CC1);
+	if (ret < 0)
+		return ret;
+
+	ret = tcpm_set_roles(port, true, TYPEC_SINK, TYPEC_DEVICE);
+	if (ret < 0)
+		return ret;
+
+	port->pd_capable = false;
+
+	port->partner = NULL;
+
+	port->attached = true;
+	port->send_discover = true;
+
+	return 0;
+}
+
+static void tcpm_snk_detach(struct tcpm_port *port)
+{
+	tcpm_detach(port);
+
+	/* XXX: (Dis)connect SuperSpeed mux? */
+}
+
+static int tcpm_acc_attach(struct tcpm_port *port)
+{
+	int ret;
+
+	if (port->attached)
+		return 0;
+
+	ret = tcpm_set_roles(port, true, TYPEC_SOURCE, TYPEC_HOST);
+	if (ret < 0)
+		return ret;
+
+	port->partner = NULL;
+
+	tcpm_typec_connect(port);
+
+	port->attached = true;
+
+	return 0;
+}
+
+static void tcpm_acc_detach(struct tcpm_port *port)
+{
+	tcpm_detach(port);
+}
+
+static inline enum tcpm_state hard_reset_state(struct tcpm_port *port)
+{
+	if (port->hard_reset_count < PD_N_HARD_RESET_COUNT)
+		return HARD_RESET_SEND;
+	if (port->pd_capable)
+		return ERROR_RECOVERY;
+	if (port->pwr_role == TYPEC_SOURCE)
+		return SRC_UNATTACHED;
+	if (port->state == SNK_WAIT_CAPABILITIES)
+		return SNK_READY;
+	return SNK_UNATTACHED;
+}
+
+static inline enum tcpm_state ready_state(struct tcpm_port *port)
+{
+	if (port->pwr_role == TYPEC_SOURCE)
+		return SRC_READY;
+	else
+		return SNK_READY;
+}
+
+static inline enum tcpm_state unattached_state(struct tcpm_port *port)
+{
+	if (port->port_type == TYPEC_PORT_DRP) {
+		if (port->pwr_role == TYPEC_SOURCE)
+			return SRC_UNATTACHED;
+		else
+			return SNK_UNATTACHED;
+	} else if (port->port_type == TYPEC_PORT_DFP) {
+		return SRC_UNATTACHED;
+	}
+
+	return SNK_UNATTACHED;
+}
+
+static void tcpm_check_send_discover(struct tcpm_port *port)
+{
+	if (port->data_role == TYPEC_HOST && port->send_discover &&
+	    port->pd_capable) {
+		tcpm_send_vdm(port, USB_SID_PD, CMD_DISCOVER_IDENT, NULL, 0);
+		port->send_discover = false;
+	}
+}
+
+static void tcpm_swap_complete(struct tcpm_port *port, int result)
+{
+	if (port->swap_pending) {
+		port->swap_status = result;
+		port->swap_pending = false;
+		port->non_pd_role_swap = false;
+		complete(&port->swap_complete);
+	}
+}
+
+static enum typec_pwr_opmode tcpm_get_pwr_opmode(enum typec_cc_status cc)
+{
+	switch (cc) {
+	case TYPEC_CC_RP_1_5:
+		return TYPEC_PWR_MODE_1_5A;
+	case TYPEC_CC_RP_3_0:
+		return TYPEC_PWR_MODE_3_0A;
+	case TYPEC_CC_RP_DEF:
+	default:
+		return TYPEC_PWR_MODE_USB;
+	}
+}
+
+static void run_state_machine(struct tcpm_port *port)
+{
+	int ret;
+	enum typec_pwr_opmode opmode;
+	unsigned int msecs;
+
+	port->enter_state = port->state;
+	switch (port->state) {
+	case DRP_TOGGLING:
+		break;
+	/* SRC states */
+	case SRC_UNATTACHED:
+		if (!port->non_pd_role_swap)
+			tcpm_swap_complete(port, -ENOTCONN);
+		tcpm_src_detach(port);
+		if (tcpm_start_drp_toggling(port)) {
+			tcpm_set_state(port, DRP_TOGGLING, 0);
+			break;
+		}
+		tcpm_set_cc(port, tcpm_rp_cc(port));
+		if (port->port_type == TYPEC_PORT_DRP)
+			tcpm_set_state(port, SNK_UNATTACHED, PD_T_DRP_SNK);
+		break;
+	case SRC_ATTACH_WAIT:
+		if (tcpm_port_is_debug(port))
+			tcpm_set_state(port, DEBUG_ACC_ATTACHED,
+				       PD_T_CC_DEBOUNCE);
+		else if (tcpm_port_is_audio(port))
+			tcpm_set_state(port, AUDIO_ACC_ATTACHED,
+				       PD_T_CC_DEBOUNCE);
+		else if (tcpm_port_is_source(port))
+			tcpm_set_state(port,
+				       tcpm_try_snk(port) ? SNK_TRY
+							  : SRC_ATTACHED,
+				       PD_T_CC_DEBOUNCE);
+		break;
+
+	case SNK_TRY:
+		port->try_snk_count++;
+		/*
+		 * Requirements:
+		 * - Do not drive vconn or vbus
+		 * - Terminate CC pins (both) to Rd
+		 * Action:
+		 * - Wait for tDRPTry (PD_T_DRP_TRY).
+		 *   Until then, ignore any state changes.
+		 */
+		tcpm_set_cc(port, TYPEC_CC_RD);
+		tcpm_set_state(port, SNK_TRY_WAIT, PD_T_DRP_TRY);
+		break;
+	case SNK_TRY_WAIT:
+		if (tcpm_port_is_sink(port)) {
+			tcpm_set_state(port, SNK_TRY_WAIT_DEBOUNCE, 0);
+		} else {
+			tcpm_set_state(port, SRC_TRYWAIT, 0);
+			port->max_wait = 0;
+		}
+		break;
+	case SNK_TRY_WAIT_DEBOUNCE:
+		tcpm_set_state(port, SNK_TRY_WAIT_DEBOUNCE_CHECK_VBUS,
+			       PD_T_PD_DEBOUNCE);
+		break;
+	case SNK_TRY_WAIT_DEBOUNCE_CHECK_VBUS:
+		if (port->vbus_present && tcpm_port_is_sink(port)) {
+			tcpm_set_state(port, SNK_ATTACHED, 0);
+		} else {
+			tcpm_set_state(port, SRC_TRYWAIT, 0);
+			port->max_wait = 0;
+		}
+		break;
+	case SRC_TRYWAIT:
+		tcpm_set_cc(port, tcpm_rp_cc(port));
+		if (port->max_wait == 0) {
+			port->max_wait = jiffies +
+					 msecs_to_jiffies(PD_T_DRP_TRY);
+			tcpm_set_state(port, SRC_TRYWAIT_UNATTACHED,
+				       PD_T_DRP_TRY);
+		} else {
+			if (time_is_after_jiffies(port->max_wait))
+				tcpm_set_state(port, SRC_TRYWAIT_UNATTACHED,
+					       jiffies_to_msecs(port->max_wait -
+								jiffies));
+			else
+				tcpm_set_state(port, SNK_UNATTACHED, 0);
+		}
+		break;
+	case SRC_TRYWAIT_DEBOUNCE:
+		tcpm_set_state(port, SRC_ATTACHED, PD_T_CC_DEBOUNCE);
+		break;
+	case SRC_TRYWAIT_UNATTACHED:
+		tcpm_set_state(port, SNK_UNATTACHED, 0);
+		break;
+
+	case SRC_ATTACHED:
+		ret = tcpm_src_attach(port);
+		tcpm_set_state(port, SRC_UNATTACHED,
+			       ret < 0 ? 0 : PD_T_PS_SOURCE_ON);
+		break;
+	case SRC_STARTUP:
+		opmode =  tcpm_get_pwr_opmode(tcpm_rp_cc(port));
+		typec_set_pwr_opmode(port->typec_port, opmode);
+		port->pwr_opmode = TYPEC_PWR_MODE_USB;
+		port->caps_count = 0;
+		port->message_id = 0;
+		port->rx_msgid = -1;
+		port->explicit_contract = false;
+		tcpm_set_state(port, SRC_SEND_CAPABILITIES, 0);
+		break;
+	case SRC_SEND_CAPABILITIES:
+		port->caps_count++;
+		if (port->caps_count > PD_N_CAPS_COUNT) {
+			tcpm_set_state(port, SRC_READY, 0);
+			break;
+		}
+		ret = tcpm_pd_send_source_caps(port);
+		if (ret < 0) {
+			tcpm_set_state(port, SRC_SEND_CAPABILITIES,
+				       PD_T_SEND_SOURCE_CAP);
+		} else {
+			/*
+			 * Per standard, we should clear the reset counter here.
+			 * However, that can result in state machine hang-ups.
+			 * Reset it only in READY state to improve stability.
+			 */
+			/* port->hard_reset_count = 0; */
+			port->caps_count = 0;
+			port->pd_capable = true;
+			tcpm_set_state_cond(port, hard_reset_state(port),
+					    PD_T_SEND_SOURCE_CAP);
+		}
+		break;
+	case SRC_NEGOTIATE_CAPABILITIES:
+		ret = tcpm_pd_check_request(port);
+		if (ret < 0) {
+			tcpm_pd_send_control(port, PD_CTRL_REJECT);
+			if (!port->explicit_contract) {
+				tcpm_set_state(port,
+					       SRC_WAIT_NEW_CAPABILITIES, 0);
+			} else {
+				tcpm_set_state(port, SRC_READY, 0);
+			}
+		} else {
+			tcpm_pd_send_control(port, PD_CTRL_ACCEPT);
+			tcpm_set_state(port, SRC_TRANSITION_SUPPLY,
+				       PD_T_SRC_TRANSITION);
+		}
+		break;
+	case SRC_TRANSITION_SUPPLY:
+		/* XXX: regulator_set_voltage(vbus, ...) */
+		tcpm_pd_send_control(port, PD_CTRL_PS_RDY);
+		port->explicit_contract = true;
+		typec_set_pwr_opmode(port->typec_port, TYPEC_PWR_MODE_PD);
+		port->pwr_opmode = TYPEC_PWR_MODE_PD;
+		tcpm_set_state_cond(port, SRC_READY, 0);
+		break;
+	case SRC_READY:
+#if 1
+		port->hard_reset_count = 0;
+#endif
+		port->try_src_count = 0;
+
+		tcpm_swap_complete(port, 0);
+		tcpm_typec_connect(port);
+		tcpm_check_send_discover(port);
+		/*
+		 * 6.3.5
+		 * Sending ping messages is not necessary if
+		 * - the source operates at vSafe5V
+		 * or
+		 * - The system is not operating in PD mode
+		 * or
+		 * - Both partners are connected using a Type-C connector
+		 *
+		 * There is no actual need to send PD messages since the local
+		 * port type-c and the spec does not clearly say whether PD is
+		 * possible when type-c is connected to Type-A/B
+		 */
+		break;
+	case SRC_WAIT_NEW_CAPABILITIES:
+		/* Nothing to do... */
+		break;
+
+	/* SNK states */
+	case SNK_UNATTACHED:
+		if (!port->non_pd_role_swap)
+			tcpm_swap_complete(port, -ENOTCONN);
+		tcpm_snk_detach(port);
+		if (tcpm_start_drp_toggling(port)) {
+			tcpm_set_state(port, DRP_TOGGLING, 0);
+			break;
+		}
+		tcpm_set_cc(port, TYPEC_CC_RD);
+		if (port->port_type == TYPEC_PORT_DRP)
+			tcpm_set_state(port, SRC_UNATTACHED, PD_T_DRP_SRC);
+		break;
+	case SNK_ATTACH_WAIT:
+		if ((port->cc1 == TYPEC_CC_OPEN &&
+		     port->cc2 != TYPEC_CC_OPEN) ||
+		    (port->cc1 != TYPEC_CC_OPEN &&
+		     port->cc2 == TYPEC_CC_OPEN))
+			tcpm_set_state(port, SNK_DEBOUNCED,
+				       PD_T_CC_DEBOUNCE);
+		else if (tcpm_port_is_disconnected(port))
+			tcpm_set_state(port, SNK_UNATTACHED,
+				       PD_T_PD_DEBOUNCE);
+		break;
+	case SNK_DEBOUNCED:
+		if (tcpm_port_is_disconnected(port))
+			tcpm_set_state(port, SNK_UNATTACHED,
+				       PD_T_PD_DEBOUNCE);
+		else if (port->vbus_present)
+			tcpm_set_state(port,
+				       tcpm_try_src(port) ? SRC_TRY
+							  : SNK_ATTACHED,
+				       0);
+		else
+			/* Wait for VBUS, but not forever */
+			tcpm_set_state(port, PORT_RESET, PD_T_PS_SOURCE_ON);
+		break;
+
+	case SRC_TRY:
+		port->try_src_count++;
+		tcpm_set_cc(port, tcpm_rp_cc(port));
+		port->max_wait = 0;
+		tcpm_set_state(port, SRC_TRY_WAIT, 0);
+		break;
+	case SRC_TRY_WAIT:
+		if (port->max_wait == 0) {
+			port->max_wait = jiffies +
+					 msecs_to_jiffies(PD_T_DRP_TRY);
+			msecs = PD_T_DRP_TRY;
+		} else {
+			if (time_is_after_jiffies(port->max_wait))
+				msecs = jiffies_to_msecs(port->max_wait -
+							 jiffies);
+			else
+				msecs = 0;
+		}
+		tcpm_set_state(port, SNK_TRYWAIT, msecs);
+		break;
+	case SRC_TRY_DEBOUNCE:
+		tcpm_set_state(port, SRC_ATTACHED, PD_T_PD_DEBOUNCE);
+		break;
+	case SNK_TRYWAIT:
+		tcpm_set_cc(port, TYPEC_CC_RD);
+		tcpm_set_state(port, SNK_TRYWAIT_VBUS, PD_T_CC_DEBOUNCE);
+		break;
+	case SNK_TRYWAIT_VBUS:
+		/*
+		 * TCPM stays in this state indefinitely until VBUS
+		 * is detected as long as Rp is not detected for
+		 * more than a time period of tPDDebounce.
+		 */
+		if (port->vbus_present && tcpm_port_is_sink(port)) {
+			tcpm_set_state(port, SNK_ATTACHED, 0);
+			break;
+		}
+		if (!tcpm_port_is_sink(port))
+			tcpm_set_state(port, SNK_TRYWAIT_DEBOUNCE, 0);
+		break;
+	case SNK_TRYWAIT_DEBOUNCE:
+		tcpm_set_state(port, SNK_UNATTACHED, PD_T_PD_DEBOUNCE);
+		break;
+	case SNK_ATTACHED:
+		ret = tcpm_snk_attach(port);
+		if (ret < 0)
+			tcpm_set_state(port, SNK_UNATTACHED, 0);
+		else
+			tcpm_set_state(port, SNK_STARTUP, 0);
+		break;
+	case SNK_STARTUP:
+		opmode =  tcpm_get_pwr_opmode(port->polarity ?
+					      port->cc2 : port->cc1);
+		typec_set_pwr_opmode(port->typec_port, opmode);
+		port->pwr_opmode = TYPEC_PWR_MODE_USB;
+		port->message_id = 0;
+		port->rx_msgid = -1;
+		port->explicit_contract = false;
+		tcpm_set_state(port, SNK_DISCOVERY, 0);
+		break;
+	case SNK_DISCOVERY:
+		if (port->vbus_present) {
+			tcpm_set_current_limit(port,
+					       tcpm_get_current_limit(port),
+					       5000);
+			tcpm_set_charge(port, true);
+			tcpm_set_state(port, SNK_WAIT_CAPABILITIES, 0);
+			break;
+		}
+		/*
+		 * For DRP, timeouts differ. Also, handling is supposed to be
+		 * different and much more complex (dead battery detection;
+		 * see USB power delivery specification, section 8.3.3.6.1.5.1).
+		 */
+		tcpm_set_state(port, hard_reset_state(port),
+			       port->port_type == TYPEC_PORT_DRP ?
+					PD_T_DB_DETECT : PD_T_NO_RESPONSE);
+		break;
+	case SNK_DISCOVERY_DEBOUNCE:
+		tcpm_set_state(port, SNK_DISCOVERY_DEBOUNCE_DONE,
+			       PD_T_CC_DEBOUNCE);
+		break;
+	case SNK_DISCOVERY_DEBOUNCE_DONE:
+		if (!tcpm_port_is_disconnected(port) &&
+		    tcpm_port_is_sink(port) &&
+		    time_is_after_jiffies(port->delayed_runtime)) {
+			tcpm_set_state(port, SNK_DISCOVERY,
+				       port->delayed_runtime - jiffies);
+			break;
+		}
+		tcpm_set_state(port, unattached_state(port), 0);
+		break;
+	case SNK_WAIT_CAPABILITIES:
+		ret = port->tcpc->set_pd_rx(port->tcpc, true);
+		if (ret < 0) {
+			tcpm_set_state(port, SNK_READY, 0);
+			break;
+		}
+		/*
+		 * If VBUS has never been low, and we time out waiting
+		 * for source cap, try a soft reset first, in case we
+		 * were already in a stable contract before this boot.
+		 * Do this only once.
+		 */
+		if (port->vbus_never_low) {
+			port->vbus_never_low = false;
+			tcpm_set_state(port, SOFT_RESET_SEND,
+				       PD_T_SINK_WAIT_CAP);
+		} else {
+			tcpm_set_state(port, hard_reset_state(port),
+				       PD_T_SINK_WAIT_CAP);
+		}
+		break;
+	case SNK_NEGOTIATE_CAPABILITIES:
+		port->pd_capable = true;
+		port->hard_reset_count = 0;
+		ret = tcpm_pd_send_request(port);
+		if (ret < 0) {
+			/* Let the Source send capabilities again. */
+			tcpm_set_state(port, SNK_WAIT_CAPABILITIES, 0);
+		} else {
+			tcpm_set_state_cond(port, hard_reset_state(port),
+					    PD_T_SENDER_RESPONSE);
+		}
+		break;
+	case SNK_TRANSITION_SINK:
+	case SNK_TRANSITION_SINK_VBUS:
+		tcpm_set_state(port, hard_reset_state(port),
+			       PD_T_PS_TRANSITION);
+		break;
+	case SNK_READY:
+		port->try_snk_count = 0;
+		if (port->explicit_contract) {
+			typec_set_pwr_opmode(port->typec_port,
+					     TYPEC_PWR_MODE_PD);
+			port->pwr_opmode = TYPEC_PWR_MODE_PD;
+		}
+
+		tcpm_swap_complete(port, 0);
+		tcpm_typec_connect(port);
+		tcpm_check_send_discover(port);
+		break;
+
+	/* Accessory states */
+	case ACC_UNATTACHED:
+		tcpm_acc_detach(port);
+		tcpm_set_state(port, SRC_UNATTACHED, 0);
+		break;
+	case DEBUG_ACC_ATTACHED:
+	case AUDIO_ACC_ATTACHED:
+		ret = tcpm_acc_attach(port);
+		if (ret < 0)
+			tcpm_set_state(port, ACC_UNATTACHED, 0);
+		break;
+	case AUDIO_ACC_DEBOUNCE:
+		tcpm_set_state(port, ACC_UNATTACHED, PD_T_CC_DEBOUNCE);
+		break;
+
+	/* Hard_Reset states */
+	case HARD_RESET_SEND:
+		tcpm_pd_transmit(port, TCPC_TX_HARD_RESET, NULL);
+		tcpm_set_state(port, HARD_RESET_START, 0);
+		break;
+	case HARD_RESET_START:
+		port->hard_reset_count++;
+		port->tcpc->set_pd_rx(port->tcpc, false);
+		tcpm_unregister_altmodes(port);
+		port->send_discover = true;
+		if (port->pwr_role == TYPEC_SOURCE)
+			tcpm_set_state(port, SRC_HARD_RESET_VBUS_OFF,
+				       PD_T_PS_HARD_RESET);
+		else
+			tcpm_set_state(port, SNK_HARD_RESET_SINK_OFF, 0);
+		break;
+	case SRC_HARD_RESET_VBUS_OFF:
+		tcpm_set_vconn(port, true);
+		tcpm_set_vbus(port, false);
+		tcpm_set_roles(port, false, TYPEC_SOURCE, TYPEC_HOST);
+		tcpm_set_state(port, SRC_HARD_RESET_VBUS_ON, PD_T_SRC_RECOVER);
+		break;
+	case SRC_HARD_RESET_VBUS_ON:
+		tcpm_set_vbus(port, true);
+		port->tcpc->set_pd_rx(port->tcpc, true);
+		tcpm_set_attached_state(port, true);
+		tcpm_set_state(port, SRC_UNATTACHED, PD_T_PS_SOURCE_ON);
+		break;
+	case SNK_HARD_RESET_SINK_OFF:
+		tcpm_set_vconn(port, false);
+		tcpm_set_charge(port, false);
+		tcpm_set_roles(port, false, TYPEC_SINK, TYPEC_DEVICE);
+		/*
+		 * VBUS may or may not toggle, depending on the adapter.
+		 * If it doesn't toggle, transition to SNK_HARD_RESET_SINK_ON
+		 * directly after timeout.
+		 */
+		tcpm_set_state(port, SNK_HARD_RESET_SINK_ON, PD_T_SAFE_0V);
+		break;
+	case SNK_HARD_RESET_WAIT_VBUS:
+		/* Assume we're disconnected if VBUS doesn't come back. */
+		tcpm_set_state(port, SNK_UNATTACHED,
+			       PD_T_SRC_RECOVER_MAX + PD_T_SRC_TURN_ON);
+		break;
+	case SNK_HARD_RESET_SINK_ON:
+		/* Note: There is no guarantee that VBUS is on in this state */
+		/*
+		 * XXX:
+		 * The specification suggests that dual mode ports in sink
+		 * mode should transition to state PE_SRC_Transition_to_default.
+		 * See USB power delivery specification chapter 8.3.3.6.1.3.
+		 * This would mean to to
+		 * - turn off VCONN, reset power supply
+		 * - request hardware reset
+		 * - turn on VCONN
+		 * - Transition to state PE_Src_Startup
+		 * SNK only ports shall transition to state Snk_Startup
+		 * (see chapter 8.3.3.3.8).
+		 * Similar, dual-mode ports in source mode should transition
+		 * to PE_SNK_Transition_to_default.
+		 */
+		tcpm_set_attached_state(port, true);
+		tcpm_set_state(port, SNK_STARTUP, 0);
+		break;
+
+	/* Soft_Reset states */
+	case SOFT_RESET:
+		port->message_id = 0;
+		port->rx_msgid = -1;
+		tcpm_pd_send_control(port, PD_CTRL_ACCEPT);
+		if (port->pwr_role == TYPEC_SOURCE)
+			tcpm_set_state(port, SRC_SEND_CAPABILITIES, 0);
+		else
+			tcpm_set_state(port, SNK_WAIT_CAPABILITIES, 0);
+		break;
+	case SOFT_RESET_SEND:
+		port->message_id = 0;
+		port->rx_msgid = -1;
+		if (tcpm_pd_send_control(port, PD_CTRL_SOFT_RESET))
+			tcpm_set_state_cond(port, hard_reset_state(port), 0);
+		else
+			tcpm_set_state_cond(port, hard_reset_state(port),
+					    PD_T_SENDER_RESPONSE);
+		break;
+
+	/* DR_Swap states */
+	case DR_SWAP_SEND:
+		tcpm_pd_send_control(port, PD_CTRL_DR_SWAP);
+		tcpm_set_state_cond(port, DR_SWAP_SEND_TIMEOUT,
+				    PD_T_SENDER_RESPONSE);
+		break;
+	case DR_SWAP_ACCEPT:
+		tcpm_pd_send_control(port, PD_CTRL_ACCEPT);
+		tcpm_set_state_cond(port, DR_SWAP_CHANGE_DR, 0);
+		break;
+	case DR_SWAP_SEND_TIMEOUT:
+		tcpm_swap_complete(port, -ETIMEDOUT);
+		tcpm_set_state(port, ready_state(port), 0);
+		break;
+	case DR_SWAP_CHANGE_DR:
+		if (port->data_role == TYPEC_HOST) {
+			tcpm_unregister_altmodes(port);
+			tcpm_set_roles(port, true, port->pwr_role,
+				       TYPEC_DEVICE);
+		} else {
+			tcpm_set_roles(port, true, port->pwr_role,
+				       TYPEC_HOST);
+			port->send_discover = true;
+		}
+		tcpm_set_state(port, ready_state(port), 0);
+		break;
+
+	/* PR_Swap states */
+	case PR_SWAP_ACCEPT:
+		tcpm_pd_send_control(port, PD_CTRL_ACCEPT);
+		tcpm_set_state(port, PR_SWAP_START, 0);
+		break;
+	case PR_SWAP_SEND:
+		tcpm_pd_send_control(port, PD_CTRL_PR_SWAP);
+		tcpm_set_state_cond(port, PR_SWAP_SEND_TIMEOUT,
+				    PD_T_SENDER_RESPONSE);
+		break;
+	case PR_SWAP_SEND_TIMEOUT:
+		tcpm_swap_complete(port, -ETIMEDOUT);
+		tcpm_set_state(port, ready_state(port), 0);
+		break;
+	case PR_SWAP_START:
+		if (port->pwr_role == TYPEC_SOURCE)
+			tcpm_set_state(port, PR_SWAP_SRC_SNK_TRANSITION_OFF,
+				       PD_T_SRC_TRANSITION);
+		else
+			tcpm_set_state(port, PR_SWAP_SNK_SRC_SINK_OFF, 0);
+		break;
+	case PR_SWAP_SRC_SNK_TRANSITION_OFF:
+		tcpm_set_vbus(port, false);
+		port->explicit_contract = false;
+		/* allow time for Vbus discharge, must be < tSrcSwapStdby */
+		tcpm_set_state(port, PR_SWAP_SRC_SNK_SOURCE_OFF,
+			       PD_T_SRCSWAPSTDBY);
+		break;
+	case PR_SWAP_SRC_SNK_SOURCE_OFF:
+		tcpm_set_cc(port, TYPEC_CC_RD);
+		/* allow CC debounce */
+		tcpm_set_state(port, PR_SWAP_SRC_SNK_SOURCE_OFF_CC_DEBOUNCED,
+			       PD_T_CC_DEBOUNCE);
+		break;
+	case PR_SWAP_SRC_SNK_SOURCE_OFF_CC_DEBOUNCED:
+		/*
+		 * USB-PD standard, 6.2.1.4, Port Power Role:
+		 * "During the Power Role Swap Sequence, for the initial Source
+		 * Port, the Port Power Role field shall be set to Sink in the
+		 * PS_RDY Message indicating that the initial Source’s power
+		 * supply is turned off"
+		 */
+		tcpm_set_pwr_role(port, TYPEC_SINK);
+		if (tcpm_pd_send_control(port, PD_CTRL_PS_RDY)) {
+			tcpm_set_state(port, ERROR_RECOVERY, 0);
+			break;
+		}
+		tcpm_set_state_cond(port, SNK_UNATTACHED, PD_T_PS_SOURCE_ON);
+		break;
+	case PR_SWAP_SRC_SNK_SINK_ON:
+		tcpm_set_state(port, SNK_STARTUP, 0);
+		break;
+	case PR_SWAP_SNK_SRC_SINK_OFF:
+		tcpm_set_charge(port, false);
+		tcpm_set_state(port, hard_reset_state(port),
+			       PD_T_PS_SOURCE_OFF);
+		break;
+	case PR_SWAP_SNK_SRC_SOURCE_ON:
+		tcpm_set_cc(port, tcpm_rp_cc(port));
+		tcpm_set_vbus(port, true);
+		/*
+		 * allow time VBUS ramp-up, must be < tNewSrc
+		 * Also, this window overlaps with CC debounce as well.
+		 * So, Wait for the max of two which is PD_T_NEWSRC
+		 */
+		tcpm_set_state(port, PR_SWAP_SNK_SRC_SOURCE_ON_VBUS_RAMPED_UP,
+			       PD_T_NEWSRC);
+		break;
+	case PR_SWAP_SNK_SRC_SOURCE_ON_VBUS_RAMPED_UP:
+		/*
+		 * USB PD standard, 6.2.1.4:
+		 * "Subsequent Messages initiated by the Policy Engine,
+		 * such as the PS_RDY Message sent to indicate that Vbus
+		 * is ready, will have the Port Power Role field set to
+		 * Source."
+		 */
+		tcpm_set_pwr_role(port, TYPEC_SOURCE);
+		tcpm_pd_send_control(port, PD_CTRL_PS_RDY);
+		tcpm_set_state(port, SRC_STARTUP, 0);
+		break;
+
+	case VCONN_SWAP_ACCEPT:
+		tcpm_pd_send_control(port, PD_CTRL_ACCEPT);
+		tcpm_set_state(port, VCONN_SWAP_START, 0);
+		break;
+	case VCONN_SWAP_SEND:
+		tcpm_pd_send_control(port, PD_CTRL_VCONN_SWAP);
+		tcpm_set_state(port, VCONN_SWAP_SEND_TIMEOUT,
+			       PD_T_SENDER_RESPONSE);
+		break;
+	case VCONN_SWAP_SEND_TIMEOUT:
+		tcpm_swap_complete(port, -ETIMEDOUT);
+		tcpm_set_state(port, ready_state(port), 0);
+		break;
+	case VCONN_SWAP_START:
+		if (port->vconn_role == TYPEC_SOURCE)
+			tcpm_set_state(port, VCONN_SWAP_WAIT_FOR_VCONN, 0);
+		else
+			tcpm_set_state(port, VCONN_SWAP_TURN_ON_VCONN, 0);
+		break;
+	case VCONN_SWAP_WAIT_FOR_VCONN:
+		tcpm_set_state(port, hard_reset_state(port),
+			       PD_T_VCONN_SOURCE_ON);
+		break;
+	case VCONN_SWAP_TURN_ON_VCONN:
+		tcpm_set_vconn(port, true);
+		tcpm_pd_send_control(port, PD_CTRL_PS_RDY);
+		tcpm_set_state(port, ready_state(port), 0);
+		break;
+	case VCONN_SWAP_TURN_OFF_VCONN:
+		tcpm_set_vconn(port, false);
+		tcpm_set_state(port, ready_state(port), 0);
+		break;
+
+	case DR_SWAP_CANCEL:
+	case PR_SWAP_CANCEL:
+	case VCONN_SWAP_CANCEL:
+		tcpm_swap_complete(port, port->swap_status);
+		if (port->pwr_role == TYPEC_SOURCE)
+			tcpm_set_state(port, SRC_READY, 0);
+		else
+			tcpm_set_state(port, SNK_READY, 0);
+		break;
+
+	case BIST_RX:
+		switch (BDO_MODE_MASK(port->bist_request)) {
+		case BDO_MODE_CARRIER2:
+			tcpm_pd_transmit(port, TCPC_TX_BIST_MODE_2, NULL);
+			break;
+		default:
+			break;
+		}
+		/* Always switch to unattached state */
+		tcpm_set_state(port, unattached_state(port), 0);
+		break;
+	case ERROR_RECOVERY:
+		tcpm_swap_complete(port, -EPROTO);
+		tcpm_set_state(port, PORT_RESET, 0);
+		break;
+	case PORT_RESET:
+		tcpm_reset_port(port);
+		tcpm_set_cc(port, TYPEC_CC_OPEN);
+		tcpm_set_state(port, PORT_RESET_WAIT_OFF,
+			       PD_T_ERROR_RECOVERY);
+		break;
+	case PORT_RESET_WAIT_OFF:
+		tcpm_set_state(port,
+			       tcpm_default_state(port),
+			       port->vbus_present ? PD_T_PS_SOURCE_OFF : 0);
+		break;
+	default:
+		WARN(1, "Unexpected port state %d\n", port->state);
+		break;
+	}
+}
+
+static void tcpm_state_machine_work(struct work_struct *work)
+{
+	struct tcpm_port *port = container_of(work, struct tcpm_port,
+					      state_machine.work);
+	enum tcpm_state prev_state;
+
+	mutex_lock(&port->lock);
+	port->state_machine_running = true;
+
+	if (port->queued_message && tcpm_send_queued_message(port))
+		goto done;
+
+	/* If we were queued due to a delayed state change, update it now */
+	if (port->delayed_state) {
+		tcpm_log(port, "state change %s -> %s [delayed %ld ms]",
+			 tcpm_states[port->state],
+			 tcpm_states[port->delayed_state], port->delay_ms);
+		port->prev_state = port->state;
+		port->state = port->delayed_state;
+		port->delayed_state = INVALID_STATE;
+	}
+
+	/*
+	 * Continue running as long as we have (non-delayed) state changes
+	 * to make.
+	 */
+	do {
+		prev_state = port->state;
+		run_state_machine(port);
+		if (port->queued_message)
+			tcpm_send_queued_message(port);
+	} while (port->state != prev_state && !port->delayed_state);
+
+done:
+	port->state_machine_running = false;
+	mutex_unlock(&port->lock);
+}
+
+static void _tcpm_cc_change(struct tcpm_port *port, enum typec_cc_status cc1,
+			    enum typec_cc_status cc2)
+{
+	enum typec_cc_status old_cc1, old_cc2;
+	enum tcpm_state new_state;
+
+	old_cc1 = port->cc1;
+	old_cc2 = port->cc2;
+	port->cc1 = cc1;
+	port->cc2 = cc2;
+
+	tcpm_log_force(port,
+		       "CC1: %u -> %u, CC2: %u -> %u [state %s, polarity %d, %s]",
+		       old_cc1, cc1, old_cc2, cc2, tcpm_states[port->state],
+		       port->polarity,
+		       tcpm_port_is_disconnected(port) ? "disconnected"
+						       : "connected");
+
+	switch (port->state) {
+	case DRP_TOGGLING:
+		if (tcpm_port_is_debug(port) || tcpm_port_is_audio(port) ||
+		    tcpm_port_is_source(port))
+			tcpm_set_state(port, SRC_ATTACH_WAIT, 0);
+		else if (tcpm_port_is_sink(port))
+			tcpm_set_state(port, SNK_ATTACH_WAIT, 0);
+		break;
+	case SRC_UNATTACHED:
+	case ACC_UNATTACHED:
+		if (tcpm_port_is_debug(port) || tcpm_port_is_audio(port) ||
+		    tcpm_port_is_source(port))
+			tcpm_set_state(port, SRC_ATTACH_WAIT, 0);
+		break;
+	case SRC_ATTACH_WAIT:
+		if (tcpm_port_is_disconnected(port) ||
+		    tcpm_port_is_audio_detached(port))
+			tcpm_set_state(port, SRC_UNATTACHED, 0);
+		else if (cc1 != old_cc1 || cc2 != old_cc2)
+			tcpm_set_state(port, SRC_ATTACH_WAIT, 0);
+		break;
+	case SRC_ATTACHED:
+	case SRC_SEND_CAPABILITIES:
+	case SRC_READY:
+		if (tcpm_port_is_disconnected(port) ||
+		    !tcpm_port_is_source(port))
+			tcpm_set_state(port, SRC_UNATTACHED, 0);
+		break;
+	case SNK_UNATTACHED:
+		if (tcpm_port_is_sink(port))
+			tcpm_set_state(port, SNK_ATTACH_WAIT, 0);
+		break;
+	case SNK_ATTACH_WAIT:
+		if ((port->cc1 == TYPEC_CC_OPEN &&
+		     port->cc2 != TYPEC_CC_OPEN) ||
+		    (port->cc1 != TYPEC_CC_OPEN &&
+		     port->cc2 == TYPEC_CC_OPEN))
+			new_state = SNK_DEBOUNCED;
+		else if (tcpm_port_is_disconnected(port))
+			new_state = SNK_UNATTACHED;
+		else
+			break;
+		if (new_state != port->delayed_state)
+			tcpm_set_state(port, SNK_ATTACH_WAIT, 0);
+		break;
+	case SNK_DEBOUNCED:
+		if (tcpm_port_is_disconnected(port))
+			new_state = SNK_UNATTACHED;
+		else if (port->vbus_present)
+			new_state = tcpm_try_src(port) ? SRC_TRY : SNK_ATTACHED;
+		else
+			new_state = SNK_UNATTACHED;
+		if (new_state != port->delayed_state)
+			tcpm_set_state(port, SNK_DEBOUNCED, 0);
+		break;
+	case SNK_READY:
+		if (tcpm_port_is_disconnected(port))
+			tcpm_set_state(port, unattached_state(port), 0);
+		else if (!port->pd_capable &&
+			 (cc1 != old_cc1 || cc2 != old_cc2))
+			tcpm_set_current_limit(port,
+					       tcpm_get_current_limit(port),
+					       5000);
+		break;
+
+	case AUDIO_ACC_ATTACHED:
+		if (cc1 == TYPEC_CC_OPEN || cc2 == TYPEC_CC_OPEN)
+			tcpm_set_state(port, AUDIO_ACC_DEBOUNCE, 0);
+		break;
+	case AUDIO_ACC_DEBOUNCE:
+		if (tcpm_port_is_audio(port))
+			tcpm_set_state(port, AUDIO_ACC_ATTACHED, 0);
+		break;
+
+	case DEBUG_ACC_ATTACHED:
+		if (cc1 == TYPEC_CC_OPEN || cc2 == TYPEC_CC_OPEN)
+			tcpm_set_state(port, ACC_UNATTACHED, 0);
+		break;
+
+	case SNK_TRY:
+		/* Do nothing, waiting for timeout */
+		break;
+
+	case SNK_DISCOVERY:
+		/* CC line is unstable, wait for debounce */
+		if (tcpm_port_is_disconnected(port))
+			tcpm_set_state(port, SNK_DISCOVERY_DEBOUNCE, 0);
+		break;
+	case SNK_DISCOVERY_DEBOUNCE:
+		break;
+
+	case SRC_TRYWAIT:
+		/* Hand over to state machine if needed */
+		if (!port->vbus_present && tcpm_port_is_source(port))
+			tcpm_set_state(port, SRC_TRYWAIT_DEBOUNCE, 0);
+		break;
+	case SRC_TRYWAIT_DEBOUNCE:
+		if (port->vbus_present || !tcpm_port_is_source(port))
+			tcpm_set_state(port, SRC_TRYWAIT, 0);
+		break;
+	case SNK_TRY_WAIT_DEBOUNCE:
+		if (!tcpm_port_is_sink(port)) {
+			port->max_wait = 0;
+			tcpm_set_state(port, SRC_TRYWAIT, 0);
+		}
+		break;
+	case SRC_TRY_WAIT:
+		if (tcpm_port_is_source(port))
+			tcpm_set_state(port, SRC_TRY_DEBOUNCE, 0);
+		break;
+	case SRC_TRY_DEBOUNCE:
+		tcpm_set_state(port, SRC_TRY_WAIT, 0);
+		break;
+	case SNK_TRYWAIT_DEBOUNCE:
+		if (tcpm_port_is_sink(port))
+			tcpm_set_state(port, SNK_TRYWAIT_VBUS, 0);
+		break;
+	case SNK_TRYWAIT_VBUS:
+		if (!tcpm_port_is_sink(port))
+			tcpm_set_state(port, SNK_TRYWAIT_DEBOUNCE, 0);
+		break;
+	case SNK_TRYWAIT:
+		/* Do nothing, waiting for tCCDebounce */
+		break;
+	case PR_SWAP_SNK_SRC_SINK_OFF:
+	case PR_SWAP_SRC_SNK_TRANSITION_OFF:
+	case PR_SWAP_SRC_SNK_SOURCE_OFF:
+	case PR_SWAP_SRC_SNK_SOURCE_OFF_CC_DEBOUNCED:
+	case PR_SWAP_SNK_SRC_SOURCE_ON:
+		/*
+		 * CC state change is expected in PR_SWAP
+		 * Ignore it.
+		 */
+		break;
+
+	default:
+		if (tcpm_port_is_disconnected(port))
+			tcpm_set_state(port, unattached_state(port), 0);
+		break;
+	}
+}
+
+static void _tcpm_pd_vbus_on(struct tcpm_port *port)
+{
+	tcpm_log_force(port, "VBUS on");
+	port->vbus_present = true;
+	switch (port->state) {
+	case SNK_TRANSITION_SINK_VBUS:
+		port->explicit_contract = true;
+		tcpm_set_state(port, SNK_READY, 0);
+		break;
+	case SNK_DISCOVERY:
+		tcpm_set_state(port, SNK_DISCOVERY, 0);
+		break;
+
+	case SNK_DEBOUNCED:
+		tcpm_set_state(port, tcpm_try_src(port) ? SRC_TRY
+							: SNK_ATTACHED,
+				       0);
+		break;
+	case SNK_HARD_RESET_WAIT_VBUS:
+		tcpm_set_state(port, SNK_HARD_RESET_SINK_ON, 0);
+		break;
+	case SRC_ATTACHED:
+		tcpm_set_state(port, SRC_STARTUP, 0);
+		break;
+	case SRC_HARD_RESET_VBUS_ON:
+		tcpm_set_state(port, SRC_STARTUP, 0);
+		break;
+
+	case SNK_TRY:
+		/* Do nothing, waiting for timeout */
+		break;
+	case SRC_TRYWAIT:
+		/* Do nothing, Waiting for Rd to be detected */
+		break;
+	case SRC_TRYWAIT_DEBOUNCE:
+		tcpm_set_state(port, SRC_TRYWAIT, 0);
+		break;
+	case SNK_TRY_WAIT_DEBOUNCE:
+		/* Do nothing, waiting for PD_DEBOUNCE to do be done */
+		break;
+	case SNK_TRYWAIT:
+		/* Do nothing, waiting for tCCDebounce */
+		break;
+	case SNK_TRYWAIT_VBUS:
+		if (tcpm_port_is_sink(port))
+			tcpm_set_state(port, SNK_ATTACHED, 0);
+		break;
+	case SNK_TRYWAIT_DEBOUNCE:
+		/* Do nothing, waiting for Rp */
+		break;
+	case SRC_TRY_WAIT:
+	case SRC_TRY_DEBOUNCE:
+		/* Do nothing, waiting for sink detection */
+		break;
+	default:
+		break;
+	}
+}
+
+static void _tcpm_pd_vbus_off(struct tcpm_port *port)
+{
+	tcpm_log_force(port, "VBUS off");
+	port->vbus_present = false;
+	port->vbus_never_low = false;
+	switch (port->state) {
+	case SNK_HARD_RESET_SINK_OFF:
+		tcpm_set_state(port, SNK_HARD_RESET_WAIT_VBUS, 0);
+		break;
+	case SRC_HARD_RESET_VBUS_OFF:
+		tcpm_set_state(port, SRC_HARD_RESET_VBUS_ON, 0);
+		break;
+	case HARD_RESET_SEND:
+		break;
+
+	case SNK_TRY:
+		/* Do nothing, waiting for timeout */
+		break;
+	case SRC_TRYWAIT:
+		/* Hand over to state machine if needed */
+		if (tcpm_port_is_source(port))
+			tcpm_set_state(port, SRC_TRYWAIT_DEBOUNCE, 0);
+		break;
+	case SNK_TRY_WAIT_DEBOUNCE:
+		/* Do nothing, waiting for PD_DEBOUNCE to do be done */
+		break;
+	case SNK_TRYWAIT:
+	case SNK_TRYWAIT_VBUS:
+	case SNK_TRYWAIT_DEBOUNCE:
+		break;
+	case SNK_ATTACH_WAIT:
+		tcpm_set_state(port, SNK_UNATTACHED, 0);
+		break;
+
+	case SNK_NEGOTIATE_CAPABILITIES:
+		break;
+
+	case PR_SWAP_SRC_SNK_TRANSITION_OFF:
+		tcpm_set_state(port, PR_SWAP_SRC_SNK_SOURCE_OFF, 0);
+		break;
+
+	case PR_SWAP_SNK_SRC_SINK_OFF:
+		/* Do nothing, expected */
+		break;
+
+	case PORT_RESET_WAIT_OFF:
+		tcpm_set_state(port, tcpm_default_state(port), 0);
+		break;
+	case SRC_TRY_WAIT:
+	case SRC_TRY_DEBOUNCE:
+		/* Do nothing, waiting for sink detection */
+		break;
+	default:
+		if (port->pwr_role == TYPEC_SINK &&
+		    port->attached)
+			tcpm_set_state(port, SNK_UNATTACHED, 0);
+		break;
+	}
+}
+
+static void _tcpm_pd_hard_reset(struct tcpm_port *port)
+{
+	tcpm_log_force(port, "Received hard reset");
+	/*
+	 * If we keep receiving hard reset requests, executing the hard reset
+	 * must have failed. Revert to error recovery if that happens.
+	 */
+	tcpm_set_state(port,
+		       port->hard_reset_count < PD_N_HARD_RESET_COUNT ?
+				HARD_RESET_START : ERROR_RECOVERY,
+		       0);
+}
+
+static void tcpm_pd_event_handler(struct work_struct *work)
+{
+	struct tcpm_port *port = container_of(work, struct tcpm_port,
+					      event_work);
+	u32 events;
+
+	mutex_lock(&port->lock);
+
+	spin_lock(&port->pd_event_lock);
+	while (port->pd_events) {
+		events = port->pd_events;
+		port->pd_events = 0;
+		spin_unlock(&port->pd_event_lock);
+		if (events & TCPM_RESET_EVENT)
+			_tcpm_pd_hard_reset(port);
+		if (events & TCPM_VBUS_EVENT) {
+			bool vbus;
+
+			vbus = port->tcpc->get_vbus(port->tcpc);
+			if (vbus)
+				_tcpm_pd_vbus_on(port);
+			else
+				_tcpm_pd_vbus_off(port);
+		}
+		if (events & TCPM_CC_EVENT) {
+			enum typec_cc_status cc1, cc2;
+
+			if (port->tcpc->get_cc(port->tcpc, &cc1, &cc2) == 0)
+				_tcpm_cc_change(port, cc1, cc2);
+		}
+		spin_lock(&port->pd_event_lock);
+	}
+	spin_unlock(&port->pd_event_lock);
+	mutex_unlock(&port->lock);
+}
+
+void tcpm_cc_change(struct tcpm_port *port)
+{
+	spin_lock(&port->pd_event_lock);
+	port->pd_events |= TCPM_CC_EVENT;
+	spin_unlock(&port->pd_event_lock);
+	queue_work(port->wq, &port->event_work);
+}
+EXPORT_SYMBOL_GPL(tcpm_cc_change);
+
+void tcpm_vbus_change(struct tcpm_port *port)
+{
+	spin_lock(&port->pd_event_lock);
+	port->pd_events |= TCPM_VBUS_EVENT;
+	spin_unlock(&port->pd_event_lock);
+	queue_work(port->wq, &port->event_work);
+}
+EXPORT_SYMBOL_GPL(tcpm_vbus_change);
+
+void tcpm_pd_hard_reset(struct tcpm_port *port)
+{
+	spin_lock(&port->pd_event_lock);
+	port->pd_events = TCPM_RESET_EVENT;
+	spin_unlock(&port->pd_event_lock);
+	queue_work(port->wq, &port->event_work);
+}
+EXPORT_SYMBOL_GPL(tcpm_pd_hard_reset);
+
+static int tcpm_dr_set(const struct typec_capability *cap,
+		       enum typec_data_role data)
+{
+	struct tcpm_port *port = typec_cap_to_tcpm(cap);
+	int ret;
+
+	mutex_lock(&port->swap_lock);
+	mutex_lock(&port->lock);
+
+	if (port->port_type != TYPEC_PORT_DRP) {
+		ret = -EINVAL;
+		goto port_unlock;
+	}
+	if (port->state != SRC_READY && port->state != SNK_READY) {
+		ret = -EAGAIN;
+		goto port_unlock;
+	}
+
+	if (port->data_role == data) {
+		ret = 0;
+		goto port_unlock;
+	}
+
+	/*
+	 * XXX
+	 * 6.3.9: If an alternate mode is active, a request to swap
+	 * alternate modes shall trigger a port reset.
+	 * Reject data role swap request in this case.
+	 */
+
+	if (!port->pd_capable) {
+		/*
+		 * If the partner is not PD capable, reset the port to
+		 * trigger a role change. This can only work if a preferred
+		 * role is configured, and if it matches the requested role.
+		 */
+		if (port->try_role == TYPEC_NO_PREFERRED_ROLE ||
+		    port->try_role == port->pwr_role) {
+			ret = -EINVAL;
+			goto port_unlock;
+		}
+		port->non_pd_role_swap = true;
+		tcpm_set_state(port, PORT_RESET, 0);
+	} else {
+		tcpm_set_state(port, DR_SWAP_SEND, 0);
+	}
+
+	port->swap_status = 0;
+	port->swap_pending = true;
+	reinit_completion(&port->swap_complete);
+	mutex_unlock(&port->lock);
+
+	if (!wait_for_completion_timeout(&port->swap_complete,
+				msecs_to_jiffies(PD_ROLE_SWAP_TIMEOUT)))
+		ret = -ETIMEDOUT;
+	else
+		ret = port->swap_status;
+
+	port->non_pd_role_swap = false;
+	goto swap_unlock;
+
+port_unlock:
+	mutex_unlock(&port->lock);
+swap_unlock:
+	mutex_unlock(&port->swap_lock);
+	return ret;
+}
+
+static int tcpm_pr_set(const struct typec_capability *cap,
+		       enum typec_role role)
+{
+	struct tcpm_port *port = typec_cap_to_tcpm(cap);
+	int ret;
+
+	mutex_lock(&port->swap_lock);
+	mutex_lock(&port->lock);
+
+	if (port->port_type != TYPEC_PORT_DRP) {
+		ret = -EINVAL;
+		goto port_unlock;
+	}
+	if (port->state != SRC_READY && port->state != SNK_READY) {
+		ret = -EAGAIN;
+		goto port_unlock;
+	}
+
+	if (role == port->pwr_role) {
+		ret = 0;
+		goto port_unlock;
+	}
+
+	port->swap_status = 0;
+	port->swap_pending = true;
+	reinit_completion(&port->swap_complete);
+	tcpm_set_state(port, PR_SWAP_SEND, 0);
+	mutex_unlock(&port->lock);
+
+	if (!wait_for_completion_timeout(&port->swap_complete,
+				msecs_to_jiffies(PD_ROLE_SWAP_TIMEOUT)))
+		ret = -ETIMEDOUT;
+	else
+		ret = port->swap_status;
+
+	goto swap_unlock;
+
+port_unlock:
+	mutex_unlock(&port->lock);
+swap_unlock:
+	mutex_unlock(&port->swap_lock);
+	return ret;
+}
+
+static int tcpm_vconn_set(const struct typec_capability *cap,
+			  enum typec_role role)
+{
+	struct tcpm_port *port = typec_cap_to_tcpm(cap);
+	int ret;
+
+	mutex_lock(&port->swap_lock);
+	mutex_lock(&port->lock);
+
+	if (port->state != SRC_READY && port->state != SNK_READY) {
+		ret = -EAGAIN;
+		goto port_unlock;
+	}
+
+	if (role == port->vconn_role) {
+		ret = 0;
+		goto port_unlock;
+	}
+
+	port->swap_status = 0;
+	port->swap_pending = true;
+	reinit_completion(&port->swap_complete);
+	tcpm_set_state(port, VCONN_SWAP_SEND, 0);
+	mutex_unlock(&port->lock);
+
+	if (!wait_for_completion_timeout(&port->swap_complete,
+				msecs_to_jiffies(PD_ROLE_SWAP_TIMEOUT)))
+		ret = -ETIMEDOUT;
+	else
+		ret = port->swap_status;
+
+	goto swap_unlock;
+
+port_unlock:
+	mutex_unlock(&port->lock);
+swap_unlock:
+	mutex_unlock(&port->swap_lock);
+	return ret;
+}
+
+static int tcpm_try_role(const struct typec_capability *cap, int role)
+{
+	struct tcpm_port *port = typec_cap_to_tcpm(cap);
+	struct tcpc_dev	*tcpc = port->tcpc;
+	int ret = 0;
+
+	mutex_lock(&port->lock);
+	if (tcpc->try_role)
+		ret = tcpc->try_role(tcpc, role);
+	if (!ret && !tcpc->config->try_role_hw)
+		port->try_role = role;
+	port->try_src_count = 0;
+	port->try_snk_count = 0;
+	mutex_unlock(&port->lock);
+
+	return ret;
+}
+
+static void tcpm_init(struct tcpm_port *port)
+{
+	enum typec_cc_status cc1, cc2;
+
+	port->tcpc->init(port->tcpc);
+
+	tcpm_reset_port(port);
+
+	/*
+	 * XXX
+	 * Should possibly wait for VBUS to settle if it was enabled locally
+	 * since tcpm_reset_port() will disable VBUS.
+	 */
+	port->vbus_present = port->tcpc->get_vbus(port->tcpc);
+	if (port->vbus_present)
+		port->vbus_never_low = true;
+
+	tcpm_set_state(port, tcpm_default_state(port), 0);
+
+	if (port->tcpc->get_cc(port->tcpc, &cc1, &cc2) == 0)
+		_tcpm_cc_change(port, cc1, cc2);
+
+	/*
+	 * Some adapters need a clean slate at startup, and won't recover
+	 * otherwise. So do not try to be fancy and force a clean disconnect.
+	 */
+	tcpm_set_state(port, PORT_RESET, 0);
+}
+
+static int tcpm_port_type_set(const struct typec_capability *cap,
+			      enum typec_port_type type)
+{
+	struct tcpm_port *port = typec_cap_to_tcpm(cap);
+
+	mutex_lock(&port->lock);
+	if (type == port->port_type)
+		goto port_unlock;
+
+	port->port_type = type;
+
+	if (!port->connected) {
+		tcpm_set_state(port, PORT_RESET, 0);
+	} else if (type == TYPEC_PORT_UFP) {
+		if (!(port->pwr_role == TYPEC_SINK &&
+		      port->data_role == TYPEC_DEVICE))
+			tcpm_set_state(port, PORT_RESET, 0);
+	} else if (type == TYPEC_PORT_DFP) {
+		if (!(port->pwr_role == TYPEC_SOURCE &&
+		      port->data_role == TYPEC_HOST))
+			tcpm_set_state(port, PORT_RESET, 0);
+	}
+
+port_unlock:
+	mutex_unlock(&port->lock);
+	return 0;
+}
+
+void tcpm_tcpc_reset(struct tcpm_port *port)
+{
+	mutex_lock(&port->lock);
+	/* XXX: Maintain PD connection if possible? */
+	tcpm_init(port);
+	mutex_unlock(&port->lock);
+}
+EXPORT_SYMBOL_GPL(tcpm_tcpc_reset);
+
+static int tcpm_copy_pdos(u32 *dest_pdo, const u32 *src_pdo,
+			  unsigned int nr_pdo)
+{
+	unsigned int i;
+
+	if (nr_pdo > PDO_MAX_OBJECTS)
+		nr_pdo = PDO_MAX_OBJECTS;
+
+	for (i = 0; i < nr_pdo; i++)
+		dest_pdo[i] = src_pdo[i];
+
+	return nr_pdo;
+}
+
+static int tcpm_copy_vdos(u32 *dest_vdo, const u32 *src_vdo,
+			  unsigned int nr_vdo)
+{
+	unsigned int i;
+
+	if (nr_vdo > VDO_MAX_OBJECTS)
+		nr_vdo = VDO_MAX_OBJECTS;
+
+	for (i = 0; i < nr_vdo; i++)
+		dest_vdo[i] = src_vdo[i];
+
+	return nr_vdo;
+}
+
+void tcpm_update_source_capabilities(struct tcpm_port *port, const u32 *pdo,
+				     unsigned int nr_pdo)
+{
+	mutex_lock(&port->lock);
+	port->nr_src_pdo = tcpm_copy_pdos(port->src_pdo, pdo, nr_pdo);
+	switch (port->state) {
+	case SRC_UNATTACHED:
+	case SRC_ATTACH_WAIT:
+	case SRC_TRYWAIT:
+		tcpm_set_cc(port, tcpm_rp_cc(port));
+		break;
+	case SRC_SEND_CAPABILITIES:
+	case SRC_NEGOTIATE_CAPABILITIES:
+	case SRC_READY:
+	case SRC_WAIT_NEW_CAPABILITIES:
+		tcpm_set_cc(port, tcpm_rp_cc(port));
+		tcpm_set_state(port, SRC_SEND_CAPABILITIES, 0);
+		break;
+	default:
+		break;
+	}
+	mutex_unlock(&port->lock);
+}
+EXPORT_SYMBOL_GPL(tcpm_update_source_capabilities);
+
+void tcpm_update_sink_capabilities(struct tcpm_port *port, const u32 *pdo,
+				   unsigned int nr_pdo,
+				   unsigned int max_snk_mv,
+				   unsigned int max_snk_ma,
+				   unsigned int max_snk_mw,
+				   unsigned int operating_snk_mw)
+{
+	mutex_lock(&port->lock);
+	port->nr_snk_pdo = tcpm_copy_pdos(port->snk_pdo, pdo, nr_pdo);
+	port->max_snk_mv = max_snk_mv;
+	port->max_snk_ma = max_snk_ma;
+	port->max_snk_mw = max_snk_mw;
+	port->operating_snk_mw = operating_snk_mw;
+
+	switch (port->state) {
+	case SNK_NEGOTIATE_CAPABILITIES:
+	case SNK_READY:
+	case SNK_TRANSITION_SINK:
+	case SNK_TRANSITION_SINK_VBUS:
+		tcpm_set_state(port, SNK_NEGOTIATE_CAPABILITIES, 0);
+		break;
+	default:
+		break;
+	}
+	mutex_unlock(&port->lock);
+}
+EXPORT_SYMBOL_GPL(tcpm_update_sink_capabilities);
+
+struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc)
+{
+	struct tcpm_port *port;
+	int i, err;
+
+	if (!dev || !tcpc || !tcpc->config ||
+	    !tcpc->get_vbus || !tcpc->set_cc || !tcpc->get_cc ||
+	    !tcpc->set_polarity || !tcpc->set_vconn || !tcpc->set_vbus ||
+	    !tcpc->set_pd_rx || !tcpc->set_roles || !tcpc->pd_transmit)
+		return ERR_PTR(-EINVAL);
+
+	port = devm_kzalloc(dev, sizeof(*port), GFP_KERNEL);
+	if (!port)
+		return ERR_PTR(-ENOMEM);
+
+	port->dev = dev;
+	port->tcpc = tcpc;
+
+	mutex_init(&port->lock);
+	mutex_init(&port->swap_lock);
+
+	port->wq = create_singlethread_workqueue(dev_name(dev));
+	if (!port->wq)
+		return ERR_PTR(-ENOMEM);
+	INIT_DELAYED_WORK(&port->state_machine, tcpm_state_machine_work);
+	INIT_DELAYED_WORK(&port->vdm_state_machine, vdm_state_machine_work);
+	INIT_WORK(&port->event_work, tcpm_pd_event_handler);
+
+	spin_lock_init(&port->pd_event_lock);
+
+	init_completion(&port->tx_complete);
+	init_completion(&port->swap_complete);
+
+	port->nr_src_pdo = tcpm_copy_pdos(port->src_pdo, tcpc->config->src_pdo,
+					  tcpc->config->nr_src_pdo);
+	port->nr_snk_pdo = tcpm_copy_pdos(port->snk_pdo, tcpc->config->snk_pdo,
+					  tcpc->config->nr_snk_pdo);
+	port->nr_snk_vdo = tcpm_copy_vdos(port->snk_vdo, tcpc->config->snk_vdo,
+					  tcpc->config->nr_snk_vdo);
+
+	port->max_snk_mv = tcpc->config->max_snk_mv;
+	port->max_snk_ma = tcpc->config->max_snk_ma;
+	port->max_snk_mw = tcpc->config->max_snk_mw;
+	port->operating_snk_mw = tcpc->config->operating_snk_mw;
+	if (!tcpc->config->try_role_hw)
+		port->try_role = tcpc->config->default_role;
+	else
+		port->try_role = TYPEC_NO_PREFERRED_ROLE;
+
+	port->typec_caps.prefer_role = tcpc->config->default_role;
+	port->typec_caps.type = tcpc->config->type;
+	port->typec_caps.revision = 0x0120;	/* Type-C spec release 1.2 */
+	port->typec_caps.pd_revision = 0x0200;	/* USB-PD spec release 2.0 */
+	port->typec_caps.dr_set = tcpm_dr_set;
+	port->typec_caps.pr_set = tcpm_pr_set;
+	port->typec_caps.vconn_set = tcpm_vconn_set;
+	port->typec_caps.try_role = tcpm_try_role;
+	port->typec_caps.port_type_set = tcpm_port_type_set;
+
+	port->partner_desc.identity = &port->partner_ident;
+	port->port_type = tcpc->config->type;
+
+	port->typec_port = typec_register_port(port->dev, &port->typec_caps);
+	if (!port->typec_port) {
+		err = -ENOMEM;
+		goto out_destroy_wq;
+	}
+
+	if (tcpc->config->alt_modes) {
+		const struct typec_altmode_desc *paltmode = tcpc->config->alt_modes;
+
+		i = 0;
+		while (paltmode->svid && i < ARRAY_SIZE(port->port_altmode)) {
+			port->port_altmode[i] =
+			  typec_port_register_altmode(port->typec_port,
+						      paltmode);
+			if (!port->port_altmode[i]) {
+				tcpm_log(port,
+					 "%s: failed to register port alternate mode 0x%x",
+					 dev_name(dev), paltmode->svid);
+				break;
+			}
+			i++;
+			paltmode++;
+		}
+	}
+
+	tcpm_debugfs_init(port);
+	mutex_lock(&port->lock);
+	tcpm_init(port);
+	mutex_unlock(&port->lock);
+
+	tcpm_log(port, "%s: registered", dev_name(dev));
+	return port;
+
+out_destroy_wq:
+	destroy_workqueue(port->wq);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(tcpm_register_port);
+
+void tcpm_unregister_port(struct tcpm_port *port)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(port->port_altmode); i++)
+		typec_unregister_altmode(port->port_altmode[i]);
+	typec_unregister_port(port->typec_port);
+	tcpm_debugfs_exit(port);
+	destroy_workqueue(port->wq);
+}
+EXPORT_SYMBOL_GPL(tcpm_unregister_port);
+
+MODULE_AUTHOR("Guenter Roeck <groeck@chromium.org>");
+MODULE_DESCRIPTION("USB Type-C Port Manager");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h
new file mode 100644
index 000000000000..e00051ced806
--- /dev/null
+++ b/include/linux/usb/pd.h
@@ -0,0 +1,298 @@
+/*
+ * Copyright 2015-2017 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_USB_PD_H
+#define __LINUX_USB_PD_H
+
+#include <linux/types.h>
+#include <linux/usb/typec.h>
+
+/* USB PD Messages */
+enum pd_ctrl_msg_type {
+	/* 0 Reserved */
+	PD_CTRL_GOOD_CRC = 1,
+	PD_CTRL_GOTO_MIN = 2,
+	PD_CTRL_ACCEPT = 3,
+	PD_CTRL_REJECT = 4,
+	PD_CTRL_PING = 5,
+	PD_CTRL_PS_RDY = 6,
+	PD_CTRL_GET_SOURCE_CAP = 7,
+	PD_CTRL_GET_SINK_CAP = 8,
+	PD_CTRL_DR_SWAP = 9,
+	PD_CTRL_PR_SWAP = 10,
+	PD_CTRL_VCONN_SWAP = 11,
+	PD_CTRL_WAIT = 12,
+	PD_CTRL_SOFT_RESET = 13,
+	/* 14-15 Reserved */
+};
+
+enum pd_data_msg_type {
+	/* 0 Reserved */
+	PD_DATA_SOURCE_CAP = 1,
+	PD_DATA_REQUEST = 2,
+	PD_DATA_BIST = 3,
+	PD_DATA_SINK_CAP = 4,
+	/* 5-14 Reserved */
+	PD_DATA_VENDOR_DEF = 15,
+};
+
+#define PD_REV10	0x0
+#define PD_REV20	0x1
+
+#define PD_HEADER_CNT_SHIFT	12
+#define PD_HEADER_CNT_MASK	0x7
+#define PD_HEADER_ID_SHIFT	9
+#define PD_HEADER_ID_MASK	0x7
+#define PD_HEADER_PWR_ROLE	BIT(8)
+#define PD_HEADER_REV_SHIFT	6
+#define PD_HEADER_REV_MASK	0x3
+#define PD_HEADER_DATA_ROLE	BIT(5)
+#define PD_HEADER_TYPE_SHIFT	0
+#define PD_HEADER_TYPE_MASK	0xf
+
+#define PD_HEADER(type, pwr, data, id, cnt)				\
+	((((type) & PD_HEADER_TYPE_MASK) << PD_HEADER_TYPE_SHIFT) |	\
+	 ((pwr) == TYPEC_SOURCE ? PD_HEADER_PWR_ROLE : 0) |		\
+	 ((data) == TYPEC_HOST ? PD_HEADER_DATA_ROLE : 0) |		\
+	 (PD_REV20 << PD_HEADER_REV_SHIFT) |				\
+	 (((id) & PD_HEADER_ID_MASK) << PD_HEADER_ID_SHIFT) |		\
+	 (((cnt) & PD_HEADER_CNT_MASK) << PD_HEADER_CNT_SHIFT))
+
+#define PD_HEADER_LE(type, pwr, data, id, cnt) \
+	cpu_to_le16(PD_HEADER((type), (pwr), (data), (id), (cnt)))
+
+static inline unsigned int pd_header_cnt(u16 header)
+{
+	return (header >> PD_HEADER_CNT_SHIFT) & PD_HEADER_CNT_MASK;
+}
+
+static inline unsigned int pd_header_cnt_le(__le16 header)
+{
+	return pd_header_cnt(le16_to_cpu(header));
+}
+
+static inline unsigned int pd_header_type(u16 header)
+{
+	return (header >> PD_HEADER_TYPE_SHIFT) & PD_HEADER_TYPE_MASK;
+}
+
+static inline unsigned int pd_header_type_le(__le16 header)
+{
+	return pd_header_type(le16_to_cpu(header));
+}
+
+static inline unsigned int pd_header_msgid(u16 header)
+{
+	return (header >> PD_HEADER_ID_SHIFT) & PD_HEADER_ID_MASK;
+}
+
+static inline unsigned int pd_header_msgid_le(__le16 header)
+{
+	return pd_header_msgid(le16_to_cpu(header));
+}
+
+#define PD_MAX_PAYLOAD		7
+
+/**
+ * struct pd_message - PD message as seen on wire
+ * @header:	PD message header
+ * @payload:	PD message payload
+ */
+struct pd_message {
+	__le16 header;
+	__le32 payload[PD_MAX_PAYLOAD];
+} __packed;
+
+/* PDO: Power Data Object */
+#define PDO_MAX_OBJECTS		7
+
+enum pd_pdo_type {
+	PDO_TYPE_FIXED = 0,
+	PDO_TYPE_BATT = 1,
+	PDO_TYPE_VAR = 2,
+};
+
+#define PDO_TYPE_SHIFT		30
+#define PDO_TYPE_MASK		0x3
+
+#define PDO_TYPE(t)	((t) << PDO_TYPE_SHIFT)
+
+#define PDO_VOLT_MASK		0x3ff
+#define PDO_CURR_MASK		0x3ff
+#define PDO_PWR_MASK		0x3ff
+
+#define PDO_FIXED_DUAL_ROLE	BIT(29)	/* Power role swap supported */
+#define PDO_FIXED_SUSPEND	BIT(28) /* USB Suspend supported (Source) */
+#define PDO_FIXED_HIGHER_CAP	BIT(28) /* Requires more than vSafe5V (Sink) */
+#define PDO_FIXED_EXTPOWER	BIT(27) /* Externally powered */
+#define PDO_FIXED_USB_COMM	BIT(26) /* USB communications capable */
+#define PDO_FIXED_DATA_SWAP	BIT(25) /* Data role swap supported */
+#define PDO_FIXED_VOLT_SHIFT	10	/* 50mV units */
+#define PDO_FIXED_CURR_SHIFT	0	/* 10mA units */
+
+#define PDO_FIXED_VOLT(mv)	((((mv) / 50) & PDO_VOLT_MASK) << PDO_FIXED_VOLT_SHIFT)
+#define PDO_FIXED_CURR(ma)	((((ma) / 10) & PDO_CURR_MASK) << PDO_FIXED_CURR_SHIFT)
+
+#define PDO_FIXED(mv, ma, flags)			\
+	(PDO_TYPE(PDO_TYPE_FIXED) | (flags) |		\
+	 PDO_FIXED_VOLT(mv) | PDO_FIXED_CURR(ma))
+
+#define PDO_BATT_MAX_VOLT_SHIFT	20	/* 50mV units */
+#define PDO_BATT_MIN_VOLT_SHIFT	10	/* 50mV units */
+#define PDO_BATT_MAX_PWR_SHIFT	0	/* 250mW units */
+
+#define PDO_BATT_MIN_VOLT(mv) ((((mv) / 50) & PDO_VOLT_MASK) << PDO_BATT_MIN_VOLT_SHIFT)
+#define PDO_BATT_MAX_VOLT(mv) ((((mv) / 50) & PDO_VOLT_MASK) << PDO_BATT_MAX_VOLT_SHIFT)
+#define PDO_BATT_MAX_POWER(mw) ((((mw) / 250) & PDO_PWR_MASK) << PDO_BATT_MAX_PWR_SHIFT)
+
+#define PDO_BATT(min_mv, max_mv, max_mw)			\
+	(PDO_TYPE(PDO_TYPE_BATT) | PDO_BATT_MIN_VOLT(min_mv) |	\
+	 PDO_BATT_MAX_VOLT(max_mv) | PDO_BATT_MAX_POWER(max_mw))
+
+#define PDO_VAR_MAX_VOLT_SHIFT	20	/* 50mV units */
+#define PDO_VAR_MIN_VOLT_SHIFT	10	/* 50mV units */
+#define PDO_VAR_MAX_CURR_SHIFT	0	/* 10mA units */
+
+#define PDO_VAR_MIN_VOLT(mv) ((((mv) / 50) & PDO_VOLT_MASK) << PDO_VAR_MIN_VOLT_SHIFT)
+#define PDO_VAR_MAX_VOLT(mv) ((((mv) / 50) & PDO_VOLT_MASK) << PDO_VAR_MAX_VOLT_SHIFT)
+#define PDO_VAR_MAX_CURR(ma) ((((ma) / 10) & PDO_CURR_MASK) << PDO_VAR_MAX_CURR_SHIFT)
+
+#define PDO_VAR(min_mv, max_mv, max_ma)				\
+	(PDO_TYPE(PDO_TYPE_VAR) | PDO_VAR_MIN_VOLT(min_mv) |	\
+	 PDO_VAR_MAX_VOLT(max_mv) | PDO_VAR_MAX_CURR(max_ma))
+
+static inline enum pd_pdo_type pdo_type(u32 pdo)
+{
+	return (pdo >> PDO_TYPE_SHIFT) & PDO_TYPE_MASK;
+}
+
+static inline unsigned int pdo_fixed_voltage(u32 pdo)
+{
+	return ((pdo >> PDO_FIXED_VOLT_SHIFT) & PDO_VOLT_MASK) * 50;
+}
+
+static inline unsigned int pdo_min_voltage(u32 pdo)
+{
+	return ((pdo >> PDO_VAR_MIN_VOLT_SHIFT) & PDO_VOLT_MASK) * 50;
+}
+
+static inline unsigned int pdo_max_voltage(u32 pdo)
+{
+	return ((pdo >> PDO_VAR_MAX_VOLT_SHIFT) & PDO_VOLT_MASK) * 50;
+}
+
+static inline unsigned int pdo_max_current(u32 pdo)
+{
+	return ((pdo >> PDO_VAR_MAX_CURR_SHIFT) & PDO_CURR_MASK) * 10;
+}
+
+static inline unsigned int pdo_max_power(u32 pdo)
+{
+	return ((pdo >> PDO_BATT_MAX_PWR_SHIFT) & PDO_PWR_MASK) * 250;
+}
+
+/* RDO: Request Data Object */
+#define RDO_OBJ_POS_SHIFT	28
+#define RDO_OBJ_POS_MASK	0x7
+#define RDO_GIVE_BACK		BIT(27)	/* Supports reduced operating current */
+#define RDO_CAP_MISMATCH	BIT(26) /* Not satisfied by source caps */
+#define RDO_USB_COMM		BIT(25) /* USB communications capable */
+#define RDO_NO_SUSPEND		BIT(24) /* USB Suspend not supported */
+
+#define RDO_PWR_MASK			0x3ff
+#define RDO_CURR_MASK			0x3ff
+
+#define RDO_FIXED_OP_CURR_SHIFT		10
+#define RDO_FIXED_MAX_CURR_SHIFT	0
+
+#define RDO_OBJ(idx) (((idx) & RDO_OBJ_POS_MASK) << RDO_OBJ_POS_SHIFT)
+
+#define PDO_FIXED_OP_CURR(ma) ((((ma) / 10) & RDO_CURR_MASK) << RDO_FIXED_OP_CURR_SHIFT)
+#define PDO_FIXED_MAX_CURR(ma) ((((ma) / 10) & RDO_CURR_MASK) << RDO_FIXED_MAX_CURR_SHIFT)
+
+#define RDO_FIXED(idx, op_ma, max_ma, flags)			\
+	(RDO_OBJ(idx) | (flags) |				\
+	 PDO_FIXED_OP_CURR(op_ma) | PDO_FIXED_MAX_CURR(max_ma))
+
+#define RDO_BATT_OP_PWR_SHIFT		10	/* 250mW units */
+#define RDO_BATT_MAX_PWR_SHIFT		0	/* 250mW units */
+
+#define RDO_BATT_OP_PWR(mw) ((((mw) / 250) & RDO_PWR_MASK) << RDO_BATT_OP_PWR_SHIFT)
+#define RDO_BATT_MAX_PWR(mw) ((((mw) / 250) & RDO_PWR_MASK) << RDO_BATT_MAX_PWR_SHIFT)
+
+#define RDO_BATT(idx, op_mw, max_mw, flags)			\
+	(RDO_OBJ(idx) | (flags) |				\
+	 RDO_BATT_OP_PWR(op_mw) | RDO_BATT_MAX_PWR(max_mw))
+
+static inline unsigned int rdo_index(u32 rdo)
+{
+	return (rdo >> RDO_OBJ_POS_SHIFT) & RDO_OBJ_POS_MASK;
+}
+
+static inline unsigned int rdo_op_current(u32 rdo)
+{
+	return ((rdo >> RDO_FIXED_OP_CURR_SHIFT) & RDO_CURR_MASK) * 10;
+}
+
+static inline unsigned int rdo_max_current(u32 rdo)
+{
+	return ((rdo >> RDO_FIXED_MAX_CURR_SHIFT) &
+		RDO_CURR_MASK) * 10;
+}
+
+static inline unsigned int rdo_op_power(u32 rdo)
+{
+	return ((rdo >> RDO_BATT_OP_PWR_SHIFT) & RDO_PWR_MASK) * 250;
+}
+
+static inline unsigned int rdo_max_power(u32 rdo)
+{
+	return ((rdo >> RDO_BATT_MAX_PWR_SHIFT) & RDO_PWR_MASK) * 250;
+}
+
+/* USB PD timers and counters */
+#define PD_T_NO_RESPONSE	5000	/* 4.5 - 5.5 seconds */
+#define PD_T_DB_DETECT		10000	/* 10 - 15 seconds */
+#define PD_T_SEND_SOURCE_CAP	150	/* 100 - 200 ms */
+#define PD_T_SENDER_RESPONSE	60	/* 24 - 30 ms, relaxed */
+#define PD_T_SOURCE_ACTIVITY	45
+#define PD_T_SINK_ACTIVITY	135
+#define PD_T_SINK_WAIT_CAP	240
+#define PD_T_PS_TRANSITION	500
+#define PD_T_SRC_TRANSITION	35
+#define PD_T_DRP_SNK		40
+#define PD_T_DRP_SRC		30
+#define PD_T_PS_SOURCE_OFF	920
+#define PD_T_PS_SOURCE_ON	480
+#define PD_T_PS_HARD_RESET	30
+#define PD_T_SRC_RECOVER	760
+#define PD_T_SRC_RECOVER_MAX	1000
+#define PD_T_SRC_TURN_ON	275
+#define PD_T_SAFE_0V		650
+#define PD_T_VCONN_SOURCE_ON	100
+#define PD_T_SINK_REQUEST	100	/* 100 ms minimum */
+#define PD_T_ERROR_RECOVERY	100	/* minimum 25 is insufficient */
+#define PD_T_SRCSWAPSTDBY      625     /* Maximum of 650ms */
+#define PD_T_NEWSRC            250     /* Maximum of 275ms */
+
+#define PD_T_DRP_TRY		100	/* 75 - 150 ms */
+#define PD_T_DRP_TRYWAIT	600	/* 400 - 800 ms */
+
+#define PD_T_CC_DEBOUNCE	200	/* 100 - 200 ms */
+#define PD_T_PD_DEBOUNCE	20	/* 10 - 20 ms */
+
+#define PD_N_CAPS_COUNT		(PD_T_NO_RESPONSE / PD_T_SEND_SOURCE_CAP)
+#define PD_N_HARD_RESET_COUNT	2
+
+#endif /* __LINUX_USB_PD_H */
diff --git a/include/linux/usb/pd_bdo.h b/include/linux/usb/pd_bdo.h
new file mode 100644
index 000000000000..90b94d9fea5d
--- /dev/null
+++ b/include/linux/usb/pd_bdo.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2015-2017 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_USB_PD_BDO_H
+#define __LINUX_USB_PD_BDO_H
+
+/* BDO : BIST Data Object */
+#define BDO_MODE_RECV		(0 << 28)
+#define BDO_MODE_TRANSMIT	(1 << 28)
+#define BDO_MODE_COUNTERS	(2 << 28)
+#define BDO_MODE_CARRIER0	(3 << 28)
+#define BDO_MODE_CARRIER1	(4 << 28)
+#define BDO_MODE_CARRIER2	(5 << 28)
+#define BDO_MODE_CARRIER3	(6 << 28)
+#define BDO_MODE_EYE		(7 << 28)
+#define BDO_MODE_TESTDATA	(8 << 28)
+
+#define BDO_MODE_MASK(mode)	((mode) & 0xf0000000)
+
+#endif
diff --git a/include/linux/usb/pd_vdo.h b/include/linux/usb/pd_vdo.h
new file mode 100644
index 000000000000..d92259f8de0a
--- /dev/null
+++ b/include/linux/usb/pd_vdo.h
@@ -0,0 +1,251 @@
+/*
+ * Copyright 2015-2017 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_USB_PD_VDO_H
+#define __LINUX_USB_PD_VDO_H
+
+#include "pd.h"
+
+/*
+ * VDO : Vendor Defined Message Object
+ * VDM object is minimum of VDM header + 6 additional data objects.
+ */
+
+#define VDO_MAX_OBJECTS		6
+#define VDO_MAX_SIZE		(VDO_MAX_OBJECTS + 1)
+
+/*
+ * VDM header
+ * ----------
+ * <31:16>  :: SVID
+ * <15>     :: VDM type ( 1b == structured, 0b == unstructured )
+ * <14:13>  :: Structured VDM version (can only be 00 == 1.0 currently)
+ * <12:11>  :: reserved
+ * <10:8>   :: object position (1-7 valid ... used for enter/exit mode only)
+ * <7:6>    :: command type (SVDM only?)
+ * <5>      :: reserved (SVDM), command type (UVDM)
+ * <4:0>    :: command
+ */
+#define VDO(vid, type, custom)				\
+	(((vid) << 16) |				\
+	 ((type) << 15) |				\
+	 ((custom) & 0x7FFF))
+
+#define VDO_SVDM_TYPE		(1 << 15)
+#define VDO_SVDM_VERS(x)	((x) << 13)
+#define VDO_OPOS(x)		((x) << 8)
+#define VDO_CMDT(x)		((x) << 6)
+#define VDO_OPOS_MASK		VDO_OPOS(0x7)
+#define VDO_CMDT_MASK		VDO_CMDT(0x3)
+
+#define CMDT_INIT		0
+#define CMDT_RSP_ACK		1
+#define CMDT_RSP_NAK		2
+#define CMDT_RSP_BUSY		3
+
+/* reserved for SVDM ... for Google UVDM */
+#define VDO_SRC_INITIATOR	(0 << 5)
+#define VDO_SRC_RESPONDER	(1 << 5)
+
+#define CMD_DISCOVER_IDENT	1
+#define CMD_DISCOVER_SVID	2
+#define CMD_DISCOVER_MODES	3
+#define CMD_ENTER_MODE		4
+#define CMD_EXIT_MODE		5
+#define CMD_ATTENTION		6
+
+#define VDO_CMD_VENDOR(x)    (((10 + (x)) & 0x1f))
+
+/* ChromeOS specific commands */
+#define VDO_CMD_VERSION		VDO_CMD_VENDOR(0)
+#define VDO_CMD_SEND_INFO	VDO_CMD_VENDOR(1)
+#define VDO_CMD_READ_INFO	VDO_CMD_VENDOR(2)
+#define VDO_CMD_REBOOT		VDO_CMD_VENDOR(5)
+#define VDO_CMD_FLASH_ERASE	VDO_CMD_VENDOR(6)
+#define VDO_CMD_FLASH_WRITE	VDO_CMD_VENDOR(7)
+#define VDO_CMD_ERASE_SIG	VDO_CMD_VENDOR(8)
+#define VDO_CMD_PING_ENABLE	VDO_CMD_VENDOR(10)
+#define VDO_CMD_CURRENT		VDO_CMD_VENDOR(11)
+#define VDO_CMD_FLIP		VDO_CMD_VENDOR(12)
+#define VDO_CMD_GET_LOG		VDO_CMD_VENDOR(13)
+#define VDO_CMD_CCD_EN		VDO_CMD_VENDOR(14)
+
+#define PD_VDO_VID(vdo)		((vdo) >> 16)
+#define PD_VDO_SVDM(vdo)	(((vdo) >> 15) & 1)
+#define PD_VDO_OPOS(vdo)	(((vdo) >> 8) & 0x7)
+#define PD_VDO_CMD(vdo)		((vdo) & 0x1f)
+#define PD_VDO_CMDT(vdo)	(((vdo) >> 6) & 0x3)
+
+/*
+ * SVDM Identity request -> response
+ *
+ * Request is simply properly formatted SVDM header
+ *
+ * Response is 4 data objects:
+ * [0] :: SVDM header
+ * [1] :: Identitiy header
+ * [2] :: Cert Stat VDO
+ * [3] :: (Product | Cable) VDO
+ * [4] :: AMA VDO
+ *
+ */
+#define VDO_INDEX_HDR		0
+#define VDO_INDEX_IDH		1
+#define VDO_INDEX_CSTAT		2
+#define VDO_INDEX_CABLE		3
+#define VDO_INDEX_PRODUCT	3
+#define VDO_INDEX_AMA		4
+
+/*
+ * SVDM Identity Header
+ * --------------------
+ * <31>     :: data capable as a USB host
+ * <30>     :: data capable as a USB device
+ * <29:27>  :: product type
+ * <26>     :: modal operation supported (1b == yes)
+ * <25:16>  :: Reserved, Shall be set to zero
+ * <15:0>   :: USB-IF assigned VID for this cable vendor
+ */
+#define IDH_PTYPE_UNDEF		0
+#define IDH_PTYPE_HUB		1
+#define IDH_PTYPE_PERIPH	2
+#define IDH_PTYPE_PCABLE	3
+#define IDH_PTYPE_ACABLE	4
+#define IDH_PTYPE_AMA		5
+
+#define VDO_IDH(usbh, usbd, ptype, is_modal, vid)		\
+	((usbh) << 31 | (usbd) << 30 | ((ptype) & 0x7) << 27	\
+	 | (is_modal) << 26 | ((vid) & 0xffff))
+
+#define PD_IDH_PTYPE(vdo)	(((vdo) >> 27) & 0x7)
+#define PD_IDH_VID(vdo)		((vdo) & 0xffff)
+#define PD_IDH_MODAL_SUPP(vdo)	((vdo) & (1 << 26))
+
+/*
+ * Cert Stat VDO
+ * -------------
+ * <31:0>  : USB-IF assigned XID for this cable
+ */
+#define PD_CSTAT_XID(vdo)	(vdo)
+
+/*
+ * Product VDO
+ * -----------
+ * <31:16> : USB Product ID
+ * <15:0>  : USB bcdDevice
+ */
+#define VDO_PRODUCT(pid, bcd)	(((pid) & 0xffff) << 16 | ((bcd) & 0xffff))
+#define PD_PRODUCT_PID(vdo)	(((vdo) >> 16) & 0xffff)
+
+/*
+ * Cable VDO
+ * ---------
+ * <31:28> :: Cable HW version
+ * <27:24> :: Cable FW version
+ * <23:20> :: Reserved, Shall be set to zero
+ * <19:18> :: type-C to Type-A/B/C (00b == A, 01 == B, 10 == C)
+ * <17>    :: Type-C to Plug/Receptacle (0b == plug, 1b == receptacle)
+ * <16:13> :: cable latency (0001 == <10ns(~1m length))
+ * <12:11> :: cable termination type (11b == both ends active VCONN req)
+ * <10>    :: SSTX1 Directionality support (0b == fixed, 1b == cfgable)
+ * <9>     :: SSTX2 Directionality support
+ * <8>     :: SSRX1 Directionality support
+ * <7>     :: SSRX2 Directionality support
+ * <6:5>   :: Vbus current handling capability
+ * <4>     :: Vbus through cable (0b == no, 1b == yes)
+ * <3>     :: SOP" controller present? (0b == no, 1b == yes)
+ * <2:0>   :: USB SS Signaling support
+ */
+#define CABLE_ATYPE		0
+#define CABLE_BTYPE		1
+#define CABLE_CTYPE		2
+#define CABLE_PLUG		0
+#define CABLE_RECEPTACLE	1
+#define CABLE_CURR_1A5		0
+#define CABLE_CURR_3A		1
+#define CABLE_CURR_5A		2
+#define CABLE_USBSS_U2_ONLY	0
+#define CABLE_USBSS_U31_GEN1	1
+#define CABLE_USBSS_U31_GEN2	2
+#define VDO_CABLE(hw, fw, cbl, gdr, lat, term, tx1d, tx2d, rx1d, rx2d, cur,\
+		  vps, sopp, usbss) \
+	(((hw) & 0x7) << 28 | ((fw) & 0x7) << 24 | ((cbl) & 0x3) << 18	\
+	 | (gdr) << 17 | ((lat) & 0x7) << 13 | ((term) & 0x3) << 11	\
+	 | (tx1d) << 10 | (tx2d) << 9 | (rx1d) << 8 | (rx2d) << 7	\
+	 | ((cur) & 0x3) << 5 | (vps) << 4 | (sopp) << 3		\
+	 | ((usbss) & 0x7))
+
+/*
+ * AMA VDO
+ * ---------
+ * <31:28> :: Cable HW version
+ * <27:24> :: Cable FW version
+ * <23:12> :: Reserved, Shall be set to zero
+ * <11>    :: SSTX1 Directionality support (0b == fixed, 1b == cfgable)
+ * <10>    :: SSTX2 Directionality support
+ * <9>     :: SSRX1 Directionality support
+ * <8>     :: SSRX2 Directionality support
+ * <7:5>   :: Vconn power
+ * <4>     :: Vconn power required
+ * <3>     :: Vbus power required
+ * <2:0>   :: USB SS Signaling support
+ */
+#define VDO_AMA(hw, fw, tx1d, tx2d, rx1d, rx2d, vcpwr, vcr, vbr, usbss) \
+	(((hw) & 0x7) << 28 | ((fw) & 0x7) << 24			\
+	 | (tx1d) << 11 | (tx2d) << 10 | (rx1d) << 9 | (rx2d) << 8	\
+	 | ((vcpwr) & 0x7) << 5 | (vcr) << 4 | (vbr) << 3		\
+	 | ((usbss) & 0x7))
+
+#define PD_VDO_AMA_VCONN_REQ(vdo)	(((vdo) >> 4) & 1)
+#define PD_VDO_AMA_VBUS_REQ(vdo)	(((vdo) >> 3) & 1)
+
+#define AMA_VCONN_PWR_1W	0
+#define AMA_VCONN_PWR_1W5	1
+#define AMA_VCONN_PWR_2W	2
+#define AMA_VCONN_PWR_3W	3
+#define AMA_VCONN_PWR_4W	4
+#define AMA_VCONN_PWR_5W	5
+#define AMA_VCONN_PWR_6W	6
+#define AMA_USBSS_U2_ONLY	0
+#define AMA_USBSS_U31_GEN1	1
+#define AMA_USBSS_U31_GEN2	2
+#define AMA_USBSS_BBONLY	3
+
+/*
+ * SVDM Discover SVIDs request -> response
+ *
+ * Request is properly formatted VDM Header with discover SVIDs command.
+ * Response is a set of SVIDs of all all supported SVIDs with all zero's to
+ * mark the end of SVIDs.  If more than 12 SVIDs are supported command SHOULD be
+ * repeated.
+ */
+#define VDO_SVID(svid0, svid1)	(((svid0) & 0xffff) << 16 | ((svid1) & 0xffff))
+#define PD_VDO_SVID_SVID0(vdo)	((vdo) >> 16)
+#define PD_VDO_SVID_SVID1(vdo)	((vdo) & 0xffff)
+
+/* USB-IF SIDs */
+#define USB_SID_PD		0xff00 /* power delivery */
+#define USB_SID_DISPLAYPORT	0xff01
+#define USB_SID_MHL		0xff02	/* Mobile High-Definition Link */
+
+/* VDM command timeouts (in ms) */
+
+#define PD_T_VDM_UNSTRUCTURED	500
+#define PD_T_VDM_BUSY		100
+#define PD_T_VDM_WAIT_MODE_E	100
+#define PD_T_VDM_SNDR_RSP	30
+#define PD_T_VDM_E_MODE		25
+#define PD_T_VDM_RCVR_RSP	15
+
+#endif /* __LINUX_USB_PD_VDO_H */
diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h
new file mode 100644
index 000000000000..073197f0d2bb
--- /dev/null
+++ b/include/linux/usb/tcpm.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright 2015-2017 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_USB_TCPM_H
+#define __LINUX_USB_TCPM_H
+
+#include <linux/bitops.h>
+#include <linux/usb/typec.h>
+#include "pd.h"
+
+enum typec_cc_status {
+	TYPEC_CC_OPEN,
+	TYPEC_CC_RA,
+	TYPEC_CC_RD,
+	TYPEC_CC_RP_DEF,
+	TYPEC_CC_RP_1_5,
+	TYPEC_CC_RP_3_0,
+};
+
+enum typec_cc_polarity {
+	TYPEC_POLARITY_CC1,
+	TYPEC_POLARITY_CC2,
+};
+
+/* Time to wait for TCPC to complete transmit */
+#define PD_T_TCPC_TX_TIMEOUT	100		/* in ms	*/
+#define PD_ROLE_SWAP_TIMEOUT	(MSEC_PER_SEC * 10)
+
+enum tcpm_transmit_status {
+	TCPC_TX_SUCCESS = 0,
+	TCPC_TX_DISCARDED = 1,
+	TCPC_TX_FAILED = 2,
+};
+
+enum tcpm_transmit_type {
+	TCPC_TX_SOP = 0,
+	TCPC_TX_SOP_PRIME = 1,
+	TCPC_TX_SOP_PRIME_PRIME = 2,
+	TCPC_TX_SOP_DEBUG_PRIME = 3,
+	TCPC_TX_SOP_DEBUG_PRIME_PRIME = 4,
+	TCPC_TX_HARD_RESET = 5,
+	TCPC_TX_CABLE_RESET = 6,
+	TCPC_TX_BIST_MODE_2 = 7
+};
+
+/**
+ * struct tcpc_config - Port configuration
+ * @src_pdo:	PDO parameters sent to port partner as response to
+ *		PD_CTRL_GET_SOURCE_CAP message
+ * @nr_src_pdo:	Number of entries in @src_pdo
+ * @snk_pdo:	PDO parameters sent to partner as response to
+ *		PD_CTRL_GET_SINK_CAP message
+ * @nr_snk_pdo:	Number of entries in @snk_pdo
+ * @max_snk_mv:	Maximum acceptable sink voltage in mV
+ * @max_snk_ma:	Maximum sink current in mA
+ * @max_snk_mw:	Maximum required sink power in mW
+ * @operating_snk_mw:
+ *		Required operating sink power in mW
+ * @type:	Port type (TYPEC_PORT_DFP, TYPEC_PORT_UFP, or
+ *		TYPEC_PORT_DRP)
+ * @default_role:
+ *		Default port role (TYPEC_SINK or TYPEC_SOURCE).
+ *		Set to TYPEC_NO_PREFERRED_ROLE if no default role.
+ * @try_role_hw:True if try.{Src,Snk} is implemented in hardware
+ * @alt_modes:	List of supported alternate modes
+ */
+struct tcpc_config {
+	const u32 *src_pdo;
+	unsigned int nr_src_pdo;
+
+	const u32 *snk_pdo;
+	unsigned int nr_snk_pdo;
+
+	const u32 *snk_vdo;
+	unsigned int nr_snk_vdo;
+
+	unsigned int max_snk_mv;
+	unsigned int max_snk_ma;
+	unsigned int max_snk_mw;
+	unsigned int operating_snk_mw;
+
+	enum typec_port_type type;
+	enum typec_role default_role;
+	bool try_role_hw;	/* try.{src,snk} implemented in hardware */
+
+	const struct typec_altmode_desc *alt_modes;
+};
+
+enum tcpc_usb_switch {
+	TCPC_USB_SWITCH_CONNECT,
+	TCPC_USB_SWITCH_DISCONNECT,
+};
+
+/* Mux state attributes */
+#define TCPC_MUX_USB_ENABLED		BIT(0)	/* USB enabled */
+#define TCPC_MUX_DP_ENABLED		BIT(1)	/* DP enabled */
+#define TCPC_MUX_POLARITY_INVERTED	BIT(2)	/* Polarity inverted */
+
+/* Mux modes, decoded to attributes */
+enum tcpc_mux_mode {
+	TYPEC_MUX_NONE	= 0,				/* Open switch */
+	TYPEC_MUX_USB	= TCPC_MUX_USB_ENABLED,		/* USB only */
+	TYPEC_MUX_DP	= TCPC_MUX_DP_ENABLED,		/* DP only */
+	TYPEC_MUX_DOCK	= TCPC_MUX_USB_ENABLED |	/* Both USB and DP */
+			  TCPC_MUX_DP_ENABLED,
+};
+
+struct tcpc_mux_dev {
+	int (*set)(struct tcpc_mux_dev *dev, enum tcpc_mux_mode mux_mode,
+		   enum tcpc_usb_switch usb_config,
+		   enum typec_cc_polarity polarity);
+	bool dfp_only;
+	void *priv_data;
+};
+
+/**
+ * struct tcpc_dev - Port configuration and callback functions
+ * @config:	Pointer to port configuration
+ * @get_vbus:	Called to read current VBUS state
+ * @get_current_limit:
+ *		Optional; called by the tcpm core when configured as a snk
+ *		and cc=Rp-def. This allows the tcpm to provide a fallback
+ *		current-limit detection method for the cc=Rp-def case.
+ *		For example, some tcpcs may include BC1.2 charger detection
+ *		and use that in this case.
+ * @set_cc:	Called to set value of CC pins
+ * @get_cc:	Called to read current CC pin values
+ * @set_polarity:
+ *		Called to set polarity
+ * @set_vconn:	Called to enable or disable VCONN
+ * @set_vbus:	Called to enable or disable VBUS
+ * @set_current_limit:
+ *		Optional; called to set current limit as negotiated
+ *		with partner.
+ * @set_pd_rx:	Called to enable or disable reception of PD messages
+ * @set_roles:	Called to set power and data roles
+ * @start_drp_toggling:
+ *		Optional; if supported by hardware, called to start DRP
+ *		toggling. DRP toggling is stopped automatically if
+ *		a connection is established.
+ * @try_role:	Optional; called to set a preferred role
+ * @pd_transmit:Called to transmit PD message
+ * @mux:	Pointer to multiplexer data
+ */
+struct tcpc_dev {
+	const struct tcpc_config *config;
+
+	int (*init)(struct tcpc_dev *dev);
+	int (*get_vbus)(struct tcpc_dev *dev);
+	int (*get_current_limit)(struct tcpc_dev *dev);
+	int (*set_cc)(struct tcpc_dev *dev, enum typec_cc_status cc);
+	int (*get_cc)(struct tcpc_dev *dev, enum typec_cc_status *cc1,
+		      enum typec_cc_status *cc2);
+	int (*set_polarity)(struct tcpc_dev *dev,
+			    enum typec_cc_polarity polarity);
+	int (*set_vconn)(struct tcpc_dev *dev, bool on);
+	int (*set_vbus)(struct tcpc_dev *dev, bool on, bool charge);
+	int (*set_current_limit)(struct tcpc_dev *dev, u32 max_ma, u32 mv);
+	int (*set_pd_rx)(struct tcpc_dev *dev, bool on);
+	int (*set_roles)(struct tcpc_dev *dev, bool attached,
+			 enum typec_role role, enum typec_data_role data);
+	int (*start_drp_toggling)(struct tcpc_dev *dev,
+				  enum typec_cc_status cc);
+	int (*try_role)(struct tcpc_dev *dev, int role);
+	int (*pd_transmit)(struct tcpc_dev *dev, enum tcpm_transmit_type type,
+			   const struct pd_message *msg);
+	struct tcpc_mux_dev *mux;
+};
+
+struct tcpm_port;
+
+struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc);
+void tcpm_unregister_port(struct tcpm_port *port);
+
+void tcpm_update_source_capabilities(struct tcpm_port *port, const u32 *pdo,
+				     unsigned int nr_pdo);
+void tcpm_update_sink_capabilities(struct tcpm_port *port, const u32 *pdo,
+				   unsigned int nr_pdo,
+				   unsigned int max_snk_mv,
+				   unsigned int max_snk_ma,
+				   unsigned int max_snk_mw,
+				   unsigned int operating_snk_mw);
+
+void tcpm_vbus_change(struct tcpm_port *port);
+void tcpm_cc_change(struct tcpm_port *port);
+void tcpm_pd_receive(struct tcpm_port *port,
+		     const struct pd_message *msg);
+void tcpm_pd_transmit_complete(struct tcpm_port *port,
+			       enum tcpm_transmit_status status);
+void tcpm_pd_hard_reset(struct tcpm_port *port);
+void tcpm_tcpc_reset(struct tcpm_port *port);
+
+#endif /* __LINUX_USB_TCPM_H */
-- 
cgit v1.2.3


From 14157f861437ebe2d624b0a845b91bbdf8ca9a2d Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Wed, 13 Sep 2017 11:05:50 +0900
Subject: mtd: nand: introduce NAND_ROW_ADDR_3 flag

Several drivers check ->chipsize to see if the third row address cycle
is needed.  Instead of embedding magic sizes such as 32MB, 128MB in
drivers, introduce a new flag NAND_ROW_ADDR_3 for clean-up.  Since
nand_scan_ident() knows well about the device, it can handle this
properly.  The flag is set if the row address bit width is greater
than 16.

Delete comments such as "One more address cycle for ..." because
intention is now clear enough from the code.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Wenyou Yang <wenyou.yang@microchip.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/atmel/nand-controller.c | 3 +--
 drivers/mtd/nand/au1550nd.c              | 3 +--
 drivers/mtd/nand/diskonchip.c            | 3 +--
 drivers/mtd/nand/hisi504_nand.c          | 3 +--
 drivers/mtd/nand/mxc_nand.c              | 3 +--
 drivers/mtd/nand/nand_base.c             | 9 +++++----
 drivers/mtd/nand/nuc900_nand.c           | 2 +-
 include/linux/mtd/rawnand.h              | 3 +++
 8 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/atmel/nand-controller.c b/drivers/mtd/nand/atmel/nand-controller.c
index f25eca79f4e5..7bc8d20ed885 100644
--- a/drivers/mtd/nand/atmel/nand-controller.c
+++ b/drivers/mtd/nand/atmel/nand-controller.c
@@ -718,8 +718,7 @@ static void atmel_nfc_set_op_addr(struct nand_chip *chip, int page, int column)
 		nc->op.addrs[nc->op.naddrs++] = page;
 		nc->op.addrs[nc->op.naddrs++] = page >> 8;
 
-		if ((mtd->writesize > 512 && chip->chipsize > SZ_128M) ||
-		    (mtd->writesize <= 512 && chip->chipsize > SZ_32M))
+		if (chip->options & NAND_ROW_ADDR_3)
 			nc->op.addrs[nc->op.naddrs++] = page >> 16;
 	}
 }
diff --git a/drivers/mtd/nand/au1550nd.c b/drivers/mtd/nand/au1550nd.c
index 9d4a28fa6b73..8ab827edf94e 100644
--- a/drivers/mtd/nand/au1550nd.c
+++ b/drivers/mtd/nand/au1550nd.c
@@ -331,8 +331,7 @@ static void au1550_command(struct mtd_info *mtd, unsigned command, int column, i
 
 			ctx->write_byte(mtd, (u8)(page_addr >> 8));
 
-			/* One more address cycle for devices > 32MiB */
-			if (this->chipsize > (32 << 20))
+			if (this->options & NAND_ROW_ADDR_3)
 				ctx->write_byte(mtd,
 						((page_addr >> 16) & 0x0f));
 		}
diff --git a/drivers/mtd/nand/diskonchip.c b/drivers/mtd/nand/diskonchip.c
index c3aa53caab5c..72671dc52e2e 100644
--- a/drivers/mtd/nand/diskonchip.c
+++ b/drivers/mtd/nand/diskonchip.c
@@ -705,8 +705,7 @@ static void doc2001plus_command(struct mtd_info *mtd, unsigned command, int colu
 		if (page_addr != -1) {
 			WriteDOC((unsigned char)(page_addr & 0xff), docptr, Mplus_FlashAddress);
 			WriteDOC((unsigned char)((page_addr >> 8) & 0xff), docptr, Mplus_FlashAddress);
-			/* One more address cycle for higher density devices */
-			if (this->chipsize & 0x0c000000) {
+			if (this->options & NAND_ROW_ADDR_3) {
 				WriteDOC((unsigned char)((page_addr >> 16) & 0x0f), docptr, Mplus_FlashAddress);
 				printk("high density\n");
 			}
diff --git a/drivers/mtd/nand/hisi504_nand.c b/drivers/mtd/nand/hisi504_nand.c
index d9ee1a7e6956..0897261c3e17 100644
--- a/drivers/mtd/nand/hisi504_nand.c
+++ b/drivers/mtd/nand/hisi504_nand.c
@@ -432,8 +432,7 @@ static void set_addr(struct mtd_info *mtd, int column, int page_addr)
 		host->addr_value[0] |= (page_addr & 0xffff)
 			<< (host->addr_cycle * 8);
 		host->addr_cycle    += 2;
-		/* One more address cycle for devices > 128MiB */
-		if (chip->chipsize > (128 << 20)) {
+		if (chip->options & NAND_ROW_ADDR_3) {
 			host->addr_cycle += 1;
 			if (host->command == NAND_CMD_ERASE1)
 				host->addr_value[0] |= ((page_addr >> 16) & 0xff) << 16;
diff --git a/drivers/mtd/nand/mxc_nand.c b/drivers/mtd/nand/mxc_nand.c
index 53e5e0337c3e..bacdd04e765b 100644
--- a/drivers/mtd/nand/mxc_nand.c
+++ b/drivers/mtd/nand/mxc_nand.c
@@ -859,8 +859,7 @@ static void mxc_do_addr_cycle(struct mtd_info *mtd, int column, int page_addr)
 				host->devtype_data->send_addr(host,
 						(page_addr >> 8) & 0xff, true);
 		} else {
-			/* One more address cycle for higher density devices */
-			if (mtd->size >= 0x4000000) {
+			if (nand_chip->options & NAND_ROW_ADDR_3) {
 				/* paddr_8 - paddr_15 */
 				host->devtype_data->send_addr(host,
 						(page_addr >> 8) & 0xff,
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index fae28ec56277..c63e4a88a653 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -727,8 +727,7 @@ static void nand_command(struct mtd_info *mtd, unsigned int command,
 		chip->cmd_ctrl(mtd, page_addr, ctrl);
 		ctrl &= ~NAND_CTRL_CHANGE;
 		chip->cmd_ctrl(mtd, page_addr >> 8, ctrl);
-		/* One more address cycle for devices > 32MiB */
-		if (chip->chipsize > (32 << 20))
+		if (chip->options & NAND_ROW_ADDR_3)
 			chip->cmd_ctrl(mtd, page_addr >> 16, ctrl);
 	}
 	chip->cmd_ctrl(mtd, NAND_CMD_NONE, NAND_NCE | NAND_CTRL_CHANGE);
@@ -854,8 +853,7 @@ static void nand_command_lp(struct mtd_info *mtd, unsigned int command,
 			chip->cmd_ctrl(mtd, page_addr, ctrl);
 			chip->cmd_ctrl(mtd, page_addr >> 8,
 				       NAND_NCE | NAND_ALE);
-			/* One more address cycle for devices > 128MiB */
-			if (chip->chipsize > (128 << 20))
+			if (chip->options & NAND_ROW_ADDR_3)
 				chip->cmd_ctrl(mtd, page_addr >> 16,
 					       NAND_NCE | NAND_ALE);
 		}
@@ -4000,6 +3998,9 @@ ident_done:
 		chip->chip_shift += 32 - 1;
 	}
 
+	if (chip->chip_shift - chip->page_shift > 16)
+		chip->options |= NAND_ROW_ADDR_3;
+
 	chip->badblockbits = 8;
 	chip->erase = single_erase;
 
diff --git a/drivers/mtd/nand/nuc900_nand.c b/drivers/mtd/nand/nuc900_nand.c
index 7bb4d2ea9342..af5b32c9a791 100644
--- a/drivers/mtd/nand/nuc900_nand.c
+++ b/drivers/mtd/nand/nuc900_nand.c
@@ -154,7 +154,7 @@ static void nuc900_nand_command_lp(struct mtd_info *mtd, unsigned int command,
 		if (page_addr != -1) {
 			write_addr_reg(nand, page_addr);
 
-			if (chip->chipsize > (128 << 20)) {
+			if (chip->options & NAND_ROW_ADDR_3) {
 				write_addr_reg(nand, page_addr >> 8);
 				write_addr_reg(nand, page_addr >> 16 | ENDADDR);
 			} else {
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 2b05f4273bab..749bb08c4772 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -177,6 +177,9 @@ enum nand_ecc_algo {
  */
 #define NAND_NEED_SCRAMBLING	0x00002000
 
+/* Device needs 3rd row address cycle */
+#define NAND_ROW_ADDR_3		0x00004000
+
 /* Options valid for Samsung large page devices */
 #define NAND_SAMSUNG_LP_OPTIONS NAND_CACHEPRG
 
-- 
cgit v1.2.3


From ef838a81dd4de1e08454406812e42d7b9b417c4d Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Wed, 13 Sep 2017 10:18:27 +0200
Subject: serial: Add common rs485 device tree parsing function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Several drivers have the same device tree parsing code. Create
a common helper function for it.

This patch bases on work done by Sascha Hauer.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/serial_core.c | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/serial_core.h      |  5 +++++
 2 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 3a14cccbd7ff..f4e6c8662987 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -3026,5 +3026,41 @@ EXPORT_SYMBOL(uart_resume_port);
 EXPORT_SYMBOL(uart_add_one_port);
 EXPORT_SYMBOL(uart_remove_one_port);
 
+/**
+ * of_get_rs485_mode() - Implement parsing rs485 properties
+ * @np: uart node
+ * @rs485conf: output parameter
+ *
+ * This function implements the device tree binding described in
+ * Documentation/devicetree/bindings/serial/rs485.txt.
+ */
+void of_get_rs485_mode(struct device_node *np, struct serial_rs485 *rs485conf)
+{
+	u32 rs485_delay[2];
+	int ret;
+
+	ret = of_property_read_u32_array(np, "rs485-rts-delay", rs485_delay, 2);
+	if (!ret) {
+		rs485conf->delay_rts_before_send = rs485_delay[0];
+		rs485conf->delay_rts_after_send = rs485_delay[1];
+	} else {
+		rs485conf->delay_rts_before_send = 0;
+		rs485conf->delay_rts_after_send = 0;
+	}
+
+	/*
+	 * clear full-duplex and enabled flags to get to a defined state with
+	 * the two following properties.
+	 */
+	rs485conf->flags &= ~(SER_RS485_RX_DURING_TX | SER_RS485_ENABLED);
+
+	if (of_property_read_bool(np, "rs485-rx-during-tx"))
+		rs485conf->flags |= SER_RS485_RX_DURING_TX;
+
+	if (of_property_read_bool(np, "linux,rs485-enabled-at-boot-time"))
+		rs485conf->flags |= SER_RS485_ENABLED;
+}
+EXPORT_SYMBOL_GPL(of_get_rs485_mode);
+
 MODULE_DESCRIPTION("Serial driver core");
 MODULE_LICENSE("GPL");
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 5553e04e59c9..37b044e78333 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -501,4 +501,9 @@ static inline int uart_handle_break(struct uart_port *port)
 					 (cflag) & CRTSCTS || \
 					 !((cflag) & CLOCAL))
 
+/*
+ * Common device tree parsing helpers
+ */
+void of_get_rs485_mode(struct device_node *np, struct serial_rs485 *rs485conf);
+
 #endif /* LINUX_SERIAL_CORE_H */
-- 
cgit v1.2.3


From f4ac6476945ff62939420bcf8266e39f8d5d54bd Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Thu, 14 Sep 2017 12:35:36 +0200
Subject: libata: Add new med_power_with_dipm link_power_management_policy
 setting

As described by Matthew Garret quite a while back:
https://mjg59.dreamwidth.org/34868.html

Intel CPUs starting with the Haswell generation need SATA links to power
down for the "package" part of the CPU to reach low power-states like
PC7 / P8 which bring a significant power-saving with them.

The default max_performance lpm policy does not allow for these high
PC states, both the medium_power and min_power policies do allow this.

The min_power policy saves significantly more power, but there are some
reports of some disks / SSDs not liking min_power leading to system
crashes and in some cases even data corruption has been reported.

Matthew has found a document documenting the default settings of
Intel's IRST Windows driver with which most laptops ship:
https://www-ssl.intel.com/content/dam/doc/reference-guide/sata-devices-implementation-recommendations.pdf

Matthew wrote a patch changing med_power to match those defaults, but
that never got anywhere as some people where reporting issues with the
patch-set that patch was a part of.

This commit is another attempt to make the default IRST driver settings
available under Linux, but instead of changing medium_power and
potentially introducing regressions, this commit adds a new
med_power_with_dipm setting which is identical to the existing
medium_power accept that it enables dipm on top, which makes it match
the Windows IRST driver settings, which should hopefully be safe to
use on most devices.

The med_power_with_dipm setting is close to min_power, except that:
a) It does not use host-initiated slumber mode (ASP not set),
   but it does allow device-initiated slumber
b) It does not enable DevSlp mode

On my T440s test laptop I get the following power savings when idle:
medium_power		0.9W
med_power_with_dipm	1.2W
min_power		1.2W

Suggested-by: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c |  1 +
 drivers/ata/libata-eh.c   | 10 +++++-----
 drivers/ata/libata-scsi.c |  9 +++++----
 include/linux/libata.h    |  1 +
 4 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index ee4c1ec9dca0..65f7574afc55 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -3964,6 +3964,7 @@ int sata_link_scr_lpm(struct ata_link *link, enum ata_lpm_policy policy,
 		scontrol &= ~(0x1 << 8);
 		scontrol |= (0x6 << 8);
 		break;
+	case ATA_LPM_MED_POWER_WITH_DIPM:
 	case ATA_LPM_MIN_POWER:
 		if (ata_link_nr_enabled(link) > 0)
 			/* no restrictions on LPM transitions */
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index e4effef0c83f..49b3745d2c1f 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -3454,9 +3454,9 @@ static int ata_eh_maybe_retry_flush(struct ata_device *dev)
  *	@r_failed_dev: out parameter for failed device
  *
  *	Enable SATA Interface power management.  This will enable
- *	Device Interface Power Management (DIPM) for min_power
- * 	policy, and then call driver specific callbacks for
- *	enabling Host Initiated Power management.
+ *	Device Interface Power Management (DIPM) for min_power and
+ *	medium_power_with_dipm policies, and then call driver specific
+ *	callbacks for enabling Host Initiated Power management.
  *
  *	LOCKING:
  *	EH context.
@@ -3502,7 +3502,7 @@ static int ata_eh_set_lpm(struct ata_link *link, enum ata_lpm_policy policy,
 			hints &= ~ATA_LPM_HIPM;
 
 		/* disable DIPM before changing link config */
-		if (policy != ATA_LPM_MIN_POWER && dipm) {
+		if (policy < ATA_LPM_MED_POWER_WITH_DIPM && dipm) {
 			err_mask = ata_dev_set_feature(dev,
 					SETFEATURES_SATA_DISABLE, SATA_DIPM);
 			if (err_mask && err_mask != AC_ERR_DEV) {
@@ -3545,7 +3545,7 @@ static int ata_eh_set_lpm(struct ata_link *link, enum ata_lpm_policy policy,
 
 	/* host config updated, enable DIPM if transitioning to MIN_POWER */
 	ata_for_each_dev(dev, link, ENABLED) {
-		if (policy == ATA_LPM_MIN_POWER && !no_dipm &&
+		if (policy >= ATA_LPM_MED_POWER_WITH_DIPM && !no_dipm &&
 		    ata_id_has_dipm(dev->id)) {
 			err_mask = ata_dev_set_feature(dev,
 					SETFEATURES_SATA_ENABLE, SATA_DIPM);
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 44ba292f2cd7..673e72f438eb 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -106,10 +106,11 @@ static const u8 def_control_mpage[CONTROL_MPAGE_LEN] = {
 };
 
 static const char *ata_lpm_policy_names[] = {
-	[ATA_LPM_UNKNOWN]	= "max_performance",
-	[ATA_LPM_MAX_POWER]	= "max_performance",
-	[ATA_LPM_MED_POWER]	= "medium_power",
-	[ATA_LPM_MIN_POWER]	= "min_power",
+	[ATA_LPM_UNKNOWN]		= "max_performance",
+	[ATA_LPM_MAX_POWER]		= "max_performance",
+	[ATA_LPM_MED_POWER]		= "medium_power",
+	[ATA_LPM_MED_POWER_WITH_DIPM]	= "med_power_with_dipm",
+	[ATA_LPM_MIN_POWER]		= "min_power",
 };
 
 static ssize_t ata_scsi_lpm_store(struct device *device,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 931c32f1f18d..ed9826b21c5e 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -522,6 +522,7 @@ enum ata_lpm_policy {
 	ATA_LPM_UNKNOWN,
 	ATA_LPM_MAX_POWER,
 	ATA_LPM_MED_POWER,
+	ATA_LPM_MED_POWER_WITH_DIPM, /* Med power + DIPM as win IRST does */
 	ATA_LPM_MIN_POWER,
 };
 
-- 
cgit v1.2.3


From a5c6951c49fbf7509419d877620625ca3cdae9b1 Mon Sep 17 00:00:00 2001
From: Robert Jarzmik <robert.jarzmik@free.fr>
Date: Wed, 13 Sep 2017 21:37:17 +0200
Subject: mfd: wm97xx-core: core support for wm97xx Codec

The WM9705, WM9712 and WM9713 are highly integrated codecs, with an
audio codec, DAC and ADC, GPIO unit and a touchscreen interface.

Historically the support was spread across drivers/input/touchscreen and
sound/soc/codecs. The sharing was done through ac97 bus sharing. This
model will not withstand the new AC97 bus model, where codecs are
discovered on runtime.

Signed-off-by: Robert Jarzmik <robert.jarzmik@free.fr>
Acked-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/mfd/Kconfig        |  14 ++
 drivers/mfd/Makefile       |   1 +
 drivers/mfd/wm97xx-core.c  | 366 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/wm97xx.h |  25 ++++
 4 files changed, 406 insertions(+)
 create mode 100644 drivers/mfd/wm97xx-core.c
 create mode 100644 include/linux/mfd/wm97xx.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index fc5e4fef89d2..ac5ad6d0837c 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1746,6 +1746,20 @@ config MFD_WM8994
 	  core support for the WM8994, in order to use the actual
 	  functionaltiy of the device other drivers must be enabled.
 
+config MFD_WM97xx
+	tristate "Wolfson Microelectronics WM97xx"
+	select MFD_CORE
+	select REGMAP_AC97
+	select AC97_BUS_COMPAT
+	depends on AC97_BUS_NEW
+	help
+	  The WM9705, WM9712 and WM9713 is a highly integrated hi-fi CODEC
+	  designed for smartphone applications.  As well as audio functionality
+	  it has on board GPIO and a touchscreen functionality which is
+	  supported via the relevant subsystems.  This driver provides core
+	  support for the WM97xx, in order to use the actual functionaltiy of
+	  the device other drivers must be enabled.
+
 config MFD_STW481X
 	tristate "Support for ST Microelectronics STw481x"
 	depends on I2C && (ARCH_NOMADIK || COMPILE_TEST)
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index c3d0a1b39bb6..3266977f54b8 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -73,6 +73,7 @@ obj-$(CONFIG_MFD_WM8350)	+= wm8350.o
 obj-$(CONFIG_MFD_WM8350_I2C)	+= wm8350-i2c.o
 wm8994-objs			:= wm8994-core.o wm8994-irq.o wm8994-regmap.o
 obj-$(CONFIG_MFD_WM8994)	+= wm8994.o
+obj-$(CONFIG_MFD_WM97xx)	+= wm97xx-core.o
 
 obj-$(CONFIG_TPS6105X)		+= tps6105x.o
 obj-$(CONFIG_TPS65010)		+= tps65010.o
diff --git a/drivers/mfd/wm97xx-core.c b/drivers/mfd/wm97xx-core.c
new file mode 100644
index 000000000000..4141ee52a70b
--- /dev/null
+++ b/drivers/mfd/wm97xx-core.c
@@ -0,0 +1,366 @@
+/*
+ * Wolfson WM97xx -- Core device
+ *
+ * Copyright (C) 2017 Robert Jarzmik
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Features:
+ *  - an AC97 audio codec
+ *  - a touchscreen driver
+ *  - a GPIO block
+ */
+
+#include <linux/device.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/wm97xx.h>
+#include <linux/module.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+#include <linux/wm97xx.h>
+#include <sound/ac97/codec.h>
+#include <sound/ac97/compat.h>
+
+#define WM9705_VENDOR_ID 0x574d4c05
+#define WM9712_VENDOR_ID 0x574d4c12
+#define WM9713_VENDOR_ID 0x574d4c13
+#define WM97xx_VENDOR_ID_MASK 0xffffffff
+
+struct wm97xx_priv {
+	struct regmap *regmap;
+	struct snd_ac97 *ac97;
+	struct device *dev;
+	struct wm97xx_platform_data codec_pdata;
+};
+
+static bool wm97xx_readable_reg(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case AC97_RESET ... AC97_PCM_SURR_DAC_RATE:
+	case AC97_PCM_LR_ADC_RATE:
+	case AC97_CENTER_LFE_MASTER:
+	case AC97_SPDIF ... AC97_LINE1_LEVEL:
+	case AC97_GPIO_CFG ... 0x5c:
+	case AC97_CODEC_CLASS_REV ... AC97_PCI_SID:
+	case 0x74 ... AC97_VENDOR_ID2:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool wm97xx_writeable_reg(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case AC97_VENDOR_ID1:
+	case AC97_VENDOR_ID2:
+		return false;
+	default:
+		return wm97xx_readable_reg(dev, reg);
+	}
+}
+
+static const struct reg_default wm9705_reg_defaults[] = {
+	{ 0x02, 0x8000 },
+	{ 0x04, 0x8000 },
+	{ 0x06, 0x8000 },
+	{ 0x0a, 0x8000 },
+	{ 0x0c, 0x8008 },
+	{ 0x0e, 0x8008 },
+	{ 0x10, 0x8808 },
+	{ 0x12, 0x8808 },
+	{ 0x14, 0x8808 },
+	{ 0x16, 0x8808 },
+	{ 0x18, 0x8808 },
+	{ 0x1a, 0x0000 },
+	{ 0x1c, 0x8000 },
+	{ 0x20, 0x0000 },
+	{ 0x22, 0x0000 },
+	{ 0x26, 0x000f },
+	{ 0x28, 0x0605 },
+	{ 0x2a, 0x0000 },
+	{ 0x2c, 0xbb80 },
+	{ 0x32, 0xbb80 },
+	{ 0x34, 0x2000 },
+	{ 0x5a, 0x0000 },
+	{ 0x5c, 0x0000 },
+	{ 0x72, 0x0808 },
+	{ 0x74, 0x0000 },
+	{ 0x76, 0x0006 },
+	{ 0x78, 0x0000 },
+	{ 0x7a, 0x0000 },
+};
+
+static const struct regmap_config wm9705_regmap_config = {
+	.reg_bits = 16,
+	.reg_stride = 2,
+	.val_bits = 16,
+	.max_register = 0x7e,
+	.cache_type = REGCACHE_RBTREE,
+
+	.reg_defaults = wm9705_reg_defaults,
+	.num_reg_defaults = ARRAY_SIZE(wm9705_reg_defaults),
+	.volatile_reg = regmap_ac97_default_volatile,
+	.readable_reg = wm97xx_readable_reg,
+	.writeable_reg = wm97xx_writeable_reg,
+};
+
+static struct mfd_cell wm9705_cells[] = {
+	{ .name = "wm9705-codec", },
+	{ .name = "wm97xx-ts", },
+};
+
+static bool wm9712_volatile_reg(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case AC97_REC_GAIN:
+		return true;
+	default:
+		return regmap_ac97_default_volatile(dev, reg);
+	}
+}
+
+static const struct reg_default wm9712_reg_defaults[] = {
+	{ 0x02, 0x8000 },
+	{ 0x04, 0x8000 },
+	{ 0x06, 0x8000 },
+	{ 0x08, 0x0f0f },
+	{ 0x0a, 0xaaa0 },
+	{ 0x0c, 0xc008 },
+	{ 0x0e, 0x6808 },
+	{ 0x10, 0xe808 },
+	{ 0x12, 0xaaa0 },
+	{ 0x14, 0xad00 },
+	{ 0x16, 0x8000 },
+	{ 0x18, 0xe808 },
+	{ 0x1a, 0x3000 },
+	{ 0x1c, 0x8000 },
+	{ 0x20, 0x0000 },
+	{ 0x22, 0x0000 },
+	{ 0x26, 0x000f },
+	{ 0x28, 0x0605 },
+	{ 0x2a, 0x0410 },
+	{ 0x2c, 0xbb80 },
+	{ 0x2e, 0xbb80 },
+	{ 0x32, 0xbb80 },
+	{ 0x34, 0x2000 },
+	{ 0x4c, 0xf83e },
+	{ 0x4e, 0xffff },
+	{ 0x50, 0x0000 },
+	{ 0x52, 0x0000 },
+	{ 0x56, 0xf83e },
+	{ 0x58, 0x0008 },
+	{ 0x5c, 0x0000 },
+	{ 0x60, 0xb032 },
+	{ 0x62, 0x3e00 },
+	{ 0x64, 0x0000 },
+	{ 0x76, 0x0006 },
+	{ 0x78, 0x0001 },
+	{ 0x7a, 0x0000 },
+};
+
+static const struct regmap_config wm9712_regmap_config = {
+	.reg_bits = 16,
+	.reg_stride = 2,
+	.val_bits = 16,
+	.max_register = 0x7e,
+	.cache_type = REGCACHE_RBTREE,
+
+	.reg_defaults = wm9712_reg_defaults,
+	.num_reg_defaults = ARRAY_SIZE(wm9712_reg_defaults),
+	.volatile_reg = wm9712_volatile_reg,
+	.readable_reg = wm97xx_readable_reg,
+	.writeable_reg = wm97xx_writeable_reg,
+};
+
+static struct mfd_cell wm9712_cells[] = {
+	{ .name = "wm9712-codec", },
+	{ .name = "wm97xx-ts", },
+};
+
+static const struct reg_default wm9713_reg_defaults[] = {
+	{ 0x02, 0x8080 },	/* Speaker Output Volume */
+	{ 0x04, 0x8080 },	/* Headphone Output Volume */
+	{ 0x06, 0x8080 },	/* Out3/OUT4 Volume */
+	{ 0x08, 0xc880 },	/* Mono Volume */
+	{ 0x0a, 0xe808 },	/* LINEIN Volume */
+	{ 0x0c, 0xe808 },	/* DAC PGA Volume */
+	{ 0x0e, 0x0808 },	/* MIC PGA Volume */
+	{ 0x10, 0x00da },	/* MIC Routing Control */
+	{ 0x12, 0x8000 },	/* Record PGA Volume */
+	{ 0x14, 0xd600 },	/* Record Routing */
+	{ 0x16, 0xaaa0 },	/* PCBEEP Volume */
+	{ 0x18, 0xaaa0 },	/* VxDAC Volume */
+	{ 0x1a, 0xaaa0 },	/* AUXDAC Volume */
+	{ 0x1c, 0x0000 },	/* Output PGA Mux */
+	{ 0x1e, 0x0000 },	/* DAC 3D control */
+	{ 0x20, 0x0f0f },	/* DAC Tone Control*/
+	{ 0x22, 0x0040 },	/* MIC Input Select & Bias */
+	{ 0x24, 0x0000 },	/* Output Volume Mapping & Jack */
+	{ 0x26, 0x7f00 },	/* Powerdown Ctrl/Stat*/
+	{ 0x28, 0x0405 },	/* Extended Audio ID */
+	{ 0x2a, 0x0410 },	/* Extended Audio Start/Ctrl */
+	{ 0x2c, 0xbb80 },	/* Audio DACs Sample Rate */
+	{ 0x2e, 0xbb80 },	/* AUXDAC Sample Rate */
+	{ 0x32, 0xbb80 },	/* Audio ADCs Sample Rate */
+	{ 0x36, 0x4523 },	/* PCM codec control */
+	{ 0x3a, 0x2000 },	/* SPDIF control */
+	{ 0x3c, 0xfdff },	/* Powerdown 1 */
+	{ 0x3e, 0xffff },	/* Powerdown 2 */
+	{ 0x40, 0x0000 },	/* General Purpose */
+	{ 0x42, 0x0000 },	/* Fast Power-Up Control */
+	{ 0x44, 0x0080 },	/* MCLK/PLL Control */
+	{ 0x46, 0x0000 },	/* MCLK/PLL Control */
+
+	{ 0x4c, 0xfffe },	/* GPIO Pin Configuration */
+	{ 0x4e, 0xffff },	/* GPIO Pin Polarity / Type */
+	{ 0x50, 0x0000 },	/* GPIO Pin Sticky */
+	{ 0x52, 0x0000 },	/* GPIO Pin Wake-Up */
+				/* GPIO Pin Status */
+	{ 0x56, 0xfffe },	/* GPIO Pin Sharing */
+	{ 0x58, 0x4000 },	/* GPIO PullUp/PullDown */
+	{ 0x5a, 0x0000 },	/* Additional Functions 1 */
+	{ 0x5c, 0x0000 },	/* Additional Functions 2 */
+	{ 0x60, 0xb032 },	/* ALC Control */
+	{ 0x62, 0x3e00 },	/* ALC / Noise Gate Control */
+	{ 0x64, 0x0000 },	/* AUXDAC input control */
+	{ 0x74, 0x0000 },	/* Digitiser Reg 1 */
+	{ 0x76, 0x0006 },	/* Digitiser Reg 2 */
+	{ 0x78, 0x0001 },	/* Digitiser Reg 3 */
+	{ 0x7a, 0x0000 },	/* Digitiser Read Back */
+};
+
+static const struct regmap_config wm9713_regmap_config = {
+	.reg_bits = 16,
+	.reg_stride = 2,
+	.val_bits = 16,
+	.max_register = 0x7e,
+	.cache_type = REGCACHE_RBTREE,
+
+	.reg_defaults = wm9713_reg_defaults,
+	.num_reg_defaults = ARRAY_SIZE(wm9713_reg_defaults),
+	.volatile_reg = regmap_ac97_default_volatile,
+	.readable_reg = wm97xx_readable_reg,
+	.writeable_reg = wm97xx_writeable_reg,
+};
+
+static struct mfd_cell wm9713_cells[] = {
+	{ .name = "wm9713-codec", },
+	{ .name = "wm97xx-ts", },
+};
+
+static int wm97xx_ac97_probe(struct ac97_codec_device *adev)
+{
+	struct wm97xx_priv *wm97xx;
+	const struct regmap_config *config;
+	struct wm97xx_platform_data *codec_pdata;
+	struct mfd_cell *cells;
+	int ret = -ENODEV, nb_cells, i;
+	struct wm97xx_pdata *pdata = snd_ac97_codec_get_platdata(adev);
+
+	wm97xx = devm_kzalloc(ac97_codec_dev2dev(adev),
+			      sizeof(*wm97xx), GFP_KERNEL);
+	if (!wm97xx)
+		return -ENOMEM;
+
+	wm97xx->dev = ac97_codec_dev2dev(adev);
+	wm97xx->ac97 = snd_ac97_compat_alloc(adev);
+	if (IS_ERR(wm97xx->ac97))
+		return PTR_ERR(wm97xx->ac97);
+
+
+	ac97_set_drvdata(adev, wm97xx);
+	dev_info(wm97xx->dev, "wm97xx core found, id=0x%x\n",
+		 adev->vendor_id);
+
+	codec_pdata = &wm97xx->codec_pdata;
+	codec_pdata->ac97 = wm97xx->ac97;
+	codec_pdata->batt_pdata = pdata->batt_pdata;
+
+	switch (adev->vendor_id) {
+	case WM9705_VENDOR_ID:
+		config = &wm9705_regmap_config;
+		cells = wm9705_cells;
+		nb_cells = ARRAY_SIZE(wm9705_cells);
+		break;
+	case WM9712_VENDOR_ID:
+		config = &wm9712_regmap_config;
+		cells = wm9712_cells;
+		nb_cells = ARRAY_SIZE(wm9712_cells);
+		break;
+	case WM9713_VENDOR_ID:
+		config = &wm9713_regmap_config;
+		cells = wm9713_cells;
+		nb_cells = ARRAY_SIZE(wm9713_cells);
+		break;
+	default:
+		goto err_free_compat;
+	}
+
+	for (i = 0; i < nb_cells; i++) {
+		cells[i].platform_data = codec_pdata;
+		cells[i].pdata_size = sizeof(*codec_pdata);
+	}
+
+	codec_pdata->regmap = devm_regmap_init_ac97(wm97xx->ac97, config);
+	if (IS_ERR(codec_pdata->regmap)) {
+		ret = PTR_ERR(codec_pdata->regmap);
+		goto err_free_compat;
+	}
+
+	ret = devm_mfd_add_devices(wm97xx->dev, PLATFORM_DEVID_NONE,
+				   cells, nb_cells, NULL, 0, NULL);
+	if (ret)
+		goto err_free_compat;
+
+	return ret;
+
+err_free_compat:
+	snd_ac97_compat_release(wm97xx->ac97);
+	return ret;
+}
+
+static int wm97xx_ac97_remove(struct ac97_codec_device *adev)
+{
+	struct wm97xx_priv *wm97xx = ac97_get_drvdata(adev);
+
+	snd_ac97_compat_release(wm97xx->ac97);
+
+	return 0;
+}
+
+static const struct ac97_id wm97xx_ac97_ids[] = {
+	{ .id = WM9705_VENDOR_ID, .mask = WM97xx_VENDOR_ID_MASK },
+	{ .id = WM9712_VENDOR_ID, .mask = WM97xx_VENDOR_ID_MASK },
+	{ .id = WM9713_VENDOR_ID, .mask = WM97xx_VENDOR_ID_MASK },
+	{ }
+};
+
+static struct ac97_codec_driver wm97xx_ac97_driver = {
+	.driver = {
+		.name = "wm97xx-core",
+	},
+	.probe		= wm97xx_ac97_probe,
+	.remove		= wm97xx_ac97_remove,
+	.id_table	= wm97xx_ac97_ids,
+};
+
+static int __init wm97xx_module_init(void)
+{
+	return snd_ac97_codec_driver_register(&wm97xx_ac97_driver);
+}
+module_init(wm97xx_module_init);
+
+static void __exit wm97xx_module_exit(void)
+{
+	snd_ac97_codec_driver_unregister(&wm97xx_ac97_driver);
+}
+module_exit(wm97xx_module_exit);
+
+MODULE_DESCRIPTION("WM9712, WM9713 core driver");
+MODULE_AUTHOR("Robert Jarzmik <robert.jarzmik@free.fr>");
+MODULE_LICENSE("GPL");
+
diff --git a/include/linux/mfd/wm97xx.h b/include/linux/mfd/wm97xx.h
new file mode 100644
index 000000000000..45fb54f19d09
--- /dev/null
+++ b/include/linux/mfd/wm97xx.h
@@ -0,0 +1,25 @@
+/*
+ * wm97xx client interface
+ *
+ * Copyright (C) 2017 Robert Jarzmik
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_MFD_WM97XX_H
+#define __LINUX_MFD_WM97XX_H
+
+struct regmap;
+struct wm97xx_batt_pdata;
+struct snd_ac97;
+
+struct wm97xx_platform_data {
+	struct snd_ac97 *ac97;
+	struct regmap *regmap;
+	struct wm97xx_batt_pdata *batt_pdata;
+};
+
+#endif
-- 
cgit v1.2.3


From f454322efbf6faee695f517c6b52c4dc03cacd3e Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Tue, 22 Aug 2017 02:16:11 +0300
Subject: signal: replace sigset_to_compat() with put_compat_sigset()

There are 4 callers of sigset_to_compat() in the entire kernel.  One is
in sparc compat rt_sigaction(2), the rest are in kernel/signal.c itself.
All are followed by copy_to_user(), and all but the sparc one are under
"if it's big-endian..." ifdefs.

Let's transform sigset_to_compat() into put_compat_sigset() that also
calls copy_to_user().

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/sparc/kernel/sys_sparc32.c |  6 +++---
 include/linux/compat.h          |  3 ++-
 kernel/compat.c                 | 20 ++++++++++++++------
 kernel/signal.c                 | 27 ++++++---------------------
 4 files changed, 25 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index bca44f3e6b86..5e2bec9e41b2 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -159,7 +159,6 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig,
 {
         struct k_sigaction new_ka, old_ka;
         int ret;
-	compat_sigset_t set32;
 
         /* XXX: Don't preclude handling different sized sigset_t's.  */
         if (sigsetsize != sizeof(compat_sigset_t))
@@ -167,6 +166,7 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig,
 
         if (act) {
 		u32 u_handler, u_restorer;
+		compat_sigset_t set32;
 
 		new_ka.ka_restorer = restorer;
 		ret = get_user(u_handler, &act->sa_handler);
@@ -183,9 +183,9 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig,
 	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
 
 	if (!ret && oact) {
-		sigset_to_compat(&set32, &old_ka.sa.sa_mask);
 		ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler);
-		ret |= copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t));
+		ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask,
+					 sizeof(oact->sa_mask));
 		ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
 		ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer);
 		if (ret)
diff --git a/include/linux/compat.h b/include/linux/compat.h
index a5619de3437d..ab1baa79abe8 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -456,7 +456,8 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
 
 extern void sigset_from_compat(sigset_t *set, const compat_sigset_t *compat);
-extern void sigset_to_compat(compat_sigset_t *compat, const sigset_t *set);
+extern int put_compat_sigset(compat_sigset_t __user *compat,
+			     const sigset_t *set, unsigned int size);
 
 asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
 		compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes,
diff --git a/kernel/compat.c b/kernel/compat.c
index 772e038d04d9..18dd902c9052 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -497,15 +497,23 @@ sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
 }
 EXPORT_SYMBOL_GPL(sigset_from_compat);
 
-void
-sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
+int
+put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
+		  unsigned int size)
 {
+	/* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */
+#ifdef __BIG_ENDIAN
+	compat_sigset_t v;
 	switch (_NSIG_WORDS) {
-	case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
-	case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
-	case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
-	case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
+	case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3];
+	case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2];
+	case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1];
+	case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0];
 	}
+	return copy_to_user(compat, &v, size) ? -EFAULT : 0;
+#else
+	return copy_to_user(compat, set, size) ? -EFAULT : 0;
+#endif
 }
 
 #ifdef CONFIG_NUMA
diff --git a/kernel/signal.c b/kernel/signal.c
index 800a18f77732..14ad6bb90dad 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2621,13 +2621,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
 		if (error)
 			return error;
 	}
-	if (oset) {
-		compat_sigset_t old32;
-		sigset_to_compat(&old32, &old_set);
-		if (copy_to_user(oset, &old32, sizeof(compat_sigset_t)))
-			return -EFAULT;
-	}
-	return 0;
+	return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0;
 #else
 	return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
 				  (sigset_t __user *)oset, sigsetsize);
@@ -2669,20 +2663,11 @@ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
 COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
 		compat_size_t, sigsetsize)
 {
-#ifdef __BIG_ENDIAN
 	sigset_t set;
 	int err = do_sigpending(&set, sigsetsize);
-	if (!err) {
-		compat_sigset_t set32;
-		sigset_to_compat(&set32, &set);
-		/* we can get here only if sigsetsize <= sizeof(set) */
-		if (copy_to_user(uset, &set32, sigsetsize))
-			err = -EFAULT;
-	}
+	if (!err)
+		err = put_compat_sigset(uset, &set, sigsetsize);
 	return err;
-#else
-	return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize);
-#endif
 }
 #endif
 
@@ -3451,7 +3436,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
 		compat_size_t, sigsetsize)
 {
 	struct k_sigaction new_ka, old_ka;
-	compat_sigset_t mask;
 #ifdef __ARCH_HAS_SA_RESTORER
 	compat_uptr_t restorer;
 #endif
@@ -3463,6 +3447,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
 
 	if (act) {
 		compat_uptr_t handler;
+		compat_sigset_t mask;
 		ret = get_user(handler, &act->sa_handler);
 		new_ka.sa.sa_handler = compat_ptr(handler);
 #ifdef __ARCH_HAS_SA_RESTORER
@@ -3478,10 +3463,10 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
 
 	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
 	if (!ret && oact) {
-		sigset_to_compat(&mask, &old_ka.sa.sa_mask);
 		ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), 
 			       &oact->sa_handler);
-		ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
+		ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask,
+					 sizeof(oact->sa_mask));
 		ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
 #ifdef __ARCH_HAS_SA_RESTORER
 		ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
-- 
cgit v1.2.3


From b8e8e1aa9f14110da180569908bbe538c9e9dc63 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 3 Sep 2017 20:42:54 -0400
Subject: get rid of {get,put}_compat_itimerspec()

no users left

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/compat.h |  5 -----
 kernel/compat.c        | 18 ------------------
 2 files changed, 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index ab1baa79abe8..21d30be5c0a5 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -443,11 +443,6 @@ static inline int compat_timespec_compare(struct compat_timespec *lhs,
 	return lhs->tv_nsec - rhs->tv_nsec;
 }
 
-extern int get_compat_itimerspec(struct itimerspec *dst,
-				 const struct compat_itimerspec __user *src);
-extern int put_compat_itimerspec(struct compat_itimerspec __user *dst,
-				 const struct itimerspec *src);
-
 asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
 		struct timezone __user *tz);
 asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
diff --git a/kernel/compat.c b/kernel/compat.c
index 18dd902c9052..d43b18031116 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -367,24 +367,6 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t,  pid, unsigned int, len,
 	return ret;
 }
 
-int get_compat_itimerspec(struct itimerspec *dst,
-			  const struct compat_itimerspec __user *src)
-{
-	if (__compat_get_timespec(&dst->it_interval, &src->it_interval) ||
-	    __compat_get_timespec(&dst->it_value, &src->it_value))
-		return -EFAULT;
-	return 0;
-}
-
-int put_compat_itimerspec(struct compat_itimerspec __user *dst,
-			  const struct itimerspec *src)
-{
-	if (__compat_put_timespec(&src->it_interval, &dst->it_interval) ||
-	    __compat_put_timespec(&src->it_value, &dst->it_value))
-		return -EFAULT;
-	return 0;
-}
-
 int get_compat_itimerspec64(struct itimerspec64 *its,
 			const struct compat_itimerspec __user *uits)
 {
-- 
cgit v1.2.3


From 3968cf623892d710e651070243fd16af312a9797 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 3 Sep 2017 21:45:17 -0400
Subject: get_compat_sigset()

similar to put_compat_sigset()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/sparc/kernel/sys_sparc32.c |  4 +---
 fs/eventpoll.c                  |  4 +---
 fs/select.c                     |  8 ++------
 fs/signalfd.c                   |  4 +---
 include/linux/compat.h          |  2 +-
 kernel/compat.c                 | 23 ++++++++++++++++-------
 kernel/signal.c                 | 27 ++++-----------------------
 virt/kvm/kvm_main.c             |  7 ++-----
 8 files changed, 28 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index 5e2bec9e41b2..34ece61ee970 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -166,13 +166,11 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig,
 
         if (act) {
 		u32 u_handler, u_restorer;
-		compat_sigset_t set32;
 
 		new_ka.ka_restorer = restorer;
 		ret = get_user(u_handler, &act->sa_handler);
 		new_ka.sa.sa_handler =  compat_ptr(u_handler);
-		ret |= copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t));
-		sigset_from_compat(&new_ka.sa.sa_mask, &set32);
+		ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask);
 		ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
 		ret |= get_user(u_restorer, &act->sa_restorer);
 		new_ka.sa.sa_restorer = compat_ptr(u_restorer);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..396a3c075fd4 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2259,7 +2259,6 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 			compat_size_t, sigsetsize)
 {
 	long err;
-	compat_sigset_t csigmask;
 	sigset_t ksigmask, sigsaved;
 
 	/*
@@ -2269,9 +2268,8 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 	if (sigmask) {
 		if (sigsetsize != sizeof(compat_sigset_t))
 			return -EINVAL;
-		if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
+		if (get_compat_sigset(&ksigmask, sigmask))
 			return -EFAULT;
-		sigset_from_compat(&ksigmask, &csigmask);
 		sigsaved = current->blocked;
 		set_current_blocked(&ksigmask);
 	}
diff --git a/fs/select.c b/fs/select.c
index 20a7d061904f..9c980162c9fe 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -1301,7 +1301,6 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
 	compat_size_t sigsetsize)
 {
-	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
@@ -1318,9 +1317,8 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	if (sigmask) {
 		if (sigsetsize != sizeof(compat_sigset_t))
 			return -EINVAL;
-		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+		if (get_compat_sigset(&ksigmask, sigmask))
 			return -EFAULT;
-		sigset_from_compat(&ksigmask, &ss32);
 
 		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
@@ -1369,7 +1367,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
 	unsigned int,  nfds, struct compat_timespec __user *, tsp,
 	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
 {
-	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
@@ -1386,9 +1383,8 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
 	if (sigmask) {
 		if (sigsetsize != sizeof(compat_sigset_t))
 			return -EINVAL;
-		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+		if (get_compat_sigset(&ksigmask, sigmask))
 			return -EFAULT;
-		sigset_from_compat(&ksigmask, &ss32);
 
 		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index d2c434112f42..9de5beeb771d 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -312,15 +312,13 @@ COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd,
 		     compat_size_t, sigsetsize,
 		     int, flags)
 {
-	compat_sigset_t ss32;
 	sigset_t tmp;
 	sigset_t __user *ksigmask;
 
 	if (sigsetsize != sizeof(compat_sigset_t))
 		return -EINVAL;
-	if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+	if (get_compat_sigset(&tmp, sigmask))
 		return -EFAULT;
-	sigset_from_compat(&tmp, &ss32);
 	ksigmask = compat_alloc_user_space(sizeof(sigset_t));
 	if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
 		return -EFAULT;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 21d30be5c0a5..57cb6ecafa86 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -450,7 +450,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
 
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
 
-extern void sigset_from_compat(sigset_t *set, const compat_sigset_t *compat);
+extern int get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat);
 extern int put_compat_sigset(compat_sigset_t __user *compat,
 			     const sigset_t *set, unsigned int size);
 
diff --git a/kernel/compat.c b/kernel/compat.c
index d43b18031116..a46a4a40bb8b 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -467,17 +467,26 @@ Efault:
 	return -EFAULT;
 }
 
-void
-sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
+int
+get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
 {
+#ifdef __BIG_ENDIAN
+	compat_sigset_t v;
+	if (copy_from_user(&v, compat, sizeof(compat_sigset_t)))
+		return -EFAULT;
 	switch (_NSIG_WORDS) {
-	case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
-	case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
-	case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
-	case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
+	case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 );
+	case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 );
+	case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 );
+	case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 );
 	}
+#else
+	if (copy_from_user(set, compat, sizeof(compat_sigset_t)))
+		return -EFAULT;
+#endif
+	return 0;
 }
-EXPORT_SYMBOL_GPL(sigset_from_compat);
+EXPORT_SYMBOL_GPL(get_compat_sigset);
 
 int
 put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
diff --git a/kernel/signal.c b/kernel/signal.c
index 9fbc574ced10..36a523640894 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2600,7 +2600,6 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
 COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
 		compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
 {
-#ifdef __BIG_ENDIAN
 	sigset_t old_set = current->blocked;
 
 	/* XXX: Don't preclude handling different sized sigset_t's.  */
@@ -2608,13 +2607,10 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
 		return -EINVAL;
 
 	if (nset) {
-		compat_sigset_t new32;
 		sigset_t new_set;
 		int error;
-		if (copy_from_user(&new32, nset, sizeof(compat_sigset_t)))
+		if (get_compat_sigset(&new_set, nset))
 			return -EFAULT;
-
-		sigset_from_compat(&new_set, &new32);
 		sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
 
 		error = sigprocmask(how, &new_set, NULL);
@@ -2622,10 +2618,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
 			return error;
 	}
 	return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0;
-#else
-	return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
-				  (sigset_t __user *)oset, sigsetsize);
-#endif
 }
 #endif
 
@@ -2908,7 +2900,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
 		struct compat_siginfo __user *, uinfo,
 		struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
 {
-	compat_sigset_t s32;
 	sigset_t s;
 	struct timespec t;
 	siginfo_t info;
@@ -2917,9 +2908,8 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
 	if (sigsetsize != sizeof(sigset_t))
 		return -EINVAL;
 
-	if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
+	if (get_compat_sigset(&s, uthese))
 		return -EFAULT;
-	sigset_from_compat(&s, &s32);
 
 	if (uts) {
 		if (compat_get_timespec(&t, uts))
@@ -3450,18 +3440,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
 
 	if (act) {
 		compat_uptr_t handler;
-		compat_sigset_t mask;
 		ret = get_user(handler, &act->sa_handler);
 		new_ka.sa.sa_handler = compat_ptr(handler);
 #ifdef __ARCH_HAS_SA_RESTORER
 		ret |= get_user(restorer, &act->sa_restorer);
 		new_ka.sa.sa_restorer = compat_ptr(restorer);
 #endif
-		ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
+		ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask);
 		ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
 		if (ret)
 			return -EFAULT;
-		sigset_from_compat(&new_ka.sa.sa_mask, &mask);
 	}
 
 	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
@@ -3649,22 +3637,15 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
 {
-#ifdef __BIG_ENDIAN
 	sigset_t newset;
-	compat_sigset_t newset32;
 
 	/* XXX: Don't preclude handling different sized sigset_t's.  */
 	if (sigsetsize != sizeof(sigset_t))
 		return -EINVAL;
 
-	if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
+	if (get_compat_sigset(&newset, unewset))
 		return -EFAULT;
-	sigset_from_compat(&newset, &newset32);
 	return sigsuspend(&newset);
-#else
-	/* on little-endian bitmaps don't care about granularity */
-	return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize);
-#endif
 }
 #endif
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9deb5a245b83..99bfe50a0589 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2724,7 +2724,6 @@ static long kvm_vcpu_compat_ioctl(struct file *filp,
 	case KVM_SET_SIGNAL_MASK: {
 		struct kvm_signal_mask __user *sigmask_arg = argp;
 		struct kvm_signal_mask kvm_sigmask;
-		compat_sigset_t csigset;
 		sigset_t sigset;
 
 		if (argp) {
@@ -2733,13 +2732,11 @@ static long kvm_vcpu_compat_ioctl(struct file *filp,
 					   sizeof(kvm_sigmask)))
 				goto out;
 			r = -EINVAL;
-			if (kvm_sigmask.len != sizeof(csigset))
+			if (kvm_sigmask.len != sizeof(compat_sigset_t))
 				goto out;
 			r = -EFAULT;
-			if (copy_from_user(&csigset, sigmask_arg->sigset,
-					   sizeof(csigset)))
+			if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset))
 				goto out;
-			sigset_from_compat(&sigset, &csigset);
 			r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
 		} else
 			r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
-- 
cgit v1.2.3


From bffa72cf7f9df842f0016ba03586039296b4caaf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 19 Sep 2017 05:14:24 -0700
Subject: net: sk_buff rbnode reorg

skb->rbnode shares space with skb->next, skb->prev and skb->tstamp

Current uses (TCP receive ofo queue and netem) need to save/restore
tstamp, while skb->dev is either NULL (TCP) or a constant for a given
queue (netem).

Since we plan using an RB tree for TCP retransmit queue to speedup SACK
processing with large BDP, this patch exchanges skb->dev and
skb->tstamp.

This saves some overhead in both TCP and netem.

v2: removes the swtstamp field from struct tcp_skb_cb

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Wei Wang <weiwan@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 16 ++++++++--------
 include/net/tcp.h      |  6 ------
 net/ipv4/tcp_input.c   | 27 +++++----------------------
 net/sched/sch_netem.c  |  7 ++++---
 4 files changed, 17 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 72299ef00061..492828801acb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -661,8 +661,12 @@ struct sk_buff {
 			struct sk_buff		*prev;
 
 			union {
-				ktime_t		tstamp;
-				u64		skb_mstamp;
+				struct net_device	*dev;
+				/* Some protocols might use this space to store information,
+				 * while device pointer would be NULL.
+				 * UDP receive path is one user.
+				 */
+				unsigned long		dev_scratch;
 			};
 		};
 		struct rb_node	rbnode; /* used in netem & tcp stack */
@@ -670,12 +674,8 @@ struct sk_buff {
 	struct sock		*sk;
 
 	union {
-		struct net_device	*dev;
-		/* Some protocols might use this space to store information,
-		 * while device pointer would be NULL.
-		 * UDP receive path is one user.
-		 */
-		unsigned long		dev_scratch;
+		ktime_t		tstamp;
+		u64		skb_mstamp;
 	};
 	/*
 	 * This is the control buffer. It is free to use for every
diff --git a/include/net/tcp.h b/include/net/tcp.h
index b510f284427a..49a8a46466f3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -797,12 +797,6 @@ struct tcp_skb_cb {
 			u16	tcp_gso_segs;
 			u16	tcp_gso_size;
 		};
-
-		/* Used to stash the receive timestamp while this skb is in the
-		 * out of order queue, as skb->tstamp is overwritten by the
-		 * rbnode.
-		 */
-		ktime_t		swtstamp;
 	};
 	__u8		tcp_flags;	/* TCP header flags. (tcp[13])	*/
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bddf724f5c02..db9bb46b5776 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4266,11 +4266,6 @@ static void tcp_sack_remove(struct tcp_sock *tp)
 	tp->rx_opt.num_sacks = num_sacks;
 }
 
-enum tcp_queue {
-	OOO_QUEUE,
-	RCV_QUEUE,
-};
-
 /**
  * tcp_try_coalesce - try to merge skb to prior one
  * @sk: socket
@@ -4286,7 +4281,6 @@ enum tcp_queue {
  * Returns true if caller should free @from instead of queueing it
  */
 static bool tcp_try_coalesce(struct sock *sk,
-			     enum tcp_queue dest,
 			     struct sk_buff *to,
 			     struct sk_buff *from,
 			     bool *fragstolen)
@@ -4311,10 +4305,7 @@ static bool tcp_try_coalesce(struct sock *sk,
 
 	if (TCP_SKB_CB(from)->has_rxtstamp) {
 		TCP_SKB_CB(to)->has_rxtstamp = true;
-		if (dest == OOO_QUEUE)
-			TCP_SKB_CB(to)->swtstamp = TCP_SKB_CB(from)->swtstamp;
-		else
-			to->tstamp = from->tstamp;
+		to->tstamp = from->tstamp;
 	}
 
 	return true;
@@ -4351,9 +4342,6 @@ static void tcp_ofo_queue(struct sock *sk)
 		}
 		p = rb_next(p);
 		rb_erase(&skb->rbnode, &tp->out_of_order_queue);
-		/* Replace tstamp which was stomped by rbnode */
-		if (TCP_SKB_CB(skb)->has_rxtstamp)
-			skb->tstamp = TCP_SKB_CB(skb)->swtstamp;
 
 		if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
 			SOCK_DEBUG(sk, "ofo packet was already received\n");
@@ -4365,8 +4353,7 @@ static void tcp_ofo_queue(struct sock *sk)
 			   TCP_SKB_CB(skb)->end_seq);
 
 		tail = skb_peek_tail(&sk->sk_receive_queue);
-		eaten = tail && tcp_try_coalesce(sk, RCV_QUEUE,
-						 tail, skb, &fragstolen);
+		eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
 		tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
 		fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
 		if (!eaten)
@@ -4420,10 +4407,6 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		return;
 	}
 
-	/* Stash tstamp to avoid being stomped on by rbnode */
-	if (TCP_SKB_CB(skb)->has_rxtstamp)
-		TCP_SKB_CB(skb)->swtstamp = skb->tstamp;
-
 	/* Disable header prediction. */
 	tp->pred_flags = 0;
 	inet_csk_schedule_ack(sk);
@@ -4451,7 +4434,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	/* In the typical case, we are adding an skb to the end of the list.
 	 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
 	 */
-	if (tcp_try_coalesce(sk, OOO_QUEUE, tp->ooo_last_skb,
+	if (tcp_try_coalesce(sk, tp->ooo_last_skb,
 			     skb, &fragstolen)) {
 coalesce_done:
 		tcp_grow_window(sk, skb);
@@ -4502,7 +4485,7 @@ coalesce_done:
 				__kfree_skb(skb1);
 				goto merge_right;
 			}
-		} else if (tcp_try_coalesce(sk, OOO_QUEUE, skb1,
+		} else if (tcp_try_coalesce(sk, skb1,
 					    skb, &fragstolen)) {
 			goto coalesce_done;
 		}
@@ -4554,7 +4537,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
 
 	__skb_pull(skb, hdrlen);
 	eaten = (tail &&
-		 tcp_try_coalesce(sk, RCV_QUEUE, tail,
+		 tcp_try_coalesce(sk, tail,
 				  skb, fragstolen)) ? 1 : 0;
 	tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
 	if (!eaten) {
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index b1266e75ca43..063a4bdb9ee6 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -146,7 +146,6 @@ struct netem_sched_data {
  */
 struct netem_skb_cb {
 	psched_time_t	time_to_send;
-	ktime_t		tstamp_save;
 };
 
 
@@ -561,7 +560,6 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		}
 
 		cb->time_to_send = now + delay;
-		cb->tstamp_save = skb->tstamp;
 		++q->counter;
 		tfifo_enqueue(skb, sch);
 	} else {
@@ -629,7 +627,10 @@ deliver:
 			qdisc_qstats_backlog_dec(sch, skb);
 			skb->next = NULL;
 			skb->prev = NULL;
-			skb->tstamp = netem_skb_cb(skb)->tstamp_save;
+			/* skb->dev shares skb->rbnode area,
+			 * we need to restore its value.
+			 */
+			skb->dev = qdisc_dev(sch);
 
 #ifdef CONFIG_NET_CLS_ACT
 			/*
-- 
cgit v1.2.3


From 85e7dd3f871b988702973c80d9ef128e10dd3dad Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 4 Sep 2017 16:41:53 +0100
Subject: ASoC: arizona: Add support for setting the output volume limits

The output volume limits allow signals to be limited to specific levels
appropriate for the hardware attached. As this is a property of the
hardware itself these will be configured through device tree.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/mfd/arizona/pdata.h |  3 +++
 sound/soc/codecs/arizona.c        | 25 +++++++++++++++++++++++++
 sound/soc/codecs/arizona.h        |  1 +
 sound/soc/codecs/cs47l24.c        |  3 +++
 sound/soc/codecs/wm5102.c         |  3 +++
 sound/soc/codecs/wm5110.c         |  3 +++
 sound/soc/codecs/wm8997.c         |  3 +++
 7 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h
index bfeecf179895..f72dc53848d7 100644
--- a/include/linux/mfd/arizona/pdata.h
+++ b/include/linux/mfd/arizona/pdata.h
@@ -174,6 +174,9 @@ struct arizona_pdata {
 	/** Mode for outputs */
 	int out_mono[ARIZONA_MAX_OUTPUT];
 
+	/** Limit output volumes */
+	unsigned int out_vol_limit[2 * ARIZONA_MAX_OUTPUT];
+
 	/** PDM speaker mute setting */
 	unsigned int spk_mute[ARIZONA_MAX_PDM_SPK];
 
diff --git a/sound/soc/codecs/arizona.c b/sound/soc/codecs/arizona.c
index e6967385dccb..b3375e19598a 100644
--- a/sound/soc/codecs/arizona.c
+++ b/sound/soc/codecs/arizona.c
@@ -372,6 +372,22 @@ int arizona_init_common(struct arizona *arizona)
 }
 EXPORT_SYMBOL_GPL(arizona_init_common);
 
+int arizona_init_vol_limit(struct arizona *arizona)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(arizona->pdata.out_vol_limit); ++i) {
+		if (arizona->pdata.out_vol_limit[i])
+			regmap_update_bits(arizona->regmap,
+					   ARIZONA_DAC_VOLUME_LIMIT_1L + i * 4,
+					   ARIZONA_OUT1L_VOL_LIM_MASK,
+					   arizona->pdata.out_vol_limit[i]);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(arizona_init_vol_limit);
+
 const char * const arizona_mixer_texts[ARIZONA_NUM_MIXER_INPUTS] = {
 	"None",
 	"Tone Generator 1",
@@ -2810,6 +2826,15 @@ int arizona_of_get_audio_pdata(struct arizona *arizona)
 		count++;
 	}
 
+	count = 0;
+	of_property_for_each_u32(np, "wlf,out-volume-limit", prop, cur, val) {
+		if (count == ARRAY_SIZE(pdata->out_vol_limit))
+			break;
+
+		pdata->out_vol_limit[count] = val;
+		count++;
+	}
+
 	ret = of_property_read_u32_array(np, "wlf,spk-fmt",
 					 pdm_val, ARRAY_SIZE(pdm_val));
 
diff --git a/sound/soc/codecs/arizona.h b/sound/soc/codecs/arizona.h
index 2d198fb2ce97..dfdf6d8c9687 100644
--- a/sound/soc/codecs/arizona.h
+++ b/sound/soc/codecs/arizona.h
@@ -315,6 +315,7 @@ int arizona_init_gpio(struct snd_soc_codec *codec);
 int arizona_init_mono(struct snd_soc_codec *codec);
 
 int arizona_init_common(struct arizona *arizona);
+int arizona_init_vol_limit(struct arizona *arizona);
 
 int arizona_init_spk_irqs(struct arizona *arizona);
 int arizona_free_spk_irqs(struct arizona *arizona);
diff --git a/sound/soc/codecs/cs47l24.c b/sound/soc/codecs/cs47l24.c
index 0fe7d7a87ff3..94c0209977d0 100644
--- a/sound/soc/codecs/cs47l24.c
+++ b/sound/soc/codecs/cs47l24.c
@@ -1297,6 +1297,9 @@ static int cs47l24_probe(struct platform_device *pdev)
 
 	arizona_init_common(arizona);
 
+	ret = arizona_init_vol_limit(arizona);
+	if (ret < 0)
+		goto err_dsp_irq;
 	ret = arizona_init_spk_irqs(arizona);
 	if (ret < 0)
 		goto err_dsp_irq;
diff --git a/sound/soc/codecs/wm5102.c b/sound/soc/codecs/wm5102.c
index 5a917dd73f32..4f0481d3c7a7 100644
--- a/sound/soc/codecs/wm5102.c
+++ b/sound/soc/codecs/wm5102.c
@@ -2107,6 +2107,9 @@ static int wm5102_probe(struct platform_device *pdev)
 
 	arizona_init_common(arizona);
 
+	ret = arizona_init_vol_limit(arizona);
+	if (ret < 0)
+		goto err_dsp_irq;
 	ret = arizona_init_spk_irqs(arizona);
 	if (ret < 0)
 		goto err_dsp_irq;
diff --git a/sound/soc/codecs/wm5110.c b/sound/soc/codecs/wm5110.c
index ba1e90ca8be4..6ed1e1f9ce51 100644
--- a/sound/soc/codecs/wm5110.c
+++ b/sound/soc/codecs/wm5110.c
@@ -2463,6 +2463,9 @@ static int wm5110_probe(struct platform_device *pdev)
 
 	arizona_init_common(arizona);
 
+	ret = arizona_init_vol_limit(arizona);
+	if (ret < 0)
+		goto err_dsp_irq;
 	ret = arizona_init_spk_irqs(arizona);
 	if (ret < 0)
 		goto err_dsp_irq;
diff --git a/sound/soc/codecs/wm8997.c b/sound/soc/codecs/wm8997.c
index c5aef9ecdecc..77f512767273 100644
--- a/sound/soc/codecs/wm8997.c
+++ b/sound/soc/codecs/wm8997.c
@@ -1176,6 +1176,9 @@ static int wm8997_probe(struct platform_device *pdev)
 
 	arizona_init_common(arizona);
 
+	ret = arizona_init_vol_limit(arizona);
+	if (ret < 0)
+		return ret;
 	ret = arizona_init_spk_irqs(arizona);
 	if (ret < 0)
 		return ret;
-- 
cgit v1.2.3


From 3f7632e1ba2c6f5ca2499d1ee8acf95599d4b7b6 Mon Sep 17 00:00:00 2001
From: Stefan Brüns <stefan.bruens@rwth-aachen.de>
Date: Tue, 12 Sep 2017 01:44:44 +0200
Subject: dmaengine: List all allowed values for src/dst_addr_width in kernel
 doc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 93c6ee94c140 ("dma: Support for 3 bytes word size") and
commit 534a729866f9 ("dmaengine: Add 16 bytes, 32 bytes and 64 bytes
bus widths") added additional values for the allowed word size, but
omitted these from the struct dma_slave_config documentation.

Signed-off-by: Stefan Brüns <stefan.bruens@rwth-aachen.de>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/dmaengine.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 8319101170fc..2910e7dadc7f 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -329,7 +329,7 @@ enum dma_slave_buswidth {
  * @src_addr_width: this is the width in bytes of the source (RX)
  * register where DMA data shall be read. If the source
  * is memory this may be ignored depending on architecture.
- * Legal values: 1, 2, 4, 8.
+ * Legal values: 1, 2, 3, 4, 8, 16, 32, 64.
  * @dst_addr_width: same as src_addr_width but for destination
  * target (TX) mutatis mutandis.
  * @src_maxburst: the maximum number of words (note: words, as in
-- 
cgit v1.2.3


From c2cbd4276eea1d3b204754653dd74d6e10ca207f Mon Sep 17 00:00:00 2001
From: Stefan Brüns <stefan.bruens@rwth-aachen.de>
Date: Tue, 12 Sep 2017 01:44:45 +0200
Subject: dmaengine: Mark struct dma_slave_caps kernel-doc correctly, clarify
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct dma_slave_caps documentation omitted the correct kernel-doc
opening comment mark.

Document byte granularity and interpretation of the src/dst_addr_widths
bit flag fields used by struct dma_slave_caps and struct dma_device.

Add punctuation to their "directions" member documentations, and cleanup
wording of the description.

Signed-off-by: Stefan Brüns <stefan.bruens@rwth-aachen.de>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/dmaengine.h | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 2910e7dadc7f..f838764993eb 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -404,14 +404,16 @@ enum dma_residue_granularity {
 	DMA_RESIDUE_GRANULARITY_BURST = 2,
 };
 
-/* struct dma_slave_caps - expose capabilities of a slave channel only
- *
- * @src_addr_widths: bit mask of src addr widths the channel supports
- * @dst_addr_widths: bit mask of dstn addr widths the channel supports
- * @directions: bit mask of slave direction the channel supported
- * 	since the enum dma_transfer_direction is not defined as bits for each
- * 	type of direction, the dma controller should fill (1 << <TYPE>) and same
- * 	should be checked by controller as well
+/**
+ * struct dma_slave_caps - expose capabilities of a slave channel only
+ * @src_addr_widths: bit mask of src addr widths the channel supports.
+ *	Width is specified in bytes, e.g. for a channel supporting
+ *	a width of 4 the mask should have BIT(4) set.
+ * @dst_addr_widths: bit mask of dst addr widths the channel supports
+ * @directions: bit mask of slave directions the channel supports.
+ *	Since the enum dma_transfer_direction is not defined as bit flag for
+ *	each type, the dma controller should set BIT(<TYPE>) and same
+ *	should be checked by controller as well
  * @max_burst: max burst capability per-transfer
  * @cmd_pause: true, if pause and thereby resume is supported
  * @cmd_terminate: true, if terminate cmd is supported
@@ -678,11 +680,13 @@ struct dma_filter {
  * @dev_id: unique device ID
  * @dev: struct device reference for dma mapping api
  * @src_addr_widths: bit mask of src addr widths the device supports
+ *	Width is specified in bytes, e.g. for a device supporting
+ *	a width of 4 the mask should have BIT(4) set.
  * @dst_addr_widths: bit mask of dst addr widths the device supports
- * @directions: bit mask of slave direction the device supports since
- * 	the enum dma_transfer_direction is not defined as bits for
- * 	each type of direction, the dma controller should fill (1 <<
- * 	<TYPE>) and same should be checked by controller as well
+ * @directions: bit mask of slave directions the device supports.
+ *	Since the enum dma_transfer_direction is not defined as bit flag for
+ *	each type, the dma controller should set BIT(<TYPE>) and same
+ *	should be checked by controller as well
  * @max_burst: max burst capability per-transfer
  * @residue_granularity: granularity of the transfer residue reported
  *	by tx_status
-- 
cgit v1.2.3


From 6e617de84e87d626d1e976fc30e1322239fd4d2d Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 20 Sep 2017 18:26:53 +0200
Subject: net: avoid a full fib lookup when rp_filter is disabled.

Since commit 1dced6a85482 ("ipv4: Restore accept_local behaviour
in fib_validate_source()") a full fib lookup is needed even if
the rp_filter is disabled, if accept_local is false - which is
the default.

What we really need in the above scenario is just checking
that the source IP address is not local, and in most case we
can do that is a cheaper way looking up the ifaddr hash table.

This commit adds a helper for such lookup, and uses it to
validate the src address when rp_filter is disabled and no
'local' routes are created by the user space in the relevant
namespace.

A new ipv4 netns flag is added to account for such routes.
We need that to preserve the same behavior we had before this
patch.

It also drops the checks to bail early from __fib_validate_source,
added by the commit 1dced6a85482 ("ipv4: Restore accept_local
behaviour in fib_validate_source()") they do not give any
measurable performance improvement: if we do the lookup with are
on a slower path.

This improves UDP performances for unconnected sockets
when rp_filter is disabled by 5% and also gives small but
measurable performance improvement for TCP flood scenarios.

v1 -> v2:
 - use the ifaddr lookup helper in __ip_dev_find(), as suggested
   by Eric
 - fall-back to full lookup if custom local routes are present

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h |  1 +
 include/net/netns/ipv4.h   |  1 +
 net/ipv4/devinet.c         | 30 ++++++++++++++++++------------
 net/ipv4/fib_frontend.c    | 22 +++++++++++++++++-----
 4 files changed, 37 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index fb3f809e34e4..751d051f0bc7 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -179,6 +179,7 @@ __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst,
 			 __be32 local, int scope);
 struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
 				    __be32 mask);
+struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr);
 static __inline__ bool inet_ifa_match(__be32 addr, struct in_ifaddr *ifa)
 {
 	return !((addr^ifa->ifa_address)&ifa->ifa_mask);
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 20d061c805e3..20720721da4b 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -49,6 +49,7 @@ struct netns_ipv4 {
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	struct fib_rules_ops	*rules_ops;
 	bool			fib_has_custom_rules;
+	bool			fib_has_custom_local_routes;
 	struct fib_table __rcu	*fib_main;
 	struct fib_table __rcu	*fib_default;
 #endif
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d7adc0616599..7ce22a2c07ce 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -137,22 +137,12 @@ static void inet_hash_remove(struct in_ifaddr *ifa)
  */
 struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
 {
-	u32 hash = inet_addr_hash(net, addr);
 	struct net_device *result = NULL;
 	struct in_ifaddr *ifa;
 
 	rcu_read_lock();
-	hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) {
-		if (ifa->ifa_local == addr) {
-			struct net_device *dev = ifa->ifa_dev->dev;
-
-			if (!net_eq(dev_net(dev), net))
-				continue;
-			result = dev;
-			break;
-		}
-	}
-	if (!result) {
+	ifa = inet_lookup_ifaddr_rcu(net, addr);
+	if (!ifa) {
 		struct flowi4 fl4 = { .daddr = addr };
 		struct fib_result res = { 0 };
 		struct fib_table *local;
@@ -165,6 +155,8 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
 		    !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
 		    res.type == RTN_LOCAL)
 			result = FIB_RES_DEV(res);
+	} else {
+		result = ifa->ifa_dev->dev;
 	}
 	if (result && devref)
 		dev_hold(result);
@@ -173,6 +165,20 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
 }
 EXPORT_SYMBOL(__ip_dev_find);
 
+/* called under RCU lock */
+struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr)
+{
+	u32 hash = inet_addr_hash(net, addr);
+	struct in_ifaddr *ifa;
+
+	hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash)
+		if (ifa->ifa_local == addr &&
+		    net_eq(dev_net(ifa->ifa_dev->dev), net))
+			return ifa;
+
+	return NULL;
+}
+
 static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
 
 static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 37819ab4cc74..f02819134ba2 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -345,9 +345,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	if (res.type != RTN_UNICAST &&
 	    (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
 		goto e_inval;
-	if (!rpf && !fib_num_tclassid_users(net) &&
-	    (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
-		goto last_resort;
 	fib_combine_itag(itag, &res);
 	dev_match = false;
 
@@ -402,13 +399,26 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 			struct in_device *idev, u32 *itag)
 {
 	int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
+	struct net *net = dev_net(dev);
 
-	if (!r && !fib_num_tclassid_users(dev_net(dev)) &&
-	    IN_DEV_ACCEPT_LOCAL(idev) &&
+	if (!r && !fib_num_tclassid_users(net) &&
 	    (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
+		if (IN_DEV_ACCEPT_LOCAL(idev))
+			goto ok;
+		/* if no local routes are added from user space we can check
+		 * for local addresses looking-up the ifaddr table
+		 */
+		if (net->ipv4.fib_has_custom_local_routes)
+			goto full_check;
+		if (inet_lookup_ifaddr_rcu(net, src))
+			return -EINVAL;
+
+ok:
 		*itag = 0;
 		return 0;
 	}
+
+full_check:
 	return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
 }
 
@@ -759,6 +769,8 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 	}
 
 	err = fib_table_insert(net, tb, &cfg, extack);
+	if (!err && cfg.fc_type == RTN_LOCAL)
+		net->ipv4.fib_has_custom_local_routes = true;
 errout:
 	return err;
 }
-- 
cgit v1.2.3


From a9a1d2a7827c9cf780966d0879c73ef5a91380e9 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 22 Sep 2017 11:02:10 +0200
Subject: pinctrl/gpio: Unify namespace for cross-calls

The pinctrl_request_gpio() and pinctrl_free_gpio() break the nice
namespacing in the other cross-calls like pinctrl_gpio_foo().
Just rename them and all references so we have one namespace
with all cross-calls under pinctrl_gpio_*().

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/driver-api/pinctl.rst       |  6 +++---
 Documentation/gpio/gpio-legacy.txt        | 10 +++++-----
 Documentation/translations/zh_CN/gpio.txt |  6 +++---
 drivers/gpio/gpio-aspeed.c                |  4 ++--
 drivers/gpio/gpio-em.c                    |  4 ++--
 drivers/gpio/gpio-pxa.c                   |  4 ++--
 drivers/gpio/gpio-rcar.c                  |  4 ++--
 drivers/gpio/gpio-tegra.c                 |  4 ++--
 drivers/gpio/gpio-tz1090.c                |  4 ++--
 drivers/gpio/gpiolib.c                    |  4 ++--
 drivers/pinctrl/bcm/pinctrl-iproc-gpio.c  |  4 ++--
 drivers/pinctrl/bcm/pinctrl-nsp-gpio.c    |  4 ++--
 drivers/pinctrl/core.c                    | 12 ++++++------
 drivers/pinctrl/core.h                    |  2 +-
 drivers/pinctrl/meson/pinctrl-meson.c     |  4 ++--
 drivers/pinctrl/sh-pfc/gpio.c             |  4 ++--
 drivers/pinctrl/sirf/pinctrl-atlas7.c     |  4 ++--
 drivers/pinctrl/sirf/pinctrl-sirf.c       |  4 ++--
 drivers/pinctrl/spear/pinctrl-plgpio.c    |  6 +++---
 drivers/pinctrl/stm32/pinctrl-stm32.c     |  4 ++--
 include/linux/pinctrl/consumer.h          |  8 ++++----
 21 files changed, 53 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/pinctl.rst b/Documentation/driver-api/pinctl.rst
index 48f15b4f9d3e..6cb68d67fa75 100644
--- a/Documentation/driver-api/pinctl.rst
+++ b/Documentation/driver-api/pinctl.rst
@@ -757,8 +757,8 @@ that your datasheet calls "GPIO mode", but actually is just an electrical
 configuration for a certain device. See the section below named
 "GPIO mode pitfalls" for more details on this scenario.
 
-The public pinmux API contains two functions named pinctrl_request_gpio()
-and pinctrl_free_gpio(). These two functions shall *ONLY* be called from
+The public pinmux API contains two functions named pinctrl_gpio_request()
+and pinctrl_gpio_free(). These two functions shall *ONLY* be called from
 gpiolib-based drivers as part of their gpio_request() and
 gpio_free() semantics. Likewise the pinctrl_gpio_direction_[input|output]
 shall only be called from within respective gpio_direction_[input|output]
@@ -790,7 +790,7 @@ gpiolib driver and the affected GPIO range, pin offset and desired direction
 will be passed along to this function.
 
 Alternatively to using these special functions, it is fully allowed to use
-named functions for each GPIO pin, the pinctrl_request_gpio() will attempt to
+named functions for each GPIO pin, the pinctrl_gpio_request() will attempt to
 obtain the function "gpioN" where "N" is the global GPIO pin number if no
 special GPIO-handler is registered.
 
diff --git a/Documentation/gpio/gpio-legacy.txt b/Documentation/gpio/gpio-legacy.txt
index 5eacc147ea87..8356d0e78f67 100644
--- a/Documentation/gpio/gpio-legacy.txt
+++ b/Documentation/gpio/gpio-legacy.txt
@@ -273,8 +273,8 @@ easily, gating off unused clocks.
 
 For GPIOs that use pins known to the pinctrl subsystem, that subsystem should
 be informed of their use; a gpiolib driver's .request() operation may call
-pinctrl_request_gpio(), and a gpiolib driver's .free() operation may call
-pinctrl_free_gpio(). The pinctrl subsystem allows a pinctrl_request_gpio()
+pinctrl_gpio_request(), and a gpiolib driver's .free() operation may call
+pinctrl_gpio_free(). The pinctrl subsystem allows a pinctrl_gpio_request()
 to succeed concurrently with a pin or pingroup being "owned" by a device for
 pin multiplexing.
 
@@ -448,8 +448,8 @@ together with an optional gpio feature. We have already covered the
 case where e.g. a GPIO controller need to reserve a pin or set the
 direction of a pin by calling any of:
 
-pinctrl_request_gpio()
-pinctrl_free_gpio()
+pinctrl_gpio_request()
+pinctrl_gpio_free()
 pinctrl_gpio_direction_input()
 pinctrl_gpio_direction_output()
 
@@ -466,7 +466,7 @@ gpio (under gpiolib) is still maintained by gpio drivers. It may happen
 that different pin ranges in a SoC is managed by different gpio drivers.
 
 This makes it logical to let gpio drivers announce their pin ranges to
-the pin ctrl subsystem before it will call 'pinctrl_request_gpio' in order
+the pin ctrl subsystem before it will call 'pinctrl_gpio_request' in order
 to request the corresponding pin to be prepared by the pinctrl subsystem
 before any gpio usage.
 
diff --git a/Documentation/translations/zh_CN/gpio.txt b/Documentation/translations/zh_CN/gpio.txt
index bce972521065..4f8bf30a41dc 100644
--- a/Documentation/translations/zh_CN/gpio.txt
+++ b/Documentation/translations/zh_CN/gpio.txt
@@ -257,9 +257,9 @@ GPIO 值的命令需要等待其信息排到队首才发送命令，再获得其
 简单地关闭未使用时钟)。
 
 对于 GPIO 使用 pinctrl 子系统已知的引脚，子系统应该被告知其使用情况；
-一个 gpiolib 驱动的 .request()操作应调用 pinctrl_request_gpio()，
-而 gpiolib 驱动的 .free()操作应调用 pinctrl_free_gpio()。pinctrl
-子系统允许 pinctrl_request_gpio()在某个引脚或引脚组以复用形式“属于”
+一个 gpiolib 驱动的 .request()操作应调用 pinctrl_gpio_request()，
+而 gpiolib 驱动的 .free()操作应调用 pinctrl_gpio_free()。pinctrl
+子系统允许 pinctrl_gpio_request()在某个引脚或引脚组以复用形式“属于”
 一个设备时都成功返回。
 
 任何须将 GPIO 信号导向适当引脚的引脚复用硬件的编程应该发生在 GPIO
diff --git a/drivers/gpio/gpio-aspeed.c b/drivers/gpio/gpio-aspeed.c
index bfc53995064a..c269cc199707 100644
--- a/drivers/gpio/gpio-aspeed.c
+++ b/drivers/gpio/gpio-aspeed.c
@@ -536,12 +536,12 @@ static int aspeed_gpio_request(struct gpio_chip *chip, unsigned int offset)
 	if (!have_gpio(gpiochip_get_data(chip), offset))
 		return -ENODEV;
 
-	return pinctrl_request_gpio(chip->base + offset);
+	return pinctrl_gpio_request(chip->base + offset);
 }
 
 static void aspeed_gpio_free(struct gpio_chip *chip, unsigned int offset)
 {
-	pinctrl_free_gpio(chip->base + offset);
+	pinctrl_gpio_free(chip->base + offset);
 }
 
 static inline void __iomem *bank_debounce_reg(struct aspeed_gpio *gpio,
diff --git a/drivers/gpio/gpio-em.c b/drivers/gpio/gpio-em.c
index 8d32ccc980d9..b86e09e1b13b 100644
--- a/drivers/gpio/gpio-em.c
+++ b/drivers/gpio/gpio-em.c
@@ -239,12 +239,12 @@ static int em_gio_to_irq(struct gpio_chip *chip, unsigned offset)
 
 static int em_gio_request(struct gpio_chip *chip, unsigned offset)
 {
-	return pinctrl_request_gpio(chip->base + offset);
+	return pinctrl_gpio_request(chip->base + offset);
 }
 
 static void em_gio_free(struct gpio_chip *chip, unsigned offset)
 {
-	pinctrl_free_gpio(chip->base + offset);
+	pinctrl_gpio_free(chip->base + offset);
 
 	/* Set the GPIO as an input to ensure that the next GPIO request won't
 	* drive the GPIO pin as an output.
diff --git a/drivers/gpio/gpio-pxa.c b/drivers/gpio/gpio-pxa.c
index 6029899789f3..da68d6cbc1e4 100644
--- a/drivers/gpio/gpio-pxa.c
+++ b/drivers/gpio/gpio-pxa.c
@@ -332,12 +332,12 @@ static int pxa_gpio_of_xlate(struct gpio_chip *gc,
 
 static int pxa_gpio_request(struct gpio_chip *chip, unsigned int offset)
 {
-	return pinctrl_request_gpio(chip->base + offset);
+	return pinctrl_gpio_request(chip->base + offset);
 }
 
 static void pxa_gpio_free(struct gpio_chip *chip, unsigned int offset)
 {
-	pinctrl_free_gpio(chip->base + offset);
+	pinctrl_gpio_free(chip->base + offset);
 }
 
 static int pxa_init_gpio_chip(struct pxa_gpio_chip *pchip, int ngpio,
diff --git a/drivers/gpio/gpio-rcar.c b/drivers/gpio/gpio-rcar.c
index 1f0871553fd2..43b51045aa47 100644
--- a/drivers/gpio/gpio-rcar.c
+++ b/drivers/gpio/gpio-rcar.c
@@ -249,7 +249,7 @@ static int gpio_rcar_request(struct gpio_chip *chip, unsigned offset)
 	if (error < 0)
 		return error;
 
-	error = pinctrl_request_gpio(chip->base + offset);
+	error = pinctrl_gpio_request(chip->base + offset);
 	if (error)
 		pm_runtime_put(&p->pdev->dev);
 
@@ -260,7 +260,7 @@ static void gpio_rcar_free(struct gpio_chip *chip, unsigned offset)
 {
 	struct gpio_rcar_priv *p = gpiochip_get_data(chip);
 
-	pinctrl_free_gpio(chip->base + offset);
+	pinctrl_gpio_free(chip->base + offset);
 
 	/*
 	 * Set the GPIO as an input to ensure that the next GPIO request won't
diff --git a/drivers/gpio/gpio-tegra.c b/drivers/gpio/gpio-tegra.c
index fbaf974277df..8db47f671708 100644
--- a/drivers/gpio/gpio-tegra.c
+++ b/drivers/gpio/gpio-tegra.c
@@ -141,14 +141,14 @@ static void tegra_gpio_disable(struct tegra_gpio_info *tgi, unsigned int gpio)
 
 static int tegra_gpio_request(struct gpio_chip *chip, unsigned int offset)
 {
-	return pinctrl_request_gpio(offset);
+	return pinctrl_gpio_request(offset);
 }
 
 static void tegra_gpio_free(struct gpio_chip *chip, unsigned int offset)
 {
 	struct tegra_gpio_info *tgi = gpiochip_get_data(chip);
 
-	pinctrl_free_gpio(offset);
+	pinctrl_gpio_free(offset);
 	tegra_gpio_disable(tgi, offset);
 }
 
diff --git a/drivers/gpio/gpio-tz1090.c b/drivers/gpio/gpio-tz1090.c
index 22c5be65051f..0bb9bb583889 100644
--- a/drivers/gpio/gpio-tz1090.c
+++ b/drivers/gpio/gpio-tz1090.c
@@ -232,7 +232,7 @@ static int tz1090_gpio_request(struct gpio_chip *chip, unsigned int offset)
 	struct tz1090_gpio_bank *bank = gpiochip_get_data(chip);
 	int ret;
 
-	ret = pinctrl_request_gpio(chip->base + offset);
+	ret = pinctrl_gpio_request(chip->base + offset);
 	if (ret)
 		return ret;
 
@@ -246,7 +246,7 @@ static void tz1090_gpio_free(struct gpio_chip *chip, unsigned int offset)
 {
 	struct tz1090_gpio_bank *bank = gpiochip_get_data(chip);
 
-	pinctrl_free_gpio(chip->base + offset);
+	pinctrl_gpio_free(chip->base + offset);
 
 	tz1090_gpio_clear_bit(bank, REG_GPIO_BIT_EN, offset);
 }
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index eb80dac4e26a..a9cb825ef335 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1859,7 +1859,7 @@ static inline void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gpiochip)
  */
 int gpiochip_generic_request(struct gpio_chip *chip, unsigned offset)
 {
-	return pinctrl_request_gpio(chip->gpiodev->base + offset);
+	return pinctrl_gpio_request(chip->gpiodev->base + offset);
 }
 EXPORT_SYMBOL_GPL(gpiochip_generic_request);
 
@@ -1870,7 +1870,7 @@ EXPORT_SYMBOL_GPL(gpiochip_generic_request);
  */
 void gpiochip_generic_free(struct gpio_chip *chip, unsigned offset)
 {
-	pinctrl_free_gpio(chip->gpiodev->base + offset);
+	pinctrl_gpio_free(chip->gpiodev->base + offset);
 }
 EXPORT_SYMBOL_GPL(gpiochip_generic_free);
 
diff --git a/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c b/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c
index 85a8c97d9dfe..5d08d989b1d0 100644
--- a/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c
+++ b/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c
@@ -311,7 +311,7 @@ static int iproc_gpio_request(struct gpio_chip *gc, unsigned offset)
 	if (!chip->pinmux_is_supported)
 		return 0;
 
-	return pinctrl_request_gpio(gpio);
+	return pinctrl_gpio_request(gpio);
 }
 
 static void iproc_gpio_free(struct gpio_chip *gc, unsigned offset)
@@ -322,7 +322,7 @@ static void iproc_gpio_free(struct gpio_chip *gc, unsigned offset)
 	if (!chip->pinmux_is_supported)
 		return;
 
-	pinctrl_free_gpio(gpio);
+	pinctrl_gpio_free(gpio);
 }
 
 static int iproc_gpio_direction_input(struct gpio_chip *gc, unsigned gpio)
diff --git a/drivers/pinctrl/bcm/pinctrl-nsp-gpio.c b/drivers/pinctrl/bcm/pinctrl-nsp-gpio.c
index 1cfe45fd391f..c1887072936e 100644
--- a/drivers/pinctrl/bcm/pinctrl-nsp-gpio.c
+++ b/drivers/pinctrl/bcm/pinctrl-nsp-gpio.c
@@ -282,14 +282,14 @@ static int nsp_gpio_request(struct gpio_chip *gc, unsigned offset)
 {
 	unsigned gpio = gc->base + offset;
 
-	return pinctrl_request_gpio(gpio);
+	return pinctrl_gpio_request(gpio);
 }
 
 static void nsp_gpio_free(struct gpio_chip *gc, unsigned offset)
 {
 	unsigned gpio = gc->base + offset;
 
-	pinctrl_free_gpio(gpio);
+	pinctrl_gpio_free(gpio);
 }
 
 static int nsp_gpio_direction_input(struct gpio_chip *gc, unsigned gpio)
diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index 56fbe4c3e800..4c8d5b23e4d0 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -733,14 +733,14 @@ int pinctrl_get_group_selector(struct pinctrl_dev *pctldev,
 }
 
 /**
- * pinctrl_request_gpio() - request a single pin to be used as GPIO
+ * pinctrl_gpio_request() - request a single pin to be used as GPIO
  * @gpio: the GPIO pin number from the GPIO subsystem number space
  *
  * This function should *ONLY* be used from gpiolib-based GPIO drivers,
  * as part of their gpio_request() semantics, platforms and individual drivers
  * shall *NOT* request GPIO pins to be muxed in.
  */
-int pinctrl_request_gpio(unsigned gpio)
+int pinctrl_gpio_request(unsigned gpio)
 {
 	struct pinctrl_dev *pctldev;
 	struct pinctrl_gpio_range *range;
@@ -765,17 +765,17 @@ int pinctrl_request_gpio(unsigned gpio)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(pinctrl_request_gpio);
+EXPORT_SYMBOL_GPL(pinctrl_gpio_request);
 
 /**
- * pinctrl_free_gpio() - free control on a single pin, currently used as GPIO
+ * pinctrl_gpio_free() - free control on a single pin, currently used as GPIO
  * @gpio: the GPIO pin number from the GPIO subsystem number space
  *
  * This function should *ONLY* be used from gpiolib-based GPIO drivers,
  * as part of their gpio_free() semantics, platforms and individual drivers
  * shall *NOT* request GPIO pins to be muxed out.
  */
-void pinctrl_free_gpio(unsigned gpio)
+void pinctrl_gpio_free(unsigned gpio)
 {
 	struct pinctrl_dev *pctldev;
 	struct pinctrl_gpio_range *range;
@@ -795,7 +795,7 @@ void pinctrl_free_gpio(unsigned gpio)
 
 	mutex_unlock(&pctldev->mutex);
 }
-EXPORT_SYMBOL_GPL(pinctrl_free_gpio);
+EXPORT_SYMBOL_GPL(pinctrl_gpio_free);
 
 static int pinctrl_gpio_direction(unsigned gpio, bool input)
 {
diff --git a/drivers/pinctrl/core.h b/drivers/pinctrl/core.h
index 7880c3adc450..8cf2eba17c8c 100644
--- a/drivers/pinctrl/core.h
+++ b/drivers/pinctrl/core.h
@@ -154,7 +154,7 @@ struct pinctrl_setting {
  *	or pin, and each of these will increment the @usecount.
  * @mux_owner: The name of device that called pinctrl_get().
  * @mux_setting: The most recent selected mux setting for this pin, if any.
- * @gpio_owner: If pinctrl_request_gpio() was called for this pin, this is
+ * @gpio_owner: If pinctrl_gpio_request() was called for this pin, this is
  *	the name of the GPIO that "owns" this pin.
  */
 struct pin_desc {
diff --git a/drivers/pinctrl/meson/pinctrl-meson.c b/drivers/pinctrl/meson/pinctrl-meson.c
index 66ed70c12733..e6e12a7b21e0 100644
--- a/drivers/pinctrl/meson/pinctrl-meson.c
+++ b/drivers/pinctrl/meson/pinctrl-meson.c
@@ -412,14 +412,14 @@ static const struct pinconf_ops meson_pinconf_ops = {
 
 static int meson_gpio_request(struct gpio_chip *chip, unsigned gpio)
 {
-	return pinctrl_request_gpio(chip->base + gpio);
+	return pinctrl_gpio_request(chip->base + gpio);
 }
 
 static void meson_gpio_free(struct gpio_chip *chip, unsigned gpio)
 {
 	struct meson_pinctrl *pc = gpiochip_get_data(chip);
 
-	pinctrl_free_gpio(pc->data->pin_base + gpio);
+	pinctrl_gpio_free(pc->data->pin_base + gpio);
 }
 
 static int meson_gpio_direction_input(struct gpio_chip *chip, unsigned gpio)
diff --git a/drivers/pinctrl/sh-pfc/gpio.c b/drivers/pinctrl/sh-pfc/gpio.c
index 6b5422766f13..946d9be50b62 100644
--- a/drivers/pinctrl/sh-pfc/gpio.c
+++ b/drivers/pinctrl/sh-pfc/gpio.c
@@ -139,12 +139,12 @@ static int gpio_pin_request(struct gpio_chip *gc, unsigned offset)
 	if (idx < 0 || pfc->info->pins[idx].enum_id == 0)
 		return -EINVAL;
 
-	return pinctrl_request_gpio(offset);
+	return pinctrl_gpio_request(offset);
 }
 
 static void gpio_pin_free(struct gpio_chip *gc, unsigned offset)
 {
-	return pinctrl_free_gpio(offset);
+	return pinctrl_gpio_free(offset);
 }
 
 static void gpio_pin_set_value(struct sh_pfc_chip *chip, unsigned offset,
diff --git a/drivers/pinctrl/sirf/pinctrl-atlas7.c b/drivers/pinctrl/sirf/pinctrl-atlas7.c
index 4db9323251e3..f4b192b493a0 100644
--- a/drivers/pinctrl/sirf/pinctrl-atlas7.c
+++ b/drivers/pinctrl/sirf/pinctrl-atlas7.c
@@ -5860,7 +5860,7 @@ static int atlas7_gpio_request(struct gpio_chip *chip,
 	if (ret < 0)
 		return ret;
 
-	if (pinctrl_request_gpio(chip->base + gpio))
+	if (pinctrl_gpio_request(chip->base + gpio))
 		return -ENODEV;
 
 	raw_spin_lock_irqsave(&a7gc->lock, flags);
@@ -5890,7 +5890,7 @@ static void atlas7_gpio_free(struct gpio_chip *chip,
 
 	raw_spin_unlock_irqrestore(&a7gc->lock, flags);
 
-	pinctrl_free_gpio(chip->base + gpio);
+	pinctrl_gpio_free(chip->base + gpio);
 }
 
 static int atlas7_gpio_direction_input(struct gpio_chip *chip,
diff --git a/drivers/pinctrl/sirf/pinctrl-sirf.c b/drivers/pinctrl/sirf/pinctrl-sirf.c
index d3ef05973901..d64add0b84cc 100644
--- a/drivers/pinctrl/sirf/pinctrl-sirf.c
+++ b/drivers/pinctrl/sirf/pinctrl-sirf.c
@@ -614,7 +614,7 @@ static int sirfsoc_gpio_request(struct gpio_chip *chip, unsigned offset)
 	struct sirfsoc_gpio_bank *bank = sirfsoc_gpio_to_bank(sgpio, offset);
 	unsigned long flags;
 
-	if (pinctrl_request_gpio(chip->base + offset))
+	if (pinctrl_gpio_request(chip->base + offset))
 		return -ENODEV;
 
 	spin_lock_irqsave(&bank->lock, flags);
@@ -644,7 +644,7 @@ static void sirfsoc_gpio_free(struct gpio_chip *chip, unsigned offset)
 
 	spin_unlock_irqrestore(&bank->lock, flags);
 
-	pinctrl_free_gpio(chip->base + offset);
+	pinctrl_gpio_free(chip->base + offset);
 }
 
 static int sirfsoc_gpio_direction_input(struct gpio_chip *chip, unsigned gpio)
diff --git a/drivers/pinctrl/spear/pinctrl-plgpio.c b/drivers/pinctrl/spear/pinctrl-plgpio.c
index cf6d68c7345b..7a33e2e1e3e7 100644
--- a/drivers/pinctrl/spear/pinctrl-plgpio.c
+++ b/drivers/pinctrl/spear/pinctrl-plgpio.c
@@ -204,7 +204,7 @@ static int plgpio_request(struct gpio_chip *chip, unsigned offset)
 	if (offset >= chip->ngpio)
 		return -EINVAL;
 
-	ret = pinctrl_request_gpio(gpio);
+	ret = pinctrl_gpio_request(gpio);
 	if (ret)
 		return ret;
 
@@ -242,7 +242,7 @@ err1:
 	if (!IS_ERR(plgpio->clk))
 		clk_disable(plgpio->clk);
 err0:
-	pinctrl_free_gpio(gpio);
+	pinctrl_gpio_free(gpio);
 	return ret;
 }
 
@@ -273,7 +273,7 @@ disable_clk:
 	if (!IS_ERR(plgpio->clk))
 		clk_disable(plgpio->clk);
 
-	pinctrl_free_gpio(gpio);
+	pinctrl_gpio_free(gpio);
 }
 
 /* PLGPIO IRQ */
diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c
index 50299ad96659..a954d25bac4e 100644
--- a/drivers/pinctrl/stm32/pinctrl-stm32.c
+++ b/drivers/pinctrl/stm32/pinctrl-stm32.c
@@ -150,12 +150,12 @@ static int stm32_gpio_request(struct gpio_chip *chip, unsigned offset)
 		return -EINVAL;
 	}
 
-	return pinctrl_request_gpio(chip->base + offset);
+	return pinctrl_gpio_request(chip->base + offset);
 }
 
 static void stm32_gpio_free(struct gpio_chip *chip, unsigned offset)
 {
-	pinctrl_free_gpio(chip->base + offset);
+	pinctrl_gpio_free(chip->base + offset);
 }
 
 static int stm32_gpio_get(struct gpio_chip *chip, unsigned offset)
diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index a0f2aba72fa9..0412cc9833e9 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -25,8 +25,8 @@ struct device;
 #ifdef CONFIG_PINCTRL
 
 /* External interface to pin control */
-extern int pinctrl_request_gpio(unsigned gpio);
-extern void pinctrl_free_gpio(unsigned gpio);
+extern int pinctrl_gpio_request(unsigned gpio);
+extern void pinctrl_gpio_free(unsigned gpio);
 extern int pinctrl_gpio_direction_input(unsigned gpio);
 extern int pinctrl_gpio_direction_output(unsigned gpio);
 extern int pinctrl_gpio_set_config(unsigned gpio, unsigned long config);
@@ -62,12 +62,12 @@ static inline int pinctrl_pm_select_idle_state(struct device *dev)
 
 #else /* !CONFIG_PINCTRL */
 
-static inline int pinctrl_request_gpio(unsigned gpio)
+static inline int pinctrl_gpio_request(unsigned gpio)
 {
 	return 0;
 }
 
-static inline void pinctrl_free_gpio(unsigned gpio)
+static inline void pinctrl_gpio_free(unsigned gpio)
 {
 }
 
-- 
cgit v1.2.3


From 242c1a28eb61cb34974e8aa05235d84355940a8a Mon Sep 17 00:00:00 2001
From: Gao Feng <gfree.wind@vip.163.com>
Date: Fri, 22 Sep 2017 10:25:22 +0800
Subject: net: Remove useless function skb_header_release

There is no one which would invokes the function skb_header_release.
So just remove it now.

Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/asix_common.c   |  2 +-
 include/linux/skbuff.h          | 19 -------------------
 net/batman-adv/soft-interface.c |  2 +-
 3 files changed, 2 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/asix_common.c b/drivers/net/usb/asix_common.c
index 522d2900cd1d..f4d7362eb325 100644
--- a/drivers/net/usb/asix_common.c
+++ b/drivers/net/usb/asix_common.c
@@ -245,7 +245,7 @@ struct sk_buff *asix_tx_fixup(struct usbnet *dev, struct sk_buff *skb,
 	 * - We are allowed to put 4 bytes at tail if skb_cloned()
 	 *   is false (and if we have 4 bytes of tailroom)
 	 *
-	 * TCP packets for example are cloned, but skb_header_release()
+	 * TCP packets for example are cloned, but __skb_header_release()
 	 * was called in tcp stack, allowing us to use headroom for our needs.
 	 */
 	if (!skb_header_cloned(skb) &&
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 492828801acb..f9db5539a6fb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1456,28 +1456,9 @@ static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri)
 	return 0;
 }
 
-/**
- *	skb_header_release - release reference to header
- *	@skb: buffer to operate on
- *
- *	Drop a reference to the header part of the buffer.  This is done
- *	by acquiring a payload reference.  You must not read from the header
- *	part of skb->data after this.
- *	Note : Check if you can use __skb_header_release() instead.
- */
-static inline void skb_header_release(struct sk_buff *skb)
-{
-	BUG_ON(skb->nohdr);
-	skb->nohdr = 1;
-	atomic_add(1 << SKB_DATAREF_SHIFT, &skb_shinfo(skb)->dataref);
-}
-
 /**
  *	__skb_header_release - release reference to header
  *	@skb: buffer to operate on
- *
- *	Variant of skb_header_release() assuming skb is private to caller.
- *	We can avoid one atomic operation.
  */
 static inline void __skb_header_release(struct sk_buff *skb)
 {
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 10f7edfb176e..c2c986746d0b 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -69,7 +69,7 @@ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len)
 	int result;
 
 	/* TODO: We must check if we can release all references to non-payload
-	 * data using skb_header_release in our skbs to allow skb_cow_header to
+	 * data using __skb_header_release in our skbs to allow skb_cow_header to
 	 * work optimally. This means that those skbs are not allowed to read
 	 * or write any data which is before the current position of skb->data
 	 * after that call and thus allow other skbs with the same data buffer
-- 
cgit v1.2.3


From cfb766da54d98ceb145e1bb0bd11c559569dcbfc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Sep 2017 08:12:04 -0700
Subject: sched/cputime: Expose cputime_adjust()

Will be used by basic cgroup resource stat reporting later.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/sched/cputime.h | 3 ++-
 kernel/sched/cputime.c        | 5 ++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
index 4c5b9735c1ae..9251044335c5 100644
--- a/include/linux/sched/cputime.h
+++ b/include/linux/sched/cputime.h
@@ -53,7 +53,8 @@ static inline void task_cputime_scaled(struct task_struct *t,
 
 extern void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
 extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
-
+extern void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+			   u64 *ut, u64 *st);
 
 /*
  * Thread group CPU time accounting.
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 14d2dbf97c53..8839b6e8a104 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -585,9 +585,8 @@ drop_precision:
  *
  * Assuming that rtime_i+1 >= rtime_i.
  */
-static void cputime_adjust(struct task_cputime *curr,
-			   struct prev_cputime *prev,
-			   u64 *ut, u64 *st)
+void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+		    u64 *ut, u64 *st)
 {
 	u64 rtime, stime, utime;
 	unsigned long flags;
-- 
cgit v1.2.3


From d2cc5ed6949085cfba30ec5228816cf6eb1d02b9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Sep 2017 08:12:04 -0700
Subject: cpuacct: Introduce cgroup_account_cputime[_field]()

Introduce cgroup_account_cputime[_field]() which wrap cpuacct_charge()
and cgroup_account_field().  This doesn't introduce any functional
changes and will be used to add cgroup basic resource accounting.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
---
 include/linux/cgroup.h   | 38 ++++++++++++++++++++++++++++++++++++++
 kernel/sched/cpuacct.h   | 17 -----------------
 kernel/sched/cputime.c   |  2 +-
 kernel/sched/deadline.c  |  2 +-
 kernel/sched/fair.c      |  2 +-
 kernel/sched/rt.c        |  2 +-
 kernel/sched/sched.h     |  2 +-
 kernel/sched/stop_task.c |  2 +-
 8 files changed, 44 insertions(+), 23 deletions(-)
 delete mode 100644 kernel/sched/cpuacct.h

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d023ac5e377f..6cd579329310 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -22,6 +22,7 @@
 #include <linux/nsproxy.h>
 #include <linux/user_namespace.h>
 #include <linux/refcount.h>
+#include <linux/kernel_stat.h>
 
 #include <linux/cgroup-defs.h>
 
@@ -688,6 +689,43 @@ static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
 	char *buf, size_t buflen) {}
 #endif /* !CONFIG_CGROUPS */
 
+/*
+ * Basic resource stats.
+ */
+#ifdef CONFIG_CGROUPS
+
+#ifdef CONFIG_CGROUP_CPUACCT
+void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
+#else
+static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+static inline void cpuacct_account_field(struct task_struct *tsk, int index,
+					 u64 val) {}
+#endif
+
+static inline void cgroup_account_cputime(struct task_struct *task,
+					  u64 delta_exec)
+{
+	cpuacct_charge(task, delta_exec);
+}
+
+static inline void cgroup_account_cputime_field(struct task_struct *task,
+						enum cpu_usage_stat index,
+						u64 delta_exec)
+{
+	cpuacct_account_field(task, index, delta_exec);
+}
+
+#else	/* CONFIG_CGROUPS */
+
+static inline void cgroup_account_cputime(struct task_struct *task,
+					  u64 delta_exec) {}
+static inline void cgroup_account_cputime_field(struct task_struct *task,
+						enum cpu_usage_stat index,
+						u64 delta_exec) {}
+
+#endif	/* CONFIG_CGROUPS */
+
 /*
  * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
  * definition in cgroup-defs.h.
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
deleted file mode 100644
index ba72807c73d4..000000000000
--- a/kernel/sched/cpuacct.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifdef CONFIG_CGROUP_CPUACCT
-
-extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
-
-#else
-
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
-}
-
-static inline void
-cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
-{
-}
-
-#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8839b6e8a104..e01b699bbd5b 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -109,7 +109,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
 	 */
 	__this_cpu_add(kernel_cpustat.cpustat[index], tmp);
 
-	cpuacct_account_field(p, index, tmp);
+	cgroup_account_cputime_field(p, index, tmp);
 }
 
 /*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0191ec7667c3..abd913c1b99e 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1143,7 +1143,7 @@ static void update_curr_dl(struct rq *rq)
 	account_group_exec_runtime(curr, delta_exec);
 
 	curr->se.exec_start = rq_clock_task(rq);
-	cpuacct_charge(curr, delta_exec);
+	cgroup_account_cputime(curr, delta_exec);
 
 	sched_rt_avg_update(rq, delta_exec);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..0ae69af95b8b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -851,7 +851,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 		struct task_struct *curtask = task_of(curr);
 
 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
-		cpuacct_charge(curtask, delta_exec);
+		cgroup_account_cputime(curtask, delta_exec);
 		account_group_exec_runtime(curtask, delta_exec);
 	}
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0af5ca9e3e3f..fdc2c5d1f82e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -979,7 +979,7 @@ static void update_curr_rt(struct rq *rq)
 	account_group_exec_runtime(curr, delta_exec);
 
 	curr->se.exec_start = rq_clock_task(rq);
-	cpuacct_charge(curr, delta_exec);
+	cgroup_account_cputime(curr, delta_exec);
 
 	sched_rt_avg_update(rq, delta_exec);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 14db76cd496f..f0b98f978843 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -29,6 +29,7 @@
 #include <linux/irq_work.h>
 #include <linux/tick.h>
 #include <linux/slab.h>
+#include <linux/cgroup.h>
 
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
@@ -36,7 +37,6 @@
 
 #include "cpupri.h"
 #include "cpudeadline.h"
-#include "cpuacct.h"
 
 #ifdef CONFIG_SCHED_DEBUG
 # define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 9f69fb630853..ec0bb5ab9024 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -71,7 +71,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
 	account_group_exec_runtime(curr, delta_exec);
 
 	curr->se.exec_start = rq_clock_task(rq);
-	cpuacct_charge(curr, delta_exec);
+	cgroup_account_cputime(curr, delta_exec);
 }
 
 static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
-- 
cgit v1.2.3


From 041cd640b2f3c5607171c59d8712b503659d21f7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Sep 2017 08:12:05 -0700
Subject: cgroup: Implement cgroup2 basic CPU usage accounting

In cgroup1, while cpuacct isn't actually controlling any resources, it
is a separate controller due to combination of two factors -
1. enabling cpu controller has significant side effects, and 2. we
have to pick one of the hierarchies to account CPU usages on.  cpuacct
controller is effectively used to designate a hierarchy to track CPU
usages on.

cgroup2's unified hierarchy removes the second reason and we can
account basic CPU usages by default.  While we can use cpuacct for
this purpose, both its interface and implementation leave a lot to be
desired - it collects and exposes two sources of truth which don't
agree with each other and some of the exposed statistics don't make
much sense.  Also, it propagates all the way up the hierarchy on each
accounting event which is unnecessary.

This patch adds basic resource accounting mechanism to cgroup2's
unified hierarchy and accounts CPU usages using it.

* All accountings are done per-cpu and don't propagate immediately.
  It just bumps the per-cgroup per-cpu counters and links to the
  parent's updated list if not already on it.

* On a read, the per-cpu counters are collected into the global ones
  and then propagated upwards.  Only the per-cpu counters which have
  changed since the last read are propagated.

* CPU usage stats are collected and shown in "cgroup.stat" with "cpu."
  prefix.  Total usage is collected from scheduling events.  User/sys
  breakdown is sourced from tick sampling and adjusted to the usage
  using cputime_adjust().

This keeps the accounting side hot path O(1) and per-cpu and the read
side O(nr_updated_since_last_read).

v2: Minor changes and documentation updates as suggested by Waiman and
    Roman.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Roman Gushchin <guro@fb.com>
---
 Documentation/cgroup-v2.txt     |   9 ++
 include/linux/cgroup-defs.h     |  57 +++++++
 include/linux/cgroup.h          |  22 +++
 kernel/cgroup/Makefile          |   2 +-
 kernel/cgroup/cgroup-internal.h |   8 +
 kernel/cgroup/cgroup.c          |  24 ++-
 kernel/cgroup/stat.c            | 334 ++++++++++++++++++++++++++++++++++++++++
 7 files changed, 453 insertions(+), 3 deletions(-)
 create mode 100644 kernel/cgroup/stat.c

(limited to 'include/linux')

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index dc44785dc0fa..3f8216912df0 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -886,6 +886,15 @@ All cgroup core files are prefixed with "cgroup."
 		A dying cgroup can consume system resources not exceeding
 		limits, which were active at the moment of cgroup deletion.
 
+	  cpu.usage_usec
+		CPU time consumed in the subtree.
+
+	  cpu.user_usec
+		User CPU time consumed in the subtree.
+
+	  cpu.system_usec
+		System CPU time consumed in the subtree.
+
 
 Controllers
 ===========
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ade4a78a54c2..3e55bbd31ad1 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -16,6 +16,7 @@
 #include <linux/refcount.h>
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
+#include <linux/u64_stats_sync.h>
 #include <linux/workqueue.h>
 #include <linux/bpf-cgroup.h>
 
@@ -254,6 +255,57 @@ struct css_set {
 	struct rcu_head rcu_head;
 };
 
+/*
+ * cgroup basic resource usage statistics.  Accounting is done per-cpu in
+ * cgroup_cpu_stat which is then lazily propagated up the hierarchy on
+ * reads.
+ *
+ * When a stat gets updated, the cgroup_cpu_stat and its ancestors are
+ * linked into the updated tree.  On the following read, propagation only
+ * considers and consumes the updated tree.  This makes reading O(the
+ * number of descendants which have been active since last read) instead of
+ * O(the total number of descendants).
+ *
+ * This is important because there can be a lot of (draining) cgroups which
+ * aren't active and stat may be read frequently.  The combination can
+ * become very expensive.  By propagating selectively, increasing reading
+ * frequency decreases the cost of each read.
+ */
+struct cgroup_cpu_stat {
+	/*
+	 * ->sync protects all the current counters.  These are the only
+	 * fields which get updated in the hot path.
+	 */
+	struct u64_stats_sync sync;
+	struct task_cputime cputime;
+
+	/*
+	 * Snapshots at the last reading.  These are used to calculate the
+	 * deltas to propagate to the global counters.
+	 */
+	struct task_cputime last_cputime;
+
+	/*
+	 * Child cgroups with stat updates on this cpu since the last read
+	 * are linked on the parent's ->updated_children through
+	 * ->updated_next.
+	 *
+	 * In addition to being more compact, singly-linked list pointing
+	 * to the cgroup makes it unnecessary for each per-cpu struct to
+	 * point back to the associated cgroup.
+	 *
+	 * Protected by per-cpu cgroup_cpu_stat_lock.
+	 */
+	struct cgroup *updated_children;	/* terminated by self cgroup */
+	struct cgroup *updated_next;		/* NULL iff not on the list */
+};
+
+struct cgroup_stat {
+	/* per-cpu statistics are collected into the folowing global counters */
+	struct task_cputime cputime;
+	struct prev_cputime prev_cputime;
+};
+
 struct cgroup {
 	/* self css with NULL ->ss, points back to this cgroup */
 	struct cgroup_subsys_state self;
@@ -353,6 +405,11 @@ struct cgroup {
 	 */
 	struct cgroup *dom_cgrp;
 
+	/* cgroup basic resource statistics */
+	struct cgroup_cpu_stat __percpu *cpu_stat;
+	struct cgroup_stat pending_stat;	/* pending from children */
+	struct cgroup_stat stat;
+
 	/*
 	 * list of pidlists, up to two for each namespace (one for procs, one
 	 * for tasks); created on demand.
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 6cd579329310..328a70ce0e23 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -703,17 +703,39 @@ static inline void cpuacct_account_field(struct task_struct *tsk, int index,
 					 u64 val) {}
 #endif
 
+void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix);
+
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+				    enum cpu_usage_stat index, u64 delta_exec);
+
 static inline void cgroup_account_cputime(struct task_struct *task,
 					  u64 delta_exec)
 {
+	struct cgroup *cgrp;
+
 	cpuacct_charge(task, delta_exec);
+
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(task);
+	if (cgroup_parent(cgrp))
+		__cgroup_account_cputime(cgrp, delta_exec);
+	rcu_read_unlock();
 }
 
 static inline void cgroup_account_cputime_field(struct task_struct *task,
 						enum cpu_usage_stat index,
 						u64 delta_exec)
 {
+	struct cgroup *cgrp;
+
 	cpuacct_account_field(task, index, delta_exec);
+
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(task);
+	if (cgroup_parent(cgrp))
+		__cgroup_account_cputime_field(cgrp, index, delta_exec);
+	rcu_read_unlock();
 }
 
 #else	/* CONFIG_CGROUPS */
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index ce693ccb8c58..0acee616e06c 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,4 +1,4 @@
-obj-y := cgroup.o namespace.o cgroup-v1.o
+obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
 
 obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 5151ff256c29..fa642c99586a 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -199,6 +199,14 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 
 int cgroup_task_count(const struct cgroup *cgrp);
 
+/*
+ * stat.c
+ */
+void cgroup_stat_flush(struct cgroup *cgrp);
+int cgroup_stat_init(struct cgroup *cgrp);
+void cgroup_stat_exit(struct cgroup *cgrp);
+void cgroup_stat_boot(void);
+
 /*
  * namespace.c
  */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d6551cd45238..d036625556c9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
 };
 #undef SUBSYS
 
+static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
+
 /*
  * The default hierarchy, reserved for the subsystems that are otherwise
  * unattached - it never has more than a single cgroup, and all tasks are
  * part of that cgroup.
  */
-struct cgroup_root cgrp_dfl_root;
+struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
 EXPORT_SYMBOL_GPL(cgrp_dfl_root);
 
 /*
@@ -3301,6 +3303,8 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "nr_dying_descendants %d\n",
 		   cgroup->nr_dying_descendants);
 
+	cgroup_stat_show_cputime(seq, "cpu.");
+
 	return 0;
 }
 
@@ -4471,6 +4475,8 @@ static void css_free_work_fn(struct work_struct *work)
 			 */
 			cgroup_put(cgroup_parent(cgrp));
 			kernfs_put(cgrp->kn);
+			if (cgroup_on_dfl(cgrp))
+				cgroup_stat_exit(cgrp);
 			kfree(cgrp);
 		} else {
 			/*
@@ -4515,6 +4521,9 @@ static void css_release_work_fn(struct work_struct *work)
 		/* cgroup release path */
 		trace_cgroup_release(cgrp);
 
+		if (cgroup_on_dfl(cgrp))
+			cgroup_stat_flush(cgrp);
+
 		for (tcgrp = cgroup_parent(cgrp); tcgrp;
 		     tcgrp = cgroup_parent(tcgrp))
 			tcgrp->nr_dying_descendants--;
@@ -4698,6 +4707,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	if (ret)
 		goto out_free_cgrp;
 
+	if (cgroup_on_dfl(parent)) {
+		ret = cgroup_stat_init(cgrp);
+		if (ret)
+			goto out_cancel_ref;
+	}
+
 	/*
 	 * Temporarily set the pointer to NULL, so idr_find() won't return
 	 * a half-baked cgroup.
@@ -4705,7 +4720,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
 	if (cgrp->id < 0) {
 		ret = -ENOMEM;
-		goto out_cancel_ref;
+		goto out_stat_exit;
 	}
 
 	init_cgroup_housekeeping(cgrp);
@@ -4754,6 +4769,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 
 	return cgrp;
 
+out_stat_exit:
+	if (cgroup_on_dfl(parent))
+		cgroup_stat_exit(cgrp);
 out_cancel_ref:
 	percpu_ref_exit(&cgrp->self.refcnt);
 out_free_cgrp:
@@ -5148,6 +5166,8 @@ int __init cgroup_init(void)
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
 
+	cgroup_stat_boot();
+
 	/*
 	 * The latency of the synchronize_sched() is too high for cgroups,
 	 * avoid it at the cost of forcing all readers into the slow path.
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
new file mode 100644
index 000000000000..9cce79e89320
--- /dev/null
+++ b/kernel/cgroup/stat.c
@@ -0,0 +1,334 @@
+#include "cgroup-internal.h"
+
+#include <linux/sched/cputime.h>
+
+static DEFINE_MUTEX(cgroup_stat_mutex);
+static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
+
+static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
+{
+	return per_cpu_ptr(cgrp->cpu_stat, cpu);
+}
+
+/**
+ * cgroup_cpu_stat_updated - keep track of updated cpu_stat
+ * @cgrp: target cgroup
+ * @cpu: cpu on which cpu_stat was updated
+ *
+ * @cgrp's cpu_stat on @cpu was updated.  Put it on the parent's matching
+ * cpu_stat->updated_children list.  See the comment on top of
+ * cgroup_cpu_stat definition for details.
+ */
+static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
+{
+	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+	struct cgroup *parent;
+	unsigned long flags;
+
+	/*
+	 * Speculative already-on-list test.  This may race leading to
+	 * temporary inaccuracies, which is fine.
+	 *
+	 * Because @parent's updated_children is terminated with @parent
+	 * instead of NULL, we can tell whether @cgrp is on the list by
+	 * testing the next pointer for NULL.
+	 */
+	if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
+		return;
+
+	raw_spin_lock_irqsave(cpu_lock, flags);
+
+	/* put @cgrp and all ancestors on the corresponding updated lists */
+	for (parent = cgroup_parent(cgrp); parent;
+	     cgrp = parent, parent = cgroup_parent(cgrp)) {
+		struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+		struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+
+		/*
+		 * Both additions and removals are bottom-up.  If a cgroup
+		 * is already in the tree, all ancestors are.
+		 */
+		if (cstat->updated_next)
+			break;
+
+		cstat->updated_next = pcstat->updated_children;
+		pcstat->updated_children = cgrp;
+	}
+
+	raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+
+/**
+ * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
+ * @pos: current position
+ * @root: root of the tree to traversal
+ * @cpu: target cpu
+ *
+ * Walks the udpated cpu_stat tree on @cpu from @root.  %NULL @pos starts
+ * the traversal and %NULL return indicates the end.  During traversal,
+ * each returned cgroup is unlinked from the tree.  Must be called with the
+ * matching cgroup_cpu_stat_lock held.
+ *
+ * The only ordering guarantee is that, for a parent and a child pair
+ * covered by a given traversal, if a child is visited, its parent is
+ * guaranteed to be visited afterwards.
+ */
+static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
+						  struct cgroup *root, int cpu)
+{
+	struct cgroup_cpu_stat *cstat;
+	struct cgroup *parent;
+
+	if (pos == root)
+		return NULL;
+
+	/*
+	 * We're gonna walk down to the first leaf and visit/remove it.  We
+	 * can pick whatever unvisited node as the starting point.
+	 */
+	if (!pos)
+		pos = root;
+	else
+		pos = cgroup_parent(pos);
+
+	/* walk down to the first leaf */
+	while (true) {
+		cstat = cgroup_cpu_stat(pos, cpu);
+		if (cstat->updated_children == pos)
+			break;
+		pos = cstat->updated_children;
+	}
+
+	/*
+	 * Unlink @pos from the tree.  As the updated_children list is
+	 * singly linked, we have to walk it to find the removal point.
+	 * However, due to the way we traverse, @pos will be the first
+	 * child in most cases. The only exception is @root.
+	 */
+	parent = cgroup_parent(pos);
+	if (parent && cstat->updated_next) {
+		struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+		struct cgroup_cpu_stat *ncstat;
+		struct cgroup **nextp;
+
+		nextp = &pcstat->updated_children;
+		while (true) {
+			ncstat = cgroup_cpu_stat(*nextp, cpu);
+			if (*nextp == pos)
+				break;
+
+			WARN_ON_ONCE(*nextp == parent);
+			nextp = &ncstat->updated_next;
+		}
+
+		*nextp = cstat->updated_next;
+		cstat->updated_next = NULL;
+	}
+
+	return pos;
+}
+
+static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
+				   struct cgroup_stat *src_stat)
+{
+	dst_stat->cputime.utime += src_stat->cputime.utime;
+	dst_stat->cputime.stime += src_stat->cputime.stime;
+	dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
+}
+
+static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
+{
+	struct cgroup *parent = cgroup_parent(cgrp);
+	struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+	struct task_cputime *last_cputime = &cstat->last_cputime;
+	struct task_cputime cputime;
+	struct cgroup_stat delta;
+	unsigned seq;
+
+	lockdep_assert_held(&cgroup_stat_mutex);
+
+	/* fetch the current per-cpu values */
+	do {
+		seq = __u64_stats_fetch_begin(&cstat->sync);
+		cputime = cstat->cputime;
+	} while (__u64_stats_fetch_retry(&cstat->sync, seq));
+
+	/* accumulate the deltas to propgate */
+	delta.cputime.utime = cputime.utime - last_cputime->utime;
+	delta.cputime.stime = cputime.stime - last_cputime->stime;
+	delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
+					 last_cputime->sum_exec_runtime;
+	*last_cputime = cputime;
+
+	/* transfer the pending stat into delta */
+	cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
+	memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
+
+	/* propagate delta into the global stat and the parent's pending */
+	cgroup_stat_accumulate(&cgrp->stat, &delta);
+	if (parent)
+		cgroup_stat_accumulate(&parent->pending_stat, &delta);
+}
+
+/* see cgroup_stat_flush() */
+static void cgroup_stat_flush_locked(struct cgroup *cgrp)
+{
+	int cpu;
+
+	lockdep_assert_held(&cgroup_stat_mutex);
+
+	for_each_possible_cpu(cpu) {
+		raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+		struct cgroup *pos = NULL;
+
+		raw_spin_lock_irq(cpu_lock);
+		while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
+			cgroup_cpu_stat_flush_one(pos, cpu);
+		raw_spin_unlock_irq(cpu_lock);
+	}
+}
+
+/**
+ * cgroup_stat_flush - flush stats in @cgrp's subtree
+ * @cgrp: target cgroup
+ *
+ * Collect all per-cpu stats in @cgrp's subtree into the global counters
+ * and propagate them upwards.  After this function returns, all cgroups in
+ * the subtree have up-to-date ->stat.
+ *
+ * This also gets all cgroups in the subtree including @cgrp off the
+ * ->updated_children lists.
+ */
+void cgroup_stat_flush(struct cgroup *cgrp)
+{
+	mutex_lock(&cgroup_stat_mutex);
+	cgroup_stat_flush_locked(cgrp);
+	mutex_unlock(&cgroup_stat_mutex);
+}
+
+static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
+{
+	struct cgroup_cpu_stat *cstat;
+
+	cstat = get_cpu_ptr(cgrp->cpu_stat);
+	u64_stats_update_begin(&cstat->sync);
+	return cstat;
+}
+
+static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
+					struct cgroup_cpu_stat *cstat)
+{
+	u64_stats_update_end(&cstat->sync);
+	cgroup_cpu_stat_updated(cgrp, smp_processor_id());
+	put_cpu_ptr(cstat);
+}
+
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
+{
+	struct cgroup_cpu_stat *cstat;
+
+	cstat = cgroup_cpu_stat_account_begin(cgrp);
+	cstat->cputime.sum_exec_runtime += delta_exec;
+	cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+				    enum cpu_usage_stat index, u64 delta_exec)
+{
+	struct cgroup_cpu_stat *cstat;
+
+	cstat = cgroup_cpu_stat_account_begin(cgrp);
+
+	switch (index) {
+	case CPUTIME_USER:
+	case CPUTIME_NICE:
+		cstat->cputime.utime += delta_exec;
+		break;
+	case CPUTIME_SYSTEM:
+	case CPUTIME_IRQ:
+	case CPUTIME_SOFTIRQ:
+		cstat->cputime.stime += delta_exec;
+		break;
+	default:
+		break;
+	}
+
+	cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+
+void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	u64 usage, utime, stime;
+
+	if (!cgroup_parent(cgrp))
+		return;
+
+	mutex_lock(&cgroup_stat_mutex);
+
+	cgroup_stat_flush_locked(cgrp);
+
+	usage = cgrp->stat.cputime.sum_exec_runtime;
+	cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
+		       &utime, &stime);
+
+	mutex_unlock(&cgroup_stat_mutex);
+
+	do_div(usage, NSEC_PER_USEC);
+	do_div(utime, NSEC_PER_USEC);
+	do_div(stime, NSEC_PER_USEC);
+
+	seq_printf(seq, "%susage_usec %llu\n"
+		   "%suser_usec %llu\n"
+		   "%ssystem_usec %llu\n",
+		   prefix, usage, prefix, utime, prefix, stime);
+}
+
+int cgroup_stat_init(struct cgroup *cgrp)
+{
+	int cpu;
+
+	/* the root cgrp has cpu_stat preallocated */
+	if (!cgrp->cpu_stat) {
+		cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
+		if (!cgrp->cpu_stat)
+			return -ENOMEM;
+	}
+
+	/* ->updated_children list is self terminated */
+	for_each_possible_cpu(cpu)
+		cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp;
+
+	prev_cputime_init(&cgrp->stat.prev_cputime);
+
+	return 0;
+}
+
+void cgroup_stat_exit(struct cgroup *cgrp)
+{
+	int cpu;
+
+	cgroup_stat_flush(cgrp);
+
+	/* sanity check */
+	for_each_possible_cpu(cpu) {
+		struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+
+		if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
+		    WARN_ON_ONCE(cstat->updated_next))
+			return;
+	}
+
+	free_percpu(cgrp->cpu_stat);
+	cgrp->cpu_stat = NULL;
+}
+
+void __init cgroup_stat_boot(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
+
+	BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
+}
-- 
cgit v1.2.3


From 07557ccb8c83f315e409ddee181e9ffbf87c6ad1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:05 +0200
Subject: genirq/msi: Capture device name for debugfs

For debugging the allocation of unused or potentially leaked interrupt
descriptor it's helpful to have some information about the site which
allocated them. In case of MSI this is simple because the caller hands the
device struct pointer into the domain allocation function.

Duplicate the device name and show it in the debugfs entry of the interrupt
descriptor.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213152.433038426@linutronix.de
---
 include/linux/irqdesc.h |  1 +
 kernel/irq/debugfs.c    | 10 ++++++++++
 kernel/irq/internals.h  |  5 +++++
 kernel/irq/msi.c        |  6 +++++-
 4 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 3e90a094798d..b55b113c049b 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -93,6 +93,7 @@ struct irq_desc {
 #endif
 #ifdef CONFIG_GENERIC_IRQ_DEBUGFS
 	struct dentry		*debugfs_file;
+	const char		*dev_name;
 #endif
 #ifdef CONFIG_SPARSE_IRQ
 	struct rcu_head		rcu;
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index c3fdb36dec30..b7d1023b9b22 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -149,6 +149,7 @@ static int irq_debug_show(struct seq_file *m, void *p)
 	raw_spin_lock_irq(&desc->lock);
 	data = irq_desc_get_irq_data(desc);
 	seq_printf(m, "handler:  %pf\n", desc->handle_irq);
+	seq_printf(m, "device:   %s\n", desc->dev_name);
 	seq_printf(m, "status:   0x%08x\n", desc->status_use_accessors);
 	irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states,
 			    ARRAY_SIZE(irqdesc_states));
@@ -226,6 +227,15 @@ static const struct file_operations dfs_irq_ops = {
 	.release	= single_release,
 };
 
+void irq_debugfs_copy_devname(int irq, struct device *dev)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	const char *name = dev_name(dev);
+
+	if (name)
+		desc->dev_name = kstrdup(name, GFP_KERNEL);
+}
+
 void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc)
 {
 	char name [10];
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index a4aa39009f0d..cfaec2669093 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -443,7 +443,9 @@ void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc);
 static inline void irq_remove_debugfs_entry(struct irq_desc *desc)
 {
 	debugfs_remove(desc->debugfs_file);
+	kfree(desc->dev_name);
 }
+void irq_debugfs_copy_devname(int irq, struct device *dev);
 # ifdef CONFIG_IRQ_DOMAIN
 void irq_domain_debugfs_init(struct dentry *root);
 # else
@@ -458,4 +460,7 @@ static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d)
 static inline void irq_remove_debugfs_entry(struct irq_desc *d)
 {
 }
+static inline void irq_debugfs_copy_devname(int irq, struct device *dev)
+{
+}
 #endif /* CONFIG_GENERIC_IRQ_DEBUGFS */
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 3fa4bd59f569..94ecc0293844 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -16,6 +16,8 @@
 #include <linux/msi.h>
 #include <linux/slab.h>
 
+#include "internals.h"
+
 /**
  * alloc_msi_entry - Allocate an initialize msi_entry
  * @dev:	Pointer to the device for which this is allocated
@@ -373,8 +375,10 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 			return ret;
 		}
 
-		for (i = 0; i < desc->nvec_used; i++)
+		for (i = 0; i < desc->nvec_used; i++) {
 			irq_set_msi_desc_off(virq, i, desc);
+			irq_debugfs_copy_devname(virq + i, dev);
+		}
 	}
 
 	if (ops->msi_finish)
-- 
cgit v1.2.3


From c3e7239a7f43ce1ff407df5f5944bf0d42dc21bf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:06 +0200
Subject: irqdomain/debugfs: Provide domain specific debug callback

Some interrupt domains like the X86 vector domain has special requirements
for debugging, like showing the vector usage on the CPUs.

Add a callback to the irqdomain ops which can be filled in by domains which
require it and add conditional invocations to the irqdomain and the per irq
debug files.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213152.512937505@linutronix.de
---
 include/linux/irqdomain.h | 6 +++++-
 kernel/irq/debugfs.c      | 2 ++
 kernel/irq/irqdomain.c    | 2 ++
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 81e4889ca6dd..1fb50438a868 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -40,6 +40,7 @@ struct of_device_id;
 struct irq_chip;
 struct irq_data;
 struct cpumask;
+struct seq_file;
 
 /* Number of irqs reserved for a legacy isa controller */
 #define NUM_ISA_INTERRUPTS	16
@@ -104,7 +105,6 @@ struct irq_domain_ops {
 	int (*xlate)(struct irq_domain *d, struct device_node *node,
 		     const u32 *intspec, unsigned int intsize,
 		     unsigned long *out_hwirq, unsigned int *out_type);
-
 #ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
 	/* extended V2 interfaces to support hierarchy irq_domains */
 	int (*alloc)(struct irq_domain *d, unsigned int virq,
@@ -116,6 +116,10 @@ struct irq_domain_ops {
 	int (*translate)(struct irq_domain *d, struct irq_fwspec *fwspec,
 			 unsigned long *out_hwirq, unsigned int *out_type);
 #endif
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+	void (*debug_show)(struct seq_file *m, struct irq_domain *d,
+			   struct irq_data *irqd, int ind);
+#endif
 };
 
 extern struct irq_domain_ops irq_generic_chip_ops;
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index b7d1023b9b22..7f608ac39653 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -81,6 +81,8 @@ irq_debug_show_data(struct seq_file *m, struct irq_data *data, int ind)
 		   data->domain ? data->domain->name : "");
 	seq_printf(m, "%*shwirq:   0x%lx\n", ind + 1, "", data->hwirq);
 	irq_debug_show_chip(m, data, ind + 1);
+	if (data->domain && data->domain->ops && data->domain->ops->debug_show)
+		data->domain->ops->debug_show(m, NULL, data, ind + 1);
 #ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
 	if (!data->parent_data)
 		return;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index e84b7056bb08..1a75a00ee01f 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1810,6 +1810,8 @@ irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind)
 		   d->revmap_size + d->revmap_direct_max_irq);
 	seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount);
 	seq_printf(m, "%*sflags:  0x%08x\n", ind +1 , "", d->flags);
+	if (d->ops && d->ops->debug_show)
+		d->ops->debug_show(m, d, NULL, ind + 1);
 #ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
 	if (!d->parent)
 		return;
-- 
cgit v1.2.3


From 457f6d35072f395508255ef09fd08f824382cf85 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:07 +0200
Subject: genirq: Make state consistent for !IRQ_DOMAIN_HIERARCHY

In the !IRQ_DOMAIN_HIERARCHY cas the activation stubs are not
setting/clearing the activation status bits. This is not a problem at the
moment, but upcoming changes require a correct status.

Add the set/clear incovations to the stub functions and move them to the
core internal header to avoid duplication and visibility outside the core.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213152.591985591@linutronix.de
---
 include/linux/irqdomain.h |  4 ----
 kernel/irq/internals.h    | 11 +++++++++++
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 1fb50438a868..de06ce992a2d 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -511,8 +511,6 @@ static inline bool irq_domain_is_msi_remap(struct irq_domain *domain)
 extern bool irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain);
 
 #else	/* CONFIG_IRQ_DOMAIN_HIERARCHY */
-static inline void irq_domain_activate_irq(struct irq_data *data) { }
-static inline void irq_domain_deactivate_irq(struct irq_data *data) { }
 static inline int irq_domain_alloc_irqs(struct irq_domain *domain,
 			unsigned int nr_irqs, int node, void *arg)
 {
@@ -561,8 +559,6 @@ irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain)
 
 #else /* CONFIG_IRQ_DOMAIN */
 static inline void irq_dispose_mapping(unsigned int virq) { }
-static inline void irq_domain_activate_irq(struct irq_data *data) { }
-static inline void irq_domain_deactivate_irq(struct irq_data *data) { }
 static inline struct irq_domain *irq_find_matching_fwnode(
 	struct fwnode_handle *fwnode, enum irq_domain_bus_token bus_token)
 {
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index cfaec2669093..72b8da2a7c39 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -436,6 +436,17 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
 }
 #endif /* !CONFIG_GENERIC_PENDING_IRQ */
 
+#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
+static inline void irq_domain_activate_irq(struct irq_data *data)
+{
+	irqd_set_activated(data);
+}
+static inline void irq_domain_deactivate_irq(struct irq_data *data)
+{
+	irqd_clr_activated(data);
+}
+#endif
+
 #ifdef CONFIG_GENERIC_IRQ_DEBUGFS
 #include <linux/debugfs.h>
 
-- 
cgit v1.2.3


From 72491643469aab736536ae71dcd199b19dabd891 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:10 +0200
Subject: genirq/irqdomain: Update irq_domain_ops.activate() signature

The irq_domain_ops.activate() callback has no return value and no way to
tell the function that the activation is early.

The upcoming changes to support a reservation scheme which allows to assign
interrupt vectors on x86 only when the interrupt is actually requested
requires:

  - A return value, so activation can fail at request_irq() time

  - Information that the activate invocation is early, i.e. before
    request_irq().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213152.848490816@linutronix.de
---
 arch/x86/include/asm/irqdomain.h      |  4 ++--
 arch/x86/kernel/apic/htirq.c          |  5 +++--
 arch/x86/kernel/apic/io_apic.c        |  5 +++--
 arch/x86/platform/uv/uv_irq.c         |  5 +++--
 drivers/gpio/gpio-xgene-sb.c          |  8 +++++---
 drivers/iommu/amd_iommu.c             |  5 +++--
 drivers/iommu/intel_irq_remapping.c   |  5 +++--
 drivers/irqchip/irq-gic-v3-its.c      | 10 ++++++----
 drivers/pinctrl/stm32/pinctrl-stm32.c |  5 +++--
 include/linux/irqdomain.h             |  2 +-
 kernel/irq/irqdomain.c                |  2 +-
 kernel/irq/msi.c                      |  5 +++--
 12 files changed, 36 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h
index d26075b52885..1d9091ffa140 100644
--- a/arch/x86/include/asm/irqdomain.h
+++ b/arch/x86/include/asm/irqdomain.h
@@ -41,8 +41,8 @@ extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 			      unsigned int nr_irqs, void *arg);
 extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
 			      unsigned int nr_irqs);
-extern void mp_irqdomain_activate(struct irq_domain *domain,
-				  struct irq_data *irq_data);
+extern int mp_irqdomain_activate(struct irq_domain *domain,
+				 struct irq_data *irq_data, bool early);
 extern void mp_irqdomain_deactivate(struct irq_domain *domain,
 				    struct irq_data *irq_data);
 extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain);
diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c
index 56ccf9346b08..b07075dce8b7 100644
--- a/arch/x86/kernel/apic/htirq.c
+++ b/arch/x86/kernel/apic/htirq.c
@@ -112,8 +112,8 @@ static void htirq_domain_free(struct irq_domain *domain, unsigned int virq,
 	irq_domain_free_irqs_top(domain, virq, nr_irqs);
 }
 
-static void htirq_domain_activate(struct irq_domain *domain,
-				  struct irq_data *irq_data)
+static int htirq_domain_activate(struct irq_domain *domain,
+				 struct irq_data *irq_data, bool early)
 {
 	struct ht_irq_msg msg;
 	struct irq_cfg *cfg = irqd_cfg(irq_data);
@@ -132,6 +132,7 @@ static void htirq_domain_activate(struct irq_domain *domain,
 			HT_IRQ_LOW_MT_ARBITRATED) |
 		HT_IRQ_LOW_IRQ_MASKED;
 	write_ht_irq_msg(irq_data->irq, &msg);
+	return 0;
 }
 
 static void htirq_domain_deactivate(struct irq_domain *domain,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 70e48aa6af98..d50e46757f6d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2977,8 +2977,8 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
 	irq_domain_free_irqs_top(domain, virq, nr_irqs);
 }
 
-void mp_irqdomain_activate(struct irq_domain *domain,
-			   struct irq_data *irq_data)
+int mp_irqdomain_activate(struct irq_domain *domain,
+			  struct irq_data *irq_data, bool early)
 {
 	unsigned long flags;
 	struct irq_pin_list *entry;
@@ -2988,6 +2988,7 @@ void mp_irqdomain_activate(struct irq_domain *domain,
 	for_each_irq_pin(entry, data->irq_2_pin)
 		__ioapic_write_entry(entry->apic, entry->pin, data->entry);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+	return 0;
 }
 
 void mp_irqdomain_deactivate(struct irq_domain *domain,
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 03fc397335b7..5f6fd860820a 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -127,10 +127,11 @@ static void uv_domain_free(struct irq_domain *domain, unsigned int virq,
  * Re-target the irq to the specified CPU and enable the specified MMR located
  * on the specified blade to allow the sending of MSIs to the specified CPU.
  */
-static void uv_domain_activate(struct irq_domain *domain,
-			       struct irq_data *irq_data)
+static int uv_domain_activate(struct irq_domain *domain,
+			      struct irq_data *irq_data, bool early)
 {
 	uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data);
+	return 0;
 }
 
 /*
diff --git a/drivers/gpio/gpio-xgene-sb.c b/drivers/gpio/gpio-xgene-sb.c
index 033258634b8c..b5843fe6a44d 100644
--- a/drivers/gpio/gpio-xgene-sb.c
+++ b/drivers/gpio/gpio-xgene-sb.c
@@ -140,8 +140,9 @@ static int xgene_gpio_sb_to_irq(struct gpio_chip *gc, u32 gpio)
 	return irq_create_fwspec_mapping(&fwspec);
 }
 
-static void xgene_gpio_sb_domain_activate(struct irq_domain *d,
-		struct irq_data *irq_data)
+static int xgene_gpio_sb_domain_activate(struct irq_domain *d,
+					 struct irq_data *irq_data,
+					 bool early)
 {
 	struct xgene_gpio_sb *priv = d->host_data;
 	u32 gpio = HWIRQ_TO_GPIO(priv, irq_data->hwirq);
@@ -150,11 +151,12 @@ static void xgene_gpio_sb_domain_activate(struct irq_domain *d,
 		dev_err(priv->gc.parent,
 		"Unable to configure XGene GPIO standby pin %d as IRQ\n",
 				gpio);
-		return;
+		return -ENOSPC;
 	}
 
 	xgene_gpio_set_bit(&priv->gc, priv->regs + MPA_GPIO_SEL_LO,
 			gpio * 2, 1);
+	return 0;
 }
 
 static void xgene_gpio_sb_domain_deactivate(struct irq_domain *d,
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 51f8215877f5..ea03f4138f5f 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -4170,8 +4170,8 @@ static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
 	irq_domain_free_irqs_common(domain, virq, nr_irqs);
 }
 
-static void irq_remapping_activate(struct irq_domain *domain,
-				   struct irq_data *irq_data)
+static int irq_remapping_activate(struct irq_domain *domain,
+				  struct irq_data *irq_data, bool early)
 {
 	struct amd_ir_data *data = irq_data->chip_data;
 	struct irq_2_irte *irte_info = &data->irq_2_irte;
@@ -4180,6 +4180,7 @@ static void irq_remapping_activate(struct irq_domain *domain,
 	if (iommu)
 		iommu->irte_ops->activate(data->entry, irte_info->devid,
 					  irte_info->index);
+	return 0;
 }
 
 static void irq_remapping_deactivate(struct irq_domain *domain,
diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c
index a5b89f6bcdbf..762d84713b7a 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -1389,12 +1389,13 @@ static void intel_irq_remapping_free(struct irq_domain *domain,
 	irq_domain_free_irqs_common(domain, virq, nr_irqs);
 }
 
-static void intel_irq_remapping_activate(struct irq_domain *domain,
-					 struct irq_data *irq_data)
+static int intel_irq_remapping_activate(struct irq_domain *domain,
+					struct irq_data *irq_data, bool early)
 {
 	struct intel_ir_data *data = irq_data->chip_data;
 
 	modify_irte(&data->irq_2_iommu, &data->irte_entry);
+	return 0;
 }
 
 static void intel_irq_remapping_deactivate(struct irq_domain *domain,
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index e8d89343d613..20e2b5fac7b9 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -2186,8 +2186,8 @@ static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 	return 0;
 }
 
-static void its_irq_domain_activate(struct irq_domain *domain,
-				    struct irq_data *d)
+static int its_irq_domain_activate(struct irq_domain *domain,
+				   struct irq_data *d, bool early)
 {
 	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
 	u32 event = its_get_event_id(d);
@@ -2205,6 +2205,7 @@ static void its_irq_domain_activate(struct irq_domain *domain,
 
 	/* Map the GIC IRQ and event to the device */
 	its_send_mapti(its_dev, d->hwirq, event);
+	return 0;
 }
 
 static void its_irq_domain_deactivate(struct irq_domain *domain,
@@ -2678,8 +2679,8 @@ static int its_vpe_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
 	return err;
 }
 
-static void its_vpe_irq_domain_activate(struct irq_domain *domain,
-					struct irq_data *d)
+static int its_vpe_irq_domain_activate(struct irq_domain *domain,
+				       struct irq_data *d, bool early)
 {
 	struct its_vpe *vpe = irq_data_get_irq_chip_data(d);
 
@@ -2687,6 +2688,7 @@ static void its_vpe_irq_domain_activate(struct irq_domain *domain,
 	vpe->col_idx = cpumask_first(cpu_online_mask);
 	its_send_vmapp(vpe, true);
 	its_send_vinvall(vpe);
+	return 0;
 }
 
 static void its_vpe_irq_domain_deactivate(struct irq_domain *domain,
diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c
index 50299ad96659..02b66588cac6 100644
--- a/drivers/pinctrl/stm32/pinctrl-stm32.c
+++ b/drivers/pinctrl/stm32/pinctrl-stm32.c
@@ -289,13 +289,14 @@ static int stm32_gpio_domain_translate(struct irq_domain *d,
 	return 0;
 }
 
-static void stm32_gpio_domain_activate(struct irq_domain *d,
-				       struct irq_data *irq_data)
+static int stm32_gpio_domain_activate(struct irq_domain *d,
+				      struct irq_data *irq_data, bool early)
 {
 	struct stm32_gpio_bank *bank = d->host_data;
 	struct stm32_pinctrl *pctl = dev_get_drvdata(bank->gpio_chip.parent);
 
 	regmap_field_write(pctl->irqmux[irq_data->hwirq], bank->bank_nr);
+	return 0;
 }
 
 static int stm32_gpio_domain_alloc(struct irq_domain *d,
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index de06ce992a2d..9bc07f59b090 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -111,7 +111,7 @@ struct irq_domain_ops {
 		     unsigned int nr_irqs, void *arg);
 	void (*free)(struct irq_domain *d, unsigned int virq,
 		     unsigned int nr_irqs);
-	void (*activate)(struct irq_domain *d, struct irq_data *irq_data);
+	int (*activate)(struct irq_domain *d, struct irq_data *irqd, bool early);
 	void (*deactivate)(struct irq_domain *d, struct irq_data *irq_data);
 	int (*translate)(struct irq_domain *d, struct irq_fwspec *fwspec,
 			 unsigned long *out_hwirq, unsigned int *out_type);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 1a75a00ee01f..1423f8a6c75d 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1690,7 +1690,7 @@ static void __irq_domain_activate_irq(struct irq_data *irq_data)
 		if (irq_data->parent_data)
 			__irq_domain_activate_irq(irq_data->parent_data);
 		if (domain->ops->activate)
-			domain->ops->activate(domain, irq_data);
+			domain->ops->activate(domain, irq_data, false);
 	}
 }
 
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 94ecc0293844..6155fff9f647 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -102,13 +102,14 @@ int msi_domain_set_affinity(struct irq_data *irq_data,
 	return ret;
 }
 
-static void msi_domain_activate(struct irq_domain *domain,
-				struct irq_data *irq_data)
+static int msi_domain_activate(struct irq_domain *domain,
+			       struct irq_data *irq_data, bool early)
 {
 	struct msi_msg msg;
 
 	BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
 	irq_chip_write_msi_msg(irq_data, &msg);
+	return 0;
 }
 
 static void msi_domain_deactivate(struct irq_domain *domain,
-- 
cgit v1.2.3


From bb9b428a5c832d7abb494fbabac37c515c01c6c4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:11 +0200
Subject: genirq/irqdomain: Allow irq_domain_activate_irq() to fail

Allow irq_domain_activate_irq() to fail. This is required to support a
reservation and late vector assignment scheme.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213152.933882227@linutronix.de
---
 include/linux/irqdomain.h |  2 +-
 kernel/irq/chip.c         |  9 +++++++--
 kernel/irq/internals.h    |  3 ++-
 kernel/irq/irqdomain.c    | 40 +++++++++++++++++++++++++---------------
 kernel/irq/msi.c          | 19 +++++++++++++++++--
 5 files changed, 52 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 9bc07f59b090..8fb877121984 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -441,7 +441,7 @@ extern int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
 				   unsigned int nr_irqs, int node, void *arg,
 				   bool realloc, const struct cpumask *affinity);
 extern void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs);
-extern void irq_domain_activate_irq(struct irq_data *irq_data);
+extern int irq_domain_activate_irq(struct irq_data *irq_data);
 extern void irq_domain_deactivate_irq(struct irq_data *irq_data);
 
 static inline int irq_domain_alloc_irqs(struct irq_domain *domain,
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 37dd34d922f4..cd5b3eb38082 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -219,7 +219,12 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
 		 */
 		return IRQ_STARTUP_ABORT;
 	}
-	irq_domain_activate_irq(d);
+	/*
+	 * Managed interrupts have reserved resources, so this should not
+	 * happen.
+	 */
+	if (WARN_ON(irq_domain_activate_irq(d)))
+		return IRQ_STARTUP_ABORT;
 	return IRQ_STARTUP_MANAGED;
 }
 #else
@@ -285,7 +290,7 @@ int irq_activate(struct irq_desc *desc)
 	struct irq_data *d = irq_desc_get_irq_data(desc);
 
 	if (!irqd_affinity_is_managed(d))
-		irq_domain_activate_irq(d);
+		return irq_domain_activate_irq(d);
 	return 0;
 }
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 8bd131e4c944..e84d0e3899f6 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -439,9 +439,10 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
 #endif /* !CONFIG_GENERIC_PENDING_IRQ */
 
 #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
-static inline void irq_domain_activate_irq(struct irq_data *data)
+static inline int irq_domain_activate_irq(struct irq_data *data)
 {
 	irqd_set_activated(data);
+	return 0;
 }
 static inline void irq_domain_deactivate_irq(struct irq_data *data)
 {
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 1423f8a6c75d..47e8ddd9e8cf 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1682,28 +1682,35 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain,
 }
 EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent);
 
-static void __irq_domain_activate_irq(struct irq_data *irq_data)
+static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
 {
 	if (irq_data && irq_data->domain) {
 		struct irq_domain *domain = irq_data->domain;
 
+		if (domain->ops->deactivate)
+			domain->ops->deactivate(domain, irq_data);
 		if (irq_data->parent_data)
-			__irq_domain_activate_irq(irq_data->parent_data);
-		if (domain->ops->activate)
-			domain->ops->activate(domain, irq_data, false);
+			__irq_domain_deactivate_irq(irq_data->parent_data);
 	}
 }
 
-static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
+static int __irq_domain_activate_irq(struct irq_data *irqd)
 {
-	if (irq_data && irq_data->domain) {
-		struct irq_domain *domain = irq_data->domain;
+	int ret = 0;
 
-		if (domain->ops->deactivate)
-			domain->ops->deactivate(domain, irq_data);
-		if (irq_data->parent_data)
-			__irq_domain_deactivate_irq(irq_data->parent_data);
+	if (irqd && irqd->domain) {
+		struct irq_domain *domain = irqd->domain;
+
+		if (irqd->parent_data)
+			ret = __irq_domain_activate_irq(irqd->parent_data);
+		if (!ret && domain->ops->activate) {
+			ret = domain->ops->activate(domain, irqd, false);
+			/* Rollback in case of error */
+			if (ret && irqd->parent_data)
+				__irq_domain_deactivate_irq(irqd->parent_data);
+		}
 	}
+	return ret;
 }
 
 /**
@@ -1714,12 +1721,15 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
  * This is the second step to call domain_ops->activate to program interrupt
  * controllers, so the interrupt could actually get delivered.
  */
-void irq_domain_activate_irq(struct irq_data *irq_data)
+int irq_domain_activate_irq(struct irq_data *irq_data)
 {
-	if (!irqd_is_activated(irq_data)) {
-		__irq_domain_activate_irq(irq_data);
+	int ret = 0;
+
+	if (!irqd_is_activated(irq_data))
+		ret = __irq_domain_activate_irq(irq_data);
+	if (!ret)
 		irqd_set_activated(irq_data);
-	}
+	return ret;
 }
 
 /**
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 6155fff9f647..5ece369950ec 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -401,11 +401,26 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 			struct irq_data *irq_data;
 
 			irq_data = irq_domain_get_irq_data(domain, desc->irq);
-			irq_domain_activate_irq(irq_data);
+			ret = irq_domain_activate_irq(irq_data);
+			if (ret)
+				goto cleanup;
 		}
 	}
-
 	return 0;
+
+cleanup:
+	for_each_msi_entry(desc, dev) {
+		struct irq_data *irqd;
+
+		if (desc->irq == virq)
+			break;
+
+		irqd = irq_domain_get_irq_data(domain, desc->irq);
+		if (irqd_is_activated(irqd))
+			irq_domain_deactivate_irq(irqd);
+	}
+	msi_domain_free_irqs(domain, dev);
+	return ret;
 }
 
 /**
-- 
cgit v1.2.3


From 42e1cc2dc5b698181ab1ffb7972bd880230c506e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:12 +0200
Subject: genirq/irqdomain: Propagate early activation

Propagate the early activation mode to the irqdomain activate()
callbacks. This is required for the upcoming reservation, late vector
assignment scheme, so that the early activation call can act accordingly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213153.028353660@linutronix.de
---
 arch/x86/kernel/apic/io_apic.c |  4 ++--
 include/linux/irqdomain.h      |  2 +-
 kernel/irq/chip.c              |  4 ++--
 kernel/irq/internals.h         |  2 +-
 kernel/irq/irqdomain.c         | 11 ++++++-----
 kernel/irq/msi.c               |  2 +-
 6 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index d50e46757f6d..6f1007fd9783 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2096,7 +2096,7 @@ static inline void __init check_timer(void)
 				unmask_ioapic_irq(irq_get_irq_data(0));
 		}
 		irq_domain_deactivate_irq(irq_data);
-		irq_domain_activate_irq(irq_data);
+		irq_domain_activate_irq(irq_data, false);
 		if (timer_irq_works()) {
 			if (disable_timer_pin_1 > 0)
 				clear_IO_APIC_pin(0, pin1);
@@ -2118,7 +2118,7 @@ static inline void __init check_timer(void)
 		 */
 		replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2);
 		irq_domain_deactivate_irq(irq_data);
-		irq_domain_activate_irq(irq_data);
+		irq_domain_activate_irq(irq_data, false);
 		legacy_pic->unmask(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 8fb877121984..7d0c6c144708 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -441,7 +441,7 @@ extern int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
 				   unsigned int nr_irqs, int node, void *arg,
 				   bool realloc, const struct cpumask *affinity);
 extern void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs);
-extern int irq_domain_activate_irq(struct irq_data *irq_data);
+extern int irq_domain_activate_irq(struct irq_data *irq_data, bool early);
 extern void irq_domain_deactivate_irq(struct irq_data *irq_data);
 
 static inline int irq_domain_alloc_irqs(struct irq_domain *domain,
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index cd5b3eb38082..82333835ac66 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -223,7 +223,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
 	 * Managed interrupts have reserved resources, so this should not
 	 * happen.
 	 */
-	if (WARN_ON(irq_domain_activate_irq(d)))
+	if (WARN_ON(irq_domain_activate_irq(d, false)))
 		return IRQ_STARTUP_ABORT;
 	return IRQ_STARTUP_MANAGED;
 }
@@ -290,7 +290,7 @@ int irq_activate(struct irq_desc *desc)
 	struct irq_data *d = irq_desc_get_irq_data(desc);
 
 	if (!irqd_affinity_is_managed(d))
-		return irq_domain_activate_irq(d);
+		return irq_domain_activate_irq(d, false);
 	return 0;
 }
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e84d0e3899f6..a0327136e469 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -439,7 +439,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
 #endif /* !CONFIG_GENERIC_PENDING_IRQ */
 
 #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
-static inline int irq_domain_activate_irq(struct irq_data *data)
+static inline int irq_domain_activate_irq(struct irq_data *data, bool early)
 {
 	irqd_set_activated(data);
 	return 0;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 47e8ddd9e8cf..b50f737574ae 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1694,7 +1694,7 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
 	}
 }
 
-static int __irq_domain_activate_irq(struct irq_data *irqd)
+static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
 {
 	int ret = 0;
 
@@ -1702,9 +1702,10 @@ static int __irq_domain_activate_irq(struct irq_data *irqd)
 		struct irq_domain *domain = irqd->domain;
 
 		if (irqd->parent_data)
-			ret = __irq_domain_activate_irq(irqd->parent_data);
+			ret = __irq_domain_activate_irq(irqd->parent_data,
+							early);
 		if (!ret && domain->ops->activate) {
-			ret = domain->ops->activate(domain, irqd, false);
+			ret = domain->ops->activate(domain, irqd, early);
 			/* Rollback in case of error */
 			if (ret && irqd->parent_data)
 				__irq_domain_deactivate_irq(irqd->parent_data);
@@ -1721,12 +1722,12 @@ static int __irq_domain_activate_irq(struct irq_data *irqd)
  * This is the second step to call domain_ops->activate to program interrupt
  * controllers, so the interrupt could actually get delivered.
  */
-int irq_domain_activate_irq(struct irq_data *irq_data)
+int irq_domain_activate_irq(struct irq_data *irq_data, bool early)
 {
 	int ret = 0;
 
 	if (!irqd_is_activated(irq_data))
-		ret = __irq_domain_activate_irq(irq_data);
+		ret = __irq_domain_activate_irq(irq_data, early);
 	if (!ret)
 		irqd_set_activated(irq_data);
 	return ret;
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 5ece369950ec..d7553d015175 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -401,7 +401,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 			struct irq_data *irq_data;
 
 			irq_data = irq_domain_get_irq_data(domain, desc->irq);
-			ret = irq_domain_activate_irq(irq_data);
+			ret = irq_domain_activate_irq(irq_data, true);
 			if (ret)
 				goto cleanup;
 		}
-- 
cgit v1.2.3


From 22d0b12f3560d3b3264ee79faa1c05a5060fb916 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:13 +0200
Subject: genirq/irqdomain: Add force reactivation flag to irq domains

Allow irqdomains to tell the core code, that after early activation the
interrupt needs to be reactivated at request_irq() time.

This allows reservation of vectors at early activation time and actual
vector assignment at request_irq() time.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213153.106242536@linutronix.de
---
 include/linux/msi.h | 5 +++++
 kernel/irq/msi.c    | 2 ++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 80e3b562bef6..eff16ef81f43 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -283,6 +283,11 @@ enum {
 	MSI_FLAG_PCI_MSIX		= (1 << 3),
 	/* Needs early activate, required for PCI */
 	MSI_FLAG_ACTIVATE_EARLY		= (1 << 4),
+	/*
+	 * Must reactivate when irq is started even when
+	 * MSI_FLAG_ACTIVATE_EARLY has been set.
+	 */
+	MSI_FLAG_MUST_REACTIVATE	= (1 << 5),
 };
 
 int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask,
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index d7553d015175..edb987b2c58d 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -404,6 +404,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 			ret = irq_domain_activate_irq(irq_data, true);
 			if (ret)
 				goto cleanup;
+			if (info->flags & MSI_FLAG_MUST_REACTIVATE)
+				irqd_clr_activated(irq_data);
 		}
 	}
 	return 0;
-- 
cgit v1.2.3


From 2f75d9e1c90511bff6d1ce4de94503cc28fec032 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:14 +0200
Subject: genirq: Implement bitmap matrix allocator

Implement the infrastructure for a simple bitmap based allocator, which
will replace the x86 vector allocator. It's in the core code as other
architectures might be able to reuse/extend it. For now it only implements
allocations for single CPUs, but it's simple to add multi CPU allocation
support if required.

The concept is rather simple:

 Global information:
 	system_vector bitmap
	global accounting

 PerCPU information:
 	allocation bitmap
	managed allocation bitmap
	local accounting

The system vector bitmap is used to exclude vectors system wide from the
allocation space.

The allocation bitmap is used to keep track of per cpu used vectors.

The managed allocation bitmap is used to reserve vectors for managed
interrupts.

When a regular (non managed) interrupt allocation happens then the
following rule applies:

      tmpmap = system_map | alloc_map | managed_map
      find_zero_bit(tmpmap)

Oring the bitmaps together gives the real available space. The same rule
applies for reserving a managed interrupt vector. But contrary to the
regular interrupts the reservation only marks the bit in the managed map
and therefor excludes it from the regular allocations. The managed map is
only cleaned out when the a managed interrupt is completely released and it
stays alive accross CPU offline/online operations.

For managed interrupt allocations the rule is:

      tmpmap = managed_map & ~alloc_map
      find_first_bit(tmpmap)

This returns the first bit which is in the managed map, but not yet
allocated in the allocation map. The allocation marks it in the allocation
map and hands it back to the caller for use.

The rest of the code are helper functions to handle the various
requirements and the accounting which are necessary to replace the x86
vector allocation code. The result is a single patch as the evolution of
this infrastructure cannot be represented in bits and pieces.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213153.185437174@linutronix.de
---
 include/linux/irq.h |  22 +++
 kernel/irq/Kconfig  |   3 +
 kernel/irq/Makefile |   1 +
 kernel/irq/matrix.c | 428 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 454 insertions(+)
 create mode 100644 kernel/irq/matrix.c

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index d4728bf6a537..fda8da7c45e7 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -1113,6 +1113,28 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc,
 		return readl(gc->reg_base + reg_offset);
 }
 
+struct irq_matrix;
+struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits,
+				    unsigned int alloc_start,
+				    unsigned int alloc_end);
+void irq_matrix_online(struct irq_matrix *m);
+void irq_matrix_offline(struct irq_matrix *m);
+void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit, bool replace);
+int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk);
+void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk);
+int irq_matrix_alloc_managed(struct irq_matrix *m, unsigned int cpu);
+void irq_matrix_reserve(struct irq_matrix *m);
+void irq_matrix_remove_reserved(struct irq_matrix *m);
+int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
+		     bool reserved, unsigned int *mapped_cpu);
+void irq_matrix_free(struct irq_matrix *m, unsigned int cpu,
+		     unsigned int bit, bool managed);
+void irq_matrix_assign(struct irq_matrix *m, unsigned int bit);
+unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown);
+unsigned int irq_matrix_allocated(struct irq_matrix *m);
+unsigned int irq_matrix_reserved(struct irq_matrix *m);
+void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind);
+
 /* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */
 #define INVALID_HWIRQ	(~0UL)
 irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index a117adf7084b..ac1a3e29d3b9 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -97,6 +97,9 @@ config HANDLE_DOMAIN_IRQ
 config IRQ_TIMINGS
 	bool
 
+config GENERIC_IRQ_MATRIX_ALLOCATOR
+	bool
+
 config IRQ_DOMAIN_DEBUG
 	bool "Expose hardware/virtual IRQ mapping via debugfs"
 	depends on IRQ_DOMAIN && DEBUG_FS
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 1970cafe8f2a..329c9193b4bf 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -13,3 +13,4 @@ obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
 obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
 obj-$(CONFIG_SMP) += affinity.o
 obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
+obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
new file mode 100644
index 000000000000..7b2b4fbde1e2
--- /dev/null
+++ b/kernel/irq/matrix.c
@@ -0,0 +1,428 @@
+/*
+ * Copyright (C) 2017 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ */
+#include <linux/spinlock.h>
+#include <linux/seq_file.h>
+#include <linux/bitmap.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/irq.h>
+
+#define IRQ_MATRIX_SIZE	(BITS_TO_LONGS(IRQ_MATRIX_BITS) * sizeof(unsigned long))
+
+struct cpumap {
+	unsigned int		available;
+	unsigned int		allocated;
+	unsigned int		managed;
+	bool			online;
+	unsigned long		alloc_map[IRQ_MATRIX_SIZE];
+	unsigned long		managed_map[IRQ_MATRIX_SIZE];
+};
+
+struct irq_matrix {
+	unsigned int		matrix_bits;
+	unsigned int		alloc_start;
+	unsigned int		alloc_end;
+	unsigned int		alloc_size;
+	unsigned int		global_available;
+	unsigned int		global_reserved;
+	unsigned int		systembits_inalloc;
+	unsigned int		total_allocated;
+	unsigned int		online_maps;
+	struct cpumap __percpu	*maps;
+	unsigned long		scratch_map[IRQ_MATRIX_SIZE];
+	unsigned long		system_map[IRQ_MATRIX_SIZE];
+};
+
+/**
+ * irq_alloc_matrix - Allocate a irq_matrix structure and initialize it
+ * @matrix_bits:	Number of matrix bits must be <= IRQ_MATRIX_BITS
+ * @alloc_start:	From which bit the allocation search starts
+ * @alloc_end:		At which bit the allocation search ends, i.e first
+ *			invalid bit
+ */
+__init struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits,
+					   unsigned int alloc_start,
+					   unsigned int alloc_end)
+{
+	struct irq_matrix *m;
+
+	if (matrix_bits > IRQ_MATRIX_BITS)
+		return NULL;
+
+	m = kzalloc(sizeof(*m), GFP_KERNEL);
+	if (!m)
+		return NULL;
+
+	m->matrix_bits = matrix_bits;
+	m->alloc_start = alloc_start;
+	m->alloc_end = alloc_end;
+	m->alloc_size = alloc_end - alloc_start;
+	m->maps = alloc_percpu(*m->maps);
+	if (!m->maps) {
+		kfree(m);
+		return NULL;
+	}
+	return m;
+}
+
+/**
+ * irq_matrix_online - Bring the local CPU matrix online
+ * @m:		Matrix pointer
+ */
+void irq_matrix_online(struct irq_matrix *m)
+{
+	struct cpumap *cm = this_cpu_ptr(m->maps);
+
+	BUG_ON(cm->online);
+
+	bitmap_zero(cm->alloc_map, m->matrix_bits);
+	cm->available = m->alloc_size - (cm->managed + m->systembits_inalloc);
+	cm->allocated = 0;
+	m->global_available += cm->available;
+	cm->online = true;
+	m->online_maps++;
+}
+
+/**
+ * irq_matrix_offline - Bring the local CPU matrix offline
+ * @m:		Matrix pointer
+ */
+void irq_matrix_offline(struct irq_matrix *m)
+{
+	struct cpumap *cm = this_cpu_ptr(m->maps);
+
+	/* Update the global available size */
+	m->global_available -= cm->available;
+	cm->online = false;
+	m->online_maps--;
+}
+
+static unsigned int matrix_alloc_area(struct irq_matrix *m, struct cpumap *cm,
+				      unsigned int num, bool managed)
+{
+	unsigned int area, start = m->alloc_start;
+	unsigned int end = m->alloc_end;
+
+	bitmap_or(m->scratch_map, cm->managed_map, m->system_map, end);
+	bitmap_or(m->scratch_map, m->scratch_map, cm->alloc_map, end);
+	area = bitmap_find_next_zero_area(m->scratch_map, end, start, num, 0);
+	if (area >= end)
+		return area;
+	if (managed)
+		bitmap_set(cm->managed_map, area, num);
+	else
+		bitmap_set(cm->alloc_map, area, num);
+	return area;
+}
+
+/**
+ * irq_matrix_assign_system - Assign system wide entry in the matrix
+ * @m:		Matrix pointer
+ * @bit:	Which bit to reserve
+ * @replace:	Replace an already allocated vector with a system
+ *		vector at the same bit position.
+ *
+ * The BUG_ON()s below are on purpose. If this goes wrong in the
+ * early boot process, then the chance to survive is about zero.
+ * If this happens when the system is life, it's not much better.
+ */
+void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit,
+			      bool replace)
+{
+	struct cpumap *cm = this_cpu_ptr(m->maps);
+
+	BUG_ON(bit > m->matrix_bits);
+	BUG_ON(m->online_maps > 1 || (m->online_maps && !replace));
+
+	set_bit(bit, m->system_map);
+	if (replace) {
+		BUG_ON(!test_and_clear_bit(bit, cm->alloc_map));
+		cm->allocated--;
+		m->total_allocated--;
+	}
+	if (bit >= m->alloc_start && bit < m->alloc_end)
+		m->systembits_inalloc++;
+}
+
+/**
+ * irq_matrix_reserve_managed - Reserve a managed interrupt in a CPU map
+ * @m:		Matrix pointer
+ * @msk:	On which CPUs the bits should be reserved.
+ *
+ * Can be called for offline CPUs. Note, this will only reserve one bit
+ * on all CPUs in @msk, but it's not guaranteed that the bits are at the
+ * same offset on all CPUs
+ */
+int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk)
+{
+	unsigned int cpu, failed_cpu;
+
+	for_each_cpu(cpu, msk) {
+		struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+		unsigned int bit;
+
+		bit = matrix_alloc_area(m, cm, 1, true);
+		if (bit >= m->alloc_end)
+			goto cleanup;
+		cm->managed++;
+		if (cm->online) {
+			cm->available--;
+			m->global_available--;
+		}
+	}
+	return 0;
+cleanup:
+	failed_cpu = cpu;
+	for_each_cpu(cpu, msk) {
+		if (cpu == failed_cpu)
+			break;
+		irq_matrix_remove_managed(m, cpumask_of(cpu));
+	}
+	return -ENOSPC;
+}
+
+/**
+ * irq_matrix_remove_managed - Remove managed interrupts in a CPU map
+ * @m:		Matrix pointer
+ * @msk:	On which CPUs the bits should be removed
+ *
+ * Can be called for offline CPUs
+ *
+ * This removes not allocated managed interrupts from the map. It does
+ * not matter which one because the managed interrupts free their
+ * allocation when they shut down. If not, the accounting is screwed,
+ * but all what can be done at this point is warn about it.
+ */
+void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk)
+{
+	unsigned int cpu;
+
+	for_each_cpu(cpu, msk) {
+		struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+		unsigned int bit, end = m->alloc_end;
+
+		if (WARN_ON_ONCE(!cm->managed))
+			continue;
+
+		/* Get managed bit which are not allocated */
+		bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end);
+
+		bit = find_first_bit(m->scratch_map, end);
+		if (WARN_ON_ONCE(bit >= end))
+			continue;
+
+		clear_bit(bit, cm->managed_map);
+
+		cm->managed--;
+		if (cm->online) {
+			cm->available++;
+			m->global_available++;
+		}
+	}
+}
+
+/**
+ * irq_matrix_alloc_managed - Allocate a managed interrupt in a CPU map
+ * @m:		Matrix pointer
+ * @cpu:	On which CPU the interrupt should be allocated
+ */
+int irq_matrix_alloc_managed(struct irq_matrix *m, unsigned int cpu)
+{
+	struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+	unsigned int bit, end = m->alloc_end;
+
+	/* Get managed bit which are not allocated */
+	bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end);
+	bit = find_first_bit(m->scratch_map, end);
+	if (bit >= end)
+		return -ENOSPC;
+	set_bit(bit, cm->alloc_map);
+	cm->allocated++;
+	m->total_allocated++;
+	return bit;
+}
+
+/**
+ * irq_matrix_assign - Assign a preallocated interrupt in the local CPU map
+ * @m:		Matrix pointer
+ * @bit:	Which bit to mark
+ *
+ * This should only be used to mark preallocated vectors
+ */
+void irq_matrix_assign(struct irq_matrix *m, unsigned int bit)
+{
+	struct cpumap *cm = this_cpu_ptr(m->maps);
+
+	if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end))
+		return;
+	if (WARN_ON_ONCE(test_and_set_bit(bit, cm->alloc_map)))
+		return;
+	cm->allocated++;
+	m->total_allocated++;
+	cm->available--;
+	m->global_available--;
+}
+
+/**
+ * irq_matrix_reserve - Reserve interrupts
+ * @m:		Matrix pointer
+ *
+ * This is merily a book keeping call. It increments the number of globally
+ * reserved interrupt bits w/o actually allocating them. This allows to
+ * setup interrupt descriptors w/o assigning low level resources to it.
+ * The actual allocation happens when the interrupt gets activated.
+ */
+void irq_matrix_reserve(struct irq_matrix *m)
+{
+	if (m->global_reserved <= m->global_available &&
+	    m->global_reserved + 1 > m->global_available)
+		pr_warn("Interrupt reservation exceeds available resources\n");
+
+	m->global_reserved++;
+}
+
+/**
+ * irq_matrix_remove_reserved - Remove interrupt reservation
+ * @m:		Matrix pointer
+ *
+ * This is merily a book keeping call. It decrements the number of globally
+ * reserved interrupt bits. This is used to undo irq_matrix_reserve() when the
+ * interrupt was never in use and a real vector allocated, which undid the
+ * reservation.
+ */
+void irq_matrix_remove_reserved(struct irq_matrix *m)
+{
+	m->global_reserved--;
+}
+
+/**
+ * irq_matrix_alloc - Allocate a regular interrupt in a CPU map
+ * @m:		Matrix pointer
+ * @msk:	Which CPUs to search in
+ * @reserved:	Allocate previously reserved interrupts
+ * @mapped_cpu: Pointer to store the CPU for which the irq was allocated
+ */
+int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
+		     bool reserved, unsigned int *mapped_cpu)
+{
+	unsigned int cpu;
+
+	for_each_cpu(cpu, msk) {
+		struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+		unsigned int bit;
+
+		if (!cm->online)
+			continue;
+
+		bit = matrix_alloc_area(m, cm, 1, false);
+		if (bit < m->alloc_end) {
+			cm->allocated++;
+			cm->available--;
+			m->total_allocated++;
+			m->global_available--;
+			if (reserved)
+				m->global_reserved--;
+			*mapped_cpu = cpu;
+			return bit;
+		}
+	}
+	return -ENOSPC;
+}
+
+/**
+ * irq_matrix_free - Free allocated interrupt in the matrix
+ * @m:		Matrix pointer
+ * @cpu:	Which CPU map needs be updated
+ * @bit:	The bit to remove
+ * @managed:	If true, the interrupt is managed and not accounted
+ *		as available.
+ */
+void irq_matrix_free(struct irq_matrix *m, unsigned int cpu,
+		     unsigned int bit, bool managed)
+{
+	struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+
+	if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end))
+		return;
+
+	if (cm->online) {
+		clear_bit(bit, cm->alloc_map);
+		cm->allocated--;
+		m->total_allocated--;
+		if (!managed) {
+			cm->available++;
+			m->global_available++;
+		}
+	}
+}
+
+/**
+ * irq_matrix_available - Get the number of globally available irqs
+ * @m:		Pointer to the matrix to query
+ * @cpudown:	If true, the local CPU is about to go down, adjust
+ *		the number of available irqs accordingly
+ */
+unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown)
+{
+	struct cpumap *cm = this_cpu_ptr(m->maps);
+
+	return m->global_available - cpudown ? cm->available : 0;
+}
+
+/**
+ * irq_matrix_reserved - Get the number of globally reserved irqs
+ * @m:		Pointer to the matrix to query
+ */
+unsigned int irq_matrix_reserved(struct irq_matrix *m)
+{
+	return m->global_reserved;
+}
+
+/**
+ * irq_matrix_allocated - Get the number of allocated irqs on the local cpu
+ * @m:		Pointer to the matrix to search
+ *
+ * This returns number of allocated irqs
+ */
+unsigned int irq_matrix_allocated(struct irq_matrix *m)
+{
+	struct cpumap *cm = this_cpu_ptr(m->maps);
+
+	return cm->allocated;
+}
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+/**
+ * irq_matrix_debug_show - Show detailed allocation information
+ * @sf:		Pointer to the seq_file to print to
+ * @m:		Pointer to the matrix allocator
+ * @ind:	Indentation for the print format
+ *
+ * Note, this is a lockless snapshot.
+ */
+void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind)
+{
+	unsigned int nsys = bitmap_weight(m->system_map, m->matrix_bits);
+	int cpu;
+
+	seq_printf(sf, "Online bitmaps:   %6u\n", m->online_maps);
+	seq_printf(sf, "Global available: %6u\n", m->global_available);
+	seq_printf(sf, "Global reserved:  %6u\n", m->global_reserved);
+	seq_printf(sf, "Total allocated:  %6u\n", m->total_allocated);
+	seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits,
+		   m->system_map);
+	seq_printf(sf, "%*s| CPU | avl | man | act | vectors\n", ind, " ");
+	cpus_read_lock();
+	for_each_online_cpu(cpu) {
+		struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+
+		seq_printf(sf, "%*s %4d  %4u  %4u  %4u  %*pbl\n", ind, " ",
+			   cpu, cm->available, cm->managed, cm->allocated,
+			   m->matrix_bits, cm->alloc_map);
+	}
+	cpus_read_unlock();
+}
+#endif
-- 
cgit v1.2.3


From 1d66af81905a4e2f3d03913f5449a8e9b5d3facd Mon Sep 17 00:00:00 2001
From: Danilo Krummrich <danilokrummrich@dk-develop.de>
Date: Fri, 9 Jun 2017 00:29:00 +0200
Subject: clk: bcm2835: remove remains from stub clk driver

This commit removes the fixed clocks introduced as a stub clock driver
added with commit 75fabc3f6448 ("ARM: bcm2835: add stub clock driver").
Originally they were used to drive the AMBA bus and PL011 uart driver.
Now these clocks are derived by the CPRMAN clock driver and configured
in DT.

Additionally, get rid of init_machine function in bcm2835 board file
as there's nothing to do any longer.

Signed-off-by: Danilo Krummrich <danilokrummrich@dk-develop.de>
Acked-by: Eric Anholt <eric@anholt.net>
Acked-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
---
 arch/arm/mach-bcm/board_bcm2835.c |  7 -------
 drivers/clk/bcm/clk-bcm2835-aux.c |  1 -
 drivers/clk/bcm/clk-bcm2835.c     | 30 ------------------------------
 include/linux/clk/bcm2835.h       | 24 ------------------------
 4 files changed, 62 deletions(-)
 delete mode 100644 include/linux/clk/bcm2835.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-bcm/board_bcm2835.c b/arch/arm/mach-bcm/board_bcm2835.c
index 0c1edfc98696..24af33f91705 100644
--- a/arch/arm/mach-bcm/board_bcm2835.c
+++ b/arch/arm/mach-bcm/board_bcm2835.c
@@ -15,16 +15,10 @@
 #include <linux/init.h>
 #include <linux/irqchip.h>
 #include <linux/of_address.h>
-#include <linux/clk/bcm2835.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
 
-static void __init bcm2835_init(void)
-{
-	bcm2835_init_clocks();
-}
-
 static const char * const bcm2835_compat[] = {
 #ifdef CONFIG_ARCH_MULTI_V6
 	"brcm,bcm2835",
@@ -36,6 +30,5 @@ static const char * const bcm2835_compat[] = {
 };
 
 DT_MACHINE_START(BCM2835, "BCM2835")
-	.init_machine = bcm2835_init,
 	.dt_compat = bcm2835_compat
 MACHINE_END
diff --git a/drivers/clk/bcm/clk-bcm2835-aux.c b/drivers/clk/bcm/clk-bcm2835-aux.c
index bd750cf2238d..77e276d61702 100644
--- a/drivers/clk/bcm/clk-bcm2835-aux.c
+++ b/drivers/clk/bcm/clk-bcm2835-aux.c
@@ -14,7 +14,6 @@
 
 #include <linux/clk.h>
 #include <linux/clk-provider.h>
-#include <linux/clk/bcm2835.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <dt-bindings/clock/bcm2835-aux.h>
diff --git a/drivers/clk/bcm/clk-bcm2835.c b/drivers/clk/bcm/clk-bcm2835.c
index 58ce6af8452d..44301a3d9963 100644
--- a/drivers/clk/bcm/clk-bcm2835.c
+++ b/drivers/clk/bcm/clk-bcm2835.c
@@ -37,7 +37,6 @@
 #include <linux/clk-provider.h>
 #include <linux/clkdev.h>
 #include <linux/clk.h>
-#include <linux/clk/bcm2835.h>
 #include <linux/debugfs.h>
 #include <linux/delay.h>
 #include <linux/module.h>
@@ -416,35 +415,6 @@ static int bcm2835_debugfs_regset(struct bcm2835_cprman *cprman, u32 base,
 	return regdump ? 0 : -ENOMEM;
 }
 
-/*
- * These are fixed clocks. They're probably not all root clocks and it may
- * be possible to turn them on and off but until this is mapped out better
- * it's the only way they can be used.
- */
-void __init bcm2835_init_clocks(void)
-{
-	struct clk_hw *hw;
-	int ret;
-
-	hw = clk_hw_register_fixed_rate(NULL, "apb_pclk", NULL, 0, 126000000);
-	if (IS_ERR(hw))
-		pr_err("apb_pclk not registered\n");
-
-	hw = clk_hw_register_fixed_rate(NULL, "uart0_pclk", NULL, 0, 3000000);
-	if (IS_ERR(hw))
-		pr_err("uart0_pclk not registered\n");
-	ret = clk_hw_register_clkdev(hw, NULL, "20201000.uart");
-	if (ret)
-		pr_err("uart0_pclk alias not registered\n");
-
-	hw = clk_hw_register_fixed_rate(NULL, "uart1_pclk", NULL, 0, 125000000);
-	if (IS_ERR(hw))
-		pr_err("uart1_pclk not registered\n");
-	ret = clk_hw_register_clkdev(hw, NULL, "20215000.uart");
-	if (ret)
-		pr_err("uart1_pclk alias not registered\n");
-}
-
 struct bcm2835_pll_data {
 	const char *name;
 	u32 cm_ctrl_reg;
diff --git a/include/linux/clk/bcm2835.h b/include/linux/clk/bcm2835.h
deleted file mode 100644
index aa937f6c17da..000000000000
--- a/include/linux/clk/bcm2835.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (C) 2010 Broadcom
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-#ifndef __LINUX_CLK_BCM2835_H_
-#define __LINUX_CLK_BCM2835_H_
-
-void __init bcm2835_init_clocks(void);
-
-#endif
-- 
cgit v1.2.3


From 88bbe85dcd37aa2662c1a83962c15009fc12503e Mon Sep 17 00:00:00 2001
From: Stefan Wahren <stefan.wahren@i2se.com>
Date: Sun, 6 Aug 2017 17:52:02 +0200
Subject: irqchip: bcm2836: Move SMP startup code to arch/arm (v2)

In order to easily provide SMP for BCM2837 on 32-bit and 64-bit
the SMP startup code was placed in irq-bcm2836. That's not the
right approach. So move this code where it belongs.

Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
Fixes: 41f4988cc287 ("irqchip/bcm2836: Add SMP support for the 2836")
Tested-by: Eric Anholt <eric@anholt.net>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm/mach-bcm/Makefile          |  5 +++
 arch/arm/mach-bcm/board_bcm2835.c   |  6 ++-
 arch/arm/mach-bcm/platsmp.c         | 35 ++++++++++++++++
 arch/arm/mach-bcm/platsmp.h         | 10 +++++
 drivers/irqchip/irq-bcm2836.c       | 79 +------------------------------------
 include/linux/irqchip/irq-bcm2836.h | 70 ++++++++++++++++++++++++++++++++
 6 files changed, 127 insertions(+), 78 deletions(-)
 create mode 100644 arch/arm/mach-bcm/platsmp.h
 create mode 100644 include/linux/irqchip/irq-bcm2836.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-bcm/Makefile b/arch/arm/mach-bcm/Makefile
index 980f5850097c..62a59008c5a8 100644
--- a/arch/arm/mach-bcm/Makefile
+++ b/arch/arm/mach-bcm/Makefile
@@ -43,6 +43,11 @@ endif
 
 # BCM2835
 obj-$(CONFIG_ARCH_BCM2835)	+= board_bcm2835.o
+ifeq ($(CONFIG_ARCH_BCM2835),y)
+ifeq ($(CONFIG_ARM),y)
+obj-$(CONFIG_SMP)		+= platsmp.o
+endif
+endif
 
 # BCM5301X
 obj-$(CONFIG_ARCH_BCM_5301X)	+= bcm_5301x.o
diff --git a/arch/arm/mach-bcm/board_bcm2835.c b/arch/arm/mach-bcm/board_bcm2835.c
index 24af33f91705..8cff865ace04 100644
--- a/arch/arm/mach-bcm/board_bcm2835.c
+++ b/arch/arm/mach-bcm/board_bcm2835.c
@@ -19,16 +19,20 @@
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
 
+#include "platsmp.h"
+
 static const char * const bcm2835_compat[] = {
 #ifdef CONFIG_ARCH_MULTI_V6
 	"brcm,bcm2835",
 #endif
 #ifdef CONFIG_ARCH_MULTI_V7
 	"brcm,bcm2836",
+	"brcm,bcm2837",
 #endif
 	NULL
 };
 
 DT_MACHINE_START(BCM2835, "BCM2835")
-	.dt_compat = bcm2835_compat
+	.dt_compat = bcm2835_compat,
+	.smp = smp_ops(bcm2836_smp_ops),
 MACHINE_END
diff --git a/arch/arm/mach-bcm/platsmp.c b/arch/arm/mach-bcm/platsmp.c
index 9e3f275934eb..34506a608fea 100644
--- a/arch/arm/mach-bcm/platsmp.c
+++ b/arch/arm/mach-bcm/platsmp.c
@@ -17,6 +17,7 @@
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/io.h>
+#include <linux/irqchip/irq-bcm2836.h>
 #include <linux/jiffies.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
@@ -287,6 +288,35 @@ out:
 	return ret;
 }
 
+static int bcm2836_boot_secondary(unsigned int cpu, struct task_struct *idle)
+{
+	void __iomem *intc_base;
+	struct device_node *dn;
+	char *name;
+
+	name = "brcm,bcm2836-l1-intc";
+	dn = of_find_compatible_node(NULL, NULL, name);
+	if (!dn) {
+		pr_err("unable to find intc node\n");
+		return -ENODEV;
+	}
+
+	intc_base = of_iomap(dn, 0);
+	of_node_put(dn);
+
+	if (!intc_base) {
+		pr_err("unable to remap intc base register\n");
+		return -ENOMEM;
+	}
+
+	writel(virt_to_phys(secondary_startup),
+	       intc_base + LOCAL_MAILBOX3_SET0 + 16 * cpu);
+
+	iounmap(intc_base);
+
+	return 0;
+}
+
 static const struct smp_operations kona_smp_ops __initconst = {
 	.smp_prepare_cpus	= bcm_smp_prepare_cpus,
 	.smp_boot_secondary	= kona_boot_secondary,
@@ -305,3 +335,8 @@ static const struct smp_operations nsp_smp_ops __initconst = {
 	.smp_boot_secondary	= nsp_boot_secondary,
 };
 CPU_METHOD_OF_DECLARE(bcm_smp_nsp, "brcm,bcm-nsp-smp", &nsp_smp_ops);
+
+const struct smp_operations bcm2836_smp_ops __initconst = {
+	.smp_boot_secondary	= bcm2836_boot_secondary,
+};
+CPU_METHOD_OF_DECLARE(bcm_smp_bcm2836, "brcm,bcm2836-smp", &bcm2836_smp_ops);
diff --git a/arch/arm/mach-bcm/platsmp.h b/arch/arm/mach-bcm/platsmp.h
new file mode 100644
index 000000000000..b8b8b3fa350d
--- /dev/null
+++ b/arch/arm/mach-bcm/platsmp.h
@@ -0,0 +1,10 @@
+/*
+ * Copyright (C) 2017 Stefan Wahren <stefan.wahren@i2se.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ */
+
+extern const struct smp_operations bcm2836_smp_ops;
diff --git a/drivers/irqchip/irq-bcm2836.c b/drivers/irqchip/irq-bcm2836.c
index dc8c1e3eafe7..667b9e14b032 100644
--- a/drivers/irqchip/irq-bcm2836.c
+++ b/drivers/irqchip/irq-bcm2836.c
@@ -19,62 +19,9 @@
 #include <linux/of_irq.h>
 #include <linux/irqchip.h>
 #include <linux/irqdomain.h>
-#include <asm/exception.h>
-
-#define LOCAL_CONTROL			0x000
-#define LOCAL_PRESCALER			0x008
+#include <linux/irqchip/irq-bcm2836.h>
 
-/*
- * The low 2 bits identify the CPU that the GPU IRQ goes to, and the
- * next 2 bits identify the CPU that the GPU FIQ goes to.
- */
-#define LOCAL_GPU_ROUTING		0x00c
-/* When setting bits 0-3, enables PMU interrupts on that CPU. */
-#define LOCAL_PM_ROUTING_SET		0x010
-/* When setting bits 0-3, disables PMU interrupts on that CPU. */
-#define LOCAL_PM_ROUTING_CLR		0x014
-/*
- * The low 4 bits of this are the CPU's timer IRQ enables, and the
- * next 4 bits are the CPU's timer FIQ enables (which override the IRQ
- * bits).
- */
-#define LOCAL_TIMER_INT_CONTROL0	0x040
-/*
- * The low 4 bits of this are the CPU's per-mailbox IRQ enables, and
- * the next 4 bits are the CPU's per-mailbox FIQ enables (which
- * override the IRQ bits).
- */
-#define LOCAL_MAILBOX_INT_CONTROL0	0x050
-/*
- * The CPU's interrupt status register.  Bits are defined by the the
- * LOCAL_IRQ_* bits below.
- */
-#define LOCAL_IRQ_PENDING0		0x060
-/* Same status bits as above, but for FIQ. */
-#define LOCAL_FIQ_PENDING0		0x070
-/*
- * Mailbox write-to-set bits.  There are 16 mailboxes, 4 per CPU, and
- * these bits are organized by mailbox number and then CPU number.  We
- * use mailbox 0 for IPIs.  The mailbox's interrupt is raised while
- * any bit is set.
- */
-#define LOCAL_MAILBOX0_SET0		0x080
-#define LOCAL_MAILBOX3_SET0		0x08c
-/* Mailbox write-to-clear bits. */
-#define LOCAL_MAILBOX0_CLR0		0x0c0
-#define LOCAL_MAILBOX3_CLR0		0x0cc
-
-#define LOCAL_IRQ_CNTPSIRQ	0
-#define LOCAL_IRQ_CNTPNSIRQ	1
-#define LOCAL_IRQ_CNTHPIRQ	2
-#define LOCAL_IRQ_CNTVIRQ	3
-#define LOCAL_IRQ_MAILBOX0	4
-#define LOCAL_IRQ_MAILBOX1	5
-#define LOCAL_IRQ_MAILBOX2	6
-#define LOCAL_IRQ_MAILBOX3	7
-#define LOCAL_IRQ_GPU_FAST	8
-#define LOCAL_IRQ_PMU_FAST	9
-#define LAST_IRQ		LOCAL_IRQ_PMU_FAST
+#include <asm/exception.h>
 
 struct bcm2836_arm_irqchip_intc {
 	struct irq_domain *domain;
@@ -215,24 +162,6 @@ static int bcm2836_cpu_dying(unsigned int cpu)
 					     cpu);
 	return 0;
 }
-
-#ifdef CONFIG_ARM
-static int __init bcm2836_smp_boot_secondary(unsigned int cpu,
-					     struct task_struct *idle)
-{
-	unsigned long secondary_startup_phys =
-		(unsigned long)virt_to_phys((void *)secondary_startup);
-
-	writel(secondary_startup_phys,
-	       intc.base + LOCAL_MAILBOX3_SET0 + 16 * cpu);
-
-	return 0;
-}
-
-static const struct smp_operations bcm2836_smp_ops __initconst = {
-	.smp_boot_secondary	= bcm2836_smp_boot_secondary,
-};
-#endif
 #endif
 
 static const struct irq_domain_ops bcm2836_arm_irqchip_intc_ops = {
@@ -249,10 +178,6 @@ bcm2836_arm_irqchip_smp_init(void)
 			  bcm2836_cpu_dying);
 
 	set_smp_cross_call(bcm2836_arm_irqchip_send_ipi);
-
-#ifdef CONFIG_ARM
-	smp_set_ops(&bcm2836_smp_ops);
-#endif
 #endif
 }
 
diff --git a/include/linux/irqchip/irq-bcm2836.h b/include/linux/irqchip/irq-bcm2836.h
new file mode 100644
index 000000000000..218a6e1b18d8
--- /dev/null
+++ b/include/linux/irqchip/irq-bcm2836.h
@@ -0,0 +1,70 @@
+/*
+ * Root interrupt controller for the BCM2836 (Raspberry Pi 2).
+ *
+ * Copyright 2015 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define LOCAL_CONTROL			0x000
+#define LOCAL_PRESCALER			0x008
+
+/*
+ * The low 2 bits identify the CPU that the GPU IRQ goes to, and the
+ * next 2 bits identify the CPU that the GPU FIQ goes to.
+ */
+#define LOCAL_GPU_ROUTING		0x00c
+/* When setting bits 0-3, enables PMU interrupts on that CPU. */
+#define LOCAL_PM_ROUTING_SET		0x010
+/* When setting bits 0-3, disables PMU interrupts on that CPU. */
+#define LOCAL_PM_ROUTING_CLR		0x014
+/*
+ * The low 4 bits of this are the CPU's timer IRQ enables, and the
+ * next 4 bits are the CPU's timer FIQ enables (which override the IRQ
+ * bits).
+ */
+#define LOCAL_TIMER_INT_CONTROL0	0x040
+/*
+ * The low 4 bits of this are the CPU's per-mailbox IRQ enables, and
+ * the next 4 bits are the CPU's per-mailbox FIQ enables (which
+ * override the IRQ bits).
+ */
+#define LOCAL_MAILBOX_INT_CONTROL0	0x050
+/*
+ * The CPU's interrupt status register.  Bits are defined by the the
+ * LOCAL_IRQ_* bits below.
+ */
+#define LOCAL_IRQ_PENDING0		0x060
+/* Same status bits as above, but for FIQ. */
+#define LOCAL_FIQ_PENDING0		0x070
+/*
+ * Mailbox write-to-set bits.  There are 16 mailboxes, 4 per CPU, and
+ * these bits are organized by mailbox number and then CPU number.  We
+ * use mailbox 0 for IPIs.  The mailbox's interrupt is raised while
+ * any bit is set.
+ */
+#define LOCAL_MAILBOX0_SET0		0x080
+#define LOCAL_MAILBOX3_SET0		0x08c
+/* Mailbox write-to-clear bits. */
+#define LOCAL_MAILBOX0_CLR0		0x0c0
+#define LOCAL_MAILBOX3_CLR0		0x0cc
+
+#define LOCAL_IRQ_CNTPSIRQ	0
+#define LOCAL_IRQ_CNTPNSIRQ	1
+#define LOCAL_IRQ_CNTHPIRQ	2
+#define LOCAL_IRQ_CNTVIRQ	3
+#define LOCAL_IRQ_MAILBOX0	4
+#define LOCAL_IRQ_MAILBOX1	5
+#define LOCAL_IRQ_MAILBOX2	6
+#define LOCAL_IRQ_MAILBOX3	7
+#define LOCAL_IRQ_GPU_FAST	8
+#define LOCAL_IRQ_PMU_FAST	9
+#define LAST_IRQ		LOCAL_IRQ_PMU_FAST
-- 
cgit v1.2.3


From 4c3711d7fb4763c63b2654f2d07cbe21ca5aadd4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 31 Aug 2017 17:12:48 +0200
Subject: timekeeping: Provide NMI safe access to clock realtime

The configurable printk timestamping wants access to clock realtime. Right
now there is no ktime_get_real_fast_ns() accessor because reading the
monotonic base and the realtime offset cannot be done atomically. Contrary
to boot time this offset can change during runtime and cause half updated
readouts.

struct tk_read_base was fully packed when the fast timekeeper access was
implemented. commit ceea5e3771ed ("time: Fix clock->read(clock) race around
clocksource changes") removed the 'read' function pointer from the
structure, but of course left the comment stale.

So now the structure can fit a new 64bit member w/o violating the cache
line constraints.

Add real_base to tk_read_base and update it in the fast timekeeper update
sequence.

Implement an accessor which follows the same scheme as the accessor to
clock monotonic, but uses the new real_base to access clock real time.

The runtime overhead for updating real_base is minimal as it just adds two
cache hot values and stores them into an already dirtied cache line along
with the other fast timekeeper updates.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead,org>
Link: https://lkml.kernel.org/r/1505757060-2004-3-git-send-email-prarit@redhat.com
---
 include/linux/timekeeper_internal.h |  6 +++++-
 include/linux/timekeeping.h         |  1 +
 kernel/time/timekeeping.c           | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 41 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 0a0a53daf2a2..98f645ee8409 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -13,19 +13,22 @@
 /**
  * struct tk_read_base - base structure for timekeeping readout
  * @clock:	Current clocksource used for timekeeping.
- * @read:	Read function of @clock
  * @mask:	Bitmask for two's complement subtraction of non 64bit clocks
  * @cycle_last: @clock cycle value at last update
  * @mult:	(NTP adjusted) multiplier for scaled math conversion
  * @shift:	Shift value for scaled math conversion
  * @xtime_nsec: Shifted (fractional) nano seconds offset for readout
  * @base:	ktime_t (nanoseconds) base time for readout
+ * @base_real:	Nanoseconds base value for clock REALTIME readout
  *
  * This struct has size 56 byte on 64 bit. Together with a seqcount it
  * occupies a single 64byte cache line.
  *
  * The struct is separate from struct timekeeper as it is also used
  * for a fast NMI safe accessors.
+ *
+ * @base_real is for the fast NMI safe accessor to allow reading clock
+ * realtime from any context.
  */
 struct tk_read_base {
 	struct clocksource	*clock;
@@ -35,6 +38,7 @@ struct tk_read_base {
 	u32			shift;
 	u64			xtime_nsec;
 	ktime_t			base;
+	u64			base_real;
 };
 
 /**
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index ddc229ff6d1e..eb98cbdbb323 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -239,6 +239,7 @@ static inline u64 ktime_get_raw_ns(void)
 extern u64 ktime_get_mono_fast_ns(void);
 extern u64 ktime_get_raw_fast_ns(void);
 extern u64 ktime_get_boot_fast_ns(void);
+extern u64 ktime_get_real_fast_ns(void);
 
 /*
  * Timespec interfaces utilizing the ktime based ones
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6a92794427c9..8af77006e937 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -496,6 +496,39 @@ u64 notrace ktime_get_boot_fast_ns(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
 
+
+/*
+ * See comment for __ktime_get_fast_ns() vs. timestamp ordering
+ */
+static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf)
+{
+	struct tk_read_base *tkr;
+	unsigned int seq;
+	u64 now;
+
+	do {
+		seq = raw_read_seqcount_latch(&tkf->seq);
+		tkr = tkf->base + (seq & 0x01);
+		now = ktime_to_ns(tkr->base_real);
+
+		now += timekeeping_delta_to_ns(tkr,
+				clocksource_delta(
+					tk_clock_read(tkr),
+					tkr->cycle_last,
+					tkr->mask));
+	} while (read_seqcount_retry(&tkf->seq, seq));
+
+	return now;
+}
+
+/**
+ * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
+ */
+u64 ktime_get_real_fast_ns(void)
+{
+	return __ktime_get_real_fast_ns(&tk_fast_mono);
+}
+
 /**
  * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
  * @tk: Timekeeper to snapshot.
@@ -514,6 +547,7 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
 	memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
 	cycles_at_suspend = tk_clock_read(tkr);
 	tkr_dummy.clock = &dummy_clock;
+	tkr_dummy.base_real = tkr->base + tk->offs_real;
 	update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
 
 	tkr = &tk->tkr_raw;
@@ -661,6 +695,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 	update_vsyscall(tk);
 	update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
 
+	tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
 	update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
 	update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);
 
-- 
cgit v1.2.3


From a380f2edef65b2447a043251bb3c00a9d2153a8b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 25 Sep 2017 14:56:44 +0200
Subject: PM / core: Drop legacy class suspend/resume operations

There are no classes using the legacy suspend/resume operations in
the tree any more, so drop these operations and update the code
referring to them accordingly.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/power/main.c | 32 +++++++++-----------------------
 include/linux/device.h    |  5 -----
 2 files changed, 9 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 770b1539a083..12abcf6084a5 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -848,16 +848,10 @@ static int device_resume(struct device *dev, pm_message_t state, bool async)
 		goto Driver;
 	}
 
-	if (dev->class) {
-		if (dev->class->pm) {
-			info = "class ";
-			callback = pm_op(dev->class->pm, state);
-			goto Driver;
-		} else if (dev->class->resume) {
-			info = "legacy class ";
-			callback = dev->class->resume;
-			goto End;
-		}
+	if (dev->class && dev->class->pm) {
+		info = "class ";
+		callback = pm_op(dev->class->pm, state);
+		goto Driver;
 	}
 
 	if (dev->bus) {
@@ -1508,17 +1502,10 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 		goto Run;
 	}
 
-	if (dev->class) {
-		if (dev->class->pm) {
-			info = "class ";
-			callback = pm_op(dev->class->pm, state);
-			goto Run;
-		} else if (dev->class->suspend) {
-			pm_dev_dbg(dev, state, "legacy class ");
-			error = legacy_suspend(dev, state, dev->class->suspend,
-						"legacy class ");
-			goto End;
-		}
+	if (dev->class && dev->class->pm) {
+		info = "class ";
+		callback = pm_op(dev->class->pm, state);
+		goto Run;
 	}
 
 	if (dev->bus) {
@@ -1862,8 +1849,7 @@ void device_pm_check_callbacks(struct device *dev)
 	dev->power.no_pm_callbacks =
 		(!dev->bus || (pm_ops_is_empty(dev->bus->pm) &&
 		 !dev->bus->suspend && !dev->bus->resume)) &&
-		(!dev->class || (pm_ops_is_empty(dev->class->pm) &&
-		 !dev->class->suspend && !dev->class->resume)) &&
+		(!dev->class || pm_ops_is_empty(dev->class->pm)) &&
 		(!dev->type || pm_ops_is_empty(dev->type->pm)) &&
 		(!dev->pm_domain || pm_ops_is_empty(&dev->pm_domain->ops)) &&
 		(!dev->driver || (pm_ops_is_empty(dev->driver->pm) &&
diff --git a/include/linux/device.h b/include/linux/device.h
index 1d2607923a24..c1527f887050 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -372,9 +372,6 @@ int subsys_virtual_register(struct bus_type *subsys,
  * @devnode:	Callback to provide the devtmpfs.
  * @class_release: Called to release this class.
  * @dev_release: Called to release the device.
- * @suspend:	Used to put the device to sleep mode, usually to a low power
- *		state.
- * @resume:	Used to bring the device from the sleep mode.
  * @shutdown_pre: Called at shut-down time before driver shutdown.
  * @ns_type:	Callbacks so sysfs can detemine namespaces.
  * @namespace:	Namespace of the device belongs to this class.
@@ -402,8 +399,6 @@ struct class {
 	void (*class_release)(struct class *class);
 	void (*dev_release)(struct device *dev);
 
-	int (*suspend)(struct device *dev, pm_message_t state);
-	int (*resume)(struct device *dev);
 	int (*shutdown_pre)(struct device *dev);
 
 	const struct kobj_ns_type_operations *ns_type;
-- 
cgit v1.2.3


From 003278e431bffa4070d18c821eff1d95867f24db Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe.montjoie@gmail.com>
Date: Tue, 26 Sep 2017 09:14:07 +0200
Subject: nfs_common: convert int to bool

Since __state_in_grace return only true/false, make it return bool
instead of int.
Same change for the two user of it, locks_in_grace/opens_in_grace

Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfs_common/grace.c | 10 +++++-----
 include/linux/fs.h    |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c
index c030cd618b99..897b299db55e 100644
--- a/fs/nfs_common/grace.c
+++ b/fs/nfs_common/grace.c
@@ -55,7 +55,7 @@ locks_end_grace(struct lock_manager *lm)
 }
 EXPORT_SYMBOL_GPL(locks_end_grace);
 
-static int
+static bool
 __state_in_grace(struct net *net, bool open)
 {
 	struct list_head *grace_list = net_generic(net, grace_net_id);
@@ -78,15 +78,15 @@ __state_in_grace(struct net *net, bool open)
  * to answer ordinary lock requests, and when they should accept only
  * lock reclaims.
  */
-int locks_in_grace(struct net *net)
+bool locks_in_grace(struct net *net)
 {
-	return __state_in_grace(net, 0);
+	return __state_in_grace(net, false);
 }
 EXPORT_SYMBOL_GPL(locks_in_grace);
 
-int opens_in_grace(struct net *net)
+bool opens_in_grace(struct net *net)
 {
-	return __state_in_grace(net, 1);
+	return __state_in_grace(net, true);
 }
 EXPORT_SYMBOL_GPL(opens_in_grace);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 339e73742e73..8cc0493c7c39 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -970,8 +970,8 @@ struct lock_manager {
 struct net;
 void locks_start_grace(struct net *, struct lock_manager *);
 void locks_end_grace(struct lock_manager *);
-int locks_in_grace(struct net *);
-int opens_in_grace(struct net *);
+bool locks_in_grace(struct net *);
+bool opens_in_grace(struct net *);
 
 /* that will die - we need it for nfs_lock_info */
 #include <linux/nfs_fs_i.h>
-- 
cgit v1.2.3


From 05e3db95ebfc5c06a29a1d8c7a3e02f46f3a25a7 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 14 Sep 2017 14:02:04 -0700
Subject: kthread: add a mechanism to store cgroup info

kthread usually runs jobs on behalf of other threads. The jobs should be
charged to cgroup of original threads. But the jobs run in a kthread,
where we lose the cgroup context of original threads. The patch adds a
machanism to record cgroup info of original threads in kthread context.
Later we can retrieve the cgroup info and attach the cgroup info to jobs.

Since this mechanism is only required by kthread, we store the cgroup
info in kthread data instead of generic task_struct.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/kthread.h | 11 +++++++++
 kernel/kthread.c        | 66 +++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 75 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 82e197eeac91..bd4369c83dfb 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -3,6 +3,7 @@
 /* Simple interface for creating and stopping kernel threads without mess. */
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/cgroup.h>
 
 __printf(4, 5)
 struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
@@ -198,4 +199,14 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work);
 
 void kthread_destroy_worker(struct kthread_worker *worker);
 
+#ifdef CONFIG_CGROUPS
+void kthread_associate_blkcg(struct cgroup_subsys_state *css);
+struct cgroup_subsys_state *kthread_blkcg(void);
+#else
+static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { }
+static inline struct cgroup_subsys_state *kthread_blkcg(void)
+{
+	return NULL;
+}
+#endif
 #endif /* _LINUX_KTHREAD_H */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1c19edf82427..b011ea08967f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,7 +20,6 @@
 #include <linux/freezer.h>
 #include <linux/ptrace.h>
 #include <linux/uaccess.h>
-#include <linux/cgroup.h>
 #include <trace/events/sched.h>
 
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -47,6 +46,9 @@ struct kthread {
 	void *data;
 	struct completion parked;
 	struct completion exited;
+#ifdef CONFIG_CGROUPS
+	struct cgroup_subsys_state *blkcg_css;
+#endif
 };
 
 enum KTHREAD_BITS {
@@ -74,11 +76,17 @@ static inline struct kthread *to_kthread(struct task_struct *k)
 
 void free_kthread_struct(struct task_struct *k)
 {
+	struct kthread *kthread;
+
 	/*
 	 * Can be NULL if this kthread was created by kernel_thread()
 	 * or if kmalloc() in kthread() failed.
 	 */
-	kfree(to_kthread(k));
+	kthread = to_kthread(k);
+#ifdef CONFIG_CGROUPS
+	WARN_ON_ONCE(kthread && kthread->blkcg_css);
+#endif
+	kfree(kthread);
 }
 
 /**
@@ -216,6 +224,9 @@ static int kthread(void *_create)
 	self->data = data;
 	init_completion(&self->exited);
 	init_completion(&self->parked);
+#ifdef CONFIG_CGROUPS
+	self->blkcg_css = NULL;
+#endif
 	current->vfork_done = &self->exited;
 
 	/* OK, tell user we're spawned, wait for stop or wakeup */
@@ -1154,3 +1165,54 @@ void kthread_destroy_worker(struct kthread_worker *worker)
 	kfree(worker);
 }
 EXPORT_SYMBOL(kthread_destroy_worker);
+
+#ifdef CONFIG_CGROUPS
+/**
+ * kthread_associate_blkcg - associate blkcg to current kthread
+ * @css: the cgroup info
+ *
+ * Current thread must be a kthread. The thread is running jobs on behalf of
+ * other threads. In some cases, we expect the jobs attach cgroup info of
+ * original threads instead of that of current thread. This function stores
+ * original thread's cgroup info in current kthread context for later
+ * retrieval.
+ */
+void kthread_associate_blkcg(struct cgroup_subsys_state *css)
+{
+	struct kthread *kthread;
+
+	if (!(current->flags & PF_KTHREAD))
+		return;
+	kthread = to_kthread(current);
+	if (!kthread)
+		return;
+
+	if (kthread->blkcg_css) {
+		css_put(kthread->blkcg_css);
+		kthread->blkcg_css = NULL;
+	}
+	if (css) {
+		css_get(css);
+		kthread->blkcg_css = css;
+	}
+}
+EXPORT_SYMBOL(kthread_associate_blkcg);
+
+/**
+ * kthread_blkcg - get associated blkcg css of current kthread
+ *
+ * Current thread must be a kthread.
+ */
+struct cgroup_subsys_state *kthread_blkcg(void)
+{
+	struct kthread *kthread;
+
+	if (current->flags & PF_KTHREAD) {
+		kthread = to_kthread(current);
+		if (kthread)
+			return kthread->blkcg_css;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(kthread_blkcg);
+#endif
-- 
cgit v1.2.3


From af551fb3be26a22b7a6b345b3b7e7e6acfc41758 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 14 Sep 2017 14:02:05 -0700
Subject: blkcg: delete unused APIs

Nobody uses the APIs right now.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                | 31 -------------------------------
 include/linux/bio.h        |  2 --
 include/linux/blk-cgroup.h | 12 ------------
 3 files changed, 45 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index b38e962fa83e..8338304ea256 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2032,37 +2032,6 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
 }
 EXPORT_SYMBOL_GPL(bio_associate_blkcg);
 
-/**
- * bio_associate_current - associate a bio with %current
- * @bio: target bio
- *
- * Associate @bio with %current if it hasn't been associated yet.  Block
- * layer will treat @bio as if it were issued by %current no matter which
- * task actually issues it.
- *
- * This function takes an extra reference of @task's io_context and blkcg
- * which will be put when @bio is released.  The caller must own @bio,
- * ensure %current->io_context exists, and is responsible for synchronizing
- * calls to this function.
- */
-int bio_associate_current(struct bio *bio)
-{
-	struct io_context *ioc;
-
-	if (bio->bi_css)
-		return -EBUSY;
-
-	ioc = current->io_context;
-	if (!ioc)
-		return -ENOENT;
-
-	get_io_context_active(ioc);
-	bio->bi_ioc = ioc;
-	bio->bi_css = task_get_css(current, io_cgrp_id);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(bio_associate_current);
-
 /**
  * bio_disassociate_task - undo bio_associate_current()
  * @bio: target bio
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 275c91c99516..9c75f58f6a50 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -522,13 +522,11 @@ do {						\
 
 #ifdef CONFIG_BLK_CGROUP
 int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
-int bio_associate_current(struct bio *bio);
 void bio_disassociate_task(struct bio *bio);
 void bio_clone_blkcg_association(struct bio *dst, struct bio *src);
 #else	/* CONFIG_BLK_CGROUP */
 static inline int bio_associate_blkcg(struct bio *bio,
 			struct cgroup_subsys_state *blkcg_css) { return 0; }
-static inline int bio_associate_current(struct bio *bio) { return -ENOENT; }
 static inline void bio_disassociate_task(struct bio *bio) { }
 static inline void bio_clone_blkcg_association(struct bio *dst,
 			struct bio *src) { }
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 9d92153dd856..0cfa8d2ef4d6 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -235,12 +235,6 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
 	return task_blkcg(current);
 }
 
-static inline struct cgroup_subsys_state *
-task_get_blkcg_css(struct task_struct *task)
-{
-	return task_get_css(task, io_cgrp_id);
-}
-
 /**
  * blkcg_parent - get the parent of a blkcg
  * @blkcg: blkcg of interest
@@ -735,12 +729,6 @@ struct blkcg_policy {
 
 #define blkcg_root_css	((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
 
-static inline struct cgroup_subsys_state *
-task_get_blkcg_css(struct task_struct *task)
-{
-	return NULL;
-}
-
 #ifdef CONFIG_BLOCK
 
 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
-- 
cgit v1.2.3


From 902ec5b6de0678ac2effba0faaafdd1c3e7fcf2e Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 14 Sep 2017 14:02:06 -0700
Subject: block: make blkcg aware of kthread stored original cgroup info

bio_blkcg is the only API to get cgroup info for a bio right now. If
bio_blkcg finds current task is a kthread and has original blkcg
associated, it will use the css instead of associating the bio to
current task. This makes it possible that kthread dispatches bios on
behalf of other threads.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 0cfa8d2ef4d6..f57e54d64529 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -19,6 +19,7 @@
 #include <linux/radix-tree.h>
 #include <linux/blkdev.h>
 #include <linux/atomic.h>
+#include <linux/kthread.h>
 
 /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
 #define BLKG_STAT_CPU_BATCH	(INT_MAX / 2)
@@ -223,16 +224,16 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
 	return css ? container_of(css, struct blkcg, css) : NULL;
 }
 
-static inline struct blkcg *task_blkcg(struct task_struct *tsk)
-{
-	return css_to_blkcg(task_css(tsk, io_cgrp_id));
-}
-
 static inline struct blkcg *bio_blkcg(struct bio *bio)
 {
+	struct cgroup_subsys_state *css;
+
 	if (bio && bio->bi_css)
 		return css_to_blkcg(bio->bi_css);
-	return task_blkcg(current);
+	css = kthread_blkcg();
+	if (css)
+		return css_to_blkcg(css);
+	return css_to_blkcg(task_css(current, io_cgrp_id));
 }
 
 /**
-- 
cgit v1.2.3


From 0b508bc926bdced678febee2a2b8cdba0a19e481 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Tue, 26 Sep 2017 11:02:12 -0700
Subject: block: fix a build error

The code is only for blkcg not for all cgroups

Fixes: d4478e92d618 ("block/loop: make loop cgroup aware")
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c    | 2 +-
 include/linux/kthread.h | 2 +-
 kernel/kthread.c        | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index fd4eff5f5b76..bc8e61506968 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1692,7 +1692,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 	}
 
 	/* always use the first bio's css */
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 	if (cmd->use_aio && cmd->rq->bio && cmd->rq->bio->bi_css) {
 		cmd->css = cmd->rq->bio->bi_css;
 		css_get(cmd->css);
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index bd4369c83dfb..fb201842c635 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -199,7 +199,7 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work);
 
 void kthread_destroy_worker(struct kthread_worker *worker);
 
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 void kthread_associate_blkcg(struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *kthread_blkcg(void);
 #else
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b011ea08967f..f87cd8b4eb2a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -46,7 +46,7 @@ struct kthread {
 	void *data;
 	struct completion parked;
 	struct completion exited;
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 	struct cgroup_subsys_state *blkcg_css;
 #endif
 };
@@ -83,7 +83,7 @@ void free_kthread_struct(struct task_struct *k)
 	 * or if kmalloc() in kthread() failed.
 	 */
 	kthread = to_kthread(k);
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 	WARN_ON_ONCE(kthread && kthread->blkcg_css);
 #endif
 	kfree(kthread);
@@ -224,7 +224,7 @@ static int kthread(void *_create)
 	self->data = data;
 	init_completion(&self->exited);
 	init_completion(&self->parked);
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 	self->blkcg_css = NULL;
 #endif
 	current->vfork_done = &self->exited;
@@ -1166,7 +1166,7 @@ void kthread_destroy_worker(struct kthread_worker *worker)
 }
 EXPORT_SYMBOL(kthread_destroy_worker);
 
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 /**
  * kthread_associate_blkcg - associate blkcg to current kthread
  * @css: the cgroup info
-- 
cgit v1.2.3


From 1e99c497012cd8647972876f1bd18545bc907aea Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Sun, 24 Sep 2017 12:09:45 +0300
Subject: qed: iWARP - Add check for errors on a SYN packet

A SYN packet which arrives with errors from FW should be dropped.
This required adding an additional field to the ll2
rx completion data.

Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 8 ++++++++
 drivers/net/ethernet/qlogic/qed/qed_ll2.c   | 1 +
 include/linux/qed/qed_ll2_if.h              | 1 +
 3 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
index 568e9853cc8d..8fc9c811f6e3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
@@ -1733,6 +1733,14 @@ qed_iwarp_ll2_comp_syn_pkt(void *cxt, struct qed_ll2_comp_rx_data *data)
 
 	memset(&cm_info, 0, sizeof(cm_info));
 	ll2_syn_handle = p_hwfn->p_rdma_info->iwarp.ll2_syn_handle;
+
+	/* Check if packet was received with errors... */
+	if (data->err_flags) {
+		DP_NOTICE(p_hwfn, "Error received on SYN packet: 0x%x\n",
+			  data->err_flags);
+		goto err;
+	}
+
 	if (GET_FIELD(data->parse_flags,
 		      PARSING_AND_ERR_FLAGS_L4CHKSMWASCALCULATED) &&
 	    GET_FIELD(data->parse_flags, PARSING_AND_ERR_FLAGS_L4CHKSMERROR)) {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index c06ad4f0758e..250afa5486cf 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -413,6 +413,7 @@ static void qed_ll2_rxq_parse_reg(struct qed_hwfn *p_hwfn,
 				  struct qed_ll2_comp_rx_data *data)
 {
 	data->parse_flags = le16_to_cpu(p_cqe->rx_cqe_fp.parse_flags.flags);
+	data->err_flags = le16_to_cpu(p_cqe->rx_cqe_fp.err_flags.flags);
 	data->length.packet_length =
 	    le16_to_cpu(p_cqe->rx_cqe_fp.packet_length);
 	data->vlan = le16_to_cpu(p_cqe->rx_cqe_fp.vlan);
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index dd7a3b86bb9e..89fa0bbd54f3 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -101,6 +101,7 @@ struct qed_ll2_comp_rx_data {
 	void *cookie;
 	dma_addr_t rx_buf_addr;
 	u16 parse_flags;
+	u16 err_flags;
 	u16 vlan;
 	bool b_last_packet;
 	u8 connection_handle;
-- 
cgit v1.2.3


From 6aaae2b6c4330a46204bca042f1d2f41e8e18dea Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 25 Sep 2017 02:25:50 +0200
Subject: bpf: rename bpf_compute_data_end into bpf_compute_data_pointers

Just do the rename into bpf_compute_data_pointers() as we'll add
one more pointer here to recompute.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  9 ++++++---
 kernel/bpf/sockmap.c   |  4 ++--
 net/bpf/test_run.c     |  2 +-
 net/core/filter.c      | 14 +++++++-------
 net/core/lwt_bpf.c     |  2 +-
 net/sched/act_bpf.c    |  4 ++--
 net/sched/cls_bpf.c    |  4 ++--
 7 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index d29e58fde364..052bab3d62e7 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -496,10 +496,13 @@ struct xdp_buff {
 	void *data_hard_start;
 };
 
-/* compute the linear packet data range [data, data_end) which
- * will be accessed by cls_bpf, act_bpf and lwt programs
+/* Compute the linear packet data range [data, data_end) which
+ * will be accessed by various program types (cls_bpf, act_bpf,
+ * lwt, ...). Subsystems allowing direct data access must (!)
+ * ensure that cb[] area can be written to when BPF program is
+ * invoked (otherwise cb[] save/restore is necessary).
  */
-static inline void bpf_compute_data_end(struct sk_buff *skb)
+static inline void bpf_compute_data_pointers(struct sk_buff *skb)
 {
 	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
 
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 6424ce0e4969..a298d6666698 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -102,7 +102,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 
 	skb_orphan(skb);
 	skb->sk = psock->sock;
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	skb->sk = NULL;
 
@@ -369,7 +369,7 @@ static int smap_parse_func_strparser(struct strparser *strp,
 	 * any socket yet.
 	 */
 	skb->sk = psock->sock;
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	skb->sk = NULL;
 	rcu_read_unlock();
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 6be41a44d688..df672517b4fd 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -133,7 +133,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 	if (is_l2)
 		__skb_push(skb, ETH_HLEN);
 	if (is_direct_pkt_access)
-		bpf_compute_data_end(skb);
+		bpf_compute_data_pointers(skb);
 	retval = bpf_test_run(prog, skb, repeat, &duration);
 	if (!is_l2)
 		__skb_push(skb, ETH_HLEN);
diff --git a/net/core/filter.c b/net/core/filter.c
index 82edad58d066..c468e7cfad19 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1402,7 +1402,7 @@ static inline int bpf_try_make_writable(struct sk_buff *skb,
 {
 	int err = __bpf_try_make_writable(skb, write_len);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return err;
 }
 
@@ -1962,7 +1962,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
 	ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
 	bpf_pull_mac_rcsum(skb);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -1984,7 +1984,7 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
 	ret = skb_vlan_pop(skb);
 	bpf_pull_mac_rcsum(skb);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2178,7 +2178,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
 	 * need to be verified first.
 	 */
 	ret = bpf_skb_proto_xlat(skb, proto);
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2303,7 +2303,7 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff)
 	ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) :
 		       bpf_skb_net_grow(skb, len_diff_abs);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2394,7 +2394,7 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
 			skb_gso_reset(skb);
 	}
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2434,7 +2434,7 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
 		skb_reset_mac_header(skb);
 	}
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return 0;
 }
 
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 1307731ddfe4..e7e626fb87bb 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -51,7 +51,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
 	 */
 	preempt_disable();
 	rcu_read_lock();
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	ret = bpf_prog_run_save_cb(lwt->prog, skb);
 	rcu_read_unlock();
 
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index c0c707eb2c96..5ef8ce8c83d4 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -49,11 +49,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 	filter = rcu_dereference(prog->filter);
 	if (at_ingress) {
 		__skb_push(skb, skb->mac_len);
-		bpf_compute_data_end(skb);
+		bpf_compute_data_pointers(skb);
 		filter_res = BPF_PROG_RUN(filter, skb);
 		__skb_pull(skb, skb->mac_len);
 	} else {
-		bpf_compute_data_end(skb);
+		bpf_compute_data_pointers(skb);
 		filter_res = BPF_PROG_RUN(filter, skb);
 	}
 	rcu_read_unlock();
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 520c5027646a..36671b0fb125 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -99,11 +99,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		} else if (at_ingress) {
 			/* It is safe to push/pull even if skb_shared() */
 			__skb_push(skb, skb->mac_len);
-			bpf_compute_data_end(skb);
+			bpf_compute_data_pointers(skb);
 			filter_res = BPF_PROG_RUN(prog->filter, skb);
 			__skb_pull(skb, skb->mac_len);
 		} else {
-			bpf_compute_data_end(skb);
+			bpf_compute_data_pointers(skb);
 			filter_res = BPF_PROG_RUN(prog->filter, skb);
 		}
 
-- 
cgit v1.2.3


From de8f3a83b0a0fddb2cf56e7a718127e9619ea3da Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 25 Sep 2017 02:25:51 +0200
Subject: bpf: add meta pointer for direct access

This work enables generic transfer of metadata from XDP into skb. The
basic idea is that we can make use of the fact that the resulting skb
must be linear and already comes with a larger headroom for supporting
bpf_xdp_adjust_head(), which mangles xdp->data. Here, we base our work
on a similar principle and introduce a small helper bpf_xdp_adjust_meta()
for adjusting a new pointer called xdp->data_meta. Thus, the packet has
a flexible and programmable room for meta data, followed by the actual
packet data. struct xdp_buff is therefore laid out that we first point
to data_hard_start, then data_meta directly prepended to data followed
by data_end marking the end of packet. bpf_xdp_adjust_head() takes into
account whether we have meta data already prepended and if so, memmove()s
this along with the given offset provided there's enough room.

xdp->data_meta is optional and programs are not required to use it. The
rationale is that when we process the packet in XDP (e.g. as DoS filter),
we can push further meta data along with it for the XDP_PASS case, and
give the guarantee that a clsact ingress BPF program on the same device
can pick this up for further post-processing. Since we work with skb
there, we can also set skb->mark, skb->priority or other skb meta data
out of BPF, thus having this scratch space generic and programmable
allows for more flexibility than defining a direct 1:1 transfer of
potentially new XDP members into skb (it's also more efficient as we
don't need to initialize/handle each of such new members). The facility
also works together with GRO aggregation. The scratch space at the head
of the packet can be multiple of 4 byte up to 32 byte large. Drivers not
yet supporting xdp->data_meta can simply be set up with xdp->data_meta
as xdp->data + 1 as bpf_xdp_adjust_meta() will detect this and bail out,
such that the subsequent match against xdp->data for later access is
guaranteed to fail.

The verifier treats xdp->data_meta/xdp->data the same way as we treat
xdp->data/xdp->data_end pointer comparisons. The requirement for doing
the compare against xdp->data is that it hasn't been modified from it's
original address we got from ctx access. It may have a range marking
already from prior successful xdp->data/xdp->data_end pointer comparisons
though.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c      |   1 +
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   |   1 +
 drivers/net/ethernet/intel/i40e/i40e_txrx.c        |   1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      |   1 +
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |   1 +
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |   1 +
 drivers/net/ethernet/qlogic/qede/qede_fp.c         |   1 +
 drivers/net/tun.c                                  |   1 +
 drivers/net/virtio_net.c                           |   2 +
 include/linux/bpf.h                                |   1 +
 include/linux/filter.h                             |  21 +++-
 include/linux/skbuff.h                             |  68 +++++++++++-
 include/uapi/linux/bpf.h                           |  13 ++-
 kernel/bpf/verifier.c                              | 114 ++++++++++++++++-----
 net/bpf/test_run.c                                 |   1 +
 net/core/dev.c                                     |  31 +++++-
 net/core/filter.c                                  |  77 +++++++++++++-
 net/core/skbuff.c                                  |   2 +
 19 files changed, 297 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
index d8f0c837b72c..06ce63c00821 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -94,6 +94,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
 
 	xdp.data_hard_start = *data_ptr - offset;
 	xdp.data = *data_ptr;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = *data_ptr + *len;
 	orig_data = xdp.data;
 	mapping = rx_buf->mapping - bp->rx_dma_offset;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 49b80da51ba7..d68478afccbf 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -523,6 +523,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
 
 	xdp.data_hard_start = page_address(page);
 	xdp.data = (void *)cpu_addr;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + len;
 	orig_data = xdp.data;
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 1519dfb851d0..f426762bd83a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2107,6 +2107,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
 		if (!skb) {
 			xdp.data = page_address(rx_buffer->page) +
 				   rx_buffer->page_offset;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_hard_start = xdp.data -
 					      i40e_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index d962368d08d0..04bb03bda1cd 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2326,6 +2326,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		if (!skb) {
 			xdp.data = page_address(rx_buffer->page) +
 				   rx_buffer->page_offset;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_hard_start = xdp.data -
 					      ixgbe_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index b97a55c827eb..8f9cb8abc497 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -762,6 +762,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 
 			xdp.data_hard_start = va - frags[0].page_offset;
 			xdp.data = va;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_end = xdp.data + length;
 			orig_data = xdp.data;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index f1dd638384d3..30b3f3fbd719 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -794,6 +794,7 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
 		return false;
 
 	xdp.data = va + *rx_headroom;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + *len;
 	xdp.data_hard_start = va;
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 1c0187f0af51..e3a38be3600a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1583,6 +1583,7 @@ static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, void *hard_start,
 
 	xdp.data_hard_start = hard_start;
 	xdp.data = data + *off;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = data + *off + *len;
 
 	orig_data = xdp.data;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index 6fc854b120b0..48ec4c56cddf 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -1004,6 +1004,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,
 
 	xdp.data_hard_start = page_address(bd->data);
 	xdp.data = xdp.data_hard_start + *data_offset;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + *len;
 
 	/* Queues always have a full reset currently, so for the time
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 2c36f6ebad79..a6e0bffe3d29 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1468,6 +1468,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 
 		xdp.data_hard_start = buf;
 		xdp.data = buf + pad;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + len;
 		orig_data = xdp.data;
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index dd14a4547932..fc059f193e7d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -554,6 +554,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 
 		xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
 		xdp.data = xdp.data_hard_start + xdp_headroom;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + len;
 		orig_data = xdp.data;
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
@@ -686,6 +687,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		data = page_address(xdp_page) + offset;
 		xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
 		xdp.data = data + vi->hdr_len;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + (len - vi->hdr_len);
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8390859e79e7..2b672c50f160 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -137,6 +137,7 @@ enum bpf_reg_type {
 	PTR_TO_MAP_VALUE,	 /* reg points to map element value */
 	PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
 	PTR_TO_STACK,		 /* reg == frame_pointer + offset */
+	PTR_TO_PACKET_META,	 /* skb->data - meta_len */
 	PTR_TO_PACKET,		 /* reg points to skb->data */
 	PTR_TO_PACKET_END,	 /* skb->data + headlen */
 };
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 052bab3d62e7..911d454af107 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -487,12 +487,14 @@ struct sk_filter {
 
 struct bpf_skb_data_end {
 	struct qdisc_skb_cb qdisc_cb;
+	void *data_meta;
 	void *data_end;
 };
 
 struct xdp_buff {
 	void *data;
 	void *data_end;
+	void *data_meta;
 	void *data_hard_start;
 };
 
@@ -507,7 +509,8 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb)
 	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
 
 	BUILD_BUG_ON(sizeof(*cb) > FIELD_SIZEOF(struct sk_buff, cb));
-	cb->data_end = skb->data + skb_headlen(skb);
+	cb->data_meta = skb->data - skb_metadata_len(skb);
+	cb->data_end  = skb->data + skb_headlen(skb);
 }
 
 static inline u8 *bpf_skb_cb(struct sk_buff *skb)
@@ -728,8 +731,22 @@ int xdp_do_redirect(struct net_device *dev,
 		    struct bpf_prog *prog);
 void xdp_do_flush_map(void);
 
+/* Drivers not supporting XDP metadata can use this helper, which
+ * rejects any room expansion for metadata as a result.
+ */
+static __always_inline void
+xdp_set_data_meta_invalid(struct xdp_buff *xdp)
+{
+	xdp->data_meta = xdp->data + 1;
+}
+
+static __always_inline bool
+xdp_data_meta_unsupported(const struct xdp_buff *xdp)
+{
+	return unlikely(xdp->data_meta > xdp->data);
+}
+
 void bpf_warn_invalid_xdp_action(u32 act);
-void bpf_warn_invalid_xdp_redirect(u32 ifindex);
 
 struct sock *do_sk_redirect_map(void);
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f9db5539a6fb..19e64bfb1a66 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -489,8 +489,9 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
  * the end of the header data, ie. at skb->end.
  */
 struct skb_shared_info {
-	unsigned short	_unused;
-	unsigned char	nr_frags;
+	__u8		__unused;
+	__u8		meta_len;
+	__u8		nr_frags;
 	__u8		tx_flags;
 	unsigned short	gso_size;
 	/* Warning: this field is not always filled in (UFO)! */
@@ -3400,6 +3401,69 @@ static inline ktime_t net_invalid_timestamp(void)
 	return 0;
 }
 
+static inline u8 skb_metadata_len(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->meta_len;
+}
+
+static inline void *skb_metadata_end(const struct sk_buff *skb)
+{
+	return skb_mac_header(skb);
+}
+
+static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
+					  const struct sk_buff *skb_b,
+					  u8 meta_len)
+{
+	const void *a = skb_metadata_end(skb_a);
+	const void *b = skb_metadata_end(skb_b);
+	/* Using more efficient varaiant than plain call to memcmp(). */
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
+	u64 diffs = 0;
+
+	switch (meta_len) {
+#define __it(x, op) (x -= sizeof(u##op))
+#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
+	case 32: diffs |= __it_diff(a, b, 64);
+	case 24: diffs |= __it_diff(a, b, 64);
+	case 16: diffs |= __it_diff(a, b, 64);
+	case  8: diffs |= __it_diff(a, b, 64);
+		break;
+	case 28: diffs |= __it_diff(a, b, 64);
+	case 20: diffs |= __it_diff(a, b, 64);
+	case 12: diffs |= __it_diff(a, b, 64);
+	case  4: diffs |= __it_diff(a, b, 32);
+		break;
+	}
+	return diffs;
+#else
+	return memcmp(a - meta_len, b - meta_len, meta_len);
+#endif
+}
+
+static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
+					const struct sk_buff *skb_b)
+{
+	u8 len_a = skb_metadata_len(skb_a);
+	u8 len_b = skb_metadata_len(skb_b);
+
+	if (!(len_a | len_b))
+		return false;
+
+	return len_a != len_b ?
+	       true : __skb_metadata_differs(skb_a, skb_b, len_a);
+}
+
+static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
+{
+	skb_shinfo(skb)->meta_len = meta_len;
+}
+
+static inline void skb_metadata_clear(struct sk_buff *skb)
+{
+	skb_metadata_set(skb, 0);
+}
+
 struct sk_buff *skb_clone_sk(struct sk_buff *skb);
 
 #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 43ab5c402f98..e43491ac4823 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -582,6 +582,12 @@ union bpf_attr {
  *	@map: pointer to sockmap to update
  *	@key: key to insert/update sock in map
  *	@flags: same flags as map update elem
+ *
+ * int bpf_xdp_adjust_meta(xdp_md, delta)
+ *     Adjust the xdp_md.data_meta by delta
+ *     @xdp_md: pointer to xdp_md
+ *     @delta: An positive/negative integer to be added to xdp_md.data_meta
+ *     Return: 0 on success or negative on error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -638,6 +644,7 @@ union bpf_attr {
 	FN(redirect_map),		\
 	FN(sk_redirect_map),		\
 	FN(sock_map_update),		\
+	FN(xdp_adjust_meta),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -715,7 +722,7 @@ struct __sk_buff {
 	__u32 data_end;
 	__u32 napi_id;
 
-	/* accessed by BPF_PROG_TYPE_sk_skb types */
+	/* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
 	__u32 local_ip4;	/* Stored in network byte order */
@@ -723,6 +730,9 @@ struct __sk_buff {
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	/* ... here. */
+
+	__u32 data_meta;
 };
 
 struct bpf_tunnel_key {
@@ -783,6 +793,7 @@ enum xdp_action {
 struct xdp_md {
 	__u32 data;
 	__u32 data_end;
+	__u32 data_meta;
 };
 
 enum sk_action {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b914fbe1383e..f849eca36052 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -177,6 +177,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...)
 	va_end(args);
 }
 
+static bool type_is_pkt_pointer(enum bpf_reg_type type)
+{
+	return type == PTR_TO_PACKET ||
+	       type == PTR_TO_PACKET_META;
+}
+
 /* string representation of 'enum bpf_reg_type' */
 static const char * const reg_type_str[] = {
 	[NOT_INIT]		= "?",
@@ -187,6 +193,7 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
 	[PTR_TO_STACK]		= "fp",
 	[PTR_TO_PACKET]		= "pkt",
+	[PTR_TO_PACKET_META]	= "pkt_meta",
 	[PTR_TO_PACKET_END]	= "pkt_end",
 };
 
@@ -226,7 +233,7 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 			verbose("(id=%d", reg->id);
 			if (t != SCALAR_VALUE)
 				verbose(",off=%d", reg->off);
-			if (t == PTR_TO_PACKET)
+			if (type_is_pkt_pointer(t))
 				verbose(",r=%d", reg->range);
 			else if (t == CONST_PTR_TO_MAP ||
 				 t == PTR_TO_MAP_VALUE ||
@@ -519,6 +526,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
 	__mark_reg_known_zero(regs + regno);
 }
 
+static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
+{
+	return type_is_pkt_pointer(reg->type);
+}
+
+static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
+{
+	return reg_is_pkt_pointer(reg) ||
+	       reg->type == PTR_TO_PACKET_END;
+}
+
+/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
+static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
+				    enum bpf_reg_type which)
+{
+	/* The register can already have a range from prior markings.
+	 * This is fine as long as it hasn't been advanced from its
+	 * origin.
+	 */
+	return reg->type == which &&
+	       reg->id == 0 &&
+	       reg->off == 0 &&
+	       tnum_equals_const(reg->var_off, 0);
+}
+
 /* Attempts to improve min/max values based on var_off information */
 static void __update_reg_bounds(struct bpf_reg_state *reg)
 {
@@ -702,6 +734,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_STACK:
 	case PTR_TO_CTX:
 	case PTR_TO_PACKET:
+	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET_END:
 	case CONST_PTR_TO_MAP:
 		return true;
@@ -1047,7 +1080,10 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
-		/* special case, because of NET_IP_ALIGN */
+	case PTR_TO_PACKET_META:
+		/* Special case, because of NET_IP_ALIGN. Given metadata sits
+		 * right in front, treat it the very same way.
+		 */
 		return check_pkt_ptr_alignment(reg, off, size, strict);
 	case PTR_TO_MAP_VALUE:
 		pointer_desc = "value ";
@@ -1124,8 +1160,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
 		if (!err && t == BPF_READ && value_regno >= 0) {
 			/* ctx access returns either a scalar, or a
-			 * PTR_TO_PACKET[_END].  In the latter case, we know
-			 * the offset is zero.
+			 * PTR_TO_PACKET[_META,_END]. In the latter
+			 * case, we know the offset is zero.
 			 */
 			if (reg_type == SCALAR_VALUE)
 				mark_reg_unknown(state->regs, value_regno);
@@ -1170,7 +1206,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		} else {
 			err = check_stack_read(state, off, size, value_regno);
 		}
-	} else if (reg->type == PTR_TO_PACKET) {
+	} else if (reg_is_pkt_pointer(reg)) {
 		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
@@ -1310,6 +1346,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
+	case PTR_TO_PACKET_META:
 		return check_packet_access(env, regno, reg->off, access_size);
 	case PTR_TO_MAP_VALUE:
 		return check_map_access(env, regno, reg->off, access_size);
@@ -1342,7 +1379,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		return 0;
 	}
 
-	if (type == PTR_TO_PACKET &&
+	if (type_is_pkt_pointer(type) &&
 	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {
 		verbose("helper access to the packet is not allowed\n");
 		return -EACCES;
@@ -1351,7 +1388,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 	if (arg_type == ARG_PTR_TO_MAP_KEY ||
 	    arg_type == ARG_PTR_TO_MAP_VALUE) {
 		expected_type = PTR_TO_STACK;
-		if (type != PTR_TO_PACKET && type != expected_type)
+		if (!type_is_pkt_pointer(type) &&
+		    type != expected_type)
 			goto err_type;
 	} else if (arg_type == ARG_CONST_SIZE ||
 		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -1375,7 +1413,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (register_is_null(*reg))
 			/* final test in check_stack_boundary() */;
-		else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
+		else if (!type_is_pkt_pointer(type) &&
+			 type != PTR_TO_MAP_VALUE &&
 			 type != expected_type)
 			goto err_type;
 		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
@@ -1401,7 +1440,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose("invalid map_ptr to access map->key\n");
 			return -EACCES;
 		}
-		if (type == PTR_TO_PACKET)
+		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
 						  meta->map_ptr->key_size);
 		else
@@ -1417,7 +1456,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose("invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
-		if (type == PTR_TO_PACKET)
+		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
 						  meta->map_ptr->value_size);
 		else
@@ -1590,8 +1629,8 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
 	return count > 1 ? -EINVAL : 0;
 }
 
-/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid,
- * so turn them into unknown SCALAR_VALUE.
+/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
+ * are now invalid, so turn them into unknown SCALAR_VALUE.
  */
 static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 {
@@ -1600,18 +1639,15 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++)
-		if (regs[i].type == PTR_TO_PACKET ||
-		    regs[i].type == PTR_TO_PACKET_END)
+		if (reg_is_pkt_pointer_any(&regs[i]))
 			mark_reg_unknown(regs, i);
 
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] != STACK_SPILL)
 			continue;
 		reg = &state->spilled_regs[i / BPF_REG_SIZE];
-		if (reg->type != PTR_TO_PACKET &&
-		    reg->type != PTR_TO_PACKET_END)
-			continue;
-		__mark_reg_unknown(reg);
+		if (reg_is_pkt_pointer_any(reg))
+			__mark_reg_unknown(reg);
 	}
 }
 
@@ -1871,7 +1907,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		}
 		dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->off = ptr_reg->off;
-		if (ptr_reg->type == PTR_TO_PACKET) {
+		if (reg_is_pkt_pointer(ptr_reg)) {
 			dst_reg->id = ++env->id_gen;
 			/* something was added to pkt_ptr, set range to zero */
 			dst_reg->range = 0;
@@ -1931,7 +1967,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		}
 		dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->off = ptr_reg->off;
-		if (ptr_reg->type == PTR_TO_PACKET) {
+		if (reg_is_pkt_pointer(ptr_reg)) {
 			dst_reg->id = ++env->id_gen;
 			/* something was added to pkt_ptr, set range to zero */
 			if (smin_val < 0)
@@ -2421,7 +2457,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 }
 
 static void find_good_pkt_pointers(struct bpf_verifier_state *state,
-				   struct bpf_reg_state *dst_reg)
+				   struct bpf_reg_state *dst_reg,
+				   enum bpf_reg_type type)
 {
 	struct bpf_reg_state *regs = state->regs, *reg;
 	int i;
@@ -2483,7 +2520,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 	 * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
 	 */
 	for (i = 0; i < MAX_BPF_REG; i++)
-		if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
+		if (regs[i].type == type && regs[i].id == dst_reg->id)
 			/* keep the maximum range already checked */
 			regs[i].range = max_t(u16, regs[i].range, dst_reg->off);
 
@@ -2491,7 +2528,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 		if (state->stack_slot_type[i] != STACK_SPILL)
 			continue;
 		reg = &state->spilled_regs[i / BPF_REG_SIZE];
-		if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
+		if (reg->type == type && reg->id == dst_reg->id)
 			reg->range = max_t(u16, reg->range, dst_reg->off);
 	}
 }
@@ -2856,19 +2893,39 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
 		   dst_reg->type == PTR_TO_PACKET &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		find_good_pkt_pointers(this_branch, dst_reg);
+		find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
 		   dst_reg->type == PTR_TO_PACKET &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		find_good_pkt_pointers(other_branch, dst_reg);
+		find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
 		   dst_reg->type == PTR_TO_PACKET_END &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
+		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
 		   dst_reg->type == PTR_TO_PACKET_END &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		find_good_pkt_pointers(this_branch, &regs[insn->src_reg]);
+		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
+		   dst_reg->type == PTR_TO_PACKET_META &&
+		   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
+		find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
+		   dst_reg->type == PTR_TO_PACKET_META &&
+		   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
+		find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
+		   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+		   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
+		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
+		   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+		   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
+		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET_META);
 	} else if (is_pointer_value(env, insn->dst_reg)) {
 		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
 		return -EACCES;
@@ -3298,8 +3355,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 			return false;
 		/* Check our ids match any regs they're supposed to */
 		return check_ids(rold->id, rcur->id, idmap);
+	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET:
-		if (rcur->type != PTR_TO_PACKET)
+		if (rcur->type != rold->type)
 			return false;
 		/* We must have at least as much range as the old ptr
 		 * did, so that any accesses which were safe before are
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index df672517b4fd..a86e6687026e 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -162,6 +162,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 
 	xdp.data_hard_start = data;
 	xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN;
+	xdp.data_meta = xdp.data;
 	xdp.data_end = xdp.data + size;
 
 	retval = bpf_test_run(prog, &xdp, repeat, &duration);
diff --git a/net/core/dev.c b/net/core/dev.c
index 97abddd9039a..e350c768d4b5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3864,8 +3864,8 @@ drop:
 static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 				     struct bpf_prog *xdp_prog)
 {
+	u32 metalen, act = XDP_DROP;
 	struct xdp_buff xdp;
-	u32 act = XDP_DROP;
 	void *orig_data;
 	int hlen, off;
 	u32 mac_len;
@@ -3876,8 +3876,25 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	if (skb_cloned(skb))
 		return XDP_PASS;
 
-	if (skb_linearize(skb))
-		goto do_drop;
+	/* XDP packets must be linear and must have sufficient headroom
+	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
+	 * native XDP provides, thus we need to do it here as well.
+	 */
+	if (skb_is_nonlinear(skb) ||
+	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
+		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
+		int troom = skb->tail + skb->data_len - skb->end;
+
+		/* In case we have to go down the path and also linearize,
+		 * then lets do the pskb_expand_head() work just once here.
+		 */
+		if (pskb_expand_head(skb,
+				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
+			goto do_drop;
+		if (troom > 0 && __skb_linearize(skb))
+			goto do_drop;
+	}
 
 	/* The XDP program wants to see the packet starting at the MAC
 	 * header.
@@ -3885,6 +3902,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	mac_len = skb->data - skb_mac_header(skb);
 	hlen = skb_headlen(skb) + mac_len;
 	xdp.data = skb->data - mac_len;
+	xdp.data_meta = xdp.data;
 	xdp.data_end = xdp.data + hlen;
 	xdp.data_hard_start = skb->data - skb_headroom(skb);
 	orig_data = xdp.data;
@@ -3902,10 +3920,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	case XDP_REDIRECT:
 	case XDP_TX:
 		__skb_push(skb, mac_len);
-		/* fall through */
+		break;
 	case XDP_PASS:
+		metalen = xdp.data - xdp.data_meta;
+		if (metalen)
+			skb_metadata_set(skb, metalen);
 		break;
-
 	default:
 		bpf_warn_invalid_xdp_action(act);
 		/* fall through */
@@ -4695,6 +4715,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
 		diffs |= p->vlan_tci ^ skb->vlan_tci;
 		diffs |= skb_metadata_dst_cmp(p, skb);
+		diffs |= skb_metadata_differs(p, skb);
 		if (maclen == ETH_HLEN)
 			diffs |= compare_ether_header(skb_mac_header(p),
 						      skb_mac_header(skb));
diff --git a/net/core/filter.c b/net/core/filter.c
index c468e7cfad19..9b6e7e84aafd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2447,14 +2447,26 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
+{
+	return xdp_data_meta_unsupported(xdp) ? 0 :
+	       xdp->data - xdp->data_meta;
+}
+
 BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
 {
+	unsigned long metalen = xdp_get_metalen(xdp);
+	void *data_start = xdp->data_hard_start + metalen;
 	void *data = xdp->data + offset;
 
-	if (unlikely(data < xdp->data_hard_start ||
+	if (unlikely(data < data_start ||
 		     data > xdp->data_end - ETH_HLEN))
 		return -EINVAL;
 
+	if (metalen)
+		memmove(xdp->data_meta + offset,
+			xdp->data_meta, metalen);
+	xdp->data_meta += offset;
 	xdp->data = data;
 
 	return 0;
@@ -2468,6 +2480,33 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
+{
+	void *meta = xdp->data_meta + offset;
+	unsigned long metalen = xdp->data - meta;
+
+	if (xdp_data_meta_unsupported(xdp))
+		return -ENOTSUPP;
+	if (unlikely(meta < xdp->data_hard_start ||
+		     meta > xdp->data))
+		return -EINVAL;
+	if (unlikely((metalen & (sizeof(__u32) - 1)) ||
+		     (metalen > 32)))
+		return -EACCES;
+
+	xdp->data_meta = meta;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
+	.func		= bpf_xdp_adjust_meta,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static int __bpf_tx_xdp(struct net_device *dev,
 			struct bpf_map *map,
 			struct xdp_buff *xdp,
@@ -2692,7 +2731,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_clone_redirect ||
 	    func == bpf_l3_csum_replace ||
 	    func == bpf_l4_csum_replace ||
-	    func == bpf_xdp_adjust_head)
+	    func == bpf_xdp_adjust_head ||
+	    func == bpf_xdp_adjust_meta)
 		return true;
 
 	return false;
@@ -3288,6 +3328,8 @@ xdp_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_xdp_adjust_head:
 		return &bpf_xdp_adjust_head_proto;
+	case BPF_FUNC_xdp_adjust_meta:
+		return &bpf_xdp_adjust_meta_proto;
 	case BPF_FUNC_redirect:
 		return &bpf_xdp_redirect_proto;
 	case BPF_FUNC_redirect_map:
@@ -3418,6 +3460,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
 	case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
 	case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
 	case bpf_ctx_range(struct __sk_buff, data):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		if (size != size_default)
 			return false;
@@ -3444,6 +3487,7 @@ static bool sk_filter_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range(struct __sk_buff, data):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, data_end):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
@@ -3468,6 +3512,7 @@ static bool lwt_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 		return false;
 	}
 
@@ -3586,6 +3631,9 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
+	case bpf_ctx_range(struct __sk_buff, data_meta):
+		info->reg_type = PTR_TO_PACKET_META;
+		break;
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
@@ -3619,6 +3667,9 @@ static bool xdp_is_valid_access(int off, int size,
 	case offsetof(struct xdp_md, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
+	case offsetof(struct xdp_md, data_meta):
+		info->reg_type = PTR_TO_PACKET_META;
+		break;
 	case offsetof(struct xdp_md, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
@@ -3677,6 +3728,12 @@ static bool sk_skb_is_valid_access(int off, int size,
 				   enum bpf_access_type type,
 				   struct bpf_insn_access_aux *info)
 {
+	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, tc_classid):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
+		return false;
+	}
+
 	if (type == BPF_WRITE) {
 		switch (off) {
 		case bpf_ctx_range(struct __sk_buff, mark):
@@ -3689,8 +3746,6 @@ static bool sk_skb_is_valid_access(int off, int size,
 	}
 
 	switch (off) {
-	case bpf_ctx_range(struct __sk_buff, tc_classid):
-		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
@@ -3847,6 +3902,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(struct sk_buff, data));
 		break;
 
+	case offsetof(struct __sk_buff, data_meta):
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, data_meta);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct bpf_skb_data_end, data_meta);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+				      si->src_reg, off);
+		break;
+
 	case offsetof(struct __sk_buff, data_end):
 		off  = si->off;
 		off -= offsetof(struct __sk_buff, data_end);
@@ -4095,6 +4159,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, data));
 		break;
+	case offsetof(struct xdp_md, data_meta):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct xdp_buff, data_meta));
+		break;
 	case offsetof(struct xdp_md, data_end):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
 				      si->dst_reg, si->src_reg,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 000ce735fa8d..d98c2e3ce2bf 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1509,6 +1509,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	skb->nohdr    = 0;
 	atomic_set(&skb_shinfo(skb)->dataref, 1);
 
+	skb_metadata_clear(skb);
+
 	/* It is not generally safe to change skb->truesize.
 	 * For the moment, we really care of rx path, or
 	 * when skb is orphaned (not attached to a socket).
-- 
cgit v1.2.3


From 404376af788a76cca760efdc05f26fd73bd94b17 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 17 Sep 2017 19:07:10 -0700
Subject: Documentation: kernel-api: add bitmap operations from linux/bitmap.h

Add <linux/bitmap.h> to kernel-api Bitmap Operations section.
Fix kernel-doc nitpicks in <linux/bitmap.h>.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Yury Norov <ynorov@caviumnetworks.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/core-api/kernel-api.rst | 3 +++
 include/linux/bitmap.h                | 9 +++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index e557509574a0..f37c73faa5b7 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -59,6 +59,9 @@ Bitmap Operations
 .. kernel-doc:: lib/bitmap.c
    :internal:
 
+.. kernel-doc:: include/linux/bitmap.h
+   :internal:
+
 Command-line Parsing
 --------------------
 
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 700cf5f67118..5c4178016b1e 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -360,8 +360,9 @@ static inline int bitmap_parse(const char *buf, unsigned int buflen,
 	return __bitmap_parse(buf, buflen, 0, maskp, nmaskbits);
 }
 
-/*
+/**
  * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap.
+ * @n: u64 value
  *
  * Linux bitmaps are internally arrays of unsigned longs, i.e. 32-bit
  * integers in 32-bit environment, and 64-bit integers in 64-bit one.
@@ -392,14 +393,14 @@ static inline int bitmap_parse(const char *buf, unsigned int buflen,
 				((unsigned long) ((u64)(n) >> 32))
 #endif
 
-/*
+/**
  * bitmap_from_u64 - Check and swap words within u64.
  *  @mask: source bitmap
  *  @dst:  destination bitmap
  *
- * In 32-bit Big Endian kernel, when using (u32 *)(&val)[*]
+ * In 32-bit Big Endian kernel, when using ``(u32 *)(&val)[*]``
  * to read u64 mask, we will get the wrong word.
- * That is "(u32 *)(&val)[0]" gets the upper 32 bits,
+ * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits,
  * but we expect the lower 32-bits of u64.
  */
 static inline void bitmap_from_u64(unsigned long *dst, u64 mask)
-- 
cgit v1.2.3


From e60aa7b53845a261dd419652f12ab9f89e668843 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 21 Sep 2017 16:52:44 +0100
Subject: iommu/iova: Extend rbtree node caching

The cached node mechanism provides a significant performance benefit for
allocations using a 32-bit DMA mask, but in the case of non-PCI devices
or where the 32-bit space is full, the loss of this benefit can be
significant - on large systems there can be many thousands of entries in
the tree, such that walking all the way down to find free space every
time becomes increasingly awful.

Maintain a similar cached node for the whole IOVA space as a superset of
the 32-bit space so that performance can remain much more consistent.

Inspired by work by Zhen Lei <thunder.leizhen@huawei.com>.

Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Zhen Lei <thunder.leizhen@huawei.com>
Tested-by: Nate Watterson <nwatters@codeaurora.org>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iova.c | 60 ++++++++++++++++++++++++----------------------------
 include/linux/iova.h |  3 ++-
 2 files changed, 30 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 20be9a8b3188..c6f5a22f8d20 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -48,6 +48,7 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 
 	spin_lock_init(&iovad->iova_rbtree_lock);
 	iovad->rbroot = RB_ROOT;
+	iovad->cached_node = NULL;
 	iovad->cached32_node = NULL;
 	iovad->granule = granule;
 	iovad->start_pfn = start_pfn;
@@ -110,48 +111,44 @@ EXPORT_SYMBOL_GPL(init_iova_flush_queue);
 static struct rb_node *
 __get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn)
 {
-	if ((*limit_pfn > iovad->dma_32bit_pfn) ||
-		(iovad->cached32_node == NULL))
+	struct rb_node *cached_node = NULL;
+	struct iova *curr_iova;
+
+	if (*limit_pfn <= iovad->dma_32bit_pfn)
+		cached_node = iovad->cached32_node;
+	if (!cached_node)
+		cached_node = iovad->cached_node;
+	if (!cached_node)
 		return rb_last(&iovad->rbroot);
-	else {
-		struct rb_node *prev_node = rb_prev(iovad->cached32_node);
-		struct iova *curr_iova =
-			rb_entry(iovad->cached32_node, struct iova, node);
-		*limit_pfn = curr_iova->pfn_lo;
-		return prev_node;
-	}
+
+	curr_iova = rb_entry(cached_node, struct iova, node);
+	*limit_pfn = min(*limit_pfn, curr_iova->pfn_lo);
+
+	return rb_prev(cached_node);
 }
 
 static void
-__cached_rbnode_insert_update(struct iova_domain *iovad,
-	unsigned long limit_pfn, struct iova *new)
+__cached_rbnode_insert_update(struct iova_domain *iovad, struct iova *new)
 {
-	if (limit_pfn != iovad->dma_32bit_pfn)
-		return;
-	iovad->cached32_node = &new->node;
+	if (new->pfn_hi < iovad->dma_32bit_pfn)
+		iovad->cached32_node = &new->node;
+	else
+		iovad->cached_node = &new->node;
 }
 
 static void
 __cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
 {
 	struct iova *cached_iova;
-	struct rb_node *curr;
 
-	if (!iovad->cached32_node)
-		return;
-	curr = iovad->cached32_node;
-	cached_iova = rb_entry(curr, struct iova, node);
+	cached_iova = rb_entry(iovad->cached32_node, struct iova, node);
+	if (free->pfn_hi < iovad->dma_32bit_pfn &&
+	    iovad->cached32_node && free->pfn_lo >= cached_iova->pfn_lo)
+		iovad->cached32_node = rb_next(&free->node);
 
-	if (free->pfn_lo >= cached_iova->pfn_lo) {
-		struct rb_node *node = rb_next(&free->node);
-		struct iova *iova = rb_entry(node, struct iova, node);
-
-		/* only cache if it's below 32bit pfn */
-		if (node && iova->pfn_lo < iovad->dma_32bit_pfn)
-			iovad->cached32_node = node;
-		else
-			iovad->cached32_node = NULL;
-	}
+	cached_iova = rb_entry(iovad->cached_node, struct iova, node);
+	if (iovad->cached_node && free->pfn_lo >= cached_iova->pfn_lo)
+		iovad->cached_node = rb_next(&free->node);
 }
 
 /* Insert the iova into domain rbtree by holding writer lock */
@@ -188,7 +185,7 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
 {
 	struct rb_node *prev, *curr = NULL;
 	unsigned long flags;
-	unsigned long saved_pfn, new_pfn;
+	unsigned long new_pfn;
 	unsigned long align_mask = ~0UL;
 
 	if (size_aligned)
@@ -196,7 +193,6 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
 
 	/* Walk the tree backwards */
 	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-	saved_pfn = limit_pfn;
 	curr = __get_cached_rbnode(iovad, &limit_pfn);
 	prev = curr;
 	while (curr) {
@@ -226,7 +222,7 @@ move_left:
 
 	/* If we have 'prev', it's a valid place to start the insertion. */
 	iova_insert_rbtree(&iovad->rbroot, new, prev);
-	__cached_rbnode_insert_update(iovad, saved_pfn, new);
+	__cached_rbnode_insert_update(iovad, new);
 
 	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
 
diff --git a/include/linux/iova.h b/include/linux/iova.h
index d179b9bf7814..69ea3e258ff2 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -70,7 +70,8 @@ struct iova_fq {
 struct iova_domain {
 	spinlock_t	iova_rbtree_lock; /* Lock to protect update of rbtree */
 	struct rb_root	rbroot;		/* iova domain rbtree root */
-	struct rb_node	*cached32_node; /* Save last alloced node */
+	struct rb_node	*cached_node;	/* Save last alloced node */
+	struct rb_node	*cached32_node; /* Save last 32-bit alloced node */
 	unsigned long	granule;	/* pfn granularity for this domain */
 	unsigned long	start_pfn;	/* Lower limit for this domain */
 	unsigned long	dma_32bit_pfn;
-- 
cgit v1.2.3


From aa3ac9469c1850ed00741955b975c3a19029763a Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Thu, 21 Sep 2017 16:52:45 +0100
Subject: iommu/iova: Make dma_32bit_pfn implicit

Now that the cached node optimisation can apply to all allocations, the
couple of users which were playing tricks with dma_32bit_pfn in order to
benefit from it can stop doing so. Conversely, there is also no need for
all the other users to explicitly calculate a 'real' 32-bit PFN, when
init_iova_domain() can happily do that itself from the page granularity.

CC: Thierry Reding <thierry.reding@gmail.com>
CC: Jonathan Hunter <jonathanh@nvidia.com>
CC: David Airlie <airlied@linux.ie>
CC: Sudeep Dutt <sudeep.dutt@intel.com>
CC: Ashutosh Dixit <ashutosh.dixit@intel.com>
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Zhen Lei <thunder.leizhen@huawei.com>
Tested-by: Nate Watterson <nwatters@codeaurora.org>
[rm: use iova_shift(), rewrote commit message]
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/gpu/drm/tegra/drm.c      |  3 +--
 drivers/gpu/host1x/dev.c         |  3 +--
 drivers/iommu/amd_iommu.c        |  7 ++-----
 drivers/iommu/dma-iommu.c        | 18 +-----------------
 drivers/iommu/intel-iommu.c      | 11 +++--------
 drivers/iommu/iova.c             |  4 ++--
 drivers/misc/mic/scif/scif_rma.c |  3 +--
 include/linux/iova.h             |  5 ++---
 8 files changed, 13 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index 597d563d636a..b822e484b7e5 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -155,8 +155,7 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
 
 		order = __ffs(tegra->domain->pgsize_bitmap);
 		init_iova_domain(&tegra->carveout.domain, 1UL << order,
-				 carveout_start >> order,
-				 carveout_end >> order);
+				 carveout_start >> order);
 
 		tegra->carveout.shift = iova_shift(&tegra->carveout.domain);
 		tegra->carveout.limit = carveout_end >> tegra->carveout.shift;
diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c
index 7f22c5c37660..5267c62e8896 100644
--- a/drivers/gpu/host1x/dev.c
+++ b/drivers/gpu/host1x/dev.c
@@ -198,8 +198,7 @@ static int host1x_probe(struct platform_device *pdev)
 
 		order = __ffs(host->domain->pgsize_bitmap);
 		init_iova_domain(&host->iova, 1UL << order,
-				 geometry->aperture_start >> order,
-				 geometry->aperture_end >> order);
+				 geometry->aperture_start >> order);
 		host->iova_end = geometry->aperture_end;
 	}
 
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 51f8215877f5..647ab7691aee 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -63,7 +63,6 @@
 /* IO virtual address start page frame number */
 #define IOVA_START_PFN		(1)
 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
-#define DMA_32BIT_PFN		IOVA_PFN(DMA_BIT_MASK(32))
 
 /* Reserved IOVA ranges */
 #define MSI_RANGE_START		(0xfee00000)
@@ -1788,8 +1787,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(void)
 	if (!dma_dom->domain.pt_root)
 		goto free_dma_dom;
 
-	init_iova_domain(&dma_dom->iovad, PAGE_SIZE,
-			 IOVA_START_PFN, DMA_32BIT_PFN);
+	init_iova_domain(&dma_dom->iovad, PAGE_SIZE, IOVA_START_PFN);
 
 	if (init_iova_flush_queue(&dma_dom->iovad, iova_domain_flush_tlb, NULL))
 		goto free_dma_dom;
@@ -2696,8 +2694,7 @@ static int init_reserved_iova_ranges(void)
 	struct pci_dev *pdev = NULL;
 	struct iova *val;
 
-	init_iova_domain(&reserved_iova_ranges, PAGE_SIZE,
-			 IOVA_START_PFN, DMA_32BIT_PFN);
+	init_iova_domain(&reserved_iova_ranges, PAGE_SIZE, IOVA_START_PFN);
 
 	lockdep_set_class(&reserved_iova_ranges.iova_rbtree_lock,
 			  &reserved_rbtree_key);
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 9d1cebe7f6cb..191be9c80a8a 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -292,18 +292,7 @@ int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 		/* ...then finally give it a kicking to make sure it fits */
 		base_pfn = max_t(unsigned long, base_pfn,
 				domain->geometry.aperture_start >> order);
-		end_pfn = min_t(unsigned long, end_pfn,
-				domain->geometry.aperture_end >> order);
 	}
-	/*
-	 * PCI devices may have larger DMA masks, but still prefer allocating
-	 * within a 32-bit mask to avoid DAC addressing. Such limitations don't
-	 * apply to the typical platform device, so for those we may as well
-	 * leave the cache limit at the top of their range to save an rb_last()
-	 * traversal on every allocation.
-	 */
-	if (dev && dev_is_pci(dev))
-		end_pfn &= DMA_BIT_MASK(32) >> order;
 
 	/* start_pfn is always nonzero for an already-initialised domain */
 	if (iovad->start_pfn) {
@@ -312,16 +301,11 @@ int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 			pr_warn("Incompatible range for DMA domain\n");
 			return -EFAULT;
 		}
-		/*
-		 * If we have devices with different DMA masks, move the free
-		 * area cache limit down for the benefit of the smaller one.
-		 */
-		iovad->dma_32bit_pfn = min(end_pfn + 1, iovad->dma_32bit_pfn);
 
 		return 0;
 	}
 
-	init_iova_domain(iovad, 1UL << order, base_pfn, end_pfn);
+	init_iova_domain(iovad, 1UL << order, base_pfn);
 	if (!dev)
 		return 0;
 
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 6784a05dd6b2..ebb48353dd39 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -82,8 +82,6 @@
 #define IOVA_START_PFN		(1)
 
 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
-#define DMA_32BIT_PFN		IOVA_PFN(DMA_BIT_MASK(32))
-#define DMA_64BIT_PFN		IOVA_PFN(DMA_BIT_MASK(64))
 
 /* page table handling */
 #define LEVEL_STRIDE		(9)
@@ -1878,8 +1876,7 @@ static int dmar_init_reserved_ranges(void)
 	struct iova *iova;
 	int i;
 
-	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
-			DMA_32BIT_PFN);
+	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
 
 	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
 		&reserved_rbtree_key);
@@ -1938,8 +1935,7 @@ static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
 	unsigned long sagaw;
 	int err;
 
-	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
-			DMA_32BIT_PFN);
+	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
 
 	err = init_iova_flush_queue(&domain->iovad,
 				    iommu_flush_iova, iova_entry_free);
@@ -4897,8 +4893,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
 {
 	int adjust_width;
 
-	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
-			DMA_32BIT_PFN);
+	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
 	domain_reserve_special_ranges(domain);
 
 	/* calculate AGAW */
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index c6f5a22f8d20..65032e60a5d1 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -37,7 +37,7 @@ static void fq_flush_timeout(unsigned long data);
 
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
-	unsigned long start_pfn, unsigned long pfn_32bit)
+	unsigned long start_pfn)
 {
 	/*
 	 * IOVA granularity will normally be equal to the smallest
@@ -52,7 +52,7 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	iovad->cached32_node = NULL;
 	iovad->granule = granule;
 	iovad->start_pfn = start_pfn;
-	iovad->dma_32bit_pfn = pfn_32bit + 1;
+	iovad->dma_32bit_pfn = 1UL << (32 - iova_shift(iovad));
 	iovad->flush_cb = NULL;
 	iovad->fq = NULL;
 	init_iova_rcaches(iovad);
diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c
index 329727e00e97..c824329f7012 100644
--- a/drivers/misc/mic/scif/scif_rma.c
+++ b/drivers/misc/mic/scif/scif_rma.c
@@ -39,8 +39,7 @@ void scif_rma_ep_init(struct scif_endpt *ep)
 	struct scif_endpt_rma_info *rma = &ep->rma_info;
 
 	mutex_init(&rma->rma_lock);
-	init_iova_domain(&rma->iovad, PAGE_SIZE, SCIF_IOVA_START_PFN,
-			 SCIF_DMA_64BIT_PFN);
+	init_iova_domain(&rma->iovad, PAGE_SIZE, SCIF_IOVA_START_PFN);
 	spin_lock_init(&rma->tc_lock);
 	mutex_init(&rma->mmn_lock);
 	INIT_LIST_HEAD(&rma->reg_list);
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 69ea3e258ff2..953cfd20f152 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -154,7 +154,7 @@ struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 	unsigned long pfn_hi);
 void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
 void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
-	unsigned long start_pfn, unsigned long pfn_32bit);
+	unsigned long start_pfn);
 int init_iova_flush_queue(struct iova_domain *iovad,
 			  iova_flush_cb flush_cb, iova_entry_dtor entry_dtor);
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
@@ -230,8 +230,7 @@ static inline void copy_reserved_iova(struct iova_domain *from,
 
 static inline void init_iova_domain(struct iova_domain *iovad,
 				    unsigned long granule,
-				    unsigned long start_pfn,
-				    unsigned long pfn_32bit)
+				    unsigned long start_pfn)
 {
 }
 
-- 
cgit v1.2.3


From bb68b2fbfbd643d4407541f9c7a16a2c9b3a57c7 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 21 Sep 2017 16:52:46 +0100
Subject: iommu/iova: Add rbtree anchor node

Add a permanent dummy IOVA reservation to the rbtree, such that we can
always access the top of the address space instantly. The immediate
benefit is that we remove the overhead of the rb_last() traversal when
not using the cached node, but it also paves the way for further
simplifications.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iova.c | 15 +++++++++++++--
 include/linux/iova.h |  1 +
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 65032e60a5d1..9e04c1f3e740 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -24,6 +24,9 @@
 #include <linux/bitops.h>
 #include <linux/cpu.h>
 
+/* The anchor node sits above the top of the usable address space */
+#define IOVA_ANCHOR	~0UL
+
 static bool iova_rcache_insert(struct iova_domain *iovad,
 			       unsigned long pfn,
 			       unsigned long size);
@@ -55,6 +58,9 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	iovad->dma_32bit_pfn = 1UL << (32 - iova_shift(iovad));
 	iovad->flush_cb = NULL;
 	iovad->fq = NULL;
+	iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
+	rb_link_node(&iovad->anchor.node, NULL, &iovad->rbroot.rb_node);
+	rb_insert_color(&iovad->anchor.node, &iovad->rbroot);
 	init_iova_rcaches(iovad);
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
@@ -119,7 +125,7 @@ __get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn)
 	if (!cached_node)
 		cached_node = iovad->cached_node;
 	if (!cached_node)
-		return rb_last(&iovad->rbroot);
+		return rb_prev(&iovad->anchor.node);
 
 	curr_iova = rb_entry(cached_node, struct iova, node);
 	*limit_pfn = min(*limit_pfn, curr_iova->pfn_lo);
@@ -242,7 +248,8 @@ EXPORT_SYMBOL(alloc_iova_mem);
 
 void free_iova_mem(struct iova *iova)
 {
-	kmem_cache_free(iova_cache, iova);
+	if (iova->pfn_lo != IOVA_ANCHOR)
+		kmem_cache_free(iova_cache, iova);
 }
 EXPORT_SYMBOL(free_iova_mem);
 
@@ -676,6 +683,10 @@ reserve_iova(struct iova_domain *iovad,
 	struct iova *iova;
 	unsigned int overlap = 0;
 
+	/* Don't allow nonsensical pfns */
+	if (WARN_ON((pfn_hi | pfn_lo) > (ULLONG_MAX >> iova_shift(iovad))))
+		return NULL;
+
 	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
 	for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) {
 		if (__is_range_overlap(node, pfn_lo, pfn_hi)) {
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 953cfd20f152..c696ee81054e 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -75,6 +75,7 @@ struct iova_domain {
 	unsigned long	granule;	/* pfn granularity for this domain */
 	unsigned long	start_pfn;	/* Lower limit for this domain */
 	unsigned long	dma_32bit_pfn;
+	struct iova	anchor;		/* rbtree lookup anchor */
 	struct iova_rcache rcaches[IOVA_RANGE_CACHE_MAX_SIZE];	/* IOVA range caches */
 
 	iova_flush_cb	flush_cb;	/* Call-Back function to flush IOMMU
-- 
cgit v1.2.3


From 310ebbba3b7396b00bce08a33f1d2de2c74fa257 Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Wed, 27 Sep 2017 08:23:12 +0200
Subject: ipmr: Add reference count to MFC entries

Next commits will introduce MFC notifications through the atomic
fib_notification chain, thus allowing modules to be aware of MFC entries.

Due to the fact that modules may need to hold a reference to an MFC entry,
add reference count to MFC entries to prevent them from being freed while
these modules use them.

The reference counting is done only on resolved MFC entries currently.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h | 21 +++++++++++++++++++++
 net/ipv4/ipmr.c        |  8 +++++---
 2 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index d7f63339ef0b..10028f208efb 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -109,6 +109,7 @@ struct mfc_cache_cmp_arg {
  * @wrong_if: number of wrong source interface hits
  * @lastuse: time of last use of the group (traffic or update)
  * @ttls: OIF TTL threshold array
+ * @refcount: reference count for this entry
  * @list: global entry list
  * @rcu: used for entry destruction
  */
@@ -138,6 +139,7 @@ struct mfc_cache {
 			unsigned long wrong_if;
 			unsigned long lastuse;
 			unsigned char ttls[MAXVIFS];
+			refcount_t refcount;
 		} res;
 	} mfc_un;
 	struct list_head list;
@@ -148,4 +150,23 @@ struct rtmsg;
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		   __be32 saddr, __be32 daddr,
 		   struct rtmsg *rtm, u32 portid);
+
+#ifdef CONFIG_IP_MROUTE
+void ipmr_cache_free(struct mfc_cache *mfc_cache);
+#else
+static inline void ipmr_cache_free(struct mfc_cache *mfc_cache)
+{
+}
+#endif
+
+static inline void ipmr_cache_put(struct mfc_cache *c)
+{
+	if (refcount_dec_and_test(&c->mfc_un.res.refcount))
+		ipmr_cache_free(c);
+}
+static inline void ipmr_cache_hold(struct mfc_cache *c)
+{
+	refcount_inc(&c->mfc_un.res.refcount);
+}
+
 #endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index c9b3e6e069ae..86dc5f98c5dd 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -652,10 +652,11 @@ static void ipmr_cache_free_rcu(struct rcu_head *head)
 	kmem_cache_free(mrt_cachep, c);
 }
 
-static inline void ipmr_cache_free(struct mfc_cache *c)
+void ipmr_cache_free(struct mfc_cache *c)
 {
 	call_rcu(&c->rcu, ipmr_cache_free_rcu);
 }
+EXPORT_SYMBOL(ipmr_cache_free);
 
 /* Destroy an unresolved cache entry, killing queued skbs
  * and reporting error to netlink readers.
@@ -949,6 +950,7 @@ static struct mfc_cache *ipmr_cache_alloc(void)
 	if (c) {
 		c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
 		c->mfc_un.res.minvif = MAXVIFS;
+		refcount_set(&c->mfc_un.res.refcount, 1);
 	}
 	return c;
 }
@@ -1162,7 +1164,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
 	rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
 	list_del_rcu(&c->list);
 	mroute_netlink_event(mrt, c, RTM_DELROUTE);
-	ipmr_cache_free(c);
+	ipmr_cache_put(c);
 
 	return 0;
 }
@@ -1264,7 +1266,7 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 		rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
 		list_del_rcu(&c->list);
 		mroute_netlink_event(mrt, c, RTM_DELROUTE);
-		ipmr_cache_free(c);
+		ipmr_cache_put(c);
 	}
 
 	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
-- 
cgit v1.2.3


From 4d65b9487831170e699b2fc64a91b839d729bd78 Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Wed, 27 Sep 2017 08:23:13 +0200
Subject: ipmr: Add FIB notification access functions

Make the ipmr module register as a FIB notifier. To do that, implement both
the ipmr_seq_read and ipmr_dump ops.

The ipmr_seq_read op returns a sequence counter that is incremented on
every notification related operation done by the ipmr. To implement that,
add a sequence counter in the netns_ipv4 struct and increment it whenever a
new MFC route or VIF are added or deleted. The sequence operations are
protected by the RTNL lock.

The ipmr_dump iterates the list of MFC routes and the list of VIF entries
and sends notifications about them. The entries dump is done under RCU
where the VIF dump uses the mrt_lock too, as the vif->dev field can change
under RCU.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h   |  15 ++++++
 include/net/netns/ipv4.h |   3 ++
 net/ipv4/ipmr.c          | 137 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 153 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 10028f208efb..54c5cb82ddcb 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -5,6 +5,7 @@
 #include <linux/pim.h>
 #include <linux/rhashtable.h>
 #include <net/sock.h>
+#include <net/fib_notifier.h>
 #include <uapi/linux/mroute.h>
 
 #ifdef CONFIG_IP_MROUTE
@@ -58,6 +59,14 @@ struct vif_device {
 	int		link;			/* Physical interface index	*/
 };
 
+struct vif_entry_notifier_info {
+	struct fib_notifier_info info;
+	struct net_device *dev;
+	vifi_t vif_index;
+	unsigned short vif_flags;
+	u32 tb_id;
+};
+
 #define VIFF_STATIC 0x8000
 
 #define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
@@ -146,6 +155,12 @@ struct mfc_cache {
 	struct rcu_head	rcu;
 };
 
+struct mfc_entry_notifier_info {
+	struct fib_notifier_info info;
+	struct mfc_cache *mfc;
+	u32 tb_id;
+};
+
 struct rtmsg;
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		   __be32 saddr, __be32 daddr,
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 8387f099115e..abc84d986da4 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -163,6 +163,9 @@ struct netns_ipv4 {
 	struct fib_notifier_ops	*notifier_ops;
 	unsigned int	fib_seq;	/* protected by rtnl_mutex */
 
+	struct fib_notifier_ops	*ipmr_notifier_ops;
+	unsigned int	ipmr_seq;	/* protected by rtnl_mutex */
+
 	atomic_t	rt_genid;
 };
 #endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 86dc5f98c5dd..49879c338357 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -264,6 +264,16 @@ static void __net_exit ipmr_rules_exit(struct net *net)
 	fib_rules_unregister(net->ipv4.mr_rules_ops);
 	rtnl_unlock();
 }
+
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+{
+	return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR);
+}
+
+static unsigned int ipmr_rules_seq_read(struct net *net)
+{
+	return fib_rules_seq_read(net, RTNL_FAMILY_IPMR);
+}
 #else
 #define ipmr_for_each_table(mrt, net) \
 	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
@@ -298,6 +308,16 @@ static void __net_exit ipmr_rules_exit(struct net *net)
 	net->ipv4.mrt = NULL;
 	rtnl_unlock();
 }
+
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+{
+	return 0;
+}
+
+static unsigned int ipmr_rules_seq_read(struct net *net)
+{
+	return 0;
+}
 #endif
 
 static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
@@ -587,6 +607,43 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
 }
 #endif
 
+static int call_ipmr_vif_entry_notifier(struct notifier_block *nb,
+					struct net *net,
+					enum fib_event_type event_type,
+					struct vif_device *vif,
+					vifi_t vif_index, u32 tb_id)
+{
+	struct vif_entry_notifier_info info = {
+		.info = {
+			.family = RTNL_FAMILY_IPMR,
+			.net = net,
+		},
+		.dev = vif->dev,
+		.vif_index = vif_index,
+		.vif_flags = vif->flags,
+		.tb_id = tb_id,
+	};
+
+	return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
+static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb,
+					struct net *net,
+					enum fib_event_type event_type,
+					struct mfc_cache *mfc, u32 tb_id)
+{
+	struct mfc_entry_notifier_info info = {
+		.info = {
+			.family = RTNL_FAMILY_IPMR,
+			.net = net,
+		},
+		.mfc = mfc,
+		.tb_id = tb_id
+	};
+
+	return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
 /**
  *	vif_delete - Delete a VIF entry
  *	@notify: Set to 1, if the caller is a notifier_call
@@ -3050,14 +3107,87 @@ static const struct net_protocol pim_protocol = {
 };
 #endif
 
+static unsigned int ipmr_seq_read(struct net *net)
+{
+	ASSERT_RTNL();
+
+	return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net);
+}
+
+static int ipmr_dump(struct net *net, struct notifier_block *nb)
+{
+	struct mr_table *mrt;
+	int err;
+
+	err = ipmr_rules_dump(net, nb);
+	if (err)
+		return err;
+
+	ipmr_for_each_table(mrt, net) {
+		struct vif_device *v = &mrt->vif_table[0];
+		struct mfc_cache *mfc;
+		int vifi;
+
+		/* Notifiy on table VIF entries */
+		read_lock(&mrt_lock);
+		for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
+			if (!v->dev)
+				continue;
+
+			call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD,
+						     v, vifi, mrt->id);
+		}
+		read_unlock(&mrt_lock);
+
+		/* Notify on table MFC entries */
+		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
+			call_ipmr_mfc_entry_notifier(nb, net,
+						     FIB_EVENT_ENTRY_ADD, mfc,
+						     mrt->id);
+	}
+
+	return 0;
+}
+
+static const struct fib_notifier_ops ipmr_notifier_ops_template = {
+	.family		= RTNL_FAMILY_IPMR,
+	.fib_seq_read	= ipmr_seq_read,
+	.fib_dump	= ipmr_dump,
+	.owner		= THIS_MODULE,
+};
+
+int __net_init ipmr_notifier_init(struct net *net)
+{
+	struct fib_notifier_ops *ops;
+
+	net->ipv4.ipmr_seq = 0;
+
+	ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+	net->ipv4.ipmr_notifier_ops = ops;
+
+	return 0;
+}
+
+static void __net_exit ipmr_notifier_exit(struct net *net)
+{
+	fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops);
+	net->ipv4.ipmr_notifier_ops = NULL;
+}
+
 /* Setup for IP multicast routing */
 static int __net_init ipmr_net_init(struct net *net)
 {
 	int err;
 
+	err = ipmr_notifier_init(net);
+	if (err)
+		goto ipmr_notifier_fail;
+
 	err = ipmr_rules_init(net);
 	if (err < 0)
-		goto fail;
+		goto ipmr_rules_fail;
 
 #ifdef CONFIG_PROC_FS
 	err = -ENOMEM;
@@ -3074,7 +3204,9 @@ proc_cache_fail:
 proc_vif_fail:
 	ipmr_rules_exit(net);
 #endif
-fail:
+ipmr_rules_fail:
+	ipmr_notifier_exit(net);
+ipmr_notifier_fail:
 	return err;
 }
 
@@ -3084,6 +3216,7 @@ static void __net_exit ipmr_net_exit(struct net *net)
 	remove_proc_entry("ip_mr_cache", net->proc_net);
 	remove_proc_entry("ip_mr_vif", net->proc_net);
 #endif
+	ipmr_notifier_exit(net);
 	ipmr_rules_exit(net);
 }
 
-- 
cgit v1.2.3


From c7c0bbeae9501a7e42f2fd306d6a6399aca688b6 Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Wed, 27 Sep 2017 08:23:15 +0200
Subject: net: ipmr: Add MFC offload indication

Allow drivers, registered to the fib notification chain indicate whether a
multicast MFC route is offloaded or not, similarly to unicast routes. The
indication of whether a route is offloaded is done using the mfc_flags
field on an mfc_cache struct, and the information is sent to the userspace
via the RTNetlink interface only.

Currently, MFC routes are either offloaded or not, thus there is no need to
add per-VIF offload indication.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h | 2 ++
 net/ipv4/ipmr.c        | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 54c5cb82ddcb..5566580811ce 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -90,9 +90,11 @@ struct mr_table {
 
 /* mfc_flags:
  * MFC_STATIC - the entry was added statically (not by a routing daemon)
+ * MFC_OFFLOAD - the entry was offloaded to the hardware
  */
 enum {
 	MFC_STATIC = BIT(0),
+	MFC_OFFLOAD = BIT(1),
 };
 
 struct mfc_cache_cmp_arg {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index ba71bc402336..2a795d2c0502 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2268,6 +2268,9 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 	    nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
 		return -EMSGSIZE;
 
+	if (c->mfc_flags & MFC_OFFLOAD)
+		rtm->rtm_flags |= RTNH_F_OFFLOAD;
+
 	if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
 		return -EMSGSIZE;
 
-- 
cgit v1.2.3


From 478e4c2f0067d57d7c17059caafab026ca32084a Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Wed, 27 Sep 2017 08:23:16 +0200
Subject: net: mroute: Check if rule is a default rule

When the ipmr starts, it adds one default FIB rule that matches all packets
and sends them to the DEFAULT (multicast) FIB table. A more complex rule
can be added by user to specify that for a specific interface, a packet
should be look up at either an arbitrary table or according to the l3mdev
of the interface.

For drivers willing to offload the ipmr logic into a hardware but don't
want to offload all the FIB rules functionality, provide a function that
can indicate whether the FIB rule is the default multicast rule, thus only
one routing table is needed.

This way, a driver can register to the FIB notification chain, get
notifications about FIB rules added and trigger some kind of an internal
abort mechanism when a non default rule is added by the user.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h |  7 +++++++
 net/ipv4/ipmr.c        | 12 ++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 5566580811ce..b072a84fbe1c 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -5,6 +5,7 @@
 #include <linux/pim.h>
 #include <linux/rhashtable.h>
 #include <net/sock.h>
+#include <net/fib_rules.h>
 #include <net/fib_notifier.h>
 #include <uapi/linux/mroute.h>
 
@@ -19,6 +20,7 @@ int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *);
 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg);
 int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
 int ip_mr_init(void);
+bool ipmr_rule_default(const struct fib_rule *rule);
 #else
 static inline int ip_mroute_setsockopt(struct sock *sock, int optname,
 				       char __user *optval, unsigned int optlen)
@@ -46,6 +48,11 @@ static inline int ip_mroute_opt(int opt)
 {
 	return 0;
 }
+
+static inline bool ipmr_rule_default(const struct fib_rule *rule)
+{
+	return true;
+}
 #endif
 
 struct vif_device {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 2a795d2c0502..292a8e80bdfa 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -274,6 +274,12 @@ static unsigned int ipmr_rules_seq_read(struct net *net)
 {
 	return fib_rules_seq_read(net, RTNL_FAMILY_IPMR);
 }
+
+bool ipmr_rule_default(const struct fib_rule *rule)
+{
+	return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT;
+}
+EXPORT_SYMBOL(ipmr_rule_default);
 #else
 #define ipmr_for_each_table(mrt, net) \
 	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
@@ -318,6 +324,12 @@ static unsigned int ipmr_rules_seq_read(struct net *net)
 {
 	return 0;
 }
+
+bool ipmr_rule_default(const struct fib_rule *rule)
+{
+	return true;
+}
+EXPORT_SYMBOL(ipmr_rule_default);
 #endif
 
 static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
-- 
cgit v1.2.3


From 511d57dc71a22514e106f79a878e788cb22f73e3 Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard@mvista.com>
Date: Wed, 30 Aug 2017 08:04:24 -0500
Subject: ipmi: Get the device id through a function

This makes getting the device id consistent, and make it possible
to add a function to fetch it dynamically later.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
---
 drivers/char/ipmi/ipmi_msghandler.c | 155 +++++++++++++++++++++++++++---------
 drivers/char/ipmi/ipmi_watchdog.c   |  11 ++-
 include/linux/ipmi.h                |   6 +-
 3 files changed, 130 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index d92767225b15..89b72425f9ce 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -272,6 +272,9 @@ struct bmc_device {
 };
 #define to_bmc_device(x) container_of((x), struct bmc_device, pdev.dev)
 
+static int bmc_get_device_id(ipmi_smi_t intf, struct bmc_device *bmc,
+			     struct ipmi_device_id *id);
+
 /*
  * Various statistics for IPMI, these index stats[] in the ipmi_smi
  * structure.
@@ -396,10 +399,6 @@ struct ipmi_smi {
 	 */
 	struct list_head users;
 
-	/* Information to supply to users. */
-	unsigned char ipmi_version_major;
-	unsigned char ipmi_version_minor;
-
 	/* Used for wake ups at startup. */
 	wait_queue_head_t waitq;
 
@@ -1194,12 +1193,21 @@ int ipmi_destroy_user(ipmi_user_t user)
 }
 EXPORT_SYMBOL(ipmi_destroy_user);
 
-void ipmi_get_version(ipmi_user_t   user,
-		      unsigned char *major,
-		      unsigned char *minor)
+int ipmi_get_version(ipmi_user_t   user,
+		     unsigned char *major,
+		     unsigned char *minor)
 {
-	*major = user->intf->ipmi_version_major;
-	*minor = user->intf->ipmi_version_minor;
+	struct ipmi_device_id id;
+	int rv;
+
+	rv = bmc_get_device_id(user->intf, NULL, &id);
+	if (rv)
+		return rv;
+
+	*major = ipmi_version_major(&id);
+	*minor = ipmi_version_minor(&id);
+
+	return 0;
 }
 EXPORT_SYMBOL(ipmi_get_version);
 
@@ -2072,6 +2080,16 @@ int ipmi_request_supply_msgs(ipmi_user_t          user,
 }
 EXPORT_SYMBOL(ipmi_request_supply_msgs);
 
+static int bmc_get_device_id(ipmi_smi_t intf, struct bmc_device *bmc,
+			     struct ipmi_device_id *id)
+{
+	if (!bmc)
+		bmc = intf->bmc;
+
+	*id = bmc->id;
+	return 0;
+}
+
 #ifdef CONFIG_PROC_FS
 static int smi_ipmb_proc_show(struct seq_file *m, void *v)
 {
@@ -2101,10 +2119,16 @@ static const struct file_operations smi_ipmb_proc_ops = {
 static int smi_version_proc_show(struct seq_file *m, void *v)
 {
 	ipmi_smi_t intf = m->private;
+	struct ipmi_device_id id;
+	int rv;
+
+	rv = bmc_get_device_id(intf, NULL, &id);
+	if (rv)
+		return rv;
 
 	seq_printf(m, "%u.%u\n",
-		   ipmi_version_major(&intf->bmc->id),
-		   ipmi_version_minor(&intf->bmc->id));
+		   ipmi_version_major(&id),
+		   ipmi_version_minor(&id));
 
 	return 0;
 }
@@ -2287,8 +2311,14 @@ static ssize_t device_id_show(struct device *dev,
 			      char *buf)
 {
 	struct bmc_device *bmc = to_bmc_device(dev);
+	struct ipmi_device_id id;
+	int rv;
+
+	rv = bmc_get_device_id(NULL, bmc, &id);
+	if (rv)
+		return rv;
 
-	return snprintf(buf, 10, "%u\n", bmc->id.device_id);
+	return snprintf(buf, 10, "%u\n", id.device_id);
 }
 static DEVICE_ATTR(device_id, S_IRUGO, device_id_show, NULL);
 
@@ -2297,9 +2327,14 @@ static ssize_t provides_device_sdrs_show(struct device *dev,
 					 char *buf)
 {
 	struct bmc_device *bmc = to_bmc_device(dev);
+	struct ipmi_device_id id;
+	int rv;
 
-	return snprintf(buf, 10, "%u\n",
-			(bmc->id.device_revision & 0x80) >> 7);
+	rv = bmc_get_device_id(NULL, bmc, &id);
+	if (rv)
+		return rv;
+
+	return snprintf(buf, 10, "%u\n", (id.device_revision & 0x80) >> 7);
 }
 static DEVICE_ATTR(provides_device_sdrs, S_IRUGO, provides_device_sdrs_show,
 		   NULL);
@@ -2308,9 +2343,14 @@ static ssize_t revision_show(struct device *dev, struct device_attribute *attr,
 			     char *buf)
 {
 	struct bmc_device *bmc = to_bmc_device(dev);
+	struct ipmi_device_id id;
+	int rv;
 
-	return snprintf(buf, 20, "%u\n",
-			bmc->id.device_revision & 0x0F);
+	rv = bmc_get_device_id(NULL, bmc, &id);
+	if (rv)
+		return rv;
+
+	return snprintf(buf, 20, "%u\n", id.device_revision & 0x0F);
 }
 static DEVICE_ATTR(revision, S_IRUGO, revision_show, NULL);
 
@@ -2319,9 +2359,15 @@ static ssize_t firmware_revision_show(struct device *dev,
 				      char *buf)
 {
 	struct bmc_device *bmc = to_bmc_device(dev);
+	struct ipmi_device_id id;
+	int rv;
 
-	return snprintf(buf, 20, "%u.%x\n", bmc->id.firmware_revision_1,
-			bmc->id.firmware_revision_2);
+	rv = bmc_get_device_id(NULL, bmc, &id);
+	if (rv)
+		return rv;
+
+	return snprintf(buf, 20, "%u.%x\n", id.firmware_revision_1,
+			id.firmware_revision_2);
 }
 static DEVICE_ATTR(firmware_revision, S_IRUGO, firmware_revision_show, NULL);
 
@@ -2330,10 +2376,16 @@ static ssize_t ipmi_version_show(struct device *dev,
 				 char *buf)
 {
 	struct bmc_device *bmc = to_bmc_device(dev);
+	struct ipmi_device_id id;
+	int rv;
+
+	rv = bmc_get_device_id(NULL, bmc, &id);
+	if (rv)
+		return rv;
 
 	return snprintf(buf, 20, "%u.%u\n",
-			ipmi_version_major(&bmc->id),
-			ipmi_version_minor(&bmc->id));
+			ipmi_version_major(&id),
+			ipmi_version_minor(&id));
 }
 static DEVICE_ATTR(ipmi_version, S_IRUGO, ipmi_version_show, NULL);
 
@@ -2342,9 +2394,14 @@ static ssize_t add_dev_support_show(struct device *dev,
 				    char *buf)
 {
 	struct bmc_device *bmc = to_bmc_device(dev);
+	struct ipmi_device_id id;
+	int rv;
 
-	return snprintf(buf, 10, "0x%02x\n",
-			bmc->id.additional_device_support);
+	rv = bmc_get_device_id(NULL, bmc, &id);
+	if (rv)
+		return rv;
+
+	return snprintf(buf, 10, "0x%02x\n", id.additional_device_support);
 }
 static DEVICE_ATTR(additional_device_support, S_IRUGO, add_dev_support_show,
 		   NULL);
@@ -2354,8 +2411,14 @@ static ssize_t manufacturer_id_show(struct device *dev,
 				    char *buf)
 {
 	struct bmc_device *bmc = to_bmc_device(dev);
+	struct ipmi_device_id id;
+	int rv;
+
+	rv = bmc_get_device_id(NULL, bmc, &id);
+	if (rv)
+		return rv;
 
-	return snprintf(buf, 20, "0x%6.6x\n", bmc->id.manufacturer_id);
+	return snprintf(buf, 20, "0x%6.6x\n", id.manufacturer_id);
 }
 static DEVICE_ATTR(manufacturer_id, S_IRUGO, manufacturer_id_show, NULL);
 
@@ -2364,8 +2427,14 @@ static ssize_t product_id_show(struct device *dev,
 			       char *buf)
 {
 	struct bmc_device *bmc = to_bmc_device(dev);
+	struct ipmi_device_id id;
+	int rv;
+
+	rv = bmc_get_device_id(NULL, bmc, &id);
+	if (rv)
+		return rv;
 
-	return snprintf(buf, 10, "0x%4.4x\n", bmc->id.product_id);
+	return snprintf(buf, 10, "0x%4.4x\n", id.product_id);
 }
 static DEVICE_ATTR(product_id, S_IRUGO, product_id_show, NULL);
 
@@ -2374,12 +2443,18 @@ static ssize_t aux_firmware_rev_show(struct device *dev,
 				     char *buf)
 {
 	struct bmc_device *bmc = to_bmc_device(dev);
+	struct ipmi_device_id id;
+	int rv;
+
+	rv = bmc_get_device_id(NULL, bmc, &id);
+	if (rv)
+		return rv;
 
 	return snprintf(buf, 21, "0x%02x 0x%02x 0x%02x 0x%02x\n",
-			bmc->id.aux_firmware_revision[3],
-			bmc->id.aux_firmware_revision[2],
-			bmc->id.aux_firmware_revision[1],
-			bmc->id.aux_firmware_revision[0]);
+			id.aux_firmware_revision[3],
+			id.aux_firmware_revision[2],
+			id.aux_firmware_revision[1],
+			id.aux_firmware_revision[0]);
 }
 static DEVICE_ATTR(aux_firmware_revision, S_IRUGO, aux_firmware_rev_show, NULL);
 
@@ -2417,9 +2492,13 @@ static umode_t bmc_dev_attr_is_visible(struct kobject *kobj,
 	struct device *dev = kobj_to_dev(kobj);
 	struct bmc_device *bmc = to_bmc_device(dev);
 	umode_t mode = attr->mode;
+	struct ipmi_device_id id;
+	int rv;
 
-	if (attr == &dev_attr_aux_firmware_revision.attr)
-		return bmc->id.aux_firmware_revision_set ? mode : 0;
+	if (attr == &dev_attr_aux_firmware_revision.attr) {
+		rv = bmc_get_device_id(NULL, bmc, &id);
+		return (!rv && id.aux_firmware_revision_set) ? mode : 0;
+	}
 	if (attr == &dev_attr_guid.attr)
 		return bmc->guid_set ? mode : 0;
 	return mode;
@@ -2884,6 +2963,7 @@ int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
 	ipmi_smi_t       intf;
 	ipmi_smi_t       tintf;
 	struct list_head *link;
+	struct ipmi_device_id id;
 
 	/*
 	 * Make sure the driver is actually initialized, this handles
@@ -2905,9 +2985,6 @@ int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
 	if (!intf)
 		return -ENOMEM;
 
-	intf->ipmi_version_major = ipmi_version_major(device_id);
-	intf->ipmi_version_minor = ipmi_version_minor(device_id);
-
 	intf->bmc = kzalloc(sizeof(*intf->bmc), GFP_KERNEL);
 	if (!intf->bmc) {
 		kfree(intf);
@@ -2982,13 +3059,19 @@ int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
 
 	get_guid(intf);
 
+	rv = bmc_get_device_id(intf, NULL, &id);
+	if (rv) {
+		dev_err(si_dev, "Unable to get the device id: %d\n", rv);
+		goto out;
+	}
+
 	rv = ipmi_bmc_register(intf, i);
 	if (rv)
 		goto out;
 
-	if ((intf->ipmi_version_major > 1)
-			|| ((intf->ipmi_version_major == 1)
-			    && (intf->ipmi_version_minor >= 5))) {
+	if (ipmi_version_major(&id) > 1
+			|| (ipmi_version_major(&id) == 1
+			    && ipmi_version_minor(&id) >= 5)) {
 		/*
 		 * Start scanning the channels to see what is
 		 * available.
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 3d832d0362a4..76b270678b50 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -1009,9 +1009,14 @@ static void ipmi_register_watchdog(int ipmi_intf)
 		goto out;
 	}
 
-	ipmi_get_version(watchdog_user,
-			 &ipmi_version_major,
-			 &ipmi_version_minor);
+	rv = ipmi_get_version(watchdog_user,
+			      &ipmi_version_major,
+			      &ipmi_version_minor);
+	if (rv) {
+		pr_warn(PFX "Unable to get IPMI version, assuming 1.0\n");
+		ipmi_version_major = 1;
+		ipmi_version_minor = 0;
+	}
 
 	rv = misc_register(&ipmi_wdog_miscdev);
 	if (rv < 0) {
diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index f1045b2c6a00..80fc3f798984 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -113,9 +113,9 @@ int ipmi_create_user(unsigned int          if_num,
 int ipmi_destroy_user(ipmi_user_t user);
 
 /* Get the IPMI version of the BMC we are talking to. */
-void ipmi_get_version(ipmi_user_t   user,
-		      unsigned char *major,
-		      unsigned char *minor);
+int ipmi_get_version(ipmi_user_t   user,
+		     unsigned char *major,
+		     unsigned char *minor);
 
 /* Set and get the slave address and LUN that we will use for our
    source messages.  Note that this affects the interface, not just
-- 
cgit v1.2.3


From c468f911b73beb39b20f7e5f97a35d41f038b31b Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Fri, 25 Aug 2017 15:47:23 +0800
Subject: ipmi: Make ipmi_demangle_device_id more generic

Currently, ipmi_demagle_device_id requires a full response buffer in its
data argument. This means we can't use it to parse a response in a
struct ipmi_recv_msg, which has the netfn and cmd as separate bytes.

This change alters the definition and users of ipmi_demangle_device_id
to use a split netfn, cmd and data buffer, so it can be used with
non-sequential responses.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>

Fixed the ipmi_ssif.c and ipmi_si_intf.c changes to use data from the
response, not the data from the message, when passing info to the
ipmi_demangle_device_id() function.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
---
 drivers/char/ipmi/ipmi_si_intf.c |  3 ++-
 drivers/char/ipmi/ipmi_ssif.c    |  3 ++-
 include/linux/ipmi_smi.h         | 24 ++++++++++++------------
 3 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index f8e28bad6d56..bc99369fca49 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -2926,7 +2926,8 @@ static int try_get_dev_id(struct smi_info *smi_info)
 						  resp, IPMI_MAX_MSG_LENGTH);
 
 	/* Check and record info from the get device id, in case we need it. */
-	rv = ipmi_demangle_device_id(resp, resp_len, &smi_info->device_id);
+	rv = ipmi_demangle_device_id(resp[0] >> 2, resp[1],
+			resp + 2, resp_len - 2, &smi_info->device_id);
 
 out:
 	kfree(resp);
diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
index 0aea3bcb6158..20ab098cd661 100644
--- a/drivers/char/ipmi/ipmi_ssif.c
+++ b/drivers/char/ipmi/ipmi_ssif.c
@@ -1491,7 +1491,8 @@ static int ssif_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	if (rv)
 		goto out;
 
-	rv = ipmi_demangle_device_id(resp, len, &ssif_info->device_id);
+	rv = ipmi_demangle_device_id(resp[0] >> 2, resp[1],
+			resp + 2, len - 2, &ssif_info->device_id);
 	if (rv)
 		goto out;
 
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index f8cea14485dd..75542c837c07 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -162,27 +162,27 @@ struct ipmi_device_id {
 #define ipmi_version_major(v) ((v)->ipmi_version & 0xf)
 #define ipmi_version_minor(v) ((v)->ipmi_version >> 4)
 
-/* Take a pointer to a raw data buffer and a length and extract device
-   id information from it.  The first byte of data must point to the
-   netfn << 2, the data should be of the format:
-      netfn << 2, cmd, completion code, data
-   as normally comes from a device interface. */
-static inline int ipmi_demangle_device_id(const unsigned char *data,
+/* Take a pointer to an IPMI response and extract device id information from
+ * it. @netfn is in the IPMI_NETFN_ format, so may need to be shifted from
+ * a SI response.
+ */
+static inline int ipmi_demangle_device_id(uint8_t netfn, uint8_t cmd,
+					  const unsigned char *data,
 					  unsigned int data_len,
 					  struct ipmi_device_id *id)
 {
-	if (data_len < 9)
+	if (data_len < 7)
 		return -EINVAL;
-	if (data[0] != IPMI_NETFN_APP_RESPONSE << 2 ||
-	    data[1] != IPMI_GET_DEVICE_ID_CMD)
+	if (netfn != IPMI_NETFN_APP_RESPONSE || cmd != IPMI_GET_DEVICE_ID_CMD)
 		/* Strange, didn't get the response we expected. */
 		return -EINVAL;
-	if (data[2] != 0)
+	if (data[0] != 0)
 		/* That's odd, it shouldn't be able to fail. */
 		return -EINVAL;
 
-	data += 3;
-	data_len -= 3;
+	data++;
+	data_len--;
+
 	id->device_id = data[0];
 	id->device_revision = data[1];
 	id->firmware_revision_1 = data[2];
-- 
cgit v1.2.3


From 1e5058ea21010883b1e1d288637f7390bb8d1c61 Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard@mvista.com>
Date: Thu, 31 Aug 2017 16:45:40 -0500
Subject: ipmi: Remove the device id from ipmi_register_smi()

It's no longer used, dynamic device id handling is in place now.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
---
 drivers/char/ipmi/ipmi_msghandler.c |  1 -
 drivers/char/ipmi/ipmi_powernv.c    |  4 +---
 drivers/char/ipmi/ipmi_si_intf.c    |  1 -
 drivers/char/ipmi/ipmi_ssif.c       | 19 -------------------
 include/linux/ipmi_smi.h            |  1 -
 5 files changed, 1 insertion(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 9157a9e17c36..ee108be13348 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -3112,7 +3112,6 @@ EXPORT_SYMBOL(ipmi_poll_interface);
 
 int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
 		      void		       *send_info,
-		      struct ipmi_device_id    *device_id,
 		      struct device            *si_dev,
 		      unsigned char            slave_addr)
 {
diff --git a/drivers/char/ipmi/ipmi_powernv.c b/drivers/char/ipmi/ipmi_powernv.c
index b338a4becbf8..07fddbefefe4 100644
--- a/drivers/char/ipmi/ipmi_powernv.c
+++ b/drivers/char/ipmi/ipmi_powernv.c
@@ -23,7 +23,6 @@
 
 struct ipmi_smi_powernv {
 	u64			interface_id;
-	struct ipmi_device_id	ipmi_id;
 	ipmi_smi_t		intf;
 	unsigned int		irq;
 
@@ -266,8 +265,7 @@ static int ipmi_powernv_probe(struct platform_device *pdev)
 	}
 
 	/* todo: query actual ipmi_device_id */
-	rc = ipmi_register_smi(&ipmi_powernv_smi_handlers, ipmi,
-			&ipmi->ipmi_id, dev, 0);
+	rc = ipmi_register_smi(&ipmi_powernv_smi_handlers, ipmi, dev, 0);
 	if (rc) {
 		dev_warn(dev, "IPMI SMI registration failed (%d)\n", rc);
 		goto err_free_msg;
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index bc99369fca49..4caa793a6765 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -3631,7 +3631,6 @@ static int try_smi_init(struct smi_info *new_smi)
 
 	rv = ipmi_register_smi(&handlers,
 			       new_smi,
-			       &new_smi->device_id,
 			       new_smi->dev,
 			       new_smi->slave_addr);
 	if (rv) {
diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
index 20ab098cd661..dd716d06ce73 100644
--- a/drivers/char/ipmi/ipmi_ssif.c
+++ b/drivers/char/ipmi/ipmi_ssif.c
@@ -267,9 +267,6 @@ struct ssif_info {
 	unsigned char *i2c_data;
 	unsigned int i2c_size;
 
-	/* From the device id response. */
-	struct ipmi_device_id device_id;
-
 	struct timer_list retry_timer;
 	int retries_left;
 
@@ -1481,21 +1478,6 @@ static int ssif_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	       ipmi_addr_src_to_str(ssif_info->addr_source),
 	       client->addr, client->adapter->name, slave_addr);
 
-	/*
-	 * Do a Get Device ID command, since it comes back with some
-	 * useful info.
-	 */
-	msg[0] = IPMI_NETFN_APP_REQUEST << 2;
-	msg[1] = IPMI_GET_DEVICE_ID_CMD;
-	rv = do_cmd(client, 2, msg, &len, resp);
-	if (rv)
-		goto out;
-
-	rv = ipmi_demangle_device_id(resp[0] >> 2, resp[1],
-			resp + 2, len - 2, &ssif_info->device_id);
-	if (rv)
-		goto out;
-
 	ssif_info->client = client;
 	i2c_set_clientdata(client, ssif_info);
 
@@ -1685,7 +1667,6 @@ static int ssif_probe(struct i2c_client *client, const struct i2c_device_id *id)
 
 	rv = ipmi_register_smi(&ssif_info->handlers,
 			       ssif_info,
-			       &ssif_info->device_id,
 			       &ssif_info->client->dev,
 			       slave_addr);
 	 if (rv) {
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index 75542c837c07..97771e36b7f0 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -214,7 +214,6 @@ static inline int ipmi_demangle_device_id(uint8_t netfn, uint8_t cmd,
    call. */
 int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
 		      void                     *send_info,
-		      struct ipmi_device_id    *device_id,
 		      struct device            *dev,
 		      unsigned char            slave_addr);
 
-- 
cgit v1.2.3


From da61e7308df868c20a41861c0a11f6eefd822993 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe.montjoie@gmail.com>
Date: Mon, 18 Sep 2017 20:10:36 +0200
Subject: sbus: char: Move D7S_MINOR to include/linux/miscdevice.h

This patch move the define for D7S_MINOR to include/linux/miscdevice.h.
It's better that all minor number are in the same place.

Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/sbus/char/display7seg.c | 1 -
 include/linux/miscdevice.h      | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/sbus/char/display7seg.c b/drivers/sbus/char/display7seg.c
index f32765d3cbd8..5c8ed7350a04 100644
--- a/drivers/sbus/char/display7seg.c
+++ b/drivers/sbus/char/display7seg.c
@@ -22,7 +22,6 @@
 
 #include <asm/display7seg.h>
 
-#define D7S_MINOR	193
 #define DRIVER_NAME	"d7s"
 #define PFX		DRIVER_NAME ": "
 
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 58751eae5f77..05c3892b3e92 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -35,6 +35,7 @@
 #define HWRNG_MINOR		183
 #define MICROCODE_MINOR		184
 #define IRNET_MINOR		187
+#define D7S_MINOR		193
 #define VFIO_MINOR		196
 #define TUN_MINOR		200
 #define CUSE_MINOR		203
-- 
cgit v1.2.3


From 55f91cb6f1dfc873359674f35a8ffb1e78429d22 Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard@mvista.com>
Date: Sat, 16 Sep 2017 15:51:25 -0500
Subject: ipmi: Make the IPMI proc interface configurable

So we can remove it later.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
---
 drivers/char/ipmi/Kconfig           |  8 +++++++
 drivers/char/ipmi/ipmi_msghandler.c | 43 +++++++++++++++++++------------------
 drivers/char/ipmi/ipmi_si_intf.c    |  4 ++++
 drivers/char/ipmi/ipmi_ssif.c       |  6 ++++++
 include/linux/ipmi_smi.h            |  2 ++
 5 files changed, 42 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/ipmi/Kconfig b/drivers/char/ipmi/Kconfig
index 152ccefdaecb..3544abc0f9f9 100644
--- a/drivers/char/ipmi/Kconfig
+++ b/drivers/char/ipmi/Kconfig
@@ -22,6 +22,14 @@ config IPMI_DMI_DECODE
 
 if IPMI_HANDLER
 
+config IPMI_PROC_INTERFACE
+       bool 'Provide an interface for IPMI stats in /proc (deprecated)'
+       depends on PROC_FS
+       default y
+       help
+         Do not use this any more, use sysfs for this info.  It will be
+	 removed in future kernel versions.
+
 config IPMI_PANIC_EVENT
        bool 'Generate a panic event to all BMCs on a panic'
        help
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 1c8bef2e1dc1..fd3ac6b50412 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -132,9 +132,9 @@ module_param_cb(panic_op, &panic_op_ops, NULL, 0600);
 MODULE_PARM_DESC(panic_op, "Sets if the IPMI driver will attempt to store panic information in the event log in the event of a panic.  Set to 'none' for no, 'event' for a single event, or 'string' for a generic event and the panic string in IPMI OEM events.");
 
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IPMI_PROC_INTERFACE
 static struct proc_dir_entry *proc_ipmi_root;
-#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_IPMI_PROC_INTERFACE */
 
 /* Remain in auto-maintenance mode for this amount of time (in ms). */
 #define IPMI_MAINTENANCE_MODE_TIMEOUT 30000
@@ -267,7 +267,7 @@ struct ipmi_my_addrinfo {
 	unsigned char lun;
 };
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IPMI_PROC_INTERFACE
 struct ipmi_proc_entry {
 	char                   *name;
 	struct ipmi_proc_entry *next;
@@ -449,10 +449,13 @@ struct ipmi_smi {
 	const struct ipmi_smi_handlers *handlers;
 	void                     *send_info;
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IPMI_PROC_INTERFACE
 	/* A list of proc entries for this interface. */
 	struct mutex           proc_entry_lock;
 	struct ipmi_proc_entry *proc_entries;
+
+	struct proc_dir_entry *proc_dir;
+	char                  proc_dir_name[10];
 #endif
 
 	/* Driver-model device for the system interface. */
@@ -542,10 +545,6 @@ struct ipmi_smi {
 	struct ipmi_my_addrinfo addrinfo[IPMI_MAX_CHANNELS];
 	bool channels_ready;
 
-	/* Proc FS stuff. */
-	struct proc_dir_entry *proc_dir;
-	char                  proc_dir_name[10];
-
 	atomic_t stats[IPMI_NUM_STATS];
 
 	/*
@@ -2363,7 +2362,7 @@ static int bmc_get_device_id(ipmi_smi_t intf, struct bmc_device *bmc,
 	return __bmc_get_device_id(intf, bmc, id, guid_set, guid, -1);
 }
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IPMI_PROC_INTERFACE
 static int smi_ipmb_proc_show(struct seq_file *m, void *v)
 {
 	ipmi_smi_t intf = m->private;
@@ -2492,14 +2491,12 @@ static const struct file_operations smi_stats_proc_ops = {
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
-#endif /* CONFIG_PROC_FS */
 
 int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name,
 			    const struct file_operations *proc_ops,
 			    void *data)
 {
 	int                    rv = 0;
-#ifdef CONFIG_PROC_FS
 	struct proc_dir_entry  *file;
 	struct ipmi_proc_entry *entry;
 
@@ -2525,7 +2522,6 @@ int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name,
 		smi->proc_entries = entry;
 		mutex_unlock(&smi->proc_entry_lock);
 	}
-#endif /* CONFIG_PROC_FS */
 
 	return rv;
 }
@@ -2535,7 +2531,6 @@ static int add_proc_entries(ipmi_smi_t smi, int num)
 {
 	int rv = 0;
 
-#ifdef CONFIG_PROC_FS
 	sprintf(smi->proc_dir_name, "%d", num);
 	smi->proc_dir = proc_mkdir(smi->proc_dir_name, proc_ipmi_root);
 	if (!smi->proc_dir)
@@ -2555,14 +2550,12 @@ static int add_proc_entries(ipmi_smi_t smi, int num)
 		rv = ipmi_smi_add_proc_entry(smi, "version",
 					     &smi_version_proc_ops,
 					     smi);
-#endif /* CONFIG_PROC_FS */
 
 	return rv;
 }
 
 static void remove_proc_entries(ipmi_smi_t smi)
 {
-#ifdef CONFIG_PROC_FS
 	struct ipmi_proc_entry *entry;
 
 	mutex_lock(&smi->proc_entry_lock);
@@ -2576,8 +2569,8 @@ static void remove_proc_entries(ipmi_smi_t smi)
 	}
 	mutex_unlock(&smi->proc_entry_lock);
 	remove_proc_entry(smi->proc_dir_name, proc_ipmi_root);
-#endif /* CONFIG_PROC_FS */
 }
+#endif /* CONFIG_IPMI_PROC_INTERFACE */
 
 static ssize_t device_id_show(struct device *dev,
 			      struct device_attribute *attr,
@@ -3419,7 +3412,7 @@ int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
 		intf->seq_table[j].seqid = 0;
 	}
 	intf->curr_seq = 0;
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IPMI_PROC_INTERFACE
 	mutex_init(&intf->proc_entry_lock);
 #endif
 	spin_lock_init(&intf->waiting_rcv_msgs_lock);
@@ -3443,7 +3436,9 @@ int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
 	for (i = 0; i < IPMI_NUM_STATS; i++)
 		atomic_set(&intf->stats[i], 0);
 
+#ifdef CONFIG_IPMI_PROC_INTERFACE
 	intf->proc_dir = NULL;
+#endif
 
 	mutex_lock(&smi_watchers_mutex);
 	mutex_lock(&ipmi_interfaces_mutex);
@@ -3479,13 +3474,17 @@ int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
 	if (rv)
 		goto out;
 
+#ifdef CONFIG_IPMI_PROC_INTERFACE
 	rv = add_proc_entries(intf, i);
+#endif
 
  out:
 	if (rv) {
 		ipmi_bmc_unregister(intf);
+#ifdef CONFIG_IPMI_PROC_INTERFACE
 		if (intf->proc_dir)
 			remove_proc_entries(intf);
+#endif
 		intf->handlers = NULL;
 		list_del_rcu(&intf->link);
 		mutex_unlock(&ipmi_interfaces_mutex);
@@ -3590,7 +3589,9 @@ int ipmi_unregister_smi(ipmi_smi_t intf)
 	intf->handlers = NULL;
 	mutex_unlock(&ipmi_interfaces_mutex);
 
+#ifdef CONFIG_IPMI_PROC_INTERFACE
 	remove_proc_entries(intf);
+#endif
 	ipmi_bmc_unregister(intf);
 
 	/*
@@ -5170,7 +5171,7 @@ static int ipmi_init_msghandler(void)
 	printk(KERN_INFO "ipmi message handler version "
 	       IPMI_DRIVER_VERSION "\n");
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IPMI_PROC_INTERFACE
 	proc_ipmi_root = proc_mkdir("ipmi", NULL);
 	if (!proc_ipmi_root) {
 	    printk(KERN_ERR PFX "Unable to create IPMI proc dir");
@@ -5178,7 +5179,7 @@ static int ipmi_init_msghandler(void)
 	    return -ENOMEM;
 	}
 
-#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_IPMI_PROC_INTERFACE */
 
 	setup_timer(&ipmi_timer, ipmi_timeout, 0);
 	mod_timer(&ipmi_timer, jiffies + IPMI_TIMEOUT_JIFFIES);
@@ -5218,9 +5219,9 @@ static void __exit cleanup_ipmi(void)
 	atomic_inc(&stop_operation);
 	del_timer_sync(&ipmi_timer);
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IPMI_PROC_INTERFACE
 	proc_remove(proc_ipmi_root);
-#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_IPMI_PROC_INTERFACE */
 
 	driver_unregister(&ipmidriver.driver);
 
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 6ce5b7c8cf93..efc8ee9b5071 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -1605,6 +1605,7 @@ out:
 	return rv;
 }
 
+#ifdef CONFIG_IPMI_PROC_INTERFACE
 static int smi_type_proc_show(struct seq_file *m, void *v)
 {
 	struct smi_info *smi = m->private;
@@ -1698,6 +1699,7 @@ static const struct file_operations smi_params_proc_ops = {
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
+#endif
 
 #define IPMI_SI_ATTR(name) \
 static ssize_t ipmi_##name##_show(struct device *dev,			\
@@ -2191,6 +2193,7 @@ static int try_smi_init(struct smi_info *new_smi)
 		goto out_err_remove_attrs;
 	}
 
+#ifdef CONFIG_IPMI_PROC_INTERFACE
 	rv = ipmi_smi_add_proc_entry(new_smi->intf, "type",
 				     &smi_type_proc_ops,
 				     new_smi);
@@ -2217,6 +2220,7 @@ static int try_smi_init(struct smi_info *new_smi)
 			"Unable to create proc entry: %d\n", rv);
 		goto out_err_stop_timer;
 	}
+#endif
 
 	/* Don't increment till we know we have succeeded. */
 	smi_num++;
diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
index 7b2d4000b11e..aadec879d052 100644
--- a/drivers/char/ipmi/ipmi_ssif.c
+++ b/drivers/char/ipmi/ipmi_ssif.c
@@ -1344,6 +1344,7 @@ static int ssif_detect(struct i2c_client *client, struct i2c_board_info *info)
 	return rv;
 }
 
+#ifdef CONFIG_IPMI_PROC_INTERFACE
 static int smi_type_proc_show(struct seq_file *m, void *v)
 {
 	seq_puts(m, "ssif\n");
@@ -1407,6 +1408,7 @@ static const struct file_operations smi_stats_proc_ops = {
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
+#endif
 
 static int strcmp_nospace(char *s1, char *s2)
 {
@@ -1742,6 +1744,7 @@ static int ssif_probe(struct i2c_client *client, const struct i2c_device_id *id)
 		goto out_remove_attr;
 	}
 
+#ifdef CONFIG_IPMI_PROC_INTERFACE
 	rv = ipmi_smi_add_proc_entry(ssif_info->intf, "type",
 				     &smi_type_proc_ops,
 				     ssif_info);
@@ -1757,6 +1760,7 @@ static int ssif_probe(struct i2c_client *client, const struct i2c_device_id *id)
 		pr_err(PFX "Unable to create proc entry: %d\n", rv);
 		goto out_err_unreg;
 	}
+#endif
 
  out:
 	if (rv) {
@@ -1775,8 +1779,10 @@ static int ssif_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	kfree(resp);
 	return rv;
 
+#ifdef CONFIG_IPMI_PROC_INTERFACE
 out_err_unreg:
 	ipmi_unregister_smi(ssif_info->intf);
+#endif
 
 out_remove_attr:
 	device_remove_group(&ssif_info->client->dev, &ipmi_ssif_dev_attr_group);
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index 97771e36b7f0..5be51281e14d 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -241,11 +241,13 @@ static inline void ipmi_free_smi_msg(struct ipmi_smi_msg *msg)
 	msg->done(msg);
 }
 
+#ifdef CONFIG_IPMI_PROC_INTERFACE
 /* Allow the lower layer to add things to the proc filesystem
    directory for this interface.  Note that the entry will
    automatically be dstroyed when the interface is destroyed. */
 int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name,
 			    const struct file_operations *proc_ops,
 			    void *data);
+#endif
 
 #endif /* __LINUX_IPMI_SMI_H */
-- 
cgit v1.2.3


From 95e300c052fd9dbb05f289a912c138ed03320ec5 Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard@mvista.com>
Date: Mon, 18 Sep 2017 12:38:17 -0500
Subject: ipmi: Make the DMI probe into a generic platform probe

Rework the DMI probe function to be a generic platform probe, and
then rework the DMI code (and a few other things) to use the more
generic information.  This is so other things can declare platform
IPMI devices.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
---
 drivers/char/ipmi/ipmi_dmi.c         |  70 ++++++++++++++++--------
 drivers/char/ipmi/ipmi_dmi.h         |   8 +--
 drivers/char/ipmi/ipmi_msghandler.c  |   2 +-
 drivers/char/ipmi/ipmi_si_intf.c     |   2 +-
 drivers/char/ipmi/ipmi_si_platform.c | 101 ++++++++++++++++-------------------
 drivers/char/ipmi/ipmi_si_sm.h       |   2 +-
 drivers/char/ipmi/ipmi_ssif.c        |  12 ++---
 include/linux/ipmi.h                 |   2 +-
 8 files changed, 104 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/ipmi/ipmi_dmi.c b/drivers/char/ipmi/ipmi_dmi.c
index 2a84401dea05..d08d41903b0f 100644
--- a/drivers/char/ipmi/ipmi_dmi.c
+++ b/drivers/char/ipmi/ipmi_dmi.c
@@ -8,10 +8,16 @@
 #include <linux/dmi.h>
 #include <linux/platform_device.h>
 #include <linux/property.h>
+#include "ipmi_si_sm.h"
 #include "ipmi_dmi.h"
 
+#define IPMI_DMI_TYPE_KCS	0x01
+#define IPMI_DMI_TYPE_SMIC	0x02
+#define IPMI_DMI_TYPE_BT	0x03
+#define IPMI_DMI_TYPE_SSIF	0x04
+
 struct ipmi_dmi_info {
-	int type;
+	enum si_type si_type;
 	u32 flags;
 	unsigned long addr;
 	u8 slave_addr;
@@ -22,6 +28,15 @@ static struct ipmi_dmi_info *ipmi_dmi_infos;
 
 static int ipmi_dmi_nr __initdata;
 
+#define set_prop_entry(_p_, _name_, type, val)	\
+do {					\
+	struct property_entry *_p = &_p_;	\
+	_p->name = _name_;			\
+	_p->length = sizeof(type);		\
+	_p->is_string = false;			\
+	_p->value.type##_data = val;		\
+} while(0)
+
 static void __init dmi_add_platform_ipmi(unsigned long base_addr,
 					 u32 flags,
 					 u8 slave_addr,
@@ -32,27 +47,14 @@ static void __init dmi_add_platform_ipmi(unsigned long base_addr,
 	struct platform_device *pdev;
 	struct resource r[4];
 	unsigned int num_r = 1, size;
-	struct property_entry p[4] = {
-		PROPERTY_ENTRY_U8("slave-addr", slave_addr),
-		PROPERTY_ENTRY_U8("ipmi-type", type),
-		PROPERTY_ENTRY_U16("i2c-addr", base_addr),
-		{ }
-	};
+	struct property_entry p[5];
+	unsigned int pidx = 0;
 	char *name, *override;
 	int rv;
+	enum si_type si_type;
 	struct ipmi_dmi_info *info;
 
-	info = kmalloc(sizeof(*info), GFP_KERNEL);
-	if (!info) {
-		pr_warn("ipmi:dmi: Could not allocate dmi info\n");
-	} else {
-		info->type = type;
-		info->flags = flags;
-		info->addr = base_addr;
-		info->slave_addr = slave_addr;
-		info->next = ipmi_dmi_infos;
-		ipmi_dmi_infos = info;
-	}
+	memset(p, 0, sizeof(p));
 
 	name = "dmi-ipmi-si";
 	override = "ipmi_si";
@@ -62,19 +64,42 @@ static void __init dmi_add_platform_ipmi(unsigned long base_addr,
 		override = "ipmi_ssif";
 		offset = 1;
 		size = 1;
+		si_type = SI_TYPE_INVALID;
 		break;
 	case IPMI_DMI_TYPE_BT:
 		size = 3;
+		si_type = SI_BT;
 		break;
 	case IPMI_DMI_TYPE_KCS:
+		size = 2;
+		si_type = SI_KCS;
+		break;
 	case IPMI_DMI_TYPE_SMIC:
 		size = 2;
+		si_type = SI_SMIC;
 		break;
 	default:
 		pr_err("ipmi:dmi: Invalid IPMI type: %d", type);
 		return;
 	}
 
+	if (si_type != SI_TYPE_INVALID)
+		set_prop_entry(p[pidx++], "ipmi-type", u8, si_type);
+	set_prop_entry(p[pidx++], "slave-addr", u8, slave_addr);
+	set_prop_entry(p[pidx++], "addr-source", u8, SI_SMBIOS);
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		pr_warn("ipmi:dmi: Could not allocate dmi info\n");
+	} else {
+		info->si_type = si_type;
+		info->flags = flags;
+		info->addr = base_addr;
+		info->slave_addr = slave_addr;
+		info->next = ipmi_dmi_infos;
+		ipmi_dmi_infos = info;
+	}
+
 	pdev = platform_device_alloc(name, ipmi_dmi_nr);
 	if (!pdev) {
 		pr_err("ipmi:dmi: Error allocation IPMI platform device");
@@ -82,8 +107,10 @@ static void __init dmi_add_platform_ipmi(unsigned long base_addr,
 	}
 	pdev->driver_override = override;
 
-	if (type == IPMI_DMI_TYPE_SSIF)
+	if (type == IPMI_DMI_TYPE_SSIF) {
+		set_prop_entry(p[pidx++], "i2c-addr", u16, base_addr);
 		goto add_properties;
+	}
 
 	memset(r, 0, sizeof(r));
 
@@ -151,12 +178,13 @@ err:
  * This function allows an ACPI-specified IPMI device to look up the
  * slave address from the DMI table.
  */
-int ipmi_dmi_get_slave_addr(int type, u32 flags, unsigned long base_addr)
+int ipmi_dmi_get_slave_addr(enum si_type si_type, u32 flags,
+			    unsigned long base_addr)
 {
 	struct ipmi_dmi_info *info = ipmi_dmi_infos;
 
 	while (info) {
-		if (info->type == type &&
+		if (info->si_type == si_type &&
 		    info->flags == flags &&
 		    info->addr == base_addr)
 			return info->slave_addr;
diff --git a/drivers/char/ipmi/ipmi_dmi.h b/drivers/char/ipmi/ipmi_dmi.h
index 0a1afe5ceb1e..062015b8f520 100644
--- a/drivers/char/ipmi/ipmi_dmi.h
+++ b/drivers/char/ipmi/ipmi_dmi.h
@@ -2,11 +2,7 @@
  * DMI defines for use by IPMI
  */
 
-#define IPMI_DMI_TYPE_KCS	0x01
-#define IPMI_DMI_TYPE_SMIC	0x02
-#define IPMI_DMI_TYPE_BT	0x03
-#define IPMI_DMI_TYPE_SSIF	0x04
-
 #ifdef CONFIG_IPMI_DMI_DECODE
-int ipmi_dmi_get_slave_addr(int type, u32 flags, unsigned long base_addr);
+int ipmi_dmi_get_slave_addr(enum si_type si_type, u32 flags,
+			    unsigned long base_addr);
 #endif
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index fd3ac6b50412..9d1eaf70f406 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -594,7 +594,7 @@ static DEFINE_MUTEX(smi_watchers_mutex);
 
 static const char * const addr_src_to_str[] = {
 	"invalid", "hotmod", "hardcoded", "SPMI", "ACPI", "SMBIOS", "PCI",
-	"device-tree"
+	"device-tree", "platform"
 };
 
 const char *ipmi_addr_src_to_str(enum ipmi_addr_src src)
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index efc8ee9b5071..55e0c42bee4d 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -89,7 +89,7 @@ enum si_intf_state {
 #define IPMI_BT_INTMASK_CLEAR_IRQ_BIT	2
 #define IPMI_BT_INTMASK_ENABLE_IRQ_BIT	1
 
-static const char * const si_to_str[] = { "kcs", "smic", "bt" };
+static const char * const si_to_str[] = { "invalid", "kcs", "smic", "bt" };
 
 static int initialized;
 
diff --git a/drivers/char/ipmi/ipmi_si_platform.c b/drivers/char/ipmi/ipmi_si_platform.c
index cf5c3e5e72e2..9573f1116450 100644
--- a/drivers/char/ipmi/ipmi_si_platform.c
+++ b/drivers/char/ipmi/ipmi_si_platform.c
@@ -6,14 +6,13 @@
  */
 #include <linux/types.h>
 #include <linux/module.h>
-#include "ipmi_dmi.h"
-#include <linux/dmi.h>
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/acpi.h>
 #include "ipmi_si.h"
+#include "ipmi_dmi.h"
 
 #define PFX "ipmi_platform: "
 
@@ -21,19 +20,29 @@ static bool si_tryplatform = true;
 #ifdef CONFIG_ACPI
 static bool          si_tryacpi = true;
 #endif
+#ifdef CONFIG_OF
+static bool          si_tryopenfirmware = true;
+#endif
 #ifdef CONFIG_DMI
 static bool          si_trydmi = true;
+#else
+static bool          si_trydmi = false;
 #endif
 
 module_param_named(tryplatform, si_tryplatform, bool, 0);
 MODULE_PARM_DESC(tryplatform, "Setting this to zero will disable the"
 		 " default scan of the interfaces identified via platform"
-		 " interfaces like openfirmware");
+		 " interfaces besides ACPI, OpenFirmware, and DMI");
 #ifdef CONFIG_ACPI
 module_param_named(tryacpi, si_tryacpi, bool, 0);
 MODULE_PARM_DESC(tryacpi, "Setting this to zero will disable the"
 		 " default scan of the interfaces identified via ACPI");
 #endif
+#ifdef CONFIG_OF
+module_param_named(tryopenfirmware, si_tryopenfirmware, bool, 0);
+MODULE_PARM_DESC(tryacpi, "Setting this to zero will disable the"
+		 " default scan of the interfaces identified via OpenFirmware");
+#endif
 #ifdef CONFIG_DMI
 module_param_named(trydmi, si_trydmi, bool, 0);
 MODULE_PARM_DESC(trydmi, "Setting this to zero will disable the"
@@ -235,7 +244,6 @@ static void spmi_find_bmc(void)
 }
 #endif
 
-#if defined(CONFIG_DMI) || defined(CONFIG_ACPI)
 static struct resource *
 ipmi_get_info_from_resources(struct platform_device *pdev,
 			     struct si_sm_io *io)
@@ -271,48 +279,52 @@ ipmi_get_info_from_resources(struct platform_device *pdev,
 	return res;
 }
 
-#endif
-
-#ifdef CONFIG_DMI
-static int dmi_ipmi_probe(struct platform_device *pdev)
+static int platform_ipmi_probe(struct platform_device *pdev)
 {
 	struct si_sm_io io;
-	u8 type, slave_addr;
+	u8 type, slave_addr, addr_source;
 	int rv;
 
-	if (!si_trydmi)
-		return -ENODEV;
+	rv = device_property_read_u8(&pdev->dev, "addr-source", &addr_source);
+	if (rv)
+		addr_source = SI_PLATFORM;
+	if (addr_source >= SI_LAST)
+		return -EINVAL;
+
+	if (addr_source == SI_SMBIOS) {
+		if (!si_trydmi)
+			return -ENODEV;
+	} else {
+		if (!si_tryplatform)
+			return -ENODEV;
+	}
 
 	rv = device_property_read_u8(&pdev->dev, "ipmi-type", &type);
 	if (rv)
 		return -ENODEV;
 
 	memset(&io, 0, sizeof(io));
-	io.addr_source = SI_SMBIOS;
-	pr_info(PFX "probing via SMBIOS\n");
+	io.addr_source = addr_source;
+	dev_info(&pdev->dev, PFX "probing via %s\n",
+		 ipmi_addr_src_to_str(addr_source));
 
 	switch (type) {
-	case IPMI_DMI_TYPE_KCS:
-		io.si_type = SI_KCS;
-		break;
-	case IPMI_DMI_TYPE_SMIC:
-		io.si_type = SI_SMIC;
-		break;
-	case IPMI_DMI_TYPE_BT:
-		io.si_type = SI_BT;
+	case SI_KCS:
+	case SI_SMIC:
+	case SI_BT:
+		io.si_type = type;
 		break;
 	default:
+		dev_err(&pdev->dev, "ipmi-type property is invalid\n");
 		return -EINVAL;
 	}
 
-	if (!ipmi_get_info_from_resources(pdev, &io)) {
-		rv = -EINVAL;
-		goto err_free;
-	}
+	if (!ipmi_get_info_from_resources(pdev, &io))
+		return -EINVAL;
 
 	rv = device_property_read_u8(&pdev->dev, "slave-addr", &slave_addr);
 	if (rv) {
-		dev_warn(&pdev->dev, "device has no slave-addr property");
+		dev_warn(&pdev->dev, "device has no slave-addr property\n");
 		io.slave_addr = 0x20;
 	} else {
 		io.slave_addr = slave_addr;
@@ -333,16 +345,7 @@ static int dmi_ipmi_probe(struct platform_device *pdev)
 	ipmi_si_add_smi(&io);
 
 	return 0;
-
-err_free:
-	return rv;
 }
-#else
-static int dmi_ipmi_probe(struct platform_device *pdev)
-{
-	return -ENODEV;
-}
-#endif /* CONFIG_DMI */
 
 #ifdef CONFIG_OF
 static const struct of_device_id of_ipmi_match[] = {
@@ -366,6 +369,9 @@ static int of_ipmi_probe(struct platform_device *pdev)
 	int ret;
 	int proplen;
 
+	if (!si_tryopenfirmware)
+		return -ENODEV;
+
 	dev_info(&pdev->dev, "probing via device tree\n");
 
 	match = of_match_device(of_ipmi_match, &pdev->dev);
@@ -436,25 +442,12 @@ static int find_slave_address(struct si_sm_io *io, int slave_addr)
 {
 #ifdef CONFIG_IPMI_DMI_DECODE
 	if (!slave_addr) {
-		int type = -1;
 		u32 flags = IORESOURCE_IO;
 
-		switch (io->si_type) {
-		case SI_KCS:
-			type = IPMI_DMI_TYPE_KCS;
-			break;
-		case SI_BT:
-			type = IPMI_DMI_TYPE_BT;
-			break;
-		case SI_SMIC:
-			type = IPMI_DMI_TYPE_SMIC;
-			break;
-		}
-
 		if (io->addr_type == IPMI_MEM_ADDR_SPACE)
 			flags = IORESOURCE_MEM;
 
-		slave_addr = ipmi_dmi_get_slave_addr(type, flags,
+		slave_addr = ipmi_dmi_get_slave_addr(io->si_type, flags,
 						     io->addr_data);
 	}
 #endif
@@ -563,7 +556,7 @@ static int ipmi_probe(struct platform_device *pdev)
 	if (acpi_ipmi_probe(pdev) == 0)
 		return 0;
 
-	return dmi_ipmi_probe(pdev);
+	return platform_ipmi_probe(pdev);
 }
 
 static int ipmi_remove(struct platform_device *pdev)
@@ -583,11 +576,9 @@ struct platform_driver ipmi_platform_driver = {
 
 void ipmi_si_platform_init(void)
 {
-	if (si_tryplatform) {
-		int rv = platform_driver_register(&ipmi_platform_driver);
-		if (rv)
-			pr_err(PFX "Unable to register driver: %d\n", rv);
-	}
+	int rv = platform_driver_register(&ipmi_platform_driver);
+	if (rv)
+		pr_err(PFX "Unable to register driver: %d\n", rv);
 
 #ifdef CONFIG_ACPI
 	if (si_tryacpi)
diff --git a/drivers/char/ipmi/ipmi_si_sm.h b/drivers/char/ipmi/ipmi_si_sm.h
index fbf5bfccde2e..aa8d88ab4433 100644
--- a/drivers/char/ipmi/ipmi_si_sm.h
+++ b/drivers/char/ipmi/ipmi_si_sm.h
@@ -43,7 +43,7 @@
 struct si_sm_data;
 
 enum si_type {
-	SI_KCS, SI_SMIC, SI_BT
+	SI_TYPE_INVALID, SI_KCS, SI_SMIC, SI_BT
 };
 
 /*
diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
index aadec879d052..466b3a1c0adf 100644
--- a/drivers/char/ipmi/ipmi_ssif.c
+++ b/drivers/char/ipmi/ipmi_ssif.c
@@ -53,6 +53,7 @@
 #include <linux/acpi.h>
 #include <linux/ctype.h>
 #include <linux/time64.h>
+#include "ipmi_si_sm.h"
 #include "ipmi_dmi.h"
 
 #define PFX "ipmi_ssif: "
@@ -1482,7 +1483,7 @@ static int find_slave_address(struct i2c_client *client, int slave_addr)
 #ifdef CONFIG_IPMI_DMI_DECODE
 	if (!slave_addr)
 		slave_addr = ipmi_dmi_get_slave_addr(
-			IPMI_DMI_TYPE_SSIF,
+			SI_TYPE_INVALID,
 			i2c_adapter_id(client->adapter),
 			client->addr);
 #endif
@@ -2013,20 +2014,13 @@ static void spmi_find_bmc(void) { }
 #ifdef CONFIG_DMI
 static int dmi_ipmi_probe(struct platform_device *pdev)
 {
-	u8 type, slave_addr = 0;
+	u8 slave_addr = 0;
 	u16 i2c_addr;
 	int rv;
 
 	if (!ssif_trydmi)
 		return -ENODEV;
 
-	rv = device_property_read_u8(&pdev->dev, "ipmi-type", &type);
-	if (rv)
-		return -ENODEV;
-
-	if (type != IPMI_DMI_TYPE_SSIF)
-		return -ENODEV;
-
 	rv = device_property_read_u16(&pdev->dev, "i2c-addr", &i2c_addr);
 	if (rv) {
 		dev_warn(&pdev->dev, PFX "No i2c-addr property\n");
diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index 80fc3f798984..f4ffacf4fe9d 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -277,7 +277,7 @@ int ipmi_validate_addr(struct ipmi_addr *addr, int len);
  */
 enum ipmi_addr_src {
 	SI_INVALID = 0, SI_HOTMOD, SI_HARDCODED, SI_SPMI, SI_ACPI, SI_SMBIOS,
-	SI_PCI,	SI_DEVICETREE, SI_LAST
+	SI_PCI,	SI_DEVICETREE, SI_PLATFORM, SI_LAST
 };
 const char *ipmi_addr_src_to_str(enum ipmi_addr_src src);
 
-- 
cgit v1.2.3


From 6ade97da601f8af793f6c7a861af754d0f0b6767 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 26 Sep 2017 23:12:28 +0300
Subject: arp: make arp_hdr_len() return unsigned int

Negative ARP header length are not a thing.

Constify arguments while I'm at it.

Space savings:

	add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-3 (-3)
	function                        old     new   delta
	arpt_do_table                  1163    1160      -3

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 3 ++-
 include/linux/if_arp.h          | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index c99dc59d729b..d2e94b8559f0 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2491,7 +2491,8 @@ int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond,
 	struct slave *curr_active_slave, *curr_arp_slave;
 	unsigned char *arp_ptr;
 	__be32 sip, tip;
-	int alen, is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP);
+	int is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP);
+	unsigned int alen;
 
 	if (!slave_do_arp_validate(bond, slave)) {
 		if ((slave_do_arp_validate_only(bond) && is_arp) ||
diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h
index 3355efc89781..6756fea18b69 100644
--- a/include/linux/if_arp.h
+++ b/include/linux/if_arp.h
@@ -31,7 +31,7 @@ static inline struct arphdr *arp_hdr(const struct sk_buff *skb)
 	return (struct arphdr *)skb_network_header(skb);
 }
 
-static inline int arp_hdr_len(struct net_device *dev)
+static inline unsigned int arp_hdr_len(const struct net_device *dev)
 {
 	switch (dev->type) {
 #if IS_ENABLED(CONFIG_FIREWIRE_NET)
-- 
cgit v1.2.3


From cb4d2b3f03d8eed90be3a194e5b54b734ec4bbe9 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 27 Sep 2017 14:37:52 -0700
Subject: bpf: Add name, load_time, uid and map_ids to bpf_prog_info

The patch adds name and load_time to struct bpf_prog_aux.  They
are also exported to bpf_prog_info.

The bpf_prog's name is passed by userspace during BPF_PROG_LOAD.
The kernel only stores the first (BPF_PROG_NAME_LEN - 1) bytes
and the name stored in the kernel is always \0 terminated.

The kernel will reject name that contains characters other than
isalnum() and '_'.  It will also reject name that is not null
terminated.

The existing 'user->uid' of the bpf_prog_aux is also exported to
the bpf_prog_info as created_by_uid.

The existing 'used_maps' of the bpf_prog_aux is exported to
the newly added members 'nr_map_ids' and 'map_ids' of
the bpf_prog_info.  On the input, nr_map_ids tells how
big the userspace's map_ids buffer is.  On the output,
nr_map_ids tells the exact user_map_cnt and it will only
copy up to the userspace's map_ids buffer is allowed.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  2 ++
 include/uapi/linux/bpf.h |  8 ++++++++
 kernel/bpf/syscall.c     | 51 +++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 60 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2b672c50f160..33ccc474fb04 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -187,6 +187,8 @@ struct bpf_prog_aux {
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
 	struct user_struct *user;
+	u64 load_time; /* ns since boottime */
+	u8 name[BPF_OBJ_NAME_LEN];
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e43491ac4823..bd6348269bf5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -175,6 +175,8 @@ enum bpf_attach_type {
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
+#define BPF_OBJ_NAME_LEN 16U
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
@@ -210,6 +212,7 @@ union bpf_attr {
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
+		__u8		prog_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -812,6 +815,11 @@ struct bpf_prog_info {
 	__u32 xlated_prog_len;
 	__aligned_u64 jited_prog_insns;
 	__aligned_u64 xlated_prog_insns;
+	__u64 load_time;	/* ns since boottime */
+	__u32 created_by_uid;
+	__u32 nr_map_ids;
+	__aligned_u64 map_ids;
+	__u8  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 25d074920a00..45970df3f820 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,6 +23,9 @@
 #include <linux/version.h>
 #include <linux/kernel.h>
 #include <linux/idr.h>
+#include <linux/cred.h>
+#include <linux/timekeeping.h>
+#include <linux/ctype.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
 			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -312,6 +315,30 @@ int bpf_map_new_fd(struct bpf_map *map)
 		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
 		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
 
+/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes.
+ * Return 0 on success and < 0 on error.
+ */
+static int bpf_obj_name_cpy(char *dst, const char *src)
+{
+	const char *end = src + BPF_OBJ_NAME_LEN;
+
+	/* Copy all isalnum() and '_' char */
+	while (src < end && *src) {
+		if (!isalnum(*src) && *src != '_')
+			return -EINVAL;
+		*dst++ = *src++;
+	}
+
+	/* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */
+	if (src == end)
+		return -EINVAL;
+
+	/* '\0' terminates dst */
+	*dst = 0;
+
+	return 0;
+}
+
 #define BPF_MAP_CREATE_LAST_FIELD numa_node
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
@@ -973,7 +1000,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD prog_flags
+#define	BPF_PROG_LOAD_LAST_FIELD prog_name
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -1037,6 +1064,11 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (err < 0)
 		goto free_prog;
 
+	prog->aux->load_time = ktime_get_boot_ns();
+	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
+	if (err)
+		goto free_prog;
+
 	/* run eBPF verifier */
 	err = bpf_check(&prog, attr);
 	if (err < 0)
@@ -1358,8 +1390,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 
 	info.type = prog->type;
 	info.id = prog->aux->id;
+	info.load_time = prog->aux->load_time;
+	info.created_by_uid = from_kuid_munged(current_user_ns(),
+					       prog->aux->user->uid);
 
 	memcpy(info.tag, prog->tag, sizeof(prog->tag));
+	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
+
+	ulen = info.nr_map_ids;
+	info.nr_map_ids = prog->aux->used_map_cnt;
+	ulen = min_t(u32, info.nr_map_ids, ulen);
+	if (ulen) {
+		u32 *user_map_ids = (u32 *)info.map_ids;
+		u32 i;
+
+		for (i = 0; i < ulen; i++)
+			if (put_user(prog->aux->used_maps[i]->id,
+				     &user_map_ids[i]))
+				return -EFAULT;
+	}
 
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
-- 
cgit v1.2.3


From ad5b177bd73f5107d97c36f56395c4281fb6f089 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 27 Sep 2017 14:37:53 -0700
Subject: bpf: Add map_name to bpf_map_info

This patch allows userspace to specify a name for a map
during BPF_MAP_CREATE.

The map's name can later be exported to user space
via BPF_OBJ_GET_INFO_BY_FD.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      | 1 +
 include/uapi/linux/bpf.h | 2 ++
 kernel/bpf/syscall.c     | 7 ++++++-
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 33ccc474fb04..252f4bc9eb25 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,6 +56,7 @@ struct bpf_map {
 	struct work_struct work;
 	atomic_t usercnt;
 	struct bpf_map *inner_map_meta;
+	u8 name[BPF_OBJ_NAME_LEN];
 };
 
 /* function argument constraints */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bd6348269bf5..6d2137b4cf38 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -190,6 +190,7 @@ union bpf_attr {
 		__u32	numa_node;	/* numa node (effective only if
 					 * BPF_F_NUMA_NODE is set).
 					 */
+		__u8	map_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -829,6 +830,7 @@ struct bpf_map_info {
 	__u32 value_size;
 	__u32 max_entries;
 	__u32 map_flags;
+	__u8  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 45970df3f820..11a7f82a55d1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -339,7 +339,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD numa_node
+#define BPF_MAP_CREATE_LAST_FIELD map_name
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -361,6 +361,10 @@ static int map_create(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	err = bpf_obj_name_cpy(map->name, attr->map_name);
+	if (err)
+		goto free_map_nouncharge;
+
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
 
@@ -1462,6 +1466,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	info.value_size = map->value_size;
 	info.max_entries = map->max_entries;
 	info.map_flags = map->map_flags;
+	memcpy(info.name, map->name, sizeof(map->name));
 
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
cgit v1.2.3


From 1ea6c46a23f1213d1972bfae220db5c165e27bba Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 6 May 2017 15:59:54 +0200
Subject: sched/fair: Propagate an effective runnable_load_avg

The load balancer uses runnable_load_avg as load indicator. For
!cgroup this is:

  runnable_load_avg = \Sum se->avg.load_avg ; where se->on_rq

That is, a direct sum of all runnable tasks on that runqueue. As
opposed to load_avg, which is a sum of all tasks on the runqueue,
which includes a blocked component.

However, in the cgroup case, this comes apart since the group entities
are always runnable, even if most of their constituent entities are
blocked.

Therefore introduce a runnable_weight which for task entities is the
same as the regular weight, but for group entities is a fraction of
the entity weight and represents the runnable part of the group
runqueue.

Then propagate this load through the PELT hierarchy to arrive at an
effective runnable load avgerage -- which we should not confuse with
the canonical runnable load average.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |   3 +
 kernel/sched/debug.c  |   8 ++-
 kernel/sched/fair.c   | 172 +++++++++++++++++++++++++++++++++-----------------
 kernel/sched/sched.h  |   3 +-
 4 files changed, 124 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a7df4e558c..bdd6ad6fcce1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -331,9 +331,11 @@ struct load_weight {
 struct sched_avg {
 	u64				last_update_time;
 	u64				load_sum;
+	u64				runnable_load_sum;
 	u32				util_sum;
 	u32				period_contrib;
 	unsigned long			load_avg;
+	unsigned long			runnable_load_avg;
 	unsigned long			util_avg;
 };
 
@@ -376,6 +378,7 @@ struct sched_statistics {
 struct sched_entity {
 	/* For load-balancing: */
 	struct load_weight		load;
+	unsigned long			runnable_weight;
 	struct rb_node			run_node;
 	struct list_head		group_node;
 	unsigned int			on_rq;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2e039a81864c..1ca0130ed4f9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -441,9 +441,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 		P_SCHEDSTAT(se->statistics.wait_count);
 	}
 	P(se->load.weight);
+	P(se->runnable_weight);
 #ifdef CONFIG_SMP
 	P(se->avg.load_avg);
 	P(se->avg.util_avg);
+	P(se->avg.runnable_load_avg);
 #endif
 
 #undef PN_SCHEDSTAT
@@ -558,10 +560,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
 	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_SMP
+	SEQ_printf(m, "  .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight);
 	SEQ_printf(m, "  .%-30s: %lu\n", "load_avg",
 			cfs_rq->avg.load_avg);
 	SEQ_printf(m, "  .%-30s: %lu\n", "runnable_load_avg",
-			cfs_rq->runnable_load_avg);
+			cfs_rq->avg.runnable_load_avg);
 	SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
 			cfs_rq->avg.util_avg);
 	SEQ_printf(m, "  .%-30s: %ld\n", "removed.load_avg",
@@ -1006,10 +1009,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 		   "nr_involuntary_switches", (long long)p->nivcsw);
 
 	P(se.load.weight);
+	P(se.runnable_weight);
 #ifdef CONFIG_SMP
 	P(se.avg.load_sum);
+	P(se.avg.runnable_load_sum);
 	P(se.avg.util_sum);
 	P(se.avg.load_avg);
+	P(se.avg.runnable_load_avg);
 	P(se.avg.util_avg);
 	P(se.avg.last_update_time);
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 086a5d979720..10d2000fca2d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -730,8 +730,9 @@ void init_entity_runnable_average(struct sched_entity *se)
 	 * nothing has been attached to the task group yet.
 	 */
 	if (entity_is_task(se))
-		sa->load_avg = scale_load_down(se->load.weight);
-	sa->load_sum = LOAD_AVG_MAX;
+		sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
+	sa->runnable_load_sum = sa->load_sum = LOAD_AVG_MAX;
+
 	/*
 	 * At this point, util_avg won't be used in select_task_rq_fair anyway
 	 */
@@ -2731,25 +2732,35 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 #ifdef CONFIG_SMP
 /*
- * XXX we want to get rid of this helper and use the full load resolution.
+ * XXX we want to get rid of these helpers and use the full load resolution.
  */
 static inline long se_weight(struct sched_entity *se)
 {
 	return scale_load_down(se->load.weight);
 }
 
+static inline long se_runnable(struct sched_entity *se)
+{
+	return scale_load_down(se->runnable_weight);
+}
+
 static inline void
 enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	cfs_rq->runnable_load_avg += se->avg.load_avg;
-	cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum;
+	cfs_rq->runnable_weight += se->runnable_weight;
+
+	cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
+	cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
 }
 
 static inline void
 dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg);
-	sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum);
+	cfs_rq->runnable_weight -= se->runnable_weight;
+
+	sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
+	sub_positive(&cfs_rq->avg.runnable_load_sum,
+		     se_runnable(se) * se->avg.runnable_load_sum);
 }
 
 static inline void
@@ -2777,7 +2788,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 #endif
 
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
-			    unsigned long weight)
+			    unsigned long weight, unsigned long runnable)
 {
 	if (se->on_rq) {
 		/* commit outstanding execution time */
@@ -2788,11 +2799,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	}
 	dequeue_load_avg(cfs_rq, se);
 
+	se->runnable_weight = runnable;
 	update_load_set(&se->load, weight);
 
 #ifdef CONFIG_SMP
-	se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum,
-				   LOAD_AVG_MAX - 1024 + se->avg.period_contrib);
+	do {
+		u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
+
+		se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
+		se->avg.runnable_load_avg =
+			div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
+	} while (0);
 #endif
 
 	enqueue_load_avg(cfs_rq, se);
@@ -2809,7 +2826,7 @@ void reweight_task(struct task_struct *p, int prio)
 	struct load_weight *load = &se->load;
 	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
 
-	reweight_entity(cfs_rq, se, weight);
+	reweight_entity(cfs_rq, se, weight, weight);
 	load->inv_weight = sched_prio_to_wmult[prio];
 }
 
@@ -2917,31 +2934,45 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq)
 
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 
-static void update_cfs_shares(struct sched_entity *se)
+/*
+ * Recomputes the group entity based on the current state of its group
+ * runqueue.
+ */
+static void update_cfs_group(struct sched_entity *se)
 {
-	struct cfs_rq *cfs_rq = group_cfs_rq(se);
-	long shares;
+	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+	long shares, runnable;
 
-	if (!cfs_rq)
+	if (!gcfs_rq)
 		return;
 
-	if (throttled_hierarchy(cfs_rq))
+	if (throttled_hierarchy(gcfs_rq))
 		return;
 
 #ifndef CONFIG_SMP
-	shares = READ_ONCE(cfs_rq->tg->shares);
+	runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
 
 	if (likely(se->load.weight == shares))
 		return;
 #else
-	shares = calc_cfs_shares(cfs_rq);
+	shares = calc_cfs_shares(gcfs_rq);
+	/*
+	 * The hierarchical runnable load metric is the proportional part
+	 * of this group's runnable_load_avg / load_avg.
+	 *
+	 * Note: we need to deal with very sporadic 'runnable > load' cases
+	 * due to numerical instability.
+	 */
+	runnable = shares * gcfs_rq->avg.runnable_load_avg;
+	if (runnable)
+		runnable /= max(gcfs_rq->avg.load_avg, gcfs_rq->avg.runnable_load_avg);
 #endif
 
-	reweight_entity(cfs_rq_of(se), se, shares);
+	reweight_entity(cfs_rq_of(se), se, shares, runnable);
 }
 
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct sched_entity *se)
+static inline void update_cfs_group(struct sched_entity *se)
 {
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -3050,7 +3081,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
  */
 static __always_inline u32
 accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
-	       unsigned long weight, int running, struct cfs_rq *cfs_rq)
+	       unsigned long load, unsigned long runnable, int running)
 {
 	unsigned long scale_freq, scale_cpu;
 	u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
@@ -3067,10 +3098,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
 	 */
 	if (periods) {
 		sa->load_sum = decay_load(sa->load_sum, periods);
-		if (cfs_rq) {
-			cfs_rq->runnable_load_sum =
-				decay_load(cfs_rq->runnable_load_sum, periods);
-		}
+		sa->runnable_load_sum =
+			decay_load(sa->runnable_load_sum, periods);
 		sa->util_sum = decay_load((u64)(sa->util_sum), periods);
 
 		/*
@@ -3083,11 +3112,10 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
 	sa->period_contrib = delta;
 
 	contrib = cap_scale(contrib, scale_freq);
-	if (weight) {
-		sa->load_sum += weight * contrib;
-		if (cfs_rq)
-			cfs_rq->runnable_load_sum += weight * contrib;
-	}
+	if (load)
+		sa->load_sum += load * contrib;
+	if (runnable)
+		sa->runnable_load_sum += runnable * contrib;
 	if (running)
 		sa->util_sum += contrib * scale_cpu;
 
@@ -3124,7 +3152,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
  */
 static __always_inline int
 ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
-		   unsigned long weight, int running, struct cfs_rq *cfs_rq)
+		  unsigned long load, unsigned long runnable, int running)
 {
 	u64 delta;
 
@@ -3157,8 +3185,8 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
 	 * this happens during idle_balance() which calls
 	 * update_blocked_averages()
 	 */
-	if (!weight)
-		running = 0;
+	if (!load)
+		runnable = running = 0;
 
 	/*
 	 * Now we know we crossed measurement unit boundaries. The *_avg
@@ -3167,45 +3195,60 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
 	 * Step 1: accumulate *_sum since last_update_time. If we haven't
 	 * crossed period boundaries, finish.
 	 */
-	if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+	if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
 		return 0;
 
 	return 1;
 }
 
 static __always_inline void
-___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq *cfs_rq)
+___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
 {
 	u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
 
 	/*
 	 * Step 2: update *_avg.
 	 */
-	sa->load_avg = div_u64(weight * sa->load_sum, divider);
-	if (cfs_rq) {
-		cfs_rq->runnable_load_avg =
-			div_u64(cfs_rq->runnable_load_sum, divider);
-	}
+	sa->load_avg = div_u64(load * sa->load_sum, divider);
+	sa->runnable_load_avg =	div_u64(runnable * sa->runnable_load_sum, divider);
 	sa->util_avg = sa->util_sum / divider;
 }
 
 /*
  * sched_entity:
  *
+ *   task:
+ *     se_runnable() == se_weight()
+ *
+ *   group: [ see update_cfs_group() ]
+ *     se_weight()   = tg->weight * grq->load_avg / tg->load_avg
+ *     se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
+ *
  *   load_sum := runnable_sum
  *   load_avg = se_weight(se) * runnable_avg
  *
+ *   runnable_load_sum := runnable_sum
+ *   runnable_load_avg = se_runnable(se) * runnable_avg
+ *
+ * XXX collapse load_sum and runnable_load_sum
+ *
  * cfq_rs:
  *
  *   load_sum = \Sum se_weight(se) * se->avg.load_sum
  *   load_avg = \Sum se->avg.load_avg
+ *
+ *   runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
+ *   runnable_load_avg = \Sum se->avg.runable_load_avg
  */
 
 static int
 __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
 {
-	if (___update_load_sum(now, cpu, &se->avg, 0, 0, NULL)) {
-		___update_load_avg(&se->avg, se_weight(se), NULL);
+	if (entity_is_task(se))
+		se->runnable_weight = se->load.weight;
+
+	if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
+		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
 		return 1;
 	}
 
@@ -3215,10 +3258,13 @@ __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
 static int
 __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq,
-				cfs_rq->curr == se, NULL)) {
+	if (entity_is_task(se))
+		se->runnable_weight = se->load.weight;
+
+	if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
+				cfs_rq->curr == se)) {
 
-		___update_load_avg(&se->avg, se_weight(se), NULL);
+		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
 		return 1;
 	}
 
@@ -3230,8 +3276,10 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
 {
 	if (___update_load_sum(now, cpu, &cfs_rq->avg,
 				scale_load_down(cfs_rq->load.weight),
-				cfs_rq->curr != NULL, cfs_rq)) {
-		___update_load_avg(&cfs_rq->avg, 1, cfs_rq);
+				scale_load_down(cfs_rq->runnable_weight),
+				cfs_rq->curr != NULL)) {
+
+		___update_load_avg(&cfs_rq->avg, 1, 1);
 		return 1;
 	}
 
@@ -3409,8 +3457,8 @@ static inline void
 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 {
 	long runnable_sum = gcfs_rq->prop_runnable_sum;
-	long load_avg;
-	s64 load_sum;
+	long runnable_load_avg, load_avg;
+	s64 runnable_load_sum, load_sum;
 
 	if (!runnable_sum)
 		return;
@@ -3426,9 +3474,15 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
 	add_positive(&cfs_rq->avg.load_avg, load_avg);
 	add_positive(&cfs_rq->avg.load_sum, load_sum);
 
+	runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
+	runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
+
+	add_positive(&se->avg.runnable_load_sum, runnable_sum);
+	add_positive(&se->avg.runnable_load_avg, runnable_load_avg);
+
 	if (se->on_rq) {
-		add_positive(&cfs_rq->runnable_load_avg, load_avg);
-		add_positive(&cfs_rq->runnable_load_sum, load_sum);
+		add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg);
+		add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum);
 	}
 }
 
@@ -3710,7 +3764,7 @@ void remove_entity_load_avg(struct sched_entity *se)
 
 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
 {
-	return cfs_rq->runnable_load_avg;
+	return cfs_rq->avg.runnable_load_avg;
 }
 
 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
@@ -3882,8 +3936,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 *   - Add its new weight to cfs_rq->load.weight
 	 */
 	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
+	update_cfs_group(se);
 	enqueue_runnable_load_avg(cfs_rq, se);
-	update_cfs_shares(se);
 	account_entity_enqueue(cfs_rq, se);
 
 	if (flags & ENQUEUE_WAKEUP)
@@ -3989,7 +4043,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	/* return excess runtime on last dequeue */
 	return_cfs_rq_runtime(cfs_rq);
 
-	update_cfs_shares(se);
+	update_cfs_group(se);
 
 	/*
 	 * Now advance min_vruntime if @se was the entity holding it back,
@@ -4172,7 +4226,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * Ensure that runnable average is periodically updated.
 	 */
 	update_load_avg(cfs_rq, curr, UPDATE_TG);
-	update_cfs_shares(curr);
+	update_cfs_group(curr);
 
 #ifdef CONFIG_SCHED_HRTICK
 	/*
@@ -5090,7 +5144,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 
 		update_load_avg(cfs_rq, se, UPDATE_TG);
-		update_cfs_shares(se);
+		update_cfs_group(se);
 	}
 
 	if (!se)
@@ -5149,7 +5203,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 
 		update_load_avg(cfs_rq, se, UPDATE_TG);
-		update_cfs_shares(se);
+		update_cfs_group(se);
 	}
 
 	if (!se)
@@ -7174,7 +7228,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 	if (cfs_rq->avg.util_sum)
 		return false;
 
-	if (cfs_rq->runnable_load_sum)
+	if (cfs_rq->avg.runnable_load_sum)
 		return false;
 
 	return true;
@@ -9692,7 +9746,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 		update_rq_clock(rq);
 		for_each_sched_entity(se) {
 			update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
-			update_cfs_shares(se);
+			update_cfs_group(se);
 		}
 		rq_unlock_irqrestore(rq, &rf);
 	}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5bcb86eb026b..e83d1b8be611 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -418,6 +418,7 @@ struct cfs_bandwidth { };
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
+	unsigned long runnable_weight;
 	unsigned int nr_running, h_nr_running;
 
 	u64 exec_clock;
@@ -443,8 +444,6 @@ struct cfs_rq {
 	 * CFS load tracking
 	 */
 	struct sched_avg avg;
-	u64 runnable_load_sum;
-	unsigned long runnable_load_avg;
 #ifndef CONFIG_64BIT
 	u64 load_last_update_time_copy;
 #endif
-- 
cgit v1.2.3


From a3f5aa907340b5d7b54223ddbaa90410f168864d Mon Sep 17 00:00:00 2001
From: Alan Brady <alan.brady@intel.com>
Date: Fri, 14 Jul 2017 09:27:08 -0400
Subject: i40e: Enable VF to negotiate number of allocated queues

Currently the PF allocates a default number of queues for each VF and
cannot be changed.  This patch enables the VF to request a different
number of queues allocated to it.  This patch also adds a new virtchnl
op and capability flag to facilitate this negotiation.

After the PF receives a request message, it will set a requested number
of queues for that VF.  Then when the VF resets, its VSI will get a new
number of queues allocated to it.

This is a best effort request and since we only allocate a guaranteed
default number, if the VF tries to ask for more than the guaranteed
number, there may not be enough in HW to accommodate it unless other
queues for other VFs are freed. It should also be noted decreasing the
number queues allocated to a VF to below the default will NOT enable the
allocation of more than 32 VFs per PF and will not free queues guaranteed
to each VF by default.

Signed-off-by: Alan Brady <alan.brady@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h             |  1 +
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 75 ++++++++++++++++++++++
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h |  1 +
 include/linux/avf/virtchnl.h                       | 20 ++++++
 4 files changed, 97 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index b7a539cdca00..439c63cb2a0c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -77,6 +77,7 @@
 #define i40e_default_queues_per_vmdq(pf) \
 		(((pf)->hw_features & I40E_HW_RSS_AQ_CAPABLE) ? 4 : 1)
 #define I40E_DEFAULT_QUEUES_PER_VF	4
+#define I40E_MAX_VF_QUEUES		16
 #define I40E_DEFAULT_QUEUES_PER_TC	1 /* should be a power of 2 */
 #define i40e_pf_get_max_q_per_tc(pf) \
 		(((pf)->hw_features & I40E_HW_128_QP_RSS_CAPABLE) ? 128 : 64)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 4d1e670f490e..a75396c157d9 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -815,6 +815,14 @@ static void i40e_free_vf_res(struct i40e_vf *vf)
 	 */
 	clear_bit(I40E_VF_STATE_INIT, &vf->vf_states);
 
+	/* It's possible the VF had requeuested more queues than the default so
+	 * do the accounting here when we're about to free them.
+	 */
+	if (vf->num_queue_pairs > I40E_DEFAULT_QUEUES_PER_VF) {
+		pf->queues_left += vf->num_queue_pairs -
+				   I40E_DEFAULT_QUEUES_PER_VF;
+	}
+
 	/* free vsi & disconnect it from the parent uplink */
 	if (vf->lan_vsi_idx) {
 		i40e_vsi_release(pf->vsi[vf->lan_vsi_idx]);
@@ -868,12 +876,27 @@ static int i40e_alloc_vf_res(struct i40e_vf *vf)
 	int total_queue_pairs = 0;
 	int ret;
 
+	if (vf->num_req_queues &&
+	    vf->num_req_queues <= pf->queues_left + I40E_DEFAULT_QUEUES_PER_VF)
+		pf->num_vf_qps = vf->num_req_queues;
+	else
+		pf->num_vf_qps = I40E_DEFAULT_QUEUES_PER_VF;
+
 	/* allocate hw vsi context & associated resources */
 	ret = i40e_alloc_vsi_res(vf, I40E_VSI_SRIOV);
 	if (ret)
 		goto error_alloc;
 	total_queue_pairs += pf->vsi[vf->lan_vsi_idx]->alloc_queue_pairs;
 
+	/* We account for each VF to get a default number of queue pairs.  If
+	 * the VF has now requested more, we need to account for that to make
+	 * certain we never request more queues than we actually have left in
+	 * HW.
+	 */
+	if (total_queue_pairs > I40E_DEFAULT_QUEUES_PER_VF)
+		pf->queues_left -=
+			total_queue_pairs - I40E_DEFAULT_QUEUES_PER_VF;
+
 	if (vf->trusted)
 		set_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
 	else
@@ -1579,6 +1602,9 @@ static int i40e_vc_get_vf_resources_msg(struct i40e_vf *vf, u8 *msg)
 					VIRTCHNL_VF_OFFLOAD_WB_ON_ITR;
 	}
 
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_REQ_QUEUES)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_REQ_QUEUES;
+
 	vfres->num_vsis = num_vsis;
 	vfres->num_queue_pairs = vf->num_queue_pairs;
 	vfres->max_vectors = pf->hw.func_caps.num_msix_vectors_vf;
@@ -1986,6 +2012,52 @@ error_param:
 				       aq_ret);
 }
 
+/**
+ * i40e_vc_request_queues_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * VFs get a default number of queues but can use this message to request a
+ * different number.  Will respond with either the number requested or the
+ * maximum we can support.
+ **/
+static int i40e_vc_request_queues_msg(struct i40e_vf *vf, u8 *msg, int msglen)
+{
+	struct virtchnl_vf_res_request *vfres =
+		(struct virtchnl_vf_res_request *)msg;
+	int req_pairs = vfres->num_queue_pairs;
+	int cur_pairs = vf->num_queue_pairs;
+	struct i40e_pf *pf = vf->pf;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states))
+		return -EINVAL;
+
+	if (req_pairs <= 0) {
+		dev_err(&pf->pdev->dev,
+			"VF %d tried to request %d queues.  Ignoring.\n",
+			vf->vf_id, req_pairs);
+	} else if (req_pairs > I40E_MAX_VF_QUEUES) {
+		dev_err(&pf->pdev->dev,
+			"VF %d tried to request more than %d queues.\n",
+			vf->vf_id,
+			I40E_MAX_VF_QUEUES);
+		vfres->num_queue_pairs = I40E_MAX_VF_QUEUES;
+	} else if (req_pairs - cur_pairs > pf->queues_left) {
+		dev_warn(&pf->pdev->dev,
+			 "VF %d requested %d more queues, but only %d left.\n",
+			 vf->vf_id,
+			 req_pairs - cur_pairs,
+			 pf->queues_left);
+		vfres->num_queue_pairs = pf->queues_left + cur_pairs;
+	} else {
+		vf->num_req_queues = req_pairs;
+	}
+
+	return i40e_vc_send_msg_to_vf(vf, VIRTCHNL_OP_REQUEST_QUEUES, 0,
+				      (u8 *)vfres, sizeof(vfres));
+}
+
 /**
  * i40e_vc_get_stats_msg
  * @vf: pointer to the VF info
@@ -2708,6 +2780,9 @@ int i40e_vc_process_vf_msg(struct i40e_pf *pf, s16 vf_id, u32 v_opcode,
 	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING:
 		ret = i40e_vc_disable_vlan_stripping(vf, msg, msglen);
 		break;
+	case VIRTCHNL_OP_REQUEST_QUEUES:
+		ret = i40e_vc_request_queues_msg(vf, msg, msglen);
+		break;
 
 	case VIRTCHNL_OP_UNKNOWN:
 	default:
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
index 1f4b0c504368..5111d05d5f2f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
@@ -97,6 +97,7 @@ struct i40e_vf {
 	u16 lan_vsi_id;		/* ID as used by firmware */
 
 	u8 num_queue_pairs;	/* num of qps assigned to VF vsis */
+	u8 num_req_queues;	/* num of requested qps */
 	u64 num_mdd_events;	/* num of mdd events detected */
 	/* num of continuous malformed or invalid msgs detected */
 	u64 num_invalid_msgs;
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 2b038442c352..60e5d90cb18a 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -135,6 +135,7 @@ enum virtchnl_ops {
 	VIRTCHNL_OP_SET_RSS_HENA = 26,
 	VIRTCHNL_OP_ENABLE_VLAN_STRIPPING = 27,
 	VIRTCHNL_OP_DISABLE_VLAN_STRIPPING = 28,
+	VIRTCHNL_OP_REQUEST_QUEUES = 29,
 };
 
 /* This macro is used to generate a compilation error if a structure
@@ -235,6 +236,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
 #define VIRTCHNL_VF_OFFLOAD_RSS_AQ		0x00000008
 #define VIRTCHNL_VF_OFFLOAD_RSS_REG		0x00000010
 #define VIRTCHNL_VF_OFFLOAD_WB_ON_ITR		0x00000020
+#define VIRTCHNL_VF_OFFLOAD_REQ_QUEUES		0x00000040
 #define VIRTCHNL_VF_OFFLOAD_VLAN		0x00010000
 #define VIRTCHNL_VF_OFFLOAD_RX_POLLING		0x00020000
 #define VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2	0x00040000
@@ -325,6 +327,21 @@ struct virtchnl_vsi_queue_config_info {
 	struct virtchnl_queue_pair_info qpair[1];
 };
 
+/* VIRTCHNL_OP_REQUEST_QUEUES
+ * VF sends this message to request the PF to allocate additional queues to
+ * this VF.  Each VF gets a guaranteed number of queues on init but asking for
+ * additional queues must be negotiated.  This is a best effort request as it
+ * is possible the PF does not have enough queues left to support the request.
+ * If the PF cannot support the number requested it will respond with the
+ * maximum number it is able to support; otherwise it will respond with the
+ * number requested.
+ */
+
+/* VF resource request */
+struct virtchnl_vf_res_request {
+	u16 num_queue_pairs;
+};
+
 VIRTCHNL_CHECK_STRUCT_LEN(72, virtchnl_vsi_queue_config_info);
 
 /* VIRTCHNL_OP_CONFIG_IRQ_MAP
@@ -691,6 +708,9 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode,
 	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING:
 	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING:
 		break;
+	case VIRTCHNL_OP_REQUEST_QUEUES:
+		valid_len = sizeof(struct virtchnl_vf_res_request);
+		break;
 	/* These are always errors coming from the VF. */
 	case VIRTCHNL_OP_EVENT:
 	case VIRTCHNL_OP_UNKNOWN:
-- 
cgit v1.2.3


From 2f657a600409f1961d67642fe384a9d4be71d36a Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Fri, 29 Sep 2017 17:19:20 -0400
Subject: net: dsa: change dsa_ptr for a dsa_port

With DSA, a master net device (CPU facing interface) has a dsa_ptr
pointer to which hangs a dsa_switch_tree. This is not correct because a
master interface is wired to a dedicated switch port, and because we can
theoretically have several master interfaces pointing to several CPU
ports of the same switch fabric.

Change the master interface's dsa_ptr for the CPU dsa_port pointer.
This is a step towards supporting multiple CPU ports.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 ++--
 net/dsa/dsa.c             |  6 +++---
 net/dsa/dsa2.c            |  2 +-
 net/dsa/dsa_priv.h        |  3 ++-
 net/dsa/legacy.c          |  2 +-
 net/dsa/master.c          | 15 +++++----------
 6 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f535779d9dc1..e1d6ef130611 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -55,7 +55,7 @@
 struct netpoll_info;
 struct device;
 struct phy_device;
-struct dsa_switch_tree;
+struct dsa_port;
 
 /* 802.11 specific */
 struct wireless_dev;
@@ -1752,7 +1752,7 @@ struct net_device {
 	struct vlan_info __rcu	*vlan_info;
 #endif
 #if IS_ENABLED(CONFIG_NET_DSA)
-	struct dsa_switch_tree	*dsa_ptr;
+	struct dsa_port		*dsa_ptr;
 #endif
 #if IS_ENABLED(CONFIG_TIPC)
 	struct tipc_bearer __rcu *tipc_ptr;
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 81c852e32821..51ca2a524a27 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -160,12 +160,12 @@ EXPORT_SYMBOL_GPL(dsa_dev_to_net_device);
 static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 			  struct packet_type *pt, struct net_device *unused)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
 	struct sk_buff *nskb = NULL;
 	struct pcpu_sw_netstats *s;
 	struct dsa_slave_priv *p;
 
-	if (unlikely(dst == NULL)) {
+	if (unlikely(!cpu_dp)) {
 		kfree_skb(skb);
 		return 0;
 	}
@@ -174,7 +174,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (!skb)
 		return 0;
 
-	nskb = dst->rcv(skb, dev, pt);
+	nskb = cpu_dp->rcv(skb, dev, pt);
 	if (!nskb) {
 		kfree_skb(skb);
 		return 0;
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index b71e3bb478e4..62302558f38c 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -438,7 +438,7 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst)
 	 * sent to the tag format's receive function.
 	 */
 	wmb();
-	dst->cpu_dp->netdev->dsa_ptr = dst;
+	dst->cpu_dp->netdev->dsa_ptr = dst->cpu_dp;
 
 	err = dsa_master_ethtool_setup(dst->cpu_dp->netdev);
 	if (err)
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 9397291bb3aa..2850077cc9cc 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -116,7 +116,8 @@ void dsa_master_ethtool_restore(struct net_device *dev);
 static inline struct net_device *dsa_master_get_slave(struct net_device *dev,
 						      int device, int port)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	struct dsa_switch_tree *dst = cpu_dp->dst;
 	struct dsa_switch *ds;
 
 	if (device < 0 || device >= DSA_MAX_SWITCHES)
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index 96c7e3f8b8bb..71917505a5cc 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -607,7 +607,7 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev,
 	 * sent to the tag format's receive function.
 	 */
 	wmb();
-	dev->dsa_ptr = dst;
+	dev->dsa_ptr = dst->cpu_dp;
 
 	return dsa_master_ethtool_setup(dst->cpu_dp->netdev);
 }
diff --git a/net/dsa/master.c b/net/dsa/master.c
index ef15d35f1574..5f3f57e372e0 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -16,8 +16,7 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev,
 					 struct ethtool_stats *stats,
 					 uint64_t *data)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_port *cpu_dp = dst->cpu_dp;
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
 	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
 	struct dsa_switch *ds = cpu_dp->ds;
 	int port = cpu_dp->index;
@@ -34,8 +33,7 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev,
 
 static int dsa_master_get_sset_count(struct net_device *dev, int sset)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_port *cpu_dp = dst->cpu_dp;
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
 	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
 	struct dsa_switch *ds = cpu_dp->ds;
 	int count = 0;
@@ -52,8 +50,7 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset)
 static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
 				   uint8_t *data)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_port *cpu_dp = dst->cpu_dp;
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
 	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
 	struct dsa_switch *ds = cpu_dp->ds;
 	int port = cpu_dp->index;
@@ -90,8 +87,7 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
 
 int dsa_master_ethtool_setup(struct net_device *dev)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_port *cpu_dp = dst->cpu_dp;
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
 	struct dsa_switch *ds = cpu_dp->ds;
 	struct ethtool_ops *ops;
 
@@ -114,8 +110,7 @@ int dsa_master_ethtool_setup(struct net_device *dev)
 
 void dsa_master_ethtool_restore(struct net_device *dev)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_port *cpu_dp = dst->cpu_dp;
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
 
 	dev->ethtool_ops = cpu_dp->orig_ethtool_ops;
 	cpu_dp->orig_ethtool_ops = NULL;
-- 
cgit v1.2.3


From 19fe5f643f89f29c1a16bc474d91506b0e9a6232 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Sun, 1 Oct 2017 17:55:54 -0400
Subject: iomap: Switch from blkno to disk offset

Replace iomap->blkno, the sector number, with iomap->addr, the disk
offset in bytes.  For invalid disk offsets, use the special value
IOMAP_NULL_ADDR instead of IOMAP_NULL_BLOCK.

This allows to use iomap for mappings which are not block aligned, such
as inline data on ext4.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>  # iomap, xfs
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/buffer.c           |  4 ++--
 fs/dax.c              |  2 +-
 fs/ext2/inode.c       |  4 ++--
 fs/ext4/inode.c       |  4 ++--
 fs/iomap.c            | 11 +++++------
 fs/nfsd/blocklayout.c |  4 ++--
 fs/xfs/xfs_iomap.c    |  6 +++---
 include/linux/iomap.h | 10 +++++-----
 8 files changed, 22 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 170df856bdb9..bd4d0923cdce 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1978,8 +1978,8 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
 	case IOMAP_MAPPED:
 		if (offset >= i_size_read(inode))
 			set_buffer_new(bh);
-		bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
-				((offset - iomap->offset) >> inode->i_blkbits);
+		bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
+				inode->i_blkbits;
 		set_buffer_mapped(bh);
 		break;
 	}
diff --git a/fs/dax.c b/fs/dax.c
index f001d8c72a06..f3a44a7c14b3 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -938,7 +938,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range);
 
 static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
 {
-	return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
+	return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
 }
 
 static loff_t
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 4dca6f348714..1b8fc73de4a1 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -820,11 +820,11 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 
 	if (ret == 0) {
 		iomap->type = IOMAP_HOLE;
-		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->addr = IOMAP_NULL_ADDR;
 		iomap->length = 1 << blkbits;
 	} else {
 		iomap->type = IOMAP_MAPPED;
-		iomap->blkno = (sector_t)bno << (blkbits - 9);
+		iomap->addr = (u64)bno << blkbits;
 		iomap->length = (u64)ret << blkbits;
 		iomap->flags |= IOMAP_F_MERGED;
 	}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 31db875bc7a1..d9e633c12aae 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3472,7 +3472,7 @@ retry:
 
 	if (ret == 0) {
 		iomap->type = IOMAP_HOLE;
-		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->addr = IOMAP_NULL_ADDR;
 		iomap->length = (u64)map.m_len << blkbits;
 	} else {
 		if (map.m_flags & EXT4_MAP_MAPPED) {
@@ -3483,7 +3483,7 @@ retry:
 			WARN_ON_ONCE(1);
 			return -EIO;
 		}
-		iomap->blkno = (sector_t)map.m_pblk << (blkbits - 9);
+		iomap->addr = (u64)map.m_pblk << blkbits;
 		iomap->length = (u64)map.m_len << blkbits;
 	}
 
diff --git a/fs/iomap.c b/fs/iomap.c
index be61cf742b5e..d25db039ee13 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -350,8 +350,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
 static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
 		struct iomap *iomap)
 {
-	sector_t sector = iomap->blkno +
-		(((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
+	sector_t sector = (iomap->addr +
+			   (pos & PAGE_MASK) - iomap->offset) >> 9;
 
 	return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
 			offset, bytes);
@@ -512,9 +512,8 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
 		flags |= FIEMAP_EXTENT_SHARED;
 
 	return fiemap_fill_next_extent(fi, iomap->offset,
-			iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
+			iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
 			iomap->length, flags);
-
 }
 
 static loff_t
@@ -823,7 +822,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
 	bio = bio_alloc(GFP_KERNEL, 1);
 	bio_set_dev(bio, iomap->bdev);
 	bio->bi_iter.bi_sector =
-		iomap->blkno + ((pos - iomap->offset) >> 9);
+		(iomap->addr + pos - iomap->offset) >> 9;
 	bio->bi_private = dio;
 	bio->bi_end_io = iomap_dio_bio_end_io;
 
@@ -902,7 +901,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 		bio = bio_alloc(GFP_KERNEL, nr_pages);
 		bio_set_dev(bio, iomap->bdev);
 		bio->bi_iter.bi_sector =
-			iomap->blkno + ((pos - iomap->offset) >> 9);
+			(iomap->addr + pos - iomap->offset) >> 9;
 		bio->bi_write_hint = dio->iocb->ki_hint;
 		bio->bi_private = dio;
 		bio->bi_end_io = iomap_dio_bio_end_io;
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index c862c2489df0..2d1d37b27dc7 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -65,7 +65,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
 			bex->es = PNFS_BLOCK_READ_DATA;
 		else
 			bex->es = PNFS_BLOCK_READWRITE_DATA;
-		bex->soff = (iomap.blkno << 9);
+		bex->soff = iomap.addr;
 		break;
 	case IOMAP_UNWRITTEN:
 		if (seg->iomode & IOMODE_RW) {
@@ -78,7 +78,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
 			}
 
 			bex->es = PNFS_BLOCK_INVALID_DATA;
-			bex->soff = (iomap.blkno << 9);
+			bex->soff = iomap.addr;
 			break;
 		}
 		/*FALLTHRU*/
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index f179bdf1644d..9744b4819e0d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -54,13 +54,13 @@ xfs_bmbt_to_iomap(
 	struct xfs_mount	*mp = ip->i_mount;
 
 	if (imap->br_startblock == HOLESTARTBLOCK) {
-		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->addr = IOMAP_NULL_ADDR;
 		iomap->type = IOMAP_HOLE;
 	} else if (imap->br_startblock == DELAYSTARTBLOCK) {
-		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->addr = IOMAP_NULL_ADDR;
 		iomap->type = IOMAP_DELALLOC;
 	} else {
-		iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
+		iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock));
 		if (imap->br_state == XFS_EXT_UNWRITTEN)
 			iomap->type = IOMAP_UNWRITTEN;
 		else
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index f64dc6ce5161..7b8a615fa021 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -15,8 +15,8 @@ struct vm_fault;
  */
 #define IOMAP_HOLE	0x01	/* no blocks allocated, need allocation */
 #define IOMAP_DELALLOC	0x02	/* delayed allocation blocks */
-#define IOMAP_MAPPED	0x03	/* blocks allocated @blkno */
-#define IOMAP_UNWRITTEN	0x04	/* blocks allocated @blkno in unwritten state */
+#define IOMAP_MAPPED	0x03	/* blocks allocated at @addr */
+#define IOMAP_UNWRITTEN	0x04	/* blocks allocated at @addr in unwritten state */
 
 /*
  * Flags for all iomap mappings:
@@ -30,12 +30,12 @@ struct vm_fault;
 #define IOMAP_F_SHARED	0x20	/* block shared with another file */
 
 /*
- * Magic value for blkno:
+ * Magic value for addr:
  */
-#define IOMAP_NULL_BLOCK -1LL	/* blkno is not valid */
+#define IOMAP_NULL_ADDR -1ULL	/* addr is not valid */
 
 struct iomap {
-	sector_t		blkno;	/* 1st sector of mapping, 512b units */
+	u64			addr; /* disk offset of mapping, bytes */
 	loff_t			offset;	/* file offset of mapping, bytes */
 	u64			length;	/* length of mapping, bytes */
 	u16			type;	/* type of mapping */
-- 
cgit v1.2.3


From 9ca250a5137f3df7ffac58b49660beadd5926ace Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Sun, 1 Oct 2017 17:56:54 -0400
Subject: iomap: Add IOMAP_F_DATA_INLINE flag

Add a new IOMAP_F_DATA_INLINE flag to indicate that a mapping is in a
disk area that contains data as well as metadata.  In iomap_fiemap, map
this flag to FIEMAP_EXTENT_DATA_INLINE.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/iomap.c            | 2 ++
 include/linux/iomap.h | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/iomap.c b/fs/iomap.c
index d25db039ee13..379eb9896093 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -510,6 +510,8 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
 		flags |= FIEMAP_EXTENT_MERGED;
 	if (iomap->flags & IOMAP_F_SHARED)
 		flags |= FIEMAP_EXTENT_SHARED;
+	if (iomap->flags & IOMAP_F_DATA_INLINE)
+		flags |= FIEMAP_EXTENT_DATA_INLINE;
 
 	return fiemap_fill_next_extent(fi, iomap->offset,
 			iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 7b8a615fa021..2b0790dbd6ea 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -26,8 +26,9 @@ struct vm_fault;
 /*
  * Flags that only need to be reported for IOMAP_REPORT requests:
  */
-#define IOMAP_F_MERGED	0x10	/* contains multiple blocks/extents */
-#define IOMAP_F_SHARED	0x20	/* block shared with another file */
+#define IOMAP_F_MERGED		0x10	/* contains multiple blocks/extents */
+#define IOMAP_F_SHARED		0x20	/* block shared with another file */
+#define IOMAP_F_DATA_INLINE	0x40	/* data inline in the inode */
 
 /*
  * Magic value for addr:
-- 
cgit v1.2.3


From 4f210c2938053e894ee00500dd0661f7a27a6ff4 Mon Sep 17 00:00:00 2001
From: Jaejoong Kim <climbbb.kim@gmail.com>
Date: Wed, 20 Sep 2017 18:40:36 +0900
Subject: HID: add comment for power callback in struct hid_ll_driver

There is a missing comment in struct hid_ll_driver. So, add it.

Signed-off-by: Jaejoong Kim <climbbb.kim@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/hid.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index ab05a86269dc..7c3d4a17bbde 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -753,6 +753,7 @@ struct hid_driver {
  * @stop: called on remove
  * @open: called by input layer on open
  * @close: called by input layer on close
+ * @power: request underlying hardware to enter requested power mode
  * @parse: this method is called only once to parse the device data,
  *	   shouldn't allocate anything to not leak memory
  * @request: send report request to device (e.g. feature report)
-- 
cgit v1.2.3


From 66b1bedf662518e9b6367990a87e9601b35a94c1 Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Fri, 29 Sep 2017 14:21:14 +0300
Subject: ieee80211: Add WFA TPC report element OUI type

Add Transmit Power Control OUI type definition for WLAN_OUI_MICROSOFT.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 55a604ad459f..ee6657a0ed69 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2445,6 +2445,7 @@ enum ieee80211_sa_query_action {
 #define WLAN_OUI_TYPE_MICROSOFT_WPA	1
 #define WLAN_OUI_TYPE_MICROSOFT_WMM	2
 #define WLAN_OUI_TYPE_MICROSOFT_WPS	4
+#define WLAN_OUI_TYPE_MICROSOFT_TPC	8
 
 /*
  * WMM/802.11e Tspec Element
-- 
cgit v1.2.3


From f2f2efb807d339513199b1bb771806c90cce83ae Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:28 +0300
Subject: byteorder: Move {cpu_to_be32, be32_to_cpu}_array() from Thunderbolt
 to core

We will be using these when communicating XDomain discovery protocol
over Thunderbolt link but they might be useful for other drivers as
well.

Make them available through byteorder/generic.h.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/ctl.c         | 14 --------------
 include/linux/byteorder/generic.h | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thunderbolt/ctl.c b/drivers/thunderbolt/ctl.c
index fb40dd0588b9..e6a4c9458c76 100644
--- a/drivers/thunderbolt/ctl.c
+++ b/drivers/thunderbolt/ctl.c
@@ -289,20 +289,6 @@ static void tb_cfg_print_error(struct tb_ctl *ctl,
 	}
 }
 
-static void cpu_to_be32_array(__be32 *dst, const u32 *src, size_t len)
-{
-	int i;
-	for (i = 0; i < len; i++)
-		dst[i] = cpu_to_be32(src[i]);
-}
-
-static void be32_to_cpu_array(u32 *dst, __be32 *src, size_t len)
-{
-	int i;
-	for (i = 0; i < len; i++)
-		dst[i] = be32_to_cpu(src[i]);
-}
-
 static __be32 tb_crc(const void *data, size_t len)
 {
 	return cpu_to_be32(~__crc32c_le(~0, data, len));
diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h
index 89f67c1c3160..805d16654459 100644
--- a/include/linux/byteorder/generic.h
+++ b/include/linux/byteorder/generic.h
@@ -170,4 +170,20 @@ static inline void be64_add_cpu(__be64 *var, u64 val)
 	*var = cpu_to_be64(be64_to_cpu(*var) + val);
 }
 
+static inline void cpu_to_be32_array(__be32 *dst, const u32 *src, size_t len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		dst[i] = cpu_to_be32(src[i]);
+}
+
+static inline void be32_to_cpu_array(u32 *dst, const __be32 *src, size_t len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		dst[i] = be32_to_cpu(src[i]);
+}
+
 #endif /* _LINUX_BYTEORDER_GENERIC_H */
-- 
cgit v1.2.3


From cdae7c07e3e3509eaabc18c1640a55dc5b99c179 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:30 +0300
Subject: thunderbolt: Add support for XDomain properties

Thunderbolt XDomain discovery protocol uses directories which contain
properties and other directories to exchange information about what
capabilities the remote host supports. This also includes identification
information like device ID and name.

This adds support for parsing and formatting these properties and
establishes an API drivers can use in addition to the core Thunderbolt
driver. This API is exposed in a new header: include/linux/thunderbolt.h.

This code is based on the work done by Amir Levy and Michael Jamet.

Signed-off-by: Michael Jamet <michael.jamet@intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/Makefile   |   2 +-
 drivers/thunderbolt/property.c | 670 +++++++++++++++++++++++++++++++++++++++++
 include/linux/thunderbolt.h    |  89 ++++++
 3 files changed, 760 insertions(+), 1 deletion(-)
 create mode 100644 drivers/thunderbolt/property.c
 create mode 100644 include/linux/thunderbolt.h

(limited to 'include/linux')

diff --git a/drivers/thunderbolt/Makefile b/drivers/thunderbolt/Makefile
index 4900febc6c8a..7afd21f5383a 100644
--- a/drivers/thunderbolt/Makefile
+++ b/drivers/thunderbolt/Makefile
@@ -1,3 +1,3 @@
 obj-${CONFIG_THUNDERBOLT} := thunderbolt.o
 thunderbolt-objs := nhi.o ctl.o tb.o switch.o cap.o path.o tunnel_pci.o eeprom.o
-thunderbolt-objs += domain.o dma_port.o icm.o
+thunderbolt-objs += domain.o dma_port.o icm.o property.o
diff --git a/drivers/thunderbolt/property.c b/drivers/thunderbolt/property.c
new file mode 100644
index 000000000000..8fe913a95b4a
--- /dev/null
+++ b/drivers/thunderbolt/property.c
@@ -0,0 +1,670 @@
+/*
+ * Thunderbolt XDomain property support
+ *
+ * Copyright (C) 2017, Intel Corporation
+ * Authors: Michael Jamet <michael.jamet@intel.com>
+ *          Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uuid.h>
+#include <linux/thunderbolt.h>
+
+struct tb_property_entry {
+	u32 key_hi;
+	u32 key_lo;
+	u16 length;
+	u8 reserved;
+	u8 type;
+	u32 value;
+};
+
+struct tb_property_rootdir_entry {
+	u32 magic;
+	u32 length;
+	struct tb_property_entry entries[];
+};
+
+struct tb_property_dir_entry {
+	u32 uuid[4];
+	struct tb_property_entry entries[];
+};
+
+#define TB_PROPERTY_ROOTDIR_MAGIC	0x55584401
+
+static struct tb_property_dir *__tb_property_parse_dir(const u32 *block,
+	size_t block_len, unsigned int dir_offset, size_t dir_len,
+	bool is_root);
+
+static inline void parse_dwdata(void *dst, const void *src, size_t dwords)
+{
+	be32_to_cpu_array(dst, src, dwords);
+}
+
+static inline void format_dwdata(void *dst, const void *src, size_t dwords)
+{
+	cpu_to_be32_array(dst, src, dwords);
+}
+
+static bool tb_property_entry_valid(const struct tb_property_entry *entry,
+				  size_t block_len)
+{
+	switch (entry->type) {
+	case TB_PROPERTY_TYPE_DIRECTORY:
+	case TB_PROPERTY_TYPE_DATA:
+	case TB_PROPERTY_TYPE_TEXT:
+		if (entry->length > block_len)
+			return false;
+		if (entry->value + entry->length > block_len)
+			return false;
+		break;
+
+	case TB_PROPERTY_TYPE_VALUE:
+		if (entry->length != 1)
+			return false;
+		break;
+	}
+
+	return true;
+}
+
+static bool tb_property_key_valid(const char *key)
+{
+	return key && strlen(key) <= TB_PROPERTY_KEY_SIZE;
+}
+
+static struct tb_property *
+tb_property_alloc(const char *key, enum tb_property_type type)
+{
+	struct tb_property *property;
+
+	property = kzalloc(sizeof(*property), GFP_KERNEL);
+	if (!property)
+		return NULL;
+
+	strcpy(property->key, key);
+	property->type = type;
+	INIT_LIST_HEAD(&property->list);
+
+	return property;
+}
+
+static struct tb_property *tb_property_parse(const u32 *block, size_t block_len,
+					const struct tb_property_entry *entry)
+{
+	char key[TB_PROPERTY_KEY_SIZE + 1];
+	struct tb_property *property;
+	struct tb_property_dir *dir;
+
+	if (!tb_property_entry_valid(entry, block_len))
+		return NULL;
+
+	parse_dwdata(key, entry, 2);
+	key[TB_PROPERTY_KEY_SIZE] = '\0';
+
+	property = tb_property_alloc(key, entry->type);
+	if (!property)
+		return NULL;
+
+	property->length = entry->length;
+
+	switch (property->type) {
+	case TB_PROPERTY_TYPE_DIRECTORY:
+		dir = __tb_property_parse_dir(block, block_len, entry->value,
+					      entry->length, false);
+		if (!dir) {
+			kfree(property);
+			return NULL;
+		}
+		property->value.dir = dir;
+		break;
+
+	case TB_PROPERTY_TYPE_DATA:
+		property->value.data = kcalloc(property->length, sizeof(u32),
+					       GFP_KERNEL);
+		if (!property->value.data) {
+			kfree(property);
+			return NULL;
+		}
+		parse_dwdata(property->value.data, block + entry->value,
+			     entry->length);
+		break;
+
+	case TB_PROPERTY_TYPE_TEXT:
+		property->value.text = kcalloc(property->length, sizeof(u32),
+					       GFP_KERNEL);
+		if (!property->value.text) {
+			kfree(property);
+			return NULL;
+		}
+		parse_dwdata(property->value.text, block + entry->value,
+			     entry->length);
+		/* Force null termination */
+		property->value.text[property->length * 4 - 1] = '\0';
+		break;
+
+	case TB_PROPERTY_TYPE_VALUE:
+		property->value.immediate = entry->value;
+		break;
+
+	default:
+		property->type = TB_PROPERTY_TYPE_UNKNOWN;
+		break;
+	}
+
+	return property;
+}
+
+static struct tb_property_dir *__tb_property_parse_dir(const u32 *block,
+	size_t block_len, unsigned int dir_offset, size_t dir_len, bool is_root)
+{
+	const struct tb_property_entry *entries;
+	size_t i, content_len, nentries;
+	unsigned int content_offset;
+	struct tb_property_dir *dir;
+
+	dir = kzalloc(sizeof(*dir), GFP_KERNEL);
+	if (!dir)
+		return NULL;
+
+	if (is_root) {
+		content_offset = dir_offset + 2;
+		content_len = dir_len;
+	} else {
+		dir->uuid = kmemdup(&block[dir_offset], sizeof(*dir->uuid),
+				    GFP_KERNEL);
+		content_offset = dir_offset + 4;
+		content_len = dir_len - 4; /* Length includes UUID */
+	}
+
+	entries = (const struct tb_property_entry *)&block[content_offset];
+	nentries = content_len / (sizeof(*entries) / 4);
+
+	INIT_LIST_HEAD(&dir->properties);
+
+	for (i = 0; i < nentries; i++) {
+		struct tb_property *property;
+
+		property = tb_property_parse(block, block_len, &entries[i]);
+		if (!property) {
+			tb_property_free_dir(dir);
+			return NULL;
+		}
+
+		list_add_tail(&property->list, &dir->properties);
+	}
+
+	return dir;
+}
+
+/**
+ * tb_property_parse_dir() - Parses properties from given property block
+ * @block: Property block to parse
+ * @block_len: Number of dword elements in the property block
+ *
+ * This function parses the XDomain properties data block into format that
+ * can be traversed using the helper functions provided by this module.
+ * Upon success returns the parsed directory. In case of error returns
+ * %NULL. The resulting &struct tb_property_dir needs to be released by
+ * calling tb_property_free_dir() when not needed anymore.
+ *
+ * The @block is expected to be root directory.
+ */
+struct tb_property_dir *tb_property_parse_dir(const u32 *block,
+					      size_t block_len)
+{
+	const struct tb_property_rootdir_entry *rootdir =
+		(const struct tb_property_rootdir_entry *)block;
+
+	if (rootdir->magic != TB_PROPERTY_ROOTDIR_MAGIC)
+		return NULL;
+	if (rootdir->length > block_len)
+		return NULL;
+
+	return __tb_property_parse_dir(block, block_len, 0, rootdir->length,
+				       true);
+}
+
+/**
+ * tb_property_create_dir() - Creates new property directory
+ * @uuid: UUID used to identify the particular directory
+ *
+ * Creates new, empty property directory. If @uuid is %NULL then the
+ * directory is assumed to be root directory.
+ */
+struct tb_property_dir *tb_property_create_dir(const uuid_t *uuid)
+{
+	struct tb_property_dir *dir;
+
+	dir = kzalloc(sizeof(*dir), GFP_KERNEL);
+	if (!dir)
+		return NULL;
+
+	INIT_LIST_HEAD(&dir->properties);
+	if (uuid) {
+		dir->uuid = kmemdup(uuid, sizeof(*dir->uuid), GFP_KERNEL);
+		if (!dir->uuid) {
+			kfree(dir);
+			return NULL;
+		}
+	}
+
+	return dir;
+}
+EXPORT_SYMBOL_GPL(tb_property_create_dir);
+
+static void tb_property_free(struct tb_property *property)
+{
+	switch (property->type) {
+	case TB_PROPERTY_TYPE_DIRECTORY:
+		tb_property_free_dir(property->value.dir);
+		break;
+
+	case TB_PROPERTY_TYPE_DATA:
+		kfree(property->value.data);
+		break;
+
+	case TB_PROPERTY_TYPE_TEXT:
+		kfree(property->value.text);
+		break;
+
+	default:
+		break;
+	}
+
+	kfree(property);
+}
+
+/**
+ * tb_property_free_dir() - Release memory allocated for property directory
+ * @dir: Directory to release
+ *
+ * This will release all the memory the directory occupies including all
+ * descendants. It is OK to pass %NULL @dir, then the function does
+ * nothing.
+ */
+void tb_property_free_dir(struct tb_property_dir *dir)
+{
+	struct tb_property *property, *tmp;
+
+	if (!dir)
+		return;
+
+	list_for_each_entry_safe(property, tmp, &dir->properties, list) {
+		list_del(&property->list);
+		tb_property_free(property);
+	}
+	kfree(dir->uuid);
+	kfree(dir);
+}
+EXPORT_SYMBOL_GPL(tb_property_free_dir);
+
+static size_t tb_property_dir_length(const struct tb_property_dir *dir,
+				     bool recurse, size_t *data_len)
+{
+	const struct tb_property *property;
+	size_t len = 0;
+
+	if (dir->uuid)
+		len += sizeof(*dir->uuid) / 4;
+	else
+		len += sizeof(struct tb_property_rootdir_entry) / 4;
+
+	list_for_each_entry(property, &dir->properties, list) {
+		len += sizeof(struct tb_property_entry) / 4;
+
+		switch (property->type) {
+		case TB_PROPERTY_TYPE_DIRECTORY:
+			if (recurse) {
+				len += tb_property_dir_length(
+					property->value.dir, recurse, data_len);
+			}
+			/* Reserve dword padding after each directory */
+			if (data_len)
+				*data_len += 1;
+			break;
+
+		case TB_PROPERTY_TYPE_DATA:
+		case TB_PROPERTY_TYPE_TEXT:
+			if (data_len)
+				*data_len += property->length;
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	return len;
+}
+
+static ssize_t __tb_property_format_dir(const struct tb_property_dir *dir,
+	u32 *block, unsigned int start_offset, size_t block_len)
+{
+	unsigned int data_offset, dir_end;
+	const struct tb_property *property;
+	struct tb_property_entry *entry;
+	size_t dir_len, data_len = 0;
+	int ret;
+
+	/*
+	 * The structure of property block looks like following. Leaf
+	 * data/text is included right after the directory and each
+	 * directory follows each other (even nested ones).
+	 *
+	 * +----------+ <-- start_offset
+	 * |  header  | <-- root directory header
+	 * +----------+ ---
+	 * |  entry 0 | -^--------------------.
+	 * +----------+  |                    |
+	 * |  entry 1 | -|--------------------|--.
+	 * +----------+  |                    |  |
+	 * |  entry 2 | -|-----------------.  |  |
+	 * +----------+  |                 |  |  |
+	 * :          :  |  dir_len        |  |  |
+	 * .          .  |                 |  |  |
+	 * :          :  |                 |  |  |
+	 * +----------+  |                 |  |  |
+	 * |  entry n |  v                 |  |  |
+	 * +----------+ <-- data_offset    |  |  |
+	 * |  data 0  | <------------------|--'  |
+	 * +----------+                    |     |
+	 * |  data 1  | <------------------|-----'
+	 * +----------+                    |
+	 * | 00000000 | padding            |
+	 * +----------+ <-- dir_end <------'
+	 * |   UUID   | <-- directory UUID (child directory)
+	 * +----------+
+	 * |  entry 0 |
+	 * +----------+
+	 * |  entry 1 |
+	 * +----------+
+	 * :          :
+	 * .          .
+	 * :          :
+	 * +----------+
+	 * |  entry n |
+	 * +----------+
+	 * |  data 0  |
+	 * +----------+
+	 *
+	 * We use dir_end to hold pointer to the end of the directory. It
+	 * will increase as we add directories and each directory should be
+	 * added starting from previous dir_end.
+	 */
+	dir_len = tb_property_dir_length(dir, false, &data_len);
+	data_offset = start_offset + dir_len;
+	dir_end = start_offset + data_len + dir_len;
+
+	if (data_offset > dir_end)
+		return -EINVAL;
+	if (dir_end > block_len)
+		return -EINVAL;
+
+	/* Write headers first */
+	if (dir->uuid) {
+		struct tb_property_dir_entry *pe;
+
+		pe = (struct tb_property_dir_entry *)&block[start_offset];
+		memcpy(pe->uuid, dir->uuid, sizeof(pe->uuid));
+		entry = pe->entries;
+	} else {
+		struct tb_property_rootdir_entry *re;
+
+		re = (struct tb_property_rootdir_entry *)&block[start_offset];
+		re->magic = TB_PROPERTY_ROOTDIR_MAGIC;
+		re->length = dir_len - sizeof(*re) / 4;
+		entry = re->entries;
+	}
+
+	list_for_each_entry(property, &dir->properties, list) {
+		const struct tb_property_dir *child;
+
+		format_dwdata(entry, property->key, 2);
+		entry->type = property->type;
+
+		switch (property->type) {
+		case TB_PROPERTY_TYPE_DIRECTORY:
+			child = property->value.dir;
+			ret = __tb_property_format_dir(child, block, dir_end,
+						       block_len);
+			if (ret < 0)
+				return ret;
+			entry->length = tb_property_dir_length(child, false,
+							       NULL);
+			entry->value = dir_end;
+			dir_end = ret;
+			break;
+
+		case TB_PROPERTY_TYPE_DATA:
+			format_dwdata(&block[data_offset], property->value.data,
+				      property->length);
+			entry->length = property->length;
+			entry->value = data_offset;
+			data_offset += entry->length;
+			break;
+
+		case TB_PROPERTY_TYPE_TEXT:
+			format_dwdata(&block[data_offset], property->value.text,
+				      property->length);
+			entry->length = property->length;
+			entry->value = data_offset;
+			data_offset += entry->length;
+			break;
+
+		case TB_PROPERTY_TYPE_VALUE:
+			entry->length = property->length;
+			entry->value = property->value.immediate;
+			break;
+
+		default:
+			break;
+		}
+
+		entry++;
+	}
+
+	return dir_end;
+}
+
+/**
+ * tb_property_format_dir() - Formats directory to the packed XDomain format
+ * @dir: Directory to format
+ * @block: Property block where the packed data is placed
+ * @block_len: Length of the property block
+ *
+ * This function formats the directory to the packed format that can be
+ * then send over the thunderbolt fabric to receiving host. Returns %0 in
+ * case of success and negative errno on faulure. Passing %NULL in @block
+ * returns number of entries the block takes.
+ */
+ssize_t tb_property_format_dir(const struct tb_property_dir *dir, u32 *block,
+			       size_t block_len)
+{
+	ssize_t ret;
+
+	if (!block) {
+		size_t dir_len, data_len = 0;
+
+		dir_len = tb_property_dir_length(dir, true, &data_len);
+		return dir_len + data_len;
+	}
+
+	ret = __tb_property_format_dir(dir, block, 0, block_len);
+	return ret < 0 ? ret : 0;
+}
+
+/**
+ * tb_property_add_immediate() - Add immediate property to directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @value: Immediate value to store with the property
+ */
+int tb_property_add_immediate(struct tb_property_dir *parent, const char *key,
+			      u32 value)
+{
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_VALUE);
+	if (!property)
+		return -ENOMEM;
+
+	property->length = 1;
+	property->value.immediate = value;
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_immediate);
+
+/**
+ * tb_property_add_data() - Adds arbitrary data property to directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @buf: Data buffer to add
+ * @buflen: Number of bytes in the data buffer
+ *
+ * Function takes a copy of @buf and adds it to the directory.
+ */
+int tb_property_add_data(struct tb_property_dir *parent, const char *key,
+			 const void *buf, size_t buflen)
+{
+	/* Need to pad to dword boundary */
+	size_t size = round_up(buflen, 4);
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_DATA);
+	if (!property)
+		return -ENOMEM;
+
+	property->length = size / 4;
+	property->value.data = kzalloc(size, GFP_KERNEL);
+	memcpy(property->value.data, buf, buflen);
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_data);
+
+/**
+ * tb_property_add_text() - Adds string property to directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @text: String to add
+ *
+ * Function takes a copy of @text and adds it to the directory.
+ */
+int tb_property_add_text(struct tb_property_dir *parent, const char *key,
+			 const char *text)
+{
+	/* Need to pad to dword boundary */
+	size_t size = round_up(strlen(text) + 1, 4);
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_TEXT);
+	if (!property)
+		return -ENOMEM;
+
+	property->length = size / 4;
+	property->value.data = kzalloc(size, GFP_KERNEL);
+	strcpy(property->value.text, text);
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_text);
+
+/**
+ * tb_property_add_dir() - Adds a directory to the parent directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @dir: Directory to add
+ */
+int tb_property_add_dir(struct tb_property_dir *parent, const char *key,
+			struct tb_property_dir *dir)
+{
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_DIRECTORY);
+	if (!property)
+		return -ENOMEM;
+
+	property->value.dir = dir;
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_dir);
+
+/**
+ * tb_property_remove() - Removes property from a parent directory
+ * @property: Property to remove
+ *
+ * Note memory for @property is released as well so it is not allowed to
+ * touch the object after call to this function.
+ */
+void tb_property_remove(struct tb_property *property)
+{
+	list_del(&property->list);
+	kfree(property);
+}
+EXPORT_SYMBOL_GPL(tb_property_remove);
+
+/**
+ * tb_property_find() - Find a property from a directory
+ * @dir: Directory where the property is searched
+ * @key: Key to look for
+ * @type: Type of the property
+ *
+ * Finds and returns property from the given directory. Does not recurse
+ * into sub-directories. Returns %NULL if the property was not found.
+ */
+struct tb_property *tb_property_find(struct tb_property_dir *dir,
+	const char *key, enum tb_property_type type)
+{
+	struct tb_property *property;
+
+	list_for_each_entry(property, &dir->properties, list) {
+		if (property->type == type && !strcmp(property->key, key))
+			return property;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(tb_property_find);
+
+/**
+ * tb_property_get_next() - Get next property from directory
+ * @dir: Directory holding properties
+ * @prev: Previous property in the directory (%NULL returns the first)
+ */
+struct tb_property *tb_property_get_next(struct tb_property_dir *dir,
+					 struct tb_property *prev)
+{
+	if (prev) {
+		if (list_is_last(&prev->list, &dir->properties))
+			return NULL;
+		return list_next_entry(prev, list);
+	}
+	return list_first_entry_or_null(&dir->properties, struct tb_property,
+					list);
+}
+EXPORT_SYMBOL_GPL(tb_property_get_next);
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
new file mode 100644
index 000000000000..96561c1265ae
--- /dev/null
+++ b/include/linux/thunderbolt.h
@@ -0,0 +1,89 @@
+/*
+ * Thunderbolt service API
+ *
+ * Copyright (C) 2017, Intel Corporation
+ * Authors: Michael Jamet <michael.jamet@intel.com>
+ *          Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef THUNDERBOLT_H_
+#define THUNDERBOLT_H_
+
+#include <linux/list.h>
+#include <linux/uuid.h>
+
+/**
+ * struct tb_property_dir - XDomain property directory
+ * @uuid: Directory UUID or %NULL if root directory
+ * @properties: List of properties in this directory
+ *
+ * User needs to provide serialization if needed.
+ */
+struct tb_property_dir {
+	const uuid_t *uuid;
+	struct list_head properties;
+};
+
+enum tb_property_type {
+	TB_PROPERTY_TYPE_UNKNOWN = 0x00,
+	TB_PROPERTY_TYPE_DIRECTORY = 0x44,
+	TB_PROPERTY_TYPE_DATA = 0x64,
+	TB_PROPERTY_TYPE_TEXT = 0x74,
+	TB_PROPERTY_TYPE_VALUE = 0x76,
+};
+
+#define TB_PROPERTY_KEY_SIZE	8
+
+/**
+ * struct tb_property - XDomain property
+ * @list: Used to link properties together in a directory
+ * @key: Key for the property (always terminated).
+ * @type: Type of the property
+ * @length: Length of the property data in dwords
+ * @value: Property value
+ *
+ * Users use @type to determine which field in @value is filled.
+ */
+struct tb_property {
+	struct list_head list;
+	char key[TB_PROPERTY_KEY_SIZE + 1];
+	enum tb_property_type type;
+	size_t length;
+	union {
+		struct tb_property_dir *dir;
+		u8 *data;
+		char *text;
+		u32 immediate;
+	} value;
+};
+
+struct tb_property_dir *tb_property_parse_dir(const u32 *block,
+					      size_t block_len);
+ssize_t tb_property_format_dir(const struct tb_property_dir *dir, u32 *block,
+			       size_t block_len);
+struct tb_property_dir *tb_property_create_dir(const uuid_t *uuid);
+void tb_property_free_dir(struct tb_property_dir *dir);
+int tb_property_add_immediate(struct tb_property_dir *parent, const char *key,
+			      u32 value);
+int tb_property_add_data(struct tb_property_dir *parent, const char *key,
+			 const void *buf, size_t buflen);
+int tb_property_add_text(struct tb_property_dir *parent, const char *key,
+			 const char *text);
+int tb_property_add_dir(struct tb_property_dir *parent, const char *key,
+			struct tb_property_dir *dir);
+void tb_property_remove(struct tb_property *tb_property);
+struct tb_property *tb_property_find(struct tb_property_dir *dir,
+			const char *key, enum tb_property_type type);
+struct tb_property *tb_property_get_next(struct tb_property_dir *dir,
+					 struct tb_property *prev);
+
+#define tb_property_for_each(dir, property)			\
+	for (property = tb_property_get_next(dir, NULL);	\
+	     property;						\
+	     property = tb_property_get_next(dir, property))
+
+#endif /* THUNDERBOLT_H_ */
-- 
cgit v1.2.3


From eaf8ff35a345449207ad116e2574c19780ec9a98 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:31 +0300
Subject: thunderbolt: Move enum tb_cfg_pkg_type to thunderbolt.h

These will be needed by Thunderbolt services when sending and receiving
XDomain control messages. While there change TB_CFG_PKG_PREPARE_TO_SLEEP
value to be decimal in order to be consistent with other members.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/ctl.h     |  1 +
 drivers/thunderbolt/tb_msgs.h | 17 -----------------
 include/linux/thunderbolt.h   | 17 +++++++++++++++++
 3 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thunderbolt/ctl.h b/drivers/thunderbolt/ctl.h
index 36fd28b1c1c5..d0f21e1e0b8b 100644
--- a/drivers/thunderbolt/ctl.h
+++ b/drivers/thunderbolt/ctl.h
@@ -8,6 +8,7 @@
 #define _TB_CFG
 
 #include <linux/kref.h>
+#include <linux/thunderbolt.h>
 
 #include "nhi.h"
 #include "tb_msgs.h"
diff --git a/drivers/thunderbolt/tb_msgs.h b/drivers/thunderbolt/tb_msgs.h
index f3adf58a40ce..f2b2550cd97c 100644
--- a/drivers/thunderbolt/tb_msgs.h
+++ b/drivers/thunderbolt/tb_msgs.h
@@ -15,23 +15,6 @@
 #include <linux/types.h>
 #include <linux/uuid.h>
 
-enum tb_cfg_pkg_type {
-	TB_CFG_PKG_READ = 1,
-	TB_CFG_PKG_WRITE = 2,
-	TB_CFG_PKG_ERROR = 3,
-	TB_CFG_PKG_NOTIFY_ACK = 4,
-	TB_CFG_PKG_EVENT = 5,
-	TB_CFG_PKG_XDOMAIN_REQ = 6,
-	TB_CFG_PKG_XDOMAIN_RESP = 7,
-	TB_CFG_PKG_OVERRIDE = 8,
-	TB_CFG_PKG_RESET = 9,
-	TB_CFG_PKG_ICM_EVENT = 10,
-	TB_CFG_PKG_ICM_CMD = 11,
-	TB_CFG_PKG_ICM_RESP = 12,
-	TB_CFG_PKG_PREPARE_TO_SLEEP = 0xd,
-
-};
-
 enum tb_cfg_space {
 	TB_CFG_HOPS = 0,
 	TB_CFG_PORT = 1,
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 96561c1265ae..b512b1e2b4f2 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -1,6 +1,7 @@
 /*
  * Thunderbolt service API
  *
+ * Copyright (C) 2014 Andreas Noever <andreas.noever@gmail.com>
  * Copyright (C) 2017, Intel Corporation
  * Authors: Michael Jamet <michael.jamet@intel.com>
  *          Mika Westerberg <mika.westerberg@linux.intel.com>
@@ -16,6 +17,22 @@
 #include <linux/list.h>
 #include <linux/uuid.h>
 
+enum tb_cfg_pkg_type {
+	TB_CFG_PKG_READ = 1,
+	TB_CFG_PKG_WRITE = 2,
+	TB_CFG_PKG_ERROR = 3,
+	TB_CFG_PKG_NOTIFY_ACK = 4,
+	TB_CFG_PKG_EVENT = 5,
+	TB_CFG_PKG_XDOMAIN_REQ = 6,
+	TB_CFG_PKG_XDOMAIN_RESP = 7,
+	TB_CFG_PKG_OVERRIDE = 8,
+	TB_CFG_PKG_RESET = 9,
+	TB_CFG_PKG_ICM_EVENT = 10,
+	TB_CFG_PKG_ICM_CMD = 11,
+	TB_CFG_PKG_ICM_RESP = 12,
+	TB_CFG_PKG_PREPARE_TO_SLEEP = 13,
+};
+
 /**
  * struct tb_property_dir - XDomain property directory
  * @uuid: Directory UUID or %NULL if root directory
-- 
cgit v1.2.3


From 9e99b9f4d5c36340dabda6d14053195b2a43796b Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:32 +0300
Subject: thunderbolt: Move thunderbolt domain structure to thunderbolt.h

These are needed by Thunderbolt services so move them to thunderbolt.h
to make sure they are available outside of drivers/thunderbolt.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/tb.h    | 42 ------------------------------------------
 include/linux/thunderbolt.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h
index e0deee4f1eb0..2fefe76621ca 100644
--- a/drivers/thunderbolt/tb.h
+++ b/drivers/thunderbolt/tb.h
@@ -39,20 +39,6 @@ struct tb_switch_nvm {
 	bool authenticating;
 };
 
-/**
- * enum tb_security_level - Thunderbolt security level
- * @TB_SECURITY_NONE: No security, legacy mode
- * @TB_SECURITY_USER: User approval required at minimum
- * @TB_SECURITY_SECURE: One time saved key required at minimum
- * @TB_SECURITY_DPONLY: Only tunnel Display port (and USB)
- */
-enum tb_security_level {
-	TB_SECURITY_NONE,
-	TB_SECURITY_USER,
-	TB_SECURITY_SECURE,
-	TB_SECURITY_DPONLY,
-};
-
 #define TB_SWITCH_KEY_SIZE		32
 /* Each physical port contains 2 links on modern controllers */
 #define TB_SWITCH_LINKS_PER_PHY_PORT	2
@@ -223,33 +209,6 @@ struct tb_cm_ops {
 	int (*disconnect_pcie_paths)(struct tb *tb);
 };
 
-/**
- * struct tb - main thunderbolt bus structure
- * @dev: Domain device
- * @lock: Big lock. Must be held when accessing any struct
- *	  tb_switch / struct tb_port.
- * @nhi: Pointer to the NHI structure
- * @ctl: Control channel for this domain
- * @wq: Ordered workqueue for all domain specific work
- * @root_switch: Root switch of this domain
- * @cm_ops: Connection manager specific operations vector
- * @index: Linux assigned domain number
- * @security_level: Current security level
- * @privdata: Private connection manager specific data
- */
-struct tb {
-	struct device dev;
-	struct mutex lock;
-	struct tb_nhi *nhi;
-	struct tb_ctl *ctl;
-	struct workqueue_struct *wq;
-	struct tb_switch *root_switch;
-	const struct tb_cm_ops *cm_ops;
-	int index;
-	enum tb_security_level security_level;
-	unsigned long privdata[0];
-};
-
 static inline void *tb_priv(struct tb *tb)
 {
 	return (void *)tb->privdata;
@@ -368,7 +327,6 @@ static inline int tb_port_write(struct tb_port *port, const void *buffer,
 struct tb *icm_probe(struct tb_nhi *nhi);
 struct tb *tb_probe(struct tb_nhi *nhi);
 
-extern struct bus_type tb_bus_type;
 extern struct device_type tb_domain_type;
 extern struct device_type tb_switch_type;
 
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index b512b1e2b4f2..910b1bf92112 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -14,7 +14,9 @@
 #ifndef THUNDERBOLT_H_
 #define THUNDERBOLT_H_
 
+#include <linux/device.h>
 #include <linux/list.h>
+#include <linux/mutex.h>
 #include <linux/uuid.h>
 
 enum tb_cfg_pkg_type {
@@ -33,6 +35,49 @@ enum tb_cfg_pkg_type {
 	TB_CFG_PKG_PREPARE_TO_SLEEP = 13,
 };
 
+/**
+ * enum tb_security_level - Thunderbolt security level
+ * @TB_SECURITY_NONE: No security, legacy mode
+ * @TB_SECURITY_USER: User approval required at minimum
+ * @TB_SECURITY_SECURE: One time saved key required at minimum
+ * @TB_SECURITY_DPONLY: Only tunnel Display port (and USB)
+ */
+enum tb_security_level {
+	TB_SECURITY_NONE,
+	TB_SECURITY_USER,
+	TB_SECURITY_SECURE,
+	TB_SECURITY_DPONLY,
+};
+
+/**
+ * struct tb - main thunderbolt bus structure
+ * @dev: Domain device
+ * @lock: Big lock. Must be held when accessing any struct
+ *	  tb_switch / struct tb_port.
+ * @nhi: Pointer to the NHI structure
+ * @ctl: Control channel for this domain
+ * @wq: Ordered workqueue for all domain specific work
+ * @root_switch: Root switch of this domain
+ * @cm_ops: Connection manager specific operations vector
+ * @index: Linux assigned domain number
+ * @security_level: Current security level
+ * @privdata: Private connection manager specific data
+ */
+struct tb {
+	struct device dev;
+	struct mutex lock;
+	struct tb_nhi *nhi;
+	struct tb_ctl *ctl;
+	struct workqueue_struct *wq;
+	struct tb_switch *root_switch;
+	const struct tb_cm_ops *cm_ops;
+	int index;
+	enum tb_security_level security_level;
+	unsigned long privdata[0];
+};
+
+extern struct bus_type tb_bus_type;
+
 /**
  * struct tb_property_dir - XDomain property directory
  * @uuid: Directory UUID or %NULL if root directory
-- 
cgit v1.2.3


From e69b71f8458b78a2ef44e3d07374a8f46e45123d Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:33 +0300
Subject: thunderbolt: Move tb_switch_phy_port_from_link() to thunderbolt.h

A Thunderbolt service might need to find the physical port from a link
the cable is connected to. For instance networking driver uses this
information to generate MAC address according the Apple ThunderboltIP
protocol.

Move this function to thunderbolt.h and rename it to
tb_phy_port_from_link() to reflect the fact that it does not take switch
as parameter.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/icm.c   | 2 +-
 drivers/thunderbolt/tb.h    | 7 -------
 include/linux/thunderbolt.h | 7 +++++++
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thunderbolt/icm.c b/drivers/thunderbolt/icm.c
index 53250fc057e1..8c22b91ed040 100644
--- a/drivers/thunderbolt/icm.c
+++ b/drivers/thunderbolt/icm.c
@@ -89,7 +89,7 @@ static inline struct tb *icm_to_tb(struct icm *icm)
 
 static inline u8 phy_port_from_route(u64 route, u8 depth)
 {
-	return tb_switch_phy_port_from_link(route >> ((depth - 1) * 8));
+	return tb_phy_port_from_link(route >> ((depth - 1) * 8));
 }
 
 static inline u8 dual_link_from_link(u8 link)
diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h
index 2fefe76621ca..ea21d927bd09 100644
--- a/drivers/thunderbolt/tb.h
+++ b/drivers/thunderbolt/tb.h
@@ -40,8 +40,6 @@ struct tb_switch_nvm {
 };
 
 #define TB_SWITCH_KEY_SIZE		32
-/* Each physical port contains 2 links on modern controllers */
-#define TB_SWITCH_LINKS_PER_PHY_PORT	2
 
 /**
  * struct tb_switch - a thunderbolt switch
@@ -367,11 +365,6 @@ struct tb_switch *tb_switch_find_by_link_depth(struct tb *tb, u8 link,
 					       u8 depth);
 struct tb_switch *tb_switch_find_by_uuid(struct tb *tb, const uuid_t *uuid);
 
-static inline unsigned int tb_switch_phy_port_from_link(unsigned int link)
-{
-	return (link - 1) / TB_SWITCH_LINKS_PER_PHY_PORT;
-}
-
 static inline void tb_switch_put(struct tb_switch *sw)
 {
 	put_device(&sw->dev);
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 910b1bf92112..43b8d1e09341 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -78,6 +78,13 @@ struct tb {
 
 extern struct bus_type tb_bus_type;
 
+#define TB_LINKS_PER_PHY_PORT	2
+
+static inline unsigned int tb_phy_port_from_link(unsigned int link)
+{
+	return (link - 1) / TB_LINKS_PER_PHY_PORT;
+}
+
 /**
  * struct tb_property_dir - XDomain property directory
  * @uuid: Directory UUID or %NULL if root directory
-- 
cgit v1.2.3


From d1ff70241a275133e1a0258b7c23588b122276c8 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:34 +0300
Subject: thunderbolt: Add support for XDomain discovery protocol

When two hosts are connected over a Thunderbolt cable, there is a
protocol they can use to communicate capabilities supported by the host.
The discovery protocol uses automatically configured control channel
(ring 0) and is build on top of request/response transactions using
special XDomain primitives provided by the Thunderbolt base protocol.

The capabilities consists of a root directory block of basic properties
used for identification of the host, and then there can be zero or more
directories each describing a Thunderbolt service and its capabilities.

Once both sides have discovered what is supported the two hosts can
setup high-speed DMA paths and transfer data to the other side using
whatever protocol was agreed based on the properties. The software
protocol used to communicate which DMA paths to enable is service
specific.

This patch adds support for the XDomain discovery protocol to the
Thunderbolt bus. We model each remote host connection as a Linux XDomain
device. For each Thunderbolt service found supported on the XDomain
device, we create Linux Thunderbolt service device which Thunderbolt
service drivers can then bind to based on the protocol identification
information retrieved from the property directory describing the
service.

This code is based on the work done by Amir Levy and Michael Jamet.

Signed-off-by: Michael Jamet <michael.jamet@intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/ABI/testing/sysfs-bus-thunderbolt |   48 +
 drivers/thunderbolt/Makefile                    |    2 +-
 drivers/thunderbolt/ctl.c                       |   11 +-
 drivers/thunderbolt/ctl.h                       |    2 +-
 drivers/thunderbolt/domain.c                    |  197 ++-
 drivers/thunderbolt/icm.c                       |  218 +++-
 drivers/thunderbolt/nhi.h                       |    2 +
 drivers/thunderbolt/switch.c                    |    7 +-
 drivers/thunderbolt/tb.h                        |   39 +-
 drivers/thunderbolt/tb_msgs.h                   |  123 ++
 drivers/thunderbolt/xdomain.c                   | 1576 +++++++++++++++++++++++
 include/linux/mod_devicetable.h                 |   26 +
 include/linux/thunderbolt.h                     |  242 ++++
 scripts/mod/devicetable-offsets.c               |    7 +
 scripts/mod/file2alias.c                        |   25 +
 15 files changed, 2507 insertions(+), 18 deletions(-)
 create mode 100644 drivers/thunderbolt/xdomain.c

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-thunderbolt b/Documentation/ABI/testing/sysfs-bus-thunderbolt
index 392bef5bd399..93798c02e28b 100644
--- a/Documentation/ABI/testing/sysfs-bus-thunderbolt
+++ b/Documentation/ABI/testing/sysfs-bus-thunderbolt
@@ -110,3 +110,51 @@ Description:	When new NVM image is written to the non-active NVM
 		is directly the status value from the DMA configuration
 		based mailbox before the device is power cycled. Writing
 		0 here clears the status.
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>.<service>/key
+Date:		Jan 2018
+KernelVersion:	4.15
+Contact:	thunderbolt-software@lists.01.org
+Description:	This contains name of the property directory the XDomain
+		service exposes. This entry describes the protocol in
+		question. Following directories are already reserved by
+		the Apple XDomain specification:
+
+		network:  IP/ethernet over Thunderbolt
+		targetdm: Target disk mode protocol over Thunderbolt
+		extdisp:  External display mode protocol over Thunderbolt
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>.<service>/modalias
+Date:		Jan 2018
+KernelVersion:	4.15
+Contact:	thunderbolt-software@lists.01.org
+Description:	Stores the same MODALIAS value emitted by uevent for
+		the XDomain service. Format: tbtsvc:kSpNvNrN
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>.<service>/prtcid
+Date:		Jan 2018
+KernelVersion:	4.15
+Contact:	thunderbolt-software@lists.01.org
+Description:	This contains XDomain protocol identifier the XDomain
+		service supports.
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>.<service>/prtcvers
+Date:		Jan 2018
+KernelVersion:	4.15
+Contact:	thunderbolt-software@lists.01.org
+Description:	This contains XDomain protocol version the XDomain
+		service supports.
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>.<service>/prtcrevs
+Date:		Jan 2018
+KernelVersion:	4.15
+Contact:	thunderbolt-software@lists.01.org
+Description:	This contains XDomain software version the XDomain
+		service supports.
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>.<service>/prtcstns
+Date:		Jan 2018
+KernelVersion:	4.15
+Contact:	thunderbolt-software@lists.01.org
+Description:	This contains XDomain service specific settings as
+		bitmask. Format: %x
diff --git a/drivers/thunderbolt/Makefile b/drivers/thunderbolt/Makefile
index 7afd21f5383a..f2f0de27252b 100644
--- a/drivers/thunderbolt/Makefile
+++ b/drivers/thunderbolt/Makefile
@@ -1,3 +1,3 @@
 obj-${CONFIG_THUNDERBOLT} := thunderbolt.o
 thunderbolt-objs := nhi.o ctl.o tb.o switch.o cap.o path.o tunnel_pci.o eeprom.o
-thunderbolt-objs += domain.o dma_port.o icm.o property.o
+thunderbolt-objs += domain.o dma_port.o icm.o property.o xdomain.o
diff --git a/drivers/thunderbolt/ctl.c b/drivers/thunderbolt/ctl.c
index e6a4c9458c76..46e393c5fd1d 100644
--- a/drivers/thunderbolt/ctl.c
+++ b/drivers/thunderbolt/ctl.c
@@ -368,10 +368,10 @@ static int tb_ctl_tx(struct tb_ctl *ctl, const void *data, size_t len,
 /**
  * tb_ctl_handle_event() - acknowledge a plug event, invoke ctl->callback
  */
-static void tb_ctl_handle_event(struct tb_ctl *ctl, enum tb_cfg_pkg_type type,
+static bool tb_ctl_handle_event(struct tb_ctl *ctl, enum tb_cfg_pkg_type type,
 				struct ctl_pkg *pkg, size_t size)
 {
-	ctl->callback(ctl->callback_data, type, pkg->buffer, size);
+	return ctl->callback(ctl->callback_data, type, pkg->buffer, size);
 }
 
 static void tb_ctl_rx_submit(struct ctl_pkg *pkg)
@@ -444,6 +444,8 @@ static void tb_ctl_rx_callback(struct tb_ring *ring, struct ring_frame *frame,
 		break;
 
 	case TB_CFG_PKG_EVENT:
+	case TB_CFG_PKG_XDOMAIN_RESP:
+	case TB_CFG_PKG_XDOMAIN_REQ:
 		if (*(__be32 *)(pkg->buffer + frame->size) != crc32) {
 			tb_ctl_err(pkg->ctl,
 				   "RX: checksum mismatch, dropping packet\n");
@@ -451,8 +453,9 @@ static void tb_ctl_rx_callback(struct tb_ring *ring, struct ring_frame *frame,
 		}
 		/* Fall through */
 	case TB_CFG_PKG_ICM_EVENT:
-		tb_ctl_handle_event(pkg->ctl, frame->eof, pkg, frame->size);
-		goto rx;
+		if (tb_ctl_handle_event(pkg->ctl, frame->eof, pkg, frame->size))
+			goto rx;
+		break;
 
 	default:
 		break;
diff --git a/drivers/thunderbolt/ctl.h b/drivers/thunderbolt/ctl.h
index d0f21e1e0b8b..85c49dd301ea 100644
--- a/drivers/thunderbolt/ctl.h
+++ b/drivers/thunderbolt/ctl.h
@@ -16,7 +16,7 @@
 /* control channel */
 struct tb_ctl;
 
-typedef void (*event_cb)(void *data, enum tb_cfg_pkg_type type,
+typedef bool (*event_cb)(void *data, enum tb_cfg_pkg_type type,
 			 const void *buf, size_t size);
 
 struct tb_ctl *tb_ctl_alloc(struct tb_nhi *nhi, event_cb cb, void *cb_data);
diff --git a/drivers/thunderbolt/domain.c b/drivers/thunderbolt/domain.c
index 9f2dcd48974d..9b90115319ce 100644
--- a/drivers/thunderbolt/domain.c
+++ b/drivers/thunderbolt/domain.c
@@ -20,6 +20,98 @@
 
 static DEFINE_IDA(tb_domain_ida);
 
+static bool match_service_id(const struct tb_service_id *id,
+			     const struct tb_service *svc)
+{
+	if (id->match_flags & TBSVC_MATCH_PROTOCOL_KEY) {
+		if (strcmp(id->protocol_key, svc->key))
+			return false;
+	}
+
+	if (id->match_flags & TBSVC_MATCH_PROTOCOL_ID) {
+		if (id->protocol_id != svc->prtcid)
+			return false;
+	}
+
+	if (id->match_flags & TBSVC_MATCH_PROTOCOL_VERSION) {
+		if (id->protocol_version != svc->prtcvers)
+			return false;
+	}
+
+	if (id->match_flags & TBSVC_MATCH_PROTOCOL_VERSION) {
+		if (id->protocol_revision != svc->prtcrevs)
+			return false;
+	}
+
+	return true;
+}
+
+static const struct tb_service_id *__tb_service_match(struct device *dev,
+						      struct device_driver *drv)
+{
+	struct tb_service_driver *driver;
+	const struct tb_service_id *ids;
+	struct tb_service *svc;
+
+	svc = tb_to_service(dev);
+	if (!svc)
+		return NULL;
+
+	driver = container_of(drv, struct tb_service_driver, driver);
+	if (!driver->id_table)
+		return NULL;
+
+	for (ids = driver->id_table; ids->match_flags != 0; ids++) {
+		if (match_service_id(ids, svc))
+			return ids;
+	}
+
+	return NULL;
+}
+
+static int tb_service_match(struct device *dev, struct device_driver *drv)
+{
+	return !!__tb_service_match(dev, drv);
+}
+
+static int tb_service_probe(struct device *dev)
+{
+	struct tb_service *svc = tb_to_service(dev);
+	struct tb_service_driver *driver;
+	const struct tb_service_id *id;
+
+	driver = container_of(dev->driver, struct tb_service_driver, driver);
+	id = __tb_service_match(dev, &driver->driver);
+
+	return driver->probe(svc, id);
+}
+
+static int tb_service_remove(struct device *dev)
+{
+	struct tb_service *svc = tb_to_service(dev);
+	struct tb_service_driver *driver;
+
+	driver = container_of(dev->driver, struct tb_service_driver, driver);
+	if (driver->remove)
+		driver->remove(svc);
+
+	return 0;
+}
+
+static void tb_service_shutdown(struct device *dev)
+{
+	struct tb_service_driver *driver;
+	struct tb_service *svc;
+
+	svc = tb_to_service(dev);
+	if (!svc || !dev->driver)
+		return;
+
+	driver = container_of(dev->driver, struct tb_service_driver, driver);
+	if (driver->shutdown)
+		driver->shutdown(svc);
+}
+
 static const char * const tb_security_names[] = {
 	[TB_SECURITY_NONE] = "none",
 	[TB_SECURITY_USER] = "user",
@@ -52,6 +144,10 @@ static const struct attribute_group *domain_attr_groups[] = {
 
 struct bus_type tb_bus_type = {
 	.name = "thunderbolt",
+	.match = tb_service_match,
+	.probe = tb_service_probe,
+	.remove = tb_service_remove,
+	.shutdown = tb_service_shutdown,
 };
 
 static void tb_domain_release(struct device *dev)
@@ -128,17 +224,26 @@ err_free:
 	return NULL;
 }
 
-static void tb_domain_event_cb(void *data, enum tb_cfg_pkg_type type,
+static bool tb_domain_event_cb(void *data, enum tb_cfg_pkg_type type,
 			       const void *buf, size_t size)
 {
 	struct tb *tb = data;
 
 	if (!tb->cm_ops->handle_event) {
 		tb_warn(tb, "domain does not have event handler\n");
-		return;
+		return true;
 	}
 
-	tb->cm_ops->handle_event(tb, type, buf, size);
+	switch (type) {
+	case TB_CFG_PKG_XDOMAIN_REQ:
+	case TB_CFG_PKG_XDOMAIN_RESP:
+		return tb_xdomain_handle_request(tb, type, buf, size);
+
+	default:
+		tb->cm_ops->handle_event(tb, type, buf, size);
+	}
+
+	return true;
 }
 
 /**
@@ -443,9 +548,92 @@ int tb_domain_disconnect_pcie_paths(struct tb *tb)
 	return tb->cm_ops->disconnect_pcie_paths(tb);
 }
 
+/**
+ * tb_domain_approve_xdomain_paths() - Enable DMA paths for XDomain
+ * @tb: Domain enabling the DMA paths
+ * @xd: XDomain DMA paths are created to
+ *
+ * Calls connection manager specific method to enable DMA paths to the
+ * XDomain in question.
+ *
+ * Return: 0% in case of success and negative errno otherwise. In
+ * particular returns %-ENOTSUPP if the connection manager
+ * implementation does not support XDomains.
+ */
+int tb_domain_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+{
+	if (!tb->cm_ops->approve_xdomain_paths)
+		return -ENOTSUPP;
+
+	return tb->cm_ops->approve_xdomain_paths(tb, xd);
+}
+
+/**
+ * tb_domain_disconnect_xdomain_paths() - Disable DMA paths for XDomain
+ * @tb: Domain disabling the DMA paths
+ * @xd: XDomain whose DMA paths are disconnected
+ *
+ * Calls connection manager specific method to disconnect DMA paths to
+ * the XDomain in question.
+ *
+ * Return: 0% in case of success and negative errno otherwise. In
+ * particular returns %-ENOTSUPP if the connection manager
+ * implementation does not support XDomains.
+ */
+int tb_domain_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+{
+	if (!tb->cm_ops->disconnect_xdomain_paths)
+		return -ENOTSUPP;
+
+	return tb->cm_ops->disconnect_xdomain_paths(tb, xd);
+}
+
+static int disconnect_xdomain(struct device *dev, void *data)
+{
+	struct tb_xdomain *xd;
+	struct tb *tb = data;
+	int ret = 0;
+
+	xd = tb_to_xdomain(dev);
+	if (xd && xd->tb == tb)
+		ret = tb_xdomain_disable_paths(xd);
+
+	return ret;
+}
+
+/**
+ * tb_domain_disconnect_all_paths() - Disconnect all paths for the domain
+ * @tb: Domain whose paths are disconnected
+ *
+ * This function can be used to disconnect all paths (PCIe, XDomain) for
+ * example in preparation for host NVM firmware upgrade. After this is
+ * called the paths cannot be established without resetting the switch.
+ *
+ * Return: %0 in case of success and negative errno otherwise.
+ */
+int tb_domain_disconnect_all_paths(struct tb *tb)
+{
+	int ret;
+
+	ret = tb_domain_disconnect_pcie_paths(tb);
+	if (ret)
+		return ret;
+
+	return bus_for_each_dev(&tb_bus_type, NULL, tb, disconnect_xdomain);
+}
+
 int tb_domain_init(void)
 {
-	return bus_register(&tb_bus_type);
+	int ret;
+
+	ret = tb_xdomain_init();
+	if (ret)
+		return ret;
+	ret = bus_register(&tb_bus_type);
+	if (ret)
+		tb_xdomain_exit();
+
+	return ret;
 }
 
 void tb_domain_exit(void)
@@ -453,4 +641,5 @@ void tb_domain_exit(void)
 	bus_unregister(&tb_bus_type);
 	ida_destroy(&tb_domain_ida);
 	tb_switch_exit();
+	tb_xdomain_exit();
 }
diff --git a/drivers/thunderbolt/icm.c b/drivers/thunderbolt/icm.c
index 8c22b91ed040..ab02d13f40b7 100644
--- a/drivers/thunderbolt/icm.c
+++ b/drivers/thunderbolt/icm.c
@@ -60,6 +60,8 @@
  * @get_route: Find a route string for given switch
  * @device_connected: Handle device connected ICM message
  * @device_disconnected: Handle device disconnected ICM message
+ * @xdomain_connected - Handle XDomain connected ICM message
+ * @xdomain_disconnected - Handle XDomain disconnected ICM message
  */
 struct icm {
 	struct mutex request_lock;
@@ -74,6 +76,10 @@ struct icm {
 				 const struct icm_pkg_header *hdr);
 	void (*device_disconnected)(struct tb *tb,
 				    const struct icm_pkg_header *hdr);
+	void (*xdomain_connected)(struct tb *tb,
+				  const struct icm_pkg_header *hdr);
+	void (*xdomain_disconnected)(struct tb *tb,
+				     const struct icm_pkg_header *hdr);
 };
 
 struct icm_notification {
@@ -89,7 +95,10 @@ static inline struct tb *icm_to_tb(struct icm *icm)
 
 static inline u8 phy_port_from_route(u64 route, u8 depth)
 {
-	return tb_phy_port_from_link(route >> ((depth - 1) * 8));
+	u8 link;
+
+	link = depth ? route >> ((depth - 1) * 8) : route;
+	return tb_phy_port_from_link(link);
 }
 
 static inline u8 dual_link_from_link(u8 link)
@@ -320,6 +329,51 @@ static int icm_fr_challenge_switch_key(struct tb *tb, struct tb_switch *sw,
 	return 0;
 }
 
+static int icm_fr_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+{
+	struct icm_fr_pkg_approve_xdomain_response reply;
+	struct icm_fr_pkg_approve_xdomain request;
+	int ret;
+
+	memset(&request, 0, sizeof(request));
+	request.hdr.code = ICM_APPROVE_XDOMAIN;
+	request.link_info = xd->depth << ICM_LINK_INFO_DEPTH_SHIFT | xd->link;
+	memcpy(&request.remote_uuid, xd->remote_uuid, sizeof(*xd->remote_uuid));
+
+	request.transmit_path = xd->transmit_path;
+	request.transmit_ring = xd->transmit_ring;
+	request.receive_path = xd->receive_path;
+	request.receive_ring = xd->receive_ring;
+
+	memset(&reply, 0, sizeof(reply));
+	ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply),
+			  1, ICM_TIMEOUT);
+	if (ret)
+		return ret;
+
+	if (reply.hdr.flags & ICM_FLAGS_ERROR)
+		return -EIO;
+
+	return 0;
+}
+
+static int icm_fr_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+{
+	u8 phy_port;
+	u8 cmd;
+
+	phy_port = tb_phy_port_from_link(xd->link);
+	if (phy_port == 0)
+		cmd = NHI_MAILBOX_DISCONNECT_PA;
+	else
+		cmd = NHI_MAILBOX_DISCONNECT_PB;
+
+	nhi_mailbox_cmd(tb->nhi, cmd, 1);
+	usleep_range(10, 50);
+	nhi_mailbox_cmd(tb->nhi, cmd, 2);
+	return 0;
+}
+
 static void remove_switch(struct tb_switch *sw)
 {
 	struct tb_switch *parent_sw;
@@ -475,6 +529,141 @@ icm_fr_device_disconnected(struct tb *tb, const struct icm_pkg_header *hdr)
 	tb_switch_put(sw);
 }
 
+static void remove_xdomain(struct tb_xdomain *xd)
+{
+	struct tb_switch *sw;
+
+	sw = tb_to_switch(xd->dev.parent);
+	tb_port_at(xd->route, sw)->xdomain = NULL;
+	tb_xdomain_remove(xd);
+}
+
+static void
+icm_fr_xdomain_connected(struct tb *tb, const struct icm_pkg_header *hdr)
+{
+	const struct icm_fr_event_xdomain_connected *pkg =
+		(const struct icm_fr_event_xdomain_connected *)hdr;
+	struct tb_xdomain *xd;
+	struct tb_switch *sw;
+	u8 link, depth;
+	bool approved;
+	u64 route;
+
+	/*
+	 * After NVM upgrade adding root switch device fails because we
+	 * initiated reset. During that time ICM might still send
+	 * XDomain connected message which we ignore here.
+	 */
+	if (!tb->root_switch)
+		return;
+
+	link = pkg->link_info & ICM_LINK_INFO_LINK_MASK;
+	depth = (pkg->link_info & ICM_LINK_INFO_DEPTH_MASK) >>
+		ICM_LINK_INFO_DEPTH_SHIFT;
+	approved = pkg->link_info & ICM_LINK_INFO_APPROVED;
+
+	if (link > ICM_MAX_LINK || depth > ICM_MAX_DEPTH) {
+		tb_warn(tb, "invalid topology %u.%u, ignoring\n", link, depth);
+		return;
+	}
+
+	route = get_route(pkg->local_route_hi, pkg->local_route_lo);
+
+	xd = tb_xdomain_find_by_uuid(tb, &pkg->remote_uuid);
+	if (xd) {
+		u8 xd_phy_port, phy_port;
+
+		xd_phy_port = phy_port_from_route(xd->route, xd->depth);
+		phy_port = phy_port_from_route(route, depth);
+
+		if (xd->depth == depth && xd_phy_port == phy_port) {
+			xd->link = link;
+			xd->route = route;
+			xd->is_unplugged = false;
+			tb_xdomain_put(xd);
+			return;
+		}
+
+		/*
+		 * If we find an existing XDomain connection remove it
+		 * now. We need to go through login handshake and
+		 * everything anyway to be able to re-establish the
+		 * connection.
+		 */
+		remove_xdomain(xd);
+		tb_xdomain_put(xd);
+	}
+
+	/*
+	 * Look if there already exists an XDomain in the same place
+	 * than the new one and in that case remove it because it is
+	 * most likely another host that got disconnected.
+	 */
+	xd = tb_xdomain_find_by_link_depth(tb, link, depth);
+	if (!xd) {
+		u8 dual_link;
+
+		dual_link = dual_link_from_link(link);
+		if (dual_link)
+			xd = tb_xdomain_find_by_link_depth(tb, dual_link,
+							   depth);
+	}
+	if (xd) {
+		remove_xdomain(xd);
+		tb_xdomain_put(xd);
+	}
+
+	/*
+	 * If the user disconnected a switch during suspend and
+	 * connected another host to the same port, remove the switch
+	 * first.
+	 */
+	sw = get_switch_at_route(tb->root_switch, route);
+	if (sw)
+		remove_switch(sw);
+
+	sw = tb_switch_find_by_link_depth(tb, link, depth);
+	if (!sw) {
+		tb_warn(tb, "no switch exists at %u.%u, ignoring\n", link,
+			depth);
+		return;
+	}
+
+	xd = tb_xdomain_alloc(sw->tb, &sw->dev, route,
+			      &pkg->local_uuid, &pkg->remote_uuid);
+	if (!xd) {
+		tb_switch_put(sw);
+		return;
+	}
+
+	xd->link = link;
+	xd->depth = depth;
+
+	tb_port_at(route, sw)->xdomain = xd;
+
+	tb_xdomain_add(xd);
+	tb_switch_put(sw);
+}
+
+static void
+icm_fr_xdomain_disconnected(struct tb *tb, const struct icm_pkg_header *hdr)
+{
+	const struct icm_fr_event_xdomain_disconnected *pkg =
+		(const struct icm_fr_event_xdomain_disconnected *)hdr;
+	struct tb_xdomain *xd;
+
+	/*
+	 * If the connection is through one or multiple devices, the
+	 * XDomain device is removed along with them so it is fine if we
+	 * cannot find it here.
+	 */
+	xd = tb_xdomain_find_by_uuid(tb, &pkg->remote_uuid);
+	if (xd) {
+		remove_xdomain(xd);
+		tb_xdomain_put(xd);
+	}
+}
+
 static struct pci_dev *get_upstream_port(struct pci_dev *pdev)
 {
 	struct pci_dev *parent;
@@ -594,6 +783,12 @@ static void icm_handle_notification(struct work_struct *work)
 	case ICM_EVENT_DEVICE_DISCONNECTED:
 		icm->device_disconnected(tb, n->pkg);
 		break;
+	case ICM_EVENT_XDOMAIN_CONNECTED:
+		icm->xdomain_connected(tb, n->pkg);
+		break;
+	case ICM_EVENT_XDOMAIN_DISCONNECTED:
+		icm->xdomain_disconnected(tb, n->pkg);
+		break;
 	}
 
 	mutex_unlock(&tb->lock);
@@ -927,6 +1122,10 @@ static void icm_unplug_children(struct tb_switch *sw)
 
 		if (tb_is_upstream_port(port))
 			continue;
+		if (port->xdomain) {
+			port->xdomain->is_unplugged = true;
+			continue;
+		}
 		if (!port->remote)
 			continue;
 
@@ -943,6 +1142,13 @@ static void icm_free_unplugged_children(struct tb_switch *sw)
 
 		if (tb_is_upstream_port(port))
 			continue;
+
+		if (port->xdomain && port->xdomain->is_unplugged) {
+			tb_xdomain_remove(port->xdomain);
+			port->xdomain = NULL;
+			continue;
+		}
+
 		if (!port->remote)
 			continue;
 
@@ -1009,8 +1215,10 @@ static int icm_start(struct tb *tb)
 	tb->root_switch->no_nvm_upgrade = x86_apple_machine;
 
 	ret = tb_switch_add(tb->root_switch);
-	if (ret)
+	if (ret) {
 		tb_switch_put(tb->root_switch);
+		tb->root_switch = NULL;
+	}
 
 	return ret;
 }
@@ -1042,6 +1250,8 @@ static const struct tb_cm_ops icm_fr_ops = {
 	.add_switch_key = icm_fr_add_switch_key,
 	.challenge_switch_key = icm_fr_challenge_switch_key,
 	.disconnect_pcie_paths = icm_disconnect_pcie_paths,
+	.approve_xdomain_paths = icm_fr_approve_xdomain_paths,
+	.disconnect_xdomain_paths = icm_fr_disconnect_xdomain_paths,
 };
 
 struct tb *icm_probe(struct tb_nhi *nhi)
@@ -1064,6 +1274,8 @@ struct tb *icm_probe(struct tb_nhi *nhi)
 		icm->get_route = icm_fr_get_route;
 		icm->device_connected = icm_fr_device_connected;
 		icm->device_disconnected = icm_fr_device_disconnected;
+		icm->xdomain_connected = icm_fr_xdomain_connected;
+		icm->xdomain_disconnected = icm_fr_xdomain_disconnected;
 		tb->cm_ops = &icm_fr_ops;
 		break;
 
@@ -1077,6 +1289,8 @@ struct tb *icm_probe(struct tb_nhi *nhi)
 		icm->get_route = icm_ar_get_route;
 		icm->device_connected = icm_fr_device_connected;
 		icm->device_disconnected = icm_fr_device_disconnected;
+		icm->xdomain_connected = icm_fr_xdomain_connected;
+		icm->xdomain_disconnected = icm_fr_xdomain_disconnected;
 		tb->cm_ops = &icm_fr_ops;
 		break;
 	}
diff --git a/drivers/thunderbolt/nhi.h b/drivers/thunderbolt/nhi.h
index 5b5bb2c436be..0e05828983db 100644
--- a/drivers/thunderbolt/nhi.h
+++ b/drivers/thunderbolt/nhi.h
@@ -157,6 +157,8 @@ enum nhi_mailbox_cmd {
 	NHI_MAILBOX_SAVE_DEVS = 0x05,
 	NHI_MAILBOX_DISCONNECT_PCIE_PATHS = 0x06,
 	NHI_MAILBOX_DRV_UNLOADS = 0x07,
+	NHI_MAILBOX_DISCONNECT_PA = 0x10,
+	NHI_MAILBOX_DISCONNECT_PB = 0x11,
 	NHI_MAILBOX_ALLOW_ALL_DEVS = 0x23,
 };
 
diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c
index 53f40c57df59..dfc357d33e1e 100644
--- a/drivers/thunderbolt/switch.c
+++ b/drivers/thunderbolt/switch.c
@@ -171,11 +171,11 @@ static int nvm_authenticate_host(struct tb_switch *sw)
 
 	/*
 	 * Root switch NVM upgrade requires that we disconnect the
-	 * existing PCIe paths first (in case it is not in safe mode
+	 * existing paths first (in case it is not in safe mode
 	 * already).
 	 */
 	if (!sw->safe_mode) {
-		ret = tb_domain_disconnect_pcie_paths(sw->tb);
+		ret = tb_domain_disconnect_all_paths(sw->tb);
 		if (ret)
 			return ret;
 		/*
@@ -1363,6 +1363,9 @@ void tb_switch_remove(struct tb_switch *sw)
 		if (sw->ports[i].remote)
 			tb_switch_remove(sw->ports[i].remote->sw);
 		sw->ports[i].remote = NULL;
+		if (sw->ports[i].xdomain)
+			tb_xdomain_remove(sw->ports[i].xdomain);
+		sw->ports[i].xdomain = NULL;
 	}
 
 	if (!sw->is_unplugged)
diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h
index ea21d927bd09..74af9d4929ab 100644
--- a/drivers/thunderbolt/tb.h
+++ b/drivers/thunderbolt/tb.h
@@ -9,6 +9,7 @@
 
 #include <linux/nvmem-provider.h>
 #include <linux/pci.h>
+#include <linux/thunderbolt.h>
 #include <linux/uuid.h>
 
 #include "tb_regs.h"
@@ -109,14 +110,25 @@ struct tb_switch {
 
 /**
  * struct tb_port - a thunderbolt port, part of a tb_switch
+ * @config: Cached port configuration read from registers
+ * @sw: Switch the port belongs to
+ * @remote: Remote port (%NULL if not connected)
+ * @xdomain: Remote host (%NULL if not connected)
+ * @cap_phy: Offset, zero if not found
+ * @port: Port number on switch
+ * @disabled: Disabled by eeprom
+ * @dual_link_port: If the switch is connected using two ports, points
+ *		    to the other port.
+ * @link_nr: Is this primary or secondary port on the dual_link.
  */
 struct tb_port {
 	struct tb_regs_port_header config;
 	struct tb_switch *sw;
-	struct tb_port *remote; /* remote port, NULL if not connected */
-	int cap_phy; /* offset, zero if not found */
-	u8 port; /* port number on switch */
-	bool disabled; /* disabled by eeprom */
+	struct tb_port *remote;
+	struct tb_xdomain *xdomain;
+	int cap_phy;
+	u8 port;
+	bool disabled;
 	struct tb_port *dual_link_port;
 	u8 link_nr:1;
 };
@@ -189,6 +201,8 @@ struct tb_path {
  * @add_switch_key: Add key to switch
  * @challenge_switch_key: Challenge switch using key
  * @disconnect_pcie_paths: Disconnects PCIe paths before NVM update
+ * @approve_xdomain_paths: Approve (establish) XDomain DMA paths
+ * @disconnect_xdomain_paths: Disconnect XDomain DMA paths
  */
 struct tb_cm_ops {
 	int (*driver_ready)(struct tb *tb);
@@ -205,6 +219,8 @@ struct tb_cm_ops {
 	int (*challenge_switch_key)(struct tb *tb, struct tb_switch *sw,
 				    const u8 *challenge, u8 *response);
 	int (*disconnect_pcie_paths)(struct tb *tb);
+	int (*approve_xdomain_paths)(struct tb *tb, struct tb_xdomain *xd);
+	int (*disconnect_xdomain_paths)(struct tb *tb, struct tb_xdomain *xd);
 };
 
 static inline void *tb_priv(struct tb *tb)
@@ -331,6 +347,8 @@ extern struct device_type tb_switch_type;
 int tb_domain_init(void);
 void tb_domain_exit(void);
 void tb_switch_exit(void);
+int tb_xdomain_init(void);
+void tb_xdomain_exit(void);
 
 struct tb *tb_domain_alloc(struct tb_nhi *nhi, size_t privsize);
 int tb_domain_add(struct tb *tb);
@@ -343,6 +361,9 @@ int tb_domain_approve_switch(struct tb *tb, struct tb_switch *sw);
 int tb_domain_approve_switch_key(struct tb *tb, struct tb_switch *sw);
 int tb_domain_challenge_switch_key(struct tb *tb, struct tb_switch *sw);
 int tb_domain_disconnect_pcie_paths(struct tb *tb);
+int tb_domain_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd);
+int tb_domain_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd);
+int tb_domain_disconnect_all_paths(struct tb *tb);
 
 static inline void tb_domain_put(struct tb *tb)
 {
@@ -422,4 +443,14 @@ static inline u64 tb_downstream_route(struct tb_port *port)
 	       | ((u64) port->port << (port->sw->config.depth * 8));
 }
 
+bool tb_xdomain_handle_request(struct tb *tb, enum tb_cfg_pkg_type type,
+			       const void *buf, size_t size);
+struct tb_xdomain *tb_xdomain_alloc(struct tb *tb, struct device *parent,
+				    u64 route, const uuid_t *local_uuid,
+				    const uuid_t *remote_uuid);
+void tb_xdomain_add(struct tb_xdomain *xd);
+void tb_xdomain_remove(struct tb_xdomain *xd);
+struct tb_xdomain *tb_xdomain_find_by_link_depth(struct tb *tb, u8 link,
+						 u8 depth);
+
 #endif
diff --git a/drivers/thunderbolt/tb_msgs.h b/drivers/thunderbolt/tb_msgs.h
index f2b2550cd97c..b0a092baa605 100644
--- a/drivers/thunderbolt/tb_msgs.h
+++ b/drivers/thunderbolt/tb_msgs.h
@@ -101,11 +101,14 @@ enum icm_pkg_code {
 	ICM_CHALLENGE_DEVICE = 0x5,
 	ICM_ADD_DEVICE_KEY = 0x6,
 	ICM_GET_ROUTE = 0xa,
+	ICM_APPROVE_XDOMAIN = 0x10,
 };
 
 enum icm_event_code {
 	ICM_EVENT_DEVICE_CONNECTED = 3,
 	ICM_EVENT_DEVICE_DISCONNECTED = 4,
+	ICM_EVENT_XDOMAIN_CONNECTED = 6,
+	ICM_EVENT_XDOMAIN_DISCONNECTED = 7,
 };
 
 struct icm_pkg_header {
@@ -188,6 +191,25 @@ struct icm_fr_event_device_disconnected {
 	u16 link_info;
 };
 
+struct icm_fr_event_xdomain_connected {
+	struct icm_pkg_header hdr;
+	u16 reserved;
+	u16 link_info;
+	uuid_t remote_uuid;
+	uuid_t local_uuid;
+	u32 local_route_hi;
+	u32 local_route_lo;
+	u32 remote_route_hi;
+	u32 remote_route_lo;
+};
+
+struct icm_fr_event_xdomain_disconnected {
+	struct icm_pkg_header hdr;
+	u16 reserved;
+	u16 link_info;
+	uuid_t remote_uuid;
+};
+
 struct icm_fr_pkg_add_device_key {
 	struct icm_pkg_header hdr;
 	uuid_t ep_uuid;
@@ -224,6 +246,28 @@ struct icm_fr_pkg_challenge_device_response {
 	u32 response[8];
 };
 
+struct icm_fr_pkg_approve_xdomain {
+	struct icm_pkg_header hdr;
+	u16 reserved;
+	u16 link_info;
+	uuid_t remote_uuid;
+	u16 transmit_path;
+	u16 transmit_ring;
+	u16 receive_path;
+	u16 receive_ring;
+};
+
+struct icm_fr_pkg_approve_xdomain_response {
+	struct icm_pkg_header hdr;
+	u16 reserved;
+	u16 link_info;
+	uuid_t remote_uuid;
+	u16 transmit_path;
+	u16 transmit_ring;
+	u16 receive_path;
+	u16 receive_ring;
+};
+
 /* Alpine Ridge only messages */
 
 struct icm_ar_pkg_get_route {
@@ -240,4 +284,83 @@ struct icm_ar_pkg_get_route_response {
 	u32 route_lo;
 };
 
+/* XDomain messages */
+
+struct tb_xdomain_header {
+	u32 route_hi;
+	u32 route_lo;
+	u32 length_sn;
+};
+
+#define TB_XDOMAIN_LENGTH_MASK	GENMASK(5, 0)
+#define TB_XDOMAIN_SN_MASK	GENMASK(28, 27)
+#define TB_XDOMAIN_SN_SHIFT	27
+
+enum tb_xdp_type {
+	UUID_REQUEST_OLD = 1,
+	UUID_RESPONSE = 2,
+	PROPERTIES_REQUEST,
+	PROPERTIES_RESPONSE,
+	PROPERTIES_CHANGED_REQUEST,
+	PROPERTIES_CHANGED_RESPONSE,
+	ERROR_RESPONSE,
+	UUID_REQUEST = 12,
+};
+
+struct tb_xdp_header {
+	struct tb_xdomain_header xd_hdr;
+	uuid_t uuid;
+	u32 type;
+};
+
+struct tb_xdp_properties {
+	struct tb_xdp_header hdr;
+	uuid_t src_uuid;
+	uuid_t dst_uuid;
+	u16 offset;
+	u16 reserved;
+};
+
+struct tb_xdp_properties_response {
+	struct tb_xdp_header hdr;
+	uuid_t src_uuid;
+	uuid_t dst_uuid;
+	u16 offset;
+	u16 data_length;
+	u32 generation;
+	u32 data[0];
+};
+
+/*
+ * Max length of data array single XDomain property response is allowed
+ * to carry.
+ */
+#define TB_XDP_PROPERTIES_MAX_DATA_LENGTH	\
+	(((256 - 4 - sizeof(struct tb_xdp_properties_response))) / 4)
+
+/* Maximum size of the total property block in dwords we allow */
+#define TB_XDP_PROPERTIES_MAX_LENGTH		500
+
+struct tb_xdp_properties_changed {
+	struct tb_xdp_header hdr;
+	uuid_t src_uuid;
+};
+
+struct tb_xdp_properties_changed_response {
+	struct tb_xdp_header hdr;
+};
+
+enum tb_xdp_error {
+	ERROR_SUCCESS,
+	ERROR_UNKNOWN_PACKET,
+	ERROR_UNKNOWN_DOMAIN,
+	ERROR_NOT_SUPPORTED,
+	ERROR_NOT_READY,
+};
+
+struct tb_xdp_error_response {
+	struct tb_xdp_header hdr;
+	u32 error;
+};
+
 #endif
diff --git a/drivers/thunderbolt/xdomain.c b/drivers/thunderbolt/xdomain.c
new file mode 100644
index 000000000000..f2d06f6f7be9
--- /dev/null
+++ b/drivers/thunderbolt/xdomain.c
@@ -0,0 +1,1576 @@
+/*
+ * Thunderbolt XDomain discovery protocol support
+ *
+ * Copyright (C) 2017, Intel Corporation
+ * Authors: Michael Jamet <michael.jamet@intel.com>
+ *          Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/uuid.h>
+#include <linux/workqueue.h>
+
+#include "tb.h"
+
+#define XDOMAIN_DEFAULT_TIMEOUT			5000 /* ms */
+#define XDOMAIN_PROPERTIES_RETRIES		60
+#define XDOMAIN_PROPERTIES_CHANGED_RETRIES	10
+
+struct xdomain_request_work {
+	struct work_struct work;
+	struct tb_xdp_header *pkg;
+	struct tb *tb;
+};
+
+/* Serializes access to the properties and protocol handlers below */
+static DEFINE_MUTEX(xdomain_lock);
+
+/* Properties exposed to the remote domains */
+static struct tb_property_dir *xdomain_property_dir;
+static u32 *xdomain_property_block;
+static u32 xdomain_property_block_len;
+static u32 xdomain_property_block_gen;
+
+/* Additional protocol handlers */
+static LIST_HEAD(protocol_handlers);
+
+/* UUID for XDomain discovery protocol: b638d70e-42ff-40bb-97c2-90e2c0b2ff07 */
+static const uuid_t tb_xdp_uuid =
+	UUID_INIT(0xb638d70e, 0x42ff, 0x40bb,
+		  0x97, 0xc2, 0x90, 0xe2, 0xc0, 0xb2, 0xff, 0x07);
+
+static bool tb_xdomain_match(const struct tb_cfg_request *req,
+			     const struct ctl_pkg *pkg)
+{
+	switch (pkg->frame.eof) {
+	case TB_CFG_PKG_ERROR:
+		return true;
+
+	case TB_CFG_PKG_XDOMAIN_RESP: {
+		const struct tb_xdp_header *res_hdr = pkg->buffer;
+		const struct tb_xdp_header *req_hdr = req->request;
+		u8 req_seq, res_seq;
+
+		if (pkg->frame.size < req->response_size / 4)
+			return false;
+
+		/* Make sure route matches */
+		if ((res_hdr->xd_hdr.route_hi & ~BIT(31)) !=
+		     req_hdr->xd_hdr.route_hi)
+			return false;
+		if ((res_hdr->xd_hdr.route_lo) != req_hdr->xd_hdr.route_lo)
+			return false;
+
+		/* Then check that the sequence number matches */
+		res_seq = res_hdr->xd_hdr.length_sn & TB_XDOMAIN_SN_MASK;
+		res_seq >>= TB_XDOMAIN_SN_SHIFT;
+		req_seq = req_hdr->xd_hdr.length_sn & TB_XDOMAIN_SN_MASK;
+		req_seq >>= TB_XDOMAIN_SN_SHIFT;
+		if (res_seq != req_seq)
+			return false;
+
+		/* Check that the XDomain protocol matches */
+		if (!uuid_equal(&res_hdr->uuid, &req_hdr->uuid))
+			return false;
+
+		return true;
+	}
+
+	default:
+		return false;
+	}
+}
+
+static bool tb_xdomain_copy(struct tb_cfg_request *req,
+			    const struct ctl_pkg *pkg)
+{
+	memcpy(req->response, pkg->buffer, req->response_size);
+	req->result.err = 0;
+	return true;
+}
+
+static void response_ready(void *data)
+{
+	tb_cfg_request_put(data);
+}
+
+static int __tb_xdomain_response(struct tb_ctl *ctl, const void *response,
+				 size_t size, enum tb_cfg_pkg_type type)
+{
+	struct tb_cfg_request *req;
+
+	req = tb_cfg_request_alloc();
+	if (!req)
+		return -ENOMEM;
+
+	req->match = tb_xdomain_match;
+	req->copy = tb_xdomain_copy;
+	req->request = response;
+	req->request_size = size;
+	req->request_type = type;
+
+	return tb_cfg_request(ctl, req, response_ready, req);
+}
+
+/**
+ * tb_xdomain_response() - Send a XDomain response message
+ * @xd: XDomain to send the message
+ * @response: Response to send
+ * @size: Size of the response
+ * @type: PDF type of the response
+ *
+ * This can be used to send a XDomain response message to the other
+ * domain. No response for the message is expected.
+ *
+ * Return: %0 in case of success and negative errno in case of failure
+ */
+int tb_xdomain_response(struct tb_xdomain *xd, const void *response,
+			size_t size, enum tb_cfg_pkg_type type)
+{
+	return __tb_xdomain_response(xd->tb->ctl, response, size, type);
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_response);
+
+static int __tb_xdomain_request(struct tb_ctl *ctl, const void *request,
+	size_t request_size, enum tb_cfg_pkg_type request_type, void *response,
+	size_t response_size, enum tb_cfg_pkg_type response_type,
+	unsigned int timeout_msec)
+{
+	struct tb_cfg_request *req;
+	struct tb_cfg_result res;
+
+	req = tb_cfg_request_alloc();
+	if (!req)
+		return -ENOMEM;
+
+	req->match = tb_xdomain_match;
+	req->copy = tb_xdomain_copy;
+	req->request = request;
+	req->request_size = request_size;
+	req->request_type = request_type;
+	req->response = response;
+	req->response_size = response_size;
+	req->response_type = response_type;
+
+	res = tb_cfg_request_sync(ctl, req, timeout_msec);
+
+	tb_cfg_request_put(req);
+
+	return res.err == 1 ? -EIO : res.err;
+}
+
+/**
+ * tb_xdomain_request() - Send a XDomain request
+ * @xd: XDomain to send the request
+ * @request: Request to send
+ * @request_size: Size of the request in bytes
+ * @request_type: PDF type of the request
+ * @response: Response is copied here
+ * @response_size: Expected size of the response in bytes
+ * @response_type: Expected PDF type of the response
+ * @timeout_msec: Timeout in milliseconds to wait for the response
+ *
+ * This function can be used to send XDomain control channel messages to
+ * the other domain. The function waits until the response is received
+ * or when timeout triggers. Whichever comes first.
+ *
+ * Return: %0 in case of success and negative errno in case of failure
+ */
+int tb_xdomain_request(struct tb_xdomain *xd, const void *request,
+	size_t request_size, enum tb_cfg_pkg_type request_type,
+	void *response, size_t response_size,
+	enum tb_cfg_pkg_type response_type, unsigned int timeout_msec)
+{
+	return __tb_xdomain_request(xd->tb->ctl, request, request_size,
+				    request_type, response, response_size,
+				    response_type, timeout_msec);
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_request);
+
+static inline void tb_xdp_fill_header(struct tb_xdp_header *hdr, u64 route,
+	u8 sequence, enum tb_xdp_type type, size_t size)
+{
+	u32 length_sn;
+
+	length_sn = (size - sizeof(hdr->xd_hdr)) / 4;
+	length_sn |= (sequence << TB_XDOMAIN_SN_SHIFT) & TB_XDOMAIN_SN_MASK;
+
+	hdr->xd_hdr.route_hi = upper_32_bits(route);
+	hdr->xd_hdr.route_lo = lower_32_bits(route);
+	hdr->xd_hdr.length_sn = length_sn;
+	hdr->type = type;
+	memcpy(&hdr->uuid, &tb_xdp_uuid, sizeof(tb_xdp_uuid));
+}
+
+static int tb_xdp_handle_error(const struct tb_xdp_header *hdr)
+{
+	const struct tb_xdp_error_response *error;
+
+	if (hdr->type != ERROR_RESPONSE)
+		return 0;
+
+	error = (const struct tb_xdp_error_response *)hdr;
+
+	switch (error->error) {
+	case ERROR_UNKNOWN_PACKET:
+	case ERROR_UNKNOWN_DOMAIN:
+		return -EIO;
+	case ERROR_NOT_SUPPORTED:
+		return -ENOTSUPP;
+	case ERROR_NOT_READY:
+		return -EAGAIN;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int tb_xdp_error_response(struct tb_ctl *ctl, u64 route, u8 sequence,
+				 enum tb_xdp_error error)
+{
+	struct tb_xdp_error_response res;
+
+	memset(&res, 0, sizeof(res));
+	tb_xdp_fill_header(&res.hdr, route, sequence, ERROR_RESPONSE,
+			   sizeof(res));
+	res.error = error;
+
+	return __tb_xdomain_response(ctl, &res, sizeof(res),
+				     TB_CFG_PKG_XDOMAIN_RESP);
+}
+
+static int tb_xdp_properties_request(struct tb_ctl *ctl, u64 route,
+	const uuid_t *src_uuid, const uuid_t *dst_uuid, int retry,
+	u32 **block, u32 *generation)
+{
+	struct tb_xdp_properties_response *res;
+	struct tb_xdp_properties req;
+	u16 data_len, len;
+	size_t total_size;
+	u32 *data = NULL;
+	int ret;
+
+	total_size = sizeof(*res) + TB_XDP_PROPERTIES_MAX_DATA_LENGTH * 4;
+	res = kzalloc(total_size, GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	memset(&req, 0, sizeof(req));
+	tb_xdp_fill_header(&req.hdr, route, retry % 4, PROPERTIES_REQUEST,
+			   sizeof(req));
+	memcpy(&req.src_uuid, src_uuid, sizeof(*src_uuid));
+	memcpy(&req.dst_uuid, dst_uuid, sizeof(*dst_uuid));
+
+	len = 0;
+	data_len = 0;
+
+	do {
+		ret = __tb_xdomain_request(ctl, &req, sizeof(req),
+					   TB_CFG_PKG_XDOMAIN_REQ, res,
+					   total_size, TB_CFG_PKG_XDOMAIN_RESP,
+					   XDOMAIN_DEFAULT_TIMEOUT);
+		if (ret)
+			goto err;
+
+		ret = tb_xdp_handle_error(&res->hdr);
+		if (ret)
+			goto err;
+
+		/*
+		 * Package length includes the whole payload without the
+		 * XDomain header. Validate first that the package is at
+		 * least size of the response structure.
+		 */
+		len = res->hdr.xd_hdr.length_sn & TB_XDOMAIN_LENGTH_MASK;
+		if (len < sizeof(*res) / 4) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		len += sizeof(res->hdr.xd_hdr) / 4;
+		len -= sizeof(*res) / 4;
+
+		if (res->offset != req.offset) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		/*
+		 * First time allocate block that has enough space for
+		 * the whole properties block.
+		 */
+		if (!data) {
+			data_len = res->data_length;
+			if (data_len > TB_XDP_PROPERTIES_MAX_LENGTH) {
+				ret = -E2BIG;
+				goto err;
+			}
+
+			data = kcalloc(data_len, sizeof(u32), GFP_KERNEL);
+			if (!data) {
+				ret = -ENOMEM;
+				goto err;
+			}
+		}
+
+		memcpy(data + req.offset, res->data, len * 4);
+		req.offset += len;
+	} while (!data_len || req.offset < data_len);
+
+	*block = data;
+	*generation = res->generation;
+
+	kfree(res);
+
+	return data_len;
+
+err:
+	kfree(data);
+	kfree(res);
+
+	return ret;
+}
+
+static int tb_xdp_properties_response(struct tb *tb, struct tb_ctl *ctl,
+	u64 route, u8 sequence, const uuid_t *src_uuid,
+	const struct tb_xdp_properties *req)
+{
+	struct tb_xdp_properties_response *res;
+	size_t total_size;
+	u16 len;
+	int ret;
+
+	/*
+	 * Currently we expect all requests to be directed to us. The
+	 * protocol supports forwarding, though which we might add
+	 * support later on.
+	 */
+	if (!uuid_equal(src_uuid, &req->dst_uuid)) {
+		tb_xdp_error_response(ctl, route, sequence,
+				      ERROR_UNKNOWN_DOMAIN);
+		return 0;
+	}
+
+	mutex_lock(&xdomain_lock);
+
+	if (req->offset >= xdomain_property_block_len) {
+		mutex_unlock(&xdomain_lock);
+		return -EINVAL;
+	}
+
+	len = xdomain_property_block_len - req->offset;
+	len = min_t(u16, len, TB_XDP_PROPERTIES_MAX_DATA_LENGTH);
+	total_size = sizeof(*res) + len * 4;
+
+	res = kzalloc(total_size, GFP_KERNEL);
+	if (!res) {
+		mutex_unlock(&xdomain_lock);
+		return -ENOMEM;
+	}
+
+	tb_xdp_fill_header(&res->hdr, route, sequence, PROPERTIES_RESPONSE,
+			   total_size);
+	res->generation = xdomain_property_block_gen;
+	res->data_length = xdomain_property_block_len;
+	res->offset = req->offset;
+	uuid_copy(&res->src_uuid, src_uuid);
+	uuid_copy(&res->dst_uuid, &req->src_uuid);
+	memcpy(res->data, &xdomain_property_block[req->offset], len * 4);
+
+	mutex_unlock(&xdomain_lock);
+
+	ret = __tb_xdomain_response(ctl, res, total_size,
+				    TB_CFG_PKG_XDOMAIN_RESP);
+
+	kfree(res);
+	return ret;
+}
+
+static int tb_xdp_properties_changed_request(struct tb_ctl *ctl, u64 route,
+					     int retry, const uuid_t *uuid)
+{
+	struct tb_xdp_properties_changed_response res;
+	struct tb_xdp_properties_changed req;
+	int ret;
+
+	memset(&req, 0, sizeof(req));
+	tb_xdp_fill_header(&req.hdr, route, retry % 4,
+			   PROPERTIES_CHANGED_REQUEST, sizeof(req));
+	uuid_copy(&req.src_uuid, uuid);
+
+	memset(&res, 0, sizeof(res));
+	ret = __tb_xdomain_request(ctl, &req, sizeof(req),
+				   TB_CFG_PKG_XDOMAIN_REQ, &res, sizeof(res),
+				   TB_CFG_PKG_XDOMAIN_RESP,
+				   XDOMAIN_DEFAULT_TIMEOUT);
+	if (ret)
+		return ret;
+
+	return tb_xdp_handle_error(&res.hdr);
+}
+
+static int
+tb_xdp_properties_changed_response(struct tb_ctl *ctl, u64 route, u8 sequence)
+{
+	struct tb_xdp_properties_changed_response res;
+
+	memset(&res, 0, sizeof(res));
+	tb_xdp_fill_header(&res.hdr, route, sequence,
+			   PROPERTIES_CHANGED_RESPONSE, sizeof(res));
+	return __tb_xdomain_response(ctl, &res, sizeof(res),
+				     TB_CFG_PKG_XDOMAIN_RESP);
+}
+
+/**
+ * tb_register_protocol_handler() - Register protocol handler
+ * @handler: Handler to register
+ *
+ * This allows XDomain service drivers to hook into incoming XDomain
+ * messages. After this function is called the service driver needs to
+ * be able to handle calls to callback whenever a package with the
+ * registered protocol is received.
+ */
+int tb_register_protocol_handler(struct tb_protocol_handler *handler)
+{
+	if (!handler->uuid || !handler->callback)
+		return -EINVAL;
+	if (uuid_equal(handler->uuid, &tb_xdp_uuid))
+		return -EINVAL;
+
+	mutex_lock(&xdomain_lock);
+	list_add_tail(&handler->list, &protocol_handlers);
+	mutex_unlock(&xdomain_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_register_protocol_handler);
+
+/**
+ * tb_unregister_protocol_handler() - Unregister protocol handler
+ * @handler: Handler to unregister
+ *
+ * Removes the previously registered protocol handler.
+ */
+void tb_unregister_protocol_handler(struct tb_protocol_handler *handler)
+{
+	mutex_lock(&xdomain_lock);
+	list_del_init(&handler->list);
+	mutex_unlock(&xdomain_lock);
+}
+EXPORT_SYMBOL_GPL(tb_unregister_protocol_handler);
+
+static void tb_xdp_handle_request(struct work_struct *work)
+{
+	struct xdomain_request_work *xw = container_of(work, typeof(*xw), work);
+	const struct tb_xdp_header *pkg = xw->pkg;
+	const struct tb_xdomain_header *xhdr = &pkg->xd_hdr;
+	struct tb *tb = xw->tb;
+	struct tb_ctl *ctl = tb->ctl;
+	const uuid_t *uuid;
+	int ret = 0;
+	u8 sequence;
+	u64 route;
+
+	route = ((u64)xhdr->route_hi << 32 | xhdr->route_lo) & ~BIT_ULL(63);
+	sequence = xhdr->length_sn & TB_XDOMAIN_SN_MASK;
+	sequence >>= TB_XDOMAIN_SN_SHIFT;
+
+	mutex_lock(&tb->lock);
+	if (tb->root_switch)
+		uuid = tb->root_switch->uuid;
+	else
+		uuid = NULL;
+	mutex_unlock(&tb->lock);
+
+	if (!uuid) {
+		tb_xdp_error_response(ctl, route, sequence, ERROR_NOT_READY);
+		goto out;
+	}
+
+	switch (pkg->type) {
+	case PROPERTIES_REQUEST:
+		ret = tb_xdp_properties_response(tb, ctl, route, sequence, uuid,
+			(const struct tb_xdp_properties *)pkg);
+		break;
+
+	case PROPERTIES_CHANGED_REQUEST: {
+		const struct tb_xdp_properties_changed *xchg =
+			(const struct tb_xdp_properties_changed *)pkg;
+		struct tb_xdomain *xd;
+
+		ret = tb_xdp_properties_changed_response(ctl, route, sequence);
+
+		/*
+		 * Since the properties have been changed, let's update
+		 * the xdomain related to this connection as well in
+		 * case there is a change in services it offers.
+		 */
+		xd = tb_xdomain_find_by_uuid_locked(tb, &xchg->src_uuid);
+		if (xd) {
+			queue_delayed_work(tb->wq, &xd->get_properties_work,
+					   msecs_to_jiffies(50));
+			tb_xdomain_put(xd);
+		}
+
+		break;
+	}
+
+	default:
+		break;
+	}
+
+	if (ret) {
+		tb_warn(tb, "failed to send XDomain response for %#x\n",
+			pkg->type);
+	}
+
+out:
+	kfree(xw->pkg);
+	kfree(xw);
+}
+
+static void
+tb_xdp_schedule_request(struct tb *tb, const struct tb_xdp_header *hdr,
+			size_t size)
+{
+	struct xdomain_request_work *xw;
+
+	xw = kmalloc(sizeof(*xw), GFP_KERNEL);
+	if (!xw)
+		return;
+
+	INIT_WORK(&xw->work, tb_xdp_handle_request);
+	xw->pkg = kmemdup(hdr, size, GFP_KERNEL);
+	xw->tb = tb;
+
+	queue_work(tb->wq, &xw->work);
+}
+
+/**
+ * tb_register_service_driver() - Register XDomain service driver
+ * @drv: Driver to register
+ *
+ * Registers new service driver from @drv to the bus.
+ */
+int tb_register_service_driver(struct tb_service_driver *drv)
+{
+	drv->driver.bus = &tb_bus_type;
+	return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(tb_register_service_driver);
+
+/**
+ * tb_unregister_service_driver() - Unregister XDomain service driver
+ * @xdrv: Driver to unregister
+ *
+ * Unregisters XDomain service driver from the bus.
+ */
+void tb_unregister_service_driver(struct tb_service_driver *drv)
+{
+	driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(tb_unregister_service_driver);
+
+static ssize_t key_show(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+
+	/*
+	 * It should be null terminated but anything else is pretty much
+	 * allowed.
+	 */
+	return sprintf(buf, "%*pEp\n", (int)strlen(svc->key), svc->key);
+}
+static DEVICE_ATTR_RO(key);
+
+static int get_modalias(struct tb_service *svc, char *buf, size_t size)
+{
+	return snprintf(buf, size, "tbsvc:k%sp%08Xv%08Xr%08X", svc->key,
+			svc->prtcid, svc->prtcvers, svc->prtcrevs);
+}
+
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+
+	/* Full buffer size except new line and null termination */
+	get_modalias(svc, buf, PAGE_SIZE - 2);
+	return sprintf(buf, "%s\n", buf);
+}
+static DEVICE_ATTR_RO(modalias);
+
+static ssize_t prtcid_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+
+	return sprintf(buf, "%u\n", svc->prtcid);
+}
+static DEVICE_ATTR_RO(prtcid);
+
+static ssize_t prtcvers_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+
+	return sprintf(buf, "%u\n", svc->prtcvers);
+}
+static DEVICE_ATTR_RO(prtcvers);
+
+static ssize_t prtcrevs_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+
+	return sprintf(buf, "%u\n", svc->prtcrevs);
+}
+static DEVICE_ATTR_RO(prtcrevs);
+
+static ssize_t prtcstns_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+
+	return sprintf(buf, "0x%08x\n", svc->prtcstns);
+}
+static DEVICE_ATTR_RO(prtcstns);
+
+static struct attribute *tb_service_attrs[] = {
+	&dev_attr_key.attr,
+	&dev_attr_modalias.attr,
+	&dev_attr_prtcid.attr,
+	&dev_attr_prtcvers.attr,
+	&dev_attr_prtcrevs.attr,
+	&dev_attr_prtcstns.attr,
+	NULL,
+};
+
+static struct attribute_group tb_service_attr_group = {
+	.attrs = tb_service_attrs,
+};
+
+static const struct attribute_group *tb_service_attr_groups[] = {
+	&tb_service_attr_group,
+	NULL,
+};
+
+static int tb_service_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+	char modalias[64];
+
+	get_modalias(svc, modalias, sizeof(modalias));
+	return add_uevent_var(env, "MODALIAS=%s", modalias);
+}
+
+static void tb_service_release(struct device *dev)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+	struct tb_xdomain *xd = tb_service_parent(svc);
+
+	ida_simple_remove(&xd->service_ids, svc->id);
+	kfree(svc->key);
+	kfree(svc);
+}
+
+struct device_type tb_service_type = {
+	.name = "thunderbolt_service",
+	.groups = tb_service_attr_groups,
+	.uevent = tb_service_uevent,
+	.release = tb_service_release,
+};
+EXPORT_SYMBOL_GPL(tb_service_type);
+
+static int remove_missing_service(struct device *dev, void *data)
+{
+	struct tb_xdomain *xd = data;
+	struct tb_service *svc;
+
+	svc = tb_to_service(dev);
+	if (!svc)
+		return 0;
+
+	if (!tb_property_find(xd->properties, svc->key,
+			      TB_PROPERTY_TYPE_DIRECTORY))
+		device_unregister(dev);
+
+	return 0;
+}
+
+static int find_service(struct device *dev, void *data)
+{
+	const struct tb_property *p = data;
+	struct tb_service *svc;
+
+	svc = tb_to_service(dev);
+	if (!svc)
+		return 0;
+
+	return !strcmp(svc->key, p->key);
+}
+
+static int populate_service(struct tb_service *svc,
+			    struct tb_property *property)
+{
+	struct tb_property_dir *dir = property->value.dir;
+	struct tb_property *p;
+
+	/* Fill in standard properties */
+	p = tb_property_find(dir, "prtcid", TB_PROPERTY_TYPE_VALUE);
+	if (p)
+		svc->prtcid = p->value.immediate;
+	p = tb_property_find(dir, "prtcvers", TB_PROPERTY_TYPE_VALUE);
+	if (p)
+		svc->prtcvers = p->value.immediate;
+	p = tb_property_find(dir, "prtcrevs", TB_PROPERTY_TYPE_VALUE);
+	if (p)
+		svc->prtcrevs = p->value.immediate;
+	p = tb_property_find(dir, "prtcstns", TB_PROPERTY_TYPE_VALUE);
+	if (p)
+		svc->prtcstns = p->value.immediate;
+
+	svc->key = kstrdup(property->key, GFP_KERNEL);
+	if (!svc->key)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void enumerate_services(struct tb_xdomain *xd)
+{
+	struct tb_service *svc;
+	struct tb_property *p;
+	struct device *dev;
+
+	/*
+	 * First remove all services that are not available anymore in
+	 * the updated property block.
+	 */
+	device_for_each_child_reverse(&xd->dev, xd, remove_missing_service);
+
+	/* Then re-enumerate properties creating new services as we go */
+	tb_property_for_each(xd->properties, p) {
+		if (p->type != TB_PROPERTY_TYPE_DIRECTORY)
+			continue;
+
+		/* If the service exists already we are fine */
+		dev = device_find_child(&xd->dev, p, find_service);
+		if (dev) {
+			put_device(dev);
+			continue;
+		}
+
+		svc = kzalloc(sizeof(*svc), GFP_KERNEL);
+		if (!svc)
+			break;
+
+		if (populate_service(svc, p)) {
+			kfree(svc);
+			break;
+		}
+
+		svc->id = ida_simple_get(&xd->service_ids, 0, 0, GFP_KERNEL);
+		svc->dev.bus = &tb_bus_type;
+		svc->dev.type = &tb_service_type;
+		svc->dev.parent = &xd->dev;
+		dev_set_name(&svc->dev, "%s.%d", dev_name(&xd->dev), svc->id);
+
+		if (device_register(&svc->dev)) {
+			put_device(&svc->dev);
+			break;
+		}
+	}
+}
+
+static int populate_properties(struct tb_xdomain *xd,
+			       struct tb_property_dir *dir)
+{
+	const struct tb_property *p;
+
+	/* Required properties */
+	p = tb_property_find(dir, "deviceid", TB_PROPERTY_TYPE_VALUE);
+	if (!p)
+		return -EINVAL;
+	xd->device = p->value.immediate;
+
+	p = tb_property_find(dir, "vendorid", TB_PROPERTY_TYPE_VALUE);
+	if (!p)
+		return -EINVAL;
+	xd->vendor = p->value.immediate;
+
+	kfree(xd->device_name);
+	xd->device_name = NULL;
+	kfree(xd->vendor_name);
+	xd->vendor_name = NULL;
+
+	/* Optional properties */
+	p = tb_property_find(dir, "deviceid", TB_PROPERTY_TYPE_TEXT);
+	if (p)
+		xd->device_name = kstrdup(p->value.text, GFP_KERNEL);
+	p = tb_property_find(dir, "vendorid", TB_PROPERTY_TYPE_TEXT);
+	if (p)
+		xd->vendor_name = kstrdup(p->value.text, GFP_KERNEL);
+
+	return 0;
+}
+
+/* Called with @xd->lock held */
+static void tb_xdomain_restore_paths(struct tb_xdomain *xd)
+{
+	if (!xd->resume)
+		return;
+
+	xd->resume = false;
+	if (xd->transmit_path) {
+		dev_dbg(&xd->dev, "re-establishing DMA path\n");
+		tb_domain_approve_xdomain_paths(xd->tb, xd);
+	}
+}
+
+static void tb_xdomain_get_properties(struct work_struct *work)
+{
+	struct tb_xdomain *xd = container_of(work, typeof(*xd),
+					     get_properties_work.work);
+	struct tb_property_dir *dir;
+	struct tb *tb = xd->tb;
+	bool update = false;
+	u32 *block = NULL;
+	u32 gen = 0;
+	int ret;
+
+	ret = tb_xdp_properties_request(tb->ctl, xd->route, xd->local_uuid,
+					xd->remote_uuid, xd->properties_retries,
+					&block, &gen);
+	if (ret < 0) {
+		if (xd->properties_retries-- > 0) {
+			queue_delayed_work(xd->tb->wq, &xd->get_properties_work,
+					   msecs_to_jiffies(1000));
+		} else {
+			/* Give up now */
+			dev_err(&xd->dev,
+				"failed read XDomain properties from %pUb\n",
+				xd->remote_uuid);
+		}
+		return;
+	}
+
+	xd->properties_retries = XDOMAIN_PROPERTIES_RETRIES;
+
+	mutex_lock(&xd->lock);
+
+	/* Only accept newer generation properties */
+	if (xd->properties && gen <= xd->property_block_gen) {
+		/*
+		 * On resume it is likely that the properties block is
+		 * not changed (unless the other end added or removed
+		 * services). However, we need to make sure the existing
+		 * DMA paths are restored properly.
+		 */
+		tb_xdomain_restore_paths(xd);
+		goto err_free_block;
+	}
+
+	dir = tb_property_parse_dir(block, ret);
+	if (!dir) {
+		dev_err(&xd->dev, "failed to parse XDomain properties\n");
+		goto err_free_block;
+	}
+
+	ret = populate_properties(xd, dir);
+	if (ret) {
+		dev_err(&xd->dev, "missing XDomain properties in response\n");
+		goto err_free_dir;
+	}
+
+	/* Release the existing one */
+	if (xd->properties) {
+		tb_property_free_dir(xd->properties);
+		update = true;
+	}
+
+	xd->properties = dir;
+	xd->property_block_gen = gen;
+
+	tb_xdomain_restore_paths(xd);
+
+	mutex_unlock(&xd->lock);
+
+	kfree(block);
+
+	/*
+	 * Now the device should be ready enough so we can add it to the
+	 * bus and let userspace know about it. If the device is already
+	 * registered, we notify the userspace that it has changed.
+	 */
+	if (!update) {
+		if (device_add(&xd->dev)) {
+			dev_err(&xd->dev, "failed to add XDomain device\n");
+			return;
+		}
+	} else {
+		kobject_uevent(&xd->dev.kobj, KOBJ_CHANGE);
+	}
+
+	enumerate_services(xd);
+	return;
+
+err_free_dir:
+	tb_property_free_dir(dir);
+err_free_block:
+	kfree(block);
+	mutex_unlock(&xd->lock);
+}
+
+static void tb_xdomain_properties_changed(struct work_struct *work)
+{
+	struct tb_xdomain *xd = container_of(work, typeof(*xd),
+					     properties_changed_work.work);
+	int ret;
+
+	ret = tb_xdp_properties_changed_request(xd->tb->ctl, xd->route,
+				xd->properties_changed_retries, xd->local_uuid);
+	if (ret) {
+		if (xd->properties_changed_retries-- > 0)
+			queue_delayed_work(xd->tb->wq,
+					   &xd->properties_changed_work,
+					   msecs_to_jiffies(1000));
+		return;
+	}
+
+	xd->properties_changed_retries = XDOMAIN_PROPERTIES_CHANGED_RETRIES;
+}
+
+static ssize_t device_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+
+	return sprintf(buf, "%#x\n", xd->device);
+}
+static DEVICE_ATTR_RO(device);
+
+static ssize_t
+device_name_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+	int ret;
+
+	if (mutex_lock_interruptible(&xd->lock))
+		return -ERESTARTSYS;
+	ret = sprintf(buf, "%s\n", xd->device_name ? xd->device_name : "");
+	mutex_unlock(&xd->lock);
+
+	return ret;
+}
+static DEVICE_ATTR_RO(device_name);
+
+static ssize_t vendor_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+
+	return sprintf(buf, "%#x\n", xd->vendor);
+}
+static DEVICE_ATTR_RO(vendor);
+
+static ssize_t
+vendor_name_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+	int ret;
+
+	if (mutex_lock_interruptible(&xd->lock))
+		return -ERESTARTSYS;
+	ret = sprintf(buf, "%s\n", xd->vendor_name ? xd->vendor_name : "");
+	mutex_unlock(&xd->lock);
+
+	return ret;
+}
+static DEVICE_ATTR_RO(vendor_name);
+
+static ssize_t unique_id_show(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+
+	return sprintf(buf, "%pUb\n", xd->remote_uuid);
+}
+static DEVICE_ATTR_RO(unique_id);
+
+static struct attribute *xdomain_attrs[] = {
+	&dev_attr_device.attr,
+	&dev_attr_device_name.attr,
+	&dev_attr_unique_id.attr,
+	&dev_attr_vendor.attr,
+	&dev_attr_vendor_name.attr,
+	NULL,
+};
+
+static struct attribute_group xdomain_attr_group = {
+	.attrs = xdomain_attrs,
+};
+
+static const struct attribute_group *xdomain_attr_groups[] = {
+	&xdomain_attr_group,
+	NULL,
+};
+
+static void tb_xdomain_release(struct device *dev)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+
+	put_device(xd->dev.parent);
+
+	tb_property_free_dir(xd->properties);
+	ida_destroy(&xd->service_ids);
+
+	kfree(xd->local_uuid);
+	kfree(xd->remote_uuid);
+	kfree(xd->device_name);
+	kfree(xd->vendor_name);
+	kfree(xd);
+}
+
+static void start_handshake(struct tb_xdomain *xd)
+{
+	xd->properties_retries = XDOMAIN_PROPERTIES_RETRIES;
+	xd->properties_changed_retries = XDOMAIN_PROPERTIES_CHANGED_RETRIES;
+
+	/* Start exchanging properties with the other host */
+	queue_delayed_work(xd->tb->wq, &xd->properties_changed_work,
+			   msecs_to_jiffies(100));
+	queue_delayed_work(xd->tb->wq, &xd->get_properties_work,
+			   msecs_to_jiffies(1000));
+}
+
+static void stop_handshake(struct tb_xdomain *xd)
+{
+	xd->properties_retries = 0;
+	xd->properties_changed_retries = 0;
+
+	cancel_delayed_work_sync(&xd->get_properties_work);
+	cancel_delayed_work_sync(&xd->properties_changed_work);
+}
+
+static int __maybe_unused tb_xdomain_suspend(struct device *dev)
+{
+	stop_handshake(tb_to_xdomain(dev));
+	return 0;
+}
+
+static int __maybe_unused tb_xdomain_resume(struct device *dev)
+{
+	struct tb_xdomain *xd = tb_to_xdomain(dev);
+
+	/*
+	 * Ask tb_xdomain_get_properties() restore any existing DMA
+	 * paths after properties are re-read.
+	 */
+	xd->resume = true;
+	start_handshake(xd);
+
+	return 0;
+}
+
+static const struct dev_pm_ops tb_xdomain_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(tb_xdomain_suspend, tb_xdomain_resume)
+};
+
+struct device_type tb_xdomain_type = {
+	.name = "thunderbolt_xdomain",
+	.release = tb_xdomain_release,
+	.pm = &tb_xdomain_pm_ops,
+};
+EXPORT_SYMBOL_GPL(tb_xdomain_type);
+
+/**
+ * tb_xdomain_alloc() - Allocate new XDomain object
+ * @tb: Domain where the XDomain belongs
+ * @parent: Parent device (the switch through the connection to the
+ *	    other domain is reached).
+ * @route: Route string used to reach the other domain
+ * @local_uuid: Our local domain UUID
+ * @remote_uuid: UUID of the other domain
+ *
+ * Allocates new XDomain structure and returns pointer to that. The
+ * object must be released by calling tb_xdomain_put().
+ */
+struct tb_xdomain *tb_xdomain_alloc(struct tb *tb, struct device *parent,
+				    u64 route, const uuid_t *local_uuid,
+				    const uuid_t *remote_uuid)
+{
+	struct tb_xdomain *xd;
+
+	xd = kzalloc(sizeof(*xd), GFP_KERNEL);
+	if (!xd)
+		return NULL;
+
+	xd->tb = tb;
+	xd->route = route;
+	ida_init(&xd->service_ids);
+	mutex_init(&xd->lock);
+	INIT_DELAYED_WORK(&xd->get_properties_work, tb_xdomain_get_properties);
+	INIT_DELAYED_WORK(&xd->properties_changed_work,
+			  tb_xdomain_properties_changed);
+
+	xd->local_uuid = kmemdup(local_uuid, sizeof(uuid_t), GFP_KERNEL);
+	if (!xd->local_uuid)
+		goto err_free;
+
+	xd->remote_uuid = kmemdup(remote_uuid, sizeof(uuid_t), GFP_KERNEL);
+	if (!xd->remote_uuid)
+		goto err_free_local_uuid;
+
+	device_initialize(&xd->dev);
+	xd->dev.parent = get_device(parent);
+	xd->dev.bus = &tb_bus_type;
+	xd->dev.type = &tb_xdomain_type;
+	xd->dev.groups = xdomain_attr_groups;
+	dev_set_name(&xd->dev, "%u-%llx", tb->index, route);
+
+	return xd;
+
+err_free_local_uuid:
+	kfree(xd->local_uuid);
+err_free:
+	kfree(xd);
+
+	return NULL;
+}
+
+/**
+ * tb_xdomain_add() - Add XDomain to the bus
+ * @xd: XDomain to add
+ *
+ * This function starts XDomain discovery protocol handshake and
+ * eventually adds the XDomain to the bus. After calling this function
+ * the caller needs to call tb_xdomain_remove() in order to remove and
+ * release the object regardless whether the handshake succeeded or not.
+ */
+void tb_xdomain_add(struct tb_xdomain *xd)
+{
+	/* Start exchanging properties with the other host */
+	start_handshake(xd);
+}
+
+static int unregister_service(struct device *dev, void *data)
+{
+	device_unregister(dev);
+	return 0;
+}
+
+/**
+ * tb_xdomain_remove() - Remove XDomain from the bus
+ * @xd: XDomain to remove
+ *
+ * This will stop all ongoing configuration work and remove the XDomain
+ * along with any services from the bus. When the last reference to @xd
+ * is released the object will be released as well.
+ */
+void tb_xdomain_remove(struct tb_xdomain *xd)
+{
+	stop_handshake(xd);
+
+	device_for_each_child_reverse(&xd->dev, xd, unregister_service);
+
+	if (!device_is_registered(&xd->dev))
+		put_device(&xd->dev);
+	else
+		device_unregister(&xd->dev);
+}
+
+/**
+ * tb_xdomain_enable_paths() - Enable DMA paths for XDomain connection
+ * @xd: XDomain connection
+ * @transmit_path: HopID of the transmit path the other end is using to
+ *		   send packets
+ * @transmit_ring: DMA ring used to receive packets from the other end
+ * @receive_path: HopID of the receive path the other end is using to
+ *		  receive packets
+ * @receive_ring: DMA ring used to send packets to the other end
+ *
+ * The function enables DMA paths accordingly so that after successful
+ * return the caller can send and receive packets using high-speed DMA
+ * path.
+ *
+ * Return: %0 in case of success and negative errno in case of error
+ */
+int tb_xdomain_enable_paths(struct tb_xdomain *xd, u16 transmit_path,
+			    u16 transmit_ring, u16 receive_path,
+			    u16 receive_ring)
+{
+	int ret;
+
+	mutex_lock(&xd->lock);
+
+	if (xd->transmit_path) {
+		ret = xd->transmit_path == transmit_path ? 0 : -EBUSY;
+		goto exit_unlock;
+	}
+
+	xd->transmit_path = transmit_path;
+	xd->transmit_ring = transmit_ring;
+	xd->receive_path = receive_path;
+	xd->receive_ring = receive_ring;
+
+	ret = tb_domain_approve_xdomain_paths(xd->tb, xd);
+
+exit_unlock:
+	mutex_unlock(&xd->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_enable_paths);
+
+/**
+ * tb_xdomain_disable_paths() - Disable DMA paths for XDomain connection
+ * @xd: XDomain connection
+ *
+ * This does the opposite of tb_xdomain_enable_paths(). After call to
+ * this the caller is not expected to use the rings anymore.
+ *
+ * Return: %0 in case of success and negative errno in case of error
+ */
+int tb_xdomain_disable_paths(struct tb_xdomain *xd)
+{
+	int ret = 0;
+
+	mutex_lock(&xd->lock);
+	if (xd->transmit_path) {
+		xd->transmit_path = 0;
+		xd->transmit_ring = 0;
+		xd->receive_path = 0;
+		xd->receive_ring = 0;
+
+		ret = tb_domain_disconnect_xdomain_paths(xd->tb, xd);
+	}
+	mutex_unlock(&xd->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_disable_paths);
+
+struct tb_xdomain_lookup {
+	const uuid_t *uuid;
+	u8 link;
+	u8 depth;
+};
+
+static struct tb_xdomain *switch_find_xdomain(struct tb_switch *sw,
+	const struct tb_xdomain_lookup *lookup)
+{
+	int i;
+
+	for (i = 1; i <= sw->config.max_port_number; i++) {
+		struct tb_port *port = &sw->ports[i];
+		struct tb_xdomain *xd;
+
+		if (tb_is_upstream_port(port))
+			continue;
+
+		if (port->xdomain) {
+			xd = port->xdomain;
+
+			if (lookup->uuid) {
+				if (uuid_equal(xd->remote_uuid, lookup->uuid))
+					return xd;
+			} else if (lookup->link == xd->link &&
+				   lookup->depth == xd->depth) {
+				return xd;
+			}
+		} else if (port->remote) {
+			xd = switch_find_xdomain(port->remote->sw, lookup);
+			if (xd)
+				return xd;
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * tb_xdomain_find_by_uuid() - Find an XDomain by UUID
+ * @tb: Domain where the XDomain belongs to
+ * @uuid: UUID to look for
+ *
+ * Finds XDomain by walking through the Thunderbolt topology below @tb.
+ * The returned XDomain will have its reference count increased so the
+ * caller needs to call tb_xdomain_put() when it is done with the
+ * object.
+ *
+ * This will find all XDomains including the ones that are not yet added
+ * to the bus (handshake is still in progress).
+ *
+ * The caller needs to hold @tb->lock.
+ */
+struct tb_xdomain *tb_xdomain_find_by_uuid(struct tb *tb, const uuid_t *uuid)
+{
+	struct tb_xdomain_lookup lookup;
+	struct tb_xdomain *xd;
+
+	memset(&lookup, 0, sizeof(lookup));
+	lookup.uuid = uuid;
+
+	xd = switch_find_xdomain(tb->root_switch, &lookup);
+	if (xd) {
+		get_device(&xd->dev);
+		return xd;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_find_by_uuid);
+
+/**
+ * tb_xdomain_find_by_link_depth() - Find an XDomain by link and depth
+ * @tb: Domain where the XDomain belongs to
+ * @link: Root switch link number
+ * @depth: Depth in the link
+ *
+ * Finds XDomain by walking through the Thunderbolt topology below @tb.
+ * The returned XDomain will have its reference count increased so the
+ * caller needs to call tb_xdomain_put() when it is done with the
+ * object.
+ *
+ * This will find all XDomains including the ones that are not yet added
+ * to the bus (handshake is still in progress).
+ *
+ * The caller needs to hold @tb->lock.
+ */
+struct tb_xdomain *tb_xdomain_find_by_link_depth(struct tb *tb, u8 link,
+						 u8 depth)
+{
+	struct tb_xdomain_lookup lookup;
+	struct tb_xdomain *xd;
+
+	memset(&lookup, 0, sizeof(lookup));
+	lookup.link = link;
+	lookup.depth = depth;
+
+	xd = switch_find_xdomain(tb->root_switch, &lookup);
+	if (xd) {
+		get_device(&xd->dev);
+		return xd;
+	}
+
+	return NULL;
+}
+
+bool tb_xdomain_handle_request(struct tb *tb, enum tb_cfg_pkg_type type,
+			       const void *buf, size_t size)
+{
+	const struct tb_protocol_handler *handler, *tmp;
+	const struct tb_xdp_header *hdr = buf;
+	unsigned int length;
+	int ret = 0;
+
+	/* We expect the packet is at least size of the header */
+	length = hdr->xd_hdr.length_sn & TB_XDOMAIN_LENGTH_MASK;
+	if (length != size / 4 - sizeof(hdr->xd_hdr) / 4)
+		return true;
+	if (length < sizeof(*hdr) / 4 - sizeof(hdr->xd_hdr) / 4)
+		return true;
+
+	/*
+	 * Handle XDomain discovery protocol packets directly here. For
+	 * other protocols (based on their UUID) we call registered
+	 * handlers in turn.
+	 */
+	if (uuid_equal(&hdr->uuid, &tb_xdp_uuid)) {
+		if (type == TB_CFG_PKG_XDOMAIN_REQ) {
+			tb_xdp_schedule_request(tb, hdr, size);
+			return true;
+		}
+		return false;
+	}
+
+	mutex_lock(&xdomain_lock);
+	list_for_each_entry_safe(handler, tmp, &protocol_handlers, list) {
+		if (!uuid_equal(&hdr->uuid, handler->uuid))
+			continue;
+
+		mutex_unlock(&xdomain_lock);
+		ret = handler->callback(buf, size, handler->data);
+		mutex_lock(&xdomain_lock);
+
+		if (ret)
+			break;
+	}
+	mutex_unlock(&xdomain_lock);
+
+	return ret > 0;
+}
+
+static int rebuild_property_block(void)
+{
+	u32 *block, len;
+	int ret;
+
+	ret = tb_property_format_dir(xdomain_property_dir, NULL, 0);
+	if (ret < 0)
+		return ret;
+
+	len = ret;
+
+	block = kcalloc(len, sizeof(u32), GFP_KERNEL);
+	if (!block)
+		return -ENOMEM;
+
+	ret = tb_property_format_dir(xdomain_property_dir, block, len);
+	if (ret) {
+		kfree(block);
+		return ret;
+	}
+
+	kfree(xdomain_property_block);
+	xdomain_property_block = block;
+	xdomain_property_block_len = len;
+	xdomain_property_block_gen++;
+
+	return 0;
+}
+
+static int update_xdomain(struct device *dev, void *data)
+{
+	struct tb_xdomain *xd;
+
+	xd = tb_to_xdomain(dev);
+	if (xd) {
+		queue_delayed_work(xd->tb->wq, &xd->properties_changed_work,
+				   msecs_to_jiffies(50));
+	}
+
+	return 0;
+}
+
+static void update_all_xdomains(void)
+{
+	bus_for_each_dev(&tb_bus_type, NULL, NULL, update_xdomain);
+}
+
+static bool remove_directory(const char *key, const struct tb_property_dir *dir)
+{
+	struct tb_property *p;
+
+	p = tb_property_find(xdomain_property_dir, key,
+			     TB_PROPERTY_TYPE_DIRECTORY);
+	if (p && p->value.dir == dir) {
+		tb_property_remove(p);
+		return true;
+	}
+	return false;
+}
+
+/**
+ * tb_register_property_dir() - Register property directory to the host
+ * @key: Key (name) of the directory to add
+ * @dir: Directory to add
+ *
+ * Service drivers can use this function to add new property directory
+ * to the host available properties. The other connected hosts are
+ * notified so they can re-read properties of this host if they are
+ * interested.
+ *
+ * Return: %0 on success and negative errno on failure
+ */
+int tb_register_property_dir(const char *key, struct tb_property_dir *dir)
+{
+	int ret;
+
+	if (!key || strlen(key) > 8)
+		return -EINVAL;
+
+	mutex_lock(&xdomain_lock);
+	if (tb_property_find(xdomain_property_dir, key,
+			     TB_PROPERTY_TYPE_DIRECTORY)) {
+		ret = -EEXIST;
+		goto err_unlock;
+	}
+
+	ret = tb_property_add_dir(xdomain_property_dir, key, dir);
+	if (ret)
+		goto err_unlock;
+
+	ret = rebuild_property_block();
+	if (ret) {
+		remove_directory(key, dir);
+		goto err_unlock;
+	}
+
+	mutex_unlock(&xdomain_lock);
+	update_all_xdomains();
+	return 0;
+
+err_unlock:
+	mutex_unlock(&xdomain_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tb_register_property_dir);
+
+/**
+ * tb_unregister_property_dir() - Removes property directory from host
+ * @key: Key (name) of the directory
+ * @dir: Directory to remove
+ *
+ * This will remove the existing directory from this host and notify the
+ * connected hosts about the change.
+ */
+void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir)
+{
+	int ret = 0;
+
+	mutex_lock(&xdomain_lock);
+	if (remove_directory(key, dir))
+		ret = rebuild_property_block();
+	mutex_unlock(&xdomain_lock);
+
+	if (!ret)
+		update_all_xdomains();
+}
+EXPORT_SYMBOL_GPL(tb_unregister_property_dir);
+
+int tb_xdomain_init(void)
+{
+	int ret;
+
+	xdomain_property_dir = tb_property_create_dir(NULL);
+	if (!xdomain_property_dir)
+		return -ENOMEM;
+
+	/*
+	 * Initialize standard set of properties without any service
+	 * directories. Those will be added by service drivers
+	 * themselves when they are loaded.
+	 */
+	tb_property_add_immediate(xdomain_property_dir, "vendorid",
+				  PCI_VENDOR_ID_INTEL);
+	tb_property_add_text(xdomain_property_dir, "vendorid", "Intel Corp.");
+	tb_property_add_immediate(xdomain_property_dir, "deviceid", 0x1);
+	tb_property_add_text(xdomain_property_dir, "deviceid",
+			     utsname()->nodename);
+	tb_property_add_immediate(xdomain_property_dir, "devicerv", 0x80000100);
+
+	ret = rebuild_property_block();
+	if (ret) {
+		tb_property_free_dir(xdomain_property_dir);
+		xdomain_property_dir = NULL;
+	}
+
+	return ret;
+}
+
+void tb_xdomain_exit(void)
+{
+	kfree(xdomain_property_block);
+	tb_property_free_dir(xdomain_property_dir);
+}
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 694cebb50f72..7625c3b81f84 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -683,5 +683,31 @@ struct fsl_mc_device_id {
 	const char obj_type[16];
 };
 
+/**
+ * struct tb_service_id - Thunderbolt service identifiers
+ * @match_flags: Flags used to match the structure
+ * @protocol_key: Protocol key the service supports
+ * @protocol_id: Protocol id the service supports
+ * @protocol_version: Version of the protocol
+ * @protocol_revision: Revision of the protocol software
+ * @driver_data: Driver specific data
+ *
+ * Thunderbolt XDomain services are exposed as devices where each device
+ * carries the protocol information the service supports. Thunderbolt
+ * XDomain service drivers match against that information.
+ */
+struct tb_service_id {
+	__u32 match_flags;
+	char protocol_key[8 + 1];
+	__u32 protocol_id;
+	__u32 protocol_version;
+	__u32 protocol_revision;
+	kernel_ulong_t driver_data;
+};
+
+#define TBSVC_MATCH_PROTOCOL_KEY	0x0001
+#define TBSVC_MATCH_PROTOCOL_ID		0x0002
+#define TBSVC_MATCH_PROTOCOL_VERSION	0x0004
+#define TBSVC_MATCH_PROTOCOL_REVISION	0x0008
 
 #endif /* LINUX_MOD_DEVICETABLE_H */
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 43b8d1e09341..18c0e3d5e85c 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -17,6 +17,7 @@
 #include <linux/device.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
+#include <linux/mod_devicetable.h>
 #include <linux/uuid.h>
 
 enum tb_cfg_pkg_type {
@@ -77,6 +78,8 @@ struct tb {
 };
 
 extern struct bus_type tb_bus_type;
+extern struct device_type tb_service_type;
+extern struct device_type tb_xdomain_type;
 
 #define TB_LINKS_PER_PHY_PORT	2
 
@@ -155,4 +158,243 @@ struct tb_property *tb_property_get_next(struct tb_property_dir *dir,
 	     property;						\
 	     property = tb_property_get_next(dir, property))
 
+int tb_register_property_dir(const char *key, struct tb_property_dir *dir);
+void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir);
+
+/**
+ * struct tb_xdomain - Cross-domain (XDomain) connection
+ * @dev: XDomain device
+ * @tb: Pointer to the domain
+ * @remote_uuid: UUID of the remote domain (host)
+ * @local_uuid: Cached local UUID
+ * @route: Route string the other domain can be reached
+ * @vendor: Vendor ID of the remote domain
+ * @device: Device ID of the demote domain
+ * @lock: Lock to serialize access to the following fields of this structure
+ * @vendor_name: Name of the vendor (or %NULL if not known)
+ * @device_name: Name of the device (or %NULL if not known)
+ * @is_unplugged: The XDomain is unplugged
+ * @resume: The XDomain is being resumed
+ * @transmit_path: HopID which the remote end expects us to transmit
+ * @transmit_ring: Local ring (hop) where outgoing packets are pushed
+ * @receive_path: HopID which we expect the remote end to transmit
+ * @receive_ring: Local ring (hop) where incoming packets arrive
+ * @service_ids: Used to generate IDs for the services
+ * @properties: Properties exported by the remote domain
+ * @property_block_gen: Generation of @properties
+ * @properties_lock: Lock protecting @properties.
+ * @get_properties_work: Work used to get remote domain properties
+ * @properties_retries: Number of times left to read properties
+ * @properties_changed_work: Work used to notify the remote domain that
+ *			     our properties have changed
+ * @properties_changed_retries: Number of times left to send properties
+ *				changed notification
+ * @link: Root switch link the remote domain is connected (ICM only)
+ * @depth: Depth in the chain the remote domain is connected (ICM only)
+ *
+ * This structure represents connection across two domains (hosts).
+ * Each XDomain contains zero or more services which are exposed as
+ * &struct tb_service objects.
+ *
+ * Service drivers may access this structure if they need to enumerate
+ * non-standard properties but they need hold @lock when doing so
+ * because properties can be changed asynchronously in response to
+ * changes in the remote domain.
+ */
+struct tb_xdomain {
+	struct device dev;
+	struct tb *tb;
+	uuid_t *remote_uuid;
+	const uuid_t *local_uuid;
+	u64 route;
+	u16 vendor;
+	u16 device;
+	struct mutex lock;
+	const char *vendor_name;
+	const char *device_name;
+	bool is_unplugged;
+	bool resume;
+	u16 transmit_path;
+	u16 transmit_ring;
+	u16 receive_path;
+	u16 receive_ring;
+	struct ida service_ids;
+	struct tb_property_dir *properties;
+	u32 property_block_gen;
+	struct delayed_work get_properties_work;
+	int properties_retries;
+	struct delayed_work properties_changed_work;
+	int properties_changed_retries;
+	u8 link;
+	u8 depth;
+};
+
+int tb_xdomain_enable_paths(struct tb_xdomain *xd, u16 transmit_path,
+			    u16 transmit_ring, u16 receive_path,
+			    u16 receive_ring);
+int tb_xdomain_disable_paths(struct tb_xdomain *xd);
+struct tb_xdomain *tb_xdomain_find_by_uuid(struct tb *tb, const uuid_t *uuid);
+
+static inline struct tb_xdomain *
+tb_xdomain_find_by_uuid_locked(struct tb *tb, const uuid_t *uuid)
+{
+	struct tb_xdomain *xd;
+
+	mutex_lock(&tb->lock);
+	xd = tb_xdomain_find_by_uuid(tb, uuid);
+	mutex_unlock(&tb->lock);
+
+	return xd;
+}
+
+static inline struct tb_xdomain *tb_xdomain_get(struct tb_xdomain *xd)
+{
+	if (xd)
+		get_device(&xd->dev);
+	return xd;
+}
+
+static inline void tb_xdomain_put(struct tb_xdomain *xd)
+{
+	if (xd)
+		put_device(&xd->dev);
+}
+
+static inline bool tb_is_xdomain(const struct device *dev)
+{
+	return dev->type == &tb_xdomain_type;
+}
+
+static inline struct tb_xdomain *tb_to_xdomain(struct device *dev)
+{
+	if (tb_is_xdomain(dev))
+		return container_of(dev, struct tb_xdomain, dev);
+	return NULL;
+}
+
+int tb_xdomain_response(struct tb_xdomain *xd, const void *response,
+			size_t size, enum tb_cfg_pkg_type type);
+int tb_xdomain_request(struct tb_xdomain *xd, const void *request,
+		       size_t request_size, enum tb_cfg_pkg_type request_type,
+		       void *response, size_t response_size,
+		       enum tb_cfg_pkg_type response_type,
+		       unsigned int timeout_msec);
+
+/**
+ * tb_protocol_handler - Protocol specific handler
+ * @uuid: XDomain messages with this UUID are dispatched to this handler
+ * @callback: Callback called with the XDomain message. Returning %1
+ *	      here tells the XDomain core that the message was handled
+ *	      by this handler and should not be forwared to other
+ *	      handlers.
+ * @data: Data passed with the callback
+ * @list: Handlers are linked using this
+ *
+ * Thunderbolt services can hook into incoming XDomain requests by
+ * registering protocol handler. Only limitation is that the XDomain
+ * discovery protocol UUID cannot be registered since it is handled by
+ * the core XDomain code.
+ *
+ * The @callback must check that the message is really directed to the
+ * service the driver implements.
+ */
+struct tb_protocol_handler {
+	const uuid_t *uuid;
+	int (*callback)(const void *buf, size_t size, void *data);
+	void *data;
+	struct list_head list;
+};
+
+int tb_register_protocol_handler(struct tb_protocol_handler *handler);
+void tb_unregister_protocol_handler(struct tb_protocol_handler *handler);
+
+/**
+ * struct tb_service - Thunderbolt service
+ * @dev: XDomain device
+ * @id: ID of the service (shown in sysfs)
+ * @key: Protocol key from the properties directory
+ * @prtcid: Protocol ID from the properties directory
+ * @prtcvers: Protocol version from the properties directory
+ * @prtcrevs: Protocol software revision from the properties directory
+ * @prtcstns: Protocol settings mask from the properties directory
+ *
+ * Each domain exposes set of services it supports as collection of
+ * properties. For each service there will be one corresponding
+ * &struct tb_service. Service drivers are bound to these.
+ */
+struct tb_service {
+	struct device dev;
+	int id;
+	const char *key;
+	u32 prtcid;
+	u32 prtcvers;
+	u32 prtcrevs;
+	u32 prtcstns;
+};
+
+static inline struct tb_service *tb_service_get(struct tb_service *svc)
+{
+	if (svc)
+		get_device(&svc->dev);
+	return svc;
+}
+
+static inline void tb_service_put(struct tb_service *svc)
+{
+	if (svc)
+		put_device(&svc->dev);
+}
+
+static inline bool tb_is_service(const struct device *dev)
+{
+	return dev->type == &tb_service_type;
+}
+
+static inline struct tb_service *tb_to_service(struct device *dev)
+{
+	if (tb_is_service(dev))
+		return container_of(dev, struct tb_service, dev);
+	return NULL;
+}
+
+/**
+ * tb_service_driver - Thunderbolt service driver
+ * @driver: Driver structure
+ * @probe: Called when the driver is probed
+ * @remove: Called when the driver is removed (optional)
+ * @shutdown: Called at shutdown time to stop the service (optional)
+ * @id_table: Table of service identifiers the driver supports
+ */
+struct tb_service_driver {
+	struct device_driver driver;
+	int (*probe)(struct tb_service *svc, const struct tb_service_id *id);
+	void (*remove)(struct tb_service *svc);
+	void (*shutdown)(struct tb_service *svc);
+	const struct tb_service_id *id_table;
+};
+
+#define TB_SERVICE(key, id)				\
+	.match_flags = TBSVC_MATCH_PROTOCOL_KEY |	\
+		       TBSVC_MATCH_PROTOCOL_ID,		\
+	.protocol_key = (key),				\
+	.protocol_id = (id)
+
+int tb_register_service_driver(struct tb_service_driver *drv);
+void tb_unregister_service_driver(struct tb_service_driver *drv);
+
+static inline void *tb_service_get_drvdata(const struct tb_service *svc)
+{
+	return dev_get_drvdata(&svc->dev);
+}
+
+static inline void tb_service_set_drvdata(struct tb_service *svc, void *data)
+{
+	dev_set_drvdata(&svc->dev, data);
+}
+
+static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc)
+{
+	return tb_to_xdomain(svc->dev.parent);
+}
+
 #endif /* THUNDERBOLT_H_ */
diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c
index e4d90e50f6fe..57263f2f8f2f 100644
--- a/scripts/mod/devicetable-offsets.c
+++ b/scripts/mod/devicetable-offsets.c
@@ -206,5 +206,12 @@ int main(void)
 	DEVID_FIELD(fsl_mc_device_id, vendor);
 	DEVID_FIELD(fsl_mc_device_id, obj_type);
 
+	DEVID(tb_service_id);
+	DEVID_FIELD(tb_service_id, match_flags);
+	DEVID_FIELD(tb_service_id, protocol_key);
+	DEVID_FIELD(tb_service_id, protocol_id);
+	DEVID_FIELD(tb_service_id, protocol_version);
+	DEVID_FIELD(tb_service_id, protocol_revision);
+
 	return 0;
 }
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 29d6699d5a06..6ef6e63f96fd 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -1301,6 +1301,31 @@ static int do_fsl_mc_entry(const char *filename, void *symval,
 }
 ADD_TO_DEVTABLE("fslmc", fsl_mc_device_id, do_fsl_mc_entry);
 
+/* Looks like: tbsvc:kSpNvNrN */
+static int do_tbsvc_entry(const char *filename, void *symval, char *alias)
+{
+	DEF_FIELD(symval, tb_service_id, match_flags);
+	DEF_FIELD_ADDR(symval, tb_service_id, protocol_key);
+	DEF_FIELD(symval, tb_service_id, protocol_id);
+	DEF_FIELD(symval, tb_service_id, protocol_version);
+	DEF_FIELD(symval, tb_service_id, protocol_revision);
+
+	strcpy(alias, "tbsvc:");
+	if (match_flags & TBSVC_MATCH_PROTOCOL_KEY)
+		sprintf(alias + strlen(alias), "k%s", *protocol_key);
+	else
+		strcat(alias + strlen(alias), "k*");
+	ADD(alias, "p", match_flags & TBSVC_MATCH_PROTOCOL_ID, protocol_id);
+	ADD(alias, "v", match_flags & TBSVC_MATCH_PROTOCOL_VERSION,
+	    protocol_version);
+	ADD(alias, "r", match_flags & TBSVC_MATCH_PROTOCOL_REVISION,
+	    protocol_revision);
+
+	add_wildcard(alias);
+	return 1;
+}
+ADD_TO_DEVTABLE("tbsvc", tb_service_id, do_tbsvc_entry);
+
 /* Does namelen bytes of name exactly match the symbol? */
 static bool sym_is(const char *name, unsigned namelen, const char *symbol)
 {
-- 
cgit v1.2.3


From 3b3d9f4da96493e4f68d0a80ab210763a24f8b33 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:37 +0300
Subject: thunderbolt: Export ring handling functions to modules

These are used by Thunderbolt services to send and receive frames over
the high-speed DMA rings.

We also put the functions to tb_ namespace to make sure we do not
collide with others and add missing kernel-doc comments for the exported
functions.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/ctl.c   |  20 +++---
 drivers/thunderbolt/nhi.c   |  62 +++++++++++------
 drivers/thunderbolt/nhi.h   | 147 +----------------------------------------
 include/linux/thunderbolt.h | 158 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 211 insertions(+), 176 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thunderbolt/ctl.c b/drivers/thunderbolt/ctl.c
index 05400b77dcd7..dd10789e1dbb 100644
--- a/drivers/thunderbolt/ctl.c
+++ b/drivers/thunderbolt/ctl.c
@@ -359,7 +359,7 @@ static int tb_ctl_tx(struct tb_ctl *ctl, const void *data, size_t len,
 	cpu_to_be32_array(pkg->buffer, data, len / 4);
 	*(__be32 *) (pkg->buffer + len) = tb_crc(pkg->buffer, len);
 
-	res = ring_tx(ctl->tx, &pkg->frame);
+	res = tb_ring_tx(ctl->tx, &pkg->frame);
 	if (res) /* ring is stopped */
 		tb_ctl_pkg_free(pkg);
 	return res;
@@ -376,7 +376,7 @@ static bool tb_ctl_handle_event(struct tb_ctl *ctl, enum tb_cfg_pkg_type type,
 
 static void tb_ctl_rx_submit(struct ctl_pkg *pkg)
 {
-	ring_rx(pkg->ctl->rx, &pkg->frame); /*
+	tb_ring_rx(pkg->ctl->rx, &pkg->frame); /*
 					     * We ignore failures during stop.
 					     * All rx packets are referenced
 					     * from ctl->rx_packets, so we do
@@ -614,11 +614,11 @@ struct tb_ctl *tb_ctl_alloc(struct tb_nhi *nhi, event_cb cb, void *cb_data)
 	if (!ctl->frame_pool)
 		goto err;
 
-	ctl->tx = ring_alloc_tx(nhi, 0, 10, RING_FLAG_NO_SUSPEND);
+	ctl->tx = tb_ring_alloc_tx(nhi, 0, 10, RING_FLAG_NO_SUSPEND);
 	if (!ctl->tx)
 		goto err;
 
-	ctl->rx = ring_alloc_rx(nhi, 0, 10, RING_FLAG_NO_SUSPEND, 0xffff,
+	ctl->rx = tb_ring_alloc_rx(nhi, 0, 10, RING_FLAG_NO_SUSPEND, 0xffff,
 				0xffff);
 	if (!ctl->rx)
 		goto err;
@@ -652,9 +652,9 @@ void tb_ctl_free(struct tb_ctl *ctl)
 		return;
 
 	if (ctl->rx)
-		ring_free(ctl->rx);
+		tb_ring_free(ctl->rx);
 	if (ctl->tx)
-		ring_free(ctl->tx);
+		tb_ring_free(ctl->tx);
 
 	/* free RX packets */
 	for (i = 0; i < TB_CTL_RX_PKG_COUNT; i++)
@@ -673,8 +673,8 @@ void tb_ctl_start(struct tb_ctl *ctl)
 {
 	int i;
 	tb_ctl_info(ctl, "control channel starting...\n");
-	ring_start(ctl->tx); /* is used to ack hotplug packets, start first */
-	ring_start(ctl->rx);
+	tb_ring_start(ctl->tx); /* is used to ack hotplug packets, start first */
+	tb_ring_start(ctl->rx);
 	for (i = 0; i < TB_CTL_RX_PKG_COUNT; i++)
 		tb_ctl_rx_submit(ctl->rx_packets[i]);
 
@@ -695,8 +695,8 @@ void tb_ctl_stop(struct tb_ctl *ctl)
 	ctl->running = false;
 	mutex_unlock(&ctl->request_queue_lock);
 
-	ring_stop(ctl->rx);
-	ring_stop(ctl->tx);
+	tb_ring_stop(ctl->rx);
+	tb_ring_stop(ctl->tx);
 
 	if (!list_empty(&ctl->request_queue))
 		tb_ctl_WARN(ctl, "dangling request in request_queue\n");
diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index bebcad3d2c1f..e0a47f7581cb 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -253,7 +253,7 @@ invoke_callback:
 	}
 }
 
-int __ring_enqueue(struct tb_ring *ring, struct ring_frame *frame)
+int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame)
 {
 	int ret = 0;
 	mutex_lock(&ring->lock);
@@ -266,6 +266,7 @@ int __ring_enqueue(struct tb_ring *ring, struct ring_frame *frame)
 	mutex_unlock(&ring->lock);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(__tb_ring_enqueue);
 
 static irqreturn_t ring_msix(int irq, void *data)
 {
@@ -309,9 +310,9 @@ static void ring_release_msix(struct tb_ring *ring)
 	ring->irq = 0;
 }
 
-static struct tb_ring *ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
-				  bool transmit, unsigned int flags,
-				  u16 sof_mask, u16 eof_mask)
+static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
+				     bool transmit, unsigned int flags,
+				     u16 sof_mask, u16 eof_mask)
 {
 	struct tb_ring *ring = NULL;
 	dev_info(&nhi->pdev->dev, "allocating %s ring %d of size %d\n",
@@ -377,24 +378,42 @@ err:
 	return NULL;
 }
 
-struct tb_ring *ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
-			      unsigned int flags)
+/**
+ * tb_ring_alloc_tx() - Allocate DMA ring for transmit
+ * @nhi: Pointer to the NHI the ring is to be allocated
+ * @hop: HopID (ring) to allocate
+ * @size: Number of entries in the ring
+ * @flags: Flags for the ring
+ */
+struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
+				 unsigned int flags)
 {
-	return ring_alloc(nhi, hop, size, true, flags, 0, 0);
+	return tb_ring_alloc(nhi, hop, size, true, flags, 0, 0);
 }
+EXPORT_SYMBOL_GPL(tb_ring_alloc_tx);
 
-struct tb_ring *ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
-			      unsigned int flags, u16 sof_mask, u16 eof_mask)
+/**
+ * tb_ring_alloc_rx() - Allocate DMA ring for receive
+ * @nhi: Pointer to the NHI the ring is to be allocated
+ * @hop: HopID (ring) to allocate
+ * @size: Number of entries in the ring
+ * @flags: Flags for the ring
+ * @sof_mask: Mask of PDF values that start a frame
+ * @eof_mask: Mask of PDF values that end a frame
+ */
+struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
+				 unsigned int flags, u16 sof_mask, u16 eof_mask)
 {
-	return ring_alloc(nhi, hop, size, false, flags, sof_mask, eof_mask);
+	return tb_ring_alloc(nhi, hop, size, false, flags, sof_mask, eof_mask);
 }
+EXPORT_SYMBOL_GPL(tb_ring_alloc_rx);
 
 /**
- * ring_start() - enable a ring
+ * tb_ring_start() - enable a ring
  *
- * Must not be invoked in parallel with ring_stop().
+ * Must not be invoked in parallel with tb_ring_stop().
  */
-void ring_start(struct tb_ring *ring)
+void tb_ring_start(struct tb_ring *ring)
 {
 	u16 frame_size;
 	u32 flags;
@@ -450,21 +469,22 @@ err:
 	mutex_unlock(&ring->lock);
 	mutex_unlock(&ring->nhi->lock);
 }
-
+EXPORT_SYMBOL_GPL(tb_ring_start);
 
 /**
- * ring_stop() - shutdown a ring
+ * tb_ring_stop() - shutdown a ring
  *
  * Must not be invoked from a callback.
  *
- * This method will disable the ring. Further calls to ring_tx/ring_rx will
- * return -ESHUTDOWN until ring_stop has been called.
+ * This method will disable the ring. Further calls to
+ * tb_ring_tx/tb_ring_rx will return -ESHUTDOWN until ring_stop has been
+ * called.
  *
  * All enqueued frames will be canceled and their callbacks will be executed
  * with frame->canceled set to true (on the callback thread). This method
  * returns only after all callback invocations have finished.
  */
-void ring_stop(struct tb_ring *ring)
+void tb_ring_stop(struct tb_ring *ring)
 {
 	mutex_lock(&ring->nhi->lock);
 	mutex_lock(&ring->lock);
@@ -497,9 +517,10 @@ err:
 	schedule_work(&ring->work);
 	flush_work(&ring->work);
 }
+EXPORT_SYMBOL_GPL(tb_ring_stop);
 
 /*
- * ring_free() - free ring
+ * tb_ring_free() - free ring
  *
  * When this method returns all invocations of ring->callback will have
  * finished.
@@ -508,7 +529,7 @@ err:
  *
  * Must NOT be called from ring_frame->callback!
  */
-void ring_free(struct tb_ring *ring)
+void tb_ring_free(struct tb_ring *ring)
 {
 	mutex_lock(&ring->nhi->lock);
 	/*
@@ -550,6 +571,7 @@ void ring_free(struct tb_ring *ring)
 	mutex_destroy(&ring->lock);
 	kfree(ring);
 }
+EXPORT_SYMBOL_GPL(tb_ring_free);
 
 /**
  * nhi_mailbox_cmd() - Send a command through NHI mailbox
diff --git a/drivers/thunderbolt/nhi.h b/drivers/thunderbolt/nhi.h
index 4503ddbeccb3..771d09ca5dc5 100644
--- a/drivers/thunderbolt/nhi.h
+++ b/drivers/thunderbolt/nhi.h
@@ -7,152 +7,7 @@
 #ifndef DSL3510_H_
 #define DSL3510_H_
 
-#include <linux/idr.h>
-#include <linux/mutex.h>
-#include <linux/workqueue.h>
-
-/**
- * struct tb_nhi - thunderbolt native host interface
- * @lock: Must be held during ring creation/destruction. Is acquired by
- *	  interrupt_work when dispatching interrupts to individual rings.
- * @pdev: Pointer to the PCI device
- * @iobase: MMIO space of the NHI
- * @tx_rings: All Tx rings available on this host controller
- * @rx_rings: All Rx rings available on this host controller
- * @msix_ida: Used to allocate MSI-X vectors for rings
- * @going_away: The host controller device is about to disappear so when
- *		this flag is set, avoid touching the hardware anymore.
- * @interrupt_work: Work scheduled to handle ring interrupt when no
- *		    MSI-X is used.
- * @hop_count: Number of rings (end point hops) supported by NHI.
- */
-struct tb_nhi {
-	struct mutex lock;
-	struct pci_dev *pdev;
-	void __iomem *iobase;
-	struct tb_ring **tx_rings;
-	struct tb_ring **rx_rings;
-	struct ida msix_ida;
-	bool going_away;
-	struct work_struct interrupt_work;
-	u32 hop_count;
-};
-
-/**
- * struct tb_ring - thunderbolt TX or RX ring associated with a NHI
- * @lock: Lock serializing actions to this ring. Must be acquired after
- *	  nhi->lock.
- * @nhi: Pointer to the native host controller interface
- * @size: Size of the ring
- * @hop: Hop (DMA channel) associated with this ring
- * @head: Head of the ring (write next descriptor here)
- * @tail: Tail of the ring (complete next descriptor here)
- * @descriptors: Allocated descriptors for this ring
- * @queue: Queue holding frames to be transferred over this ring
- * @in_flight: Queue holding frames that are currently in flight
- * @work: Interrupt work structure
- * @is_tx: Is the ring Tx or Rx
- * @running: Is the ring running
- * @irq: MSI-X irq number if the ring uses MSI-X. %0 otherwise.
- * @vector: MSI-X vector number the ring uses (only set if @irq is > 0)
- * @flags: Ring specific flags
- * @sof_mask: Bit mask used to detect start of frame PDF
- * @eof_mask: Bit mask used to detect end of frame PDF
- */
-struct tb_ring {
-	struct mutex lock;
-	struct tb_nhi *nhi;
-	int size;
-	int hop;
-	int head;
-	int tail;
-	struct ring_desc *descriptors;
-	dma_addr_t descriptors_dma;
-	struct list_head queue;
-	struct list_head in_flight;
-	struct work_struct work;
-	bool is_tx:1;
-	bool running:1;
-	int irq;
-	u8 vector;
-	unsigned int flags;
-	u16 sof_mask;
-	u16 eof_mask;
-};
-
-/* Leave ring interrupt enabled on suspend */
-#define RING_FLAG_NO_SUSPEND	BIT(0)
-/* Configure the ring to be in frame mode */
-#define RING_FLAG_FRAME		BIT(1)
-/* Enable end-to-end flow control */
-#define RING_FLAG_E2E		BIT(2)
-
-struct ring_frame;
-typedef void (*ring_cb)(struct tb_ring*, struct ring_frame*, bool canceled);
-
-/**
- * struct ring_frame - for use with ring_rx/ring_tx
- */
-struct ring_frame {
-	dma_addr_t buffer_phy;
-	ring_cb callback;
-	struct list_head list;
-	u32 size:12; /* TX: in, RX: out*/
-	u32 flags:12; /* RX: out */
-	u32 eof:4; /* TX:in, RX: out */
-	u32 sof:4; /* TX:in, RX: out */
-};
-
-#define TB_FRAME_SIZE 0x100    /* minimum size for ring_rx */
-
-struct tb_ring *ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
-			      unsigned int flags);
-struct tb_ring *ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
-			      unsigned int flags, u16 sof_mask, u16 eof_mask);
-void ring_start(struct tb_ring *ring);
-void ring_stop(struct tb_ring *ring);
-void ring_free(struct tb_ring *ring);
-
-int __ring_enqueue(struct tb_ring *ring, struct ring_frame *frame);
-
-/**
- * ring_rx() - enqueue a frame on an RX ring
- *
- * frame->buffer, frame->buffer_phy and frame->callback have to be set. The
- * buffer must contain at least TB_FRAME_SIZE bytes.
- *
- * frame->callback will be invoked with frame->size, frame->flags, frame->eof,
- * frame->sof set once the frame has been received.
- *
- * If ring_stop is called after the packet has been enqueued frame->callback
- * will be called with canceled set to true.
- *
- * Return: Returns ESHUTDOWN if ring_stop has been called. Zero otherwise.
- */
-static inline int ring_rx(struct tb_ring *ring, struct ring_frame *frame)
-{
-	WARN_ON(ring->is_tx);
-	return __ring_enqueue(ring, frame);
-}
-
-/**
- * ring_tx() - enqueue a frame on an TX ring
- *
- * frame->buffer, frame->buffer_phy, frame->callback, frame->size, frame->eof
- * and frame->sof have to be set.
- *
- * frame->callback will be invoked with once the frame has been transmitted.
- *
- * If ring_stop is called after the packet has been enqueued frame->callback
- * will be called with canceled set to true.
- *
- * Return: Returns ESHUTDOWN if ring_stop has been called. Zero otherwise.
- */
-static inline int ring_tx(struct tb_ring *ring, struct ring_frame *frame)
-{
-	WARN_ON(!ring->is_tx);
-	return __ring_enqueue(ring, frame);
-}
+#include <linux/thunderbolt.h>
 
 enum nhi_fw_mode {
 	NHI_FW_SAFE_MODE,
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 18c0e3d5e85c..9ddb83ad890f 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -15,10 +15,12 @@
 #define THUNDERBOLT_H_
 
 #include <linux/device.h>
+#include <linux/idr.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/mod_devicetable.h>
 #include <linux/uuid.h>
+#include <linux/workqueue.h>
 
 enum tb_cfg_pkg_type {
 	TB_CFG_PKG_READ = 1,
@@ -397,4 +399,160 @@ static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc)
 	return tb_to_xdomain(svc->dev.parent);
 }
 
+/**
+ * struct tb_nhi - thunderbolt native host interface
+ * @lock: Must be held during ring creation/destruction. Is acquired by
+ *	  interrupt_work when dispatching interrupts to individual rings.
+ * @pdev: Pointer to the PCI device
+ * @iobase: MMIO space of the NHI
+ * @tx_rings: All Tx rings available on this host controller
+ * @rx_rings: All Rx rings available on this host controller
+ * @msix_ida: Used to allocate MSI-X vectors for rings
+ * @going_away: The host controller device is about to disappear so when
+ *		this flag is set, avoid touching the hardware anymore.
+ * @interrupt_work: Work scheduled to handle ring interrupt when no
+ *		    MSI-X is used.
+ * @hop_count: Number of rings (end point hops) supported by NHI.
+ */
+struct tb_nhi {
+	struct mutex lock;
+	struct pci_dev *pdev;
+	void __iomem *iobase;
+	struct tb_ring **tx_rings;
+	struct tb_ring **rx_rings;
+	struct ida msix_ida;
+	bool going_away;
+	struct work_struct interrupt_work;
+	u32 hop_count;
+};
+
+/**
+ * struct tb_ring - thunderbolt TX or RX ring associated with a NHI
+ * @lock: Lock serializing actions to this ring. Must be acquired after
+ *	  nhi->lock.
+ * @nhi: Pointer to the native host controller interface
+ * @size: Size of the ring
+ * @hop: Hop (DMA channel) associated with this ring
+ * @head: Head of the ring (write next descriptor here)
+ * @tail: Tail of the ring (complete next descriptor here)
+ * @descriptors: Allocated descriptors for this ring
+ * @queue: Queue holding frames to be transferred over this ring
+ * @in_flight: Queue holding frames that are currently in flight
+ * @work: Interrupt work structure
+ * @is_tx: Is the ring Tx or Rx
+ * @running: Is the ring running
+ * @irq: MSI-X irq number if the ring uses MSI-X. %0 otherwise.
+ * @vector: MSI-X vector number the ring uses (only set if @irq is > 0)
+ * @flags: Ring specific flags
+ * @sof_mask: Bit mask used to detect start of frame PDF
+ * @eof_mask: Bit mask used to detect end of frame PDF
+ */
+struct tb_ring {
+	struct mutex lock;
+	struct tb_nhi *nhi;
+	int size;
+	int hop;
+	int head;
+	int tail;
+	struct ring_desc *descriptors;
+	dma_addr_t descriptors_dma;
+	struct list_head queue;
+	struct list_head in_flight;
+	struct work_struct work;
+	bool is_tx:1;
+	bool running:1;
+	int irq;
+	u8 vector;
+	unsigned int flags;
+	u16 sof_mask;
+	u16 eof_mask;
+};
+
+/* Leave ring interrupt enabled on suspend */
+#define RING_FLAG_NO_SUSPEND	BIT(0)
+/* Configure the ring to be in frame mode */
+#define RING_FLAG_FRAME		BIT(1)
+/* Enable end-to-end flow control */
+#define RING_FLAG_E2E		BIT(2)
+
+struct ring_frame;
+typedef void (*ring_cb)(struct tb_ring *, struct ring_frame *, bool canceled);
+
+/**
+ * struct ring_frame - For use with ring_rx/ring_tx
+ * @buffer_phy: DMA mapped address of the frame
+ * @callback: Callback called when the frame is finished
+ * @list: Frame is linked to a queue using this
+ * @size: Size of the frame in bytes (%0 means %4096)
+ * @flags: Flags for the frame (see &enum ring_desc_flags)
+ * @eof: End of frame protocol defined field
+ * @sof: Start of frame protocol defined field
+ */
+struct ring_frame {
+	dma_addr_t buffer_phy;
+	ring_cb callback;
+	struct list_head list;
+	u32 size:12;
+	u32 flags:12;
+	u32 eof:4;
+	u32 sof:4;
+};
+
+/* Minimum size for ring_rx */
+#define TB_FRAME_SIZE		0x100
+
+struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
+				 unsigned int flags);
+struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
+				 unsigned int flags, u16 sof_mask,
+				 u16 eof_mask);
+void tb_ring_start(struct tb_ring *ring);
+void tb_ring_stop(struct tb_ring *ring);
+void tb_ring_free(struct tb_ring *ring);
+
+int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame);
+
+/**
+ * tb_ring_rx() - enqueue a frame on an RX ring
+ * @ring: Ring to enqueue the frame
+ * @frame: Frame to enqueue
+ *
+ * @frame->buffer, @frame->buffer_phy and @frame->callback have to be set. The
+ * buffer must contain at least %TB_FRAME_SIZE bytes.
+ *
+ * @frame->callback will be invoked with @frame->size, @frame->flags,
+ * @frame->eof, @frame->sof set once the frame has been received.
+ *
+ * If ring_stop() is called after the packet has been enqueued
+ * @frame->callback will be called with canceled set to true.
+ *
+ * Return: Returns %-ESHUTDOWN if ring_stop has been called. Zero otherwise.
+ */
+static inline int tb_ring_rx(struct tb_ring *ring, struct ring_frame *frame)
+{
+	WARN_ON(ring->is_tx);
+	return __tb_ring_enqueue(ring, frame);
+}
+
+/**
+ * tb_ring_tx() - enqueue a frame on an TX ring
+ * @ring: Ring the enqueue the frame
+ * @frame: Frame to enqueue
+ *
+ * @frame->buffer, @frame->buffer_phy, @frame->callback, @frame->size,
+ * @frame->eof and @frame->sof have to be set.
+ *
+ * @frame->callback will be invoked with once the frame has been transmitted.
+ *
+ * If ring_stop() is called after the packet has been enqueued @frame->callback
+ * will be called with canceled set to true.
+ *
+ * Return: Returns %-ESHUTDOWN if ring_stop has been called. Zero otherwise.
+ */
+static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame)
+{
+	WARN_ON(!ring->is_tx);
+	return __tb_ring_enqueue(ring, frame);
+}
+
 #endif /* THUNDERBOLT_H_ */
-- 
cgit v1.2.3


From 2a91ec63f8a11e70d4b958dd4df867fec0247179 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:38 +0300
Subject: thunderbolt: Move ring descriptor flags to thunderbolt.h

A Thunderbolt service driver might need to check if there was an error
with the descriptor when in frame mode. We also add two Rx specific
error flags RING_DESC_CRC_ERROR and RING_DESC_BUFFER_OVERRUN.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/nhi_regs.h |  7 -------
 include/linux/thunderbolt.h    | 18 ++++++++++++++++++
 2 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thunderbolt/nhi_regs.h b/drivers/thunderbolt/nhi_regs.h
index 491a4c0c18fc..5ed6934e31e7 100644
--- a/drivers/thunderbolt/nhi_regs.h
+++ b/drivers/thunderbolt/nhi_regs.h
@@ -17,13 +17,6 @@ enum ring_flags {
 	RING_FLAG_ENABLE = 1 << 31,
 };
 
-enum ring_desc_flags {
-	RING_DESC_ISOCH = 0x1, /* TX only? */
-	RING_DESC_COMPLETED = 0x2, /* set by NHI */
-	RING_DESC_POSTED = 0x4, /* always set this */
-	RING_DESC_INTERRUPT = 0x8, /* request an interrupt on completion */
-};
-
 /**
  * struct ring_desc - TX/RX ring entry
  *
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 9ddb83ad890f..e3b9af7be0ad 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -478,6 +478,24 @@ struct tb_ring {
 struct ring_frame;
 typedef void (*ring_cb)(struct tb_ring *, struct ring_frame *, bool canceled);
 
+/**
+ * enum ring_desc_flags - Flags for DMA ring descriptor
+ * %RING_DESC_ISOCH: Enable isonchronous DMA (Tx only)
+ * %RING_DESC_CRC_ERROR: In frame mode CRC check failed for the frame (Rx only)
+ * %RING_DESC_COMPLETED: Descriptor completed (set by NHI)
+ * %RING_DESC_POSTED: Always set this
+ * %RING_DESC_BUFFER_OVERRUN: RX buffer overrun
+ * %RING_DESC_INTERRUPT: Request an interrupt on completion
+ */
+enum ring_desc_flags {
+	RING_DESC_ISOCH = 0x1,
+	RING_DESC_CRC_ERROR = 0x1,
+	RING_DESC_COMPLETED = 0x2,
+	RING_DESC_POSTED = 0x4,
+	RING_DESC_BUFFER_OVERRUN = 0x04,
+	RING_DESC_INTERRUPT = 0x8,
+};
+
 /**
  * struct ring_frame - For use with ring_rx/ring_tx
  * @buffer_phy: DMA mapped address of the frame
-- 
cgit v1.2.3


From 22b7de1000e66d739c431d6be4e7e97c69fa7c98 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:39 +0300
Subject: thunderbolt: Use spinlock in ring serialization

This makes it possible to enqueue frames also from atomic context which
is needed for example, when networking packets are sent over a
Thunderbolt cable.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/nhi.c   | 26 ++++++++++++++------------
 include/linux/thunderbolt.h |  2 +-
 2 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index e0a47f7581cb..7d1891ec3c47 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -212,8 +212,10 @@ static void ring_work(struct work_struct *work)
 	struct tb_ring *ring = container_of(work, typeof(*ring), work);
 	struct ring_frame *frame;
 	bool canceled = false;
+	unsigned long flags;
 	LIST_HEAD(done);
-	mutex_lock(&ring->lock);
+
+	spin_lock_irqsave(&ring->lock, flags);
 
 	if (!ring->running) {
 		/*  Move all frames to done and mark them as canceled. */
@@ -241,7 +243,8 @@ static void ring_work(struct work_struct *work)
 	ring_write_descriptors(ring);
 
 invoke_callback:
-	mutex_unlock(&ring->lock); /* allow callbacks to schedule new work */
+	/* allow callbacks to schedule new work */
+	spin_unlock_irqrestore(&ring->lock, flags);
 	while (!list_empty(&done)) {
 		frame = list_first_entry(&done, typeof(*frame), list);
 		/*
@@ -255,15 +258,17 @@ invoke_callback:
 
 int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame)
 {
+	unsigned long flags;
 	int ret = 0;
-	mutex_lock(&ring->lock);
+
+	spin_lock_irqsave(&ring->lock, flags);
 	if (ring->running) {
 		list_add_tail(&frame->list, &ring->queue);
 		ring_write_descriptors(ring);
 	} else {
 		ret = -ESHUTDOWN;
 	}
-	mutex_unlock(&ring->lock);
+	spin_unlock_irqrestore(&ring->lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(__tb_ring_enqueue);
@@ -338,7 +343,7 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 	if (!ring)
 		goto err;
 
-	mutex_init(&ring->lock);
+	spin_lock_init(&ring->lock);
 	INIT_LIST_HEAD(&ring->queue);
 	INIT_LIST_HEAD(&ring->in_flight);
 	INIT_WORK(&ring->work, ring_work);
@@ -371,8 +376,6 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 	return ring;
 
 err:
-	if (ring)
-		mutex_destroy(&ring->lock);
 	kfree(ring);
 	mutex_unlock(&nhi->lock);
 	return NULL;
@@ -419,7 +422,7 @@ void tb_ring_start(struct tb_ring *ring)
 	u32 flags;
 
 	mutex_lock(&ring->nhi->lock);
-	mutex_lock(&ring->lock);
+	spin_lock_irq(&ring->lock);
 	if (ring->nhi->going_away)
 		goto err;
 	if (ring->running) {
@@ -466,7 +469,7 @@ void tb_ring_start(struct tb_ring *ring)
 	ring_interrupt_active(ring, true);
 	ring->running = true;
 err:
-	mutex_unlock(&ring->lock);
+	spin_unlock_irq(&ring->lock);
 	mutex_unlock(&ring->nhi->lock);
 }
 EXPORT_SYMBOL_GPL(tb_ring_start);
@@ -487,7 +490,7 @@ EXPORT_SYMBOL_GPL(tb_ring_start);
 void tb_ring_stop(struct tb_ring *ring)
 {
 	mutex_lock(&ring->nhi->lock);
-	mutex_lock(&ring->lock);
+	spin_lock_irq(&ring->lock);
 	dev_info(&ring->nhi->pdev->dev, "stopping %s %d\n",
 		 RING_TYPE(ring), ring->hop);
 	if (ring->nhi->going_away)
@@ -508,7 +511,7 @@ void tb_ring_stop(struct tb_ring *ring)
 	ring->running = false;
 
 err:
-	mutex_unlock(&ring->lock);
+	spin_unlock_irq(&ring->lock);
 	mutex_unlock(&ring->nhi->lock);
 
 	/*
@@ -568,7 +571,6 @@ void tb_ring_free(struct tb_ring *ring)
 	 * to finish before freeing the ring.
 	 */
 	flush_work(&ring->work);
-	mutex_destroy(&ring->lock);
 	kfree(ring);
 }
 EXPORT_SYMBOL_GPL(tb_ring_free);
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index e3b9af7be0ad..cf9e42db780f 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -448,7 +448,7 @@ struct tb_nhi {
  * @eof_mask: Bit mask used to detect end of frame PDF
  */
 struct tb_ring {
-	struct mutex lock;
+	spinlock_t lock;
 	struct tb_nhi *nhi;
 	int size;
 	int hop;
-- 
cgit v1.2.3


From 59120e06101db72442acf4c8b364a0c76d8faa68 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:40 +0300
Subject: thunderbolt: Use spinlock in NHI serialization

This is needed because ring polling functionality can be called from
atomic contexts when networking and other high-speed traffic is
transferred over a Thunderbolt cable.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/nhi.c   | 75 +++++++++++++++++++++++++--------------------
 include/linux/thunderbolt.h |  2 +-
 2 files changed, 42 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index 7d1891ec3c47..0b3c0640048b 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -327,21 +327,9 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 	if (transmit && hop == RING_E2E_UNUSED_HOPID)
 		return NULL;
 
-	mutex_lock(&nhi->lock);
-	if (hop >= nhi->hop_count) {
-		dev_WARN(&nhi->pdev->dev, "invalid hop: %d\n", hop);
-		goto err;
-	}
-	if (transmit && nhi->tx_rings[hop]) {
-		dev_WARN(&nhi->pdev->dev, "TX hop %d already allocated\n", hop);
-		goto err;
-	} else if (!transmit && nhi->rx_rings[hop]) {
-		dev_WARN(&nhi->pdev->dev, "RX hop %d already allocated\n", hop);
-		goto err;
-	}
 	ring = kzalloc(sizeof(*ring), GFP_KERNEL);
 	if (!ring)
-		goto err;
+		return NULL;
 
 	spin_lock_init(&ring->lock);
 	INIT_LIST_HEAD(&ring->queue);
@@ -359,25 +347,45 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 	ring->tail = 0;
 	ring->running = false;
 
-	if (ring_request_msix(ring, flags & RING_FLAG_NO_SUSPEND))
-		goto err;
-
 	ring->descriptors = dma_alloc_coherent(&ring->nhi->pdev->dev,
 			size * sizeof(*ring->descriptors),
 			&ring->descriptors_dma, GFP_KERNEL | __GFP_ZERO);
 	if (!ring->descriptors)
-		goto err;
+		goto err_free_ring;
 
+	if (ring_request_msix(ring, flags & RING_FLAG_NO_SUSPEND))
+		goto err_free_descs;
+
+	spin_lock_irq(&nhi->lock);
+	if (hop >= nhi->hop_count) {
+		dev_WARN(&nhi->pdev->dev, "invalid hop: %d\n", hop);
+		goto err_release_msix;
+	}
+	if (transmit && nhi->tx_rings[hop]) {
+		dev_WARN(&nhi->pdev->dev, "TX hop %d already allocated\n", hop);
+		goto err_release_msix;
+	} else if (!transmit && nhi->rx_rings[hop]) {
+		dev_WARN(&nhi->pdev->dev, "RX hop %d already allocated\n", hop);
+		goto err_release_msix;
+	}
 	if (transmit)
 		nhi->tx_rings[hop] = ring;
 	else
 		nhi->rx_rings[hop] = ring;
-	mutex_unlock(&nhi->lock);
+	spin_unlock_irq(&nhi->lock);
+
 	return ring;
 
-err:
+err_release_msix:
+	spin_unlock_irq(&nhi->lock);
+	ring_release_msix(ring);
+err_free_descs:
+	dma_free_coherent(&ring->nhi->pdev->dev,
+			  ring->size * sizeof(*ring->descriptors),
+			  ring->descriptors, ring->descriptors_dma);
+err_free_ring:
 	kfree(ring);
-	mutex_unlock(&nhi->lock);
+
 	return NULL;
 }
 
@@ -421,8 +429,8 @@ void tb_ring_start(struct tb_ring *ring)
 	u16 frame_size;
 	u32 flags;
 
-	mutex_lock(&ring->nhi->lock);
-	spin_lock_irq(&ring->lock);
+	spin_lock_irq(&ring->nhi->lock);
+	spin_lock(&ring->lock);
 	if (ring->nhi->going_away)
 		goto err;
 	if (ring->running) {
@@ -469,8 +477,8 @@ void tb_ring_start(struct tb_ring *ring)
 	ring_interrupt_active(ring, true);
 	ring->running = true;
 err:
-	spin_unlock_irq(&ring->lock);
-	mutex_unlock(&ring->nhi->lock);
+	spin_unlock(&ring->lock);
+	spin_unlock_irq(&ring->nhi->lock);
 }
 EXPORT_SYMBOL_GPL(tb_ring_start);
 
@@ -489,8 +497,8 @@ EXPORT_SYMBOL_GPL(tb_ring_start);
  */
 void tb_ring_stop(struct tb_ring *ring)
 {
-	mutex_lock(&ring->nhi->lock);
-	spin_lock_irq(&ring->lock);
+	spin_lock_irq(&ring->nhi->lock);
+	spin_lock(&ring->lock);
 	dev_info(&ring->nhi->pdev->dev, "stopping %s %d\n",
 		 RING_TYPE(ring), ring->hop);
 	if (ring->nhi->going_away)
@@ -511,8 +519,8 @@ void tb_ring_stop(struct tb_ring *ring)
 	ring->running = false;
 
 err:
-	spin_unlock_irq(&ring->lock);
-	mutex_unlock(&ring->nhi->lock);
+	spin_unlock(&ring->lock);
+	spin_unlock_irq(&ring->nhi->lock);
 
 	/*
 	 * schedule ring->work to invoke callbacks on all remaining frames.
@@ -534,7 +542,7 @@ EXPORT_SYMBOL_GPL(tb_ring_stop);
  */
 void tb_ring_free(struct tb_ring *ring)
 {
-	mutex_lock(&ring->nhi->lock);
+	spin_lock_irq(&ring->nhi->lock);
 	/*
 	 * Dissociate the ring from the NHI. This also ensures that
 	 * nhi_interrupt_work cannot reschedule ring->work.
@@ -564,7 +572,7 @@ void tb_ring_free(struct tb_ring *ring)
 		 RING_TYPE(ring),
 		 ring->hop);
 
-	mutex_unlock(&ring->nhi->lock);
+	spin_unlock_irq(&ring->nhi->lock);
 	/**
 	 * ring->work can no longer be scheduled (it is scheduled only
 	 * by nhi_interrupt_work, ring_stop and ring_msix). Wait for it
@@ -639,7 +647,7 @@ static void nhi_interrupt_work(struct work_struct *work)
 	int type = 0; /* current interrupt type 0: TX, 1: RX, 2: RX overflow */
 	struct tb_ring *ring;
 
-	mutex_lock(&nhi->lock);
+	spin_lock_irq(&nhi->lock);
 
 	/*
 	 * Starting at REG_RING_NOTIFY_BASE there are three status bitfields
@@ -677,7 +685,7 @@ static void nhi_interrupt_work(struct work_struct *work)
 		/* we do not check ring->running, this is done in ring->work */
 		schedule_work(&ring->work);
 	}
-	mutex_unlock(&nhi->lock);
+	spin_unlock_irq(&nhi->lock);
 }
 
 static irqreturn_t nhi_msi(int irq, void *data)
@@ -767,7 +775,6 @@ static void nhi_shutdown(struct tb_nhi *nhi)
 		devm_free_irq(&nhi->pdev->dev, nhi->pdev->irq, nhi);
 		flush_work(&nhi->interrupt_work);
 	}
-	mutex_destroy(&nhi->lock);
 	ida_destroy(&nhi->msix_ida);
 }
 
@@ -856,7 +863,7 @@ static int nhi_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		return res;
 	}
 
-	mutex_init(&nhi->lock);
+	spin_lock_init(&nhi->lock);
 
 	pci_set_master(pdev);
 
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index cf9e42db780f..d59e3f9a35c4 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -415,7 +415,7 @@ static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc)
  * @hop_count: Number of rings (end point hops) supported by NHI.
  */
 struct tb_nhi {
-	struct mutex lock;
+	spinlock_t lock;
 	struct pci_dev *pdev;
 	void __iomem *iobase;
 	struct tb_ring **tx_rings;
-- 
cgit v1.2.3


From 4ffe722eefcb07c76701f03e0d759fbaecedf79f Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:41 +0300
Subject: thunderbolt: Add polling mode for rings

In order to support things like networking over Thunderbolt cable, there
needs to be a way to switch the ring to a mode where it can be polled
with the interrupt masked. We implement such mode so that the caller can
allocate a ring by passing pointer to a function that is then called
when an interrupt is triggered. Completed frames can be fetched using
tb_ring_poll() and the interrupt can be re-enabled when the caller is
finished with polling by using tb_ring_poll_complete().

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/thunderbolt/ctl.c   |   2 +-
 drivers/thunderbolt/nhi.c   | 126 ++++++++++++++++++++++++++++++++++++++++----
 include/linux/thunderbolt.h |  23 +++++---
 3 files changed, 134 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thunderbolt/ctl.c b/drivers/thunderbolt/ctl.c
index dd10789e1dbb..d079dbba2c03 100644
--- a/drivers/thunderbolt/ctl.c
+++ b/drivers/thunderbolt/ctl.c
@@ -619,7 +619,7 @@ struct tb_ctl *tb_ctl_alloc(struct tb_nhi *nhi, event_cb cb, void *cb_data)
 		goto err;
 
 	ctl->rx = tb_ring_alloc_rx(nhi, 0, 10, RING_FLAG_NO_SUSPEND, 0xffff,
-				0xffff);
+				0xffff, NULL, NULL);
 	if (!ctl->rx)
 		goto err;
 
diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index 0b3c0640048b..af0a80ddf594 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -252,7 +252,8 @@ invoke_callback:
 		 * Do not hold on to it.
 		 */
 		list_del_init(&frame->list);
-		frame->callback(ring, frame, canceled);
+		if (frame->callback)
+			frame->callback(ring, frame, canceled);
 	}
 }
 
@@ -273,11 +274,106 @@ int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame)
 }
 EXPORT_SYMBOL_GPL(__tb_ring_enqueue);
 
+/**
+ * tb_ring_poll() - Poll one completed frame from the ring
+ * @ring: Ring to poll
+ *
+ * This function can be called when @start_poll callback of the @ring
+ * has been called. It will read one completed frame from the ring and
+ * return it to the caller. Returns %NULL if there is no more completed
+ * frames.
+ */
+struct ring_frame *tb_ring_poll(struct tb_ring *ring)
+{
+	struct ring_frame *frame = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ring->lock, flags);
+	if (!ring->running)
+		goto unlock;
+	if (ring_empty(ring))
+		goto unlock;
+
+	if (ring->descriptors[ring->tail].flags & RING_DESC_COMPLETED) {
+		frame = list_first_entry(&ring->in_flight, typeof(*frame),
+					 list);
+		list_del_init(&frame->list);
+
+		if (!ring->is_tx) {
+			frame->size = ring->descriptors[ring->tail].length;
+			frame->eof = ring->descriptors[ring->tail].eof;
+			frame->sof = ring->descriptors[ring->tail].sof;
+			frame->flags = ring->descriptors[ring->tail].flags;
+		}
+
+		ring->tail = (ring->tail + 1) % ring->size;
+	}
+
+unlock:
+	spin_unlock_irqrestore(&ring->lock, flags);
+	return frame;
+}
+EXPORT_SYMBOL_GPL(tb_ring_poll);
+
+static void __ring_interrupt_mask(struct tb_ring *ring, bool mask)
+{
+	int idx = ring_interrupt_index(ring);
+	int reg = REG_RING_INTERRUPT_BASE + idx / 32 * 4;
+	int bit = idx % 32;
+	u32 val;
+
+	val = ioread32(ring->nhi->iobase + reg);
+	if (mask)
+		val &= ~BIT(bit);
+	else
+		val |= BIT(bit);
+	iowrite32(val, ring->nhi->iobase + reg);
+}
+
+/* Both @nhi->lock and @ring->lock should be held */
+static void __ring_interrupt(struct tb_ring *ring)
+{
+	if (!ring->running)
+		return;
+
+	if (ring->start_poll) {
+		__ring_interrupt_mask(ring, false);
+		ring->start_poll(ring->poll_data);
+	} else {
+		schedule_work(&ring->work);
+	}
+}
+
+/**
+ * tb_ring_poll_complete() - Re-start interrupt for the ring
+ * @ring: Ring to re-start the interrupt
+ *
+ * This will re-start (unmask) the ring interrupt once the user is done
+ * with polling.
+ */
+void tb_ring_poll_complete(struct tb_ring *ring)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ring->nhi->lock, flags);
+	spin_lock(&ring->lock);
+	if (ring->start_poll)
+		__ring_interrupt_mask(ring, false);
+	spin_unlock(&ring->lock);
+	spin_unlock_irqrestore(&ring->nhi->lock, flags);
+}
+EXPORT_SYMBOL_GPL(tb_ring_poll_complete);
+
 static irqreturn_t ring_msix(int irq, void *data)
 {
 	struct tb_ring *ring = data;
 
-	schedule_work(&ring->work);
+	spin_lock(&ring->nhi->lock);
+	spin_lock(&ring->lock);
+	__ring_interrupt(ring);
+	spin_unlock(&ring->lock);
+	spin_unlock(&ring->nhi->lock);
+
 	return IRQ_HANDLED;
 }
 
@@ -317,7 +413,9 @@ static void ring_release_msix(struct tb_ring *ring)
 
 static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 				     bool transmit, unsigned int flags,
-				     u16 sof_mask, u16 eof_mask)
+				     u16 sof_mask, u16 eof_mask,
+				     void (*start_poll)(void *),
+				     void *poll_data)
 {
 	struct tb_ring *ring = NULL;
 	dev_info(&nhi->pdev->dev, "allocating %s ring %d of size %d\n",
@@ -346,6 +444,8 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 	ring->head = 0;
 	ring->tail = 0;
 	ring->running = false;
+	ring->start_poll = start_poll;
+	ring->poll_data = poll_data;
 
 	ring->descriptors = dma_alloc_coherent(&ring->nhi->pdev->dev,
 			size * sizeof(*ring->descriptors),
@@ -399,7 +499,7 @@ err_free_ring:
 struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
 				 unsigned int flags)
 {
-	return tb_ring_alloc(nhi, hop, size, true, flags, 0, 0);
+	return tb_ring_alloc(nhi, hop, size, true, flags, 0, 0, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(tb_ring_alloc_tx);
 
@@ -411,11 +511,17 @@ EXPORT_SYMBOL_GPL(tb_ring_alloc_tx);
  * @flags: Flags for the ring
  * @sof_mask: Mask of PDF values that start a frame
  * @eof_mask: Mask of PDF values that end a frame
+ * @start_poll: If not %NULL the ring will call this function when an
+ *		interrupt is triggered and masked, instead of callback
+ *		in each Rx frame.
+ * @poll_data: Optional data passed to @start_poll
  */
 struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
-				 unsigned int flags, u16 sof_mask, u16 eof_mask)
+				 unsigned int flags, u16 sof_mask, u16 eof_mask,
+				 void (*start_poll)(void *), void *poll_data)
 {
-	return tb_ring_alloc(nhi, hop, size, false, flags, sof_mask, eof_mask);
+	return tb_ring_alloc(nhi, hop, size, false, flags, sof_mask, eof_mask,
+			     start_poll, poll_data);
 }
 EXPORT_SYMBOL_GPL(tb_ring_alloc_rx);
 
@@ -556,6 +662,7 @@ void tb_ring_free(struct tb_ring *ring)
 		dev_WARN(&ring->nhi->pdev->dev, "%s %d still running\n",
 			 RING_TYPE(ring), ring->hop);
 	}
+	spin_unlock_irq(&ring->nhi->lock);
 
 	ring_release_msix(ring);
 
@@ -572,7 +679,6 @@ void tb_ring_free(struct tb_ring *ring)
 		 RING_TYPE(ring),
 		 ring->hop);
 
-	spin_unlock_irq(&ring->nhi->lock);
 	/**
 	 * ring->work can no longer be scheduled (it is scheduled only
 	 * by nhi_interrupt_work, ring_stop and ring_msix). Wait for it
@@ -682,8 +788,10 @@ static void nhi_interrupt_work(struct work_struct *work)
 				 hop);
 			continue;
 		}
-		/* we do not check ring->running, this is done in ring->work */
-		schedule_work(&ring->work);
+
+		spin_lock(&ring->lock);
+		__ring_interrupt(ring);
+		spin_unlock(&ring->lock);
 	}
 	spin_unlock_irq(&nhi->lock);
 }
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index d59e3f9a35c4..36925e3aec7c 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -446,6 +446,9 @@ struct tb_nhi {
  * @flags: Ring specific flags
  * @sof_mask: Bit mask used to detect start of frame PDF
  * @eof_mask: Bit mask used to detect end of frame PDF
+ * @start_poll: Called when ring interrupt is triggered to start
+ *		polling. Passing %NULL keeps the ring in interrupt mode.
+ * @poll_data: Data passed to @start_poll
  */
 struct tb_ring {
 	spinlock_t lock;
@@ -466,6 +469,8 @@ struct tb_ring {
 	unsigned int flags;
 	u16 sof_mask;
 	u16 eof_mask;
+	void (*start_poll)(void *data);
+	void *poll_data;
 };
 
 /* Leave ring interrupt enabled on suspend */
@@ -499,7 +504,7 @@ enum ring_desc_flags {
 /**
  * struct ring_frame - For use with ring_rx/ring_tx
  * @buffer_phy: DMA mapped address of the frame
- * @callback: Callback called when the frame is finished
+ * @callback: Callback called when the frame is finished (optional)
  * @list: Frame is linked to a queue using this
  * @size: Size of the frame in bytes (%0 means %4096)
  * @flags: Flags for the frame (see &enum ring_desc_flags)
@@ -522,8 +527,8 @@ struct ring_frame {
 struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
 				 unsigned int flags);
 struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
-				 unsigned int flags, u16 sof_mask,
-				 u16 eof_mask);
+				 unsigned int flags, u16 sof_mask, u16 eof_mask,
+				 void (*start_poll)(void *), void *poll_data);
 void tb_ring_start(struct tb_ring *ring);
 void tb_ring_stop(struct tb_ring *ring);
 void tb_ring_free(struct tb_ring *ring);
@@ -535,8 +540,8 @@ int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame);
  * @ring: Ring to enqueue the frame
  * @frame: Frame to enqueue
  *
- * @frame->buffer, @frame->buffer_phy and @frame->callback have to be set. The
- * buffer must contain at least %TB_FRAME_SIZE bytes.
+ * @frame->buffer, @frame->buffer_phy have to be set. The buffer must
+ * contain at least %TB_FRAME_SIZE bytes.
  *
  * @frame->callback will be invoked with @frame->size, @frame->flags,
  * @frame->eof, @frame->sof set once the frame has been received.
@@ -557,8 +562,8 @@ static inline int tb_ring_rx(struct tb_ring *ring, struct ring_frame *frame)
  * @ring: Ring the enqueue the frame
  * @frame: Frame to enqueue
  *
- * @frame->buffer, @frame->buffer_phy, @frame->callback, @frame->size,
- * @frame->eof and @frame->sof have to be set.
+ * @frame->buffer, @frame->buffer_phy, @frame->size, @frame->eof and
+ * @frame->sof have to be set.
  *
  * @frame->callback will be invoked with once the frame has been transmitted.
  *
@@ -573,4 +578,8 @@ static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame)
 	return __tb_ring_enqueue(ring, frame);
 }
 
+/* Used only when the ring is in polling mode */
+struct ring_frame *tb_ring_poll(struct tb_ring *ring);
+void tb_ring_poll_complete(struct tb_ring *ring);
+
 #endif /* THUNDERBOLT_H_ */
-- 
cgit v1.2.3


From 3304559e353f098d7e0ed5ca981e26c406513e12 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:42 +0300
Subject: thunderbolt: Add function to retrieve DMA device for the ring

This is needed when Thunderbolt service drivers need to DMA map memory
before it is passed down to the ring.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/thunderbolt.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 36925e3aec7c..7b69853188b1 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -19,6 +19,7 @@
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/mod_devicetable.h>
+#include <linux/pci.h>
 #include <linux/uuid.h>
 #include <linux/workqueue.h>
 
@@ -582,4 +583,16 @@ static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame)
 struct ring_frame *tb_ring_poll(struct tb_ring *ring);
 void tb_ring_poll_complete(struct tb_ring *ring);
 
+/**
+ * tb_ring_dma_device() - Return device used for DMA mapping
+ * @ring: Ring whose DMA device is retrieved
+ *
+ * Use this function when you are mapping DMA for buffers that are
+ * passed to the ring for sending/receiving.
+ */
+static inline struct device *tb_ring_dma_device(struct tb_ring *ring)
+{
+	return &ring->nhi->pdev->dev;
+}
+
 #endif /* THUNDERBOLT_H_ */
-- 
cgit v1.2.3


From e7d5459dfaf613799915e901189d296bdc7534f9 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Tue, 26 Sep 2017 17:41:07 +0100
Subject: cpufreq: provide default frequency-invariance setter function

Frequency-invariant accounting support based on the ratio of current
frequency and maximum supported frequency is an optional feature an arch
can implement.

Since there are cpufreq drivers (e.g. cpufreq-dt) which can be build for
different arch's a default implementation of the frequency-invariance
setter function arch_set_freq_scale() is needed.

This default implementation is an empty weak function which will be
overwritten by a strong function in case the arch provides one.

The setter function passes the cpumask of related (to the frequency
change) cpus (online and offline cpus), the (new) current frequency and
the maximum supported frequency.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 6 ++++++
 include/linux/cpufreq.h   | 3 +++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index ea43b147a7fe..41d148af7748 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -161,6 +161,12 @@ u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy)
 }
 EXPORT_SYMBOL_GPL(get_cpu_idle_time);
 
+__weak void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
+		unsigned long max_freq)
+{
+}
+EXPORT_SYMBOL_GPL(arch_set_freq_scale);
+
 /*
  * This is a generic cpufreq init() routine which can be used by cpufreq
  * drivers of SMP systems. It will do following:
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 537ff842ff73..28734ee185a7 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -919,6 +919,9 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy)
 
 extern unsigned int arch_freq_get_on_cpu(int cpu);
 
+extern void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
+				unsigned long max_freq);
+
 /* the following are really really optional */
 extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs;
 extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs;
-- 
cgit v1.2.3


From 0e27c567d1673137b06aa96bb7aef635fb657dee Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Tue, 26 Sep 2017 17:41:10 +0100
Subject: drivers base/arch_topology: provide frequency-invariant accounting
 support

Implements the arch-specific (arm and arm64) frequency-invariance setter
function arch_set_freq_scale() which provides the following frequency
scaling factor:

  current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_supported_freq(cpu)

One possible consumer of the frequency-invariance getter function
topology_get_freq_scale() is the Per-Entity Load Tracking (PELT)
mechanism of the task scheduler.

Allow inlining of topology_get_freq_scale() into the task scheduler
fast path (e.g. __update_load_avg_se()) by coding it as a static inline
function in the arch topology header file.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/arch_topology.c  | 14 ++++++++++++++
 include/linux/arch_topology.h |  9 +++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index e9bb368f32b3..416ec2f5211d 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -22,6 +22,20 @@
 #include <linux/string.h>
 #include <linux/sched/topology.h>
 
+DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
+
+void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
+			 unsigned long max_freq)
+{
+	unsigned long scale;
+	int i;
+
+	scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq;
+
+	for_each_cpu(i, cpus)
+		per_cpu(freq_scale, i) = scale;
+}
+
 static DEFINE_MUTEX(cpu_scale_mutex);
 static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
 
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index 716ce587247e..f6e490312e4d 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -5,6 +5,7 @@
 #define _LINUX_ARCH_TOPOLOGY_H_
 
 #include <linux/types.h>
+#include <linux/percpu.h>
 
 void topology_normalize_cpu_scale(void);
 
@@ -16,4 +17,12 @@ unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu);
 
 void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity);
 
+DECLARE_PER_CPU(unsigned long, freq_scale);
+
+static inline
+unsigned long topology_get_freq_scale(struct sched_domain *sd, int cpu)
+{
+	return per_cpu(freq_scale, cpu);
+}
+
 #endif /* _LINUX_ARCH_TOPOLOGY_H_ */
-- 
cgit v1.2.3


From 8216f588b52b61ce36fc0080218e4730435e58b7 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Tue, 26 Sep 2017 17:41:11 +0100
Subject: drivers base/arch_topology: allow inlining cpu-invariant accounting
 support

Allow inlining of topology_get_cpu_scale() into the task
scheduler fast path (e.g. __update_load_avg_se()) by coding it as a
static inline function in the arch topology header file.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/arch_topology.c  | 7 +------
 include/linux/arch_topology.h | 8 +++++++-
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 416ec2f5211d..aea0b9d521f6 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -37,12 +37,7 @@ void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
 }
 
 static DEFINE_MUTEX(cpu_scale_mutex);
-static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
-
-unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu)
-{
-	return per_cpu(cpu_scale, cpu);
-}
+DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
 
 void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity)
 {
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index f6e490312e4d..c189de3ef5df 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -12,8 +12,14 @@ void topology_normalize_cpu_scale(void);
 struct device_node;
 bool topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu);
 
+DECLARE_PER_CPU(unsigned long, cpu_scale);
+
 struct sched_domain;
-unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu);
+static inline
+unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu)
+{
+	return per_cpu(cpu_scale, cpu);
+}
 
 void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity);
 
-- 
cgit v1.2.3


From 7f20333fe76780af19ee0d8d2dc6d04d35aa88d2 Mon Sep 17 00:00:00 2001
From: Al Cooper <alcooperx@gmail.com>
Date: Fri, 22 Sep 2017 15:33:59 -0400
Subject: soc: brcmstb: Add Product ID and Family ID helper functions

Add Product ID and Family ID helper functions for brcmstb soc.

Signed-off-by: Al Cooper <alcooperx@gmail.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 drivers/soc/bcm/brcmstb/common.c    | 12 ++++++++++++
 include/linux/soc/brcmstb/brcmstb.h | 17 +++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/soc/bcm/brcmstb/common.c b/drivers/soc/bcm/brcmstb/common.c
index 22e98a90468c..a71730da6385 100644
--- a/drivers/soc/bcm/brcmstb/common.c
+++ b/drivers/soc/bcm/brcmstb/common.c
@@ -40,6 +40,18 @@ bool soc_is_brcmstb(void)
 	return of_match_node(brcmstb_machine_match, root) != NULL;
 }
 
+u32 brcmstb_get_family_id(void)
+{
+	return family_id;
+}
+EXPORT_SYMBOL(brcmstb_get_family_id);
+
+u32 brcmstb_get_product_id(void)
+{
+	return product_id;
+}
+EXPORT_SYMBOL(brcmstb_get_product_id);
+
 static const struct of_device_id sun_top_ctrl_match[] = {
 	{ .compatible = "brcm,bcm7125-sun-top-ctrl", },
 	{ .compatible = "brcm,bcm7346-sun-top-ctrl", },
diff --git a/include/linux/soc/brcmstb/brcmstb.h b/include/linux/soc/brcmstb/brcmstb.h
index 337ce414e898..8baafd6908b9 100644
--- a/include/linux/soc/brcmstb/brcmstb.h
+++ b/include/linux/soc/brcmstb/brcmstb.h
@@ -1,10 +1,27 @@
 #ifndef __BRCMSTB_SOC_H
 #define __BRCMSTB_SOC_H
 
+static inline u32 BRCM_ID(u32 reg)
+{
+	return reg >> 28 ? reg >> 16 : reg >> 8;
+}
+
+static inline u32 BRCM_REV(u32 reg)
+{
+	return reg & 0xff;
+}
+
 /*
  * Bus Interface Unit control register setup, must happen early during boot,
  * before SMP is brought up, called by machine entry point.
  */
 void brcmstb_biuctrl_init(void);
 
+/*
+ * Helper functions for getting family or product id from the
+ * SoC driver.
+ */
+u32 brcmstb_get_family_id(void);
+u32 brcmstb_get_product_id(void);
+
 #endif /* __BRCMSTB_SOC_H */
-- 
cgit v1.2.3


From 640ab98fb3629c0f8417b9b2532eca596495f3bb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 27 Sep 2017 05:40:16 -0600
Subject: buffer: have alloc_page_buffers() use __GFP_NOFAIL

Instead of adding weird retry logic in that function, utilize
__GFP_NOFAIL to ensure that the vm takes care of handling any
potential retries appropriately. This means we don't have to
call free_more_memory() from here.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bitmap.c         |  2 +-
 fs/buffer.c                 | 33 ++++++++++-----------------------
 fs/ntfs/aops.c              |  2 +-
 fs/ntfs/mft.c               |  2 +-
 include/linux/buffer_head.h |  2 +-
 5 files changed, 14 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index d2121637b4ab..4d8ed74efadf 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -368,7 +368,7 @@ static int read_page(struct file *file, unsigned long index,
 	pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
 		 (unsigned long long)index << PAGE_SHIFT);
 
-	bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0);
+	bh = alloc_page_buffers(page, 1<<inode->i_blkbits, false);
 	if (!bh) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/fs/buffer.c b/fs/buffer.c
index 170df856bdb9..1234ae343aef 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -861,16 +861,19 @@ int remove_inode_buffers(struct inode *inode)
  * which may not fail from ordinary buffer allocations.
  */
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
-		int retry)
+		bool retry)
 {
 	struct buffer_head *bh, *head;
+	gfp_t gfp = GFP_NOFS;
 	long offset;
 
-try_again:
+	if (retry)
+		gfp |= __GFP_NOFAIL;
+
 	head = NULL;
 	offset = PAGE_SIZE;
 	while ((offset -= size) >= 0) {
-		bh = alloc_buffer_head(GFP_NOFS);
+		bh = alloc_buffer_head(gfp);
 		if (!bh)
 			goto no_grow;
 
@@ -896,23 +899,7 @@ no_grow:
 		} while (head);
 	}
 
-	/*
-	 * Return failure for non-async IO requests.  Async IO requests
-	 * are not allowed to fail, so we have to wait until buffer heads
-	 * become available.  But we don't want tasks sleeping with 
-	 * partially complete buffers, so all were released above.
-	 */
-	if (!retry)
-		return NULL;
-
-	/* We're _really_ low on memory. Now we just
-	 * wait for old buffer heads to become free due to
-	 * finishing IO.  Since this is an async request and
-	 * the reserve list is empty, we're sure there are 
-	 * async buffer heads in use.
-	 */
-	free_more_memory();
-	goto try_again;
+	return NULL;
 }
 EXPORT_SYMBOL_GPL(alloc_page_buffers);
 
@@ -1021,7 +1008,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	/*
 	 * Allocate some buffers for this page
 	 */
-	bh = alloc_page_buffers(page, size, 0);
+	bh = alloc_page_buffers(page, size, false);
 	if (!bh)
 		goto failed;
 
@@ -1575,7 +1562,7 @@ void create_empty_buffers(struct page *page,
 {
 	struct buffer_head *bh, *head, *tail;
 
-	head = alloc_page_buffers(page, blocksize, 1);
+	head = alloc_page_buffers(page, blocksize, true);
 	bh = head;
 	do {
 		bh->b_state |= b_state;
@@ -2638,7 +2625,7 @@ int nobh_write_begin(struct address_space *mapping,
 	 * Be careful: the buffer linked list is a NULL terminated one, rather
 	 * than the circular one we're used to.
 	 */
-	head = alloc_page_buffers(page, blocksize, 0);
+	head = alloc_page_buffers(page, blocksize, false);
 	if (!head) {
 		ret = -ENOMEM;
 		goto out_release;
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index cc91856b5e2d..3a2e509c77c5 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1739,7 +1739,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
 	spin_lock(&mapping->private_lock);
 	if (unlikely(!page_has_buffers(page))) {
 		spin_unlock(&mapping->private_lock);
-		bh = head = alloc_page_buffers(page, bh_size, 1);
+		bh = head = alloc_page_buffers(page, bh_size, true);
 		spin_lock(&mapping->private_lock);
 		if (likely(!page_has_buffers(page))) {
 			struct buffer_head *tail;
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index b6f402194f02..ee8392aee9f6 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -507,7 +507,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
 	if (unlikely(!page_has_buffers(page))) {
 		struct buffer_head *tail;
 
-		bh = head = alloc_page_buffers(page, blocksize, 1);
+		bh = head = alloc_page_buffers(page, blocksize, true);
 		do {
 			set_buffer_uptodate(bh);
 			tail = bh;
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index c8dae555eccf..ae2d25f01b98 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -156,7 +156,7 @@ void set_bh_page(struct buffer_head *bh,
 		struct page *page, unsigned long offset);
 int try_to_free_buffers(struct page *);
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
-		int retry);
+		bool retry);
 void create_empty_buffers(struct page *, unsigned long,
 			unsigned long b_state);
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
-- 
cgit v1.2.3


From 9ba4b2dfafaa711b41cc2102b0e9a529f3981218 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 20 Sep 2017 08:58:25 -0600
Subject: fs: kill 'nr_pages' argument from wakeup_flusher_threads()

Everybody is passing in 0 now, let's get rid of the argument.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c         | 9 ++++-----
 fs/sync.c                 | 2 +-
 include/linux/writeback.h | 2 +-
 mm/vmscan.c               | 2 +-
 4 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 245c430a2e41..bb6148dc6d24 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1947,12 +1947,12 @@ void wb_workfn(struct work_struct *work)
 }
 
 /*
- * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
- * the whole world.
+ * Wakeup the flusher threads to start writeback of all currently dirty pages
  */
-void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
+void wakeup_flusher_threads(enum wb_reason reason)
 {
 	struct backing_dev_info *bdi;
+	long nr_pages;
 
 	/*
 	 * If we are expecting writeback progress we must submit plugged IO.
@@ -1960,8 +1960,7 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 	if (blk_needs_flush_plug(current))
 		blk_schedule_flush_plug(current);
 
-	if (!nr_pages)
-		nr_pages = get_nr_dirty_pages();
+	nr_pages = get_nr_dirty_pages();
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
diff --git a/fs/sync.c b/fs/sync.c
index a576aa2e6b09..09f96a18dd93 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -108,7 +108,7 @@ SYSCALL_DEFINE0(sync)
 {
 	int nowait = 0, wait = 1;
 
-	wakeup_flusher_threads(0, WB_REASON_SYNC);
+	wakeup_flusher_threads(WB_REASON_SYNC);
 	iterate_supers(sync_inodes_one_sb, NULL);
 	iterate_supers(sync_fs_one_sb, &nowait);
 	iterate_supers(sync_fs_one_sb, &wait);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d5815794416c..1f9c6db5e29a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -189,7 +189,7 @@ bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
 bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 				   enum wb_reason reason);
 void sync_inodes_sb(struct super_block *);
-void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
+void wakeup_flusher_threads(enum wb_reason reason);
 void inode_wait_for_writeback(struct inode *inode);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 13d711dd8776..42a7fdd52d87 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1867,7 +1867,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 		 * also allow kswapd to start writing pages during reclaim.
 		 */
 		if (stat.nr_unqueued_dirty == nr_taken) {
-			wakeup_flusher_threads(0, WB_REASON_VMSCAN);
+			wakeup_flusher_threads(WB_REASON_VMSCAN);
 			set_bit(PGDAT_DIRTY, &pgdat->flags);
 		}
 
-- 
cgit v1.2.3


From 47410d88f665486bf91f02242ab5d5692b8887ac Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 28 Sep 2017 11:25:03 -0600
Subject: writeback: remove 'range_cyclic' argument for wb_start_writeback()

All the callers pass in 'true' for range_cyclic, so kill the
argument.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c           | 6 +++---
 include/linux/backing-dev.h | 2 +-
 mm/page-writeback.c         | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 65e6992d8719..fe555bce886c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -934,7 +934,7 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
-			bool range_cyclic, enum wb_reason reason)
+			enum wb_reason reason)
 {
 	struct wb_writeback_work *work;
 
@@ -955,7 +955,7 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
 
 	work->sync_mode	= WB_SYNC_NONE;
 	work->nr_pages	= nr_pages;
-	work->range_cyclic = range_cyclic;
+	work->range_cyclic = 1;
 	work->reason	= reason;
 	work->auto_free	= 1;
 
@@ -1971,7 +1971,7 @@ void wakeup_flusher_threads(enum wb_reason reason)
 
 		list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
 			wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
-						true, reason);
+						reason);
 	}
 	rcu_read_unlock();
 }
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 854e1bdd0b2a..0f63493de9e7 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -39,7 +39,7 @@ static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask)
 }
 
 void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
-			bool range_cyclic, enum wb_reason reason);
+			enum wb_reason reason);
 void wb_start_background_writeback(struct bdi_writeback *wb);
 void wb_workfn(struct work_struct *work);
 void wb_wakeup_delayed(struct bdi_writeback *wb);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b9c5cbe8eba..dede53355123 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1994,8 +1994,8 @@ void laptop_mode_timer_fn(unsigned long data)
 	rcu_read_lock();
 	list_for_each_entry_rcu(wb, &q->backing_dev_info->wb_list, bdi_node)
 		if (wb_has_dirty_io(wb))
-			wb_start_writeback(wb, nr_pages, true,
-					   WB_REASON_LAPTOP_TIMER);
+			wb_start_writeback(wb, nr_pages,
+						WB_REASON_LAPTOP_TIMER);
 	rcu_read_unlock();
 }
 
-- 
cgit v1.2.3


From 595043e5f9ef1d8263bd9fda215cade489227491 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 28 Sep 2017 11:26:59 -0600
Subject: writeback: provide a wakeup_flusher_threads_bdi()

Similar to wakeup_flusher_threads(), except that we only wake
up the flusher threads on the specified backing device.

No functional changes in this patch.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Chris Mason <clm@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c         | 39 +++++++++++++++++++++++++++++----------
 include/linux/writeback.h |  2 ++
 2 files changed, 31 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index fe555bce886c..9f39829459dc 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1946,6 +1946,33 @@ void wb_workfn(struct work_struct *work)
 	current->flags &= ~PF_SWAPWRITE;
 }
 
+/*
+ * Start writeback of `nr_pages' pages on this bdi. If `nr_pages' is zero,
+ * write back the whole world.
+ */
+static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
+					 long nr_pages, enum wb_reason reason)
+{
+	struct bdi_writeback *wb;
+
+	if (!bdi_has_dirty_io(bdi))
+		return;
+
+	list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
+		wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
+					reason);
+}
+
+void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
+				enum wb_reason reason)
+{
+	long nr_pages = get_nr_dirty_pages();
+
+	rcu_read_lock();
+	__wakeup_flusher_threads_bdi(bdi, nr_pages, reason);
+	rcu_read_unlock();
+}
+
 /*
  * Wakeup the flusher threads to start writeback of all currently dirty pages
  */
@@ -1963,16 +1990,8 @@ void wakeup_flusher_threads(enum wb_reason reason)
 	nr_pages = get_nr_dirty_pages();
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
-		struct bdi_writeback *wb;
-
-		if (!bdi_has_dirty_io(bdi))
-			continue;
-
-		list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
-			wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
-						reason);
-	}
+	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
+		__wakeup_flusher_threads_bdi(bdi, nr_pages, reason);
 	rcu_read_unlock();
 }
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 1f9c6db5e29a..9c0091678af4 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -190,6 +190,8 @@ bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 				   enum wb_reason reason);
 void sync_inodes_sb(struct super_block *);
 void wakeup_flusher_threads(enum wb_reason reason);
+void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
+				enum wb_reason reason);
 void inode_wait_for_writeback(struct inode *inode);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
-- 
cgit v1.2.3


From 9dfb176fae57a1dea68531fd25e867037e4d9bac Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 28 Sep 2017 11:28:55 -0600
Subject: writeback: make wb_start_writeback() static

We don't have any callers outside of fs-writeback.c anymore,
make it private.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Chris Mason <clm@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c           | 4 ++--
 include/linux/backing-dev.h | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9f39829459dc..f1042061eaad 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -933,8 +933,8 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
-void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
-			enum wb_reason reason)
+static void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+				enum wb_reason reason)
 {
 	struct wb_writeback_work *work;
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 0f63493de9e7..157e950a70dc 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -38,8 +38,6 @@ static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask)
 	return bdi_alloc_node(gfp_mask, NUMA_NO_NODE);
 }
 
-void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
-			enum wb_reason reason);
 void wb_start_background_writeback(struct bdi_writeback *wb);
 void wb_workfn(struct work_struct *work);
 void wb_wakeup_delayed(struct bdi_writeback *wb);
-- 
cgit v1.2.3


From aac8d41cd438f25bf3110fc6b98f1d16d7dbc169 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 28 Sep 2017 11:31:55 -0600
Subject: writeback: only allow one inflight and pending full flush

When someone calls wakeup_flusher_threads() or
wakeup_flusher_threads_bdi(), they schedule writeback of all dirty
pages in the system (or on that bdi). If we are tight on memory, we
can get tons of these queued from kswapd/vmscan. This causes (at
least) two problems:

1) We consume a ton of memory just allocating writeback work items.
   We've seen as much as 600 million of these writeback work items
   pending. That's a lot of memory to pointlessly hold hostage,
   while the box is under memory pressure.

2) We spend so much time processing these work items, that we
   introduce a softlockup in writeback processing. This is because
   each of the writeback work items don't end up doing any work (it's
   hard when you have millions of identical ones coming in to the
   flush machinery), so we just sit in a tight loop pulling work
   items and deleting/freeing them.

Fix this by adding a 'start_all' bit to the writeback structure, and
set that when someone attempts to flush all dirty pages. The bit is
cleared when we start writeback on that work item. If the bit is
already set when we attempt to queue !nr_pages writeback, then we
simply ignore it.

This provides us one full flush in flight, with one pending as well,
and makes for more efficient handling of this type of writeback.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Chris Mason <clm@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c                | 25 +++++++++++++++++++++++++
 include/linux/backing-dev-defs.h |  1 +
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 424577152eb5..399619c97567 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -53,6 +53,7 @@ struct wb_writeback_work {
 	unsigned int for_background:1;
 	unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
 	unsigned int auto_free:1;	/* free on completion */
+	unsigned int start_all:1;	/* nr_pages == 0 (all) writeback */
 	enum wb_reason reason;		/* why was writeback initiated? */
 
 	struct list_head list;		/* pending work list */
@@ -951,6 +952,20 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
 	if (!wb_has_dirty_io(wb))
 		return;
 
+	/*
+	 * All callers of this function want to start writeback of all
+	 * dirty pages. Places like vmscan can call this at a very
+	 * high frequency, causing pointless allocations of tons of
+	 * work items and keeping the flusher threads busy retrieving
+	 * that work. Ensure that we only allow one of them pending and
+	 * inflight at the time. It doesn't matter if we race a little
+	 * bit on this, so use the faster separate test/set bit variants.
+	 */
+	if (test_bit(WB_start_all, &wb->state))
+		return;
+
+	set_bit(WB_start_all, &wb->state);
+
 	/*
 	 * This is WB_SYNC_NONE writeback, so if allocation fails just
 	 * wakeup the thread for old dirty data writeback
@@ -958,6 +973,7 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
 	work = kzalloc(sizeof(*work),
 		       GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
 	if (!work) {
+		clear_bit(WB_start_all, &wb->state);
 		trace_writeback_nowork(wb);
 		wb_wakeup(wb);
 		return;
@@ -968,6 +984,7 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
 	work->range_cyclic = 1;
 	work->reason	= reason;
 	work->auto_free	= 1;
+	work->start_all = 1;
 
 	wb_queue_work(wb, work);
 }
@@ -1821,6 +1838,14 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
 		list_del_init(&work->list);
 	}
 	spin_unlock_bh(&wb->work_lock);
+
+	/*
+	 * Once we start processing a work item that had !nr_pages,
+	 * clear the wb state bit for that so we can allow more.
+	 */
+	if (work && work->start_all)
+		clear_bit(WB_start_all, &wb->state);
+
 	return work;
 }
 
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 866c433e7d32..420de5c7c7f9 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -24,6 +24,7 @@ enum wb_state {
 	WB_shutting_down,	/* wb_shutdown() in progress */
 	WB_writeback_running,	/* Writeback is in progress */
 	WB_has_dirty_io,	/* Dirty inodes on ->b_{dirty|io|more_io} */
+	WB_start_all,		/* nr_pages == 0 (all) work pending */
 };
 
 enum wb_congested_state {
-- 
cgit v1.2.3


From abf4bb6b63d0a54266f8e7eff3720c1974063971 Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Tue, 3 Oct 2017 09:58:06 +0200
Subject: skbuff: Add the offload_mr_fwd_mark field

Similarly to the offload_fwd_mark field, the offload_mr_fwd_mark field is
used to allow partial offloading of MFC multicast routes.

Switchdev drivers can offload MFC multicast routes to the hardware by
registering to the FIB notification chain. When one of the route output
interfaces is not offload-able, i.e. has different parent ID, the route
cannot be fully offloaded by the hardware. Examples to non-offload-able
devices are a management NIC, dummy device, pimreg device, etc.

Similar problem exists in the bridge module, as one bridge can hold
interfaces with different parent IDs. At the bridge, the problem is solved
by the offload_fwd_mark skb field.

Currently, when a route cannot go through full offload, the only solution
for a switchdev driver is not to offload it at all and let the packet go
through slow path.

Using the offload_mr_fwd_mark field, a driver can indicate that a packet
was already forwarded by hardware to all the devices with the same parent
ID as the input device. Further patches in this patch-set are going to
enhance ipmr to skip multicast forwarding to devices with the same parent
ID if a packets is marked with that field.

The reason why the already existing "offload_fwd_mark" bit cannot be used
is that a switchdev driver would want to make the distinction between a
packet that has already gone through L2 forwarding but did not go through
multicast forwarding, and a packet that has already gone through both L2
and multicast forwarding.

For example: when a packet is ingressing from a switchport enslaved to a
bridge, which is configured with multicast forwarding, the following
scenarios are possible:
 - The packet can be trapped to the CPU due to exception while multicast
   forwarding (for example, MTU error). In that case, it had already gone
   through L2 forwarding in the hardware, thus A switchdev driver would
   want to set the skb->offload_fwd_mark and not the
   skb->offload_mr_fwd_mark.
 - The packet can also be trapped due to a pimreg/dummy device used as one
   of the output interfaces. In that case, it can go through both L2 and
   (partial) multicast forwarding inside the hardware, thus a switchdev
   driver would want to set both the skb->offload_fwd_mark and
   skb->offload_mr_fwd_mark.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellaox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 19e64bfb1a66..ada821466e88 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -772,6 +772,7 @@ struct sk_buff {
 	__u8			remcsum_offload:1;
 #ifdef CONFIG_NET_SWITCHDEV
 	__u8			offload_fwd_mark:1;
+	__u8			offload_mr_fwd_mark:1;
 #endif
 #ifdef CONFIG_NET_CLS_ACT
 	__u8			tc_skip_classify:1;
-- 
cgit v1.2.3


From 5d8b3e69fc5e5ccafc9db1251bb7c78a8622fddd Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Tue, 3 Oct 2017 09:58:07 +0200
Subject: ipv4: ipmr: Add the parent ID field to VIF struct

In order to allow the ipmr module to do partial multicast forwarding
according to the device parent ID, add the device parent ID field to the
VIF struct. This way, the forwarding path can use the parent ID field
without invoking switchdev calls, which requires the RTNL lock.

When a new VIF is added, set the device parent ID field in it by invoking
the switchdev_port_attr_get call.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h |  1 +
 net/ipv4/ipmr.c        | 11 +++++++++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index b072a84fbe1c..8242d05df35e 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -57,6 +57,7 @@ static inline bool ipmr_rule_default(const struct fib_rule *rule)
 
 struct vif_device {
 	struct net_device 	*dev;			/* Device we are using */
+	struct netdev_phys_item_id dev_parent_id;	/* Device parent ID    */
 	unsigned long	bytes_in,bytes_out;
 	unsigned long	pkt_in,pkt_out;		/* Statistics 			*/
 	unsigned long	rate_limit;		/* Traffic shaping (NI) 	*/
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a844738b38bd..1b161ada7ae6 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -67,6 +67,7 @@
 #include <net/fib_rules.h>
 #include <linux/netconf.h>
 #include <net/nexthop.h>
+#include <net/switchdev.h>
 
 struct ipmr_rule {
 	struct fib_rule		common;
@@ -868,6 +869,9 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 		   struct vifctl *vifc, int mrtsock)
 {
 	int vifi = vifc->vifc_vifi;
+	struct switchdev_attr attr = {
+		.id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
+	};
 	struct vif_device *v = &mrt->vif_table[vifi];
 	struct net_device *dev;
 	struct in_device *in_dev;
@@ -942,6 +946,13 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 
 	/* Fill in the VIF structures */
 
+	attr.orig_dev = dev;
+	if (!switchdev_port_attr_get(dev, &attr)) {
+		memcpy(v->dev_parent_id.id, attr.u.ppid.id, attr.u.ppid.id_len);
+		v->dev_parent_id.id_len = attr.u.ppid.id_len;
+	} else {
+		v->dev_parent_id.id_len = 0;
+	}
 	v->rate_limit = vifc->vifc_rate_limit;
 	v->local = vifc->vifc_lcl_addr.s_addr;
 	v->remote = vifc->vifc_rmt_addr.s_addr;
-- 
cgit v1.2.3


From 6c5570016b972d9b1f0f6c2dca9cc0422b1f92bf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 2 Oct 2017 23:50:05 +0200
Subject: net: core: decouple ifalias get/set from rtnl lock

Device alias can be set by either rtnetlink (rtnl is held) or sysfs.

rtnetlink hold the rtnl mutex, sysfs acquires it for this purpose.
Add an extra mutex for it and use rcu to protect concurrent accesses.

This allows the sysfs path to not take rtnl and would later allow
to not hold it when dumping ifalias.

Based on suggestion from Eric Dumazet.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  8 +++++++-
 net/core/dev.c            | 52 +++++++++++++++++++++++++++++++++++------------
 net/core/net-sysfs.c      | 17 ++++++++--------
 net/core/rtnetlink.c      | 13 ++++++++++--
 4 files changed, 65 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e1d6ef130611..d04424cfffba 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -826,6 +826,11 @@ struct xfrmdev_ops {
 };
 #endif
 
+struct dev_ifalias {
+	struct rcu_head rcuhead;
+	char ifalias[];
+};
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -1632,7 +1637,7 @@ enum netdev_priv_flags {
 struct net_device {
 	char			name[IFNAMSIZ];
 	struct hlist_node	name_hlist;
-	char 			*ifalias;
+	struct dev_ifalias	__rcu *ifalias;
 	/*
 	 *	I/O specific fields
 	 *	FIXME: Merge these and struct ifmap into one
@@ -3275,6 +3280,7 @@ void __dev_notify_flags(struct net_device *, unsigned int old_flags,
 			unsigned int gchanges);
 int dev_change_name(struct net_device *, const char *);
 int dev_set_alias(struct net_device *, const char *, size_t);
+int dev_get_alias(const struct net_device *, char *, size_t);
 int dev_change_net_namespace(struct net_device *, struct net *, const char *);
 int __dev_set_mtu(struct net_device *, int);
 int dev_set_mtu(struct net_device *, int);
diff --git a/net/core/dev.c b/net/core/dev.c
index e350c768d4b5..1770097cfd86 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -188,6 +188,8 @@ static struct napi_struct *napi_by_id(unsigned int napi_id);
 DEFINE_RWLOCK(dev_base_lock);
 EXPORT_SYMBOL(dev_base_lock);
 
+static DEFINE_MUTEX(ifalias_mutex);
+
 /* protects napi_hash addition/deletion and napi_gen_id */
 static DEFINE_SPINLOCK(napi_hash_lock);
 
@@ -1265,29 +1267,53 @@ rollback:
  */
 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 {
-	char *new_ifalias;
-
-	ASSERT_RTNL();
+	struct dev_ifalias *new_alias = NULL;
 
 	if (len >= IFALIASZ)
 		return -EINVAL;
 
-	if (!len) {
-		kfree(dev->ifalias);
-		dev->ifalias = NULL;
-		return 0;
+	if (len) {
+		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
+		if (!new_alias)
+			return -ENOMEM;
+
+		memcpy(new_alias->ifalias, alias, len);
+		new_alias->ifalias[len] = 0;
 	}
 
-	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
-	if (!new_ifalias)
-		return -ENOMEM;
-	dev->ifalias = new_ifalias;
-	memcpy(dev->ifalias, alias, len);
-	dev->ifalias[len] = 0;
+	mutex_lock(&ifalias_mutex);
+	rcu_swap_protected(dev->ifalias, new_alias,
+			   mutex_is_locked(&ifalias_mutex));
+	mutex_unlock(&ifalias_mutex);
+
+	if (new_alias)
+		kfree_rcu(new_alias, rcuhead);
 
 	return len;
 }
 
+/**
+ *	dev_get_alias - get ifalias of a device
+ *	@dev: device
+ *	@alias: buffer to store name of ifalias
+ *	@len: size of buffer
+ *
+ *	get ifalias for a device.  Caller must make sure dev cannot go
+ *	away,  e.g. rcu read lock or own a reference count to device.
+ */
+int dev_get_alias(const struct net_device *dev, char *name, size_t len)
+{
+	const struct dev_ifalias *alias;
+	int ret = 0;
+
+	rcu_read_lock();
+	alias = rcu_dereference(dev->ifalias);
+	if (alias)
+		ret = snprintf(name, len, "%s", alias->ifalias);
+	rcu_read_unlock();
+
+	return ret;
+}
 
 /**
  *	netdev_features_change - device changes features
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 927a6dcbad96..51d5836d8fb9 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -391,10 +391,7 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
 	if (len >  0 && buf[len - 1] == '\n')
 		--count;
 
-	if (!rtnl_trylock())
-		return restart_syscall();
 	ret = dev_set_alias(netdev, buf, count);
-	rtnl_unlock();
 
 	return ret < 0 ? ret : len;
 }
@@ -403,13 +400,12 @@ static ssize_t ifalias_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
 	const struct net_device *netdev = to_net_dev(dev);
+	char tmp[IFALIASZ];
 	ssize_t ret = 0;
 
-	if (!rtnl_trylock())
-		return restart_syscall();
-	if (netdev->ifalias)
-		ret = sprintf(buf, "%s\n", netdev->ifalias);
-	rtnl_unlock();
+	ret = dev_get_alias(netdev, tmp, sizeof(tmp));
+	if (ret > 0)
+		ret = sprintf(buf, "%s\n", tmp);
 	return ret;
 }
 static DEVICE_ATTR_RW(ifalias);
@@ -1488,7 +1484,10 @@ static void netdev_release(struct device *d)
 
 	BUG_ON(dev->reg_state != NETREG_RELEASED);
 
-	kfree(dev->ifalias);
+	/* no need to wait for rcu grace period:
+	 * device is dead and about to be freed.
+	 */
+	kfree(rcu_access_pointer(dev->ifalias));
 	netdev_freemem(dev);
 }
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e6955da0d58d..3961f87cdc76 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1366,6 +1366,16 @@ static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev)
 	return nla_put_u32(skb, IFLA_LINK, ifindex);
 }
 
+static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb,
+					      struct net_device *dev)
+{
+	char buf[IFALIASZ];
+	int ret;
+
+	ret = dev_get_alias(dev, buf, sizeof(buf));
+	return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0;
+}
+
 static int rtnl_fill_link_netnsid(struct sk_buff *skb,
 				  const struct net_device *dev)
 {
@@ -1425,8 +1435,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	    nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
 	    (dev->qdisc &&
 	     nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
-	    (dev->ifalias &&
-	     nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
+	    nla_put_ifalias(skb, dev) ||
 	    nla_put_u32(skb, IFLA_CARRIER_CHANGES,
 			atomic_read(&dev->carrier_changes)) ||
 	    nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
-- 
cgit v1.2.3


From eaefd5abf6b095bfc55eb745bdf7c42cf66790eb Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Thu, 14 Sep 2017 10:38:42 -0700
Subject: nvme-fc: add uevent for auto-connect

To support auto-connecting to FC-NVME devices upon their dynamic
appearance, add a uevent that can kick off connection scripts.
uevent is posted against the fc_udev device.

patch set tested with the following rule to kick an nvme-cli connect-all
for the FC initiator and FC target ports. This is just an example for
testing and not intended for real life use.

ACTION=="change", SUBSYSTEM=="fc", ENV{FC_EVENT}=="nvmediscovery", \
        ENV{NVMEFC_HOST_TRADDR}=="*", ENV{NVMEFC_TRADDR}=="*", \
	RUN+="/bin/sh -c '/usr/local/sbin/nvme connect-all --transport=fc --host-traddr=$env{NVMEFC_HOST_TRADDR} --traddr=$env{NVMEFC_TRADDR} >> /tmp/nvme_fc.log'"

I will post proposed udev/systemd scripts for possible kernel support.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c         | 49 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/nvme-fc-driver.h |  2 ++
 2 files changed, 51 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index af075e998944..f546eecb1f82 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -452,6 +452,36 @@ nvme_fc_unregister_localport(struct nvme_fc_local_port *portptr)
 }
 EXPORT_SYMBOL_GPL(nvme_fc_unregister_localport);
 
+/*
+ * TRADDR strings, per FC-NVME are fixed format:
+ *   "nn-0x<16hexdigits>:pn-0x<16hexdigits>" - 43 characters
+ * udev event will only differ by prefix of what field is
+ * being specified:
+ *    "NVMEFC_HOST_TRADDR=" or "NVMEFC_TRADDR=" - 19 max characters
+ *  19 + 43 + null_fudge = 64 characters
+ */
+#define FCNVME_TRADDR_LENGTH		64
+
+static void
+nvme_fc_signal_discovery_scan(struct nvme_fc_lport *lport,
+		struct nvme_fc_rport *rport)
+{
+	char hostaddr[FCNVME_TRADDR_LENGTH];	/* NVMEFC_HOST_TRADDR=...*/
+	char tgtaddr[FCNVME_TRADDR_LENGTH];	/* NVMEFC_TRADDR=...*/
+	char *envp[4] = { "FC_EVENT=nvmediscovery", hostaddr, tgtaddr, NULL };
+
+	if (!(rport->remoteport.port_role & FC_PORT_ROLE_NVME_DISCOVERY))
+		return;
+
+	snprintf(hostaddr, sizeof(hostaddr),
+		"NVMEFC_HOST_TRADDR=nn-0x%016llx:pn-0x%016llx",
+		lport->localport.node_name, lport->localport.port_name);
+	snprintf(tgtaddr, sizeof(tgtaddr),
+		"NVMEFC_TRADDR=nn-0x%016llx:pn-0x%016llx",
+		rport->remoteport.node_name, rport->remoteport.port_name);
+	kobject_uevent_env(&fc_udev_device->kobj, KOBJ_CHANGE, envp);
+}
+
 /**
  * nvme_fc_register_remoteport - transport entry point called by an
  *                              LLDD to register the existence of a NVME
@@ -516,6 +546,8 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
 	list_add_tail(&newrec->endp_list, &lport->endp_list);
 	spin_unlock_irqrestore(&nvme_fc_lock, flags);
 
+	nvme_fc_signal_discovery_scan(lport, newrec);
+
 	*portptr = &newrec->remoteport;
 	return 0;
 
@@ -634,6 +666,23 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr)
 }
 EXPORT_SYMBOL_GPL(nvme_fc_unregister_remoteport);
 
+/**
+ * nvme_fc_rescan_remoteport - transport entry point called by an
+ *                              LLDD to request a nvme device rescan.
+ * @remoteport: pointer to the (registered) remote port that is to be
+ *              rescanned.
+ *
+ * Returns: N/A
+ */
+void
+nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport)
+{
+	struct nvme_fc_rport *rport = remoteport_to_rport(remoteport);
+
+	nvme_fc_signal_discovery_scan(rport->lport, rport);
+}
+EXPORT_SYMBOL_GPL(nvme_fc_rescan_remoteport);
+
 
 /* *********************** FC-NVME DMA Handling **************************** */
 
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index a726f96010d5..4ea03b9a5c8c 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -446,6 +446,8 @@ int nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
 
 int nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *remoteport);
 
+void nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport);
+
 
 
 /*
-- 
cgit v1.2.3


From eb8470db8bc018fc28901e4e3b0f48e33f1ea7df Mon Sep 17 00:00:00 2001
From: Jan Kandziora <jjj@gmx.de>
Date: Wed, 20 Sep 2017 23:52:45 +0200
Subject: wire: export w1_touch_bit

The w1_ds28e17 driver from the next part of this patch needs to emit
single-bit read timeslots to the DS28E17. The w1 subsystem already
has this function but it is not exported outside drivers/w1/w1_io.c

This subpatch exports the w1_touch_bit symbol with EXPORT_SYMBOL_GPL,
same as the other exported symbols in drivers/w1/w1_io.c

May be also useful later for writing drivers for other Onewire chips
which do single-bit communication.

Signed-off-by: Jan Kandziora <jjj@gmx.de>
Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/w1/w1_io.c | 3 ++-
 include/linux/w1.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/w1/w1_io.c b/drivers/w1/w1_io.c
index d191e1f80579..075d120e7b88 100644
--- a/drivers/w1/w1_io.c
+++ b/drivers/w1/w1_io.c
@@ -58,7 +58,7 @@ static u8 w1_read_bit(struct w1_master *dev);
  * @dev:	the master device
  * @bit:	0 - write a 0, 1 - write a 0 read the level
  */
-static u8 w1_touch_bit(struct w1_master *dev, int bit)
+u8 w1_touch_bit(struct w1_master *dev, int bit)
 {
 	if (dev->bus_master->touch_bit)
 		return dev->bus_master->touch_bit(dev->bus_master->data, bit);
@@ -69,6 +69,7 @@ static u8 w1_touch_bit(struct w1_master *dev, int bit)
 		return 0;
 	}
 }
+EXPORT_SYMBOL_GPL(w1_touch_bit);
 
 /**
  * w1_write_bit() - Generates a write-0 or write-1 cycle.
diff --git a/include/linux/w1.h b/include/linux/w1.h
index 5b2972946dda..694101f744c7 100644
--- a/include/linux/w1.h
+++ b/include/linux/w1.h
@@ -293,6 +293,7 @@ void w1_unregister_family(struct w1_family *family);
 			w1_unregister_family)
 
 u8 w1_triplet(struct w1_master *dev, int bdir);
+u8 w1_touch_bit(struct w1_master *dev, int bit);
 void w1_write_8(struct w1_master *, u8);
 u8 w1_read_8(struct w1_master *);
 int w1_reset_bus(struct w1_master *);
-- 
cgit v1.2.3


From c2e5df616e1ae6c2a074cb241ebb65a318ebaf7c Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Thu, 21 Sep 2017 20:58:49 -0700
Subject: vmbus: add per-channel sysfs info

This extends existing vmbus related sysfs structure to provide per-channel
state information. This is useful when diagnosing issues with multiple
queues in networking and storage.

The existing sysfs only displayed information about the primary
channel. The one place it reported multiple channels was the
channel_vp_mapping file which violated the sysfs convention
of one value per file.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/stable/sysfs-bus-vmbus |  56 ++++++++++
 drivers/hv/channel_mgmt.c                |  10 +-
 drivers/hv/hyperv_vmbus.h                |   2 +
 drivers/hv/vmbus_drv.c                   | 185 +++++++++++++++++++++++++++++--
 include/linux/hyperv.h                   |   6 +
 5 files changed, 246 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/stable/sysfs-bus-vmbus b/Documentation/ABI/stable/sysfs-bus-vmbus
index 5d0125f7bcaf..0ebd8a1537a0 100644
--- a/Documentation/ABI/stable/sysfs-bus-vmbus
+++ b/Documentation/ABI/stable/sysfs-bus-vmbus
@@ -41,3 +41,59 @@ KernelVersion:	4.5
 Contact:	K. Y. Srinivasan <kys@microsoft.com>
 Description:	The 16 bit vendor ID of the device
 Users:		tools/hv/lsvmbus and user level RDMA libraries
+
+What:		/sys/bus/vmbus/devices/vmbus_*/channels/relid/cpu
+Date:		September. 2017
+KernelVersion:	4.14
+Contact:	Stephen Hemminger <sthemmin@microsoft.com>
+Description:	VCPU (sub)channel is affinitized to
+Users:		tools/hv/lsvmbus and other debuggig tools
+
+What:		/sys/bus/vmbus/devices/vmbus_*/channels/relid/cpu
+Date:		September. 2017
+KernelVersion:	4.14
+Contact:	Stephen Hemminger <sthemmin@microsoft.com>
+Description:	VCPU (sub)channel is affinitized to
+Users:		tools/hv/lsvmbus and other debuggig tools
+
+What:		/sys/bus/vmbus/devices/vmbus_*/channels/relid/in_mask
+Date:		September. 2017
+KernelVersion:	4.14
+Contact:	Stephen Hemminger <sthemmin@microsoft.com>
+Description:	Inbound channel signaling state
+Users:		Debuggig tools
+
+What:		/sys/bus/vmbus/devices/vmbus_*/channels/relid/latency
+Date:		September. 2017
+KernelVersion:	4.14
+Contact:	Stephen Hemminger <sthemmin@microsoft.com>
+Description:	Channel signaling latency
+Users:		Debuggig tools
+
+What:		/sys/bus/vmbus/devices/vmbus_*/channels/relid/out_mask
+Date:		September. 2017
+KernelVersion:	4.14
+Contact:	Stephen Hemminger <sthemmin@microsoft.com>
+Description:	Outbound channel signaling state
+Users:		Debuggig tools
+
+What:		/sys/bus/vmbus/devices/vmbus_*/channels/relid/pending
+Date:		September. 2017
+KernelVersion:	4.14
+Contact:	Stephen Hemminger <sthemmin@microsoft.com>
+Description:	Channel interrupt pending state
+Users:		Debuggig tools
+
+What:		/sys/bus/vmbus/devices/vmbus_*/channels/relid/read_avail
+Date:		September. 2017
+KernelVersion:	4.14
+Contact:	Stephen Hemminger <sthemmin@microsoft.com>
+Description:	Bytes availabble to read
+Users:		Debuggig tools
+
+What:		/sys/bus/vmbus/devices/vmbus_*/channels/relid/write_avail
+Date:		September. 2017
+KernelVersion:	4.14
+Contact:	Stephen Hemminger <sthemmin@microsoft.com>
+Description:	Bytes availabble to write
+Users:		Debuggig tools
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 060df71c2e8b..dd2dffe816be 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -350,7 +350,7 @@ static void free_channel(struct vmbus_channel *channel)
 {
 	tasklet_kill(&channel->callback_event);
 
-	kfree_rcu(channel, rcu);
+	kobject_put(&channel->kobj);
 }
 
 static void percpu_channel_enq(void *arg)
@@ -513,6 +513,14 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
 	newchannel->state = CHANNEL_OPEN_STATE;
 
 	if (!fnew) {
+		struct hv_device *dev
+			= newchannel->primary_channel->device_obj;
+
+		if (vmbus_add_channel_kobj(dev, newchannel)) {
+			atomic_dec(&vmbus_connection.offer_in_progress);
+			goto err_free_chan;
+		}
+
 		if (channel->sc_creation_callback != NULL)
 			channel->sc_creation_callback(newchannel);
 		return;
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 49569f8fe038..de6f01df9592 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -373,6 +373,8 @@ struct hv_device *vmbus_device_create(const uuid_le *type,
 
 int vmbus_device_register(struct hv_device *child_device_obj);
 void vmbus_device_unregister(struct hv_device *device_obj);
+int vmbus_add_channel_kobj(struct hv_device *device_obj,
+			   struct vmbus_channel *channel);
 
 struct vmbus_channel *relid2channel(u32 relid);
 
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index a9d49f6f6501..a209527b09f5 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -107,28 +107,30 @@ static void print_alias_name(struct hv_device *hv_dev, char *alias_name)
 		sprintf(&alias_name[i], "%02x", hv_dev->dev_type.b[i/2]);
 }
 
-static u8 channel_monitor_group(struct vmbus_channel *channel)
+static u8 channel_monitor_group(const struct vmbus_channel *channel)
 {
 	return (u8)channel->offermsg.monitorid / 32;
 }
 
-static u8 channel_monitor_offset(struct vmbus_channel *channel)
+static u8 channel_monitor_offset(const struct vmbus_channel *channel)
 {
 	return (u8)channel->offermsg.monitorid % 32;
 }
 
-static u32 channel_pending(struct vmbus_channel *channel,
-			   struct hv_monitor_page *monitor_page)
+static u32 channel_pending(const struct vmbus_channel *channel,
+			   const struct hv_monitor_page *monitor_page)
 {
 	u8 monitor_group = channel_monitor_group(channel);
+
 	return monitor_page->trigger_group[monitor_group].pending;
 }
 
-static u32 channel_latency(struct vmbus_channel *channel,
-			   struct hv_monitor_page *monitor_page)
+static u32 channel_latency(const struct vmbus_channel *channel,
+			   const struct hv_monitor_page *monitor_page)
 {
 	u8 monitor_group = channel_monitor_group(channel);
 	u8 monitor_offset = channel_monitor_offset(channel);
+
 	return monitor_page->latency[monitor_group][monitor_offset];
 }
 
@@ -1134,6 +1136,145 @@ void vmbus_driver_unregister(struct hv_driver *hv_driver)
 }
 EXPORT_SYMBOL_GPL(vmbus_driver_unregister);
 
+
+/*
+ * Called when last reference to channel is gone.
+ */
+static void vmbus_chan_release(struct kobject *kobj)
+{
+	struct vmbus_channel *channel
+		= container_of(kobj, struct vmbus_channel, kobj);
+
+	kfree_rcu(channel, rcu);
+}
+
+struct vmbus_chan_attribute {
+	struct attribute attr;
+	ssize_t (*show)(const struct vmbus_channel *chan, char *buf);
+	ssize_t (*store)(struct vmbus_channel *chan,
+			 const char *buf, size_t count);
+};
+#define VMBUS_CHAN_ATTR(_name, _mode, _show, _store) \
+	struct vmbus_chan_attribute chan_attr_##_name \
+		= __ATTR(_name, _mode, _show, _store)
+#define VMBUS_CHAN_ATTR_RW(_name) \
+	struct vmbus_chan_attribute chan_attr_##_name = __ATTR_RW(_name)
+#define VMBUS_CHAN_ATTR_RO(_name) \
+	struct vmbus_chan_attribute chan_attr_##_name = __ATTR_RO(_name)
+#define VMBUS_CHAN_ATTR_WO(_name) \
+	struct vmbus_chan_attribute chan_attr_##_name = __ATTR_WO(_name)
+
+static ssize_t vmbus_chan_attr_show(struct kobject *kobj,
+				    struct attribute *attr, char *buf)
+{
+	const struct vmbus_chan_attribute *attribute
+		= container_of(attr, struct vmbus_chan_attribute, attr);
+	const struct vmbus_channel *chan
+		= container_of(kobj, struct vmbus_channel, kobj);
+
+	if (!attribute->show)
+		return -EIO;
+
+	return attribute->show(chan, buf);
+}
+
+static const struct sysfs_ops vmbus_chan_sysfs_ops = {
+	.show = vmbus_chan_attr_show,
+};
+
+static ssize_t out_mask_show(const struct vmbus_channel *channel, char *buf)
+{
+	const struct hv_ring_buffer_info *rbi = &channel->outbound;
+
+	return sprintf(buf, "%u\n", rbi->ring_buffer->interrupt_mask);
+}
+VMBUS_CHAN_ATTR_RO(out_mask);
+
+static ssize_t in_mask_show(const struct vmbus_channel *channel, char *buf)
+{
+	const struct hv_ring_buffer_info *rbi = &channel->inbound;
+
+	return sprintf(buf, "%u\n", rbi->ring_buffer->interrupt_mask);
+}
+VMBUS_CHAN_ATTR_RO(in_mask);
+
+static ssize_t read_avail_show(const struct vmbus_channel *channel, char *buf)
+{
+	const struct hv_ring_buffer_info *rbi = &channel->inbound;
+
+	return sprintf(buf, "%u\n", hv_get_bytes_to_read(rbi));
+}
+VMBUS_CHAN_ATTR_RO(read_avail);
+
+static ssize_t write_avail_show(const struct vmbus_channel *channel, char *buf)
+{
+	const struct hv_ring_buffer_info *rbi = &channel->outbound;
+
+	return sprintf(buf, "%u\n", hv_get_bytes_to_write(rbi));
+}
+VMBUS_CHAN_ATTR_RO(write_avail);
+
+static ssize_t show_target_cpu(const struct vmbus_channel *channel, char *buf)
+{
+	return sprintf(buf, "%u\n", channel->target_cpu);
+}
+VMBUS_CHAN_ATTR(cpu, S_IRUGO, show_target_cpu, NULL);
+
+static ssize_t channel_pending_show(const struct vmbus_channel *channel,
+				    char *buf)
+{
+	return sprintf(buf, "%d\n",
+		       channel_pending(channel,
+				       vmbus_connection.monitor_pages[1]));
+}
+VMBUS_CHAN_ATTR(pending, S_IRUGO, channel_pending_show, NULL);
+
+static ssize_t channel_latency_show(const struct vmbus_channel *channel,
+				    char *buf)
+{
+	return sprintf(buf, "%d\n",
+		       channel_latency(channel,
+				       vmbus_connection.monitor_pages[1]));
+}
+VMBUS_CHAN_ATTR(latency, S_IRUGO, channel_latency_show, NULL);
+
+static struct attribute *vmbus_chan_attrs[] = {
+	&chan_attr_out_mask.attr,
+	&chan_attr_in_mask.attr,
+	&chan_attr_read_avail.attr,
+	&chan_attr_write_avail.attr,
+	&chan_attr_cpu.attr,
+	&chan_attr_pending.attr,
+	&chan_attr_latency.attr,
+	NULL
+};
+
+static struct kobj_type vmbus_chan_ktype = {
+	.sysfs_ops = &vmbus_chan_sysfs_ops,
+	.release = vmbus_chan_release,
+	.default_attrs = vmbus_chan_attrs,
+};
+
+/*
+ * vmbus_add_channel_kobj - setup a sub-directory under device/channels
+ */
+int vmbus_add_channel_kobj(struct hv_device *dev, struct vmbus_channel *channel)
+{
+	struct kobject *kobj = &channel->kobj;
+	u32 relid = channel->offermsg.child_relid;
+	int ret;
+
+	kobj->kset = dev->channels_kset;
+	ret = kobject_init_and_add(kobj, &vmbus_chan_ktype, NULL,
+				   "%u", relid);
+	if (ret)
+		return ret;
+
+	kobject_uevent(kobj, KOBJ_ADD);
+
+	return 0;
+}
+
 /*
  * vmbus_device_create - Creates and registers a new child device
  * on the vmbus.
@@ -1165,7 +1306,8 @@ struct hv_device *vmbus_device_create(const uuid_le *type,
  */
 int vmbus_device_register(struct hv_device *child_device_obj)
 {
-	int ret = 0;
+	struct kobject *kobj = &child_device_obj->device.kobj;
+	int ret;
 
 	dev_set_name(&child_device_obj->device, "%pUl",
 		     child_device_obj->channel->offermsg.offer.if_instance.b);
@@ -1179,13 +1321,32 @@ int vmbus_device_register(struct hv_device *child_device_obj)
 	 * binding...which will eventually call vmbus_match() and vmbus_probe()
 	 */
 	ret = device_register(&child_device_obj->device);
-
-	if (ret)
+	if (ret) {
 		pr_err("Unable to register child device\n");
-	else
-		pr_debug("child device %s registered\n",
-			dev_name(&child_device_obj->device));
+		return ret;
+	}
+
+	child_device_obj->channels_kset = kset_create_and_add("channels",
+							      NULL, kobj);
+	if (!child_device_obj->channels_kset) {
+		ret = -ENOMEM;
+		goto err_dev_unregister;
+	}
+
+	ret = vmbus_add_channel_kobj(child_device_obj,
+				     child_device_obj->channel);
+	if (ret) {
+		pr_err("Unable to register primary channeln");
+		goto err_kset_unregister;
+	}
+
+	return 0;
+
+err_kset_unregister:
+	kset_unregister(child_device_obj->channels_kset);
 
+err_dev_unregister:
+	device_unregister(&child_device_obj->device);
 	return ret;
 }
 
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index c458d7b7ad19..ef16ee039850 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -828,6 +828,11 @@ struct vmbus_channel {
 	 */
 	struct rcu_head rcu;
 
+	/*
+	 * For sysfs per-channel properties.
+	 */
+	struct kobject			kobj;
+
 	/*
 	 * For performance critical channels (storage, networking
 	 * etc,), Hyper-V has a mechanism to enhance the throughput
@@ -1089,6 +1094,7 @@ struct hv_device {
 	struct device device;
 
 	struct vmbus_channel *channel;
+	struct kset	     *channels_kset;
 };
 
 
-- 
cgit v1.2.3


From 667063acb81931e2f8fd0cb91df9fcccad131d9a Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Fri, 29 Sep 2017 16:23:01 +0800
Subject: regmap: add iopoll-like polling macro for regmap_field

This patch adds a macro regmap_field_read_poll_timeout that works
similar to the readx_poll_timeout defined in linux/iopoll.h, except
that this can also return the error value returned by a failed
regmap_field_read.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regmap.h | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 978abfbac617..93a4663d7acb 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -139,6 +139,45 @@ struct reg_sequence {
 	pollret ?: ((cond) ? 0 : -ETIMEDOUT); \
 })
 
+/**
+ * regmap_field_read_poll_timeout - Poll until a condition is met or timeout
+ *
+ * @field: Regmap field to read from
+ * @val: Unsigned integer variable to read the value into
+ * @cond: Break condition (usually involving @val)
+ * @sleep_us: Maximum time to sleep between reads in us (0
+ *            tight-loops).  Should be less than ~20ms since usleep_range
+ *            is used (see Documentation/timers/timers-howto.txt).
+ * @timeout_us: Timeout in us, 0 means never timeout
+ *
+ * Returns 0 on success and -ETIMEDOUT upon a timeout or the regmap_field_read
+ * error return value in case of a error read. In the two former cases,
+ * the last read value at @addr is stored in @val. Must not be called
+ * from atomic context if sleep_us or timeout_us are used.
+ *
+ * This is modelled after the readx_poll_timeout macros in linux/iopoll.h.
+ */
+#define regmap_field_read_poll_timeout(field, val, cond, sleep_us, timeout_us) \
+({ \
+	ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \
+	int pollret; \
+	might_sleep_if(sleep_us); \
+	for (;;) { \
+		pollret = regmap_field_read((field), &(val)); \
+		if (pollret) \
+			break; \
+		if (cond) \
+			break; \
+		if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \
+			pollret = regmap_field_read((field), &(val)); \
+			break; \
+		} \
+		if (sleep_us) \
+			usleep_range((sleep_us >> 2) + 1, sleep_us); \
+	} \
+	pollret ?: ((cond) ? 0 : -ETIMEDOUT); \
+})
+
 #ifdef CONFIG_REGMAP
 
 enum regmap_endian {
-- 
cgit v1.2.3


From d81851c1764b26b46670c0b3bd6701308ddaab98 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Fri, 29 Sep 2017 11:25:09 +0800
Subject: regulator: axp20x: Add support for AXP813 regulators

The AXP813 PMIC has 7 DC-DC buck regulators, 16 LDOs (including the
fixed RTC LDO and 2 GPIO LDOs), and 1 switchable. The drive-vbus
feature is also supported. All the hardware details are very similar
to the AXP803, with the following exceptions:

  - Extra DCDC7 buck regulator, with the same range as DCDC6

  - SWitch now has a separate supply pin, instead of being chained
    internaly from DCDC1

  - RTC LDO output voltage is now 1.8V

  - FLDO3 is an LDO with switchable supplies, but unconfigurable output
    voltage. The voltage is always half that of its supply.

Support for FLDO3 is currently unimplemented, as it requires runtime
switching of its supplies, something the regulator subsystem does not
support. It is not used in either the reference designs nor actually
produced boards available.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Tested-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Acked-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/axp20x-regulator.c | 102 +++++++++++++++++++++++++++++++++--
 include/linux/mfd/axp20x.h           |   3 ++
 2 files changed, 101 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/axp20x-regulator.c b/drivers/regulator/axp20x-regulator.c
index 376a99b7cf5d..e1761df4cbfd 100644
--- a/drivers/regulator/axp20x-regulator.c
+++ b/drivers/regulator/axp20x-regulator.c
@@ -244,6 +244,7 @@ static const struct regulator_desc axp22x_drivevbus_regulator = {
 	.ops		= &axp20x_ops_sw,
 };
 
+/* DCDC ranges shared with AXP813 */
 static const struct regulator_linear_range axp803_dcdc234_ranges[] = {
 	REGULATOR_LINEAR_RANGE(500000, 0x0, 0x46, 10000),
 	REGULATOR_LINEAR_RANGE(1220000, 0x47, 0x4b, 20000),
@@ -426,6 +427,69 @@ static const struct regulator_desc axp809_regulators[] = {
 	AXP_DESC_SW(AXP809, SW, "sw", "swin", AXP22X_PWR_OUT_CTRL2, BIT(6)),
 };
 
+static const struct regulator_desc axp813_regulators[] = {
+	AXP_DESC(AXP813, DCDC1, "dcdc1", "vin1", 1600, 3400, 100,
+		 AXP803_DCDC1_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL1, BIT(0)),
+	AXP_DESC_RANGES(AXP813, DCDC2, "dcdc2", "vin2", axp803_dcdc234_ranges,
+			76, AXP803_DCDC2_V_OUT, 0x7f, AXP22X_PWR_OUT_CTRL1,
+			BIT(1)),
+	AXP_DESC_RANGES(AXP813, DCDC3, "dcdc3", "vin3", axp803_dcdc234_ranges,
+			76, AXP803_DCDC3_V_OUT, 0x7f, AXP22X_PWR_OUT_CTRL1,
+			BIT(2)),
+	AXP_DESC_RANGES(AXP813, DCDC4, "dcdc4", "vin4", axp803_dcdc234_ranges,
+			76, AXP803_DCDC4_V_OUT, 0x7f, AXP22X_PWR_OUT_CTRL1,
+			BIT(3)),
+	AXP_DESC_RANGES(AXP813, DCDC5, "dcdc5", "vin5", axp803_dcdc5_ranges,
+			68, AXP803_DCDC5_V_OUT, 0x7f, AXP22X_PWR_OUT_CTRL1,
+			BIT(4)),
+	AXP_DESC_RANGES(AXP813, DCDC6, "dcdc6", "vin6", axp803_dcdc6_ranges,
+			72, AXP803_DCDC6_V_OUT, 0x7f, AXP22X_PWR_OUT_CTRL1,
+			BIT(5)),
+	AXP_DESC_RANGES(AXP813, DCDC7, "dcdc7", "vin7", axp803_dcdc6_ranges,
+			72, AXP813_DCDC7_V_OUT, 0x7f, AXP22X_PWR_OUT_CTRL1,
+			BIT(6)),
+	AXP_DESC(AXP813, ALDO1, "aldo1", "aldoin", 700, 3300, 100,
+		 AXP22X_ALDO1_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL3, BIT(5)),
+	AXP_DESC(AXP813, ALDO2, "aldo2", "aldoin", 700, 3300, 100,
+		 AXP22X_ALDO2_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL3, BIT(6)),
+	AXP_DESC(AXP813, ALDO3, "aldo3", "aldoin", 700, 3300, 100,
+		 AXP22X_ALDO3_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL3, BIT(7)),
+	AXP_DESC(AXP813, DLDO1, "dldo1", "dldoin", 700, 3300, 100,
+		 AXP22X_DLDO1_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(3)),
+	AXP_DESC_RANGES(AXP813, DLDO2, "dldo2", "dldoin", axp803_dldo2_ranges,
+			32, AXP22X_DLDO2_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2,
+			BIT(4)),
+	AXP_DESC(AXP813, DLDO3, "dldo3", "dldoin", 700, 3300, 100,
+		 AXP22X_DLDO3_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(5)),
+	AXP_DESC(AXP813, DLDO4, "dldo4", "dldoin", 700, 3300, 100,
+		 AXP22X_DLDO4_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(6)),
+	AXP_DESC(AXP813, ELDO1, "eldo1", "eldoin", 700, 1900, 50,
+		 AXP22X_ELDO1_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(0)),
+	AXP_DESC(AXP813, ELDO2, "eldo2", "eldoin", 700, 1900, 50,
+		 AXP22X_ELDO2_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(1)),
+	AXP_DESC(AXP813, ELDO3, "eldo3", "eldoin", 700, 1900, 50,
+		 AXP22X_ELDO3_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(2)),
+	/* to do / check ... */
+	AXP_DESC(AXP813, FLDO1, "fldo1", "fldoin", 700, 1450, 50,
+		 AXP803_FLDO1_V_OUT, 0x0f, AXP22X_PWR_OUT_CTRL3, BIT(2)),
+	AXP_DESC(AXP813, FLDO2, "fldo2", "fldoin", 700, 1450, 50,
+		 AXP803_FLDO2_V_OUT, 0x0f, AXP22X_PWR_OUT_CTRL3, BIT(3)),
+	/*
+	 * TODO: FLDO3 = {DCDC5, FLDOIN} / 2
+	 *
+	 * This means FLDO3 effectively switches supplies at runtime,
+	 * something the regulator subsystem does not support.
+	 */
+	AXP_DESC_FIXED(AXP813, RTC_LDO, "rtc-ldo", "ips", 1800),
+	AXP_DESC_IO(AXP813, LDO_IO0, "ldo-io0", "ips", 700, 3300, 100,
+		    AXP22X_LDO_IO0_V_OUT, 0x1f, AXP20X_GPIO0_CTRL, 0x07,
+		    AXP22X_IO_ENABLED, AXP22X_IO_DISABLED),
+	AXP_DESC_IO(AXP813, LDO_IO1, "ldo-io1", "ips", 700, 3300, 100,
+		    AXP22X_LDO_IO1_V_OUT, 0x1f, AXP20X_GPIO1_CTRL, 0x07,
+		    AXP22X_IO_ENABLED, AXP22X_IO_DISABLED),
+	AXP_DESC_SW(AXP813, SW, "sw", "swin", AXP22X_PWR_OUT_CTRL2, BIT(7)),
+};
+
 static int axp20x_set_dcdc_freq(struct platform_device *pdev, u32 dcdcfreq)
 {
 	struct axp20x_dev *axp20x = dev_get_drvdata(pdev->dev.parent);
@@ -441,9 +505,10 @@ static int axp20x_set_dcdc_freq(struct platform_device *pdev, u32 dcdcfreq)
 		step = 75;
 		break;
 	case AXP803_ID:
+	case AXP813_ID:
 		/*
-		 * AXP803 DCDC work frequency setting has the same range and
-		 * step as AXP22X, but at a different register.
+		 * AXP803/AXP813 DCDC work frequency setting has the same
+		 * range and step as AXP22X, but at a different register.
 		 * Fall through to the check below.
 		 * (See include/linux/mfd/axp20x.h)
 		 */
@@ -561,6 +626,14 @@ static int axp20x_set_dcdc_workmode(struct regulator_dev *rdev, int id, u32 work
 		workmode <<= id - AXP803_DCDC1;
 		break;
 
+	case AXP813_ID:
+		if (id < AXP813_DCDC1 || id > AXP813_DCDC7)
+			return -EINVAL;
+
+		mask = AXP22X_WORKMODE_DCDCX_MASK(id - AXP813_DCDC1);
+		workmode <<= id - AXP813_DCDC1;
+		break;
+
 	default:
 		/* should not happen */
 		WARN_ON(1);
@@ -579,8 +652,8 @@ static bool axp20x_is_polyphase_slave(struct axp20x_dev *axp20x, int id)
 	u32 reg = 0;
 
 	/*
-	 * Currently in our supported AXP variants, only AXP803 and AXP806
-	 * have polyphase regulators.
+	 * Currently in our supported AXP variants, only AXP803, AXP806,
+	 * and AXP813 have polyphase regulators.
 	 */
 	switch (axp20x->variant) {
 	case AXP803_ID:
@@ -608,6 +681,17 @@ static bool axp20x_is_polyphase_slave(struct axp20x_dev *axp20x, int id)
 		}
 		break;
 
+	case AXP813_ID:
+		regmap_read(axp20x->regmap, AXP803_POLYPHASE_CTRL, &reg);
+
+		switch (id) {
+		case AXP803_DCDC3:
+			return !!(reg & BIT(6));
+		case AXP803_DCDC6:
+			return !!(reg & BIT(5));
+		}
+		break;
+
 	default:
 		return false;
 	}
@@ -656,6 +740,12 @@ static int axp20x_regulator_probe(struct platform_device *pdev)
 		regulators = axp809_regulators;
 		nregulators = AXP809_REG_ID_MAX;
 		break;
+	case AXP813_ID:
+		regulators = axp813_regulators;
+		nregulators = AXP813_REG_ID_MAX;
+		drivevbus = of_property_read_bool(pdev->dev.parent->of_node,
+						  "x-powers,drive-vbus-en");
+		break;
 	default:
 		dev_err(&pdev->dev, "Unsupported AXP variant: %ld\n",
 			axp20x->variant);
@@ -677,6 +767,10 @@ static int axp20x_regulator_probe(struct platform_device *pdev)
 		if (axp20x_is_polyphase_slave(axp20x, i))
 			continue;
 
+		/* Support for AXP813's FLDO3 is not implemented */
+		if (axp20x->variant == AXP813_ID && i == AXP813_FLDO3)
+			continue;
+
 		/*
 		 * Regulators DC1SW and DC5LDO are connected internally,
 		 * so we have to handle their supply names separately.
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index e9c908c4fba8..78dc85365c4f 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -131,6 +131,9 @@ enum axp20x_variants {
 #define AXP803_DCDC6_V_OUT		0x25
 #define AXP803_DCDC_FREQ_CTRL		0x3b
 
+/* Other DCDC regulator control registers are the same as AXP803 */
+#define AXP813_DCDC7_V_OUT		0x26
+
 /* Interrupt */
 #define AXP152_IRQ1_EN			0x40
 #define AXP152_IRQ2_EN			0x41
-- 
cgit v1.2.3


From 85009b4f5f0399669a44f07cb9a5622c0e71d419 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 30 Sep 2017 02:09:06 -0600
Subject: writeback: eliminate work item allocation in bd_start_writeback()

Handle start-all writeback like we do periodic or kupdate
style writeback - by marking the bdi_writeback as needing a full
flush, and simply waking the thread. This eliminates the need to
allocate and queue a specific work item just for this purpose.

After this change, we truly only ever have one of them running at
any point in time. We mark the need to start all flushes, and the
writeback thread will clear it once it has processed the request.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c                | 71 +++++++++++++++++++---------------------
 include/linux/backing-dev-defs.h | 23 +++++++++++++
 include/linux/writeback.h        | 22 -------------
 include/trace/events/writeback.h |  1 -
 4 files changed, 57 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 399619c97567..9e24d604c59c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -53,7 +53,6 @@ struct wb_writeback_work {
 	unsigned int for_background:1;
 	unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
 	unsigned int auto_free:1;	/* free on completion */
-	unsigned int start_all:1;	/* nr_pages == 0 (all) writeback */
 	enum wb_reason reason;		/* why was writeback initiated? */
 
 	struct list_head list;		/* pending work list */
@@ -947,8 +946,6 @@ static unsigned long get_nr_dirty_pages(void)
 
 static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
 {
-	struct wb_writeback_work *work;
-
 	if (!wb_has_dirty_io(wb))
 		return;
 
@@ -958,35 +955,14 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
 	 * high frequency, causing pointless allocations of tons of
 	 * work items and keeping the flusher threads busy retrieving
 	 * that work. Ensure that we only allow one of them pending and
-	 * inflight at the time. It doesn't matter if we race a little
-	 * bit on this, so use the faster separate test/set bit variants.
+	 * inflight at the time.
 	 */
-	if (test_bit(WB_start_all, &wb->state))
+	if (test_bit(WB_start_all, &wb->state) ||
+	    test_and_set_bit(WB_start_all, &wb->state))
 		return;
 
-	set_bit(WB_start_all, &wb->state);
-
-	/*
-	 * This is WB_SYNC_NONE writeback, so if allocation fails just
-	 * wakeup the thread for old dirty data writeback
-	 */
-	work = kzalloc(sizeof(*work),
-		       GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
-	if (!work) {
-		clear_bit(WB_start_all, &wb->state);
-		trace_writeback_nowork(wb);
-		wb_wakeup(wb);
-		return;
-	}
-
-	work->sync_mode	= WB_SYNC_NONE;
-	work->nr_pages	= wb_split_bdi_pages(wb, get_nr_dirty_pages());
-	work->range_cyclic = 1;
-	work->reason	= reason;
-	work->auto_free	= 1;
-	work->start_all = 1;
-
-	wb_queue_work(wb, work);
+	wb->start_all_reason = reason;
+	wb_wakeup(wb);
 }
 
 /**
@@ -1838,14 +1814,6 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
 		list_del_init(&work->list);
 	}
 	spin_unlock_bh(&wb->work_lock);
-
-	/*
-	 * Once we start processing a work item that had !nr_pages,
-	 * clear the wb state bit for that so we can allow more.
-	 */
-	if (work && work->start_all)
-		clear_bit(WB_start_all, &wb->state);
-
 	return work;
 }
 
@@ -1901,6 +1869,30 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 	return 0;
 }
 
+static long wb_check_start_all(struct bdi_writeback *wb)
+{
+	long nr_pages;
+
+	if (!test_bit(WB_start_all, &wb->state))
+		return 0;
+
+	nr_pages = get_nr_dirty_pages();
+	if (nr_pages) {
+		struct wb_writeback_work work = {
+			.nr_pages	= wb_split_bdi_pages(wb, nr_pages),
+			.sync_mode	= WB_SYNC_NONE,
+			.range_cyclic	= 1,
+			.reason		= wb->start_all_reason,
+		};
+
+		nr_pages = wb_writeback(wb, &work);
+	}
+
+	clear_bit(WB_start_all, &wb->state);
+	return nr_pages;
+}
+
+
 /*
  * Retrieve work items and do the writeback they describe
  */
@@ -1916,6 +1908,11 @@ static long wb_do_writeback(struct bdi_writeback *wb)
 		finish_writeback_work(wb, work);
 	}
 
+	/*
+	 * Check for a flush-everything request
+	 */
+	wrote += wb_check_start_all(wb);
+
 	/*
 	 * Check for periodic writeback, kupdated() style
 	 */
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 420de5c7c7f9..b7c7be6f5986 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -44,6 +44,28 @@ enum wb_stat_item {
 
 #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
 
+/*
+ * why some writeback work was initiated
+ */
+enum wb_reason {
+	WB_REASON_BACKGROUND,
+	WB_REASON_VMSCAN,
+	WB_REASON_SYNC,
+	WB_REASON_PERIODIC,
+	WB_REASON_LAPTOP_TIMER,
+	WB_REASON_FREE_MORE_MEM,
+	WB_REASON_FS_FREE_SPACE,
+	/*
+	 * There is no bdi forker thread any more and works are done
+	 * by emergency worker, however, this is TPs userland visible
+	 * and we'll be exposing exactly the same information,
+	 * so it has a mismatch name.
+	 */
+	WB_REASON_FORKER_THREAD,
+
+	WB_REASON_MAX,
+};
+
 /*
  * For cgroup writeback, multiple wb's may map to the same blkcg.  Those
  * wb's can operate mostly independently but should share the congested
@@ -116,6 +138,7 @@ struct bdi_writeback {
 
 	struct fprop_local_percpu completions;
 	int dirty_exceeded;
+	enum wb_reason start_all_reason;
 
 	spinlock_t work_lock;		/* protects work_list & dwork scheduling */
 	struct list_head work_list;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 9c0091678af4..dd1d2c23f743 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -41,28 +41,6 @@ enum writeback_sync_modes {
 	WB_SYNC_ALL,	/* Wait on every mapping */
 };
 
-/*
- * why some writeback work was initiated
- */
-enum wb_reason {
-	WB_REASON_BACKGROUND,
-	WB_REASON_VMSCAN,
-	WB_REASON_SYNC,
-	WB_REASON_PERIODIC,
-	WB_REASON_LAPTOP_TIMER,
-	WB_REASON_FREE_MORE_MEM,
-	WB_REASON_FS_FREE_SPACE,
-	/*
-	 * There is no bdi forker thread any more and works are done
-	 * by emergency worker, however, this is TPs userland visible
-	 * and we'll be exposing exactly the same information,
-	 * so it has a mismatch name.
-	 */
-	WB_REASON_FORKER_THREAD,
-
-	WB_REASON_MAX,
-};
-
 /*
  * A control structure which tells the writeback code what to do.  These are
  * always on the stack, and hence need no locking.  They are always initialised
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 9b57f014d79d..19a0ea08e098 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -286,7 +286,6 @@ DEFINE_EVENT(writeback_class, name, \
 	TP_PROTO(struct bdi_writeback *wb), \
 	TP_ARGS(wb))
 
-DEFINE_WRITEBACK_EVENT(writeback_nowork);
 DEFINE_WRITEBACK_EVENT(writeback_wake_background);
 
 TRACE_EVENT(writeback_bdi_register,
-- 
cgit v1.2.3


From 6cafbe159416822f6d3dfd711bf4c39050c650ba Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 20 Jun 2017 10:44:58 -0400
Subject: ftrace: Add a ftrace_free_mem() function for modules to use

In order to be able to trace module init functions, the module code needs to
tell ftrace what is being freed when the init sections are freed. Use the
code that the main init calls to tell ftrace to free the main init sections.
This requires passing in a start and end address to free.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  2 ++
 kernel/trace/ftrace.c  | 14 +++++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 2e028854bac7..47fc404ad233 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -151,8 +151,10 @@ struct ftrace_ops_hash {
 };
 
 void ftrace_free_init_mem(void);
+void ftrace_free_mem(void *start, void *end);
 #else
 static inline void ftrace_free_init_mem(void) { }
+static inline void ftrace_free_mem(void *start, void *end) { }
 #endif
 
 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6abfafd7f173..84cb5928665a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5868,10 +5868,10 @@ void ftrace_module_init(struct module *mod)
 }
 #endif /* CONFIG_MODULES */
 
-void __init ftrace_free_init_mem(void)
+void ftrace_free_mem(void *start_ptr, void *end_ptr)
 {
-	unsigned long start = (unsigned long)(&__init_begin);
-	unsigned long end = (unsigned long)(&__init_end);
+	unsigned long start = (unsigned long)(start_ptr);
+	unsigned long end = (unsigned long)(end_ptr);
 	struct ftrace_page **last_pg = &ftrace_pages_start;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
@@ -5913,6 +5913,14 @@ void __init ftrace_free_init_mem(void)
 	mutex_unlock(&ftrace_lock);
 }
 
+void __init ftrace_free_init_mem(void)
+{
+	void *start = (void *)(&__init_begin);
+	void *end = (void *)(&__init_end);
+
+	ftrace_free_mem(start, end);
+}
+
 void __init ftrace_init(void)
 {
 	extern unsigned long __start_mcount_loc[];
-- 
cgit v1.2.3


From 5c95878f618cf4f3eb0a4c7ff54a09ca6d4d0426 Mon Sep 17 00:00:00 2001
From: Ladislav Michl <ladis@linux-mips.org>
Date: Thu, 7 Sep 2017 20:39:45 -0300
Subject: [media] media: rc: gpio-ir-recv: remove gpio_ir_recv_platform_data

gpio_ir_recv_platform_data are not used anywhere in kernel tree,
so remove it.

Signed-off-by: Ladislav Michl <ladis@linux-mips.org>
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 drivers/media/rc/gpio-ir-recv.c                  | 92 +++++++-----------------
 include/linux/platform_data/media/gpio-ir-recv.h | 23 ------
 2 files changed, 26 insertions(+), 89 deletions(-)
 delete mode 100644 include/linux/platform_data/media/gpio-ir-recv.h

(limited to 'include/linux')

diff --git a/drivers/media/rc/gpio-ir-recv.c b/drivers/media/rc/gpio-ir-recv.c
index b78195f06354..2634b81cbe7e 100644
--- a/drivers/media/rc/gpio-ir-recv.c
+++ b/drivers/media/rc/gpio-ir-recv.c
@@ -21,7 +21,6 @@
 #include <linux/platform_device.h>
 #include <linux/irq.h>
 #include <media/rc-core.h>
-#include <linux/platform_data/media/gpio-ir-recv.h>
 
 #define GPIO_IR_DEVICE_NAME	"gpio_ir_recv"
 
@@ -31,45 +30,6 @@ struct gpio_rc_dev {
 	bool active_low;
 };
 
-#ifdef CONFIG_OF
-/*
- * Translate OpenFirmware node properties into platform_data
- */
-static int gpio_ir_recv_get_devtree_pdata(struct device *dev,
-				  struct gpio_ir_recv_platform_data *pdata)
-{
-	struct device_node *np = dev->of_node;
-	enum of_gpio_flags flags;
-	int gpio;
-
-	gpio = of_get_gpio_flags(np, 0, &flags);
-	if (gpio < 0) {
-		if (gpio != -EPROBE_DEFER)
-			dev_err(dev, "Failed to get gpio flags (%d)\n", gpio);
-		return gpio;
-	}
-
-	pdata->gpio_nr = gpio;
-	pdata->active_low = (flags & OF_GPIO_ACTIVE_LOW);
-	/* probe() takes care of map_name == NULL or allowed_protos == 0 */
-	pdata->map_name = of_get_property(np, "linux,rc-map-name", NULL);
-	pdata->allowed_protos = 0;
-
-	return 0;
-}
-
-static const struct of_device_id gpio_ir_recv_of_match[] = {
-	{ .compatible = "gpio-ir-receiver", },
-	{ },
-};
-MODULE_DEVICE_TABLE(of, gpio_ir_recv_of_match);
-
-#else /* !CONFIG_OF */
-
-#define gpio_ir_recv_get_devtree_pdata(dev, pdata)	(-ENOSYS)
-
-#endif
-
 static irqreturn_t gpio_ir_recv_irq(int irq, void *dev_id)
 {
 	struct gpio_rc_dev *gpio_dev = dev_id;
@@ -95,32 +55,29 @@ err_get_value:
 static int gpio_ir_recv_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
+	struct device_node *np = dev->of_node;
 	struct gpio_rc_dev *gpio_dev;
+	enum of_gpio_flags flags;
 	struct rc_dev *rcdev;
-	const struct gpio_ir_recv_platform_data *pdata = dev->platform_data;
 	int rc;
 
-	if (pdev->dev.of_node) {
-		struct gpio_ir_recv_platform_data *dtpdata =
-			devm_kzalloc(dev, sizeof(*dtpdata), GFP_KERNEL);
-		if (!dtpdata)
-			return -ENOMEM;
-		rc = gpio_ir_recv_get_devtree_pdata(dev, dtpdata);
-		if (rc)
-			return rc;
-		pdata = dtpdata;
-	}
-
-	if (!pdata)
-		return -EINVAL;
-
-	if (pdata->gpio_nr < 0)
-		return -EINVAL;
+	if (!np)
+		return -ENODEV;
 
 	gpio_dev = devm_kzalloc(dev, sizeof(*gpio_dev), GFP_KERNEL);
 	if (!gpio_dev)
 		return -ENOMEM;
 
+	rc = of_get_gpio_flags(np, 0, &flags);
+	if (rc < 0) {
+		if (rc != -EPROBE_DEFER)
+			dev_err(dev, "Failed to get gpio flags (%d)\n", rc);
+		return rc;
+	}
+
+	gpio_dev->gpio_nr = rc;
+	gpio_dev->active_low = (flags & OF_GPIO_ACTIVE_LOW);
+
 	rcdev = devm_rc_allocate_device(dev, RC_DRIVER_IR_RAW);
 	if (!rcdev)
 		return -ENOMEM;
@@ -137,17 +94,14 @@ static int gpio_ir_recv_probe(struct platform_device *pdev)
 	rcdev->min_timeout = 1;
 	rcdev->timeout = IR_DEFAULT_TIMEOUT;
 	rcdev->max_timeout = 10 * IR_DEFAULT_TIMEOUT;
-	if (pdata->allowed_protos)
-		rcdev->allowed_protocols = pdata->allowed_protos;
-	else
-		rcdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
-	rcdev->map_name = pdata->map_name ?: RC_MAP_EMPTY;
+	rcdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
+	rcdev->map_name = of_get_property(np, "linux,rc-map-name", NULL);
+	if (!rcdev->map_name)
+		rcdev->map_name = RC_MAP_EMPTY;
 
 	gpio_dev->rcdev = rcdev;
-	gpio_dev->gpio_nr = pdata->gpio_nr;
-	gpio_dev->active_low = pdata->active_low;
 
-	rc = devm_gpio_request_one(dev, pdata->gpio_nr, GPIOF_DIR_IN,
+	rc = devm_gpio_request_one(dev, gpio_dev->gpio_nr, GPIOF_DIR_IN,
 				   "gpio-ir-recv");
 	if (rc < 0)
 		return rc;
@@ -160,7 +114,7 @@ static int gpio_ir_recv_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, gpio_dev);
 
-	return devm_request_irq(dev, gpio_to_irq(pdata->gpio_nr),
+	return devm_request_irq(dev, gpio_to_irq(gpio_dev->gpio_nr),
 				gpio_ir_recv_irq,
 				IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING,
 				"gpio-ir-recv-irq", gpio_dev);
@@ -199,6 +153,12 @@ static const struct dev_pm_ops gpio_ir_recv_pm_ops = {
 };
 #endif
 
+static const struct of_device_id gpio_ir_recv_of_match[] = {
+	{ .compatible = "gpio-ir-receiver", },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, gpio_ir_recv_of_match);
+
 static struct platform_driver gpio_ir_recv_driver = {
 	.probe  = gpio_ir_recv_probe,
 	.driver = {
diff --git a/include/linux/platform_data/media/gpio-ir-recv.h b/include/linux/platform_data/media/gpio-ir-recv.h
deleted file mode 100644
index 0c298f569d5a..000000000000
--- a/include/linux/platform_data/media/gpio-ir-recv.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2012, Code Aurora Forum. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#ifndef __GPIO_IR_RECV_H__
-#define __GPIO_IR_RECV_H__
-
-struct gpio_ir_recv_platform_data {
-	int		gpio_nr;
-	bool		active_low;
-	u64		allowed_protos;
-	const char	*map_name;
-};
-
-#endif /* __GPIO_IR_RECV_H__ */
-- 
cgit v1.2.3


From 324bda9e6c5add86ba2e1066476481c48132aca0 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 2 Oct 2017 22:50:21 -0700
Subject: bpf: multi program support for cgroup+bpf

introduce BPF_F_ALLOW_MULTI flag that can be used to attach multiple
bpf programs to a cgroup.

The difference between three possible flags for BPF_PROG_ATTACH command:
- NONE(default): No further bpf programs allowed in the subtree.
- BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
  the program in this cgroup yields to sub-cgroup program.
- BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
  that cgroup program gets run in addition to the program in this cgroup.

NONE and BPF_F_ALLOW_OVERRIDE existed before. This patch doesn't
change their behavior. It only clarifies the semantics in relation
to new flag.

Only one program is allowed to be attached to a cgroup with
NONE or BPF_F_ALLOW_OVERRIDE flag.
Multiple programs are allowed to be attached to a cgroup with
BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
(those that were attached first, run first)
The programs of sub-cgroup are executed first, then programs of
this cgroup and then programs of parent cgroup.
All eligible programs are executed regardless of return code from
earlier programs.

To allow efficient execution of multiple programs attached to a cgroup
and to avoid penalizing cgroups without any programs attached
introduce 'struct bpf_prog_array' which is RCU protected array
of pointers to bpf programs.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
for cgroup bits
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h |  46 +++--
 include/linux/bpf.h        |  32 ++++
 include/linux/filter.h     |   2 +-
 include/uapi/linux/bpf.h   |  42 +++-
 kernel/bpf/cgroup.c        | 467 ++++++++++++++++++++++++++++++++-------------
 kernel/bpf/core.c          |  31 +++
 kernel/bpf/syscall.c       |  37 ++--
 kernel/cgroup/cgroup.c     |  28 ++-
 8 files changed, 516 insertions(+), 169 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index d41d40ac3efd..102e56fbb6de 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -14,27 +14,42 @@ struct bpf_sock_ops_kern;
 extern struct static_key_false cgroup_bpf_enabled_key;
 #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
 
+struct bpf_prog_list {
+	struct list_head node;
+	struct bpf_prog *prog;
+};
+
+struct bpf_prog_array;
+
 struct cgroup_bpf {
-	/*
-	 * Store two sets of bpf_prog pointers, one for programs that are
-	 * pinned directly to this cgroup, and one for those that are effective
-	 * when this cgroup is accessed.
+	/* array of effective progs in this cgroup */
+	struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE];
+
+	/* attached progs to this cgroup and attach flags
+	 * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will
+	 * have either zero or one element
+	 * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
 	 */
-	struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
-	struct bpf_prog __rcu *effective[MAX_BPF_ATTACH_TYPE];
-	bool disallow_override[MAX_BPF_ATTACH_TYPE];
+	struct list_head progs[MAX_BPF_ATTACH_TYPE];
+	u32 flags[MAX_BPF_ATTACH_TYPE];
+
+	/* temp storage for effective prog array used by prog_attach/detach */
+	struct bpf_prog_array __rcu *inactive;
 };
 
 void cgroup_bpf_put(struct cgroup *cgrp);
-void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
+int cgroup_bpf_inherit(struct cgroup *cgrp);
 
-int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
-			struct bpf_prog *prog, enum bpf_attach_type type,
-			bool overridable);
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags);
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags);
 
-/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
-		      enum bpf_attach_type type, bool overridable);
+/* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags);
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags);
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
@@ -96,8 +111,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 
 struct cgroup_bpf {};
 static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
-static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
-				      struct cgroup *parent) {}
+static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 252f4bc9eb25..a6964b75f070 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -241,6 +241,38 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 			  union bpf_attr __user *uattr);
 
+/* an array of programs to be executed under rcu_lock.
+ *
+ * Typical usage:
+ * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, BPF_PROG_RUN);
+ *
+ * the structure returned by bpf_prog_array_alloc() should be populated
+ * with program pointers and the last pointer must be NULL.
+ * The user has to keep refcnt on the program and make sure the program
+ * is removed from the array before bpf_prog_put().
+ * The 'struct bpf_prog_array *' should only be replaced with xchg()
+ * since other cpus are walking the array of pointers in parallel.
+ */
+struct bpf_prog_array {
+	struct rcu_head rcu;
+	struct bpf_prog *progs[0];
+};
+
+struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
+
+#define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
+	({						\
+		struct bpf_prog **_prog;		\
+		u32 _ret = 1;				\
+		rcu_read_lock();			\
+		_prog = rcu_dereference(array)->progs;	\
+		for (; *_prog; _prog++)			\
+			_ret &= func(*_prog, ctx);	\
+		rcu_read_unlock();			\
+		_ret;					\
+	 })
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 911d454af107..2d2db394b0ca 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -481,7 +481,7 @@ struct sk_filter {
 	struct bpf_prog	*prog;
 };
 
-#define BPF_PROG_RUN(filter, ctx)  (*filter->bpf_func)(ctx, filter->insnsi)
+#define BPF_PROG_RUN(filter, ctx)  (*(filter)->bpf_func)(ctx, (filter)->insnsi)
 
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6d2137b4cf38..762f74bc6c47 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -143,11 +143,47 @@ enum bpf_attach_type {
 
 #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
 
-/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
- * to the given target_fd cgroup the descendent cgroup will be able to
- * override effective bpf program that was inherited from this cgroup
+/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
+ *
+ * NONE(default): No further bpf programs allowed in the subtree.
+ *
+ * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
+ * the program in this cgroup yields to sub-cgroup program.
+ *
+ * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
+ * that cgroup program gets run in addition to the program in this cgroup.
+ *
+ * Only one program is allowed to be attached to a cgroup with
+ * NONE or BPF_F_ALLOW_OVERRIDE flag.
+ * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will
+ * release old program and attach the new one. Attach flags has to match.
+ *
+ * Multiple programs are allowed to be attached to a cgroup with
+ * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
+ * (those that were attached first, run first)
+ * The programs of sub-cgroup are executed first, then programs of
+ * this cgroup and then programs of parent cgroup.
+ * When children program makes decision (like picking TCP CA or sock bind)
+ * parent program has a chance to override it.
+ *
+ * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups.
+ * A cgroup with NONE doesn't allow any programs in sub-cgroups.
+ * Ex1:
+ * cgrp1 (MULTI progs A, B) ->
+ *    cgrp2 (OVERRIDE prog C) ->
+ *      cgrp3 (MULTI prog D) ->
+ *        cgrp4 (OVERRIDE prog E) ->
+ *          cgrp5 (NONE prog F)
+ * the event in cgrp5 triggers execution of F,D,A,B in that order.
+ * if prog F is detached, the execution is E,D,A,B
+ * if prog F and D are detached, the execution is E,A,B
+ * if prog F, E and D are detached, the execution is C,A,B
+ *
+ * All eligible programs are executed regardless of return code from
+ * earlier programs.
  */
 #define BPF_F_ALLOW_OVERRIDE	(1U << 0)
+#define BPF_F_ALLOW_MULTI	(1U << 1)
 
 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
  * verifier will perform strict alignment checking as if the kernel
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 546113430049..6b7500bbdb53 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -27,129 +27,361 @@ void cgroup_bpf_put(struct cgroup *cgrp)
 {
 	unsigned int type;
 
-	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
-		struct bpf_prog *prog = cgrp->bpf.prog[type];
-
-		if (prog) {
-			bpf_prog_put(prog);
+	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
+		struct list_head *progs = &cgrp->bpf.progs[type];
+		struct bpf_prog_list *pl, *tmp;
+
+		list_for_each_entry_safe(pl, tmp, progs, node) {
+			list_del(&pl->node);
+			bpf_prog_put(pl->prog);
+			kfree(pl);
 			static_branch_dec(&cgroup_bpf_enabled_key);
 		}
+		bpf_prog_array_free(cgrp->bpf.effective[type]);
+	}
+}
+
+/* count number of elements in the list.
+ * it's slow but the list cannot be long
+ */
+static u32 prog_list_length(struct list_head *head)
+{
+	struct bpf_prog_list *pl;
+	u32 cnt = 0;
+
+	list_for_each_entry(pl, head, node) {
+		if (!pl->prog)
+			continue;
+		cnt++;
 	}
+	return cnt;
+}
+
+/* if parent has non-overridable prog attached,
+ * disallow attaching new programs to the descendent cgroup.
+ * if parent has overridable or multi-prog, allow attaching
+ */
+static bool hierarchy_allows_attach(struct cgroup *cgrp,
+				    enum bpf_attach_type type,
+				    u32 new_flags)
+{
+	struct cgroup *p;
+
+	p = cgroup_parent(cgrp);
+	if (!p)
+		return true;
+	do {
+		u32 flags = p->bpf.flags[type];
+		u32 cnt;
+
+		if (flags & BPF_F_ALLOW_MULTI)
+			return true;
+		cnt = prog_list_length(&p->bpf.progs[type]);
+		WARN_ON_ONCE(cnt > 1);
+		if (cnt == 1)
+			return !!(flags & BPF_F_ALLOW_OVERRIDE);
+		p = cgroup_parent(p);
+	} while (p);
+	return true;
+}
+
+/* compute a chain of effective programs for a given cgroup:
+ * start from the list of programs in this cgroup and add
+ * all parent programs.
+ * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
+ * to programs in this cgroup
+ */
+static int compute_effective_progs(struct cgroup *cgrp,
+				   enum bpf_attach_type type,
+				   struct bpf_prog_array __rcu **array)
+{
+	struct bpf_prog_array __rcu *progs;
+	struct bpf_prog_list *pl;
+	struct cgroup *p = cgrp;
+	int cnt = 0;
+
+	/* count number of effective programs by walking parents */
+	do {
+		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+			cnt += prog_list_length(&p->bpf.progs[type]);
+		p = cgroup_parent(p);
+	} while (p);
+
+	progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
+	if (!progs)
+		return -ENOMEM;
+
+	/* populate the array with effective progs */
+	cnt = 0;
+	p = cgrp;
+	do {
+		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+			list_for_each_entry(pl,
+					    &p->bpf.progs[type], node) {
+				if (!pl->prog)
+					continue;
+				rcu_dereference_protected(progs, 1)->
+					progs[cnt++] = pl->prog;
+			}
+		p = cgroup_parent(p);
+	} while (p);
+
+	*array = progs;
+	return 0;
+}
+
+static void activate_effective_progs(struct cgroup *cgrp,
+				     enum bpf_attach_type type,
+				     struct bpf_prog_array __rcu *array)
+{
+	struct bpf_prog_array __rcu *old_array;
+
+	old_array = xchg(&cgrp->bpf.effective[type], array);
+	/* free prog array after grace period, since __cgroup_bpf_run_*()
+	 * might be still walking the array
+	 */
+	bpf_prog_array_free(old_array);
 }
 
 /**
  * cgroup_bpf_inherit() - inherit effective programs from parent
  * @cgrp: the cgroup to modify
- * @parent: the parent to inherit from
  */
-void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+int cgroup_bpf_inherit(struct cgroup *cgrp)
 {
-	unsigned int type;
+/* has to use marco instead of const int, since compiler thinks
+ * that array below is variable length
+ */
+#define	NR ARRAY_SIZE(cgrp->bpf.effective)
+	struct bpf_prog_array __rcu *arrays[NR] = {};
+	int i;
 
-	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
-		struct bpf_prog *e;
+	for (i = 0; i < NR; i++)
+		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
 
-		e = rcu_dereference_protected(parent->bpf.effective[type],
-					      lockdep_is_held(&cgroup_mutex));
-		rcu_assign_pointer(cgrp->bpf.effective[type], e);
-		cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type];
-	}
+	for (i = 0; i < NR; i++)
+		if (compute_effective_progs(cgrp, i, &arrays[i]))
+			goto cleanup;
+
+	for (i = 0; i < NR; i++)
+		activate_effective_progs(cgrp, i, arrays[i]);
+
+	return 0;
+cleanup:
+	for (i = 0; i < NR; i++)
+		bpf_prog_array_free(arrays[i]);
+	return -ENOMEM;
 }
 
+#define BPF_CGROUP_MAX_PROGS 64
+
 /**
- * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ * __cgroup_bpf_attach() - Attach the program to a cgroup, and
  *                         propagate the change to descendants
  * @cgrp: The cgroup which descendants to traverse
- * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
- * @prog: A new program to pin
- * @type: Type of pinning operation (ingress/egress)
- *
- * Each cgroup has a set of two pointers for bpf programs; one for eBPF
- * programs it owns, and which is effective for execution.
- *
- * If @prog is not %NULL, this function attaches a new program to the cgroup
- * and releases the one that is currently attached, if any. @prog is then made
- * the effective program of type @type in that cgroup.
- *
- * If @prog is %NULL, the currently attached program of type @type is released,
- * and the effective program of the parent cgroup (if any) is inherited to
- * @cgrp.
- *
- * Then, the descendants of @cgrp are walked and the effective program for
- * each of them is set to the effective program of @cgrp unless the
- * descendant has its own program attached, in which case the subbranch is
- * skipped. This ensures that delegated subcgroups with own programs are left
- * untouched.
+ * @prog: A program to attach
+ * @type: Type of attach operation
  *
  * Must be called with cgroup_mutex held.
  */
-int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
-			struct bpf_prog *prog, enum bpf_attach_type type,
-			bool new_overridable)
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags)
 {
-	struct bpf_prog *old_prog, *effective = NULL;
-	struct cgroup_subsys_state *pos;
-	bool overridable = true;
-
-	if (parent) {
-		overridable = !parent->bpf.disallow_override[type];
-		effective = rcu_dereference_protected(parent->bpf.effective[type],
-						      lockdep_is_held(&cgroup_mutex));
-	}
-
-	if (prog && effective && !overridable)
-		/* if parent has non-overridable prog attached, disallow
-		 * attaching new programs to descendent cgroup
-		 */
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	struct bpf_prog *old_prog = NULL;
+	struct cgroup_subsys_state *css;
+	struct bpf_prog_list *pl;
+	bool pl_was_allocated;
+	u32 old_flags;
+	int err;
+
+	if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
+		/* invalid combination */
+		return -EINVAL;
+
+	if (!hierarchy_allows_attach(cgrp, type, flags))
 		return -EPERM;
 
-	if (prog && effective && overridable != new_overridable)
-		/* if parent has overridable prog attached, only
-		 * allow overridable programs in descendent cgroup
+	if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
+		/* Disallow attaching non-overridable on top
+		 * of existing overridable in this cgroup.
+		 * Disallow attaching multi-prog if overridable or none
 		 */
 		return -EPERM;
 
-	old_prog = cgrp->bpf.prog[type];
-
-	if (prog) {
-		overridable = new_overridable;
-		effective = prog;
-		if (old_prog &&
-		    cgrp->bpf.disallow_override[type] == new_overridable)
-			/* disallow attaching non-overridable on top
-			 * of existing overridable in this cgroup
-			 * and vice versa
-			 */
-			return -EPERM;
+	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
+		return -E2BIG;
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		list_for_each_entry(pl, progs, node)
+			if (pl->prog == prog)
+				/* disallow attaching the same prog twice */
+				return -EINVAL;
+
+		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+		if (!pl)
+			return -ENOMEM;
+		pl_was_allocated = true;
+		pl->prog = prog;
+		list_add_tail(&pl->node, progs);
+	} else {
+		if (list_empty(progs)) {
+			pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+			if (!pl)
+				return -ENOMEM;
+			pl_was_allocated = true;
+			list_add_tail(&pl->node, progs);
+		} else {
+			pl = list_first_entry(progs, typeof(*pl), node);
+			old_prog = pl->prog;
+			pl_was_allocated = false;
+		}
+		pl->prog = prog;
 	}
 
-	if (!prog && !old_prog)
-		/* report error when trying to detach and nothing is attached */
-		return -ENOENT;
+	old_flags = cgrp->bpf.flags[type];
+	cgrp->bpf.flags[type] = flags;
 
-	cgrp->bpf.prog[type] = prog;
+	/* allocate and recompute effective prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
 
-	css_for_each_descendant_pre(pos, &cgrp->self) {
-		struct cgroup *desc = container_of(pos, struct cgroup, self);
-
-		/* skip the subtree if the descendant has its own program */
-		if (desc->bpf.prog[type] && desc != cgrp) {
-			pos = css_rightmost_descendant(pos);
-		} else {
-			rcu_assign_pointer(desc->bpf.effective[type],
-					   effective);
-			desc->bpf.disallow_override[type] = !overridable;
-		}
+		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+		if (err)
+			goto cleanup;
 	}
 
-	if (prog)
-		static_branch_inc(&cgroup_bpf_enabled_key);
+	/* all allocations were successful. Activate all prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
 
+		activate_effective_progs(desc, type, desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	static_branch_inc(&cgroup_bpf_enabled_key);
 	if (old_prog) {
 		bpf_prog_put(old_prog);
 		static_branch_dec(&cgroup_bpf_enabled_key);
 	}
 	return 0;
+
+cleanup:
+	/* oom while computing effective. Free all computed effective arrays
+	 * since they were not activated
+	 */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		bpf_prog_array_free(desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* and cleanup the prog list */
+	pl->prog = old_prog;
+	if (pl_was_allocated) {
+		list_del(&pl->node);
+		kfree(pl);
+	}
+	return err;
+}
+
+/**
+ * __cgroup_bpf_detach() - Detach the program from a cgroup, and
+ *                         propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @prog: A program to detach or NULL
+ * @type: Type of detach operation
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 unused_flags)
+{
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	u32 flags = cgrp->bpf.flags[type];
+	struct bpf_prog *old_prog = NULL;
+	struct cgroup_subsys_state *css;
+	struct bpf_prog_list *pl;
+	int err;
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		if (!prog)
+			/* to detach MULTI prog the user has to specify valid FD
+			 * of the program to be detached
+			 */
+			return -EINVAL;
+	} else {
+		if (list_empty(progs))
+			/* report error when trying to detach and nothing is attached */
+			return -ENOENT;
+	}
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		/* find the prog and detach it */
+		list_for_each_entry(pl, progs, node) {
+			if (pl->prog != prog)
+				continue;
+			old_prog = prog;
+			/* mark it deleted, so it's ignored while
+			 * recomputing effective
+			 */
+			pl->prog = NULL;
+			break;
+		}
+		if (!old_prog)
+			return -ENOENT;
+	} else {
+		/* to maintain backward compatibility NONE and OVERRIDE cgroups
+		 * allow detaching with invalid FD (prog==NULL)
+		 */
+		pl = list_first_entry(progs, typeof(*pl), node);
+		old_prog = pl->prog;
+		pl->prog = NULL;
+	}
+
+	/* allocate and recompute effective prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+		if (err)
+			goto cleanup;
+	}
+
+	/* all allocations were successful. Activate all prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		activate_effective_progs(desc, type, desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* now can actually delete it from this cgroup list */
+	list_del(&pl->node);
+	kfree(pl);
+	if (list_empty(progs))
+		/* last program was detached, reset flags to zero */
+		cgrp->bpf.flags[type] = 0;
+
+	bpf_prog_put(old_prog);
+	static_branch_dec(&cgroup_bpf_enabled_key);
+	return 0;
+
+cleanup:
+	/* oom while computing effective. Free all computed effective arrays
+	 * since they were not activated
+	 */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		bpf_prog_array_free(desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* and restore back old_prog */
+	pl->prog = old_prog;
+	return err;
 }
 
 /**
@@ -171,36 +403,26 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
 				enum bpf_attach_type type)
 {
-	struct bpf_prog *prog;
+	unsigned int offset = skb->data - skb_network_header(skb);
+	struct sock *save_sk;
 	struct cgroup *cgrp;
-	int ret = 0;
+	int ret;
 
 	if (!sk || !sk_fullsock(sk))
 		return 0;
 
-	if (sk->sk_family != AF_INET &&
-	    sk->sk_family != AF_INET6)
+	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 		return 0;
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-
-	rcu_read_lock();
-
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog) {
-		unsigned int offset = skb->data - skb_network_header(skb);
-		struct sock *save_sk = skb->sk;
-
-		skb->sk = sk;
-		__skb_push(skb, offset);
-		ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
-		__skb_pull(skb, offset);
-		skb->sk = save_sk;
-	}
-
-	rcu_read_unlock();
-
-	return ret;
+	save_sk = skb->sk;
+	skb->sk = sk;
+	__skb_push(skb, offset);
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+				 bpf_prog_run_save_cb);
+	__skb_pull(skb, offset);
+	skb->sk = save_sk;
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
 
@@ -221,19 +443,10 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 			       enum bpf_attach_type type)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	struct bpf_prog *prog;
-	int ret = 0;
-
-
-	rcu_read_lock();
-
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog)
-		ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
+	int ret;
 
-	rcu_read_unlock();
-
-	return ret;
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
 
@@ -258,18 +471,10 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     enum bpf_attach_type type)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	struct bpf_prog *prog;
-	int ret = 0;
-
-
-	rcu_read_lock();
-
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog)
-		ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
-
-	rcu_read_unlock();
+	int ret;
 
-	return ret;
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
+				 BPF_PROG_RUN);
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 917cc04a0a94..6b49e1991ae7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1381,6 +1381,37 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
+/* to avoid allocating empty bpf_prog_array for cgroups that
+ * don't have bpf program attached use one global 'empty_prog_array'
+ * It will not be modified the caller of bpf_prog_array_alloc()
+ * (since caller requested prog_cnt == 0)
+ * that pointer should be 'freed' by bpf_prog_array_free()
+ */
+static struct {
+	struct bpf_prog_array hdr;
+	struct bpf_prog *null_prog;
+} empty_prog_array = {
+	.null_prog = NULL,
+};
+
+struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
+{
+	if (prog_cnt)
+		return kzalloc(sizeof(struct bpf_prog_array) +
+			       sizeof(struct bpf_prog *) * (prog_cnt + 1),
+			       flags);
+
+	return &empty_prog_array.hdr;
+}
+
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
+{
+	if (!progs ||
+	    progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
+		return;
+	kfree_rcu(progs, rcu);
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b927da66f653..51bee695d32c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1168,6 +1168,9 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
 	return 0;
 }
 
+#define BPF_F_ATTACH_MASK \
+	(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
+
 static int bpf_prog_attach(const union bpf_attr *attr)
 {
 	enum bpf_prog_type ptype;
@@ -1181,7 +1184,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_PROG_ATTACH))
 		return -EINVAL;
 
-	if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE)
+	if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
 		return -EINVAL;
 
 	switch (attr->attach_type) {
@@ -1212,8 +1215,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 		return PTR_ERR(cgrp);
 	}
 
-	ret = cgroup_bpf_update(cgrp, prog, attr->attach_type,
-				attr->attach_flags & BPF_F_ALLOW_OVERRIDE);
+	ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+				attr->attach_flags);
 	if (ret)
 		bpf_prog_put(prog);
 	cgroup_put(cgrp);
@@ -1225,6 +1228,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 
 static int bpf_prog_detach(const union bpf_attr *attr)
 {
+	enum bpf_prog_type ptype;
+	struct bpf_prog *prog;
 	struct cgroup *cgrp;
 	int ret;
 
@@ -1237,23 +1242,33 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
+		ptype = BPF_PROG_TYPE_CGROUP_SKB;
+		break;
 	case BPF_CGROUP_INET_SOCK_CREATE:
+		ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+		break;
 	case BPF_CGROUP_SOCK_OPS:
-		cgrp = cgroup_get_from_fd(attr->target_fd);
-		if (IS_ERR(cgrp))
-			return PTR_ERR(cgrp);
-
-		ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
-		cgroup_put(cgrp);
+		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
-		ret = sockmap_get_from_fd(attr, false);
-		break;
+		return sockmap_get_from_fd(attr, false);
 	default:
 		return -EINVAL;
 	}
 
+	cgrp = cgroup_get_from_fd(attr->target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+	if (IS_ERR(prog))
+		prog = NULL;
+
+	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
+	if (prog)
+		bpf_prog_put(prog);
+	cgroup_put(cgrp);
 	return ret;
 }
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d6551cd45238..57eb866ae78d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1896,6 +1896,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
 	if (ret)
 		goto destroy_root;
 
+	ret = cgroup_bpf_inherit(root_cgrp);
+	WARN_ON_ONCE(ret);
+
 	trace_cgroup_setup_root(root);
 
 	/*
@@ -4713,6 +4716,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	cgrp->self.parent = &parent->self;
 	cgrp->root = root;
 	cgrp->level = level;
+	ret = cgroup_bpf_inherit(cgrp);
+	if (ret)
+		goto out_idr_free;
 
 	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
 		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
@@ -4747,13 +4753,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	if (!cgroup_on_dfl(cgrp))
 		cgrp->subtree_control = cgroup_control(cgrp);
 
-	if (parent)
-		cgroup_bpf_inherit(cgrp, parent);
-
 	cgroup_propagate_control(cgrp);
 
 	return cgrp;
 
+out_idr_free:
+	cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
 out_cancel_ref:
 	percpu_ref_exit(&cgrp->self.refcnt);
 out_free_cgrp:
@@ -5736,14 +5741,23 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
 #endif	/* CONFIG_SOCK_CGROUP_DATA */
 
 #ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
-		      enum bpf_attach_type type, bool overridable)
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags)
+{
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+	ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags)
 {
-	struct cgroup *parent = cgroup_parent(cgrp);
 	int ret;
 
 	mutex_lock(&cgroup_mutex);
-	ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
+	ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
-- 
cgit v1.2.3


From 468e2f64d220fe2dc11caa2bcb9b3a1e50fc7321 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 2 Oct 2017 22:50:22 -0700
Subject: bpf: introduce BPF_PROG_QUERY command

introduce BPF_PROG_QUERY command to retrieve a set of either
attached programs to given cgroup or a set of effective programs
that will execute for events within a cgroup

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
for cgroup bits
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h |  4 ++++
 include/linux/bpf.h        |  3 +++
 include/uapi/linux/bpf.h   | 13 +++++++++++++
 kernel/bpf/cgroup.c        | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/core.c          | 38 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c       | 34 ++++++++++++++++++++++++++++++++++
 kernel/cgroup/cgroup.c     | 10 ++++++++++
 7 files changed, 148 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 102e56fbb6de..359b6f5d3d90 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -44,12 +44,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 			enum bpf_attach_type type, u32 flags);
 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 			enum bpf_attach_type type, u32 flags);
+int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		       union bpf_attr __user *uattr);
 
 /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
 int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type, u32 flags);
 int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type, u32 flags);
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		     union bpf_attr __user *uattr);
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a6964b75f070..a67daea731ab 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -260,6 +260,9 @@ struct bpf_prog_array {
 
 struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
 void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
+int bpf_prog_array_length(struct bpf_prog_array __rcu *progs);
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+				__u32 __user *prog_ids, u32 cnt);
 
 #define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
 	({						\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 762f74bc6c47..cb2b9f95160a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -92,6 +92,7 @@ enum bpf_cmd {
 	BPF_PROG_GET_FD_BY_ID,
 	BPF_MAP_GET_FD_BY_ID,
 	BPF_OBJ_GET_INFO_BY_FD,
+	BPF_PROG_QUERY,
 };
 
 enum bpf_map_type {
@@ -211,6 +212,9 @@ enum bpf_attach_type {
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
+/* flags for BPF_PROG_QUERY */
+#define BPF_F_QUERY_EFFECTIVE	(1U << 0)
+
 #define BPF_OBJ_NAME_LEN 16U
 
 union bpf_attr {
@@ -289,6 +293,15 @@ union bpf_attr {
 		__u32		info_len;
 		__aligned_u64	info;
 	} info;
+
+	struct { /* anonymous struct used by BPF_PROG_QUERY command */
+		__u32		target_fd;	/* container object to query */
+		__u32		attach_type;
+		__u32		query_flags;
+		__u32		attach_flags;
+		__aligned_u64	prog_ids;
+		__u32		prog_cnt;
+	} query;
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 6b7500bbdb53..e88abc0865d5 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -384,6 +384,52 @@ cleanup:
 	return err;
 }
 
+/* Must be called with cgroup_mutex held to avoid races. */
+int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		       union bpf_attr __user *uattr)
+{
+	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+	enum bpf_attach_type type = attr->query.attach_type;
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	u32 flags = cgrp->bpf.flags[type];
+	int cnt, ret = 0, i;
+
+	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
+		cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
+	else
+		cnt = prog_list_length(progs);
+
+	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+		return -EFAULT;
+	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
+		return -EFAULT;
+	if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
+		/* return early if user requested only program count + flags */
+		return 0;
+	if (attr->query.prog_cnt < cnt) {
+		cnt = attr->query.prog_cnt;
+		ret = -ENOSPC;
+	}
+
+	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+		return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
+						   prog_ids, cnt);
+	} else {
+		struct bpf_prog_list *pl;
+		u32 id;
+
+		i = 0;
+		list_for_each_entry(pl, progs, node) {
+			id = pl->prog->aux->id;
+			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+				return -EFAULT;
+			if (++i == cnt)
+				break;
+		}
+	}
+	return ret;
+}
+
 /**
  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
  * @sk: The socket sending or receiving traffic
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 6b49e1991ae7..eba966c09053 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1412,6 +1412,44 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
 	kfree_rcu(progs, rcu);
 }
 
+int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
+{
+	struct bpf_prog **prog;
+	u32 cnt = 0;
+
+	rcu_read_lock();
+	prog = rcu_dereference(progs)->progs;
+	for (; *prog; prog++)
+		cnt++;
+	rcu_read_unlock();
+	return cnt;
+}
+
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+				__u32 __user *prog_ids, u32 cnt)
+{
+	struct bpf_prog **prog;
+	u32 i = 0, id;
+
+	rcu_read_lock();
+	prog = rcu_dereference(progs)->progs;
+	for (; *prog; prog++) {
+		id = (*prog)->aux->id;
+		if (copy_to_user(prog_ids + i, &id, sizeof(id))) {
+			rcu_read_unlock();
+			return -EFAULT;
+		}
+		if (++i == cnt) {
+			prog++;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	if (*prog)
+		return -ENOSPC;
+	return 0;
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 51bee695d32c..0048cb24ba7b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1272,6 +1272,37 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	return ret;
 }
 
+#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
+
+static int bpf_prog_query(const union bpf_attr *attr,
+			  union bpf_attr __user *uattr)
+{
+	struct cgroup *cgrp;
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+	if (CHECK_ATTR(BPF_PROG_QUERY))
+		return -EINVAL;
+	if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
+		return -EINVAL;
+
+	switch (attr->query.attach_type) {
+	case BPF_CGROUP_INET_INGRESS:
+	case BPF_CGROUP_INET_EGRESS:
+	case BPF_CGROUP_INET_SOCK_CREATE:
+	case BPF_CGROUP_SOCK_OPS:
+		break;
+	default:
+		return -EINVAL;
+	}
+	cgrp = cgroup_get_from_fd(attr->query.target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+	ret = cgroup_bpf_query(cgrp, attr, uattr);
+	cgroup_put(cgrp);
+	return ret;
+}
 #endif /* CONFIG_CGROUP_BPF */
 
 #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -1568,6 +1599,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_PROG_DETACH:
 		err = bpf_prog_detach(&attr);
 		break;
+	case BPF_PROG_QUERY:
+		err = bpf_prog_query(&attr, uattr);
+		break;
 #endif
 	case BPF_PROG_TEST_RUN:
 		err = bpf_prog_test_run(&attr, uattr);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 57eb866ae78d..269512b94a94 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5761,4 +5761,14 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		     union bpf_attr __user *uattr)
+{
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+	ret = __cgroup_bpf_query(cgrp, attr, uattr);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
 #endif /* CONFIG_CGROUP_BPF */
-- 
cgit v1.2.3


From 6621dd29eb9b5e6774ec7a9a75161352fdea47fc Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 3 Oct 2017 13:53:23 +0200
Subject: dev: advertise the new nsid when the netns iface changes

x-netns interfaces are bound to two netns: the link netns and the upper
netns. Usually, this kind of interfaces is created in the link netns and
then moved to the upper netns. At the end, the interface is visible only
in the upper netns. The link nsid is advertised via netlink in the upper
netns, thus the user always knows where is the link part.

There is no such mechanism in the link netns. When the interface is moved
to another netns, the user cannot "follow" it.
This patch adds a new netlink attribute which helps to follow an interface
which moves to another netns. When the interface is unregistered, the new
nsid is advertised. If the interface is a x-netns interface (ie
rtnl_link_ops->get_link_net is defined), the nsid is allocated if needed.

CC: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h    |  4 +++-
 include/uapi/linux/if_link.h |  1 +
 net/core/dev.c               | 11 ++++++++---
 net/core/rtnetlink.c         | 31 ++++++++++++++++++++++---------
 4 files changed, 34 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index dea59c8eec54..1251638e60d3 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -17,9 +17,11 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
 			      u32 id, long expires, u32 error);
 
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
+void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
+			 gfp_t flags, int *new_nsid);
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned change, u32 event,
-				       gfp_t flags);
+				       gfp_t flags, int *new_nsid);
 void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
 		       gfp_t flags);
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index ea87bd708ee9..cd580fc0e58f 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -158,6 +158,7 @@ enum {
 	IFLA_PAD,
 	IFLA_XDP,
 	IFLA_EVENT,
+	IFLA_NEW_NETNSID,
 	__IFLA_MAX
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 454f05441546..bffc75429184 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -145,6 +145,7 @@
 #include <linux/crash_dump.h>
 #include <linux/sctp.h>
 #include <net/udp_tunnel.h>
+#include <linux/net_namespace.h>
 
 #include "net-sysfs.h"
 
@@ -7204,7 +7205,7 @@ static void rollback_registered_many(struct list_head *head)
 		if (!dev->rtnl_link_ops ||
 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
-						     GFP_KERNEL);
+						     GFP_KERNEL, NULL);
 
 		/*
 		 *	Flush the unicast and multicast chains
@@ -8291,7 +8292,7 @@ EXPORT_SYMBOL(unregister_netdev);
 
 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
 {
-	int err;
+	int err, new_nsid;
 
 	ASSERT_RTNL();
 
@@ -8347,7 +8348,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 	rcu_barrier();
 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
-	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
+	if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net)
+		new_nsid = peernet2id_alloc(dev_net(dev), net);
+	else
+		new_nsid = peernet2id(dev_net(dev), net);
+	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid);
 
 	/*
 	 *	Flush the unicast and multicast chains
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 3fb1ca33cba4..1ee98b1369d5 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -915,6 +915,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
 	       + rtnl_xdp_size() /* IFLA_XDP */
 	       + nla_total_size(4)  /* IFLA_EVENT */
+	       + nla_total_size(4)  /* IFLA_NEW_NETNSID */
 	       + nla_total_size(1); /* IFLA_PROTO_DOWN */
 
 }
@@ -1384,7 +1385,7 @@ static int rtnl_fill_link_netnsid(struct sk_buff *skb,
 static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags, u32 ext_filter_mask,
-			    u32 event)
+			    u32 event, int *new_nsid)
 {
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
@@ -1472,6 +1473,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	if (rtnl_fill_link_netnsid(skb, dev))
 		goto nla_put_failure;
 
+	if (new_nsid &&
+	    nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0)
+		goto nla_put_failure;
+
 	if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC)))
 		goto nla_put_failure;
 
@@ -1701,7 +1706,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq, 0,
 					       flags,
-					       ext_filter_mask, 0);
+					       ext_filter_mask, 0, NULL);
 
 			if (err < 0) {
 				if (likely(skb->len))
@@ -2808,7 +2813,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return -ENOBUFS;
 
 	err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid,
-			       nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0);
+			       nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0, NULL);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size */
 		WARN_ON(err == -EMSGSIZE);
@@ -2893,7 +2898,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned int change,
-				       u32 event, gfp_t flags)
+				       u32 event, gfp_t flags, int *new_nsid)
 {
 	struct net *net = dev_net(dev);
 	struct sk_buff *skb;
@@ -2904,7 +2909,8 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 	if (skb == NULL)
 		goto errout;
 
-	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event);
+	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event,
+			       new_nsid);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
@@ -2927,14 +2933,14 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
 
 static void rtmsg_ifinfo_event(int type, struct net_device *dev,
 			       unsigned int change, u32 event,
-			       gfp_t flags)
+			       gfp_t flags, int *new_nsid)
 {
 	struct sk_buff *skb;
 
 	if (dev->reg_state != NETREG_REGISTERED)
 		return;
 
-	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags);
+	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid);
 	if (skb)
 		rtmsg_ifinfo_send(skb, dev, flags);
 }
@@ -2942,10 +2948,17 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev,
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
 		  gfp_t flags)
 {
-	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags);
+	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL);
 }
 EXPORT_SYMBOL(rtmsg_ifinfo);
 
+void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
+			 gfp_t flags, int *new_nsid)
+{
+	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
+			   new_nsid);
+}
+
 static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
 				   struct net_device *dev,
 				   u8 *addr, u16 vid, u32 pid, u32 seq,
@@ -4321,7 +4334,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_RESEND_IGMP:
 	case NETDEV_CHANGEINFODATA:
 		rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event),
-				   GFP_KERNEL);
+				   GFP_KERNEL, NULL);
 		break;
 	default:
 		break;
-- 
cgit v1.2.3


From 51d0c04795a4b5d9a188336884887a9d394a94b0 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 4 Oct 2017 17:48:45 -0700
Subject: net: Add extack to netdev_notifier_info

Add netlink_ext_ack to netdev_notifier_info to allow notifier
handlers to return errors to userspace.

Clean up the initialization in dev.c such that extack is easily
added in subsequent patches where relevant. Specifically, remove
the init call in call_netdevice_notifiers_info and have callers
initalize on stack when info is declared.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 10 +++++-
 net/core/dev.c            | 79 ++++++++++++++++++++++++++++-------------------
 2 files changed, 56 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d04424cfffba..05fcaba4b0d9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2309,7 +2309,8 @@ int register_netdevice_notifier(struct notifier_block *nb);
 int unregister_netdevice_notifier(struct notifier_block *nb);
 
 struct netdev_notifier_info {
-	struct net_device *dev;
+	struct net_device	*dev;
+	struct netlink_ext_ack	*extack;
 };
 
 struct netdev_notifier_change_info {
@@ -2334,6 +2335,7 @@ static inline void netdev_notifier_info_init(struct netdev_notifier_info *info,
 					     struct net_device *dev)
 {
 	info->dev = dev;
+	info->extack = NULL;
 }
 
 static inline struct net_device *
@@ -2342,6 +2344,12 @@ netdev_notifier_info_to_dev(const struct netdev_notifier_info *info)
 	return info->dev;
 }
 
+static inline struct netlink_ext_ack *
+netdev_notifier_info_to_extack(const struct netdev_notifier_info *info)
+{
+	return info->extack;
+}
+
 int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
 
 
diff --git a/net/core/dev.c b/net/core/dev.c
index bffc75429184..e27a6bc0ac4d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -163,7 +163,6 @@ static struct list_head offload_base __read_mostly;
 
 static int netif_rx_internal(struct sk_buff *skb);
 static int call_netdevice_notifiers_info(unsigned long val,
-					 struct net_device *dev,
 					 struct netdev_notifier_info *info);
 static struct napi_struct *napi_by_id(unsigned int napi_id);
 
@@ -1339,10 +1338,11 @@ EXPORT_SYMBOL(netdev_features_change);
 void netdev_state_change(struct net_device *dev)
 {
 	if (dev->flags & IFF_UP) {
-		struct netdev_notifier_change_info change_info;
+		struct netdev_notifier_change_info change_info = {
+			.info.dev = dev,
+		};
 
-		change_info.flags_changed = 0;
-		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
+		call_netdevice_notifiers_info(NETDEV_CHANGE,
 					      &change_info.info);
 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
 	}
@@ -1563,9 +1563,10 @@ EXPORT_SYMBOL(dev_disable_lro);
 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 				   struct net_device *dev)
 {
-	struct netdev_notifier_info info;
+	struct netdev_notifier_info info = {
+		.dev = dev,
+	};
 
-	netdev_notifier_info_init(&info, dev);
 	return nb->notifier_call(nb, val, &info);
 }
 
@@ -1690,11 +1691,9 @@ EXPORT_SYMBOL(unregister_netdevice_notifier);
  */
 
 static int call_netdevice_notifiers_info(unsigned long val,
-					 struct net_device *dev,
 					 struct netdev_notifier_info *info)
 {
 	ASSERT_RTNL();
-	netdev_notifier_info_init(info, dev);
 	return raw_notifier_call_chain(&netdev_chain, val, info);
 }
 
@@ -1709,9 +1708,11 @@ static int call_netdevice_notifiers_info(unsigned long val,
 
 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 {
-	struct netdev_notifier_info info;
+	struct netdev_notifier_info info = {
+		.dev = dev,
+	};
 
-	return call_netdevice_notifiers_info(val, dev, &info);
+	return call_netdevice_notifiers_info(val, &info);
 }
 EXPORT_SYMBOL(call_netdevice_notifiers);
 
@@ -6278,7 +6279,15 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 				   struct net_device *upper_dev, bool master,
 				   void *upper_priv, void *upper_info)
 {
-	struct netdev_notifier_changeupper_info changeupper_info;
+	struct netdev_notifier_changeupper_info changeupper_info = {
+		.info = {
+			.dev = dev,
+		},
+		.upper_dev = upper_dev,
+		.master = master,
+		.linking = true,
+		.upper_info = upper_info,
+	};
 	int ret = 0;
 
 	ASSERT_RTNL();
@@ -6296,12 +6305,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	if (master && netdev_master_upper_dev_get(dev))
 		return -EBUSY;
 
-	changeupper_info.upper_dev = upper_dev;
-	changeupper_info.master = master;
-	changeupper_info.linking = true;
-	changeupper_info.upper_info = upper_info;
-
-	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
+	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 					    &changeupper_info.info);
 	ret = notifier_to_errno(ret);
 	if (ret)
@@ -6312,7 +6316,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	if (ret)
 		return ret;
 
-	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 					    &changeupper_info.info);
 	ret = notifier_to_errno(ret);
 	if (ret)
@@ -6376,20 +6380,24 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link);
 void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev)
 {
-	struct netdev_notifier_changeupper_info changeupper_info;
+	struct netdev_notifier_changeupper_info changeupper_info = {
+		.info = {
+			.dev = dev,
+		},
+		.upper_dev = upper_dev,
+		.linking = false,
+	};
 
 	ASSERT_RTNL();
 
-	changeupper_info.upper_dev = upper_dev;
 	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
-	changeupper_info.linking = false;
 
-	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
+	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 				      &changeupper_info.info);
 
 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 
-	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 				      &changeupper_info.info);
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
@@ -6405,11 +6413,13 @@ EXPORT_SYMBOL(netdev_upper_dev_unlink);
 void netdev_bonding_info_change(struct net_device *dev,
 				struct netdev_bonding_info *bonding_info)
 {
-	struct netdev_notifier_bonding_info	info;
+	struct netdev_notifier_bonding_info info = {
+		.info.dev = dev,
+	};
 
 	memcpy(&info.bonding_info, bonding_info,
 	       sizeof(struct netdev_bonding_info));
-	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
+	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
 				      &info.info);
 }
 EXPORT_SYMBOL(netdev_bonding_info_change);
@@ -6535,11 +6545,13 @@ EXPORT_SYMBOL(dev_get_nest_level);
 void netdev_lower_state_changed(struct net_device *lower_dev,
 				void *lower_state_info)
 {
-	struct netdev_notifier_changelowerstate_info changelowerstate_info;
+	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
+		.info.dev = lower_dev,
+	};
 
 	ASSERT_RTNL();
 	changelowerstate_info.lower_state_info = lower_state_info;
-	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
+	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
 				      &changelowerstate_info.info);
 }
 EXPORT_SYMBOL(netdev_lower_state_changed);
@@ -6830,11 +6842,14 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
 
 	if (dev->flags & IFF_UP &&
 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
-		struct netdev_notifier_change_info change_info;
-
-		change_info.flags_changed = changes;
-		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
-					      &change_info.info);
+		struct netdev_notifier_change_info change_info = {
+			.info = {
+				.dev = dev,
+			},
+			.flags_changed = changes,
+		};
+
+		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
 	}
 }
 
-- 
cgit v1.2.3


From 33eaf2a6eb48ebf00374aaaf4b1b43f9950dcbe4 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 4 Oct 2017 17:48:46 -0700
Subject: net: Add extack to ndo_add_slave

Pass extack to do_set_master and down to ndo_add_slave

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c    |  5 +++--
 drivers/net/bonding/bond_options.c |  2 +-
 drivers/net/team/team.c            |  3 ++-
 drivers/net/vrf.c                  |  3 ++-
 include/linux/netdevice.h          |  3 ++-
 include/net/bonding.h              |  3 ++-
 net/batman-adv/soft-interface.c    |  3 ++-
 net/bridge/br_device.c             |  3 ++-
 net/core/rtnetlink.c               | 10 ++++++----
 9 files changed, 22 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index b19dc033fb36..78feb94a36db 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1328,7 +1328,8 @@ void bond_lower_state_changed(struct slave *slave)
 }
 
 /* enslave device <slave> to bond device <master> */
-int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
+int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
+		 struct netlink_ext_ack *extack)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	const struct net_device_ops *slave_ops = slave_dev->netdev_ops;
@@ -3492,7 +3493,7 @@ static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd
 	switch (cmd) {
 	case BOND_ENSLAVE_OLD:
 	case SIOCBONDENSLAVE:
-		res = bond_enslave(bond_dev, slave_dev);
+		res = bond_enslave(bond_dev, slave_dev, NULL);
 		break;
 	case BOND_RELEASE_OLD:
 	case SIOCBONDRELEASE:
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 5931aa2fe997..8a9b085c2a98 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -1383,7 +1383,7 @@ static int bond_option_slaves_set(struct bonding *bond,
 	switch (command[0]) {
 	case '+':
 		netdev_dbg(bond->dev, "Adding slave %s\n", dev->name);
-		ret = bond_enslave(bond->dev, dev);
+		ret = bond_enslave(bond->dev, dev, NULL);
 		break;
 
 	case '-':
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index ae53e899259f..4359d45aa131 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1914,7 +1914,8 @@ static int team_netpoll_setup(struct net_device *dev,
 }
 #endif
 
-static int team_add_slave(struct net_device *dev, struct net_device *port_dev)
+static int team_add_slave(struct net_device *dev, struct net_device *port_dev,
+			  struct netlink_ext_ack *extack)
 {
 	struct team *team = netdev_priv(dev);
 	int err;
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index cc18b7b11612..4a082ef53533 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -788,7 +788,8 @@ err:
 	return ret;
 }
 
-static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
+static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev,
+			 struct netlink_ext_ack *extack)
 {
 	if (netif_is_l3_master(port_dev) || netif_is_l3_slave(port_dev))
 		return -EINVAL;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 05fcaba4b0d9..368a5064a487 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1246,7 +1246,8 @@ struct net_device_ops {
 						     u32 flow_id);
 #endif
 	int			(*ndo_add_slave)(struct net_device *dev,
-						 struct net_device *slave_dev);
+						 struct net_device *slave_dev,
+						 struct netlink_ext_ack *extack);
 	int			(*ndo_del_slave)(struct net_device *dev,
 						 struct net_device *slave_dev);
 	netdev_features_t	(*ndo_fix_features)(struct net_device *dev,
diff --git a/include/net/bonding.h b/include/net/bonding.h
index b2e68657a216..2860cc66c2bb 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -596,7 +596,8 @@ void bond_destroy_sysfs(struct bond_net *net);
 void bond_prepare_sysfs_group(struct bonding *bond);
 int bond_sysfs_slave_add(struct slave *slave);
 void bond_sysfs_slave_del(struct slave *slave);
-int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev);
+int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
+		 struct netlink_ext_ack *extack);
 int bond_release(struct net_device *bond_dev, struct net_device *slave_dev);
 u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb);
 int bond_set_carrier(struct bonding *bond);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index c2c986746d0b..e7d5fbb6ad53 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -867,7 +867,8 @@ free_bat_counters:
  * Return: 0 if successful or error otherwise.
  */
 static int batadv_softif_slave_add(struct net_device *dev,
-				   struct net_device *slave_dev)
+				   struct net_device *slave_dev,
+				   struct netlink_ext_ack *extack)
 {
 	struct batadv_hard_iface *hard_iface;
 	struct net *net = dev_net(dev);
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index f6b6a92f1c48..cb0131d70ab1 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -320,7 +320,8 @@ void br_netpoll_disable(struct net_bridge_port *p)
 
 #endif
 
-static int br_add_slave(struct net_device *dev, struct net_device *slave_dev)
+static int br_add_slave(struct net_device *dev, struct net_device *slave_dev,
+			struct netlink_ext_ack *extack)
 
 {
 	struct net_bridge *br = netdev_priv(dev);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1ee98b1369d5..c5ee429bcce9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1957,7 +1957,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
 	return err;
 }
 
-static int do_set_master(struct net_device *dev, int ifindex)
+static int do_set_master(struct net_device *dev, int ifindex,
+			 struct netlink_ext_ack *extack)
 {
 	struct net_device *upper_dev = netdev_master_upper_dev_get(dev);
 	const struct net_device_ops *ops;
@@ -1982,7 +1983,7 @@ static int do_set_master(struct net_device *dev, int ifindex)
 			return -EINVAL;
 		ops = upper_dev->netdev_ops;
 		if (ops->ndo_add_slave) {
-			err = ops->ndo_add_slave(upper_dev, dev);
+			err = ops->ndo_add_slave(upper_dev, dev, extack);
 			if (err)
 				return err;
 		} else {
@@ -2115,7 +2116,7 @@ static int do_setlink(const struct sk_buff *skb,
 	}
 
 	if (tb[IFLA_MASTER]) {
-		err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
+		err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack);
 		if (err)
 			goto errout;
 		status |= DO_SETLINK_MODIFIED;
@@ -2753,7 +2754,8 @@ replay:
 				goto out_unregister;
 		}
 		if (tb[IFLA_MASTER]) {
-			err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
+			err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]),
+					    extack);
 			if (err)
 				goto out_unregister;
 		}
-- 
cgit v1.2.3


From 42ab19ee90292993370a30ad242599d75a3b749e Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 4 Oct 2017 17:48:47 -0700
Subject: net: Add extack to upper device linking

Add extack arg to netdev_upper_dev_link and netdev_master_upper_dev_link

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c                    |  7 ++++---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c |  2 +-
 drivers/net/hyperv/netvsc_drv.c                    |  2 +-
 drivers/net/ipvlan/ipvlan_main.c                   |  2 +-
 drivers/net/macsec.c                               |  2 +-
 drivers/net/macvlan.c                              |  7 ++++---
 drivers/net/macvtap.c                              |  2 +-
 drivers/net/team/team.c                            |  2 +-
 drivers/net/usb/qmi_wwan.c                         |  2 +-
 drivers/net/vrf.c                                  |  7 ++++---
 include/linux/if_macvlan.h                         |  3 ++-
 include/linux/netdevice.h                          |  6 ++++--
 net/8021q/vlan.c                                   |  6 +++---
 net/8021q/vlan.h                                   |  2 +-
 net/8021q/vlan_netlink.c                           |  2 +-
 net/batman-adv/hard-interface.c                    |  2 +-
 net/bridge/br_if.c                                 |  2 +-
 net/core/dev.c                                     | 15 ++++++++++-----
 net/openvswitch/vport-netdev.c                     |  3 ++-
 19 files changed, 44 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 78feb94a36db..bc92307c2082 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1217,14 +1217,15 @@ static enum netdev_lag_tx_type bond_lag_tx_type(struct bonding *bond)
 	}
 }
 
-static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave)
+static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave,
+				      struct netlink_ext_ack *extack)
 {
 	struct netdev_lag_upper_info lag_upper_info;
 	int err;
 
 	lag_upper_info.tx_type = bond_lag_tx_type(bond);
 	err = netdev_master_upper_dev_link(slave->dev, bond->dev, slave,
-					   &lag_upper_info);
+					   &lag_upper_info, extack);
 	if (err)
 		return err;
 	rtmsg_ifinfo(RTM_NEWLINK, slave->dev, IFF_SLAVE, GFP_KERNEL);
@@ -1710,7 +1711,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
 		goto err_detach;
 	}
 
-	res = bond_master_upper_dev_link(bond, new_slave);
+	res = bond_master_upper_dev_link(bond, new_slave, extack);
 	if (res) {
 		netdev_dbg(bond_dev, "Error %d calling bond_master_upper_dev_link\n", res);
 		goto err_unregister;
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index 98f22551eb45..1af326a60cbb 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -178,7 +178,7 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev,
 	if (err)
 		goto err1;
 
-	err = netdev_master_upper_dev_link(dev, real_dev, NULL, NULL);
+	err = netdev_master_upper_dev_link(dev, real_dev, NULL, NULL, extack);
 	if (err)
 		goto err2;
 
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index f300ae61c6c6..dfb986421ec6 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1748,7 +1748,7 @@ static int netvsc_vf_join(struct net_device *vf_netdev,
 		goto rx_handler_failed;
 	}
 
-	ret = netdev_upper_dev_link(vf_netdev, ndev);
+	ret = netdev_upper_dev_link(vf_netdev, ndev, NULL);
 	if (ret != 0) {
 		netdev_err(vf_netdev,
 			   "can not set master device %s (err = %d)\n",
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index c74893c1e620..57c3856bab05 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -584,7 +584,7 @@ int ipvlan_link_new(struct net *src_net, struct net_device *dev,
 	if (err < 0)
 		goto remove_ida;
 
-	err = netdev_upper_dev_link(phy_dev, dev);
+	err = netdev_upper_dev_link(phy_dev, dev, extack);
 	if (err) {
 		goto unregister_netdev;
 	}
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 98e4deaa3a6a..ccbe4eaffe4d 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -3244,7 +3244,7 @@ static int macsec_newlink(struct net *net, struct net_device *dev,
 				       &macsec_netdev_addr_lock_key,
 				       macsec_get_nest_level(dev));
 
-	err = netdev_upper_dev_link(real_dev, dev);
+	err = netdev_upper_dev_link(real_dev, dev, extack);
 	if (err < 0)
 		goto unregister;
 
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 1ffe77e95d46..858bd66511a2 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1344,7 +1344,8 @@ static int macvlan_changelink_sources(struct macvlan_dev *vlan, u32 mode,
 }
 
 int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
-			   struct nlattr *tb[], struct nlattr *data[])
+			   struct nlattr *tb[], struct nlattr *data[],
+			   struct netlink_ext_ack *extack)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct macvlan_port *port;
@@ -1433,7 +1434,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 		goto destroy_macvlan_port;
 
 	dev->priv_flags |= IFF_MACVLAN;
-	err = netdev_upper_dev_link(lowerdev, dev);
+	err = netdev_upper_dev_link(lowerdev, dev, extack);
 	if (err)
 		goto unregister_netdev;
 
@@ -1456,7 +1457,7 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev,
 			   struct nlattr *tb[], struct nlattr *data[],
 			   struct netlink_ext_ack *extack)
 {
-	return macvlan_common_newlink(src_net, dev, tb, data);
+	return macvlan_common_newlink(src_net, dev, tb, data, extack);
 }
 
 void macvlan_dellink(struct net_device *dev, struct list_head *head)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index c2d0ea2fb019..f62aea2fcfa9 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -105,7 +105,7 @@ static int macvtap_newlink(struct net *src_net, struct net_device *dev,
 	/* Don't put anything that may fail after macvlan_common_newlink
 	 * because we can't undo what it does.
 	 */
-	err = macvlan_common_newlink(src_net, dev, tb, data);
+	err = macvlan_common_newlink(src_net, dev, tb, data, extack);
 	if (err) {
 		netdev_rx_handler_unregister(dev);
 		return err;
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 4359d45aa131..a468439969df 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1112,7 +1112,7 @@ static int team_upper_dev_link(struct team *team, struct team_port *port)
 
 	lag_upper_info.tx_type = team->mode->lag_tx_type;
 	err = netdev_master_upper_dev_link(port->dev, team->dev, NULL,
-					   &lag_upper_info);
+					   &lag_upper_info, NULL);
 	if (err)
 		return err;
 	port->dev->priv_flags |= IFF_TEAM_PORT;
diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 8c3733608271..db7279d5b250 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -221,7 +221,7 @@ static int qmimux_register_device(struct net_device *real_dev, u8 mux_id)
 	/* Account for reference in struct qmimux_priv_priv */
 	dev_hold(real_dev);
 
-	err = netdev_upper_dev_link(real_dev, new_dev);
+	err = netdev_upper_dev_link(real_dev, new_dev, NULL);
 	if (err)
 		goto out_unregister_netdev;
 
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 4a082ef53533..77d0655a0250 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -764,7 +764,8 @@ static void cycle_netdev(struct net_device *dev)
 	}
 }
 
-static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
+static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev,
+			    struct netlink_ext_ack *extack)
 {
 	int ret;
 
@@ -775,7 +776,7 @@ static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
 		return -EOPNOTSUPP;
 
 	port_dev->priv_flags |= IFF_L3MDEV_SLAVE;
-	ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL);
+	ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack);
 	if (ret < 0)
 		goto err;
 
@@ -794,7 +795,7 @@ static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev,
 	if (netif_is_l3_master(port_dev) || netif_is_l3_slave(port_dev))
 		return -EINVAL;
 
-	return do_vrf_add_slave(dev, port_dev);
+	return do_vrf_add_slave(dev, port_dev, extack);
 }
 
 /* inverse of do_vrf_add_slave */
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index c9ec1343d187..10e319f41fb1 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -72,7 +72,8 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
 extern void macvlan_common_setup(struct net_device *dev);
 
 extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
-				  struct nlattr *tb[], struct nlattr *data[]);
+				  struct nlattr *tb[], struct nlattr *data[],
+				  struct netlink_ext_ack *extack);
 
 extern void macvlan_count_rx(const struct macvlan_dev *vlan,
 			     unsigned int len, bool success,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 368a5064a487..31bb3010c69b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3919,10 +3919,12 @@ void *netdev_adjacent_get_private(struct list_head *adj_list);
 void *netdev_lower_get_first_private_rcu(struct net_device *dev);
 struct net_device *netdev_master_upper_dev_get(struct net_device *dev);
 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev);
-int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev);
+int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev,
+			  struct netlink_ext_ack *extack);
 int netdev_master_upper_dev_link(struct net_device *dev,
 				 struct net_device *upper_dev,
-				 void *upper_priv, void *upper_info);
+				 void *upper_priv, void *upper_info,
+				 struct netlink_ext_ack *extack);
 void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev);
 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname);
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 9649579b5b9f..71c3e045505b 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -138,7 +138,7 @@ int vlan_check_real_dev(struct net_device *real_dev,
 	return 0;
 }
 
-int register_vlan_dev(struct net_device *dev)
+int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack)
 {
 	struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
 	struct net_device *real_dev = vlan->real_dev;
@@ -174,7 +174,7 @@ int register_vlan_dev(struct net_device *dev)
 	if (err < 0)
 		goto out_uninit_mvrp;
 
-	err = netdev_upper_dev_link(real_dev, dev);
+	err = netdev_upper_dev_link(real_dev, dev, extack);
 	if (err)
 		goto out_unregister_netdev;
 
@@ -270,7 +270,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
 	vlan->flags = VLAN_FLAG_REORDER_HDR;
 
 	new_dev->rtnl_link_ops = &vlan_link_ops;
-	err = register_vlan_dev(new_dev);
+	err = register_vlan_dev(new_dev, NULL);
 	if (err < 0)
 		goto out_free_newdev;
 
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index df8bd65dd370..94f8eed9f9b3 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -107,7 +107,7 @@ void vlan_dev_get_realdev_name(const struct net_device *dev, char *result);
 int vlan_check_real_dev(struct net_device *real_dev,
 			__be16 protocol, u16 vlan_id);
 void vlan_setup(struct net_device *dev);
-int register_vlan_dev(struct net_device *dev);
+int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack);
 void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
 bool vlan_dev_inherit_address(struct net_device *dev,
 			      struct net_device *real_dev);
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index 5e831de3103e..6e7c5a6a7930 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -160,7 +160,7 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev,
 	if (err < 0)
 		return err;
 
-	return register_vlan_dev(dev);
+	return register_vlan_dev(dev, extack);
 }
 
 static inline size_t vlan_qos_map_size(unsigned int n)
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index e348f76ea8c1..f7b413b9297e 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -738,7 +738,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
 	bat_priv = netdev_priv(hard_iface->soft_iface);
 
 	ret = netdev_master_upper_dev_link(hard_iface->net_dev,
-					   soft_iface, NULL, NULL);
+					   soft_iface, NULL, NULL, NULL);
 	if (ret)
 		goto err_dev;
 
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index f3aef22931ab..0a3fd727048d 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -540,7 +540,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 
 	dev->priv_flags |= IFF_BRIDGE_PORT;
 
-	err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL);
+	err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, NULL);
 	if (err)
 		goto err5;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index e27a6bc0ac4d..fcddccb6be41 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6277,11 +6277,13 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 
 static int __netdev_upper_dev_link(struct net_device *dev,
 				   struct net_device *upper_dev, bool master,
-				   void *upper_priv, void *upper_info)
+				   void *upper_priv, void *upper_info,
+				   struct netlink_ext_ack *extack)
 {
 	struct netdev_notifier_changeupper_info changeupper_info = {
 		.info = {
 			.dev = dev,
+			.extack = extack,
 		},
 		.upper_dev = upper_dev,
 		.master = master,
@@ -6341,9 +6343,11 @@ rollback:
  * returns zero.
  */
 int netdev_upper_dev_link(struct net_device *dev,
-			  struct net_device *upper_dev)
+			  struct net_device *upper_dev,
+			  struct netlink_ext_ack *extack)
 {
-	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
+	return __netdev_upper_dev_link(dev, upper_dev, false,
+				       NULL, NULL, extack);
 }
 EXPORT_SYMBOL(netdev_upper_dev_link);
 
@@ -6362,10 +6366,11 @@ EXPORT_SYMBOL(netdev_upper_dev_link);
  */
 int netdev_master_upper_dev_link(struct net_device *dev,
 				 struct net_device *upper_dev,
-				 void *upper_priv, void *upper_info)
+				 void *upper_priv, void *upper_info,
+				 struct netlink_ext_ack *extack)
 {
 	return __netdev_upper_dev_link(dev, upper_dev, true,
-				       upper_priv, upper_info);
+				       upper_priv, upper_info, extack);
 }
 EXPORT_SYMBOL(netdev_master_upper_dev_link);
 
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 0389398fa4ab..2e5e7a41d8ef 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -108,7 +108,8 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name)
 
 	rtnl_lock();
 	err = netdev_master_upper_dev_link(vport->dev,
-					   get_dpdev(vport->dp), NULL, NULL);
+					   get_dpdev(vport->dp),
+					   NULL, NULL, NULL);
 	if (err)
 		goto error_unlock;
 
-- 
cgit v1.2.3


From 29cc309d8bf19a36c5196bf626662319af6e3c0b Mon Sep 17 00:00:00 2001
From: Nicolas Boichat <drinkcat@chromium.org>
Date: Tue, 22 Aug 2017 09:10:11 +0800
Subject: HID: hid-multitouch: forward MSC_TIMESTAMP

Computes and forwards the device timestamp according to the specification.

Many devices use a 16-bit timestamp field, with a resolution of 100us,
therefore rolling around very frequently (every 6.5 seconds). To make sure
there is no ambiguity, the timestamp reported to the input stack reset to
0 whenever the time between 2 received events is greater than
MAX_TIMESTAMP_INTERVAL (1 second).

Signed-off-by: Nicolas Boichat <drinkcat@chromium.org>
Acked-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-multitouch.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/hid.h          |  1 +
 2 files changed, 43 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c
index 440b999304a5..996bdc9bf0e5 100644
--- a/drivers/hid/hid-multitouch.c
+++ b/drivers/hid/hid-multitouch.c
@@ -43,6 +43,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/input/mt.h>
+#include <linux/jiffies.h>
 #include <linux/string.h>
 #include <linux/timer.h>
 
@@ -136,6 +137,9 @@ struct mt_device {
 	bool serial_maybe;	/* need to check for serial protocol */
 	bool curvalid;		/* is the current contact valid? */
 	unsigned mt_flags;	/* flags to pass to input-mt */
+	__s32 dev_time;		/* the scan time provided by the device */
+	unsigned long jiffies;	/* the frame's jiffies */
+	int timestamp;		/* the timestamp to be sent */
 };
 
 static void mt_post_parse_default_settings(struct mt_device *td);
@@ -177,6 +181,12 @@ static void mt_post_parse(struct mt_device *td);
 #define MT_DEFAULT_MAXCONTACT	10
 #define MT_MAX_MAXCONTACT	250
 
+/*
+ * Resync device and local timestamps after that many microseconds without
+ * receiving data.
+ */
+#define MAX_TIMESTAMP_INTERVAL	1000000
+
 #define MT_USB_DEVICE(v, p)	HID_DEVICE(BUS_USB, HID_GROUP_MULTITOUCH, v, p)
 #define MT_BT_DEVICE(v, p)	HID_DEVICE(BUS_BLUETOOTH, HID_GROUP_MULTITOUCH, v, p)
 
@@ -583,6 +593,12 @@ static int mt_touch_input_mapping(struct hid_device *hdev, struct hid_input *hi,
 				cls->sn_pressure);
 			mt_store_field(usage, td, hi);
 			return 1;
+		case HID_DG_SCANTIME:
+			hid_map_usage(hi, usage, bit, max,
+				EV_MSC, MSC_TIMESTAMP);
+			input_set_capability(hi->input, EV_MSC, MSC_TIMESTAMP);
+			mt_store_field(usage, td, hi);
+			return 1;
 		case HID_DG_CONTACTCOUNT:
 			/* Ignore if indexes are out of bounds. */
 			if (field->index >= field->report->maxfield ||
@@ -718,6 +734,7 @@ static void mt_complete_slot(struct mt_device *td, struct input_dev *input)
 static void mt_sync_frame(struct mt_device *td, struct input_dev *input)
 {
 	input_mt_sync_frame(input);
+	input_event(input, EV_MSC, MSC_TIMESTAMP, td->timestamp);
 	input_sync(input);
 	td->num_received = 0;
 	if (test_bit(MT_IO_FLAGS_ACTIVE_SLOTS, &td->mt_io_flags))
@@ -727,6 +744,28 @@ static void mt_sync_frame(struct mt_device *td, struct input_dev *input)
 	clear_bit(MT_IO_FLAGS_ACTIVE_SLOTS, &td->mt_io_flags);
 }
 
+static int mt_compute_timestamp(struct mt_device *td, struct hid_field *field,
+		__s32 value)
+{
+	long delta = value - td->dev_time;
+	unsigned long jdelta = jiffies_to_usecs(jiffies - td->jiffies);
+
+	td->jiffies = jiffies;
+	td->dev_time = value;
+
+	if (delta < 0)
+		delta += field->logical_maximum;
+
+	/* HID_DG_SCANTIME is expressed in 100us, we want it in us. */
+	delta *= 100;
+
+	if (jdelta > MAX_TIMESTAMP_INTERVAL)
+		/* No data received for a while, resync the timestamp. */
+		return 0;
+	else
+		return td->timestamp + delta;
+}
+
 static int mt_touch_event(struct hid_device *hid, struct hid_field *field,
 				struct hid_usage *usage, __s32 value)
 {
@@ -787,6 +826,9 @@ static void mt_process_mt_event(struct hid_device *hid, struct hid_field *field,
 		case HID_DG_HEIGHT:
 			td->curdata.h = value;
 			break;
+		case HID_DG_SCANTIME:
+			td->timestamp = mt_compute_timestamp(td, field, value);
+			break;
 		case HID_DG_CONTACTCOUNT:
 			break;
 		case HID_DG_TOUCH:
diff --git a/include/linux/hid.h b/include/linux/hid.h
index ab05a86269dc..47dd962d9a7a 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -289,6 +289,7 @@ struct hid_item {
 #define HID_DG_DEVICEINDEX	0x000d0053
 #define HID_DG_CONTACTCOUNT	0x000d0054
 #define HID_DG_CONTACTMAX	0x000d0055
+#define HID_DG_SCANTIME		0x000d0056
 #define HID_DG_BUTTONTYPE	0x000d0059
 #define HID_DG_BARRELSWITCH2	0x000d005a
 #define HID_DG_TOOLSERIALNUMBER	0x000d005b
-- 
cgit v1.2.3


From 58e1177b4cd10b0d358faf7d7ebb3779f98bc3ea Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 4 Oct 2017 16:26:55 -0700
Subject: timer: Convert schedule_timeout() to use from_timer()

In preparation for unconditionally passing the struct timer_list pointer to
all timer callbacks, switch to using the new from_timer() helper and passing
the timer pointer explicitly. Since this special timer is on the stack, it
needs to have a wrapper structure to carry state once .data is eliminated.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Cc: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: linux1394-devel@lists.sourceforge.net
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: linux-s390@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Harish Patil <harish.patil@cavium.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Manish Chopra <manish.chopra@cavium.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-pm@vger.kernel.org
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Mark Gross <mark.gross@intel.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: linux-watchdog@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Michael Reed <mdr@sgi.com>
Cc: netdev@vger.kernel.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lkml.kernel.org/r/1507159627-127660-2-git-send-email-keescook@chromium.org
---
 include/linux/timer.h |  8 ++++++++
 kernel/time/timer.c   | 26 +++++++++++++++++++-------
 2 files changed, 27 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 6383c528b148..5ef5c9e41a09 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -179,6 +179,14 @@ static inline void timer_setup(struct timer_list *timer,
 		      (TIMER_DATA_TYPE)timer, flags);
 }
 
+static inline void timer_setup_on_stack(struct timer_list *timer,
+			       void (*callback)(struct timer_list *),
+			       unsigned int flags)
+{
+	__setup_timer_on_stack(timer, (TIMER_FUNC_TYPE)callback,
+			       (TIMER_DATA_TYPE)timer, flags);
+}
+
 #define from_timer(var, callback_timer, timer_fieldname) \
 	container_of(callback_timer, typeof(*var), timer_fieldname)
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index f2674a056c26..38613ced2324 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1668,9 +1668,20 @@ void run_local_timers(void)
 	raise_softirq(TIMER_SOFTIRQ);
 }
 
-static void process_timeout(unsigned long __data)
+/*
+ * Since schedule_timeout()'s timer is defined on the stack, it must store
+ * the target task on the stack as well.
+ */
+struct process_timer {
+	struct timer_list timer;
+	struct task_struct *task;
+};
+
+static void process_timeout(struct timer_list *t)
 {
-	wake_up_process((struct task_struct *)__data);
+	struct process_timer *timeout = from_timer(timeout, t, timer);
+
+	wake_up_process(timeout->task);
 }
 
 /**
@@ -1704,7 +1715,7 @@ static void process_timeout(unsigned long __data)
  */
 signed long __sched schedule_timeout(signed long timeout)
 {
-	struct timer_list timer;
+	struct process_timer timer;
 	unsigned long expire;
 
 	switch (timeout)
@@ -1738,13 +1749,14 @@ signed long __sched schedule_timeout(signed long timeout)
 
 	expire = timeout + jiffies;
 
-	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-	__mod_timer(&timer, expire, false);
+	timer.task = current;
+	timer_setup_on_stack(&timer.timer, process_timeout, 0);
+	__mod_timer(&timer.timer, expire, false);
 	schedule();
-	del_singleshot_timer_sync(&timer);
+	del_singleshot_timer_sync(&timer.timer);
 
 	/* Remove the timer from the object tracker */
-	destroy_timer_on_stack(&timer);
+	destroy_timer_on_stack(&timer.timer);
 
 	timeout = expire - jiffies;
 
-- 
cgit v1.2.3


From 1d1fe902afb380571105d05d0be3de61b81bc9a8 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 4 Oct 2017 16:26:56 -0700
Subject: timer: Remove init_timer_pinned_deferrable() in favor of
 timer_setup()

This refactors the only user of init_timer_pinned_deferrable() to use the
new timer_setup() and from_timer(). Adds a pointer back to the policy,
and drops the definition of init_timer_pinned_deferrable().

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Cc: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: linux1394-devel@lists.sourceforge.net
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: linux-s390@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Harish Patil <harish.patil@cavium.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Manish Chopra <manish.chopra@cavium.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-pm@vger.kernel.org
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Mark Gross <mark.gross@intel.com>
Cc: linux-watchdog@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Michael Reed <mdr@sgi.com>
Cc: netdev@vger.kernel.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lkml.kernel.org/r/1507159627-127660-3-git-send-email-keescook@chromium.org
---
 drivers/cpufreq/powernv-cpufreq.c | 13 +++++++------
 include/linux/timer.h             |  2 --
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index 3ff5160451b4..b6d7c4c98d0a 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -90,6 +90,7 @@ struct global_pstate_info {
 	int last_gpstate_idx;
 	spinlock_t gpstate_lock;
 	struct timer_list timer;
+	struct cpufreq_policy *policy;
 };
 
 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
@@ -625,10 +626,10 @@ static inline void  queue_gpstate_timer(struct global_pstate_info *gpstates)
  * according quadratic equation. Queues a new timer if it is still not equal
  * to local pstate
  */
-void gpstate_timer_handler(unsigned long data)
+void gpstate_timer_handler(struct timer_list *t)
 {
-	struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
-	struct global_pstate_info *gpstates = policy->driver_data;
+	struct global_pstate_info *gpstates = from_timer(gpstates, t, timer);
+	struct cpufreq_policy *policy = gpstates->policy;
 	int gpstate_idx, lpstate_idx;
 	unsigned long val;
 	unsigned int time_diff = jiffies_to_msecs(jiffies)
@@ -800,9 +801,9 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	policy->driver_data = gpstates;
 
 	/* initialize timer */
-	init_timer_pinned_deferrable(&gpstates->timer);
-	gpstates->timer.data = (unsigned long)policy;
-	gpstates->timer.function = gpstate_timer_handler;
+	gpstates->policy = policy;
+	timer_setup(&gpstates->timer, gpstate_timer_handler,
+		    TIMER_PINNED | TIMER_DEFERRABLE);
 	gpstates->timer.expires = jiffies +
 				msecs_to_jiffies(GPSTATE_TIMER_INTERVAL);
 	spin_lock_init(&gpstates->gpstate_lock);
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 5ef5c9e41a09..d11e819a86e2 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -132,8 +132,6 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 	__init_timer((timer), TIMER_PINNED)
 #define init_timer_deferrable(timer)					\
 	__init_timer((timer), TIMER_DEFERRABLE)
-#define init_timer_pinned_deferrable(timer)				\
-	__init_timer((timer), TIMER_DEFERRABLE | TIMER_PINNED)
 #define init_timer_on_stack(timer)					\
 	__init_timer_on_stack((timer), 0)
 
-- 
cgit v1.2.3


From 9c6c273aa4248c60569de6ef7e7e9c7bed3cd32e Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 4 Oct 2017 16:26:57 -0700
Subject: timer: Remove init_timer_on_stack() in favor of
 timer_setup_on_stack()

Remove uses of init_timer_on_stack() with open-coded function and data
assignments that could be expressed using timer_setup_on_stack(). Several
were removed from the stack entirely since there was a one-to-one mapping
of parent structure to timer, those are switched to using timer_setup()
instead. All related callbacks were adjusted to use from_timer().

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Cc: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: linux1394-devel@lists.sourceforge.net
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: linux-s390@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: linux-scsi@vger.kernel.org
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Harish Patil <harish.patil@cavium.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Michael Reed <mdr@sgi.com>
Cc: Manish Chopra <manish.chopra@cavium.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-pm@vger.kernel.org
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Mark Gross <mark.gross@intel.com>
Cc: linux-watchdog@vger.kernel.org
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: netdev@vger.kernel.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lkml.kernel.org/r/1507159627-127660-4-git-send-email-keescook@chromium.org
---
 drivers/base/power/main.c           |  8 +++-----
 drivers/firewire/core-transaction.c | 10 +++++-----
 drivers/parport/ieee1284.c          | 21 +++++++--------------
 drivers/s390/char/tape.h            |  1 +
 drivers/s390/char/tape_std.c        | 18 ++++++------------
 drivers/s390/net/lcs.c              | 16 ++++++----------
 drivers/s390/net/lcs.h              |  1 +
 drivers/scsi/qla1280.c              | 14 +++++---------
 drivers/scsi/qla1280.h              |  1 +
 include/linux/parport.h             |  1 +
 include/linux/timer.h               |  2 --
 11 files changed, 36 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 770b1539a083..ae47b2ec84b4 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -478,9 +478,9 @@ struct dpm_watchdog {
  * There's not much we can do here to recover so panic() to
  * capture a crash-dump in pstore.
  */
-static void dpm_watchdog_handler(unsigned long data)
+static void dpm_watchdog_handler(struct timer_list *t)
 {
-	struct dpm_watchdog *wd = (void *)data;
+	struct dpm_watchdog *wd = from_timer(wd, t, timer);
 
 	dev_emerg(wd->dev, "**** DPM device timeout ****\n");
 	show_stack(wd->tsk, NULL);
@@ -500,11 +500,9 @@ static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev)
 	wd->dev = dev;
 	wd->tsk = current;
 
-	init_timer_on_stack(timer);
+	timer_setup_on_stack(timer, dpm_watchdog_handler, 0);
 	/* use same timeout value for both suspend and resume */
 	timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_TIMEOUT;
-	timer->function = dpm_watchdog_handler;
-	timer->data = (unsigned long)wd;
 	add_timer(timer);
 }
 
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index d6a09b9cd8cc..4372f9e4b0da 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -137,9 +137,9 @@ int fw_cancel_transaction(struct fw_card *card,
 }
 EXPORT_SYMBOL(fw_cancel_transaction);
 
-static void split_transaction_timeout_callback(unsigned long data)
+static void split_transaction_timeout_callback(struct timer_list *timer)
 {
-	struct fw_transaction *t = (struct fw_transaction *)data;
+	struct fw_transaction *t = from_timer(t, timer, split_timeout_timer);
 	struct fw_card *card = t->card;
 	unsigned long flags;
 
@@ -373,8 +373,8 @@ void fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode,
 	t->tlabel = tlabel;
 	t->card = card;
 	t->is_split_transaction = false;
-	setup_timer(&t->split_timeout_timer,
-		    split_transaction_timeout_callback, (unsigned long)t);
+	timer_setup(&t->split_timeout_timer,
+		    split_transaction_timeout_callback, 0);
 	t->callback = callback;
 	t->callback_data = callback_data;
 
@@ -423,7 +423,7 @@ int fw_run_transaction(struct fw_card *card, int tcode, int destination_id,
 	struct transaction_callback_data d;
 	struct fw_transaction t;
 
-	init_timer_on_stack(&t.split_timeout_timer);
+	timer_setup_on_stack(&t.split_timeout_timer, NULL, 0);
 	init_completion(&d.done);
 	d.payload = payload;
 	fw_send_request(card, &t, tcode, destination_id, generation, speed,
diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c
index 74cc6dd982d2..2d1a5c737c6e 100644
--- a/drivers/parport/ieee1284.c
+++ b/drivers/parport/ieee1284.c
@@ -44,10 +44,11 @@ static void parport_ieee1284_wakeup (struct parport *port)
 	up (&port->physport->ieee1284.irq);
 }
 
-static struct parport *port_from_cookie[PARPORT_MAX];
-static void timeout_waiting_on_port (unsigned long cookie)
+static void timeout_waiting_on_port (struct timer_list *t)
 {
-	parport_ieee1284_wakeup (port_from_cookie[cookie % PARPORT_MAX]);
+	struct parport *port = from_timer(port, t, timer);
+
+	parport_ieee1284_wakeup (port);
 }
 
 /**
@@ -69,27 +70,19 @@ static void timeout_waiting_on_port (unsigned long cookie)
 int parport_wait_event (struct parport *port, signed long timeout)
 {
 	int ret;
-	struct timer_list timer;
 
 	if (!port->physport->cad->timeout)
 		/* Zero timeout is special, and we can't down() the
 		   semaphore. */
 		return 1;
 
-	init_timer_on_stack(&timer);
-	timer.expires = jiffies + timeout;
-	timer.function = timeout_waiting_on_port;
-	port_from_cookie[port->number % PARPORT_MAX] = port;
-	timer.data = port->number;
-
-	add_timer (&timer);
+	timer_setup(&port->timer, timeout_waiting_on_port, 0);
+	mod_timer(&port->timer, jiffies + timeout);
 	ret = down_interruptible (&port->physport->ieee1284.irq);
-	if (!del_timer_sync(&timer) && !ret)
+	if (!del_timer_sync(&port->timer) && !ret)
 		/* Timed out. */
 		ret = 1;
 
-	destroy_timer_on_stack(&timer);
-
 	return ret;
 }
 
diff --git a/drivers/s390/char/tape.h b/drivers/s390/char/tape.h
index ea664dd4f56d..52fbcd9c3cf8 100644
--- a/drivers/s390/char/tape.h
+++ b/drivers/s390/char/tape.h
@@ -128,6 +128,7 @@ struct tape_request {
 	int options;			/* options for execution. */
 	int retries;			/* retry counter for error recovery. */
 	int rescnt;			/* residual count from devstat. */
+	struct timer_list timer;	/* timer for std_assign_timeout(). */
 
 	/* Callback for delivering final status. */
 	void (*callback)(struct tape_request *, void *);
diff --git a/drivers/s390/char/tape_std.c b/drivers/s390/char/tape_std.c
index 3478e19ae194..cd204abdc0bc 100644
--- a/drivers/s390/char/tape_std.c
+++ b/drivers/s390/char/tape_std.c
@@ -32,14 +32,12 @@
  * tape_std_assign
  */
 static void
-tape_std_assign_timeout(unsigned long data)
+tape_std_assign_timeout(struct timer_list *t)
 {
-	struct tape_request *	request;
-	struct tape_device *	device;
+	struct tape_request *	request = from_timer(request, t, timer);
+	struct tape_device *	device = request->device;
 	int rc;
 
-	request = (struct tape_request *) data;
-	device = request->device;
 	BUG_ON(!device);
 
 	DBF_EVENT(3, "%08x: Assignment timeout. Device busy.\n",
@@ -70,16 +68,12 @@ tape_std_assign(struct tape_device *device)
 	 * to another host (actually this shouldn't happen but it does).
 	 * So we set up a timeout for this call.
 	 */
-	init_timer_on_stack(&timeout);
-	timeout.function = tape_std_assign_timeout;
-	timeout.data     = (unsigned long) request;
-	timeout.expires  = jiffies + 2 * HZ;
-	add_timer(&timeout);
+	timer_setup(&request->timer, tape_std_assign_timeout, 0);
+	mod_timer(&timeout, jiffies + 2 * HZ);
 
 	rc = tape_do_io_interruptible(device, request);
 
-	del_timer_sync(&timeout);
-	destroy_timer_on_stack(&timeout);
+	del_timer_sync(&request->timer);
 
 	if (rc != 0) {
 		DBF_EVENT(3, "%08x: assign failed - device might be busy\n",
diff --git a/drivers/s390/net/lcs.c b/drivers/s390/net/lcs.c
index d01b5c2a7760..21bba406d5be 100644
--- a/drivers/s390/net/lcs.c
+++ b/drivers/s390/net/lcs.c
@@ -834,9 +834,10 @@ lcs_notify_lancmd_waiters(struct lcs_card *card, struct lcs_cmd *cmd)
  * Emit buffer of a lan command.
  */
 static void
-lcs_lancmd_timeout(unsigned long data)
+lcs_lancmd_timeout(struct timer_list *t)
 {
-	struct lcs_reply *reply, *list_reply, *r;
+	struct lcs_reply *reply = from_timer(reply, t, timer);
+	struct lcs_reply *list_reply, *r;
 	unsigned long flags;
 
 	LCS_DBF_TEXT(4, trace, "timeout");
@@ -864,7 +865,6 @@ lcs_send_lancmd(struct lcs_card *card, struct lcs_buffer *buffer,
 {
 	struct lcs_reply *reply;
 	struct lcs_cmd *cmd;
-	struct timer_list timer;
 	unsigned long flags;
 	int rc;
 
@@ -885,14 +885,10 @@ lcs_send_lancmd(struct lcs_card *card, struct lcs_buffer *buffer,
 	rc = lcs_ready_buffer(&card->write, buffer);
 	if (rc)
 		return rc;
-	init_timer_on_stack(&timer);
-	timer.function = lcs_lancmd_timeout;
-	timer.data = (unsigned long) reply;
-	timer.expires = jiffies + HZ*card->lancmd_timeout;
-	add_timer(&timer);
+	timer_setup(&reply->timer, lcs_lancmd_timeout, 0);
+	mod_timer(&reply->timer, jiffies + HZ * card->lancmd_timeout);
 	wait_event(reply->wait_q, reply->received);
-	del_timer_sync(&timer);
-	destroy_timer_on_stack(&timer);
+	del_timer_sync(&reply->timer);
 	LCS_DBF_TEXT_(4, trace, "rc:%d",reply->rc);
 	rc = reply->rc;
 	lcs_put_reply(reply);
diff --git a/drivers/s390/net/lcs.h b/drivers/s390/net/lcs.h
index 150fcb4cebc3..d44fb8d9378f 100644
--- a/drivers/s390/net/lcs.h
+++ b/drivers/s390/net/lcs.h
@@ -275,6 +275,7 @@ struct lcs_reply {
 	void (*callback)(struct lcs_card *, struct lcs_cmd *);
 	wait_queue_head_t wait_q;
 	struct lcs_card *card;
+	struct timer_list timer;
 	int received;
 	int rc;
 };
diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c
index 8a29fb09db14..390775d5c918 100644
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -758,9 +758,9 @@ enum action {
 };
 
 
-static void qla1280_mailbox_timeout(unsigned long __data)
+static void qla1280_mailbox_timeout(struct timer_list *t)
 {
-	struct scsi_qla_host *ha = (struct scsi_qla_host *)__data;
+	struct scsi_qla_host *ha = from_timer(ha, t, mailbox_timer);
 	struct device_reg __iomem *reg;
 	reg = ha->iobase;
 
@@ -2465,7 +2465,6 @@ qla1280_mailbox_command(struct scsi_qla_host *ha, uint8_t mr, uint16_t *mb)
 	uint16_t __iomem *mptr;
 	uint16_t data;
 	DECLARE_COMPLETION_ONSTACK(wait);
-	struct timer_list timer;
 
 	ENTER("qla1280_mailbox_command");
 
@@ -2494,18 +2493,15 @@ qla1280_mailbox_command(struct scsi_qla_host *ha, uint8_t mr, uint16_t *mb)
 	/* Issue set host interrupt command. */
 
 	/* set up a timer just in case we're really jammed */
-	init_timer_on_stack(&timer);
-	timer.expires = jiffies + 20*HZ;
-	timer.data = (unsigned long)ha;
-	timer.function = qla1280_mailbox_timeout;
-	add_timer(&timer);
+	timer_setup(&ha->mailbox_timer, qla1280_mailbox_timeout, 0);
+	mod_timer(&ha->mailbox_timer, jiffies + 20 * HZ);
 
 	spin_unlock_irq(ha->host->host_lock);
 	WRT_REG_WORD(&reg->host_cmd, HC_SET_HOST_INT);
 	data = qla1280_debounce_register(&reg->istatus);
 
 	wait_for_completion(&wait);
-	del_timer_sync(&timer);
+	del_timer_sync(&ha->mailbox_timer);
 
 	spin_lock_irq(ha->host->host_lock);
 
diff --git a/drivers/scsi/qla1280.h b/drivers/scsi/qla1280.h
index 834884b9eed5..1522aca2c8c8 100644
--- a/drivers/scsi/qla1280.h
+++ b/drivers/scsi/qla1280.h
@@ -1055,6 +1055,7 @@ struct scsi_qla_host {
 	struct list_head done_q;	/* Done queue */
 
 	struct completion *mailbox_wait;
+	struct timer_list mailbox_timer;
 
 	volatile struct {
 		uint32_t online:1;			/* 0 */
diff --git a/include/linux/parport.h b/include/linux/parport.h
index 58e3c64c6b49..397607a0c0eb 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -225,6 +225,7 @@ struct parport {
 	struct pardevice *waittail;
 
 	struct list_head list;
+	struct timer_list timer;
 	unsigned int flags;
 
 	void *sysctl_table;
diff --git a/include/linux/timer.h b/include/linux/timer.h
index d11e819a86e2..b10c4bdc6fbd 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -132,8 +132,6 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 	__init_timer((timer), TIMER_PINNED)
 #define init_timer_deferrable(timer)					\
 	__init_timer((timer), TIMER_DEFERRABLE)
-#define init_timer_on_stack(timer)					\
-	__init_timer_on_stack((timer), 0)
 
 #define __setup_timer(_timer, _fn, _data, _flags)			\
 	do {								\
-- 
cgit v1.2.3


From 185981d54a60ae90942c6ba9006b250f3348cef2 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 4 Oct 2017 16:26:58 -0700
Subject: timer: Remove init_timer_pinned() in favor of timer_setup()

This refactors the only users of init_timer_pinned() to use
the new timer_setup() and from_timer(). Drops the definition of
init_timer_pinned().

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David S. Miller <davem@davemloft.net> # for networking parts
Cc: linux-mips@linux-mips.org
Cc: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: linux1394-devel@lists.sourceforge.net
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: linux-s390@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Harish Patil <harish.patil@cavium.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Manish Chopra <manish.chopra@cavium.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-pm@vger.kernel.org
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Mark Gross <mark.gross@intel.com>
Cc: linux-watchdog@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Michael Reed <mdr@sgi.com>
Cc: netdev@vger.kernel.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lkml.kernel.org/r/1507159627-127660-5-git-send-email-keescook@chromium.org
---
 drivers/net/ethernet/tile/tilepro.c | 9 ++++-----
 include/linux/timer.h               | 2 --
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/tile/tilepro.c b/drivers/net/ethernet/tile/tilepro.c
index 49ccee4b9aec..56d06282fbde 100644
--- a/drivers/net/ethernet/tile/tilepro.c
+++ b/drivers/net/ethernet/tile/tilepro.c
@@ -608,9 +608,9 @@ static void tile_net_schedule_egress_timer(struct tile_net_cpu *info)
  * ISSUE: Maybe instead track number of expected completions, and free
  * only that many, resetting to zero if "pending" is ever false.
  */
-static void tile_net_handle_egress_timer(unsigned long arg)
+static void tile_net_handle_egress_timer(struct timer_list *t)
 {
-	struct tile_net_cpu *info = (struct tile_net_cpu *)arg;
+	struct tile_net_cpu *info = from_timer(info, t, egress_timer);
 	struct net_device *dev = info->napi.dev;
 
 	/* The timer is no longer scheduled. */
@@ -1004,9 +1004,8 @@ static void tile_net_register(void *dev_ptr)
 		BUG();
 
 	/* Initialize the egress timer. */
-	init_timer_pinned(&info->egress_timer);
-	info->egress_timer.data = (long)info;
-	info->egress_timer.function = tile_net_handle_egress_timer;
+	timer_setup(&info->egress_timer, tile_net_handle_egress_timer,
+		    TIMER_PINNED);
 
 	u64_stats_init(&info->stats.syncp);
 
diff --git a/include/linux/timer.h b/include/linux/timer.h
index b10c4bdc6fbd..9da903562ed4 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -128,8 +128,6 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 
 #define init_timer(timer)						\
 	__init_timer((timer), 0)
-#define init_timer_pinned(timer)					\
-	__init_timer((timer), TIMER_PINNED)
 #define init_timer_deferrable(timer)					\
 	__init_timer((timer), TIMER_DEFERRABLE)
 
-- 
cgit v1.2.3


From df7e828c1b699792b2ff26ebcf0a6d1025b2b790 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 4 Oct 2017 16:26:59 -0700
Subject: timer: Remove init_timer_deferrable() in favor of timer_setup()

This refactors the only users of init_timer_deferrable() to use
the new timer_setup() and from_timer(). Removes definition of
init_timer_deferrable().

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David S. Miller <davem@davemloft.net> # for networking parts
Acked-by: Sebastian Reichel <sre@kernel.org> # for drivers/hsi parts
Cc: linux-mips@linux-mips.org
Cc: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: linux1394-devel@lists.sourceforge.net
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: linux-s390@vger.kernel.org
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Harish Patil <harish.patil@cavium.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Manish Chopra <manish.chopra@cavium.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-pm@vger.kernel.org
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Mark Gross <mark.gross@intel.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: linux-watchdog@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: linux-wireless@vger.kernel.org
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Michael Reed <mdr@sgi.com>
Cc: netdev@vger.kernel.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lkml.kernel.org/r/1507159627-127660-6-git-send-email-keescook@chromium.org
---
 arch/powerpc/mm/numa.c                       | 12 +++++------
 drivers/hsi/clients/ssi_protocol.c           | 32 ++++++++++++++++------------
 drivers/net/ethernet/qlogic/qlge/qlge_main.c | 11 ++++------
 drivers/net/vxlan.c                          |  8 +++----
 drivers/net/wireless/ath/ath6kl/recovery.c   |  9 ++++----
 include/linux/timer.h                        |  2 --
 6 files changed, 34 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index b95c584ce19d..f9b6107d6854 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1453,7 +1453,7 @@ static void topology_schedule_update(void)
 	schedule_work(&topology_work);
 }
 
-static void topology_timer_fn(unsigned long ignored)
+static void topology_timer_fn(struct timer_list *unused)
 {
 	if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
 		topology_schedule_update();
@@ -1463,14 +1463,11 @@ static void topology_timer_fn(unsigned long ignored)
 		reset_topology_timer();
 	}
 }
-static struct timer_list topology_timer =
-	TIMER_INITIALIZER(topology_timer_fn, 0, 0);
+static struct timer_list topology_timer;
 
 static void reset_topology_timer(void)
 {
-	topology_timer.data = 0;
-	topology_timer.expires = jiffies + 60 * HZ;
-	mod_timer(&topology_timer, topology_timer.expires);
+	mod_timer(&topology_timer, jiffies + 60 * HZ);
 }
 
 #ifdef CONFIG_SMP
@@ -1530,7 +1527,8 @@ int start_topology_update(void)
 			prrn_enabled = 0;
 			vphn_enabled = 1;
 			setup_cpu_associativity_change_counters();
-			init_timer_deferrable(&topology_timer);
+			timer_setup(&topology_timer, topology_timer_fn,
+				    TIMER_DEFERRABLE);
 			reset_topology_timer();
 		}
 	}
diff --git a/drivers/hsi/clients/ssi_protocol.c b/drivers/hsi/clients/ssi_protocol.c
index 93d28c0ec8bf..67af03d3aeb3 100644
--- a/drivers/hsi/clients/ssi_protocol.c
+++ b/drivers/hsi/clients/ssi_protocol.c
@@ -464,10 +464,10 @@ static void ssip_error(struct hsi_client *cl)
 	hsi_async_read(cl, msg);
 }
 
-static void ssip_keep_alive(unsigned long data)
+static void ssip_keep_alive(struct timer_list *t)
 {
-	struct hsi_client *cl = (struct hsi_client *)data;
-	struct ssi_protocol *ssi = hsi_client_drvdata(cl);
+	struct ssi_protocol *ssi = from_timer(ssi, t, keep_alive);
+	struct hsi_client *cl = ssi->cl;
 
 	dev_dbg(&cl->device, "Keep alive kick in: m(%d) r(%d) s(%d)\n",
 		ssi->main_state, ssi->recv_state, ssi->send_state);
@@ -490,9 +490,19 @@ static void ssip_keep_alive(unsigned long data)
 	spin_unlock(&ssi->lock);
 }
 
-static void ssip_wd(unsigned long data)
+static void ssip_rx_wd(struct timer_list *t)
+{
+	struct ssi_protocol *ssi = from_timer(ssi, t, rx_wd);
+	struct hsi_client *cl = ssi->cl;
+
+	dev_err(&cl->device, "Watchdog trigerred\n");
+	ssip_error(cl);
+}
+
+static void ssip_tx_wd(unsigned long data)
 {
-	struct hsi_client *cl = (struct hsi_client *)data;
+	struct ssi_protocol *ssi = from_timer(ssi, t, tx_wd);
+	struct hsi_client *cl = ssi->cl;
 
 	dev_err(&cl->device, "Watchdog trigerred\n");
 	ssip_error(cl);
@@ -1084,15 +1094,9 @@ static int ssi_protocol_probe(struct device *dev)
 	}
 
 	spin_lock_init(&ssi->lock);
-	init_timer_deferrable(&ssi->rx_wd);
-	init_timer_deferrable(&ssi->tx_wd);
-	init_timer(&ssi->keep_alive);
-	ssi->rx_wd.data = (unsigned long)cl;
-	ssi->rx_wd.function = ssip_wd;
-	ssi->tx_wd.data = (unsigned long)cl;
-	ssi->tx_wd.function = ssip_wd;
-	ssi->keep_alive.data = (unsigned long)cl;
-	ssi->keep_alive.function = ssip_keep_alive;
+	timer_setup(&ssi->rx_wd, ssip_rx_wd, TIMER_DEFERRABLE);
+	timer_setup(&ssi->tx_wd, ssip_tx_wd, TIMER_DEFERRABLE);
+	timer_setup(&ssi->keep_alive, ssip_keep_alive, 0);
 	INIT_LIST_HEAD(&ssi->txqueue);
 	INIT_LIST_HEAD(&ssi->cmdqueue);
 	atomic_set(&ssi->tx_usecnt, 0);
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
index 9feec7009443..29fea74bff2e 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
@@ -4725,9 +4725,9 @@ static const struct net_device_ops qlge_netdev_ops = {
 	.ndo_vlan_rx_kill_vid	= qlge_vlan_rx_kill_vid,
 };
 
-static void ql_timer(unsigned long data)
+static void ql_timer(struct timer_list *t)
 {
-	struct ql_adapter *qdev = (struct ql_adapter *)data;
+	struct ql_adapter *qdev = from_timer(qdev, t, timer);
 	u32 var = 0;
 
 	var = ql_read32(qdev, STS);
@@ -4806,11 +4806,8 @@ static int qlge_probe(struct pci_dev *pdev,
 	/* Start up the timer to trigger EEH if
 	 * the bus goes dead
 	 */
-	init_timer_deferrable(&qdev->timer);
-	qdev->timer.data = (unsigned long)qdev;
-	qdev->timer.function = ql_timer;
-	qdev->timer.expires = jiffies + (5*HZ);
-	add_timer(&qdev->timer);
+	timer_setup(&qdev->timer, ql_timer, TIMER_DEFERRABLE);
+	mod_timer(&qdev->timer, jiffies + (5*HZ));
 	ql_link_off(qdev);
 	ql_display_dev_info(ndev);
 	atomic_set(&qdev->lb_count, 0);
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index d7c49cf1d5e9..3247d2feda07 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2325,9 +2325,9 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 /* Walk the forwarding table and purge stale entries */
-static void vxlan_cleanup(unsigned long arg)
+static void vxlan_cleanup(struct timer_list *t)
 {
-	struct vxlan_dev *vxlan = (struct vxlan_dev *) arg;
+	struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer);
 	unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
 	unsigned int h;
 
@@ -2647,9 +2647,7 @@ static void vxlan_setup(struct net_device *dev)
 	INIT_LIST_HEAD(&vxlan->next);
 	spin_lock_init(&vxlan->hash_lock);
 
-	init_timer_deferrable(&vxlan->age_timer);
-	vxlan->age_timer.function = vxlan_cleanup;
-	vxlan->age_timer.data = (unsigned long) vxlan;
+	timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE);
 
 	vxlan->dev = dev;
 
diff --git a/drivers/net/wireless/ath/ath6kl/recovery.c b/drivers/net/wireless/ath/ath6kl/recovery.c
index 3a8d5e97dc8e..c09e40c9010f 100644
--- a/drivers/net/wireless/ath/ath6kl/recovery.c
+++ b/drivers/net/wireless/ath/ath6kl/recovery.c
@@ -60,9 +60,9 @@ void ath6kl_recovery_hb_event(struct ath6kl *ar, u32 cookie)
 		ar->fw_recovery.hb_pending = false;
 }
 
-static void ath6kl_recovery_hb_timer(unsigned long data)
+static void ath6kl_recovery_hb_timer(struct timer_list *t)
 {
-	struct ath6kl *ar = (struct ath6kl *) data;
+	struct ath6kl *ar = from_timer(ar, t, fw_recovery.hb_timer);
 	int err;
 
 	if (test_bit(RECOVERY_CLEANUP, &ar->flag) ||
@@ -104,9 +104,8 @@ void ath6kl_recovery_init(struct ath6kl *ar)
 	recovery->seq_num = 0;
 	recovery->hb_misscnt = 0;
 	ar->fw_recovery.hb_pending = false;
-	ar->fw_recovery.hb_timer.function = ath6kl_recovery_hb_timer;
-	ar->fw_recovery.hb_timer.data = (unsigned long) ar;
-	init_timer_deferrable(&ar->fw_recovery.hb_timer);
+	timer_setup(&ar->fw_recovery.hb_timer, ath6kl_recovery_hb_timer,
+		    TIMER_DEFERRABLE);
 
 	if (ar->fw_recovery.hb_poll)
 		mod_timer(&ar->fw_recovery.hb_timer, jiffies +
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 9da903562ed4..10cc45ca5803 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -128,8 +128,6 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 
 #define init_timer(timer)						\
 	__init_timer((timer), 0)
-#define init_timer_deferrable(timer)					\
-	__init_timer((timer), TIMER_DEFERRABLE)
 
 #define __setup_timer(_timer, _fn, _data, _flags)			\
 	do {								\
-- 
cgit v1.2.3


From 51487d9ed1e386f9f0863bbf385e5da8a586bff8 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 4 Oct 2017 16:27:01 -0700
Subject: timer: Remove last user of TIMER_INITIALIZER

Drops the last user of TIMER_INITIALIZER and adapts timer.h to use the
internal version.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Cc: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: linux1394-devel@lists.sourceforge.net
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: linux-s390@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: Mark Gross <mark.gross@intel.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Harish Patil <harish.patil@cavium.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Michael Reed <mdr@sgi.com>
Cc: Manish Chopra <manish.chopra@cavium.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-pm@vger.kernel.org
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: linux-watchdog@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: netdev@vger.kernel.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lkml.kernel.org/r/1507159627-127660-8-git-send-email-keescook@chromium.org
---
 drivers/char/tlclk.c  | 12 +++++-------
 include/linux/timer.h |  2 +-
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tlclk.c b/drivers/char/tlclk.c
index 6210bff46341..8eeb4190207d 100644
--- a/drivers/char/tlclk.c
+++ b/drivers/char/tlclk.c
@@ -184,9 +184,8 @@ static unsigned int telclk_interrupt;
 static int int_events;		/* Event that generate a interrupt */
 static int got_event;		/* if events processing have been done */
 
-static void switchover_timeout(unsigned long data);
-static struct timer_list switchover_timer =
-	TIMER_INITIALIZER(switchover_timeout , 0, 0);
+static void switchover_timeout(struct timer_list *t);
+static struct timer_list switchover_timer;
 static unsigned long tlclk_timer_data;
 
 static struct tlclk_alarms *alarm_events;
@@ -805,7 +804,7 @@ static int __init tlclk_init(void)
 		goto out3;
 	}
 
-	init_timer(&switchover_timer);
+	timer_setup(&switchover_timer, switchover_timeout, 0);
 
 	ret = misc_register(&tlclk_miscdev);
 	if (ret < 0) {
@@ -855,9 +854,9 @@ static void __exit tlclk_cleanup(void)
 
 }
 
-static void switchover_timeout(unsigned long data)
+static void switchover_timeout(struct timer_list *unused)
 {
-	unsigned long flags = *(unsigned long *) data;
+	unsigned long flags = tlclk_timer_data;
 
 	if ((flags & 1)) {
 		if ((inb(TLCLK_REG1) & 0x08) != (flags & 0x08))
@@ -922,7 +921,6 @@ static irqreturn_t tlclk_interrupt(int irq, void *dev_id)
 		/* TIMEOUT in ~10ms */
 		switchover_timer.expires = jiffies + msecs_to_jiffies(10);
 		tlclk_timer_data = inb(TLCLK_REG1);
-		switchover_timer.data = (unsigned long) &tlclk_timer_data;
 		mod_timer(&switchover_timer, switchover_timer.expires);
 	} else {
 		got_event = 1;
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 10cc45ca5803..4f7476e4a727 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -87,7 +87,7 @@ struct timer_list {
 
 #define DEFINE_TIMER(_name, _function, _expires, _data)		\
 	struct timer_list _name =				\
-		TIMER_INITIALIZER(_function, _expires, _data)
+		__TIMER_INITIALIZER(_function, _expires, _data, 0)
 
 void init_timer_key(struct timer_list *timer, unsigned int flags,
 		    const char *name, struct lock_class_key *key);
-- 
cgit v1.2.3


From fca7ce5b7c6d3eda8c2e011cff98d22e0fbd5097 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 4 Oct 2017 16:27:02 -0700
Subject: timer: Remove unused static initializer macros

This removes the now unused TIMER_*INITIALIZER macros:

TIMER_INITIALIZER
TIMER_PINNED_INITIALIZER
TIMER_DEFERRED_INITIALIZER
TIMER_PINNED_DEFERRED_INITIALIZER

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Cc: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: linux1394-devel@lists.sourceforge.net
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: linux-s390@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Harish Patil <harish.patil@cavium.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Michael Reed <mdr@sgi.com>
Cc: Manish Chopra <manish.chopra@cavium.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-pm@vger.kernel.org
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Mark Gross <mark.gross@intel.com>
Cc: linux-watchdog@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: netdev@vger.kernel.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lkml.kernel.org/r/1507159627-127660-9-git-send-email-keescook@chromium.org
---
 include/linux/timer.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 4f7476e4a727..a33220311361 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -73,18 +73,6 @@ struct timer_list {
 			__FILE__ ":" __stringify(__LINE__))	\
 	}
 
-#define TIMER_INITIALIZER(_function, _expires, _data)		\
-	__TIMER_INITIALIZER((_function), (_expires), (_data), 0)
-
-#define TIMER_PINNED_INITIALIZER(_function, _expires, _data)	\
-	__TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_PINNED)
-
-#define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data)	\
-	__TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE)
-
-#define TIMER_PINNED_DEFERRED_INITIALIZER(_function, _expires, _data)	\
-	__TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE | TIMER_PINNED)
-
 #define DEFINE_TIMER(_name, _function, _expires, _data)		\
 	struct timer_list _name =				\
 		__TIMER_INITIALIZER(_function, _expires, _data, 0)
-- 
cgit v1.2.3


From 1d27e3e2252ba9d949ca82fbdb73cde102cb2067 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 4 Oct 2017 16:27:04 -0700
Subject: timer: Remove expires and data arguments from DEFINE_TIMER

Drop the arguments from the macro and adjust all callers with the
following script:

  perl -pi -e 's/DEFINE_TIMER\((.*), 0, 0\);/DEFINE_TIMER($1);/g;' \
    $(git grep DEFINE_TIMER | cut -d: -f1 | sort -u | grep -v timer.h)

Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org> # for m68k parts
Acked-by: Guenter Roeck <linux@roeck-us.net> # for watchdog parts
Acked-by: David S. Miller <davem@davemloft.net> # for networking parts
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Kalle Valo <kvalo@codeaurora.org> # for wireless parts
Acked-by: Arnd Bergmann <arnd@arndb.de>
Cc: linux-mips@linux-mips.org
Cc: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: linux1394-devel@lists.sourceforge.net
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: linux-s390@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Harish Patil <harish.patil@cavium.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Michael Reed <mdr@sgi.com>
Cc: Manish Chopra <manish.chopra@cavium.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-pm@vger.kernel.org
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Mark Gross <mark.gross@intel.com>
Cc: linux-watchdog@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: netdev@vger.kernel.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lkml.kernel.org/r/1507159627-127660-11-git-send-email-keescook@chromium.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/arm/mach-ixp4xx/dsmg600-setup.c      | 2 +-
 arch/arm/mach-ixp4xx/nas100d-setup.c      | 2 +-
 arch/m68k/amiga/amisound.c                | 2 +-
 arch/m68k/mac/macboing.c                  | 2 +-
 arch/mips/mti-malta/malta-display.c       | 2 +-
 arch/parisc/kernel/pdc_cons.c             | 2 +-
 arch/s390/mm/cmm.c                        | 2 +-
 drivers/atm/idt77105.c                    | 4 ++--
 drivers/atm/iphase.c                      | 2 +-
 drivers/block/ataflop.c                   | 8 ++++----
 drivers/char/dtlk.c                       | 2 +-
 drivers/char/hangcheck-timer.c            | 2 +-
 drivers/char/nwbutton.c                   | 2 +-
 drivers/char/rtc.c                        | 2 +-
 drivers/input/touchscreen/s3c2410_ts.c    | 2 +-
 drivers/net/cris/eth_v10.c                | 6 +++---
 drivers/net/hamradio/yam.c                | 2 +-
 drivers/net/wireless/atmel/at76c50x-usb.c | 2 +-
 drivers/staging/speakup/main.c            | 2 +-
 drivers/staging/speakup/synth.c           | 2 +-
 drivers/tty/cyclades.c                    | 2 +-
 drivers/tty/isicom.c                      | 2 +-
 drivers/tty/moxa.c                        | 2 +-
 drivers/tty/rocket.c                      | 2 +-
 drivers/tty/vt/keyboard.c                 | 2 +-
 drivers/tty/vt/vt.c                       | 2 +-
 drivers/watchdog/alim7101_wdt.c           | 2 +-
 drivers/watchdog/machzwd.c                | 2 +-
 drivers/watchdog/mixcomwd.c               | 2 +-
 drivers/watchdog/sbc60xxwdt.c             | 2 +-
 drivers/watchdog/sc520_wdt.c              | 2 +-
 drivers/watchdog/via_wdt.c                | 2 +-
 drivers/watchdog/w83877f_wdt.c            | 2 +-
 drivers/xen/grant-table.c                 | 2 +-
 fs/pstore/platform.c                      | 2 +-
 include/linux/timer.h                     | 4 ++--
 kernel/irq/spurious.c                     | 2 +-
 lib/random32.c                            | 2 +-
 net/atm/mpc.c                             | 2 +-
 net/decnet/dn_route.c                     | 2 +-
 net/ipv6/ip6_flowlabel.c                  | 2 +-
 net/netrom/nr_loopback.c                  | 2 +-
 security/keys/gc.c                        | 2 +-
 sound/oss/midibuf.c                       | 2 +-
 sound/oss/soundcard.c                     | 2 +-
 sound/oss/sys_timer.c                     | 2 +-
 sound/oss/uart6850.c                      | 2 +-
 47 files changed, 54 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-ixp4xx/dsmg600-setup.c b/arch/arm/mach-ixp4xx/dsmg600-setup.c
index b3bd0e137f6d..b3689a141ec6 100644
--- a/arch/arm/mach-ixp4xx/dsmg600-setup.c
+++ b/arch/arm/mach-ixp4xx/dsmg600-setup.c
@@ -174,7 +174,7 @@ static int power_button_countdown;
 #define PBUTTON_HOLDDOWN_COUNT 4 /* 2 secs */
 
 static void dsmg600_power_handler(unsigned long data);
-static DEFINE_TIMER(dsmg600_power_timer, dsmg600_power_handler, 0, 0);
+static DEFINE_TIMER(dsmg600_power_timer, dsmg600_power_handler);
 
 static void dsmg600_power_handler(unsigned long data)
 {
diff --git a/arch/arm/mach-ixp4xx/nas100d-setup.c b/arch/arm/mach-ixp4xx/nas100d-setup.c
index 4e0f762bc651..562d05f9888e 100644
--- a/arch/arm/mach-ixp4xx/nas100d-setup.c
+++ b/arch/arm/mach-ixp4xx/nas100d-setup.c
@@ -197,7 +197,7 @@ static int power_button_countdown;
 #define PBUTTON_HOLDDOWN_COUNT 4 /* 2 secs */
 
 static void nas100d_power_handler(unsigned long data);
-static DEFINE_TIMER(nas100d_power_timer, nas100d_power_handler, 0, 0);
+static DEFINE_TIMER(nas100d_power_timer, nas100d_power_handler);
 
 static void nas100d_power_handler(unsigned long data)
 {
diff --git a/arch/m68k/amiga/amisound.c b/arch/m68k/amiga/amisound.c
index 90a60d758f8b..a23f48181fd6 100644
--- a/arch/m68k/amiga/amisound.c
+++ b/arch/m68k/amiga/amisound.c
@@ -66,7 +66,7 @@ void __init amiga_init_sound(void)
 }
 
 static void nosound( unsigned long ignored );
-static DEFINE_TIMER(sound_timer, nosound, 0, 0);
+static DEFINE_TIMER(sound_timer, nosound);
 
 void amiga_mksound( unsigned int hz, unsigned int ticks )
 {
diff --git a/arch/m68k/mac/macboing.c b/arch/m68k/mac/macboing.c
index ffaa1f6439ae..9a52aff183d0 100644
--- a/arch/m68k/mac/macboing.c
+++ b/arch/m68k/mac/macboing.c
@@ -56,7 +56,7 @@ static void ( *mac_special_bell )( unsigned int, unsigned int, unsigned int );
 /*
  * our timer to start/continue/stop the bell
  */
-static DEFINE_TIMER(mac_sound_timer, mac_nosound, 0, 0);
+static DEFINE_TIMER(mac_sound_timer, mac_nosound);
 
 /*
  * Sort of initialize the sound chip (called from mac_mksound on the first
diff --git a/arch/mips/mti-malta/malta-display.c b/arch/mips/mti-malta/malta-display.c
index ac813158b9b8..063de44675ce 100644
--- a/arch/mips/mti-malta/malta-display.c
+++ b/arch/mips/mti-malta/malta-display.c
@@ -37,7 +37,7 @@ void mips_display_message(const char *str)
 }
 
 static void scroll_display_message(unsigned long unused);
-static DEFINE_TIMER(mips_scroll_timer, scroll_display_message, 0, 0);
+static DEFINE_TIMER(mips_scroll_timer, scroll_display_message);
 
 static void scroll_display_message(unsigned long unused)
 {
diff --git a/arch/parisc/kernel/pdc_cons.c b/arch/parisc/kernel/pdc_cons.c
index 10a5ae9553fd..27a2dd616a7d 100644
--- a/arch/parisc/kernel/pdc_cons.c
+++ b/arch/parisc/kernel/pdc_cons.c
@@ -92,7 +92,7 @@ static int pdc_console_setup(struct console *co, char *options)
 #define PDC_CONS_POLL_DELAY (30 * HZ / 1000)
 
 static void pdc_console_poll(unsigned long unused);
-static DEFINE_TIMER(pdc_console_timer, pdc_console_poll, 0, 0);
+static DEFINE_TIMER(pdc_console_timer, pdc_console_poll);
 static struct tty_port tty_port;
 
 static int pdc_console_tty_open(struct tty_struct *tty, struct file *filp)
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index 829c63dbc81a..2dbdcd85b68f 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -56,7 +56,7 @@ static DEFINE_SPINLOCK(cmm_lock);
 
 static struct task_struct *cmm_thread_ptr;
 static DECLARE_WAIT_QUEUE_HEAD(cmm_thread_wait);
-static DEFINE_TIMER(cmm_timer, NULL, 0, 0);
+static DEFINE_TIMER(cmm_timer, NULL);
 
 static void cmm_timer_fn(unsigned long);
 static void cmm_set_timer(void);
diff --git a/drivers/atm/idt77105.c b/drivers/atm/idt77105.c
index 082aa02abc57..57af9fd198e4 100644
--- a/drivers/atm/idt77105.c
+++ b/drivers/atm/idt77105.c
@@ -49,8 +49,8 @@ static void idt77105_stats_timer_func(unsigned long);
 static void idt77105_restart_timer_func(unsigned long);
 
 
-static DEFINE_TIMER(stats_timer, idt77105_stats_timer_func, 0, 0);
-static DEFINE_TIMER(restart_timer, idt77105_restart_timer_func, 0, 0);
+static DEFINE_TIMER(stats_timer, idt77105_stats_timer_func);
+static DEFINE_TIMER(restart_timer, idt77105_restart_timer_func);
 static int start_timer = 1;
 static struct idt77105_priv *idt77105_all = NULL;
 
diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c
index fc72b763fdd7..ad6b582c268e 100644
--- a/drivers/atm/iphase.c
+++ b/drivers/atm/iphase.c
@@ -76,7 +76,7 @@ static IADEV *ia_dev[8];
 static struct atm_dev *_ia_dev[8];
 static int iadev_count;
 static void ia_led_timer(unsigned long arg);
-static DEFINE_TIMER(ia_timer, ia_led_timer, 0, 0);
+static DEFINE_TIMER(ia_timer, ia_led_timer);
 static int IA_TX_BUF = DFL_TX_BUFFERS, IA_TX_BUF_SZ = DFL_TX_BUF_SZ;
 static int IA_RX_BUF = DFL_RX_BUFFERS, IA_RX_BUF_SZ = DFL_RX_BUF_SZ;
 static uint IADebugFlag = /* IF_IADBG_ERR | IF_IADBG_CBR| IF_IADBG_INIT_ADAPTER
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 92da886180aa..ae596e55bcb6 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -373,10 +373,10 @@ static void floppy_release(struct gendisk *disk, fmode_t mode);
 
 /************************* End of Prototypes **************************/
 
-static DEFINE_TIMER(motor_off_timer, fd_motor_off_timer, 0, 0);
-static DEFINE_TIMER(readtrack_timer, fd_readtrack_check, 0, 0);
-static DEFINE_TIMER(timeout_timer, fd_times_out, 0, 0);
-static DEFINE_TIMER(fd_timer, check_change, 0, 0);
+static DEFINE_TIMER(motor_off_timer, fd_motor_off_timer);
+static DEFINE_TIMER(readtrack_timer, fd_readtrack_check);
+static DEFINE_TIMER(timeout_timer, fd_times_out);
+static DEFINE_TIMER(fd_timer, check_change);
 	
 static void fd_end_request_cur(blk_status_t err)
 {
diff --git a/drivers/char/dtlk.c b/drivers/char/dtlk.c
index 58471394beb9..1a0385ed6417 100644
--- a/drivers/char/dtlk.c
+++ b/drivers/char/dtlk.c
@@ -84,7 +84,7 @@ static int dtlk_has_indexing;
 static unsigned int dtlk_portlist[] =
 {0x25e, 0x29e, 0x2de, 0x31e, 0x35e, 0x39e, 0};
 static wait_queue_head_t dtlk_process_list;
-static DEFINE_TIMER(dtlk_timer, dtlk_timer_tick, 0, 0);
+static DEFINE_TIMER(dtlk_timer, dtlk_timer_tick);
 
 /* prototypes for file_operations struct */
 static ssize_t dtlk_read(struct file *, char __user *,
diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c
index 5406b90bf626..5b8db2ed844d 100644
--- a/drivers/char/hangcheck-timer.c
+++ b/drivers/char/hangcheck-timer.c
@@ -124,7 +124,7 @@ static unsigned long long hangcheck_tsc, hangcheck_tsc_margin;
 
 static void hangcheck_fire(unsigned long);
 
-static DEFINE_TIMER(hangcheck_ticktock, hangcheck_fire, 0, 0);
+static DEFINE_TIMER(hangcheck_ticktock, hangcheck_fire);
 
 static void hangcheck_fire(unsigned long data)
 {
diff --git a/drivers/char/nwbutton.c b/drivers/char/nwbutton.c
index e6d0d271c58c..44006ed9558f 100644
--- a/drivers/char/nwbutton.c
+++ b/drivers/char/nwbutton.c
@@ -27,7 +27,7 @@ static void button_sequence_finished (unsigned long parameters);
 
 static int button_press_count;		/* The count of button presses */
 /* Times for the end of a sequence */
-static DEFINE_TIMER(button_timer, button_sequence_finished, 0, 0);
+static DEFINE_TIMER(button_timer, button_sequence_finished);
 static DECLARE_WAIT_QUEUE_HEAD(button_wait_queue); /* Used for blocking read */
 static char button_output_buffer[32];	/* Stores data to write out of device */
 static int bcount;			/* The number of bytes in the buffer */
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index 974d48927b07..616871e68e09 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -137,7 +137,7 @@ static DECLARE_WAIT_QUEUE_HEAD(rtc_wait);
 #ifdef RTC_IRQ
 static void rtc_dropped_irq(unsigned long data);
 
-static DEFINE_TIMER(rtc_irq_timer, rtc_dropped_irq, 0, 0);
+static DEFINE_TIMER(rtc_irq_timer, rtc_dropped_irq);
 #endif
 
 static ssize_t rtc_read(struct file *file, char __user *buf,
diff --git a/drivers/input/touchscreen/s3c2410_ts.c b/drivers/input/touchscreen/s3c2410_ts.c
index 3b3db8c868e0..d3265b6b58b8 100644
--- a/drivers/input/touchscreen/s3c2410_ts.c
+++ b/drivers/input/touchscreen/s3c2410_ts.c
@@ -145,7 +145,7 @@ static void touch_timer_fire(unsigned long data)
 	}
 }
 
-static DEFINE_TIMER(touch_timer, touch_timer_fire, 0, 0);
+static DEFINE_TIMER(touch_timer, touch_timer_fire);
 
 /**
  * stylus_irq - touchscreen stylus event interrupt
diff --git a/drivers/net/cris/eth_v10.c b/drivers/net/cris/eth_v10.c
index 017f48cdcab9..1fcc86fa4e05 100644
--- a/drivers/net/cris/eth_v10.c
+++ b/drivers/net/cris/eth_v10.c
@@ -165,8 +165,8 @@ static unsigned int network_rec_config_shadow = 0;
 static unsigned int network_tr_ctrl_shadow = 0;
 
 /* Network speed indication. */
-static DEFINE_TIMER(speed_timer, NULL, 0, 0);
-static DEFINE_TIMER(clear_led_timer, NULL, 0, 0);
+static DEFINE_TIMER(speed_timer, NULL);
+static DEFINE_TIMER(clear_led_timer, NULL);
 static int current_speed; /* Speed read from transceiver */
 static int current_speed_selection; /* Speed selected by user */
 static unsigned long led_next_time;
@@ -174,7 +174,7 @@ static int led_active;
 static int rx_queue_len;
 
 /* Duplex */
-static DEFINE_TIMER(duplex_timer, NULL, 0, 0);
+static DEFINE_TIMER(duplex_timer, NULL);
 static int full_duplex;
 static enum duplex current_duplex;
 
diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c
index 7a7c5224a336..104f71fa9c5e 100644
--- a/drivers/net/hamradio/yam.c
+++ b/drivers/net/hamradio/yam.c
@@ -157,7 +157,7 @@ static struct net_device *yam_devs[NR_PORTS];
 
 static struct yam_mcs *yam_data;
 
-static DEFINE_TIMER(yam_timer, NULL, 0, 0);
+static DEFINE_TIMER(yam_timer, NULL);
 
 /* --------------------------------------------------------------------- */
 
diff --git a/drivers/net/wireless/atmel/at76c50x-usb.c b/drivers/net/wireless/atmel/at76c50x-usb.c
index 94bf01f8b2a8..ede89d4ffc88 100644
--- a/drivers/net/wireless/atmel/at76c50x-usb.c
+++ b/drivers/net/wireless/atmel/at76c50x-usb.c
@@ -519,7 +519,7 @@ exit:
 /* LED trigger */
 static int tx_activity;
 static void at76_ledtrig_tx_timerfunc(unsigned long data);
-static DEFINE_TIMER(ledtrig_tx_timer, at76_ledtrig_tx_timerfunc, 0, 0);
+static DEFINE_TIMER(ledtrig_tx_timer, at76_ledtrig_tx_timerfunc);
 DEFINE_LED_TRIGGER(ledtrig_tx);
 
 static void at76_ledtrig_tx_timerfunc(unsigned long data)
diff --git a/drivers/staging/speakup/main.c b/drivers/staging/speakup/main.c
index 56f7be6af1f6..585925bb49a4 100644
--- a/drivers/staging/speakup/main.c
+++ b/drivers/staging/speakup/main.c
@@ -1165,7 +1165,7 @@ static const int NUM_CTL_LABELS = (MSG_CTL_END - MSG_CTL_START + 1);
 
 static void read_all_doc(struct vc_data *vc);
 static void cursor_done(u_long data);
-static DEFINE_TIMER(cursor_timer, cursor_done, 0, 0);
+static DEFINE_TIMER(cursor_timer, cursor_done);
 
 static void do_handle_shift(struct vc_data *vc, u_char value, char up_flag)
 {
diff --git a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c
index a1ca68c76579..6ddd3fc3f08d 100644
--- a/drivers/staging/speakup/synth.c
+++ b/drivers/staging/speakup/synth.c
@@ -158,7 +158,7 @@ static void thread_wake_up(u_long data)
 	wake_up_interruptible_all(&speakup_event);
 }
 
-static DEFINE_TIMER(thread_timer, thread_wake_up, 0, 0);
+static DEFINE_TIMER(thread_timer, thread_wake_up);
 
 void synth_start(void)
 {
diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c
index d272bc4e7fb5..dac8a1a8e4ac 100644
--- a/drivers/tty/cyclades.c
+++ b/drivers/tty/cyclades.c
@@ -283,7 +283,7 @@ static void cyz_poll(unsigned long);
 /* The Cyclades-Z polling cycle is defined by this variable */
 static long cyz_polling_cycle = CZ_DEF_POLL;
 
-static DEFINE_TIMER(cyz_timerlist, cyz_poll, 0, 0);
+static DEFINE_TIMER(cyz_timerlist, cyz_poll);
 
 #else				/* CONFIG_CYZ_INTR */
 static void cyz_rx_restart(unsigned long);
diff --git a/drivers/tty/isicom.c b/drivers/tty/isicom.c
index 61ecdd6b2fc2..40af32108ff5 100644
--- a/drivers/tty/isicom.c
+++ b/drivers/tty/isicom.c
@@ -177,7 +177,7 @@ static struct tty_driver *isicom_normal;
 static void isicom_tx(unsigned long _data);
 static void isicom_start(struct tty_struct *tty);
 
-static DEFINE_TIMER(tx, isicom_tx, 0, 0);
+static DEFINE_TIMER(tx, isicom_tx);
 
 /*   baud index mappings from linux defns to isi */
 
diff --git a/drivers/tty/moxa.c b/drivers/tty/moxa.c
index 7f3d4cb0341b..93d37655d928 100644
--- a/drivers/tty/moxa.c
+++ b/drivers/tty/moxa.c
@@ -428,7 +428,7 @@ static const struct tty_port_operations moxa_port_ops = {
 };
 
 static struct tty_driver *moxaDriver;
-static DEFINE_TIMER(moxaTimer, moxa_poll, 0, 0);
+static DEFINE_TIMER(moxaTimer, moxa_poll);
 
 /*
  * HW init
diff --git a/drivers/tty/rocket.c b/drivers/tty/rocket.c
index 20d79a6007d5..aa695fda1084 100644
--- a/drivers/tty/rocket.c
+++ b/drivers/tty/rocket.c
@@ -111,7 +111,7 @@ static struct r_port *rp_table[MAX_RP_PORTS];	       /*  The main repository of
 static unsigned int xmit_flags[NUM_BOARDS];	       /*  Bit significant, indicates port had data to transmit. */
 						       /*  eg.  Bit 0 indicates port 0 has xmit data, ...        */
 static atomic_t rp_num_ports_open;	               /*  Number of serial ports open                           */
-static DEFINE_TIMER(rocket_timer, rp_do_poll, 0, 0);
+static DEFINE_TIMER(rocket_timer, rp_do_poll);
 
 static unsigned long board1;	                       /* ISA addresses, retrieved from rocketport.conf          */
 static unsigned long board2;
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index f4166263bb3a..f974d6340d04 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -250,7 +250,7 @@ static void kd_nosound(unsigned long ignored)
 	input_handler_for_each_handle(&kbd_handler, &zero, kd_sound_helper);
 }
 
-static DEFINE_TIMER(kd_mksound_timer, kd_nosound, 0, 0);
+static DEFINE_TIMER(kd_mksound_timer, kd_nosound);
 
 void kd_mksound(unsigned int hz, unsigned int ticks)
 {
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index 2ebaba16f785..602d71630952 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -228,7 +228,7 @@ static int scrollback_delta;
  */
 int (*console_blank_hook)(int);
 
-static DEFINE_TIMER(console_timer, blank_screen_t, 0, 0);
+static DEFINE_TIMER(console_timer, blank_screen_t);
 static int blank_state;
 static int blank_timer_expired;
 enum {
diff --git a/drivers/watchdog/alim7101_wdt.c b/drivers/watchdog/alim7101_wdt.c
index 3c1f6ac68ea9..18e896eeca62 100644
--- a/drivers/watchdog/alim7101_wdt.c
+++ b/drivers/watchdog/alim7101_wdt.c
@@ -71,7 +71,7 @@ MODULE_PARM_DESC(use_gpio,
 		"Use the gpio watchdog (required by old cobalt boards).");
 
 static void wdt_timer_ping(unsigned long);
-static DEFINE_TIMER(timer, wdt_timer_ping, 0, 0);
+static DEFINE_TIMER(timer, wdt_timer_ping);
 static unsigned long next_heartbeat;
 static unsigned long wdt_is_open;
 static char wdt_expect_close;
diff --git a/drivers/watchdog/machzwd.c b/drivers/watchdog/machzwd.c
index 9826b59ef734..8a616a57bb90 100644
--- a/drivers/watchdog/machzwd.c
+++ b/drivers/watchdog/machzwd.c
@@ -127,7 +127,7 @@ static int zf_action = GEN_RESET;
 static unsigned long zf_is_open;
 static char zf_expect_close;
 static DEFINE_SPINLOCK(zf_port_lock);
-static DEFINE_TIMER(zf_timer, zf_ping, 0, 0);
+static DEFINE_TIMER(zf_timer, zf_ping);
 static unsigned long next_heartbeat;
 
 
diff --git a/drivers/watchdog/mixcomwd.c b/drivers/watchdog/mixcomwd.c
index be86ea359eee..c9e38096ea91 100644
--- a/drivers/watchdog/mixcomwd.c
+++ b/drivers/watchdog/mixcomwd.c
@@ -105,7 +105,7 @@ static unsigned long mixcomwd_opened; /* long req'd for setbit --RR */
 
 static int watchdog_port;
 static int mixcomwd_timer_alive;
-static DEFINE_TIMER(mixcomwd_timer, mixcomwd_timerfun, 0, 0);
+static DEFINE_TIMER(mixcomwd_timer, mixcomwd_timerfun);
 static char expect_close;
 
 static bool nowayout = WATCHDOG_NOWAYOUT;
diff --git a/drivers/watchdog/sbc60xxwdt.c b/drivers/watchdog/sbc60xxwdt.c
index 2eef58a0cf05..8d589939bc84 100644
--- a/drivers/watchdog/sbc60xxwdt.c
+++ b/drivers/watchdog/sbc60xxwdt.c
@@ -113,7 +113,7 @@ MODULE_PARM_DESC(nowayout,
 				__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
 
 static void wdt_timer_ping(unsigned long);
-static DEFINE_TIMER(timer, wdt_timer_ping, 0, 0);
+static DEFINE_TIMER(timer, wdt_timer_ping);
 static unsigned long next_heartbeat;
 static unsigned long wdt_is_open;
 static char wdt_expect_close;
diff --git a/drivers/watchdog/sc520_wdt.c b/drivers/watchdog/sc520_wdt.c
index 1cfd3f6a13d5..3e9bbaa37bf4 100644
--- a/drivers/watchdog/sc520_wdt.c
+++ b/drivers/watchdog/sc520_wdt.c
@@ -124,7 +124,7 @@ MODULE_PARM_DESC(nowayout,
 static __u16 __iomem *wdtmrctl;
 
 static void wdt_timer_ping(unsigned long);
-static DEFINE_TIMER(timer, wdt_timer_ping, 0, 0);
+static DEFINE_TIMER(timer, wdt_timer_ping);
 static unsigned long next_heartbeat;
 static unsigned long wdt_is_open;
 static char wdt_expect_close;
diff --git a/drivers/watchdog/via_wdt.c b/drivers/watchdog/via_wdt.c
index 5f9cbc37520d..ad3c3be13b40 100644
--- a/drivers/watchdog/via_wdt.c
+++ b/drivers/watchdog/via_wdt.c
@@ -68,7 +68,7 @@ static struct resource wdt_res;
 static void __iomem *wdt_mem;
 static unsigned int mmio;
 static void wdt_timer_tick(unsigned long data);
-static DEFINE_TIMER(timer, wdt_timer_tick, 0, 0);
+static DEFINE_TIMER(timer, wdt_timer_tick);
 					/* The timer that pings the watchdog */
 static unsigned long next_heartbeat;	/* the next_heartbeat for the timer */
 
diff --git a/drivers/watchdog/w83877f_wdt.c b/drivers/watchdog/w83877f_wdt.c
index f0483c75ed32..ba6b680af100 100644
--- a/drivers/watchdog/w83877f_wdt.c
+++ b/drivers/watchdog/w83877f_wdt.c
@@ -98,7 +98,7 @@ MODULE_PARM_DESC(nowayout,
 				__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
 
 static void wdt_timer_ping(unsigned long);
-static DEFINE_TIMER(timer, wdt_timer_ping, 0, 0);
+static DEFINE_TIMER(timer, wdt_timer_ping);
 static unsigned long next_heartbeat;
 static unsigned long wdt_is_open;
 static char wdt_expect_close;
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 2c6a9114d332..a8721d718186 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -305,7 +305,7 @@ struct deferred_entry {
 };
 static LIST_HEAD(deferred_list);
 static void gnttab_handle_deferred(unsigned long);
-static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred, 0, 0);
+static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
 
 static void gnttab_handle_deferred(unsigned long unused)
 {
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 2b21d180157c..ec7199e859d2 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -62,7 +62,7 @@ MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content "
 static int pstore_new_entry;
 
 static void pstore_timefunc(unsigned long);
-static DEFINE_TIMER(pstore_timer, pstore_timefunc, 0, 0);
+static DEFINE_TIMER(pstore_timer, pstore_timefunc);
 
 static void pstore_dowork(struct work_struct *);
 static DECLARE_WORK(pstore_work, pstore_dowork);
diff --git a/include/linux/timer.h b/include/linux/timer.h
index a33220311361..91e5a2cc81b5 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -73,9 +73,9 @@ struct timer_list {
 			__FILE__ ":" __stringify(__LINE__))	\
 	}
 
-#define DEFINE_TIMER(_name, _function, _expires, _data)		\
+#define DEFINE_TIMER(_name, _function)				\
 	struct timer_list _name =				\
-		__TIMER_INITIALIZER(_function, _expires, _data, 0)
+		__TIMER_INITIALIZER(_function, 0, 0, 0)
 
 void init_timer_key(struct timer_list *timer, unsigned int flags,
 		    const char *name, struct lock_class_key *key);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 061ba7eed4ed..c805e8691c22 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -20,7 +20,7 @@ static int irqfixup __read_mostly;
 
 #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
 static void poll_spurious_irqs(unsigned long dummy);
-static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
+static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
 static int irq_poll_cpu;
 static atomic_t irq_poll_active;
 
diff --git a/lib/random32.c b/lib/random32.c
index fa594b1140e6..6e91b75c113f 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -214,7 +214,7 @@ core_initcall(prandom_init);
 
 static void __prandom_timer(unsigned long dontcare);
 
-static DEFINE_TIMER(seed_timer, __prandom_timer, 0, 0);
+static DEFINE_TIMER(seed_timer, __prandom_timer);
 
 static void __prandom_timer(unsigned long dontcare)
 {
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 5677147209e8..63138c8c2269 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -121,7 +121,7 @@ static struct notifier_block mpoa_notifier = {
 
 struct mpoa_client *mpcs = NULL; /* FIXME */
 static struct atm_mpoa_qos *qos_head = NULL;
-static DEFINE_TIMER(mpc_timer, NULL, 0, 0);
+static DEFINE_TIMER(mpc_timer, NULL);
 
 
 static struct mpoa_client *find_mpc_by_itfnum(int itf)
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 0bd3afd01dd2..6538632fbd03 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -131,7 +131,7 @@ static struct dn_rt_hash_bucket *dn_rt_hash_table;
 static unsigned int dn_rt_hash_mask;
 
 static struct timer_list dn_route_timer;
-static DEFINE_TIMER(dn_rt_flush_timer, dn_run_flush, 0, 0);
+static DEFINE_TIMER(dn_rt_flush_timer, dn_run_flush);
 int decnet_dst_gc_interval = 2;
 
 static struct dst_ops dn_dst_ops = {
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 8081bafe441b..b39d0908be2e 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -47,7 +47,7 @@ static atomic_t fl_size = ATOMIC_INIT(0);
 static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1];
 
 static void ip6_fl_gc(unsigned long dummy);
-static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc, 0, 0);
+static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc);
 
 /* FL hash table lock: it protects only of GC */
 
diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c
index 94d4e922af53..989ae647825e 100644
--- a/net/netrom/nr_loopback.c
+++ b/net/netrom/nr_loopback.c
@@ -18,7 +18,7 @@
 static void nr_loopback_timer(unsigned long);
 
 static struct sk_buff_head loopback_queue;
-static DEFINE_TIMER(loopback_timer, nr_loopback_timer, 0, 0);
+static DEFINE_TIMER(loopback_timer, nr_loopback_timer);
 
 void __init nr_loopback_init(void)
 {
diff --git a/security/keys/gc.c b/security/keys/gc.c
index 87cb260e4890..8673f7f58ead 100644
--- a/security/keys/gc.c
+++ b/security/keys/gc.c
@@ -30,7 +30,7 @@ DECLARE_WORK(key_gc_work, key_garbage_collector);
  * Reaper for links from keyrings to dead keys.
  */
 static void key_gc_timer_func(unsigned long);
-static DEFINE_TIMER(key_gc_timer, key_gc_timer_func, 0, 0);
+static DEFINE_TIMER(key_gc_timer, key_gc_timer_func);
 
 static time_t key_gc_next_run = LONG_MAX;
 static struct key_type *key_gc_dead_keytype;
diff --git a/sound/oss/midibuf.c b/sound/oss/midibuf.c
index 701c7625c971..1277df815d5b 100644
--- a/sound/oss/midibuf.c
+++ b/sound/oss/midibuf.c
@@ -52,7 +52,7 @@ static struct midi_parms parms[MAX_MIDI_DEV];
 static void midi_poll(unsigned long dummy);
 
 
-static DEFINE_TIMER(poll_timer, midi_poll, 0, 0);
+static DEFINE_TIMER(poll_timer, midi_poll);
 
 static volatile int open_devs;
 static DEFINE_SPINLOCK(lock);
diff --git a/sound/oss/soundcard.c b/sound/oss/soundcard.c
index b70c7c8f9c5d..4391062e5cfd 100644
--- a/sound/oss/soundcard.c
+++ b/sound/oss/soundcard.c
@@ -662,7 +662,7 @@ static void do_sequencer_timer(unsigned long dummy)
 }
 
 
-static DEFINE_TIMER(seq_timer, do_sequencer_timer, 0, 0);
+static DEFINE_TIMER(seq_timer, do_sequencer_timer);
 
 void request_sound_timer(int count)
 {
diff --git a/sound/oss/sys_timer.c b/sound/oss/sys_timer.c
index d17019d25b99..8a4b5625dba6 100644
--- a/sound/oss/sys_timer.c
+++ b/sound/oss/sys_timer.c
@@ -28,7 +28,7 @@ static unsigned long prev_event_time;
 
 static void     poll_def_tmr(unsigned long dummy);
 static DEFINE_SPINLOCK(lock);
-static DEFINE_TIMER(def_tmr, poll_def_tmr, 0, 0);
+static DEFINE_TIMER(def_tmr, poll_def_tmr);
 
 static unsigned long
 tmr2ticks(int tmr_value)
diff --git a/sound/oss/uart6850.c b/sound/oss/uart6850.c
index eda32d7eddbd..a9d3f7568525 100644
--- a/sound/oss/uart6850.c
+++ b/sound/oss/uart6850.c
@@ -78,7 +78,7 @@ static void (*midi_input_intr) (int dev, unsigned char data);
 static void poll_uart6850(unsigned long dummy);
 
 
-static DEFINE_TIMER(uart6850_timer, poll_uart6850, 0, 0);
+static DEFINE_TIMER(uart6850_timer, poll_uart6850);
 
 static void uart6850_input_loop(void)
 {
-- 
cgit v1.2.3


From 8ede369b2cccfa585e2969bbed18edc0e2a18c50 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 4 Oct 2017 16:27:05 -0700
Subject: timer: Remove expires argument from __TIMER_INITIALIZER()

The expires field is normally initialized during the first mod_timer()
call. It was unused by all callers, so remove it from the macro.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: linux-mips@linux-mips.org
Cc: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: linux1394-devel@lists.sourceforge.net
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: linux-s390@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Harish Patil <harish.patil@cavium.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Michael Reed <mdr@sgi.com>
Cc: Manish Chopra <manish.chopra@cavium.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-pm@vger.kernel.org
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Mark Gross <mark.gross@intel.com>
Cc: linux-watchdog@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: netdev@vger.kernel.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lkml.kernel.org/r/1507159627-127660-12-git-send-email-keescook@chromium.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/kthread.h   | 2 +-
 include/linux/timer.h     | 5 ++---
 include/linux/workqueue.h | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 82e197eeac91..0d622b350d3f 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -117,7 +117,7 @@ struct kthread_delayed_work {
 #define KTHREAD_DELAYED_WORK_INIT(dwork, fn) {				\
 	.work = KTHREAD_WORK_INIT((dwork).work, (fn)),			\
 	.timer = __TIMER_INITIALIZER(kthread_delayed_work_timer_fn,	\
-				     0, (unsigned long)&(dwork),	\
+				     (unsigned long)&(dwork),		\
 				     TIMER_IRQSAFE),			\
 	}
 
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 91e5a2cc81b5..10685c33e679 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -63,10 +63,9 @@ struct timer_list {
 
 #define TIMER_TRACE_FLAGMASK	(TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE)
 
-#define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
+#define __TIMER_INITIALIZER(_function, _data, _flags) {		\
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
-		.expires = (_expires),				\
 		.data = (_data),				\
 		.flags = (_flags),				\
 		__TIMER_LOCKDEP_MAP_INITIALIZER(		\
@@ -75,7 +74,7 @@ struct timer_list {
 
 #define DEFINE_TIMER(_name, _function)				\
 	struct timer_list _name =				\
-		__TIMER_INITIALIZER(_function, 0, 0, 0)
+		__TIMER_INITIALIZER(_function, 0, 0)
 
 void init_timer_key(struct timer_list *timer, unsigned int flags,
 		    const char *name, struct lock_class_key *key);
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 1c49431f3121..f4960260feaf 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -176,7 +176,7 @@ struct execute_work {
 #define __DELAYED_WORK_INITIALIZER(n, f, tflags) {			\
 	.work = __WORK_INITIALIZER((n).work, (f)),			\
 	.timer = __TIMER_INITIALIZER(delayed_work_timer_fn,		\
-				     0, (unsigned long)&(n),		\
+				     (unsigned long)&(n),		\
 				     (tflags) | TIMER_IRQSAFE),		\
 	}
 
-- 
cgit v1.2.3


From fe5c3b69b540e3387223a696f327c1bb8880d1ac Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 4 Oct 2017 16:27:06 -0700
Subject: kthread: Convert callback to use from_timer()

In preparation for unconditionally passing the struct timer_list pointer
to all timer callbacks, switch kthread to use from_timer() and pass the
timer pointer explicitly.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Cc: Len Brown <len.brown@intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: linux1394-devel@lists.sourceforge.net
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: linux-s390@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Harish Patil <harish.patil@cavium.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Manish Chopra <manish.chopra@cavium.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-pm@vger.kernel.org
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Mark Gross <mark.gross@intel.com>
Cc: linux-watchdog@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Michael Reed <mdr@sgi.com>
Cc: netdev@vger.kernel.org
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lkml.kernel.org/r/1507159627-127660-13-git-send-email-keescook@chromium.org
---
 include/linux/kthread.h | 10 +++++-----
 kernel/kthread.c        | 10 ++++------
 2 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 0d622b350d3f..35cbe3b0ce5b 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -75,7 +75,7 @@ extern int tsk_fork_get_node(struct task_struct *tsk);
  */
 struct kthread_work;
 typedef void (*kthread_work_func_t)(struct kthread_work *work);
-void kthread_delayed_work_timer_fn(unsigned long __data);
+void kthread_delayed_work_timer_fn(struct timer_list *t);
 
 enum {
 	KTW_FREEZABLE		= 1 << 0,	/* freeze during suspend */
@@ -116,8 +116,8 @@ struct kthread_delayed_work {
 
 #define KTHREAD_DELAYED_WORK_INIT(dwork, fn) {				\
 	.work = KTHREAD_WORK_INIT((dwork).work, (fn)),			\
-	.timer = __TIMER_INITIALIZER(kthread_delayed_work_timer_fn,	\
-				     (unsigned long)&(dwork),		\
+	.timer = __TIMER_INITIALIZER((TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn,\
+				     (TIMER_DATA_TYPE)&(dwork.timer),	\
 				     TIMER_IRQSAFE),			\
 	}
 
@@ -164,8 +164,8 @@ extern void __kthread_init_worker(struct kthread_worker *worker,
 	do {								\
 		kthread_init_work(&(dwork)->work, (fn));		\
 		__setup_timer(&(dwork)->timer,				\
-			      kthread_delayed_work_timer_fn,		\
-			      (unsigned long)(dwork),			\
+			      (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn,\
+			      (TIMER_DATA_TYPE)&(dwork)->timer,		\
 			      TIMER_IRQSAFE);				\
 	} while (0)
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1c19edf82427..ba3992c8c375 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -798,15 +798,14 @@ EXPORT_SYMBOL_GPL(kthread_queue_work);
 /**
  * kthread_delayed_work_timer_fn - callback that queues the associated kthread
  *	delayed work when the timer expires.
- * @__data: pointer to the data associated with the timer
+ * @t: pointer to the expired timer
  *
  * The format of the function is defined by struct timer_list.
  * It should have been called from irqsafe timer with irq already off.
  */
-void kthread_delayed_work_timer_fn(unsigned long __data)
+void kthread_delayed_work_timer_fn(struct timer_list *t)
 {
-	struct kthread_delayed_work *dwork =
-		(struct kthread_delayed_work *)__data;
+	struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
 	struct kthread_work *work = &dwork->work;
 	struct kthread_worker *worker = work->worker;
 
@@ -837,8 +836,7 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker,
 	struct timer_list *timer = &dwork->timer;
 	struct kthread_work *work = &dwork->work;
 
-	WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn ||
-		     timer->data != (unsigned long)dwork);
+	WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn);
 
 	/*
 	 * If @delay is 0, queue @dwork->work immediately.  This is for
-- 
cgit v1.2.3


From 8c20feb60604d91a29cd7fef8ac758bd92d9fd2c Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 4 Oct 2017 16:27:07 -0700
Subject: workqueue: Convert callback to use from_timer()

In preparation for unconditionally passing the struct timer_list pointer
to all timer callbacks, switch workqueue to use from_timer() and pass the
timer pointer explicitly.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Cc: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: linux1394-devel@lists.sourceforge.net
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: linux-s390@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Harish Patil <harish.patil@cavium.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Manish Chopra <manish.chopra@cavium.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-pm@vger.kernel.org
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Mark Gross <mark.gross@intel.com>
Cc: linux-watchdog@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Michael Reed <mdr@sgi.com>
Cc: netdev@vger.kernel.org
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lkml.kernel.org/r/1507159627-127660-14-git-send-email-keescook@chromium.org
---
 include/linux/workqueue.h | 15 ++++++++-------
 kernel/workqueue.c        |  7 +++----
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index f4960260feaf..f3c47a05fd06 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -17,7 +17,7 @@ struct workqueue_struct;
 
 struct work_struct;
 typedef void (*work_func_t)(struct work_struct *work);
-void delayed_work_timer_fn(unsigned long __data);
+void delayed_work_timer_fn(struct timer_list *t);
 
 /*
  * The first word is the work queue pointer and the flags rolled into
@@ -175,8 +175,8 @@ struct execute_work {
 
 #define __DELAYED_WORK_INITIALIZER(n, f, tflags) {			\
 	.work = __WORK_INITIALIZER((n).work, (f)),			\
-	.timer = __TIMER_INITIALIZER(delayed_work_timer_fn,		\
-				     (unsigned long)&(n),		\
+	.timer = __TIMER_INITIALIZER((TIMER_FUNC_TYPE)delayed_work_timer_fn,\
+				     (TIMER_DATA_TYPE)&(n.timer),	\
 				     (tflags) | TIMER_IRQSAFE),		\
 	}
 
@@ -241,8 +241,9 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 #define __INIT_DELAYED_WORK(_work, _func, _tflags)			\
 	do {								\
 		INIT_WORK(&(_work)->work, (_func));			\
-		__setup_timer(&(_work)->timer, delayed_work_timer_fn,	\
-			      (unsigned long)(_work),			\
+		__setup_timer(&(_work)->timer,				\
+			      (TIMER_FUNC_TYPE)delayed_work_timer_fn,	\
+			      (TIMER_DATA_TYPE)&(_work)->timer,		\
 			      (_tflags) | TIMER_IRQSAFE);		\
 	} while (0)
 
@@ -250,8 +251,8 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 	do {								\
 		INIT_WORK_ONSTACK(&(_work)->work, (_func));		\
 		__setup_timer_on_stack(&(_work)->timer,			\
-				       delayed_work_timer_fn,		\
-				       (unsigned long)(_work),		\
+				       (TIMER_FUNC_TYPE)delayed_work_timer_fn,\
+				       (TIMER_DATA_TYPE)&(_work)->timer,\
 				       (_tflags) | TIMER_IRQSAFE);	\
 	} while (0)
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a5361fc6215d..c77fdf6bf24f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1492,9 +1492,9 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL(queue_work_on);
 
-void delayed_work_timer_fn(unsigned long __data)
+void delayed_work_timer_fn(struct timer_list *t)
 {
-	struct delayed_work *dwork = (struct delayed_work *)__data;
+	struct delayed_work *dwork = from_timer(dwork, t, timer);
 
 	/* should have been called from irqsafe timer with irq already off */
 	__queue_work(dwork->cpu, dwork->wq, &dwork->work);
@@ -1508,8 +1508,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 	struct work_struct *work = &dwork->work;
 
 	WARN_ON_ONCE(!wq);
-	WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
-		     timer->data != (unsigned long)dwork);
+	WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)delayed_work_timer_fn);
 	WARN_ON_ONCE(timer_pending(timer));
 	WARN_ON_ONCE(!list_empty(&work->entry));
 
-- 
cgit v1.2.3


From a060c2104ef83e62346b7e893947a940471c0d7c Mon Sep 17 00:00:00 2001
From: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
Date: Tue, 26 Sep 2017 12:22:54 +0200
Subject: of/pci: Add of_pci_dma_range_parser_init() for dma-ranges parsing
 support

Several host bridge drivers duplicate of_pci_range_parser_init() in order
to parse their dma-ranges property.

Provide of_pci_dma_range_parser_init() for that use case.

Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/of/address.c       | 19 ++++++++++++++++---
 include/linux/of_address.h | 10 +++++++++-
 2 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/address.c b/drivers/of/address.c
index 792722e7d458..fa6cabfc3cb9 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -232,8 +232,8 @@ int of_pci_address_to_resource(struct device_node *dev, int bar,
 }
 EXPORT_SYMBOL_GPL(of_pci_address_to_resource);
 
-int of_pci_range_parser_init(struct of_pci_range_parser *parser,
-				struct device_node *node)
+static int parser_init(struct of_pci_range_parser *parser,
+			struct device_node *node, const char *name)
 {
 	const int na = 3, ns = 2;
 	int rlen;
@@ -242,7 +242,7 @@ int of_pci_range_parser_init(struct of_pci_range_parser *parser,
 	parser->pna = of_n_addr_cells(node);
 	parser->np = parser->pna + na + ns;
 
-	parser->range = of_get_property(node, "ranges", &rlen);
+	parser->range = of_get_property(node, name, &rlen);
 	if (parser->range == NULL)
 		return -ENOENT;
 
@@ -250,8 +250,21 @@ int of_pci_range_parser_init(struct of_pci_range_parser *parser,
 
 	return 0;
 }
+
+int of_pci_range_parser_init(struct of_pci_range_parser *parser,
+				struct device_node *node)
+{
+	return parser_init(parser, node, "ranges");
+}
 EXPORT_SYMBOL_GPL(of_pci_range_parser_init);
 
+int of_pci_dma_range_parser_init(struct of_pci_range_parser *parser,
+				struct device_node *node)
+{
+	return parser_init(parser, node, "dma-ranges");
+}
+EXPORT_SYMBOL_GPL(of_pci_dma_range_parser_init);
+
 struct of_pci_range *of_pci_range_parser_one(struct of_pci_range_parser *parser,
 						struct of_pci_range *range)
 {
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index 37864734ca50..8beed2de98e9 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -49,6 +49,8 @@ extern const __be32 *of_get_address(struct device_node *dev, int index,
 
 extern int of_pci_range_parser_init(struct of_pci_range_parser *parser,
 			struct device_node *node);
+extern int of_pci_dma_range_parser_init(struct of_pci_range_parser *parser,
+			struct device_node *node);
 extern struct of_pci_range *of_pci_range_parser_one(
 					struct of_pci_range_parser *parser,
 					struct of_pci_range *range);
@@ -85,7 +87,13 @@ static inline const __be32 *of_get_address(struct device_node *dev, int index,
 static inline int of_pci_range_parser_init(struct of_pci_range_parser *parser,
 			struct device_node *node)
 {
-	return -1;
+	return -ENOSYS;
+}
+
+static inline int of_pci_dma_range_parser_init(struct of_pci_range_parser *parser,
+			struct device_node *node)
+{
+	return -ENOSYS;
 }
 
 static inline struct of_pci_range *of_pci_range_parser_one(
-- 
cgit v1.2.3


From 753f612471819d3b3abba8c520eb3ce8f9d00fa5 Mon Sep 17 00:00:00 2001
From: "Jan H. Schönherr" <jschoenh@amazon.de>
Date: Tue, 26 Sep 2017 12:53:23 -0500
Subject: PCI: Remove reset argument from pci_iov_{add,remove}_virtfn()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "reset" argument passed to pci_iov_add_virtfn() and
pci_iov_remove_virtfn() is always zero since 46cb7b1bd86f ("PCI: Remove
unused SR-IOV VF Migration support")

Remove the argument together with the associated code.

Signed-off-by: Jan H. Schönherr <jschoenh@amazon.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Russell Currey <ruscur@russell.cc>
---
 arch/powerpc/kernel/eeh_driver.c |  4 ++--
 drivers/pci/iov.c                | 18 +++++-------------
 include/linux/pci.h              |  8 ++++----
 3 files changed, 11 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 8b840191df59..4e1b433f6cb5 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -441,7 +441,7 @@ static void *eeh_add_virt_device(void *data, void *userdata)
 	}
 
 #ifdef CONFIG_PPC_POWERNV
-	pci_iov_add_virtfn(edev->physfn, pdn->vf_index, 0);
+	pci_iov_add_virtfn(edev->physfn, pdn->vf_index);
 #endif
 	return NULL;
 }
@@ -499,7 +499,7 @@ static void *eeh_rmv_device(void *data, void *userdata)
 #ifdef CONFIG_PPC_POWERNV
 		struct pci_dn *pdn = eeh_dev_to_pdn(edev);
 
-		pci_iov_remove_virtfn(edev->physfn, pdn->vf_index, 0);
+		pci_iov_remove_virtfn(edev->physfn, pdn->vf_index);
 		edev->pdev = NULL;
 
 		/*
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index ac41c8be9200..996bf3b3cb70 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -113,7 +113,7 @@ resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)
 	return dev->sriov->barsz[resno - PCI_IOV_RESOURCES];
 }
 
-int pci_iov_add_virtfn(struct pci_dev *dev, int id, int reset)
+int pci_iov_add_virtfn(struct pci_dev *dev, int id)
 {
 	int i;
 	int rc = -ENOMEM;
@@ -157,9 +157,6 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id, int reset)
 		BUG_ON(rc);
 	}
 
-	if (reset)
-		__pci_reset_function(virtfn);
-
 	pci_device_add(virtfn, virtfn->bus);
 
 	pci_bus_add_device(virtfn);
@@ -187,7 +184,7 @@ failed:
 	return rc;
 }
 
-void pci_iov_remove_virtfn(struct pci_dev *dev, int id, int reset)
+void pci_iov_remove_virtfn(struct pci_dev *dev, int id)
 {
 	char buf[VIRTFN_ID_LEN];
 	struct pci_dev *virtfn;
@@ -198,11 +195,6 @@ void pci_iov_remove_virtfn(struct pci_dev *dev, int id, int reset)
 	if (!virtfn)
 		return;
 
-	if (reset) {
-		device_release_driver(&virtfn->dev);
-		__pci_reset_function(virtfn);
-	}
-
 	sprintf(buf, "virtfn%u", id);
 	sysfs_remove_link(&dev->dev.kobj, buf);
 	/*
@@ -317,7 +309,7 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
 	pci_cfg_access_unlock(dev);
 
 	for (i = 0; i < initial; i++) {
-		rc = pci_iov_add_virtfn(dev, i, 0);
+		rc = pci_iov_add_virtfn(dev, i);
 		if (rc)
 			goto failed;
 	}
@@ -329,7 +321,7 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
 
 failed:
 	while (i--)
-		pci_iov_remove_virtfn(dev, i, 0);
+		pci_iov_remove_virtfn(dev, i);
 
 err_pcibios:
 	iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
@@ -356,7 +348,7 @@ static void sriov_disable(struct pci_dev *dev)
 		return;
 
 	for (i = 0; i < iov->num_VFs; i++)
-		pci_iov_remove_virtfn(dev, i, 0);
+		pci_iov_remove_virtfn(dev, i);
 
 	iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
 	pci_cfg_access_lock(dev);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index f4f8ee5a7362..75ac2af89e87 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1958,8 +1958,8 @@ int pci_iov_virtfn_devfn(struct pci_dev *dev, int id);
 
 int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
 void pci_disable_sriov(struct pci_dev *dev);
-int pci_iov_add_virtfn(struct pci_dev *dev, int id, int reset);
-void pci_iov_remove_virtfn(struct pci_dev *dev, int id, int reset);
+int pci_iov_add_virtfn(struct pci_dev *dev, int id);
+void pci_iov_remove_virtfn(struct pci_dev *dev, int id);
 int pci_num_vf(struct pci_dev *dev);
 int pci_vfs_assigned(struct pci_dev *dev);
 int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs);
@@ -1976,12 +1976,12 @@ static inline int pci_iov_virtfn_devfn(struct pci_dev *dev, int id)
 }
 static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
 { return -ENODEV; }
-static inline int pci_iov_add_virtfn(struct pci_dev *dev, int id, int reset)
+static inline int pci_iov_add_virtfn(struct pci_dev *dev, int id)
 {
 	return -ENOSYS;
 }
 static inline void pci_iov_remove_virtfn(struct pci_dev *dev,
-					 int id, int reset) { }
+					 int id) { }
 static inline void pci_disable_sriov(struct pci_dev *dev) { }
 static inline int pci_num_vf(struct pci_dev *dev) { return 0; }
 static inline int pci_vfs_assigned(struct pci_dev *dev)
-- 
cgit v1.2.3


From 79e699b648b93f76d0f6e692499d5c6a2295ef05 Mon Sep 17 00:00:00 2001
From: "Jan H. Schönherr" <jschoenh@amazon.de>
Date: Wed, 6 Sep 2017 01:21:23 +0200
Subject: PCI: Remove unused function __pci_reset_function()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The last caller of __pci_reset_function() has been removed. Remove the
function as well.

Signed-off-by: Jan H. Schönherr <jschoenh@amazon.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c   | 35 +++--------------------------------
 include/linux/pci.h |  1 -
 2 files changed, 3 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 6078dfc11b11..f0d68066c726 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4145,35 +4145,6 @@ static void pci_dev_restore(struct pci_dev *dev)
 		err_handler->reset_done(dev);
 }
 
-/**
- * __pci_reset_function - reset a PCI device function
- * @dev: PCI device to reset
- *
- * Some devices allow an individual function to be reset without affecting
- * other functions in the same device.  The PCI device must be responsive
- * to PCI config space in order to use this function.
- *
- * The device function is presumed to be unused when this function is called.
- * Resetting the device will make the contents of PCI configuration space
- * random, so any caller of this must be prepared to reinitialise the
- * device including MSI, bus mastering, BARs, decoding IO and memory spaces,
- * etc.
- *
- * Returns 0 if the device function was successfully reset or negative if the
- * device doesn't support resetting a single function.
- */
-int __pci_reset_function(struct pci_dev *dev)
-{
-	int ret;
-
-	pci_dev_lock(dev);
-	ret = __pci_reset_function_locked(dev);
-	pci_dev_unlock(dev);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(__pci_reset_function);
-
 /**
  * __pci_reset_function_locked - reset a PCI device function while holding
  * the @dev mutex lock.
@@ -4264,8 +4235,8 @@ int pci_probe_reset_function(struct pci_dev *dev)
  *
  * This function does not just reset the PCI portion of a device, but
  * clears all the state associated with the device.  This function differs
- * from __pci_reset_function in that it saves and restores device state
- * over the reset.
+ * from __pci_reset_function_locked() in that it saves and restores device state
+ * over the reset and takes the PCI device lock.
  *
  * Returns 0 if the device function was successfully reset or negative if the
  * device doesn't support resetting a single function.
@@ -4300,7 +4271,7 @@ EXPORT_SYMBOL_GPL(pci_reset_function);
  *
  * This function does not just reset the PCI portion of a device, but
  * clears all the state associated with the device.  This function differs
- * from __pci_reset_function() in that it saves and restores device state
+ * from __pci_reset_function_locked() in that it saves and restores device state
  * over the reset.  It also differs from pci_reset_function() in that it
  * requires the PCI device lock to be held.
  *
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 75ac2af89e87..fb29d964a057 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1089,7 +1089,6 @@ int pcie_set_mps(struct pci_dev *dev, int mps);
 int pcie_get_minimum_link(struct pci_dev *dev, enum pci_bus_speed *speed,
 			  enum pcie_link_width *width);
 void pcie_flr(struct pci_dev *dev);
-int __pci_reset_function(struct pci_dev *dev);
 int __pci_reset_function_locked(struct pci_dev *dev);
 int pci_reset_function(struct pci_dev *dev);
 int pci_reset_function_locked(struct pci_dev *dev);
-- 
cgit v1.2.3


From 5fdee2127faa77c9c91862ad5e001dfab7013e92 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 5 Oct 2017 21:22:52 +0200
Subject: block: remove QUEUE_FLAG_STACKABLE

We already have a queue_is_rq_based helper to check if a request_queue
is request based, so we can remove the flag for it.

Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c |  1 -
 block/elevator.c       |  2 +-
 drivers/md/dm-rq.c     |  2 +-
 drivers/md/dm-table.c  | 15 +--------------
 drivers/md/dm.c        | 11 -----------
 include/linux/blkdev.h |  5 -----
 6 files changed, 3 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 980e73095643..7f4a1ba532af 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -54,7 +54,6 @@ static const char *const blk_queue_flag_name[] = {
 	QUEUE_FLAG_NAME(NOMERGES),
 	QUEUE_FLAG_NAME(SAME_COMP),
 	QUEUE_FLAG_NAME(FAIL_IO),
-	QUEUE_FLAG_NAME(STACKABLE),
 	QUEUE_FLAG_NAME(NONROT),
 	QUEUE_FLAG_NAME(IO_STAT),
 	QUEUE_FLAG_NAME(DISCARD),
diff --git a/block/elevator.c b/block/elevator.c
index 153926a90901..7ae50eb2732b 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -1118,7 +1118,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
 	struct elevator_type *__e;
 	int len = 0;
 
-	if (!blk_queue_stackable(q))
+	if (!queue_is_rq_based(q))
 		return sprintf(name, "none\n");
 
 	if (!q->elevator)
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index eadfcfd106ff..9d32f25489c2 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -56,7 +56,7 @@ static unsigned dm_get_blk_mq_queue_depth(void)
 
 int dm_request_based(struct mapped_device *md)
 {
-	return blk_queue_stackable(md->queue);
+	return queue_is_rq_based(md->queue);
 }
 
 static void dm_old_start_queue(struct request_queue *q)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index ef7b8f201f73..75281828f2cb 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1000,7 +1000,7 @@ verify_rq_based:
 	list_for_each_entry(dd, devices, list) {
 		struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
 
-		if (!blk_queue_stackable(q)) {
+		if (!queue_is_rq_based(q)) {
 			DMERR("table load rejected: including"
 			      " non-request-stackable devices");
 			return -EINVAL;
@@ -1847,19 +1847,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	 */
 	if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random))
 		queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q);
-
-	/*
-	 * QUEUE_FLAG_STACKABLE must be set after all queue settings are
-	 * visible to other CPUs because, once the flag is set, incoming bios
-	 * are processed by request-based dm, which refers to the queue
-	 * settings.
-	 * Until the flag set, bios are passed to bio-based dm and queued to
-	 * md->deferred where queue settings are not needed yet.
-	 * Those bios are passed to request-based dm at the resume time.
-	 */
-	smp_mb();
-	if (dm_table_request_based(t))
-		queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
 }
 
 unsigned int dm_table_get_num_targets(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6e54145969c5..8d07ad61221c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1612,17 +1612,6 @@ static void dm_wq_work(struct work_struct *work);
 
 void dm_init_md_queue(struct mapped_device *md)
 {
-	/*
-	 * Request-based dm devices cannot be stacked on top of bio-based dm
-	 * devices.  The type of this dm device may not have been decided yet.
-	 * The type is decided at the first table loading time.
-	 * To prevent problematic device stacking, clear the queue flag
-	 * for request stacking support until then.
-	 *
-	 * This queue is new, so no concurrency on the queue_flags.
-	 */
-	queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
-
 	/*
 	 * Initialize data that will only be used by a non-blk-mq DM queue
 	 * - must do so here (in alloc_dev callchain) before queue is used
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 02fa42d24b52..9fb71fc7d0e8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -609,7 +609,6 @@ struct request_queue {
 #define QUEUE_FLAG_NOMERGES     5	/* disable merge attempts */
 #define QUEUE_FLAG_SAME_COMP	6	/* complete on same CPU-group */
 #define QUEUE_FLAG_FAIL_IO	7	/* fake timeout */
-#define QUEUE_FLAG_STACKABLE	8	/* supports request stacking */
 #define QUEUE_FLAG_NONROT	9	/* non-rotational device (SSD) */
 #define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
 #define QUEUE_FLAG_IO_STAT     10	/* do IO stats */
@@ -633,12 +632,10 @@ struct request_queue {
 #define QUEUE_FLAG_QUIESCED    28	/* queue has been quiesced */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
-				 (1 << QUEUE_FLAG_STACKABLE)	|	\
 				 (1 << QUEUE_FLAG_SAME_COMP)	|	\
 				 (1 << QUEUE_FLAG_ADD_RANDOM))
 
 #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
-				 (1 << QUEUE_FLAG_STACKABLE)	|	\
 				 (1 << QUEUE_FLAG_SAME_COMP)	|	\
 				 (1 << QUEUE_FLAG_POLL))
 
@@ -722,8 +719,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 #define blk_queue_nonrot(q)	test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
 #define blk_queue_io_stat(q)	test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
 #define blk_queue_add_random(q)	test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
-#define blk_queue_stackable(q)	\
-	test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
 #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
 #define blk_queue_secure_erase(q) \
 	(test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags))
-- 
cgit v1.2.3


From 3e234289f86b12985ef8909cd34525fcb66c4efb Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 3 Mar 2017 18:00:22 -0500
Subject: ftrace: Allow module init functions to be traced

Allow for module init sections to be traced as well as core kernel init
sections. Now that filtering modules functions can be stored, for when they
are loaded, it makes sense to be able to trace them.

Cc: Jessica Yu <jeyu@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/init.h  | 4 +---
 kernel/module.c       | 2 ++
 kernel/trace/ftrace.c | 6 ++++--
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index 94769d687cf0..a779c1816437 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -39,7 +39,7 @@
 
 /* These are for everybody (although not all archs will actually
    discard it in modules) */
-#define __init		__section(.init.text) __cold __inittrace __latent_entropy
+#define __init		__section(.init.text) __cold  __latent_entropy
 #define __initdata	__section(.init.data)
 #define __initconst	__section(.init.rodata)
 #define __exitdata	__section(.exit.data)
@@ -68,10 +68,8 @@
 
 #ifdef MODULE
 #define __exitused
-#define __inittrace notrace
 #else
 #define __exitused  __used
-#define __inittrace
 #endif
 
 #define __exit          __section(.exit.text) __exitused __cold notrace
diff --git a/kernel/module.c b/kernel/module.c
index de66ec825992..58bca427ac3f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3473,6 +3473,8 @@ static noinline int do_init_module(struct module *mod)
 	if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
 		async_synchronize_full();
 
+	ftrace_free_mem(mod->init_layout.base, mod->init_layout.base +
+			mod->init_layout.size);
 	mutex_lock(&module_mutex);
 	/* Drop initial reference. */
 	module_put(mod);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 84cb5928665a..d7297e866e4a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5752,7 +5752,8 @@ void ftrace_release_mod(struct module *mod)
 	last_pg = &ftrace_pages_start;
 	for (pg = ftrace_pages_start; pg; pg = *last_pg) {
 		rec = &pg->records[0];
-		if (within_module_core(rec->ip, mod)) {
+		if (within_module_core(rec->ip, mod) ||
+		    within_module_init(rec->ip, mod)) {
 			/*
 			 * As core pages are first, the first
 			 * page should never be a module page.
@@ -5821,7 +5822,8 @@ void ftrace_module_enable(struct module *mod)
 		 * not part of this module, then skip this pg,
 		 * which the "break" will do.
 		 */
-		if (!within_module_core(rec->ip, mod))
+		if (!within_module_core(rec->ip, mod) &&
+		    !within_module_init(rec->ip, mod))
 			break;
 
 		cnt = 0;
-- 
cgit v1.2.3


From aba4b5c22cbac296f4081a0476d0c55828f135b4 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 1 Sep 2017 08:35:38 -0400
Subject: ftrace: Save module init functions kallsyms symbols for tracing

If function tracing is active when the module init functions are freed, then
store them to be referenced by kallsyms. As module init functions can now be
traced on module load, they were useless:

 ># echo ':mod:snd_seq' > set_ftrace_filter
 ># echo function > current_tracer
 ># modprobe snd_seq
 ># cat trace
 # tracer: function
 #
 #                              _-----=> irqs-off
 #                             / _----=> need-resched
 #                            | / _---=> hardirq/softirq
 #                            || / _--=> preempt-depth
 #                            ||| /     delay
 #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
 #              | |       |   ||||       |         |
         modprobe-2786  [000] ....  3189.037874: 0xffffffffa0860000 <-do_one_initcall
         modprobe-2786  [000] ....  3189.037876: 0xffffffffa086004d <-0xffffffffa086000f
         modprobe-2786  [000] ....  3189.037876: 0xffffffffa086010d <-0xffffffffa0860018
         modprobe-2786  [000] ....  3189.037877: 0xffffffffa086011a <-0xffffffffa0860021
         modprobe-2786  [000] ....  3189.037877: 0xffffffffa0860080 <-0xffffffffa086002a
         modprobe-2786  [000] ....  3189.039523: 0xffffffffa0860400 <-0xffffffffa0860033
         modprobe-2786  [000] ....  3189.039523: 0xffffffffa086038a <-0xffffffffa086041c
         modprobe-2786  [000] ....  3189.039591: 0xffffffffa086038a <-0xffffffffa0860436
         modprobe-2786  [000] ....  3189.039657: 0xffffffffa086038a <-0xffffffffa0860450
         modprobe-2786  [000] ....  3189.039719: 0xffffffffa0860127 <-0xffffffffa086003c
         modprobe-2786  [000] ....  3189.039742: snd_seq_create_kernel_client <-0xffffffffa08601f6

When the output is shown, the kallsyms for the module init functions have
already been freed, and the output of the trace can not convert them to
their function names.

Now this looks like this:

 # tracer: function
 #
 #                              _-----=> irqs-off
 #                             / _----=> need-resched
 #                            | / _---=> hardirq/softirq
 #                            || / _--=> preempt-depth
 #                            ||| /     delay
 #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
 #              | |       |   ||||       |         |
         modprobe-2463  [002] ....   174.243237: alsa_seq_init <-do_one_initcall
         modprobe-2463  [002] ....   174.243239: client_init_data <-alsa_seq_init
         modprobe-2463  [002] ....   174.243240: snd_sequencer_memory_init <-alsa_seq_init
         modprobe-2463  [002] ....   174.243240: snd_seq_queues_init <-alsa_seq_init
         modprobe-2463  [002] ....   174.243240: snd_sequencer_device_init <-alsa_seq_init
         modprobe-2463  [002] ....   174.244860: snd_seq_info_init <-alsa_seq_init
         modprobe-2463  [002] ....   174.244861: create_info_entry <-snd_seq_info_init
         modprobe-2463  [002] ....   174.244936: create_info_entry <-snd_seq_info_init
         modprobe-2463  [002] ....   174.245003: create_info_entry <-snd_seq_info_init
         modprobe-2463  [002] ....   174.245072: snd_seq_system_client_init <-alsa_seq_init
         modprobe-2463  [002] ....   174.245094: snd_seq_create_kernel_client <-snd_seq_system_client_init

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  20 ++++++-
 kernel/kallsyms.c      |   5 ++
 kernel/module.c        |   2 +-
 kernel/trace/ftrace.c  | 146 ++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 168 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 47fc404ad233..202b40784c4e 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -51,6 +51,21 @@ static inline void early_trace_init(void) { }
 struct module;
 struct ftrace_hash;
 
+#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_MODULES) && \
+	defined(CONFIG_DYNAMIC_FTRACE)
+const char *
+ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
+		   unsigned long *off, char **modname, char *sym);
+#else
+static inline const char *
+ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
+		   unsigned long *off, char **modname, char *sym)
+{
+	return NULL;
+}
+#endif
+
+
 #ifdef CONFIG_FUNCTION_TRACER
 
 extern int ftrace_enabled;
@@ -151,10 +166,10 @@ struct ftrace_ops_hash {
 };
 
 void ftrace_free_init_mem(void);
-void ftrace_free_mem(void *start, void *end);
+void ftrace_free_mem(struct module *mod, void *start, void *end);
 #else
 static inline void ftrace_free_init_mem(void) { }
-static inline void ftrace_free_mem(void *start, void *end) { }
+static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { }
 #endif
 
 /*
@@ -272,6 +287,7 @@ static inline int ftrace_nr_registered_ops(void)
 static inline void clear_ftrace_function(void) { }
 static inline void ftrace_kill(void) { }
 static inline void ftrace_free_init_mem(void) { }
+static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { }
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_STACK_TRACER
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 127e7cfafa55..976ecb9275d9 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -24,6 +24,7 @@
 #include <linux/ctype.h>
 #include <linux/slab.h>
 #include <linux/filter.h>
+#include <linux/ftrace.h>
 #include <linux/compiler.h>
 
 #include <asm/sections.h>
@@ -337,6 +338,10 @@ const char *kallsyms_lookup(unsigned long addr,
 	if (!ret)
 		ret = bpf_address_lookup(addr, symbolsize,
 					 offset, modname, namebuf);
+
+	if (!ret)
+		ret = ftrace_mod_address_lookup(addr, symbolsize,
+						offset, modname, namebuf);
 	return ret;
 }
 
diff --git a/kernel/module.c b/kernel/module.c
index 58bca427ac3f..279a469dc375 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3473,7 +3473,7 @@ static noinline int do_init_module(struct module *mod)
 	if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
 		async_synchronize_full();
 
-	ftrace_free_mem(mod->init_layout.base, mod->init_layout.base +
+	ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base +
 			mod->init_layout.size);
 	mutex_lock(&module_mutex);
 	/* Drop initial reference. */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d7297e866e4a..86dbbfb353db 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5675,6 +5675,21 @@ static int ftrace_process_locs(struct module *mod,
 	return ret;
 }
 
+struct ftrace_mod_func {
+	struct list_head	list;
+	char			*name;
+	unsigned long		ip;
+	unsigned int		size;
+};
+
+struct ftrace_mod_map {
+	struct list_head	list;
+	struct module		*mod;
+	unsigned long		start_addr;
+	unsigned long		end_addr;
+	struct list_head	funcs;
+};
+
 #ifdef CONFIG_MODULES
 
 #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
@@ -5868,9 +5883,123 @@ void ftrace_module_init(struct module *mod)
 	ftrace_process_locs(mod, mod->ftrace_callsites,
 			    mod->ftrace_callsites + mod->num_ftrace_callsites);
 }
+
+static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
+				struct dyn_ftrace *rec)
+{
+	struct ftrace_mod_func *mod_func;
+	unsigned long symsize;
+	unsigned long offset;
+	char str[KSYM_SYMBOL_LEN];
+	char *modname;
+	const char *ret;
+
+	ret = kallsyms_lookup(rec->ip, &symsize, &offset, &modname, str);
+	if (!ret)
+		return;
+
+	mod_func = kmalloc(sizeof(*mod_func), GFP_KERNEL);
+	if (!mod_func)
+		return;
+
+	mod_func->name = kstrdup(str, GFP_KERNEL);
+	if (!mod_func->name) {
+		kfree(mod_func);
+		return;
+	}
+
+	mod_func->ip = rec->ip - offset;
+	mod_func->size = symsize;
+
+	list_add_rcu(&mod_func->list, &mod_map->funcs);
+}
+
+static LIST_HEAD(ftrace_mod_maps);
+
+static struct ftrace_mod_map *
+allocate_ftrace_mod_map(struct module *mod,
+			unsigned long start, unsigned long end)
+{
+	struct ftrace_mod_map *mod_map;
+
+	mod_map = kmalloc(sizeof(*mod_map), GFP_KERNEL);
+	if (!mod_map)
+		return NULL;
+
+	mod_map->mod = mod;
+	mod_map->start_addr = start;
+	mod_map->end_addr = end;
+
+	INIT_LIST_HEAD_RCU(&mod_map->funcs);
+
+	list_add_rcu(&mod_map->list, &ftrace_mod_maps);
+
+	return mod_map;
+}
+
+static const char *
+ftrace_func_address_lookup(struct ftrace_mod_map *mod_map,
+			   unsigned long addr, unsigned long *size,
+			   unsigned long *off, char *sym)
+{
+	struct ftrace_mod_func *found_func =  NULL;
+	struct ftrace_mod_func *mod_func;
+
+	list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) {
+		if (addr >= mod_func->ip &&
+		    addr < mod_func->ip + mod_func->size) {
+			found_func = mod_func;
+			break;
+		}
+	}
+
+	if (found_func) {
+		if (size)
+			*size = found_func->size;
+		if (off)
+			*off = addr - found_func->ip;
+		if (sym)
+			strlcpy(sym, found_func->name, KSYM_NAME_LEN);
+
+		return found_func->name;
+	}
+
+	return NULL;
+}
+
+const char *
+ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
+		   unsigned long *off, char **modname, char *sym)
+{
+	struct ftrace_mod_map *mod_map;
+	const char *ret = NULL;
+
+	preempt_disable();
+	list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
+		ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym);
+		if (ret) {
+			if (modname)
+				*modname = mod_map->mod->name;
+			break;
+		}
+	}
+	preempt_enable();
+
+	return ret;
+}
+
+#else
+static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
+				struct dyn_ftrace *rec) { }
+static inline struct ftrace_mod_map *
+allocate_ftrace_mod_map(struct module *mod,
+			unsigned long start, unsigned long end)
+{
+	return NULL;
+}
 #endif /* CONFIG_MODULES */
 
-void ftrace_free_mem(void *start_ptr, void *end_ptr)
+void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
 {
 	unsigned long start = (unsigned long)(start_ptr);
 	unsigned long end = (unsigned long)(end_ptr);
@@ -5878,6 +6007,7 @@ void ftrace_free_mem(void *start_ptr, void *end_ptr)
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
 	struct dyn_ftrace key;
+	struct ftrace_mod_map *mod_map = NULL;
 	int order;
 
 	key.ip = start;
@@ -5885,6 +6015,14 @@ void ftrace_free_mem(void *start_ptr, void *end_ptr)
 
 	mutex_lock(&ftrace_lock);
 
+	/*
+	 * If we are freeing module init memory, then check if
+	 * any tracer is active. If so, we need to save a mapping of
+	 * the module functions being freed with the address.
+	 */
+	if (mod && ftrace_ops_list != &ftrace_list_end)
+		mod_map = allocate_ftrace_mod_map(mod, start, end);
+
 	for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) {
 		if (end < pg->records[0].ip ||
 		    start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
@@ -5895,6 +6033,10 @@ void ftrace_free_mem(void *start_ptr, void *end_ptr)
 			      ftrace_cmp_recs);
 		if (!rec)
 			continue;
+
+		if (mod_map)
+			save_ftrace_mod_rec(mod_map, rec);
+
 		pg->index--;
 		ftrace_update_tot_cnt--;
 		if (!pg->index) {
@@ -5920,7 +6062,7 @@ void __init ftrace_free_init_mem(void)
 	void *start = (void *)(&__init_begin);
 	void *end = (void *)(&__init_end);
 
-	ftrace_free_mem(start, end);
+	ftrace_free_mem(NULL, start, end);
 }
 
 void __init ftrace_init(void)
-- 
cgit v1.2.3


From 6171a0310a06a7a0cb83713fa7068bdd4192de19 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 6 Sep 2017 08:40:41 -0400
Subject: ftrace/kallsyms: Have /proc/kallsyms show saved mod init functions

If a module is loaded while tracing is enabled, then there's a possibility
that the module init functions were traced. These functions have their name
and address stored by ftrace such that it can translate the function address
that is written into the buffer into a human readable function name.

As userspace tools may be doing the same, they need a way to map function
names to their address as well. This is done through reading /proc/kallsyms.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  9 +++++++++
 kernel/kallsyms.c      | 38 ++++++++++++++++++++++++++++++++------
 kernel/trace/ftrace.c  | 40 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 202b40784c4e..346f8294e40a 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -56,6 +56,9 @@ struct ftrace_hash;
 const char *
 ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
 		   unsigned long *off, char **modname, char *sym);
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+			   char *type, char *name,
+			   char *module_name, int *exported);
 #else
 static inline const char *
 ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
@@ -63,6 +66,12 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
 {
 	return NULL;
 }
+static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+					 char *type, char *name,
+					 char *module_name, int *exported)
+{
+	return -1;
+}
 #endif
 
 
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 976ecb9275d9..1966fe1c2b57 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -479,6 +479,7 @@ EXPORT_SYMBOL(__print_symbol);
 struct kallsym_iter {
 	loff_t pos;
 	loff_t pos_mod_end;
+	loff_t pos_ftrace_mod_end;
 	unsigned long value;
 	unsigned int nameoff; /* If iterating in core kernel symbols. */
 	char type;
@@ -501,11 +502,25 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
 	return 1;
 }
 
+static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
+{
+	int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end,
+					 &iter->value, &iter->type,
+					 iter->name, iter->module_name,
+					 &iter->exported);
+	if (ret < 0) {
+		iter->pos_ftrace_mod_end = iter->pos;
+		return 0;
+	}
+
+	return 1;
+}
+
 static int get_ksymbol_bpf(struct kallsym_iter *iter)
 {
 	iter->module_name[0] = '\0';
 	iter->exported = 0;
-	return bpf_get_kallsym(iter->pos - iter->pos_mod_end,
+	return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
 			       &iter->value, &iter->type,
 			       iter->name) < 0 ? 0 : 1;
 }
@@ -530,20 +545,31 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
 	iter->name[0] = '\0';
 	iter->nameoff = get_symbol_offset(new_pos);
 	iter->pos = new_pos;
-	if (new_pos == 0)
+	if (new_pos == 0) {
 		iter->pos_mod_end = 0;
+		iter->pos_ftrace_mod_end = 0;
+	}
 }
 
 static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
 {
 	iter->pos = pos;
 
-	if (iter->pos_mod_end > 0 &&
-	    iter->pos_mod_end < iter->pos)
+	if (iter->pos_ftrace_mod_end > 0 &&
+	    iter->pos_ftrace_mod_end < iter->pos)
 		return get_ksymbol_bpf(iter);
 
-	if (!get_ksymbol_mod(iter))
-		return get_ksymbol_bpf(iter);
+	if (iter->pos_mod_end > 0 &&
+	    iter->pos_mod_end < iter->pos) {
+		if (!get_ksymbol_ftrace_mod(iter))
+			return get_ksymbol_bpf(iter);
+		return 1;
+	}
+
+	if (!get_ksymbol_mod(iter)) {
+		if (!get_ksymbol_ftrace_mod(iter))
+			return get_ksymbol_bpf(iter);
+	}
 
 	return 1;
 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a5824408bed9..9e99bd55732e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5689,6 +5689,7 @@ struct ftrace_mod_map {
 	unsigned long		start_addr;
 	unsigned long		end_addr;
 	struct list_head	funcs;
+	unsigned int		num_funcs;
 };
 
 #ifdef CONFIG_MODULES
@@ -5940,6 +5941,8 @@ static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
 	mod_func->ip = rec->ip - offset;
 	mod_func->size = symsize;
 
+	mod_map->num_funcs++;
+
 	list_add_rcu(&mod_func->list, &mod_map->funcs);
 }
 
@@ -5956,6 +5959,7 @@ allocate_ftrace_mod_map(struct module *mod,
 	mod_map->mod = mod;
 	mod_map->start_addr = start;
 	mod_map->end_addr = end;
+	mod_map->num_funcs = 0;
 
 	INIT_LIST_HEAD_RCU(&mod_map->funcs);
 
@@ -6016,6 +6020,42 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
 	return ret;
 }
 
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+			   char *type, char *name,
+			   char *module_name, int *exported)
+{
+	struct ftrace_mod_map *mod_map;
+	struct ftrace_mod_func *mod_func;
+
+	preempt_disable();
+	list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
+
+		if (symnum >= mod_map->num_funcs) {
+			symnum -= mod_map->num_funcs;
+			continue;
+		}
+
+		list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) {
+			if (symnum > 1) {
+				symnum--;
+				continue;
+			}
+
+			*value = mod_func->ip;
+			*type = 'T';
+			strlcpy(name, mod_func->name, KSYM_NAME_LEN);
+			strlcpy(module_name, mod_map->mod->name, MODULE_NAME_LEN);
+			*exported = 1;
+			preempt_enable();
+			return 0;
+		}
+		WARN_ON(1);
+		break;
+	}
+	preempt_enable();
+	return -ERANGE;
+}
+
 #else
 static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
 				struct dyn_ftrace *rec) { }
-- 
cgit v1.2.3


From e2080072ed2d98a55ae69d95dea60ff7a17cddd5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 4 Oct 2017 12:59:58 -0700
Subject: tcp: new list for sent but unacked skbs for RACK recovery

This patch adds a new queue (list) that tracks the sent but not yet
acked or SACKed skbs for a TCP connection. The list is chronologically
ordered by skb->skb_mstamp (the head is the oldest sent skb).

This list will be used to optimize TCP Rack recovery, which checks
an skb's timestamp to judge if it has been lost and needs to be
retransmitted. Since TCP write queue is ordered by sequence instead
of sent time, RACK has to scan over the write queue to catch all
eligible packets to detect lost retransmission, and iterates through
SACKed skbs repeatedly.

Special cares for rare events:
1. TCP repair fakes skb transmission so the send queue needs adjusted
2. SACK reneging would require re-inserting SACKed skbs into the
   send queue. For now I believe it's not worth the complexity to
   make RACK work perfectly on SACK reneging, so we do nothing here.
3. Fast Open: currently for non-TFO, send-queue correctly queues
   the pure SYN packet. For TFO which queues a pure SYN and
   then a data packet, send-queue only queues the data packet but
   not the pure SYN due to the structure of TFO code. This is okay
   because the SYN receiver would never respond with a SACK on a
   missing SYN (i.e. SYN is never fast-retransmitted by SACK/RACK).

In order to not grow sk_buff, we use an union for the new list and
_skb_refdst/destructor fields. This is a bit complicated because
we need to make sure _skb_refdst and destructor are properly zeroed
before skb is cloned/copied at transmit, and before being freed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h   | 11 +++++++++--
 include/linux/tcp.h      |  1 +
 include/net/tcp.h        | 24 +++++++++++++++++++++++-
 net/ipv4/tcp.c           |  2 ++
 net/ipv4/tcp_input.c     |  9 +++++++--
 net/ipv4/tcp_minisocks.c |  1 +
 net/ipv4/tcp_output.c    | 42 +++++++++++++++++++++++++++++++-----------
 7 files changed, 74 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ada821466e88..01a985937867 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -617,6 +617,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@nf_trace: netfilter packet trace flag
  *	@protocol: Packet protocol from driver
  *	@destructor: Destruct function
+ *	@tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
  *	@_nfct: Associated connection, if any (with nfctinfo bits)
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *	@skb_iif: ifindex of device we arrived on
@@ -686,8 +687,14 @@ struct sk_buff {
 	 */
 	char			cb[48] __aligned(8);
 
-	unsigned long		_skb_refdst;
-	void			(*destructor)(struct sk_buff *skb);
+	union {
+		struct {
+			unsigned long	_skb_refdst;
+			void		(*destructor)(struct sk_buff *skb);
+		};
+		struct list_head	tcp_tsorted_anchor;
+	};
+
 #ifdef CONFIG_XFRM
 	struct	sec_path	*sp;
 #endif
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4aa40ef02d32..1d2c44e09e31 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -191,6 +191,7 @@ struct tcp_sock {
 	u32	tsoffset;	/* timestamp offset */
 
 	struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
+	struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
 
 	u32	snd_wl1;	/* Sequence for window update		*/
 	u32	snd_wnd;	/* The window we expect to receive	*/
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 426c2e986016..3b16f353b539 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1589,14 +1589,34 @@ enum tcp_chrono {
 void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
 void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);
 
+/* This helper is needed, because skb->tcp_tsorted_anchor uses
+ * the same memory storage than skb->destructor/_skb_refdst
+ */
+static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
+{
+	skb->destructor = NULL;
+	skb->_skb_refdst = 0UL;
+}
+
+#define tcp_skb_tsorted_save(skb) {		\
+	unsigned long _save = skb->_skb_refdst;	\
+	skb->_skb_refdst = 0UL;
+
+#define tcp_skb_tsorted_restore(skb)		\
+	skb->_skb_refdst = _save;		\
+}
+
 /* write queue abstraction */
 static inline void tcp_write_queue_purge(struct sock *sk)
 {
 	struct sk_buff *skb;
 
 	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
-	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
+	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+		tcp_skb_tsorted_anchor_cleanup(skb);
 		sk_wmem_free_skb(sk, skb);
+	}
+	INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
 	sk_mem_reclaim(sk);
 	tcp_clear_all_retrans_hints(tcp_sk(sk));
 }
@@ -1711,6 +1731,8 @@ static inline void tcp_insert_write_queue_before(struct sk_buff *new,
 
 static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
 {
+	list_del(&skb->tcp_tsorted_anchor);
+	tcp_skb_tsorted_anchor_cleanup(skb);
 	__skb_unlink(skb, &sk->sk_write_queue);
 }
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c115e37ca608..8cf742fd4f99 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -415,6 +415,7 @@ void tcp_init_sock(struct sock *sk)
 	tp->out_of_order_queue = RB_ROOT;
 	tcp_init_xmit_timers(sk);
 	INIT_LIST_HEAD(&tp->tsq_node);
+	INIT_LIST_HEAD(&tp->tsorted_sent_queue);
 
 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
 	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
@@ -881,6 +882,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
 			 * available to the caller, no more, no less.
 			 */
 			skb->reserved_tailroom = skb->end - skb->tail - size;
+			INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
 			return skb;
 		}
 		__kfree_skb(skb);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c5b8d61846c2..fb0d7ed84b94 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1593,6 +1593,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
 						tcp_skb_pcount(skb),
 						skb->skb_mstamp);
 			tcp_rate_skb_delivered(sk, skb, state->rate);
+			if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+				list_del_init(&skb->tcp_tsorted_anchor);
 
 			if (!before(TCP_SKB_CB(skb)->seq,
 				    tcp_highest_sack_seq(tp)))
@@ -3054,8 +3056,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
 
 	shinfo = skb_shinfo(skb);
 	if (!before(shinfo->tskey, prior_snd_una) &&
-	    before(shinfo->tskey, tcp_sk(sk)->snd_una))
-		__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+	    before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
+		tcp_skb_tsorted_save(skb) {
+			__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+		} tcp_skb_tsorted_restore(skb);
+	}
 }
 
 /* Remove acknowledged frames from the retransmission queue. If our packet
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 188a6f31356d..2341b9f857b6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -446,6 +446,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
 
 		INIT_LIST_HEAD(&newtp->tsq_node);
+		INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
 
 		tcp_init_wl(newtp, treq->rcv_isn);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0bc9e46a5369..8162e2880178 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -971,6 +971,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
 		      HRTIMER_MODE_ABS_PINNED);
 }
 
+static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	skb->skb_mstamp = tp->tcp_mstamp;
+	list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+}
+
 /* This routine actually transmits TCP packets queued in by
  * tcp_do_sendmsg().  This is used by both the initial
  * transmission and possible later retransmissions.
@@ -1003,10 +1009,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
 			- tp->snd_una;
 		oskb = skb;
-		if (unlikely(skb_cloned(skb)))
-			skb = pskb_copy(skb, gfp_mask);
-		else
-			skb = skb_clone(skb, gfp_mask);
+
+		tcp_skb_tsorted_save(oskb) {
+			if (unlikely(skb_cloned(oskb)))
+				skb = pskb_copy(oskb, gfp_mask);
+			else
+				skb = skb_clone(oskb, gfp_mask);
+		} tcp_skb_tsorted_restore(oskb);
+
 		if (unlikely(!skb))
 			return -ENOBUFS;
 	}
@@ -1127,7 +1137,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 		err = net_xmit_eval(err);
 	}
 	if (!err && oskb) {
-		oskb->skb_mstamp = tp->tcp_mstamp;
+		tcp_update_skb_after_send(tp, oskb);
 		tcp_rate_skb_sent(sk, oskb);
 	}
 	return err;
@@ -1328,6 +1338,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 	/* Link BUFF into the send queue. */
 	__skb_header_release(buff);
 	tcp_insert_write_queue_after(skb, buff, sk);
+	list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
 
 	return 0;
 }
@@ -2260,7 +2271,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 
 		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
 			/* "skb_mstamp" is used as a start point for the retransmit timer */
-			skb->skb_mstamp = tp->tcp_mstamp;
+			tcp_update_skb_after_send(tp, skb);
 			goto repair; /* Skip network transmission */
 		}
 
@@ -2838,11 +2849,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 		     skb_headroom(skb) >= 0xFFFF)) {
 		struct sk_buff *nskb;
 
-		nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
-		err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
-			     -ENOBUFS;
+		tcp_skb_tsorted_save(skb) {
+			nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
+			err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
+				     -ENOBUFS;
+		} tcp_skb_tsorted_restore(skb);
+
 		if (!err)
-			skb->skb_mstamp = tp->tcp_mstamp;
+			tcp_update_skb_after_send(tp, skb);
 	} else {
 		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 	}
@@ -3023,6 +3037,7 @@ coalesce:
 				goto coalesce;
 			return;
 		}
+		INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
 		skb_reserve(skb, MAX_TCP_HEADER);
 		sk_forced_mem_schedule(sk, skb->truesize);
 		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
@@ -3078,9 +3093,14 @@ int tcp_send_synack(struct sock *sk)
 	}
 	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
 		if (skb_cloned(skb)) {
-			struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
+			struct sk_buff *nskb;
+
+			tcp_skb_tsorted_save(skb) {
+				nskb = skb_copy(skb, GFP_ATOMIC);
+			} tcp_skb_tsorted_restore(skb);
 			if (!nskb)
 				return -ENOMEM;
+			INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
 			tcp_unlink_write_queue(skb, sk);
 			__skb_header_release(nskb);
 			__tcp_add_write_queue_head(sk, nskb);
-- 
cgit v1.2.3


From ec154bf56b276a0bb36079a5d22a267b5f417801 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Fri, 6 Oct 2017 15:00:53 +0200
Subject: iommu/vt-d: Don't register bus-notifier under dmar_global_lock

The notifier function will take the dmar_global_lock too, so
lockdep complains about inverse locking order when the
notifier is registered under the dmar_global_lock.

Reported-by: Jan Kiszka <jan.kiszka@siemens.com>
Fixes: 59ce0515cdaf ('iommu/vt-d: Update DRHD/RMRR/ATSR device scope caches when PCI hotplug happens')
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/dmar.c        |  7 +++++--
 drivers/iommu/intel-iommu.c | 10 ++++++++++
 include/linux/dmar.h        |  1 +
 3 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 57c920c1372d..1ea7cd537873 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -801,13 +801,16 @@ int __init dmar_dev_scope_init(void)
 				dmar_free_pci_notify_info(info);
 			}
 		}
-
-		bus_register_notifier(&pci_bus_type, &dmar_pci_bus_nb);
 	}
 
 	return dmar_dev_scope_status;
 }
 
+void dmar_register_bus_notifier(void)
+{
+	bus_register_notifier(&pci_bus_type, &dmar_pci_bus_nb);
+}
+
 
 int __init dmar_table_init(void)
 {
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 6784a05dd6b2..934cef924461 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4752,6 +4752,16 @@ int __init intel_iommu_init(void)
 		goto out_free_dmar;
 	}
 
+	up_write(&dmar_global_lock);
+
+	/*
+	 * The bus notifier takes the dmar_global_lock, so lockdep will
+	 * complain later when we register it under the lock.
+	 */
+	dmar_register_bus_notifier();
+
+	down_write(&dmar_global_lock);
+
 	if (no_iommu || dmar_disabled) {
 		/*
 		 * We exit the function here to ensure IOMMU's remapping and
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index e8ffba1052d3..e2433bc50210 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -112,6 +112,7 @@ static inline bool dmar_rcu_check(void)
 
 extern int dmar_table_init(void);
 extern int dmar_dev_scope_init(void);
+extern void dmar_register_bus_notifier(void);
 extern int dmar_parse_dev_scope(void *start, void *end, int *cnt,
 				struct dmar_dev_scope **devices, u16 segment);
 extern void *dmar_alloc_dev_scope(void *start, void *end, int *cnt);
-- 
cgit v1.2.3


From 775d3a35dc3e13de55ec0e061c59e36faa7dd7f0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 6 Oct 2017 08:15:15 -0600
Subject: backing-dev: kill unused pdflush_proc_obsolete()

After commit b35bd0d9f8a8, pdflush_proc_obsolete() is no longer
used. Kill the function and declaration.

Reported-by: Rakesh Pandit <rakesh@tuxera.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/backing-dev.h |  2 --
 mm/backing-dev.c            | 20 --------------------
 2 files changed, 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 157e950a70dc..872afa41abc2 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -172,8 +172,6 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
 
 long congestion_wait(int sync, long timeout);
 long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout);
-int pdflush_proc_obsolete(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp, loff_t *ppos);
 
 static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi)
 {
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e19606bb41a0..74b52dfd5852 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1072,23 +1072,3 @@ out:
 	return ret;
 }
 EXPORT_SYMBOL(wait_iff_congested);
-
-int pdflush_proc_obsolete(struct ctl_table *table, int write,
-			void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	char kbuf[] = "0\n";
-
-	if (*ppos || *lenp < sizeof(kbuf)) {
-		*lenp = 0;
-		return 0;
-	}
-
-	if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
-		return -EFAULT;
-	pr_warn_once("%s exported in /proc is scheduled for removal\n",
-		     table->procname);
-
-	*lenp = 2;
-	*ppos += *lenp;
-	return 2;
-}
-- 
cgit v1.2.3


From 878e832ade6fc315e6ea59d95824bbb0430c6e8d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 6 Oct 2017 11:19:24 -0400
Subject: acct.h: get rid of detritus

unused forward declarations of structs

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/acct.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acct.h b/include/linux/acct.h
index dccc2d4fe7de..3912a45b8c89 100644
--- a/include/linux/acct.h
+++ b/include/linux/acct.h
@@ -19,9 +19,6 @@
 
 
 #ifdef CONFIG_BSD_PROCESS_ACCT
-struct vfsmount;
-struct super_block;
-struct pacct_struct;
 struct pid_namespace;
 extern int acct_parm[]; /* for sysctl */
 extern void acct_collect(long exitcode, int group_dead);
-- 
cgit v1.2.3


From 55edd1dad96b760c87f5b9e2c461ba551a625f44 Mon Sep 17 00:00:00 2001
From: David Lin <dtwlin@google.com>
Date: Wed, 13 Sep 2017 10:53:58 -0700
Subject: leds: Replace flags bit shift with BIT() macros

This is for readability as well as to avoid checkpatch warnings when
adding new bit flag information in the future.

Signed-off-by: David Lin <dtwlin@google.com>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 include/linux/leds.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index bf6db4fe895b..5579c64c8fd6 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -40,16 +40,16 @@ struct led_classdev {
 	int			 flags;
 
 	/* Lower 16 bits reflect status */
-#define LED_SUSPENDED		(1 << 0)
-#define LED_UNREGISTERING	(1 << 1)
+#define LED_SUSPENDED		BIT(0)
+#define LED_UNREGISTERING	BIT(1)
 	/* Upper 16 bits reflect control information */
-#define LED_CORE_SUSPENDRESUME	(1 << 16)
-#define LED_SYSFS_DISABLE	(1 << 17)
-#define LED_DEV_CAP_FLASH	(1 << 18)
-#define LED_HW_PLUGGABLE	(1 << 19)
-#define LED_PANIC_INDICATOR	(1 << 20)
-#define LED_BRIGHT_HW_CHANGED	(1 << 21)
-#define LED_RETAIN_AT_SHUTDOWN	(1 << 22)
+#define LED_CORE_SUSPENDRESUME	BIT(16)
+#define LED_SYSFS_DISABLE	BIT(17)
+#define LED_DEV_CAP_FLASH	BIT(18)
+#define LED_HW_PLUGGABLE	BIT(19)
+#define LED_PANIC_INDICATOR	BIT(20)
+#define LED_BRIGHT_HW_CHANGED	BIT(21)
+#define LED_RETAIN_AT_SHUTDOWN	BIT(22)
 
 	/* set_brightness_work / blink_timer flags, atomic, private. */
 	unsigned long		work_flags;
-- 
cgit v1.2.3


From 18a4c0eab2623cc95be98a1e6af1ad18e7695977 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 5 Oct 2017 22:21:21 -0700
Subject: net: add rb_to_skb() and other rb tree helpers

Geeralize private netem_rb_to_skb()

TCP rtx queue will soon be converted to rb-tree,
so we will need skb_rbtree_walk() helpers.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h  | 18 ++++++++++++++++++
 net/ipv4/tcp_fastopen.c |  8 +++-----
 net/ipv4/tcp_input.c    | 33 ++++++++++++---------------------
 net/sched/sch_netem.c   | 14 ++++----------
 4 files changed, 37 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 01a985937867..03634ec2f918 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3158,6 +3158,12 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
 	return __skb_grow(skb, len);
 }
 
+#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
+#define skb_rb_first(root) rb_to_skb(rb_first(root))
+#define skb_rb_last(root)  rb_to_skb(rb_last(root))
+#define skb_rb_next(skb)   rb_to_skb(rb_next(&(skb)->rbnode))
+#define skb_rb_prev(skb)   rb_to_skb(rb_prev(&(skb)->rbnode))
+
 #define skb_queue_walk(queue, skb) \
 		for (skb = (queue)->next;					\
 		     skb != (struct sk_buff *)(queue);				\
@@ -3172,6 +3178,18 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
 		for (; skb != (struct sk_buff *)(queue);			\
 		     skb = skb->next)
 
+#define skb_rbtree_walk(skb, root)						\
+		for (skb = skb_rb_first(root); skb != NULL;			\
+		     skb = skb_rb_next(skb))
+
+#define skb_rbtree_walk_from(skb)						\
+		for (; skb != NULL;						\
+		     skb = skb_rb_next(skb))
+
+#define skb_rbtree_walk_from_safe(skb, tmp)					\
+		for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL);	\
+		     skb = tmp)
+
 #define skb_queue_walk_from_safe(queue, skb, tmp)				\
 		for (tmp = skb->next;						\
 		     skb != (struct sk_buff *)(queue);				\
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 29fff14d5a53..7ee4aadcdd71 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -465,17 +465,15 @@ bool tcp_fastopen_active_should_disable(struct sock *sk)
 void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct rb_node *p;
-	struct sk_buff *skb;
 	struct dst_entry *dst;
+	struct sk_buff *skb;
 
 	if (!tp->syn_fastopen)
 		return;
 
 	if (!tp->data_segs_in) {
-		p = rb_first(&tp->out_of_order_queue);
-		if (p && !rb_next(p)) {
-			skb = rb_entry(p, struct sk_buff, rbnode);
+		skb = skb_rb_first(&tp->out_of_order_queue);
+		if (skb && !skb_rb_next(skb)) {
 			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
 				tcp_fastopen_active_disable(sk);
 				return;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fb0d7ed84b94..90afe4143596 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4335,7 +4335,7 @@ static void tcp_ofo_queue(struct sock *sk)
 
 	p = rb_first(&tp->out_of_order_queue);
 	while (p) {
-		skb = rb_entry(p, struct sk_buff, rbnode);
+		skb = rb_to_skb(p);
 		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
 			break;
 
@@ -4399,7 +4399,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
 static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct rb_node **p, *q, *parent;
+	struct rb_node **p, *parent;
 	struct sk_buff *skb1;
 	u32 seq, end_seq;
 	bool fragstolen;
@@ -4458,7 +4458,7 @@ coalesce_done:
 	parent = NULL;
 	while (*p) {
 		parent = *p;
-		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		skb1 = rb_to_skb(parent);
 		if (before(seq, TCP_SKB_CB(skb1)->seq)) {
 			p = &parent->rb_left;
 			continue;
@@ -4503,9 +4503,7 @@ insert:
 
 merge_right:
 	/* Remove other segments covered by skb. */
-	while ((q = rb_next(&skb->rbnode)) != NULL) {
-		skb1 = rb_entry(q, struct sk_buff, rbnode);
-
+	while ((skb1 = skb_rb_next(skb)) != NULL) {
 		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
 			break;
 		if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
@@ -4520,7 +4518,7 @@ merge_right:
 		tcp_drop(sk, skb1);
 	}
 	/* If there is no skb after us, we are the last_skb ! */
-	if (!q)
+	if (!skb1)
 		tp->ooo_last_skb = skb;
 
 add_sack:
@@ -4706,7 +4704,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li
 	if (list)
 		return !skb_queue_is_last(list, skb) ? skb->next : NULL;
 
-	return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
+	return skb_rb_next(skb);
 }
 
 static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
@@ -4735,7 +4733,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
 
 	while (*p) {
 		parent = *p;
-		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		skb1 = rb_to_skb(parent);
 		if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
 			p = &parent->rb_left;
 		else
@@ -4854,26 +4852,19 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb, *head;
-	struct rb_node *p;
 	u32 start, end;
 
-	p = rb_first(&tp->out_of_order_queue);
-	skb = rb_entry_safe(p, struct sk_buff, rbnode);
+	skb = skb_rb_first(&tp->out_of_order_queue);
 new_range:
 	if (!skb) {
-		p = rb_last(&tp->out_of_order_queue);
-		/* Note: This is possible p is NULL here. We do not
-		 * use rb_entry_safe(), as ooo_last_skb is valid only
-		 * if rbtree is not empty.
-		 */
-		tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
+		tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
 		return;
 	}
 	start = TCP_SKB_CB(skb)->seq;
 	end = TCP_SKB_CB(skb)->end_seq;
 
 	for (head = skb;;) {
-		skb = tcp_skb_next(skb, NULL);
+		skb = skb_rb_next(skb);
 
 		/* Range is terminated when we see a gap or when
 		 * we are at the queue end.
@@ -4916,14 +4907,14 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
 	do {
 		prev = rb_prev(node);
 		rb_erase(node, &tp->out_of_order_queue);
-		tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
+		tcp_drop(sk, rb_to_skb(node));
 		sk_mem_reclaim(sk);
 		if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
 		    !tcp_under_memory_pressure(sk))
 			break;
 		node = prev;
 	} while (node);
-	tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
+	tp->ooo_last_skb = rb_to_skb(prev);
 
 	/* Reset SACK state.  A conforming SACK implementation will
 	 * do the same at a timeout based retransmit.  When a connection
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 5a4f10080290..db0228a65e8c 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -148,12 +148,6 @@ struct netem_skb_cb {
 	psched_time_t	time_to_send;
 };
 
-
-static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
-{
-	return rb_entry(rb, struct sk_buff, rbnode);
-}
-
 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 {
 	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
@@ -364,7 +358,7 @@ static void tfifo_reset(struct Qdisc *sch)
 	struct rb_node *p = rb_first(&q->t_root);
 
 	while (p) {
-		struct sk_buff *skb = netem_rb_to_skb(p);
+		struct sk_buff *skb = rb_to_skb(p);
 
 		p = rb_next(p);
 		rb_erase(&skb->rbnode, &q->t_root);
@@ -382,7 +376,7 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 		struct sk_buff *skb;
 
 		parent = *p;
-		skb = netem_rb_to_skb(parent);
+		skb = rb_to_skb(parent);
 		if (tnext >= netem_skb_cb(skb)->time_to_send)
 			p = &parent->rb_right;
 		else
@@ -538,7 +532,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 				struct sk_buff *t_skb;
 				struct netem_skb_cb *t_last;
 
-				t_skb = netem_rb_to_skb(rb_last(&q->t_root));
+				t_skb = skb_rb_last(&q->t_root);
 				t_last = netem_skb_cb(t_skb);
 				if (!last ||
 				    t_last->time_to_send > last->time_to_send) {
@@ -617,7 +611,7 @@ deliver:
 	if (p) {
 		psched_time_t time_to_send;
 
-		skb = netem_rb_to_skb(p);
+		skb = rb_to_skb(p);
 
 		/* if more time remaining? */
 		time_to_send = netem_skb_cb(skb)->time_to_send;
-- 
cgit v1.2.3


From cf5868c8a22dc2854b96e9569064bb92365549ca Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Fri, 8 Sep 2017 20:57:10 +0200
Subject: padata: ensure the reorder timer callback runs on the correct CPU

The reorder timer function runs on the CPU where the timer interrupt was
handled which is not necessarily one of the CPUs of the 'pcpu' CPU mask
set.

Ensure the padata_reorder() callback runs on the correct CPU, which is
one in the 'pcpu' CPU mask set and, preferrably, the next expected one.
Do so by comparing the current CPU with the expected target CPU. If they
match, call padata_reorder() right away. If they differ, schedule a work
item on the target CPU that does the padata_reorder() call for us.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/padata.h |  2 ++
 kernel/padata.c        | 43 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 44 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 2f9c1f93b1ce..5c0175bbc179 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -85,6 +85,7 @@ struct padata_serial_queue {
  * @swork: work struct for serialization.
  * @pd: Backpointer to the internal control structure.
  * @work: work struct for parallelization.
+ * @reorder_work: work struct for reordering.
  * @num_obj: Number of objects that are processed by this cpu.
  * @cpu_index: Index of the cpu.
  */
@@ -93,6 +94,7 @@ struct padata_parallel_queue {
        struct padata_list    reorder;
        struct parallel_data *pd;
        struct work_struct    work;
+       struct work_struct    reorder_work;
        atomic_t              num_obj;
        int                   cpu_index;
 };
diff --git a/kernel/padata.c b/kernel/padata.c
index 1b9b4bac4a9b..b4066147bce4 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -275,11 +275,51 @@ static void padata_reorder(struct parallel_data *pd)
 	return;
 }
 
+static void invoke_padata_reorder(struct work_struct *work)
+{
+	struct padata_parallel_queue *pqueue;
+	struct parallel_data *pd;
+
+	local_bh_disable();
+	pqueue = container_of(work, struct padata_parallel_queue, reorder_work);
+	pd = pqueue->pd;
+	padata_reorder(pd);
+	local_bh_enable();
+}
+
 static void padata_reorder_timer(unsigned long arg)
 {
 	struct parallel_data *pd = (struct parallel_data *)arg;
+	unsigned int weight;
+	int target_cpu, cpu;
 
-	padata_reorder(pd);
+	cpu = get_cpu();
+
+	/* We don't lock pd here to not interfere with parallel processing
+	 * padata_reorder() calls on other CPUs. We just need any CPU out of
+	 * the cpumask.pcpu set. It would be nice if it's the right one but
+	 * it doesn't matter if we're off to the next one by using an outdated
+	 * pd->processed value.
+	 */
+	weight = cpumask_weight(pd->cpumask.pcpu);
+	target_cpu = padata_index_to_cpu(pd, pd->processed % weight);
+
+	/* ensure to call the reorder callback on the correct CPU */
+	if (cpu != target_cpu) {
+		struct padata_parallel_queue *pqueue;
+		struct padata_instance *pinst;
+
+		/* The timer function is serialized wrt itself -- no locking
+		 * needed.
+		 */
+		pinst = pd->pinst;
+		pqueue = per_cpu_ptr(pd->pqueue, target_cpu);
+		queue_work_on(target_cpu, pinst->wq, &pqueue->reorder_work);
+	} else {
+		padata_reorder(pd);
+	}
+
+	put_cpu();
 }
 
 static void padata_serial_worker(struct work_struct *serial_work)
@@ -399,6 +439,7 @@ static void padata_init_pqueues(struct parallel_data *pd)
 		__padata_list_init(&pqueue->reorder);
 		__padata_list_init(&pqueue->parallel);
 		INIT_WORK(&pqueue->work, padata_parallel_worker);
+		INIT_WORK(&pqueue->reorder_work, invoke_padata_reorder);
 		atomic_set(&pqueue->num_obj, 0);
 	}
 }
-- 
cgit v1.2.3


From 350ef88e7e922354f82a931897ad4a4ce6c686ff Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Fri, 8 Sep 2017 20:57:11 +0200
Subject: padata: ensure padata_do_serial() runs on the correct CPU

If the algorithm we're parallelizing is asynchronous we might change
CPUs between padata_do_parallel() and padata_do_serial(). However, we
don't expect this to happen as we need to enqueue the padata object into
the per-cpu reorder queue we took it from, i.e. the same-cpu's parallel
queue.

Ensure we're not switching CPUs for a given padata object by tracking
the CPU within the padata object. If the serial callback gets called on
the wrong CPU, defer invoking padata_reorder() via a kernel worker on
the CPU we're expected to run on.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/padata.h |  2 ++
 kernel/padata.c        | 20 +++++++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 5c0175bbc179..5d13d25da2c8 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -37,6 +37,7 @@
  * @list: List entry, to attach to the padata lists.
  * @pd: Pointer to the internal control structure.
  * @cb_cpu: Callback cpu for serializatioon.
+ * @cpu: Cpu for parallelization.
  * @seq_nr: Sequence number of the parallelized data object.
  * @info: Used to pass information from the parallel to the serial function.
  * @parallel: Parallel execution function.
@@ -46,6 +47,7 @@ struct padata_priv {
 	struct list_head	list;
 	struct parallel_data	*pd;
 	int			cb_cpu;
+	int			cpu;
 	int			info;
 	void                    (*parallel)(struct padata_priv *padata);
 	void                    (*serial)(struct padata_priv *padata);
diff --git a/kernel/padata.c b/kernel/padata.c
index b4066147bce4..f262c9a4e70a 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -131,6 +131,7 @@ int padata_do_parallel(struct padata_instance *pinst,
 	padata->cb_cpu = cb_cpu;
 
 	target_cpu = padata_cpu_hash(pd);
+	padata->cpu = target_cpu;
 	queue = per_cpu_ptr(pd->pqueue, target_cpu);
 
 	spin_lock(&queue->parallel.lock);
@@ -363,10 +364,21 @@ void padata_do_serial(struct padata_priv *padata)
 	int cpu;
 	struct padata_parallel_queue *pqueue;
 	struct parallel_data *pd;
+	int reorder_via_wq = 0;
 
 	pd = padata->pd;
 
 	cpu = get_cpu();
+
+	/* We need to run on the same CPU padata_do_parallel(.., padata, ..)
+	 * was called on -- or, at least, enqueue the padata object into the
+	 * correct per-cpu queue.
+	 */
+	if (cpu != padata->cpu) {
+		reorder_via_wq = 1;
+		cpu = padata->cpu;
+	}
+
 	pqueue = per_cpu_ptr(pd->pqueue, cpu);
 
 	spin_lock(&pqueue->reorder.lock);
@@ -376,7 +388,13 @@ void padata_do_serial(struct padata_priv *padata)
 
 	put_cpu();
 
-	padata_reorder(pd);
+	/* If we're running on the wrong CPU, call padata_reorder() via a
+	 * kernel worker.
+	 */
+	if (reorder_via_wq)
+		queue_work_on(cpu, pd->pinst->wq, &pqueue->reorder_work);
+	else
+		padata_reorder(pd);
 }
 EXPORT_SYMBOL(padata_do_serial);
 
-- 
cgit v1.2.3


From 703321b60b605b1f381f5dcf0bca42703b912e3e Mon Sep 17 00:00:00 2001
From: Matthew Auld <matthew.auld@intel.com>
Date: Fri, 6 Oct 2017 23:18:13 +0100
Subject: mm/shmem: introduce shmem_file_setup_with_mnt

We are planning to use our own tmpfs mnt in i915 in place of the
shm_mnt, such that we can control the mount options, in particular
huge=, which we require to support huge-gtt-pages. So rather than roll
our own version of __shmem_file_setup, it would be preferred if we could
just give shmem our mnt, and let it do the rest.

Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Hugh Dickins <hughd@google.com>
Cc: linux-mm@kvack.org
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171006145041.21673-2-matthew.auld@intel.com
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20171006221833.32439-1-chris@chris-wilson.co.uk
---
 include/linux/shmem_fs.h |  2 ++
 mm/shmem.c               | 30 ++++++++++++++++++++++--------
 2 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index b6c3540e07bc..0937d9a7d8fb 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -53,6 +53,8 @@ extern struct file *shmem_file_setup(const char *name,
 					loff_t size, unsigned long flags);
 extern struct file *shmem_kernel_file_setup(const char *name, loff_t size,
 					    unsigned long flags);
+extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt,
+		const char *name, loff_t size, unsigned long flags);
 extern int shmem_zero_setup(struct vm_area_struct *);
 extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags);
diff --git a/mm/shmem.c b/mm/shmem.c
index 07a1d22807be..3229d27503ec 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -4183,7 +4183,7 @@ static const struct dentry_operations anon_ops = {
 	.d_dname = simple_dname
 };
 
-static struct file *__shmem_file_setup(const char *name, loff_t size,
+static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
 				       unsigned long flags, unsigned int i_flags)
 {
 	struct file *res;
@@ -4192,8 +4192,8 @@ static struct file *__shmem_file_setup(const char *name, loff_t size,
 	struct super_block *sb;
 	struct qstr this;
 
-	if (IS_ERR(shm_mnt))
-		return ERR_CAST(shm_mnt);
+	if (IS_ERR(mnt))
+		return ERR_CAST(mnt);
 
 	if (size < 0 || size > MAX_LFS_FILESIZE)
 		return ERR_PTR(-EINVAL);
@@ -4205,8 +4205,8 @@ static struct file *__shmem_file_setup(const char *name, loff_t size,
 	this.name = name;
 	this.len = strlen(name);
 	this.hash = 0; /* will go */
-	sb = shm_mnt->mnt_sb;
-	path.mnt = mntget(shm_mnt);
+	sb = mnt->mnt_sb;
+	path.mnt = mntget(mnt);
 	path.dentry = d_alloc_pseudo(sb, &this);
 	if (!path.dentry)
 		goto put_memory;
@@ -4251,7 +4251,7 @@ put_path:
  */
 struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
 {
-	return __shmem_file_setup(name, size, flags, S_PRIVATE);
+	return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
 }
 
 /**
@@ -4262,10 +4262,24 @@ struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned lon
  */
 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
 {
-	return __shmem_file_setup(name, size, flags, 0);
+	return __shmem_file_setup(shm_mnt, name, size, flags, 0);
 }
 EXPORT_SYMBOL_GPL(shmem_file_setup);
 
+/**
+ * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
+ * @mnt: the tmpfs mount where the file will be created
+ * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @size: size to be set for the file
+ * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
+ */
+struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
+				       loff_t size, unsigned long flags)
+{
+	return __shmem_file_setup(mnt, name, size, flags, 0);
+}
+EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
+
 /**
  * shmem_zero_setup - setup a shared anonymous mapping
  * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
@@ -4281,7 +4295,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 	 * accessible to the user through its mapping, use S_PRIVATE flag to
 	 * bypass file security, in the same way as shmem_kernel_file_setup().
 	 */
-	file = __shmem_file_setup("dev/zero", size, vma->vm_flags, S_PRIVATE);
+	file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
-- 
cgit v1.2.3


From f3d0d8d938b4d71be1f7fb003ca2ac9250b3be4d Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sun, 24 Sep 2017 19:39:12 +0200
Subject: mtd: nand: gpio: Convert to use GPIO descriptors

There is exactly one board in the kernel that defines platform data
for the GPIO NAND driver.

Use the feature to provide a lookup table for the GPIOs in the board
file so we can convert the driver as a whole to just use GPIO
descriptors.

After this we can cut the use of <linux/of_gpio.h> and use the GPIO
descriptor management from <linux/gpio/consumer.h> alone to grab and use
the GPIOs used in the driver.

I also created a local struct device *dev in the probe() function
because I was getting annoyed with all the &pdev->dev dereferencing.

Cc: arm@kernel.org
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Frans Klaver <fransklaver@gmail.com>
Cc: Gerhard Sittig <gsi@denx.de>
Cc: Jamie Iles <jamie.iles@oracle.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Marek Vasut <marek.vasut@gmail.com>
Acked-by: Jamie Iles <jamie.iles@oracle.com>
Acked-by: Olof Johansson <olof@lixom.net>
Acked-by: Robert Jarzmik <robert.jarzmik@free.fr>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 arch/arm/mach-pxa/cm-x255.c   |  19 ++++---
 drivers/mtd/nand/gpio.c       | 112 +++++++++++++++++++++---------------------
 include/linux/mtd/nand-gpio.h |   5 --
 3 files changed, 70 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-pxa/cm-x255.c b/arch/arm/mach-pxa/cm-x255.c
index b592f79a1742..fa8e7dd4d898 100644
--- a/arch/arm/mach-pxa/cm-x255.c
+++ b/arch/arm/mach-pxa/cm-x255.c
@@ -14,7 +14,7 @@
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/physmap.h>
 #include <linux/mtd/nand-gpio.h>
-
+#include <linux/gpio/machine.h>
 #include <linux/spi/spi.h>
 #include <linux/spi/pxa2xx_spi.h>
 
@@ -176,6 +176,17 @@ static inline void cmx255_init_nor(void) {}
 #endif
 
 #if defined(CONFIG_MTD_NAND_GPIO) || defined(CONFIG_MTD_NAND_GPIO_MODULE)
+
+static struct gpiod_lookup_table cmx255_nand_gpiod_table = {
+	.dev_id         = "gpio-nand",
+	.table          = {
+		GPIO_LOOKUP("gpio-pxa", GPIO_NAND_CS, "nce", GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP("gpio-pxa", GPIO_NAND_CLE, "cle", GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP("gpio-pxa", GPIO_NAND_ALE, "ale", GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP("gpio-pxa", GPIO_NAND_RB, "rdy", GPIO_ACTIVE_HIGH),
+	},
+};
+
 static struct resource cmx255_nand_resource[] = {
 	[0] = {
 		.start = PXA_CS1_PHYS,
@@ -198,11 +209,6 @@ static struct mtd_partition cmx255_nand_parts[] = {
 };
 
 static struct gpio_nand_platdata cmx255_nand_platdata = {
-	.gpio_nce = GPIO_NAND_CS,
-	.gpio_cle = GPIO_NAND_CLE,
-	.gpio_ale = GPIO_NAND_ALE,
-	.gpio_rdy = GPIO_NAND_RB,
-	.gpio_nwp = -1,
 	.parts = cmx255_nand_parts,
 	.num_parts = ARRAY_SIZE(cmx255_nand_parts),
 	.chip_delay = 25,
@@ -220,6 +226,7 @@ static struct platform_device cmx255_nand = {
 
 static void __init cmx255_init_nand(void)
 {
+	gpiod_add_lookup_table(&cmx255_nand_gpiod_table);
 	platform_device_register(&cmx255_nand);
 }
 #else
diff --git a/drivers/mtd/nand/gpio.c b/drivers/mtd/nand/gpio.c
index fd3648952b5a..484f7fbc3f7d 100644
--- a/drivers/mtd/nand/gpio.c
+++ b/drivers/mtd/nand/gpio.c
@@ -23,7 +23,7 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/io.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
@@ -31,12 +31,16 @@
 #include <linux/mtd/nand-gpio.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
-#include <linux/of_gpio.h>
 
 struct gpiomtd {
 	void __iomem		*io_sync;
 	struct nand_chip	nand_chip;
 	struct gpio_nand_platdata plat;
+	struct gpio_desc *nce; /* Optional chip enable */
+	struct gpio_desc *cle;
+	struct gpio_desc *ale;
+	struct gpio_desc *rdy;
+	struct gpio_desc *nwp; /* Optional write protection */
 };
 
 static inline struct gpiomtd *gpio_nand_getpriv(struct mtd_info *mtd)
@@ -78,11 +82,10 @@ static void gpio_nand_cmd_ctrl(struct mtd_info *mtd, int cmd, unsigned int ctrl)
 	gpio_nand_dosync(gpiomtd);
 
 	if (ctrl & NAND_CTRL_CHANGE) {
-		if (gpio_is_valid(gpiomtd->plat.gpio_nce))
-			gpio_set_value(gpiomtd->plat.gpio_nce,
-				       !(ctrl & NAND_NCE));
-		gpio_set_value(gpiomtd->plat.gpio_cle, !!(ctrl & NAND_CLE));
-		gpio_set_value(gpiomtd->plat.gpio_ale, !!(ctrl & NAND_ALE));
+		if (gpiomtd->nce)
+			gpiod_set_value(gpiomtd->nce, !(ctrl & NAND_NCE));
+		gpiod_set_value(gpiomtd->cle, !!(ctrl & NAND_CLE));
+		gpiod_set_value(gpiomtd->ale, !!(ctrl & NAND_ALE));
 		gpio_nand_dosync(gpiomtd);
 	}
 	if (cmd == NAND_CMD_NONE)
@@ -96,7 +99,7 @@ static int gpio_nand_devready(struct mtd_info *mtd)
 {
 	struct gpiomtd *gpiomtd = gpio_nand_getpriv(mtd);
 
-	return gpio_get_value(gpiomtd->plat.gpio_rdy);
+	return gpiod_get_value(gpiomtd->rdy);
 }
 
 #ifdef CONFIG_OF
@@ -123,12 +126,6 @@ static int gpio_nand_get_config_of(const struct device *dev,
 		}
 	}
 
-	plat->gpio_rdy = of_get_gpio(dev->of_node, 0);
-	plat->gpio_nce = of_get_gpio(dev->of_node, 1);
-	plat->gpio_ale = of_get_gpio(dev->of_node, 2);
-	plat->gpio_cle = of_get_gpio(dev->of_node, 3);
-	plat->gpio_nwp = of_get_gpio(dev->of_node, 4);
-
 	if (!of_property_read_u32(dev->of_node, "chip-delay", &val))
 		plat->chip_delay = val;
 
@@ -201,10 +198,11 @@ static int gpio_nand_remove(struct platform_device *pdev)
 
 	nand_release(nand_to_mtd(&gpiomtd->nand_chip));
 
-	if (gpio_is_valid(gpiomtd->plat.gpio_nwp))
-		gpio_set_value(gpiomtd->plat.gpio_nwp, 0);
-	if (gpio_is_valid(gpiomtd->plat.gpio_nce))
-		gpio_set_value(gpiomtd->plat.gpio_nce, 1);
+	/* Enable write protection and disable the chip */
+	if (gpiomtd->nwp && !IS_ERR(gpiomtd->nwp))
+		gpiod_set_value(gpiomtd->nwp, 0);
+	if (gpiomtd->nce && !IS_ERR(gpiomtd->nce))
+		gpiod_set_value(gpiomtd->nce, 0);
 
 	return 0;
 }
@@ -215,66 +213,66 @@ static int gpio_nand_probe(struct platform_device *pdev)
 	struct nand_chip *chip;
 	struct mtd_info *mtd;
 	struct resource *res;
+	struct device *dev = &pdev->dev;
 	int ret = 0;
 
-	if (!pdev->dev.of_node && !dev_get_platdata(&pdev->dev))
+	if (!dev->of_node && !dev_get_platdata(dev))
 		return -EINVAL;
 
-	gpiomtd = devm_kzalloc(&pdev->dev, sizeof(*gpiomtd), GFP_KERNEL);
+	gpiomtd = devm_kzalloc(dev, sizeof(*gpiomtd), GFP_KERNEL);
 	if (!gpiomtd)
 		return -ENOMEM;
 
 	chip = &gpiomtd->nand_chip;
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	chip->IO_ADDR_R = devm_ioremap_resource(&pdev->dev, res);
+	chip->IO_ADDR_R = devm_ioremap_resource(dev, res);
 	if (IS_ERR(chip->IO_ADDR_R))
 		return PTR_ERR(chip->IO_ADDR_R);
 
 	res = gpio_nand_get_io_sync(pdev);
 	if (res) {
-		gpiomtd->io_sync = devm_ioremap_resource(&pdev->dev, res);
+		gpiomtd->io_sync = devm_ioremap_resource(dev, res);
 		if (IS_ERR(gpiomtd->io_sync))
 			return PTR_ERR(gpiomtd->io_sync);
 	}
 
-	ret = gpio_nand_get_config(&pdev->dev, &gpiomtd->plat);
+	ret = gpio_nand_get_config(dev, &gpiomtd->plat);
 	if (ret)
 		return ret;
 
-	if (gpio_is_valid(gpiomtd->plat.gpio_nce)) {
-		ret = devm_gpio_request(&pdev->dev, gpiomtd->plat.gpio_nce,
-					"NAND NCE");
-		if (ret)
-			return ret;
-		gpio_direction_output(gpiomtd->plat.gpio_nce, 1);
+	/* Just enable the chip */
+	gpiomtd->nce = devm_gpiod_get_optional(dev, "nce", GPIOD_OUT_HIGH);
+	if (IS_ERR(gpiomtd->nce))
+		return PTR_ERR(gpiomtd->nce);
+
+	/* We disable write protection once we know probe() will succeed */
+	gpiomtd->nwp = devm_gpiod_get_optional(dev, "nwp", GPIOD_OUT_LOW);
+	if (IS_ERR(gpiomtd->nwp)) {
+		ret = PTR_ERR(gpiomtd->nwp);
+		goto out_ce;
 	}
 
-	if (gpio_is_valid(gpiomtd->plat.gpio_nwp)) {
-		ret = devm_gpio_request(&pdev->dev, gpiomtd->plat.gpio_nwp,
-					"NAND NWP");
-		if (ret)
-			return ret;
+	gpiomtd->nwp = devm_gpiod_get(dev, "ale", GPIOD_OUT_LOW);
+	if (IS_ERR(gpiomtd->nwp)) {
+		ret = PTR_ERR(gpiomtd->nwp);
+		goto out_ce;
 	}
 
-	ret = devm_gpio_request(&pdev->dev, gpiomtd->plat.gpio_ale, "NAND ALE");
-	if (ret)
-		return ret;
-	gpio_direction_output(gpiomtd->plat.gpio_ale, 0);
+	gpiomtd->cle = devm_gpiod_get(dev, "cle", GPIOD_OUT_LOW);
+	if (IS_ERR(gpiomtd->cle)) {
+		ret = PTR_ERR(gpiomtd->cle);
+		goto out_ce;
+	}
 
-	ret = devm_gpio_request(&pdev->dev, gpiomtd->plat.gpio_cle, "NAND CLE");
-	if (ret)
-		return ret;
-	gpio_direction_output(gpiomtd->plat.gpio_cle, 0);
-
-	if (gpio_is_valid(gpiomtd->plat.gpio_rdy)) {
-		ret = devm_gpio_request(&pdev->dev, gpiomtd->plat.gpio_rdy,
-					"NAND RDY");
-		if (ret)
-			return ret;
-		gpio_direction_input(gpiomtd->plat.gpio_rdy);
-		chip->dev_ready = gpio_nand_devready;
+	gpiomtd->rdy = devm_gpiod_get_optional(dev, "rdy", GPIOD_IN);
+	if (IS_ERR(gpiomtd->rdy)) {
+		ret = PTR_ERR(gpiomtd->rdy);
+		goto out_ce;
 	}
+	/* Using RDY pin */
+	if (gpiomtd->rdy)
+		chip->dev_ready = gpio_nand_devready;
 
 	nand_set_flash_node(chip, pdev->dev.of_node);
 	chip->IO_ADDR_W		= chip->IO_ADDR_R;
@@ -285,12 +283,13 @@ static int gpio_nand_probe(struct platform_device *pdev)
 	chip->cmd_ctrl		= gpio_nand_cmd_ctrl;
 
 	mtd			= nand_to_mtd(chip);
-	mtd->dev.parent		= &pdev->dev;
+	mtd->dev.parent		= dev;
 
 	platform_set_drvdata(pdev, gpiomtd);
 
-	if (gpio_is_valid(gpiomtd->plat.gpio_nwp))
-		gpio_direction_output(gpiomtd->plat.gpio_nwp, 1);
+	/* Disable write protection, if wired up */
+	if (gpiomtd->nwp && !IS_ERR(gpiomtd->nwp))
+		gpiod_direction_output(gpiomtd->nwp, 1);
 
 	ret = nand_scan(mtd, 1);
 	if (ret)
@@ -305,8 +304,11 @@ static int gpio_nand_probe(struct platform_device *pdev)
 		return 0;
 
 err_wp:
-	if (gpio_is_valid(gpiomtd->plat.gpio_nwp))
-		gpio_set_value(gpiomtd->plat.gpio_nwp, 0);
+	if (gpiomtd->nwp && !IS_ERR(gpiomtd->nwp))
+		gpiod_set_value(gpiomtd->nwp, 0);
+out_ce:
+	if (gpiomtd->nce && !IS_ERR(gpiomtd->nce))
+		gpiod_set_value(gpiomtd->nce, 0);
 
 	return ret;
 }
diff --git a/include/linux/mtd/nand-gpio.h b/include/linux/mtd/nand-gpio.h
index be4f45d89be2..98f71908212d 100644
--- a/include/linux/mtd/nand-gpio.h
+++ b/include/linux/mtd/nand-gpio.h
@@ -4,11 +4,6 @@
 #include <linux/mtd/rawnand.h>
 
 struct gpio_nand_platdata {
-	int	gpio_nce;
-	int	gpio_nwp;
-	int	gpio_cle;
-	int	gpio_ale;
-	int	gpio_rdy;
 	void	(*adjust_parts)(struct gpio_nand_platdata *, size_t);
 	struct mtd_partition *parts;
 	unsigned int num_parts;
-- 
cgit v1.2.3


From a1c4d24e02d093efd84cbde417051d2e767fa8fa Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 30 Sep 2017 08:43:38 -0700
Subject: linux/log2.h: fix kernel-doc notation

Fix <linux/log2.h> kernel-doc:
- Add kernel-doc notation to some functions.
- Fix kernel-doc notation in function parameters.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/log2.h | 42 +++++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/log2.h b/include/linux/log2.h
index c373295f359f..41a1ae010993 100644
--- a/include/linux/log2.h
+++ b/include/linux/log2.h
@@ -37,19 +37,23 @@ int __ilog2_u64(u64 n)
 }
 #endif
 
-/*
- *  Determine whether some value is a power of two, where zero is
+/**
+ * is_power_of_2() - check if a value is a power of two
+ * @n: the value to check
+ *
+ * Determine whether some value is a power of two, where zero is
  * *not* considered a power of two.
+ * Return: true if @n is a power of 2, otherwise false.
  */
-
 static inline __attribute__((const))
 bool is_power_of_2(unsigned long n)
 {
 	return (n != 0 && ((n & (n - 1)) == 0));
 }
 
-/*
- * round up to nearest power of two
+/**
+ * __roundup_pow_of_two() - round up to nearest power of two
+ * @n: value to round up
  */
 static inline __attribute__((const))
 unsigned long __roundup_pow_of_two(unsigned long n)
@@ -57,8 +61,9 @@ unsigned long __roundup_pow_of_two(unsigned long n)
 	return 1UL << fls_long(n - 1);
 }
 
-/*
- * round down to nearest power of two
+/**
+ * __rounddown_pow_of_two() - round down to nearest power of two
+ * @n: value to round down
  */
 static inline __attribute__((const))
 unsigned long __rounddown_pow_of_two(unsigned long n)
@@ -67,12 +72,12 @@ unsigned long __rounddown_pow_of_two(unsigned long n)
 }
 
 /**
- * ilog2 - log of base 2 of 32-bit or a 64-bit unsigned value
- * @n - parameter
+ * ilog2 - log base 2 of 32-bit or a 64-bit unsigned value
+ * @n: parameter
  *
  * constant-capable log of base 2 calculation
  * - this can be used to initialise global variables from constant data, hence
- *   the massive ternary operator construction
+ * the massive ternary operator construction
  *
  * selects the appropriately-sized optimised version depending on sizeof(n)
  */
@@ -150,7 +155,7 @@ unsigned long __rounddown_pow_of_two(unsigned long n)
 
 /**
  * roundup_pow_of_two - round the given value up to nearest power of two
- * @n - parameter
+ * @n: parameter
  *
  * round the given value up to the nearest power of two
  * - the result is undefined when n == 0
@@ -167,7 +172,7 @@ unsigned long __rounddown_pow_of_two(unsigned long n)
 
 /**
  * rounddown_pow_of_two - round the given value down to nearest power of two
- * @n - parameter
+ * @n: parameter
  *
  * round the given value down to the nearest power of two
  * - the result is undefined when n == 0
@@ -180,6 +185,12 @@ unsigned long __rounddown_pow_of_two(unsigned long n)
 	__rounddown_pow_of_two(n)		\
  )
 
+static inline __attribute_const__
+int __order_base_2(unsigned long n)
+{
+	return n > 1 ? ilog2(n - 1) + 1 : 0;
+}
+
 /**
  * order_base_2 - calculate the (rounded up) base 2 order of the argument
  * @n: parameter
@@ -193,13 +204,6 @@ unsigned long __rounddown_pow_of_two(unsigned long n)
  *  ob2(5) = 3
  *  ... and so on.
  */
-
-static inline __attribute_const__
-int __order_base_2(unsigned long n)
-{
-	return n > 1 ? ilog2(n - 1) + 1 : 0;
-}
-
 #define order_base_2(n)				\
 (						\
 	__builtin_constant_p(n) ? (		\
-- 
cgit v1.2.3


From 078843f75d234123e654b992d97e8ae8a744ba23 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 30 Sep 2017 08:43:45 -0700
Subject: math64: add missing kernel-doc notation

Add missing kernel-doc notation (function parameters) for several
div() functions.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/math64.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/math64.h b/include/linux/math64.h
index 80690c96c734..8dd2488bf289 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -11,6 +11,11 @@
 
 /**
  * div_u64_rem - unsigned 64bit divide with 32bit divisor with remainder
+ * @dividend: unsigned 64bit dividend
+ * @divisor: unsigned 32bit divisor
+ * @remainder: pointer to unsigned 32bit remainder
+ *
+ * Return: sets ``*remainder``, then returns dividend / divisor
  *
  * This is commonly provided by 32bit archs to provide an optimized 64bit
  * divide.
@@ -23,6 +28,11 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
 
 /**
  * div_s64_rem - signed 64bit divide with 32bit divisor with remainder
+ * @dividend: signed 64bit dividend
+ * @divisor: signed 32bit divisor
+ * @remainder: pointer to signed 32bit remainder
+ *
+ * Return: sets ``*remainder``, then returns dividend / divisor
  */
 static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder)
 {
@@ -32,6 +42,11 @@ static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder)
 
 /**
  * div64_u64_rem - unsigned 64bit divide with 64bit divisor and remainder
+ * @dividend: unsigned 64bit dividend
+ * @divisor: unsigned 64bit divisor
+ * @remainder: pointer to unsigned 64bit remainder
+ *
+ * Return: sets ``*remainder``, then returns dividend / divisor
  */
 static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
 {
@@ -41,6 +56,10 @@ static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
 
 /**
  * div64_u64 - unsigned 64bit divide with 64bit divisor
+ * @dividend: unsigned 64bit dividend
+ * @divisor: unsigned 64bit divisor
+ *
+ * Return: dividend / divisor
  */
 static inline u64 div64_u64(u64 dividend, u64 divisor)
 {
@@ -49,6 +68,10 @@ static inline u64 div64_u64(u64 dividend, u64 divisor)
 
 /**
  * div64_s64 - signed 64bit divide with 64bit divisor
+ * @dividend: signed 64bit dividend
+ * @divisor: signed 64bit divisor
+ *
+ * Return: dividend / divisor
  */
 static inline s64 div64_s64(s64 dividend, s64 divisor)
 {
@@ -88,6 +111,8 @@ extern s64 div64_s64(s64 dividend, s64 divisor);
 
 /**
  * div_u64 - unsigned 64bit divide with 32bit divisor
+ * @dividend: unsigned 64bit dividend
+ * @divisor: unsigned 32bit divisor
  *
  * This is the most common 64bit divide and should be used if possible,
  * as many 32bit archs can optimize this variant better than a full 64bit
@@ -103,6 +128,8 @@ static inline u64 div_u64(u64 dividend, u32 divisor)
 
 /**
  * div_s64 - signed 64bit divide with 32bit divisor
+ * @dividend: signed 64bit dividend
+ * @divisor: signed 32bit divisor
  */
 #ifndef div_s64
 static inline s64 div_s64(s64 dividend, s32 divisor)
-- 
cgit v1.2.3


From 97562633bcbac4a07d605ae628d7655fa71caaf5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 5 Oct 2017 09:19:19 -0700
Subject: bpf: perf event change needed for subsequent bpf helpers

This patch does not impact existing functionalities.
It contains the changes in perf event area needed for
subsequent bpf_perf_event_read_value and
bpf_perf_prog_read_value helpers.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/perf_event.h |  7 +++++--
 kernel/bpf/arraymap.c      |  2 +-
 kernel/events/core.c       | 15 +++++++++++++--
 kernel/trace/bpf_trace.c   |  2 +-
 4 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8e22f24ded6a..79b18a20cf5d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -806,6 +806,7 @@ struct perf_output_handle {
 struct bpf_perf_event_data_kern {
 	struct pt_regs *regs;
 	struct perf_sample_data *data;
+	struct perf_event *event;
 };
 
 #ifdef CONFIG_CGROUP_PERF
@@ -884,7 +885,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				void *context);
 extern void perf_pmu_migrate_context(struct pmu *pmu,
 				int src_cpu, int dst_cpu);
-int perf_event_read_local(struct perf_event *event, u64 *value);
+int perf_event_read_local(struct perf_event *event, u64 *value,
+			  u64 *enabled, u64 *running);
 extern u64 perf_event_read_value(struct perf_event *event,
 				 u64 *enabled, u64 *running);
 
@@ -1286,7 +1288,8 @@ static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *
 {
 	return ERR_PTR(-EINVAL);
 }
-static inline int perf_event_read_local(struct perf_event *event, u64 *value)
+static inline int perf_event_read_local(struct perf_event *event, u64 *value,
+					u64 *enabled, u64 *running)
 {
 	return -EINVAL;
 }
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 98c0f00c3f5e..68d866628be0 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
 
 	ee = ERR_PTR(-EOPNOTSUPP);
 	event = perf_file->private_data;
-	if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
+	if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
 		goto err_out;
 
 	ee = bpf_event_entry_gen(perf_file, map_file);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6bc21e202ae4..902149f05381 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3684,10 +3684,12 @@ static inline u64 perf_event_count(struct perf_event *event)
  *     will not be local and we cannot read them atomically
  *   - must not have a pmu::count method
  */
-int perf_event_read_local(struct perf_event *event, u64 *value)
+int perf_event_read_local(struct perf_event *event, u64 *value,
+			  u64 *enabled, u64 *running)
 {
 	unsigned long flags;
 	int ret = 0;
+	u64 now;
 
 	/*
 	 * Disabling interrupts avoids all counter scheduling (context
@@ -3718,13 +3720,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
 		goto out;
 	}
 
+	now = event->shadow_ctx_time + perf_clock();
+	if (enabled)
+		*enabled = now - event->tstamp_enabled;
 	/*
 	 * If the event is currently on this CPU, its either a per-task event,
 	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
 	 * oncpu == -1).
 	 */
-	if (event->oncpu == smp_processor_id())
+	if (event->oncpu == smp_processor_id()) {
 		event->pmu->read(event);
+		if (running)
+			*running = now - event->tstamp_running;
+	} else if (running) {
+		*running = event->total_time_running;
+	}
 
 	*value = local64_read(&event->count);
 out:
@@ -8072,6 +8082,7 @@ static void bpf_overflow_handler(struct perf_event *event,
 	struct bpf_perf_event_data_kern ctx = {
 		.data = data,
 		.regs = regs,
+		.event = event,
 	};
 	int ret = 0;
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index dc498b605d5d..95888ae6c263 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -275,7 +275,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
 	if (!ee)
 		return -ENOENT;
 
-	err = perf_event_read_local(ee->event, &value);
+	err = perf_event_read_local(ee->event, &value, NULL, NULL);
 	/*
 	 * this api is ugly since we miss [-22..-2] range of valid
 	 * counter values, but that's uapi
-- 
cgit v1.2.3


From 64237470ddf97b63155fbd272c9e743e01d5f514 Mon Sep 17 00:00:00 2001
From: Lin Zhang <xiaolou4617@gmail.com>
Date: Fri, 6 Oct 2017 01:37:29 +0800
Subject: net: phonet: mark header_ops as const

Signed-off-by: Lin Zhang <xiaolou4617@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_phonet.h | 2 +-
 net/phonet/af_phonet.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_phonet.h b/include/linux/if_phonet.h
index bbcdb0a767d8..a118ee4a8428 100644
--- a/include/linux/if_phonet.h
+++ b/include/linux/if_phonet.h
@@ -10,5 +10,5 @@
 
 #include <uapi/linux/if_phonet.h>
 
-extern struct header_ops phonet_header_ops;
+extern const struct header_ops phonet_header_ops;
 #endif
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index f925753668a7..b12142e55d19 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -149,7 +149,7 @@ static int pn_header_parse(const struct sk_buff *skb, unsigned char *haddr)
 	return 1;
 }
 
-struct header_ops phonet_header_ops = {
+const struct header_ops phonet_header_ops = {
 	.create = pn_header_create,
 	.parse = pn_header_parse,
 };
-- 
cgit v1.2.3


From 067cae47771c864604969fd902efe10916e0d79c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 5 Oct 2017 21:52:12 -0700
Subject: bpf: Use char in prog and map name

Instead of u8, use char for prog and map name.  It can avoid the
userspace tool getting compiler's signess warning.  The
bpf_prog_aux, bpf_map, bpf_attr, bpf_prog_info and
bpf_map_info are changed.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h            | 4 ++--
 include/uapi/linux/bpf.h       | 8 ++++----
 tools/include/uapi/linux/bpf.h | 8 ++++----
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a67daea731ab..bc7da2ddfcaf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,7 +56,7 @@ struct bpf_map {
 	struct work_struct work;
 	atomic_t usercnt;
 	struct bpf_map *inner_map_meta;
-	u8 name[BPF_OBJ_NAME_LEN];
+	char name[BPF_OBJ_NAME_LEN];
 };
 
 /* function argument constraints */
@@ -189,7 +189,7 @@ struct bpf_prog_aux {
 	struct bpf_prog *prog;
 	struct user_struct *user;
 	u64 load_time; /* ns since boottime */
-	u8 name[BPF_OBJ_NAME_LEN];
+	char name[BPF_OBJ_NAME_LEN];
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5bbbec17aa5a..6db9e1d679cd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -230,7 +230,7 @@ union bpf_attr {
 		__u32	numa_node;	/* numa node (effective only if
 					 * BPF_F_NUMA_NODE is set).
 					 */
-		__u8	map_name[BPF_OBJ_NAME_LEN];
+		char	map_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -253,7 +253,7 @@ union bpf_attr {
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
-		__u8		prog_name[BPF_OBJ_NAME_LEN];
+		char		prog_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -888,7 +888,7 @@ struct bpf_prog_info {
 	__u32 created_by_uid;
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
-	__u8  name[BPF_OBJ_NAME_LEN];
+	char name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -898,7 +898,7 @@ struct bpf_map_info {
 	__u32 value_size;
 	__u32 max_entries;
 	__u32 map_flags;
-	__u8  name[BPF_OBJ_NAME_LEN];
+	char  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 0894fd20b12b..fb4fb81ce5b0 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -230,7 +230,7 @@ union bpf_attr {
 		__u32	numa_node;	/* numa node (effective only if
 					 * BPF_F_NUMA_NODE is set).
 					 */
-		__u8	map_name[BPF_OBJ_NAME_LEN];
+		char	map_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -253,7 +253,7 @@ union bpf_attr {
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
-		__u8		prog_name[BPF_OBJ_NAME_LEN];
+		char		prog_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -871,7 +871,7 @@ struct bpf_prog_info {
 	__u32 created_by_uid;
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
-	__u8  name[BPF_OBJ_NAME_LEN];
+	char  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -881,7 +881,7 @@ struct bpf_map_info {
 	__u32 value_size;
 	__u32 max_entries;
 	__u32 map_flags;
-	__u8  name[BPF_OBJ_NAME_LEN];
+	char  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
-- 
cgit v1.2.3


From 821f1b21cabb46827ce39ddf82e2789680b5042a Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Fri, 6 Oct 2017 22:12:37 -0700
Subject: bridge: add new BR_NEIGH_SUPPRESS port flag to suppress arp and nd
 flood

This patch adds a new bridge port flag BR_NEIGH_SUPPRESS to
suppress arp and nd flood on bridge ports. It implements
rfc7432, section 10.
https://tools.ietf.org/html/rfc7432#section-10
for ethernet VPN deployments. It is similar to the existing
BR_PROXYARP* flags but has a few semantic differences to conform
to EVPN standard. Unlike the existing flags, this new flag suppresses
flood of all neigh discovery packets (arp and nd) to tunnel ports.
Supports both vlan filtering and non-vlan filtering bridges.

In case of EVPN, it is mainly used to avoid flooding
of arp and nd packets to tunnel ports like vxlan.

This patch adds netlink and sysfs support to set this bridge port
flag.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h    |  1 +
 include/uapi/linux/if_link.h |  1 +
 net/bridge/Makefile          |  2 +-
 net/bridge/br_arp_nd_proxy.c | 32 ++++++++++++++++++++++++++++++++
 net/bridge/br_forward.c      |  2 +-
 net/bridge/br_if.c           |  5 +++++
 net/bridge/br_netlink.c      | 10 +++++++++-
 net/bridge/br_private.h      |  2 ++
 net/bridge/br_sysfs_if.c     |  2 ++
 9 files changed, 54 insertions(+), 3 deletions(-)
 create mode 100644 net/bridge/br_arp_nd_proxy.c

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 3cd18ac0697f..316ee113a220 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -49,6 +49,7 @@ struct br_ip_list {
 #define BR_MULTICAST_TO_UNICAST	BIT(12)
 #define BR_VLAN_TUNNEL		BIT(13)
 #define BR_BCAST_FLOOD		BIT(14)
+#define BR_NEIGH_SUPPRESS	BIT(15)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index cd580fc0e58f..b037e0ab1975 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -327,6 +327,7 @@ enum {
 	IFLA_BRPORT_VLAN_TUNNEL,
 	IFLA_BRPORT_BCAST_FLOOD,
 	IFLA_BRPORT_GROUP_FWD_MASK,
+	IFLA_BRPORT_NEIGH_SUPPRESS,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index 40b1ede527ca..4aee55fdcc92 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_BRIDGE) += bridge.o
 bridge-y	:= br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \
 			br_ioctl.o br_stp.o br_stp_bpdu.o \
 			br_stp_if.o br_stp_timer.o br_netlink.o \
-			br_netlink_tunnel.o
+			br_netlink_tunnel.o br_arp_nd_proxy.o
 
 bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
 
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
new file mode 100644
index 000000000000..f889ad5f0048
--- /dev/null
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -0,0 +1,32 @@
+/*
+ *  Handle bridge arp/nd proxy/suppress
+ *
+ *  Copyright (C) 2017 Cumulus Networks
+ *  Copyright (c) 2017 Roopa Prabhu <roopa@cumulusnetworks.com>
+ *
+ *  Authors:
+ *	Roopa Prabhu <roopa@cumulusnetworks.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include "br_private.h"
+
+void br_recalculate_neigh_suppress_enabled(struct net_bridge *br)
+{
+	struct net_bridge_port *p;
+	bool neigh_suppress = false;
+
+	list_for_each_entry(p, &br->port_list, list) {
+		if (p->flags & BR_NEIGH_SUPPRESS) {
+			neigh_suppress = true;
+			break;
+		}
+	}
+
+	br->neigh_suppress_enabled = neigh_suppress;
+}
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 48fb17417fac..b4eed113d2ec 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -204,7 +204,7 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
 		/* Do not flood to ports that enable proxy ARP */
 		if (p->flags & BR_PROXYARP)
 			continue;
-		if ((p->flags & BR_PROXYARP_WIFI) &&
+		if ((p->flags & (BR_PROXYARP_WIFI | BR_NEIGH_SUPPRESS)) &&
 		    BR_INPUT_SKB_CB(skb)->proxyarp_replied)
 			continue;
 
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 59a74a414e20..ae38547bbf91 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -310,6 +310,8 @@ void br_dev_delete(struct net_device *dev, struct list_head *head)
 		del_nbp(p);
 	}
 
+	br_recalculate_neigh_suppress_enabled(br);
+
 	br_fdb_delete_by_port(br, NULL, 0, 1);
 
 	cancel_delayed_work_sync(&br->gc_work);
@@ -660,4 +662,7 @@ void br_port_flags_change(struct net_bridge_port *p, unsigned long mask)
 
 	if (mask & BR_AUTO_MASK)
 		nbp_update_port_count(br);
+
+	if (mask & BR_NEIGH_SUPPRESS)
+		br_recalculate_neigh_suppress_enabled(br);
 }
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index dea88a255d26..f0e82682e071 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -138,6 +138,7 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROXYARP */
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROXYARP_WIFI */
 		+ nla_total_size(1)	/* IFLA_BRPORT_VLAN_TUNNEL */
+		+ nla_total_size(1)	/* IFLA_BRPORT_NEIGH_SUPPRESS */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_ROOT_ID */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_BRIDGE_ID */
 		+ nla_total_size(sizeof(u16))	/* IFLA_BRPORT_DESIGNATED_PORT */
@@ -210,7 +211,9 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 	    nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) ||
 	    nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags &
 							BR_VLAN_TUNNEL)) ||
-	    nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask))
+	    nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) ||
+	    nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS,
+		       !!(p->flags & BR_NEIGH_SUPPRESS)))
 		return -EMSGSIZE;
 
 	timerval = br_timer_value(&p->message_age_timer);
@@ -785,6 +788,11 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 		p->group_fwd_mask = fwd_mask;
 	}
 
+	err = br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS,
+			       BR_NEIGH_SUPPRESS);
+	if (err)
+		return err;
+
 	br_port_flags_change(p, old_flags ^ p->flags);
 	return 0;
 }
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index ab4df24f7bba..00fa371b1fb2 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -404,6 +404,7 @@ struct net_bridge {
 #ifdef CONFIG_NET_SWITCHDEV
 	int offload_fwd_mark;
 #endif
+	bool				neigh_suppress_enabled;
 };
 
 struct br_input_skb_cb {
@@ -1139,4 +1140,5 @@ static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
 }
 #endif /* CONFIG_NET_SWITCHDEV */
 
+void br_recalculate_neigh_suppress_enabled(struct net_bridge *br);
 #endif
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 9110d5e56085..0a1fa9ccd8b7 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -191,6 +191,7 @@ BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP);
 BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI);
 BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD);
 BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD);
+BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS);
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
@@ -241,6 +242,7 @@ static const struct brport_attribute *brport_attrs[] = {
 	&brport_attr_multicast_flood,
 	&brport_attr_broadcast_flood,
 	&brport_attr_group_fwd_mask,
+	&brport_attr_neigh_suppress,
 	NULL
 };
 
-- 
cgit v1.2.3


From f8e0731db4a02c8c5624e1dce6dc983210a51f64 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Fri, 15 Sep 2017 11:53:07 +0200
Subject: dma-fence: fix dma_fence_get_rcu_safe v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When dma_fence_get_rcu() fails to acquire a reference it doesn't necessary
mean that there is no fence at all.

It usually mean that the fence was replaced by a new one and in this situation
we certainly want to have the new one as result and *NOT* NULL.

v2: Keep extra check after dma_fence_get_rcu().

Signed-off-by: Christian König <christian.koenig@amd.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: linux-media@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Cc: linaro-mm-sig@lists.linaro.org
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1505469187-3565-1-git-send-email-deathsimple@vodafone.de
---
 include/linux/dma-fence.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 171895072435..ca974224d92e 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -248,9 +248,12 @@ dma_fence_get_rcu_safe(struct dma_fence * __rcu *fencep)
 		struct dma_fence *fence;
 
 		fence = rcu_dereference(*fencep);
-		if (!fence || !dma_fence_get_rcu(fence))
+		if (!fence)
 			return NULL;
 
+		if (!dma_fence_get_rcu(fence))
+			continue;
+
 		/* The atomic_inc_not_zero() inside dma_fence_get_rcu()
 		 * provides a full memory barrier upon success (such as now).
 		 * This is paired with the write barrier from assigning
-- 
cgit v1.2.3


From 0912bda436388a02c72164b4b490b578e64c012e Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Mon, 9 Oct 2017 11:15:32 +0200
Subject: net: bridge: Export bridge multicast router state

Add an access function that, given a bridge netdevice, returns whether the
bridge device is currently an mrouter or not. The function uses the already
existing br_multicast_is_router function to check that.

This function is needed in order to allow ports that join an already
existing bridge to know the current mrouter state of the bridge device.
Together with the bridge device mrouter ports switchdev notifications, it
is possible to have full offloading of the semantics of the bridge device
mcast router state.

Due to the fact that the bridge multicast router status can change in
packet RX path, take the multicast_router bridge spinlock to protect the
read.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Nogah Frankel <nogahf@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h |  5 +++++
 net/bridge/br_multicast.c | 12 ++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 316ee113a220..02639ebea2f0 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -64,6 +64,7 @@ int br_multicast_list_adjacent(struct net_device *dev,
 bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto);
 bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto);
 bool br_multicast_enabled(const struct net_device *dev);
+bool br_multicast_router(const struct net_device *dev);
 #else
 static inline int br_multicast_list_adjacent(struct net_device *dev,
 					     struct list_head *br_ip_list)
@@ -84,6 +85,10 @@ static inline bool br_multicast_enabled(const struct net_device *dev)
 {
 	return false;
 }
+static inline bool br_multicast_router(const struct net_device *dev)
+{
+	return false;
+}
 #endif
 
 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index bd50550dd4ca..7947e0436e18 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -2216,6 +2216,18 @@ bool br_multicast_enabled(const struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(br_multicast_enabled);
 
+bool br_multicast_router(const struct net_device *dev)
+{
+	struct net_bridge *br = netdev_priv(dev);
+	bool is_router;
+
+	spin_lock_bh(&br->multicast_lock);
+	is_router = br_multicast_is_router(br);
+	spin_unlock_bh(&br->multicast_lock);
+	return is_router;
+}
+EXPORT_SYMBOL_GPL(br_multicast_router);
+
 int br_multicast_set_querier(struct net_bridge *br, unsigned long val)
 {
 	unsigned long max_delay;
-- 
cgit v1.2.3


From ed468ebee04ffba0231a8f50616bdb250752a891 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Mon, 9 Oct 2017 12:37:44 +0300
Subject: qed: Add ll2 ability of opening a secondary queue

When more than one ll2 queue is opened ( that is not an OOO queue )
ll2 code does not have enough information to determine whether
the queue is the main one or not, so a new field is added to the
acquire input data to expose the control of determining whether
the queue is the main queue or a secondary queue.

Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c | 7 ++++++-
 drivers/net/ethernet/qlogic/qed/qed_ll2.h | 1 +
 include/linux/qed/qed_ll2_if.h            | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 75af40a7690a..3c695da890df 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -894,7 +894,7 @@ static int qed_sp_ll2_rx_queue_start(struct qed_hwfn *p_hwfn,
 	p_ramrod->drop_ttl0_flg = p_ll2_conn->input.rx_drop_ttl0_flg;
 	p_ramrod->inner_vlan_removal_en = p_ll2_conn->input.rx_vlan_removal_en;
 	p_ramrod->queue_id = p_ll2_conn->queue_id;
-	p_ramrod->main_func_queue = (conn_type == QED_LL2_TYPE_OOO) ? 0 : 1;
+	p_ramrod->main_func_queue = p_ll2_conn->main_func_queue ? 1 : 0;
 
 	if ((IS_MF_DEFAULT(p_hwfn) || IS_MF_SI(p_hwfn)) &&
 	    p_ramrod->main_func_queue && (conn_type != QED_LL2_TYPE_ROCE) &&
@@ -1265,6 +1265,11 @@ int qed_ll2_acquire_connection(void *cxt, struct qed_ll2_acquire_data *data)
 
 	p_ll2_info->tx_dest = (data->input.tx_dest == QED_LL2_TX_DEST_NW) ?
 			      CORE_TX_DEST_NW : CORE_TX_DEST_LB;
+	if (data->input.conn_type == QED_LL2_TYPE_OOO ||
+	    data->input.secondary_queue)
+		p_ll2_info->main_func_queue = false;
+	else
+		p_ll2_info->main_func_queue = true;
 
 	/* Correct maximum number of Tx BDs */
 	p_tx_max = &p_ll2_info->input.tx_max_bds_per_packet;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.h b/drivers/net/ethernet/qlogic/qed/qed_ll2.h
index 9bdd08f15c79..f65817012e97 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.h
@@ -121,6 +121,7 @@ struct qed_ll2_info {
 	bool b_active;
 	enum core_tx_dest tx_dest;
 	u8 tx_stats_en;
+	bool main_func_queue;
 	struct qed_ll2_rx_queue rx_queue;
 	struct qed_ll2_tx_queue tx_queue;
 	struct qed_ll2_cbs cbs;
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index 89fa0bbd54f3..d7cca590b743 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -171,6 +171,7 @@ struct qed_ll2_acquire_data_inputs {
 	enum qed_ll2_tx_dest tx_dest;
 	enum qed_ll2_error_handle ai_err_packet_too_big;
 	enum qed_ll2_error_handle ai_err_no_buf;
+	bool secondary_queue;
 	u8 gsi_enable;
 };
 
-- 
cgit v1.2.3


From 77caa792f5d8e4ecc88eb1cf4b9c478c07e0ec57 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Mon, 9 Oct 2017 12:37:45 +0300
Subject: qed: Add ll2 option for dropping a tx packet

The option of sending a packet on the ll2 and dropping it exists in
hardware and was not used until now, thus not exposed.
The iWARP unaligned MPA flow requires this functionality for
flushing the tx queue.

Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c | 16 ++++++++++++++--
 include/linux/qed/qed_ll2_if.h            |  1 +
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 3c695da890df..ad67d36956e8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -1597,8 +1597,20 @@ qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn,
 	roce_flavor = (pkt->qed_roce_flavor == QED_LL2_ROCE) ? CORE_ROCE
 							     : CORE_RROCE;
 
-	tx_dest = (pkt->tx_dest == QED_LL2_TX_DEST_NW) ? CORE_TX_DEST_NW
-						       : CORE_TX_DEST_LB;
+	switch (pkt->tx_dest) {
+	case QED_LL2_TX_DEST_NW:
+		tx_dest = CORE_TX_DEST_NW;
+		break;
+	case QED_LL2_TX_DEST_LB:
+		tx_dest = CORE_TX_DEST_LB;
+		break;
+	case QED_LL2_TX_DEST_DROP:
+		tx_dest = CORE_TX_DEST_DROP;
+		break;
+	default:
+		tx_dest = CORE_TX_DEST_LB;
+		break;
+	}
 
 	start_bd = (struct core_tx_bd *)qed_chain_produce(p_tx_chain);
 	start_bd->nw_vlan_or_lb_echo = cpu_to_le16(pkt->vlan);
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index d7cca590b743..95fdf02a3bbe 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -64,6 +64,7 @@ enum qed_ll2_roce_flavor_type {
 enum qed_ll2_tx_dest {
 	QED_LL2_TX_DEST_NW, /* Light L2 TX Destination to the Network */
 	QED_LL2_TX_DEST_LB, /* Light L2 TX Destination to the Loopback */
+	QED_LL2_TX_DEST_DROP, /* Light L2 Drop the TX packet */
 	QED_LL2_TX_DEST_MAX
 };
 
-- 
cgit v1.2.3


From 6f34a284f36399501fcc034dc4522a2d8d9fa6c9 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Mon, 9 Oct 2017 12:37:48 +0300
Subject: qed: Add LL2 slowpath handling

For iWARP unaligned MPA flow, a slowpath event of flushing an
MPA connection that entered an unaligned state is required.
The flush ramrod is received on the ll2 queue, and a pre-registered
callback function is called to handle the flush event.

Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c | 40 +++++++++++++++++++++++++++++--
 include/linux/qed/qed_ll2_if.h            |  5 ++++
 2 files changed, 43 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 8eb9645c880d..047f556ca62e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -422,6 +422,41 @@ static void qed_ll2_rxq_parse_reg(struct qed_hwfn *p_hwfn,
 	data->u.placement_offset = p_cqe->rx_cqe_fp.placement_offset;
 }
 
+static int
+qed_ll2_handle_slowpath(struct qed_hwfn *p_hwfn,
+			struct qed_ll2_info *p_ll2_conn,
+			union core_rx_cqe_union *p_cqe,
+			unsigned long *p_lock_flags)
+{
+	struct qed_ll2_rx_queue *p_rx = &p_ll2_conn->rx_queue;
+	struct core_rx_slow_path_cqe *sp_cqe;
+
+	sp_cqe = &p_cqe->rx_cqe_sp;
+	if (sp_cqe->ramrod_cmd_id != CORE_RAMROD_RX_QUEUE_FLUSH) {
+		DP_NOTICE(p_hwfn,
+			  "LL2 - unexpected Rx CQE slowpath ramrod_cmd_id:%d\n",
+			  sp_cqe->ramrod_cmd_id);
+		return -EINVAL;
+	}
+
+	if (!p_ll2_conn->cbs.slowpath_cb) {
+		DP_NOTICE(p_hwfn,
+			  "LL2 - received RX_QUEUE_FLUSH but no callback was provided\n");
+		return -EINVAL;
+	}
+
+	spin_unlock_irqrestore(&p_rx->lock, *p_lock_flags);
+
+	p_ll2_conn->cbs.slowpath_cb(p_ll2_conn->cbs.cookie,
+				    p_ll2_conn->my_id,
+				    le32_to_cpu(sp_cqe->opaque_data.data[0]),
+				    le32_to_cpu(sp_cqe->opaque_data.data[1]));
+
+	spin_lock_irqsave(&p_rx->lock, *p_lock_flags);
+
+	return 0;
+}
+
 static int
 qed_ll2_rxq_handle_completion(struct qed_hwfn *p_hwfn,
 			      struct qed_ll2_info *p_ll2_conn,
@@ -495,8 +530,8 @@ static int qed_ll2_rxq_completion(struct qed_hwfn *p_hwfn, void *cookie)
 
 		switch (cqe->rx_cqe_sp.type) {
 		case CORE_RX_CQE_TYPE_SLOW_PATH:
-			DP_NOTICE(p_hwfn, "LL2 - unexpected Rx CQE slowpath\n");
-			rc = -EINVAL;
+			rc = qed_ll2_handle_slowpath(p_hwfn, p_ll2_conn,
+						     cqe, &flags);
 			break;
 		case CORE_RX_CQE_TYPE_GSI_OFFLOAD:
 		case CORE_RX_CQE_TYPE_REGULAR:
@@ -1214,6 +1249,7 @@ qed_ll2_set_cbs(struct qed_ll2_info *p_ll2_info, const struct qed_ll2_cbs *cbs)
 	p_ll2_info->cbs.rx_release_cb = cbs->rx_release_cb;
 	p_ll2_info->cbs.tx_comp_cb = cbs->tx_comp_cb;
 	p_ll2_info->cbs.tx_release_cb = cbs->tx_release_cb;
+	p_ll2_info->cbs.slowpath_cb = cbs->slowpath_cb;
 	p_ll2_info->cbs.cookie = cbs->cookie;
 
 	return 0;
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index 95fdf02a3bbe..e755954d85fd 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -151,11 +151,16 @@ void (*qed_ll2_release_tx_packet_cb)(void *cxt,
 				     dma_addr_t first_frag_addr,
 				     bool b_last_fragment, bool b_last_packet);
 
+typedef
+void (*qed_ll2_slowpath_cb)(void *cxt, u8 connection_handle,
+			    u32 opaque_data_0, u32 opaque_data_1);
+
 struct qed_ll2_cbs {
 	qed_ll2_complete_rx_packet_cb rx_comp_cb;
 	qed_ll2_release_rx_packet_cb rx_release_cb;
 	qed_ll2_complete_tx_packet_cb tx_comp_cb;
 	qed_ll2_release_tx_packet_cb tx_release_cb;
+	qed_ll2_slowpath_cb slowpath_cb;
 	void *cookie;
 };
 
-- 
cgit v1.2.3


From e72a060151e5bb673af24993665e270fc4f674a7 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi83@gmail.com>
Date: Wed, 30 Aug 2017 13:50:39 +0200
Subject: iio: st_sensors: add register mask for status register

Introduce register mask for data-ready status register since
pressure sensors (e.g. LPS22HB) export just two channels
(BIT(0) and BIT(1)) and BIT(2) is marked reserved while in
st_sensors_new_samples_available() value read from status register
is masked using 0x7.
Moreover do not mask status register using active_scan_mask since
now status value is properly masked and if the result is not zero the
interrupt has to be consumed by the driver. This fix an issue on LPS25H
and LPS331AP where channel definition is swapped respect to status
register.
Furthermore that change allows to properly support new devices
(e.g LIS2DW12) that report just ZYXDA (data-ready) field in status register
to figure out if the interrupt has been generated by the device.

Fixes: 97865fe41322 (iio: st_sensors: verify interrupt event to status)
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@st.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/accel/st_accel_core.c                  | 35 +++++++++++++++++-----
 drivers/iio/common/st_sensors/st_sensors_core.c    |  2 +-
 drivers/iio/common/st_sensors/st_sensors_trigger.c | 16 +++-------
 drivers/iio/gyro/st_gyro_core.c                    | 15 ++++++++--
 drivers/iio/magnetometer/st_magn_core.c            | 10 +++++--
 drivers/iio/pressure/st_pressure_core.c            | 15 ++++++++--
 include/linux/iio/common/st_sensors.h              |  7 +++--
 7 files changed, 70 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/st_accel_core.c b/drivers/iio/accel/st_accel_core.c
index 1a2e54ff473a..140ba26f6131 100644
--- a/drivers/iio/accel/st_accel_core.c
+++ b/drivers/iio/accel/st_accel_core.c
@@ -164,7 +164,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask_int2 = 0x00,
 			.addr_ihl = 0x25,
 			.mask_ihl = 0x02,
-			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
+			.stat_drdy = {
+				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
+				.mask = 0x07,
+			},
 		},
 		.sim = {
 			.addr = 0x23,
@@ -236,7 +239,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask_ihl = 0x80,
 			.addr_od = 0x22,
 			.mask_od = 0x40,
-			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
+			.stat_drdy = {
+				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
+				.mask = 0x07,
+			},
 		},
 		.sim = {
 			.addr = 0x23,
@@ -318,7 +324,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask_int2 = 0x00,
 			.addr_ihl = 0x23,
 			.mask_ihl = 0x40,
-			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
+			.stat_drdy = {
+				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
+				.mask = 0x07,
+			},
 			.ig1 = {
 				.en_addr = 0x23,
 				.en_mask = 0x08,
@@ -389,7 +398,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 		.drdy_irq = {
 			.addr = 0x21,
 			.mask_int1 = 0x04,
-			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
+			.stat_drdy = {
+				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
+				.mask = 0x07,
+			},
 		},
 		.sim = {
 			.addr = 0x21,
@@ -451,7 +463,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask_ihl = 0x80,
 			.addr_od = 0x22,
 			.mask_od = 0x40,
-			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
+			.stat_drdy = {
+				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
+				.mask = 0x07,
+			},
 		},
 		.sim = {
 			.addr = 0x21,
@@ -569,7 +584,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 		.drdy_irq = {
 			.addr = 0x21,
 			.mask_int1 = 0x04,
-			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
+			.stat_drdy = {
+				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
+				.mask = 0x07,
+			},
 		},
 		.sim = {
 			.addr = 0x21,
@@ -640,7 +658,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask_int2 = 0x00,
 			.addr_ihl = 0x25,
 			.mask_ihl = 0x02,
-			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
+			.stat_drdy = {
+				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
+				.mask = 0x07,
+			},
 		},
 		.sim = {
 			.addr = 0x23,
diff --git a/drivers/iio/common/st_sensors/st_sensors_core.c b/drivers/iio/common/st_sensors/st_sensors_core.c
index 02e833b14db0..34115f05d5c4 100644
--- a/drivers/iio/common/st_sensors/st_sensors_core.c
+++ b/drivers/iio/common/st_sensors/st_sensors_core.c
@@ -470,7 +470,7 @@ int st_sensors_set_dataready_irq(struct iio_dev *indio_dev, bool enable)
 		 * different one. Take into account irq status register
 		 * to understand if irq trigger can be properly supported
 		 */
-		if (sdata->sensor_settings->drdy_irq.addr_stat_drdy)
+		if (sdata->sensor_settings->drdy_irq.stat_drdy.addr)
 			sdata->hw_irq_trigger = enable;
 		return 0;
 	}
diff --git a/drivers/iio/common/st_sensors/st_sensors_trigger.c b/drivers/iio/common/st_sensors/st_sensors_trigger.c
index fa73e6795359..fdcc5a891958 100644
--- a/drivers/iio/common/st_sensors/st_sensors_trigger.c
+++ b/drivers/iio/common/st_sensors/st_sensors_trigger.c
@@ -31,7 +31,7 @@ static int st_sensors_new_samples_available(struct iio_dev *indio_dev,
 	int ret;
 
 	/* How would I know if I can't check it? */
-	if (!sdata->sensor_settings->drdy_irq.addr_stat_drdy)
+	if (!sdata->sensor_settings->drdy_irq.stat_drdy.addr)
 		return -EINVAL;
 
 	/* No scan mask, no interrupt */
@@ -39,23 +39,15 @@ static int st_sensors_new_samples_available(struct iio_dev *indio_dev,
 		return 0;
 
 	ret = sdata->tf->read_byte(&sdata->tb, sdata->dev,
-			sdata->sensor_settings->drdy_irq.addr_stat_drdy,
+			sdata->sensor_settings->drdy_irq.stat_drdy.addr,
 			&status);
 	if (ret < 0) {
 		dev_err(sdata->dev,
 			"error checking samples available\n");
 		return ret;
 	}
-	/*
-	 * the lower bits of .active_scan_mask[0] is directly mapped
-	 * to the channels on the sensor: either bit 0 for
-	 * one-dimensional sensors, or e.g. x,y,z for accelerometers,
-	 * gyroscopes or magnetometers. No sensor use more than 3
-	 * channels, so cut the other status bits here.
-	 */
-	status &= 0x07;
 
-	if (status & (u8)indio_dev->active_scan_mask[0])
+	if (status & sdata->sensor_settings->drdy_irq.stat_drdy.mask)
 		return 1;
 
 	return 0;
@@ -212,7 +204,7 @@ int st_sensors_allocate_trigger(struct iio_dev *indio_dev,
 	 * it was "our" interrupt.
 	 */
 	if (sdata->int_pin_open_drain &&
-	    sdata->sensor_settings->drdy_irq.addr_stat_drdy)
+	    sdata->sensor_settings->drdy_irq.stat_drdy.addr)
 		irq_trig |= IRQF_SHARED;
 
 	err = request_threaded_irq(sdata->get_irq_data_ready(indio_dev),
diff --git a/drivers/iio/gyro/st_gyro_core.c b/drivers/iio/gyro/st_gyro_core.c
index 4cf85aa01dde..22c0c1732996 100644
--- a/drivers/iio/gyro/st_gyro_core.c
+++ b/drivers/iio/gyro/st_gyro_core.c
@@ -118,7 +118,10 @@ static const struct st_sensor_settings st_gyro_sensors_settings[] = {
 			 * drain settings, but only for INT1 and not
 			 * for the DRDY line on INT2.
 			 */
-			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
+			.stat_drdy = {
+				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
+				.mask = 0x07,
+			},
 		},
 		.multi_read_bit = true,
 		.bootime = 2,
@@ -188,7 +191,10 @@ static const struct st_sensor_settings st_gyro_sensors_settings[] = {
 			 * drain settings, but only for INT1 and not
 			 * for the DRDY line on INT2.
 			 */
-			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
+			.stat_drdy = {
+				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
+				.mask = 0x07,
+			},
 		},
 		.multi_read_bit = true,
 		.bootime = 2,
@@ -253,7 +259,10 @@ static const struct st_sensor_settings st_gyro_sensors_settings[] = {
 			 * drain settings, but only for INT1 and not
 			 * for the DRDY line on INT2.
 			 */
-			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
+			.stat_drdy = {
+				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
+				.mask = 0x07,
+			},
 		},
 		.multi_read_bit = true,
 		.bootime = 2,
diff --git a/drivers/iio/magnetometer/st_magn_core.c b/drivers/iio/magnetometer/st_magn_core.c
index 703de313c418..ace72c57f53c 100644
--- a/drivers/iio/magnetometer/st_magn_core.c
+++ b/drivers/iio/magnetometer/st_magn_core.c
@@ -317,7 +317,10 @@ static const struct st_sensor_settings st_magn_sensors_settings[] = {
 		},
 		.drdy_irq = {
 			/* drdy line is routed drdy pin */
-			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
+			.stat_drdy = {
+				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
+				.mask = 0x07,
+			},
 		},
 		.multi_read_bit = true,
 		.bootime = 2,
@@ -361,7 +364,10 @@ static const struct st_sensor_settings st_magn_sensors_settings[] = {
 		.drdy_irq = {
 			.addr = 0x62,
 			.mask_int1 = 0x01,
-			.addr_stat_drdy = 0x67,
+			.stat_drdy = {
+				.addr = 0x67,
+				.mask = 0x07,
+			},
 		},
 		.multi_read_bit = false,
 		.bootime = 2,
diff --git a/drivers/iio/pressure/st_pressure_core.c b/drivers/iio/pressure/st_pressure_core.c
index 86120715913b..5f8358e23f5b 100644
--- a/drivers/iio/pressure/st_pressure_core.c
+++ b/drivers/iio/pressure/st_pressure_core.c
@@ -287,7 +287,10 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 			.mask_ihl = 0x80,
 			.addr_od = 0x22,
 			.mask_od = 0x40,
-			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
+			.stat_drdy = {
+				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
+				.mask = 0x03,
+			},
 		},
 		.multi_read_bit = true,
 		.bootime = 2,
@@ -395,7 +398,10 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 			.mask_ihl = 0x80,
 			.addr_od = 0x22,
 			.mask_od = 0x40,
-			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
+			.stat_drdy = {
+				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
+				.mask = 0x03,
+			},
 		},
 		.multi_read_bit = true,
 		.bootime = 2,
@@ -456,7 +462,10 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 			.mask_ihl = 0x80,
 			.addr_od = 0x12,
 			.mask_od = 0x40,
-			.addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
+			.stat_drdy = {
+				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
+				.mask = 0x03,
+			},
 		},
 		.multi_read_bit = false,
 		.bootime = 2,
diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index 7b0fa8b5c120..ce0ef1c0a30a 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -139,7 +139,7 @@ struct st_sensor_das {
  * @mask_ihl: mask to enable/disable active low on the INT lines.
  * @addr_od: address to enable/disable Open Drain on the INT lines.
  * @mask_od: mask to enable/disable Open Drain on the INT lines.
- * @addr_stat_drdy: address to read status of DRDY (data ready) interrupt
+ * struct stat_drdy - status register of DRDY (data ready) interrupt.
  * struct ig1 - represents the Interrupt Generator 1 of sensors.
  * @en_addr: address of the enable ig1 register.
  * @en_mask: mask to write the on/off value for enable.
@@ -152,7 +152,10 @@ struct st_sensor_data_ready_irq {
 	u8 mask_ihl;
 	u8 addr_od;
 	u8 mask_od;
-	u8 addr_stat_drdy;
+	struct {
+		u8 addr;
+		u8 mask;
+	} stat_drdy;
 	struct {
 		u8 en_addr;
 		u8 en_mask;
-- 
cgit v1.2.3


From 75d4c6d2e15d4455dfd5995c27e6e6ad6f214e39 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi83@gmail.com>
Date: Wed, 30 Aug 2017 13:50:40 +0200
Subject: iio: st_sensors: decouple irq1 configuration parameters from the irq2
 ones

Separate data-ready configuration parameters for INT1 and INT2 pins in
st_sensor_data_ready_irq data structure. That change will be use to
properly support LIS2DW12 accel sensor.

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@st.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/accel/st_accel_core.c               | 66 ++++++++++++++++---------
 drivers/iio/common/st_sensors/st_sensors_core.c | 28 ++++++-----
 drivers/iio/gyro/st_gyro_core.c                 | 18 ++++---
 drivers/iio/magnetometer/st_magn_core.c         |  6 ++-
 drivers/iio/pressure/st_pressure_core.c         | 31 +++++++-----
 include/linux/iio/common/st_sensors.h           | 16 +++---
 6 files changed, 104 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/st_accel_core.c b/drivers/iio/accel/st_accel_core.c
index 140ba26f6131..a6eac959dfe8 100644
--- a/drivers/iio/accel/st_accel_core.c
+++ b/drivers/iio/accel/st_accel_core.c
@@ -159,9 +159,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask = 0x80,
 		},
 		.drdy_irq = {
-			.addr = 0x22,
-			.mask_int1 = 0x10,
-			.mask_int2 = 0x00,
+			.int1 = {
+				.addr = 0x22,
+				.mask = 0x10,
+			},
 			.addr_ihl = 0x25,
 			.mask_ihl = 0x02,
 			.stat_drdy = {
@@ -232,9 +233,14 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask = 0x80,
 		},
 		.drdy_irq = {
-			.addr = 0x22,
-			.mask_int1 = 0x02,
-			.mask_int2 = 0x10,
+			.int1 = {
+				.addr = 0x22,
+				.mask = 0x02,
+			},
+			.int2 = {
+				.addr = 0x22,
+				.mask = 0x10,
+			},
 			.addr_ihl = 0x22,
 			.mask_ihl = 0x80,
 			.addr_od = 0x22,
@@ -319,9 +325,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask = 0x08,
 		},
 		.drdy_irq = {
-			.addr = 0x23,
-			.mask_int1 = 0x80,
-			.mask_int2 = 0x00,
+			.int1 = {
+				.addr = 0x23,
+				.mask = 0x80,
+			},
 			.addr_ihl = 0x23,
 			.mask_ihl = 0x40,
 			.stat_drdy = {
@@ -396,8 +403,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask = 0x01,
 		},
 		.drdy_irq = {
-			.addr = 0x21,
-			.mask_int1 = 0x04,
+			.int1 = {
+				.addr = 0x21,
+				.mask = 0x04,
+			},
 			.stat_drdy = {
 				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
 				.mask = 0x07,
@@ -456,9 +465,14 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			},
 		},
 		.drdy_irq = {
-			.addr = 0x22,
-			.mask_int1 = 0x04,
-			.mask_int2 = 0x20,
+			.int1 = {
+				.addr = 0x22,
+				.mask = 0x04,
+			},
+			.int2 = {
+				.addr = 0x22,
+				.mask = 0x20,
+			},
 			.addr_ihl = 0x22,
 			.mask_ihl = 0x80,
 			.addr_od = 0x22,
@@ -528,9 +542,14 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.mask = 0x80,
 		},
 		.drdy_irq = {
-			.addr = 0x22,
-			.mask_int1 = 0x02,
-			.mask_int2 = 0x10,
+			.int1 = {
+				.addr = 0x22,
+				.mask = 0x02,
+			},
+			.int2 = {
+				.addr = 0x22,
+				.mask = 0x10,
+			},
 			.addr_ihl = 0x22,
 			.mask_ihl = 0x80,
 		},
@@ -582,8 +601,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 		.bdu = {
 		},
 		.drdy_irq = {
-			.addr = 0x21,
-			.mask_int1 = 0x04,
+			.int1 = {
+				.addr = 0x21,
+				.mask = 0x04,
+			},
 			.stat_drdy = {
 				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
 				.mask = 0x07,
@@ -653,9 +674,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			},
 		},
 		.drdy_irq = {
-			.addr = 0x22,
-			.mask_int1 = 0x10,
-			.mask_int2 = 0x00,
+			.int1 = {
+				.addr = 0x22,
+				.mask = 0x10,
+			},
 			.addr_ihl = 0x25,
 			.mask_ihl = 0x02,
 			.stat_drdy = {
diff --git a/drivers/iio/common/st_sensors/st_sensors_core.c b/drivers/iio/common/st_sensors/st_sensors_core.c
index 34115f05d5c4..af702fa8fa84 100644
--- a/drivers/iio/common/st_sensors/st_sensors_core.c
+++ b/drivers/iio/common/st_sensors/st_sensors_core.c
@@ -283,7 +283,8 @@ static int st_sensors_set_drdy_int_pin(struct iio_dev *indio_dev,
 	struct st_sensor_data *sdata = iio_priv(indio_dev);
 
 	/* Sensor does not support interrupts */
-	if (sdata->sensor_settings->drdy_irq.addr == 0) {
+	if (!sdata->sensor_settings->drdy_irq.int1.addr &&
+	    !sdata->sensor_settings->drdy_irq.int2.addr) {
 		if (pdata->drdy_int_pin)
 			dev_info(&indio_dev->dev,
 				 "DRDY on pin INT%d specified, but sensor "
@@ -294,7 +295,7 @@ static int st_sensors_set_drdy_int_pin(struct iio_dev *indio_dev,
 
 	switch (pdata->drdy_int_pin) {
 	case 1:
-		if (sdata->sensor_settings->drdy_irq.mask_int1 == 0) {
+		if (!sdata->sensor_settings->drdy_irq.int1.mask) {
 			dev_err(&indio_dev->dev,
 					"DRDY on INT1 not available.\n");
 			return -EINVAL;
@@ -302,7 +303,7 @@ static int st_sensors_set_drdy_int_pin(struct iio_dev *indio_dev,
 		sdata->drdy_int_pin = 1;
 		break;
 	case 2:
-		if (sdata->sensor_settings->drdy_irq.mask_int2 == 0) {
+		if (!sdata->sensor_settings->drdy_irq.int2.mask) {
 			dev_err(&indio_dev->dev,
 					"DRDY on INT2 not available.\n");
 			return -EINVAL;
@@ -460,10 +461,11 @@ EXPORT_SYMBOL(st_sensors_init_sensor);
 int st_sensors_set_dataready_irq(struct iio_dev *indio_dev, bool enable)
 {
 	int err;
-	u8 drdy_mask;
+	u8 drdy_addr, drdy_mask;
 	struct st_sensor_data *sdata = iio_priv(indio_dev);
 
-	if (!sdata->sensor_settings->drdy_irq.addr) {
+	if (!sdata->sensor_settings->drdy_irq.int1.addr &&
+	    !sdata->sensor_settings->drdy_irq.int2.addr) {
 		/*
 		 * there are some devices (e.g. LIS3MDL) where drdy line is
 		 * routed to a given pin and it is not possible to select a
@@ -485,18 +487,20 @@ int st_sensors_set_dataready_irq(struct iio_dev *indio_dev, bool enable)
 			goto st_accel_set_dataready_irq_error;
 	}
 
-	if (sdata->drdy_int_pin == 1)
-		drdy_mask = sdata->sensor_settings->drdy_irq.mask_int1;
-	else
-		drdy_mask = sdata->sensor_settings->drdy_irq.mask_int2;
+	if (sdata->drdy_int_pin == 1) {
+		drdy_addr = sdata->sensor_settings->drdy_irq.int1.addr;
+		drdy_mask = sdata->sensor_settings->drdy_irq.int1.mask;
+	} else {
+		drdy_addr = sdata->sensor_settings->drdy_irq.int2.addr;
+		drdy_mask = sdata->sensor_settings->drdy_irq.int2.mask;
+	}
 
 	/* Flag to the poll function that the hardware trigger is in use */
 	sdata->hw_irq_trigger = enable;
 
 	/* Enable/Disable the interrupt generator for data ready. */
-	err = st_sensors_write_data_with_mask(indio_dev,
-					sdata->sensor_settings->drdy_irq.addr,
-					drdy_mask, (int)enable);
+	err = st_sensors_write_data_with_mask(indio_dev, drdy_addr,
+					      drdy_mask, (int)enable);
 
 st_accel_set_dataready_irq_error:
 	return err;
diff --git a/drivers/iio/gyro/st_gyro_core.c b/drivers/iio/gyro/st_gyro_core.c
index 22c0c1732996..46991f7131d1 100644
--- a/drivers/iio/gyro/st_gyro_core.c
+++ b/drivers/iio/gyro/st_gyro_core.c
@@ -111,8 +111,10 @@ static const struct st_sensor_settings st_gyro_sensors_settings[] = {
 			.mask = 0x80,
 		},
 		.drdy_irq = {
-			.addr = 0x22,
-			.mask_int2 = 0x08,
+			.int2 = {
+				.addr = 0x22,
+				.mask = 0x08,
+			},
 			/*
 			 * The sensor has IHL (active low) and open
 			 * drain settings, but only for INT1 and not
@@ -184,8 +186,10 @@ static const struct st_sensor_settings st_gyro_sensors_settings[] = {
 			.mask = 0x80,
 		},
 		.drdy_irq = {
-			.addr = 0x22,
-			.mask_int2 = 0x08,
+			.int2 = {
+				.addr = 0x22,
+				.mask = 0x08,
+			},
 			/*
 			 * The sensor has IHL (active low) and open
 			 * drain settings, but only for INT1 and not
@@ -252,8 +256,10 @@ static const struct st_sensor_settings st_gyro_sensors_settings[] = {
 			.mask = 0x80,
 		},
 		.drdy_irq = {
-			.addr = 0x22,
-			.mask_int2 = 0x08,
+			.int2 = {
+				.addr = 0x22,
+				.mask = 0x08,
+			},
 			/*
 			 * The sensor has IHL (active low) and open
 			 * drain settings, but only for INT1 and not
diff --git a/drivers/iio/magnetometer/st_magn_core.c b/drivers/iio/magnetometer/st_magn_core.c
index ace72c57f53c..8745686f233c 100644
--- a/drivers/iio/magnetometer/st_magn_core.c
+++ b/drivers/iio/magnetometer/st_magn_core.c
@@ -362,8 +362,10 @@ static const struct st_sensor_settings st_magn_sensors_settings[] = {
 			.mask = 0x10,
 		},
 		.drdy_irq = {
-			.addr = 0x62,
-			.mask_int1 = 0x01,
+			.int1 = {
+				.addr = 0x62,
+				.mask = 0x01,
+			},
 			.stat_drdy = {
 				.addr = 0x67,
 				.mask = 0x07,
diff --git a/drivers/iio/pressure/st_pressure_core.c b/drivers/iio/pressure/st_pressure_core.c
index 5f8358e23f5b..b8890e37a2d2 100644
--- a/drivers/iio/pressure/st_pressure_core.c
+++ b/drivers/iio/pressure/st_pressure_core.c
@@ -280,9 +280,14 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 			.mask = 0x04,
 		},
 		.drdy_irq = {
-			.addr = 0x22,
-			.mask_int1 = 0x04,
-			.mask_int2 = 0x20,
+			.int1 = {
+				.addr = 0x22,
+				.mask = 0x04,
+			},
+			.int2 = {
+				.addr = 0x22,
+				.mask = 0x20,
+			},
 			.addr_ihl = 0x22,
 			.mask_ihl = 0x80,
 			.addr_od = 0x22,
@@ -338,9 +343,6 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 			.addr = 0x20,
 			.mask = 0x04,
 		},
-		.drdy_irq = {
-			.addr = 0,
-		},
 		.multi_read_bit = true,
 		.bootime = 2,
 	},
@@ -391,9 +393,10 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 			.mask = 0x04,
 		},
 		.drdy_irq = {
-			.addr = 0x23,
-			.mask_int1 = 0x01,
-			.mask_int2 = 0x00,
+			.int1 = {
+				.addr = 0x23,
+				.mask = 0x01,
+			},
 			.addr_ihl = 0x22,
 			.mask_ihl = 0x80,
 			.addr_od = 0x22,
@@ -455,9 +458,10 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 			.mask = 0x02,
 		},
 		.drdy_irq = {
-			.addr = 0x12,
-			.mask_int1 = 0x04,
-			.mask_int2 = 0x00,
+			.int1 = {
+				.addr = 0x12,
+				.mask = 0x04,
+			},
 			.addr_ihl = 0x12,
 			.mask_ihl = 0x80,
 			.addr_od = 0x12,
@@ -614,7 +618,8 @@ int st_press_common_probe(struct iio_dev *indio_dev)
 	press_data->odr = press_data->sensor_settings->odr.odr_avl[0].hz;
 
 	/* Some devices don't support a data ready pin. */
-	if (!pdata && press_data->sensor_settings->drdy_irq.addr)
+	if (!pdata && (press_data->sensor_settings->drdy_irq.int1.addr ||
+		       press_data->sensor_settings->drdy_irq.int2.addr))
 		pdata =	(struct st_sensors_platform_data *)&default_press_pdata;
 
 	err = st_sensors_init_sensor(indio_dev, press_data->dev->platform_data);
diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index ce0ef1c0a30a..e6c646d5d6d4 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -132,9 +132,8 @@ struct st_sensor_das {
 
 /**
  * struct st_sensor_data_ready_irq - ST sensor device data-ready interrupt
- * @addr: address of the register.
- * @mask_int1: mask to enable/disable IRQ on INT1 pin.
- * @mask_int2: mask to enable/disable IRQ on INT2 pin.
+ * struct int1 - data-ready configuration register for INT1 pin.
+ * struct int2 - data-ready configuration register for INT2 pin.
  * @addr_ihl: address to enable/disable active low on the INT lines.
  * @mask_ihl: mask to enable/disable active low on the INT lines.
  * @addr_od: address to enable/disable Open Drain on the INT lines.
@@ -145,9 +144,14 @@ struct st_sensor_das {
  * @en_mask: mask to write the on/off value for enable.
  */
 struct st_sensor_data_ready_irq {
-	u8 addr;
-	u8 mask_int1;
-	u8 mask_int2;
+	struct {
+		u8 addr;
+		u8 mask;
+	} int1;
+	struct {
+		u8 addr;
+		u8 mask;
+	} int2;
 	u8 addr_ihl;
 	u8 mask_ihl;
 	u8 addr_od;
-- 
cgit v1.2.3


From 6733bab7bc09b67668028dab562caea1b4ff3c69 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 18 Aug 2017 10:59:16 -0700
Subject: irq_work: Map irq_work_on_queue() to irq_work_on() in !SMP

Commit 478850160636 ("irq_work: Implement remote queueing") provides
irq_work_on_queue() only for SMP builds.  However, providing it simplifies
code that submits irq_work to lists of CPUs, eliminating the !SMP special
cases.  This commit therefore maps irq_work_on_queue() to irq_work_on()
in !SMP builds, but validating the specified CPU.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/irq_work.h | 3 ---
 kernel/irq_work.c        | 9 ++++++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index 47b9ebd4a74f..d8c9876d5da4 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -33,10 +33,7 @@ void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
 #define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), }
 
 bool irq_work_queue(struct irq_work *work);
-
-#ifdef CONFIG_SMP
 bool irq_work_queue_on(struct irq_work *work, int cpu);
-#endif
 
 void irq_work_tick(void);
 void irq_work_sync(struct irq_work *work);
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index bcf107ce0854..9f20f6c72579 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -56,7 +56,6 @@ void __weak arch_irq_work_raise(void)
 	 */
 }
 
-#ifdef CONFIG_SMP
 /*
  * Enqueue the irq_work @work on @cpu unless it's already pending
  * somewhere.
@@ -68,6 +67,8 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
 	/* All work should have been flushed before going offline */
 	WARN_ON_ONCE(cpu_is_offline(cpu));
 
+#ifdef CONFIG_SMP
+
 	/* Arch remote IPI send/receive backend aren't NMI safe */
 	WARN_ON_ONCE(in_nmi());
 
@@ -78,10 +79,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
 	if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
 		arch_send_call_function_single_ipi(cpu);
 
+#else /* #ifdef CONFIG_SMP */
+	irq_work_queue(work);
+#endif /* #else #ifdef CONFIG_SMP */
+
 	return true;
 }
-EXPORT_SYMBOL_GPL(irq_work_queue_on);
-#endif
 
 /* Enqueue the irq work @work on the current CPU */
 bool irq_work_queue(struct irq_work *work)
-- 
cgit v1.2.3


From 8055af0a4fddb45a8cd925fb9bc71f4b52628c9a Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 6 Oct 2017 09:08:34 +0200
Subject: ACPI / PM: Remove stale function header

acpi_dev_pm_get_node() isn't used or implemented, so remove it.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 502af53ec012..3b89b4fe6812 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -868,17 +868,12 @@ int acpi_dev_runtime_suspend(struct device *dev);
 int acpi_dev_runtime_resume(struct device *dev);
 int acpi_subsys_runtime_suspend(struct device *dev);
 int acpi_subsys_runtime_resume(struct device *dev);
-struct acpi_device *acpi_dev_pm_get_node(struct device *dev);
 int acpi_dev_pm_attach(struct device *dev, bool power_on);
 #else
 static inline int acpi_dev_runtime_suspend(struct device *dev) { return 0; }
 static inline int acpi_dev_runtime_resume(struct device *dev) { return 0; }
 static inline int acpi_subsys_runtime_suspend(struct device *dev) { return 0; }
 static inline int acpi_subsys_runtime_resume(struct device *dev) { return 0; }
-static inline struct acpi_device *acpi_dev_pm_get_node(struct device *dev)
-{
-	return NULL;
-}
 static inline int acpi_dev_pm_attach(struct device *dev, bool power_on)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From cf4c950b87ee2f547ad3abd3aca6ae3f3eb3443f Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 9 Oct 2017 14:30:52 -0700
Subject: once: switch to new jump label API

Switch the DO_ONCE() macro from the deprecated jump label API to the new
one.  The new one is more readable, and for DO_ONCE() it also makes the
generated code more icache-friendly: now the one-time initialization
code is placed out-of-line at the jump target, rather than at the inline
fallthrough case.

Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/once.h | 6 +++---
 lib/once.c           | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/once.h b/include/linux/once.h
index 9c98aaa87cbc..724724918e8b 100644
--- a/include/linux/once.h
+++ b/include/linux/once.h
@@ -5,7 +5,7 @@
 #include <linux/jump_label.h>
 
 bool __do_once_start(bool *done, unsigned long *flags);
-void __do_once_done(bool *done, struct static_key *once_key,
+void __do_once_done(bool *done, struct static_key_true *once_key,
 		    unsigned long *flags);
 
 /* Call a function exactly once. The idea of DO_ONCE() is to perform
@@ -38,8 +38,8 @@ void __do_once_done(bool *done, struct static_key *once_key,
 	({								     \
 		bool ___ret = false;					     \
 		static bool ___done = false;				     \
-		static struct static_key ___once_key = STATIC_KEY_INIT_TRUE; \
-		if (static_key_true(&___once_key)) {			     \
+		static DEFINE_STATIC_KEY_TRUE(___once_key);		     \
+		if (static_branch_unlikely(&___once_key)) {		     \
 			unsigned long ___flags;				     \
 			___ret = __do_once_start(&___done, &___flags);	     \
 			if (unlikely(___ret)) {				     \
diff --git a/lib/once.c b/lib/once.c
index 05c8604627eb..831c5a6b0bb2 100644
--- a/lib/once.c
+++ b/lib/once.c
@@ -5,7 +5,7 @@
 
 struct once_work {
 	struct work_struct work;
-	struct static_key *key;
+	struct static_key_true *key;
 };
 
 static void once_deferred(struct work_struct *w)
@@ -14,11 +14,11 @@ static void once_deferred(struct work_struct *w)
 
 	work = container_of(w, struct once_work, work);
 	BUG_ON(!static_key_enabled(work->key));
-	static_key_slow_dec(work->key);
+	static_branch_disable(work->key);
 	kfree(work);
 }
 
-static void once_disable_jump(struct static_key *key)
+static void once_disable_jump(struct static_key_true *key)
 {
 	struct once_work *w;
 
@@ -51,7 +51,7 @@ bool __do_once_start(bool *done, unsigned long *flags)
 }
 EXPORT_SYMBOL(__do_once_start);
 
-void __do_once_done(bool *done, struct static_key *once_key,
+void __do_once_done(bool *done, struct static_key_true *once_key,
 		    unsigned long *flags)
 	__releases(once_lock)
 {
-- 
cgit v1.2.3


From 1d48b080bcce0a5e7d7aa2dbcdb35deefc188c3f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 29 Sep 2017 13:50:16 +0200
Subject: sched/debug: Rename task-state printing helpers

Steve requested better names for the new task-state helper functions.

So introduce the concept of task-state index for the printing and
rename __get_task_state() to task_state_index() and
__task_state_to_char() to task_index_to_char().

Requested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170929115016.pzlqc7ss3ccystyg@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/proc/array.c                   |  2 +-
 include/linux/sched.h             |  6 +++---
 include/trace/events/sched.h      |  2 +-
 kernel/trace/trace_output.c       | 12 ++++++------
 kernel/trace/trace_sched_wakeup.c |  8 ++++----
 5 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 77a8eacbe032..1b5406ca8e4c 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -137,7 +137,7 @@ static const char * const task_state_array[] = {
 static inline const char *get_task_state(struct task_struct *tsk)
 {
 	BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array));
-	return task_state_array[__get_task_state(tsk)];
+	return task_state_array[task_state_index(tsk)];
 }
 
 static inline int get_task_umask(struct task_struct *tsk)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index bdd6ad6fcce1..33a01f4deb00 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1248,7 +1248,7 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 #define TASK_REPORT_IDLE	(TASK_REPORT + 1)
 #define TASK_REPORT_MAX		(TASK_REPORT_IDLE << 1)
 
-static inline unsigned int __get_task_state(struct task_struct *tsk)
+static inline unsigned int task_state_index(struct task_struct *tsk)
 {
 	unsigned int tsk_state = READ_ONCE(tsk->state);
 	unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
@@ -1261,7 +1261,7 @@ static inline unsigned int __get_task_state(struct task_struct *tsk)
 	return fls(state);
 }
 
-static inline char __task_state_to_char(unsigned int state)
+static inline char task_index_to_char(unsigned int state)
 {
 	static const char state_char[] = "RSDTtXZPI";
 
@@ -1272,7 +1272,7 @@ static inline char __task_state_to_char(unsigned int state)
 
 static inline char task_state_to_char(struct task_struct *tsk)
 {
-	return __task_state_to_char(__get_task_state(tsk));
+	return task_index_to_char(task_state_index(tsk));
 }
 
 /**
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 3c8b7f625670..fab74a12ca0f 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -117,7 +117,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
 	if (preempt)
 		return TASK_STATE_MAX;
 
-	return __get_task_state(p);
+	return task_state_index(p);
 }
 #endif /* CREATE_TRACE_POINTS */
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c738e764e2a5..90db994ac900 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -921,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
 
 	trace_assign_type(field, iter->ent);
 
-	T = __task_state_to_char(field->next_state);
-	S = __task_state_to_char(field->prev_state);
+	T = task_index_to_char(field->next_state);
+	S = task_index_to_char(field->prev_state);
 	trace_find_cmdline(field->next_pid, comm);
 	trace_seq_printf(&iter->seq,
 			 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
@@ -957,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
 	trace_assign_type(field, iter->ent);
 
 	if (!S)
-		S = __task_state_to_char(field->prev_state);
-	T = __task_state_to_char(field->next_state);
+		S = task_index_to_char(field->prev_state);
+	T = task_index_to_char(field->next_state);
 	trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
 			 field->prev_pid,
 			 field->prev_prio,
@@ -993,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
 	trace_assign_type(field, iter->ent);
 
 	if (!S)
-		S = __task_state_to_char(field->prev_state);
-	T = __task_state_to_char(field->next_state);
+		S = task_index_to_char(field->prev_state);
+	T = task_index_to_char(field->next_state);
 
 	SEQ_PUT_HEX_FIELD(s, field->prev_pid);
 	SEQ_PUT_HEX_FIELD(s, field->prev_prio);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0c331978b1a6..500f370d3bb1 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry	= ring_buffer_event_data(event);
 	entry->prev_pid			= prev->pid;
 	entry->prev_prio		= prev->prio;
-	entry->prev_state		= __get_task_state(prev);
+	entry->prev_state		= task_state_index(prev);
 	entry->next_pid			= next->pid;
 	entry->next_prio		= next->prio;
-	entry->next_state		= __get_task_state(next);
+	entry->next_state		= task_state_index(next);
 	entry->next_cpu	= task_cpu(next);
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
@@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry	= ring_buffer_event_data(event);
 	entry->prev_pid			= curr->pid;
 	entry->prev_prio		= curr->prio;
-	entry->prev_state		= __get_task_state(curr);
+	entry->prev_state		= task_state_index(curr);
 	entry->next_pid			= wakee->pid;
 	entry->next_prio		= wakee->prio;
-	entry->next_state		= __get_task_state(wakee);
+	entry->next_state		= task_state_index(wakee);
 	entry->next_cpu			= task_cpu(wakee);
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
-- 
cgit v1.2.3


From 799ba82de01e7543f6b2042e1a739f3a20255f23 Mon Sep 17 00:00:00 2001
From: luca abeni <luca.abeni@santannapisa.it>
Date: Thu, 7 Sep 2017 12:09:31 +0200
Subject: sched/deadline: Use C bitfields for the state flags

Ask the compiler to use a single bit for storing true / false values,
instead of wasting the size of a whole int value.
Tested with gcc 5.4.0 on x86_64, and the compiler produces the expected
Assembly (similar to the Assembly code generated when explicitly accessing
the bits with bitmasks, "&" and "|").

Signed-off-by: luca abeni <luca.abeni@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1504778971-13573-5-git-send-email-luca.abeni@santannapisa.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 33a01f4deb00..0f897dfc195e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -474,10 +474,10 @@ struct sched_dl_entity {
 	 * conditions between the inactive timer handler and the wakeup
 	 * code.
 	 */
-	int				dl_throttled;
-	int				dl_boosted;
-	int				dl_yielded;
-	int				dl_non_contending;
+	int				dl_throttled      : 1;
+	int				dl_boosted        : 1;
+	int				dl_yielded        : 1;
+	int				dl_non_contending : 1;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
-- 
cgit v1.2.3


From ff0d4a9dc16b1f4c954f6407c233ab848bdfe8b0 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 4 Oct 2017 17:49:00 +0200
Subject: sched/rt: Add a helper to test for a RT task

This helper returns true if a task has elevated priority which is true
for RT tasks (SCHED_RR and SCHED_FIFO) and also for SCHED_DEADLINE.
A task which runs at RT priority due to PI-boosting is not considered as
one with elevated priority.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Jens Axboe <axboe@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171004154901.26904-1-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/rt.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index f93329aba31a..133001627ba1 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -17,6 +17,17 @@ static inline int rt_task(struct task_struct *p)
 	return rt_prio(p->prio);
 }
 
+static inline bool task_is_realtime(struct task_struct *tsk)
+{
+	int policy = tsk->policy;
+
+	if (policy == SCHED_FIFO || policy == SCHED_RR)
+		return true;
+	if (policy == SCHED_DEADLINE)
+		return true;
+	return false;
+}
+
 #ifdef CONFIG_RT_MUTEXES
 /*
  * Must hold either p->pi_lock or task_rq(p)->lock.
-- 
cgit v1.2.3


From 36436440cd19f59f5be12a1b181d299af2725140 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 4 Oct 2017 17:49:01 +0200
Subject: block/ioprio: Use a helper to check for RT prio

A side-effect to the old code is that now SCHED_DEADLINE is also
recognized.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Jens Axboe <axboe@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171004154901.26904-2-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ioprio.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 8c1239020d79..2f19aab84a4a 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -2,6 +2,7 @@
 #define IOPRIO_H
 
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/iocontext.h>
 
 /*
@@ -62,7 +63,7 @@ static inline int task_nice_ioclass(struct task_struct *task)
 {
 	if (task->policy == SCHED_IDLE)
 		return IOPRIO_CLASS_IDLE;
-	else if (task->policy == SCHED_FIFO || task->policy == SCHED_RR)
+	else if (task_is_realtime(task))
 		return IOPRIO_CLASS_RT;
 	else
 		return IOPRIO_CLASS_BE;
-- 
cgit v1.2.3


From 76f8507f7a6442215df19de74f07eabca2462f1e Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Fri, 29 Sep 2017 19:06:38 +0300
Subject: locking/rwsem: Add down_read_killable()

Similar to down_read() and down_write_killable(),
add killable version of down_read(), based on
__down_read_killable() function, added in previous
patches.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arnd@arndb.de
Cc: avagin@virtuozzo.com
Cc: davem@davemloft.net
Cc: fenghua.yu@intel.com
Cc: gorcunov@virtuozzo.com
Cc: heiko.carstens@de.ibm.com
Cc: hpa@zytor.com
Cc: ink@jurassic.park.msu.ru
Cc: mattst88@gmail.com
Cc: rientjes@google.com
Cc: rth@twiddle.net
Cc: schwidefsky@de.ibm.com
Cc: tony.luck@intel.com
Cc: viro@zeniv.linux.org.uk
Link: http://lkml.kernel.org/r/150670119884.23930.2585570605960763239.stgit@localhost.localdomain
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/rwsem.h | 10 ++++++++++
 include/linux/rwsem.h       |  1 +
 kernel/locking/rwsem.c      | 16 ++++++++++++++++
 3 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h
index 6c6a2141f271..b2d68d2d198e 100644
--- a/include/asm-generic/rwsem.h
+++ b/include/asm-generic/rwsem.h
@@ -37,6 +37,16 @@ static inline void __down_read(struct rw_semaphore *sem)
 		rwsem_down_read_failed(sem);
 }
 
+static inline int __down_read_killable(struct rw_semaphore *sem)
+{
+	if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
+		if (IS_ERR(rwsem_down_read_failed_killable(sem)))
+			return -EINTR;
+	}
+
+	return 0;
+}
+
 static inline int __down_read_trylock(struct rw_semaphore *sem)
 {
 	long tmp;
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 0ad7318ff299..6ac8ee5f15dd 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -111,6 +111,7 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem)
  * lock for reading
  */
 extern void down_read(struct rw_semaphore *sem);
+extern int __must_check down_read_killable(struct rw_semaphore *sem);
 
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 4d48b1c4870d..e53f7746d9fd 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -28,6 +28,22 @@ void __sched down_read(struct rw_semaphore *sem)
 
 EXPORT_SYMBOL(down_read);
 
+int __sched down_read_killable(struct rw_semaphore *sem)
+{
+	might_sleep();
+	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
+		rwsem_release(&sem->dep_map, 1, _RET_IP_);
+		return -EINTR;
+	}
+
+	rwsem_set_reader_owned(sem);
+	return 0;
+}
+
+EXPORT_SYMBOL(down_read_killable);
+
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-- 
cgit v1.2.3


From a8a217c22116eff6c120d753c9934089fb229af0 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 3 Oct 2017 19:25:27 +0100
Subject: locking/core: Remove {read,spin,write}_can_lock()

Outside of the locking code itself, {read,spin,write}_can_lock() have no
users in tree. Apparmor (the last remaining user of write_can_lock()) got
moved over to lockdep by the previous patch.

This patch removes the use of {read,spin,write}_can_lock() from the
BUILD_LOCK_OPS macro, deferring to the trylock operation for testing the
lock status, and subsequently removes the unused macros altogether. They
aren't guaranteed to work in a concurrent environment and can give
incorrect results in the case of qrwlock.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/1507055129-12300-2-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/include/asm/spinlock.h        | 10 ----------
 arch/arc/include/asm/spinlock.h          |  3 ---
 arch/arm/include/asm/spinlock.h          |  6 ------
 arch/blackfin/include/asm/spinlock.h     | 10 ----------
 arch/hexagon/include/asm/spinlock.h      | 10 ----------
 arch/ia64/include/asm/spinlock.h         |  3 ---
 arch/m32r/include/asm/spinlock.h         | 12 ------------
 arch/metag/include/asm/spinlock_lnkget.h | 30 ------------------------------
 arch/metag/include/asm/spinlock_lock1.h  | 20 --------------------
 arch/mn10300/include/asm/spinlock.h      | 12 ------------
 arch/parisc/include/asm/spinlock.h       | 18 ------------------
 arch/powerpc/include/asm/spinlock.h      |  3 ---
 arch/s390/include/asm/spinlock.h         | 12 ------------
 arch/sh/include/asm/spinlock-cas.h       | 12 ------------
 arch/sh/include/asm/spinlock-llsc.h      | 12 ------------
 arch/sparc/include/asm/spinlock_32.h     |  3 ---
 arch/tile/include/asm/spinlock_32.h      | 16 ----------------
 arch/tile/include/asm/spinlock_64.h      | 18 ------------------
 arch/xtensa/include/asm/spinlock.h       |  2 --
 include/asm-generic/qrwlock.h            | 20 --------------------
 include/linux/rwlock.h                   |  3 ---
 include/linux/spinlock.h                 | 11 -----------
 include/linux/spinlock_up.h              |  3 ---
 kernel/locking/spinlock.c                |  6 ++----
 24 files changed, 2 insertions(+), 253 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/spinlock.h b/arch/alpha/include/asm/spinlock.h
index 718ac0b64adf..7bff6316b8bb 100644
--- a/arch/alpha/include/asm/spinlock.h
+++ b/arch/alpha/include/asm/spinlock.h
@@ -54,16 +54,6 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
 
 /***********************************************************/
 
-static inline int arch_read_can_lock(arch_rwlock_t *lock)
-{
-	return (lock->lock & 1) == 0;
-}
-
-static inline int arch_write_can_lock(arch_rwlock_t *lock)
-{
-	return lock->lock == 0;
-}
-
 static inline void arch_read_lock(arch_rwlock_t *lock)
 {
 	long regx;
diff --git a/arch/arc/include/asm/spinlock.h b/arch/arc/include/asm/spinlock.h
index 47efc8451b70..ce9bfcf1d870 100644
--- a/arch/arc/include/asm/spinlock.h
+++ b/arch/arc/include/asm/spinlock.h
@@ -410,9 +410,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 
 #endif
 
-#define arch_read_can_lock(x)	((x)->counter > 0)
-#define arch_write_can_lock(x)	((x)->counter == __ARCH_RW_LOCK_UNLOCKED__)
-
 #define arch_read_lock_flags(lock, flags)	arch_read_lock(lock)
 #define arch_write_lock_flags(lock, flags)	arch_write_lock(lock)
 
diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h
index c030143c18c6..f5223260c73e 100644
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -193,9 +193,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 	dsb_sev();
 }
 
-/* write_can_lock - would write_trylock() succeed? */
-#define arch_write_can_lock(x)		(ACCESS_ONCE((x)->lock) == 0)
-
 /*
  * Read locks are a bit more hairy:
  *  - Exclusively load the lock value.
@@ -273,9 +270,6 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 	}
 }
 
-/* read_can_lock - would read_trylock() succeed? */
-#define arch_read_can_lock(x)		(ACCESS_ONCE((x)->lock) < 0x80000000)
-
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
 #define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
 
diff --git a/arch/blackfin/include/asm/spinlock.h b/arch/blackfin/include/asm/spinlock.h
index f6431439d15d..607ef98c0f6c 100644
--- a/arch/blackfin/include/asm/spinlock.h
+++ b/arch/blackfin/include/asm/spinlock.h
@@ -48,16 +48,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 	__raw_spin_unlock_asm(&lock->lock);
 }
 
-static inline int arch_read_can_lock(arch_rwlock_t *rw)
-{
-	return __raw_uncached_fetch_asm(&rw->lock) > 0;
-}
-
-static inline int arch_write_can_lock(arch_rwlock_t *rw)
-{
-	return __raw_uncached_fetch_asm(&rw->lock) == RW_LOCK_BIAS;
-}
-
 static inline void arch_read_lock(arch_rwlock_t *rw)
 {
 	__raw_read_lock_asm(&rw->lock);
diff --git a/arch/hexagon/include/asm/spinlock.h b/arch/hexagon/include/asm/spinlock.h
index 53a8d5885887..9f9414b9c303 100644
--- a/arch/hexagon/include/asm/spinlock.h
+++ b/arch/hexagon/include/asm/spinlock.h
@@ -86,16 +86,6 @@ static inline int arch_read_trylock(arch_rwlock_t *lock)
 	return temp;
 }
 
-static inline int arch_read_can_lock(arch_rwlock_t *rwlock)
-{
-	return rwlock->lock == 0;
-}
-
-static inline int arch_write_can_lock(arch_rwlock_t *rwlock)
-{
-	return rwlock->lock == 0;
-}
-
 /*  Stuffs a -1 in the lock value?  */
 static inline void arch_write_lock(arch_rwlock_t *lock)
 {
diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h
index df2c121164b8..c728dda59bd2 100644
--- a/arch/ia64/include/asm/spinlock.h
+++ b/arch/ia64/include/asm/spinlock.h
@@ -127,9 +127,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
 	arch_spin_lock(lock);
 }
 
-#define arch_read_can_lock(rw)		(*(volatile int *)(rw) >= 0)
-#define arch_write_can_lock(rw)	(*(volatile int *)(rw) == 0)
-
 #ifdef ASM_SUPPORTED
 
 static __always_inline void
diff --git a/arch/m32r/include/asm/spinlock.h b/arch/m32r/include/asm/spinlock.h
index a56825592b90..002601371b2f 100644
--- a/arch/m32r/include/asm/spinlock.h
+++ b/arch/m32r/include/asm/spinlock.h
@@ -137,18 +137,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
  * semaphore.h for details.  -ben
  */
 
-/**
- * read_can_lock - would read_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_read_can_lock(x) ((int)(x)->lock > 0)
-
-/**
- * write_can_lock - would write_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_write_can_lock(x) ((x)->lock == RW_LOCK_BIAS)
-
 static inline void arch_read_lock(arch_rwlock_t *rw)
 {
 	unsigned long tmp0, tmp1;
diff --git a/arch/metag/include/asm/spinlock_lnkget.h b/arch/metag/include/asm/spinlock_lnkget.h
index ad8436feed8d..6a932a952d53 100644
--- a/arch/metag/include/asm/spinlock_lnkget.h
+++ b/arch/metag/include/asm/spinlock_lnkget.h
@@ -136,21 +136,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 		      : "memory");
 }
 
-/* write_can_lock - would write_trylock() succeed? */
-static inline int arch_write_can_lock(arch_rwlock_t *rw)
-{
-	int ret;
-
-	asm volatile ("LNKGETD	%0, [%1]\n"
-		      "CMP	%0, #0\n"
-		      "MOV	%0, #1\n"
-		      "XORNZ     %0, %0, %0\n"
-		      : "=&d" (ret)
-		      : "da" (&rw->lock)
-		      : "cc");
-	return ret;
-}
-
 /*
  * Read locks are a bit more hairy:
  *  - Exclusively load the lock value.
@@ -224,21 +209,6 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 	return tmp;
 }
 
-/* read_can_lock - would read_trylock() succeed? */
-static inline int arch_read_can_lock(arch_rwlock_t *rw)
-{
-	int tmp;
-
-	asm volatile ("LNKGETD	%0, [%1]\n"
-		      "CMP	%0, %2\n"
-		      "MOV	%0, #1\n"
-		      "XORZ	%0, %0, %0\n"
-		      : "=&d" (tmp)
-		      : "da" (&rw->lock), "bd" (0x80000000)
-		      : "cc");
-	return tmp;
-}
-
 #define	arch_read_lock_flags(lock, flags) arch_read_lock(lock)
 #define	arch_write_lock_flags(lock, flags) arch_write_lock(lock)
 
diff --git a/arch/metag/include/asm/spinlock_lock1.h b/arch/metag/include/asm/spinlock_lock1.h
index c630444cffe9..8ae12bfc8ad8 100644
--- a/arch/metag/include/asm/spinlock_lock1.h
+++ b/arch/metag/include/asm/spinlock_lock1.h
@@ -104,16 +104,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 	rw->lock = 0;
 }
 
-/* write_can_lock - would write_trylock() succeed? */
-static inline int arch_write_can_lock(arch_rwlock_t *rw)
-{
-	unsigned int ret;
-
-	barrier();
-	ret = rw->lock;
-	return (ret == 0);
-}
-
 /*
  * Read locks are a bit more hairy:
  *  - Exclusively load the lock value.
@@ -171,14 +161,4 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 	return (ret < 0x80000000);
 }
 
-/* read_can_lock - would read_trylock() succeed? */
-static inline int arch_read_can_lock(arch_rwlock_t *rw)
-{
-	unsigned int ret;
-
-	barrier();
-	ret = rw->lock;
-	return (ret < 0x80000000);
-}
-
 #endif /* __ASM_SPINLOCK_LOCK1_H */
diff --git a/arch/mn10300/include/asm/spinlock.h b/arch/mn10300/include/asm/spinlock.h
index fe413b41df6c..54f75dac8094 100644
--- a/arch/mn10300/include/asm/spinlock.h
+++ b/arch/mn10300/include/asm/spinlock.h
@@ -98,18 +98,6 @@ static inline void arch_spin_lock_flags(arch_spinlock_t *lock,
  * read-locks.
  */
 
-/**
- * read_can_lock - would read_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_read_can_lock(x) ((int)(x)->lock > 0)
-
-/**
- * write_can_lock - would write_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_write_can_lock(x) ((x)->lock == RW_LOCK_BIAS)
-
 /*
  * On mn10300, we implement read-write locks as a 32-bit counter
  * with the high bit (sign) being the "contended" bit.
diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h
index 55bfe4affca3..136e1c9bb8a9 100644
--- a/arch/parisc/include/asm/spinlock.h
+++ b/arch/parisc/include/asm/spinlock.h
@@ -168,24 +168,6 @@ static __inline__ int arch_write_trylock(arch_rwlock_t *rw)
 	return result;
 }
 
-/*
- * read_can_lock - would read_trylock() succeed?
- * @lock: the rwlock in question.
- */
-static __inline__ int arch_read_can_lock(arch_rwlock_t *rw)
-{
-	return rw->counter >= 0;
-}
-
-/*
- * write_can_lock - would write_trylock() succeed?
- * @lock: the rwlock in question.
- */
-static __inline__ int arch_write_can_lock(arch_rwlock_t *rw)
-{
-	return !rw->counter;
-}
-
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
 #define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
 
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index edbe571bcc54..d83f4f755ad8 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -181,9 +181,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
  * read-locks.
  */
 
-#define arch_read_can_lock(rw)		((rw)->lock >= 0)
-#define arch_write_can_lock(rw)	(!(rw)->lock)
-
 #ifdef CONFIG_PPC64
 #define __DO_SIGN_EXTEND	"extsw	%0,%0\n"
 #define WRLOCK_TOKEN		LOCK_TOKEN	/* it's negative */
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 8182b521c42f..dc9c58ed9e4c 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -110,18 +110,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp)
  * read-locks.
  */
 
-/**
- * read_can_lock - would read_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_read_can_lock(x) ((int)(x)->lock >= 0)
-
-/**
- * write_can_lock - would write_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_write_can_lock(x) ((x)->lock == 0)
-
 extern int _raw_read_trylock_retry(arch_rwlock_t *lp);
 extern int _raw_write_trylock_retry(arch_rwlock_t *lp);
 
diff --git a/arch/sh/include/asm/spinlock-cas.h b/arch/sh/include/asm/spinlock-cas.h
index 5ed7dbbd94ff..315467834521 100644
--- a/arch/sh/include/asm/spinlock-cas.h
+++ b/arch/sh/include/asm/spinlock-cas.h
@@ -53,18 +53,6 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
  * read-locks.
  */
 
-/**
- * read_can_lock - would read_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_read_can_lock(x)	((x)->lock > 0)
-
-/**
- * write_can_lock - would write_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_write_can_lock(x)	((x)->lock == RW_LOCK_BIAS)
-
 static inline void arch_read_lock(arch_rwlock_t *rw)
 {
 	unsigned old;
diff --git a/arch/sh/include/asm/spinlock-llsc.h b/arch/sh/include/asm/spinlock-llsc.h
index f77263aae760..06be4a55f3c4 100644
--- a/arch/sh/include/asm/spinlock-llsc.h
+++ b/arch/sh/include/asm/spinlock-llsc.h
@@ -89,18 +89,6 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
  * read-locks.
  */
 
-/**
- * read_can_lock - would read_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_read_can_lock(x)	((x)->lock > 0)
-
-/**
- * write_can_lock - would write_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_write_can_lock(x)	((x)->lock == RW_LOCK_BIAS)
-
 static inline void arch_read_lock(arch_rwlock_t *rw)
 {
 	unsigned long tmp;
diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h
index 67345b2dc408..76986b8f7dad 100644
--- a/arch/sparc/include/asm/spinlock_32.h
+++ b/arch/sparc/include/asm/spinlock_32.h
@@ -190,9 +190,6 @@ static inline int __arch_read_trylock(arch_rwlock_t *rw)
 #define arch_read_relax(lock)	cpu_relax()
 #define arch_write_relax(lock)	cpu_relax()
 
-#define arch_read_can_lock(rw) (!((rw)->lock & 0xff))
-#define arch_write_can_lock(rw) (!(rw)->lock)
-
 #endif /* !(__ASSEMBLY__) */
 
 #endif /* __SPARC_SPINLOCK_H */
diff --git a/arch/tile/include/asm/spinlock_32.h b/arch/tile/include/asm/spinlock_32.h
index cba8ba9b8da6..91d05f21cba9 100644
--- a/arch/tile/include/asm/spinlock_32.h
+++ b/arch/tile/include/asm/spinlock_32.h
@@ -79,22 +79,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 #define _RD_COUNT_SHIFT 24
 #define _RD_COUNT_WIDTH 8
 
-/**
- * arch_read_can_lock() - would read_trylock() succeed?
- */
-static inline int arch_read_can_lock(arch_rwlock_t *rwlock)
-{
-	return (rwlock->lock << _RD_COUNT_WIDTH) == 0;
-}
-
-/**
- * arch_write_can_lock() - would write_trylock() succeed?
- */
-static inline int arch_write_can_lock(arch_rwlock_t *rwlock)
-{
-	return rwlock->lock == 0;
-}
-
 /**
  * arch_read_lock() - acquire a read lock.
  */
diff --git a/arch/tile/include/asm/spinlock_64.h b/arch/tile/include/asm/spinlock_64.h
index 9a2c2d605752..c802f48badf4 100644
--- a/arch/tile/include/asm/spinlock_64.h
+++ b/arch/tile/include/asm/spinlock_64.h
@@ -93,24 +93,6 @@ static inline int arch_write_val_locked(int val)
 	return val < 0;  /* Optimize "val & __WRITE_LOCK_BIT". */
 }
 
-/**
- * read_can_lock - would read_trylock() succeed?
- * @lock: the rwlock in question.
- */
-static inline int arch_read_can_lock(arch_rwlock_t *rw)
-{
-	return !arch_write_val_locked(rw->lock);
-}
-
-/**
- * write_can_lock - would write_trylock() succeed?
- * @lock: the rwlock in question.
- */
-static inline int arch_write_can_lock(arch_rwlock_t *rw)
-{
-	return rw->lock == 0;
-}
-
 extern void __read_lock_failed(arch_rwlock_t *rw);
 
 static inline void arch_read_lock(arch_rwlock_t *rw)
diff --git a/arch/xtensa/include/asm/spinlock.h b/arch/xtensa/include/asm/spinlock.h
index 3bb49681ee24..d005af51e2e1 100644
--- a/arch/xtensa/include/asm/spinlock.h
+++ b/arch/xtensa/include/asm/spinlock.h
@@ -97,8 +97,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
  *  0x80000000  one writer owns the rwlock, no other writers, no readers
  */
 
-#define arch_write_can_lock(x)  ((x)->lock == 0)
-
 static inline void arch_write_lock(arch_rwlock_t *rw)
 {
 	unsigned long tmp;
diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
index 7d026bf27713..50925327b0a8 100644
--- a/include/asm-generic/qrwlock.h
+++ b/include/asm-generic/qrwlock.h
@@ -52,24 +52,6 @@
 extern void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts);
 extern void queued_write_lock_slowpath(struct qrwlock *lock);
 
-/**
- * queued_read_can_lock- would read_trylock() succeed?
- * @lock: Pointer to queue rwlock structure
- */
-static inline int queued_read_can_lock(struct qrwlock *lock)
-{
-	return !(atomic_read(&lock->cnts) & _QW_WMASK);
-}
-
-/**
- * queued_write_can_lock- would write_trylock() succeed?
- * @lock: Pointer to queue rwlock structure
- */
-static inline int queued_write_can_lock(struct qrwlock *lock)
-{
-	return !atomic_read(&lock->cnts);
-}
-
 /**
  * queued_read_trylock - try to acquire read lock of a queue rwlock
  * @lock : Pointer to queue rwlock structure
@@ -169,8 +151,6 @@ static inline void queued_write_unlock(struct qrwlock *lock)
  * Remapping rwlock architecture specific functions to the corresponding
  * queue rwlock functions.
  */
-#define arch_read_can_lock(l)	queued_read_can_lock(l)
-#define arch_write_can_lock(l)	queued_write_can_lock(l)
 #define arch_read_lock(l)	queued_read_lock(l)
 #define arch_write_lock(l)	queued_write_lock(l)
 #define arch_read_trylock(l)	queued_read_trylock(l)
diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
index bc2994ed66e1..766c5ca5cbd1 100644
--- a/include/linux/rwlock.h
+++ b/include/linux/rwlock.h
@@ -50,9 +50,6 @@ do {								\
 # define do_raw_write_unlock(rwlock)	do {arch_write_unlock(&(rwlock)->raw_lock); __release(lock); } while (0)
 #endif
 
-#define read_can_lock(rwlock)		arch_read_can_lock(&(rwlock)->raw_lock)
-#define write_can_lock(rwlock)		arch_write_can_lock(&(rwlock)->raw_lock)
-
 /*
  * Define the various rw_lock methods.  Note we define these
  * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 69e079c5ff98..1e3e48041800 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -278,12 +278,6 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 	1 : ({ local_irq_restore(flags); 0; }); \
 })
 
-/**
- * raw_spin_can_lock - would raw_spin_trylock() succeed?
- * @lock: the spinlock in question.
- */
-#define raw_spin_can_lock(lock)	(!raw_spin_is_locked(lock))
-
 /* Include rwlock functions */
 #include <linux/rwlock.h>
 
@@ -396,11 +390,6 @@ static __always_inline int spin_is_contended(spinlock_t *lock)
 	return raw_spin_is_contended(&lock->rlock);
 }
 
-static __always_inline int spin_can_lock(spinlock_t *lock)
-{
-	return raw_spin_can_lock(&lock->rlock);
-}
-
 #define assert_spin_locked(lock)	assert_raw_spin_locked(&(lock)->rlock)
 
 /*
diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
index 612fb530af41..901cf8f44388 100644
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -77,7 +77,4 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 
 #define arch_spin_is_contended(lock)	(((void)(lock), 0))
 
-#define arch_read_can_lock(lock)	(((void)(lock), 1))
-#define arch_write_can_lock(lock)	(((void)(lock), 1))
-
 #endif /* __LINUX_SPINLOCK_UP_H */
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 4b082b5cac9e..8fd48b5552a7 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -32,8 +32,6 @@
  * include/linux/spinlock_api_smp.h
  */
 #else
-#define raw_read_can_lock(l)	read_can_lock(l)
-#define raw_write_can_lock(l)	write_can_lock(l)
 
 /*
  * Some architectures can relax in favour of the CPU owning the lock.
@@ -68,7 +66,7 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock)			\
 									\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
-		while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
+		while ((lock)->break_lock)				\
 			arch_##op##_relax(&lock->raw_lock);		\
 	}								\
 	(lock)->break_lock = 0;						\
@@ -88,7 +86,7 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock)	\
 									\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
-		while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
+		while ((lock)->break_lock)				\
 			arch_##op##_relax(&lock->raw_lock);		\
 	}								\
 	(lock)->break_lock = 0;						\
-- 
cgit v1.2.3


From a4c1887d4c1462b0ec5a8989f8ba3cdd9057a299 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 3 Oct 2017 19:25:29 +0100
Subject: locking/arch: Remove dummy arch_{read,spin,write}_lock_flags()
 implementations

The arch_{read,spin,write}_lock_flags() macros are simply mapped to the
non-flags versions by the majority of architectures, so do this in core
code and remove the dummy implementations. Also remove the implementation
in spinlock_up.h, since all callers of do_raw_spin_lock_flags() call
local_irq_save(flags) anyway.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/1507055129-12300-4-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/include/asm/spinlock.h        | 4 ----
 arch/arc/include/asm/spinlock.h          | 4 ----
 arch/arm/include/asm/spinlock.h          | 5 -----
 arch/arm64/include/asm/spinlock.h        | 5 -----
 arch/blackfin/include/asm/spinlock.h     | 6 ------
 arch/hexagon/include/asm/spinlock.h      | 5 -----
 arch/ia64/include/asm/spinlock.h         | 5 +++--
 arch/m32r/include/asm/spinlock.h         | 4 ----
 arch/metag/include/asm/spinlock.h        | 5 -----
 arch/metag/include/asm/spinlock_lnkget.h | 3 ---
 arch/mips/include/asm/spinlock.h         | 3 ---
 arch/mn10300/include/asm/spinlock.h      | 4 +---
 arch/parisc/include/asm/spinlock.h       | 4 +---
 arch/powerpc/include/asm/spinlock.h      | 4 +---
 arch/s390/include/asm/spinlock.h         | 4 +---
 arch/sh/include/asm/spinlock-cas.h       | 4 ----
 arch/sh/include/asm/spinlock-llsc.h      | 4 ----
 arch/sparc/include/asm/spinlock_32.h     | 4 ----
 arch/sparc/include/asm/spinlock_64.h     | 3 ---
 arch/tile/include/asm/spinlock_32.h      | 6 ------
 arch/tile/include/asm/spinlock_64.h      | 6 ------
 arch/x86/include/asm/spinlock.h          | 3 ---
 arch/xtensa/include/asm/spinlock.h       | 5 -----
 include/asm-generic/qspinlock.h          | 1 -
 include/linux/rwlock.h                   | 9 +++++++++
 include/linux/spinlock.h                 | 4 ++++
 include/linux/spinlock_up.h              | 8 --------
 27 files changed, 20 insertions(+), 102 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/spinlock.h b/arch/alpha/include/asm/spinlock.h
index 7bff6316b8bb..3e2b4a05cb0f 100644
--- a/arch/alpha/include/asm/spinlock.h
+++ b/arch/alpha/include/asm/spinlock.h
@@ -13,7 +13,6 @@
  * We make no fairness assumptions. They have a cost.
  */
 
-#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 #define arch_spin_is_locked(x)	((x)->lock != 0)
 
 static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
@@ -160,7 +159,4 @@ static inline void arch_write_unlock(arch_rwlock_t * lock)
 	lock->lock = 0;
 }
 
-#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
-#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-
 #endif /* _ALPHA_SPINLOCK_H */
diff --git a/arch/arc/include/asm/spinlock.h b/arch/arc/include/asm/spinlock.h
index f85bb585cdfc..2ba04a7db621 100644
--- a/arch/arc/include/asm/spinlock.h
+++ b/arch/arc/include/asm/spinlock.h
@@ -14,7 +14,6 @@
 #include <asm/barrier.h>
 
 #define arch_spin_is_locked(x)	((x)->slock != __ARCH_SPIN_LOCK_UNLOCKED__)
-#define arch_spin_lock_flags(lock, flags)	arch_spin_lock(lock)
 
 #ifdef CONFIG_ARC_HAS_LLSC
 
@@ -410,7 +409,4 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 
 #endif
 
-#define arch_read_lock_flags(lock, flags)	arch_read_lock(lock)
-#define arch_write_lock_flags(lock, flags)	arch_write_lock(lock)
-
 #endif /* __ASM_SPINLOCK_H */
diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h
index d40a28fcbc62..daa87212c9a1 100644
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -52,8 +52,6 @@ static inline void dsb_sev(void)
  * memory.
  */
 
-#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
 	unsigned long tmp;
@@ -270,7 +268,4 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 	}
 }
 
-#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
-#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-
 #endif /* __ASM_SPINLOCK_H */
diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h
index 1504f2b95c57..aa51a38e46e4 100644
--- a/arch/arm64/include/asm/spinlock.h
+++ b/arch/arm64/include/asm/spinlock.h
@@ -27,8 +27,6 @@
  * instructions.
  */
 
-#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
 	unsigned int tmp;
@@ -303,9 +301,6 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 /* read_can_lock - would read_trylock() succeed? */
 #define arch_read_can_lock(x)		((x)->lock < 0x80000000)
 
-#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
-#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-
 /* See include/linux/spinlock.h */
 #define smp_mb__after_spinlock()	smp_mb()
 
diff --git a/arch/blackfin/include/asm/spinlock.h b/arch/blackfin/include/asm/spinlock.h
index 3885d12d9939..839d1441af3a 100644
--- a/arch/blackfin/include/asm/spinlock.h
+++ b/arch/blackfin/include/asm/spinlock.h
@@ -36,8 +36,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
 	__raw_spin_lock_asm(&lock->lock);
 }
 
-#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-
 static inline int arch_spin_trylock(arch_spinlock_t *lock)
 {
 	return __raw_spin_trylock_asm(&lock->lock);
@@ -53,8 +51,6 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
 	__raw_read_lock_asm(&rw->lock);
 }
 
-#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
-
 static inline int arch_read_trylock(arch_rwlock_t *rw)
 {
 	return __raw_read_trylock_asm(&rw->lock);
@@ -70,8 +66,6 @@ static inline void arch_write_lock(arch_rwlock_t *rw)
 	__raw_write_lock_asm(&rw->lock);
 }
 
-#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-
 static inline int arch_write_trylock(arch_rwlock_t *rw)
 {
 	return __raw_write_trylock_asm(&rw->lock);
diff --git a/arch/hexagon/include/asm/spinlock.h b/arch/hexagon/include/asm/spinlock.h
index 9f9414b9c303..48020863f53a 100644
--- a/arch/hexagon/include/asm/spinlock.h
+++ b/arch/hexagon/include/asm/spinlock.h
@@ -167,11 +167,6 @@ static inline unsigned int arch_spin_trylock(arch_spinlock_t *lock)
 /*
  * SMP spinlocks are intended to allow only a single CPU at the lock
  */
-#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-
 #define arch_spin_is_locked(x) ((x)->lock != 0)
 
-#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
-#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-
 #endif
diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h
index ed1e6212e9de..35b31884863b 100644
--- a/arch/ia64/include/asm/spinlock.h
+++ b/arch/ia64/include/asm/spinlock.h
@@ -126,6 +126,7 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
 {
 	arch_spin_lock(lock);
 }
+#define arch_spin_lock_flags	arch_spin_lock_flags
 
 #ifdef ASM_SUPPORTED
 
@@ -153,6 +154,7 @@ arch_read_lock_flags(arch_rwlock_t *lock, unsigned long flags)
 		: "p6", "p7", "r2", "memory");
 }
 
+#define arch_read_lock_flags arch_read_lock_flags
 #define arch_read_lock(lock) arch_read_lock_flags(lock, 0)
 
 #else /* !ASM_SUPPORTED */
@@ -205,6 +207,7 @@ arch_write_lock_flags(arch_rwlock_t *lock, unsigned long flags)
 		: "ar.ccv", "p6", "p7", "r2", "r29", "memory");
 }
 
+#define arch_write_lock_flags arch_write_lock_flags
 #define arch_write_lock(rw) arch_write_lock_flags(rw, 0)
 
 #define arch_write_trylock(rw)							\
@@ -228,8 +231,6 @@ static inline void arch_write_unlock(arch_rwlock_t *x)
 
 #else /* !ASM_SUPPORTED */
 
-#define arch_write_lock_flags(l, flags) arch_write_lock(l)
-
 #define arch_write_lock(l)								\
 ({											\
 	__u64 ia64_val, ia64_set_val = ia64_dep_mi(-1, 0, 31, 1);			\
diff --git a/arch/m32r/include/asm/spinlock.h b/arch/m32r/include/asm/spinlock.h
index 6809a9bbd169..882203db8723 100644
--- a/arch/m32r/include/asm/spinlock.h
+++ b/arch/m32r/include/asm/spinlock.h
@@ -28,7 +28,6 @@
  */
 
 #define arch_spin_is_locked(x)		(*(volatile int *)(&(x)->slock) <= 0)
-#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
 /**
  * arch_spin_trylock - Try spin lock and return a result
@@ -305,7 +304,4 @@ static inline int arch_write_trylock(arch_rwlock_t *lock)
 	return 0;
 }
 
-#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
-#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-
 #endif	/* _ASM_M32R_SPINLOCK_H */
diff --git a/arch/metag/include/asm/spinlock.h b/arch/metag/include/asm/spinlock.h
index b5b4174cde5e..80e3e59172f2 100644
--- a/arch/metag/include/asm/spinlock.h
+++ b/arch/metag/include/asm/spinlock.h
@@ -15,9 +15,4 @@
  * locked.
  */
 
-#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-
-#define	arch_read_lock_flags(lock, flags) arch_read_lock(lock)
-#define	arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-
 #endif /* __ASM_SPINLOCK_H */
diff --git a/arch/metag/include/asm/spinlock_lnkget.h b/arch/metag/include/asm/spinlock_lnkget.h
index d5c334ddfd62..5708ac0a9d09 100644
--- a/arch/metag/include/asm/spinlock_lnkget.h
+++ b/arch/metag/include/asm/spinlock_lnkget.h
@@ -209,7 +209,4 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 	return tmp;
 }
 
-#define	arch_read_lock_flags(lock, flags) arch_read_lock(lock)
-#define	arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-
 #endif /* __ASM_SPINLOCK_LNKGET_H */
diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h
index 4260d3f80d3a..ee81297d9117 100644
--- a/arch/mips/include/asm/spinlock.h
+++ b/arch/mips/include/asm/spinlock.h
@@ -13,7 +13,4 @@
 #include <asm/qrwlock.h>
 #include <asm/qspinlock.h>
 
-#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
-#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-
 #endif /* _ASM_SPINLOCK_H */
diff --git a/arch/mn10300/include/asm/spinlock.h b/arch/mn10300/include/asm/spinlock.h
index 54f75dac8094..879cd0df53ba 100644
--- a/arch/mn10300/include/asm/spinlock.h
+++ b/arch/mn10300/include/asm/spinlock.h
@@ -84,6 +84,7 @@ static inline void arch_spin_lock_flags(arch_spinlock_t *lock,
 		: "d" (flags), "a"(&lock->slock), "i"(EPSW_IE | MN10300_CLI_LEVEL)
 		: "memory", "cc");
 }
+#define arch_spin_lock_flags	arch_spin_lock_flags
 
 #ifdef __KERNEL__
 
@@ -171,9 +172,6 @@ static inline int arch_write_trylock(arch_rwlock_t *lock)
 	return 0;
 }
 
-#define arch_read_lock_flags(lock, flags)  arch_read_lock(lock)
-#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h
index 136e1c9bb8a9..d66d7b1efc4e 100644
--- a/arch/parisc/include/asm/spinlock.h
+++ b/arch/parisc/include/asm/spinlock.h
@@ -31,6 +31,7 @@ static inline void arch_spin_lock_flags(arch_spinlock_t *x,
 				cpu_relax();
 	mb();
 }
+#define arch_spin_lock_flags arch_spin_lock_flags
 
 static inline void arch_spin_unlock(arch_spinlock_t *x)
 {
@@ -168,7 +169,4 @@ static __inline__ int arch_write_trylock(arch_rwlock_t *rw)
 	return result;
 }
 
-#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
-#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-
 #endif /* __ASM_SPINLOCK_H */
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index d83f4f755ad8..b9ebc3085fb7 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -161,6 +161,7 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
 		local_irq_restore(flags_dis);
 	}
 }
+#define arch_spin_lock_flags arch_spin_lock_flags
 
 static inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
@@ -299,9 +300,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 	rw->lock = 0;
 }
 
-#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
-#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-
 #define arch_spin_relax(lock)	__spin_yield(lock)
 #define arch_read_relax(lock)	__rw_yield(lock)
 #define arch_write_relax(lock)	__rw_yield(lock)
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 4eca60cc81e4..9fa855f91e55 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -81,6 +81,7 @@ static inline void arch_spin_lock_flags(arch_spinlock_t *lp,
 	if (!arch_spin_trylock_once(lp))
 		arch_spin_lock_wait_flags(lp, flags);
 }
+#define arch_spin_lock_flags	arch_spin_lock_flags
 
 static inline int arch_spin_trylock(arch_spinlock_t *lp)
 {
@@ -114,9 +115,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp)
 extern int _raw_read_trylock_retry(arch_rwlock_t *lp);
 extern int _raw_write_trylock_retry(arch_rwlock_t *lp);
 
-#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
-#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-
 static inline int arch_read_trylock_once(arch_rwlock_t *rw)
 {
 	int old = ACCESS_ONCE(rw->lock);
diff --git a/arch/sh/include/asm/spinlock-cas.h b/arch/sh/include/asm/spinlock-cas.h
index 295993c2598e..270ee4d3e25b 100644
--- a/arch/sh/include/asm/spinlock-cas.h
+++ b/arch/sh/include/asm/spinlock-cas.h
@@ -27,7 +27,6 @@ static inline unsigned __sl_cas(volatile unsigned *p, unsigned old, unsigned new
  */
 
 #define arch_spin_is_locked(x)		((x)->lock <= 0)
-#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
@@ -90,7 +89,4 @@ static inline int arch_write_trylock(arch_rwlock_t *rw)
 	return __sl_cas(&rw->lock, RW_LOCK_BIAS, 0) == RW_LOCK_BIAS;
 }
 
-#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
-#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-
 #endif /* __ASM_SH_SPINLOCK_CAS_H */
diff --git a/arch/sh/include/asm/spinlock-llsc.h b/arch/sh/include/asm/spinlock-llsc.h
index a6f9edd15317..715595de286a 100644
--- a/arch/sh/include/asm/spinlock-llsc.h
+++ b/arch/sh/include/asm/spinlock-llsc.h
@@ -19,7 +19,6 @@
  */
 
 #define arch_spin_is_locked(x)		((x)->lock <= 0)
-#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
@@ -197,7 +196,4 @@ static inline int arch_write_trylock(arch_rwlock_t *rw)
 	return (oldval > (RW_LOCK_BIAS - 1));
 }
 
-#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
-#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-
 #endif /* __ASM_SH_SPINLOCK_LLSC_H */
diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h
index 9d9129efd5d6..12bf857b471e 100644
--- a/arch/sparc/include/asm/spinlock_32.h
+++ b/arch/sparc/include/asm/spinlock_32.h
@@ -182,10 +182,6 @@ static inline int __arch_read_trylock(arch_rwlock_t *rw)
 	res; \
 })
 
-#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-#define arch_read_lock_flags(rw, flags)   arch_read_lock(rw)
-#define arch_write_lock_flags(rw, flags)  arch_write_lock(rw)
-
 #endif /* !(__ASSEMBLY__) */
 
 #endif /* __SPARC_SPINLOCK_H */
diff --git a/arch/sparc/include/asm/spinlock_64.h b/arch/sparc/include/asm/spinlock_64.h
index 3b67705e1b74..99b6e1c4f630 100644
--- a/arch/sparc/include/asm/spinlock_64.h
+++ b/arch/sparc/include/asm/spinlock_64.h
@@ -13,9 +13,6 @@
 #include <asm/qrwlock.h>
 #include <asm/qspinlock.h>
 
-#define arch_read_lock_flags(p, f) arch_read_lock(p)
-#define arch_write_lock_flags(p, f) arch_write_lock(p)
-
 #endif /* !(__ASSEMBLY__) */
 
 #endif /* !(__SPARC64_SPINLOCK_H) */
diff --git a/arch/tile/include/asm/spinlock_32.h b/arch/tile/include/asm/spinlock_32.h
index 91d05f21cba9..fb5313d77315 100644
--- a/arch/tile/include/asm/spinlock_32.h
+++ b/arch/tile/include/asm/spinlock_32.h
@@ -51,9 +51,6 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 
 void arch_spin_lock(arch_spinlock_t *lock);
 
-/* We cannot take an interrupt after getting a ticket, so don't enable them. */
-#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-
 int arch_spin_trylock(arch_spinlock_t *lock);
 
 static inline void arch_spin_unlock(arch_spinlock_t *lock)
@@ -109,7 +106,4 @@ void arch_read_unlock(arch_rwlock_t *rwlock);
  */
 void arch_write_unlock(arch_rwlock_t *rwlock);
 
-#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
-#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-
 #endif /* _ASM_TILE_SPINLOCK_32_H */
diff --git a/arch/tile/include/asm/spinlock_64.h b/arch/tile/include/asm/spinlock_64.h
index c802f48badf4..5b616ef642a8 100644
--- a/arch/tile/include/asm/spinlock_64.h
+++ b/arch/tile/include/asm/spinlock_64.h
@@ -75,9 +75,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
 /* Try to get the lock, and return whether we succeeded. */
 int arch_spin_trylock(arch_spinlock_t *lock);
 
-/* We cannot take an interrupt after getting a ticket, so don't enable them. */
-#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-
 /*
  * Read-write spinlocks, allowing multiple readers
  * but only one writer.
@@ -138,7 +135,4 @@ static inline int arch_write_trylock(arch_rwlock_t *rw)
 	return 0;
 }
 
-#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
-#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-
 #endif /* _ASM_TILE_SPINLOCK_64_H */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index a558c187f20c..c6a6adf0a5c5 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -41,7 +41,4 @@
 
 #include <asm/qrwlock.h>
 
-#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
-#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-
 #endif /* _ASM_X86_SPINLOCK_H */
diff --git a/arch/xtensa/include/asm/spinlock.h b/arch/xtensa/include/asm/spinlock.h
index d005af51e2e1..c6e1290dcbb7 100644
--- a/arch/xtensa/include/asm/spinlock.h
+++ b/arch/xtensa/include/asm/spinlock.h
@@ -33,8 +33,6 @@
 
 #define arch_spin_is_locked(x) ((x)->slock != 0)
 
-#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
 	unsigned long tmp;
@@ -198,7 +196,4 @@ static inline void arch_read_unlock(arch_rwlock_t *rw)
 			: "memory");
 }
 
-#define arch_read_lock_flags(lock, flags)	arch_read_lock(lock)
-#define arch_write_lock_flags(lock, flags)	arch_write_lock(lock)
-
 #endif	/* _XTENSA_SPINLOCK_H */
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index 66260777d644..b37b4ad7eb94 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -121,6 +121,5 @@ static __always_inline bool virt_spin_lock(struct qspinlock *lock)
 #define arch_spin_lock(l)		queued_spin_lock(l)
 #define arch_spin_trylock(l)		queued_spin_trylock(l)
 #define arch_spin_unlock(l)		queued_spin_unlock(l)
-#define arch_spin_lock_flags(l, f)	queued_spin_lock(l)
 
 #endif /* __ASM_GENERIC_QSPINLOCK_H */
diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
index 766c5ca5cbd1..3dcd617e65ae 100644
--- a/include/linux/rwlock.h
+++ b/include/linux/rwlock.h
@@ -38,6 +38,15 @@ do {								\
  extern int do_raw_write_trylock(rwlock_t *lock);
  extern void do_raw_write_unlock(rwlock_t *lock) __releases(lock);
 #else
+
+#ifndef arch_read_lock_flags
+# define arch_read_lock_flags(lock, flags)	arch_read_lock(lock)
+#endif
+
+#ifndef arch_write_lock_flags
+# define arch_write_lock_flags(lock, flags)	arch_write_lock(lock)
+#endif
+
 # define do_raw_read_lock(rwlock)	do {__acquire(lock); arch_read_lock(&(rwlock)->raw_lock); } while (0)
 # define do_raw_read_lock_flags(lock, flags) \
 		do {__acquire(lock); arch_read_lock_flags(&(lock)->raw_lock, *(flags)); } while (0)
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 1e3e48041800..4e202b00dd66 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -165,6 +165,10 @@ static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock)
 	arch_spin_lock(&lock->raw_lock);
 }
 
+#ifndef arch_spin_lock_flags
+#define arch_spin_lock_flags(lock, flags)	arch_spin_lock(lock)
+#endif
+
 static inline void
 do_raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long *flags) __acquires(lock)
 {
diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
index 901cf8f44388..0ac9112c1bbe 100644
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -32,14 +32,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
 	barrier();
 }
 
-static inline void
-arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
-{
-	local_irq_save(flags);
-	lock->slock = 0;
-	barrier();
-}
-
 static inline int arch_spin_trylock(arch_spinlock_t *lock)
 {
 	char oldval = lock->slock;
-- 
cgit v1.2.3


From de8cd83e91bc3ee212b3e6ec6e4283af9e4ab269 Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Mon, 2 Oct 2017 20:21:39 -0400
Subject: audit: Record fanotify access control decisions

The fanotify interface allows user space daemons to make access
control decisions. Under common criteria requirements, we need to
optionally record decisions based on policy. This patch adds a bit mask,
FAN_AUDIT, that a user space daemon can 'or' into the response decision
which will tell the kernel that it made a decision and record it.

It would be used something like this in user space code:

  response.response = FAN_DENY | FAN_AUDIT;
  write(fd, &response, sizeof(struct fanotify_response));

When the syscall ends, the audit system will record the decision as a
AUDIT_FANOTIFY auxiliary record to denote that the reason this event
occurred is the result of an access control decision from fanotify
rather than DAC or MAC policy.

A sample event looks like this:

type=PATH msg=audit(1504310584.332:290): item=0 name="./evil-ls"
inode=1319561 dev=fc:03 mode=0100755 ouid=1000 ogid=1000 rdev=00:00
obj=unconfined_u:object_r:user_home_t:s0 nametype=NORMAL
type=CWD msg=audit(1504310584.332:290): cwd="/home/sgrubb"
type=SYSCALL msg=audit(1504310584.332:290): arch=c000003e syscall=2
success=no exit=-1 a0=32cb3fca90 a1=0 a2=43 a3=8 items=1 ppid=901
pid=959 auid=1000 uid=1000 gid=1000 euid=1000 suid=1000
fsuid=1000 egid=1000 sgid=1000 fsgid=1000 tty=pts1 ses=3 comm="bash"
exe="/usr/bin/bash" subj=unconfined_u:unconfined_r:unconfined_t:
s0-s0:c0.c1023 key=(null)
type=FANOTIFY msg=audit(1504310584.332:290): resp=2

Prior to using the audit flag, the developer needs to call
fanotify_init or'ing in FAN_ENABLE_AUDIT to ensure that the kernel
supports auditing. The calling process must also have the CAP_AUDIT_WRITE
capability.

Signed-off-by: sgrubb <sgrubb@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c      |  8 +++++++-
 fs/notify/fanotify/fanotify_user.c | 16 +++++++++++++++-
 fs/notify/fdinfo.c                 |  3 +++
 include/linux/audit.h              | 10 ++++++++++
 include/linux/fsnotify_backend.h   |  1 +
 include/uapi/linux/audit.h         |  1 +
 include/uapi/linux/fanotify.h      |  3 +++
 kernel/auditsc.c                   |  6 ++++++
 8 files changed, 46 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 2fa99aeaa095..1968d21a3f37 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -9,6 +9,7 @@
 #include <linux/sched/user.h>
 #include <linux/types.h>
 #include <linux/wait.h>
+#include <linux/audit.h>
 
 #include "fanotify.h"
 
@@ -78,7 +79,7 @@ static int fanotify_get_response(struct fsnotify_group *group,
 	fsnotify_finish_user_wait(iter_info);
 out:
 	/* userspace responded, convert to something usable */
-	switch (event->response) {
+	switch (event->response & ~FAN_AUDIT) {
 	case FAN_ALLOW:
 		ret = 0;
 		break;
@@ -86,6 +87,11 @@ out:
 	default:
 		ret = -EPERM;
 	}
+
+	/* Check if the response should be audited */
+	if (event->response & FAN_AUDIT)
+		audit_fanotify(event->response & ~FAN_AUDIT);
+
 	event->response = 0;
 
 	pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 907a481ac781..0455ea729384 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -179,7 +179,7 @@ static int process_access_response(struct fsnotify_group *group,
 	 * userspace can send a valid response or we will clean it up after the
 	 * timeout
 	 */
-	switch (response) {
+	switch (response & ~FAN_AUDIT) {
 	case FAN_ALLOW:
 	case FAN_DENY:
 		break;
@@ -190,6 +190,9 @@ static int process_access_response(struct fsnotify_group *group,
 	if (fd < 0)
 		return -EINVAL;
 
+	if ((response & FAN_AUDIT) && !group->fanotify_data.audit)
+		return -EINVAL;
+
 	event = dequeue_event(group, fd);
 	if (!event)
 		return -ENOENT;
@@ -721,7 +724,11 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+#ifdef CONFIG_AUDITSYSCALL
+	if (flags & ~(FAN_ALL_INIT_FLAGS | FAN_ENABLE_AUDIT))
+#else
 	if (flags & ~FAN_ALL_INIT_FLAGS)
+#endif
 		return -EINVAL;
 
 	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
@@ -805,6 +812,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 		group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
 	}
 
+	if (flags & FAN_ENABLE_AUDIT) {
+		fd = -EPERM;
+		if (!capable(CAP_AUDIT_WRITE))
+			goto out_destroy_group;
+		group->fanotify_data.audit = true;
+	}
+
 	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
 	if (fd < 0)
 		goto out_destroy_group;
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index dd63aa9a6f9a..645ab561e790 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -156,6 +156,9 @@ void fanotify_show_fdinfo(struct seq_file *m, struct file *f)
 	if (group->fanotify_data.max_marks == UINT_MAX)
 		flags |= FAN_UNLIMITED_MARKS;
 
+	if (group->fanotify_data.audit)
+		flags |= FAN_ENABLE_AUDIT;
+
 	seq_printf(m, "fanotify flags:%x event-flags:%x\n",
 		   flags, group->fanotify_data.f_flags);
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
index cb708eb8accc..d66220dac364 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -356,6 +356,7 @@ extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 extern void __audit_log_capset(const struct cred *new, const struct cred *old);
 extern void __audit_mmap_fd(int fd, int flags);
 extern void __audit_log_kern_module(char *name);
+extern void __audit_fanotify(unsigned int response);
 
 static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
@@ -452,6 +453,12 @@ static inline void audit_log_kern_module(char *name)
 		__audit_log_kern_module(name);
 }
 
+static inline void audit_fanotify(unsigned int response)
+{
+	if (!audit_dummy_context())
+		__audit_fanotify(response);
+}
+
 extern int audit_n_rules;
 extern int audit_signals;
 #else /* CONFIG_AUDITSYSCALL */
@@ -568,6 +575,9 @@ static inline void audit_log_kern_module(char *name)
 {
 }
 
+static inline void audit_fanotify(unsigned int response)
+{ }
+
 static inline void audit_ptrace(struct task_struct *t)
 { }
 #define audit_n_rules 0
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index c6c69318752b..4a474f972910 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -190,6 +190,7 @@ struct fsnotify_group {
 			int f_flags;
 			unsigned int max_marks;
 			struct user_struct *user;
+			bool audit;
 		} fanotify_data;
 #endif /* CONFIG_FANOTIFY */
 	};
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 0714a66f0e0c..221f8b7f01b2 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -112,6 +112,7 @@
 #define AUDIT_FEATURE_CHANGE	1328	/* audit log listing feature changes */
 #define AUDIT_REPLACE		1329	/* Replace auditd if this packet unanswerd */
 #define AUDIT_KERN_MODULE	1330	/* Kernel Module events */
+#define AUDIT_FANOTIFY		1331	/* Fanotify access decision */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index 030508d195d3..5dda19a9a947 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -35,6 +35,7 @@
 
 #define FAN_UNLIMITED_QUEUE	0x00000010
 #define FAN_UNLIMITED_MARKS	0x00000020
+#define FAN_ENABLE_AUDIT	0x00000040
 
 #define FAN_ALL_INIT_FLAGS	(FAN_CLOEXEC | FAN_NONBLOCK | \
 				 FAN_ALL_CLASS_BITS | FAN_UNLIMITED_QUEUE |\
@@ -99,6 +100,8 @@ struct fanotify_response {
 /* Legit userspace responses to a _PERM event */
 #define FAN_ALLOW	0x01
 #define FAN_DENY	0x02
+#define FAN_AUDIT	0x10	/* Bit mask to create audit record for result */
+
 /* No fd set in event */
 #define FAN_NOFD	-1
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ecc23e25c9eb..9c723e978245 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2390,6 +2390,12 @@ void __audit_log_kern_module(char *name)
 	context->type = AUDIT_KERN_MODULE;
 }
 
+void __audit_fanotify(unsigned int response)
+{
+	audit_log(current->audit_context, GFP_KERNEL,
+		AUDIT_FANOTIFY,	"resp=%u", response);
+}
+
 static void audit_log_task(struct audit_buffer *ab)
 {
 	kuid_t auid, uid;
-- 
cgit v1.2.3


From 8264c3214f28b52b399d9e03bfa7feec275a0d71 Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Mon, 9 Oct 2017 13:34:41 +0300
Subject: writeback: merge try_to_writeback_inodes_sb_nr() into caller

Since commit 925a6efb8ff0c ("Btrfs: stop using
try_to_writeback_inodes_sb_nr to flush delalloc") this function hasn't
been used outside so stop exporting it.

In addition we merge it into try_to_writeback_inodes_sb() which is the
only caller.  Also change return type of try_to_writeback_inodes_sb to
void as the only user ext4 doesn't care.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c         | 30 ++++++------------------------
 include/linux/writeback.h |  4 +---
 2 files changed, 7 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9e24d604c59c..08f5debd07d1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2376,37 +2376,19 @@ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
 EXPORT_SYMBOL(writeback_inodes_sb);
 
 /**
- * try_to_writeback_inodes_sb_nr - try to start writeback if none underway
+ * try_to_writeback_inodes_sb - try to start writeback if none underway
  * @sb: the superblock
- * @nr: the number of pages to write
- * @reason: the reason of writeback
+ * @reason: reason why some writeback work was initiated
  *
- * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
- * Returns 1 if writeback was started, 0 if not.
+ * Invoke __writeback_inodes_sb_nr if no writeback is currently underway.
  */
-bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
-				   enum wb_reason reason)
+void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
 {
 	if (!down_read_trylock(&sb->s_umount))
-		return false;
+		return;
 
-	__writeback_inodes_sb_nr(sb, nr, reason, true);
+	__writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
 	up_read(&sb->s_umount);
-	return true;
-}
-EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
-
-/**
- * try_to_writeback_inodes_sb - try to start writeback if none underway
- * @sb: the superblock
- * @reason: reason why some writeback work was initiated
- *
- * Implement by try_to_writeback_inodes_sb_nr()
- * Returns 1 if writeback was started, 0 if not.
- */
-bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
-{
-	return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
 }
 EXPORT_SYMBOL(try_to_writeback_inodes_sb);
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index dd1d2c23f743..e15ec14085ad 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -163,9 +163,7 @@ struct bdi_writeback;
 void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
 void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 							enum wb_reason reason);
-bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
-bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
-				   enum wb_reason reason);
+void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason);
 void sync_inodes_sb(struct super_block *);
 void wakeup_flusher_threads(enum wb_reason reason);
 void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
-- 
cgit v1.2.3


From 46dde01f6bab35d99af111fcc02ca3ee1146050f Mon Sep 17 00:00:00 2001
From: Kamal Dasu <kdasu.kdev@gmail.com>
Date: Tue, 22 Aug 2017 16:45:21 -0400
Subject: mtd: spi-nor: add spi_nor_init() function

This patch extracts some chunks from spi_nor_init_params and spi_nor_scan()
 and moves them into a new spi_nor_init() function.

Indeed, spi_nor_init() regroups all the required SPI flash commands to be
sent to the SPI flash memory before performing any runtime operations
(Fast Read, Page Program, Sector Erase, ...). Hence spi_nor_init():
1) removes the flash protection if applicable for certain vendors.
2) sets the Quad Enable bit, if needed, before using Quad SPI protocols.
3) makes the memory enter its (stateful) 4-byte address mode, if needed,
   for SPI flash memory > 128Mbits not supporting the 4-byte address
   instruction set.

spi_nor_scan() now ends by calling spi_nor_init() once the probe phase has
completed. Further patches could also use spi_nor_init() to implement the
mtd->_resume() handler for the spi-nor framework.

Signed-off-by: Kamal Dasu <kdasu.kdev@gmail.com>
Signed-off-by: Cyrille Pitchen <cyrille.pitchen@wedev4u.fr>
---
 drivers/mtd/spi-nor/spi-nor.c | 56 +++++++++++++++++++++++++++++--------------
 include/linux/mtd/spi-nor.h   | 10 ++++++++
 2 files changed, 48 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c
index 33f6fc1b0eab..04751a73c3fe 100644
--- a/drivers/mtd/spi-nor/spi-nor.c
+++ b/drivers/mtd/spi-nor/spi-nor.c
@@ -2629,14 +2629,44 @@ static int spi_nor_setup(struct spi_nor *nor, const struct flash_info *info,
 	/* Enable Quad I/O if needed. */
 	enable_quad_io = (spi_nor_get_protocol_width(nor->read_proto) == 4 ||
 			  spi_nor_get_protocol_width(nor->write_proto) == 4);
-	if (enable_quad_io && params->quad_enable) {
-		err = params->quad_enable(nor);
+	if (enable_quad_io && params->quad_enable)
+		nor->quad_enable = params->quad_enable;
+	else
+		nor->quad_enable = NULL;
+
+	return 0;
+}
+
+static int spi_nor_init(struct spi_nor *nor)
+{
+	int err;
+
+	/*
+	 * Atmel, SST, Intel/Numonyx, and others serial NOR tend to power up
+	 * with the software protection bits set
+	 */
+	if (JEDEC_MFR(nor->info) == SNOR_MFR_ATMEL ||
+	    JEDEC_MFR(nor->info) == SNOR_MFR_INTEL ||
+	    JEDEC_MFR(nor->info) == SNOR_MFR_SST ||
+	    nor->info->flags & SPI_NOR_HAS_LOCK) {
+		write_enable(nor);
+		write_sr(nor, 0);
+		spi_nor_wait_till_ready(nor);
+	}
+
+	if (nor->quad_enable) {
+		err = nor->quad_enable(nor);
 		if (err) {
 			dev_err(nor->dev, "quad mode not supported\n");
 			return err;
 		}
 	}
 
+	if ((nor->addr_width == 4) &&
+	    (JEDEC_MFR(nor->info) != SNOR_MFR_SPANSION) &&
+	    !(nor->info->flags & SPI_NOR_4B_OPCODES))
+		set_4byte(nor, nor->info, 1);
+
 	return 0;
 }
 
@@ -2707,20 +2737,6 @@ int spi_nor_scan(struct spi_nor *nor, const char *name,
 	if (ret)
 		return ret;
 
-	/*
-	 * Atmel, SST, Intel/Numonyx, and others serial NOR tend to power up
-	 * with the software protection bits set
-	 */
-
-	if (JEDEC_MFR(info) == SNOR_MFR_ATMEL ||
-	    JEDEC_MFR(info) == SNOR_MFR_INTEL ||
-	    JEDEC_MFR(info) == SNOR_MFR_SST ||
-	    info->flags & SPI_NOR_HAS_LOCK) {
-		write_enable(nor);
-		write_sr(nor, 0);
-		spi_nor_wait_till_ready(nor);
-	}
-
 	if (!mtd->name)
 		mtd->name = dev_name(dev);
 	mtd->priv = nor;
@@ -2803,8 +2819,6 @@ int spi_nor_scan(struct spi_nor *nor, const char *name,
 		if (JEDEC_MFR(info) == SNOR_MFR_SPANSION ||
 		    info->flags & SPI_NOR_4B_OPCODES)
 			spi_nor_set_4byte_opcodes(nor, info);
-		else
-			set_4byte(nor, info, 1);
 	} else {
 		nor->addr_width = 3;
 	}
@@ -2821,6 +2835,12 @@ int spi_nor_scan(struct spi_nor *nor, const char *name,
 			return ret;
 	}
 
+	/* Send all the required SPI flash commands to initialize device */
+	nor->info = info;
+	ret = spi_nor_init(nor);
+	if (ret)
+		return ret;
+
 	dev_info(dev, "%s (%lld Kbytes)\n", info->name,
 			(long long)mtd->size >> 10);
 
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index 1f0a7fc7772f..d0c66a0975cf 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -231,11 +231,18 @@ enum spi_nor_option_flags {
 	SNOR_F_USE_CLSR		= BIT(5),
 };
 
+/**
+ * struct flash_info - Forward declaration of a structure used internally by
+ *		       spi_nor_scan()
+ */
+struct flash_info;
+
 /**
  * struct spi_nor - Structure for defining a the SPI NOR layer
  * @mtd:		point to a mtd_info structure
  * @lock:		the lock for the read/write/erase/lock/unlock operations
  * @dev:		point to a spi device, or a spi nor controller device.
+ * @info:		spi-nor part JDEC MFR id and other info
  * @page_size:		the page size of the SPI NOR
  * @addr_width:		number of address bytes
  * @erase_opcode:	the opcode for erasing a sector
@@ -262,6 +269,7 @@ enum spi_nor_option_flags {
  * @flash_lock:		[FLASH-SPECIFIC] lock a region of the SPI NOR
  * @flash_unlock:	[FLASH-SPECIFIC] unlock a region of the SPI NOR
  * @flash_is_locked:	[FLASH-SPECIFIC] check if a region of the SPI NOR is
+ * @quad_enable:	[FLASH-SPECIFIC] enables SPI NOR quad mode
  *			completely locked
  * @priv:		the private data
  */
@@ -269,6 +277,7 @@ struct spi_nor {
 	struct mtd_info		mtd;
 	struct mutex		lock;
 	struct device		*dev;
+	const struct flash_info	*info;
 	u32			page_size;
 	u8			addr_width;
 	u8			erase_opcode;
@@ -296,6 +305,7 @@ struct spi_nor {
 	int (*flash_lock)(struct spi_nor *nor, loff_t ofs, uint64_t len);
 	int (*flash_unlock)(struct spi_nor *nor, loff_t ofs, uint64_t len);
 	int (*flash_is_locked)(struct spi_nor *nor, loff_t ofs, uint64_t len);
+	int (*quad_enable)(struct spi_nor *nor);
 
 	void *priv;
 };
-- 
cgit v1.2.3


From e7bf8249e8f1bac64885eeccb55bcf6111901a81 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 9 Oct 2017 10:30:10 -0700
Subject: bpf: encapsulate verifier log state into a structure

Put the loose log_* variables into a structure.  This will make
it simpler to remove the global verifier state in following patches.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h | 13 ++++++++++
 kernel/bpf/verifier.c        | 57 +++++++++++++++++++++++---------------------
 2 files changed, 43 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b8d200f60a40..163541ba70d9 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -115,6 +115,19 @@ struct bpf_insn_aux_data {
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
 
+struct bpf_verifer_log {
+	u32 level;
+	char *kbuf;
+	char __user *ubuf;
+	u32 len_used;
+	u32 len_total;
+};
+
+static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log)
+{
+	return log->len_used >= log->len_total - 1;
+}
+
 struct bpf_verifier_env;
 struct bpf_ext_analyzer_ops {
 	int (*insn_hook)(struct bpf_verifier_env *env,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6352a88ca6d1..e53458b02249 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -156,8 +156,7 @@ struct bpf_call_arg_meta {
 /* verbose verifier prints what it's seeing
  * bpf_check() is called under lock, so no race to access these global vars
  */
-static u32 log_level, log_size, log_len;
-static char *log_buf;
+static struct bpf_verifer_log verifier_log;
 
 static DEFINE_MUTEX(bpf_verifier_lock);
 
@@ -167,13 +166,15 @@ static DEFINE_MUTEX(bpf_verifier_lock);
  */
 static __printf(1, 2) void verbose(const char *fmt, ...)
 {
+	struct bpf_verifer_log *log = &verifier_log;
 	va_list args;
 
-	if (log_level == 0 || log_len >= log_size - 1)
+	if (!log->level || bpf_verifier_log_full(log))
 		return;
 
 	va_start(args, fmt);
-	log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+	log->len_used += vscnprintf(log->kbuf + log->len_used,
+				    log->len_total - log->len_used, fmt, args);
 	va_end(args);
 }
 
@@ -886,7 +887,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * need to try adding each of min_value and max_value to off
 	 * to make sure our theoretical access will be safe.
 	 */
-	if (log_level)
+	if (verifier_log.level)
 		print_verifier_state(state);
 	/* The minimum value is only important with signed
 	 * comparisons where we can't assume the floor of a
@@ -2956,7 +2957,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
 		return -EACCES;
 	}
-	if (log_level)
+	if (verifier_log.level)
 		print_verifier_state(this_branch);
 	return 0;
 }
@@ -3712,7 +3713,7 @@ static int do_check(struct bpf_verifier_env *env)
 			return err;
 		if (err == 1) {
 			/* found equivalent state, can prune the search */
-			if (log_level) {
+			if (verifier_log.level) {
 				if (do_print_state)
 					verbose("\nfrom %d to %d: safe\n",
 						prev_insn_idx, insn_idx);
@@ -3725,8 +3726,9 @@ static int do_check(struct bpf_verifier_env *env)
 		if (need_resched())
 			cond_resched();
 
-		if (log_level > 1 || (log_level && do_print_state)) {
-			if (log_level > 1)
+		if (verifier_log.level > 1 ||
+		    (verifier_log.level && do_print_state)) {
+			if (verifier_log.level > 1)
 				verbose("%d:", insn_idx);
 			else
 				verbose("\nfrom %d to %d:",
@@ -3735,7 +3737,7 @@ static int do_check(struct bpf_verifier_env *env)
 			do_print_state = false;
 		}
 
-		if (log_level) {
+		if (verifier_log.level) {
 			verbose("%d: ", insn_idx);
 			print_bpf_insn(env, insn);
 		}
@@ -4389,7 +4391,7 @@ static void free_states(struct bpf_verifier_env *env)
 
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 {
-	char __user *log_ubuf = NULL;
+	struct bpf_verifer_log *log = &verifier_log;
 	struct bpf_verifier_env *env;
 	int ret = -EINVAL;
 
@@ -4414,23 +4416,23 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		/* user requested verbose verifier output
 		 * and supplied buffer to store the verification trace
 		 */
-		log_level = attr->log_level;
-		log_ubuf = (char __user *) (unsigned long) attr->log_buf;
-		log_size = attr->log_size;
-		log_len = 0;
+		log->level = attr->log_level;
+		log->ubuf = (char __user *) (unsigned long) attr->log_buf;
+		log->len_total = attr->log_size;
+		log->len_used = 0;
 
 		ret = -EINVAL;
-		/* log_* values have to be sane */
-		if (log_size < 128 || log_size > UINT_MAX >> 8 ||
-		    log_level == 0 || log_ubuf == NULL)
+		/* log attributes have to be sane */
+		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+		    !log->level || !log->ubuf)
 			goto err_unlock;
 
 		ret = -ENOMEM;
-		log_buf = vmalloc(log_size);
-		if (!log_buf)
+		log->kbuf = vmalloc(log->len_total);
+		if (!log->kbuf)
 			goto err_unlock;
 	} else {
-		log_level = 0;
+		log->level = 0;
 	}
 
 	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
@@ -4467,15 +4469,16 @@ skip_full_check:
 	if (ret == 0)
 		ret = fixup_bpf_calls(env);
 
-	if (log_level && log_len >= log_size - 1) {
-		BUG_ON(log_len >= log_size);
+	if (log->level && bpf_verifier_log_full(log)) {
+		BUG_ON(log->len_used >= log->len_total);
 		/* verifier log exceeded user supplied buffer */
 		ret = -ENOSPC;
 		/* fall through to return what was recorded */
 	}
 
 	/* copy verifier log back to user space including trailing zero */
-	if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+	if (log->level && copy_to_user(log->ubuf, log->kbuf,
+				       log->len_used + 1) != 0) {
 		ret = -EFAULT;
 		goto free_log_buf;
 	}
@@ -4502,8 +4505,8 @@ skip_full_check:
 	}
 
 free_log_buf:
-	if (log_level)
-		vfree(log_buf);
+	if (log->level)
+		vfree(log->kbuf);
 	if (!env->prog->aux->used_maps)
 		/* if we didn't copy map pointers into bpf_prog_info, release
 		 * them now. Otherwise free_bpf_prog_info() will release them.
@@ -4540,7 +4543,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	/* grab the mutex to protect few globals used by verifier */
 	mutex_lock(&bpf_verifier_lock);
 
-	log_level = 0;
+	verifier_log.level = 0;
 
 	env->strict_alignment = false;
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
-- 
cgit v1.2.3


From 61bd5218eef349fcacc4976a251bc83a4748b4af Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 9 Oct 2017 10:30:11 -0700
Subject: bpf: move global verifier log into verifier environment

The biggest piece of global state protected by the verifier lock
is the verifier_log.  Move that log to struct bpf_verifier_env.
struct bpf_verifier_env has to be passed now to all invocations
of verbose().

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |   2 +
 kernel/bpf/verifier.c        | 491 +++++++++++++++++++++++--------------------
 2 files changed, 261 insertions(+), 232 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 163541ba70d9..5ddb9a626a51 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -152,6 +152,8 @@ struct bpf_verifier_env {
 	bool allow_ptr_leaks;
 	bool seen_direct_write;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
+
+	struct bpf_verifer_log log;
 };
 
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e53458b02249..a352f93cd4b2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,20 +153,16 @@ struct bpf_call_arg_meta {
 	int access_size;
 };
 
-/* verbose verifier prints what it's seeing
- * bpf_check() is called under lock, so no race to access these global vars
- */
-static struct bpf_verifer_log verifier_log;
-
 static DEFINE_MUTEX(bpf_verifier_lock);
 
 /* log_level controls verbosity level of eBPF verifier.
  * verbose() is used to dump the verification trace to the log, so the user
  * can figure out what's wrong with the program
  */
-static __printf(1, 2) void verbose(const char *fmt, ...)
+static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
+				   const char *fmt, ...)
 {
-	struct bpf_verifer_log *log = &verifier_log;
+	struct bpf_verifer_log *log = &env->log;
 	va_list args;
 
 	if (!log->level || bpf_verifier_log_full(log))
@@ -214,7 +210,8 @@ static const char *func_id_name(int id)
 		return "unknown";
 }
 
-static void print_verifier_state(struct bpf_verifier_state *state)
+static void print_verifier_state(struct bpf_verifier_env *env,
+				 struct bpf_verifier_state *state)
 {
 	struct bpf_reg_state *reg;
 	enum bpf_reg_type t;
@@ -225,21 +222,21 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 		t = reg->type;
 		if (t == NOT_INIT)
 			continue;
-		verbose(" R%d=%s", i, reg_type_str[t]);
+		verbose(env, " R%d=%s", i, reg_type_str[t]);
 		if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
 		    tnum_is_const(reg->var_off)) {
 			/* reg->off should be 0 for SCALAR_VALUE */
-			verbose("%lld", reg->var_off.value + reg->off);
+			verbose(env, "%lld", reg->var_off.value + reg->off);
 		} else {
-			verbose("(id=%d", reg->id);
+			verbose(env, "(id=%d", reg->id);
 			if (t != SCALAR_VALUE)
-				verbose(",off=%d", reg->off);
+				verbose(env, ",off=%d", reg->off);
 			if (type_is_pkt_pointer(t))
-				verbose(",r=%d", reg->range);
+				verbose(env, ",r=%d", reg->range);
 			else if (t == CONST_PTR_TO_MAP ||
 				 t == PTR_TO_MAP_VALUE ||
 				 t == PTR_TO_MAP_VALUE_OR_NULL)
-				verbose(",ks=%d,vs=%d",
+				verbose(env, ",ks=%d,vs=%d",
 					reg->map_ptr->key_size,
 					reg->map_ptr->value_size);
 			if (tnum_is_const(reg->var_off)) {
@@ -247,38 +244,38 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 				 * could be a pointer whose offset is too big
 				 * for reg->off
 				 */
-				verbose(",imm=%llx", reg->var_off.value);
+				verbose(env, ",imm=%llx", reg->var_off.value);
 			} else {
 				if (reg->smin_value != reg->umin_value &&
 				    reg->smin_value != S64_MIN)
-					verbose(",smin_value=%lld",
+					verbose(env, ",smin_value=%lld",
 						(long long)reg->smin_value);
 				if (reg->smax_value != reg->umax_value &&
 				    reg->smax_value != S64_MAX)
-					verbose(",smax_value=%lld",
+					verbose(env, ",smax_value=%lld",
 						(long long)reg->smax_value);
 				if (reg->umin_value != 0)
-					verbose(",umin_value=%llu",
+					verbose(env, ",umin_value=%llu",
 						(unsigned long long)reg->umin_value);
 				if (reg->umax_value != U64_MAX)
-					verbose(",umax_value=%llu",
+					verbose(env, ",umax_value=%llu",
 						(unsigned long long)reg->umax_value);
 				if (!tnum_is_unknown(reg->var_off)) {
 					char tn_buf[48];
 
 					tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-					verbose(",var_off=%s", tn_buf);
+					verbose(env, ",var_off=%s", tn_buf);
 				}
 			}
-			verbose(")");
+			verbose(env, ")");
 		}
 	}
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] == STACK_SPILL)
-			verbose(" fp%d=%s", -MAX_BPF_STACK + i,
+			verbose(env, " fp%d=%s", -MAX_BPF_STACK + i,
 				reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
 	}
-	verbose("\n");
+	verbose(env, "\n");
 }
 
 static const char *const bpf_class_string[] = {
@@ -333,15 +330,15 @@ static const char *const bpf_jmp_string[16] = {
 	[BPF_EXIT >> 4] = "exit",
 };
 
-static void print_bpf_end_insn(const struct bpf_verifier_env *env,
+static void print_bpf_end_insn(struct bpf_verifier_env *env,
 			       const struct bpf_insn *insn)
 {
-	verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+	verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
 		BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
 		insn->imm, insn->dst_reg);
 }
 
-static void print_bpf_insn(const struct bpf_verifier_env *env,
+static void print_bpf_insn(struct bpf_verifier_env *env,
 			   const struct bpf_insn *insn)
 {
 	u8 class = BPF_CLASS(insn->code);
@@ -349,23 +346,23 @@ static void print_bpf_insn(const struct bpf_verifier_env *env,
 	if (class == BPF_ALU || class == BPF_ALU64) {
 		if (BPF_OP(insn->code) == BPF_END) {
 			if (class == BPF_ALU64)
-				verbose("BUG_alu64_%02x\n", insn->code);
+				verbose(env, "BUG_alu64_%02x\n", insn->code);
 			else
 				print_bpf_end_insn(env, insn);
 		} else if (BPF_OP(insn->code) == BPF_NEG) {
-			verbose("(%02x) r%d = %s-r%d\n",
+			verbose(env, "(%02x) r%d = %s-r%d\n",
 				insn->code, insn->dst_reg,
 				class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg);
 		} else if (BPF_SRC(insn->code) == BPF_X) {
-			verbose("(%02x) %sr%d %s %sr%d\n",
+			verbose(env, "(%02x) %sr%d %s %sr%d\n",
 				insn->code, class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg,
 				bpf_alu_string[BPF_OP(insn->code) >> 4],
 				class == BPF_ALU ? "(u32) " : "",
 				insn->src_reg);
 		} else {
-			verbose("(%02x) %sr%d %s %s%d\n",
+			verbose(env, "(%02x) %sr%d %s %s%d\n",
 				insn->code, class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg,
 				bpf_alu_string[BPF_OP(insn->code) >> 4],
@@ -374,46 +371,46 @@ static void print_bpf_insn(const struct bpf_verifier_env *env,
 		}
 	} else if (class == BPF_STX) {
 		if (BPF_MODE(insn->code) == BPF_MEM)
-			verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
+			verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->dst_reg,
 				insn->off, insn->src_reg);
 		else if (BPF_MODE(insn->code) == BPF_XADD)
-			verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+			verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->dst_reg, insn->off,
 				insn->src_reg);
 		else
-			verbose("BUG_%02x\n", insn->code);
+			verbose(env, "BUG_%02x\n", insn->code);
 	} else if (class == BPF_ST) {
 		if (BPF_MODE(insn->code) != BPF_MEM) {
-			verbose("BUG_st_%02x\n", insn->code);
+			verbose(env, "BUG_st_%02x\n", insn->code);
 			return;
 		}
-		verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
+		verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
 			insn->code,
 			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 			insn->dst_reg,
 			insn->off, insn->imm);
 	} else if (class == BPF_LDX) {
 		if (BPF_MODE(insn->code) != BPF_MEM) {
-			verbose("BUG_ldx_%02x\n", insn->code);
+			verbose(env, "BUG_ldx_%02x\n", insn->code);
 			return;
 		}
-		verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
+		verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
 			insn->code, insn->dst_reg,
 			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 			insn->src_reg, insn->off);
 	} else if (class == BPF_LD) {
 		if (BPF_MODE(insn->code) == BPF_ABS) {
-			verbose("(%02x) r0 = *(%s *)skb[%d]\n",
+			verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->imm);
 		} else if (BPF_MODE(insn->code) == BPF_IND) {
-			verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+			verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->src_reg, insn->imm);
@@ -428,36 +425,37 @@ static void print_bpf_insn(const struct bpf_verifier_env *env,
 			if (map_ptr && !env->allow_ptr_leaks)
 				imm = 0;
 
-			verbose("(%02x) r%d = 0x%llx\n", insn->code,
+			verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
 				insn->dst_reg, (unsigned long long)imm);
 		} else {
-			verbose("BUG_ld_%02x\n", insn->code);
+			verbose(env, "BUG_ld_%02x\n", insn->code);
 			return;
 		}
 	} else if (class == BPF_JMP) {
 		u8 opcode = BPF_OP(insn->code);
 
 		if (opcode == BPF_CALL) {
-			verbose("(%02x) call %s#%d\n", insn->code,
+			verbose(env, "(%02x) call %s#%d\n", insn->code,
 				func_id_name(insn->imm), insn->imm);
 		} else if (insn->code == (BPF_JMP | BPF_JA)) {
-			verbose("(%02x) goto pc%+d\n",
+			verbose(env, "(%02x) goto pc%+d\n",
 				insn->code, insn->off);
 		} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
-			verbose("(%02x) exit\n", insn->code);
+			verbose(env, "(%02x) exit\n", insn->code);
 		} else if (BPF_SRC(insn->code) == BPF_X) {
-			verbose("(%02x) if r%d %s r%d goto pc%+d\n",
+			verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
 				insn->code, insn->dst_reg,
 				bpf_jmp_string[BPF_OP(insn->code) >> 4],
 				insn->src_reg, insn->off);
 		} else {
-			verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
+			verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
 				insn->code, insn->dst_reg,
 				bpf_jmp_string[BPF_OP(insn->code) >> 4],
 				insn->imm, insn->off);
 		}
 	} else {
-		verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
+		verbose(env, "(%02x) %s\n",
+			insn->code, bpf_class_string[class]);
 	}
 }
 
@@ -496,7 +494,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 	env->head = elem;
 	env->stack_size++;
 	if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
-		verbose("BPF program is too complex\n");
+		verbose(env, "BPF program is too complex\n");
 		goto err;
 	}
 	return &elem->st;
@@ -534,10 +532,11 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
 	__mark_reg_known(reg, 0);
 }
 
-static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_known_zero(struct bpf_verifier_env *env,
+				struct bpf_reg_state *regs, u32 regno)
 {
 	if (WARN_ON(regno >= MAX_BPF_REG)) {
-		verbose("mark_reg_known_zero(regs, %u)\n", regno);
+		verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
 		/* Something bad happened, let's kill all regs */
 		for (regno = 0; regno < MAX_BPF_REG; regno++)
 			__mark_reg_not_init(regs + regno);
@@ -647,10 +646,11 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
 	__mark_reg_unbounded(reg);
 }
 
-static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_unknown(struct bpf_verifier_env *env,
+			     struct bpf_reg_state *regs, u32 regno)
 {
 	if (WARN_ON(regno >= MAX_BPF_REG)) {
-		verbose("mark_reg_unknown(regs, %u)\n", regno);
+		verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
 		/* Something bad happened, let's kill all regs */
 		for (regno = 0; regno < MAX_BPF_REG; regno++)
 			__mark_reg_not_init(regs + regno);
@@ -665,10 +665,11 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg)
 	reg->type = NOT_INIT;
 }
 
-static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_not_init(struct bpf_verifier_env *env,
+			      struct bpf_reg_state *regs, u32 regno)
 {
 	if (WARN_ON(regno >= MAX_BPF_REG)) {
-		verbose("mark_reg_not_init(regs, %u)\n", regno);
+		verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
 		/* Something bad happened, let's kill all regs */
 		for (regno = 0; regno < MAX_BPF_REG; regno++)
 			__mark_reg_not_init(regs + regno);
@@ -677,22 +678,23 @@ static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
 	__mark_reg_not_init(regs + regno);
 }
 
-static void init_reg_state(struct bpf_reg_state *regs)
+static void init_reg_state(struct bpf_verifier_env *env,
+			   struct bpf_reg_state *regs)
 {
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++) {
-		mark_reg_not_init(regs, i);
+		mark_reg_not_init(env, regs, i);
 		regs[i].live = REG_LIVE_NONE;
 	}
 
 	/* frame pointer */
 	regs[BPF_REG_FP].type = PTR_TO_STACK;
-	mark_reg_known_zero(regs, BPF_REG_FP);
+	mark_reg_known_zero(env, regs, BPF_REG_FP);
 
 	/* 1st arg to a function */
 	regs[BPF_REG_1].type = PTR_TO_CTX;
-	mark_reg_known_zero(regs, BPF_REG_1);
+	mark_reg_known_zero(env, regs, BPF_REG_1);
 }
 
 enum reg_arg_type {
@@ -726,26 +728,26 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 	struct bpf_reg_state *regs = env->cur_state.regs;
 
 	if (regno >= MAX_BPF_REG) {
-		verbose("R%d is invalid\n", regno);
+		verbose(env, "R%d is invalid\n", regno);
 		return -EINVAL;
 	}
 
 	if (t == SRC_OP) {
 		/* check whether register used as source operand can be read */
 		if (regs[regno].type == NOT_INIT) {
-			verbose("R%d !read_ok\n", regno);
+			verbose(env, "R%d !read_ok\n", regno);
 			return -EACCES;
 		}
 		mark_reg_read(&env->cur_state, regno);
 	} else {
 		/* check whether register used as dest operand can be written to */
 		if (regno == BPF_REG_FP) {
-			verbose("frame pointer is read only\n");
+			verbose(env, "frame pointer is read only\n");
 			return -EACCES;
 		}
 		regs[regno].live |= REG_LIVE_WRITTEN;
 		if (t == DST_OP)
-			mark_reg_unknown(regs, regno);
+			mark_reg_unknown(env, regs, regno);
 	}
 	return 0;
 }
@@ -770,7 +772,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 /* check_stack_read/write functions track spill/fill of registers,
  * stack boundary and alignment are checked in check_mem_access()
  */
-static int check_stack_write(struct bpf_verifier_state *state, int off,
+static int check_stack_write(struct bpf_verifier_env *env,
+			     struct bpf_verifier_state *state, int off,
 			     int size, int value_regno)
 {
 	int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
@@ -783,7 +786,7 @@ static int check_stack_write(struct bpf_verifier_state *state, int off,
 
 		/* register containing pointer is being spilled into stack */
 		if (size != BPF_REG_SIZE) {
-			verbose("invalid size of register spill\n");
+			verbose(env, "invalid size of register spill\n");
 			return -EACCES;
 		}
 
@@ -818,7 +821,8 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo
 	}
 }
 
-static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
+static int check_stack_read(struct bpf_verifier_env *env,
+			    struct bpf_verifier_state *state, int off, int size,
 			    int value_regno)
 {
 	u8 *slot_type;
@@ -828,12 +832,12 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
 
 	if (slot_type[0] == STACK_SPILL) {
 		if (size != BPF_REG_SIZE) {
-			verbose("invalid size of register spill\n");
+			verbose(env, "invalid size of register spill\n");
 			return -EACCES;
 		}
 		for (i = 1; i < BPF_REG_SIZE; i++) {
 			if (slot_type[i] != STACK_SPILL) {
-				verbose("corrupted spill memory\n");
+				verbose(env, "corrupted spill memory\n");
 				return -EACCES;
 			}
 		}
@@ -849,14 +853,14 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
 	} else {
 		for (i = 0; i < size; i++) {
 			if (slot_type[i] != STACK_MISC) {
-				verbose("invalid read from stack off %d+%d size %d\n",
+				verbose(env, "invalid read from stack off %d+%d size %d\n",
 					off, i, size);
 				return -EACCES;
 			}
 		}
 		if (value_regno >= 0)
 			/* have read misc data from the stack */
-			mark_reg_unknown(state->regs, value_regno);
+			mark_reg_unknown(env, state->regs, value_regno);
 		return 0;
 	}
 }
@@ -868,7 +872,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 	struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
 
 	if (off < 0 || size <= 0 || off + size > map->value_size) {
-		verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+		verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
 			map->value_size, off, size);
 		return -EACCES;
 	}
@@ -887,8 +891,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * need to try adding each of min_value and max_value to off
 	 * to make sure our theoretical access will be safe.
 	 */
-	if (verifier_log.level)
-		print_verifier_state(state);
+	if (env->log.level)
+		print_verifier_state(env, state);
 	/* The minimum value is only important with signed
 	 * comparisons where we can't assume the floor of a
 	 * value is 0.  If we are using signed variables for our
@@ -896,13 +900,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * will have a set floor within our range.
 	 */
 	if (reg->smin_value < 0) {
-		verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
 			regno);
 		return -EACCES;
 	}
 	err = __check_map_access(env, regno, reg->smin_value + off, size);
 	if (err) {
-		verbose("R%d min value is outside of the array range\n", regno);
+		verbose(env, "R%d min value is outside of the array range\n",
+			regno);
 		return err;
 	}
 
@@ -911,13 +916,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * If reg->umax_value + off could overflow, treat that as unbounded too.
 	 */
 	if (reg->umax_value >= BPF_MAX_VAR_OFF) {
-		verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+		verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",
 			regno);
 		return -EACCES;
 	}
 	err = __check_map_access(env, regno, reg->umax_value + off, size);
 	if (err)
-		verbose("R%d max value is outside of the array range\n", regno);
+		verbose(env, "R%d max value is outside of the array range\n",
+			regno);
 	return err;
 }
 
@@ -956,7 +962,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
 	struct bpf_reg_state *reg = &regs[regno];
 
 	if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
-		verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+		verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
 			off, size, regno, reg->id, reg->off, reg->range);
 		return -EACCES;
 	}
@@ -979,13 +985,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 	 * detail to prove they're safe.
 	 */
 	if (reg->smin_value < 0) {
-		verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
 			regno);
 		return -EACCES;
 	}
 	err = __check_packet_access(env, regno, off, size);
 	if (err) {
-		verbose("R%d offset is outside of the packet\n", regno);
+		verbose(env, "R%d offset is outside of the packet\n", regno);
 		return err;
 	}
 	return err;
@@ -1021,7 +1027,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		return 0;
 	}
 
-	verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+	verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size);
 	return -EACCES;
 }
 
@@ -1039,7 +1045,8 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 	return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
 }
 
-static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
+				   const struct bpf_reg_state *reg,
 				   int off, int size, bool strict)
 {
 	struct tnum reg_off;
@@ -1064,7 +1071,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 		char tn_buf[48];
 
 		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-		verbose("misaligned packet access off %d+%s+%d+%d size %d\n",
+		verbose(env,
+			"misaligned packet access off %d+%s+%d+%d size %d\n",
 			ip_align, tn_buf, reg->off, off, size);
 		return -EACCES;
 	}
@@ -1072,7 +1080,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 	return 0;
 }
 
-static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
+				       const struct bpf_reg_state *reg,
 				       const char *pointer_desc,
 				       int off, int size, bool strict)
 {
@@ -1087,7 +1096,7 @@ static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
 		char tn_buf[48];
 
 		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-		verbose("misaligned %saccess off %s+%d+%d size %d\n",
+		verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",
 			pointer_desc, tn_buf, reg->off, off, size);
 		return -EACCES;
 	}
@@ -1108,7 +1117,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 		/* Special case, because of NET_IP_ALIGN. Given metadata sits
 		 * right in front, treat it the very same way.
 		 */
-		return check_pkt_ptr_alignment(reg, off, size, strict);
+		return check_pkt_ptr_alignment(env, reg, off, size, strict);
 	case PTR_TO_MAP_VALUE:
 		pointer_desc = "value ";
 		break;
@@ -1121,7 +1130,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 	default:
 		break;
 	}
-	return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict);
+	return check_generic_ptr_alignment(env, reg, pointer_desc, off, size,
+					   strict);
 }
 
 /* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -1153,20 +1163,20 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	if (reg->type == PTR_TO_MAP_VALUE) {
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
-			verbose("R%d leaks addr into map\n", value_regno);
+			verbose(env, "R%d leaks addr into map\n", value_regno);
 			return -EACCES;
 		}
 
 		err = check_map_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown(state->regs, value_regno);
+			mark_reg_unknown(env, state->regs, value_regno);
 
 	} else if (reg->type == PTR_TO_CTX) {
 		enum bpf_reg_type reg_type = SCALAR_VALUE;
 
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
-			verbose("R%d leaks addr into ctx\n", value_regno);
+			verbose(env, "R%d leaks addr into ctx\n", value_regno);
 			return -EACCES;
 		}
 		/* ctx accesses must be at a fixed offset, so that we can
@@ -1176,7 +1186,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose("variable ctx access var_off=%s off=%d size=%d",
+			verbose(env,
+				"variable ctx access var_off=%s off=%d size=%d",
 				tn_buf, off, size);
 			return -EACCES;
 		}
@@ -1188,9 +1199,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			 * case, we know the offset is zero.
 			 */
 			if (reg_type == SCALAR_VALUE)
-				mark_reg_unknown(state->regs, value_regno);
+				mark_reg_unknown(env, state->regs, value_regno);
 			else
-				mark_reg_known_zero(state->regs, value_regno);
+				mark_reg_known_zero(env, state->regs,
+						    value_regno);
 			state->regs[value_regno].id = 0;
 			state->regs[value_regno].off = 0;
 			state->regs[value_regno].range = 0;
@@ -1206,13 +1218,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose("variable stack access var_off=%s off=%d size=%d",
+			verbose(env, "variable stack access var_off=%s off=%d size=%d",
 				tn_buf, off, size);
 			return -EACCES;
 		}
 		off += reg->var_off.value;
 		if (off >= 0 || off < -MAX_BPF_STACK) {
-			verbose("invalid stack off=%d size=%d\n", off, size);
+			verbose(env, "invalid stack off=%d size=%d\n", off,
+				size);
 			return -EACCES;
 		}
 
@@ -1223,29 +1236,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			if (!env->allow_ptr_leaks &&
 			    state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
 			    size != BPF_REG_SIZE) {
-				verbose("attempt to corrupt spilled pointer on stack\n");
+				verbose(env, "attempt to corrupt spilled pointer on stack\n");
 				return -EACCES;
 			}
-			err = check_stack_write(state, off, size, value_regno);
+			err = check_stack_write(env, state, off, size,
+						value_regno);
 		} else {
-			err = check_stack_read(state, off, size, value_regno);
+			err = check_stack_read(env, state, off, size,
+					       value_regno);
 		}
 	} else if (reg_is_pkt_pointer(reg)) {
 		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
-			verbose("cannot write into packet\n");
+			verbose(env, "cannot write into packet\n");
 			return -EACCES;
 		}
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
-			verbose("R%d leaks addr into packet\n", value_regno);
+			verbose(env, "R%d leaks addr into packet\n",
+				value_regno);
 			return -EACCES;
 		}
 		err = check_packet_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown(state->regs, value_regno);
+			mark_reg_unknown(env, state->regs, value_regno);
 	} else {
-		verbose("R%d invalid mem access '%s'\n",
-			regno, reg_type_str[reg->type]);
+		verbose(env, "R%d invalid mem access '%s'\n", regno,
+			reg_type_str[reg->type]);
 		return -EACCES;
 	}
 
@@ -1265,7 +1281,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 
 	if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
 	    insn->imm != 0) {
-		verbose("BPF_XADD uses reserved fields\n");
+		verbose(env, "BPF_XADD uses reserved fields\n");
 		return -EINVAL;
 	}
 
@@ -1280,7 +1296,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 		return err;
 
 	if (is_pointer_value(env, insn->src_reg)) {
-		verbose("R%d leaks addr into mem\n", insn->src_reg);
+		verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
 		return -EACCES;
 	}
 
@@ -1321,7 +1337,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		    register_is_null(regs[regno]))
 			return 0;
 
-		verbose("R%d type=%s expected=%s\n", regno,
+		verbose(env, "R%d type=%s expected=%s\n", regno,
 			reg_type_str[regs[regno].type],
 			reg_type_str[PTR_TO_STACK]);
 		return -EACCES;
@@ -1332,13 +1348,13 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		char tn_buf[48];
 
 		tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
-		verbose("invalid variable stack read R%d var_off=%s\n",
+		verbose(env, "invalid variable stack read R%d var_off=%s\n",
 			regno, tn_buf);
 	}
 	off = regs[regno].off + regs[regno].var_off.value;
 	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
 	    access_size <= 0) {
-		verbose("invalid stack type R%d off=%d access_size=%d\n",
+		verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
 			regno, off, access_size);
 		return -EACCES;
 	}
@@ -1354,7 +1370,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 
 	for (i = 0; i < access_size; i++) {
 		if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
-			verbose("invalid indirect read from stack off %d+%d size %d\n",
+			verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
 				off, i, access_size);
 			return -EACCES;
 		}
@@ -1397,7 +1413,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 
 	if (arg_type == ARG_ANYTHING) {
 		if (is_pointer_value(env, regno)) {
-			verbose("R%d leaks addr into helper function\n", regno);
+			verbose(env, "R%d leaks addr into helper function\n",
+				regno);
 			return -EACCES;
 		}
 		return 0;
@@ -1405,7 +1422,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 
 	if (type_is_pkt_pointer(type) &&
 	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {
-		verbose("helper access to the packet is not allowed\n");
+		verbose(env, "helper access to the packet is not allowed\n");
 		return -EACCES;
 	}
 
@@ -1443,7 +1460,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			goto err_type;
 		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
 	} else {
-		verbose("unsupported arg_type %d\n", arg_type);
+		verbose(env, "unsupported arg_type %d\n", arg_type);
 		return -EFAULT;
 	}
 
@@ -1461,7 +1478,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			 * we have to check map_key here. Otherwise it means
 			 * that kernel subsystem misconfigured verifier
 			 */
-			verbose("invalid map_ptr to access map->key\n");
+			verbose(env, "invalid map_ptr to access map->key\n");
 			return -EACCES;
 		}
 		if (type_is_pkt_pointer(type))
@@ -1477,7 +1494,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (!meta->map_ptr) {
 			/* kernel subsystem misconfigured verifier */
-			verbose("invalid map_ptr to access map->value\n");
+			verbose(env, "invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
 		if (type_is_pkt_pointer(type))
@@ -1497,7 +1514,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (regno == 0) {
 			/* kernel subsystem misconfigured verifier */
-			verbose("ARG_CONST_SIZE cannot be first argument\n");
+			verbose(env,
+				"ARG_CONST_SIZE cannot be first argument\n");
 			return -EACCES;
 		}
 
@@ -1514,7 +1532,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			meta = NULL;
 
 		if (reg->smin_value < 0) {
-			verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+			verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
 				regno);
 			return -EACCES;
 		}
@@ -1528,7 +1546,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		}
 
 		if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
-			verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+			verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
 				regno);
 			return -EACCES;
 		}
@@ -1539,12 +1557,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 
 	return err;
 err_type:
-	verbose("R%d type=%s expected=%s\n", regno,
+	verbose(env, "R%d type=%s expected=%s\n", regno,
 		reg_type_str[type], reg_type_str[expected_type]);
 	return -EACCES;
 }
 
-static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+static int check_map_func_compatibility(struct bpf_verifier_env *env,
+					struct bpf_map *map, int func_id)
 {
 	if (!map)
 		return 0;
@@ -1632,7 +1651,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 
 	return 0;
 error:
-	verbose("cannot pass map_type %d into func %s#%d\n",
+	verbose(env, "cannot pass map_type %d into func %s#%d\n",
 		map->map_type, func_id_name(func_id), func_id);
 	return -EINVAL;
 }
@@ -1666,7 +1685,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 
 	for (i = 0; i < MAX_BPF_REG; i++)
 		if (reg_is_pkt_pointer_any(&regs[i]))
-			mark_reg_unknown(regs, i);
+			mark_reg_unknown(env, regs, i);
 
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] != STACK_SPILL)
@@ -1688,7 +1707,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 
 	/* find function prototype */
 	if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
-		verbose("invalid func %s#%d\n", func_id_name(func_id), func_id);
+		verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
+			func_id);
 		return -EINVAL;
 	}
 
@@ -1696,13 +1716,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 		fn = env->prog->aux->ops->get_func_proto(func_id);
 
 	if (!fn) {
-		verbose("unknown func %s#%d\n", func_id_name(func_id), func_id);
+		verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
+			func_id);
 		return -EINVAL;
 	}
 
 	/* eBPF programs must be GPL compatible to use GPL-ed functions */
 	if (!env->prog->gpl_compatible && fn->gpl_only) {
-		verbose("cannot call GPL only function from proprietary program\n");
+		verbose(env, "cannot call GPL only function from proprietary program\n");
 		return -EINVAL;
 	}
 
@@ -1716,7 +1737,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 	 */
 	err = check_raw_mode(fn);
 	if (err) {
-		verbose("kernel subsystem misconfigured func %s#%d\n",
+		verbose(env, "kernel subsystem misconfigured func %s#%d\n",
 			func_id_name(func_id), func_id);
 		return err;
 	}
@@ -1749,14 +1770,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
-		mark_reg_not_init(regs, caller_saved[i]);
+		mark_reg_not_init(env, regs, caller_saved[i]);
 		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
 	}
 
 	/* update return register (already marked as written above) */
 	if (fn->ret_type == RET_INTEGER) {
 		/* sets type to SCALAR_VALUE */
-		mark_reg_unknown(regs, BPF_REG_0);
+		mark_reg_unknown(env, regs, BPF_REG_0);
 	} else if (fn->ret_type == RET_VOID) {
 		regs[BPF_REG_0].type = NOT_INIT;
 	} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
@@ -1764,14 +1785,15 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 
 		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
 		/* There is no offset yet applied, variable or fixed */
-		mark_reg_known_zero(regs, BPF_REG_0);
+		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].off = 0;
 		/* remember map_ptr, so that check_map_access()
 		 * can check 'value_size' boundary of memory access
 		 * to map element returned from bpf_map_lookup_elem()
 		 */
 		if (meta.map_ptr == NULL) {
-			verbose("kernel subsystem misconfigured verifier\n");
+			verbose(env,
+				"kernel subsystem misconfigured verifier\n");
 			return -EINVAL;
 		}
 		regs[BPF_REG_0].map_ptr = meta.map_ptr;
@@ -1782,12 +1804,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 		else if (insn_aux->map_ptr != meta.map_ptr)
 			insn_aux->map_ptr = BPF_MAP_PTR_POISON;
 	} else {
-		verbose("unknown return type %d of func %s#%d\n",
+		verbose(env, "unknown return type %d of func %s#%d\n",
 			fn->ret_type, func_id_name(func_id), func_id);
 		return -EINVAL;
 	}
 
-	err = check_map_func_compatibility(meta.map_ptr, func_id);
+	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
 	if (err)
 		return err;
 
@@ -1846,39 +1868,42 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	dst_reg = &regs[dst];
 
 	if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: known but bad sbounds\n");
+		print_verifier_state(env, &env->cur_state);
+		verbose(env,
+			"verifier internal error: known but bad sbounds\n");
 		return -EINVAL;
 	}
 	if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: known but bad ubounds\n");
+		print_verifier_state(env, &env->cur_state);
+		verbose(env,
+			"verifier internal error: known but bad ubounds\n");
 		return -EINVAL;
 	}
 
 	if (BPF_CLASS(insn->code) != BPF_ALU64) {
 		/* 32-bit ALU ops on pointers produce (meaningless) scalars */
 		if (!env->allow_ptr_leaks)
-			verbose("R%d 32-bit pointer arithmetic prohibited\n",
+			verbose(env,
+				"R%d 32-bit pointer arithmetic prohibited\n",
 				dst);
 		return -EACCES;
 	}
 
 	if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+			verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
 				dst);
 		return -EACCES;
 	}
 	if (ptr_reg->type == CONST_PTR_TO_MAP) {
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+			verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
 				dst);
 		return -EACCES;
 	}
 	if (ptr_reg->type == PTR_TO_PACKET_END) {
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+			verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
 				dst);
 		return -EACCES;
 	}
@@ -1943,7 +1968,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		if (dst_reg == off_reg) {
 			/* scalar -= pointer.  Creates an unknown scalar */
 			if (!env->allow_ptr_leaks)
-				verbose("R%d tried to subtract pointer from scalar\n",
+				verbose(env, "R%d tried to subtract pointer from scalar\n",
 					dst);
 			return -EACCES;
 		}
@@ -1953,7 +1978,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		 */
 		if (ptr_reg->type == PTR_TO_STACK) {
 			if (!env->allow_ptr_leaks)
-				verbose("R%d subtraction from stack pointer prohibited\n",
+				verbose(env, "R%d subtraction from stack pointer prohibited\n",
 					dst);
 			return -EACCES;
 		}
@@ -2008,13 +2033,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		 * ptr &= ~3 which would reduce min_value by 3.)
 		 */
 		if (!env->allow_ptr_leaks)
-			verbose("R%d bitwise operator %s on pointer prohibited\n",
+			verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
 				dst, bpf_alu_string[opcode >> 4]);
 		return -EACCES;
 	default:
 		/* other operators (e.g. MUL,LSH) produce non-pointer results */
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic with %s operator prohibited\n",
+			verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
 				dst, bpf_alu_string[opcode >> 4]);
 		return -EACCES;
 	}
@@ -2180,7 +2205,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			/* Shifts greater than 63 are undefined.  This includes
 			 * shifts by a negative number.
 			 */
-			mark_reg_unknown(regs, insn->dst_reg);
+			mark_reg_unknown(env, regs, insn->dst_reg);
 			break;
 		}
 		/* We lose all sign bit information (except what we can pick
@@ -2208,7 +2233,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			/* Shifts greater than 63 are undefined.  This includes
 			 * shifts by a negative number.
 			 */
-			mark_reg_unknown(regs, insn->dst_reg);
+			mark_reg_unknown(env, regs, insn->dst_reg);
 			break;
 		}
 		/* BPF_RSH is an unsigned shift, so make the appropriate casts */
@@ -2236,7 +2261,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		__update_reg_bounds(dst_reg);
 		break;
 	default:
-		mark_reg_unknown(regs, insn->dst_reg);
+		mark_reg_unknown(env, regs, insn->dst_reg);
 		break;
 	}
 
@@ -2268,12 +2293,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 				 * an arbitrary scalar.
 				 */
 				if (!env->allow_ptr_leaks) {
-					verbose("R%d pointer %s pointer prohibited\n",
+					verbose(env, "R%d pointer %s pointer prohibited\n",
 						insn->dst_reg,
 						bpf_alu_string[opcode >> 4]);
 					return -EACCES;
 				}
-				mark_reg_unknown(regs, insn->dst_reg);
+				mark_reg_unknown(env, regs, insn->dst_reg);
 				return 0;
 			} else {
 				/* scalar += pointer
@@ -2325,13 +2350,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 
 	/* Got here implies adding two SCALAR_VALUEs */
 	if (WARN_ON_ONCE(ptr_reg)) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: unexpected ptr_reg\n");
+		print_verifier_state(env, &env->cur_state);
+		verbose(env, "verifier internal error: unexpected ptr_reg\n");
 		return -EINVAL;
 	}
 	if (WARN_ON(!src_reg)) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: no src_reg\n");
+		print_verifier_state(env, &env->cur_state);
+		verbose(env, "verifier internal error: no src_reg\n");
 		return -EINVAL;
 	}
 	return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
@@ -2349,14 +2374,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			if (BPF_SRC(insn->code) != 0 ||
 			    insn->src_reg != BPF_REG_0 ||
 			    insn->off != 0 || insn->imm != 0) {
-				verbose("BPF_NEG uses reserved fields\n");
+				verbose(env, "BPF_NEG uses reserved fields\n");
 				return -EINVAL;
 			}
 		} else {
 			if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
 			    (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
 			    BPF_CLASS(insn->code) == BPF_ALU64) {
-				verbose("BPF_END uses reserved fields\n");
+				verbose(env, "BPF_END uses reserved fields\n");
 				return -EINVAL;
 			}
 		}
@@ -2367,7 +2392,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			return err;
 
 		if (is_pointer_value(env, insn->dst_reg)) {
-			verbose("R%d pointer arithmetic prohibited\n",
+			verbose(env, "R%d pointer arithmetic prohibited\n",
 				insn->dst_reg);
 			return -EACCES;
 		}
@@ -2381,7 +2406,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 		if (BPF_SRC(insn->code) == BPF_X) {
 			if (insn->imm != 0 || insn->off != 0) {
-				verbose("BPF_MOV uses reserved fields\n");
+				verbose(env, "BPF_MOV uses reserved fields\n");
 				return -EINVAL;
 			}
 
@@ -2391,7 +2416,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				return err;
 		} else {
 			if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
-				verbose("BPF_MOV uses reserved fields\n");
+				verbose(env, "BPF_MOV uses reserved fields\n");
 				return -EINVAL;
 			}
 		}
@@ -2411,11 +2436,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			} else {
 				/* R1 = (u32) R2 */
 				if (is_pointer_value(env, insn->src_reg)) {
-					verbose("R%d partial copy of pointer\n",
+					verbose(env,
+						"R%d partial copy of pointer\n",
 						insn->src_reg);
 					return -EACCES;
 				}
-				mark_reg_unknown(regs, insn->dst_reg);
+				mark_reg_unknown(env, regs, insn->dst_reg);
 				/* high 32 bits are known zero. */
 				regs[insn->dst_reg].var_off = tnum_cast(
 						regs[insn->dst_reg].var_off, 4);
@@ -2430,14 +2456,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		}
 
 	} else if (opcode > BPF_END) {
-		verbose("invalid BPF_ALU opcode %x\n", opcode);
+		verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
 		return -EINVAL;
 
 	} else {	/* all other ALU ops: and, sub, xor, add, ... */
 
 		if (BPF_SRC(insn->code) == BPF_X) {
 			if (insn->imm != 0 || insn->off != 0) {
-				verbose("BPF_ALU uses reserved fields\n");
+				verbose(env, "BPF_ALU uses reserved fields\n");
 				return -EINVAL;
 			}
 			/* check src1 operand */
@@ -2446,7 +2472,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				return err;
 		} else {
 			if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
-				verbose("BPF_ALU uses reserved fields\n");
+				verbose(env, "BPF_ALU uses reserved fields\n");
 				return -EINVAL;
 			}
 		}
@@ -2458,7 +2484,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 		if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
 		    BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
-			verbose("div by zero\n");
+			verbose(env, "div by zero\n");
 			return -EINVAL;
 		}
 
@@ -2467,7 +2493,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
 
 			if (insn->imm < 0 || insn->imm >= size) {
-				verbose("invalid shift %d\n", insn->imm);
+				verbose(env, "invalid shift %d\n", insn->imm);
 				return -EINVAL;
 			}
 		}
@@ -2820,13 +2846,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	int err;
 
 	if (opcode > BPF_JSLE) {
-		verbose("invalid BPF_JMP opcode %x\n", opcode);
+		verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
 		return -EINVAL;
 	}
 
 	if (BPF_SRC(insn->code) == BPF_X) {
 		if (insn->imm != 0) {
-			verbose("BPF_JMP uses reserved fields\n");
+			verbose(env, "BPF_JMP uses reserved fields\n");
 			return -EINVAL;
 		}
 
@@ -2836,13 +2862,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			return err;
 
 		if (is_pointer_value(env, insn->src_reg)) {
-			verbose("R%d pointer comparison prohibited\n",
+			verbose(env, "R%d pointer comparison prohibited\n",
 				insn->src_reg);
 			return -EACCES;
 		}
 	} else {
 		if (insn->src_reg != BPF_REG_0) {
-			verbose("BPF_JMP uses reserved fields\n");
+			verbose(env, "BPF_JMP uses reserved fields\n");
 			return -EINVAL;
 		}
 	}
@@ -2954,11 +2980,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
 				       PTR_TO_PACKET_META);
 	} else if (is_pointer_value(env, insn->dst_reg)) {
-		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+		verbose(env, "R%d pointer comparison prohibited\n",
+			insn->dst_reg);
 		return -EACCES;
 	}
-	if (verifier_log.level)
-		print_verifier_state(this_branch);
+	if (env->log.level)
+		print_verifier_state(env, this_branch);
 	return 0;
 }
 
@@ -2977,11 +3004,11 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	int err;
 
 	if (BPF_SIZE(insn->code) != BPF_DW) {
-		verbose("invalid BPF_LD_IMM insn\n");
+		verbose(env, "invalid BPF_LD_IMM insn\n");
 		return -EINVAL;
 	}
 	if (insn->off != 0) {
-		verbose("BPF_LD_IMM64 uses reserved fields\n");
+		verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
 		return -EINVAL;
 	}
 
@@ -3039,14 +3066,14 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	int i, err;
 
 	if (!may_access_skb(env->prog->type)) {
-		verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
+		verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
 		return -EINVAL;
 	}
 
 	if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
 	    BPF_SIZE(insn->code) == BPF_DW ||
 	    (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
-		verbose("BPF_LD_[ABS|IND] uses reserved fields\n");
+		verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
 		return -EINVAL;
 	}
 
@@ -3056,7 +3083,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return err;
 
 	if (regs[BPF_REG_6].type != PTR_TO_CTX) {
-		verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+		verbose(env,
+			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
 		return -EINVAL;
 	}
 
@@ -3069,7 +3097,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 	/* reset caller saved regs to unreadable */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
-		mark_reg_not_init(regs, caller_saved[i]);
+		mark_reg_not_init(env, regs, caller_saved[i]);
 		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
 	}
 
@@ -3077,7 +3105,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	 * the value fetched from the packet.
 	 * Already marked as written above.
 	 */
-	mark_reg_unknown(regs, BPF_REG_0);
+	mark_reg_unknown(env, regs, BPF_REG_0);
 	return 0;
 }
 
@@ -3097,22 +3125,22 @@ static int check_return_code(struct bpf_verifier_env *env)
 
 	reg = &env->cur_state.regs[BPF_REG_0];
 	if (reg->type != SCALAR_VALUE) {
-		verbose("At program exit the register R0 is not a known value (%s)\n",
+		verbose(env, "At program exit the register R0 is not a known value (%s)\n",
 			reg_type_str[reg->type]);
 		return -EINVAL;
 	}
 
 	if (!tnum_in(range, reg->var_off)) {
-		verbose("At program exit the register R0 ");
+		verbose(env, "At program exit the register R0 ");
 		if (!tnum_is_unknown(reg->var_off)) {
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose("has value %s", tn_buf);
+			verbose(env, "has value %s", tn_buf);
 		} else {
-			verbose("has unknown scalar value");
+			verbose(env, "has unknown scalar value");
 		}
-		verbose(" should have been 0 or 1\n");
+		verbose(env, " should have been 0 or 1\n");
 		return -EINVAL;
 	}
 	return 0;
@@ -3178,7 +3206,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 		return 0;
 
 	if (w < 0 || w >= env->prog->len) {
-		verbose("jump out of range from insn %d to %d\n", t, w);
+		verbose(env, "jump out of range from insn %d to %d\n", t, w);
 		return -EINVAL;
 	}
 
@@ -3195,13 +3223,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 		insn_stack[cur_stack++] = w;
 		return 1;
 	} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
-		verbose("back-edge from insn %d to %d\n", t, w);
+		verbose(env, "back-edge from insn %d to %d\n", t, w);
 		return -EINVAL;
 	} else if (insn_state[w] == EXPLORED) {
 		/* forward- or cross-edge */
 		insn_state[t] = DISCOVERED | e;
 	} else {
-		verbose("insn state internal bug\n");
+		verbose(env, "insn state internal bug\n");
 		return -EFAULT;
 	}
 	return 0;
@@ -3295,7 +3323,7 @@ peek_stack:
 mark_explored:
 	insn_state[t] = EXPLORED;
 	if (cur_stack-- <= 0) {
-		verbose("pop stack internal bug\n");
+		verbose(env, "pop stack internal bug\n");
 		ret = -EFAULT;
 		goto err_free;
 	}
@@ -3304,7 +3332,7 @@ mark_explored:
 check_state:
 	for (i = 0; i < insn_cnt; i++) {
 		if (insn_state[i] != EXPLORED) {
-			verbose("unreachable insn %d\n", i);
+			verbose(env, "unreachable insn %d\n", i);
 			ret = -EINVAL;
 			goto err_free;
 		}
@@ -3685,7 +3713,7 @@ static int do_check(struct bpf_verifier_env *env)
 	int insn_processed = 0;
 	bool do_print_state = false;
 
-	init_reg_state(regs);
+	init_reg_state(env, regs);
 	state->parent = NULL;
 	insn_idx = 0;
 	for (;;) {
@@ -3694,7 +3722,7 @@ static int do_check(struct bpf_verifier_env *env)
 		int err;
 
 		if (insn_idx >= insn_cnt) {
-			verbose("invalid insn idx %d insn_cnt %d\n",
+			verbose(env, "invalid insn idx %d insn_cnt %d\n",
 				insn_idx, insn_cnt);
 			return -EFAULT;
 		}
@@ -3703,7 +3731,8 @@ static int do_check(struct bpf_verifier_env *env)
 		class = BPF_CLASS(insn->code);
 
 		if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
-			verbose("BPF program is too large. Processed %d insn\n",
+			verbose(env,
+				"BPF program is too large. Processed %d insn\n",
 				insn_processed);
 			return -E2BIG;
 		}
@@ -3713,12 +3742,12 @@ static int do_check(struct bpf_verifier_env *env)
 			return err;
 		if (err == 1) {
 			/* found equivalent state, can prune the search */
-			if (verifier_log.level) {
+			if (env->log.level) {
 				if (do_print_state)
-					verbose("\nfrom %d to %d: safe\n",
+					verbose(env, "\nfrom %d to %d: safe\n",
 						prev_insn_idx, insn_idx);
 				else
-					verbose("%d: safe\n", insn_idx);
+					verbose(env, "%d: safe\n", insn_idx);
 			}
 			goto process_bpf_exit;
 		}
@@ -3726,19 +3755,18 @@ static int do_check(struct bpf_verifier_env *env)
 		if (need_resched())
 			cond_resched();
 
-		if (verifier_log.level > 1 ||
-		    (verifier_log.level && do_print_state)) {
-			if (verifier_log.level > 1)
-				verbose("%d:", insn_idx);
+		if (env->log.level > 1 || (env->log.level && do_print_state)) {
+			if (env->log.level > 1)
+				verbose(env, "%d:", insn_idx);
 			else
-				verbose("\nfrom %d to %d:",
+				verbose(env, "\nfrom %d to %d:",
 					prev_insn_idx, insn_idx);
-			print_verifier_state(&env->cur_state);
+			print_verifier_state(env, &env->cur_state);
 			do_print_state = false;
 		}
 
-		if (verifier_log.level) {
-			verbose("%d: ", insn_idx);
+		if (env->log.level) {
+			verbose(env, "%d: ", insn_idx);
 			print_bpf_insn(env, insn);
 		}
 
@@ -3795,7 +3823,7 @@ static int do_check(struct bpf_verifier_env *env)
 				 * src_reg == stack|map in some other branch.
 				 * Reject it.
 				 */
-				verbose("same insn cannot be used with different pointers\n");
+				verbose(env, "same insn cannot be used with different pointers\n");
 				return -EINVAL;
 			}
 
@@ -3835,14 +3863,14 @@ static int do_check(struct bpf_verifier_env *env)
 			} else if (dst_reg_type != *prev_dst_type &&
 				   (dst_reg_type == PTR_TO_CTX ||
 				    *prev_dst_type == PTR_TO_CTX)) {
-				verbose("same insn cannot be used with different pointers\n");
+				verbose(env, "same insn cannot be used with different pointers\n");
 				return -EINVAL;
 			}
 
 		} else if (class == BPF_ST) {
 			if (BPF_MODE(insn->code) != BPF_MEM ||
 			    insn->src_reg != BPF_REG_0) {
-				verbose("BPF_ST uses reserved fields\n");
+				verbose(env, "BPF_ST uses reserved fields\n");
 				return -EINVAL;
 			}
 			/* check src operand */
@@ -3865,7 +3893,7 @@ static int do_check(struct bpf_verifier_env *env)
 				    insn->off != 0 ||
 				    insn->src_reg != BPF_REG_0 ||
 				    insn->dst_reg != BPF_REG_0) {
-					verbose("BPF_CALL uses reserved fields\n");
+					verbose(env, "BPF_CALL uses reserved fields\n");
 					return -EINVAL;
 				}
 
@@ -3878,7 +3906,7 @@ static int do_check(struct bpf_verifier_env *env)
 				    insn->imm != 0 ||
 				    insn->src_reg != BPF_REG_0 ||
 				    insn->dst_reg != BPF_REG_0) {
-					verbose("BPF_JA uses reserved fields\n");
+					verbose(env, "BPF_JA uses reserved fields\n");
 					return -EINVAL;
 				}
 
@@ -3890,7 +3918,7 @@ static int do_check(struct bpf_verifier_env *env)
 				    insn->imm != 0 ||
 				    insn->src_reg != BPF_REG_0 ||
 				    insn->dst_reg != BPF_REG_0) {
-					verbose("BPF_EXIT uses reserved fields\n");
+					verbose(env, "BPF_EXIT uses reserved fields\n");
 					return -EINVAL;
 				}
 
@@ -3905,7 +3933,7 @@ static int do_check(struct bpf_verifier_env *env)
 					return err;
 
 				if (is_pointer_value(env, BPF_REG_0)) {
-					verbose("R0 leaks addr as return value\n");
+					verbose(env, "R0 leaks addr as return value\n");
 					return -EACCES;
 				}
 
@@ -3940,19 +3968,19 @@ process_bpf_exit:
 
 				insn_idx++;
 			} else {
-				verbose("invalid BPF_LD mode\n");
+				verbose(env, "invalid BPF_LD mode\n");
 				return -EINVAL;
 			}
 		} else {
-			verbose("unknown insn class %d\n", class);
+			verbose(env, "unknown insn class %d\n", class);
 			return -EINVAL;
 		}
 
 		insn_idx++;
 	}
 
-	verbose("processed %d insns, stack depth %d\n",
-		insn_processed, env->prog->aux->stack_depth);
+	verbose(env, "processed %d insns, stack depth %d\n", insn_processed,
+		env->prog->aux->stack_depth);
 	return 0;
 }
 
@@ -3964,7 +3992,8 @@ static int check_map_prealloc(struct bpf_map *map)
 		!(map->map_flags & BPF_F_NO_PREALLOC);
 }
 
-static int check_map_prog_compatibility(struct bpf_map *map,
+static int check_map_prog_compatibility(struct bpf_verifier_env *env,
+					struct bpf_map *map,
 					struct bpf_prog *prog)
 
 {
@@ -3975,12 +4004,12 @@ static int check_map_prog_compatibility(struct bpf_map *map,
 	 */
 	if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
 		if (!check_map_prealloc(map)) {
-			verbose("perf_event programs can only use preallocated hash map\n");
+			verbose(env, "perf_event programs can only use preallocated hash map\n");
 			return -EINVAL;
 		}
 		if (map->inner_map_meta &&
 		    !check_map_prealloc(map->inner_map_meta)) {
-			verbose("perf_event programs can only use preallocated inner hash map\n");
+			verbose(env, "perf_event programs can only use preallocated inner hash map\n");
 			return -EINVAL;
 		}
 	}
@@ -4003,14 +4032,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		if (BPF_CLASS(insn->code) == BPF_LDX &&
 		    (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
-			verbose("BPF_LDX uses reserved fields\n");
+			verbose(env, "BPF_LDX uses reserved fields\n");
 			return -EINVAL;
 		}
 
 		if (BPF_CLASS(insn->code) == BPF_STX &&
 		    ((BPF_MODE(insn->code) != BPF_MEM &&
 		      BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
-			verbose("BPF_STX uses reserved fields\n");
+			verbose(env, "BPF_STX uses reserved fields\n");
 			return -EINVAL;
 		}
 
@@ -4021,7 +4050,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 			if (i == insn_cnt - 1 || insn[1].code != 0 ||
 			    insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
 			    insn[1].off != 0) {
-				verbose("invalid bpf_ld_imm64 insn\n");
+				verbose(env, "invalid bpf_ld_imm64 insn\n");
 				return -EINVAL;
 			}
 
@@ -4030,19 +4059,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 				goto next_insn;
 
 			if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
-				verbose("unrecognized bpf_ld_imm64 insn\n");
+				verbose(env,
+					"unrecognized bpf_ld_imm64 insn\n");
 				return -EINVAL;
 			}
 
 			f = fdget(insn->imm);
 			map = __bpf_map_get(f);
 			if (IS_ERR(map)) {
-				verbose("fd %d is not pointing to valid bpf_map\n",
+				verbose(env, "fd %d is not pointing to valid bpf_map\n",
 					insn->imm);
 				return PTR_ERR(map);
 			}
 
-			err = check_map_prog_compatibility(map, env->prog);
+			err = check_map_prog_compatibility(env, map, env->prog);
 			if (err) {
 				fdput(f);
 				return err;
@@ -4164,7 +4194,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
 					env->prog);
 		if (cnt >= ARRAY_SIZE(insn_buf)) {
-			verbose("bpf verifier is misconfigured\n");
+			verbose(env, "bpf verifier is misconfigured\n");
 			return -EINVAL;
 		} else if (cnt) {
 			new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
@@ -4212,7 +4242,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			u8 size_code;
 
 			if (type == BPF_WRITE) {
-				verbose("bpf verifier narrow ctx access misconfigured\n");
+				verbose(env, "bpf verifier narrow ctx access misconfigured\n");
 				return -EINVAL;
 			}
 
@@ -4231,7 +4261,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 					      &target_size);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
 		    (ctx_field_size && !target_size)) {
-			verbose("bpf verifier is misconfigured\n");
+			verbose(env, "bpf verifier is misconfigured\n");
 			return -EINVAL;
 		}
 
@@ -4313,7 +4343,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 
 			cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
 			if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
-				verbose("bpf verifier is misconfigured\n");
+				verbose(env, "bpf verifier is misconfigured\n");
 				return -EINVAL;
 			}
 
@@ -4357,7 +4387,8 @@ patch_call_imm:
 		 * programs to call them, must be real in-kernel functions
 		 */
 		if (!fn->func) {
-			verbose("kernel subsystem misconfigured func %s#%d\n",
+			verbose(env,
+				"kernel subsystem misconfigured func %s#%d\n",
 				func_id_name(insn->imm), insn->imm);
 			return -EFAULT;
 		}
@@ -4391,8 +4422,8 @@ static void free_states(struct bpf_verifier_env *env)
 
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 {
-	struct bpf_verifer_log *log = &verifier_log;
 	struct bpf_verifier_env *env;
+	struct bpf_verifer_log *log;
 	int ret = -EINVAL;
 
 	/* 'struct bpf_verifier_env' can be global, but since it's not small,
@@ -4401,6 +4432,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
 	if (!env)
 		return -ENOMEM;
+	log = &env->log;
 
 	env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
 				     (*prog)->len);
@@ -4419,7 +4451,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		log->level = attr->log_level;
 		log->ubuf = (char __user *) (unsigned long) attr->log_buf;
 		log->len_total = attr->log_size;
-		log->len_used = 0;
 
 		ret = -EINVAL;
 		/* log attributes have to be sane */
@@ -4431,8 +4462,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		log->kbuf = vmalloc(log->len_total);
 		if (!log->kbuf)
 			goto err_unlock;
-	} else {
-		log->level = 0;
 	}
 
 	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
@@ -4543,8 +4572,6 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	/* grab the mutex to protect few globals used by verifier */
 	mutex_lock(&bpf_verifier_lock);
 
-	verifier_log.level = 0;
-
 	env->strict_alignment = false;
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		env->strict_alignment = true;
-- 
cgit v1.2.3


From a2a7d5701052542cd2260e7659b12443e0a74733 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 9 Oct 2017 10:30:15 -0700
Subject: bpf: write back the verifier log buffer as it gets filled

Verifier log buffer can be quite large (up to 16MB currently).
As Eric Dumazet points out if we allow multiple verification
requests to proceed simultaneously, malicious user may use the
verifier as a way of allocating large amounts of unswappable
memory to OOM the host.

Switch to a strategy of allocating a smaller buffer (1024B)
and writing it out into the user buffer after every print.

While at it remove the old BUG_ON().

This is in preparation of the global verifier lock removal.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |  4 +++-
 kernel/bpf/verifier.c        | 41 +++++++++++++++++++----------------------
 2 files changed, 22 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 5ddb9a626a51..f00ef751c1c5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -115,9 +115,11 @@ struct bpf_insn_aux_data {
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
 
+#define BPF_VERIFIER_TMP_LOG_SIZE	1024
+
 struct bpf_verifer_log {
 	u32 level;
-	char *kbuf;
+	char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
 	char __user *ubuf;
 	u32 len_used;
 	u32 len_total;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 274c6582ec39..2cdbcc4f8f6b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -165,15 +165,26 @@ static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
 				   const char *fmt, ...)
 {
 	struct bpf_verifer_log *log = &env->log;
+	unsigned int n;
 	va_list args;
 
-	if (!log->level || bpf_verifier_log_full(log))
+	if (!log->level || !log->ubuf || bpf_verifier_log_full(log))
 		return;
 
 	va_start(args, fmt);
-	log->len_used += vscnprintf(log->kbuf + log->len_used,
-				    log->len_total - log->len_used, fmt, args);
+	n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
 	va_end(args);
+
+	WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
+		  "verifier log line truncated - local buffer too short\n");
+
+	n = min(log->len_total - log->len_used - 1, n);
+	log->kbuf[n] = '\0';
+
+	if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
+		log->len_used += n;
+	else
+		log->ubuf = NULL;
 }
 
 static bool type_is_pkt_pointer(enum bpf_reg_type type)
@@ -4263,11 +4274,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
 		    !log->level || !log->ubuf)
 			goto err_unlock;
-
-		ret = -ENOMEM;
-		log->kbuf = vmalloc(log->len_total);
-		if (!log->kbuf)
-			goto err_unlock;
 	}
 
 	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
@@ -4304,18 +4310,11 @@ skip_full_check:
 	if (ret == 0)
 		ret = fixup_bpf_calls(env);
 
-	if (log->level && bpf_verifier_log_full(log)) {
-		BUG_ON(log->len_used >= log->len_total);
-		/* verifier log exceeded user supplied buffer */
+	if (log->level && bpf_verifier_log_full(log))
 		ret = -ENOSPC;
-		/* fall through to return what was recorded */
-	}
-
-	/* copy verifier log back to user space including trailing zero */
-	if (log->level && copy_to_user(log->ubuf, log->kbuf,
-				       log->len_used + 1) != 0) {
+	if (log->level && !log->ubuf) {
 		ret = -EFAULT;
-		goto free_log_buf;
+		goto err_release_maps;
 	}
 
 	if (ret == 0 && env->used_map_cnt) {
@@ -4326,7 +4325,7 @@ skip_full_check:
 
 		if (!env->prog->aux->used_maps) {
 			ret = -ENOMEM;
-			goto free_log_buf;
+			goto err_release_maps;
 		}
 
 		memcpy(env->prog->aux->used_maps, env->used_maps,
@@ -4339,9 +4338,7 @@ skip_full_check:
 		convert_pseudo_ld_imm64(env);
 	}
 
-free_log_buf:
-	if (log->level)
-		vfree(log->kbuf);
+err_release_maps:
 	if (!env->prog->aux->used_maps)
 		/* if we didn't copy map pointers into bpf_prog_info, release
 		 * them now. Otherwise free_bpf_prog_info() will release them.
-- 
cgit v1.2.3


From a542f9a04d30570d367770e34f2c5d0c1d313337 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi83@gmail.com>
Date: Sun, 17 Sep 2017 18:17:09 +0200
Subject: iio: st_sensors: split open-drain parameters for irq1 and irq2

Define st_sensor_int_drdy structure in st_sensor_data_ready_irq in order
to contain irq line parameters of the device.
Moreover separate data-ready open-drain configuration parameters for INT1
and INT2 pins in st_sensor_data_ready_irq data structure.
That change will be used to properly support LIS3DHH accel sensor.

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@st.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/accel/st_accel_core.c               | 18 ++++++++++------
 drivers/iio/common/st_sensors/st_sensors_core.c | 21 ++++++++++++++-----
 drivers/iio/pressure/st_pressure_core.c         | 14 +++++++------
 include/linux/iio/common/st_sensors.h           | 28 ++++++++++++++-----------
 4 files changed, 52 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/st_accel_core.c b/drivers/iio/accel/st_accel_core.c
index 731ec3a4c82b..c88db25a1aff 100644
--- a/drivers/iio/accel/st_accel_core.c
+++ b/drivers/iio/accel/st_accel_core.c
@@ -236,15 +236,17 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.int1 = {
 				.addr = 0x22,
 				.mask = 0x02,
+				.addr_od = 0x22,
+				.mask_od = 0x40,
 			},
 			.int2 = {
 				.addr = 0x22,
 				.mask = 0x10,
+				.addr_od = 0x22,
+				.mask_od = 0x40,
 			},
 			.addr_ihl = 0x22,
 			.mask_ihl = 0x80,
-			.addr_od = 0x22,
-			.mask_od = 0x40,
 			.stat_drdy = {
 				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
 				.mask = 0x07,
@@ -468,15 +470,17 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.int1 = {
 				.addr = 0x22,
 				.mask = 0x04,
+				.addr_od = 0x22,
+				.mask_od = 0x40,
 			},
 			.int2 = {
 				.addr = 0x22,
 				.mask = 0x20,
+				.addr_od = 0x22,
+				.mask_od = 0x40,
 			},
 			.addr_ihl = 0x22,
 			.mask_ihl = 0x80,
-			.addr_od = 0x22,
-			.mask_od = 0x40,
 			.stat_drdy = {
 				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
 				.mask = 0x07,
@@ -750,15 +754,17 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 			.int1 = {
 				.addr = 0x23,
 				.mask = 0x01,
+				.addr_od = 0x22,
+				.mask_od = 0x20,
 			},
 			.int2 = {
 				.addr = 0x24,
 				.mask = 0x01,
+				.addr_od = 0x22,
+				.mask_od = 0x20,
 			},
 			.addr_ihl = 0x22,
 			.mask_ihl = 0x08,
-			.addr_od = 0x22,
-			.mask_od = 0x20,
 			.stat_drdy = {
 				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
 				.mask = 0x01,
diff --git a/drivers/iio/common/st_sensors/st_sensors_core.c b/drivers/iio/common/st_sensors/st_sensors_core.c
index 6657160b5a73..40dfdfc0906b 100644
--- a/drivers/iio/common/st_sensors/st_sensors_core.c
+++ b/drivers/iio/common/st_sensors/st_sensors_core.c
@@ -319,7 +319,8 @@ static int st_sensors_set_drdy_int_pin(struct iio_dev *indio_dev,
 	}
 
 	if (pdata->open_drain) {
-		if (!sdata->sensor_settings->drdy_irq.addr_od)
+		if (!sdata->sensor_settings->drdy_irq.int1.addr_od &&
+		    !sdata->sensor_settings->drdy_irq.int2.addr_od)
 			dev_err(&indio_dev->dev,
 				"open drain requested but unsupported.\n");
 		else
@@ -446,11 +447,21 @@ int st_sensors_init_sensor(struct iio_dev *indio_dev,
 	}
 
 	if (sdata->int_pin_open_drain) {
+		u8 addr, mask;
+
+		if (sdata->drdy_int_pin == 1) {
+			addr = sdata->sensor_settings->drdy_irq.int1.addr_od;
+			mask = sdata->sensor_settings->drdy_irq.int1.mask_od;
+		} else {
+			addr = sdata->sensor_settings->drdy_irq.int2.addr_od;
+			mask = sdata->sensor_settings->drdy_irq.int2.mask_od;
+		}
+
 		dev_info(&indio_dev->dev,
-			 "set interrupt line to open drain mode\n");
-		err = st_sensors_write_data_with_mask(indio_dev,
-				sdata->sensor_settings->drdy_irq.addr_od,
-				sdata->sensor_settings->drdy_irq.mask_od, 1);
+			 "set interrupt line to open drain mode on pin %d\n",
+			 sdata->drdy_int_pin);
+		err = st_sensors_write_data_with_mask(indio_dev, addr,
+						      mask, 1);
 		if (err < 0)
 			return err;
 	}
diff --git a/drivers/iio/pressure/st_pressure_core.c b/drivers/iio/pressure/st_pressure_core.c
index 15ad6054d9f6..349e5c713c03 100644
--- a/drivers/iio/pressure/st_pressure_core.c
+++ b/drivers/iio/pressure/st_pressure_core.c
@@ -283,15 +283,17 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 			.int1 = {
 				.addr = 0x22,
 				.mask = 0x04,
+				.addr_od = 0x22,
+				.mask_od = 0x40,
 			},
 			.int2 = {
 				.addr = 0x22,
 				.mask = 0x20,
+				.addr_od = 0x22,
+				.mask_od = 0x40,
 			},
 			.addr_ihl = 0x22,
 			.mask_ihl = 0x80,
-			.addr_od = 0x22,
-			.mask_od = 0x40,
 			.stat_drdy = {
 				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
 				.mask = 0x03,
@@ -404,11 +406,11 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 			.int1 = {
 				.addr = 0x23,
 				.mask = 0x01,
+				.addr_od = 0x22,
+				.mask_od = 0x40,
 			},
 			.addr_ihl = 0x22,
 			.mask_ihl = 0x80,
-			.addr_od = 0x22,
-			.mask_od = 0x40,
 			.stat_drdy = {
 				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
 				.mask = 0x03,
@@ -473,11 +475,11 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
 			.int1 = {
 				.addr = 0x12,
 				.mask = 0x04,
+				.addr_od = 0x12,
+				.mask_od = 0x40,
 			},
 			.addr_ihl = 0x12,
 			.mask_ihl = 0x80,
-			.addr_od = 0x12,
-			.mask_od = 0x40,
 			.stat_drdy = {
 				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
 				.mask = 0x03,
diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index e6c646d5d6d4..f9bd6e8ab138 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -130,32 +130,36 @@ struct st_sensor_das {
 	u8 mask;
 };
 
+/**
+ * struct st_sensor_int_drdy - ST sensor device drdy line parameters
+ * @addr: address of INT drdy register.
+ * @mask: mask to enable drdy line.
+ * @addr_od: address to enable/disable Open Drain on the INT line.
+ * @mask_od: mask to enable/disable Open Drain on the INT line.
+ */
+struct st_sensor_int_drdy {
+	u8 addr;
+	u8 mask;
+	u8 addr_od;
+	u8 mask_od;
+};
+
 /**
  * struct st_sensor_data_ready_irq - ST sensor device data-ready interrupt
  * struct int1 - data-ready configuration register for INT1 pin.
  * struct int2 - data-ready configuration register for INT2 pin.
  * @addr_ihl: address to enable/disable active low on the INT lines.
  * @mask_ihl: mask to enable/disable active low on the INT lines.
- * @addr_od: address to enable/disable Open Drain on the INT lines.
- * @mask_od: mask to enable/disable Open Drain on the INT lines.
  * struct stat_drdy - status register of DRDY (data ready) interrupt.
  * struct ig1 - represents the Interrupt Generator 1 of sensors.
  * @en_addr: address of the enable ig1 register.
  * @en_mask: mask to write the on/off value for enable.
  */
 struct st_sensor_data_ready_irq {
-	struct {
-		u8 addr;
-		u8 mask;
-	} int1;
-	struct {
-		u8 addr;
-		u8 mask;
-	} int2;
+	struct st_sensor_int_drdy int1;
+	struct st_sensor_int_drdy int2;
 	u8 addr_ihl;
 	u8 mask_ihl;
-	u8 addr_od;
-	u8 mask_od;
 	struct {
 		u8 addr;
 		u8 mask;
-- 
cgit v1.2.3


From eca8b53a6769e60d6d8240d71202d73b0af81901 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 6 Oct 2017 17:55:59 -0700
Subject: blk-stat: delete useless code

Fix two issues:
- the per-cpu stat flush is unnecessary, nobody uses per-cpu stat except
  sum it to global stat. We can do the calculation there. The flush just
  wastes cpu time.
- some fields are signed int/s64. I don't see the point.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-stat.c          | 45 +++++++--------------------------------------
 include/linux/blk_types.h |  5 ++---
 2 files changed, 9 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-stat.c b/block/blk-stat.c
index c52356d90fe3..3a2f3c96f367 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -11,8 +11,6 @@
 #include "blk-mq.h"
 #include "blk.h"
 
-#define BLK_RQ_STAT_BATCH	64
-
 struct blk_queue_stats {
 	struct list_head callbacks;
 	spinlock_t lock;
@@ -23,45 +21,21 @@ static void blk_stat_init(struct blk_rq_stat *stat)
 {
 	stat->min = -1ULL;
 	stat->max = stat->nr_samples = stat->mean = 0;
-	stat->batch = stat->nr_batch = 0;
-}
-
-static void blk_stat_flush_batch(struct blk_rq_stat *stat)
-{
-	const s32 nr_batch = READ_ONCE(stat->nr_batch);
-	const s32 nr_samples = READ_ONCE(stat->nr_samples);
-
-	if (!nr_batch)
-		return;
-	if (!nr_samples)
-		stat->mean = div64_s64(stat->batch, nr_batch);
-	else {
-		stat->mean = div64_s64((stat->mean * nr_samples) +
-					stat->batch,
-					nr_batch + nr_samples);
-	}
-
-	stat->nr_samples += nr_batch;
-	stat->nr_batch = stat->batch = 0;
+	stat->batch = 0;
 }
 
+/* src is a per-cpu stat, mean isn't initialized */
 static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
 {
-	blk_stat_flush_batch(src);
-
 	if (!src->nr_samples)
 		return;
 
 	dst->min = min(dst->min, src->min);
 	dst->max = max(dst->max, src->max);
 
-	if (!dst->nr_samples)
-		dst->mean = src->mean;
-	else {
-		dst->mean = div64_s64((src->mean * src->nr_samples) +
-					(dst->mean * dst->nr_samples),
-					dst->nr_samples + src->nr_samples);
-	}
+	dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples,
+				dst->nr_samples + src->nr_samples);
+
 	dst->nr_samples += src->nr_samples;
 }
 
@@ -69,13 +43,8 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
 {
 	stat->min = min(stat->min, value);
 	stat->max = max(stat->max, value);
-
-	if (stat->batch + value < stat->batch ||
-	    stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
-		blk_stat_flush_batch(stat);
-
 	stat->batch += value;
-	stat->nr_batch++;
+	stat->nr_samples++;
 }
 
 void blk_stat_add(struct request *rq)
@@ -84,7 +53,7 @@ void blk_stat_add(struct request *rq)
 	struct blk_stat_callback *cb;
 	struct blk_rq_stat *stat;
 	int bucket;
-	s64 now, value;
+	u64 now, value;
 
 	now = __blk_stat_time(ktime_to_ns(ktime_get()));
 	if (now < blk_stat_time(&rq->issue_stat))
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a2d2aa709cef..3385c89f402e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -329,11 +329,10 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
 }
 
 struct blk_rq_stat {
-	s64 mean;
+	u64 mean;
 	u64 min;
 	u64 max;
-	s32 nr_samples;
-	s32 nr_batch;
+	u32 nr_samples;
 	u64 batch;
 };
 
-- 
cgit v1.2.3


From d59158162e032917a428704160a2063a02405ec6 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Tue, 10 Oct 2017 15:51:37 -0700
Subject: tracing: Add support for preempt and irq enable/disable events

Preempt and irq trace events can be used for tracing the start and
end of an atomic section which can be used by a trace viewer like
systrace to graphically view the start and end of an atomic section and
correlate them with latencies and scheduling issues.

This also serves as a prelude to using synthetic events or probes to
rewrite the preempt and irqsoff tracers, along with numerous benefits of
using trace events features for these events.
Link: http://lkml.kernel.org/r/20171006005432.14244-3-joelaf@google.com
Link: http://lkml.kernel.org/r/20171010225137.17370-1-joelaf@google.com

Cc: Peter Zilstra <peterz@infradead.org>
Cc: kernel-team@android.com
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h            |  3 +-
 include/trace/events/preemptirq.h | 70 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/Kconfig              | 11 ++++++
 kernel/trace/Makefile             |  1 +
 kernel/trace/trace_irqsoff.c      | 35 +++++++++++++++++++-
 5 files changed, 118 insertions(+), 2 deletions(-)
 create mode 100644 include/trace/events/preemptirq.h

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 346f8294e40a..1f8545caa691 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -769,7 +769,8 @@ static inline unsigned long get_lock_parent_ip(void)
   static inline void time_hardirqs_off(unsigned long a0, unsigned long a1) { }
 #endif
 
-#ifdef CONFIG_PREEMPT_TRACER
+#if defined(CONFIG_PREEMPT_TRACER) || \
+	(defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
   extern void trace_preempt_on(unsigned long a0, unsigned long a1);
   extern void trace_preempt_off(unsigned long a0, unsigned long a1);
 #else
diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h
new file mode 100644
index 000000000000..f5024c560d8f
--- /dev/null
+++ b/include/trace/events/preemptirq.h
@@ -0,0 +1,70 @@
+#ifdef CONFIG_PREEMPTIRQ_EVENTS
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM preemptirq
+
+#if !defined(_TRACE_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PREEMPTIRQ_H
+
+#include <linux/ktime.h>
+#include <linux/tracepoint.h>
+#include <linux/string.h>
+#include <asm/sections.h>
+
+DECLARE_EVENT_CLASS(preemptirq_template,
+
+	TP_PROTO(unsigned long ip, unsigned long parent_ip),
+
+	TP_ARGS(ip, parent_ip),
+
+	TP_STRUCT__entry(
+		__field(u32, caller_offs)
+		__field(u32, parent_offs)
+	),
+
+	TP_fast_assign(
+		__entry->caller_offs = (u32)(ip - (unsigned long)_stext);
+		__entry->parent_offs = (u32)(parent_ip - (unsigned long)_stext);
+	),
+
+	TP_printk("caller=%pF parent=%pF",
+		  (void *)((unsigned long)(_stext) + __entry->caller_offs),
+		  (void *)((unsigned long)(_stext) + __entry->parent_offs))
+);
+
+#ifndef CONFIG_PROVE_LOCKING
+DEFINE_EVENT(preemptirq_template, irq_disable,
+	     TP_PROTO(unsigned long ip, unsigned long parent_ip),
+	     TP_ARGS(ip, parent_ip));
+
+DEFINE_EVENT(preemptirq_template, irq_enable,
+	     TP_PROTO(unsigned long ip, unsigned long parent_ip),
+	     TP_ARGS(ip, parent_ip));
+#endif
+
+#ifdef CONFIG_DEBUG_PREEMPT
+DEFINE_EVENT(preemptirq_template, preempt_disable,
+	     TP_PROTO(unsigned long ip, unsigned long parent_ip),
+	     TP_ARGS(ip, parent_ip));
+
+DEFINE_EVENT(preemptirq_template, preempt_enable,
+	     TP_PROTO(unsigned long ip, unsigned long parent_ip),
+	     TP_ARGS(ip, parent_ip));
+#endif
+
+#endif /* _TRACE_PREEMPTIRQ_H */
+
+#include <trace/define_trace.h>
+
+#else /* !CONFIG_PREEMPTIRQ_EVENTS */
+
+#define trace_irq_enable(...)
+#define trace_irq_disable(...)
+#define trace_preempt_enable(...)
+#define trace_preempt_disable(...)
+#define trace_irq_enable_rcuidle(...)
+#define trace_irq_disable_rcuidle(...)
+#define trace_preempt_enable_rcuidle(...)
+#define trace_preempt_disable_rcuidle(...)
+
+#endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 434c840e2d82..b8395a020821 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -160,6 +160,17 @@ config FUNCTION_GRAPH_TRACER
 	  address on the current task structure into a stack of calls.
 
 
+config PREEMPTIRQ_EVENTS
+	bool "Enable trace events for preempt and irq disable/enable"
+	select TRACE_IRQFLAGS
+	depends on DEBUG_PREEMPT || !PROVE_LOCKING
+	default n
+	help
+	  Enable tracing of disable and enable events for preemption and irqs.
+	  For tracing preempt disable/enable events, DEBUG_PREEMPT must be
+	  enabled. For tracing irq disable/enable events, PROVE_LOCKING must
+	  be disabled.
+
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 90f2701d92a7..9f62eee61f14 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_TRACING) += trace_printk.o
 obj-$(CONFIG_TRACING_MAP) += tracing_map.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
+obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 0e3033c00474..03ecb4465ee4 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -16,6 +16,9 @@
 
 #include "trace.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/preemptirq.h>
+
 #if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
 static struct trace_array		*irqsoff_trace __read_mostly;
 static int				tracer_enabled __read_mostly;
@@ -777,26 +780,53 @@ static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
 #endif
 
 #if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
+/* Per-cpu variable to prevent redundant calls when IRQs already off */
+static DEFINE_PER_CPU(int, tracing_irq_cpu);
+
 void trace_hardirqs_on(void)
 {
+	if (!this_cpu_read(tracing_irq_cpu))
+		return;
+
+	trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
 	tracer_hardirqs_on();
+
+	this_cpu_write(tracing_irq_cpu, 0);
 }
 EXPORT_SYMBOL(trace_hardirqs_on);
 
 void trace_hardirqs_off(void)
 {
+	if (this_cpu_read(tracing_irq_cpu))
+		return;
+
+	this_cpu_write(tracing_irq_cpu, 1);
+
+	trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
 	tracer_hardirqs_off();
 }
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
 {
+	if (!this_cpu_read(tracing_irq_cpu))
+		return;
+
+	trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
 	tracer_hardirqs_on_caller(caller_addr);
+
+	this_cpu_write(tracing_irq_cpu, 0);
 }
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
 __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
 {
+	if (this_cpu_read(tracing_irq_cpu))
+		return;
+
+	this_cpu_write(tracing_irq_cpu, 1);
+
+	trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
 	tracer_hardirqs_off_caller(caller_addr);
 }
 EXPORT_SYMBOL(trace_hardirqs_off_caller);
@@ -818,14 +848,17 @@ inline void print_irqtrace_events(struct task_struct *curr)
 }
 #endif
 
-#ifdef CONFIG_PREEMPT_TRACER
+#if defined(CONFIG_PREEMPT_TRACER) || \
+	(defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
+	trace_preempt_enable_rcuidle(a0, a1);
 	tracer_preempt_on(a0, a1);
 }
 
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
+	trace_preempt_disable_rcuidle(a0, a1);
 	tracer_preempt_off(a0, a1);
 }
 #endif
-- 
cgit v1.2.3


From 604a7aeb4325b8ecb23df163c89fc12248302a4e Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 5 Oct 2017 17:26:21 +0530
Subject: PM / OPP: Rename dev_pm_opp_register_put_opp_helper()

The routine is named incorrectly since the first attempt as there is
nothing like a put_opp() helper. We wanted to unregister the set_opp()
helper here and so it should rather be named as
dev_pm_opp_unregister_set_opp_helper().

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/opp/core.c     | 6 +++---
 include/linux/pm_opp.h | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 0459b1204694..80c21207e48c 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -1476,13 +1476,13 @@ err:
 EXPORT_SYMBOL_GPL(dev_pm_opp_register_set_opp_helper);
 
 /**
- * dev_pm_opp_register_put_opp_helper() - Releases resources blocked for
+ * dev_pm_opp_unregister_set_opp_helper() - Releases resources blocked for
  *					   set_opp helper
  * @opp_table: OPP table returned from dev_pm_opp_register_set_opp_helper().
  *
  * Release resources blocked for platform specific set_opp helper.
  */
-void dev_pm_opp_register_put_opp_helper(struct opp_table *opp_table)
+void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table)
 {
 	if (!opp_table->set_opp) {
 		pr_err("%s: Doesn't have custom set_opp helper set\n",
@@ -1497,7 +1497,7 @@ void dev_pm_opp_register_put_opp_helper(struct opp_table *opp_table)
 
 	dev_pm_opp_put_opp_table(opp_table);
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_register_put_opp_helper);
+EXPORT_SYMBOL_GPL(dev_pm_opp_unregister_set_opp_helper);
 
 /**
  * dev_pm_opp_add()  - Add an OPP table from a table definitions
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 51ec727b4824..849d21dc4ca7 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -124,7 +124,7 @@ void dev_pm_opp_put_regulators(struct opp_table *opp_table);
 struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char * name);
 void dev_pm_opp_put_clkname(struct opp_table *opp_table);
 struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
-void dev_pm_opp_register_put_opp_helper(struct opp_table *opp_table);
+void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table);
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
 int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask);
 int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask);
@@ -243,7 +243,7 @@ static inline struct opp_table *dev_pm_opp_register_set_opp_helper(struct device
 	return ERR_PTR(-ENOTSUPP);
 }
 
-static inline void dev_pm_opp_register_put_opp_helper(struct opp_table *opp_table) {}
+static inline void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table) {}
 
 static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 {
-- 
cgit v1.2.3


From e901b9873876ca30a09253731bd3a6b00c44b5b0 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 4 Oct 2017 16:15:59 +0200
Subject: usb: core: Add a helper function to check the validity of EP type in
 URB

This patch adds a new helper function to perform a sanity check of the
given URB to see whether it contains a valid endpoint.  It's a light-
weight version of what usb_submit_urb() does, but without the kernel
warning followed by the stack trace, just returns an error code.

Especially for a driver that doesn't parse the descriptor but fills
the URB with the fixed endpoint (e.g. some quirks for non-compliant
devices), this kind of check is preferable at the probe phase before
actually submitting the urb.

Tested-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 drivers/usb/core/urb.c | 30 ++++++++++++++++++++++++++----
 include/linux/usb.h    |  2 ++
 2 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/urb.c b/drivers/usb/core/urb.c
index 47903d510955..8b800e34407b 100644
--- a/drivers/usb/core/urb.c
+++ b/drivers/usb/core/urb.c
@@ -187,6 +187,31 @@ EXPORT_SYMBOL_GPL(usb_unanchor_urb);
 
 /*-------------------------------------------------------------------*/
 
+static const int pipetypes[4] = {
+	PIPE_CONTROL, PIPE_ISOCHRONOUS, PIPE_BULK, PIPE_INTERRUPT
+};
+
+/**
+ * usb_urb_ep_type_check - sanity check of endpoint in the given urb
+ * @urb: urb to be checked
+ *
+ * This performs a light-weight sanity check for the endpoint in the
+ * given urb.  It returns 0 if the urb contains a valid endpoint, otherwise
+ * a negative error code.
+ */
+int usb_urb_ep_type_check(const struct urb *urb)
+{
+	const struct usb_host_endpoint *ep;
+
+	ep = usb_pipe_endpoint(urb->dev, urb->pipe);
+	if (!ep)
+		return -EINVAL;
+	if (usb_pipetype(urb->pipe) != pipetypes[usb_endpoint_type(&ep->desc)])
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(usb_urb_ep_type_check);
+
 /**
  * usb_submit_urb - issue an asynchronous transfer request for an endpoint
  * @urb: pointer to the urb describing the request
@@ -326,9 +351,6 @@ EXPORT_SYMBOL_GPL(usb_unanchor_urb);
  */
 int usb_submit_urb(struct urb *urb, gfp_t mem_flags)
 {
-	static int			pipetypes[4] = {
-		PIPE_CONTROL, PIPE_ISOCHRONOUS, PIPE_BULK, PIPE_INTERRUPT
-	};
 	int				xfertype, max;
 	struct usb_device		*dev;
 	struct usb_host_endpoint	*ep;
@@ -444,7 +466,7 @@ int usb_submit_urb(struct urb *urb, gfp_t mem_flags)
 	 */
 
 	/* Check that the pipe's type matches the endpoint's type */
-	if (usb_pipetype(urb->pipe) != pipetypes[xfertype])
+	if (usb_urb_ep_type_check(urb))
 		dev_WARN(&dev->dev, "BOGUS urb xfer, pipe %x != type %x\n",
 			usb_pipetype(urb->pipe), pipetypes[xfertype]);
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index cb9fbd54386e..2b861804fffa 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1728,6 +1728,8 @@ static inline int usb_urb_dir_out(struct urb *urb)
 	return (urb->transfer_flags & URB_DIR_MASK) == URB_DIR_OUT;
 }
 
+int usb_urb_ep_type_check(const struct urb *urb);
+
 void *usb_alloc_coherent(struct usb_device *dev, size_t size,
 	gfp_t mem_flags, dma_addr_t *dma);
 void usb_free_coherent(struct usb_device *dev, size_t size,
-- 
cgit v1.2.3


From 63705c406a8adbd6f26691148b09d466dd4d8d2f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 10 Oct 2017 18:49:22 +0200
Subject: ACPI / PM: Combine two identical device resume routines

Notice that acpi_dev_runtime_resume() and acpi_dev_resume_early() are
actually literally identical after some more-or-less recent changes,
so rename acpi_dev_runtime_resume() to acpi_dev_resume(), use it
everywhere instead of acpi_dev_resume_early() and drop the latter.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/acpi/acpi_lpss.c |  6 +++---
 drivers/acpi/device_pm.c | 35 ++++++-----------------------------
 include/linux/acpi.h     |  3 +--
 3 files changed, 10 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
index 032ae44710e5..81b6096c4177 100644
--- a/drivers/acpi/acpi_lpss.c
+++ b/drivers/acpi/acpi_lpss.c
@@ -693,7 +693,7 @@ static int acpi_lpss_activate(struct device *dev)
 	struct lpss_private_data *pdata = acpi_driver_data(ACPI_COMPANION(dev));
 	int ret;
 
-	ret = acpi_dev_runtime_resume(dev);
+	ret = acpi_dev_resume(dev);
 	if (ret)
 		return ret;
 
@@ -737,7 +737,7 @@ static int acpi_lpss_resume_early(struct device *dev)
 	struct lpss_private_data *pdata = acpi_driver_data(ACPI_COMPANION(dev));
 	int ret;
 
-	ret = acpi_dev_resume_early(dev);
+	ret = acpi_dev_resume(dev);
 	if (ret)
 		return ret;
 
@@ -872,7 +872,7 @@ static int acpi_lpss_runtime_resume(struct device *dev)
 	if (lpss_quirks & LPSS_QUIRK_ALWAYS_POWER_ON && iosf_mbi_available())
 		lpss_iosf_exit_d3_state();
 
-	ret = acpi_dev_runtime_resume(dev);
+	ret = acpi_dev_resume(dev);
 	if (ret)
 		return ret;
 
diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index fbcc73f7a099..6eb51145dcf7 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -882,14 +882,13 @@ int acpi_dev_runtime_suspend(struct device *dev)
 EXPORT_SYMBOL_GPL(acpi_dev_runtime_suspend);
 
 /**
- * acpi_dev_runtime_resume - Put device into the full-power state using ACPI.
+ * acpi_dev_resume - Put device into the full-power state using ACPI.
  * @dev: Device to put into the full-power state.
  *
  * Put the given device into the full-power state using the standard ACPI
- * mechanism at run time.  Set the power state of the device to ACPI D0 and
- * disable remote wakeup.
+ * mechanism.  Set the power state of the device to ACPI D0 and disable wakeup.
  */
-int acpi_dev_runtime_resume(struct device *dev)
+int acpi_dev_resume(struct device *dev)
 {
 	struct acpi_device *adev = ACPI_COMPANION(dev);
 	int error;
@@ -901,7 +900,7 @@ int acpi_dev_runtime_resume(struct device *dev)
 	acpi_device_wakeup_disable(adev);
 	return error;
 }
-EXPORT_SYMBOL_GPL(acpi_dev_runtime_resume);
+EXPORT_SYMBOL_GPL(acpi_dev_resume);
 
 /**
  * acpi_subsys_runtime_suspend - Suspend device using ACPI.
@@ -926,7 +925,7 @@ EXPORT_SYMBOL_GPL(acpi_subsys_runtime_suspend);
  */
 int acpi_subsys_runtime_resume(struct device *dev)
 {
-	int ret = acpi_dev_runtime_resume(dev);
+	int ret = acpi_dev_resume(dev);
 	return ret ? ret : pm_generic_runtime_resume(dev);
 }
 EXPORT_SYMBOL_GPL(acpi_subsys_runtime_resume);
@@ -967,28 +966,6 @@ int acpi_dev_suspend_late(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(acpi_dev_suspend_late);
 
-/**
- * acpi_dev_resume_early - Put device into the full-power state using ACPI.
- * @dev: Device to put into the full-power state.
- *
- * Put the given device into the full-power state using the standard ACPI
- * mechanism during system transition to the working state.  Set the power
- * state of the device to ACPI D0 and disable remote wakeup.
- */
-int acpi_dev_resume_early(struct device *dev)
-{
-	struct acpi_device *adev = ACPI_COMPANION(dev);
-	int error;
-
-	if (!adev)
-		return 0;
-
-	error = acpi_dev_pm_full_power(adev);
-	acpi_device_wakeup_disable(adev);
-	return error;
-}
-EXPORT_SYMBOL_GPL(acpi_dev_resume_early);
-
 /**
  * acpi_subsys_prepare - Prepare device for system transition to a sleep state.
  * @dev: Device to prepare.
@@ -1057,7 +1034,7 @@ EXPORT_SYMBOL_GPL(acpi_subsys_suspend_late);
  */
 int acpi_subsys_resume_early(struct device *dev)
 {
-	int ret = acpi_dev_resume_early(dev);
+	int ret = acpi_dev_resume(dev);
 	return ret ? ret : pm_generic_resume_early(dev);
 }
 EXPORT_SYMBOL_GPL(acpi_subsys_resume_early);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 3b89b4fe6812..d18c92d4ba19 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -865,7 +865,7 @@ static inline void arch_reserve_mem_area(acpi_physical_address addr,
 
 #if defined(CONFIG_ACPI) && defined(CONFIG_PM)
 int acpi_dev_runtime_suspend(struct device *dev);
-int acpi_dev_runtime_resume(struct device *dev);
+int acpi_dev_resume(struct device *dev);
 int acpi_subsys_runtime_suspend(struct device *dev);
 int acpi_subsys_runtime_resume(struct device *dev);
 int acpi_dev_pm_attach(struct device *dev, bool power_on);
@@ -882,7 +882,6 @@ static inline int acpi_dev_pm_attach(struct device *dev, bool power_on)
 
 #if defined(CONFIG_ACPI) && defined(CONFIG_PM_SLEEP)
 int acpi_dev_suspend_late(struct device *dev);
-int acpi_dev_resume_early(struct device *dev);
 int acpi_subsys_prepare(struct device *dev);
 void acpi_subsys_complete(struct device *dev);
 int acpi_subsys_suspend_late(struct device *dev);
-- 
cgit v1.2.3


From eeb2d80d502af28e5660ff4bbe00f90ceb82c2db Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Thu, 5 Oct 2017 16:24:03 -0700
Subject: ACPI / LPIT: Add Low Power Idle Table (LPIT) support

Add functionality to read LPIT table, which provides:

 - Sysfs interface to read residency counters via
   /sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us
   /sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us

Here the count "low_power_idle_cpu_residency_us" shows the time spent
by CPU package in low power state.  This is read via MSR interface,
which points to MSR for PKG C10.

Here the count "low_power_idle_system_residency_us" show the count the
system was in low power state. This is read via MMIO interface. This
is mapped to SLP_S0 residency on modern Intel systems. This residency
is achieved only when CPU is in PKG C10 and all functional blocks are
in low power state.

It is possible that none of the above counters present or anyone of the
counter present or all counters present.

For example: On my Kabylake system both of the above counters present.
After suspend to idle these counts updated and prints:

 6916179
 6998564

This counter can be read by tools like turbostat to display. Or it can
be used to debug, if modern systems are reaching desired low power state.

 - Provides an interface to read residency counter memory address

   This address can be used to get the base address of PMC memory
   mapped IO.  This is utilized by intel_pmc_core driver to print
   more debug information.

In addition, to avoid code duplication to read iomem, removed the read of
iomem from acpi_os_read_memory() in osl.c and made a common function
acpi_os_read_iomem(). This new function is used for reading iomem in
in both osl.c and acpi_lpit.c.

Link: http://www.uefi.org/sites/default/files/resources/Intel_ACPI_Low_Power_S0_Idle.pdf
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/acpi/lpit.txt |  25 +++++++
 drivers/acpi/Kconfig        |   5 ++
 drivers/acpi/Makefile       |   1 +
 drivers/acpi/acpi_lpit.c    | 162 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/acpi/internal.h     |   6 ++
 drivers/acpi/osl.c          |  42 +++++++-----
 drivers/acpi/scan.c         |   1 +
 include/acpi/acpiosxf.h     |   2 +
 include/linux/acpi.h        |   9 +++
 9 files changed, 237 insertions(+), 16 deletions(-)
 create mode 100644 Documentation/acpi/lpit.txt
 create mode 100644 drivers/acpi/acpi_lpit.c

(limited to 'include/linux')

diff --git a/Documentation/acpi/lpit.txt b/Documentation/acpi/lpit.txt
new file mode 100644
index 000000000000..b426398d2e97
--- /dev/null
+++ b/Documentation/acpi/lpit.txt
@@ -0,0 +1,25 @@
+To enumerate platform Low Power Idle states, Intel platforms are using
+“Low Power Idle Table” (LPIT). More details about this table can be
+downloaded from:
+http://www.uefi.org/sites/default/files/resources/Intel_ACPI_Low_Power_S0_Idle.pdf
+
+Residencies for each low power state can be read via FFH
+(Function fixed hardware) or a memory mapped interface.
+
+On platforms supporting S0ix sleep states, there can be two types of
+residencies:
+- CPU PKG C10 (Read via FFH interface)
+- Platform Controller Hub (PCH) SLP_S0 (Read via memory mapped interface)
+
+The following attributes are added dynamically to the cpuidle
+sysfs attribute group:
+	/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us
+	/sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us
+
+The "low_power_idle_cpu_residency_us" attribute shows time spent
+by the CPU package in PKG C10
+
+The "low_power_idle_system_residency_us" attribute shows SLP_S0
+residency, or system time spent with the SLP_S0# signal asserted.
+This is the lowest possible system power state, achieved only when CPU is in
+PKG C10 and all functional blocks in PCH are in a low power state.
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 1ce52f84dc23..4bfef0f78cde 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -80,6 +80,11 @@ endif
 config ACPI_SPCR_TABLE
 	bool
 
+config ACPI_LPIT
+	bool
+	depends on X86_64
+	default y
+
 config ACPI_SLEEP
 	bool
 	depends on SUSPEND || HIBERNATION
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 90265ab4437a..6a19bd7aba21 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -56,6 +56,7 @@ acpi-$(CONFIG_DEBUG_FS)		+= debugfs.o
 acpi-$(CONFIG_ACPI_NUMA)	+= numa.o
 acpi-$(CONFIG_ACPI_PROCFS_POWER) += cm_sbs.o
 acpi-y				+= acpi_lpat.o
+acpi-$(CONFIG_ACPI_LPIT)	+= acpi_lpit.o
 acpi-$(CONFIG_ACPI_GENERIC_GSI) += irq.o
 acpi-$(CONFIG_ACPI_WATCHDOG)	+= acpi_watchdog.o
 
diff --git a/drivers/acpi/acpi_lpit.c b/drivers/acpi/acpi_lpit.c
new file mode 100644
index 000000000000..e94e478dd18b
--- /dev/null
+++ b/drivers/acpi/acpi_lpit.c
@@ -0,0 +1,162 @@
+
+/*
+ * acpi_lpit.c - LPIT table processing functions
+ *
+ * Copyright (C) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/cpu.h>
+#include <linux/acpi.h>
+#include <asm/msr.h>
+#include <asm/tsc.h>
+
+struct lpit_residency_info {
+	struct acpi_generic_address gaddr;
+	u64 frequency;
+	void __iomem *iomem_addr;
+};
+
+/* Storage for an memory mapped and FFH based entries */
+static struct lpit_residency_info residency_info_mem;
+static struct lpit_residency_info residency_info_ffh;
+
+static int lpit_read_residency_counter_us(u64 *counter, bool io_mem)
+{
+	int err;
+
+	if (io_mem) {
+		u64 count = 0;
+		int error;
+
+		error = acpi_os_read_iomem(residency_info_mem.iomem_addr, &count,
+					   residency_info_mem.gaddr.bit_width);
+		if (error)
+			return error;
+
+		*counter = div64_u64(count * 1000000ULL, residency_info_mem.frequency);
+		return 0;
+	}
+
+	err = rdmsrl_safe(residency_info_ffh.gaddr.address, counter);
+	if (!err) {
+		u64 mask = GENMASK_ULL(residency_info_ffh.gaddr.bit_offset +
+				       residency_info_ffh.gaddr. bit_width - 1,
+				       residency_info_ffh.gaddr.bit_offset);
+
+		*counter &= mask;
+		*counter >>= residency_info_ffh.gaddr.bit_offset;
+		*counter = div64_u64(*counter * 1000000ULL, residency_info_ffh.frequency);
+		return 0;
+	}
+
+	return -ENODATA;
+}
+
+static ssize_t low_power_idle_system_residency_us_show(struct device *dev,
+						       struct device_attribute *attr,
+						       char *buf)
+{
+	u64 counter;
+	int ret;
+
+	ret = lpit_read_residency_counter_us(&counter, true);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%llu\n", counter);
+}
+static DEVICE_ATTR_RO(low_power_idle_system_residency_us);
+
+static ssize_t low_power_idle_cpu_residency_us_show(struct device *dev,
+						    struct device_attribute *attr,
+						    char *buf)
+{
+	u64 counter;
+	int ret;
+
+	ret = lpit_read_residency_counter_us(&counter, false);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%llu\n", counter);
+}
+static DEVICE_ATTR_RO(low_power_idle_cpu_residency_us);
+
+int lpit_read_residency_count_address(u64 *address)
+{
+	if (!residency_info_mem.gaddr.address)
+		return -EINVAL;
+
+	*address = residency_info_mem.gaddr.address;
+
+	return 0;
+}
+
+static void lpit_update_residency(struct lpit_residency_info *info,
+				 struct acpi_lpit_native *lpit_native)
+{
+	info->frequency = lpit_native->counter_frequency ?
+				lpit_native->counter_frequency : tsc_khz * 1000;
+	if (!info->frequency)
+		info->frequency = 1;
+
+	info->gaddr = lpit_native->residency_counter;
+	if (info->gaddr.space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) {
+		info->iomem_addr = ioremap_nocache(info->gaddr.address,
+						   info->gaddr.bit_width / 8);
+		if (!info->iomem_addr)
+			return;
+
+		/* Silently fail, if cpuidle attribute group is not present */
+		sysfs_add_file_to_group(&cpu_subsys.dev_root->kobj,
+					&dev_attr_low_power_idle_system_residency_us.attr,
+					"cpuidle");
+	} else if (info->gaddr.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
+		/* Silently fail, if cpuidle attribute group is not present */
+		sysfs_add_file_to_group(&cpu_subsys.dev_root->kobj,
+					&dev_attr_low_power_idle_cpu_residency_us.attr,
+					"cpuidle");
+	}
+}
+
+static void lpit_process(u64 begin, u64 end)
+{
+	while (begin + sizeof(struct acpi_lpit_native) < end) {
+		struct acpi_lpit_native *lpit_native = (struct acpi_lpit_native *)begin;
+
+		if (!lpit_native->header.type && !lpit_native->header.flags) {
+			if (lpit_native->residency_counter.space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY &&
+			    !residency_info_mem.gaddr.address) {
+				lpit_update_residency(&residency_info_mem, lpit_native);
+			} else if (lpit_native->residency_counter.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
+				   !residency_info_ffh.gaddr.address) {
+				lpit_update_residency(&residency_info_ffh, lpit_native);
+			}
+		}
+		begin += lpit_native->header.length;
+	}
+}
+
+void acpi_init_lpit(void)
+{
+	acpi_status status;
+	u64 lpit_begin;
+	struct acpi_table_lpit *lpit;
+
+	status = acpi_get_table(ACPI_SIG_LPIT, 0, (struct acpi_table_header **)&lpit);
+
+	if (ACPI_FAILURE(status))
+		return;
+
+	lpit_begin = (u64)lpit + sizeof(*lpit);
+	lpit_process(lpit_begin, lpit_begin + lpit->header.length);
+}
diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index 4361c4415b4f..fc8c43e76707 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -248,4 +248,10 @@ void acpi_watchdog_init(void);
 static inline void acpi_watchdog_init(void) {}
 #endif
 
+#ifdef CONFIG_ACPI_LPIT
+void acpi_init_lpit(void);
+#else
+static inline void acpi_init_lpit(void) { }
+#endif
+
 #endif /* _ACPI_INTERNAL_H_ */
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index db78d353bab1..3bb46cb24a99 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -663,6 +663,29 @@ acpi_status acpi_os_write_port(acpi_io_address port, u32 value, u32 width)
 
 EXPORT_SYMBOL(acpi_os_write_port);
 
+int acpi_os_read_iomem(void __iomem *virt_addr, u64 *value, u32 width)
+{
+
+	switch (width) {
+	case 8:
+		*(u8 *) value = readb(virt_addr);
+		break;
+	case 16:
+		*(u16 *) value = readw(virt_addr);
+		break;
+	case 32:
+		*(u32 *) value = readl(virt_addr);
+		break;
+	case 64:
+		*(u64 *) value = readq(virt_addr);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 acpi_status
 acpi_os_read_memory(acpi_physical_address phys_addr, u64 *value, u32 width)
 {
@@ -670,6 +693,7 @@ acpi_os_read_memory(acpi_physical_address phys_addr, u64 *value, u32 width)
 	unsigned int size = width / 8;
 	bool unmap = false;
 	u64 dummy;
+	int error;
 
 	rcu_read_lock();
 	virt_addr = acpi_map_vaddr_lookup(phys_addr, size);
@@ -684,22 +708,8 @@ acpi_os_read_memory(acpi_physical_address phys_addr, u64 *value, u32 width)
 	if (!value)
 		value = &dummy;
 
-	switch (width) {
-	case 8:
-		*(u8 *) value = readb(virt_addr);
-		break;
-	case 16:
-		*(u16 *) value = readw(virt_addr);
-		break;
-	case 32:
-		*(u32 *) value = readl(virt_addr);
-		break;
-	case 64:
-		*(u64 *) value = readq(virt_addr);
-		break;
-	default:
-		BUG();
-	}
+	error = acpi_os_read_iomem(virt_addr, value, width);
+	BUG_ON(error);
 
 	if (unmap)
 		iounmap(virt_addr);
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 602f8ff212f2..81367edc8a10 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -2122,6 +2122,7 @@ int __init acpi_scan_init(void)
 	acpi_int340x_thermal_init();
 	acpi_amba_init();
 	acpi_watchdog_init();
+	acpi_init_lpit();
 
 	acpi_scan_add_handler(&generic_device_handler);
 
diff --git a/include/acpi/acpiosxf.h b/include/acpi/acpiosxf.h
index c66eb8ffa454..d5c0f5153c4e 100644
--- a/include/acpi/acpiosxf.h
+++ b/include/acpi/acpiosxf.h
@@ -287,6 +287,8 @@ acpi_status acpi_os_write_port(acpi_io_address address, u32 value, u32 width);
 /*
  * Platform and hardware-independent physical memory interfaces
  */
+int acpi_os_read_iomem(void __iomem *virt_addr, u64 *value, u32 width);
+
 #ifndef ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_read_memory
 acpi_status
 acpi_os_read_memory(acpi_physical_address address, u64 *value, u32 width);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index d18c92d4ba19..2b1738f840ab 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1248,4 +1248,13 @@ int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res)
 }
 #endif
 
+#ifdef CONFIG_ACPI_LPIT
+int lpit_read_residency_count_address(u64 *address);
+#else
+static inline int lpit_read_residency_count_address(u64 *address)
+{
+	return -EINVAL;
+}
+#endif
+
 #endif	/*_LINUX_ACPI_H*/
-- 
cgit v1.2.3


From 0e708fc602531b8355b5de6ea7c98f09129b223f Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 3 Oct 2017 09:11:07 +0200
Subject: PM / sleep: Remove pm_complete_with_resume_check()

According to recent changes for ACPI, the are longer any users of
pm_complete_with_resume_check(), thus let's drop it.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/generic_ops.c | 23 -----------------------
 include/linux/pm.h               |  1 -
 2 files changed, 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/generic_ops.c b/drivers/base/power/generic_ops.c
index 07c3c4a9522d..b2ed606265a8 100644
--- a/drivers/base/power/generic_ops.c
+++ b/drivers/base/power/generic_ops.c
@@ -9,7 +9,6 @@
 #include <linux/pm.h>
 #include <linux/pm_runtime.h>
 #include <linux/export.h>
-#include <linux/suspend.h>
 
 #ifdef CONFIG_PM
 /**
@@ -298,26 +297,4 @@ void pm_generic_complete(struct device *dev)
 	if (drv && drv->pm && drv->pm->complete)
 		drv->pm->complete(dev);
 }
-
-/**
- * pm_complete_with_resume_check - Complete a device power transition.
- * @dev: Device to handle.
- *
- * Complete a device power transition during a system-wide power transition and
- * optionally schedule a runtime resume of the device if the system resume in
- * progress has been initated by the platform firmware and the device had its
- * power.direct_complete flag set.
- */
-void pm_complete_with_resume_check(struct device *dev)
-{
-	pm_generic_complete(dev);
-	/*
-	 * If the device had been runtime-suspended before the system went into
-	 * the sleep state it is going out of and it has never been resumed till
-	 * now, resume it in case the firmware powered it up.
-	 */
-	if (dev->power.direct_complete && pm_resume_via_firmware())
-		pm_request_resume(dev);
-}
-EXPORT_SYMBOL_GPL(pm_complete_with_resume_check);
 #endif /* CONFIG_PM_SLEEP */
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 47ded8aa8a5d..a0ceeccf2846 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -736,7 +736,6 @@ extern int pm_generic_poweroff_noirq(struct device *dev);
 extern int pm_generic_poweroff_late(struct device *dev);
 extern int pm_generic_poweroff(struct device *dev);
 extern void pm_generic_complete(struct device *dev);
-extern void pm_complete_with_resume_check(struct device *dev);
 
 #else /* !CONFIG_PM_SLEEP */
 
-- 
cgit v1.2.3


From e81cef5d3001501350b4e596b4bd6dfd26187afa Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 24 Sep 2017 09:25:39 -0400
Subject: blk_rq_map_user_iov(): move iov_iter_advance() down

... into bio_{map,copy}_user_iov()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/bio.c         | 6 ++++--
 block/blk-map.c     | 1 -
 include/linux/bio.h | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index d1ca7eecc8aa..cd1282db03cb 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1195,7 +1195,7 @@ int bio_uncopy_user(struct bio *bio)
  */
 struct bio *bio_copy_user_iov(struct request_queue *q,
 			      struct rq_map_data *map_data,
-			      const struct iov_iter *iter,
+			      struct iov_iter *iter,
 			      gfp_t gfp_mask)
 {
 	struct bio_map_data *bmd;
@@ -1298,6 +1298,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 		if (ret)
 			goto cleanup;
 	}
+	iov_iter_advance(iter, bio->bi_iter.bi_size);
 
 	bio->bi_private = bmd;
 	return bio;
@@ -1320,7 +1321,7 @@ out_bmd:
  *	device. Returns an error pointer in case of error.
  */
 struct bio *bio_map_user_iov(struct request_queue *q,
-			     const struct iov_iter *iter,
+			     struct iov_iter *iter,
 			     gfp_t gfp_mask)
 {
 	int j;
@@ -1399,6 +1400,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 	 * reference to it
 	 */
 	bio_get(bio);
+	iov_iter_advance(iter, bio->bi_iter.bi_size);
 	return bio;
 
  out_unmap:
diff --git a/block/blk-map.c b/block/blk-map.c
index 2547016aa7aa..891eea11f68e 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -69,7 +69,6 @@ static int __blk_rq_map_user_iov(struct request *rq,
 	if (map_data && map_data->null_mapped)
 		bio_set_flag(bio, BIO_NULL_MAPPED);
 
-	iov_iter_advance(iter, bio->bi_iter.bi_size);
 	if (map_data)
 		map_data->offset += bio->bi_iter.bi_size;
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 275c91c99516..6050c0caa4e1 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -462,7 +462,7 @@ extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
 struct rq_map_data;
 extern struct bio *bio_map_user_iov(struct request_queue *,
-				    const struct iov_iter *, gfp_t);
+				    struct iov_iter *, gfp_t);
 extern void bio_unmap_user(struct bio *);
 extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int,
 				gfp_t);
@@ -494,7 +494,7 @@ extern void bio_free_pages(struct bio *bio);
 
 extern struct bio *bio_copy_user_iov(struct request_queue *,
 				     struct rq_map_data *,
-				     const struct iov_iter *,
+				     struct iov_iter *,
 				     gfp_t);
 extern int bio_uncopy_user(struct bio *);
 void zero_fill_bio(struct bio *bio);
-- 
cgit v1.2.3


From faea13297ea739f94913d56d7b865134b4fc8726 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 24 Sep 2017 14:03:29 -0400
Subject: kill iov_shorten()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/read_write.c     | 21 ---------------------
 include/linux/uio.h |  2 --
 2 files changed, 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/read_write.c b/fs/read_write.c
index f0d4b16873e8..b8bf2cb3298a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -634,27 +634,6 @@ SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
 	return ret;
 }
 
-/*
- * Reduce an iovec's length in-place.  Return the resulting number of segments
- */
-unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
-{
-	unsigned long seg = 0;
-	size_t len = 0;
-
-	while (seg < nr_segs) {
-		seg++;
-		if (len + iov->iov_len >= to) {
-			iov->iov_len = to - len;
-			break;
-		}
-		len += iov->iov_len;
-		iov++;
-	}
-	return seg;
-}
-EXPORT_SYMBOL(iov_shorten);
-
 static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 		loff_t *ppos, int type, rwf_t flags)
 {
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 8a642cda641c..5885daeae721 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -80,8 +80,6 @@ static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
 	     ((iov = iov_iter_iovec(&(iter))), 1);		\
 	     iov_iter_advance(&(iter), (iov).iov_len))
 
-unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to);
-
 size_t iov_iter_copy_from_user_atomic(struct page *page,
 		struct iov_iter *i, unsigned long offset, size_t bytes);
 void iov_iter_advance(struct iov_iter *i, size_t bytes);
-- 
cgit v1.2.3


From 09cf698a594276139b7dfafb232af3fe4fbc4438 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Feb 2017 01:44:03 -0500
Subject: new primitive: iov_iter_for_each_range()

For kvec and bvec: feeds segments to given callback as long as it
returns 0.  For iovec and pipe: fails.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uio.h |  4 ++++
 lib/iov_iter.c      | 22 ++++++++++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 5885daeae721..e67e12adb136 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -244,4 +244,8 @@ int compat_import_iovec(int type, const struct compat_iovec __user * uvector,
 int import_single_range(int type, void __user *buf, size_t len,
 		 struct iovec *iov, struct iov_iter *i);
 
+int iov_iter_for_each_range(struct iov_iter *i, size_t bytes,
+			    int (*f)(struct kvec *vec, void *context),
+			    void *context);
+
 #endif
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 1c1c06ddc20a..970212670b6a 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1446,3 +1446,25 @@ int import_single_range(int rw, void __user *buf, size_t len,
 	return 0;
 }
 EXPORT_SYMBOL(import_single_range);
+
+int iov_iter_for_each_range(struct iov_iter *i, size_t bytes,
+			    int (*f)(struct kvec *vec, void *context),
+			    void *context)
+{
+	struct kvec w;
+	int err = -EINVAL;
+	if (!bytes)
+		return 0;
+
+	iterate_all_kinds(i, bytes, v, -EINVAL, ({
+		w.iov_base = kmap(v.bv_page) + v.bv_offset;
+		w.iov_len = v.bv_len;
+		err = f(&w, context);
+		kunmap(v.bv_page);
+		err;}), ({
+		w = v;
+		err = f(&w, context);})
+	)
+	return err;
+}
+EXPORT_SYMBOL(iov_iter_for_each_range);
-- 
cgit v1.2.3


From 4e659dbe2d02a56ca0df25c77e099760252a329c Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Mon, 14 Aug 2017 15:46:17 -0700
Subject: firmware: qcom: scm: Expose secure IO service

The secure IO service provides operations for reading and writing secure
memory from non-secure mode, expose this API through SCM.

Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
 drivers/firmware/qcom_scm-32.c | 18 ++++++++++++++++++
 drivers/firmware/qcom_scm-64.c | 31 +++++++++++++++++++++++++++++++
 drivers/firmware/qcom_scm.c    | 12 ++++++++++++
 drivers/firmware/qcom_scm.h    |  6 ++++++
 include/linux/qcom_scm.h       |  4 ++++
 5 files changed, 71 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/qcom_scm-32.c b/drivers/firmware/qcom_scm-32.c
index 93e3b96b6dfa..11fdb1584823 100644
--- a/drivers/firmware/qcom_scm-32.c
+++ b/drivers/firmware/qcom_scm-32.c
@@ -596,3 +596,21 @@ int __qcom_scm_iommu_secure_ptbl_init(struct device *dev, u64 addr, u32 size,
 {
 	return -ENODEV;
 }
+
+int __qcom_scm_io_readl(struct device *dev, phys_addr_t addr,
+			unsigned int *val)
+{
+	int ret;
+
+	ret = qcom_scm_call_atomic1(QCOM_SCM_SVC_IO, QCOM_SCM_IO_READ, addr);
+	if (ret >= 0)
+		*val = ret;
+
+	return ret < 0 ? ret : 0;
+}
+
+int __qcom_scm_io_writel(struct device *dev, phys_addr_t addr, unsigned int val)
+{
+	return qcom_scm_call_atomic2(QCOM_SCM_SVC_IO, QCOM_SCM_IO_WRITE,
+				     addr, val);
+}
diff --git a/drivers/firmware/qcom_scm-64.c b/drivers/firmware/qcom_scm-64.c
index 6e6d561708e2..bf50fb59852e 100644
--- a/drivers/firmware/qcom_scm-64.c
+++ b/drivers/firmware/qcom_scm-64.c
@@ -439,3 +439,34 @@ int __qcom_scm_iommu_secure_ptbl_init(struct device *dev, u64 addr, u32 size,
 
 	return ret;
 }
+
+int __qcom_scm_io_readl(struct device *dev, phys_addr_t addr,
+			unsigned int *val)
+{
+	struct qcom_scm_desc desc = {0};
+	struct arm_smccc_res res;
+	int ret;
+
+	desc.args[0] = addr;
+	desc.arginfo = QCOM_SCM_ARGS(1);
+
+	ret = qcom_scm_call(dev, QCOM_SCM_SVC_IO, QCOM_SCM_IO_READ,
+			    &desc, &res);
+	if (ret >= 0)
+		*val = res.a1;
+
+	return ret < 0 ? ret : 0;
+}
+
+int __qcom_scm_io_writel(struct device *dev, phys_addr_t addr, unsigned int val)
+{
+	struct qcom_scm_desc desc = {0};
+	struct arm_smccc_res res;
+
+	desc.args[0] = addr;
+	desc.args[1] = val;
+	desc.arginfo = QCOM_SCM_ARGS(2);
+
+	return qcom_scm_call(dev, QCOM_SCM_SVC_IO, QCOM_SCM_IO_WRITE,
+			     &desc, &res);
+}
diff --git a/drivers/firmware/qcom_scm.c b/drivers/firmware/qcom_scm.c
index bb16510d75ba..e18d63935648 100644
--- a/drivers/firmware/qcom_scm.c
+++ b/drivers/firmware/qcom_scm.c
@@ -333,6 +333,18 @@ int qcom_scm_iommu_secure_ptbl_init(u64 addr, u32 size, u32 spare)
 }
 EXPORT_SYMBOL(qcom_scm_iommu_secure_ptbl_init);
 
+int qcom_scm_io_readl(phys_addr_t addr, unsigned int *val)
+{
+	return __qcom_scm_io_readl(__scm->dev, addr, val);
+}
+EXPORT_SYMBOL(qcom_scm_io_readl);
+
+int qcom_scm_io_writel(phys_addr_t addr, unsigned int val)
+{
+	return __qcom_scm_io_writel(__scm->dev, addr, val);
+}
+EXPORT_SYMBOL(qcom_scm_io_writel);
+
 /**
  * qcom_scm_is_available() - Checks if SCM is available
  */
diff --git a/drivers/firmware/qcom_scm.h b/drivers/firmware/qcom_scm.h
index 9bea691f30fb..a60e4b9b1394 100644
--- a/drivers/firmware/qcom_scm.h
+++ b/drivers/firmware/qcom_scm.h
@@ -30,6 +30,12 @@ extern int __qcom_scm_set_cold_boot_addr(void *entry, const cpumask_t *cpus);
 #define QCOM_SCM_CMD_CORE_HOTPLUGGED	0x10
 extern void __qcom_scm_cpu_power_down(u32 flags);
 
+#define QCOM_SCM_SVC_IO			0x5
+#define QCOM_SCM_IO_READ		0x1
+#define QCOM_SCM_IO_WRITE		0x2
+extern int __qcom_scm_io_readl(struct device *dev, phys_addr_t addr, unsigned int *val);
+extern int __qcom_scm_io_writel(struct device *dev, phys_addr_t addr, unsigned int val);
+
 #define QCOM_SCM_SVC_INFO		0x6
 #define QCOM_IS_CALL_AVAIL_CMD		0x1
 extern int __qcom_scm_is_call_available(struct device *dev, u32 svc_id,
diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h
index e5380471c2cd..e8357f570695 100644
--- a/include/linux/qcom_scm.h
+++ b/include/linux/qcom_scm.h
@@ -43,6 +43,8 @@ extern int qcom_scm_set_remote_state(u32 state, u32 id);
 extern int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare);
 extern int qcom_scm_iommu_secure_ptbl_size(u32 spare, size_t *size);
 extern int qcom_scm_iommu_secure_ptbl_init(u64 addr, u32 size, u32 spare);
+extern int qcom_scm_io_readl(phys_addr_t addr, unsigned int *val);
+extern int qcom_scm_io_writel(phys_addr_t addr, unsigned int val);
 #else
 static inline
 int qcom_scm_set_cold_boot_addr(void *entry, const cpumask_t *cpus)
@@ -73,5 +75,7 @@ qcom_scm_set_remote_state(u32 state,u32 id) { return -ENODEV; }
 static inline int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare) { return -ENODEV; }
 static inline int qcom_scm_iommu_secure_ptbl_size(u32 spare, size_t *size) { return -ENODEV; }
 static inline int qcom_scm_iommu_secure_ptbl_init(u64 addr, u32 size, u32 spare) { return -ENODEV; }
+static inline int qcom_scm_io_readl(phys_addr_t addr, unsigned int *val) { return -ENODEV; }
+static inline int qcom_scm_io_writel(phys_addr_t addr, unsigned int val) { return -ENODEV; }
 #endif
 #endif
-- 
cgit v1.2.3


From 538d5b333216c3daa7a5821307164f10af73ec8c Mon Sep 17 00:00:00 2001
From: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Date: Wed, 20 Sep 2017 10:52:02 +0200
Subject: iommu/iova: Make rcache flush optional on IOVA allocation failure

Since IOVA allocation failure is not unusual case we need to flush
CPUs' rcache in hope we will succeed in next round.

However, it is useful to decide whether we need rcache flush step because
of two reasons:
- Not scalability. On large system with ~100 CPUs iterating and flushing
  rcache for each CPU becomes serious bottleneck so we may want to defer it.
- free_cpu_cached_iovas() does not care about max PFN we are interested in.
  Thus we may flush our rcaches and still get no new IOVA like in the
  commonly used scenario:

    if (dma_limit > DMA_BIT_MASK(32) && dev_is_pci(dev))
        iova = alloc_iova_fast(iovad, iova_len, DMA_BIT_MASK(32) >> shift);

    if (!iova)
        iova = alloc_iova_fast(iovad, iova_len, dma_limit >> shift);

   1. First alloc_iova_fast() call is limited to DMA_BIT_MASK(32) to get
      PCI devices a SAC address
   2. alloc_iova() fails due to full 32-bit space
   3. rcaches contain PFNs out of 32-bit space so free_cpu_cached_iovas()
      throws entries away for nothing and alloc_iova() fails again
   4. Next alloc_iova_fast() call cannot take advantage of rcache since we
      have just defeated caches. In this case we pick the slowest option
      to proceed.

This patch reworks flushed_rcache local flag to be additional function
argument instead and control rcache flush step. Also, it updates all users
to do the flush as the last chance.

Signed-off-by: Tomasz Nowicki <Tomasz.Nowicki@caviumnetworks.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Nate Watterson <nwatters@codeaurora.org>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd_iommu.c   |  5 +++--
 drivers/iommu/dma-iommu.c   |  6 ++++--
 drivers/iommu/intel-iommu.c |  5 +++--
 drivers/iommu/iova.c        | 11 ++++++-----
 include/linux/iova.h        |  5 +++--
 5 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 647ab7691aee..3d64c844d8b1 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -1546,10 +1546,11 @@ static unsigned long dma_ops_alloc_iova(struct device *dev,
 
 	if (dma_mask > DMA_BIT_MASK(32))
 		pfn = alloc_iova_fast(&dma_dom->iovad, pages,
-				      IOVA_PFN(DMA_BIT_MASK(32)));
+				      IOVA_PFN(DMA_BIT_MASK(32)), false);
 
 	if (!pfn)
-		pfn = alloc_iova_fast(&dma_dom->iovad, pages, IOVA_PFN(dma_mask));
+		pfn = alloc_iova_fast(&dma_dom->iovad, pages,
+				      IOVA_PFN(dma_mask), true);
 
 	return (pfn << PAGE_SHIFT);
 }
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 191be9c80a8a..25914d36c5ac 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -370,10 +370,12 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
 
 	/* Try to get PCI devices a SAC address */
 	if (dma_limit > DMA_BIT_MASK(32) && dev_is_pci(dev))
-		iova = alloc_iova_fast(iovad, iova_len, DMA_BIT_MASK(32) >> shift);
+		iova = alloc_iova_fast(iovad, iova_len,
+				       DMA_BIT_MASK(32) >> shift, false);
 
 	if (!iova)
-		iova = alloc_iova_fast(iovad, iova_len, dma_limit >> shift);
+		iova = alloc_iova_fast(iovad, iova_len, dma_limit >> shift,
+				       true);
 
 	return (dma_addr_t)iova << shift;
 }
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index ebb48353dd39..b3914fce8254 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -3469,11 +3469,12 @@ static unsigned long intel_alloc_iova(struct device *dev,
 		 * from higher range
 		 */
 		iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
-					   IOVA_PFN(DMA_BIT_MASK(32)));
+					   IOVA_PFN(DMA_BIT_MASK(32)), false);
 		if (iova_pfn)
 			return iova_pfn;
 	}
-	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, IOVA_PFN(dma_mask));
+	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
+				   IOVA_PFN(dma_mask), true);
 	if (unlikely(!iova_pfn)) {
 		pr_err("Allocating %ld-page iova for %s failed",
 		       nrpages, dev_name(dev));
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 3aee64b99df1..84bda3a4dafc 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -395,14 +395,15 @@ EXPORT_SYMBOL_GPL(free_iova);
  * @iovad: - iova domain in question
  * @size: - size of page frames to allocate
  * @limit_pfn: - max limit address
+ * @flush_rcache: - set to flush rcache on regular allocation failure
  * This function tries to satisfy an iova allocation from the rcache,
- * and falls back to regular allocation on failure.
+ * and falls back to regular allocation on failure. If regular allocation
+ * fails too and the flush_rcache flag is set then the rcache will be flushed.
 */
 unsigned long
 alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
-		unsigned long limit_pfn)
+		unsigned long limit_pfn, bool flush_rcache)
 {
-	bool flushed_rcache = false;
 	unsigned long iova_pfn;
 	struct iova *new_iova;
 
@@ -415,11 +416,11 @@ retry:
 	if (!new_iova) {
 		unsigned int cpu;
 
-		if (flushed_rcache)
+		if (!flush_rcache)
 			return 0;
 
 		/* Try replenishing IOVAs by flushing rcache. */
-		flushed_rcache = true;
+		flush_rcache = false;
 		for_each_online_cpu(cpu)
 			free_cpu_cached_iovas(cpu, iovad);
 		goto retry;
diff --git a/include/linux/iova.h b/include/linux/iova.h
index c696ee81054e..928442dda565 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -150,7 +150,7 @@ void queue_iova(struct iova_domain *iovad,
 		unsigned long pfn, unsigned long pages,
 		unsigned long data);
 unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
-			      unsigned long limit_pfn);
+			      unsigned long limit_pfn, bool flush_rcache);
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 	unsigned long pfn_hi);
 void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
@@ -212,7 +212,8 @@ static inline void queue_iova(struct iova_domain *iovad,
 
 static inline unsigned long alloc_iova_fast(struct iova_domain *iovad,
 					    unsigned long size,
-					    unsigned long limit_pfn)
+					    unsigned long limit_pfn,
+					    bool flush_rcache)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 24d654fadd70884a719c45ffabb0fe6033996f92 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 23 Sep 2017 12:08:16 +0200
Subject: kfifo: Fix comments

Fix some typo.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/kfifo.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index 41eb6fdf87a8..7b45959ebd92 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -325,7 +325,7 @@ __kfifo_uint_must_check_helper( \
  *
  * This macro dynamically allocates a new fifo buffer.
  *
- * The numer of elements will be rounded-up to a power of 2.
+ * The number of elements will be rounded-up to a power of 2.
  * The fifo will be release with kfifo_free().
  * Return 0 if no error, otherwise an error code.
  */
@@ -358,9 +358,9 @@ __kfifo_int_must_check_helper( \
  * @buffer: the preallocated buffer to be used
  * @size: the size of the internal buffer, this have to be a power of 2
  *
- * This macro initialize a fifo using a preallocated buffer.
+ * This macro initializes a fifo using a preallocated buffer.
  *
- * The numer of elements will be rounded-up to a power of 2.
+ * The number of elements will be rounded-up to a power of 2.
  * Return 0 if no error, otherwise an error code.
  */
 #define kfifo_init(fifo, buffer, size) \
-- 
cgit v1.2.3


From 7f66721a7d5bf6251f9c49aada9200c61ddcecc5 Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Thu, 12 Oct 2017 19:58:10 +0300
Subject: fs/block_dev: remove vfs_msg() interface

Replaced by pr_err usage in commit ef51042472f5 ("block, dax: move
"select DAX" from BLOCK to FS_DAX")

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Acked-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c         | 12 ------------
 include/linux/blkdev.h | 11 -----------
 2 files changed, 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 93d088ffc05c..07ddccd17801 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -54,18 +54,6 @@ struct block_device *I_BDEV(struct inode *inode)
 }
 EXPORT_SYMBOL(I_BDEV);
 
-void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
-{
-	struct va_format vaf;
-	va_list args;
-
-	va_start(args, fmt);
-	vaf.fmt = fmt;
-	vaf.va = &args;
-	printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf);
-	va_end(args);
-}
-
 static void bdev_write_inode(struct block_device *bdev)
 {
 	struct inode *inode = bdev->bd_inode;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9fb71fc7d0e8..72637028f3c9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -917,17 +917,6 @@ static inline void rq_flush_dcache_pages(struct request *rq)
 }
 #endif
 
-#ifdef CONFIG_PRINTK
-#define vfs_msg(sb, level, fmt, ...)				\
-	__vfs_msg(sb, level, fmt, ##__VA_ARGS__)
-#else
-#define vfs_msg(sb, level, fmt, ...)				\
-do {								\
-	no_printk(fmt, ##__VA_ARGS__);				\
-	__vfs_msg(sb, "", " ");					\
-} while (0)
-#endif
-
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 extern blk_qc_t generic_make_request(struct bio *bio);
-- 
cgit v1.2.3


From 2355a6546a053b1c16ebefd6ce1f0cccc00e1da5 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Thu, 12 Oct 2017 10:21:25 +0200
Subject: net: phy: broadcom: support new device flag for setting master mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some of Broadcom's PHYs run by default in slave mode with Automatic
Slave/Master configuration disabled. It stops them from working properly
with some devices.

So far it has been verified for BCM54210E and BCM50212E which don't
work well with Intel's I217-LM and I218-LM:
http://ark.intel.com/products/60019/Intel-Ethernet-Connection-I217-LM
http://ark.intel.com/products/71307/Intel-Ethernet-Connection-I218-LM
I was told there is massive ping loss.

This commit adds support for a new flag which can be set by an ethernet
driver to fixup PHY setup.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 6 ++++++
 include/linux/brcmphy.h    | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 1e9ad30a35c8..d7ed69deabfb 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -43,6 +43,12 @@ static int bcm54210e_config_init(struct phy_device *phydev)
 	val &= ~BCM54810_SHD_CLK_CTL_GTXCLK_EN;
 	bcm_phy_write_shadow(phydev, BCM54810_SHD_CLK_CTL, val);
 
+	if (phydev->dev_flags & PHY_BRCM_EN_MASTER_MODE) {
+		val = phy_read(phydev, MII_CTRL1000);
+		val |= CTL1000_AS_MASTER | CTL1000_ENABLE_MASTER;
+		phy_write(phydev, MII_CTRL1000, val);
+	}
+
 	return 0;
 }
 
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index abcda9b458ab..9ac9e3e3d1e5 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -63,6 +63,7 @@
 #define PHY_BRCM_EXT_IBND_TX_ENABLE	0x00002000
 #define PHY_BRCM_CLEAR_RGMII_MODE	0x00004000
 #define PHY_BRCM_DIS_TXCRXC_NOENRGY	0x00008000
+#define PHY_BRCM_EN_MASTER_MODE		0x00010000
 
 /* Broadcom BCM7xxx specific workarounds */
 #define PHY_BRCM_7XXX_REV(x)		(((x) >> 8) & 0xff)
-- 
cgit v1.2.3


From 511cb17448d95e4277451cdee882e72b6a9a3099 Mon Sep 17 00:00:00 2001
From: Keerthy <j-keerthy@ti.com>
Date: Wed, 30 Aug 2017 14:59:10 +0530
Subject: mfd: tps65217: Introduce dependency on CONFIG_OF

Currently the driver boots only via device tree hence add a
dependency on CONFIG_OF. This leaves with a bunch of unused code
so clean that up. This patch also makes use of probe_new function
in place of the probe function so as to avoid passing i2c_device_id.

Signed-off-by: Keerthy <j-keerthy@ti.com>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig                    |  2 +-
 drivers/mfd/tps65217.c                 | 28 +++++-----------------------
 drivers/regulator/tps65217-regulator.c |  5 -----
 drivers/video/backlight/tps65217_bl.c  | 14 +++-----------
 include/linux/mfd/tps65217.h           |  6 ------
 5 files changed, 9 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index fc5e4fef89d2..b47876877c9c 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1338,7 +1338,7 @@ config MFD_TPS65090
 
 config MFD_TPS65217
 	tristate "TI TPS65217 Power Management / White LED chips"
-	depends on I2C
+	depends on I2C && OF
 	select MFD_CORE
 	select REGMAP_I2C
 	select IRQ_DOMAIN
diff --git a/drivers/mfd/tps65217.c b/drivers/mfd/tps65217.c
index f769c7d4e335..7566ce4457a0 100644
--- a/drivers/mfd/tps65217.c
+++ b/drivers/mfd/tps65217.c
@@ -311,37 +311,20 @@ static const struct regmap_config tps65217_regmap_config = {
 };
 
 static const struct of_device_id tps65217_of_match[] = {
-	{ .compatible = "ti,tps65217", .data = (void *)TPS65217 },
+	{ .compatible = "ti,tps65217"},
 	{ /* sentinel */ },
 };
 MODULE_DEVICE_TABLE(of, tps65217_of_match);
 
-static int tps65217_probe(struct i2c_client *client,
-				const struct i2c_device_id *ids)
+static int tps65217_probe(struct i2c_client *client)
 {
 	struct tps65217 *tps;
 	unsigned int version;
-	unsigned long chip_id = ids->driver_data;
-	const struct of_device_id *match;
 	bool status_off = false;
 	int ret;
 
-	if (client->dev.of_node) {
-		match = of_match_device(tps65217_of_match, &client->dev);
-		if (!match) {
-			dev_err(&client->dev,
-				"Failed to find matching dt id\n");
-			return -EINVAL;
-		}
-		chip_id = (unsigned long)match->data;
-		status_off = of_property_read_bool(client->dev.of_node,
-					"ti,pmic-shutdown-controller");
-	}
-
-	if (!chip_id) {
-		dev_err(&client->dev, "id is null.\n");
-		return -ENODEV;
-	}
+	status_off = of_property_read_bool(client->dev.of_node,
+					   "ti,pmic-shutdown-controller");
 
 	tps = devm_kzalloc(&client->dev, sizeof(*tps), GFP_KERNEL);
 	if (!tps)
@@ -349,7 +332,6 @@ static int tps65217_probe(struct i2c_client *client,
 
 	i2c_set_clientdata(client, tps);
 	tps->dev = &client->dev;
-	tps->id = chip_id;
 
 	tps->regmap = devm_regmap_init_i2c(client, &tps65217_regmap_config);
 	if (IS_ERR(tps->regmap)) {
@@ -430,7 +412,7 @@ static struct i2c_driver tps65217_driver = {
 		.of_match_table = tps65217_of_match,
 	},
 	.id_table	= tps65217_id_table,
-	.probe		= tps65217_probe,
+	.probe_new	= tps65217_probe,
 	.remove		= tps65217_remove,
 };
 
diff --git a/drivers/regulator/tps65217-regulator.c b/drivers/regulator/tps65217-regulator.c
index 5324dc9e6d6e..7b12e880d1ea 100644
--- a/drivers/regulator/tps65217-regulator.c
+++ b/drivers/regulator/tps65217-regulator.c
@@ -228,11 +228,6 @@ static int tps65217_regulator_probe(struct platform_device *pdev)
 	int i, ret;
 	unsigned int val;
 
-	if (tps65217_chip_id(tps) != TPS65217) {
-		dev_err(&pdev->dev, "Invalid tps chip version\n");
-		return -ENODEV;
-	}
-
 	/* Allocate memory for strobes */
 	tps->strobes = devm_kzalloc(&pdev->dev, sizeof(u8) *
 				    TPS65217_NUM_REGULATOR, GFP_KERNEL);
diff --git a/drivers/video/backlight/tps65217_bl.c b/drivers/video/backlight/tps65217_bl.c
index fd524ad860a5..5ce8791f4523 100644
--- a/drivers/video/backlight/tps65217_bl.c
+++ b/drivers/video/backlight/tps65217_bl.c
@@ -275,17 +275,9 @@ static int tps65217_bl_probe(struct platform_device *pdev)
 	struct tps65217_bl_pdata *pdata;
 	struct backlight_properties bl_props;
 
-	if (tps->dev->of_node) {
-		pdata = tps65217_bl_parse_dt(pdev);
-		if (IS_ERR(pdata))
-			return PTR_ERR(pdata);
-	} else {
-		pdata = dev_get_platdata(&pdev->dev);
-		if (!pdata) {
-			dev_err(&pdev->dev, "no platform data provided\n");
-			return -EINVAL;
-		}
-	}
+	pdata = tps65217_bl_parse_dt(pdev);
+	if (IS_ERR(pdata))
+		return PTR_ERR(pdata);
 
 	tps65217_bl = devm_kzalloc(&pdev->dev, sizeof(*tps65217_bl),
 				GFP_KERNEL);
diff --git a/include/linux/mfd/tps65217.h b/include/linux/mfd/tps65217.h
index eac285756b37..b5dd108421c8 100644
--- a/include/linux/mfd/tps65217.h
+++ b/include/linux/mfd/tps65217.h
@@ -263,7 +263,6 @@ struct tps65217_board {
 struct tps65217 {
 	struct device *dev;
 	struct tps65217_board *pdata;
-	unsigned long id;
 	struct regulator_desc desc[TPS65217_NUM_REGULATOR];
 	struct regmap *regmap;
 	u8 *strobes;
@@ -278,11 +277,6 @@ static inline struct tps65217 *dev_to_tps65217(struct device *dev)
 	return dev_get_drvdata(dev);
 }
 
-static inline unsigned long tps65217_chip_id(struct tps65217 *tps65217)
-{
-	return tps65217->id;
-}
-
 int tps65217_reg_read(struct tps65217 *tps, unsigned int reg,
 					unsigned int *val);
 int tps65217_reg_write(struct tps65217 *tps, unsigned int reg,
-- 
cgit v1.2.3


From 8275b77a15131673f955f738f0704e1d40a8edae Mon Sep 17 00:00:00 2001
From: Rui Feng <rui_feng@realsil.com.cn>
Date: Thu, 7 Sep 2017 16:26:39 +0800
Subject: mfd: rts5249: Add support for RTS5250S power saving

Enable power saving for RTS5250S as following steps:
1.Set 0xFE58 to enable clock power management.
2.Check cfg space whether support L1SS or not.
3.If support L1SS, set 0xFF03 to free clkreq.
4.When entering idle status, enable aspm
  and set parameters for L1SS and LTR.
5.Wnen entering run status, disable aspm
  and set parameters for L1SS and LTR.
If entering L1SS mode successfully,
electric current will be below 2mA.

Signed-off-by: Rui Feng <rui_feng@realsil.com.cn>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/rts5249.c        | 155 ++++++++++++++++++++++++++++++++++++++++++-
 drivers/mfd/rtsx_pcr.c       | 142 +++++++++++++++++++++++++++++++++++++--
 drivers/mfd/rtsx_pcr.h       |  14 ++++
 include/linux/mfd/rtsx_pci.h |  84 +++++++++++++++++++++++
 4 files changed, 389 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/rts5249.c b/drivers/mfd/rts5249.c
index 40f8bb14fc59..7fcf37ba922c 100644
--- a/drivers/mfd/rts5249.c
+++ b/drivers/mfd/rts5249.c
@@ -103,8 +103,64 @@ static void rtsx_base_force_power_down(struct rtsx_pcr *pcr, u8 pm_state)
 	rtsx_pci_write_register(pcr, FPDCTL, 0x03, 0x03);
 }
 
+static void rts5249_init_from_cfg(struct rtsx_pcr *pcr)
+{
+	struct rtsx_cr_option *option = &(pcr->option);
+	u32 lval;
+
+	if (CHK_PCI_PID(pcr, PID_524A))
+		rtsx_pci_read_config_dword(pcr,
+			PCR_ASPM_SETTING_REG1, &lval);
+	else
+		rtsx_pci_read_config_dword(pcr,
+			PCR_ASPM_SETTING_REG2, &lval);
+
+	if (lval & ASPM_L1_1_EN_MASK)
+		rtsx_set_dev_flag(pcr, ASPM_L1_1_EN);
+
+	if (lval & ASPM_L1_2_EN_MASK)
+		rtsx_set_dev_flag(pcr, ASPM_L1_2_EN);
+
+	if (lval & PM_L1_1_EN_MASK)
+		rtsx_set_dev_flag(pcr, PM_L1_1_EN);
+
+	if (lval & PM_L1_2_EN_MASK)
+		rtsx_set_dev_flag(pcr, PM_L1_2_EN);
+
+	if (option->ltr_en) {
+		u16 val;
+
+		pcie_capability_read_word(pcr->pci, PCI_EXP_DEVCTL2, &val);
+		if (val & PCI_EXP_DEVCTL2_LTR_EN) {
+			option->ltr_enabled = true;
+			option->ltr_active = true;
+			rtsx_set_ltr_latency(pcr, option->ltr_active_latency);
+		} else {
+			option->ltr_enabled = false;
+		}
+	}
+}
+
+static int rts5249_init_from_hw(struct rtsx_pcr *pcr)
+{
+	struct rtsx_cr_option *option = &(pcr->option);
+
+	if (rtsx_check_dev_flag(pcr, ASPM_L1_1_EN | ASPM_L1_2_EN
+				| PM_L1_1_EN | PM_L1_2_EN))
+		option->force_clkreq_0 = false;
+	else
+		option->force_clkreq_0 = true;
+
+	return 0;
+}
+
 static int rts5249_extra_init_hw(struct rtsx_pcr *pcr)
 {
+	struct rtsx_cr_option *option = &(pcr->option);
+
+	rts5249_init_from_cfg(pcr);
+	rts5249_init_from_hw(pcr);
+
 	rtsx_pci_init_cmd(pcr);
 
 	/* Rest L1SUB Config */
@@ -125,7 +181,18 @@ static int rts5249_extra_init_hw(struct rtsx_pcr *pcr)
 	else
 		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, 0xB0, 0x80);
 
-	return rtsx_pci_send_cmd(pcr, 100);
+	/*
+	 * If u_force_clkreq_0 is enabled, CLKREQ# PIN will be forced
+	 * to drive low, and we forcibly request clock.
+	 */
+	if (option->force_clkreq_0)
+		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG,
+			FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_LOW);
+	else
+		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG,
+			FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_HIGH);
+
+	return rtsx_pci_send_cmd(pcr, CMD_TIMEOUT_DEF);
 }
 
 static int rts5249_optimize_phy(struct rtsx_pcr *pcr)
@@ -285,6 +352,31 @@ static int rtsx_base_switch_output_voltage(struct rtsx_pcr *pcr, u8 voltage)
 	return rtsx_pci_send_cmd(pcr, 100);
 }
 
+static void rts5249_set_aspm(struct rtsx_pcr *pcr, bool enable)
+{
+	struct rtsx_cr_option *option = &pcr->option;
+	u8 val = 0;
+
+	if (pcr->aspm_enabled == enable)
+		return;
+
+	if (option->dev_aspm_mode == DEV_ASPM_DYNAMIC) {
+		if (enable)
+			val = pcr->aspm_en;
+		rtsx_pci_update_cfg_byte(pcr,
+			pcr->pcie_cap + PCI_EXP_LNKCTL,
+			ASPM_MASK_NEG, val);
+	} else if (option->dev_aspm_mode == DEV_ASPM_BACKDOOR) {
+		u8 mask = FORCE_ASPM_VAL_MASK | FORCE_ASPM_CTL0;
+
+		if (!enable)
+			val = FORCE_ASPM_CTL0;
+		rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, mask, val);
+	}
+
+	pcr->aspm_enabled = enable;
+}
+
 static const struct pcr_ops rts5249_pcr_ops = {
 	.fetch_vendor_settings = rtsx_base_fetch_vendor_settings,
 	.extra_init_hw = rts5249_extra_init_hw,
@@ -297,6 +389,7 @@ static const struct pcr_ops rts5249_pcr_ops = {
 	.card_power_off = rtsx_base_card_power_off,
 	.switch_output_voltage = rtsx_base_switch_output_voltage,
 	.force_power_down = rtsx_base_force_power_down,
+	.set_aspm = rts5249_set_aspm,
 };
 
 /* SD Pull Control Enable:
@@ -353,6 +446,8 @@ static const u32 rts5249_ms_pull_ctl_disable_tbl[] = {
 
 void rts5249_init_params(struct rtsx_pcr *pcr)
 {
+	struct rtsx_cr_option *option = &(pcr->option);
+
 	pcr->extra_caps = EXTRA_CAPS_SD_SDR50 | EXTRA_CAPS_SD_SDR104;
 	pcr->num_slots = 2;
 	pcr->ops = &rts5249_pcr_ops;
@@ -372,6 +467,20 @@ void rts5249_init_params(struct rtsx_pcr *pcr)
 	pcr->ms_pull_ctl_disable_tbl = rts5249_ms_pull_ctl_disable_tbl;
 
 	pcr->reg_pm_ctrl3 = PM_CTRL3;
+
+	option->dev_flags = (LTR_L1SS_PWR_GATE_CHECK_CARD_EN
+				| LTR_L1SS_PWR_GATE_EN);
+	option->ltr_en = true;
+
+	/* Init latency of active, idle, L1OFF to 60us, 300us, 3ms */
+	option->ltr_active_latency = LTR_ACTIVE_LATENCY_DEF;
+	option->ltr_idle_latency = LTR_IDLE_LATENCY_DEF;
+	option->ltr_l1off_latency = LTR_L1OFF_LATENCY_DEF;
+	option->dev_aspm_mode = DEV_ASPM_DYNAMIC;
+	option->l1_snooze_delay = L1_SNOOZE_DELAY_DEF;
+	option->ltr_l1off_sspwrgate = LTR_L1OFF_SSPWRGATE_5249_DEF;
+	option->ltr_l1off_snooze_sspwrgate =
+		LTR_L1OFF_SNOOZE_SSPWRGATE_5249_DEF;
 }
 
 static int rts524a_write_phy(struct rtsx_pcr *pcr, u8 addr, u16 val)
@@ -459,6 +568,40 @@ static int rts524a_extra_init_hw(struct rtsx_pcr *pcr)
 	return 0;
 }
 
+static void rts5250_set_l1off_cfg_sub_d0(struct rtsx_pcr *pcr, int active)
+{
+	struct rtsx_cr_option *option = &(pcr->option);
+
+	u32 interrupt = rtsx_pci_readl(pcr, RTSX_BIPR);
+	int card_exist = (interrupt & SD_EXIST) | (interrupt & MS_EXIST);
+	int aspm_L1_1, aspm_L1_2;
+	u8 val = 0;
+
+	aspm_L1_1 = rtsx_check_dev_flag(pcr, ASPM_L1_1_EN);
+	aspm_L1_2 = rtsx_check_dev_flag(pcr, ASPM_L1_2_EN);
+
+	if (active) {
+		/* Run, latency: 60us */
+		if (aspm_L1_1)
+			val = option->ltr_l1off_snooze_sspwrgate;
+	} else {
+		/* L1off, latency: 300us */
+		if (aspm_L1_2)
+			val = option->ltr_l1off_sspwrgate;
+	}
+
+	if (aspm_L1_1 || aspm_L1_2) {
+		if (rtsx_check_dev_flag(pcr,
+					LTR_L1SS_PWR_GATE_CHECK_CARD_EN)) {
+			if (card_exist)
+				val &= ~L1OFF_MBIAS2_EN_5250;
+			else
+				val |= L1OFF_MBIAS2_EN_5250;
+		}
+	}
+	rtsx_set_l1off_sub(pcr, val);
+}
+
 static const struct pcr_ops rts524a_pcr_ops = {
 	.write_phy = rts524a_write_phy,
 	.read_phy = rts524a_read_phy,
@@ -473,11 +616,16 @@ static const struct pcr_ops rts524a_pcr_ops = {
 	.card_power_off = rtsx_base_card_power_off,
 	.switch_output_voltage = rtsx_base_switch_output_voltage,
 	.force_power_down = rtsx_base_force_power_down,
+	.set_l1off_cfg_sub_d0 = rts5250_set_l1off_cfg_sub_d0,
+	.set_aspm = rts5249_set_aspm,
 };
 
 void rts524a_init_params(struct rtsx_pcr *pcr)
 {
 	rts5249_init_params(pcr);
+	pcr->option.ltr_l1off_sspwrgate = LTR_L1OFF_SSPWRGATE_5250_DEF;
+	pcr->option.ltr_l1off_snooze_sspwrgate =
+		LTR_L1OFF_SNOOZE_SSPWRGATE_5250_DEF;
 
 	pcr->reg_pm_ctrl3 = RTS524A_PM_CTRL3;
 	pcr->ops = &rts524a_pcr_ops;
@@ -576,11 +724,16 @@ static const struct pcr_ops rts525a_pcr_ops = {
 	.card_power_off = rtsx_base_card_power_off,
 	.switch_output_voltage = rts525a_switch_output_voltage,
 	.force_power_down = rtsx_base_force_power_down,
+	.set_l1off_cfg_sub_d0 = rts5250_set_l1off_cfg_sub_d0,
+	.set_aspm = rts5249_set_aspm,
 };
 
 void rts525a_init_params(struct rtsx_pcr *pcr)
 {
 	rts5249_init_params(pcr);
+	pcr->option.ltr_l1off_sspwrgate = LTR_L1OFF_SSPWRGATE_5250_DEF;
+	pcr->option.ltr_l1off_snooze_sspwrgate =
+		LTR_L1OFF_SNOOZE_SSPWRGATE_5250_DEF;
 
 	pcr->reg_pm_ctrl3 = RTS524A_PM_CTRL3;
 	pcr->ops = &rts525a_pcr_ops;
diff --git a/drivers/mfd/rtsx_pcr.c b/drivers/mfd/rtsx_pcr.c
index 3cf69e5c5703..590fb9aad77d 100644
--- a/drivers/mfd/rtsx_pcr.c
+++ b/drivers/mfd/rtsx_pcr.c
@@ -79,6 +79,96 @@ static inline void rtsx_pci_disable_aspm(struct rtsx_pcr *pcr)
 		0xFC, 0);
 }
 
+int rtsx_comm_set_ltr_latency(struct rtsx_pcr *pcr, u32 latency)
+{
+	rtsx_pci_write_register(pcr, MSGTXDATA0,
+				MASK_8_BIT_DEF, (u8) (latency & 0xFF));
+	rtsx_pci_write_register(pcr, MSGTXDATA1,
+				MASK_8_BIT_DEF, (u8)((latency >> 8) & 0xFF));
+	rtsx_pci_write_register(pcr, MSGTXDATA2,
+				MASK_8_BIT_DEF, (u8)((latency >> 16) & 0xFF));
+	rtsx_pci_write_register(pcr, MSGTXDATA3,
+				MASK_8_BIT_DEF, (u8)((latency >> 24) & 0xFF));
+	rtsx_pci_write_register(pcr, LTR_CTL, LTR_TX_EN_MASK |
+		LTR_LATENCY_MODE_MASK, LTR_TX_EN_1 | LTR_LATENCY_MODE_SW);
+
+	return 0;
+}
+
+int rtsx_set_ltr_latency(struct rtsx_pcr *pcr, u32 latency)
+{
+	if (pcr->ops->set_ltr_latency)
+		return pcr->ops->set_ltr_latency(pcr, latency);
+	else
+		return rtsx_comm_set_ltr_latency(pcr, latency);
+}
+
+static void rtsx_comm_set_aspm(struct rtsx_pcr *pcr, bool enable)
+{
+	struct rtsx_cr_option *option = &pcr->option;
+
+	if (pcr->aspm_enabled == enable)
+		return;
+
+	if (option->dev_aspm_mode == DEV_ASPM_DYNAMIC) {
+		if (enable)
+			rtsx_pci_enable_aspm(pcr);
+		else
+			rtsx_pci_disable_aspm(pcr);
+	} else if (option->dev_aspm_mode == DEV_ASPM_BACKDOOR) {
+		u8 mask = FORCE_ASPM_VAL_MASK;
+		u8 val = 0;
+
+		if (enable)
+			val = pcr->aspm_en;
+		rtsx_pci_write_register(pcr, ASPM_FORCE_CTL,  mask, val);
+	}
+
+	pcr->aspm_enabled = enable;
+}
+
+static void rtsx_disable_aspm(struct rtsx_pcr *pcr)
+{
+	if (pcr->ops->set_aspm)
+		pcr->ops->set_aspm(pcr, false);
+	else
+		rtsx_comm_set_aspm(pcr, false);
+}
+
+int rtsx_set_l1off_sub(struct rtsx_pcr *pcr, u8 val)
+{
+	rtsx_pci_write_register(pcr, L1SUB_CONFIG3, 0xFF, val);
+
+	return 0;
+}
+
+void rtsx_set_l1off_sub_cfg_d0(struct rtsx_pcr *pcr, int active)
+{
+	if (pcr->ops->set_l1off_cfg_sub_d0)
+		pcr->ops->set_l1off_cfg_sub_d0(pcr, active);
+}
+
+static void rtsx_comm_pm_full_on(struct rtsx_pcr *pcr)
+{
+	struct rtsx_cr_option *option = &pcr->option;
+
+	rtsx_disable_aspm(pcr);
+
+	if (option->ltr_enabled)
+		rtsx_set_ltr_latency(pcr, option->ltr_active_latency);
+
+	if (rtsx_check_dev_flag(pcr, LTR_L1SS_PWR_GATE_EN))
+		rtsx_set_l1off_sub_cfg_d0(pcr, 1);
+}
+
+void rtsx_pm_full_on(struct rtsx_pcr *pcr)
+{
+	if (pcr->ops->full_on)
+		pcr->ops->full_on(pcr);
+	else
+		rtsx_comm_pm_full_on(pcr);
+}
+
 void rtsx_pci_start_run(struct rtsx_pcr *pcr)
 {
 	/* If pci device removed, don't queue idle work any more */
@@ -89,9 +179,7 @@ void rtsx_pci_start_run(struct rtsx_pcr *pcr)
 		pcr->state = PDEV_STAT_RUN;
 		if (pcr->ops->enable_auto_blink)
 			pcr->ops->enable_auto_blink(pcr);
-
-		if (pcr->aspm_en)
-			rtsx_pci_disable_aspm(pcr);
+		rtsx_pm_full_on(pcr);
 	}
 
 	mod_delayed_work(system_wq, &pcr->idle_work, msecs_to_jiffies(200));
@@ -958,6 +1046,41 @@ static int rtsx_pci_acquire_irq(struct rtsx_pcr *pcr)
 	return 0;
 }
 
+static void rtsx_enable_aspm(struct rtsx_pcr *pcr)
+{
+	if (pcr->ops->set_aspm)
+		pcr->ops->set_aspm(pcr, true);
+	else
+		rtsx_comm_set_aspm(pcr, true);
+}
+
+static void rtsx_comm_pm_power_saving(struct rtsx_pcr *pcr)
+{
+	struct rtsx_cr_option *option = &pcr->option;
+
+	if (option->ltr_enabled) {
+		u32 latency = option->ltr_l1off_latency;
+
+		if (rtsx_check_dev_flag(pcr, L1_SNOOZE_TEST_EN))
+			mdelay(option->l1_snooze_delay);
+
+		rtsx_set_ltr_latency(pcr, latency);
+	}
+
+	if (rtsx_check_dev_flag(pcr, LTR_L1SS_PWR_GATE_EN))
+		rtsx_set_l1off_sub_cfg_d0(pcr, 0);
+
+	rtsx_enable_aspm(pcr);
+}
+
+void rtsx_pm_power_saving(struct rtsx_pcr *pcr)
+{
+	if (pcr->ops->power_saving)
+		pcr->ops->power_saving(pcr);
+	else
+		rtsx_comm_pm_power_saving(pcr);
+}
+
 static void rtsx_pci_idle_work(struct work_struct *work)
 {
 	struct delayed_work *dwork = to_delayed_work(work);
@@ -974,8 +1097,7 @@ static void rtsx_pci_idle_work(struct work_struct *work)
 	if (pcr->ops->turn_off_led)
 		pcr->ops->turn_off_led(pcr);
 
-	if (pcr->aspm_en)
-		rtsx_pci_enable_aspm(pcr);
+	rtsx_pm_power_saving(pcr);
 
 	mutex_unlock(&pcr->pcr_mutex);
 }
@@ -1063,6 +1185,16 @@ static int rtsx_pci_init_hw(struct rtsx_pcr *pcr)
 	if (err < 0)
 		return err;
 
+	switch (PCI_PID(pcr)) {
+	case PID_5250:
+	case PID_524A:
+	case PID_525A:
+		rtsx_pci_write_register(pcr, PM_CLK_FORCE_CTL, 1, 1);
+		break;
+	default:
+		break;
+	}
+
 	/* Enable clk_request_n to enable clock power management */
 	rtsx_pci_write_config_byte(pcr, pcr->pcie_cap + PCI_EXP_LNKCTL + 1, 1);
 	/* Enter L1 when host tx idle */
diff --git a/drivers/mfd/rtsx_pcr.h b/drivers/mfd/rtsx_pcr.h
index 931d1ae3ce32..ec784e04fe20 100644
--- a/drivers/mfd/rtsx_pcr.h
+++ b/drivers/mfd/rtsx_pcr.h
@@ -32,6 +32,18 @@
 #define RTS524A_PME_FORCE_CTL		0xFF78
 #define RTS524A_PM_CTRL3		0xFF7E
 
+#define LTR_ACTIVE_LATENCY_DEF		0x883C
+#define LTR_IDLE_LATENCY_DEF		0x892C
+#define LTR_L1OFF_LATENCY_DEF		0x9003
+#define L1_SNOOZE_DELAY_DEF		1
+#define LTR_L1OFF_SSPWRGATE_5249_DEF		0xAF
+#define LTR_L1OFF_SSPWRGATE_5250_DEF		0xFF
+#define LTR_L1OFF_SNOOZE_SSPWRGATE_5249_DEF	0xAC
+#define LTR_L1OFF_SNOOZE_SSPWRGATE_5250_DEF	0xF8
+#define CMD_TIMEOUT_DEF		100
+#define ASPM_MASK_NEG		0xFC
+#define MASK_8_BIT_DEF		0xFF
+
 int __rtsx_pci_write_phy_register(struct rtsx_pcr *pcr, u8 addr, u16 val);
 int __rtsx_pci_read_phy_register(struct rtsx_pcr *pcr, u8 addr, u16 *val);
 
@@ -85,5 +97,7 @@ do {									\
 
 /* generic operations */
 int rtsx_gops_pm_reset(struct rtsx_pcr *pcr);
+int rtsx_set_ltr_latency(struct rtsx_pcr *pcr, u32 latency);
+int rtsx_set_l1off_sub(struct rtsx_pcr *pcr, u8 val);
 
 #endif
diff --git a/include/linux/mfd/rtsx_pci.h b/include/linux/mfd/rtsx_pci.h
index 116816fb9110..57e23ad1c2bc 100644
--- a/include/linux/mfd/rtsx_pci.h
+++ b/include/linux/mfd/rtsx_pci.h
@@ -573,6 +573,12 @@
 #define MSGTXDATA3			0xFE47
 #define MSGTXCTL			0xFE48
 #define LTR_CTL				0xFE4A
+#define LTR_TX_EN_MASK		BIT(7)
+#define LTR_TX_EN_1			BIT(7)
+#define LTR_TX_EN_0			0
+#define LTR_LATENCY_MODE_MASK		BIT(6)
+#define LTR_LATENCY_MODE_HW		0
+#define LTR_LATENCY_MODE_SW		BIT(6)
 #define OBFF_CFG			0xFE4C
 
 #define CDRESUMECTL			0xFE52
@@ -616,11 +622,15 @@
 #define L1SUB_CONFIG2			0xFE8E
 #define   L1SUB_AUTO_CFG		0x02
 #define L1SUB_CONFIG3			0xFE8F
+#define   L1OFF_MBIAS2_EN_5250		BIT(7)
 
 #define DUMMY_REG_RESET_0		0xFE90
 
 #define AUTOLOAD_CFG_BASE		0xFF00
 #define PETXCFG				0xFF03
+#define FORCE_CLKREQ_DELINK_MASK	BIT(7)
+#define FORCE_CLKREQ_LOW	0x80
+#define FORCE_CLKREQ_HIGH	0x00
 
 #define PM_CTRL1			0xFF44
 #define   CD_RESUME_EN_MASK		0xF0
@@ -844,6 +854,9 @@
 #define   PHY_DIG1E_RX_EN_KEEP		0x0001
 #define PHY_DUM_REG			0x1F
 
+#define PCR_ASPM_SETTING_REG1		0x160
+#define PCR_ASPM_SETTING_REG2		0x168
+
 #define PCR_SETTING_REG1		0x724
 #define PCR_SETTING_REG2		0x814
 #define PCR_SETTING_REG3		0x747
@@ -876,14 +889,79 @@ struct pcr_ops {
 	int		(*conv_clk_and_div_n)(int clk, int dir);
 	void		(*fetch_vendor_settings)(struct rtsx_pcr *pcr);
 	void		(*force_power_down)(struct rtsx_pcr *pcr, u8 pm_state);
+
+	void (*set_aspm)(struct rtsx_pcr *pcr, bool enable);
+	int (*set_ltr_latency)(struct rtsx_pcr *pcr, u32 latency);
+	int (*set_l1off_sub)(struct rtsx_pcr *pcr, u8 val);
+	void (*set_l1off_cfg_sub_d0)(struct rtsx_pcr *pcr, int active);
+	void (*full_on)(struct rtsx_pcr *pcr);
+	void (*power_saving)(struct rtsx_pcr *pcr);
 };
 
 enum PDEV_STAT  {PDEV_STAT_IDLE, PDEV_STAT_RUN};
 
+#define ASPM_L1_1_EN_MASK		BIT(3)
+#define ASPM_L1_2_EN_MASK		BIT(2)
+#define PM_L1_1_EN_MASK		BIT(1)
+#define PM_L1_2_EN_MASK		BIT(0)
+
+#define ASPM_L1_1_EN			BIT(0)
+#define ASPM_L1_2_EN			BIT(1)
+#define PM_L1_1_EN				BIT(2)
+#define PM_L1_2_EN				BIT(3)
+#define LTR_L1SS_PWR_GATE_EN	BIT(4)
+#define L1_SNOOZE_TEST_EN		BIT(5)
+#define LTR_L1SS_PWR_GATE_CHECK_CARD_EN	BIT(6)
+
+enum dev_aspm_mode {
+	DEV_ASPM_DISABLE = 0,
+	DEV_ASPM_DYNAMIC,
+	DEV_ASPM_BACKDOOR,
+	DEV_ASPM_STATIC,
+};
+
+/*
+ * struct rtsx_cr_option  - card reader option
+ * @dev_flags: device flags
+ * @force_clkreq_0: force clock request
+ * @ltr_en: enable ltr mode flag
+ * @ltr_enabled: ltr mode in configure space flag
+ * @ltr_active: ltr mode status
+ * @ltr_active_latency: ltr mode active latency
+ * @ltr_idle_latency: ltr mode idle latency
+ * @ltr_l1off_latency: ltr mode l1off latency
+ * @dev_aspm_mode: device aspm mode
+ * @l1_snooze_delay: l1 snooze delay
+ * @ltr_l1off_sspwrgate: ltr l1off sspwrgate
+ * @ltr_l1off_snooze_sspwrgate: ltr l1off snooze sspwrgate
+ */
+struct rtsx_cr_option {
+	u32 dev_flags;
+	bool force_clkreq_0;
+	bool ltr_en;
+	bool ltr_enabled;
+	bool ltr_active;
+	u32 ltr_active_latency;
+	u32 ltr_idle_latency;
+	u32 ltr_l1off_latency;
+	enum dev_aspm_mode dev_aspm_mode;
+	u32 l1_snooze_delay;
+	u8 ltr_l1off_sspwrgate;
+	u8 ltr_l1off_snooze_sspwrgate;
+};
+
+#define rtsx_set_dev_flag(cr, flag) \
+	((cr)->option.dev_flags |= (flag))
+#define rtsx_clear_dev_flag(cr, flag) \
+	((cr)->option.dev_flags &= ~(flag))
+#define rtsx_check_dev_flag(cr, flag) \
+	((cr)->option.dev_flags & (flag))
+
 struct rtsx_pcr {
 	struct pci_dev			*pci;
 	unsigned int			id;
 	int				pcie_cap;
+	struct rtsx_cr_option	option;
 
 	/* pci resources */
 	unsigned long			addr;
@@ -940,6 +1018,7 @@ struct rtsx_pcr {
 	u8				card_drive_sel;
 #define ASPM_L1_EN			0x02
 	u8				aspm_en;
+	bool				aspm_enabled;
 
 #define PCR_MS_PMOS			(1 << 0)
 #define PCR_REVERSE_SOCKET		(1 << 1)
@@ -964,6 +1043,11 @@ struct rtsx_pcr {
 	u8				dma_error_count;
 };
 
+#define PID_524A	0x524A
+#define PID_5249		0x5249
+#define PID_5250		0x5250
+#define PID_525A	0x525A
+
 #define CHK_PCI_PID(pcr, pid)		((pcr)->pci->device == (pid))
 #define PCI_VID(pcr)			((pcr)->pci->vendor)
 #define PCI_PID(pcr)			((pcr)->pci->device)
-- 
cgit v1.2.3


From 900148296b78c61aa8c443dc594c0da968c3be53 Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:45:50 +0200
Subject: lightnvm: prevent target type module removal when in use
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If target type module e.g. pblk here is unloaded (rmmod) while module
is in use (after creating target) system crashes.  We fix this by
using module API refcnt.

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c      | 4 ++++
 drivers/lightnvm/pblk-init.c | 1 +
 include/linux/lightnvm.h     | 1 +
 3 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index ddae430b6eae..60e163be5a89 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -22,6 +22,7 @@
 #include <linux/types.h>
 #include <linux/sem.h>
 #include <linux/bitmap.h>
+#include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/miscdevice.h>
 #include <linux/lightnvm.h>
@@ -316,6 +317,8 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	list_add_tail(&t->list, &dev->targets);
 	mutex_unlock(&dev->mlock);
 
+	__module_get(tt->owner);
+
 	return 0;
 err_sysfs:
 	if (tt->exit)
@@ -351,6 +354,7 @@ static void __nvm_remove_target(struct nvm_target *t)
 
 	nvm_remove_tgt_dev(t->dev, 1);
 	put_disk(tdisk);
+	module_put(t->type->owner);
 
 	list_del(&t->list);
 	kfree(t);
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 1b0f61233c21..6df65d14a2c5 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -1044,6 +1044,7 @@ static struct nvm_tgt_type tt_pblk = {
 
 	.sysfs_init	= pblk_sysfs_init,
 	.sysfs_exit	= pblk_sysfs_exit,
+	.owner		= THIS_MODULE,
 };
 
 static int __init pblk_module_init(void)
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 7dfa56ebbc6d..7b80ac911d26 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -460,6 +460,7 @@ struct nvm_tgt_type {
 
 	/* For internal use */
 	struct list_head list;
+	struct module *owner;
 };
 
 extern struct nvm_tgt_type *nvm_find_target_type(const char *, int);
-- 
cgit v1.2.3


From ef56b9ce562753cacf518f081a4ff3227efdab25 Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:46:30 +0200
Subject: lightnvm: remove unused argument from nvm_set_tgt_bb_tbl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

vblk isn't being used anyway and if we ever have a usecase we can
introduce this again.  This makes the logic easier and removes
unnecessary checks.

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c  | 29 ++++++++++++-----------------
 include/linux/lightnvm.h |  2 +-
 2 files changed, 13 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 798964f511cd..231c92899431 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -633,7 +633,7 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 
-	nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1);
+	nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas);
 	nvm_rq_tgt_to_dev(tgt_dev, &rqd);
 
 	ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
@@ -697,7 +697,7 @@ int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 	rqd.private = &wait;
 	rqd.flags = geo->plane_mode >> 1;
 
-	ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1);
+	ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas);
 	if (ret)
 		return ret;
 
@@ -793,14 +793,14 @@ void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
 EXPORT_SYMBOL(nvm_put_area);
 
 int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
-			const struct ppa_addr *ppas, int nr_ppas, int vblk)
+			const struct ppa_addr *ppas, int nr_ppas)
 {
 	struct nvm_dev *dev = tgt_dev->parent;
 	struct nvm_geo *geo = &tgt_dev->geo;
 	int i, plane_cnt, pl_idx;
 	struct ppa_addr ppa;
 
-	if ((!vblk || geo->plane_mode == NVM_PLANE_SINGLE) && nr_ppas == 1) {
+	if (geo->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) {
 		rqd->nr_ppas = nr_ppas;
 		rqd->ppa_addr = ppas[0];
 
@@ -814,19 +814,14 @@ int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
 		return -ENOMEM;
 	}
 
-	if (!vblk) {
-		for (i = 0; i < nr_ppas; i++)
-			rqd->ppa_list[i] = ppas[i];
-	} else {
-		plane_cnt = geo->plane_mode;
-		rqd->nr_ppas *= plane_cnt;
-
-		for (i = 0; i < nr_ppas; i++) {
-			for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
-				ppa = ppas[i];
-				ppa.g.pl = pl_idx;
-				rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa;
-			}
+	plane_cnt = geo->plane_mode;
+	rqd->nr_ppas *= plane_cnt;
+
+	for (i = 0; i < nr_ppas; i++) {
+		for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
+			ppa = ppas[i];
+			ppa.g.pl = pl_idx;
+			rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa;
 		}
 	}
 
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 7b80ac911d26..32ec35b5e18b 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -481,7 +481,7 @@ extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
 extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
 extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int);
 extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *,
-					const struct ppa_addr *, int, int);
+					const struct ppa_addr *, int);
 extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *);
 extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
 			   void *);
-- 
cgit v1.2.3


From eb6f168f97438bf1cac8b9b1301c662eace9e39f Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:46:31 +0200
Subject: lightnvm: remove stale extern and unused exported symbols
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Not all exported symbols are being used outside core and there were
some stale entries in lightnvm.h

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c  | 129 +++++++++++++++++++++++------------------------
 include/linux/lightnvm.h |   7 ---
 2 files changed, 64 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 231c92899431..0e5f77234c79 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -226,6 +226,24 @@ static const struct block_device_operations nvm_fops = {
 	.owner		= THIS_MODULE,
 };
 
+static struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock)
+{
+	struct nvm_tgt_type *tmp, *tt = NULL;
+
+	if (lock)
+		down_write(&nvm_tgtt_lock);
+
+	list_for_each_entry(tmp, &nvm_tgt_types, list)
+		if (!strcmp(name, tmp->name)) {
+			tt = tmp;
+			break;
+		}
+
+	if (lock)
+		up_write(&nvm_tgtt_lock);
+	return tt;
+}
+
 static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 {
 	struct nvm_ioctl_create_simple *s = &create->conf.s;
@@ -549,25 +567,6 @@ void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
 }
 EXPORT_SYMBOL(nvm_part_to_tgt);
 
-struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock)
-{
-	struct nvm_tgt_type *tmp, *tt = NULL;
-
-	if (lock)
-		down_write(&nvm_tgtt_lock);
-
-	list_for_each_entry(tmp, &nvm_tgt_types, list)
-		if (!strcmp(name, tmp->name)) {
-			tt = tmp;
-			break;
-		}
-
-	if (lock)
-		up_write(&nvm_tgtt_lock);
-	return tt;
-}
-EXPORT_SYMBOL(nvm_find_target_type);
-
 int nvm_register_tgt_type(struct nvm_tgt_type *tt)
 {
 	int ret = 0;
@@ -619,6 +618,52 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name)
 	return NULL;
 }
 
+static int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
+			const struct ppa_addr *ppas, int nr_ppas)
+{
+	struct nvm_dev *dev = tgt_dev->parent;
+	struct nvm_geo *geo = &tgt_dev->geo;
+	int i, plane_cnt, pl_idx;
+	struct ppa_addr ppa;
+
+	if (geo->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) {
+		rqd->nr_ppas = nr_ppas;
+		rqd->ppa_addr = ppas[0];
+
+		return 0;
+	}
+
+	rqd->nr_ppas = nr_ppas;
+	rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list);
+	if (!rqd->ppa_list) {
+		pr_err("nvm: failed to allocate dma memory\n");
+		return -ENOMEM;
+	}
+
+	plane_cnt = geo->plane_mode;
+	rqd->nr_ppas *= plane_cnt;
+
+	for (i = 0; i < nr_ppas; i++) {
+		for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
+			ppa = ppas[i];
+			ppa.g.pl = pl_idx;
+			rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa;
+		}
+	}
+
+	return 0;
+}
+
+static void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev,
+			struct nvm_rq *rqd)
+{
+	if (!rqd->ppa_list)
+		return;
+
+	nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
+}
+
+
 int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 		       int nr_ppas, int type)
 {
@@ -792,52 +837,6 @@ void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
 }
 EXPORT_SYMBOL(nvm_put_area);
 
-int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
-			const struct ppa_addr *ppas, int nr_ppas)
-{
-	struct nvm_dev *dev = tgt_dev->parent;
-	struct nvm_geo *geo = &tgt_dev->geo;
-	int i, plane_cnt, pl_idx;
-	struct ppa_addr ppa;
-
-	if (geo->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) {
-		rqd->nr_ppas = nr_ppas;
-		rqd->ppa_addr = ppas[0];
-
-		return 0;
-	}
-
-	rqd->nr_ppas = nr_ppas;
-	rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list);
-	if (!rqd->ppa_list) {
-		pr_err("nvm: failed to allocate dma memory\n");
-		return -ENOMEM;
-	}
-
-	plane_cnt = geo->plane_mode;
-	rqd->nr_ppas *= plane_cnt;
-
-	for (i = 0; i < nr_ppas; i++) {
-		for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
-			ppa = ppas[i];
-			ppa.g.pl = pl_idx;
-			rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa;
-		}
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(nvm_set_rqd_ppalist);
-
-void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
-{
-	if (!rqd->ppa_list)
-		return;
-
-	nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
-}
-EXPORT_SYMBOL(nvm_free_rqd_ppalist);
-
 void nvm_end_io(struct nvm_rq *rqd)
 {
 	struct nvm_tgt_dev *tgt_dev = rqd->dev;
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 32ec35b5e18b..4f0e4a0fd204 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -463,8 +463,6 @@ struct nvm_tgt_type {
 	struct module *owner;
 };
 
-extern struct nvm_tgt_type *nvm_find_target_type(const char *, int);
-
 extern int nvm_register_tgt_type(struct nvm_tgt_type *);
 extern void nvm_unregister_tgt_type(struct nvm_tgt_type *);
 
@@ -480,9 +478,6 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *,
 extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
 extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
 extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int);
-extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *,
-					const struct ppa_addr *, int);
-extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *);
 extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
 			   void *);
 extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t);
@@ -491,8 +486,6 @@ extern void nvm_end_io(struct nvm_rq *);
 extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int);
 extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *);
 
-extern int nvm_dev_factory(struct nvm_dev *, int flags);
-
 extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int);
 
 #else /* CONFIG_NVM */
-- 
cgit v1.2.3


From 1a94b2d484677dc559c96251dd0e7c7b8811c378 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:47 +0200
Subject: lightnvm: implement generic path for sync I/O
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement a generic path for sending sync I/O on LightNVM. This allows
to reuse the standard synchronous path trough blk_execute_rq(), instead
of implementing a wait_for_completion on the target side (e.g., pblk).

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c          | 25 +++++++++-----
 drivers/lightnvm/pblk-core.c     | 74 ++++++++++++----------------------------
 drivers/lightnvm/pblk-read.c     | 21 ++----------
 drivers/lightnvm/pblk-recovery.c | 31 ++---------------
 drivers/lightnvm/pblk.h          | 42 +++++++++++++++++++++--
 drivers/nvme/host/lightnvm.c     | 70 +++++++++++++++++++++++++++++--------
 include/linux/lightnvm.h         |  3 ++
 7 files changed, 143 insertions(+), 123 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 0e5f77234c79..fe21f4dd33e9 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -720,12 +720,25 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 }
 EXPORT_SYMBOL(nvm_submit_io);
 
-static void nvm_end_io_sync(struct nvm_rq *rqd)
+int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 {
-	struct completion *waiting = rqd->private;
+	struct nvm_dev *dev = tgt_dev->parent;
+	int ret;
+
+	if (!dev->ops->submit_io_sync)
+		return -ENODEV;
+
+	nvm_rq_tgt_to_dev(tgt_dev, rqd);
 
-	complete(waiting);
+	rqd->dev = tgt_dev;
+
+	/* In case of error, fail with right address format */
+	ret = dev->ops->submit_io_sync(dev, rqd);
+	nvm_rq_dev_to_tgt(tgt_dev, rqd);
+
+	return ret;
 }
+EXPORT_SYMBOL(nvm_submit_io_sync);
 
 int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 								int nr_ppas)
@@ -733,25 +746,21 @@ int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 	struct nvm_geo *geo = &tgt_dev->geo;
 	struct nvm_rq rqd;
 	int ret;
-	DECLARE_COMPLETION_ONSTACK(wait);
 
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 
 	rqd.opcode = NVM_OP_ERASE;
-	rqd.end_io = nvm_end_io_sync;
-	rqd.private = &wait;
 	rqd.flags = geo->plane_mode >> 1;
 
 	ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas);
 	if (ret)
 		return ret;
 
-	ret = nvm_submit_io(tgt_dev, &rqd);
+	ret = nvm_submit_io_sync(tgt_dev, &rqd);
 	if (ret) {
 		pr_err("rrpr: erase I/O submission failed: %d\n", ret);
 		goto free_ppa_list;
 	}
-	wait_for_completion_io(&wait);
 
 free_ppa_list:
 	nvm_free_rqd_ppalist(tgt_dev, &rqd);
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 4199119a0754..ce90213a42fa 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -412,39 +412,33 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
 	struct nvm_tgt_dev *dev = pblk->dev;
 
 #ifdef CONFIG_NVM_DEBUG
-	struct ppa_addr *ppa_list;
+	int ret;
 
-	ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
-	if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) {
-		WARN_ON(1);
-		return -EINVAL;
-	}
+	ret = pblk_check_io(pblk, rqd);
+	if (ret)
+		return ret;
+#endif
 
-	if (rqd->opcode == NVM_OP_PWRITE) {
-		struct pblk_line *line;
-		struct ppa_addr ppa;
-		int i;
+	atomic_inc(&pblk->inflight_io);
 
-		for (i = 0; i < rqd->nr_ppas; i++) {
-			ppa = ppa_list[i];
-			line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
+	return nvm_submit_io(dev, rqd);
+}
 
-			spin_lock(&line->lock);
-			if (line->state != PBLK_LINESTATE_OPEN) {
-				pr_err("pblk: bad ppa: line:%d,state:%d\n",
-							line->id, line->state);
-				WARN_ON(1);
-				spin_unlock(&line->lock);
-				return -EINVAL;
-			}
-			spin_unlock(&line->lock);
-		}
-	}
+int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+
+#ifdef CONFIG_NVM_DEBUG
+	int ret;
+
+	ret = pblk_check_io(pblk, rqd);
+	if (ret)
+		return ret;
 #endif
 
 	atomic_inc(&pblk->inflight_io);
 
-	return nvm_submit_io(dev, rqd);
+	return nvm_submit_io_sync(dev, rqd);
 }
 
 static void pblk_bio_map_addr_endio(struct bio *bio)
@@ -597,7 +591,6 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
 	int cmd_op, bio_op;
 	int i, j;
 	int ret;
-	DECLARE_COMPLETION_ONSTACK(wait);
 
 	if (dir == PBLK_WRITE) {
 		bio_op = REQ_OP_WRITE;
@@ -639,8 +632,6 @@ next_rq:
 	rqd.dma_ppa_list = dma_ppa_list;
 	rqd.opcode = cmd_op;
 	rqd.nr_ppas = rq_ppas;
-	rqd.end_io = pblk_end_io_sync;
-	rqd.private = &wait;
 
 	if (dir == PBLK_WRITE) {
 		struct pblk_sec_meta *meta_list = rqd.meta_list;
@@ -694,19 +685,14 @@ next_rq:
 		}
 	}
 
-	ret = pblk_submit_io(pblk, &rqd);
+	ret = pblk_submit_io_sync(pblk, &rqd);
 	if (ret) {
 		pr_err("pblk: emeta I/O submission failed: %d\n", ret);
 		bio_put(bio);
 		goto free_rqd_dma;
 	}
 
-	if (!wait_for_completion_io_timeout(&wait,
-				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
-		pr_err("pblk: emeta I/O timed out\n");
-	}
 	atomic_dec(&pblk->inflight_io);
-	reinit_completion(&wait);
 
 	if (rqd.error) {
 		if (dir == PBLK_WRITE)
@@ -750,7 +736,6 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
 	int i, ret;
 	int cmd_op, bio_op;
 	int flags;
-	DECLARE_COMPLETION_ONSTACK(wait);
 
 	if (dir == PBLK_WRITE) {
 		bio_op = REQ_OP_WRITE;
@@ -787,8 +772,6 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
 	rqd.opcode = cmd_op;
 	rqd.flags = flags;
 	rqd.nr_ppas = lm->smeta_sec;
-	rqd.end_io = pblk_end_io_sync;
-	rqd.private = &wait;
 
 	for (i = 0; i < lm->smeta_sec; i++, paddr++) {
 		struct pblk_sec_meta *meta_list = rqd.meta_list;
@@ -807,17 +790,13 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
 	 * the write thread is the only one sending write and erase commands,
 	 * there is no need to take the LUN semaphore.
 	 */
-	ret = pblk_submit_io(pblk, &rqd);
+	ret = pblk_submit_io_sync(pblk, &rqd);
 	if (ret) {
 		pr_err("pblk: smeta I/O submission failed: %d\n", ret);
 		bio_put(bio);
 		goto free_ppa_list;
 	}
 
-	if (!wait_for_completion_io_timeout(&wait,
-				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
-		pr_err("pblk: smeta I/O timed out\n");
-	}
 	atomic_dec(&pblk->inflight_io);
 
 	if (rqd.error) {
@@ -861,19 +840,15 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
 {
 	struct nvm_rq rqd;
 	int ret = 0;
-	DECLARE_COMPLETION_ONSTACK(wait);
 
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 
 	pblk_setup_e_rq(pblk, &rqd, ppa);
 
-	rqd.end_io = pblk_end_io_sync;
-	rqd.private = &wait;
-
 	/* The write thread schedules erases so that it minimizes disturbances
 	 * with writes. Thus, there is no need to take the LUN semaphore.
 	 */
-	ret = pblk_submit_io(pblk, &rqd);
+	ret = pblk_submit_io_sync(pblk, &rqd);
 	if (ret) {
 		struct nvm_tgt_dev *dev = pblk->dev;
 		struct nvm_geo *geo = &dev->geo;
@@ -886,11 +861,6 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
 		goto out;
 	}
 
-	if (!wait_for_completion_io_timeout(&wait,
-				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
-		pr_err("pblk: sync erase timed out\n");
-	}
-
 out:
 	rqd.private = pblk;
 	__pblk_end_io_erase(pblk, &rqd);
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 71c58503f1a4..ca79d8fb3e60 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -207,7 +207,6 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 	int nr_secs = rqd->nr_ppas;
 	int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
 	int i, ret, hole;
-	DECLARE_COMPLETION_ONSTACK(wait);
 
 	/* Re-use allocated memory for intermediate lbas */
 	lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
@@ -232,8 +231,6 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 	rqd->bio = new_bio;
 	rqd->nr_ppas = nr_holes;
 	rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
-	rqd->end_io = pblk_end_io_sync;
-	rqd->private = &wait;
 
 	if (unlikely(nr_holes == 1)) {
 		ppa_ptr = rqd->ppa_list;
@@ -241,18 +238,13 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 		rqd->ppa_addr = rqd->ppa_list[0];
 	}
 
-	ret = pblk_submit_read_io(pblk, rqd);
+	ret = pblk_submit_io_sync(pblk, rqd);
 	if (ret) {
 		bio_put(rqd->bio);
-		pr_err("pblk: read IO submission failed\n");
+		pr_err("pblk: sync read IO submission failed\n");
 		goto err;
 	}
 
-	if (!wait_for_completion_io_timeout(&wait,
-				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
-		pr_err("pblk: partial read I/O timed out\n");
-	}
-
 	if (rqd->error) {
 		atomic_long_inc(&pblk->read_failed);
 #ifdef CONFIG_NVM_DEBUG
@@ -537,7 +529,6 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
 	struct nvm_rq rqd;
 	int data_len;
 	int ret = NVM_IO_OK;
-	DECLARE_COMPLETION_ONSTACK(wait);
 
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 
@@ -577,22 +568,16 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
 	bio_set_op_attrs(bio, REQ_OP_READ, 0);
 
 	rqd.opcode = NVM_OP_PREAD;
-	rqd.end_io = pblk_end_io_sync;
-	rqd.private = &wait;
 	rqd.nr_ppas = gc_rq->secs_to_gc;
 	rqd.flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
 	rqd.bio = bio;
 
-	if (pblk_submit_read_io(pblk, &rqd)) {
+	if (pblk_submit_io_sync(pblk, &rqd)) {
 		ret = -EIO;
 		pr_err("pblk: GC read request failed\n");
 		goto err_free_bio;
 	}
 
-	if (!wait_for_completion_io_timeout(&wait,
-				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
-		pr_err("pblk: GC read I/O timed out\n");
-	}
 	atomic_dec(&pblk->inflight_io);
 
 	if (rqd.error) {
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 9772a947ca4f..eadb3eb5d4dc 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -216,7 +216,6 @@ static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line,
 	int rq_ppas, rq_len;
 	int i, j;
 	int ret = 0;
-	DECLARE_COMPLETION_ONSTACK(wait);
 
 	ppa_list = p.ppa_list;
 	meta_list = p.meta_list;
@@ -253,8 +252,6 @@ next_read_rq:
 	rqd->ppa_list = ppa_list;
 	rqd->dma_ppa_list = dma_ppa_list;
 	rqd->dma_meta_list = dma_meta_list;
-	rqd->end_io = pblk_end_io_sync;
-	rqd->private = &wait;
 
 	if (pblk_io_aligned(pblk, rq_ppas))
 		rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@@ -280,19 +277,13 @@ next_read_rq:
 	}
 
 	/* If read fails, more padding is needed */
-	ret = pblk_submit_io(pblk, rqd);
+	ret = pblk_submit_io_sync(pblk, rqd);
 	if (ret) {
 		pr_err("pblk: I/O submission failed: %d\n", ret);
 		return ret;
 	}
 
-	if (!wait_for_completion_io_timeout(&wait,
-				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
-		pr_err("pblk: L2P recovery read timed out\n");
-		return -EINTR;
-	}
 	atomic_dec(&pblk->inflight_io);
-	reinit_completion(&wait);
 
 	/* At this point, the read should not fail. If it does, it is a problem
 	 * we cannot recover from here. Need FTL log.
@@ -504,7 +495,6 @@ static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line,
 	int ret = 0;
 	int rec_round;
 	int left_ppas = pblk_calc_sec_in_line(pblk, line) - line->cur_sec;
-	DECLARE_COMPLETION_ONSTACK(wait);
 
 	ppa_list = p.ppa_list;
 	meta_list = p.meta_list;
@@ -539,8 +529,6 @@ next_rq:
 	rqd->ppa_list = ppa_list;
 	rqd->dma_ppa_list = dma_ppa_list;
 	rqd->dma_meta_list = dma_meta_list;
-	rqd->end_io = pblk_end_io_sync;
-	rqd->private = &wait;
 
 	if (pblk_io_aligned(pblk, rq_ppas))
 		rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@@ -566,18 +554,13 @@ next_rq:
 				addr_to_gen_ppa(pblk, w_ptr, line->id);
 	}
 
-	ret = pblk_submit_io(pblk, rqd);
+	ret = pblk_submit_io_sync(pblk, rqd);
 	if (ret) {
 		pr_err("pblk: I/O submission failed: %d\n", ret);
 		return ret;
 	}
 
-	if (!wait_for_completion_io_timeout(&wait,
-				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
-		pr_err("pblk: L2P recovery read timed out\n");
-	}
 	atomic_dec(&pblk->inflight_io);
-	reinit_completion(&wait);
 
 	/* This should not happen since the read failed during normal recovery,
 	 * but the media works funny sometimes...
@@ -645,7 +628,6 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
 	int i, j;
 	int ret = 0;
 	int left_ppas = pblk_calc_sec_in_line(pblk, line);
-	DECLARE_COMPLETION_ONSTACK(wait);
 
 	ppa_list = p.ppa_list;
 	meta_list = p.meta_list;
@@ -678,8 +660,6 @@ next_rq:
 	rqd->ppa_list = ppa_list;
 	rqd->dma_ppa_list = dma_ppa_list;
 	rqd->dma_meta_list = dma_meta_list;
-	rqd->end_io = pblk_end_io_sync;
-	rqd->private = &wait;
 
 	if (pblk_io_aligned(pblk, rq_ppas))
 		rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@@ -705,19 +685,14 @@ next_rq:
 				addr_to_gen_ppa(pblk, paddr, line->id);
 	}
 
-	ret = pblk_submit_io(pblk, rqd);
+	ret = pblk_submit_io_sync(pblk, rqd);
 	if (ret) {
 		pr_err("pblk: I/O submission failed: %d\n", ret);
 		bio_put(bio);
 		return ret;
 	}
 
-	if (!wait_for_completion_io_timeout(&wait,
-				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
-		pr_err("pblk: L2P recovery read timed out\n");
-	}
 	atomic_dec(&pblk->inflight_io);
-	reinit_completion(&wait);
 
 	/* Reached the end of the written line */
 	if (rqd->error) {
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 6c9ea9a93704..6b64288de6f7 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -714,6 +714,7 @@ void pblk_discard(struct pblk *pblk, struct bio *bio);
 void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd);
 void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd);
 int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd);
+int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd);
 int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line);
 struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
 			      unsigned int nr_secs, unsigned int len,
@@ -1203,7 +1204,6 @@ static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd,
 
 	pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
 }
-#endif
 
 static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
 				       struct ppa_addr *ppas, int nr_ppas)
@@ -1224,14 +1224,50 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
 				ppa->g.sec < geo->sec_per_pg)
 			continue;
 
-#ifdef CONFIG_NVM_DEBUG
 		print_ppa(ppa, "boundary", i);
-#endif
+
 		return 1;
 	}
 	return 0;
 }
 
+static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct ppa_addr *ppa_list;
+
+	ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
+
+	if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	if (rqd->opcode == NVM_OP_PWRITE) {
+		struct pblk_line *line;
+		struct ppa_addr ppa;
+		int i;
+
+		for (i = 0; i < rqd->nr_ppas; i++) {
+			ppa = ppa_list[i];
+			line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
+
+			spin_lock(&line->lock);
+			if (line->state != PBLK_LINESTATE_OPEN) {
+				pr_err("pblk: bad ppa: line:%d,state:%d\n",
+							line->id, line->state);
+				WARN_ON(1);
+				spin_unlock(&line->lock);
+				return -EINVAL;
+			}
+			spin_unlock(&line->lock);
+		}
+	}
+
+	return 0;
+}
+#endif
+
 static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr)
 {
 	struct pblk_line_meta *lm = &pblk->lm;
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 6017153c2439..8fc949c5b49b 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -492,34 +492,47 @@ static void nvme_nvm_end_io(struct request *rq, blk_status_t status)
 	blk_mq_free_request(rq);
 }
 
-static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
+static struct request *nvme_nvm_alloc_request(struct request_queue *q,
+					      struct nvm_rq *rqd,
+					      struct nvme_nvm_command *cmd)
 {
-	struct request_queue *q = dev->q;
 	struct nvme_ns *ns = q->queuedata;
 	struct request *rq;
-	struct bio *bio = rqd->bio;
-	struct nvme_nvm_command *cmd;
-
-	cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL);
-	if (!cmd)
-		return -ENOMEM;
 
 	nvme_nvm_rqtocmd(rqd, ns, cmd);
 
 	rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0, NVME_QID_ANY);
-	if (IS_ERR(rq)) {
-		kfree(cmd);
-		return PTR_ERR(rq);
-	}
+	if (IS_ERR(rq))
+		return rq;
+
 	rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
 
-	if (bio) {
-		blk_init_request_from_bio(rq, bio);
+	if (rqd->bio) {
+		blk_init_request_from_bio(rq, rqd->bio);
 	} else {
 		rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
 		rq->__data_len = 0;
 	}
 
+	return rq;
+}
+
+static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
+{
+	struct request_queue *q = dev->q;
+	struct nvme_nvm_command *cmd;
+	struct request *rq;
+
+	cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL);
+	if (!cmd)
+		return -ENOMEM;
+
+	rq = nvme_nvm_alloc_request(q, rqd, cmd);
+	if (IS_ERR(rq)) {
+		kfree(cmd);
+		return PTR_ERR(rq);
+	}
+
 	rq->end_io_data = rqd;
 
 	blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io);
@@ -527,6 +540,34 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
 	return 0;
 }
 
+static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd)
+{
+	struct request_queue *q = dev->q;
+	struct request *rq;
+	struct nvme_nvm_command cmd;
+	int ret = 0;
+
+	memset(&cmd, 0, sizeof(struct nvme_nvm_command));
+
+	rq = nvme_nvm_alloc_request(q, rqd, &cmd);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	/* I/Os can fail and the error is signaled through rqd. Callers must
+	 * handle the error accordingly.
+	 */
+	blk_execute_rq(q, NULL, rq, 0);
+	if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
+		ret = -EINTR;
+
+	rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64);
+	rqd->error = nvme_req(rq)->status;
+
+	blk_mq_free_request(rq);
+
+	return ret;
+}
+
 static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name)
 {
 	struct nvme_ns *ns = nvmdev->q->queuedata;
@@ -562,6 +603,7 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
 	.set_bb_tbl		= nvme_nvm_set_bb_tbl,
 
 	.submit_io		= nvme_nvm_submit_io,
+	.submit_io_sync		= nvme_nvm_submit_io_sync,
 
 	.create_dma_pool	= nvme_nvm_create_dma_pool,
 	.destroy_dma_pool	= nvme_nvm_destroy_dma_pool,
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 4f0e4a0fd204..b7f111ff4d3b 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -56,6 +56,7 @@ typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32,
 typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *);
 typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int);
 typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
+typedef int (nvm_submit_io_sync_fn)(struct nvm_dev *, struct nvm_rq *);
 typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *);
 typedef void (nvm_destroy_dma_pool_fn)(void *);
 typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
@@ -69,6 +70,7 @@ struct nvm_dev_ops {
 	nvm_op_set_bb_fn	*set_bb_tbl;
 
 	nvm_submit_io_fn	*submit_io;
+	nvm_submit_io_sync_fn	*submit_io_sync;
 
 	nvm_create_dma_pool_fn	*create_dma_pool;
 	nvm_destroy_dma_pool_fn	*destroy_dma_pool;
@@ -477,6 +479,7 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *,
 			      int, int);
 extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
 extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
+extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *);
 extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int);
 extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
 			   void *);
-- 
cgit v1.2.3


From 1b0c22e45508ffbd645430e049c52c46bdaad3b4 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 13 Oct 2017 14:12:45 +0200
Subject: regmap: avoid -Wint-in-bool-context warning

When we pass the result of a multiplication as the timeout or the delay,
we can get a warning from gcc-7:

drivers/mmc/host/bcm2835.c:596:149: error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]
drivers/mfd/arizona-core.c:247:195: error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]
drivers/gpu/drm/sun4i/sun4i_hdmi_i2c.c:49:27: error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]

The warning is a bit questionable inside of a macro, but this is
intentional on the side of the gcc developers.  It is also an indication
of another problem: we evaluate the timeout and sleep arguments multiple
times, which can have undesired side-effects when those are complex
expressions.

This changes the two regmap variants to use local variables for storing
copies of the timeouts.  This adds some more type safety, and avoids both
the double-evaluation and the gcc warning.

Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81484
Link: http://lkml.kernel.org/r/20170726133756.2161367-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regmap.h | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 1e369f0e921d..edad98890b9b 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -120,22 +120,24 @@ struct reg_sequence {
  */
 #define regmap_read_poll_timeout(map, addr, val, cond, sleep_us, timeout_us) \
 ({ \
-	ktime_t __timeout = ktime_add_us(ktime_get(), timeout_us); \
+	u64 __timeout_us = (timeout_us); \
+	unsigned long __sleep_us = (sleep_us); \
+	ktime_t __timeout = ktime_add_us(ktime_get(), __timeout_us); \
 	int __ret; \
-	might_sleep_if(sleep_us); \
+	might_sleep_if(__sleep_us); \
 	for (;;) { \
 		__ret = regmap_read((map), (addr), &(val)); \
 		if (__ret) \
 			break; \
 		if (cond) \
 			break; \
-		if ((timeout_us) && \
+		if ((__timeout_us) && \
 		    ktime_compare(ktime_get(), __timeout) > 0) { \
 			__ret = regmap_read((map), (addr), &(val)); \
 			break; \
 		} \
-		if (sleep_us) \
-			usleep_range(((sleep_us) >> 2) + 1, sleep_us); \
+		if (__sleep_us) \
+			usleep_range((__sleep_us >> 2) + 1, __sleep_us); \
 	} \
 	__ret ?: ((cond) ? 0 : -ETIMEDOUT); \
 })
@@ -160,21 +162,23 @@ struct reg_sequence {
  */
 #define regmap_field_read_poll_timeout(field, val, cond, sleep_us, timeout_us) \
 ({ \
-	ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \
+	u64 __timeout_us = (timeout_us); \
+	unsigned long __sleep_us = (sleep_us); \
+	ktime_t timeout = ktime_add_us(ktime_get(), __timeout_us); \
 	int pollret; \
-	might_sleep_if(sleep_us); \
+	might_sleep_if(__sleep_us); \
 	for (;;) { \
 		pollret = regmap_field_read((field), &(val)); \
 		if (pollret) \
 			break; \
 		if (cond) \
 			break; \
-		if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \
+		if (__timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \
 			pollret = regmap_field_read((field), &(val)); \
 			break; \
 		} \
-		if (sleep_us) \
-			usleep_range((sleep_us >> 2) + 1, sleep_us); \
+		if (__sleep_us) \
+			usleep_range((__sleep_us >> 2) + 1, __sleep_us); \
 	} \
 	pollret ?: ((cond) ? 0 : -ETIMEDOUT); \
 })
-- 
cgit v1.2.3


From 17a9422de78c3a59b490b400f555635c477f1476 Mon Sep 17 00:00:00 2001
From: Alan Brady <alan.brady@intel.com>
Date: Wed, 11 Oct 2017 14:49:43 -0700
Subject: i40e/i40evf: don't trust VF to reset itself

When using 'ethtool -L' on a VF to change number of requested queues
from PF, we shouldn't trust the VF to reset itself after making the
request.  Doing it that way opens the door for a potentially malicious
VF to do nasty things to the PF which should never be the case.

This makes it such that after VF makes a successful request, PF will
then reset the VF to institute required changes.  Only if the request
fails will PF send a message back to VF letting it know the request was
unsuccessful.

Testing-hints:
There should be no real functional changes.  This is simply hardening
against a potentially malicious VF.

Signed-off-by: Alan Brady <alan.brady@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c  | 9 +++++++--
 drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c | 7 +++----
 include/linux/avf/virtchnl.h                        | 4 ++--
 3 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index ce0981e2f605..f8a794b72462 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -2045,8 +2045,9 @@ error_param:
  * @msglen: msg length
  *
  * VFs get a default number of queues but can use this message to request a
- * different number.  Will respond with either the number requested or the
- * maximum we can support.
+ * different number.  If the request is successful, PF will reset the VF and
+ * return 0.  If unsuccessful, PF will send message informing VF of number of
+ * available queues and return result of sending VF a message.
  **/
 static int i40e_vc_request_queues_msg(struct i40e_vf *vf, u8 *msg, int msglen)
 {
@@ -2077,7 +2078,11 @@ static int i40e_vc_request_queues_msg(struct i40e_vf *vf, u8 *msg, int msglen)
 			 pf->queues_left);
 		vfres->num_queue_pairs = pf->queues_left + cur_pairs;
 	} else {
+		/* successful request */
 		vf->num_req_queues = req_pairs;
+		i40e_vc_notify_vf_reset(vf);
+		i40e_reset_vf(vf, false);
+		return 0;
 	}
 
 	return i40e_vc_send_msg_to_vf(vf, VIRTCHNL_OP_REQUEST_QUEUES, 0,
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
index 2bb81c39d85f..46c8b8a3907c 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
@@ -407,6 +407,7 @@ int i40evf_request_queues(struct i40evf_adapter *adapter, int num)
 	vfres.num_queue_pairs = num;
 
 	adapter->current_op = VIRTCHNL_OP_REQUEST_QUEUES;
+	adapter->flags |= I40EVF_FLAG_REINIT_ITR_NEEDED;
 	return i40evf_send_pf_msg(adapter, VIRTCHNL_OP_REQUEST_QUEUES,
 				  (u8 *)&vfres, sizeof(vfres));
 }
@@ -1098,15 +1099,13 @@ void i40evf_virtchnl_completion(struct i40evf_adapter *adapter,
 	case VIRTCHNL_OP_REQUEST_QUEUES: {
 		struct virtchnl_vf_res_request *vfres =
 			(struct virtchnl_vf_res_request *)msg;
-		if (vfres->num_queue_pairs == adapter->num_req_queues) {
-			adapter->flags |= I40EVF_FLAG_REINIT_ITR_NEEDED;
-			i40evf_schedule_reset(adapter);
-		} else {
+		if (vfres->num_queue_pairs != adapter->num_req_queues) {
 			dev_info(&adapter->pdev->dev,
 				 "Requested %d queues, PF can support %d\n",
 				 adapter->num_req_queues,
 				 vfres->num_queue_pairs);
 			adapter->num_req_queues = 0;
+			adapter->flags &= ~I40EVF_FLAG_REINIT_ITR_NEEDED;
 		}
 		}
 		break;
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 60e5d90cb18a..3ce61342fa31 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -333,8 +333,8 @@ struct virtchnl_vsi_queue_config_info {
  * additional queues must be negotiated.  This is a best effort request as it
  * is possible the PF does not have enough queues left to support the request.
  * If the PF cannot support the number requested it will respond with the
- * maximum number it is able to support; otherwise it will respond with the
- * number requested.
+ * maximum number it is able to support.  If the request is successful, PF will
+ * then reset the VF to institute required changes.
  */
 
 /* VF resource request */
-- 
cgit v1.2.3


From 42f6284ae602469762ee721ec31ddfc6170e00bc Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 12 Oct 2017 15:07:23 +0530
Subject: PM / Domains: Add support to select performance-state of domains

Some platforms have the capability to configure the performance state of
PM domains. This patch enhances the genpd core to support such
platforms.

The performance levels (within the genpd core) are identified by
positive integer values, a lower value represents lower performance
state.

This patch adds a new genpd API, which is called by user drivers (like
OPP framework):

- int dev_pm_genpd_set_performance_state(struct device *dev,
					 unsigned int state);

  This updates the performance state constraint of the device on its PM
  domain. On success, the genpd will have its performance state set to a
  value which is >= "state" passed to this routine. The genpd core calls
  the genpd->set_performance_state() callback, if implemented,
  else -ENODEV is returned to the caller.

The PM domain drivers need to implement the following callback if they
want to support performance states.

- int (*set_performance_state)(struct generic_pm_domain *genpd,
			       unsigned int state);

  This is called internally by the genpd core on several occasions. The
  genpd core passes the genpd pointer and the aggregate of the
  performance states of the devices supported by that genpd to this
  callback. This callback must update the performance state of the genpd
  (in a platform dependent way).

The power domains can avoid supplying above callback, if they don't
support setting performance-states.

Currently we aren't propagating performance state changes of a subdomain
to its masters as we don't have hardware that needs it right now. Over
that, the performance states of subdomain and its masters may not have
one-to-one mapping and would require additional information. We can get
back to this once we have hardware that needs it.

Tested-by: Rajendra Nayak <rnayak@codeaurora.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 98 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pm_domain.h   | 12 ++++++
 2 files changed, 110 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index a6e4c8d7d837..7e01ae364d78 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -237,6 +237,95 @@ static void genpd_update_accounting(struct generic_pm_domain *genpd)
 static inline void genpd_update_accounting(struct generic_pm_domain *genpd) {}
 #endif
 
+/**
+ * dev_pm_genpd_set_performance_state- Set performance state of device's power
+ * domain.
+ *
+ * @dev: Device for which the performance-state needs to be set.
+ * @state: Target performance state of the device. This can be set as 0 when the
+ *	   device doesn't have any performance state constraints left (And so
+ *	   the device wouldn't participate anymore to find the target
+ *	   performance state of the genpd).
+ *
+ * It is assumed that the users guarantee that the genpd wouldn't be detached
+ * while this routine is getting called.
+ *
+ * Returns 0 on success and negative error values on failures.
+ */
+int dev_pm_genpd_set_performance_state(struct device *dev, unsigned int state)
+{
+	struct generic_pm_domain *genpd;
+	struct generic_pm_domain_data *gpd_data, *pd_data;
+	struct pm_domain_data *pdd;
+	unsigned int prev;
+	int ret = 0;
+
+	genpd = dev_to_genpd(dev);
+	if (IS_ERR(genpd))
+		return -ENODEV;
+
+	if (unlikely(!genpd->set_performance_state))
+		return -EINVAL;
+
+	if (unlikely(!dev->power.subsys_data ||
+		     !dev->power.subsys_data->domain_data)) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	genpd_lock(genpd);
+
+	gpd_data = to_gpd_data(dev->power.subsys_data->domain_data);
+	prev = gpd_data->performance_state;
+	gpd_data->performance_state = state;
+
+	/* New requested state is same as Max requested state */
+	if (state == genpd->performance_state)
+		goto unlock;
+
+	/* New requested state is higher than Max requested state */
+	if (state > genpd->performance_state)
+		goto update_state;
+
+	/* Traverse all devices within the domain */
+	list_for_each_entry(pdd, &genpd->dev_list, list_node) {
+		pd_data = to_gpd_data(pdd);
+
+		if (pd_data->performance_state > state)
+			state = pd_data->performance_state;
+	}
+
+	if (state == genpd->performance_state)
+		goto unlock;
+
+	/*
+	 * We aren't propagating performance state changes of a subdomain to its
+	 * masters as we don't have hardware that needs it. Over that, the
+	 * performance states of subdomain and its masters may not have
+	 * one-to-one mapping and would require additional information. We can
+	 * get back to this once we have hardware that needs it. For that
+	 * reason, we don't have to consider performance state of the subdomains
+	 * of genpd here.
+	 */
+
+update_state:
+	if (genpd_status_on(genpd)) {
+		ret = genpd->set_performance_state(genpd, state);
+		if (ret) {
+			gpd_data->performance_state = prev;
+			goto unlock;
+		}
+	}
+
+	genpd->performance_state = state;
+
+unlock:
+	genpd_unlock(genpd);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_genpd_set_performance_state);
+
 static int _genpd_power_on(struct generic_pm_domain *genpd, bool timed)
 {
 	unsigned int state_idx = genpd->state_idx;
@@ -256,6 +345,15 @@ static int _genpd_power_on(struct generic_pm_domain *genpd, bool timed)
 		return ret;
 
 	elapsed_ns = ktime_to_ns(ktime_sub(ktime_get(), time_start));
+
+	if (unlikely(genpd->set_performance_state)) {
+		ret = genpd->set_performance_state(genpd, genpd->performance_state);
+		if (ret) {
+			pr_warn("%s: Failed to set performance state %d (%d)\n",
+				genpd->name, genpd->performance_state, ret);
+		}
+	}
+
 	if (elapsed_ns <= genpd->states[state_idx].power_on_latency_ns)
 		return ret;
 
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 84f423d5633e..9af0356bd69c 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -64,8 +64,11 @@ struct generic_pm_domain {
 	unsigned int device_count;	/* Number of devices */
 	unsigned int suspended_count;	/* System suspend device counter */
 	unsigned int prepared_count;	/* Suspend counter of prepared devices */
+	unsigned int performance_state;	/* Aggregated max performance state */
 	int (*power_off)(struct generic_pm_domain *domain);
 	int (*power_on)(struct generic_pm_domain *domain);
+	int (*set_performance_state)(struct generic_pm_domain *genpd,
+				     unsigned int state);
 	struct gpd_dev_ops dev_ops;
 	s64 max_off_time_ns;	/* Maximum allowed "suspended" time. */
 	bool max_off_time_changed;
@@ -121,6 +124,7 @@ struct generic_pm_domain_data {
 	struct pm_domain_data base;
 	struct gpd_timing_data td;
 	struct notifier_block nb;
+	unsigned int performance_state;
 	void *data;
 };
 
@@ -148,6 +152,8 @@ extern int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 extern int pm_genpd_init(struct generic_pm_domain *genpd,
 			 struct dev_power_governor *gov, bool is_off);
 extern int pm_genpd_remove(struct generic_pm_domain *genpd);
+extern int dev_pm_genpd_set_performance_state(struct device *dev,
+					      unsigned int state);
 
 extern struct dev_power_governor simple_qos_governor;
 extern struct dev_power_governor pm_domain_always_on_gov;
@@ -188,6 +194,12 @@ static inline int pm_genpd_remove(struct generic_pm_domain *genpd)
 	return -ENOTSUPP;
 }
 
+static inline int dev_pm_genpd_set_performance_state(struct device *dev,
+						     unsigned int state)
+{
+	return -ENOTSUPP;
+}
+
 #define simple_qos_governor		(*(struct dev_power_governor *)(NULL))
 #define pm_domain_always_on_gov		(*(struct dev_power_governor *)(NULL))
 #endif
-- 
cgit v1.2.3


From b6aa98364f842f943495408895627702ad7ad44b Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 11 Oct 2017 12:54:15 +0530
Subject: PM / OPP: Add dev_pm_opp_{un}register_get_pstate_helper()

This adds the dev_pm_opp_{un}register_get_pstate_helper() helper
routines which will be used to set the get_pstate() callback for a
device. This callback will be later called internally by the OPP core to
get performance state corresponding to an OPP.

This is required temporarily until the time we have proper DT bindings
to include the performance state information.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/opp/core.c     | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/opp/opp.h      |  2 ++
 include/linux/pm_opp.h | 10 +++++++
 3 files changed, 90 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 0ce8069d6843..92fa94a6dcc1 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -1036,6 +1036,9 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
 		return ret;
 	}
 
+	if (opp_table->get_pstate)
+		new_opp->pstate = opp_table->get_pstate(dev, new_opp->rate);
+
 	list_add(&new_opp->node, head);
 	mutex_unlock(&opp_table->lock);
 
@@ -1547,6 +1550,81 @@ void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_unregister_set_opp_helper);
 
+/**
+ * dev_pm_opp_register_get_pstate_helper() - Register get_pstate() helper.
+ * @dev: Device for which the helper is getting registered.
+ * @get_pstate: Helper.
+ *
+ * TODO: Remove this callback after the same information is available via Device
+ * Tree.
+ *
+ * This allows a platform to initialize the performance states of individual
+ * OPPs for its devices, until we get similar information directly from DT.
+ *
+ * This must be called before the OPPs are initialized for the device.
+ */
+struct opp_table *dev_pm_opp_register_get_pstate_helper(struct device *dev,
+		int (*get_pstate)(struct device *dev, unsigned long rate))
+{
+	struct opp_table *opp_table;
+	int ret;
+
+	if (!get_pstate)
+		return ERR_PTR(-EINVAL);
+
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return ERR_PTR(-ENOMEM);
+
+	/* This should be called before OPPs are initialized */
+	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	/* Already have genpd_performance_state set */
+	if (WARN_ON(opp_table->genpd_performance_state)) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	opp_table->genpd_performance_state = true;
+	opp_table->get_pstate = get_pstate;
+
+	return opp_table;
+
+err:
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_register_get_pstate_helper);
+
+/**
+ * dev_pm_opp_unregister_get_pstate_helper() - Releases resources blocked for
+ *					   get_pstate() helper
+ * @opp_table: OPP table returned from dev_pm_opp_register_get_pstate_helper().
+ *
+ * Release resources blocked for platform specific get_pstate() helper.
+ */
+void dev_pm_opp_unregister_get_pstate_helper(struct opp_table *opp_table)
+{
+	if (!opp_table->genpd_performance_state) {
+		pr_err("%s: Doesn't have performance states set\n",
+		       __func__);
+		return;
+	}
+
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	opp_table->genpd_performance_state = false;
+	opp_table->get_pstate = NULL;
+
+	dev_pm_opp_put_opp_table(opp_table);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_unregister_get_pstate_helper);
+
 /**
  * dev_pm_opp_add()  - Add an OPP table from a table definitions
  * @dev:	device for which we do this operation
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index e8f767ab5814..4d00061648a3 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -140,6 +140,7 @@ enum opp_table_access {
  * @genpd_performance_state: Device's power domain support performance state.
  * @set_opp: Platform specific set_opp callback
  * @set_opp_data: Data to be passed to set_opp callback
+ * @get_pstate: Platform specific get_pstate callback
  * @dentry:	debugfs dentry pointer of the real device directory (not links).
  * @dentry_name: Name of the real dentry.
  *
@@ -177,6 +178,7 @@ struct opp_table {
 
 	int (*set_opp)(struct dev_pm_set_opp_data *data);
 	struct dev_pm_set_opp_data *set_opp_data;
+	int (*get_pstate)(struct device *dev, unsigned long rate);
 
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dentry;
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 849d21dc4ca7..6c2d2e88f066 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -125,6 +125,8 @@ struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char * name);
 void dev_pm_opp_put_clkname(struct opp_table *opp_table);
 struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
 void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table);
+struct opp_table *dev_pm_opp_register_get_pstate_helper(struct device *dev, int (*get_pstate)(struct device *dev, unsigned long rate));
+void dev_pm_opp_unregister_get_pstate_helper(struct opp_table *opp_table);
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
 int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask);
 int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask);
@@ -245,6 +247,14 @@ static inline struct opp_table *dev_pm_opp_register_set_opp_helper(struct device
 
 static inline void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table) {}
 
+static inline struct opp_table *dev_pm_opp_register_get_pstate_helper(struct device *dev,
+		int (*get_pstate)(struct device *dev, unsigned long rate))
+{
+	return ERR_PTR(-ENOTSUPP);
+}
+
+static inline void dev_pm_opp_unregister_get_pstate_helper(struct opp_table *opp_table) {}
+
 static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 {
 	return ERR_PTR(-ENOTSUPP);
-- 
cgit v1.2.3


From 20f97caf1120bd02e8ff4adbad3b44b63626feb5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 13 Oct 2017 15:27:24 +0200
Subject: PM / QoS: Drop PM_QOS_FLAG_REMOTE_WAKEUP

The PM QoS flag PM_QOS_FLAG_REMOTE_WAKEUP is not used consistently
and the vast majority of code simply assumes that remote wakeup
should be enabled for devices in runtime suspend if they can
generate wakeup signals, so drop it.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 Documentation/ABI/testing/sysfs-devices-power | 16 ---------------
 Documentation/power/pm_qos_interface.txt      | 13 ++++++-------
 drivers/acpi/device_pm.c                      |  6 ++----
 drivers/base/power/domain.c                   |  4 +---
 drivers/base/power/sysfs.c                    | 28 ---------------------------
 include/linux/pm_qos.h                        |  1 -
 6 files changed, 9 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-devices-power b/Documentation/ABI/testing/sysfs-devices-power
index 676fdf5f2a99..f4b24c327665 100644
--- a/Documentation/ABI/testing/sysfs-devices-power
+++ b/Documentation/ABI/testing/sysfs-devices-power
@@ -258,19 +258,3 @@ Description:
 
 		This attribute has no effect on system-wide suspend/resume and
 		hibernation.
-
-What:		/sys/devices/.../power/pm_qos_remote_wakeup
-Date:		September 2012
-Contact:	Rafael J. Wysocki <rjw@rjwysocki.net>
-Description:
-		The /sys/devices/.../power/pm_qos_remote_wakeup attribute
-		is used for manipulating the PM QoS "remote wakeup required"
-		flag.  If set, this flag indicates to the kernel that the
-		device is a source of user events that have to be signaled from
-		its low-power states.
-
-		Not all drivers support this attribute.  If it isn't supported,
-		it is not present.
-
-		This attribute has no effect on system-wide suspend/resume and
-		hibernation.
diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
index 21d2d48f87a2..19c5f7b1a7ba 100644
--- a/Documentation/power/pm_qos_interface.txt
+++ b/Documentation/power/pm_qos_interface.txt
@@ -98,8 +98,7 @@ Values are updated in response to changes of the request list.
 The target values of resume latency and active state latency tolerance are
 simply the minimum of the request values held in the parameter list elements.
 The PM QoS flags aggregate value is a gather (bitwise OR) of all list elements'
-values.  Two device PM QoS flags are defined currently: PM_QOS_FLAG_NO_POWER_OFF
-and PM_QOS_FLAG_REMOTE_WAKEUP.
+values.  One device PM QoS flag is defined currently: PM_QOS_FLAG_NO_POWER_OFF.
 
 Note: The aggregated target values are implemented in such a way that reading
 the aggregated value does not require any locking mechanism.
@@ -153,14 +152,14 @@ PM QoS list of resume latency constraints and remove sysfs attribute
 pm_qos_resume_latency_us from the device's power directory.
 
 int dev_pm_qos_expose_flags(device, value)
-Add a request to the device's PM QoS list of flags and create sysfs attributes
-pm_qos_no_power_off and pm_qos_remote_wakeup under the device's power directory
-allowing user space to change these flags' value.
+Add a request to the device's PM QoS list of flags and create sysfs attribute
+pm_qos_no_power_off under the device's power directory allowing user space to
+change the value of the PM_QOS_FLAG_NO_POWER_OFF flag.
 
 void dev_pm_qos_hide_flags(device)
 Drop the request added by dev_pm_qos_expose_flags() from the device's PM QoS list
-of flags and remove sysfs attributes pm_qos_no_power_off and pm_qos_remote_wakeup
-under the device's power directory.
+of flags and remove sysfs attribute pm_qos_no_power_off from the device's power
+directory.
 
 Notification mechanisms:
 The per-device PM QoS framework has a per-device notification tree.
diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index fbcc73f7a099..e8c820129797 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -581,8 +581,7 @@ static int acpi_dev_pm_get_state(struct device *dev, struct acpi_device *adev,
 		d_min = ret;
 		wakeup = device_may_wakeup(dev) && adev->wakeup.flags.valid
 			&& adev->wakeup.sleep_state >= target_state;
-	} else if (dev_pm_qos_flags(dev, PM_QOS_FLAG_REMOTE_WAKEUP) !=
-			PM_QOS_FLAGS_NONE) {
+	} else {
 		wakeup = adev->wakeup.flags.valid;
 	}
 
@@ -865,8 +864,7 @@ int acpi_dev_runtime_suspend(struct device *dev)
 	if (!adev)
 		return 0;
 
-	remote_wakeup = dev_pm_qos_flags(dev, PM_QOS_FLAG_REMOTE_WAKEUP) >
-				PM_QOS_FLAGS_NONE;
+	remote_wakeup = acpi_device_can_wakeup(adev);
 	if (remote_wakeup) {
 		error = acpi_device_wakeup_enable(adev, ACPI_STATE_S0);
 		if (error)
diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index e8ca5e2cf1e5..e6414e9998bb 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -346,9 +346,7 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool one_dev_on,
 	list_for_each_entry(pdd, &genpd->dev_list, list_node) {
 		enum pm_qos_flags_status stat;
 
-		stat = dev_pm_qos_flags(pdd->dev,
-					PM_QOS_FLAG_NO_POWER_OFF
-						| PM_QOS_FLAG_REMOTE_WAKEUP);
+		stat = dev_pm_qos_flags(pdd->dev, PM_QOS_FLAG_NO_POWER_OFF);
 		if (stat > PM_QOS_FLAGS_NONE)
 			return -EBUSY;
 
diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index 156ab57bca77..29bf28fef136 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -309,33 +309,6 @@ static ssize_t pm_qos_no_power_off_store(struct device *dev,
 static DEVICE_ATTR(pm_qos_no_power_off, 0644,
 		   pm_qos_no_power_off_show, pm_qos_no_power_off_store);
 
-static ssize_t pm_qos_remote_wakeup_show(struct device *dev,
-					 struct device_attribute *attr,
-					 char *buf)
-{
-	return sprintf(buf, "%d\n", !!(dev_pm_qos_requested_flags(dev)
-					& PM_QOS_FLAG_REMOTE_WAKEUP));
-}
-
-static ssize_t pm_qos_remote_wakeup_store(struct device *dev,
-					  struct device_attribute *attr,
-					  const char *buf, size_t n)
-{
-	int ret;
-
-	if (kstrtoint(buf, 0, &ret))
-		return -EINVAL;
-
-	if (ret != 0 && ret != 1)
-		return -EINVAL;
-
-	ret = dev_pm_qos_update_flags(dev, PM_QOS_FLAG_REMOTE_WAKEUP, ret);
-	return ret < 0 ? ret : n;
-}
-
-static DEVICE_ATTR(pm_qos_remote_wakeup, 0644,
-		   pm_qos_remote_wakeup_show, pm_qos_remote_wakeup_store);
-
 #ifdef CONFIG_PM_SLEEP
 static const char _enabled[] = "enabled";
 static const char _disabled[] = "disabled";
@@ -671,7 +644,6 @@ static const struct attribute_group pm_qos_latency_tolerance_attr_group = {
 
 static struct attribute *pm_qos_flags_attrs[] = {
 	&dev_attr_pm_qos_no_power_off.attr,
-	&dev_attr_pm_qos_remote_wakeup.attr,
 	NULL,
 };
 static const struct attribute_group pm_qos_flags_attr_group = {
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 032b55909145..51f0d7e0b15f 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -39,7 +39,6 @@ enum pm_qos_flags_status {
 #define PM_QOS_LATENCY_ANY			((s32)(~(__u32)0 >> 1))
 
 #define PM_QOS_FLAG_NO_POWER_OFF	(1 << 0)
-#define PM_QOS_FLAG_REMOTE_WAKEUP	(1 << 1)
 
 struct pm_qos_request {
 	struct plist_node node;
-- 
cgit v1.2.3


From 086c321ec57bfda5b15f3553e7def33302955852 Mon Sep 17 00:00:00 2001
From: Ladislav Michl <ladis@linux-mips.org>
Date: Tue, 10 Oct 2017 14:38:07 +0200
Subject: mtd: nand: omap2: Remove omap_nand_platform_data

As driver is now configured using DT, omap_nand_platform_data structure
is no longer needed.

Signed-off-by: Ladislav Michl <ladis@linux-mips.org>
Acked-by: Roger Quadros <rogerq@ti.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/omap2.c                     | 37 ++++++----------------------
 include/linux/platform_data/mtd-nand-omap2.h | 17 -------------
 2 files changed, 8 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c
index 54540c8fa1a2..01368a8f9e3f 100644
--- a/drivers/mtd/nand/omap2.c
+++ b/drivers/mtd/nand/omap2.c
@@ -1588,8 +1588,7 @@ static bool is_elm_present(struct omap_nand_info *info,
 	return true;
 }
 
-static bool omap2_nand_ecc_check(struct omap_nand_info *info,
-				 struct omap_nand_platform_data	*pdata)
+static bool omap2_nand_ecc_check(struct omap_nand_info *info)
 {
 	bool ecc_needs_bch, ecc_needs_omap_bch, ecc_needs_elm;
 
@@ -1804,7 +1803,6 @@ static const struct mtd_ooblayout_ops omap_sw_ooblayout_ops = {
 static int omap_nand_probe(struct platform_device *pdev)
 {
 	struct omap_nand_info		*info;
-	struct omap_nand_platform_data	*pdata = NULL;
 	struct mtd_info			*mtd;
 	struct nand_chip		*nand_chip;
 	int				err;
@@ -1821,27 +1819,9 @@ static int omap_nand_probe(struct platform_device *pdev)
 
 	info->pdev = pdev;
 
-	if (dev->of_node) {
-		if (omap_get_dt_info(dev, info))
-			return -EINVAL;
-	} else {
-		pdata = dev_get_platdata(&pdev->dev);
-		if (!pdata) {
-			dev_err(&pdev->dev, "platform data missing\n");
-			return -EINVAL;
-		}
-
-		info->gpmc_cs = pdata->cs;
-		info->reg = pdata->reg;
-		info->ecc_opt = pdata->ecc_opt;
-		if (pdata->dev_ready)
-			dev_info(&pdev->dev, "pdata->dev_ready is deprecated\n");
-
-		info->xfer_type = pdata->xfer_type;
-		info->devsize = pdata->devsize;
-		info->elm_of_node = pdata->elm_of_node;
-		info->flash_bbt = pdata->flash_bbt;
-	}
+	err = omap_get_dt_info(dev, info);
+	if (err)
+		return err;
 
 	platform_set_drvdata(pdev, info);
 	info->ops = gpmc_omap_get_nand_ops(&info->reg, info->gpmc_cs);
@@ -2002,7 +1982,7 @@ static int omap_nand_probe(struct platform_device *pdev)
 		goto return_error;
 	}
 
-	if (!omap2_nand_ecc_check(info, pdata)) {
+	if (!omap2_nand_ecc_check(info)) {
 		err = -EINVAL;
 		goto return_error;
 	}
@@ -2167,10 +2147,9 @@ scan_tail:
 	if (err)
 		goto return_error;
 
-	if (dev->of_node)
-		mtd_device_register(mtd, NULL, 0);
-	else
-		mtd_device_register(mtd, pdata->parts, pdata->nr_parts);
+	err = mtd_device_register(mtd, NULL, 0);
+	if (err)
+		goto return_error;
 
 	platform_set_drvdata(pdev, mtd);
 
diff --git a/include/linux/platform_data/mtd-nand-omap2.h b/include/linux/platform_data/mtd-nand-omap2.h
index 17d57a18bac5..8b8b124e8aea 100644
--- a/include/linux/platform_data/mtd-nand-omap2.h
+++ b/include/linux/platform_data/mtd-nand-omap2.h
@@ -66,21 +66,4 @@ struct gpmc_nand_regs {
 	/* Deprecated. Do not use */
 	void __iomem	*gpmc_status;
 };
-
-struct omap_nand_platform_data {
-	int			cs;
-	struct mtd_partition	*parts;
-	int			nr_parts;
-	bool			flash_bbt;
-	enum nand_io		xfer_type;
-	int			devsize;
-	enum omap_ecc           ecc_opt;
-
-	struct device_node	*elm_of_node;
-
-	/* deprecated */
-	struct gpmc_nand_regs	reg;
-	struct device_node	*of_node;
-	bool			dev_ready;
-};
 #endif
-- 
cgit v1.2.3


From e0005bd93ab5f7a80b08a7bdff9b2ade436f9482 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Tue, 10 Oct 2017 18:15:34 +0200
Subject: iio: Drop duplicate forward declaration

Commit 5f420b42079c ("staging:iio: Add extended IIO channel info") added
a forward declaration for struct iio_dev to <linux/iio/iio.h> but forgot
to remove an existing forward declaration further down originating from
commit 7ae8cf627558 ("staging: iio: chrdev.h rationalization").

Cc: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 50cae8504256..20b61347ea58 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -365,7 +365,6 @@ unsigned int iio_get_time_res(const struct iio_dev *indio_dev);
 #define INDIO_MAX_RAW_ELEMENTS		4
 
 struct iio_trigger; /* forward declaration */
-struct iio_dev;
 
 /**
  * struct iio_info - constant information about device
-- 
cgit v1.2.3


From 7c39afb394c79e72c3795b4a42d55155b34ee073 Mon Sep 17 00:00:00 2001
From: Feras Daoud <ferasda@mellanox.com>
Date: Tue, 15 Aug 2017 13:46:04 +0300
Subject: net/mlx5: PTP code migration to driver core section

PTP code is moved to core section of mlx5 driver in order to share
it between ethernet and infiniband. This movement involves the following
changes:
- Change mlx5e_ prefix to be mlx5_
- Add clock structs to Core
- Add clock object to mlx5_core_dev
- Call Init/Uninit clock from core init/cleanup
- Rename mlx5e_tstamp to be mlx5_clock

Signed-off-by: Feras Daoud <ferasda@mellanox.com>
Signed-off-by: Eitan Rabin <rabin@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  39 +-
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |   7 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  95 +++-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |  17 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c    |   6 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       |   3 +-
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c  |   3 +-
 .../net/ethernet/mellanox/mlx5/core/lib/clock.c    | 548 +++++++++------------
 .../net/ethernet/mellanox/mlx5/core/lib/clock.h    |  51 ++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   4 +
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   1 +
 include/linux/mlx5/driver.h                        |  24 +
 12 files changed, 416 insertions(+), 382 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index cc13d3dbd366..2059122eb089 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -267,28 +267,6 @@ struct mlx5e_dcbx {
 };
 #endif
 
-#define MAX_PIN_NUM	8
-struct mlx5e_pps {
-	u8                         pin_caps[MAX_PIN_NUM];
-	struct work_struct         out_work;
-	u64                        start[MAX_PIN_NUM];
-	u8                         enabled;
-};
-
-struct mlx5e_tstamp {
-	rwlock_t                   lock;
-	struct cyclecounter        cycles;
-	struct timecounter         clock;
-	struct hwtstamp_config     hwtstamp_config;
-	u32                        nominal_c_mult;
-	unsigned long              overflow_period;
-	struct delayed_work        overflow_work;
-	struct mlx5_core_dev      *mdev;
-	struct ptp_clock          *ptp;
-	struct ptp_clock_info      ptp_info;
-	struct mlx5e_pps           pps_info;
-};
-
 enum {
 	MLX5E_RQ_STATE_ENABLED,
 	MLX5E_RQ_STATE_AM,
@@ -375,9 +353,10 @@ struct mlx5e_txqsq {
 	u8                         min_inline_mode;
 	u16                        edge;
 	struct device             *pdev;
-	struct mlx5e_tstamp       *tstamp;
 	__be32                     mkey_be;
 	unsigned long              state;
+	struct hwtstamp_config    *tstamp;
+	struct mlx5_clock         *clock;
 
 	/* control path */
 	struct mlx5_wq_ctrl        wq_ctrl;
@@ -543,10 +522,11 @@ struct mlx5e_rq {
 	struct mlx5e_channel  *channel;
 	struct device         *pdev;
 	struct net_device     *netdev;
-	struct mlx5e_tstamp   *tstamp;
 	struct mlx5e_rq_stats  stats;
 	struct mlx5e_cq        cq;
 	struct mlx5e_page_cache page_cache;
+	struct hwtstamp_config *tstamp;
+	struct mlx5_clock      *clock;
 
 	mlx5e_fp_handle_rx_cqe handle_rx_cqe;
 	mlx5e_fp_post_rx_wqes  post_wqes;
@@ -588,7 +568,7 @@ struct mlx5e_channel {
 	/* control */
 	struct mlx5e_priv         *priv;
 	struct mlx5_core_dev      *mdev;
-	struct mlx5e_tstamp       *tstamp;
+	struct hwtstamp_config    *tstamp;
 	int                        ix;
 };
 
@@ -789,7 +769,7 @@ struct mlx5e_priv {
 	struct mlx5_core_dev      *mdev;
 	struct net_device         *netdev;
 	struct mlx5e_stats         stats;
-	struct mlx5e_tstamp        tstamp;
+	struct hwtstamp_config     tstamp;
 	u16 q_counter;
 #ifdef CONFIG_MLX5_CORE_EN_DCB
 	struct mlx5e_dcbx          dcbx;
@@ -873,12 +853,6 @@ void mlx5e_ethtool_init_steering(struct mlx5e_priv *priv);
 void mlx5e_ethtool_cleanup_steering(struct mlx5e_priv *priv);
 void mlx5e_set_rx_mode_work(struct work_struct *work);
 
-void mlx5e_fill_hwstamp(struct mlx5e_tstamp *clock, u64 timestamp,
-			struct skb_shared_hwtstamps *hwts);
-void mlx5e_timestamp_init(struct mlx5e_priv *priv);
-void mlx5e_timestamp_cleanup(struct mlx5e_priv *priv);
-void mlx5e_pps_event_handler(struct mlx5e_priv *priv,
-			     struct ptp_clock_event *event);
 int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr);
 int mlx5e_hwstamp_get(struct mlx5e_priv *priv, struct ifreq *ifr);
 int mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool val);
@@ -889,6 +863,7 @@ int mlx5e_vlan_rx_kill_vid(struct net_device *dev, __always_unused __be16 proto,
 			   u16 vid);
 void mlx5e_enable_vlan_filter(struct mlx5e_priv *priv);
 void mlx5e_disable_vlan_filter(struct mlx5e_priv *priv);
+void mlx5e_timestamp_set(struct mlx5e_priv *priv);
 
 struct mlx5e_redirect_rqt_param {
 	bool is_rss;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index d12e9fc0d76b..81a112e40fe3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1417,14 +1417,15 @@ static int mlx5e_set_pauseparam(struct net_device *netdev,
 int mlx5e_ethtool_get_ts_info(struct mlx5e_priv *priv,
 			      struct ethtool_ts_info *info)
 {
+	struct mlx5_core_dev *mdev = priv->mdev;
 	int ret;
 
 	ret = ethtool_op_get_ts_info(priv->netdev, info);
 	if (ret)
 		return ret;
 
-	info->phc_index = priv->tstamp.ptp ?
-			  ptp_clock_index(priv->tstamp.ptp) : -1;
+	info->phc_index = mdev->clock.ptp ?
+			  ptp_clock_index(mdev->clock.ptp) : -1;
 
 	if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz))
 		return 0;
@@ -1754,7 +1755,7 @@ static int set_pflag_rx_cqe_compress(struct net_device *netdev,
 	if (!MLX5_CAP_GEN(mdev, cqe_compression))
 		return -EOPNOTSUPP;
 
-	if (enable && priv->tstamp.hwtstamp_config.rx_filter != HWTSTAMP_FILTER_NONE) {
+	if (enable && priv->tstamp.rx_filter != HWTSTAMP_FILTER_NONE) {
 		netdev_err(netdev, "Can't enable cqe compression while timestamping is enabled.\n");
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index cc11bbbd0309..6df00dd9745a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -373,8 +373,6 @@ static void mlx5e_async_event(struct mlx5_core_dev *mdev, void *vpriv,
 			      enum mlx5_dev_event event, unsigned long param)
 {
 	struct mlx5e_priv *priv = vpriv;
-	struct ptp_clock_event ptp_event;
-	struct mlx5_eqe *eqe = NULL;
 
 	if (!test_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLED, &priv->state))
 		return;
@@ -384,14 +382,6 @@ static void mlx5e_async_event(struct mlx5_core_dev *mdev, void *vpriv,
 	case MLX5_DEV_EVENT_PORT_DOWN:
 		queue_work(priv->wq, &priv->update_carrier_work);
 		break;
-	case MLX5_DEV_EVENT_PPS:
-		eqe = (struct mlx5_eqe *)param;
-		ptp_event.index = eqe->data.pps.pin;
-		ptp_event.timestamp =
-			timecounter_cyc2time(&priv->tstamp.clock,
-					     be64_to_cpu(eqe->data.pps.time_stamp));
-		mlx5e_pps_event_handler(vpriv, &ptp_event);
-		break;
 	default:
 		break;
 	}
@@ -585,6 +575,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	rq->pdev    = c->pdev;
 	rq->netdev  = c->netdev;
 	rq->tstamp  = c->tstamp;
+	rq->clock   = &mdev->clock;
 	rq->channel = c;
 	rq->ix      = c->ix;
 	rq->mdev    = mdev;
@@ -1123,6 +1114,7 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 
 	sq->pdev      = c->pdev;
 	sq->tstamp    = c->tstamp;
+	sq->clock     = &mdev->clock;
 	sq->mkey_be   = c->mkey_be;
 	sq->channel   = c;
 	sq->txq_ix    = txq_ix;
@@ -2678,6 +2670,12 @@ void mlx5e_switch_priv_channels(struct mlx5e_priv *priv,
 		netif_carrier_on(netdev);
 }
 
+void mlx5e_timestamp_set(struct mlx5e_priv *priv)
+{
+	priv->tstamp.tx_type   = HWTSTAMP_TX_OFF;
+	priv->tstamp.rx_filter = HWTSTAMP_FILTER_NONE;
+}
+
 int mlx5e_open_locked(struct net_device *netdev)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
@@ -2693,7 +2691,7 @@ int mlx5e_open_locked(struct net_device *netdev)
 	mlx5e_activate_priv_channels(priv);
 	if (priv->profile->update_carrier)
 		priv->profile->update_carrier(priv);
-	mlx5e_timestamp_init(priv);
+	mlx5e_timestamp_set(priv);
 
 	if (priv->profile->update_stats)
 		queue_delayed_work(priv->wq, &priv->update_stats_work, 0);
@@ -2731,7 +2729,6 @@ int mlx5e_close_locked(struct net_device *netdev)
 
 	clear_bit(MLX5E_STATE_OPENED, &priv->state);
 
-	mlx5e_timestamp_cleanup(priv);
 	netif_carrier_off(priv->netdev);
 	mlx5e_deactivate_priv_channels(priv);
 	mlx5e_close_channels(&priv->channels);
@@ -3403,6 +3400,80 @@ out:
 	return err;
 }
 
+int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr)
+{
+	struct hwtstamp_config config;
+	int err;
+
+	if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz))
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	/* TX HW timestamp */
+	switch (config.tx_type) {
+	case HWTSTAMP_TX_OFF:
+	case HWTSTAMP_TX_ON:
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	mutex_lock(&priv->state_lock);
+	/* RX HW timestamp */
+	switch (config.rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		/* Reset CQE compression to Admin default */
+		mlx5e_modify_rx_cqe_compression_locked(priv, priv->channels.params.rx_cqe_compress_def);
+		break;
+	case HWTSTAMP_FILTER_ALL:
+	case HWTSTAMP_FILTER_SOME:
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+	case HWTSTAMP_FILTER_NTP_ALL:
+		/* Disable CQE compression */
+		netdev_warn(priv->netdev, "Disabling cqe compression");
+		err = mlx5e_modify_rx_cqe_compression_locked(priv, false);
+		if (err) {
+			netdev_err(priv->netdev, "Failed disabling cqe compression err=%d\n", err);
+			mutex_unlock(&priv->state_lock);
+			return err;
+		}
+		config.rx_filter = HWTSTAMP_FILTER_ALL;
+		break;
+	default:
+		mutex_unlock(&priv->state_lock);
+		return -ERANGE;
+	}
+
+	memcpy(&priv->tstamp, &config, sizeof(config));
+	mutex_unlock(&priv->state_lock);
+
+	return copy_to_user(ifr->ifr_data, &config,
+			    sizeof(config)) ? -EFAULT : 0;
+}
+
+int mlx5e_hwstamp_get(struct mlx5e_priv *priv, struct ifreq *ifr)
+{
+	struct hwtstamp_config *cfg = &priv->tstamp;
+
+	if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz))
+		return -EOPNOTSUPP;
+
+	return copy_to_user(ifr->ifr_data, cfg, sizeof(*cfg)) ? -EFAULT : 0;
+}
+
 static int mlx5e_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 15a1687483cc..7e3bfe62ef6e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -42,10 +42,11 @@
 #include "en_rep.h"
 #include "ipoib/ipoib.h"
 #include "en_accel/ipsec_rxtx.h"
+#include "lib/clock.h"
 
-static inline bool mlx5e_rx_hw_stamp(struct mlx5e_tstamp *tstamp)
+static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config)
 {
-	return tstamp->hwtstamp_config.rx_filter == HWTSTAMP_FILTER_ALL;
+	return config->rx_filter == HWTSTAMP_FILTER_ALL;
 }
 
 static inline void mlx5e_read_cqe_slot(struct mlx5e_cq *cq, u32 cqcc,
@@ -661,7 +662,6 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
 				      struct sk_buff *skb)
 {
 	struct net_device *netdev = rq->netdev;
-	struct mlx5e_tstamp *tstamp = rq->tstamp;
 	int lro_num_seg;
 
 	lro_num_seg = be32_to_cpu(cqe->srqn) >> 24;
@@ -676,8 +676,9 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
 		rq->stats.lro_bytes += cqe_bcnt;
 	}
 
-	if (unlikely(mlx5e_rx_hw_stamp(tstamp)))
-		mlx5e_fill_hwstamp(tstamp, get_cqe_ts(cqe), skb_hwtstamps(skb));
+	if (unlikely(mlx5e_rx_hw_stamp(rq->tstamp)))
+		skb_hwtstamps(skb)->hwtstamp =
+				mlx5_timecounter_cyc2time(rq->clock, get_cqe_ts(cqe));
 
 	skb_record_rx_queue(skb, rq->ix);
 
@@ -1163,7 +1164,6 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq,
 					 struct sk_buff *skb)
 {
 	struct net_device *netdev = rq->netdev;
-	struct mlx5e_tstamp *tstamp = rq->tstamp;
 	char *pseudo_header;
 	u8 *dgid;
 	u8 g;
@@ -1188,8 +1188,9 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq,
 	skb->ip_summed = CHECKSUM_COMPLETE;
 	skb->csum = csum_unfold((__force __sum16)cqe->check_sum);
 
-	if (unlikely(mlx5e_rx_hw_stamp(tstamp)))
-		mlx5e_fill_hwstamp(tstamp, get_cqe_ts(cqe), skb_hwtstamps(skb));
+	if (unlikely(mlx5e_rx_hw_stamp(rq->tstamp)))
+		skb_hwtstamps(skb)->hwtstamp =
+				mlx5_timecounter_cyc2time(rq->clock, get_cqe_ts(cqe));
 
 	skb_record_rx_queue(skb, rq->ix);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 1d6925d4369a..a7c208a1ad83 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -35,6 +35,7 @@
 #include "en.h"
 #include "ipoib/ipoib.h"
 #include "en_accel/ipsec_rxtx.h"
+#include "lib/clock.h"
 
 #define MLX5E_SQ_NOPS_ROOM  MLX5_SEND_WQE_MAX_WQEBBS
 #define MLX5E_SQ_STOP_ROOM (MLX5_SEND_WQE_MAX_WQEBBS +\
@@ -452,8 +453,9 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
 				     SKBTX_HW_TSTAMP)) {
 				struct skb_shared_hwtstamps hwts = {};
 
-				mlx5e_fill_hwstamp(sq->tstamp,
-						   get_cqe_ts(cqe), &hwts);
+				hwts.hwtstamp =
+					mlx5_timecounter_cyc2time(sq->clock,
+								  get_cqe_ts(cqe));
 				skb_tstamp_tx(skb, &hwts);
 			}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index fc606bfd1d6e..60771865c99c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -491,8 +491,7 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
 			break;
 
 		case MLX5_EVENT_TYPE_PPS_EVENT:
-			if (dev->event)
-				dev->event(dev, MLX5_DEV_EVENT_PPS, (unsigned long)eqe);
+			mlx5_pps_event(dev, eqe);
 			break;
 
 		case MLX5_EVENT_TYPE_FPGA_ERROR:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index 145e392ab849..14dfb577691b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -404,7 +404,7 @@ static int mlx5i_open(struct net_device *netdev)
 
 	mlx5e_refresh_tirs(priv, false);
 	mlx5e_activate_priv_channels(priv);
-	mlx5e_timestamp_init(priv);
+	mlx5e_timestamp_set(priv);
 
 	mutex_unlock(&priv->state_lock);
 	return 0;
@@ -429,7 +429,6 @@ static int mlx5i_close(struct net_device *netdev)
 
 	clear_bit(MLX5E_STATE_OPENED, &priv->state);
 
-	mlx5e_timestamp_cleanup(priv);
 	netif_carrier_off(priv->netdev);
 	mlx5e_deactivate_priv_channels(priv);
 	mlx5e_close_channels(&priv->channels);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
index 84dd63e74041..fa8aed62b231 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
@@ -34,250 +34,164 @@
 #include "en.h"
 
 enum {
-	MLX5E_CYCLES_SHIFT	= 23
+	MLX5_CYCLES_SHIFT	= 23
 };
 
 enum {
-	MLX5E_PIN_MODE_IN		= 0x0,
-	MLX5E_PIN_MODE_OUT		= 0x1,
+	MLX5_PIN_MODE_IN		= 0x0,
+	MLX5_PIN_MODE_OUT		= 0x1,
 };
 
 enum {
-	MLX5E_OUT_PATTERN_PULSE		= 0x0,
-	MLX5E_OUT_PATTERN_PERIODIC	= 0x1,
+	MLX5_OUT_PATTERN_PULSE		= 0x0,
+	MLX5_OUT_PATTERN_PERIODIC	= 0x1,
 };
 
 enum {
-	MLX5E_EVENT_MODE_DISABLE	= 0x0,
-	MLX5E_EVENT_MODE_REPETETIVE	= 0x1,
-	MLX5E_EVENT_MODE_ONCE_TILL_ARM	= 0x2,
+	MLX5_EVENT_MODE_DISABLE	= 0x0,
+	MLX5_EVENT_MODE_REPETETIVE	= 0x1,
+	MLX5_EVENT_MODE_ONCE_TILL_ARM	= 0x2,
 };
 
 enum {
-	MLX5E_MTPPS_FS_ENABLE			= BIT(0x0),
-	MLX5E_MTPPS_FS_PATTERN			= BIT(0x2),
-	MLX5E_MTPPS_FS_PIN_MODE			= BIT(0x3),
-	MLX5E_MTPPS_FS_TIME_STAMP		= BIT(0x4),
-	MLX5E_MTPPS_FS_OUT_PULSE_DURATION	= BIT(0x5),
-	MLX5E_MTPPS_FS_ENH_OUT_PER_ADJ		= BIT(0x7),
+	MLX5_MTPPS_FS_ENABLE			= BIT(0x0),
+	MLX5_MTPPS_FS_PATTERN			= BIT(0x2),
+	MLX5_MTPPS_FS_PIN_MODE			= BIT(0x3),
+	MLX5_MTPPS_FS_TIME_STAMP		= BIT(0x4),
+	MLX5_MTPPS_FS_OUT_PULSE_DURATION	= BIT(0x5),
+	MLX5_MTPPS_FS_ENH_OUT_PER_ADJ		= BIT(0x7),
 };
 
-void mlx5e_fill_hwstamp(struct mlx5e_tstamp *tstamp, u64 timestamp,
-			struct skb_shared_hwtstamps *hwts)
+static u64 read_internal_timer(const struct cyclecounter *cc)
 {
-	u64 nsec;
+	struct mlx5_clock *clock = container_of(cc, struct mlx5_clock, cycles);
+	struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev,
+						  clock);
 
-	read_lock(&tstamp->lock);
-	nsec = timecounter_cyc2time(&tstamp->clock, timestamp);
-	read_unlock(&tstamp->lock);
-
-	hwts->hwtstamp = ns_to_ktime(nsec);
-}
-
-static u64 mlx5e_read_internal_timer(const struct cyclecounter *cc)
-{
-	struct mlx5e_tstamp *tstamp = container_of(cc, struct mlx5e_tstamp,
-						   cycles);
-
-	return mlx5_read_internal_timer(tstamp->mdev) & cc->mask;
+	return mlx5_read_internal_timer(mdev) & cc->mask;
 }
 
-static void mlx5e_pps_out(struct work_struct *work)
+static void mlx5_pps_out(struct work_struct *work)
 {
-	struct mlx5e_pps *pps_info = container_of(work, struct mlx5e_pps,
-						  out_work);
-	struct mlx5e_tstamp *tstamp = container_of(pps_info, struct mlx5e_tstamp,
-						   pps_info);
+	struct mlx5_pps *pps_info = container_of(work, struct mlx5_pps,
+						 out_work);
+	struct mlx5_clock *clock = container_of(pps_info, struct mlx5_clock,
+						pps_info);
+	struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev,
+						  clock);
 	u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
 	unsigned long flags;
 	int i;
 
-	for (i = 0; i < tstamp->ptp_info.n_pins; i++) {
+	for (i = 0; i < clock->ptp_info.n_pins; i++) {
 		u64 tstart;
 
-		write_lock_irqsave(&tstamp->lock, flags);
-		tstart = tstamp->pps_info.start[i];
-		tstamp->pps_info.start[i] = 0;
-		write_unlock_irqrestore(&tstamp->lock, flags);
+		write_lock_irqsave(&clock->lock, flags);
+		tstart = clock->pps_info.start[i];
+		clock->pps_info.start[i] = 0;
+		write_unlock_irqrestore(&clock->lock, flags);
 		if (!tstart)
 			continue;
 
 		MLX5_SET(mtpps_reg, in, pin, i);
 		MLX5_SET64(mtpps_reg, in, time_stamp, tstart);
-		MLX5_SET(mtpps_reg, in, field_select, MLX5E_MTPPS_FS_TIME_STAMP);
-		mlx5_set_mtpps(tstamp->mdev, in, sizeof(in));
+		MLX5_SET(mtpps_reg, in, field_select, MLX5_MTPPS_FS_TIME_STAMP);
+		mlx5_set_mtpps(mdev, in, sizeof(in));
 	}
 }
 
-static void mlx5e_timestamp_overflow(struct work_struct *work)
+static void mlx5_timestamp_overflow(struct work_struct *work)
 {
 	struct delayed_work *dwork = to_delayed_work(work);
-	struct mlx5e_tstamp *tstamp = container_of(dwork, struct mlx5e_tstamp,
-						   overflow_work);
-	struct mlx5e_priv *priv = container_of(tstamp, struct mlx5e_priv, tstamp);
+	struct mlx5_clock *clock = container_of(dwork, struct mlx5_clock,
+						overflow_work);
 	unsigned long flags;
 
-	write_lock_irqsave(&tstamp->lock, flags);
-	timecounter_read(&tstamp->clock);
-	write_unlock_irqrestore(&tstamp->lock, flags);
-	queue_delayed_work(priv->wq, &tstamp->overflow_work,
-			   msecs_to_jiffies(tstamp->overflow_period * 1000));
-}
-
-int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr)
-{
-	struct hwtstamp_config config;
-	int err;
-
-	if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz))
-		return -EOPNOTSUPP;
-
-	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
-		return -EFAULT;
-
-	/* TX HW timestamp */
-	switch (config.tx_type) {
-	case HWTSTAMP_TX_OFF:
-	case HWTSTAMP_TX_ON:
-		break;
-	default:
-		return -ERANGE;
-	}
-
-	mutex_lock(&priv->state_lock);
-	/* RX HW timestamp */
-	switch (config.rx_filter) {
-	case HWTSTAMP_FILTER_NONE:
-		/* Reset CQE compression to Admin default */
-		mlx5e_modify_rx_cqe_compression_locked(priv, priv->channels.params.rx_cqe_compress_def);
-		break;
-	case HWTSTAMP_FILTER_ALL:
-	case HWTSTAMP_FILTER_SOME:
-	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
-	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
-	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
-	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
-	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
-	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
-	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
-	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
-	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
-	case HWTSTAMP_FILTER_PTP_V2_EVENT:
-	case HWTSTAMP_FILTER_PTP_V2_SYNC:
-	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
-	case HWTSTAMP_FILTER_NTP_ALL:
-		/* Disable CQE compression */
-		netdev_warn(priv->netdev, "Disabling cqe compression");
-		err = mlx5e_modify_rx_cqe_compression_locked(priv, false);
-		if (err) {
-			netdev_err(priv->netdev, "Failed disabling cqe compression err=%d\n", err);
-			mutex_unlock(&priv->state_lock);
-			return err;
-		}
-		config.rx_filter = HWTSTAMP_FILTER_ALL;
-		break;
-	default:
-		mutex_unlock(&priv->state_lock);
-		return -ERANGE;
-	}
-
-	memcpy(&priv->tstamp.hwtstamp_config, &config, sizeof(config));
-	mutex_unlock(&priv->state_lock);
-
-	return copy_to_user(ifr->ifr_data, &config,
-			    sizeof(config)) ? -EFAULT : 0;
-}
-
-int mlx5e_hwstamp_get(struct mlx5e_priv *priv, struct ifreq *ifr)
-{
-	struct hwtstamp_config *cfg = &priv->tstamp.hwtstamp_config;
-
-	if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz))
-		return -EOPNOTSUPP;
-
-	return copy_to_user(ifr->ifr_data, cfg, sizeof(*cfg)) ? -EFAULT : 0;
+	write_lock_irqsave(&clock->lock, flags);
+	timecounter_read(&clock->tc);
+	write_unlock_irqrestore(&clock->lock, flags);
+	schedule_delayed_work(&clock->overflow_work, clock->overflow_period);
 }
 
-static int mlx5e_ptp_settime(struct ptp_clock_info *ptp,
-			     const struct timespec64 *ts)
+static int mlx5_ptp_settime(struct ptp_clock_info *ptp,
+			    const struct timespec64 *ts)
 {
-	struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
-						   ptp_info);
+	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock,
+						 ptp_info);
 	u64 ns = timespec64_to_ns(ts);
 	unsigned long flags;
 
-	write_lock_irqsave(&tstamp->lock, flags);
-	timecounter_init(&tstamp->clock, &tstamp->cycles, ns);
-	write_unlock_irqrestore(&tstamp->lock, flags);
+	write_lock_irqsave(&clock->lock, flags);
+	timecounter_init(&clock->tc, &clock->cycles, ns);
+	write_unlock_irqrestore(&clock->lock, flags);
 
 	return 0;
 }
 
-static int mlx5e_ptp_gettime(struct ptp_clock_info *ptp,
-			     struct timespec64 *ts)
+static int mlx5_ptp_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
 {
-	struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
-						   ptp_info);
+	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock,
+						ptp_info);
 	u64 ns;
 	unsigned long flags;
 
-	write_lock_irqsave(&tstamp->lock, flags);
-	ns = timecounter_read(&tstamp->clock);
-	write_unlock_irqrestore(&tstamp->lock, flags);
+	write_lock_irqsave(&clock->lock, flags);
+	ns = timecounter_read(&clock->tc);
+	write_unlock_irqrestore(&clock->lock, flags);
 
 	*ts = ns_to_timespec64(ns);
 
 	return 0;
 }
 
-static int mlx5e_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
+static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
 {
-	struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
-						   ptp_info);
+	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock,
+						ptp_info);
 	unsigned long flags;
 
-	write_lock_irqsave(&tstamp->lock, flags);
-	timecounter_adjtime(&tstamp->clock, delta);
-	write_unlock_irqrestore(&tstamp->lock, flags);
+	write_lock_irqsave(&clock->lock, flags);
+	timecounter_adjtime(&clock->tc, delta);
+	write_unlock_irqrestore(&clock->lock, flags);
 
 	return 0;
 }
 
-static int mlx5e_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta)
+static int mlx5_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta)
 {
 	u64 adj;
 	u32 diff;
 	unsigned long flags;
 	int neg_adj = 0;
-	struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
-						  ptp_info);
+	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock,
+						ptp_info);
 
 	if (delta < 0) {
 		neg_adj = 1;
 		delta = -delta;
 	}
 
-	adj = tstamp->nominal_c_mult;
+	adj = clock->nominal_c_mult;
 	adj *= delta;
 	diff = div_u64(adj, 1000000000ULL);
 
-	write_lock_irqsave(&tstamp->lock, flags);
-	timecounter_read(&tstamp->clock);
-	tstamp->cycles.mult = neg_adj ? tstamp->nominal_c_mult - diff :
-					tstamp->nominal_c_mult + diff;
-	write_unlock_irqrestore(&tstamp->lock, flags);
+	write_lock_irqsave(&clock->lock, flags);
+	timecounter_read(&clock->tc);
+	clock->cycles.mult = neg_adj ? clock->nominal_c_mult - diff :
+				       clock->nominal_c_mult + diff;
+	write_unlock_irqrestore(&clock->lock, flags);
 
 	return 0;
 }
 
-static int mlx5e_extts_configure(struct ptp_clock_info *ptp,
-				 struct ptp_clock_request *rq,
-				 int on)
+static int mlx5_extts_configure(struct ptp_clock_info *ptp,
+				struct ptp_clock_request *rq,
+				int on)
 {
-	struct mlx5e_tstamp *tstamp =
-		container_of(ptp, struct mlx5e_tstamp, ptp_info);
-	struct mlx5e_priv *priv =
-		container_of(tstamp, struct mlx5e_priv, tstamp);
+	struct mlx5_clock *clock =
+			container_of(ptp, struct mlx5_clock, ptp_info);
+	struct mlx5_core_dev *mdev =
+			container_of(clock, struct mlx5_core_dev, clock);
 	u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
 	u32 field_select = 0;
 	u8 pin_mode = 0;
@@ -285,24 +199,24 @@ static int mlx5e_extts_configure(struct ptp_clock_info *ptp,
 	int pin = -1;
 	int err = 0;
 
-	if (!MLX5_PPS_CAP(priv->mdev))
+	if (!MLX5_PPS_CAP(mdev))
 		return -EOPNOTSUPP;
 
-	if (rq->extts.index >= tstamp->ptp_info.n_pins)
+	if (rq->extts.index >= clock->ptp_info.n_pins)
 		return -EINVAL;
 
 	if (on) {
-		pin = ptp_find_pin(tstamp->ptp, PTP_PF_EXTTS, rq->extts.index);
+		pin = ptp_find_pin(clock->ptp, PTP_PF_EXTTS, rq->extts.index);
 		if (pin < 0)
 			return -EBUSY;
-		pin_mode = MLX5E_PIN_MODE_IN;
+		pin_mode = MLX5_PIN_MODE_IN;
 		pattern = !!(rq->extts.flags & PTP_FALLING_EDGE);
-		field_select = MLX5E_MTPPS_FS_PIN_MODE |
-			       MLX5E_MTPPS_FS_PATTERN |
-			       MLX5E_MTPPS_FS_ENABLE;
+		field_select = MLX5_MTPPS_FS_PIN_MODE |
+			       MLX5_MTPPS_FS_PATTERN |
+			       MLX5_MTPPS_FS_ENABLE;
 	} else {
 		pin = rq->extts.index;
-		field_select = MLX5E_MTPPS_FS_ENABLE;
+		field_select = MLX5_MTPPS_FS_ENABLE;
 	}
 
 	MLX5_SET(mtpps_reg, in, pin, pin);
@@ -311,22 +225,22 @@ static int mlx5e_extts_configure(struct ptp_clock_info *ptp,
 	MLX5_SET(mtpps_reg, in, enable, on);
 	MLX5_SET(mtpps_reg, in, field_select, field_select);
 
-	err = mlx5_set_mtpps(priv->mdev, in, sizeof(in));
+	err = mlx5_set_mtpps(mdev, in, sizeof(in));
 	if (err)
 		return err;
 
-	return mlx5_set_mtppse(priv->mdev, pin, 0,
-			       MLX5E_EVENT_MODE_REPETETIVE & on);
+	return mlx5_set_mtppse(mdev, pin, 0,
+			       MLX5_EVENT_MODE_REPETETIVE & on);
 }
 
-static int mlx5e_perout_configure(struct ptp_clock_info *ptp,
-				  struct ptp_clock_request *rq,
-				  int on)
+static int mlx5_perout_configure(struct ptp_clock_info *ptp,
+				 struct ptp_clock_request *rq,
+				 int on)
 {
-	struct mlx5e_tstamp *tstamp =
-		container_of(ptp, struct mlx5e_tstamp, ptp_info);
-	struct mlx5e_priv *priv =
-		container_of(tstamp, struct mlx5e_priv, tstamp);
+	struct mlx5_clock *clock =
+			container_of(ptp, struct mlx5_clock, ptp_info);
+	struct mlx5_core_dev *mdev =
+			container_of(clock, struct mlx5_core_dev, clock);
 	u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
 	u64 nsec_now, nsec_delta, time_stamp = 0;
 	u64 cycles_now, cycles_delta;
@@ -339,20 +253,20 @@ static int mlx5e_perout_configure(struct ptp_clock_info *ptp,
 	int err = 0;
 	s64 ns;
 
-	if (!MLX5_PPS_CAP(priv->mdev))
+	if (!MLX5_PPS_CAP(mdev))
 		return -EOPNOTSUPP;
 
-	if (rq->perout.index >= tstamp->ptp_info.n_pins)
+	if (rq->perout.index >= clock->ptp_info.n_pins)
 		return -EINVAL;
 
 	if (on) {
-		pin = ptp_find_pin(tstamp->ptp, PTP_PF_PEROUT,
+		pin = ptp_find_pin(clock->ptp, PTP_PF_PEROUT,
 				   rq->perout.index);
 		if (pin < 0)
 			return -EBUSY;
 
-		pin_mode = MLX5E_PIN_MODE_OUT;
-		pattern = MLX5E_OUT_PATTERN_PERIODIC;
+		pin_mode = MLX5_PIN_MODE_OUT;
+		pattern = MLX5_OUT_PATTERN_PERIODIC;
 		ts.tv_sec = rq->perout.period.sec;
 		ts.tv_nsec = rq->perout.period.nsec;
 		ns = timespec64_to_ns(&ts);
@@ -363,21 +277,21 @@ static int mlx5e_perout_configure(struct ptp_clock_info *ptp,
 		ts.tv_sec = rq->perout.start.sec;
 		ts.tv_nsec = rq->perout.start.nsec;
 		ns = timespec64_to_ns(&ts);
-		cycles_now = mlx5_read_internal_timer(tstamp->mdev);
-		write_lock_irqsave(&tstamp->lock, flags);
-		nsec_now = timecounter_cyc2time(&tstamp->clock, cycles_now);
+		cycles_now = mlx5_read_internal_timer(mdev);
+		write_lock_irqsave(&clock->lock, flags);
+		nsec_now = timecounter_cyc2time(&clock->tc, cycles_now);
 		nsec_delta = ns - nsec_now;
-		cycles_delta = div64_u64(nsec_delta << tstamp->cycles.shift,
-					 tstamp->cycles.mult);
-		write_unlock_irqrestore(&tstamp->lock, flags);
+		cycles_delta = div64_u64(nsec_delta << clock->cycles.shift,
+					 clock->cycles.mult);
+		write_unlock_irqrestore(&clock->lock, flags);
 		time_stamp = cycles_now + cycles_delta;
-		field_select = MLX5E_MTPPS_FS_PIN_MODE |
-			       MLX5E_MTPPS_FS_PATTERN |
-			       MLX5E_MTPPS_FS_ENABLE |
-			       MLX5E_MTPPS_FS_TIME_STAMP;
+		field_select = MLX5_MTPPS_FS_PIN_MODE |
+			       MLX5_MTPPS_FS_PATTERN |
+			       MLX5_MTPPS_FS_ENABLE |
+			       MLX5_MTPPS_FS_TIME_STAMP;
 	} else {
 		pin = rq->perout.index;
-		field_select = MLX5E_MTPPS_FS_ENABLE;
+		field_select = MLX5_MTPPS_FS_ENABLE;
 	}
 
 	MLX5_SET(mtpps_reg, in, pin, pin);
@@ -387,233 +301,225 @@ static int mlx5e_perout_configure(struct ptp_clock_info *ptp,
 	MLX5_SET64(mtpps_reg, in, time_stamp, time_stamp);
 	MLX5_SET(mtpps_reg, in, field_select, field_select);
 
-	err = mlx5_set_mtpps(priv->mdev, in, sizeof(in));
+	err = mlx5_set_mtpps(mdev, in, sizeof(in));
 	if (err)
 		return err;
 
-	return mlx5_set_mtppse(priv->mdev, pin, 0,
-			       MLX5E_EVENT_MODE_REPETETIVE & on);
+	return mlx5_set_mtppse(mdev, pin, 0,
+			       MLX5_EVENT_MODE_REPETETIVE & on);
 }
 
-static int mlx5e_pps_configure(struct ptp_clock_info *ptp,
-			       struct ptp_clock_request *rq,
-			       int on)
+static int mlx5_pps_configure(struct ptp_clock_info *ptp,
+			      struct ptp_clock_request *rq,
+			      int on)
 {
-	struct mlx5e_tstamp *tstamp =
-		container_of(ptp, struct mlx5e_tstamp, ptp_info);
+	struct mlx5_clock *clock =
+			container_of(ptp, struct mlx5_clock, ptp_info);
 
-	tstamp->pps_info.enabled = !!on;
+	clock->pps_info.enabled = !!on;
 	return 0;
 }
 
-static int mlx5e_ptp_enable(struct ptp_clock_info *ptp,
-			    struct ptp_clock_request *rq,
-			    int on)
+static int mlx5_ptp_enable(struct ptp_clock_info *ptp,
+			   struct ptp_clock_request *rq,
+			   int on)
 {
 	switch (rq->type) {
 	case PTP_CLK_REQ_EXTTS:
-		return mlx5e_extts_configure(ptp, rq, on);
+		return mlx5_extts_configure(ptp, rq, on);
 	case PTP_CLK_REQ_PEROUT:
-		return mlx5e_perout_configure(ptp, rq, on);
+		return mlx5_perout_configure(ptp, rq, on);
 	case PTP_CLK_REQ_PPS:
-		return mlx5e_pps_configure(ptp, rq, on);
+		return mlx5_pps_configure(ptp, rq, on);
 	default:
 		return -EOPNOTSUPP;
 	}
 	return 0;
 }
 
-static int mlx5e_ptp_verify(struct ptp_clock_info *ptp, unsigned int pin,
-			    enum ptp_pin_function func, unsigned int chan)
+static int mlx5_ptp_verify(struct ptp_clock_info *ptp, unsigned int pin,
+			   enum ptp_pin_function func, unsigned int chan)
 {
 	return (func == PTP_PF_PHYSYNC) ? -EOPNOTSUPP : 0;
 }
 
-static const struct ptp_clock_info mlx5e_ptp_clock_info = {
+static const struct ptp_clock_info mlx5_ptp_clock_info = {
 	.owner		= THIS_MODULE,
+	.name		= "mlx5_p2p",
 	.max_adj	= 100000000,
 	.n_alarm	= 0,
 	.n_ext_ts	= 0,
 	.n_per_out	= 0,
 	.n_pins		= 0,
 	.pps		= 0,
-	.adjfreq	= mlx5e_ptp_adjfreq,
-	.adjtime	= mlx5e_ptp_adjtime,
-	.gettime64	= mlx5e_ptp_gettime,
-	.settime64	= mlx5e_ptp_settime,
+	.adjfreq	= mlx5_ptp_adjfreq,
+	.adjtime	= mlx5_ptp_adjtime,
+	.gettime64	= mlx5_ptp_gettime,
+	.settime64	= mlx5_ptp_settime,
 	.enable		= NULL,
 	.verify		= NULL,
 };
 
-static void mlx5e_timestamp_init_config(struct mlx5e_tstamp *tstamp)
-{
-	tstamp->hwtstamp_config.tx_type = HWTSTAMP_TX_OFF;
-	tstamp->hwtstamp_config.rx_filter = HWTSTAMP_FILTER_NONE;
-}
-
-static int mlx5e_init_pin_config(struct mlx5e_tstamp *tstamp)
+static int mlx5_init_pin_config(struct mlx5_clock *clock)
 {
 	int i;
 
-	tstamp->ptp_info.pin_config =
-		kzalloc(sizeof(*tstamp->ptp_info.pin_config) *
-			       tstamp->ptp_info.n_pins, GFP_KERNEL);
-	if (!tstamp->ptp_info.pin_config)
+	clock->ptp_info.pin_config =
+			kzalloc(sizeof(*clock->ptp_info.pin_config) *
+				clock->ptp_info.n_pins, GFP_KERNEL);
+	if (!clock->ptp_info.pin_config)
 		return -ENOMEM;
-	tstamp->ptp_info.enable = mlx5e_ptp_enable;
-	tstamp->ptp_info.verify = mlx5e_ptp_verify;
-	tstamp->ptp_info.pps = 1;
+	clock->ptp_info.enable = mlx5_ptp_enable;
+	clock->ptp_info.verify = mlx5_ptp_verify;
+	clock->ptp_info.pps = 1;
 
-	for (i = 0; i < tstamp->ptp_info.n_pins; i++) {
-		snprintf(tstamp->ptp_info.pin_config[i].name,
-			 sizeof(tstamp->ptp_info.pin_config[i].name),
+	for (i = 0; i < clock->ptp_info.n_pins; i++) {
+		snprintf(clock->ptp_info.pin_config[i].name,
+			 sizeof(clock->ptp_info.pin_config[i].name),
 			 "mlx5_pps%d", i);
-		tstamp->ptp_info.pin_config[i].index = i;
-		tstamp->ptp_info.pin_config[i].func = PTP_PF_NONE;
-		tstamp->ptp_info.pin_config[i].chan = i;
+		clock->ptp_info.pin_config[i].index = i;
+		clock->ptp_info.pin_config[i].func = PTP_PF_NONE;
+		clock->ptp_info.pin_config[i].chan = i;
 	}
 
 	return 0;
 }
 
-static void mlx5e_get_pps_caps(struct mlx5e_priv *priv,
-			       struct mlx5e_tstamp *tstamp)
+static void mlx5_get_pps_caps(struct mlx5_core_dev *mdev)
 {
+	struct mlx5_clock *clock = &mdev->clock;
 	u32 out[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
 
-	mlx5_query_mtpps(priv->mdev, out, sizeof(out));
-
-	tstamp->ptp_info.n_pins = MLX5_GET(mtpps_reg, out,
-					   cap_number_of_pps_pins);
-	tstamp->ptp_info.n_ext_ts = MLX5_GET(mtpps_reg, out,
-					     cap_max_num_of_pps_in_pins);
-	tstamp->ptp_info.n_per_out = MLX5_GET(mtpps_reg, out,
-					      cap_max_num_of_pps_out_pins);
-
-	tstamp->pps_info.pin_caps[0] = MLX5_GET(mtpps_reg, out, cap_pin_0_mode);
-	tstamp->pps_info.pin_caps[1] = MLX5_GET(mtpps_reg, out, cap_pin_1_mode);
-	tstamp->pps_info.pin_caps[2] = MLX5_GET(mtpps_reg, out, cap_pin_2_mode);
-	tstamp->pps_info.pin_caps[3] = MLX5_GET(mtpps_reg, out, cap_pin_3_mode);
-	tstamp->pps_info.pin_caps[4] = MLX5_GET(mtpps_reg, out, cap_pin_4_mode);
-	tstamp->pps_info.pin_caps[5] = MLX5_GET(mtpps_reg, out, cap_pin_5_mode);
-	tstamp->pps_info.pin_caps[6] = MLX5_GET(mtpps_reg, out, cap_pin_6_mode);
-	tstamp->pps_info.pin_caps[7] = MLX5_GET(mtpps_reg, out, cap_pin_7_mode);
+	mlx5_query_mtpps(mdev, out, sizeof(out));
+
+	clock->ptp_info.n_pins = MLX5_GET(mtpps_reg, out,
+					  cap_number_of_pps_pins);
+	clock->ptp_info.n_ext_ts = MLX5_GET(mtpps_reg, out,
+					    cap_max_num_of_pps_in_pins);
+	clock->ptp_info.n_per_out = MLX5_GET(mtpps_reg, out,
+					     cap_max_num_of_pps_out_pins);
+
+	clock->pps_info.pin_caps[0] = MLX5_GET(mtpps_reg, out, cap_pin_0_mode);
+	clock->pps_info.pin_caps[1] = MLX5_GET(mtpps_reg, out, cap_pin_1_mode);
+	clock->pps_info.pin_caps[2] = MLX5_GET(mtpps_reg, out, cap_pin_2_mode);
+	clock->pps_info.pin_caps[3] = MLX5_GET(mtpps_reg, out, cap_pin_3_mode);
+	clock->pps_info.pin_caps[4] = MLX5_GET(mtpps_reg, out, cap_pin_4_mode);
+	clock->pps_info.pin_caps[5] = MLX5_GET(mtpps_reg, out, cap_pin_5_mode);
+	clock->pps_info.pin_caps[6] = MLX5_GET(mtpps_reg, out, cap_pin_6_mode);
+	clock->pps_info.pin_caps[7] = MLX5_GET(mtpps_reg, out, cap_pin_7_mode);
 }
 
-void mlx5e_pps_event_handler(struct mlx5e_priv *priv,
-			     struct ptp_clock_event *event)
+void mlx5_pps_event(struct mlx5_core_dev *mdev,
+		    struct mlx5_eqe *eqe)
 {
-	struct net_device *netdev = priv->netdev;
-	struct mlx5e_tstamp *tstamp = &priv->tstamp;
+	struct mlx5_clock *clock = &mdev->clock;
+	struct ptp_clock_event ptp_event;
 	struct timespec64 ts;
 	u64 nsec_now, nsec_delta;
 	u64 cycles_now, cycles_delta;
-	int pin = event->index;
+	int pin = eqe->data.pps.pin;
 	s64 ns;
 	unsigned long flags;
 
-	switch (tstamp->ptp_info.pin_config[pin].func) {
+	switch (clock->ptp_info.pin_config[pin].func) {
 	case PTP_PF_EXTTS:
-		if (tstamp->pps_info.enabled) {
-			event->type = PTP_CLOCK_PPSUSR;
-			event->pps_times.ts_real = ns_to_timespec64(event->timestamp);
+		if (clock->pps_info.enabled) {
+			ptp_event.type = PTP_CLOCK_PPSUSR;
+			ptp_event.pps_times.ts_real = ns_to_timespec64(eqe->data.pps.time_stamp);
 		} else {
-			event->type = PTP_CLOCK_EXTTS;
+			ptp_event.type = PTP_CLOCK_EXTTS;
 		}
-		ptp_clock_event(tstamp->ptp, event);
+		ptp_clock_event(clock->ptp, &ptp_event);
 		break;
 	case PTP_PF_PEROUT:
-		mlx5e_ptp_gettime(&tstamp->ptp_info, &ts);
-		cycles_now = mlx5_read_internal_timer(tstamp->mdev);
+		mlx5_ptp_gettime(&clock->ptp_info, &ts);
+		cycles_now = mlx5_read_internal_timer(mdev);
 		ts.tv_sec += 1;
 		ts.tv_nsec = 0;
 		ns = timespec64_to_ns(&ts);
-		write_lock_irqsave(&tstamp->lock, flags);
-		nsec_now = timecounter_cyc2time(&tstamp->clock, cycles_now);
+		write_lock_irqsave(&clock->lock, flags);
+		nsec_now = timecounter_cyc2time(&clock->tc, cycles_now);
 		nsec_delta = ns - nsec_now;
-		cycles_delta = div64_u64(nsec_delta << tstamp->cycles.shift,
-					 tstamp->cycles.mult);
-		tstamp->pps_info.start[pin] = cycles_now + cycles_delta;
-		queue_work(priv->wq, &tstamp->pps_info.out_work);
-		write_unlock_irqrestore(&tstamp->lock, flags);
+		cycles_delta = div64_u64(nsec_delta << clock->cycles.shift,
+					 clock->cycles.mult);
+		clock->pps_info.start[pin] = cycles_now + cycles_delta;
+		schedule_work(&clock->pps_info.out_work);
+		write_unlock_irqrestore(&clock->lock, flags);
 		break;
 	default:
-		netdev_err(netdev, "%s: Unhandled event\n", __func__);
+		mlx5_core_err(mdev, " Unhandled event\n");
 	}
 }
 
-void mlx5e_timestamp_init(struct mlx5e_priv *priv)
+void mlx5_init_clock(struct mlx5_core_dev *mdev)
 {
-	struct mlx5e_tstamp *tstamp = &priv->tstamp;
+	struct mlx5_clock *clock = &mdev->clock;
 	u64 ns;
 	u64 frac = 0;
 	u32 dev_freq;
 
-	mlx5e_timestamp_init_config(tstamp);
-	dev_freq = MLX5_CAP_GEN(priv->mdev, device_frequency_khz);
+	dev_freq = MLX5_CAP_GEN(mdev, device_frequency_khz);
 	if (!dev_freq) {
-		mlx5_core_warn(priv->mdev, "invalid device_frequency_khz, aborting HW clock init\n");
+		mlx5_core_warn(mdev, "invalid device_frequency_khz, aborting HW clock init\n");
 		return;
 	}
-	rwlock_init(&tstamp->lock);
-	tstamp->cycles.read = mlx5e_read_internal_timer;
-	tstamp->cycles.shift = MLX5E_CYCLES_SHIFT;
-	tstamp->cycles.mult = clocksource_khz2mult(dev_freq,
-						   tstamp->cycles.shift);
-	tstamp->nominal_c_mult = tstamp->cycles.mult;
-	tstamp->cycles.mask = CLOCKSOURCE_MASK(41);
-	tstamp->mdev = priv->mdev;
-
-	timecounter_init(&tstamp->clock, &tstamp->cycles,
+	rwlock_init(&clock->lock);
+	clock->cycles.read = read_internal_timer;
+	clock->cycles.shift = MLX5_CYCLES_SHIFT;
+	clock->cycles.mult = clocksource_khz2mult(dev_freq,
+						  clock->cycles.shift);
+	clock->nominal_c_mult = clock->cycles.mult;
+	clock->cycles.mask = CLOCKSOURCE_MASK(41);
+
+	timecounter_init(&clock->tc, &clock->cycles,
 			 ktime_to_ns(ktime_get_real()));
 
 	/* Calculate period in seconds to call the overflow watchdog - to make
 	 * sure counter is checked at least once every wrap around.
 	 */
-	ns = cyclecounter_cyc2ns(&tstamp->cycles, tstamp->cycles.mask,
+	ns = cyclecounter_cyc2ns(&clock->cycles, clock->cycles.mask,
 				 frac, &frac);
 	do_div(ns, NSEC_PER_SEC / 2 / HZ);
-	tstamp->overflow_period = ns;
+	clock->overflow_period = ns;
 
-	INIT_WORK(&tstamp->pps_info.out_work, mlx5e_pps_out);
-	INIT_DELAYED_WORK(&tstamp->overflow_work, mlx5e_timestamp_overflow);
-	if (tstamp->overflow_period)
-		queue_delayed_work(priv->wq, &tstamp->overflow_work, 0);
+	INIT_WORK(&clock->pps_info.out_work, mlx5_pps_out);
+	INIT_DELAYED_WORK(&clock->overflow_work, mlx5_timestamp_overflow);
+	if (clock->overflow_period)
+		schedule_delayed_work(&clock->overflow_work, 0);
 	else
-		mlx5_core_warn(priv->mdev, "invalid overflow period, overflow_work is not scheduled\n");
+		mlx5_core_warn(mdev, "invalid overflow period, overflow_work is not scheduled\n");
 
 	/* Configure the PHC */
-	tstamp->ptp_info = mlx5e_ptp_clock_info;
-	snprintf(tstamp->ptp_info.name, 16, "mlx5 ptp");
+	clock->ptp_info = mlx5_ptp_clock_info;
 
 	/* Initialize 1PPS data structures */
-	if (MLX5_PPS_CAP(priv->mdev))
-		mlx5e_get_pps_caps(priv, tstamp);
-	if (tstamp->ptp_info.n_pins)
-		mlx5e_init_pin_config(tstamp);
-
-	tstamp->ptp = ptp_clock_register(&tstamp->ptp_info,
-					 &priv->mdev->pdev->dev);
-	if (IS_ERR(tstamp->ptp)) {
-		mlx5_core_warn(priv->mdev, "ptp_clock_register failed %ld\n",
-			       PTR_ERR(tstamp->ptp));
-		tstamp->ptp = NULL;
+	if (MLX5_PPS_CAP(mdev))
+		mlx5_get_pps_caps(mdev);
+	if (clock->ptp_info.n_pins)
+		mlx5_init_pin_config(clock);
+
+	clock->ptp = ptp_clock_register(&clock->ptp_info,
+					&mdev->pdev->dev);
+	if (IS_ERR(clock->ptp)) {
+		mlx5_core_warn(mdev, "ptp_clock_register failed %ld\n",
+			       PTR_ERR(clock->ptp));
+		clock->ptp = NULL;
 	}
 }
 
-void mlx5e_timestamp_cleanup(struct mlx5e_priv *priv)
+void mlx5_cleanup_clock(struct mlx5_core_dev *mdev)
 {
-	struct mlx5e_tstamp *tstamp = &priv->tstamp;
+	struct mlx5_clock *clock = &mdev->clock;
 
-	if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz))
+	if (!MLX5_CAP_GEN(mdev, device_frequency_khz))
 		return;
 
-	if (priv->tstamp.ptp) {
-		ptp_clock_unregister(priv->tstamp.ptp);
-		priv->tstamp.ptp = NULL;
+	if (clock->ptp) {
+		ptp_clock_unregister(clock->ptp);
+		clock->ptp = NULL;
 	}
 
-	cancel_work_sync(&tstamp->pps_info.out_work);
-	cancel_delayed_work_sync(&tstamp->overflow_work);
-	kfree(tstamp->ptp_info.pin_config);
+	cancel_work_sync(&clock->pps_info.out_work);
+	cancel_delayed_work_sync(&clock->overflow_work);
+	kfree(clock->ptp_info.pin_config);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
new file mode 100644
index 000000000000..a8eecedd46c2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __LIB_CLOCK_H__
+#define __LIB_CLOCK_H__
+
+void mlx5_init_clock(struct mlx5_core_dev *mdev);
+void mlx5_cleanup_clock(struct mlx5_core_dev *mdev);
+
+static inline ktime_t mlx5_timecounter_cyc2time(struct mlx5_clock *clock,
+						u64 timestamp)
+{
+	u64 nsec;
+
+	read_lock(&clock->lock);
+	nsec = timecounter_cyc2time(&clock->tc, timestamp);
+	read_unlock(&clock->lock);
+
+	return ns_to_ktime(nsec);
+}
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 0d2c8dcd6eae..ecbe9fad22d8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -59,6 +59,7 @@
 #include "lib/mlx5.h"
 #include "fpga/core.h"
 #include "accel/ipsec.h"
+#include "lib/clock.h"
 
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
 MODULE_DESCRIPTION("Mellanox Connect-IB, ConnectX-4 core driver");
@@ -889,6 +890,8 @@ static int mlx5_init_once(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 
 	mlx5_init_reserved_gids(dev);
 
+	mlx5_init_clock(dev);
+
 	err = mlx5_init_rl_table(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to init rate limiting\n");
@@ -949,6 +952,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
 	mlx5_mpfs_cleanup(dev);
 	mlx5_cleanup_rl_table(dev);
+	mlx5_cleanup_clock(dev);
 	mlx5_cleanup_reserved_gids(dev);
 	mlx5_cleanup_mkey_table(dev);
 	mlx5_cleanup_srq_table(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index b7c2900b75f9..8f00de2fe283 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -93,6 +93,7 @@ void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
 		     unsigned long param);
 void mlx5_core_page_fault(struct mlx5_core_dev *dev,
 			  struct mlx5_pagefault *pfault);
+void mlx5_pps_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
 void mlx5_port_module_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
 void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force);
 void mlx5_disable_device(struct mlx5_core_dev *dev);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 401c8972cc3a..08c77b7e59cb 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -49,6 +49,8 @@
 #include <linux/mlx5/device.h>
 #include <linux/mlx5/doorbell.h>
 #include <linux/mlx5/srq.h>
+#include <linux/timecounter.h>
+#include <linux/ptp_clock_kernel.h>
 
 enum {
 	MLX5_BOARD_ID_LEN = 64,
@@ -760,6 +762,27 @@ struct mlx5_rsvd_gids {
 	struct ida ida;
 };
 
+#define MAX_PIN_NUM	8
+struct mlx5_pps {
+	u8                         pin_caps[MAX_PIN_NUM];
+	struct work_struct         out_work;
+	u64                        start[MAX_PIN_NUM];
+	u8                         enabled;
+};
+
+struct mlx5_clock {
+	rwlock_t                   lock;
+	struct cyclecounter        cycles;
+	struct timecounter         tc;
+	struct hwtstamp_config     hwtstamp_config;
+	u32                        nominal_c_mult;
+	unsigned long              overflow_period;
+	struct delayed_work        overflow_work;
+	struct ptp_clock          *ptp;
+	struct ptp_clock_info      ptp_info;
+	struct mlx5_pps            pps_info;
+};
+
 struct mlx5_core_dev {
 	struct pci_dev	       *pdev;
 	/* sync pci state */
@@ -800,6 +823,7 @@ struct mlx5_core_dev {
 #ifdef CONFIG_RFS_ACCEL
 	struct cpu_rmap         *rmap;
 #endif
+	struct mlx5_clock        clock;
 };
 
 struct mlx5_db {
-- 
cgit v1.2.3


From ad7532cefd11d11a0814a75fb814c205ee3d9d4c Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Wed, 11 Oct 2017 09:35:01 -0700
Subject: iio: hid-sensor-trigger: Don't touch sensors unless user space
 requests

One of the user complained that on his system Thinkpad Yoga S1, with
commit f1664eaacec3 ("iio: hid-sensor-trigger: Fix the race with user
space powering up sensors") causes the system to resume immediately
on suspend (S3 operation). On this system the sensor hub is on USB
and is a wake up device from S3. So if any sensor sends data on
motion, the system will wake up. This can be a legitimate use case
to wake up device motion, but that needs proper user space support
to set right thresholds.

In fact the above commit didn't cause this regression, but any operation
which cause sensors to wake up would have caused the same issue. So if
user reads the raw sensor data, same issue occurs, with or without this
commit. Only difference is that the above commit by default will trigger
a power up and power down of sensors as part of runtime pm enable
(runtime enable will cause a runtime resume callback followed by
runtime_suspend callback). Previously user has to do some action on
sensors.

On investigation it was observed that the current driver correctly
changing the state of all sensors to power off but then also some sensor
will still send some data. Only option is to never power up any sensor.

Only good option is to:
- Using sysfs interface disable USB as a wakeup device (This will not
need any driver change)

Since some user don't care about sensors. So for those users this change
brings back old functionality. As long as they don't cause any operation
to power up sensors (like raw read or start iio-sensor-proxy service),
the sensors will not be to touched. This is done by delaying run time
enable till user space does some operation with sensors.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=196853
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/common/hid-sensors/hid-sensor-trigger.c | 12 +++++++++---
 include/linux/hid-sensor-hub.h                      |  1 +
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/common/hid-sensors/hid-sensor-trigger.c b/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
index 55dfd6288d18..cfb6588565ba 100644
--- a/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
+++ b/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
@@ -179,6 +179,10 @@ int hid_sensor_power_state(struct hid_sensor_common *st, bool state)
 	int ret;
 
 	atomic_set(&st->user_requested_state, state);
+
+	if (atomic_add_unless(&st->runtime_pm_enable, 1, 1))
+		pm_runtime_enable(&st->pdev->dev);
+
 	if (state)
 		ret = pm_runtime_get_sync(&st->pdev->dev);
 	else {
@@ -221,7 +225,8 @@ static void hid_sensor_set_power_work(struct work_struct *work)
 	if (attrb->latency_ms > 0)
 		hid_sensor_set_report_latency(attrb, attrb->latency_ms);
 
-	_hid_sensor_power_state(attrb, true);
+	if (atomic_read(&attrb->user_requested_state))
+		_hid_sensor_power_state(attrb, true);
 }
 
 static int hid_sensor_data_rdy_trigger_set_state(struct iio_trigger *trig,
@@ -232,7 +237,9 @@ static int hid_sensor_data_rdy_trigger_set_state(struct iio_trigger *trig,
 
 void hid_sensor_remove_trigger(struct hid_sensor_common *attrb)
 {
-	pm_runtime_disable(&attrb->pdev->dev);
+	if (atomic_read(&attrb->runtime_pm_enable))
+		pm_runtime_disable(&attrb->pdev->dev);
+
 	pm_runtime_set_suspended(&attrb->pdev->dev);
 	pm_runtime_put_noidle(&attrb->pdev->dev);
 
@@ -282,7 +289,6 @@ int hid_sensor_setup_trigger(struct iio_dev *indio_dev, const char *name,
 	INIT_WORK(&attrb->work, hid_sensor_set_power_work);
 
 	pm_suspend_ignore_children(&attrb->pdev->dev, true);
-	pm_runtime_enable(&attrb->pdev->dev);
 	/* Default to 3 seconds, but can be changed from sysfs */
 	pm_runtime_set_autosuspend_delay(&attrb->pdev->dev,
 					 3000);
diff --git a/include/linux/hid-sensor-hub.h b/include/linux/hid-sensor-hub.h
index fc7aae64dcde..331dc377c275 100644
--- a/include/linux/hid-sensor-hub.h
+++ b/include/linux/hid-sensor-hub.h
@@ -231,6 +231,7 @@ struct hid_sensor_common {
 	unsigned usage_id;
 	atomic_t data_ready;
 	atomic_t user_requested_state;
+	atomic_t runtime_pm_enable;
 	int poll_interval;
 	int raw_hystersis;
 	int latency_ms;
-- 
cgit v1.2.3


From 53fd88ab61948f711147204c1c5017c7301979e9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 14 Oct 2017 23:00:54 -0400
Subject: make vfs_ustat() static

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/statfs.c        | 2 +-
 include/linux/fs.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/statfs.c b/fs/statfs.c
index fab9b6a3c116..6327edf79e0f 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -216,7 +216,7 @@ SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user
 	return error;
 }
 
-int vfs_ustat(dev_t dev, struct kstatfs *sbuf)
+static int vfs_ustat(dev_t dev, struct kstatfs *sbuf)
 {
 	struct super_block *s = user_get_super(dev);
 	int err;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 339e73742e73..89323e03e648 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2169,7 +2169,6 @@ extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
 extern int vfs_statfs(const struct path *, struct kstatfs *);
 extern int user_statfs(const char __user *, struct kstatfs *);
 extern int fd_statfs(int, struct kstatfs *);
-extern int vfs_ustat(dev_t, struct kstatfs *);
 extern int freeze_super(struct super_block *super);
 extern int thaw_super(struct super_block *super);
 extern bool our_mnt(struct vfsmount *mnt);
-- 
cgit v1.2.3


From f175f307dd0bd1ca3825d244f9b870ff12981d3c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 15 Oct 2017 00:38:00 -0400
Subject: stubs for mount_bdev() and kill_block_super() in !CONFIG_BLOCK case

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 89323e03e648..31f8b2ea358c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2094,9 +2094,18 @@ struct file_system_type {
 extern struct dentry *mount_ns(struct file_system_type *fs_type,
 	int flags, void *data, void *ns, struct user_namespace *user_ns,
 	int (*fill_super)(struct super_block *, void *, int));
+#ifdef CONFIG_BLOCK
 extern struct dentry *mount_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
 	int (*fill_super)(struct super_block *, void *, int));
+#else
+static inline struct dentry *mount_bdev(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data,
+	int (*fill_super)(struct super_block *, void *, int))
+{
+	return ERR_PTR(-ENODEV);
+}
+#endif
 extern struct dentry *mount_single(struct file_system_type *fs_type,
 	int flags, void *data,
 	int (*fill_super)(struct super_block *, void *, int));
@@ -2105,7 +2114,14 @@ extern struct dentry *mount_nodev(struct file_system_type *fs_type,
 	int (*fill_super)(struct super_block *, void *, int));
 extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path);
 void generic_shutdown_super(struct super_block *sb);
+#ifdef CONFIG_BLOCK
 void kill_block_super(struct super_block *sb);
+#else
+static inline void kill_block_super(struct super_block *sb)
+{
+	BUG();
+}
+#endif
 void kill_anon_super(struct super_block *sb);
 void kill_litter_super(struct super_block *sb);
 void deactivate_super(struct super_block *sb);
-- 
cgit v1.2.3


From e3d4939267925ab66f39123744ffb4bc74a13149 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Thu, 28 Sep 2017 14:03:33 +0100
Subject: ACPI/IORT: Improve functions return type/storage class specifier
 indentation

Some functions definition indentations are using a style that is frowned
upon with return value type/storage class specifier in a separate line.

Reindent the function definitions to fix them.

Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Hanjun Guo <hanjun.guo@linaro.org>
Cc: Hanjun Guo <hanjun.guo@linaro.org>
Cc: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/acpi/arm64/iort.c | 36 +++++++++++++++++-------------------
 include/linux/acpi_iort.h |  4 ++--
 2 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 89c42aa71858..5b5630e52281 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -88,8 +88,8 @@ static inline int iort_set_fwnode(struct acpi_iort_node *iort_node,
  *
  * Returns: fwnode_handle pointer on success, NULL on failure
  */
-static inline
-struct fwnode_handle *iort_get_fwnode(struct acpi_iort_node *node)
+static inline struct fwnode_handle *iort_get_fwnode(
+			struct acpi_iort_node *node)
 {
 	struct iort_fwnode *curr;
 	struct fwnode_handle *fwnode = NULL;
@@ -306,9 +306,8 @@ static int iort_id_map(struct acpi_iort_id_mapping *map, u8 type, u32 rid_in,
 	return 0;
 }
 
-static
-struct acpi_iort_node *iort_node_get_id(struct acpi_iort_node *node,
-					u32 *id_out, int index)
+static struct acpi_iort_node *iort_node_get_id(struct acpi_iort_node *node,
+					       u32 *id_out, int index)
 {
 	struct acpi_iort_node *parent;
 	struct acpi_iort_id_mapping *map;
@@ -392,10 +391,9 @@ fail_map:
 	return NULL;
 }
 
-static
-struct acpi_iort_node *iort_node_map_platform_id(struct acpi_iort_node *node,
-						 u32 *id_out, u8 type_mask,
-						 int index)
+static struct acpi_iort_node *iort_node_map_platform_id(
+		struct acpi_iort_node *node, u32 *id_out, u8 type_mask,
+		int index)
 {
 	struct acpi_iort_node *parent;
 	u32 id;
@@ -623,14 +621,14 @@ static inline bool iort_iommu_driver_enabled(u8 type)
 }
 
 #ifdef CONFIG_IOMMU_API
-static inline
-const struct iommu_ops *iort_fwspec_iommu_ops(struct iommu_fwspec *fwspec)
+static inline const struct iommu_ops *iort_fwspec_iommu_ops(
+				struct iommu_fwspec *fwspec)
 {
 	return (fwspec && fwspec->ops) ? fwspec->ops : NULL;
 }
 
-static inline
-int iort_add_device_replay(const struct iommu_ops *ops, struct device *dev)
+static inline int iort_add_device_replay(const struct iommu_ops *ops,
+					 struct device *dev)
 {
 	int err = 0;
 
@@ -640,11 +638,11 @@ int iort_add_device_replay(const struct iommu_ops *ops, struct device *dev)
 	return err;
 }
 #else
-static inline
-const struct iommu_ops *iort_fwspec_iommu_ops(struct iommu_fwspec *fwspec)
+static inline const struct iommu_ops *iort_fwspec_iommu_ops(
+				struct iommu_fwspec *fwspec)
 { return NULL; }
-static inline
-int iort_add_device_replay(const struct iommu_ops *ops, struct device *dev)
+static inline int iort_add_device_replay(const struct iommu_ops *ops,
+					 struct device *dev)
 { return 0; }
 #endif
 
@@ -1077,8 +1075,8 @@ static const struct iort_iommu_config iort_arm_smmu_cfg __initconst = {
 	.iommu_init_resources = arm_smmu_init_resources
 };
 
-static __init
-const struct iort_iommu_config *iort_get_iommu_cfg(struct acpi_iort_node *node)
+static __init const struct iort_iommu_config *iort_get_iommu_cfg(
+			struct acpi_iort_node *node)
 {
 	switch (node->type) {
 	case ACPI_IORT_NODE_SMMU_V3:
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index 8d3f0bf80379..2f7a29242b87 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -49,8 +49,8 @@ static inline void acpi_configure_pmsi_domain(struct device *dev) { }
 /* IOMMU interface */
 static inline void iort_dma_setup(struct device *dev, u64 *dma_addr,
 				  u64 *size) { }
-static inline
-const struct iommu_ops *iort_iommu_configure(struct device *dev)
+static inline const struct iommu_ops *iort_iommu_configure(
+				      struct device *dev)
 { return NULL; }
 #endif
 
-- 
cgit v1.2.3


From 3c1818275cc65c6d53e0adfde0c989bfe89ab8d7 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Wed, 26 Jul 2017 10:14:55 -0400
Subject: NFS: Create NFS_ACCESS_* flags

Passing the NFS v4 flags into the v3 code seems weird to me, even if
they are defined to the same values.  This patch adds in generic flags
to help me feel better

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/dir.c           | 16 ++++++++--------
 include/linux/nfs_fs.h | 10 ++++++++++
 2 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5ceaeb1f6fb6..80f397dc96bb 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2369,15 +2369,15 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
 }
 EXPORT_SYMBOL_GPL(nfs_access_add_cache);
 
-#define NFS_MAY_READ (NFS4_ACCESS_READ)
-#define NFS_MAY_WRITE (NFS4_ACCESS_MODIFY | \
-		NFS4_ACCESS_EXTEND | \
-		NFS4_ACCESS_DELETE)
-#define NFS_FILE_MAY_WRITE (NFS4_ACCESS_MODIFY | \
-		NFS4_ACCESS_EXTEND)
+#define NFS_MAY_READ (NFS_ACCESS_READ)
+#define NFS_MAY_WRITE (NFS_ACCESS_MODIFY | \
+		NFS_ACCESS_EXTEND | \
+		NFS_ACCESS_DELETE)
+#define NFS_FILE_MAY_WRITE (NFS_ACCESS_MODIFY | \
+		NFS_ACCESS_EXTEND)
 #define NFS_DIR_MAY_WRITE NFS_MAY_WRITE
-#define NFS_MAY_LOOKUP (NFS4_ACCESS_LOOKUP)
-#define NFS_MAY_EXECUTE (NFS4_ACCESS_EXECUTE)
+#define NFS_MAY_LOOKUP (NFS_ACCESS_LOOKUP)
+#define NFS_MAY_EXECUTE (NFS_ACCESS_EXECUTE)
 static int
 nfs_access_calc_mask(u32 access_result, umode_t umode)
 {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index a0282ceaa48b..453f491a5fda 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -183,6 +183,16 @@ struct nfs_inode {
 	struct inode		vfs_inode;
 };
 
+/*
+ * Access bit flags
+ */
+#define NFS_ACCESS_READ        0x0001
+#define NFS_ACCESS_LOOKUP      0x0002
+#define NFS_ACCESS_MODIFY      0x0004
+#define NFS_ACCESS_EXTEND      0x0008
+#define NFS_ACCESS_DELETE      0x0010
+#define NFS_ACCESS_EXECUTE     0x0020
+
 /*
  * Cache validity bit flags
  */
-- 
cgit v1.2.3


From 36689ecd2c065a8879035e5bf1b4a0f4d5b65160 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Fri, 29 Sep 2017 20:08:28 -0500
Subject: of: remove struct property.unique_id for FDT

Only Sparc uses unique_id, so remove it for FDT builds and shrink struct
property a bit making the unflattened DT less of a memory hog.

Tested-by: Nicolas Pitre <nico@linaro.org>
Reviewed-by: Frank Rowand <frowand.list@gmail.com>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index cfc34117fc92..8f9e96752837 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -38,7 +38,9 @@ struct property {
 	void	*value;
 	struct property *next;
 	unsigned long _flags;
+#if defined(CONFIG_OF_PROMTREE)
 	unsigned int unique_id;
+#endif
 	struct bin_attribute attr;
 };
 
-- 
cgit v1.2.3


From 16bba30eab137aaa37538349c0a7496720e90c66 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 4 Oct 2017 14:30:02 -0500
Subject: of: make struct property _flags field configurable

Only Sparc and CONFIG_OF_DYNAMIC use the struct property._flags field,
so make it conditional shrinking struct property a bit.

Tested-by: Nicolas Pitre <nico@linaro.org>
Reviewed-by: Frank Rowand <frowand.list@gmail.com>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index 8f9e96752837..7eb94b7fbcf3 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -37,7 +37,9 @@ struct property {
 	int	length;
 	void	*value;
 	struct property *next;
+#if defined(CONFIG_OF_DYNAMIC) || defined(CONFIG_SPARC)
 	unsigned long _flags;
+#endif
 #if defined(CONFIG_OF_PROMTREE)
 	unsigned int unique_id;
 #endif
@@ -205,6 +207,7 @@ static inline void of_node_clear_flag(struct device_node *n, unsigned long flag)
 	clear_bit(flag, &n->_flags);
 }
 
+#if defined(CONFIG_OF_DYNAMIC) || defined(CONFIG_SPARC)
 static inline int of_property_check_flag(struct property *p, unsigned long flag)
 {
 	return test_bit(flag, &p->_flags);
@@ -219,6 +222,7 @@ static inline void of_property_clear_flag(struct property *p, unsigned long flag
 {
 	clear_bit(flag, &p->_flags);
 }
+#endif
 
 extern struct device_node *__of_find_all_nodes(struct device_node *prev);
 extern struct device_node *of_find_all_nodes(struct device_node *prev);
-- 
cgit v1.2.3


From 0c3c234b95fa7f1dfa19e1456a47ebafc300dd6b Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 4 Oct 2017 14:04:01 -0500
Subject: of: wrap accesses to device_node kobject

In preparation to make kobject element in struct device_node optional,
provide and use a macro to return the kobject pointer. The only user
outside the DT core is the driver core.

Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tested-by: Nicolas Pitre <nico@linaro.org>
Reviewed-by: Frank Rowand <frowand.list@gmail.com>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/base/core.c | 2 +-
 include/linux/of.h  | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 12ebd055724c..c07b47059538 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1571,7 +1571,7 @@ static int device_add_class_symlinks(struct device *dev)
 	int error;
 
 	if (of_node) {
-		error = sysfs_create_link(&dev->kobj, &of_node->kobj,"of_node");
+		error = sysfs_create_link(&dev->kobj, of_node_kobj(of_node), "of_node");
 		if (error)
 			dev_warn(dev, "Error %d creating of_node link\n",error);
 		/* An error here doesn't warrant bringing down the device */
diff --git a/include/linux/of.h b/include/linux/of.h
index 7eb94b7fbcf3..2d685e769409 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -111,6 +111,8 @@ static inline void of_node_init(struct device_node *node)
 	node->fwnode.ops = &of_fwnode_ops;
 }
 
+#define of_node_kobj(n) (&(n)->kobj)
+
 /* true when node is initialized */
 static inline int of_node_is_initialized(struct device_node *node)
 {
-- 
cgit v1.2.3


From b56b5528f5b3c3d47e7c0ca67318c45e980d93f0 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 4 Oct 2017 14:09:40 -0500
Subject: of: make kobject and bin_attribute support configurable

Having device_nodes be kobjects is only needed if sysfs or OF_DYNAMIC is
enabled. Otherwise, having a kobject in struct device_node is
unnecessary bloat in minimal kernel configurations.

Likewise, bin_attribute is only needed in struct property when sysfs is
enabled, so we can make it configurable too.

Tested-by: Nicolas Pitre <nico@linaro.org>
Reviewed-by: Frank Rowand <frowand.list@gmail.com>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/Kconfig      |   4 ++
 drivers/of/Makefile     |   1 +
 drivers/of/base.c       | 130 --------------------------------------
 drivers/of/dynamic.c    |  22 -------
 drivers/of/kobj.c       | 164 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/of/of_private.h |  23 +++++++
 include/linux/of.h      |  22 +++----
 7 files changed, 202 insertions(+), 164 deletions(-)
 create mode 100644 drivers/of/kobj.c

(limited to 'include/linux')

diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index ba7b034b2b91..ad9a9578f9c4 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -46,10 +46,14 @@ config OF_EARLY_FLATTREE
 config OF_PROMTREE
 	bool
 
+config OF_KOBJ
+	def_bool SYSFS
+
 # Hardly any platforms need this.  It is safe to select, but only do so if you
 # need it.
 config OF_DYNAMIC
 	bool "Support for dynamic device trees" if OF_UNITTEST
+	select OF_KOBJ
 	help
 	  On some platforms, the device tree can be manipulated at runtime.
 	  While this option is selected automatically on such platforms, you
diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index 97dc01c81438..83d61a7c5e82 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -1,4 +1,5 @@
 obj-y = base.o device.o platform.o property.o
+obj-$(CONFIG_OF_KOBJ) += kobj.o
 obj-$(CONFIG_OF_DYNAMIC) += dynamic.o
 obj-$(CONFIG_OF_FLATTREE) += fdt.o
 obj-$(CONFIG_OF_EARLY_FLATTREE) += fdt_address.o
diff --git a/drivers/of/base.c b/drivers/of/base.c
index 9e9bd17121fb..b98f3adffbb0 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -95,105 +95,6 @@ int __weak of_node_to_nid(struct device_node *np)
 }
 #endif
 
-#ifndef CONFIG_OF_DYNAMIC
-static void of_node_release(struct kobject *kobj)
-{
-	/* Without CONFIG_OF_DYNAMIC, no nodes gets freed */
-}
-#endif /* CONFIG_OF_DYNAMIC */
-
-struct kobj_type of_node_ktype = {
-	.release = of_node_release,
-};
-
-static ssize_t of_node_property_read(struct file *filp, struct kobject *kobj,
-				struct bin_attribute *bin_attr, char *buf,
-				loff_t offset, size_t count)
-{
-	struct property *pp = container_of(bin_attr, struct property, attr);
-	return memory_read_from_buffer(buf, count, &offset, pp->value, pp->length);
-}
-
-/* always return newly allocated name, caller must free after use */
-static const char *safe_name(struct kobject *kobj, const char *orig_name)
-{
-	const char *name = orig_name;
-	struct kernfs_node *kn;
-	int i = 0;
-
-	/* don't be a hero. After 16 tries give up */
-	while (i < 16 && (kn = sysfs_get_dirent(kobj->sd, name))) {
-		sysfs_put(kn);
-		if (name != orig_name)
-			kfree(name);
-		name = kasprintf(GFP_KERNEL, "%s#%i", orig_name, ++i);
-	}
-
-	if (name == orig_name) {
-		name = kstrdup(orig_name, GFP_KERNEL);
-	} else {
-		pr_warn("Duplicate name in %s, renamed to \"%s\"\n",
-			kobject_name(kobj), name);
-	}
-	return name;
-}
-
-int __of_add_property_sysfs(struct device_node *np, struct property *pp)
-{
-	int rc;
-
-	/* Important: Don't leak passwords */
-	bool secure = strncmp(pp->name, "security-", 9) == 0;
-
-	if (!IS_ENABLED(CONFIG_SYSFS))
-		return 0;
-
-	if (!of_kset || !of_node_is_attached(np))
-		return 0;
-
-	sysfs_bin_attr_init(&pp->attr);
-	pp->attr.attr.name = safe_name(&np->kobj, pp->name);
-	pp->attr.attr.mode = secure ? 0400 : 0444;
-	pp->attr.size = secure ? 0 : pp->length;
-	pp->attr.read = of_node_property_read;
-
-	rc = sysfs_create_bin_file(&np->kobj, &pp->attr);
-	WARN(rc, "error adding attribute %s to node %pOF\n", pp->name, np);
-	return rc;
-}
-
-int __of_attach_node_sysfs(struct device_node *np)
-{
-	const char *name;
-	struct kobject *parent;
-	struct property *pp;
-	int rc;
-
-	if (!of_kset)
-		return 0;
-
-	np->kobj.kset = of_kset;
-	if (!np->parent) {
-		/* Nodes without parents are new top level trees */
-		name = safe_name(&of_kset->kobj, "base");
-		parent = NULL;
-	} else {
-		name = safe_name(&np->parent->kobj, kbasename(np->full_name));
-		parent = &np->parent->kobj;
-	}
-	if (!name)
-		return -ENOMEM;
-	rc = kobject_add(&np->kobj, parent, "%s", name);
-	kfree(name);
-	if (rc)
-		return rc;
-
-	for_each_property_of_node(np, pp)
-		__of_add_property_sysfs(np, pp);
-
-	return 0;
-}
-
 void __init of_core_init(void)
 {
 	struct device_node *np;
@@ -1501,22 +1402,6 @@ int __of_remove_property(struct device_node *np, struct property *prop)
 	return 0;
 }
 
-void __of_sysfs_remove_bin_file(struct device_node *np, struct property *prop)
-{
-	sysfs_remove_bin_file(&np->kobj, &prop->attr);
-	kfree(prop->attr.attr.name);
-}
-
-void __of_remove_property_sysfs(struct device_node *np, struct property *prop)
-{
-	if (!IS_ENABLED(CONFIG_SYSFS))
-		return;
-
-	/* at early boot, bail here and defer setup to of_init() */
-	if (of_kset && of_node_is_attached(np))
-		__of_sysfs_remove_bin_file(np, prop);
-}
-
 /**
  * of_remove_property - Remove a property from a node.
  *
@@ -1576,21 +1461,6 @@ int __of_update_property(struct device_node *np, struct property *newprop,
 	return 0;
 }
 
-void __of_update_property_sysfs(struct device_node *np, struct property *newprop,
-		struct property *oldprop)
-{
-	if (!IS_ENABLED(CONFIG_SYSFS))
-		return;
-
-	/* At early boot, bail out and defer setup to of_init() */
-	if (!of_kset)
-		return;
-
-	if (oldprop)
-		__of_sysfs_remove_bin_file(np, oldprop);
-	__of_add_property_sysfs(np, newprop);
-}
-
 /*
  * of_update_property - Update a property in a node, if the property does
  * not exist, add it.
diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
index 9d6ba18c529f..39e8cf731764 100644
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -48,28 +48,6 @@ void of_node_put(struct device_node *node)
 }
 EXPORT_SYMBOL(of_node_put);
 
-void __of_detach_node_sysfs(struct device_node *np)
-{
-	struct property *pp;
-
-	if (!IS_ENABLED(CONFIG_SYSFS))
-		return;
-
-	BUG_ON(!of_node_is_initialized(np));
-	if (!of_kset)
-		return;
-
-	/* only remove properties if on sysfs */
-	if (of_node_is_attached(np)) {
-		for_each_property_of_node(np, pp)
-			__of_sysfs_remove_bin_file(np, pp);
-		kobject_del(&np->kobj);
-	}
-
-	/* finally remove the kobj_init ref */
-	of_node_put(np);
-}
-
 static BLOCKING_NOTIFIER_HEAD(of_reconfig_chain);
 
 int of_reconfig_notifier_register(struct notifier_block *nb)
diff --git a/drivers/of/kobj.c b/drivers/of/kobj.c
new file mode 100644
index 000000000000..250fc7bb550f
--- /dev/null
+++ b/drivers/of/kobj.c
@@ -0,0 +1,164 @@
+#include <linux/of.h>
+#include <linux/slab.h>
+
+#include "of_private.h"
+
+/* true when node is initialized */
+static int of_node_is_initialized(struct device_node *node)
+{
+	return node && node->kobj.state_initialized;
+}
+
+/* true when node is attached (i.e. present on sysfs) */
+int of_node_is_attached(struct device_node *node)
+{
+	return node && node->kobj.state_in_sysfs;
+}
+
+
+#ifndef CONFIG_OF_DYNAMIC
+static void of_node_release(struct kobject *kobj)
+{
+	/* Without CONFIG_OF_DYNAMIC, no nodes gets freed */
+}
+#endif /* CONFIG_OF_DYNAMIC */
+
+struct kobj_type of_node_ktype = {
+	.release = of_node_release,
+};
+
+static ssize_t of_node_property_read(struct file *filp, struct kobject *kobj,
+				struct bin_attribute *bin_attr, char *buf,
+				loff_t offset, size_t count)
+{
+	struct property *pp = container_of(bin_attr, struct property, attr);
+	return memory_read_from_buffer(buf, count, &offset, pp->value, pp->length);
+}
+
+/* always return newly allocated name, caller must free after use */
+static const char *safe_name(struct kobject *kobj, const char *orig_name)
+{
+	const char *name = orig_name;
+	struct kernfs_node *kn;
+	int i = 0;
+
+	/* don't be a hero. After 16 tries give up */
+	while (i < 16 && (kn = sysfs_get_dirent(kobj->sd, name))) {
+		sysfs_put(kn);
+		if (name != orig_name)
+			kfree(name);
+		name = kasprintf(GFP_KERNEL, "%s#%i", orig_name, ++i);
+	}
+
+	if (name == orig_name) {
+		name = kstrdup(orig_name, GFP_KERNEL);
+	} else {
+		pr_warn("Duplicate name in %s, renamed to \"%s\"\n",
+			kobject_name(kobj), name);
+	}
+	return name;
+}
+
+int __of_add_property_sysfs(struct device_node *np, struct property *pp)
+{
+	int rc;
+
+	/* Important: Don't leak passwords */
+	bool secure = strncmp(pp->name, "security-", 9) == 0;
+
+	if (!IS_ENABLED(CONFIG_SYSFS))
+		return 0;
+
+	if (!of_kset || !of_node_is_attached(np))
+		return 0;
+
+	sysfs_bin_attr_init(&pp->attr);
+	pp->attr.attr.name = safe_name(&np->kobj, pp->name);
+	pp->attr.attr.mode = secure ? 0400 : 0444;
+	pp->attr.size = secure ? 0 : pp->length;
+	pp->attr.read = of_node_property_read;
+
+	rc = sysfs_create_bin_file(&np->kobj, &pp->attr);
+	WARN(rc, "error adding attribute %s to node %pOF\n", pp->name, np);
+	return rc;
+}
+
+void __of_sysfs_remove_bin_file(struct device_node *np, struct property *prop)
+{
+	if (!IS_ENABLED(CONFIG_SYSFS))
+		return;
+
+	sysfs_remove_bin_file(&np->kobj, &prop->attr);
+	kfree(prop->attr.attr.name);
+}
+
+void __of_remove_property_sysfs(struct device_node *np, struct property *prop)
+{
+	/* at early boot, bail here and defer setup to of_init() */
+	if (of_kset && of_node_is_attached(np))
+		__of_sysfs_remove_bin_file(np, prop);
+}
+
+void __of_update_property_sysfs(struct device_node *np, struct property *newprop,
+		struct property *oldprop)
+{
+	/* At early boot, bail out and defer setup to of_init() */
+	if (!of_kset)
+		return;
+
+	if (oldprop)
+		__of_sysfs_remove_bin_file(np, oldprop);
+	__of_add_property_sysfs(np, newprop);
+}
+
+int __of_attach_node_sysfs(struct device_node *np)
+{
+	const char *name;
+	struct kobject *parent;
+	struct property *pp;
+	int rc;
+
+	if (!of_kset)
+		return 0;
+
+	np->kobj.kset = of_kset;
+	if (!np->parent) {
+		/* Nodes without parents are new top level trees */
+		name = safe_name(&of_kset->kobj, "base");
+		parent = NULL;
+	} else {
+		name = safe_name(&np->parent->kobj, kbasename(np->full_name));
+		parent = &np->parent->kobj;
+	}
+	if (!name)
+		return -ENOMEM;
+	rc = kobject_add(&np->kobj, parent, "%s", name);
+	kfree(name);
+	if (rc)
+		return rc;
+
+	for_each_property_of_node(np, pp)
+		__of_add_property_sysfs(np, pp);
+
+	return 0;
+}
+
+void __of_detach_node_sysfs(struct device_node *np)
+{
+	struct property *pp;
+
+	BUG_ON(!of_node_is_initialized(np));
+	if (!of_kset)
+		return;
+
+	/* only remove properties if on sysfs */
+	if (of_node_is_attached(np)) {
+		for_each_property_of_node(np, pp)
+			__of_sysfs_remove_bin_file(np, pp);
+		kobject_del(&np->kobj);
+	}
+
+	/* finally remove the kobj_init ref */
+	of_node_put(np);
+}
+
diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h
index cc86a974f35f..43df14f0cbce 100644
--- a/drivers/of/of_private.h
+++ b/drivers/of/of_private.h
@@ -49,6 +49,29 @@ static inline int of_property_notify(int action, struct device_node *np,
 }
 #endif /* CONFIG_OF_DYNAMIC */
 
+#if defined(CONFIG_OF_KOBJ)
+int of_node_is_attached(struct device_node *node);
+int __of_add_property_sysfs(struct device_node *np, struct property *pp);
+void __of_remove_property_sysfs(struct device_node *np, struct property *prop);
+void __of_update_property_sysfs(struct device_node *np, struct property *newprop,
+		struct property *oldprop);
+int __of_attach_node_sysfs(struct device_node *np);
+void __of_detach_node_sysfs(struct device_node *np);
+#else
+static inline int __of_add_property_sysfs(struct device_node *np, struct property *pp)
+{
+	return 0;
+}
+static inline void __of_remove_property_sysfs(struct device_node *np, struct property *prop) {}
+static inline void __of_update_property_sysfs(struct device_node *np,
+		struct property *newprop, struct property *oldprop) {}
+static inline int __of_attach_node_sysfs(struct device_node *np)
+{
+	return 0;
+}
+static inline void __of_detach_node_sysfs(struct device_node *np) {}
+#endif
+
 #if defined(CONFIG_OF_UNITTEST) && defined(CONFIG_OF_OVERLAY)
 extern void __init unittest_unflatten_overlay_base(void);
 #else
diff --git a/include/linux/of.h b/include/linux/of.h
index 2d685e769409..7b0f17be7830 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -43,7 +43,9 @@ struct property {
 #if defined(CONFIG_OF_PROMTREE)
 	unsigned int unique_id;
 #endif
+#if defined(CONFIG_OF_KOBJ)
 	struct bin_attribute attr;
+#endif
 };
 
 #if defined(CONFIG_SPARC)
@@ -62,7 +64,9 @@ struct device_node {
 	struct	device_node *parent;
 	struct	device_node *child;
 	struct	device_node *sibling;
+#if defined(CONFIG_OF_KOBJ)
 	struct	kobject kobj;
+#endif
 	unsigned long _flags;
 	void	*data;
 #if defined(CONFIG_SPARC)
@@ -107,23 +111,17 @@ extern struct kobj_type of_node_ktype;
 extern const struct fwnode_operations of_fwnode_ops;
 static inline void of_node_init(struct device_node *node)
 {
+#if defined(CONFIG_OF_KOBJ)
 	kobject_init(&node->kobj, &of_node_ktype);
+#endif
 	node->fwnode.ops = &of_fwnode_ops;
 }
 
+#if defined(CONFIG_OF_KOBJ)
 #define of_node_kobj(n) (&(n)->kobj)
-
-/* true when node is initialized */
-static inline int of_node_is_initialized(struct device_node *node)
-{
-	return node && node->kobj.state_initialized;
-}
-
-/* true when node is attached (i.e. present on sysfs) */
-static inline int of_node_is_attached(struct device_node *node)
-{
-	return node && node->kobj.state_in_sysfs;
-}
+#else
+#define of_node_kobj(n) NULL
+#endif
 
 #ifdef CONFIG_OF_DYNAMIC
 extern struct device_node *of_node_get(struct device_node *node);
-- 
cgit v1.2.3


From 62518c02f75ff9b19c07b53b8e13ed878211b795 Mon Sep 17 00:00:00 2001
From: Ladislav Michl <ladis@linux-mips.org>
Date: Mon, 16 Oct 2017 18:04:22 +0200
Subject: irqchip/irq-omap-intc: Remove omap3_init_irq()

All mach-omap2 variants are device tree only now, so this function is dead
code. Remove it.

Signed-off-by: Ladislav Michl <ladis@linux-mips.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Tony Lindgren <tony@atomide.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-omap@vger.kernel.org
Cc: Jason Cooper <jason@lakedaemon.net>
Link: https://lkml.kernel.org/r/20171016160422.uu2i7vvrgy7cc4aw@lenoch
---
 drivers/irqchip/irq-omap-intc.c       | 12 ------------
 include/linux/irqchip/irq-omap-intc.h |  2 --
 2 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-omap-intc.c b/drivers/irqchip/irq-omap-intc.c
index b04a8ac6e744..05f7f06dd711 100644
--- a/drivers/irqchip/irq-omap-intc.c
+++ b/drivers/irqchip/irq-omap-intc.c
@@ -25,10 +25,6 @@
 
 #include <linux/irqchip/irq-omap-intc.h>
 
-/* Define these here for now until we drop all board-files */
-#define OMAP24XX_IC_BASE	0x480fe000
-#define OMAP34XX_IC_BASE	0x48200000
-
 /* selected INTC register offsets */
 
 #define INTC_REVISION		0x0000
@@ -364,14 +360,6 @@ omap_intc_handle_irq(struct pt_regs *regs)
 	handle_domain_irq(domain, irqnr, regs);
 }
 
-void __init omap3_init_irq(void)
-{
-	omap_nr_irqs = 96;
-	omap_nr_pending = 3;
-	omap_init_irq(OMAP34XX_IC_BASE, NULL);
-	set_handle_irq(omap_intc_handle_irq);
-}
-
 static int __init intc_of_init(struct device_node *node,
 			     struct device_node *parent)
 {
diff --git a/include/linux/irqchip/irq-omap-intc.h b/include/linux/irqchip/irq-omap-intc.h
index 2e3d1afeb674..f19ccee7749f 100644
--- a/include/linux/irqchip/irq-omap-intc.h
+++ b/include/linux/irqchip/irq-omap-intc.h
@@ -18,8 +18,6 @@
 #ifndef __INCLUDE_LINUX_IRQCHIP_IRQ_OMAP_INTC_H
 #define __INCLUDE_LINUX_IRQCHIP_IRQ_OMAP_INTC_H
 
-void omap3_init_irq(void);
-
 int omap_irq_pending(void);
 void omap_intc_save_context(void);
 void omap_intc_restore_context(void);
-- 
cgit v1.2.3


From 8fd0fbbe8888f295eb34172a7e47bf7d3a0a4687 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 11 Oct 2017 09:45:29 +0200
Subject: perf/ftrace: Revert ("perf/ftrace: Fix double traces of perf on
 ftrace:function")

Revert commit:

  75e8387685f6 ("perf/ftrace: Fix double traces of perf on ftrace:function")

The reason I instantly stumbled on that patch is that it only addresses the
ftrace situation and doesn't mention the other _5_ places that use this
interface. It doesn't explain why those don't have the problem and if not, why
their solution doesn't work for ftrace.

It doesn't, but this is just putting more duct tape on.

Link: http://lkml.kernel.org/r/20171011080224.200565770@infradead.org

Cc: Zhou Chengming <zhouchengming1@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/perf_event.h      |  2 +-
 include/linux/trace_events.h    |  4 ++--
 kernel/events/core.c            | 13 ++++---------
 kernel/trace/trace_event_perf.c |  4 +---
 kernel/trace/trace_kprobe.c     |  4 ++--
 kernel/trace/trace_syscalls.c   |  4 ++--
 kernel/trace/trace_uprobe.c     |  2 +-
 7 files changed, 13 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8e22f24ded6a..569d1b54e201 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1184,7 +1184,7 @@ extern void perf_event_init(void);
 extern void perf_tp_event(u16 event_type, u64 count, void *record,
 			  int entry_size, struct pt_regs *regs,
 			  struct hlist_head *head, int rctx,
-			  struct task_struct *task, struct perf_event *event);
+			  struct task_struct *task);
 extern void perf_bp_event(struct perf_event *event, void *data);
 
 #ifndef perf_misc_flags
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2e0f22298fe9..a6349b76fd39 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -507,9 +507,9 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
 static inline void
 perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 		       u64 count, struct pt_regs *regs, void *head,
-		       struct task_struct *task, struct perf_event *event)
+		       struct task_struct *task)
 {
-	perf_tp_event(type, count, raw_data, size, regs, head, rctx, task, event);
+	perf_tp_event(type, count, raw_data, size, regs, head, rctx, task);
 }
 #endif
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6bc21e202ae4..b8db80c5513b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7954,15 +7954,16 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
 		}
 	}
 	perf_tp_event(call->event.type, count, raw_data, size, regs, head,
-		      rctx, task, NULL);
+		      rctx, task);
 }
 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
 
 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 		   struct pt_regs *regs, struct hlist_head *head, int rctx,
-		   struct task_struct *task, struct perf_event *event)
+		   struct task_struct *task)
 {
 	struct perf_sample_data data;
+	struct perf_event *event;
 
 	struct perf_raw_record raw = {
 		.frag = {
@@ -7976,15 +7977,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 
 	perf_trace_buf_update(record, event_type);
 
-	/* Use the given event instead of the hlist */
-	if (event) {
+	hlist_for_each_entry_rcu(event, head, hlist_entry) {
 		if (perf_tp_event_match(event, &data, regs))
 			perf_swevent_event(event, count, &data, regs);
-	} else {
-		hlist_for_each_entry_rcu(event, head, hlist_entry) {
-			if (perf_tp_event_match(event, &data, regs))
-				perf_swevent_event(event, count, &data, regs);
-		}
 	}
 
 	/*
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 13ba2d3f6a91..562fa69df5d3 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -306,7 +306,6 @@ static void
 perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 			  struct ftrace_ops *ops, struct pt_regs *pt_regs)
 {
-	struct perf_event *event;
 	struct ftrace_entry *entry;
 	struct hlist_head *head;
 	struct pt_regs regs;
@@ -330,9 +329,8 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 
 	entry->ip = ip;
 	entry->parent_ip = parent_ip;
-	event = container_of(ops, struct perf_event, ftrace_ops);
 	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
-			      1, &regs, head, NULL, event);
+			      1, &regs, head, NULL);
 
 #undef ENTRY_SIZE
 }
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index af6134f2e597..996902a526d4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1200,7 +1200,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1236,7 +1236,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	entry->ret_ip = (unsigned long)ri->ret_addr;
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
 }
 NOKPROBE_SYMBOL(kretprobe_perf_func);
 #endif	/* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 696afe72d3b1..934b0da72679 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -622,7 +622,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 
 	perf_trace_buf_submit(rec, size, rctx,
 			      sys_data->enter_event->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
 }
 
 static int perf_sysenter_enable(struct trace_event_call *call)
@@ -716,7 +716,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	}
 
 	perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
-			      1, regs, head, NULL, NULL);
+			      1, regs, head, NULL);
 }
 
 static int perf_sysexit_enable(struct trace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index b34965e62fb9..402120ba4594 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1156,7 +1156,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 	}
 
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
  out:
 	preempt_enable();
 }
-- 
cgit v1.2.3


From 466c81c45b650deca213fda3d0ec4761667379a9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 10 Oct 2017 17:15:47 +0200
Subject: perf/ftrace: Fix function trace events

The function-trace <-> perf interface is a tad messed up. Where all
the other trace <-> perf interfaces use a single trace hook
registration and use per-cpu RCU based hlist to iterate the events,
function-trace actually needs multiple hook registrations in order to
minimize function entry patching when filters are present.

The end result is that we iterate events both on the trace hook and on
the hlist, which results in reporting events multiple times.

Since function-trace cannot use the regular scheme, fix it the other
way around, use singleton hlists.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h    |  5 +++
 kernel/trace/trace_event_perf.c | 80 +++++++++++++++++++++++++----------------
 2 files changed, 54 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index a6349b76fd39..ca4e67e466a7 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -173,6 +173,11 @@ enum trace_reg {
 	TRACE_REG_PERF_UNREGISTER,
 	TRACE_REG_PERF_OPEN,
 	TRACE_REG_PERF_CLOSE,
+	/*
+	 * These (ADD/DEL) use a 'boolean' return value, where 1 (true) means a
+	 * custom action was taken and the default action is not to be
+	 * performed.
+	 */
 	TRACE_REG_PERF_ADD,
 	TRACE_REG_PERF_DEL,
 #endif
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 562fa69df5d3..e73f9ab15939 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -240,27 +240,41 @@ void perf_trace_destroy(struct perf_event *p_event)
 int perf_trace_add(struct perf_event *p_event, int flags)
 {
 	struct trace_event_call *tp_event = p_event->tp_event;
-	struct hlist_head __percpu *pcpu_list;
-	struct hlist_head *list;
-
-	pcpu_list = tp_event->perf_events;
-	if (WARN_ON_ONCE(!pcpu_list))
-		return -EINVAL;
 
 	if (!(flags & PERF_EF_START))
 		p_event->hw.state = PERF_HES_STOPPED;
 
-	list = this_cpu_ptr(pcpu_list);
-	hlist_add_head_rcu(&p_event->hlist_entry, list);
+	/*
+	 * If TRACE_REG_PERF_ADD returns false; no custom action was performed
+	 * and we need to take the default action of enqueueing our event on
+	 * the right per-cpu hlist.
+	 */
+	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
+		struct hlist_head __percpu *pcpu_list;
+		struct hlist_head *list;
+
+		pcpu_list = tp_event->perf_events;
+		if (WARN_ON_ONCE(!pcpu_list))
+			return -EINVAL;
+
+		list = this_cpu_ptr(pcpu_list);
+		hlist_add_head_rcu(&p_event->hlist_entry, list);
+	}
 
-	return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
+	return 0;
 }
 
 void perf_trace_del(struct perf_event *p_event, int flags)
 {
 	struct trace_event_call *tp_event = p_event->tp_event;
-	hlist_del_rcu(&p_event->hlist_entry);
-	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
+
+	/*
+	 * If TRACE_REG_PERF_DEL returns false; no custom action was performed
+	 * and we need to take the default action of dequeueing our event from
+	 * the right per-cpu hlist.
+	 */
+	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
+		hlist_del_rcu(&p_event->hlist_entry);
 }
 
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
@@ -307,14 +321,24 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 			  struct ftrace_ops *ops, struct pt_regs *pt_regs)
 {
 	struct ftrace_entry *entry;
-	struct hlist_head *head;
+	struct perf_event *event;
+	struct hlist_head head;
 	struct pt_regs regs;
 	int rctx;
 
-	head = this_cpu_ptr(event_function.perf_events);
-	if (hlist_empty(head))
+	if ((unsigned long)ops->private != smp_processor_id())
 		return;
 
+	event = container_of(ops, struct perf_event, ftrace_ops);
+
+	/*
+	 * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
+	 * the perf code does is hlist_for_each_entry_rcu(), so we can
+	 * get away with simply setting the @head.first pointer in order
+	 * to create a singular list.
+	 */
+	head.first = &event->hlist_entry;
+
 #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
 		    sizeof(u64)) - sizeof(u32))
 
@@ -330,7 +354,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 	entry->ip = ip;
 	entry->parent_ip = parent_ip;
 	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
-			      1, &regs, head, NULL);
+			      1, &regs, &head, NULL);
 
 #undef ENTRY_SIZE
 }
@@ -339,8 +363,10 @@ static int perf_ftrace_function_register(struct perf_event *event)
 {
 	struct ftrace_ops *ops = &event->ftrace_ops;
 
-	ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
-	ops->func = perf_ftrace_function_call;
+	ops->flags   |= FTRACE_OPS_FL_RCU;
+	ops->func    = perf_ftrace_function_call;
+	ops->private = (void *)(unsigned long)nr_cpu_ids;
+
 	return register_ftrace_function(ops);
 }
 
@@ -352,19 +378,11 @@ static int perf_ftrace_function_unregister(struct perf_event *event)
 	return ret;
 }
 
-static void perf_ftrace_function_enable(struct perf_event *event)
-{
-	ftrace_function_local_enable(&event->ftrace_ops);
-}
-
-static void perf_ftrace_function_disable(struct perf_event *event)
-{
-	ftrace_function_local_disable(&event->ftrace_ops);
-}
-
 int perf_ftrace_event_register(struct trace_event_call *call,
 			       enum trace_reg type, void *data)
 {
+	struct perf_event *event = data;
+
 	switch (type) {
 	case TRACE_REG_REGISTER:
 	case TRACE_REG_UNREGISTER:
@@ -377,11 +395,11 @@ int perf_ftrace_event_register(struct trace_event_call *call,
 	case TRACE_REG_PERF_CLOSE:
 		return perf_ftrace_function_unregister(data);
 	case TRACE_REG_PERF_ADD:
-		perf_ftrace_function_enable(data);
-		return 0;
+		event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();
+		return 1;
 	case TRACE_REG_PERF_DEL:
-		perf_ftrace_function_disable(data);
-		return 0;
+		event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
+		return 1;
 	}
 
 	return -EINVAL;
-- 
cgit v1.2.3


From b3a88803ac5b4bda26017b485c8722a8487fefb7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 11 Oct 2017 09:45:32 +0200
Subject: ftrace: Kill FTRACE_OPS_FL_PER_CPU

The one and only user of FTRACE_OPS_FL_PER_CPU is gone, remove the
lot.

Link: http://lkml.kernel.org/r/20171011080224.372422809@infradead.org

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 83 +++++++++-----------------------------------------
 kernel/trace/ftrace.c  | 55 ++++-----------------------------
 2 files changed, 20 insertions(+), 118 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1f8545caa691..252e334e7b5f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -102,10 +102,6 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
  * ENABLED - set/unset when ftrace_ops is registered/unregistered
  * DYNAMIC - set when ftrace_ops is registered to denote dynamically
  *           allocated ftrace_ops which need special care
- * PER_CPU - set manualy by ftrace_ops user to denote the ftrace_ops
- *           could be controlled by following calls:
- *             ftrace_function_local_enable
- *             ftrace_function_local_disable
  * SAVE_REGS - The ftrace_ops wants regs saved at each function called
  *            and passed to the callback. If this flag is set, but the
  *            architecture does not support passing regs
@@ -149,21 +145,20 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
 enum {
 	FTRACE_OPS_FL_ENABLED			= 1 << 0,
 	FTRACE_OPS_FL_DYNAMIC			= 1 << 1,
-	FTRACE_OPS_FL_PER_CPU			= 1 << 2,
-	FTRACE_OPS_FL_SAVE_REGS			= 1 << 3,
-	FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED	= 1 << 4,
-	FTRACE_OPS_FL_RECURSION_SAFE		= 1 << 5,
-	FTRACE_OPS_FL_STUB			= 1 << 6,
-	FTRACE_OPS_FL_INITIALIZED		= 1 << 7,
-	FTRACE_OPS_FL_DELETED			= 1 << 8,
-	FTRACE_OPS_FL_ADDING			= 1 << 9,
-	FTRACE_OPS_FL_REMOVING			= 1 << 10,
-	FTRACE_OPS_FL_MODIFYING			= 1 << 11,
-	FTRACE_OPS_FL_ALLOC_TRAMP		= 1 << 12,
-	FTRACE_OPS_FL_IPMODIFY			= 1 << 13,
-	FTRACE_OPS_FL_PID			= 1 << 14,
-	FTRACE_OPS_FL_RCU			= 1 << 15,
-	FTRACE_OPS_FL_TRACE_ARRAY		= 1 << 16,
+	FTRACE_OPS_FL_SAVE_REGS			= 1 << 2,
+	FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED	= 1 << 3,
+	FTRACE_OPS_FL_RECURSION_SAFE		= 1 << 4,
+	FTRACE_OPS_FL_STUB			= 1 << 5,
+	FTRACE_OPS_FL_INITIALIZED		= 1 << 6,
+	FTRACE_OPS_FL_DELETED			= 1 << 7,
+	FTRACE_OPS_FL_ADDING			= 1 << 8,
+	FTRACE_OPS_FL_REMOVING			= 1 << 9,
+	FTRACE_OPS_FL_MODIFYING			= 1 << 10,
+	FTRACE_OPS_FL_ALLOC_TRAMP		= 1 << 11,
+	FTRACE_OPS_FL_IPMODIFY			= 1 << 12,
+	FTRACE_OPS_FL_PID			= 1 << 13,
+	FTRACE_OPS_FL_RCU			= 1 << 14,
+	FTRACE_OPS_FL_TRACE_ARRAY		= 1 << 15,
 };
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -198,7 +193,6 @@ struct ftrace_ops {
 	unsigned long			flags;
 	void				*private;
 	ftrace_func_t			saved_func;
-	int __percpu			*disabled;
 #ifdef CONFIG_DYNAMIC_FTRACE
 	struct ftrace_ops_hash		local_hash;
 	struct ftrace_ops_hash		*func_hash;
@@ -230,55 +224,6 @@ int register_ftrace_function(struct ftrace_ops *ops);
 int unregister_ftrace_function(struct ftrace_ops *ops);
 void clear_ftrace_function(void);
 
-/**
- * ftrace_function_local_enable - enable ftrace_ops on current cpu
- *
- * This function enables tracing on current cpu by decreasing
- * the per cpu control variable.
- * It must be called with preemption disabled and only on ftrace_ops
- * registered with FTRACE_OPS_FL_PER_CPU. If called without preemption
- * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
- */
-static inline void ftrace_function_local_enable(struct ftrace_ops *ops)
-{
-	if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
-		return;
-
-	(*this_cpu_ptr(ops->disabled))--;
-}
-
-/**
- * ftrace_function_local_disable - disable ftrace_ops on current cpu
- *
- * This function disables tracing on current cpu by increasing
- * the per cpu control variable.
- * It must be called with preemption disabled and only on ftrace_ops
- * registered with FTRACE_OPS_FL_PER_CPU. If called without preemption
- * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
- */
-static inline void ftrace_function_local_disable(struct ftrace_ops *ops)
-{
-	if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
-		return;
-
-	(*this_cpu_ptr(ops->disabled))++;
-}
-
-/**
- * ftrace_function_local_disabled - returns ftrace_ops disabled value
- *                                  on current cpu
- *
- * This function returns value of ftrace_ops::disabled on current cpu.
- * It must be called with preemption disabled and only on ftrace_ops
- * registered with FTRACE_OPS_FL_PER_CPU. If called without preemption
- * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
- */
-static inline int ftrace_function_local_disabled(struct ftrace_ops *ops)
-{
-	WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU));
-	return *this_cpu_ptr(ops->disabled);
-}
-
 extern void ftrace_stub(unsigned long a0, unsigned long a1,
 			struct ftrace_ops *op, struct pt_regs *regs);
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e0a98225666b..2fd3edaec6de 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -203,30 +203,6 @@ void clear_ftrace_function(void)
 	ftrace_trace_function = ftrace_stub;
 }
 
-static void per_cpu_ops_disable_all(struct ftrace_ops *ops)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		*per_cpu_ptr(ops->disabled, cpu) = 1;
-}
-
-static int per_cpu_ops_alloc(struct ftrace_ops *ops)
-{
-	int __percpu *disabled;
-
-	if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
-		return -EINVAL;
-
-	disabled = alloc_percpu(int);
-	if (!disabled)
-		return -ENOMEM;
-
-	ops->disabled = disabled;
-	per_cpu_ops_disable_all(ops);
-	return 0;
-}
-
 static void ftrace_sync(struct work_struct *work)
 {
 	/*
@@ -262,8 +238,8 @@ static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
 	 * If this is a dynamic, RCU, or per CPU ops, or we force list func,
 	 * then it needs to call the list anyway.
 	 */
-	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU |
-			  FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC)
+	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_RCU) ||
+	    FTRACE_FORCE_LIST_FUNC)
 		return ftrace_ops_list_func;
 
 	return ftrace_ops_get_func(ops);
@@ -422,11 +398,6 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 	if (!core_kernel_data((unsigned long)ops))
 		ops->flags |= FTRACE_OPS_FL_DYNAMIC;
 
-	if (ops->flags & FTRACE_OPS_FL_PER_CPU) {
-		if (per_cpu_ops_alloc(ops))
-			return -ENOMEM;
-	}
-
 	add_ftrace_ops(&ftrace_ops_list, ops);
 
 	/* Always save the function, and reset at unregistering */
@@ -2727,11 +2698,6 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
 {
 }
 
-static void per_cpu_ops_free(struct ftrace_ops *ops)
-{
-	free_percpu(ops->disabled);
-}
-
 static void ftrace_startup_enable(int command)
 {
 	if (saved_ftrace_func != ftrace_trace_function) {
@@ -2833,7 +2799,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 		 * not currently active, we can just free them
 		 * without synchronizing all CPUs.
 		 */
-		if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU))
+		if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
 			goto free_ops;
 
 		return 0;
@@ -2880,7 +2846,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 	 * The same goes for freeing the per_cpu data of the per_cpu
 	 * ops.
 	 */
-	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
+	if (ops->flags & FTRACE_OPS_FL_DYNAMIC) {
 		/*
 		 * We need to do a hard force of sched synchronization.
 		 * This is because we use preempt_disable() to do RCU, but
@@ -2903,9 +2869,6 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 
  free_ops:
 		arch_ftrace_trampoline_free(ops);
-
-		if (ops->flags & FTRACE_OPS_FL_PER_CPU)
-			per_cpu_ops_free(ops);
 	}
 
 	return 0;
@@ -6355,10 +6318,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 		 * If any of the above fails then the op->func() is not executed.
 		 */
 		if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
-		    (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
-		     !ftrace_function_local_disabled(op)) &&
 		    ftrace_ops_test(op, ip, regs)) {
-		    
 			if (FTRACE_WARN_ON(!op->func)) {
 				pr_warn("op=%p %pS\n", op, op);
 				goto out;
@@ -6416,10 +6376,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
 
 	preempt_disable_notrace();
 
-	if (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
-	    !ftrace_function_local_disabled(op)) {
-		op->func(ip, parent_ip, op, regs);
-	}
+	op->func(ip, parent_ip, op, regs);
 
 	preempt_enable_notrace();
 	trace_clear_recursion(bit);
@@ -6443,7 +6400,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
 	 * or does per cpu logic, then we need to call the assist handler.
 	 */
 	if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
-	    ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU))
+	    ops->flags & FTRACE_OPS_FL_RCU)
 		return ftrace_ops_assist_func;
 
 	return ops->func;
-- 
cgit v1.2.3


From cbe25ce37d6c2623b5ac09128987e98848a54c6c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sat, 14 Oct 2017 17:43:15 +0200
Subject: ACPI / PM: Combine device suspend routines

On top of a previous change getting rid of the PM QoS flag
PM_QOS_FLAG_REMOTE_WAKEUP, combine two ACPI device suspend routines,
acpi_dev_runtime_suspend() and acpi_dev_suspend_late(), into one,
acpi_dev_suspend(), to eliminate some code duplication.

It also avoids enabling wakeup for devices handled by the ACPI
LPSS middle layer on driver removal.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/acpi/acpi_lpss.c |  6 ++---
 drivers/acpi/device_pm.c | 61 +++++++++++-------------------------------------
 include/linux/acpi.h     |  3 +--
 3 files changed, 18 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
index 97b753dd2e6e..8ec19e7c7b61 100644
--- a/drivers/acpi/acpi_lpss.c
+++ b/drivers/acpi/acpi_lpss.c
@@ -713,7 +713,7 @@ static int acpi_lpss_activate(struct device *dev)
 
 static void acpi_lpss_dismiss(struct device *dev)
 {
-	acpi_dev_runtime_suspend(dev);
+	acpi_dev_suspend(dev, false);
 }
 
 #ifdef CONFIG_PM_SLEEP
@@ -729,7 +729,7 @@ static int acpi_lpss_suspend_late(struct device *dev)
 	if (pdata->dev_desc->flags & LPSS_SAVE_CTX)
 		acpi_lpss_save_ctx(dev, pdata);
 
-	return acpi_dev_suspend_late(dev);
+	return acpi_dev_suspend(dev, device_may_wakeup(dev));
 }
 
 static int acpi_lpss_resume_early(struct device *dev)
@@ -847,7 +847,7 @@ static int acpi_lpss_runtime_suspend(struct device *dev)
 	if (pdata->dev_desc->flags & LPSS_SAVE_CTX)
 		acpi_lpss_save_ctx(dev, pdata);
 
-	ret = acpi_dev_runtime_suspend(dev);
+	ret = acpi_dev_suspend(dev, true);
 
 	/*
 	 * This call must be last in the sequence, otherwise PMC will return
diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index d74000acb658..17e8eb93a76c 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -847,37 +847,39 @@ static int acpi_dev_pm_full_power(struct acpi_device *adev)
 }
 
 /**
- * acpi_dev_runtime_suspend - Put device into a low-power state using ACPI.
+ * acpi_dev_suspend - Put device into a low-power state using ACPI.
  * @dev: Device to put into a low-power state.
+ * @wakeup: Whether or not to enable wakeup for the device.
  *
- * Put the given device into a runtime low-power state using the standard ACPI
+ * Put the given device into a low-power state using the standard ACPI
  * mechanism.  Set up remote wakeup if desired, choose the state to put the
  * device into (this checks if remote wakeup is expected to work too), and set
  * the power state of the device.
  */
-int acpi_dev_runtime_suspend(struct device *dev)
+int acpi_dev_suspend(struct device *dev, bool wakeup)
 {
 	struct acpi_device *adev = ACPI_COMPANION(dev);
-	bool remote_wakeup;
+	u32 target_state = acpi_target_system_state();
 	int error;
 
 	if (!adev)
 		return 0;
 
-	remote_wakeup = acpi_device_can_wakeup(adev);
-	if (remote_wakeup) {
-		error = acpi_device_wakeup_enable(adev, ACPI_STATE_S0);
+	if (wakeup && acpi_device_can_wakeup(adev)) {
+		error = acpi_device_wakeup_enable(adev, target_state);
 		if (error)
 			return -EAGAIN;
+	} else {
+		wakeup = false;
 	}
 
-	error = acpi_dev_pm_low_power(dev, adev, ACPI_STATE_S0);
-	if (error && remote_wakeup)
+	error = acpi_dev_pm_low_power(dev, adev, target_state);
+	if (error && wakeup)
 		acpi_device_wakeup_disable(adev);
 
 	return error;
 }
-EXPORT_SYMBOL_GPL(acpi_dev_runtime_suspend);
+EXPORT_SYMBOL_GPL(acpi_dev_suspend);
 
 /**
  * acpi_dev_resume - Put device into the full-power state using ACPI.
@@ -910,7 +912,7 @@ EXPORT_SYMBOL_GPL(acpi_dev_resume);
 int acpi_subsys_runtime_suspend(struct device *dev)
 {
 	int ret = pm_generic_runtime_suspend(dev);
-	return ret ? ret : acpi_dev_runtime_suspend(dev);
+	return ret ? ret : acpi_dev_suspend(dev, true);
 }
 EXPORT_SYMBOL_GPL(acpi_subsys_runtime_suspend);
 
@@ -929,41 +931,6 @@ int acpi_subsys_runtime_resume(struct device *dev)
 EXPORT_SYMBOL_GPL(acpi_subsys_runtime_resume);
 
 #ifdef CONFIG_PM_SLEEP
-/**
- * acpi_dev_suspend_late - Put device into a low-power state using ACPI.
- * @dev: Device to put into a low-power state.
- *
- * Put the given device into a low-power state during system transition to a
- * sleep state using the standard ACPI mechanism.  Set up system wakeup if
- * desired, choose the state to put the device into (this checks if system
- * wakeup is expected to work too), and set the power state of the device.
- */
-int acpi_dev_suspend_late(struct device *dev)
-{
-	struct acpi_device *adev = ACPI_COMPANION(dev);
-	u32 target_state;
-	bool wakeup;
-	int error;
-
-	if (!adev)
-		return 0;
-
-	target_state = acpi_target_system_state();
-	wakeup = device_may_wakeup(dev) && acpi_device_can_wakeup(adev);
-	if (wakeup) {
-		error = acpi_device_wakeup_enable(adev, target_state);
-		if (error)
-			return error;
-	}
-
-	error = acpi_dev_pm_low_power(dev, adev, target_state);
-	if (error && wakeup)
-		acpi_device_wakeup_disable(adev);
-
-	return error;
-}
-EXPORT_SYMBOL_GPL(acpi_dev_suspend_late);
-
 static bool acpi_dev_needs_resume(struct device *dev, struct acpi_device *adev)
 {
 	u32 sys_target = acpi_target_system_state();
@@ -1046,7 +1013,7 @@ EXPORT_SYMBOL_GPL(acpi_subsys_suspend);
 int acpi_subsys_suspend_late(struct device *dev)
 {
 	int ret = pm_generic_suspend_late(dev);
-	return ret ? ret : acpi_dev_suspend_late(dev);
+	return ret ? ret : acpi_dev_suspend(dev, device_may_wakeup(dev));
 }
 EXPORT_SYMBOL_GPL(acpi_subsys_suspend_late);
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 2b1738f840ab..0ada2a948b44 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -864,7 +864,7 @@ static inline void arch_reserve_mem_area(acpi_physical_address addr,
 #endif
 
 #if defined(CONFIG_ACPI) && defined(CONFIG_PM)
-int acpi_dev_runtime_suspend(struct device *dev);
+int acpi_dev_suspend(struct device *dev, bool wakeup);
 int acpi_dev_resume(struct device *dev);
 int acpi_subsys_runtime_suspend(struct device *dev);
 int acpi_subsys_runtime_resume(struct device *dev);
@@ -889,7 +889,6 @@ int acpi_subsys_resume_early(struct device *dev);
 int acpi_subsys_suspend(struct device *dev);
 int acpi_subsys_freeze(struct device *dev);
 #else
-static inline int acpi_dev_suspend_late(struct device *dev) { return 0; }
 static inline int acpi_dev_resume_early(struct device *dev) { return 0; }
 static inline int acpi_subsys_prepare(struct device *dev) { return 0; }
 static inline void acpi_subsys_complete(struct device *dev) {}
-- 
cgit v1.2.3


From 023b7b07ccb57317bee4d971be546cc4469f4e7e Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Thu, 31 Aug 2017 11:30:45 +0530
Subject: thermal : Remove const to make same prototype

Here, prototype of thermal_zone_device_register is not matching
with static inline thermal_zone_device_register. One is using
const thermal_zone_params. Other is using non-const.

Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 include/linux/thermal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index fd5b959c753c..8c5302374eaa 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -488,7 +488,7 @@ static inline int power_actor_set_power(struct thermal_cooling_device *cdev,
 static inline struct thermal_zone_device *thermal_zone_device_register(
 	const char *type, int trips, int mask, void *devdata,
 	struct thermal_zone_device_ops *ops,
-	const struct thermal_zone_params *tzp,
+	struct thermal_zone_params *tzp,
 	int passive_delay, int polling_delay)
 { return ERR_PTR(-ENODEV); }
 static inline void thermal_zone_device_unregister(
-- 
cgit v1.2.3


From 0d96a4f6a03797b73bee465cada39133b7972e8d Mon Sep 17 00:00:00 2001
From: Ladislav Michl <ladis@linux-mips.org>
Date: Fri, 6 Oct 2017 13:45:53 +0200
Subject: memory: omap-gpmc: Drop gpmc_status

This field is no longer used, drop it.

Signed-off-by: Ladislav Michl <ladis@linux-mips.org>
Signed-off-by: Roger Quadros <rogerq@ti.com>
---
 drivers/memory/omap-gpmc.c                   | 1 -
 include/linux/platform_data/mtd-nand-omap2.h | 2 --
 2 files changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/memory/omap-gpmc.c b/drivers/memory/omap-gpmc.c
index 7059bbda2fac..f2aef0b87bc6 100644
--- a/drivers/memory/omap-gpmc.c
+++ b/drivers/memory/omap-gpmc.c
@@ -1079,7 +1079,6 @@ void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs)
 {
 	int i;
 
-	reg->gpmc_status = NULL;	/* deprecated */
 	reg->gpmc_nand_command = gpmc_base + GPMC_CS0_OFFSET +
 				GPMC_CS_NAND_COMMAND + GPMC_CS_SIZE * cs;
 	reg->gpmc_nand_address = gpmc_base + GPMC_CS0_OFFSET +
diff --git a/include/linux/platform_data/mtd-nand-omap2.h b/include/linux/platform_data/mtd-nand-omap2.h
index 17d57a18bac5..25e267f1970c 100644
--- a/include/linux/platform_data/mtd-nand-omap2.h
+++ b/include/linux/platform_data/mtd-nand-omap2.h
@@ -63,8 +63,6 @@ struct gpmc_nand_regs {
 	void __iomem	*gpmc_bch_result4[GPMC_BCH_NUM_REMAINDER];
 	void __iomem	*gpmc_bch_result5[GPMC_BCH_NUM_REMAINDER];
 	void __iomem	*gpmc_bch_result6[GPMC_BCH_NUM_REMAINDER];
-	/* Deprecated. Do not use */
-	void __iomem	*gpmc_status;
 };
 
 struct omap_nand_platform_data {
-- 
cgit v1.2.3


From 5a6cd6de76ae78b651e7c36eba8b1da465d65f06 Mon Sep 17 00:00:00 2001
From: Alan Brady <alan.brady@intel.com>
Date: Thu, 5 Oct 2017 14:53:40 -0700
Subject: ethtool: add ethtool_intersect_link_masks

This function provides a way to intersect two link masks together to
find the common ground between them.  For example in i40e, the driver
first generates link masks for what is supported by the PHY type.  The
driver then gets the link masks for what the NVM supports.  The
resulting intersection between them yields what can truly be supported.

Signed-off-by: Alan Brady <alan.brady@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/ethtool.h | 10 ++++++++++
 net/core/ethtool.c      | 16 ++++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 4587a4c36923..c77fa3529e15 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -163,6 +163,16 @@ extern int
 __ethtool_get_link_ksettings(struct net_device *dev,
 			     struct ethtool_link_ksettings *link_ksettings);
 
+/**
+ * ethtool_intersect_link_masks - Given two link masks, AND them together
+ * @dst: first mask and where result is stored
+ * @src: second mask to intersect with
+ *
+ * Given two link mode masks, AND them together and save the result in dst.
+ */
+void ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst,
+				  struct ethtool_link_ksettings *src);
+
 void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst,
 					     u32 legacy_u32);
 
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 3228411ada0f..0c406306792a 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -403,6 +403,22 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data)
 	return 0;
 }
 
+/* Given two link masks, AND them together and save the result in dst. */
+void ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst,
+				  struct ethtool_link_ksettings *src)
+{
+	unsigned int size = BITS_TO_LONGS(__ETHTOOL_LINK_MODE_MASK_NBITS);
+	unsigned int idx = 0;
+
+	for (; idx < size; idx++) {
+		dst->link_modes.supported[idx] &=
+			src->link_modes.supported[idx];
+		dst->link_modes.advertising[idx] &=
+			src->link_modes.advertising[idx];
+	}
+}
+EXPORT_SYMBOL(ethtool_intersect_link_masks);
+
 void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst,
 					     u32 legacy_u32)
 {
-- 
cgit v1.2.3


From 0290c4ca2536a35e55c53cfb9058465b1f987b17 Mon Sep 17 00:00:00 2001
From: Frank Rowand <frank.rowand@sony.com>
Date: Tue, 17 Oct 2017 16:36:23 -0700
Subject: of: overlay: rename identifiers to more reflect what they do

This patch is aimed primarily at drivers/of/overlay.c, but those
changes also have a small impact in a few other files.

overlay.c is difficult to read and maintain.  Improve readability:
  - Rename functions, types and variables to better reflect what
    they do and to be consistent with names in other places,
    such as the device tree overlay FDT (flattened device tree),
    and make the algorithms more clear
  - Use the same names consistently throughout the file
  - Update comments for name changes
  - Fix incorrect comments

This patch is intended to not introduce any functional change.

Signed-off-by: Frank Rowand <frank.rowand@sony.com>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 Documentation/devicetree/overlay-notes.txt   |  12 +-
 drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c |   5 +-
 drivers/of/dynamic.c                         |   2 +-
 drivers/of/overlay.c                         | 530 ++++++++++++++-------------
 drivers/of/unittest.c                        |  20 +-
 include/linux/of.h                           |  12 +-
 6 files changed, 310 insertions(+), 271 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/overlay-notes.txt b/Documentation/devicetree/overlay-notes.txt
index eb7f2685fda1..c4aa0adf13ec 100644
--- a/Documentation/devicetree/overlay-notes.txt
+++ b/Documentation/devicetree/overlay-notes.txt
@@ -87,15 +87,15 @@ Overlay in-kernel API
 
 The API is quite easy to use.
 
-1. Call of_overlay_create() to create and apply an overlay. The return value
-is a cookie identifying this overlay.
+1. Call of_overlay_apply() to create and apply an overlay changeset. The return
+value is an error or a cookie identifying this overlay.
 
-2. Call of_overlay_destroy() to remove and cleanup the overlay previously
-created via the call to of_overlay_create(). Removal of an overlay that
-is stacked by another will not be permitted.
+2. Call of_overlay_remove() to remove and cleanup the overlay changeset
+previously created via the call to of_overlay_apply(). Removal of an overlay
+changeset that is stacked by another will not be permitted.
 
 Finally, if you need to remove all overlays in one-go, just call
-of_overlay_destroy_all() which will remove every single one in the correct
+of_overlay_remove_all() which will remove every single one in the correct
 order.
 
 Overlay DTS Format
diff --git a/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c b/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
index 623a9140493c..5f5b7ba35f1d 100644
--- a/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
+++ b/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
@@ -247,9 +247,10 @@ static void __init tilcdc_convert_slave_node(void)
 
 	tilcdc_node_disable(slave);
 
-	ret = of_overlay_create(overlay);
+	ret = of_overlay_apply(overlay);
 	if (ret)
-		pr_err("%s: Creating overlay failed: %d\n", __func__, ret);
+		pr_err("%s: Applying overlay changeset failed: %d\n",
+			__func__, ret);
 	else
 		pr_info("%s: ti,tilcdc,slave node successfully converted\n",
 			__func__);
diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
index 39e8cf731764..4bc96af4d8e2 100644
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -758,7 +758,7 @@ int of_changeset_revert(struct of_changeset *ocs)
 EXPORT_SYMBOL_GPL(of_changeset_revert);
 
 /**
- * of_changeset_action - Perform a changeset action
+ * of_changeset_action - Add an action to the tail of the changeset list
  *
  * @ocs:	changeset pointer
  * @action:	action to perform
diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index d3f4a5974a11..69610637af88 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -25,67 +25,63 @@
 #include "of_private.h"
 
 /**
- * struct of_overlay_info - Holds a single overlay info
+ * struct fragment - info about fragment nodes in overlay expanded device tree
  * @target:	target of the overlay operation
- * @overlay:	pointer to the overlay contents node
- *
- * Holds a single overlay state, including all the overlay logs &
- * records.
+ * @overlay:	pointer to the __overlay__ node
  */
-struct of_overlay_info {
+struct fragment {
 	struct device_node *target;
 	struct device_node *overlay;
 	bool is_symbols_node;
 };
 
 /**
- * struct of_overlay - Holds a complete overlay transaction
- * @node:	List on which we are located
- * @count:	Count of ovinfo structures
- * @ovinfo_tab:	Overlay info table (count sized)
- * @cset:	Changeset to be used
- *
- * Holds a complete overlay transaction
+ * struct overlay_changeset
+ * @ovcs_list:	list on which we are located
+ * @count:	count of @fragments structures
+ * @fragments:	info about fragment nodes in overlay expanded device tree
+ * @cset:	changeset to apply fragments to live device tree
  */
-struct of_overlay {
+struct overlay_changeset {
 	int id;
-	struct list_head node;
+	struct list_head ovcs_list;
 	int count;
-	struct of_overlay_info *ovinfo_tab;
+	struct fragment *fragments;
 	struct of_changeset cset;
 };
 
-static int of_overlay_apply_one(struct of_overlay *ov,
-		struct device_node *target, const struct device_node *overlay,
+static int build_changeset_next_level(struct overlay_changeset *ovcs,
+		struct device_node *target_node,
+		const struct device_node *overlay_node,
 		bool is_symbols_node);
 
-static BLOCKING_NOTIFIER_HEAD(of_overlay_chain);
+static BLOCKING_NOTIFIER_HEAD(overlay_notify_chain);
 
 int of_overlay_notifier_register(struct notifier_block *nb)
 {
-	return blocking_notifier_chain_register(&of_overlay_chain, nb);
+	return blocking_notifier_chain_register(&overlay_notify_chain, nb);
 }
 EXPORT_SYMBOL_GPL(of_overlay_notifier_register);
 
 int of_overlay_notifier_unregister(struct notifier_block *nb)
 {
-	return blocking_notifier_chain_unregister(&of_overlay_chain, nb);
+	return blocking_notifier_chain_unregister(&overlay_notify_chain, nb);
 }
 EXPORT_SYMBOL_GPL(of_overlay_notifier_unregister);
 
-static int of_overlay_notify(struct of_overlay *ov,
-			     enum of_overlay_notify_action action)
+static int overlay_notify(struct overlay_changeset *ovcs,
+		enum of_overlay_notify_action action)
 {
 	struct of_overlay_notify_data nd;
 	int i, ret;
 
-	for (i = 0; i < ov->count; i++) {
-		struct of_overlay_info *ovinfo = &ov->ovinfo_tab[i];
+	for (i = 0; i < ovcs->count; i++) {
+		struct fragment *fragment = &ovcs->fragments[i];
 
-		nd.target = ovinfo->target;
-		nd.overlay = ovinfo->overlay;
+		nd.target = fragment->target;
+		nd.overlay = fragment->overlay;
 
-		ret = blocking_notifier_call_chain(&of_overlay_chain,
+		ret = blocking_notifier_call_chain(&overlay_notify_chain,
 						   action, &nd);
 		if (ret)
 			return notifier_to_errno(ret);
@@ -94,10 +90,10 @@ static int of_overlay_notify(struct of_overlay *ov,
 	return 0;
 }
 
-static struct property *dup_and_fixup_symbol_prop(struct of_overlay *ov,
-		const struct property *prop)
+static struct property *dup_and_fixup_symbol_prop(
+		struct overlay_changeset *ovcs, const struct property *prop)
 {
-	struct of_overlay_info *ovinfo;
+	struct fragment *fragment;
 	struct property *new;
 	const char *overlay_name;
 	char *label_path;
@@ -116,18 +112,18 @@ static struct property *dup_and_fixup_symbol_prop(struct of_overlay *ov,
 	if (!new)
 		return NULL;
 
-	for (k = 0; k < ov->count; k++) {
-		ovinfo = &ov->ovinfo_tab[k];
-		overlay_name = ovinfo->overlay->full_name;
+	for (k = 0; k < ovcs->count; k++) {
+		fragment = &ovcs->fragments[k];
+		overlay_name = fragment->overlay->full_name;
 		overlay_name_len = strlen(overlay_name);
 		if (!strncasecmp(symbol_path, overlay_name, overlay_name_len))
 			break;
 	}
 
-	if (k >= ov->count)
+	if (k >= ovcs->count)
 		goto err_free;
 
-	target_path = ovinfo->target->full_name;
+	target_path = fragment->target->full_name;
 	target_path_len = strlen(target_path);
 
 	label_path = symbol_path + overlay_name_len;
@@ -156,82 +152,119 @@ static struct property *dup_and_fixup_symbol_prop(struct of_overlay *ov,
 
 }
 
-/*
+/**
+ * add_changeset_property() - add @overlay_prop to overlay changeset
+ * @ovcs:		overlay changeset
+ * @target_node:	where to place @overlay_prop in live tree
+ * @overlay_prop:	property to add or update, from overlay tree
+ * is_symbols_node:	1 if @target_node is "/__symbols__"
+ *
+ * If @overlay_prop does not already exist in @target_node, add changeset entry
+ * to add @overlay_prop in @target_node, else add changeset entry to update
+ * value of @overlay_prop.
+ *
  * Some special properties are not updated (no error returned).
+ *
  * Update of property in symbols node is not allowed.
+ *
+ * Returns 0 on success, -ENOMEM if memory allocation failure, or -EINVAL if
+ * invalid @overlay.
  */
-static int of_overlay_apply_single_property(struct of_overlay *ov,
-		struct device_node *target, struct property *prop,
+static int add_changeset_property(struct overlay_changeset *ovcs,
+		struct device_node *target_node,
+		struct property *overlay_prop,
 		bool is_symbols_node)
 {
-	struct property *propn = NULL, *tprop;
+	struct property *new_prop = NULL, *prop;
 	int ret = 0;
 
-	tprop = of_find_property(target, prop->name, NULL);
+	prop = of_find_property(target_node, overlay_prop->name, NULL);
 
-	if (!of_prop_cmp(prop->name, "name") ||
-	    !of_prop_cmp(prop->name, "phandle") ||
-	    !of_prop_cmp(prop->name, "linux,phandle"))
+	if (!of_prop_cmp(overlay_prop->name, "name") ||
+	    !of_prop_cmp(overlay_prop->name, "phandle") ||
+	    !of_prop_cmp(overlay_prop->name, "linux,phandle"))
 		return 0;
 
 	if (is_symbols_node) {
-		if (tprop)
+		if (prop)
 			return -EINVAL;
-		propn = dup_and_fixup_symbol_prop(ov, prop);
+		new_prop = dup_and_fixup_symbol_prop(ovcs, overlay_prop);
 	} else {
-		propn = __of_prop_dup(prop, GFP_KERNEL);
+		new_prop = __of_prop_dup(overlay_prop, GFP_KERNEL);
 	}
 
-	if (!propn)
+	if (!new_prop)
 		return -ENOMEM;
 
-	if (!tprop)
-		ret = of_changeset_add_property(&ov->cset, target, propn);
+	if (!prop)
+		ret = of_changeset_add_property(&ovcs->cset, target_node,
+						new_prop);
 	else
-		ret = of_changeset_update_property(&ov->cset, target, propn);
+		ret = of_changeset_update_property(&ovcs->cset, target_node,
+						   new_prop);
 
 	if (ret) {
-		kfree(propn->name);
-		kfree(propn->value);
-		kfree(propn);
+		kfree(new_prop->name);
+		kfree(new_prop->value);
+		kfree(new_prop);
 	}
 	return ret;
 }
 
-static int of_overlay_apply_single_device_node(struct of_overlay *ov,
-		struct device_node *target, struct device_node *child)
+/**
+ * add_changeset_node() - add @node (and children) to overlay changeset
+ * @ovcs:		overlay changeset
+ * @target_node:	where to place @node in live tree
+ * @node:		node from within overlay device tree fragment
+ *
+ * If @node does not already exist in @target_node, add changeset entry
+ * to add @node in @target_node.
+ *
+ * If @node already exists in @target_node, and the existing node has
+ * a phandle, the overlay node is not allowed to have a phandle.
+ *
+ * If @node has child nodes, add the children recursively via
+ * build_changeset_next_level().
+ *
+ * NOTE: Multiple mods of created nodes not supported.
+ *
+ * Returns 0 on success, -ENOMEM if memory allocation failure, or -EINVAL if
+ * invalid @overlay.
+ */
+static int add_changeset_node(struct overlay_changeset *ovcs,
+		struct device_node *target_node, struct device_node *node)
 {
-	const char *cname;
+	const char *node_kbasename;
 	struct device_node *tchild;
 	int ret = 0;
 
-	cname = kbasename(child->full_name);
-	if (!cname)
+	node_kbasename = kbasename(node->full_name);
+	if (!node_kbasename)
 		return -ENOMEM;
 
-	for_each_child_of_node(target, tchild)
-		if (!of_node_cmp(cname, kbasename(tchild->full_name)))
+	for_each_child_of_node(target_node, tchild)
+		if (!of_node_cmp(node_kbasename, kbasename(tchild->full_name)))
 			break;
 
 	if (tchild) {
-		if (child->phandle)
+		if (node->phandle)
 			return -EINVAL;
 
-		/* apply overlay recursively */
-		ret = of_overlay_apply_one(ov, tchild, child, 0);
+		ret = build_changeset_next_level(ovcs, tchild, node, 0);
 		of_node_put(tchild);
 	} else {
-		tchild = __of_node_dup(child, "%pOF/%s", target, cname);
+		tchild = __of_node_dup(node, "%pOF/%s",
+				       target_node, node_kbasename);
 		if (!tchild)
 			return -ENOMEM;
 
-		tchild->parent = target;
+		tchild->parent = target_node;
 
-		ret = of_changeset_attach_node(&ov->cset, tchild);
+		ret = of_changeset_attach_node(&ovcs->cset, tchild);
 		if (ret)
 			return ret;
 
-		ret = of_overlay_apply_one(ov, tchild, child, 0);
+		ret = build_changeset_next_level(ovcs, tchild, node, 0);
 		if (ret)
 			return ret;
 	}
@@ -239,29 +272,37 @@ static int of_overlay_apply_single_device_node(struct of_overlay *ov,
 	return ret;
 }
 
-/*
- * Apply a single overlay node recursively.
+/**
+ * build_changeset_next_level() - add level of overlay changeset
+ * @ovcs:		overlay changeset
+ * @target_node:	where to place @overlay_node in live tree
+ * @overlay_node:	node from within an overlay device tree fragment
+ * @is_symbols_node:	@overlay_node is node "/__symbols__"
  *
- * Note that the in case of an error the target node is left
- * in a inconsistent state. Error recovery should be performed
- * by using the changeset.
+ * Add the properties (if any) and nodes (if any) from @overlay_node to the
+ * @ovcs->cset changeset.  If an added node has child nodes, they will
+ * be added recursively.
  *
  * Do not allow symbols node to have any children.
+ *
+ * Returns 0 on success, -ENOMEM if memory allocation failure, or -EINVAL if
+ * invalid @overlay_node.
  */
-static int of_overlay_apply_one(struct of_overlay *ov,
-		struct device_node *target, const struct device_node *overlay,
+static int build_changeset_next_level(struct overlay_changeset *ovcs,
+		struct device_node *target_node,
+		const struct device_node *overlay_node,
 		bool is_symbols_node)
 {
 	struct device_node *child;
 	struct property *prop;
 	int ret;
 
-	for_each_property_of_node(overlay, prop) {
-		ret = of_overlay_apply_single_property(ov, target, prop,
-						       is_symbols_node);
+	for_each_property_of_node(overlay_node, prop) {
+		ret = add_changeset_property(ovcs, target_node, prop,
+					     is_symbols_node);
 		if (ret) {
 			pr_err("Failed to apply prop @%pOF/%s\n",
-			       target, prop->name);
+			       target_node, prop->name);
 			return ret;
 		}
 	}
@@ -269,11 +310,11 @@ static int of_overlay_apply_one(struct of_overlay *ov,
 	if (is_symbols_node)
 		return 0;
 
-	for_each_child_of_node(overlay, child) {
-		ret = of_overlay_apply_single_device_node(ov, target, child);
+	for_each_child_of_node(overlay_node, child) {
+		ret = add_changeset_node(ovcs, target_node, child);
 		if (ret) {
-			pr_err("Failed to apply single node @%pOF/%s\n",
-			       target, child->name);
+			pr_err("Failed to apply node @%pOF/%s\n",
+			       target_node, child->name);
 			of_node_put(child);
 			return ret;
 		}
@@ -283,26 +324,30 @@ static int of_overlay_apply_one(struct of_overlay *ov,
 }
 
 /**
- * of_overlay_apply() - Apply @count overlays pointed at by @ovinfo_tab
- * @ov:		Overlay to apply
+ * build_changeset() - populate overlay changeset in @ovcs from @ovcs->fragments
+ * @ovcs:	Overlay changeset
  *
- * Applies the overlays given, while handling all error conditions
- * appropriately. Either the operation succeeds, or if it fails the
- * live tree is reverted to the state before the attempt.
- * Returns 0, or an error if the overlay attempt failed.
+ * Create changeset @ovcs->cset to contain the nodes and properties of the
+ * overlay device tree fragments in @ovcs->fragments[].  If an error occurs,
+ * any portions of the changeset that were successfully created will remain
+ * in @ovcs->cset.
+ *
+ * Returns 0 on success, -ENOMEM if memory allocation failure, or -EINVAL if
+ * invalid overlay in @ovcs->fragments[].
  */
-static int of_overlay_apply(struct of_overlay *ov)
+static int build_changeset(struct overlay_changeset *ovcs)
 {
-	int i, err;
+	int i, ret;
 
-	for (i = 0; i < ov->count; i++) {
-		struct of_overlay_info *ovinfo = &ov->ovinfo_tab[i];
+	for (i = 0; i < ovcs->count; i++) {
+		struct fragment *fragment = &ovcs->fragments[i];
 
-		err = of_overlay_apply_one(ov, ovinfo->target, ovinfo->overlay,
-					   ovinfo->is_symbols_node);
-		if (err) {
-			pr_err("apply failed '%pOF'\n", ovinfo->target);
-			return err;
+		ret = build_changeset_next_level(ovcs, fragment->target,
+					       fragment->overlay,
+					       fragment->is_symbols_node);
+		if (ret) {
+			pr_err("apply failed '%pOF'\n", fragment->target);
+			return ret;
 		}
 	}
 
@@ -350,45 +395,46 @@ static struct device_node *find_target_node(struct device_node *info_node)
  *
  * Returns 0 on success, or a negative error value.
  */
-static int of_fill_overlay_info(struct of_overlay *ov,
-		struct device_node *info_node, struct of_overlay_info *ovinfo)
+static int of_fill_overlay_info(struct overlay_changeset *ovcset,
+		struct device_node *info_node, struct fragment *fragment)
 {
-	ovinfo->overlay = of_get_child_by_name(info_node, "__overlay__");
-	if (!ovinfo->overlay)
+	fragment->overlay = of_get_child_by_name(info_node, "__overlay__");
+	if (!fragment->overlay)
 		goto err_fail;
 
-	ovinfo->target = find_target_node(info_node);
-	if (!ovinfo->target)
+	fragment->target = find_target_node(info_node);
+	if (!fragment->target)
 		goto err_fail;
 
 	return 0;
 
 err_fail:
-	of_node_put(ovinfo->target);
-	of_node_put(ovinfo->overlay);
+	of_node_put(fragment->target);
+	of_node_put(fragment->overlay);
 
-	memset(ovinfo, 0, sizeof(*ovinfo));
+	memset(fragment, 0, sizeof(*fragment));
 	return -EINVAL;
 }
 
 /**
- * of_build_overlay_info() - Build an overlay info array
- * @ov		Overlay to build
- * @tree:	Device node containing all the overlays
+ * init_overlay_changeset() - initialize overlay changeset from overlay tree
+ * @ovcs	Overlay changeset to build
+ * @tree:	Contains all the overlay fragments and overlay fixup nodes
  *
- * Helper function that given a tree containing overlay information,
- * allocates and builds an overlay info array containing it, ready
- * for use using of_overlay_apply.
+ * Initialize @ovcs.  Populate @ovcs->fragments with node information from
+ * the top level of @tree.  The relevant top level nodes are the fragment
+ * nodes and the __symbols__ node.  Any other top level node will be ignored.
  *
- * Returns 0 on success with the @cntp @ovinfop pointers valid,
- * while on error a negative error value is returned.
+ * Returns 0 on success, -ENOMEM if memory allocation failure, -EINVAL if error
+ * detected in @tree, or -ENODEV if no valid nodes found.
  */
-static int of_build_overlay_info(struct of_overlay *ov,
+static int init_overlay_changeset(struct overlay_changeset *ovcs,
 		struct device_node *tree)
 {
 	struct device_node *node;
-	struct of_overlay_info *ovinfo;
-	int cnt, err;
+	struct fragment *fragment;
+	struct fragment *fragments;
+	int cnt, ret;
 
 	cnt = 0;
 	for_each_child_of_node(tree, node)
@@ -397,24 +443,25 @@ static int of_build_overlay_info(struct of_overlay *ov,
 	if (of_get_child_by_name(tree, "__symbols__"))
 		cnt++;
 
-	ovinfo = kcalloc(cnt, sizeof(*ovinfo), GFP_KERNEL);
-	if (!ovinfo)
+	fragments = kcalloc(cnt, sizeof(*fragments), GFP_KERNEL);
+	if (!fragments)
 		return -ENOMEM;
 
 	cnt = 0;
 	for_each_child_of_node(tree, node) {
-		err = of_fill_overlay_info(ov, node, &ovinfo[cnt]);
-		if (!err)
+		ret = of_fill_overlay_info(ovcs, node, &fragments[cnt]);
+		if (!ret)
 			cnt++;
 	}
 
 	node = of_get_child_by_name(tree, "__symbols__");
 	if (node) {
-		ovinfo[cnt].overlay = node;
-		ovinfo[cnt].target = of_find_node_by_path("/__symbols__");
-		ovinfo[cnt].is_symbols_node = 1;
+		fragment = &fragments[cnt];
+		fragment->overlay = node;
+		fragment->target = of_find_node_by_path("/__symbols__");
+		fragment->is_symbols_node = 1;
 
-		if (!ovinfo[cnt].target) {
+		if (!fragment->target) {
 			pr_err("no symbols in root of device tree.\n");
 			return -EINVAL;
 		}
@@ -423,137 +470,127 @@ static int of_build_overlay_info(struct of_overlay *ov,
 	}
 
 	if (!cnt) {
-		kfree(ovinfo);
+		kfree(fragments);
 		return -ENODEV;
 	}
 
-	ov->count = cnt;
-	ov->ovinfo_tab = ovinfo;
+	ovcs->count = cnt;
+	ovcs->fragments = fragments;
 
 	return 0;
 }
 
 /**
- * of_free_overlay_info() - Free an overlay info array
- * @ov		Overlay to free the overlay info from
- * @ovinfo_tab:	Array of overlay_info's to free
+ * free_overlay_fragments() - Free a fragments array
+ * @ovcs	Overlay to free the overlay info from
  *
- * Releases the memory of a previously allocated ovinfo array
- * by of_build_overlay_info.
- * Returns 0, or an error if the arguments are bogus.
+ * Frees the memory of an ovcs->fragments[] array.
  */
-static int of_free_overlay_info(struct of_overlay *ov)
+static void free_overlay_fragments(struct overlay_changeset *ovcs)
 {
-	struct of_overlay_info *ovinfo;
 	int i;
 
 	/* do it in reverse */
-	for (i = ov->count - 1; i >= 0; i--) {
-		ovinfo = &ov->ovinfo_tab[i];
-
-		of_node_put(ovinfo->target);
-		of_node_put(ovinfo->overlay);
+	for (i = ovcs->count - 1; i >= 0; i--) {
+		of_node_put(ovcs->fragments[i].target);
+		of_node_put(ovcs->fragments[i].overlay);
 	}
-	kfree(ov->ovinfo_tab);
 
-	return 0;
+	kfree(ovcs->fragments);
 }
 
-static LIST_HEAD(ov_list);
-static DEFINE_IDR(ov_idr);
+static LIST_HEAD(ovcs_list);
+static DEFINE_IDR(ovcs_idr);
 
 /**
- * of_overlay_create() - Create and apply an overlay
- * @tree:	Device node containing all the overlays
+ * of_overlay_apply() - Create and apply an overlay changeset
+ * @tree:	Expanded overlay device tree
  *
- * Creates and applies an overlay while also keeping track
- * of the overlay in a list. This list can be used to prevent
- * illegal overlay removals.
+ * Creates and applies an overlay changeset.  If successful, the overlay
+ * changeset is added to the overlay changeset list.
  *
- * Returns the id of the created overlay, or a negative error number
+ * Returns the id of the created overlay changeset, or a negative error number
  */
-int of_overlay_create(struct device_node *tree)
+int of_overlay_apply(struct device_node *tree)
 {
-	struct of_overlay *ov;
-	int err, id;
+	struct overlay_changeset *ovcs;
+	int id, ret;
 
-	ov = kzalloc(sizeof(*ov), GFP_KERNEL);
-	if (!ov)
+	ovcs = kzalloc(sizeof(*ovcs), GFP_KERNEL);
+	if (!ovcs)
 		return -ENOMEM;
-	ov->id = -1;
+	ovcs->id = -1;
 
-	INIT_LIST_HEAD(&ov->node);
+	INIT_LIST_HEAD(&ovcs->ovcs_list);
 
-	of_changeset_init(&ov->cset);
+	of_changeset_init(&ovcs->cset);
 
 	mutex_lock(&of_mutex);
 
-	id = idr_alloc(&ov_idr, ov, 0, 0, GFP_KERNEL);
+	id = idr_alloc(&ovcs_idr, ovcs, 0, 0, GFP_KERNEL);
 	if (id < 0) {
-		err = id;
+		ret = id;
 		goto err_destroy_trans;
 	}
-	ov->id = id;
+	ovcs->id = id;
 
-	err = of_build_overlay_info(ov, tree);
-	if (err) {
-		pr_err("of_build_overlay_info() failed for tree@%pOF\n",
+	ret = init_overlay_changeset(ovcs, tree);
+	if (ret) {
+		pr_err("init_overlay_changeset() failed for tree@%pOF\n",
 		       tree);
 		goto err_free_idr;
 	}
 
-	err = of_overlay_notify(ov, OF_OVERLAY_PRE_APPLY);
-	if (err < 0) {
-		pr_err("%s: Pre-apply notifier failed (err=%d)\n",
-		       __func__, err);
-		goto err_free_idr;
+	ret = overlay_notify(ovcs, OF_OVERLAY_PRE_APPLY);
+	if (ret < 0) {
+		pr_err("%s: Pre-apply notifier failed (ret=%d)\n",
+		       __func__, ret);
+		goto err_free_overlay_fragments;
 	}
 
-	err = of_overlay_apply(ov);
-	if (err)
-		goto err_abort_trans;
-
-	err = __of_changeset_apply(&ov->cset);
-	if (err)
-		goto err_revert_overlay;
+	ret = build_changeset(ovcs);
+	if (ret)
+		goto err_free_overlay_fragments;
 
+	ret = __of_changeset_apply(&ovcs->cset);
+	if (ret)
+		goto err_free_overlay_fragments;
 
-	list_add_tail(&ov->node, &ov_list);
+	list_add_tail(&ovcs->ovcs_list, &ovcs_list);
 
-	of_overlay_notify(ov, OF_OVERLAY_POST_APPLY);
+	overlay_notify(ovcs, OF_OVERLAY_POST_APPLY);
 
 	mutex_unlock(&of_mutex);
 
 	return id;
 
-err_revert_overlay:
-err_abort_trans:
-	of_free_overlay_info(ov);
+err_free_overlay_fragments:
+	free_overlay_fragments(ovcs);
 err_free_idr:
-	idr_remove(&ov_idr, ov->id);
+	idr_remove(&ovcs_idr, ovcs->id);
 err_destroy_trans:
-	of_changeset_destroy(&ov->cset);
-	kfree(ov);
+	of_changeset_destroy(&ovcs->cset);
+	kfree(ovcs);
 	mutex_unlock(&of_mutex);
 
-	return err;
+	return ret;
 }
-EXPORT_SYMBOL_GPL(of_overlay_create);
+EXPORT_SYMBOL_GPL(of_overlay_apply);
 
 /*
- * check whether the given node, lies under the given tree
- * return 1 if under tree, else 0
+ * Find @np in @tree.
+ *
+ * Returns 1 if @np is @tree or is contained in @tree, else 0
  */
-static int overlay_subtree_check(struct device_node *tree,
-		struct device_node *dn)
+static int find_node(struct device_node *tree, struct device_node *np)
 {
 	struct device_node *child;
 
-	if (tree == dn)
+	if (tree == np)
 		return 1;
 
 	for_each_child_of_node(tree, child) {
-		if (overlay_subtree_check(child, dn)) {
+		if (find_node(child, np)) {
 			of_node_put(child);
 			return 1;
 		}
@@ -563,30 +600,32 @@ static int overlay_subtree_check(struct device_node *tree,
 }
 
 /*
- * check whether this overlay is the topmost
- * return 1 if topmost, else 0
+ * Is @remove_ce_np a child of or the same as any
+ * node in an overlay changeset more topmost than @remove_ovcs?
+ *
+ * Returns 1 if found, else 0
  */
-static int overlay_is_topmost(struct of_overlay *ov, struct device_node *dn)
+static int node_in_later_cs(struct overlay_changeset *remove_ovcs,
+		struct device_node *remove_ce_np)
 {
-	struct of_overlay *ovt;
+	struct overlay_changeset *ovcs;
 	struct of_changeset_entry *ce;
 
-	list_for_each_entry_reverse(ovt, &ov_list, node) {
-		/* if we hit ourselves, we're done */
-		if (ovt == ov)
+	list_for_each_entry_reverse(ovcs, &ovcs_list, ovcs_list) {
+		if (ovcs == remove_ovcs)
 			break;
 
-		/* check against each subtree affected by this overlay */
-		list_for_each_entry(ce, &ovt->cset.entries, node) {
-			if (overlay_subtree_check(ce->np, dn)) {
+		list_for_each_entry(ce, &ovcs->cset.entries, node) {
+			if (find_node(ce->np, remove_ce_np)) {
 				pr_err("%s: #%d clashes #%d @%pOF\n",
-					__func__, ov->id, ovt->id, dn);
-				return 0;
+					__func__, remove_ovcs->id, ovcs->id,
+					remove_ce_np);
+				return 1;
 			}
 		}
 	}
 
-	return 1;
+	return 0;
 }
 
 /*
@@ -599,13 +638,13 @@ static int overlay_is_topmost(struct of_overlay *ov, struct device_node *dn)
  * the one closest to the tail. If another overlay has affected this
  * device node and is closest to the tail, then removal is not permited.
  */
-static int overlay_removal_is_ok(struct of_overlay *ov)
+static int overlay_removal_is_ok(struct overlay_changeset *remove_ovcs)
 {
-	struct of_changeset_entry *ce;
+	struct of_changeset_entry *remove_ce;
 
-	list_for_each_entry(ce, &ov->cset.entries, node) {
-		if (!overlay_is_topmost(ov, ce->np)) {
-			pr_err("overlay #%d is not topmost\n", ov->id);
+	list_for_each_entry(remove_ce, &remove_ovcs->cset.entries, node) {
+		if (node_in_later_cs(remove_ovcs, remove_ce->np)) {
+			pr_err("overlay #%d is not topmost\n", remove_ovcs->id);
 			return 0;
 		}
 	}
@@ -614,74 +653,73 @@ static int overlay_removal_is_ok(struct of_overlay *ov)
 }
 
 /**
- * of_overlay_destroy() - Removes an overlay
- * @id:	Overlay id number returned by a previous call to of_overlay_create
+ * of_overlay_remove() - Revert and free an overlay changeset
+ * @ovcs_id:	Overlay changeset id number
  *
- * Removes an overlay if it is permissible.
+ * Removes an overlay if it is permissible.  ovcs_id was previously returned
+ * by of_overlay_apply().
  *
  * Returns 0 on success, or a negative error number
  */
-int of_overlay_destroy(int id)
+int of_overlay_remove(int ovcs_id)
 {
-	struct of_overlay *ov;
-	int err;
+	struct overlay_changeset *ovcs;
+	int ret = 0;
 
 	mutex_lock(&of_mutex);
 
-	ov = idr_find(&ov_idr, id);
-	if (!ov) {
-		err = -ENODEV;
-		pr_err("destroy: Could not find overlay #%d\n", id);
+	ovcs = idr_find(&ovcs_idr, ovcs_id);
+	if (!ovcs) {
+		ret = -ENODEV;
+		pr_err("remove: Could not find overlay #%d\n", ovcs_id);
 		goto out;
 	}
 
-	if (!overlay_removal_is_ok(ov)) {
-		err = -EBUSY;
+	if (!overlay_removal_is_ok(ovcs)) {
+		ret = -EBUSY;
 		goto out;
 	}
 
-	of_overlay_notify(ov, OF_OVERLAY_PRE_REMOVE);
-	list_del(&ov->node);
-	__of_changeset_revert(&ov->cset);
-	of_overlay_notify(ov, OF_OVERLAY_POST_REMOVE);
-	of_free_overlay_info(ov);
-	idr_remove(&ov_idr, id);
-	of_changeset_destroy(&ov->cset);
-	kfree(ov);
-
-	err = 0;
+	overlay_notify(ovcs, OF_OVERLAY_PRE_REMOVE);
+	list_del(&ovcs->ovcs_list);
+	__of_changeset_revert(&ovcs->cset);
+	overlay_notify(ovcs, OF_OVERLAY_POST_REMOVE);
+	free_overlay_fragments(ovcs);
+	idr_remove(&ovcs_idr, ovcs_id);
+	of_changeset_destroy(&ovcs->cset);
+	kfree(ovcs);
 
 out:
 	mutex_unlock(&of_mutex);
 
-	return err;
+	return ret;
 }
-EXPORT_SYMBOL_GPL(of_overlay_destroy);
+EXPORT_SYMBOL_GPL(of_overlay_remove);
 
 /**
- * of_overlay_destroy_all() - Removes all overlays from the system
+ * of_overlay_remove_all() - Reverts and frees all overlay changesets
  *
  * Removes all overlays from the system in the correct order.
  *
  * Returns 0 on success, or a negative error number
  */
-int of_overlay_destroy_all(void)
+int of_overlay_remove_all(void)
 {
-	struct of_overlay *ov, *ovn;
+	struct overlay_changeset *ovcs, *ovcs_n;
 
 	mutex_lock(&of_mutex);
 
 	/* the tail of list is guaranteed to be safe to remove */
-	list_for_each_entry_safe_reverse(ov, ovn, &ov_list, node) {
-		list_del(&ov->node);
-		__of_changeset_revert(&ov->cset);
-		of_free_overlay_info(ov);
-		idr_remove(&ov_idr, ov->id);
-		kfree(ov);
+	list_for_each_entry_safe_reverse(ovcs, ovcs_n, &ovcs_list, ovcs_list) {
+		list_del(&ovcs->ovcs_list);
+		__of_changeset_revert(&ovcs->cset);
+		free_overlay_fragments(ovcs);
+		idr_remove(&ovcs_idr, ovcs->id);
+		kfree(ovcs);
 	}
 
 	mutex_unlock(&of_mutex);
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(of_overlay_destroy_all);
+EXPORT_SYMBOL_GPL(of_overlay_remove_all);
diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index add02cfbfb88..bbdaf5606820 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -1230,7 +1230,7 @@ static void of_unittest_destroy_tracked_overlays(void)
 			if (!(overlay_id_bits[BIT_WORD(id)] & BIT_MASK(id)))
 				continue;
 
-			ret = of_overlay_destroy(id + overlay_first_id);
+			ret = of_overlay_remove(id + overlay_first_id);
 			if (ret == -ENODEV) {
 				pr_warn("%s: no overlay to destroy for #%d\n",
 					__func__, id + overlay_first_id);
@@ -1262,7 +1262,7 @@ static int of_unittest_apply_overlay(int overlay_nr, int unittest_nr,
 		goto out;
 	}
 
-	ret = of_overlay_create(np);
+	ret = of_overlay_apply(np);
 	if (ret < 0) {
 		unittest(0, "could not create overlay from \"%s\"\n",
 				overlay_path(overlay_nr));
@@ -1347,7 +1347,7 @@ static int of_unittest_apply_revert_overlay_check(int overlay_nr,
 		return -EINVAL;
 	}
 
-	ret = of_overlay_destroy(ov_id);
+	ret = of_overlay_remove(ov_id);
 	if (ret != 0) {
 		unittest(0, "overlay @\"%s\" failed to be destroyed @\"%s\"\n",
 				overlay_path(overlay_nr),
@@ -1476,7 +1476,7 @@ static void of_unittest_overlay_6(void)
 			return;
 		}
 
-		ret = of_overlay_create(np);
+		ret = of_overlay_apply(np);
 		if (ret < 0)  {
 			unittest(0, "could not create overlay from \"%s\"\n",
 					overlay_path(overlay_nr + i));
@@ -1500,7 +1500,7 @@ static void of_unittest_overlay_6(void)
 	}
 
 	for (i = 1; i >= 0; i--) {
-		ret = of_overlay_destroy(ov_id[i]);
+		ret = of_overlay_remove(ov_id[i]);
 		if (ret != 0) {
 			unittest(0, "overlay @\"%s\" failed destroy @\"%s\"\n",
 					overlay_path(overlay_nr + i),
@@ -1546,7 +1546,7 @@ static void of_unittest_overlay_8(void)
 			return;
 		}
 
-		ret = of_overlay_create(np);
+		ret = of_overlay_apply(np);
 		if (ret < 0)  {
 			unittest(0, "could not create overlay from \"%s\"\n",
 					overlay_path(overlay_nr + i));
@@ -1557,7 +1557,7 @@ static void of_unittest_overlay_8(void)
 	}
 
 	/* now try to remove first overlay (it should fail) */
-	ret = of_overlay_destroy(ov_id[0]);
+	ret = of_overlay_remove(ov_id[0]);
 	if (ret == 0) {
 		unittest(0, "overlay @\"%s\" was destroyed @\"%s\"\n",
 				overlay_path(overlay_nr + 0),
@@ -1568,7 +1568,7 @@ static void of_unittest_overlay_8(void)
 
 	/* removing them in order should work */
 	for (i = 1; i >= 0; i--) {
-		ret = of_overlay_destroy(ov_id[i]);
+		ret = of_overlay_remove(ov_id[i]);
 		if (ret != 0) {
 			unittest(0, "overlay @\"%s\" not destroyed @\"%s\"\n",
 					overlay_path(overlay_nr + i),
@@ -2149,9 +2149,9 @@ static int __init overlay_data_add(int onum)
 		goto out_free_np_overlay;
 	}
 
-	ret = of_overlay_create(info->np_overlay);
+	ret = of_overlay_apply(info->np_overlay);
 	if (ret < 0) {
-		pr_err("of_overlay_create() (ret=%d), %d\n", ret, onum);
+		pr_err("of_overlay_apply() (ret=%d), %d\n", ret, onum);
 		goto out_free_np_overlay;
 	} else {
 		info->overlay_id = ret;
diff --git a/include/linux/of.h b/include/linux/of.h
index 7b0f17be7830..7569e9cc45de 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -1312,26 +1312,26 @@ struct of_overlay_notify_data {
 #ifdef CONFIG_OF_OVERLAY
 
 /* ID based overlays; the API for external users */
-int of_overlay_create(struct device_node *tree);
-int of_overlay_destroy(int id);
-int of_overlay_destroy_all(void);
+int of_overlay_apply(struct device_node *tree);
+int of_overlay_remove(int id);
+int of_overlay_remove_all(void);
 
 int of_overlay_notifier_register(struct notifier_block *nb);
 int of_overlay_notifier_unregister(struct notifier_block *nb);
 
 #else
 
-static inline int of_overlay_create(struct device_node *tree)
+static inline int of_overlay_apply(struct device_node *tree)
 {
 	return -ENOTSUPP;
 }
 
-static inline int of_overlay_destroy(int id)
+static inline int of_overlay_remove(int id)
 {
 	return -ENOTSUPP;
 }
 
-static inline int of_overlay_destroy_all(void)
+static inline int of_overlay_remove_all(void)
 {
 	return -ENOTSUPP;
 }
-- 
cgit v1.2.3


From 24789c5ce5a373dd55640f9cd79117fcc3ccc46d Mon Sep 17 00:00:00 2001
From: Frank Rowand <frank.rowand@sony.com>
Date: Tue, 17 Oct 2017 16:36:26 -0700
Subject: of: overlay: detect cases where device tree may become corrupt

When an attempt to apply an overlay changeset fails, an effort
is made to revert any partial application of the changeset.
When an attempt to remove an overlay changeset fails, an effort
is made to re-apply any partial reversion of the changeset.

The existing code does not check for failure to recover a failed
overlay changeset application or overlay changeset revert.

Add the missing checks and flag the devicetree as corrupt if the
state of the devicetree can not be determined.

Improve and expand the returned errors to more fully reflect the
result of the effort to undo the partial effects of a failed attempt
to apply or remove an overlay changeset.

If the device tree might be corrupt, do not allow further attempts
to apply or remove an overlay changeset.

When creating an overlay changeset from an overlay device tree,
add some additional warnings if the state of the overlay device
tree is not as expected.

Signed-off-by: Frank Rowand <frank.rowand@sony.com>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c |   5 +-
 drivers/of/dynamic.c                         | 135 +++++++++++---
 drivers/of/of_private.h                      |   8 +-
 drivers/of/overlay.c                         | 253 ++++++++++++++++++++++-----
 drivers/of/unittest.c                        |  57 +++---
 include/linux/of.h                           |  10 +-
 6 files changed, 372 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c b/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
index 5f5b7ba35f1d..7a7be0515bfd 100644
--- a/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
+++ b/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
@@ -204,7 +204,7 @@ static void __init tilcdc_convert_slave_node(void)
 	/* For all memory needed for the overlay tree. This memory can
 	   be freed after the overlay has been applied. */
 	struct kfree_table kft;
-	int ret;
+	int ovcs_id, ret;
 
 	if (kfree_table_init(&kft))
 		return;
@@ -247,7 +247,8 @@ static void __init tilcdc_convert_slave_node(void)
 
 	tilcdc_node_disable(slave);
 
-	ret = of_overlay_apply(overlay);
+	ovcs_id = 0;
+	ret = of_overlay_apply(overlay, &ovcs_id);
 	if (ret)
 		pr_err("%s: Applying overlay changeset failed: %d\n",
 			__func__, ret);
diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
index 4bc96af4d8e2..747d87619faf 100644
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -491,11 +491,12 @@ static void __of_changeset_entry_invert(struct of_changeset_entry *ce,
 	}
 }
 
-static void __of_changeset_entry_notify(struct of_changeset_entry *ce, bool revert)
+static int __of_changeset_entry_notify(struct of_changeset_entry *ce,
+		bool revert)
 {
 	struct of_reconfig_data rd;
 	struct of_changeset_entry ce_inverted;
-	int ret;
+	int ret = 0;
 
 	if (revert) {
 		__of_changeset_entry_invert(ce, &ce_inverted);
@@ -517,11 +518,12 @@ static void __of_changeset_entry_notify(struct of_changeset_entry *ce, bool reve
 	default:
 		pr_err("invalid devicetree changeset action: %i\n",
 			(int)ce->action);
-		return;
+		ret = -EINVAL;
 	}
 
 	if (ret)
 		pr_err("changeset notifier error @%pOF\n", ce->np);
+	return ret;
 }
 
 static int __of_changeset_entry_apply(struct of_changeset_entry *ce)
@@ -655,32 +657,82 @@ void of_changeset_destroy(struct of_changeset *ocs)
 }
 EXPORT_SYMBOL_GPL(of_changeset_destroy);
 
-int __of_changeset_apply(struct of_changeset *ocs)
+/*
+ * Apply the changeset entries in @ocs.
+ * If apply fails, an attempt is made to revert the entries that were
+ * successfully applied.
+ *
+ * If multiple revert errors occur then only the final revert error is reported.
+ *
+ * Returns 0 on success, a negative error value in case of an error.
+ * If a revert error occurs, it is returned in *ret_revert.
+ */
+int __of_changeset_apply_entries(struct of_changeset *ocs, int *ret_revert)
 {
 	struct of_changeset_entry *ce;
-	int ret;
+	int ret, ret_tmp;
 
-	/* perform the rest of the work */
 	pr_debug("changeset: applying...\n");
 	list_for_each_entry(ce, &ocs->entries, node) {
 		ret = __of_changeset_entry_apply(ce);
 		if (ret) {
 			pr_err("Error applying changeset (%d)\n", ret);
-			list_for_each_entry_continue_reverse(ce, &ocs->entries, node)
-				__of_changeset_entry_revert(ce);
+			list_for_each_entry_continue_reverse(ce, &ocs->entries,
+							     node) {
+				ret_tmp = __of_changeset_entry_revert(ce);
+				if (ret_tmp)
+					*ret_revert = ret_tmp;
+			}
 			return ret;
 		}
 	}
-	pr_debug("changeset: applied, emitting notifiers.\n");
+
+	return 0;
+}
+
+/*
+ * Returns 0 on success, a negative error value in case of an error.
+ *
+ * If multiple changset entry notification errors occur then only the
+ * final notification error is reported.
+ */
+int __of_changeset_apply_notify(struct of_changeset *ocs)
+{
+	struct of_changeset_entry *ce;
+	int ret = 0, ret_tmp;
+
+	pr_debug("changeset: emitting notifiers.\n");
 
 	/* drop the global lock while emitting notifiers */
 	mutex_unlock(&of_mutex);
-	list_for_each_entry(ce, &ocs->entries, node)
-		__of_changeset_entry_notify(ce, 0);
+	list_for_each_entry(ce, &ocs->entries, node) {
+		ret_tmp = __of_changeset_entry_notify(ce, 0);
+		if (ret_tmp)
+			ret = ret_tmp;
+	}
 	mutex_lock(&of_mutex);
 	pr_debug("changeset: notifiers sent.\n");
 
-	return 0;
+	return ret;
+}
+
+/*
+ * Returns 0 on success, a negative error value in case of an error.
+ *
+ * If a changeset entry apply fails, an attempt is made to revert any
+ * previous entries in the changeset.  If any of the reverts fails,
+ * that failure is not reported.  Thus the state of the device tree
+ * is unknown if an apply error occurs.
+ */
+static int __of_changeset_apply(struct of_changeset *ocs)
+{
+	int ret, ret_revert = 0;
+
+	ret = __of_changeset_apply_entries(ocs, &ret_revert);
+	if (!ret)
+		ret = __of_changeset_apply_notify(ocs);
+
+	return ret;
 }
 
 /**
@@ -707,31 +759,74 @@ int of_changeset_apply(struct of_changeset *ocs)
 }
 EXPORT_SYMBOL_GPL(of_changeset_apply);
 
-int __of_changeset_revert(struct of_changeset *ocs)
+/*
+ * Revert the changeset entries in @ocs.
+ * If revert fails, an attempt is made to re-apply the entries that were
+ * successfully removed.
+ *
+ * If multiple re-apply errors occur then only the final apply error is
+ * reported.
+ *
+ * Returns 0 on success, a negative error value in case of an error.
+ * If an apply error occurs, it is returned in *ret_apply.
+ */
+int __of_changeset_revert_entries(struct of_changeset *ocs, int *ret_apply)
 {
 	struct of_changeset_entry *ce;
-	int ret;
+	int ret, ret_tmp;
 
 	pr_debug("changeset: reverting...\n");
 	list_for_each_entry_reverse(ce, &ocs->entries, node) {
 		ret = __of_changeset_entry_revert(ce);
 		if (ret) {
 			pr_err("Error reverting changeset (%d)\n", ret);
-			list_for_each_entry_continue(ce, &ocs->entries, node)
-				__of_changeset_entry_apply(ce);
+			list_for_each_entry_continue(ce, &ocs->entries, node) {
+				ret_tmp = __of_changeset_entry_apply(ce);
+				if (ret_tmp)
+					*ret_apply = ret_tmp;
+			}
 			return ret;
 		}
 	}
-	pr_debug("changeset: reverted, emitting notifiers.\n");
+
+	return 0;
+}
+
+/*
+ * If multiple changset entry notification errors occur then only the
+ * final notification error is reported.
+ */
+int __of_changeset_revert_notify(struct of_changeset *ocs)
+{
+	struct of_changeset_entry *ce;
+	int ret = 0, ret_tmp;
+
+	pr_debug("changeset: emitting notifiers.\n");
 
 	/* drop the global lock while emitting notifiers */
 	mutex_unlock(&of_mutex);
-	list_for_each_entry_reverse(ce, &ocs->entries, node)
-		__of_changeset_entry_notify(ce, 1);
+	list_for_each_entry_reverse(ce, &ocs->entries, node) {
+		ret_tmp = __of_changeset_entry_notify(ce, 1);
+		if (ret_tmp)
+			ret = ret_tmp;
+	}
 	mutex_lock(&of_mutex);
 	pr_debug("changeset: notifiers sent.\n");
 
-	return 0;
+	return ret;
+}
+
+static int __of_changeset_revert(struct of_changeset *ocs)
+{
+	int ret, ret_reply;
+
+	ret_reply = 0;
+	ret = __of_changeset_revert_entries(ocs, &ret_reply);
+
+	if (!ret)
+		ret = __of_changeset_revert_notify(ocs);
+
+	return ret;
 }
 
 /**
diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h
index 43df14f0cbce..36357f571df2 100644
--- a/drivers/of/of_private.h
+++ b/drivers/of/of_private.h
@@ -39,8 +39,12 @@ extern struct kset *of_kset;
 extern int of_property_notify(int action, struct device_node *np,
 			      struct property *prop, struct property *old_prop);
 extern void of_node_release(struct kobject *kobj);
-extern int __of_changeset_apply(struct of_changeset *ocs);
-extern int __of_changeset_revert(struct of_changeset *ocs);
+extern int __of_changeset_apply_entries(struct of_changeset *ocs,
+					int *ret_revert);
+extern int __of_changeset_apply_notify(struct of_changeset *ocs);
+extern int __of_changeset_revert_entries(struct of_changeset *ocs,
+					 int *ret_apply);
+extern int __of_changeset_revert_notify(struct of_changeset *ocs);
 #else /* CONFIG_OF_DYNAMIC */
 static inline int of_property_notify(int action, struct device_node *np,
 				     struct property *prop, struct property *old_prop)
diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 905916e17eec..78c50fd57750 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -50,6 +50,22 @@ struct overlay_changeset {
 	struct of_changeset cset;
 };
 
+/* flags are sticky - once set, do not reset */
+static int devicetree_state_flags;
+#define DTSF_APPLY_FAIL		0x01
+#define DTSF_REVERT_FAIL	0x02
+
+/*
+ * If a changeset apply or revert encounters an error, an attempt will
+ * be made to undo partial changes, but may fail.  If the undo fails
+ * we do not know the state of the devicetree.
+ */
+static int devicetree_corrupt(void)
+{
+	return devicetree_state_flags &
+		(DTSF_APPLY_FAIL | DTSF_REVERT_FAIL);
+}
+
 static int build_changeset_next_level(struct overlay_changeset *ovcs,
 		struct device_node *target_node,
 		const struct device_node *overlay_node,
@@ -72,6 +88,13 @@ int of_overlay_notifier_unregister(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(of_overlay_notifier_unregister);
 
+static char *of_overlay_action_name[] = {
+	"pre-apply",
+	"post-apply",
+	"pre-remove",
+	"post-remove",
+};
+
 static int overlay_notify(struct overlay_changeset *ovcs,
 		enum of_overlay_notify_action action)
 {
@@ -86,8 +109,14 @@ static int overlay_notify(struct overlay_changeset *ovcs,
 
 		ret = blocking_notifier_call_chain(&overlay_notify_chain,
 						   action, &nd);
-		if (ret)
-			return notifier_to_errno(ret);
+		if (ret == NOTIFY_STOP)
+			return 0;
+		if (ret) {
+			ret = notifier_to_errno(ret);
+			pr_err("overlay changeset %s notifier error %d, target: %pOF\n",
+			       of_overlay_action_name[action], ret, nd.target);
+			return ret;
+		}
 	}
 
 	return 0;
@@ -240,6 +269,14 @@ static int add_changeset_property(struct overlay_changeset *ovcs,
  * build_changeset_next_level().
  *
  * NOTE: Multiple mods of created nodes not supported.
+ *       If more than one fragment contains a node that does not already exist
+ *       in the live tree, then for each fragment of_changeset_attach_node()
+ *       will add a changeset entry to add the node.  When the changeset is
+ *       applied, __of_attach_node() will attach the node twice (once for
+ *       each fragment).  At this point the device tree will be corrupted.
+ *
+ *       TODO: add integrity check to ensure that multiple fragments do not
+ *             create the same node.
  *
  * Returns 0 on success, -ENOMEM if memory allocation failure, or -EINVAL if
  * invalid @overlay.
@@ -312,8 +349,8 @@ static int build_changeset_next_level(struct overlay_changeset *ovcs,
 		ret = add_changeset_property(ovcs, target_node, prop,
 					     is_symbols_node);
 		if (ret) {
-			pr_err("Failed to apply prop @%pOF/%s\n",
-			       target_node, prop->name);
+			pr_debug("Failed to apply prop @%pOF/%s, err=%d\n",
+				 target_node, prop->name, ret);
 			return ret;
 		}
 	}
@@ -324,8 +361,8 @@ static int build_changeset_next_level(struct overlay_changeset *ovcs,
 	for_each_child_of_node(overlay_node, child) {
 		ret = add_changeset_node(ovcs, target_node, child);
 		if (ret) {
-			pr_err("Failed to apply node @%pOF/%s\n",
-			       target_node, child->name);
+			pr_debug("Failed to apply node @%pOF/%s, err=%d\n",
+				 target_node, child->name, ret);
 			of_node_put(child);
 			return ret;
 		}
@@ -357,7 +394,7 @@ static int build_changeset(struct overlay_changeset *ovcs)
 					       fragment->overlay,
 					       fragment->is_symbols_node);
 		if (ret) {
-			pr_err("apply failed '%pOF'\n", fragment->target);
+			pr_debug("apply failed '%pOF'\n", fragment->target);
 			return ret;
 		}
 	}
@@ -412,6 +449,19 @@ static int init_overlay_changeset(struct overlay_changeset *ovcs,
 	struct fragment *fragments;
 	int cnt, ret;
 
+	/*
+	 * Warn for some issues.  Can not return -EINVAL for these until
+	 * of_unittest_apply_overlay() is fixed to pass these checks.
+	 */
+	if (!of_node_check_flag(tree, OF_DYNAMIC))
+		pr_debug("%s() tree is not dynamic\n", __func__);
+
+	if (!of_node_check_flag(tree, OF_DETACHED))
+		pr_debug("%s() tree is not detached\n", __func__);
+
+	if (!of_node_is_root(tree))
+		pr_debug("%s() tree is not root\n", __func__);
+
 	INIT_LIST_HEAD(&ovcs->ovcs_list);
 
 	of_changeset_init(&ovcs->cset);
@@ -485,12 +535,13 @@ static int init_overlay_changeset(struct overlay_changeset *ovcs,
 
 	return 0;
 
-
 err_free_fragments:
 	kfree(fragments);
 err_free_idr:
 	idr_remove(&ovcs_idr, ovcs->id);
 
+	pr_err("%s() failed, ret = %d\n", __func__, ret);
+
 	return ret;
 }
 
@@ -517,33 +568,71 @@ static void free_overlay_changeset(struct overlay_changeset *ovcs)
 /**
  * of_overlay_apply() - Create and apply an overlay changeset
  * @tree:	Expanded overlay device tree
+ * @ovcs_id:	Pointer to overlay changeset id
+ *
+ * Creates and applies an overlay changeset.
  *
- * Creates and applies an overlay changeset.  If successful, the overlay
- * changeset is added to the overlay changeset list.
+ * If an error occurs in a pre-apply notifier, then no changes are made
+ * to the device tree.
  *
- * Returns the id of the created overlay changeset, or a negative error number
+
+ * A non-zero return value will not have created the changeset if error is from:
+ *   - parameter checks
+ *   - building the changeset
+ *   - overlay changset pre-apply notifier
+ *
+ * If an error is returned by an overlay changeset pre-apply notifier
+ * then no further overlay changeset pre-apply notifier will be called.
+ *
+ * A non-zero return value will have created the changeset if error is from:
+ *   - overlay changeset entry notifier
+ *   - overlay changset post-apply notifier
+ *
+ * If an error is returned by an overlay changeset post-apply notifier
+ * then no further overlay changeset post-apply notifier will be called.
+ *
+ * If more than one notifier returns an error, then the last notifier
+ * error to occur is returned.
+ *
+ * If an error occurred while applying the overlay changeset, then an
+ * attempt is made to revert any changes that were made to the
+ * device tree.  If there were any errors during the revert attempt
+ * then the state of the device tree can not be determined, and any
+ * following attempt to apply or remove an overlay changeset will be
+ * refused.
+ *
+ * Returns 0 on success, or a negative error number.  Overlay changeset
+ * id is returned to *ovcs_id.
  */
-int of_overlay_apply(struct device_node *tree)
+
+int of_overlay_apply(struct device_node *tree, int *ovcs_id)
 {
 	struct overlay_changeset *ovcs;
-	int ret;
+	int ret = 0, ret_revert, ret_tmp;
+
+	*ovcs_id = 0;
+
+	if (devicetree_corrupt()) {
+		pr_err("devicetree state suspect, refuse to apply overlay\n");
+		ret = -EBUSY;
+		goto out;
+	}
 
 	ovcs = kzalloc(sizeof(*ovcs), GFP_KERNEL);
-	if (!ovcs)
-		return -ENOMEM;
+	if (!ovcs) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	mutex_lock(&of_mutex);
 
 	ret = init_overlay_changeset(ovcs, tree);
-	if (ret) {
-		pr_err("init_overlay_changeset() failed, ret = %d\n", ret);
+	if (ret)
 		goto err_free_overlay_changeset;
-	}
 
 	ret = overlay_notify(ovcs, OF_OVERLAY_PRE_APPLY);
-	if (ret < 0) {
-		pr_err("%s: Pre-apply notifier failed (ret=%d)\n",
-		       __func__, ret);
+	if (ret) {
+		pr_err("overlay changeset pre-apply notify error %d\n", ret);
 		goto err_free_overlay_changeset;
 	}
 
@@ -551,23 +640,46 @@ int of_overlay_apply(struct device_node *tree)
 	if (ret)
 		goto err_free_overlay_changeset;
 
-	ret = __of_changeset_apply(&ovcs->cset);
-	if (ret)
+	ret_revert = 0;
+	ret = __of_changeset_apply_entries(&ovcs->cset, &ret_revert);
+	if (ret) {
+		if (ret_revert) {
+			pr_debug("overlay changeset revert error %d\n",
+				 ret_revert);
+			devicetree_state_flags |= DTSF_APPLY_FAIL;
+		}
 		goto err_free_overlay_changeset;
+	} else {
+		ret = __of_changeset_apply_notify(&ovcs->cset);
+		if (ret)
+			pr_err("overlay changeset entry notify error %d\n",
+			       ret);
+		/* fall through */
+	}
 
 	list_add_tail(&ovcs->ovcs_list, &ovcs_list);
-
-	overlay_notify(ovcs, OF_OVERLAY_POST_APPLY);
+	*ovcs_id = ovcs->id;
+
+	ret_tmp = overlay_notify(ovcs, OF_OVERLAY_POST_APPLY);
+	if (ret_tmp) {
+		pr_err("overlay changeset post-apply notify error %d\n",
+		       ret_tmp);
+		if (!ret)
+			ret = ret_tmp;
+	}
 
 	mutex_unlock(&of_mutex);
 
-	return ovcs->id;
+	goto out;
 
 err_free_overlay_changeset:
 	free_overlay_changeset(ovcs);
 
 	mutex_unlock(&of_mutex);
 
+out:
+	pr_debug("%s() err=%d\n", __func__, ret);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(of_overlay_apply);
@@ -649,45 +761,106 @@ static int overlay_removal_is_ok(struct overlay_changeset *remove_ovcs)
 
 /**
  * of_overlay_remove() - Revert and free an overlay changeset
- * @ovcs_id:	Overlay changeset id number
+ * @ovcs_id:	Pointer to overlay changeset id
  *
- * Removes an overlay if it is permissible.  ovcs_id was previously returned
+ * Removes an overlay if it is permissible.  @ovcs_id was previously returned
  * by of_overlay_apply().
  *
- * Returns 0 on success, or a negative error number
+ * If an error occurred while attempting to revert the overlay changeset,
+ * then an attempt is made to re-apply any changeset entry that was
+ * reverted.  If an error occurs on re-apply then the state of the device
+ * tree can not be determined, and any following attempt to apply or remove
+ * an overlay changeset will be refused.
+ *
+ * A non-zero return value will not revert the changeset if error is from:
+ *   - parameter checks
+ *   - overlay changset pre-remove notifier
+ *   - overlay changeset entry revert
+ *
+ * If an error is returned by an overlay changeset pre-remove notifier
+ * then no further overlay changeset pre-remove notifier will be called.
+ *
+ * If more than one notifier returns an error, then the last notifier
+ * error to occur is returned.
+ *
+ * A non-zero return value will revert the changeset if error is from:
+ *   - overlay changeset entry notifier
+ *   - overlay changset post-remove notifier
+ *
+ * If an error is returned by an overlay changeset post-remove notifier
+ * then no further overlay changeset post-remove notifier will be called.
+ *
+ * Returns 0 on success, or a negative error number.  *ovcs_id is set to
+ * zero after reverting the changeset, even if a subsequent error occurs.
  */
-int of_overlay_remove(int ovcs_id)
+int of_overlay_remove(int *ovcs_id)
 {
 	struct overlay_changeset *ovcs;
-	int ret = 0;
+	int ret, ret_apply, ret_tmp;
+
+	ret = 0;
+
+	if (devicetree_corrupt()) {
+		pr_err("suspect devicetree state, refuse to remove overlay\n");
+		ret = -EBUSY;
+		goto out;
+	}
 
 	mutex_lock(&of_mutex);
 
-	ovcs = idr_find(&ovcs_idr, ovcs_id);
+	ovcs = idr_find(&ovcs_idr, *ovcs_id);
 	if (!ovcs) {
 		ret = -ENODEV;
-		pr_err("remove: Could not find overlay #%d\n", ovcs_id);
-		goto out;
+		pr_err("remove: Could not find overlay #%d\n", *ovcs_id);
+		goto out_unlock;
 	}
 
 	if (!overlay_removal_is_ok(ovcs)) {
 		ret = -EBUSY;
-		goto out;
+		goto out_unlock;
 	}
 
-	overlay_notify(ovcs, OF_OVERLAY_PRE_REMOVE);
+	ret = overlay_notify(ovcs, OF_OVERLAY_PRE_REMOVE);
+	if (ret) {
+		pr_err("overlay changeset pre-remove notify error %d\n", ret);
+		goto out_unlock;
+	}
 
 	list_del(&ovcs->ovcs_list);
 
-	__of_changeset_revert(&ovcs->cset);
+	ret_apply = 0;
+	ret = __of_changeset_revert_entries(&ovcs->cset, &ret_apply);
+	if (ret) {
+		if (ret_apply)
+			devicetree_state_flags |= DTSF_REVERT_FAIL;
+		goto out_unlock;
+	} else {
+		ret = __of_changeset_revert_notify(&ovcs->cset);
+		if (ret) {
+			pr_err("overlay changeset entry notify error %d\n",
+			       ret);
+			/* fall through - changeset was reverted */
+		}
+	}
 
-	overlay_notify(ovcs, OF_OVERLAY_POST_REMOVE);
+	*ovcs_id = 0;
+
+	ret_tmp = overlay_notify(ovcs, OF_OVERLAY_POST_REMOVE);
+	if (ret_tmp) {
+		pr_err("overlay changeset post-remove notify error %d\n",
+		       ret_tmp);
+		if (!ret)
+			ret = ret_tmp;
+	}
 
 	free_overlay_changeset(ovcs);
 
-out:
+out_unlock:
 	mutex_unlock(&of_mutex);
 
+out:
+	pr_debug("%s() err=%d\n", __func__, ret);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(of_overlay_remove);
@@ -706,7 +879,7 @@ int of_overlay_remove_all(void)
 
 	/* the tail of list is guaranteed to be safe to remove */
 	list_for_each_entry_safe_reverse(ovcs, ovcs_n, &ovcs_list, ovcs_list) {
-		ret = of_overlay_remove(ovcs->id);
+		ret = of_overlay_remove(&ovcs->id);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index bbdaf5606820..3640dae4b9b2 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -1217,7 +1217,7 @@ static void of_unittest_untrack_overlay(int id)
 
 static void of_unittest_destroy_tracked_overlays(void)
 {
-	int id, ret, defers;
+	int id, ret, defers, ovcs_id;
 
 	if (overlay_first_id < 0)
 		return;
@@ -1230,7 +1230,8 @@ static void of_unittest_destroy_tracked_overlays(void)
 			if (!(overlay_id_bits[BIT_WORD(id)] & BIT_MASK(id)))
 				continue;
 
-			ret = of_overlay_remove(id + overlay_first_id);
+			ovcs_id = id + overlay_first_id;
+			ret = of_overlay_remove(&ovcs_id);
 			if (ret == -ENODEV) {
 				pr_warn("%s: no overlay to destroy for #%d\n",
 					__func__, id + overlay_first_id);
@@ -1252,7 +1253,7 @@ static int of_unittest_apply_overlay(int overlay_nr, int unittest_nr,
 		int *overlay_id)
 {
 	struct device_node *np = NULL;
-	int ret, id = -1;
+	int ret;
 
 	np = of_find_node_by_path(overlay_path(overlay_nr));
 	if (np == NULL) {
@@ -1262,23 +1263,20 @@ static int of_unittest_apply_overlay(int overlay_nr, int unittest_nr,
 		goto out;
 	}
 
-	ret = of_overlay_apply(np);
+	*overlay_id = 0;
+	ret = of_overlay_apply(np, overlay_id);
 	if (ret < 0) {
 		unittest(0, "could not create overlay from \"%s\"\n",
 				overlay_path(overlay_nr));
 		goto out;
 	}
-	id = ret;
-	of_unittest_track_overlay(id);
+	of_unittest_track_overlay(*overlay_id);
 
 	ret = 0;
 
 out:
 	of_node_put(np);
 
-	if (overlay_id)
-		*overlay_id = id;
-
 	return ret;
 }
 
@@ -1286,7 +1284,7 @@ out:
 static int of_unittest_apply_overlay_check(int overlay_nr, int unittest_nr,
 		int before, int after, enum overlay_type ovtype)
 {
-	int ret;
+	int ret, ovcs_id;
 
 	/* unittest device must not be in before state */
 	if (of_unittest_device_exists(unittest_nr, ovtype) != before) {
@@ -1297,7 +1295,8 @@ static int of_unittest_apply_overlay_check(int overlay_nr, int unittest_nr,
 		return -EINVAL;
 	}
 
-	ret = of_unittest_apply_overlay(overlay_nr, unittest_nr, NULL);
+	ovcs_id = 0;
+	ret = of_unittest_apply_overlay(overlay_nr, unittest_nr, &ovcs_id);
 	if (ret != 0) {
 		/* of_unittest_apply_overlay already called unittest() */
 		return ret;
@@ -1320,7 +1319,7 @@ static int of_unittest_apply_revert_overlay_check(int overlay_nr,
 		int unittest_nr, int before, int after,
 		enum overlay_type ovtype)
 {
-	int ret, ov_id;
+	int ret, ovcs_id;
 
 	/* unittest device must be in before state */
 	if (of_unittest_device_exists(unittest_nr, ovtype) != before) {
@@ -1332,7 +1331,8 @@ static int of_unittest_apply_revert_overlay_check(int overlay_nr,
 	}
 
 	/* apply the overlay */
-	ret = of_unittest_apply_overlay(overlay_nr, unittest_nr, &ov_id);
+	ovcs_id = 0;
+	ret = of_unittest_apply_overlay(overlay_nr, unittest_nr, &ovcs_id);
 	if (ret != 0) {
 		/* of_unittest_apply_overlay already called unittest() */
 		return ret;
@@ -1347,7 +1347,7 @@ static int of_unittest_apply_revert_overlay_check(int overlay_nr,
 		return -EINVAL;
 	}
 
-	ret = of_overlay_remove(ov_id);
+	ret = of_overlay_remove(&ovcs_id);
 	if (ret != 0) {
 		unittest(0, "overlay @\"%s\" failed to be destroyed @\"%s\"\n",
 				overlay_path(overlay_nr),
@@ -1449,7 +1449,7 @@ static void of_unittest_overlay_5(void)
 static void of_unittest_overlay_6(void)
 {
 	struct device_node *np;
-	int ret, i, ov_id[2];
+	int ret, i, ov_id[2], ovcs_id;
 	int overlay_nr = 6, unittest_nr = 6;
 	int before = 0, after = 1;
 
@@ -1476,13 +1476,14 @@ static void of_unittest_overlay_6(void)
 			return;
 		}
 
-		ret = of_overlay_apply(np);
+		ovcs_id = 0;
+		ret = of_overlay_apply(np, &ovcs_id);
 		if (ret < 0)  {
 			unittest(0, "could not create overlay from \"%s\"\n",
 					overlay_path(overlay_nr + i));
 			return;
 		}
-		ov_id[i] = ret;
+		ov_id[i] = ovcs_id;
 		of_unittest_track_overlay(ov_id[i]);
 	}
 
@@ -1500,7 +1501,8 @@ static void of_unittest_overlay_6(void)
 	}
 
 	for (i = 1; i >= 0; i--) {
-		ret = of_overlay_remove(ov_id[i]);
+		ovcs_id = ov_id[i];
+		ret = of_overlay_remove(&ovcs_id);
 		if (ret != 0) {
 			unittest(0, "overlay @\"%s\" failed destroy @\"%s\"\n",
 					overlay_path(overlay_nr + i),
@@ -1531,7 +1533,7 @@ static void of_unittest_overlay_6(void)
 static void of_unittest_overlay_8(void)
 {
 	struct device_node *np;
-	int ret, i, ov_id[2];
+	int ret, i, ov_id[2], ovcs_id;
 	int overlay_nr = 8, unittest_nr = 8;
 
 	/* we don't care about device state in this test */
@@ -1546,18 +1548,20 @@ static void of_unittest_overlay_8(void)
 			return;
 		}
 
-		ret = of_overlay_apply(np);
+		ovcs_id = 0;
+		ret = of_overlay_apply(np, &ovcs_id);
 		if (ret < 0)  {
 			unittest(0, "could not create overlay from \"%s\"\n",
 					overlay_path(overlay_nr + i));
 			return;
 		}
-		ov_id[i] = ret;
+		ov_id[i] = ovcs_id;
 		of_unittest_track_overlay(ov_id[i]);
 	}
 
 	/* now try to remove first overlay (it should fail) */
-	ret = of_overlay_remove(ov_id[0]);
+	ovcs_id = ov_id[0];
+	ret = of_overlay_remove(&ovcs_id);
 	if (ret == 0) {
 		unittest(0, "overlay @\"%s\" was destroyed @\"%s\"\n",
 				overlay_path(overlay_nr + 0),
@@ -1568,7 +1572,8 @@ static void of_unittest_overlay_8(void)
 
 	/* removing them in order should work */
 	for (i = 1; i >= 0; i--) {
-		ret = of_overlay_remove(ov_id[i]);
+		ovcs_id = ov_id[i];
+		ret = of_overlay_remove(&ovcs_id);
 		if (ret != 0) {
 			unittest(0, "overlay @\"%s\" not destroyed @\"%s\"\n",
 					overlay_path(overlay_nr + i),
@@ -2149,13 +2154,11 @@ static int __init overlay_data_add(int onum)
 		goto out_free_np_overlay;
 	}
 
-	ret = of_overlay_apply(info->np_overlay);
+	info->overlay_id = 0;
+	ret = of_overlay_apply(info->np_overlay, &info->overlay_id);
 	if (ret < 0) {
 		pr_err("of_overlay_apply() (ret=%d), %d\n", ret, onum);
 		goto out_free_np_overlay;
-	} else {
-		info->overlay_id = ret;
-		ret = 0;
 	}
 
 	pr_debug("__dtb_overlay_begin applied, overlay id %d\n", ret);
diff --git a/include/linux/of.h b/include/linux/of.h
index 7569e9cc45de..96edda95c6b0 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -1298,7 +1298,7 @@ static inline bool of_device_is_system_power_controller(const struct device_node
  */
 
 enum of_overlay_notify_action {
-	OF_OVERLAY_PRE_APPLY,
+	OF_OVERLAY_PRE_APPLY = 0,
 	OF_OVERLAY_POST_APPLY,
 	OF_OVERLAY_PRE_REMOVE,
 	OF_OVERLAY_POST_REMOVE,
@@ -1312,8 +1312,8 @@ struct of_overlay_notify_data {
 #ifdef CONFIG_OF_OVERLAY
 
 /* ID based overlays; the API for external users */
-int of_overlay_apply(struct device_node *tree);
-int of_overlay_remove(int id);
+int of_overlay_apply(struct device_node *tree, int *ovcs_id);
+int of_overlay_remove(int *ovcs_id);
 int of_overlay_remove_all(void);
 
 int of_overlay_notifier_register(struct notifier_block *nb);
@@ -1321,12 +1321,12 @@ int of_overlay_notifier_unregister(struct notifier_block *nb);
 
 #else
 
-static inline int of_overlay_apply(struct device_node *tree)
+static inline int of_overlay_apply(struct device_node *tree, int *ovcs_id)
 {
 	return -ENOTSUPP;
 }
 
-static inline int of_overlay_remove(int id)
+static inline int of_overlay_remove(int *ovcs_id)
 {
 	return -ENOTSUPP;
 }
-- 
cgit v1.2.3


From f948d6d8b792bb90041edc12eac35faf83030994 Mon Sep 17 00:00:00 2001
From: Frank Rowand <frank.rowand@sony.com>
Date: Tue, 17 Oct 2017 16:36:29 -0700
Subject: of: overlay: avoid race condition between applying multiple overlays

The process of applying an overlay consists of:
  - unflatten an overlay FDT (flattened device tree) into an
    EDT (expanded device tree)
  - fixup the phandle values in the overlay EDT to fit in a
    range above the phandle values in the live device tree
  - create the overlay changeset to reflect the contents of
    the overlay EDT
  - apply the overlay changeset, to modify the live device tree,
    potentially changing the maximum phandle value in the live
    device tree

There is currently no protection against two overlay applies
concurrently determining what range of phandle values are in use
in the live device tree, and subsequently changing that range.
Add a mutex to prevent multiple overlay applies from occurring
simultaneously.

Move of_resolve_phandles() into of_overlay_apply() so that it does not
have to be duplicated by each caller of of_overlay_apply().

The test in of_resolve_phandles() that the overlay tree is detached is
temporarily disabled so that old style overlay unittests do not fail.

Signed-off-by: Frank Rowand <frank.rowand@sony.com>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c |  6 ------
 drivers/of/of_private.h                      | 12 +++++++++++
 drivers/of/overlay.c                         | 32 ++++++++++++++++++++++++++++
 drivers/of/resolver.c                        |  7 ++++++
 drivers/of/unittest.c                        | 22 +++++++++++++------
 include/linux/of.h                           |  3 ---
 6 files changed, 67 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c b/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
index 7a7be0515bfd..54025af534d4 100644
--- a/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
+++ b/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
@@ -145,7 +145,6 @@ static struct device_node * __init tilcdc_get_overlay(struct kfree_table *kft)
 		__dtb_tilcdc_slave_compat_begin;
 	static void *overlay_data;
 	struct device_node *overlay;
-	int ret;
 
 	if (!size) {
 		pr_warn("%s: No overlay data\n", __func__);
@@ -164,11 +163,6 @@ static struct device_node * __init tilcdc_get_overlay(struct kfree_table *kft)
 	}
 
 	of_node_set_flag(overlay, OF_DETACHED);
-	ret = of_resolve_phandles(overlay);
-	if (ret) {
-		pr_err("%s: Failed to resolve phandles: %d\n", __func__, ret);
-		return NULL;
-	}
 
 	return overlay;
 }
diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h
index 36357f571df2..248730567dbe 100644
--- a/drivers/of/of_private.h
+++ b/drivers/of/of_private.h
@@ -76,6 +76,18 @@ static inline int __of_attach_node_sysfs(struct device_node *np)
 static inline void __of_detach_node_sysfs(struct device_node *np) {}
 #endif
 
+#if defined(CONFIG_OF_RESOLVE)
+int of_resolve_phandles(struct device_node *tree);
+#endif
+
+#if defined(CONFIG_OF_OVERLAY)
+void of_overlay_mutex_lock(void);
+void of_overlay_mutex_unlock(void);
+#else
+static inline void of_overlay_mutex_lock(void) {};
+static inline void of_overlay_mutex_unlock(void) {};
+#endif
+
 #if defined(CONFIG_OF_UNITTEST) && defined(CONFIG_OF_OVERLAY)
 extern void __init unittest_unflatten_overlay_base(void);
 #else
diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 791753321ed2..d164f86e5541 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -71,6 +71,28 @@ static int build_changeset_next_level(struct overlay_changeset *ovcs,
 		const struct device_node *overlay_node,
 		bool is_symbols_node);
 
+/*
+ * of_resolve_phandles() finds the largest phandle in the live tree.
+ * of_overlay_apply() may add a larger phandle to the live tree.
+ * Do not allow race between two overlays being applied simultaneously:
+ *    mutex_lock(&of_overlay_phandle_mutex)
+ *    of_resolve_phandles()
+ *    of_overlay_apply()
+ *    mutex_unlock(&of_overlay_phandle_mutex)
+ */
+static DEFINE_MUTEX(of_overlay_phandle_mutex);
+
+void of_overlay_mutex_lock(void)
+{
+	mutex_lock(&of_overlay_phandle_mutex);
+}
+
+void of_overlay_mutex_unlock(void)
+{
+	mutex_unlock(&of_overlay_phandle_mutex);
+}
+
+
 static LIST_HEAD(ovcs_list);
 static DEFINE_IDR(ovcs_idr);
 
@@ -624,6 +646,12 @@ int of_overlay_apply(struct device_node *tree, int *ovcs_id)
 		goto out;
 	}
 
+	of_overlay_mutex_lock();
+
+	ret = of_resolve_phandles(tree);
+	if (ret)
+		goto err_overlay_unlock;
+
 	mutex_lock(&of_mutex);
 
 	ret = init_overlay_changeset(ovcs, tree);
@@ -669,9 +697,13 @@ int of_overlay_apply(struct device_node *tree, int *ovcs_id)
 	}
 
 	mutex_unlock(&of_mutex);
+	of_overlay_mutex_unlock();
 
 	goto out;
 
+err_overlay_unlock:
+	of_overlay_mutex_unlock();
+
 err_free_overlay_changeset:
 	free_overlay_changeset(ovcs);
 
diff --git a/drivers/of/resolver.c b/drivers/of/resolver.c
index bd21a66f6930..cfaeef5f6cb1 100644
--- a/drivers/of/resolver.c
+++ b/drivers/of/resolver.c
@@ -271,11 +271,18 @@ int of_resolve_phandles(struct device_node *overlay)
 		err = -EINVAL;
 		goto out;
 	}
+
+#if 0
+	Temporarily disable check so that old style overlay unittests
+	do not fail when of_resolve_phandles() is moved into
+	of_overlay_apply().
+
 	if (!of_node_check_flag(overlay, OF_DETACHED)) {
 		pr_err("overlay not detached\n");
 		err = -EINVAL;
 		goto out;
 	}
+#endif
 
 	phandle_delta = live_tree_max_phandle() + 1;
 	adjust_overlay_phandles(overlay, phandle_delta);
diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index 3640dae4b9b2..273d78c1520b 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -993,9 +993,17 @@ static int __init unittest_data_add(void)
 		pr_warn("%s: No tree to attach; not running tests\n", __func__);
 		return -ENODATA;
 	}
+
+	/*
+	 * This lock normally encloses of_overlay_apply() as well as
+	 * of_resolve_phandles().
+	 */
+	of_overlay_mutex_lock();
+
 	rc = of_resolve_phandles(unittest_data_node);
 	if (rc) {
 		pr_err("%s: Failed to resolve phandles (rc=%i)\n", __func__, rc);
+		of_overlay_mutex_unlock();
 		return -EINVAL;
 	}
 
@@ -1005,6 +1013,7 @@ static int __init unittest_data_add(void)
 			__of_attach_node_sysfs(np);
 		of_aliases = of_find_node_by_path("/aliases");
 		of_chosen = of_find_node_by_path("/chosen");
+		of_overlay_mutex_unlock();
 		return 0;
 	}
 
@@ -1017,6 +1026,9 @@ static int __init unittest_data_add(void)
 		attach_node_and_children(np);
 		np = next;
 	}
+
+	of_overlay_mutex_unlock();
+
 	return 0;
 }
 
@@ -2148,16 +2160,11 @@ static int __init overlay_data_add(int onum)
 		goto out_free_data;
 	}
 
-	ret = of_resolve_phandles(info->np_overlay);
-	if (ret) {
-		pr_err("resolve ot phandles (ret=%d), %d\n", ret, onum);
-		goto out_free_np_overlay;
-	}
-
 	info->overlay_id = 0;
 	ret = of_overlay_apply(info->np_overlay, &info->overlay_id);
 	if (ret < 0) {
 		pr_err("of_overlay_apply() (ret=%d), %d\n", ret, onum);
+		of_overlay_mutex_unlock();
 		goto out_free_np_overlay;
 	}
 
@@ -2207,7 +2214,10 @@ static __init void of_unittest_overlay_high_level(void)
 	 * Could not fixup phandles in unittest_unflatten_overlay_base()
 	 * because kmalloc() was not yet available.
 	 */
+	of_overlay_mutex_lock();
 	of_resolve_phandles(overlay_base_root);
+	of_overlay_mutex_unlock();
+
 
 	/*
 	 * do not allow overlay_base to duplicate any node already in
diff --git a/include/linux/of.h b/include/linux/of.h
index 96edda95c6b0..ef4c9ff5955a 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -1279,9 +1279,6 @@ static inline int of_reconfig_get_state_change(unsigned long action,
 }
 #endif /* CONFIG_OF_DYNAMIC */
 
-/* CONFIG_OF_RESOLVE api */
-extern int of_resolve_phandles(struct device_node *tree);
-
 /**
  * of_device_is_system_power_controller - Tells if system-power-controller is found for device_node
  * @np: Pointer to the given device_node
-- 
cgit v1.2.3


From 6710e1126934d8b4372b4d2f9ae1646cd3f151bf Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:28 +0200
Subject: bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP

The 'cpumap' is primarily used as a backend map for XDP BPF helper
call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.

This patch implement the main part of the map.  It is not connected to
the XDP redirect system yet, and no SKB allocation are done yet.

The main concern in this patch is to ensure the datapath can run
without any locking.  This adds complexity to the setup and tear-down
procedure, which assumptions are extra carefully documented in the
code comments.

V2:
 - make sure array isn't larger than NR_CPUS
 - make sure CPUs added is a valid possible CPU

V3: fix nitpicks from Jakub Kicinski <kubakici@wp.pl>

V5:
 - Restrict map allocation to root / CAP_SYS_ADMIN
 - WARN_ON_ONCE if queue is not empty on tear-down
 - Return -EPERM on memlock limit instead of -ENOMEM
 - Error code in __cpu_map_entry_alloc() also handle ptr_ring_cleanup()
 - Moved cpu_map_enqueue() to next patch

V6: all notice by Daniel Borkmann
 - Fix err return code in cpu_map_alloc() introduced in V5
 - Move cpu_possible() check after max_entries boundary check
 - Forbid usage initially in check_map_func_compatibility()

V7:
 - Fix alloc error path spotted by Daniel Borkmann
 - Did stress test adding+removing CPUs from the map concurrently
 - Fixed refcnt issue on cpu_map_entry, kthread started too soon
 - Make sure packets are flushed during tear-down, involved use of
   rcu_barrier() and kthread_run only exit after queue is empty
 - Fix alloc error path in __cpu_map_entry_alloc() for ptr_ring

V8:
 - Nitpicking comments and gramma by Edward Cree
 - Fix missing semi-colon introduced in V7 due to rebasing
 - Move struct bpf_cpu_map_entry members cpu+map_id to tracepoint patch

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_types.h      |   1 +
 include/uapi/linux/bpf.h       |   1 +
 kernel/bpf/Makefile            |   1 +
 kernel/bpf/cpumap.c            | 560 +++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |   8 +-
 kernel/bpf/verifier.c          |   5 +
 tools/include/uapi/linux/bpf.h |   1 +
 7 files changed, 576 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/cpumap.c

(limited to 'include/linux')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 6f1a567667b8..814c1081a4a9 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -41,4 +41,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
 #ifdef CONFIG_STREAM_PARSER
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 #endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6db9e1d679cd..4303fb6c3817 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -112,6 +112,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_HASH_OF_MAPS,
 	BPF_MAP_TYPE_DEVMAP,
 	BPF_MAP_TYPE_SOCKMAP,
+	BPF_MAP_TYPE_CPUMAP,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 53fb09f92e3f..e597daae6120 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
 obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
 endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
new file mode 100644
index 000000000000..e1e25ddba038
--- /dev/null
+++ b/kernel/bpf/cpumap.c
@@ -0,0 +1,560 @@
+/* bpf/cpumap.c
+ *
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * Released under terms in GPL version 2.  See COPYING.
+ */
+
+/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
+ * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
+ *
+ * Unlike devmap which redirects XDP frames out another NIC device,
+ * this map type redirects raw XDP frames to another CPU.  The remote
+ * CPU will do SKB-allocation and call the normal network stack.
+ *
+ * This is a scalability and isolation mechanism, that allow
+ * separating the early driver network XDP layer, from the rest of the
+ * netstack, and assigning dedicated CPUs for this stage.  This
+ * basically allows for 10G wirespeed pre-filtering via bpf.
+ */
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/ptr_ring.h>
+
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/capability.h>
+
+/* General idea: XDP packets getting XDP redirected to another CPU,
+ * will maximum be stored/queued for one driver ->poll() call.  It is
+ * guaranteed that setting flush bit and flush operation happen on
+ * same CPU.  Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
+ * which queue in bpf_cpu_map_entry contains packets.
+ */
+
+#define CPU_MAP_BULK_SIZE 8  /* 8 == one cacheline on 64-bit archs */
+struct xdp_bulk_queue {
+	void *q[CPU_MAP_BULK_SIZE];
+	unsigned int count;
+};
+
+/* Struct for every remote "destination" CPU in map */
+struct bpf_cpu_map_entry {
+	u32 qsize;  /* Queue size placeholder for map lookup */
+
+	/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
+	struct xdp_bulk_queue __percpu *bulkq;
+
+	/* Queue with potential multi-producers, and single-consumer kthread */
+	struct ptr_ring *queue;
+	struct task_struct *kthread;
+	struct work_struct kthread_stop_wq;
+
+	atomic_t refcnt; /* Control when this struct can be free'ed */
+	struct rcu_head rcu;
+};
+
+struct bpf_cpu_map {
+	struct bpf_map map;
+	/* Below members specific for map type */
+	struct bpf_cpu_map_entry **cpu_map;
+	unsigned long __percpu *flush_needed;
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+			     struct xdp_bulk_queue *bq);
+
+static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
+{
+	return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+}
+
+static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_cpu_map *cmap;
+	int err = -ENOMEM;
+	u64 cost;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+		return ERR_PTR(-EINVAL);
+
+	cmap = kzalloc(sizeof(*cmap), GFP_USER);
+	if (!cmap)
+		return ERR_PTR(-ENOMEM);
+
+	/* mandatory map attributes */
+	cmap->map.map_type = attr->map_type;
+	cmap->map.key_size = attr->key_size;
+	cmap->map.value_size = attr->value_size;
+	cmap->map.max_entries = attr->max_entries;
+	cmap->map.map_flags = attr->map_flags;
+	cmap->map.numa_node = bpf_map_attr_numa_node(attr);
+
+	/* Pre-limit array size based on NR_CPUS, not final CPU check */
+	if (cmap->map.max_entries > NR_CPUS) {
+		err = -E2BIG;
+		goto free_cmap;
+	}
+
+	/* make sure page count doesn't overflow */
+	cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
+	cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_cmap;
+	cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	/* Notice returns -EPERM on if map size is larger than memlock limit */
+	ret = bpf_map_precharge_memlock(cmap->map.pages);
+	if (ret) {
+		err = ret;
+		goto free_cmap;
+	}
+
+	/* A per cpu bitfield with a bit per possible CPU in map  */
+	cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
+					    __alignof__(unsigned long));
+	if (!cmap->flush_needed)
+		goto free_cmap;
+
+	/* Alloc array for possible remote "destination" CPUs */
+	cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
+					   sizeof(struct bpf_cpu_map_entry *),
+					   cmap->map.numa_node);
+	if (!cmap->cpu_map)
+		goto free_percpu;
+
+	return &cmap->map;
+free_percpu:
+	free_percpu(cmap->flush_needed);
+free_cmap:
+	kfree(cmap);
+	return ERR_PTR(err);
+}
+
+void __cpu_map_queue_destructor(void *ptr)
+{
+	/* The tear-down procedure should have made sure that queue is
+	 * empty.  See __cpu_map_entry_replace() and work-queue
+	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+	 * gracefully and warn once.
+	 */
+	if (WARN_ON_ONCE(ptr))
+		page_frag_free(ptr);
+}
+
+static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+	if (atomic_dec_and_test(&rcpu->refcnt)) {
+		/* The queue should be empty at this point */
+		ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
+		kfree(rcpu->queue);
+		kfree(rcpu);
+	}
+}
+
+static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+	atomic_inc(&rcpu->refcnt);
+}
+
+/* called from workqueue, to workaround syscall using preempt_disable */
+static void cpu_map_kthread_stop(struct work_struct *work)
+{
+	struct bpf_cpu_map_entry *rcpu;
+
+	rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
+
+	/* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
+	 * as it waits until all in-flight call_rcu() callbacks complete.
+	 */
+	rcu_barrier();
+
+	/* kthread_stop will wake_up_process and wait for it to complete */
+	kthread_stop(rcpu->kthread);
+}
+
+static int cpu_map_kthread_run(void *data)
+{
+	struct bpf_cpu_map_entry *rcpu = data;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	/* When kthread gives stop order, then rcpu have been disconnected
+	 * from map, thus no new packets can enter. Remaining in-flight
+	 * per CPU stored packets are flushed to this queue.  Wait honoring
+	 * kthread_stop signal until queue is empty.
+	 */
+	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+		struct xdp_pkt *xdp_pkt;
+
+		schedule();
+		/* Do work */
+		while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) {
+			/* For now just "refcnt-free" */
+			page_frag_free(xdp_pkt);
+		}
+		__set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+
+	put_cpu_map_entry(rcpu);
+	return 0;
+}
+
+struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
+{
+	gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN;
+	struct bpf_cpu_map_entry *rcpu;
+	int numa, err;
+
+	/* Have map->numa_node, but choose node of redirect target CPU */
+	numa = cpu_to_node(cpu);
+
+	rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa);
+	if (!rcpu)
+		return NULL;
+
+	/* Alloc percpu bulkq */
+	rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq),
+					 sizeof(void *), gfp);
+	if (!rcpu->bulkq)
+		goto free_rcu;
+
+	/* Alloc queue */
+	rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
+	if (!rcpu->queue)
+		goto free_bulkq;
+
+	err = ptr_ring_init(rcpu->queue, qsize, gfp);
+	if (err)
+		goto free_queue;
+
+	rcpu->qsize = qsize;
+
+	/* Setup kthread */
+	rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
+					       "cpumap/%d/map:%d", cpu, map_id);
+	if (IS_ERR(rcpu->kthread))
+		goto free_ptr_ring;
+
+	get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
+	get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
+
+	/* Make sure kthread runs on a single CPU */
+	kthread_bind(rcpu->kthread, cpu);
+	wake_up_process(rcpu->kthread);
+
+	return rcpu;
+
+free_ptr_ring:
+	ptr_ring_cleanup(rcpu->queue, NULL);
+free_queue:
+	kfree(rcpu->queue);
+free_bulkq:
+	free_percpu(rcpu->bulkq);
+free_rcu:
+	kfree(rcpu);
+	return NULL;
+}
+
+void __cpu_map_entry_free(struct rcu_head *rcu)
+{
+	struct bpf_cpu_map_entry *rcpu;
+	int cpu;
+
+	/* This cpu_map_entry have been disconnected from map and one
+	 * RCU graze-period have elapsed.  Thus, XDP cannot queue any
+	 * new packets and cannot change/set flush_needed that can
+	 * find this entry.
+	 */
+	rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
+
+	/* Flush remaining packets in percpu bulkq */
+	for_each_online_cpu(cpu) {
+		struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
+
+		/* No concurrent bq_enqueue can run at this point */
+		bq_flush_to_queue(rcpu, bq);
+	}
+	free_percpu(rcpu->bulkq);
+	/* Cannot kthread_stop() here, last put free rcpu resources */
+	put_cpu_map_entry(rcpu);
+}
+
+/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
+ * ensure any driver rcu critical sections have completed, but this
+ * does not guarantee a flush has happened yet. Because driver side
+ * rcu_read_lock/unlock only protects the running XDP program.  The
+ * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
+ * pending flush op doesn't fail.
+ *
+ * The bpf_cpu_map_entry is still used by the kthread, and there can
+ * still be pending packets (in queue and percpu bulkq).  A refcnt
+ * makes sure to last user (kthread_stop vs. call_rcu) free memory
+ * resources.
+ *
+ * The rcu callback __cpu_map_entry_free flush remaining packets in
+ * percpu bulkq to queue.  Due to caller map_delete_elem() disable
+ * preemption, cannot call kthread_stop() to make sure queue is empty.
+ * Instead a work_queue is started for stopping kthread,
+ * cpu_map_kthread_stop, which waits for an RCU graze period before
+ * stopping kthread, emptying the queue.
+ */
+void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
+			     u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
+{
+	struct bpf_cpu_map_entry *old_rcpu;
+
+	old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
+	if (old_rcpu) {
+		call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
+		INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
+		schedule_work(&old_rcpu->kthread_stop_wq);
+	}
+}
+
+int cpu_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	u32 key_cpu = *(u32 *)key;
+
+	if (key_cpu >= map->max_entries)
+		return -EINVAL;
+
+	/* notice caller map_delete_elem() use preempt_disable() */
+	__cpu_map_entry_replace(cmap, key_cpu, NULL);
+	return 0;
+}
+
+int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	struct bpf_cpu_map_entry *rcpu;
+
+	/* Array index key correspond to CPU number */
+	u32 key_cpu = *(u32 *)key;
+	/* Value is the queue size */
+	u32 qsize = *(u32 *)value;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+	if (unlikely(key_cpu >= cmap->map.max_entries))
+		return -E2BIG;
+	if (unlikely(map_flags == BPF_NOEXIST))
+		return -EEXIST;
+	if (unlikely(qsize > 16384)) /* sanity limit on qsize */
+		return -EOVERFLOW;
+
+	/* Make sure CPU is a valid possible cpu */
+	if (!cpu_possible(key_cpu))
+		return -ENODEV;
+
+	if (qsize == 0) {
+		rcpu = NULL; /* Same as deleting */
+	} else {
+		/* Updating qsize cause re-allocation of bpf_cpu_map_entry */
+		rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
+		if (!rcpu)
+			return -ENOMEM;
+	}
+	rcu_read_lock();
+	__cpu_map_entry_replace(cmap, key_cpu, rcpu);
+	rcu_read_unlock();
+	return 0;
+}
+
+void cpu_map_free(struct bpf_map *map)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	int cpu;
+	u32 i;
+
+	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the bpf programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding critical sections in
+	 * these programs to complete. The rcu critical section only guarantees
+	 * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
+	 * It does __not__ ensure pending flush operations (if any) are
+	 * complete.
+	 */
+	synchronize_rcu();
+
+	/* To ensure all pending flush operations have completed wait for flush
+	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+	 * Because the above synchronize_rcu() ensures the map is disconnected
+	 * from the program we can assume no new bits will be set.
+	 */
+	for_each_online_cpu(cpu) {
+		unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
+
+		while (!bitmap_empty(bitmap, cmap->map.max_entries))
+			cond_resched();
+	}
+
+	/* For cpu_map the remote CPUs can still be using the entries
+	 * (struct bpf_cpu_map_entry).
+	 */
+	for (i = 0; i < cmap->map.max_entries; i++) {
+		struct bpf_cpu_map_entry *rcpu;
+
+		rcpu = READ_ONCE(cmap->cpu_map[i]);
+		if (!rcpu)
+			continue;
+
+		/* bq flush and cleanup happens after RCU graze-period */
+		__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
+	}
+	free_percpu(cmap->flush_needed);
+	bpf_map_area_free(cmap->cpu_map);
+	kfree(cmap);
+}
+
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	struct bpf_cpu_map_entry *rcpu;
+
+	if (key >= map->max_entries)
+		return NULL;
+
+	rcpu = READ_ONCE(cmap->cpu_map[key]);
+	return rcpu;
+}
+
+static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_cpu_map_entry *rcpu =
+		__cpu_map_lookup_elem(map, *(u32 *)key);
+
+	return rcpu ? &rcpu->qsize : NULL;
+}
+
+static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	u32 index = key ? *(u32 *)key : U32_MAX;
+	u32 *next = next_key;
+
+	if (index >= cmap->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (index == cmap->map.max_entries - 1)
+		return -ENOENT;
+	*next = index + 1;
+	return 0;
+}
+
+const struct bpf_map_ops cpu_map_ops = {
+	.map_alloc		= cpu_map_alloc,
+	.map_free		= cpu_map_free,
+	.map_delete_elem	= cpu_map_delete_elem,
+	.map_update_elem	= cpu_map_update_elem,
+	.map_lookup_elem	= cpu_map_lookup_elem,
+	.map_get_next_key	= cpu_map_get_next_key,
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+			     struct xdp_bulk_queue *bq)
+{
+	struct ptr_ring *q;
+	int i;
+
+	if (unlikely(!bq->count))
+		return 0;
+
+	q = rcpu->queue;
+	spin_lock(&q->producer_lock);
+
+	for (i = 0; i < bq->count; i++) {
+		void *xdp_pkt = bq->q[i];
+		int err;
+
+		err = __ptr_ring_produce(q, xdp_pkt);
+		if (err) {
+			/* Free xdp_pkt */
+			page_frag_free(xdp_pkt);
+		}
+	}
+	bq->count = 0;
+	spin_unlock(&q->producer_lock);
+
+	return 0;
+}
+
+/* Notice: Will change in later patch */
+struct xdp_pkt {
+	void *data;
+	u16 len;
+	u16 headroom;
+};
+
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+{
+	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
+
+	if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
+		bq_flush_to_queue(rcpu, bq);
+
+	/* Notice, xdp_buff/page MUST be queued here, long enough for
+	 * driver to code invoking us to finished, due to driver
+	 * (e.g. ixgbe) recycle tricks based on page-refcnt.
+	 *
+	 * Thus, incoming xdp_pkt is always queued here (else we race
+	 * with another CPU on page-refcnt and remaining driver code).
+	 * Queue time is very short, as driver will invoke flush
+	 * operation, when completing napi->poll call.
+	 */
+	bq->q[bq->count++] = xdp_pkt;
+	return 0;
+}
+
+void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+
+	__set_bit(bit, bitmap);
+}
+
+void __cpu_map_flush(struct bpf_map *map)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+	u32 bit;
+
+	/* The napi->poll softirq makes sure __cpu_map_insert_ctx()
+	 * and __cpu_map_flush() happen on same CPU. Thus, the percpu
+	 * bitmap indicate which percpu bulkq have packets.
+	 */
+	for_each_set_bit(bit, bitmap, map->max_entries) {
+		struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
+		struct xdp_bulk_queue *bq;
+
+		/* This is possible if entry is removed by user space
+		 * between xdp redirect and flush op.
+		 */
+		if (unlikely(!rcpu))
+			continue;
+
+		__clear_bit(bit, bitmap);
+
+		/* Flush all frames in bulkq to real queue */
+		bq = this_cpu_ptr(rcpu->bulkq);
+		bq_flush_to_queue(rcpu, bq);
+
+		/* If already running, costs spin_lock_irqsave + smb_mb */
+		wake_up_process(rcpu->kthread);
+	}
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d124e702e040..54fba06942f5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -592,6 +592,12 @@ static int map_update_elem(union bpf_attr *attr)
 	if (copy_from_user(value, uvalue, value_size) != 0)
 		goto free_value;
 
+	/* Need to create a kthread, thus must support schedule */
+	if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+		err = map->ops->map_update_elem(map, key, value, attr->flags);
+		goto out;
+	}
+
 	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
 	 * inside bpf map update or delete otherwise deadlocks are possible
 	 */
@@ -622,7 +628,7 @@ static int map_update_elem(union bpf_attr *attr)
 	}
 	__this_cpu_dec(bpf_prog_active);
 	preempt_enable();
-
+out:
 	if (!err)
 		trace_bpf_map_update_elem(map, ufd, key, value);
 free_value:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9755279d94cb..cefa64be9a2f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1444,6 +1444,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
+	/* Restrict bpf side of cpumap, open when use-cases appear */
+	case BPF_MAP_TYPE_CPUMAP:
+		if (func_id != BPF_FUNC_redirect_map)
+			goto error;
+		break;
 	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
 	case BPF_MAP_TYPE_HASH_OF_MAPS:
 		if (func_id != BPF_FUNC_map_lookup_elem)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index fb4fb81ce5b0..fa93033dc521 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -112,6 +112,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_HASH_OF_MAPS,
 	BPF_MAP_TYPE_DEVMAP,
 	BPF_MAP_TYPE_SOCKMAP,
+	BPF_MAP_TYPE_CPUMAP,
 };
 
 enum bpf_prog_type {
-- 
cgit v1.2.3


From 9c270af37bb62e708e3e4415d653ce73e713df02 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:34 +0200
Subject: bpf: XDP_REDIRECT enable use of cpumap

This patch connects cpumap to the xdp_do_redirect_map infrastructure.

Still no SKB allocation are done yet.  The XDP frames are transferred
to the other CPU, but they are simply refcnt decremented on the remote
CPU.  This served as a good benchmark for measuring the overhead of
remote refcnt decrement.  If driver page recycle cache is not
efficient then this, exposes a bottleneck in the page allocator.

A shout-out to MST's ptr_ring, which is the secret behind is being so
efficient to transfer memory pointers between CPUs, without constantly
bouncing cache-lines between CPUs.

V3: Handle !CONFIG_BPF_SYSCALL pointed out by kbuild test robot.

V4: Make Generic-XDP aware of cpumap type, but don't allow redirect yet,
 as implementation require a separate upstream discussion.

V5:
 - Fix a maybe-uninitialized pointed out by kbuild test robot.
 - Restrict bpf-prog side access to cpumap, open when use-cases appear
 - Implement cpu_map_enqueue() as a more simple void pointer enqueue

V6:
 - Allow cpumap type for usage in helper bpf_redirect_map,
   general bpf-prog side restriction moved to earlier patch.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h        |  31 +++++++++-
 include/trace/events/xdp.h |  10 +++-
 kernel/bpf/cpumap.c        |  22 ++++++-
 kernel/bpf/verifier.c      |   3 +-
 net/core/filter.c          | 140 +++++++++++++++++++++++++++++++++++----------
 5 files changed, 172 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4373125de1f3..6d4dd844828a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -355,6 +355,13 @@ struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
 
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
+void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
+void __cpu_map_flush(struct bpf_map *map);
+struct xdp_buff;
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+		    struct net_device *dev_rx);
+
 /* Return map's numa specified by userspace */
 static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
 {
@@ -362,7 +369,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
 		attr->numa_node : NUMA_NO_NODE;
 }
 
-#else
+#else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
 	return ERR_PTR(-EOPNOTSUPP);
@@ -425,6 +432,28 @@ static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
 static inline void __dev_map_flush(struct bpf_map *map)
 {
 }
+
+static inline
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	return NULL;
+}
+
+static inline void __cpu_map_insert_ctx(struct bpf_map *map, u32 index)
+{
+}
+
+static inline void __cpu_map_flush(struct bpf_map *map)
+{
+}
+
+struct xdp_buff;
+static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
+				  struct xdp_buff *xdp,
+				  struct net_device *dev_rx)
+{
+	return 0;
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 4e16c43fba10..eb2ece96c1a2 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -136,12 +136,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
 		  __entry->map_id, __entry->map_index)
 );
 
+#define devmap_ifindex(fwd, map)				\
+	(!fwd ? 0 :						\
+	 (!map ? 0 :						\
+	  ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
+	   ((struct net_device *)fwd)->ifindex : 0)))
+
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
-	 trace_xdp_redirect_map(dev, xdp, fwd ? fwd->ifindex : 0,	\
+	 trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map),	\
 				0, map, idx)
 
 #define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err)	\
-	 trace_xdp_redirect_map_err(dev, xdp, fwd ? fwd->ifindex : 0,	\
+	 trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map),	\
 				    err, map, idx)
 
 #endif /* _TRACE_XDP_H */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index e1e25ddba038..768da6a2c265 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -500,7 +500,7 @@ struct xdp_pkt {
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
-int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
 {
 	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
 
@@ -520,6 +520,26 @@ int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
 	return 0;
 }
 
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+		    struct net_device *dev_rx)
+{
+	struct xdp_pkt *xdp_pkt;
+	int headroom;
+
+	/* For now this is just used as a void pointer to data_hard_start.
+	 * Followup patch will generalize this.
+	 */
+	xdp_pkt = xdp->data_hard_start;
+
+	/* Fake writing into xdp_pkt->data to measure overhead */
+	headroom = xdp->data - xdp->data_hard_start;
+	if (headroom < sizeof(*xdp_pkt))
+		xdp_pkt->data = xdp->data;
+
+	bq_enqueue(rcpu, xdp_pkt);
+	return 0;
+}
+
 void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
 {
 	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cefa64be9a2f..e4d5136725a2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1486,7 +1486,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 			goto error;
 		break;
 	case BPF_FUNC_redirect_map:
-		if (map->map_type != BPF_MAP_TYPE_DEVMAP)
+		if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+		    map->map_type != BPF_MAP_TYPE_CPUMAP)
 			goto error;
 		break;
 	case BPF_FUNC_sk_redirect_map:
diff --git a/net/core/filter.c b/net/core/filter.c
index 140fa9f9c0f4..4d88e0665c41 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2526,10 +2526,36 @@ static int __bpf_tx_xdp(struct net_device *dev,
 	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
 	if (err)
 		return err;
-	if (map)
+	dev->netdev_ops->ndo_xdp_flush(dev);
+	return 0;
+}
+
+static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
+			    struct bpf_map *map,
+			    struct xdp_buff *xdp,
+			    u32 index)
+{
+	int err;
+
+	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+		struct net_device *dev = fwd;
+
+		if (!dev->netdev_ops->ndo_xdp_xmit)
+			return -EOPNOTSUPP;
+
+		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
+		if (err)
+			return err;
 		__dev_map_insert_ctx(map, index);
-	else
-		dev->netdev_ops->ndo_xdp_flush(dev);
+
+	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+		struct bpf_cpu_map_entry *rcpu = fwd;
+
+		err = cpu_map_enqueue(rcpu, xdp, dev_rx);
+		if (err)
+			return err;
+		__cpu_map_insert_ctx(map, index);
+	}
 	return 0;
 }
 
@@ -2539,11 +2565,33 @@ void xdp_do_flush_map(void)
 	struct bpf_map *map = ri->map_to_flush;
 
 	ri->map_to_flush = NULL;
-	if (map)
-		__dev_map_flush(map);
+	if (map) {
+		switch (map->map_type) {
+		case BPF_MAP_TYPE_DEVMAP:
+			__dev_map_flush(map);
+			break;
+		case BPF_MAP_TYPE_CPUMAP:
+			__cpu_map_flush(map);
+			break;
+		default:
+			break;
+		}
+	}
 }
 EXPORT_SYMBOL_GPL(xdp_do_flush_map);
 
+static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
+{
+	switch (map->map_type) {
+	case BPF_MAP_TYPE_DEVMAP:
+		return __dev_map_lookup_elem(map, index);
+	case BPF_MAP_TYPE_CPUMAP:
+		return __cpu_map_lookup_elem(map, index);
+	default:
+		return NULL;
+	}
+}
+
 static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog,
 				   unsigned long aux)
 {
@@ -2556,8 +2604,8 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	unsigned long map_owner = ri->map_owner;
 	struct bpf_map *map = ri->map;
-	struct net_device *fwd = NULL;
 	u32 index = ri->ifindex;
+	void *fwd = NULL;
 	int err;
 
 	ri->ifindex = 0;
@@ -2570,7 +2618,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 		goto err;
 	}
 
-	fwd = __dev_map_lookup_elem(map, index);
+	fwd = __xdp_map_lookup_elem(map, index);
 	if (!fwd) {
 		err = -EINVAL;
 		goto err;
@@ -2578,7 +2626,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 	if (ri->map_to_flush && ri->map_to_flush != map)
 		xdp_do_flush_map();
 
-	err = __bpf_tx_xdp(fwd, map, xdp, index);
+	err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index);
 	if (unlikely(err))
 		goto err;
 
@@ -2620,54 +2668,88 @@ err:
 }
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
 
-int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
-			    struct bpf_prog *xdp_prog)
+static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd)
+{
+	unsigned int len;
+
+	if (unlikely(!(fwd->flags & IFF_UP)))
+		return -ENETDOWN;
+
+	len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
+	if (skb->len > len)
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb,
+				struct bpf_prog *xdp_prog)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	unsigned long map_owner = ri->map_owner;
 	struct bpf_map *map = ri->map;
 	struct net_device *fwd = NULL;
 	u32 index = ri->ifindex;
-	unsigned int len;
 	int err = 0;
 
 	ri->ifindex = 0;
 	ri->map = NULL;
 	ri->map_owner = 0;
 
-	if (map) {
-		if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) {
-			err = -EFAULT;
-			map = NULL;
-			goto err;
-		}
-		fwd = __dev_map_lookup_elem(map, index);
-	} else {
-		fwd = dev_get_by_index_rcu(dev_net(dev), index);
+	if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) {
+		err = -EFAULT;
+		map = NULL;
+		goto err;
 	}
+	fwd = __xdp_map_lookup_elem(map, index);
 	if (unlikely(!fwd)) {
 		err = -EINVAL;
 		goto err;
 	}
 
-	if (unlikely(!(fwd->flags & IFF_UP))) {
-		err = -ENETDOWN;
+	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+		if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
+			goto err;
+		skb->dev = fwd;
+	} else {
+		/* TODO: Handle BPF_MAP_TYPE_CPUMAP */
+		err = -EBADRQC;
 		goto err;
 	}
 
-	len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
-	if (skb->len > len) {
-		err = -EMSGSIZE;
+	_trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+	return 0;
+err:
+	_trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+	return err;
+}
+
+int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
+			    struct bpf_prog *xdp_prog)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	u32 index = ri->ifindex;
+	struct net_device *fwd;
+	int err = 0;
+
+	if (ri->map)
+		return xdp_do_generic_redirect_map(dev, skb, xdp_prog);
+
+	ri->ifindex = 0;
+	fwd = dev_get_by_index_rcu(dev_net(dev), index);
+	if (unlikely(!fwd)) {
+		err = -EINVAL;
 		goto err;
 	}
 
+	if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
+		goto err;
+
 	skb->dev = fwd;
-	map ? _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index)
-		: _trace_xdp_redirect(dev, xdp_prog, index);
+	_trace_xdp_redirect(dev, xdp_prog, index);
 	return 0;
 err:
-	map ? _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err)
-		: _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+	_trace_xdp_redirect_err(dev, xdp_prog, index, err);
 	return err;
 }
 EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
-- 
cgit v1.2.3


From 1c601d829ab0d7ac3ac44853f83db2206afe67fc Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:39 +0200
Subject: bpf: cpumap xdp_buff to skb conversion and allocation

This patch makes cpumap functional, by adding SKB allocation and
invoking the network stack on the dequeuing CPU.

For constructing the SKB on the remote CPU, the xdp_buff in converted
into a struct xdp_pkt, and it mapped into the top headroom of the
packet, to avoid allocating separate mem.  For now, struct xdp_pkt is
just a cpumap internal data structure, with info carried between
enqueue to dequeue.

If a driver doesn't have enough headroom it is simply dropped, with
return code -EOVERFLOW.  This will be picked up the xdp tracepoint
infrastructure, to allow users to catch this.

V2: take into account xdp->data_meta

V4:
 - Drop busypoll tricks, keeping it more simple.
 - Skip RPS and Generic-XDP-recursive-reinjection, suggested by Alexei

V5: correct RCU read protection around __netif_receive_skb_core.

V6: Setting TASK_RUNNING vs TASK_INTERRUPTIBLE based on talk with Rik van Riel

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |   1 +
 kernel/bpf/cpumap.c       | 152 +++++++++++++++++++++++++++++++++++++++-------
 net/core/dev.c            |  27 ++++++++
 3 files changed, 158 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 31bb3010c69b..bf014afcb914 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3260,6 +3260,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
 int netif_rx(struct sk_buff *skb);
 int netif_rx_ni(struct sk_buff *skb);
 int netif_receive_skb(struct sk_buff *skb);
+int netif_receive_skb_core(struct sk_buff *skb);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 768da6a2c265..ee7adf4352dd 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -25,6 +25,9 @@
 #include <linux/kthread.h>
 #include <linux/capability.h>
 
+#include <linux/netdevice.h>   /* netif_receive_skb_core */
+#include <linux/etherdevice.h> /* eth_type_trans */
+
 /* General idea: XDP packets getting XDP redirected to another CPU,
  * will maximum be stored/queued for one driver ->poll() call.  It is
  * guaranteed that setting flush bit and flush operation happen on
@@ -179,6 +182,92 @@ static void cpu_map_kthread_stop(struct work_struct *work)
 	kthread_stop(rcpu->kthread);
 }
 
+/* For now, xdp_pkt is a cpumap internal data structure, with info
+ * carried between enqueue to dequeue. It is mapped into the top
+ * headroom of the packet, to avoid allocating separate mem.
+ */
+struct xdp_pkt {
+	void *data;
+	u16 len;
+	u16 headroom;
+	u16 metasize;
+	struct net_device *dev_rx;
+};
+
+/* Convert xdp_buff to xdp_pkt */
+static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
+{
+	struct xdp_pkt *xdp_pkt;
+	int metasize;
+	int headroom;
+
+	/* Assure headroom is available for storing info */
+	headroom = xdp->data - xdp->data_hard_start;
+	metasize = xdp->data - xdp->data_meta;
+	metasize = metasize > 0 ? metasize : 0;
+	if ((headroom - metasize) < sizeof(*xdp_pkt))
+		return NULL;
+
+	/* Store info in top of packet */
+	xdp_pkt = xdp->data_hard_start;
+
+	xdp_pkt->data = xdp->data;
+	xdp_pkt->len  = xdp->data_end - xdp->data;
+	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
+	xdp_pkt->metasize = metasize;
+
+	return xdp_pkt;
+}
+
+struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
+				  struct xdp_pkt *xdp_pkt)
+{
+	unsigned int frame_size;
+	void *pkt_data_start;
+	struct sk_buff *skb;
+
+	/* build_skb need to place skb_shared_info after SKB end, and
+	 * also want to know the memory "truesize".  Thus, need to
+	 * know the memory frame size backing xdp_buff.
+	 *
+	 * XDP was designed to have PAGE_SIZE frames, but this
+	 * assumption is not longer true with ixgbe and i40e.  It
+	 * would be preferred to set frame_size to 2048 or 4096
+	 * depending on the driver.
+	 *   frame_size = 2048;
+	 *   frame_len  = frame_size - sizeof(*xdp_pkt);
+	 *
+	 * Instead, with info avail, skb_shared_info in placed after
+	 * packet len.  This, unfortunately fakes the truesize.
+	 * Another disadvantage of this approach, the skb_shared_info
+	 * is not at a fixed memory location, with mixed length
+	 * packets, which is bad for cache-line hotness.
+	 */
+	frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
+		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+	pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
+	skb = build_skb(pkt_data_start, frame_size);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, xdp_pkt->headroom);
+	__skb_put(skb, xdp_pkt->len);
+	if (xdp_pkt->metasize)
+		skb_metadata_set(skb, xdp_pkt->metasize);
+
+	/* Essential SKB info: protocol and skb->dev */
+	skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
+
+	/* Optional SKB info, currently missing:
+	 * - HW checksum info		(skb->ip_summed)
+	 * - HW RX hash			(skb_set_hash)
+	 * - RX ring dev queue index	(skb_record_rx_queue)
+	 */
+
+	return skb;
+}
+
 static int cpu_map_kthread_run(void *data)
 {
 	struct bpf_cpu_map_entry *rcpu = data;
@@ -191,15 +280,45 @@ static int cpu_map_kthread_run(void *data)
 	 * kthread_stop signal until queue is empty.
 	 */
 	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+		unsigned int processed = 0, drops = 0;
 		struct xdp_pkt *xdp_pkt;
 
-		schedule();
-		/* Do work */
-		while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) {
-			/* For now just "refcnt-free" */
-			page_frag_free(xdp_pkt);
+		/* Release CPU reschedule checks */
+		if (__ptr_ring_empty(rcpu->queue)) {
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+		} else {
+			cond_resched();
+		}
+		__set_current_state(TASK_RUNNING);
+
+		/* Process packets in rcpu->queue */
+		local_bh_disable();
+		/*
+		 * The bpf_cpu_map_entry is single consumer, with this
+		 * kthread CPU pinned. Lockless access to ptr_ring
+		 * consume side valid as no-resize allowed of queue.
+		 */
+		while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
+			struct sk_buff *skb;
+			int ret;
+
+			skb = cpu_map_build_skb(rcpu, xdp_pkt);
+			if (!skb) {
+				page_frag_free(xdp_pkt);
+				continue;
+			}
+
+			/* Inject into network stack */
+			ret = netif_receive_skb_core(skb);
+			if (ret == NET_RX_DROP)
+				drops++;
+
+			/* Limit BH-disable period */
+			if (++processed == 8)
+				break;
 		}
-		__set_current_state(TASK_INTERRUPTIBLE);
+		local_bh_enable(); /* resched point, may call do_softirq() */
 	}
 	__set_current_state(TASK_RUNNING);
 
@@ -490,13 +609,6 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 	return 0;
 }
 
-/* Notice: Will change in later patch */
-struct xdp_pkt {
-	void *data;
-	u16 len;
-	u16 headroom;
-};
-
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
@@ -524,17 +636,13 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx)
 {
 	struct xdp_pkt *xdp_pkt;
-	int headroom;
 
-	/* For now this is just used as a void pointer to data_hard_start.
-	 * Followup patch will generalize this.
-	 */
-	xdp_pkt = xdp->data_hard_start;
+	xdp_pkt = convert_to_xdp_pkt(xdp);
+	if (!xdp_pkt)
+		return -EOVERFLOW;
 
-	/* Fake writing into xdp_pkt->data to measure overhead */
-	headroom = xdp->data - xdp->data_hard_start;
-	if (headroom < sizeof(*xdp_pkt))
-		xdp_pkt->data = xdp->data;
+	/* Info needed when constructing SKB on remote CPU */
+	xdp_pkt->dev_rx = dev_rx;
 
 	bq_enqueue(rcpu, xdp_pkt);
 	return 0;
diff --git a/net/core/dev.c b/net/core/dev.c
index d2b20e73080e..cf5894f0e6eb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4492,6 +4492,33 @@ out:
 	return ret;
 }
 
+/**
+ *	netif_receive_skb_core - special purpose version of netif_receive_skb
+ *	@skb: buffer to process
+ *
+ *	More direct receive version of netif_receive_skb().  It should
+ *	only be used by callers that have a need to skip RPS and Generic XDP.
+ *	Caller must also take care of handling if (page_is_)pfmemalloc.
+ *
+ *	This function may only be called from softirq context and interrupts
+ *	should be enabled.
+ *
+ *	Return values (usually ignored):
+ *	NET_RX_SUCCESS: no congestion
+ *	NET_RX_DROP: packet was dropped
+ */
+int netif_receive_skb_core(struct sk_buff *skb)
+{
+	int ret;
+
+	rcu_read_lock();
+	ret = __netif_receive_skb_core(skb, false);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(netif_receive_skb_core);
+
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	int ret;
-- 
cgit v1.2.3


From 7de16e3a35578f4f5accc6f5f23970310483d0a2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 16 Oct 2017 16:40:53 -0700
Subject: bpf: split verifier and program ops

struct bpf_verifier_ops contains both verifier ops and operations
used later during program's lifetime (test_run).  Split the runtime
ops into a different structure.

BPF_PROG_TYPE() will now append ## _prog_ops or ## _verifier_ops
to the names.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h       | 15 ++++++++++-----
 include/linux/bpf_types.h | 28 ++++++++++++++--------------
 kernel/bpf/syscall.c      | 16 +++++++++++++---
 kernel/bpf/verifier.c     | 12 ++++++------
 kernel/trace/bpf_trace.c  | 15 ++++++++++++---
 net/core/filter.c         | 45 ++++++++++++++++++++++++++++++++++++---------
 6 files changed, 91 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6d4dd844828a..e1fba5504ca5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -157,6 +157,11 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size)
 	aux->ctx_field_size = size;
 }
 
+struct bpf_prog_ops {
+	int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
+			union bpf_attr __user *uattr);
+};
+
 struct bpf_verifier_ops {
 	/* return eBPF function prototype for verification */
 	const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id);
@@ -172,8 +177,6 @@ struct bpf_verifier_ops {
 				  const struct bpf_insn *src,
 				  struct bpf_insn *dst,
 				  struct bpf_prog *prog, u32 *target_size);
-	int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
-			union bpf_attr __user *uattr);
 };
 
 struct bpf_prog_aux {
@@ -184,7 +187,8 @@ struct bpf_prog_aux {
 	u32 id;
 	struct latch_tree_node ksym_tnode;
 	struct list_head ksym_lnode;
-	const struct bpf_verifier_ops *ops;
+	const struct bpf_prog_ops *ops;
+	const struct bpf_verifier_ops *vops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
 	struct user_struct *user;
@@ -279,8 +283,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
-#define BPF_PROG_TYPE(_id, _ops) \
-	extern const struct bpf_verifier_ops _ops;
+#define BPF_PROG_TYPE(_id, _name) \
+	extern const struct bpf_prog_ops _name ## _prog_ops; \
+	extern const struct bpf_verifier_ops _name ## _verifier_ops;
 #define BPF_MAP_TYPE(_id, _ops) \
 	extern const struct bpf_map_ops _ops;
 #include <linux/bpf_types.h>
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 814c1081a4a9..36418ad43245 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -1,22 +1,22 @@
 /* internal file - do not include directly */
 
 #ifdef CONFIG_NET
-BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act)
+BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
 #endif
 #ifdef CONFIG_BPF_EVENTS
-BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
+BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint)
+BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 #endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 54fba06942f5..444902b5a30d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -739,9 +739,18 @@ err_put:
 	return err;
 }
 
-static const struct bpf_verifier_ops * const bpf_prog_types[] = {
-#define BPF_PROG_TYPE(_id, _ops) \
-	[_id] = &_ops,
+static const struct bpf_prog_ops * const bpf_prog_types[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+	[_id] = & _name ## _prog_ops,
+#define BPF_MAP_TYPE(_id, _ops)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
+
+static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+	[_id] = & _name ## _verifier_ops,
 #define BPF_MAP_TYPE(_id, _ops)
 #include <linux/bpf_types.h>
 #undef BPF_PROG_TYPE
@@ -754,6 +763,7 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 		return -EINVAL;
 
 	prog->aux->ops = bpf_prog_types[type];
+	prog->aux->vops = bpf_verifier_ops[type];
 	prog->type = type;
 	return 0;
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e4d5136725a2..38e24d69fc95 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -856,8 +856,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 			*reg_type = info.reg_type;
 			return 0;
 		}
-	} else if (env->prog->aux->ops->is_valid_access &&
-		   env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
+	} else if (env->prog->aux->vops->is_valid_access &&
+		   env->prog->aux->vops->is_valid_access(off, size, t, &info)) {
 		/* A non zero info.ctx_field_size indicates that this field is a
 		 * candidate for later verifier transformation to load the whole
 		 * field and then apply a mask when accessed with a narrower
@@ -1565,8 +1565,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 		return -EINVAL;
 	}
 
-	if (env->prog->aux->ops->get_func_proto)
-		fn = env->prog->aux->ops->get_func_proto(func_id);
+	if (env->prog->aux->vops->get_func_proto)
+		fn = env->prog->aux->vops->get_func_proto(func_id);
 
 	if (!fn) {
 		verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
@@ -4035,7 +4035,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
  */
 static int convert_ctx_accesses(struct bpf_verifier_env *env)
 {
-	const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+	const struct bpf_verifier_ops *ops = env->prog->aux->vops;
 	int i, cnt, size, ctx_field_size, delta = 0;
 	const int insn_cnt = env->prog->len;
 	struct bpf_insn insn_buf[16], *insn;
@@ -4236,7 +4236,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			insn      = new_prog->insnsi + i + delta;
 		}
 patch_call_imm:
-		fn = prog->aux->ops->get_func_proto(insn->imm);
+		fn = prog->aux->vops->get_func_proto(insn->imm);
 		/* all functions that have prototype and verifier allowed
 		 * programs to call them, must be real in-kernel functions
 		 */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 04ea5314f2bc..3126da2f468a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -561,11 +561,14 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
 	return true;
 }
 
-const struct bpf_verifier_ops kprobe_prog_ops = {
+const struct bpf_verifier_ops kprobe_verifier_ops = {
 	.get_func_proto  = kprobe_prog_func_proto,
 	.is_valid_access = kprobe_prog_is_valid_access,
 };
 
+const struct bpf_prog_ops kprobe_prog_ops = {
+};
+
 BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
 	   u64, flags, void *, data, u64, size)
 {
@@ -667,11 +670,14 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type
 	return true;
 }
 
-const struct bpf_verifier_ops tracepoint_prog_ops = {
+const struct bpf_verifier_ops tracepoint_verifier_ops = {
 	.get_func_proto  = tp_prog_func_proto,
 	.is_valid_access = tp_prog_is_valid_access,
 };
 
+const struct bpf_prog_ops tracepoint_prog_ops = {
+};
+
 static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
 				    struct bpf_insn_access_aux *info)
 {
@@ -727,8 +733,11 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
-const struct bpf_verifier_ops perf_event_prog_ops = {
+const struct bpf_verifier_ops perf_event_verifier_ops = {
 	.get_func_proto		= tp_prog_func_proto,
 	.is_valid_access	= pe_prog_is_valid_access,
 	.convert_ctx_access	= pe_prog_convert_ctx_access,
 };
+
+const struct bpf_prog_ops perf_event_prog_ops = {
+};
diff --git a/net/core/filter.c b/net/core/filter.c
index 4d88e0665c41..1dd3034f846f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4395,68 +4395,95 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
-const struct bpf_verifier_ops sk_filter_prog_ops = {
+const struct bpf_verifier_ops sk_filter_verifier_ops = {
 	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 };
 
-const struct bpf_verifier_ops tc_cls_act_prog_ops = {
+const struct bpf_prog_ops sk_filter_prog_ops = {
+};
+
+const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.get_func_proto		= tc_cls_act_func_proto,
 	.is_valid_access	= tc_cls_act_is_valid_access,
 	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
 	.gen_prologue		= tc_cls_act_prologue,
+};
+
+const struct bpf_prog_ops tc_cls_act_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-const struct bpf_verifier_ops xdp_prog_ops = {
+const struct bpf_verifier_ops xdp_verifier_ops = {
 	.get_func_proto		= xdp_func_proto,
 	.is_valid_access	= xdp_is_valid_access,
 	.convert_ctx_access	= xdp_convert_ctx_access,
+};
+
+const struct bpf_prog_ops xdp_prog_ops = {
 	.test_run		= bpf_prog_test_run_xdp,
 };
 
-const struct bpf_verifier_ops cg_skb_prog_ops = {
+const struct bpf_verifier_ops cg_skb_verifier_ops = {
 	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops cg_skb_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-const struct bpf_verifier_ops lwt_inout_prog_ops = {
+const struct bpf_verifier_ops lwt_inout_verifier_ops = {
 	.get_func_proto		= lwt_inout_func_proto,
 	.is_valid_access	= lwt_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_inout_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-const struct bpf_verifier_ops lwt_xmit_prog_ops = {
+const struct bpf_verifier_ops lwt_xmit_verifier_ops = {
 	.get_func_proto		= lwt_xmit_func_proto,
 	.is_valid_access	= lwt_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 	.gen_prologue		= tc_cls_act_prologue,
+};
+
+const struct bpf_prog_ops lwt_xmit_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-const struct bpf_verifier_ops cg_sock_prog_ops = {
+const struct bpf_verifier_ops cg_sock_verifier_ops = {
 	.get_func_proto		= sock_filter_func_proto,
 	.is_valid_access	= sock_filter_is_valid_access,
 	.convert_ctx_access	= sock_filter_convert_ctx_access,
 };
 
-const struct bpf_verifier_ops sock_ops_prog_ops = {
+const struct bpf_prog_ops cg_sock_prog_ops = {
+};
+
+const struct bpf_verifier_ops sock_ops_verifier_ops = {
 	.get_func_proto		= sock_ops_func_proto,
 	.is_valid_access	= sock_ops_is_valid_access,
 	.convert_ctx_access	= sock_ops_convert_ctx_access,
 };
 
-const struct bpf_verifier_ops sk_skb_prog_ops = {
+const struct bpf_prog_ops sock_ops_prog_ops = {
+};
+
+const struct bpf_verifier_ops sk_skb_verifier_ops = {
 	.get_func_proto		= sk_skb_func_proto,
 	.is_valid_access	= sk_skb_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 	.gen_prologue		= sk_skb_prologue,
 };
 
+const struct bpf_prog_ops sk_skb_prog_ops = {
+};
+
 int sk_detach_filter(struct sock *sk)
 {
 	int ret = -ENOENT;
-- 
cgit v1.2.3


From 00176a34d9e27ab1e77db75fe13abc005cffe0ca Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 16 Oct 2017 16:40:54 -0700
Subject: bpf: remove the verifier ops from program structure

Since the verifier ops don't have to be associated with
the program for its entire lifetime we can move it to
verifier's struct bpf_verifier_env.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h          |  1 -
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/syscall.c         | 10 ----------
 kernel/bpf/verifier.c        | 23 +++++++++++++++++------
 4 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e1fba5504ca5..cf91977e8719 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -188,7 +188,6 @@ struct bpf_prog_aux {
 	struct latch_tree_node ksym_tnode;
 	struct list_head ksym_lnode;
 	const struct bpf_prog_ops *ops;
-	const struct bpf_verifier_ops *vops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
 	struct user_struct *user;
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index f00ef751c1c5..feeaea93d959 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -141,6 +141,7 @@ struct bpf_ext_analyzer_ops {
  */
 struct bpf_verifier_env {
 	struct bpf_prog *prog;		/* eBPF program being verified */
+	const struct bpf_verifier_ops *ops;
 	struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
 	int stack_size;			/* number of states to be processed */
 	bool strict_alignment;		/* perform strict pointer alignment checks */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 444902b5a30d..0e893cac6795 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -748,22 +748,12 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = {
 #undef BPF_MAP_TYPE
 };
 
-static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
-#define BPF_PROG_TYPE(_id, _name) \
-	[_id] = & _name ## _verifier_ops,
-#define BPF_MAP_TYPE(_id, _ops)
-#include <linux/bpf_types.h>
-#undef BPF_PROG_TYPE
-#undef BPF_MAP_TYPE
-};
-
 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 {
 	if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
 		return -EINVAL;
 
 	prog->aux->ops = bpf_prog_types[type];
-	prog->aux->vops = bpf_verifier_ops[type];
 	prog->type = type;
 	return 0;
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 38e24d69fc95..3b6e2c550e96 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23,6 +23,15 @@
 
 #include "disasm.h"
 
+static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+	[_id] = & _name ## _verifier_ops,
+#define BPF_MAP_TYPE(_id, _ops)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
+
 /* bpf_check() is a static code analyzer that walks eBPF program
  * instruction by instruction and updates register/stack state.
  * All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -856,8 +865,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 			*reg_type = info.reg_type;
 			return 0;
 		}
-	} else if (env->prog->aux->vops->is_valid_access &&
-		   env->prog->aux->vops->is_valid_access(off, size, t, &info)) {
+	} else if (env->ops->is_valid_access &&
+		   env->ops->is_valid_access(off, size, t, &info)) {
 		/* A non zero info.ctx_field_size indicates that this field is a
 		 * candidate for later verifier transformation to load the whole
 		 * field and then apply a mask when accessed with a narrower
@@ -1565,8 +1574,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 		return -EINVAL;
 	}
 
-	if (env->prog->aux->vops->get_func_proto)
-		fn = env->prog->aux->vops->get_func_proto(func_id);
+	if (env->ops->get_func_proto)
+		fn = env->ops->get_func_proto(func_id);
 
 	if (!fn) {
 		verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
@@ -4035,7 +4044,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
  */
 static int convert_ctx_accesses(struct bpf_verifier_env *env)
 {
-	const struct bpf_verifier_ops *ops = env->prog->aux->vops;
+	const struct bpf_verifier_ops *ops = env->ops;
 	int i, cnt, size, ctx_field_size, delta = 0;
 	const int insn_cnt = env->prog->len;
 	struct bpf_insn insn_buf[16], *insn;
@@ -4236,7 +4245,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			insn      = new_prog->insnsi + i + delta;
 		}
 patch_call_imm:
-		fn = prog->aux->vops->get_func_proto(insn->imm);
+		fn = env->ops->get_func_proto(insn->imm);
 		/* all functions that have prototype and verifier allowed
 		 * programs to call them, must be real in-kernel functions
 		 */
@@ -4294,6 +4303,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	if (!env->insn_aux_data)
 		goto err_free_env;
 	env->prog = *prog;
+	env->ops = bpf_verifier_ops[env->prog->type];
 
 	/* grab the mutex to protect few globals used by verifier */
 	mutex_lock(&bpf_verifier_lock);
@@ -4406,6 +4416,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	if (!env->insn_aux_data)
 		goto err_free_env;
 	env->prog = prog;
+	env->ops = bpf_verifier_ops[env->prog->type];
 	env->analyzer_ops = ops;
 	env->analyzer_priv = priv;
 
-- 
cgit v1.2.3


From 4f9218aaf8a463f76cac40aa08d859d065f8cc9e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 16 Oct 2017 16:40:55 -0700
Subject: bpf: move knowledge about post-translation offsets out of verifier

Use the fact that verifier ops are now separate from program
ops to define a separate set of callbacks for verification of
already translated programs.

Since we expect the analyzer ops to be defined only for
a small subset of all program types initialize their array
by hand (don't use linux/bpf_types.h).

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   |  3 +++
 kernel/bpf/verifier.c | 55 +++++++++++++++------------------------------------
 net/core/filter.c     | 40 +++++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cf91977e8719..d67ccdc0099f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -291,6 +291,9 @@ DECLARE_PER_CPU(int, bpf_prog_active);
 #undef BPF_PROG_TYPE
 #undef BPF_MAP_TYPE
 
+extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
+extern const struct bpf_verifier_ops xdp_analyzer_ops;
+
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
 struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3b6e2c550e96..545b8c45a578 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -822,36 +822,6 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 	return err;
 }
 
-static bool analyzer_is_valid_access(struct bpf_verifier_env *env, int off,
-				     struct bpf_insn_access_aux *info)
-{
-	switch (env->prog->type) {
-	case BPF_PROG_TYPE_XDP:
-		switch (off) {
-		case offsetof(struct xdp_buff, data):
-			info->reg_type = PTR_TO_PACKET;
-			return true;
-		case offsetof(struct xdp_buff, data_end):
-			info->reg_type = PTR_TO_PACKET_END;
-			return true;
-		}
-		return false;
-	case BPF_PROG_TYPE_SCHED_CLS:
-		switch (off) {
-		case offsetof(struct sk_buff, data):
-			info->reg_type = PTR_TO_PACKET;
-			return true;
-		case offsetof(struct sk_buff, cb) +
-		     offsetof(struct bpf_skb_data_end, data_end):
-			info->reg_type = PTR_TO_PACKET_END;
-			return true;
-		}
-		return false;
-	default:
-		return false;
-	}
-}
-
 /* check access to 'struct bpf_context' fields.  Supports fixed offsets only */
 static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
 			    enum bpf_access_type t, enum bpf_reg_type *reg_type)
@@ -860,13 +830,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		.reg_type = *reg_type,
 	};
 
-	if (env->analyzer_ops) {
-		if (analyzer_is_valid_access(env, off, &info)) {
-			*reg_type = info.reg_type;
-			return 0;
-		}
-	} else if (env->ops->is_valid_access &&
-		   env->ops->is_valid_access(off, size, t, &info)) {
+	if (env->ops->is_valid_access &&
+	    env->ops->is_valid_access(off, size, t, &info)) {
 		/* A non zero info.ctx_field_size indicates that this field is a
 		 * candidate for later verifier transformation to load the whole
 		 * field and then apply a mask when accessed with a narrower
@@ -874,9 +839,12 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		 * will only allow for whole field access and rejects any other
 		 * type of narrower access.
 		 */
-		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
 		*reg_type = info.reg_type;
 
+		if (env->analyzer_ops)
+			return 0;
+
+		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
 		/* remember the offset of last byte accessed in ctx */
 		if (env->prog->aux->max_ctx_offset < off + size)
 			env->prog->aux->max_ctx_offset = off + size;
@@ -4400,12 +4368,21 @@ err_free_env:
 	return ret;
 }
 
+static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = {
+	[BPF_PROG_TYPE_XDP]		= &xdp_analyzer_ops,
+	[BPF_PROG_TYPE_SCHED_CLS]	= &tc_cls_act_analyzer_ops,
+};
+
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 		 void *priv)
 {
 	struct bpf_verifier_env *env;
 	int ret;
 
+	if (prog->type >= ARRAY_SIZE(bpf_analyzer_ops) ||
+	    !bpf_analyzer_ops[prog->type])
+		return -EOPNOTSUPP;
+
 	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
 	if (!env)
 		return -ENOMEM;
@@ -4416,7 +4393,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	if (!env->insn_aux_data)
 		goto err_free_env;
 	env->prog = prog;
-	env->ops = bpf_verifier_ops[env->prog->type];
+	env->ops = bpf_analyzer_ops[env->prog->type];
 	env->analyzer_ops = ops;
 	env->analyzer_priv = priv;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 1dd3034f846f..7373a08fbef7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3732,6 +3732,23 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, info);
 }
 
+static bool
+tc_cls_act_is_valid_access_analyzer(int off, int size,
+				    enum bpf_access_type type,
+				    struct bpf_insn_access_aux *info)
+{
+	switch (off) {
+	case offsetof(struct sk_buff, data):
+		info->reg_type = PTR_TO_PACKET;
+		return true;
+	case offsetof(struct sk_buff, cb) +
+	     offsetof(struct bpf_skb_data_end, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		return true;
+	}
+	return false;
+}
+
 static bool __is_valid_xdp_access(int off, int size)
 {
 	if (off < 0 || off >= sizeof(struct xdp_md))
@@ -3766,6 +3783,21 @@ static bool xdp_is_valid_access(int off, int size,
 	return __is_valid_xdp_access(off, size);
 }
 
+static bool xdp_is_valid_access_analyzer(int off, int size,
+					 enum bpf_access_type type,
+					 struct bpf_insn_access_aux *info)
+{
+	switch (off) {
+	case offsetof(struct xdp_buff, data):
+		info->reg_type = PTR_TO_PACKET;
+		return true;
+	case offsetof(struct xdp_buff, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		return true;
+	}
+	return false;
+}
+
 void bpf_warn_invalid_xdp_action(u32 act)
 {
 	const u32 act_max = XDP_REDIRECT;
@@ -4411,6 +4443,10 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.gen_prologue		= tc_cls_act_prologue,
 };
 
+const struct bpf_verifier_ops tc_cls_act_analyzer_ops = {
+	.is_valid_access	= tc_cls_act_is_valid_access_analyzer,
+};
+
 const struct bpf_prog_ops tc_cls_act_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
@@ -4421,6 +4457,10 @@ const struct bpf_verifier_ops xdp_verifier_ops = {
 	.convert_ctx_access	= xdp_convert_ctx_access,
 };
 
+const struct bpf_verifier_ops xdp_analyzer_ops = {
+	.is_valid_access	= xdp_is_valid_access_analyzer,
+};
+
 const struct bpf_prog_ops xdp_prog_ops = {
 	.test_run		= bpf_prog_test_run_xdp,
 };
-- 
cgit v1.2.3


From 448956d619a35538e71e8a2d8f1e4bc274037767 Mon Sep 17 00:00:00 2001
From: Frank Binns <frank.binns@imgtec.com>
Date: Wed, 18 Oct 2017 14:30:23 +0100
Subject: dma-fence: remove duplicate word in comment

Signed-off-by: Frank Binns <frank.binns@imgtec.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/1508333423-5394-1-git-send-email-frank.binns@imgtec.com
---
 include/linux/dma-fence.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index ca974224d92e..efdabbb64e3c 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -128,7 +128,7 @@ struct dma_fence_cb {
  * implementation know that there is another driver waiting on
  * the signal (ie. hw->sw case).
  *
- * This function can be called called from atomic context, but not
+ * This function can be called from atomic context, but not
  * from irq context, so normal spinlocks can be used.
  *
  * A return value of false indicates the fence already passed,
-- 
cgit v1.2.3


From 149e10f8ff71439014dff97987e90e87e2684a16 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 11 Oct 2017 12:53:06 +0300
Subject: block: introduce blk_mq_tagset_iter

Iterator helper to apply a function on all the
tags in a given tagset. export it as it will be used
outside the block layer later on.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-tag.c     | 16 +++++++++++-----
 include/linux/blk-mq.h |  2 ++
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 6714507aa6c7..bce1c76fc768 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -298,12 +298,12 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 }
 EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
 
-int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
-			 int (reinit_request)(void *, struct request *))
+int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data,
+			 int (fn)(void *, struct request *))
 {
 	int i, j, ret = 0;
 
-	if (WARN_ON_ONCE(!reinit_request))
+	if (WARN_ON_ONCE(!fn))
 		goto out;
 
 	for (i = 0; i < set->nr_hw_queues; i++) {
@@ -316,8 +316,7 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
 			if (!tags->static_rqs[j])
 				continue;
 
-			ret = reinit_request(set->driver_data,
-					     tags->static_rqs[j]);
+			ret = fn(data, tags->static_rqs[j]);
 			if (ret)
 				goto out;
 		}
@@ -326,6 +325,13 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
 out:
 	return ret;
 }
+EXPORT_SYMBOL_GPL(blk_mq_tagset_iter);
+
+int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
+			 int (reinit_request)(void *, struct request *))
+{
+	return blk_mq_tagset_iter(set, set->driver_data, reinit_request);
+}
 EXPORT_SYMBOL_GPL(blk_mq_reinit_tagset);
 
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 50c6485cb04f..6bc29f73a9aa 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -259,6 +259,8 @@ void blk_freeze_queue_start(struct request_queue *q);
 void blk_mq_freeze_queue_wait(struct request_queue *q);
 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 				     unsigned long timeout);
+int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data,
+		int (reinit_request)(void *, struct request *));
 int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
 			 int (reinit_request)(void *, struct request *));
 
-- 
cgit v1.2.3


From dab7487bdf63c6f2d70aac604494d023b189a9fd Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 11 Oct 2017 12:53:08 +0300
Subject: block: remove blk_mq_reinit_tagset

No callers left.

Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-tag.c     | 7 -------
 include/linux/blk-mq.h | 2 --
 2 files changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index bce1c76fc768..c81b40ecd3f1 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -327,13 +327,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(blk_mq_tagset_iter);
 
-int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
-			 int (reinit_request)(void *, struct request *))
-{
-	return blk_mq_tagset_iter(set, set->driver_data, reinit_request);
-}
-EXPORT_SYMBOL_GPL(blk_mq_reinit_tagset);
-
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 		void *priv)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 6bc29f73a9aa..cfd64e500d82 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -261,8 +261,6 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 				     unsigned long timeout);
 int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data,
 		int (reinit_request)(void *, struct request *));
-int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
-			 int (reinit_request)(void *, struct request *));
 
 int blk_mq_map_queues(struct blk_mq_tag_set *set);
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
-- 
cgit v1.2.3


From 734f0d241d2b4e47383bd0d16e21e06f6cb8d2c3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 9 Oct 2017 12:15:34 -0700
Subject: fscrypt: clean up include file mess

Filesystems have to include different header files based on whether they
are compiled with encryption support or not. That's nasty and messy.

Instead, rationalise the headers so we have a single include fscrypt.h
and let it decide what internal implementation to include based on the
__FS_HAS_ENCRYPTION define.  Filesystems set __FS_HAS_ENCRYPTION to 1
before including linux/fscrypt.h if they are built with encryption
support.  Otherwise, they must set __FS_HAS_ENCRYPTION to 0.

Add guards to prevent fscrypt_supp.h and fscrypt_notsupp.h from being
directly included by filesystems.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
[EB: use 1 and 0 rather than defined/undefined]
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/fscrypt_private.h     |   3 +-
 fs/ext4/ext4.h                  |   8 +--
 fs/f2fs/f2fs.h                  |   8 +--
 fs/ubifs/ubifs.h                |   9 ++-
 include/linux/fscrypt.h         | 156 ++++++++++++++++++++++++++++++++++++++++
 include/linux/fscrypt_common.h  | 141 ------------------------------------
 include/linux/fscrypt_notsupp.h |   7 +-
 include/linux/fscrypt_supp.h    |   7 +-
 8 files changed, 177 insertions(+), 162 deletions(-)
 create mode 100644 include/linux/fscrypt.h
 delete mode 100644 include/linux/fscrypt_common.h

(limited to 'include/linux')

diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index a1d5021c31ef..a180981ee6d7 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -11,7 +11,8 @@
 #ifndef _FSCRYPT_PRIVATE_H
 #define _FSCRYPT_PRIVATE_H
 
-#include <linux/fscrypt_supp.h>
+#define __FS_HAS_ENCRYPTION 1
+#include <linux/fscrypt.h>
 #include <crypto/hash.h>
 
 /* Encryption parameters */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e2abe01c8c6b..ca6d6166b85c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -33,17 +33,15 @@
 #include <linux/percpu_counter.h>
 #include <linux/ratelimit.h>
 #include <crypto/hash.h>
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-#include <linux/fscrypt_supp.h>
-#else
-#include <linux/fscrypt_notsupp.h>
-#endif
 #include <linux/falloc.h>
 #include <linux/percpu-rwsem.h>
 #ifdef __KERNEL__
 #include <linux/compat.h>
 #endif
 
+#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION)
+#include <linux/fscrypt.h>
+
 /*
  * The fourth extended filesystem constants/structures
  */
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9a7c90386947..fc53aebaf3ae 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -23,13 +23,11 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/quotaops.h>
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-#include <linux/fscrypt_supp.h>
-#else
-#include <linux/fscrypt_notsupp.h>
-#endif
 #include <crypto/hash.h>
 
+#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_F2FS_FS_ENCRYPTION)
+#include <linux/fscrypt.h>
+
 #ifdef CONFIG_F2FS_CHECK_FS
 #define f2fs_bug_on(sbi, condition)	BUG_ON(condition)
 #else
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index cd43651f1731..6a346d4af98f 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -38,12 +38,11 @@
 #include <linux/backing-dev.h>
 #include <linux/security.h>
 #include <linux/xattr.h>
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
-#include <linux/fscrypt_supp.h>
-#else
-#include <linux/fscrypt_notsupp.h>
-#endif
 #include <linux/random.h>
+
+#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_UBIFS_FS_ENCRYPTION)
+#include <linux/fscrypt.h>
+
 #include "ubifs-media.h"
 
 /* Version of this UBIFS implementation */
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
new file mode 100644
index 000000000000..bfc962e62078
--- /dev/null
+++ b/include/linux/fscrypt.h
@@ -0,0 +1,156 @@
+/*
+ * fscrypt.h: declarations for per-file encryption
+ *
+ * Filesystems that implement per-file encryption include this header
+ * file with the __FS_HAS_ENCRYPTION set according to whether that filesystem
+ * is being built with encryption support or not.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+#ifndef _LINUX_FSCRYPT_H
+#define _LINUX_FSCRYPT_H
+
+#include <linux/key.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/bio.h>
+#include <linux/dcache.h>
+#include <crypto/skcipher.h>
+#include <uapi/linux/fs.h>
+
+#define FS_CRYPTO_BLOCK_SIZE		16
+
+struct fscrypt_info;
+
+struct fscrypt_ctx {
+	union {
+		struct {
+			struct page *bounce_page;	/* Ciphertext page */
+			struct page *control_page;	/* Original page  */
+		} w;
+		struct {
+			struct bio *bio;
+			struct work_struct work;
+		} r;
+		struct list_head free_list;	/* Free list */
+	};
+	u8 flags;				/* Flags */
+};
+
+/**
+ * For encrypted symlinks, the ciphertext length is stored at the beginning
+ * of the string in little-endian format.
+ */
+struct fscrypt_symlink_data {
+	__le16 len;
+	char encrypted_path[1];
+} __packed;
+
+struct fscrypt_str {
+	unsigned char *name;
+	u32 len;
+};
+
+struct fscrypt_name {
+	const struct qstr *usr_fname;
+	struct fscrypt_str disk_name;
+	u32 hash;
+	u32 minor_hash;
+	struct fscrypt_str crypto_buf;
+};
+
+#define FSTR_INIT(n, l)		{ .name = n, .len = l }
+#define FSTR_TO_QSTR(f)		QSTR_INIT((f)->name, (f)->len)
+#define fname_name(p)		((p)->disk_name.name)
+#define fname_len(p)		((p)->disk_name.len)
+
+/*
+ * fscrypt superblock flags
+ */
+#define FS_CFLG_OWN_PAGES (1U << 1)
+
+/*
+ * crypto opertions for filesystems
+ */
+struct fscrypt_operations {
+	unsigned int flags;
+	const char *key_prefix;
+	int (*get_context)(struct inode *, void *, size_t);
+	int (*set_context)(struct inode *, const void *, size_t, void *);
+	bool (*dummy_context)(struct inode *);
+	bool (*is_encrypted)(struct inode *);
+	bool (*empty_dir)(struct inode *);
+	unsigned (*max_namelen)(struct inode *);
+};
+
+/* Maximum value for the third parameter of fscrypt_operations.set_context(). */
+#define FSCRYPT_SET_CONTEXT_MAX_SIZE	28
+
+static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
+{
+	if (inode->i_sb->s_cop->dummy_context &&
+				inode->i_sb->s_cop->dummy_context(inode))
+		return true;
+	return false;
+}
+
+static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
+					u32 filenames_mode)
+{
+	if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC &&
+	    filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS)
+		return true;
+
+	if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS &&
+	    filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
+		return true;
+
+	return false;
+}
+
+static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
+{
+	if (str->len == 1 && str->name[0] == '.')
+		return true;
+
+	if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
+		return true;
+
+	return false;
+}
+
+#if __FS_HAS_ENCRYPTION
+
+static inline struct page *fscrypt_control_page(struct page *page)
+{
+	return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
+}
+
+static inline bool fscrypt_has_encryption_key(const struct inode *inode)
+{
+	return (inode->i_crypt_info != NULL);
+}
+
+#include <linux/fscrypt_supp.h>
+
+#else /* !__FS_HAS_ENCRYPTION */
+
+static inline struct page *fscrypt_control_page(struct page *page)
+{
+	WARN_ON_ONCE(1);
+	return ERR_PTR(-EINVAL);
+}
+
+static inline bool fscrypt_has_encryption_key(const struct inode *inode)
+{
+	return 0;
+}
+
+#include <linux/fscrypt_notsupp.h>
+#endif /* __FS_HAS_ENCRYPTION */
+
+
+#endif	/* _LINUX_FSCRYPT_H */
diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h
deleted file mode 100644
index 97f738628b36..000000000000
--- a/include/linux/fscrypt_common.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * fscrypt_common.h: common declarations for per-file encryption
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * Written by Michael Halcrow, 2015.
- * Modified by Jaegeuk Kim, 2015.
- */
-
-#ifndef _LINUX_FSCRYPT_COMMON_H
-#define _LINUX_FSCRYPT_COMMON_H
-
-#include <linux/key.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/bio.h>
-#include <linux/dcache.h>
-#include <crypto/skcipher.h>
-#include <uapi/linux/fs.h>
-
-#define FS_CRYPTO_BLOCK_SIZE		16
-
-struct fscrypt_info;
-
-struct fscrypt_ctx {
-	union {
-		struct {
-			struct page *bounce_page;	/* Ciphertext page */
-			struct page *control_page;	/* Original page  */
-		} w;
-		struct {
-			struct bio *bio;
-			struct work_struct work;
-		} r;
-		struct list_head free_list;	/* Free list */
-	};
-	u8 flags;				/* Flags */
-};
-
-/**
- * For encrypted symlinks, the ciphertext length is stored at the beginning
- * of the string in little-endian format.
- */
-struct fscrypt_symlink_data {
-	__le16 len;
-	char encrypted_path[1];
-} __packed;
-
-struct fscrypt_str {
-	unsigned char *name;
-	u32 len;
-};
-
-struct fscrypt_name {
-	const struct qstr *usr_fname;
-	struct fscrypt_str disk_name;
-	u32 hash;
-	u32 minor_hash;
-	struct fscrypt_str crypto_buf;
-};
-
-#define FSTR_INIT(n, l)		{ .name = n, .len = l }
-#define FSTR_TO_QSTR(f)		QSTR_INIT((f)->name, (f)->len)
-#define fname_name(p)		((p)->disk_name.name)
-#define fname_len(p)		((p)->disk_name.len)
-
-/*
- * fscrypt superblock flags
- */
-#define FS_CFLG_OWN_PAGES (1U << 1)
-
-/*
- * crypto opertions for filesystems
- */
-struct fscrypt_operations {
-	unsigned int flags;
-	const char *key_prefix;
-	int (*get_context)(struct inode *, void *, size_t);
-	int (*set_context)(struct inode *, const void *, size_t, void *);
-	bool (*dummy_context)(struct inode *);
-	bool (*is_encrypted)(struct inode *);
-	bool (*empty_dir)(struct inode *);
-	unsigned (*max_namelen)(struct inode *);
-};
-
-/* Maximum value for the third parameter of fscrypt_operations.set_context(). */
-#define FSCRYPT_SET_CONTEXT_MAX_SIZE	28
-
-static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
-{
-	if (inode->i_sb->s_cop->dummy_context &&
-				inode->i_sb->s_cop->dummy_context(inode))
-		return true;
-	return false;
-}
-
-static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
-					u32 filenames_mode)
-{
-	if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC &&
-	    filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS)
-		return true;
-
-	if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS &&
-	    filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
-		return true;
-
-	return false;
-}
-
-static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
-{
-	if (str->len == 1 && str->name[0] == '.')
-		return true;
-
-	if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
-		return true;
-
-	return false;
-}
-
-static inline struct page *fscrypt_control_page(struct page *page)
-{
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-	return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
-#else
-	WARN_ON_ONCE(1);
-	return ERR_PTR(-EINVAL);
-#endif
-}
-
-static inline int fscrypt_has_encryption_key(const struct inode *inode)
-{
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-	return (inode->i_crypt_info != NULL);
-#else
-	return 0;
-#endif
-}
-
-#endif	/* _LINUX_FSCRYPT_COMMON_H */
diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h
index ec406aed2f2f..2d0b6960831e 100644
--- a/include/linux/fscrypt_notsupp.h
+++ b/include/linux/fscrypt_notsupp.h
@@ -3,13 +3,16 @@
  *
  * This stubs out the fscrypt functions for filesystems configured without
  * encryption support.
+ *
+ * Do not include this file directly. Use fscrypt.h instead!
  */
+#ifndef _LINUX_FSCRYPT_H
+#error "Incorrect include of linux/fscrypt_notsupp.h!"
+#endif
 
 #ifndef _LINUX_FSCRYPT_NOTSUPP_H
 #define _LINUX_FSCRYPT_NOTSUPP_H
 
-#include <linux/fscrypt_common.h>
-
 /* crypto.c */
 static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode,
 						  gfp_t gfp_flags)
diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h
index 32e2fcf13b01..5a90e5ef4687 100644
--- a/include/linux/fscrypt_supp.h
+++ b/include/linux/fscrypt_supp.h
@@ -1,14 +1,15 @@
 /*
  * fscrypt_supp.h
  *
- * This is included by filesystems configured with encryption support.
+ * Do not include this file directly. Use fscrypt.h instead!
  */
+#ifndef _LINUX_FSCRYPT_H
+#error "Incorrect include of linux/fscrypt_supp.h!"
+#endif
 
 #ifndef _LINUX_FSCRYPT_SUPP_H
 #define _LINUX_FSCRYPT_SUPP_H
 
-#include <linux/fscrypt_common.h>
-
 /* crypto.c */
 extern struct kmem_cache *fscrypt_info_cachep;
 extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t);
-- 
cgit v1.2.3


From 2ee6a576be56427209d370d8a511d49340c84139 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 9 Oct 2017 12:15:35 -0700
Subject: fs, fscrypt: add an S_ENCRYPTED inode flag

Introduce a flag S_ENCRYPTED which can be set in ->i_flags to indicate
that the inode is encrypted using the fscrypt (fs/crypto/) mechanism.

Checking this flag will give the same information that
inode->i_sb->s_cop->is_encrypted(inode) currently does, but will be more
efficient.  This will be useful for adding higher-level helper functions
for filesystems to use.  For example we'll be able to replace this:

	if (ext4_encrypted_inode(inode)) {
		ret = fscrypt_get_encryption_info(inode);
		if (ret)
			return ret;
		if (!fscrypt_has_encryption_key(inode))
			return -ENOKEY;
	}

with this:

	ret = fscrypt_require_key(inode);
	if (ret)
		return ret;

... since we'll be able to retain the fast path for unencrypted files as
a single flag check, using an inline function.  This wasn't possible
before because we'd have had to frequently call through the
->i_sb->s_cop->is_encrypted function pointer, even when the encryption
support was disabled or not being used.

Note: we don't define S_ENCRYPTED to 0 if CONFIG_FS_ENCRYPTION is
disabled because we want to continue to return an error if an encrypted
file is accessed without encryption support, rather than pretending that
it is unencrypted.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c    | 7 +++++--
 fs/ext4/super.c    | 8 ++++++--
 fs/f2fs/f2fs.h     | 1 +
 fs/f2fs/inode.c    | 5 ++++-
 fs/ubifs/ioctl.c   | 5 ++++-
 fs/ubifs/xattr.c   | 1 +
 include/linux/fs.h | 2 ++
 7 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 31db875bc7a1..d5a471939fbc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4589,10 +4589,13 @@ void ext4_set_inode_flags(struct inode *inode)
 		new_fl |= S_DIRSYNC;
 	if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode) &&
 	    !ext4_should_journal_data(inode) && !ext4_has_inline_data(inode) &&
-	    !ext4_encrypted_inode(inode))
+	    !(flags & EXT4_ENCRYPT_FL))
 		new_fl |= S_DAX;
+	if (flags & EXT4_ENCRYPT_FL)
+		new_fl |= S_ENCRYPTED;
 	inode_set_flags(inode, new_fl,
-			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
+			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
+			S_ENCRYPTED);
 }
 
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b104096fce9e..dcfb19539871 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1181,7 +1181,8 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
 			ext4_clear_inode_state(inode,
 					EXT4_STATE_MAY_INLINE_DATA);
 			/*
-			 * Update inode->i_flags - e.g. S_DAX may get disabled
+			 * Update inode->i_flags - S_ENCRYPTED will be enabled,
+			 * S_DAX may be disabled
 			 */
 			ext4_set_inode_flags(inode);
 		}
@@ -1206,7 +1207,10 @@ retry:
 				    ctx, len, 0);
 	if (!res) {
 		ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
-		/* Update inode->i_flags - e.g. S_DAX may get disabled */
+		/*
+		 * Update inode->i_flags - S_ENCRYPTED will be enabled,
+		 * S_DAX may be disabled
+		 */
 		ext4_set_inode_flags(inode);
 		res = ext4_mark_inode_dirty(handle, inode);
 		if (res)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index fc53aebaf3ae..34d6e9ae8422 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2947,6 +2947,7 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode)
 {
 #ifdef CONFIG_F2FS_FS_ENCRYPTION
 	file_set_encrypt(inode);
+	inode->i_flags |= S_ENCRYPTED;
 #endif
 }
 
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 50c88e37ed66..53fb08810ee9 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -43,8 +43,11 @@ void f2fs_set_inode_flags(struct inode *inode)
 		new_fl |= S_NOATIME;
 	if (flags & FS_DIRSYNC_FL)
 		new_fl |= S_DIRSYNC;
+	if (f2fs_encrypted_inode(inode))
+		new_fl |= S_ENCRYPTED;
 	inode_set_flags(inode, new_fl,
-			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|
+			S_ENCRYPTED);
 }
 
 static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index fdc311246807..0164bcc827f8 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -38,7 +38,8 @@ void ubifs_set_inode_flags(struct inode *inode)
 {
 	unsigned int flags = ubifs_inode(inode)->flags;
 
-	inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC);
+	inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC |
+			    S_ENCRYPTED);
 	if (flags & UBIFS_SYNC_FL)
 		inode->i_flags |= S_SYNC;
 	if (flags & UBIFS_APPEND_FL)
@@ -47,6 +48,8 @@ void ubifs_set_inode_flags(struct inode *inode)
 		inode->i_flags |= S_IMMUTABLE;
 	if (flags & UBIFS_DIRSYNC_FL)
 		inode->i_flags |= S_DIRSYNC;
+	if (flags & UBIFS_CRYPT_FL)
+		inode->i_flags |= S_ENCRYPTED;
 }
 
 /*
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index c13eae819cbc..5ddc89d564fd 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -170,6 +170,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
 	err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
 	if (err)
 		goto out_cancel;
+	ubifs_set_inode_flags(host);
 	mutex_unlock(&host_ui->ui_mutex);
 
 	ubifs_release_budget(c, &req);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 339e73742e73..055d2fbf8eca 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1853,6 +1853,7 @@ struct super_operations {
 #else
 #define S_DAX		0	/* Make all the DAX code disappear */
 #endif
+#define S_ENCRYPTED	16384	/* Encrypted file (using fs/crypto/) */
 
 /*
  * Note that nosuid etc flags are inode-specific: setting some file-system
@@ -1892,6 +1893,7 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
 #define IS_AUTOMOUNT(inode)	((inode)->i_flags & S_AUTOMOUNT)
 #define IS_NOSEC(inode)		((inode)->i_flags & S_NOSEC)
 #define IS_DAX(inode)		((inode)->i_flags & S_DAX)
+#define IS_ENCRYPTED(inode)	((inode)->i_flags & S_ENCRYPTED)
 
 #define IS_WHITEOUT(inode)	(S_ISCHR(inode->i_mode) && \
 				 (inode)->i_rdev == WHITEOUT_DEV)
-- 
cgit v1.2.3


From e0428a266d5a29a3c2eec287fcd49be9e8e2468e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 9 Oct 2017 12:15:36 -0700
Subject: fscrypt: switch from ->is_encrypted() to IS_ENCRYPTED()

IS_ENCRYPTED() now gives the same information as
i_sb->s_cop->is_encrypted() but is more efficient, since IS_ENCRYPTED()
is just a simple flag check.  Prepare to remove ->is_encrypted() by
switching all callers to IS_ENCRYPTED().

Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/crypto.c              | 2 +-
 fs/crypto/fname.c               | 3 +--
 fs/crypto/keyinfo.c             | 2 +-
 fs/crypto/policy.c              | 6 +++---
 include/linux/fscrypt_notsupp.h | 2 +-
 5 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index c7835df7e7b8..608f6bbe0f31 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -340,7 +340,7 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
 		return -ECHILD;
 
 	dir = dget_parent(dentry);
-	if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) {
+	if (!IS_ENCRYPTED(d_inode(dir))) {
 		dput(dir);
 		return 0;
 	}
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index ad9f814fdead..2878289b3ed2 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -382,8 +382,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
 	memset(fname, 0, sizeof(struct fscrypt_name));
 	fname->usr_fname = iname;
 
-	if (!dir->i_sb->s_cop->is_encrypted(dir) ||
-				fscrypt_is_dot_dotdot(iname)) {
+	if (!IS_ENCRYPTED(dir) || fscrypt_is_dot_dotdot(iname)) {
 		fname->disk_name.name = (unsigned char *)iname->name;
 		fname->disk_name.len = iname->len;
 		return 0;
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 018c588c7ac3..236a68d4ea72 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -268,7 +268,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
 	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
 	if (res < 0) {
 		if (!fscrypt_dummy_context_enabled(inode) ||
-		    inode->i_sb->s_cop->is_encrypted(inode))
+		    IS_ENCRYPTED(inode))
 			return res;
 		/* Fake up a context for an unencrypted directory */
 		memset(&ctx, 0, sizeof(ctx));
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index ce07a86200f3..6a63b8a0d46c 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -109,7 +109,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
 	struct fscrypt_policy policy;
 	int res;
 
-	if (!inode->i_sb->s_cop->is_encrypted(inode))
+	if (!IS_ENCRYPTED(inode))
 		return -ENODATA;
 
 	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
@@ -166,11 +166,11 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
 		return 1;
 
 	/* No restrictions if the parent directory is unencrypted */
-	if (!cops->is_encrypted(parent))
+	if (!IS_ENCRYPTED(parent))
 		return 1;
 
 	/* Encrypted directories must not contain unencrypted files */
-	if (!cops->is_encrypted(child))
+	if (!IS_ENCRYPTED(child))
 		return 0;
 
 	/*
diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h
index 2d0b6960831e..7b390e356f7f 100644
--- a/include/linux/fscrypt_notsupp.h
+++ b/include/linux/fscrypt_notsupp.h
@@ -100,7 +100,7 @@ static inline int fscrypt_setup_filename(struct inode *dir,
 					 const struct qstr *iname,
 					 int lookup, struct fscrypt_name *fname)
 {
-	if (dir->i_sb->s_cop->is_encrypted(dir))
+	if (IS_ENCRYPTED(dir))
 		return -EOPNOTSUPP;
 
 	memset(fname, 0, sizeof(struct fscrypt_name));
-- 
cgit v1.2.3


From f7293e48bb1d0c482cd706deb1256a6be718f4f5 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 9 Oct 2017 12:15:37 -0700
Subject: fscrypt: remove ->is_encrypted()

Now that all callers of fscrypt_operations.is_encrypted() have been
switched to IS_ENCRYPTED(), remove ->is_encrypted().

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c         | 2 --
 fs/f2fs/super.c         | 2 --
 fs/ubifs/crypto.c       | 1 -
 fs/ubifs/super.c        | 1 -
 fs/ubifs/ubifs.h        | 9 ++-------
 include/linux/fscrypt.h | 1 -
 6 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dcfb19539871..bc63cdf194e3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1241,13 +1241,11 @@ static const struct fscrypt_operations ext4_cryptops = {
 	.get_context		= ext4_get_context,
 	.set_context		= ext4_set_context,
 	.dummy_context		= ext4_dummy_context,
-	.is_encrypted		= ext4_encrypted_inode,
 	.empty_dir		= ext4_empty_dir,
 	.max_namelen		= ext4_max_namelen,
 };
 #else
 static const struct fscrypt_operations ext4_cryptops = {
-	.is_encrypted		= ext4_encrypted_inode,
 };
 #endif
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 89f61eb3d167..1cb41f711ab8 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1594,13 +1594,11 @@ static const struct fscrypt_operations f2fs_cryptops = {
 	.key_prefix	= "f2fs:",
 	.get_context	= f2fs_get_context,
 	.set_context	= f2fs_set_context,
-	.is_encrypted	= f2fs_encrypted_inode,
 	.empty_dir	= f2fs_empty_dir,
 	.max_namelen	= f2fs_max_namelen,
 };
 #else
 static const struct fscrypt_operations f2fs_cryptops = {
-	.is_encrypted	= f2fs_encrypted_inode,
 };
 #endif
 
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 114ba455bac3..8880fa7733d8 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -87,7 +87,6 @@ const struct fscrypt_operations ubifs_crypt_operations = {
 	.key_prefix		= "ubifs:",
 	.get_context		= ubifs_crypt_get_context,
 	.set_context		= ubifs_crypt_set_context,
-	.is_encrypted		= __ubifs_crypt_is_encrypted,
 	.empty_dir		= ubifs_crypt_empty_dir,
 	.max_namelen		= ubifs_crypt_max_namelen,
 };
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 5496b17b959c..adaca6088836 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2009,7 +2009,6 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
 
 #ifndef CONFIG_UBIFS_FS_ENCRYPTION
 const struct fscrypt_operations ubifs_crypt_operations = {
-	.is_encrypted		= __ubifs_crypt_is_encrypted,
 };
 #endif
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 6a346d4af98f..63c7468147eb 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1834,18 +1834,13 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
 
 extern const struct fscrypt_operations ubifs_crypt_operations;
 
-static inline bool __ubifs_crypt_is_encrypted(struct inode *inode)
+static inline bool ubifs_crypt_is_encrypted(const struct inode *inode)
 {
-	struct ubifs_inode *ui = ubifs_inode(inode);
+	const struct ubifs_inode *ui = ubifs_inode(inode);
 
 	return ui->flags & UBIFS_CRYPT_FL;
 }
 
-static inline bool ubifs_crypt_is_encrypted(const struct inode *inode)
-{
-	return __ubifs_crypt_is_encrypted((struct inode *)inode);
-}
-
 /* Normal UBIFS messages */
 __printf(2, 3)
 void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...);
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index bfc962e62078..d58e6a90cfe4 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -81,7 +81,6 @@ struct fscrypt_operations {
 	int (*get_context)(struct inode *, void *, size_t);
 	int (*set_context)(struct inode *, const void *, size_t, void *);
 	bool (*dummy_context)(struct inode *);
-	bool (*is_encrypted)(struct inode *);
 	bool (*empty_dir)(struct inode *);
 	unsigned (*max_namelen)(struct inode *);
 };
-- 
cgit v1.2.3


From d293c3e4e07334d761c88df9d68b3aa800a83dd9 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 9 Oct 2017 12:15:39 -0700
Subject: fscrypt: new helper function - fscrypt_require_key()

Add a helper function which checks if an inode is encrypted, and if so,
tries to set up its encryption key.  This is a pattern which is
duplicated in multiple places in each of ext4, f2fs, and ubifs --- for
example, when a regular file is asked to be opened or truncated.

Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/fscrypt.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index d58e6a90cfe4..b3e2a5f93415 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -151,5 +151,30 @@ static inline bool fscrypt_has_encryption_key(const struct inode *inode)
 #include <linux/fscrypt_notsupp.h>
 #endif /* __FS_HAS_ENCRYPTION */
 
+/**
+ * fscrypt_require_key - require an inode's encryption key
+ * @inode: the inode we need the key for
+ *
+ * If the inode is encrypted, set up its encryption key if not already done.
+ * Then require that the key be present and return -ENOKEY otherwise.
+ *
+ * No locks are needed, and the key will live as long as the struct inode --- so
+ * it won't go away from under you.
+ *
+ * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
+ * if a problem occurred while setting up the encryption key.
+ */
+static inline int fscrypt_require_key(struct inode *inode)
+{
+	if (IS_ENCRYPTED(inode)) {
+		int err = fscrypt_get_encryption_info(inode);
+
+		if (err)
+			return err;
+		if (!fscrypt_has_encryption_key(inode))
+			return -ENOKEY;
+	}
+	return 0;
+}
 
 #endif	/* _LINUX_FSCRYPT_H */
-- 
cgit v1.2.3


From efcc7ae2c9172d9a7ae94afdaf066a7abf0b9a90 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 9 Oct 2017 12:15:40 -0700
Subject: fscrypt: new helper function - fscrypt_file_open()

Add a helper function which prepares to open a regular file which may be
encrypted.  It handles setting up the file's encryption key, then
checking that the file's encryption policy matches that of its parent
directory (if the parent directory is encrypted).  It may be set as the
->open() method or it can be called from another ->open() method.

Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/Makefile              |  2 +-
 fs/crypto/hooks.c               | 49 +++++++++++++++++++++++++++++++++++++++++
 include/linux/fscrypt_notsupp.h |  9 ++++++++
 include/linux/fscrypt_supp.h    |  3 +++
 4 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 fs/crypto/hooks.c

(limited to 'include/linux')

diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
index 9f6607f17b53..cb496989a6b6 100644
--- a/fs/crypto/Makefile
+++ b/fs/crypto/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_FS_ENCRYPTION)	+= fscrypto.o
 
-fscrypto-y := crypto.o fname.o policy.o keyinfo.o
+fscrypto-y := crypto.o fname.o hooks.o keyinfo.o policy.o
 fscrypto-$(CONFIG_BLOCK) += bio.o
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
new file mode 100644
index 000000000000..069088e91ea9
--- /dev/null
+++ b/fs/crypto/hooks.c
@@ -0,0 +1,49 @@
+/*
+ * fs/crypto/hooks.c
+ *
+ * Encryption hooks for higher-level filesystem operations.
+ */
+
+#include <linux/ratelimit.h>
+#include "fscrypt_private.h"
+
+/**
+ * fscrypt_file_open - prepare to open a possibly-encrypted regular file
+ * @inode: the inode being opened
+ * @filp: the struct file being set up
+ *
+ * Currently, an encrypted regular file can only be opened if its encryption key
+ * is available; access to the raw encrypted contents is not supported.
+ * Therefore, we first set up the inode's encryption key (if not already done)
+ * and return an error if it's unavailable.
+ *
+ * We also verify that if the parent directory (from the path via which the file
+ * is being opened) is encrypted, then the inode being opened uses the same
+ * encryption policy.  This is needed as part of the enforcement that all files
+ * in an encrypted directory tree use the same encryption policy, as a
+ * protection against certain types of offline attacks.  Note that this check is
+ * needed even when opening an *unencrypted* file, since it's forbidden to have
+ * an unencrypted file in an encrypted directory.
+ *
+ * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
+ */
+int fscrypt_file_open(struct inode *inode, struct file *filp)
+{
+	int err;
+	struct dentry *dir;
+
+	err = fscrypt_require_key(inode);
+	if (err)
+		return err;
+
+	dir = dget_parent(file_dentry(filp));
+	if (IS_ENCRYPTED(d_inode(dir)) &&
+	    !fscrypt_has_permitted_context(d_inode(dir), inode)) {
+		pr_warn_ratelimited("fscrypt: inconsistent encryption contexts: %lu/%lu",
+				    d_inode(dir)->i_ino, inode->i_ino);
+		err = -EPERM;
+	}
+	dput(dir);
+	return err;
+}
+EXPORT_SYMBOL_GPL(fscrypt_file_open);
diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h
index 7b390e356f7f..162da6517ac4 100644
--- a/include/linux/fscrypt_notsupp.h
+++ b/include/linux/fscrypt_notsupp.h
@@ -177,4 +177,13 @@ static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 	return -EOPNOTSUPP;
 }
 
+/* hooks.c */
+
+static inline int fscrypt_file_open(struct inode *inode, struct file *filp)
+{
+	if (IS_ENCRYPTED(inode))
+		return -EOPNOTSUPP;
+	return 0;
+}
+
 #endif	/* _LINUX_FSCRYPT_NOTSUPP_H */
diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h
index 5a90e5ef4687..fd2f6decaee4 100644
--- a/include/linux/fscrypt_supp.h
+++ b/include/linux/fscrypt_supp.h
@@ -143,4 +143,7 @@ extern void fscrypt_pullback_bio_page(struct page **, bool);
 extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t,
 				 unsigned int);
 
+/* hooks.c */
+extern int fscrypt_file_open(struct inode *inode, struct file *filp);
+
 #endif	/* _LINUX_FSCRYPT_SUPP_H */
-- 
cgit v1.2.3


From 0ea87a9644ebb5c9a3b100585d10533366de3269 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 9 Oct 2017 12:15:41 -0700
Subject: fscrypt: new helper function - fscrypt_prepare_link()

Introduce a helper function which prepares to link an inode into a
possibly-encrypted directory.  It handles setting up the target
directory's encryption key, then verifying that the link won't violate
the constraint that all files in an encrypted directory tree use the
same encryption policy.

Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/hooks.c               | 15 +++++++++++++++
 include/linux/fscrypt.h         | 27 +++++++++++++++++++++++++++
 include/linux/fscrypt_notsupp.h |  6 ++++++
 include/linux/fscrypt_supp.h    |  1 +
 4 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 069088e91ea9..8b90217320dd 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -47,3 +47,18 @@ int fscrypt_file_open(struct inode *inode, struct file *filp)
 	return err;
 }
 EXPORT_SYMBOL_GPL(fscrypt_file_open);
+
+int __fscrypt_prepare_link(struct inode *inode, struct inode *dir)
+{
+	int err;
+
+	err = fscrypt_require_key(dir);
+	if (err)
+		return err;
+
+	if (!fscrypt_has_permitted_context(dir, inode))
+		return -EPERM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_link);
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index b3e2a5f93415..9c9a53f99327 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -177,4 +177,31 @@ static inline int fscrypt_require_key(struct inode *inode)
 	return 0;
 }
 
+/**
+ * fscrypt_prepare_link - prepare to link an inode into a possibly-encrypted directory
+ * @old_dentry: an existing dentry for the inode being linked
+ * @dir: the target directory
+ * @dentry: negative dentry for the target filename
+ *
+ * A new link can only be added to an encrypted directory if the directory's
+ * encryption key is available --- since otherwise we'd have no way to encrypt
+ * the filename.  Therefore, we first set up the directory's encryption key (if
+ * not already done) and return an error if it's unavailable.
+ *
+ * We also verify that the link will not violate the constraint that all files
+ * in an encrypted directory tree use the same encryption policy.
+ *
+ * Return: 0 on success, -ENOKEY if the directory's encryption key is missing,
+ * -EPERM if the link would result in an inconsistent encryption policy, or
+ * another -errno code.
+ */
+static inline int fscrypt_prepare_link(struct dentry *old_dentry,
+				       struct inode *dir,
+				       struct dentry *dentry)
+{
+	if (IS_ENCRYPTED(dir))
+		return __fscrypt_prepare_link(d_inode(old_dentry), dir);
+	return 0;
+}
+
 #endif	/* _LINUX_FSCRYPT_H */
diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h
index 162da6517ac4..d7d1039eb6b5 100644
--- a/include/linux/fscrypt_notsupp.h
+++ b/include/linux/fscrypt_notsupp.h
@@ -186,4 +186,10 @@ static inline int fscrypt_file_open(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static inline int __fscrypt_prepare_link(struct inode *inode,
+					 struct inode *dir)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif	/* _LINUX_FSCRYPT_NOTSUPP_H */
diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h
index fd2f6decaee4..80706283da75 100644
--- a/include/linux/fscrypt_supp.h
+++ b/include/linux/fscrypt_supp.h
@@ -145,5 +145,6 @@ extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t,
 
 /* hooks.c */
 extern int fscrypt_file_open(struct inode *inode, struct file *filp);
+extern int __fscrypt_prepare_link(struct inode *inode, struct inode *dir);
 
 #endif	/* _LINUX_FSCRYPT_SUPP_H */
-- 
cgit v1.2.3


From 94b26f3672a0e41025104c7e46943917544e1c87 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 9 Oct 2017 12:15:42 -0700
Subject: fscrypt: new helper function - fscrypt_prepare_rename()

Introduce a helper function which prepares to rename a file into a
possibly encrypted directory.  It handles loading the encryption keys
for the source and target directories if needed, and it handles
enforcing that if the target directory (and the source directory for a
cross-rename) is encrypted, then the file being moved into the directory
has the same encryption policy as its containing directory.

Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/hooks.c               | 30 ++++++++++++++++++++++++++++++
 include/linux/fscrypt.h         | 33 +++++++++++++++++++++++++++++++++
 include/linux/fscrypt_notsupp.h |  9 +++++++++
 include/linux/fscrypt_supp.h    |  5 +++++
 4 files changed, 77 insertions(+)

(limited to 'include/linux')

diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 8b90217320dd..822cb78f9b45 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -62,3 +62,33 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__fscrypt_prepare_link);
+
+int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
+			     struct inode *new_dir, struct dentry *new_dentry,
+			     unsigned int flags)
+{
+	int err;
+
+	err = fscrypt_require_key(old_dir);
+	if (err)
+		return err;
+
+	err = fscrypt_require_key(new_dir);
+	if (err)
+		return err;
+
+	if (old_dir != new_dir) {
+		if (IS_ENCRYPTED(new_dir) &&
+		    !fscrypt_has_permitted_context(new_dir,
+						   d_inode(old_dentry)))
+			return -EPERM;
+
+		if ((flags & RENAME_EXCHANGE) &&
+		    IS_ENCRYPTED(old_dir) &&
+		    !fscrypt_has_permitted_context(old_dir,
+						   d_inode(new_dentry)))
+			return -EPERM;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename);
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 9c9a53f99327..c422367baed9 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -204,4 +204,37 @@ static inline int fscrypt_prepare_link(struct dentry *old_dentry,
 	return 0;
 }
 
+/**
+ * fscrypt_prepare_rename - prepare for a rename between possibly-encrypted directories
+ * @old_dir: source directory
+ * @old_dentry: dentry for source file
+ * @new_dir: target directory
+ * @new_dentry: dentry for target location (may be negative unless exchanging)
+ * @flags: rename flags (we care at least about %RENAME_EXCHANGE)
+ *
+ * Prepare for ->rename() where the source and/or target directories may be
+ * encrypted.  A new link can only be added to an encrypted directory if the
+ * directory's encryption key is available --- since otherwise we'd have no way
+ * to encrypt the filename.  A rename to an existing name, on the other hand,
+ * *is* cryptographically possible without the key.  However, we take the more
+ * conservative approach and just forbid all no-key renames.
+ *
+ * We also verify that the rename will not violate the constraint that all files
+ * in an encrypted directory tree use the same encryption policy.
+ *
+ * Return: 0 on success, -ENOKEY if an encryption key is missing, -EPERM if the
+ * rename would cause inconsistent encryption policies, or another -errno code.
+ */
+static inline int fscrypt_prepare_rename(struct inode *old_dir,
+					 struct dentry *old_dentry,
+					 struct inode *new_dir,
+					 struct dentry *new_dentry,
+					 unsigned int flags)
+{
+	if (IS_ENCRYPTED(old_dir) || IS_ENCRYPTED(new_dir))
+		return __fscrypt_prepare_rename(old_dir, old_dentry,
+						new_dir, new_dentry, flags);
+	return 0;
+}
+
 #endif	/* _LINUX_FSCRYPT_H */
diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h
index d7d1039eb6b5..6af378d8126e 100644
--- a/include/linux/fscrypt_notsupp.h
+++ b/include/linux/fscrypt_notsupp.h
@@ -192,4 +192,13 @@ static inline int __fscrypt_prepare_link(struct inode *inode,
 	return -EOPNOTSUPP;
 }
 
+static inline int __fscrypt_prepare_rename(struct inode *old_dir,
+					   struct dentry *old_dentry,
+					   struct inode *new_dir,
+					   struct dentry *new_dentry,
+					   unsigned int flags)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif	/* _LINUX_FSCRYPT_NOTSUPP_H */
diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h
index 80706283da75..40f35073145f 100644
--- a/include/linux/fscrypt_supp.h
+++ b/include/linux/fscrypt_supp.h
@@ -146,5 +146,10 @@ extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t,
 /* hooks.c */
 extern int fscrypt_file_open(struct inode *inode, struct file *filp);
 extern int __fscrypt_prepare_link(struct inode *inode, struct inode *dir);
+extern int __fscrypt_prepare_rename(struct inode *old_dir,
+				    struct dentry *old_dentry,
+				    struct inode *new_dir,
+				    struct dentry *new_dentry,
+				    unsigned int flags);
 
 #endif	/* _LINUX_FSCRYPT_SUPP_H */
-- 
cgit v1.2.3


From 32c3cf028e747d1e9cf0679b16dcbe3794fe4853 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 9 Oct 2017 12:15:43 -0700
Subject: fscrypt: new helper function - fscrypt_prepare_lookup()

Introduce a helper function which prepares to look up the given dentry
in the given directory.  If the directory is encrypted, it handles
loading the directory's encryption key, setting the dentry's ->d_op to
fscrypt_d_ops, and setting DCACHE_ENCRYPTED_WITH_KEY if the directory's
encryption key is available.

Note: once all filesystems switch over to this, we'll be able to move
fscrypt_d_ops and fscrypt_set_encrypted_dentry() to fscrypt_private.h.

Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/hooks.c               | 18 ++++++++++++++++++
 include/linux/fscrypt.h         | 28 ++++++++++++++++++++++++++++
 include/linux/fscrypt_notsupp.h |  6 ++++++
 include/linux/fscrypt_supp.h    |  1 +
 4 files changed, 53 insertions(+)

(limited to 'include/linux')

diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 822cb78f9b45..9f5fb2eb9cf7 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -92,3 +92,21 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename);
+
+int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry)
+{
+	int err = fscrypt_get_encryption_info(dir);
+
+	if (err)
+		return err;
+
+	if (fscrypt_has_encryption_key(dir)) {
+		spin_lock(&dentry->d_lock);
+		dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY;
+		spin_unlock(&dentry->d_lock);
+	}
+
+	d_set_d_op(dentry, &fscrypt_d_ops);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index c422367baed9..2327859c8cd2 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -237,4 +237,32 @@ static inline int fscrypt_prepare_rename(struct inode *old_dir,
 	return 0;
 }
 
+/**
+ * fscrypt_prepare_lookup - prepare to lookup a name in a possibly-encrypted directory
+ * @dir: directory being searched
+ * @dentry: filename being looked up
+ * @flags: lookup flags
+ *
+ * Prepare for ->lookup() in a directory which may be encrypted.  Lookups can be
+ * done with or without the directory's encryption key; without the key,
+ * filenames are presented in encrypted form.  Therefore, we'll try to set up
+ * the directory's encryption key, but even without it the lookup can continue.
+ *
+ * To allow invalidating stale dentries if the directory's encryption key is
+ * added later, we also install a custom ->d_revalidate() method and use the
+ * DCACHE_ENCRYPTED_WITH_KEY flag to indicate whether a given dentry is a
+ * plaintext name (flag set) or a ciphertext name (flag cleared).
+ *
+ * Return: 0 on success, -errno if a problem occurred while setting up the
+ * encryption key
+ */
+static inline int fscrypt_prepare_lookup(struct inode *dir,
+					 struct dentry *dentry,
+					 unsigned int flags)
+{
+	if (IS_ENCRYPTED(dir))
+		return __fscrypt_prepare_lookup(dir, dentry);
+	return 0;
+}
+
 #endif	/* _LINUX_FSCRYPT_H */
diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h
index 6af378d8126e..c4c6bf2c390e 100644
--- a/include/linux/fscrypt_notsupp.h
+++ b/include/linux/fscrypt_notsupp.h
@@ -201,4 +201,10 @@ static inline int __fscrypt_prepare_rename(struct inode *old_dir,
 	return -EOPNOTSUPP;
 }
 
+static inline int __fscrypt_prepare_lookup(struct inode *dir,
+					   struct dentry *dentry)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif	/* _LINUX_FSCRYPT_NOTSUPP_H */
diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h
index 40f35073145f..2db5e9706f60 100644
--- a/include/linux/fscrypt_supp.h
+++ b/include/linux/fscrypt_supp.h
@@ -151,5 +151,6 @@ extern int __fscrypt_prepare_rename(struct inode *old_dir,
 				    struct inode *new_dir,
 				    struct dentry *new_dentry,
 				    unsigned int flags);
+extern int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry);
 
 #endif	/* _LINUX_FSCRYPT_SUPP_H */
-- 
cgit v1.2.3


From 815dac33b27da9efce0f6008272e69f2a1ba1a33 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 9 Oct 2017 12:15:44 -0700
Subject: fscrypt: new helper function - fscrypt_prepare_setattr()

Introduce a helper function for filesystems to call when processing
->setattr() on a possibly-encrypted inode.  It handles enforcing that an
encrypted file can only be truncated if its encryption key is available.

Acked-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/fscrypt.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 2327859c8cd2..53437bfdfcbc 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -265,4 +265,29 @@ static inline int fscrypt_prepare_lookup(struct inode *dir,
 	return 0;
 }
 
+/**
+ * fscrypt_prepare_setattr - prepare to change a possibly-encrypted inode's attributes
+ * @dentry: dentry through which the inode is being changed
+ * @attr: attributes to change
+ *
+ * Prepare for ->setattr() on a possibly-encrypted inode.  On an encrypted file,
+ * most attribute changes are allowed even without the encryption key.  However,
+ * without the encryption key we do have to forbid truncates.  This is needed
+ * because the size being truncated to may not be a multiple of the filesystem
+ * block size, and in that case we'd have to decrypt the final block, zero the
+ * portion past i_size, and re-encrypt it.  (We *could* allow truncating to a
+ * filesystem block boundary, but it's simpler to just forbid all truncates ---
+ * and we already forbid all other contents modifications without the key.)
+ *
+ * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
+ * if a problem occurred while setting up the encryption key.
+ */
+static inline int fscrypt_prepare_setattr(struct dentry *dentry,
+					  struct iattr *attr)
+{
+	if (attr->ia_valid & ATTR_SIZE)
+		return fscrypt_require_key(d_inode(dentry));
+	return 0;
+}
+
 #endif	/* _LINUX_FSCRYPT_H */
-- 
cgit v1.2.3


From 5cdda5117e125e0dbb020425cc55a4c143c6febc Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 18 Oct 2017 17:24:28 +0200
Subject: locking/static_keys: Improve uninitialized key warning

Right now it says:

  static_key_disable_cpuslocked used before call to jump_label_init
  ------------[ cut here ]------------
  WARNING: CPU: 0 PID: 0 at kernel/jump_label.c:161 static_key_disable_cpuslocked+0x68/0x70
  Modules linked in:
  CPU: 0 PID: 0 Comm: swapper Not tainted 4.14.0-rc5+ #1
  Hardware name: SGI.COM C2112-4GP3/X10DRT-P-Series, BIOS 2.0a 05/09/2016
  task: ffffffff81c0e480 task.stack: ffffffff81c00000
  RIP: 0010:static_key_disable_cpuslocked+0x68/0x70
  RSP: 0000:ffffffff81c03ef0 EFLAGS: 00010096 ORIG_RAX: 0000000000000000
  RAX: 0000000000000041 RBX: ffffffff81c32680 RCX: ffffffff81c5cbf8
  RDX: 0000000000000001 RSI: 0000000000000092 RDI: 0000000000000002
  RBP: ffff88807fffd240 R08: 726f666562206465 R09: 0000000000000136
  R10: 0000000000000000 R11: 696e695f6c656261 R12: ffffffff82158900
  R13: ffffffff8215f760 R14: 0000000000000001 R15: 0000000000000008
  FS:  0000000000000000(0000) GS:ffff883f7f400000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: ffff88807ffff000 CR3: 0000000001c09000 CR4: 00000000000606b0
  Call Trace:
   static_key_disable+0x16/0x20
   start_kernel+0x15a/0x45d
   ? load_ucode_intel_bsp+0x11/0x2d
   secondary_startup_64+0xa5/0xb0
  Code: 48 c7 c7 a0 15 cf 81 e9 47 53 4b 00 48 89 df e8 5f fc ff ff eb e8 48 c7 c6 \
	c0 97 83 81 48 c7 c7 d0 ff a2 81 31 c0 e8 c5 9d f5 ff <0f> ff eb a7 0f ff eb \
	b0 e8 eb a2 4b 00 53 48 89 fb e8 42 0e f0

but it doesn't tell me which key it is. So dump the key's name too:

  static_key_disable_cpuslocked(): static key 'virt_spin_lock_key' used before call to jump_label_init()

And that makes pinpointing which key is causing that a lot easier.

 include/linux/jump_label.h           |   14 +++++++-------
 include/linux/jump_label_ratelimit.h |    6 +++---
 kernel/jump_label.c                  |   14 +++++++-------
 3 files changed, 17 insertions(+), 17 deletions(-)

Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171018152428.ffjgak4o25f7ept6@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/jump_label.h           | 14 +++++++-------
 include/linux/jump_label_ratelimit.h |  6 +++---
 kernel/jump_label.c                  | 14 +++++++-------
 3 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index cd5861651b17..979a2f2d529b 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -81,9 +81,9 @@
 
 extern bool static_key_initialized;
 
-#define STATIC_KEY_CHECK_USE() WARN(!static_key_initialized,		      \
-				    "%s used before call to jump_label_init", \
-				    __func__)
+#define STATIC_KEY_CHECK_USE(key) WARN(!static_key_initialized,		      \
+				    "%s(): static key '%pS' used before call to jump_label_init()", \
+				    __func__, (key))
 
 #ifdef HAVE_JUMP_LABEL
 
@@ -211,13 +211,13 @@ static __always_inline bool static_key_true(struct static_key *key)
 
 static inline void static_key_slow_inc(struct static_key *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 	atomic_inc(&key->enabled);
 }
 
 static inline void static_key_slow_dec(struct static_key *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 	atomic_dec(&key->enabled);
 }
 
@@ -236,7 +236,7 @@ static inline int jump_label_apply_nops(struct module *mod)
 
 static inline void static_key_enable(struct static_key *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 
 	if (atomic_read(&key->enabled) != 0) {
 		WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
@@ -247,7 +247,7 @@ static inline void static_key_enable(struct static_key *key)
 
 static inline void static_key_disable(struct static_key *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 
 	if (atomic_read(&key->enabled) != 1) {
 		WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
diff --git a/include/linux/jump_label_ratelimit.h b/include/linux/jump_label_ratelimit.h
index 23da3af459fe..93086df0a847 100644
--- a/include/linux/jump_label_ratelimit.h
+++ b/include/linux/jump_label_ratelimit.h
@@ -24,18 +24,18 @@ struct static_key_deferred {
 };
 static inline void static_key_slow_dec_deferred(struct static_key_deferred *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 	static_key_slow_dec(&key->key);
 }
 static inline void static_key_deferred_flush(struct static_key_deferred *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 }
 static inline void
 jump_label_rate_limit(struct static_key_deferred *key,
 		unsigned long rl)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 }
 #endif	/* HAVE_JUMP_LABEL */
 #endif	/* _LINUX_JUMP_LABEL_RATELIMIT_H */
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 0bf2e8f5244a..8ff4ca4665ff 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -83,7 +83,7 @@ static void static_key_slow_inc_cpuslocked(struct static_key *key)
 {
 	int v, v1;
 
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 
 	/*
 	 * Careful if we get concurrent static_key_slow_inc() calls;
@@ -128,7 +128,7 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc);
 
 void static_key_enable_cpuslocked(struct static_key *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 
 	if (atomic_read(&key->enabled) > 0) {
 		WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
@@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(static_key_enable);
 
 void static_key_disable_cpuslocked(struct static_key *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 
 	if (atomic_read(&key->enabled) != 1) {
 		WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
@@ -224,21 +224,21 @@ static void jump_label_update_timeout(struct work_struct *work)
 
 void static_key_slow_dec(struct static_key *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 	__static_key_slow_dec(key, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(static_key_slow_dec);
 
 void static_key_slow_dec_deferred(struct static_key_deferred *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 	__static_key_slow_dec(&key->key, key->timeout, &key->work);
 }
 EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
 
 void static_key_deferred_flush(struct static_key_deferred *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 	flush_delayed_work(&key->work);
 }
 EXPORT_SYMBOL_GPL(static_key_deferred_flush);
@@ -246,7 +246,7 @@ EXPORT_SYMBOL_GPL(static_key_deferred_flush);
 void jump_label_rate_limit(struct static_key_deferred *key,
 		unsigned long rl)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 	key->timeout = rl;
 	INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
 }
-- 
cgit v1.2.3


From bbb1cc050890dad2ad8747a23b9f22a53e726c9a Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Thu, 24 Aug 2017 16:13:18 -0600
Subject: usb: gadget: Add kerneldoc for some neglected structure fields

A couple of structures in <linux/usb/gadget.h> have incomplete kerneldoc
comments, leading to these warnings in the docs build:

./include/linux/usb/gadget.h:230: warning: No description found for parameter 'claimed'
./include/linux/usb/gadget.h:230: warning: No description found for parameter 'enabled'
./include/linux/usb/gadget.h:412: warning: No description found for parameter 'quirk_altset_not_supp'
./include/linux/usb/gadget.h:412: warning: No description found for parameter 'quirk_stall_not_supp'
./include/linux/usb/gadget.h:412: warning: No description found for parameter 'quirk_zlp_not_supp'

Document those fields to make the warnings go away.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 include/linux/usb/gadget.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 21468a722c4a..523e5a2b5015 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -188,6 +188,8 @@ struct usb_ep_caps {
  * @ops: Function pointers used to access hardware-specific operations.
  * @ep_list:the gadget's ep_list holds all of its endpoints
  * @caps:The structure describing types and directions supported by endoint.
+ * @enabled: The current endpoint enabled/disabled state.
+ * @claimed: True if this endpoint is claimed by a function.
  * @maxpacket:The maximum packet size used on this endpoint.  The initial
  *	value can sometimes be reduced (hardware allowing), according to
  *	the endpoint descriptor used to configure the endpoint.
@@ -349,6 +351,9 @@ struct usb_gadget_ops {
  *	or B-Peripheral wants to take host role.
  * @quirk_ep_out_aligned_size: epout requires buffer size to be aligned to
  *	MaxPacketSize.
+ * @quirk_altset_not_supp: UDC controller doesn't support alt settings.
+ * @quirk_stall_not_supp: UDC controller doesn't support stalling.
+ * @quirk_zlp_not_supp: UDC controller doesn't support ZLP.
  * @quirk_avoids_skb_reserve: udc/platform wants to avoid skb_reserve() in
  *	u_ether.c to improve performance.
  * @is_selfpowered: if the gadget is self-powered.
-- 
cgit v1.2.3


From 0f38672c629b79fa2b929d2c391bc063a08279eb Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Tue, 3 Oct 2017 20:09:14 +0900
Subject: usb: renesas_usbhs: add support for R-Car D3

This patch adds support for R-Car D3. This SoC needs to release
the PLL reset by the UGCTRL register. So, since this is not the same
as other R-Car Gen3 SoCs, this patch adds a new type as
"USBHS_TYPE_RCAR_GEN3_WITH_PLL".

Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 .../devicetree/bindings/usb/renesas_usbhs.txt      |  1 +
 drivers/usb/renesas_usbhs/common.c                 | 10 ++++-
 drivers/usb/renesas_usbhs/rcar3.c                  | 48 ++++++++++++++++++++++
 drivers/usb/renesas_usbhs/rcar3.h                  |  1 +
 include/linux/usb/renesas_usbhs.h                  |  5 ++-
 5 files changed, 62 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/usb/renesas_usbhs.txt b/Documentation/devicetree/bindings/usb/renesas_usbhs.txt
index 9e18e000339e..e79f6e43061a 100644
--- a/Documentation/devicetree/bindings/usb/renesas_usbhs.txt
+++ b/Documentation/devicetree/bindings/usb/renesas_usbhs.txt
@@ -10,6 +10,7 @@ Required properties:
 	- "renesas,usbhs-r8a7794" for r8a7794 (R-Car E2) compatible device
 	- "renesas,usbhs-r8a7795" for r8a7795 (R-Car H3) compatible device
 	- "renesas,usbhs-r8a7796" for r8a7796 (R-Car M3-W) compatible device
+	- "renesas,usbhs-r8a77995" for r8a77995 (R-Car D3) compatible device
 	- "renesas,rcar-gen2-usbhs" for R-Car Gen2 compatible device
 	- "renesas,rcar-gen3-usbhs" for R-Car Gen3 compatible device
 
diff --git a/drivers/usb/renesas_usbhs/common.c b/drivers/usb/renesas_usbhs/common.c
index 2a860e496b4f..3e92a784c5c3 100644
--- a/drivers/usb/renesas_usbhs/common.c
+++ b/drivers/usb/renesas_usbhs/common.c
@@ -485,6 +485,10 @@ static const struct of_device_id usbhs_of_match[] = {
 		.compatible = "renesas,usbhs-r8a7796",
 		.data = (void *)USBHS_TYPE_RCAR_GEN3,
 	},
+	{
+		.compatible = "renesas,usbhs-r8a77995",
+		.data = (void *)USBHS_TYPE_RCAR_GEN3_WITH_PLL,
+	},
 	{
 		.compatible = "renesas,rcar-gen2-usbhs",
 		.data = (void *)USBHS_TYPE_RCAR_GEN2,
@@ -519,7 +523,8 @@ static struct renesas_usbhs_platform_info *usbhs_parse_dt(struct device *dev)
 		dparam->enable_gpio = gpio;
 
 	if (dparam->type == USBHS_TYPE_RCAR_GEN2 ||
-	    dparam->type == USBHS_TYPE_RCAR_GEN3) {
+	    dparam->type == USBHS_TYPE_RCAR_GEN3 ||
+	    dparam->type == USBHS_TYPE_RCAR_GEN3_WITH_PLL) {
 		dparam->has_usb_dmac = 1;
 		dparam->pipe_configs = usbhsc_new_pipe;
 		dparam->pipe_size = ARRAY_SIZE(usbhsc_new_pipe);
@@ -584,6 +589,9 @@ static int usbhs_probe(struct platform_device *pdev)
 	case USBHS_TYPE_RCAR_GEN3:
 		priv->pfunc = usbhs_rcar3_ops;
 		break;
+	case USBHS_TYPE_RCAR_GEN3_WITH_PLL:
+		priv->pfunc = usbhs_rcar3_with_pll_ops;
+		break;
 	default:
 		if (!info->platform_callback.get_id) {
 			dev_err(&pdev->dev, "no platform callbacks");
diff --git a/drivers/usb/renesas_usbhs/rcar3.c b/drivers/usb/renesas_usbhs/rcar3.c
index 02b67abfc2a1..f436e9d51127 100644
--- a/drivers/usb/renesas_usbhs/rcar3.c
+++ b/drivers/usb/renesas_usbhs/rcar3.c
@@ -15,24 +15,39 @@
 #include "rcar3.h"
 
 #define LPSTS		0x102
+#define UGCTRL		0x180	/* 32-bit register */
 #define UGCTRL2		0x184	/* 32-bit register */
+#define UGSTS		0x188	/* 32-bit register */
 
 /* Low Power Status register (LPSTS) */
 #define LPSTS_SUSPM	0x4000
 
+/* R-Car D3 only: USB General control register (UGCTRL) */
+#define UGCTRL_PLLRESET		0x00000001
+#define UGCTRL_CONNECT		0x00000004
+
 /*
  * USB General control register 2 (UGCTRL2)
  * Remarks: bit[31:11] and bit[9:6] should be 0
  */
 #define UGCTRL2_RESERVED_3	0x00000001	/* bit[3:0] should be B'0001 */
+#define UGCTRL2_USB0SEL_HSUSB	0x00000020
 #define UGCTRL2_USB0SEL_OTG	0x00000030
 #define UGCTRL2_VBUSSEL		0x00000400
 
+/* R-Car D3 only: USB General status register (UGSTS) */
+#define UGSTS_LOCK		0x00000100
+
 static void usbhs_write32(struct usbhs_priv *priv, u32 reg, u32 data)
 {
 	iowrite32(data, priv->base + reg);
 }
 
+static u32 usbhs_read32(struct usbhs_priv *priv, u32 reg)
+{
+	return ioread32(priv->base + reg);
+}
+
 static int usbhs_rcar3_power_ctrl(struct platform_device *pdev,
 				void __iomem *base, int enable)
 {
@@ -52,6 +67,34 @@ static int usbhs_rcar3_power_ctrl(struct platform_device *pdev,
 	return 0;
 }
 
+/* R-Car D3 needs to release UGCTRL.PLLRESET */
+static int usbhs_rcar3_power_and_pll_ctrl(struct platform_device *pdev,
+					  void __iomem *base, int enable)
+{
+	struct usbhs_priv *priv = usbhs_pdev_to_priv(pdev);
+	u32 val;
+	int timeout = 1000;
+
+	if (enable) {
+		usbhs_write32(priv, UGCTRL, 0);	/* release PLLRESET */
+		usbhs_write32(priv, UGCTRL2, UGCTRL2_RESERVED_3 |
+			      UGCTRL2_USB0SEL_HSUSB);
+
+		usbhs_bset(priv, LPSTS, LPSTS_SUSPM, LPSTS_SUSPM);
+		do {
+			val = usbhs_read32(priv, UGSTS);
+			udelay(1);
+		} while (!(val & UGSTS_LOCK) && timeout--);
+		usbhs_write32(priv, UGCTRL, UGCTRL_CONNECT);
+	} else {
+		usbhs_write32(priv, UGCTRL, 0);
+		usbhs_bset(priv, LPSTS, LPSTS_SUSPM, 0);
+		usbhs_write32(priv, UGCTRL, UGCTRL_PLLRESET);
+	}
+
+	return 0;
+}
+
 static int usbhs_rcar3_get_id(struct platform_device *pdev)
 {
 	return USBHS_GADGET;
@@ -61,3 +104,8 @@ const struct renesas_usbhs_platform_callback usbhs_rcar3_ops = {
 	.power_ctrl = usbhs_rcar3_power_ctrl,
 	.get_id = usbhs_rcar3_get_id,
 };
+
+const struct renesas_usbhs_platform_callback usbhs_rcar3_with_pll_ops = {
+	.power_ctrl = usbhs_rcar3_power_and_pll_ctrl,
+	.get_id = usbhs_rcar3_get_id,
+};
diff --git a/drivers/usb/renesas_usbhs/rcar3.h b/drivers/usb/renesas_usbhs/rcar3.h
index 5f850b23ff18..7fe98175f94f 100644
--- a/drivers/usb/renesas_usbhs/rcar3.h
+++ b/drivers/usb/renesas_usbhs/rcar3.h
@@ -1,3 +1,4 @@
 #include "common.h"
 
 extern const struct renesas_usbhs_platform_callback usbhs_rcar3_ops;
+extern const struct renesas_usbhs_platform_callback usbhs_rcar3_with_pll_ops;
diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h
index 00a47d058d83..d2f41e4dd7a8 100644
--- a/include/linux/usb/renesas_usbhs.h
+++ b/include/linux/usb/renesas_usbhs.h
@@ -183,8 +183,9 @@ struct renesas_usbhs_driver_param {
 #define USBHS_USB_DMAC_XFER_SIZE	32	/* hardcode the xfer size */
 };
 
-#define USBHS_TYPE_RCAR_GEN2	1
-#define USBHS_TYPE_RCAR_GEN3	2
+#define USBHS_TYPE_RCAR_GEN2		1
+#define USBHS_TYPE_RCAR_GEN3		2
+#define USBHS_TYPE_RCAR_GEN3_WITH_PLL	3
 
 /*
  * option:
-- 
cgit v1.2.3


From 93862e385ded7c60351e09fcd2a541d273650905 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Fri, 13 Oct 2017 15:08:41 -0400
Subject: livepatch: add (un)patch callbacks

Provide livepatch modules a klp_object (un)patching notification
mechanism.  Pre and post-(un)patch callbacks allow livepatch modules to
setup or synchronize changes that would be difficult to support in only
patched-or-unpatched code contexts.

Callbacks can be registered for target module or vmlinux klp_objects,
but each implementation is klp_object specific.

  - Pre-(un)patch callbacks run before any (un)patching transition
    starts.

  - Post-(un)patch callbacks run once an object has been (un)patched and
    the klp_patch fully transitioned to its target state.

Example use cases include modification of global data and registration
of newly available services/handlers.

See Documentation/livepatch/callbacks.txt for details and
samples/livepatch/ for examples.

Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/livepatch/callbacks.txt           | 605 ++++++++++++++++++++++++
 include/linux/livepatch.h                       |  26 +
 kernel/livepatch/core.c                         |  51 +-
 kernel/livepatch/core.h                         |  38 ++
 kernel/livepatch/patch.c                        |   1 +
 kernel/livepatch/transition.c                   |  21 +-
 samples/livepatch/Makefile                      |   3 +
 samples/livepatch/livepatch-callbacks-busymod.c |  72 +++
 samples/livepatch/livepatch-callbacks-demo.c    | 234 +++++++++
 samples/livepatch/livepatch-callbacks-mod.c     |  53 +++
 10 files changed, 1091 insertions(+), 13 deletions(-)
 create mode 100644 Documentation/livepatch/callbacks.txt
 create mode 100644 samples/livepatch/livepatch-callbacks-busymod.c
 create mode 100644 samples/livepatch/livepatch-callbacks-demo.c
 create mode 100644 samples/livepatch/livepatch-callbacks-mod.c

(limited to 'include/linux')

diff --git a/Documentation/livepatch/callbacks.txt b/Documentation/livepatch/callbacks.txt
new file mode 100644
index 000000000000..c9776f48e458
--- /dev/null
+++ b/Documentation/livepatch/callbacks.txt
@@ -0,0 +1,605 @@
+======================
+(Un)patching Callbacks
+======================
+
+Livepatch (un)patch-callbacks provide a mechanism for livepatch modules
+to execute callback functions when a kernel object is (un)patched.  They
+can be considered a "power feature" that extends livepatching abilities
+to include:
+
+  - Safe updates to global data
+
+  - "Patches" to init and probe functions
+
+  - Patching otherwise unpatchable code (i.e. assembly)
+
+In most cases, (un)patch callbacks will need to be used in conjunction
+with memory barriers and kernel synchronization primitives, like
+mutexes/spinlocks, or even stop_machine(), to avoid concurrency issues.
+
+Callbacks differ from existing kernel facilities:
+
+  - Module init/exit code doesn't run when disabling and re-enabling a
+    patch.
+
+  - A module notifier can't stop a to-be-patched module from loading.
+
+Callbacks are part of the klp_object structure and their implementation
+is specific to that klp_object.  Other livepatch objects may or may not
+be patched, irrespective of the target klp_object's current state.
+
+Callbacks can be registered for the following livepatch actions:
+
+  * Pre-patch    - before a klp_object is patched
+
+  * Post-patch   - after a klp_object has been patched and is active
+                   across all tasks
+
+  * Pre-unpatch  - before a klp_object is unpatched (ie, patched code is
+                   active), used to clean up post-patch callback
+                   resources
+
+  * Post-unpatch - after a klp_object has been patched, all code has
+                   been restored and no tasks are running patched code,
+                   used to cleanup pre-patch callback resources
+
+Each callback is optional, omitting one does not preclude specifying any
+other.  However, the livepatching core executes the handlers in
+symmetry: pre-patch callbacks have a post-unpatch counterpart and
+post-patch callbacks have a pre-unpatch counterpart.  An unpatch
+callback will only be executed if its corresponding patch callback was
+executed.  Typical use cases pair a patch handler that acquires and
+configures resources with an unpatch handler tears down and releases
+those same resources.
+
+A callback is only executed if its host klp_object is loaded.  For
+in-kernel vmlinux targets, this means that callbacks will always execute
+when a livepatch is enabled/disabled.  For patch target kernel modules,
+callbacks will only execute if the target module is loaded.  When a
+module target is (un)loaded, its callbacks will execute only if the
+livepatch module is enabled.
+
+The pre-patch callback, if specified, is expected to return a status
+code (0 for success, -ERRNO on error).  An error status code indicates
+to the livepatching core that patching of the current klp_object is not
+safe and to stop the current patching request.  (When no pre-patch
+callback is provided, the transition is assumed to be safe.)  If a
+pre-patch callback returns failure, the kernel's module loader will:
+
+  - Refuse to load a livepatch, if the livepatch is loaded after
+    targeted code.
+
+    or:
+
+  - Refuse to load a module, if the livepatch was already successfully
+    loaded.
+
+No post-patch, pre-unpatch, or post-unpatch callbacks will be executed
+for a given klp_object if the object failed to patch, due to a failed
+pre_patch callback or for any other reason.
+
+If a patch transition is reversed, no pre-unpatch handlers will be run
+(this follows the previously mentioned symmetry -- pre-unpatch callbacks
+will only occur if their corresponding post-patch callback executed).
+
+If the object did successfully patch, but the patch transition never
+started for some reason (e.g., if another object failed to patch),
+only the post-unpatch callback will be called.
+
+
+Example Use-cases
+=================
+
+Update global data
+------------------
+
+A pre-patch callback can be useful to update a global variable.  For
+example, 75ff39ccc1bd ("tcp: make challenge acks less predictable")
+changes a global sysctl, as well as patches the tcp_send_challenge_ack()
+function.
+
+In this case, if we're being super paranoid, it might make sense to
+patch the data *after* patching is complete with a post-patch callback,
+so that tcp_send_challenge_ack() could first be changed to read
+sysctl_tcp_challenge_ack_limit with READ_ONCE.
+
+
+Support __init and probe function patches
+-----------------------------------------
+
+Although __init and probe functions are not directly livepatch-able, it
+may be possible to implement similar updates via pre/post-patch
+callbacks.
+
+48900cb6af42 ("virtio-net: drop NETIF_F_FRAGLIST") change the way that
+virtnet_probe() initialized its driver's net_device features.  A
+pre/post-patch callback could iterate over all such devices, making a
+similar change to their hw_features value.  (Client functions of the
+value may need to be updated accordingly.)
+
+
+Test cases
+==========
+
+What follows is not an exhaustive test suite of every possible livepatch
+pre/post-(un)patch combination, but a selection that demonstrates a few
+important concepts.  Each test case uses the kernel modules located in
+the samples/livepatch/ and assumes that no livepatches are loaded at the
+beginning of the test.
+
+
+Test 1
+------
+
+Test a combination of loading a kernel module and a livepatch that
+patches a function in the first module.  (Un)load the target module
+before the livepatch module:
+
+- load target module
+- load livepatch
+- disable livepatch
+- unload target module
+- unload livepatch
+
+First load a target module:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   34.475708] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+On livepatch enable, before the livepatch transition starts, pre-patch
+callbacks are executed for vmlinux and livepatch_callbacks_mod (those
+klp_objects currently loaded).  After klp_objects are patched according
+to the klp_patch, their post-patch callbacks run and the transition
+completes:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   36.503719] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   36.504213] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   36.504238] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   36.504721] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   36.505849] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   37.727133] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   37.727232] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   37.727860] livepatch_callbacks_demo: post_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   37.728792] livepatch: 'livepatch_callbacks_demo': patching complete
+
+Similarly, on livepatch disable, pre-patch callbacks run before the
+unpatching transition starts.  klp_objects are reverted, post-patch
+callbacks execute and the transition completes:
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [   38.510209] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [   38.510234] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [   38.510982] livepatch_callbacks_demo: pre_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   38.512209] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [   39.711132] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   39.711210] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [   39.711779] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   39.712735] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   42.534183] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+
+
+Test 2
+------
+
+This test is similar to the previous test, but (un)load the livepatch
+module before the target kernel module.  This tests the livepatch core's
+module_coming handler:
+
+- load livepatch
+- load target module
+- disable livepatch
+- unload livepatch
+- unload target module
+
+
+On livepatch enable, only pre/post-patch callbacks are executed for
+currently loaded klp_objects, in this case, vmlinux:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   44.553328] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   44.553997] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   44.554049] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   44.554845] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   45.727128] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   45.727212] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   45.727961] livepatch: 'livepatch_callbacks_demo': patching complete
+
+When a targeted module is subsequently loaded, only its pre/post-patch
+callbacks are executed:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   46.560845] livepatch: applying patch 'livepatch_callbacks_demo' to loading module 'livepatch_callbacks_mod'
+  [   46.561988] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [   46.563452] livepatch_callbacks_demo: post_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [   46.565495] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+On livepatch disable, all currently loaded klp_objects' (vmlinux and
+livepatch_callbacks_mod) pre/post-unpatch callbacks are executed:
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [   48.568885] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [   48.568910] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [   48.569441] livepatch_callbacks_demo: pre_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   48.570502] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [   49.759091] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   49.759171] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [   49.759742] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   49.760690] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   52.592283] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+
+
+Test 3
+------
+
+Test loading the livepatch after a targeted kernel module, then unload
+the kernel module before disabling the livepatch.  This tests the
+livepatch core's module_going handler:
+
+- load target module
+- load livepatch
+- unload target module
+- disable livepatch
+- unload livepatch
+
+First load a target module, then the livepatch:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   54.607948] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   56.613919] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   56.614411] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   56.614436] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   56.614818] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   56.615656] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   57.759070] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   57.759147] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   57.759621] livepatch_callbacks_demo: post_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   57.760307] livepatch: 'livepatch_callbacks_demo': patching complete
+
+When a target module is unloaded, the livepatch is only reverted from
+that klp_object (livepatch_callbacks_mod).  As such, only its pre and
+post-unpatch callbacks are executed when this occurs:
+
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   58.623409] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+  [   58.623903] livepatch_callbacks_demo: pre_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+  [   58.624658] livepatch: reverting patch 'livepatch_callbacks_demo' on unloading module 'livepatch_callbacks_mod'
+  [   58.625305] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+
+When the livepatch is disabled, pre and post-unpatch callbacks are run
+for the remaining klp_object, vmlinux:
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [   60.638420] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [   60.638444] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [   60.638996] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [   61.727088] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   61.727165] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [   61.727985] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+
+
+Test 4
+------
+
+This test is similar to the previous test, however the livepatch is
+loaded first.  This tests the livepatch core's module_coming and
+module_going handlers:
+
+- load livepatch
+- load target module
+- unload target module
+- disable livepatch
+- unload livepatch
+
+First load the livepatch:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   64.661552] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   64.662147] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   64.662175] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   64.662850] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   65.695056] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   65.695147] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   65.695561] livepatch: 'livepatch_callbacks_demo': patching complete
+
+When a targeted kernel module is subsequently loaded, only its
+pre/post-patch callbacks are executed:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   66.669196] livepatch: applying patch 'livepatch_callbacks_demo' to loading module 'livepatch_callbacks_mod'
+  [   66.669882] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [   66.670744] livepatch_callbacks_demo: post_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [   66.672873] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+When the target module is unloaded, the livepatch is only reverted from
+the livepatch_callbacks_mod klp_object.  As such, only pre and
+post-unpatch callbacks are executed when this occurs:
+
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   68.680065] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+  [   68.680688] livepatch_callbacks_demo: pre_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+  [   68.681452] livepatch: reverting patch 'livepatch_callbacks_demo' on unloading module 'livepatch_callbacks_mod'
+  [   68.682094] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [   70.689225] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [   70.689256] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [   70.689882] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [   71.711080] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   71.711481] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [   71.711988] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+
+
+Test 5
+------
+
+A simple test of loading a livepatch without one of its patch target
+klp_objects ever loaded (livepatch_callbacks_mod):
+
+- load livepatch
+- disable livepatch
+- unload livepatch
+
+Load the livepatch:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   74.711081] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   74.711595] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   74.711639] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   74.712272] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   75.743137] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   75.743219] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   75.743867] livepatch: 'livepatch_callbacks_demo': patching complete
+
+As expected, only pre/post-(un)patch handlers are executed for vmlinux:
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [   76.716254] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [   76.716278] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [   76.716666] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [   77.727089] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   77.727194] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [   77.727907] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+
+
+Test 6
+------
+
+Test a scenario where a vmlinux pre-patch callback returns a non-zero
+status (ie, failure):
+
+- load target module
+- load livepatch -ENODEV
+- unload target module
+
+First load a target module:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   80.740520] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+Load the livepatch module, setting its 'pre_patch_ret' value to -19
+(-ENODEV).  When its vmlinux pre-patch callback executed, this status
+code will propagate back to the module-loading subsystem.  The result is
+that the insmod command refuses to load the livepatch module:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko pre_patch_ret=-19
+  [   82.747326] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   82.747743] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   82.747767] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   82.748237] livepatch: pre-patch callback failed for object 'vmlinux'
+  [   82.748637] livepatch: failed to enable patch 'livepatch_callbacks_demo'
+  [   82.749059] livepatch: 'livepatch_callbacks_demo': canceling transition, going to unpatch
+  [   82.749060] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   82.749868] livepatch: 'livepatch_callbacks_demo': unpatching complete
+  [   82.765809] insmod: ERROR: could not insert module samples/livepatch/livepatch-callbacks-demo.ko: No such device
+
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   84.774238] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+
+
+Test 7
+------
+
+Similar to the previous test, setup a livepatch such that its vmlinux
+pre-patch callback returns success.  However, when a targeted kernel
+module is later loaded, have the livepatch return a failing status code:
+
+- load livepatch
+- setup -ENODEV
+- load target module
+- disable livepatch
+- unload livepatch
+
+Load the livepatch, notice vmlinux pre-patch callback succeeds:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   86.787845] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   86.788325] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   86.788427] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   86.788821] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   87.711069] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   87.711143] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   87.711886] livepatch: 'livepatch_callbacks_demo': patching complete
+
+Set a trap so subsequent pre-patch callbacks to this livepatch will
+return -ENODEV:
+
+  % echo -19 > /sys/module/livepatch_callbacks_demo/parameters/pre_patch_ret
+
+The livepatch pre-patch callback for subsequently loaded target modules
+will return failure, so the module loader refuses to load the kernel
+module.  Notice that no post-patch or pre/post-unpatch callbacks are
+executed for this klp_object:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   90.796976] livepatch: applying patch 'livepatch_callbacks_demo' to loading module 'livepatch_callbacks_mod'
+  [   90.797834] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [   90.798900] livepatch: pre-patch callback failed for object 'livepatch_callbacks_mod'
+  [   90.799652] livepatch: patch 'livepatch_callbacks_demo' failed for module 'livepatch_callbacks_mod', refusing to load module 'livepatch_callbacks_mod'
+  [   90.819737] insmod: ERROR: could not insert module samples/livepatch/livepatch-callbacks-mod.ko: No such device
+
+However, pre/post-unpatch callbacks run for the vmlinux klp_object:
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [   92.823547] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [   92.823573] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [   92.824331] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [   93.727128] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   93.727327] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [   93.727861] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+
+
+Test 8
+------
+
+Test loading multiple targeted kernel modules.  This test-case is
+mainly for comparing with the next test-case.
+
+- load busy target module (0s sleep),
+- load livepatch
+- load target module
+- unload target module
+- disable livepatch
+- unload livepatch
+- unload busy target module
+
+
+Load a target "busy" kernel module which kicks off a worker function
+that immediately exits:
+
+  % insmod samples/livepatch/livepatch-callbacks-busymod.ko sleep_secs=0
+  [   96.910107] livepatch_callbacks_busymod: livepatch_callbacks_mod_init
+  [   96.910600] livepatch_callbacks_busymod: busymod_work_func, sleeping 0 seconds ...
+  [   96.913024] livepatch_callbacks_busymod: busymod_work_func exit
+
+Proceed with loading the livepatch and another ordinary target module,
+notice that the post-patch callbacks are executed and the transition
+completes quickly:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   98.917892] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   98.918426] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   98.918453] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   98.918955] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_busymod -> [MODULE_STATE_LIVE] Normal state
+  [   98.923835] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   99.743104] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   99.743156] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   99.743679] livepatch_callbacks_demo: post_patch_callback: livepatch_callbacks_busymod -> [MODULE_STATE_LIVE] Normal state
+  [   99.744616] livepatch: 'livepatch_callbacks_demo': patching complete
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [  100.930955] livepatch: applying patch 'livepatch_callbacks_demo' to loading module 'livepatch_callbacks_mod'
+  [  100.931668] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [  100.932645] livepatch_callbacks_demo: post_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [  100.934125] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [  102.942805] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+  [  102.943640] livepatch_callbacks_demo: pre_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+  [  102.944585] livepatch: reverting patch 'livepatch_callbacks_demo' on unloading module 'livepatch_callbacks_mod'
+  [  102.945455] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [  104.953815] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [  104.953838] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [  104.954431] livepatch_callbacks_demo: pre_unpatch_callback: livepatch_callbacks_busymod -> [MODULE_STATE_LIVE] Normal state
+  [  104.955426] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [  106.719073] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [  106.722633] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [  106.723282] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_busymod -> [MODULE_STATE_LIVE] Normal state
+  [  106.724279] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+  % rmmod samples/livepatch/livepatch-callbacks-busymod.ko
+  [  108.975660] livepatch_callbacks_busymod: livepatch_callbacks_mod_exit
+
+
+Test 9
+------
+
+A similar test as the previous one, but force the "busy" kernel module
+to do longer work.
+
+The livepatching core will refuse to patch a task that is currently
+executing a to-be-patched function -- the consistency model stalls the
+current patch transition until this safety-check is met.  Test a
+scenario where one of a livepatch's target klp_objects sits on such a
+function for a long time.  Meanwhile, load and unload other target
+kernel modules while the livepatch transition is in progress.
+
+- load busy target module (30s sleep)
+- load livepatch
+- load target module
+- unload target module
+- disable livepatch
+- unload livepatch
+- unload busy target module
+
+
+Load the "busy" kernel module, this time make it do 30 seconds worth of
+work:
+
+  % insmod samples/livepatch/livepatch-callbacks-busymod.ko sleep_secs=30
+  [  110.993362] livepatch_callbacks_busymod: livepatch_callbacks_mod_init
+  [  110.994059] livepatch_callbacks_busymod: busymod_work_func, sleeping 30 seconds ...
+
+Meanwhile, the livepatch is loaded.  Notice that the patch transition
+does not complete as the targeted "busy" module is sitting on a
+to-be-patched function:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [  113.000309] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [  113.000764] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [  113.000791] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [  113.001289] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_busymod -> [MODULE_STATE_LIVE] Normal state
+  [  113.005208] livepatch: 'livepatch_callbacks_demo': starting patching transition
+
+Load a second target module (this one is an ordinary idle kernel
+module).  Note that *no* post-patch callbacks will be executed while the
+livepatch is still in transition:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [  115.012740] livepatch: applying patch 'livepatch_callbacks_demo' to loading module 'livepatch_callbacks_mod'
+  [  115.013406] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [  115.015315] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+Request an unload of the simple kernel module.  The patch is still
+transitioning, so its pre-unpatch callbacks are skipped:
+
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [  117.022626] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+  [  117.023376] livepatch: reverting patch 'livepatch_callbacks_demo' on unloading module 'livepatch_callbacks_mod'
+  [  117.024533] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+
+Finally the livepatch is disabled.  Since none of the patch's
+klp_object's post-patch callbacks executed, the remaining klp_object's
+pre-unpatch callbacks are skipped:
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [  119.035408] livepatch: 'livepatch_callbacks_demo': reversing transition from patching to unpatching
+  [  119.035485] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [  119.711166] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [  119.714179] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [  119.714653] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_busymod -> [MODULE_STATE_LIVE] Normal state
+  [  119.715437] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+  % rmmod samples/livepatch/livepatch-callbacks-busymod.ko
+  [  141.279111] livepatch_callbacks_busymod: busymod_work_func exit
+  [  141.279760] livepatch_callbacks_busymod: livepatch_callbacks_mod_exit
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index d08eddc00497..fc5c1be3f6f4 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -87,10 +87,35 @@ struct klp_func {
 	bool transition;
 };
 
+struct klp_object;
+
+/**
+ * struct klp_callbacks - pre/post live-(un)patch callback structure
+ * @pre_patch:		executed before code patching
+ * @post_patch:		executed after code patching
+ * @pre_unpatch:	executed before code unpatching
+ * @post_unpatch:	executed after code unpatching
+ * @post_unpatch_enabled:	flag indicating if post-unpatch callback
+ * 				should run
+ *
+ * All callbacks are optional.  Only the pre-patch callback, if provided,
+ * will be unconditionally executed.  If the parent klp_object fails to
+ * patch for any reason, including a non-zero error status returned from
+ * the pre-patch callback, no further callbacks will be executed.
+ */
+struct klp_callbacks {
+	int (*pre_patch)(struct klp_object *obj);
+	void (*post_patch)(struct klp_object *obj);
+	void (*pre_unpatch)(struct klp_object *obj);
+	void (*post_unpatch)(struct klp_object *obj);
+	bool post_unpatch_enabled;
+};
+
 /**
  * struct klp_object - kernel object structure for live patching
  * @name:	module name (or NULL for vmlinux)
  * @funcs:	function entries for functions to be patched in the object
+ * @callbacks:	functions to be executed pre/post (un)patching
  * @kobj:	kobject for sysfs resources
  * @mod:	kernel module associated with the patched object
  *		(NULL for vmlinux)
@@ -100,6 +125,7 @@ struct klp_object {
 	/* external */
 	const char *name;
 	struct klp_func *funcs;
+	struct klp_callbacks callbacks;
 
 	/* internal */
 	struct kobject kobj;
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index b9628e43c78f..cafb5a84417d 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -54,11 +54,6 @@ static bool klp_is_module(struct klp_object *obj)
 	return obj->name;
 }
 
-static bool klp_is_object_loaded(struct klp_object *obj)
-{
-	return !obj->name || obj->mod;
-}
-
 /* sets obj->mod if object is not vmlinux and module is found */
 static void klp_find_object_module(struct klp_object *obj)
 {
@@ -285,6 +280,8 @@ static int klp_write_object_relocations(struct module *pmod,
 
 static int __klp_disable_patch(struct klp_patch *patch)
 {
+	struct klp_object *obj;
+
 	if (klp_transition_patch)
 		return -EBUSY;
 
@@ -295,6 +292,10 @@ static int __klp_disable_patch(struct klp_patch *patch)
 
 	klp_init_transition(patch, KLP_UNPATCHED);
 
+	klp_for_each_object(patch, obj)
+		if (patch->enabled && obj->patched)
+			klp_pre_unpatch_callback(obj);
+
 	/*
 	 * Enforce the order of the func->transition writes in
 	 * klp_init_transition() and the TIF_PATCH_PENDING writes in
@@ -388,13 +389,18 @@ static int __klp_enable_patch(struct klp_patch *patch)
 		if (!klp_is_object_loaded(obj))
 			continue;
 
-		ret = klp_patch_object(obj);
+		ret = klp_pre_patch_callback(obj);
 		if (ret) {
-			pr_warn("failed to enable patch '%s'\n",
-				patch->mod->name);
+			pr_warn("pre-patch callback failed for object '%s'\n",
+				klp_is_module(obj) ? obj->name : "vmlinux");
+			goto err;
+		}
 
-			klp_cancel_transition();
-			return ret;
+		ret = klp_patch_object(obj);
+		if (ret) {
+			pr_warn("failed to patch object '%s'\n",
+				klp_is_module(obj) ? obj->name : "vmlinux");
+			goto err;
 		}
 	}
 
@@ -403,6 +409,11 @@ static int __klp_enable_patch(struct klp_patch *patch)
 	patch->enabled = true;
 
 	return 0;
+err:
+	pr_warn("failed to enable patch '%s'\n", patch->mod->name);
+
+	klp_cancel_transition();
+	return ret;
 }
 
 /**
@@ -871,13 +882,27 @@ int klp_module_coming(struct module *mod)
 			pr_notice("applying patch '%s' to loading module '%s'\n",
 				  patch->mod->name, obj->mod->name);
 
+			ret = klp_pre_patch_callback(obj);
+			if (ret) {
+				pr_warn("pre-patch callback failed for object '%s'\n",
+					obj->name);
+				goto err;
+			}
+
 			ret = klp_patch_object(obj);
 			if (ret) {
 				pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
 					patch->mod->name, obj->mod->name, ret);
+
+				if (patch != klp_transition_patch)
+					klp_post_unpatch_callback(obj);
+
 				goto err;
 			}
 
+			if (patch != klp_transition_patch)
+				klp_post_patch_callback(obj);
+
 			break;
 		}
 	}
@@ -927,9 +952,15 @@ void klp_module_going(struct module *mod)
 			 * is in transition.
 			 */
 			if (patch->enabled || patch == klp_transition_patch) {
+
+				if (patch != klp_transition_patch)
+					klp_pre_unpatch_callback(obj);
+
 				pr_notice("reverting patch '%s' on unloading module '%s'\n",
 					  patch->mod->name, obj->mod->name);
 				klp_unpatch_object(obj);
+
+				klp_post_unpatch_callback(obj);
 			}
 
 			klp_free_object_loaded(obj);
diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h
index c74f24c47837..6fc907b54e71 100644
--- a/kernel/livepatch/core.h
+++ b/kernel/livepatch/core.h
@@ -1,6 +1,44 @@
 #ifndef _LIVEPATCH_CORE_H
 #define _LIVEPATCH_CORE_H
 
+#include <linux/livepatch.h>
+
 extern struct mutex klp_mutex;
 
+static inline bool klp_is_object_loaded(struct klp_object *obj)
+{
+	return !obj->name || obj->mod;
+}
+
+static inline int klp_pre_patch_callback(struct klp_object *obj)
+{
+	int ret;
+
+	ret = (obj->callbacks.pre_patch) ?
+		(*obj->callbacks.pre_patch)(obj) : 0;
+
+	obj->callbacks.post_unpatch_enabled = !ret;
+
+	return ret;
+}
+
+static inline void klp_post_patch_callback(struct klp_object *obj)
+{
+	if (obj->callbacks.post_patch)
+		(*obj->callbacks.post_patch)(obj);
+}
+
+static inline void klp_pre_unpatch_callback(struct klp_object *obj)
+{
+	if (obj->callbacks.pre_unpatch)
+		(*obj->callbacks.pre_unpatch)(obj);
+}
+
+static inline void klp_post_unpatch_callback(struct klp_object *obj)
+{
+	if (obj->callbacks.post_unpatch_enabled &&
+	    obj->callbacks.post_unpatch)
+		(*obj->callbacks.post_unpatch)(obj);
+}
+
 #endif /* _LIVEPATCH_CORE_H */
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 52c4e907c14b..82d584225dc6 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/bug.h>
 #include <linux/printk.h>
+#include "core.h"
 #include "patch.h"
 #include "transition.h"
 
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index b004a1fb6032..7bf55b7f3687 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -109,9 +109,6 @@ static void klp_complete_transition(void)
 		}
 	}
 
-	if (klp_target_state == KLP_UNPATCHED && !immediate_func)
-		module_put(klp_transition_patch->mod);
-
 	/* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */
 	if (klp_target_state == KLP_PATCHED)
 		klp_synchronize_transition();
@@ -130,6 +127,24 @@ static void klp_complete_transition(void)
 	}
 
 done:
+	klp_for_each_object(klp_transition_patch, obj) {
+		if (!klp_is_object_loaded(obj))
+			continue;
+		if (klp_target_state == KLP_PATCHED)
+			klp_post_patch_callback(obj);
+		else if (klp_target_state == KLP_UNPATCHED)
+			klp_post_unpatch_callback(obj);
+	}
+
+	/*
+	 * See complementary comment in __klp_enable_patch() for why we
+	 * keep the module reference for immediate patches.
+	 */
+	if (!klp_transition_patch->immediate && !immediate_func &&
+	    klp_target_state == KLP_UNPATCHED) {
+		module_put(klp_transition_patch->mod);
+	}
+
 	klp_target_state = KLP_UNDEFINED;
 	klp_transition_patch = NULL;
 }
diff --git a/samples/livepatch/Makefile b/samples/livepatch/Makefile
index 539e81d433cd..2472ce39a18d 100644
--- a/samples/livepatch/Makefile
+++ b/samples/livepatch/Makefile
@@ -2,3 +2,6 @@ obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-sample.o
 obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-mod.o
 obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix1.o
 obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix2.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-demo.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-mod.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-busymod.o
diff --git a/samples/livepatch/livepatch-callbacks-busymod.c b/samples/livepatch/livepatch-callbacks-busymod.c
new file mode 100644
index 000000000000..80d06e103f1b
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-busymod.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-callbacks-busymod.c - (un)patching callbacks demo support module
+ *
+ *
+ * Purpose
+ * -------
+ *
+ * Simple module to demonstrate livepatch (un)patching callbacks.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone.  See the "Usage"
+ * section of livepatch-callbacks-mod.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+
+static int sleep_secs;
+module_param(sleep_secs, int, 0644);
+MODULE_PARM_DESC(sleep_secs, "sleep_secs (default=0)");
+
+static void busymod_work_func(struct work_struct *work);
+static DECLARE_DELAYED_WORK(work, busymod_work_func);
+
+static void busymod_work_func(struct work_struct *work)
+{
+	pr_info("%s, sleeping %d seconds ...\n", __func__, sleep_secs);
+	msleep(sleep_secs * 1000);
+	pr_info("%s exit\n", __func__);
+}
+
+static int livepatch_callbacks_mod_init(void)
+{
+	pr_info("%s\n", __func__);
+	schedule_delayed_work(&work,
+		msecs_to_jiffies(1000 * 0));
+	return 0;
+}
+
+static void livepatch_callbacks_mod_exit(void)
+{
+	cancel_delayed_work_sync(&work);
+	pr_info("%s\n", __func__);
+}
+
+module_init(livepatch_callbacks_mod_init);
+module_exit(livepatch_callbacks_mod_exit);
+MODULE_LICENSE("GPL");
diff --git a/samples/livepatch/livepatch-callbacks-demo.c b/samples/livepatch/livepatch-callbacks-demo.c
new file mode 100644
index 000000000000..3d115bd68442
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-demo.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-callbacks-demo.c - (un)patching callbacks livepatch demo
+ *
+ *
+ * Purpose
+ * -------
+ *
+ * Demonstration of registering livepatch (un)patching callbacks.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * Step 1 - load the simple module
+ *
+ *   insmod samples/livepatch/livepatch-callbacks-mod.ko
+ *
+ *
+ * Step 2 - load the demonstration livepatch (with callbacks)
+ *
+ *   insmod samples/livepatch/livepatch-callbacks-demo.ko
+ *
+ *
+ * Step 3 - cleanup
+ *
+ *   echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+ *   rmmod livepatch_callbacks_demo
+ *   rmmod livepatch_callbacks_mod
+ *
+ * Watch dmesg output to see livepatch enablement, callback execution
+ * and patching operations for both vmlinux and module targets.
+ *
+ * NOTE: swap the insmod order of livepatch-callbacks-mod.ko and
+ *       livepatch-callbacks-demo.ko to observe what happens when a
+ *       target module is loaded after a livepatch with callbacks.
+ *
+ * NOTE: 'pre_patch_ret' is a module parameter that sets the pre-patch
+ *       callback return status.  Try setting up a non-zero status
+ *       such as -19 (-ENODEV):
+ *
+ *       # Load demo livepatch, vmlinux is patched
+ *       insmod samples/livepatch/livepatch-callbacks-demo.ko
+ *
+ *       # Setup next pre-patch callback to return -ENODEV
+ *       echo -19 > /sys/module/livepatch_callbacks_demo/parameters/pre_patch_ret
+ *
+ *       # Module loader refuses to load the target module
+ *       insmod samples/livepatch/livepatch-callbacks-mod.ko
+ *       insmod: ERROR: could not insert module samples/livepatch/livepatch-callbacks-mod.ko: No such device
+ *
+ * NOTE: There is a second target module,
+ *       livepatch-callbacks-busymod.ko, available for experimenting
+ *       with livepatch (un)patch callbacks.  This module contains
+ *       a 'sleep_secs' parameter that parks the module on one of the
+ *       functions that the livepatch demo module wants to patch.
+ *       Modifying this value and tweaking the order of module loads can
+ *       effectively demonstrate stalled patch transitions:
+ *
+ *       # Load a target module, let it park on 'busymod_work_func' for
+ *       # thirty seconds
+ *       insmod samples/livepatch/livepatch-callbacks-busymod.ko sleep_secs=30
+ *
+ *       # Meanwhile load the livepatch
+ *       insmod samples/livepatch/livepatch-callbacks-demo.ko
+ *
+ *       # ... then load and unload another target module while the
+ *       # transition is in progress
+ *       insmod samples/livepatch/livepatch-callbacks-mod.ko
+ *       rmmod samples/livepatch/livepatch-callbacks-mod.ko
+ *
+ *       # Finally cleanup
+ *       echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+ *       rmmod samples/livepatch/livepatch-callbacks-demo.ko
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/livepatch.h>
+
+static int pre_patch_ret;
+module_param(pre_patch_ret, int, 0644);
+MODULE_PARM_DESC(pre_patch_ret, "pre_patch_ret (default=0)");
+
+static const char *const module_state[] = {
+	[MODULE_STATE_LIVE]	= "[MODULE_STATE_LIVE] Normal state",
+	[MODULE_STATE_COMING]	= "[MODULE_STATE_COMING] Full formed, running module_init",
+	[MODULE_STATE_GOING]	= "[MODULE_STATE_GOING] Going away",
+	[MODULE_STATE_UNFORMED]	= "[MODULE_STATE_UNFORMED] Still setting it up",
+};
+
+static void callback_info(const char *callback, struct klp_object *obj)
+{
+	if (obj->mod)
+		pr_info("%s: %s -> %s\n", callback, obj->mod->name,
+			module_state[obj->mod->state]);
+	else
+		pr_info("%s: vmlinux\n", callback);
+}
+
+/* Executed on object patching (ie, patch enablement) */
+static int pre_patch_callback(struct klp_object *obj)
+{
+	callback_info(__func__, obj);
+	return pre_patch_ret;
+}
+
+/* Executed on object unpatching (ie, patch disablement) */
+static void post_patch_callback(struct klp_object *obj)
+{
+	callback_info(__func__, obj);
+}
+
+/* Executed on object unpatching (ie, patch disablement) */
+static void pre_unpatch_callback(struct klp_object *obj)
+{
+	callback_info(__func__, obj);
+}
+
+/* Executed on object unpatching (ie, patch disablement) */
+static void post_unpatch_callback(struct klp_object *obj)
+{
+	callback_info(__func__, obj);
+}
+
+static void patched_work_func(struct work_struct *work)
+{
+	pr_info("%s\n", __func__);
+}
+
+static struct klp_func no_funcs[] = {
+	{ }
+};
+
+static struct klp_func busymod_funcs[] = {
+	{
+		.old_name = "busymod_work_func",
+		.new_func = patched_work_func,
+	}, { }
+};
+
+static struct klp_object objs[] = {
+	{
+		.name = NULL,	/* vmlinux */
+		.funcs = no_funcs,
+		.callbacks = {
+			.pre_patch = pre_patch_callback,
+			.post_patch = post_patch_callback,
+			.pre_unpatch = pre_unpatch_callback,
+			.post_unpatch = post_unpatch_callback,
+		},
+	},	{
+		.name = "livepatch_callbacks_mod",
+		.funcs = no_funcs,
+		.callbacks = {
+			.pre_patch = pre_patch_callback,
+			.post_patch = post_patch_callback,
+			.pre_unpatch = pre_unpatch_callback,
+			.post_unpatch = post_unpatch_callback,
+		},
+	},	{
+		.name = "livepatch_callbacks_busymod",
+		.funcs = busymod_funcs,
+		.callbacks = {
+			.pre_patch = pre_patch_callback,
+			.post_patch = post_patch_callback,
+			.pre_unpatch = pre_unpatch_callback,
+			.post_unpatch = post_unpatch_callback,
+		},
+	}, { }
+};
+
+static struct klp_patch patch = {
+	.mod = THIS_MODULE,
+	.objs = objs,
+};
+
+static int livepatch_callbacks_demo_init(void)
+{
+	int ret;
+
+	if (!klp_have_reliable_stack() && !patch.immediate) {
+		/*
+		 * WARNING: Be very careful when using 'patch.immediate' in
+		 * your patches.  It's ok to use it for simple patches like
+		 * this, but for more complex patches which change function
+		 * semantics, locking semantics, or data structures, it may not
+		 * be safe.  Use of this option will also prevent removal of
+		 * the patch.
+		 *
+		 * See Documentation/livepatch/livepatch.txt for more details.
+		 */
+		patch.immediate = true;
+		pr_notice("The consistency model isn't supported for your architecture.  Bypassing safety mechanisms and applying the patch immediately.\n");
+	}
+
+	ret = klp_register_patch(&patch);
+	if (ret)
+		return ret;
+	ret = klp_enable_patch(&patch);
+	if (ret) {
+		WARN_ON(klp_unregister_patch(&patch));
+		return ret;
+	}
+	return 0;
+}
+
+static void livepatch_callbacks_demo_exit(void)
+{
+	WARN_ON(klp_unregister_patch(&patch));
+}
+
+module_init(livepatch_callbacks_demo_init);
+module_exit(livepatch_callbacks_demo_exit);
+MODULE_LICENSE("GPL");
+MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-callbacks-mod.c b/samples/livepatch/livepatch-callbacks-mod.c
new file mode 100644
index 000000000000..e610ce29ba44
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-mod.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-callbacks-mod.c - (un)patching callbacks demo support module
+ *
+ *
+ * Purpose
+ * -------
+ *
+ * Simple module to demonstrate livepatch (un)patching callbacks.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone.  See the "Usage"
+ * section of livepatch-callbacks-demo.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+static int livepatch_callbacks_mod_init(void)
+{
+	pr_info("%s\n", __func__);
+	return 0;
+}
+
+static void livepatch_callbacks_mod_exit(void)
+{
+	pr_info("%s\n", __func__);
+}
+
+module_init(livepatch_callbacks_mod_init);
+module_exit(livepatch_callbacks_mod_exit);
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From f1d783585486c7c612f277c2a6f0c9bb5a67e463 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 5 Oct 2017 10:44:54 +0900
Subject: irqdomain: Move revmap_trees_mutex to struct irq_domain

The revmap_trees_mutex protects domain->revmap_tree.  There is no
need to make it global because it is allowed to modify revmap_tree
of two different domains concurrently.  Having said that, this would
not be a actual bottleneck because the interrupt map/unmap does not
occur quite often.  Rather, the motivation is to tidy up the code
from a data structure point of view.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqdomain.h |  2 ++
 kernel/irq/irqdomain.c    | 14 +++++++-------
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 7d0c6c144708..df162f7a4aad 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -32,6 +32,7 @@
 #include <linux/types.h>
 #include <linux/irqhandler.h>
 #include <linux/of.h>
+#include <linux/mutex.h>
 #include <linux/radix-tree.h>
 
 struct device_node;
@@ -176,6 +177,7 @@ struct irq_domain {
 	unsigned int revmap_direct_max_irq;
 	unsigned int revmap_size;
 	struct radix_tree_root revmap_tree;
+	struct mutex revmap_tree_mutex;
 	unsigned int linear_revmap[];
 };
 
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index b50f737574ae..31d0f9ff4f00 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -21,7 +21,6 @@
 static LIST_HEAD(irq_domain_list);
 static DEFINE_MUTEX(irq_domain_mutex);
 
-static DEFINE_MUTEX(revmap_trees_mutex);
 static struct irq_domain *irq_default_domain;
 
 static void irq_domain_check_hierarchy(struct irq_domain *domain);
@@ -211,6 +210,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
 
 	/* Fill structure */
 	INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
+	mutex_init(&domain->revmap_tree_mutex);
 	domain->ops = ops;
 	domain->host_data = host_data;
 	domain->hwirq_max = hwirq_max;
@@ -462,9 +462,9 @@ static void irq_domain_clear_mapping(struct irq_domain *domain,
 	if (hwirq < domain->revmap_size) {
 		domain->linear_revmap[hwirq] = 0;
 	} else {
-		mutex_lock(&revmap_trees_mutex);
+		mutex_lock(&domain->revmap_tree_mutex);
 		radix_tree_delete(&domain->revmap_tree, hwirq);
-		mutex_unlock(&revmap_trees_mutex);
+		mutex_unlock(&domain->revmap_tree_mutex);
 	}
 }
 
@@ -475,9 +475,9 @@ static void irq_domain_set_mapping(struct irq_domain *domain,
 	if (hwirq < domain->revmap_size) {
 		domain->linear_revmap[hwirq] = irq_data->irq;
 	} else {
-		mutex_lock(&revmap_trees_mutex);
+		mutex_lock(&domain->revmap_tree_mutex);
 		radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
-		mutex_unlock(&revmap_trees_mutex);
+		mutex_unlock(&domain->revmap_tree_mutex);
 	}
 }
 
@@ -1459,11 +1459,11 @@ static void irq_domain_fix_revmap(struct irq_data *d)
 		return; /* Not using radix tree. */
 
 	/* Fix up the revmap. */
-	mutex_lock(&revmap_trees_mutex);
+	mutex_lock(&d->domain->revmap_tree_mutex);
 	slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq);
 	if (slot)
 		radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
-	mutex_unlock(&revmap_trees_mutex);
+	mutex_unlock(&d->domain->revmap_tree_mutex);
 }
 
 /**
-- 
cgit v1.2.3


From eda0d04acc5e317da675ee93a3f09e7c2e2fa592 Mon Sep 17 00:00:00 2001
From: Shanker Donthineni <shankerd@codeaurora.org>
Date: Fri, 6 Oct 2017 10:24:00 -0500
Subject: irqchip/gic-v3: Add support for Range Selector (RS) feature

A new feature Range Selector (RS) has been added to GIC specification
in order to support more than 16 CPUs at affinity level 0. New fields
are introduced in SGI system registers (ICC_SGI0R_EL1, ICC_SGI1R_EL1
and ICC_ASGI1R_EL1) to relax an artificial limit of 16 at level 0.

- A new RSS field in ICC_CTLR_EL3, ICC_CTLR_EL1 and ICV_CTLR_EL1:
  [18] - Range Selector Support (RSS)
  0b0 = Targeted SGIs with affinity level 0 values of 0-15 are supported.
  0b1 = Targeted SGIs with affinity level 0 values of 0-255 are supported.

- A new RS field in ICC_SGI0R_EL1, ICC_SGI1R_EL1 and ICC_ASGI1R_EL1:
  [47:44] - RangeSelector (RS) which group of 16 TargetList[n] field
            TargetList[n] represents aff0 value ((RS*16)+n)
            When ICC_CTLR_EL3.RSS==0 or ICC_CTLR_EL1.RSS==0, RS is RES0.

- A new RSS field in GICD_TYPER:
  [26] - Range Selector Support (RSS)
  0b0 = Targeted SGIs with affinity level 0 values of 0-15 are supported.
  0b1 = Targeted SGIs with affinity level 0 values of 0-255 are supported.

Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm/include/asm/arch_gicv3.h   |  5 ++++
 arch/arm64/include/asm/arch_gicv3.h |  5 ++++
 drivers/irqchip/irq-gic-v3.c        | 50 ++++++++++++++++++++++++++++++-------
 include/linux/irqchip/arm-gic-v3.h  |  4 +++
 4 files changed, 55 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h
index eee269321923..1070044f5c3f 100644
--- a/arch/arm/include/asm/arch_gicv3.h
+++ b/arch/arm/include/asm/arch_gicv3.h
@@ -196,6 +196,11 @@ static inline void gic_write_ctlr(u32 val)
 	isb();
 }
 
+static inline u32 gic_read_ctlr(void)
+{
+	return read_sysreg(ICC_CTLR);
+}
+
 static inline void gic_write_grpen1(u32 val)
 {
 	write_sysreg(val, ICC_IGRPEN1);
diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h
index b7e3f74822da..9becba9ab392 100644
--- a/arch/arm64/include/asm/arch_gicv3.h
+++ b/arch/arm64/include/asm/arch_gicv3.h
@@ -87,6 +87,11 @@ static inline void gic_write_ctlr(u32 val)
 	isb();
 }
 
+static inline u32 gic_read_ctlr(void)
+{
+	return read_sysreg_s(SYS_ICC_CTLR_EL1);
+}
+
 static inline void gic_write_grpen1(u32 val)
 {
 	write_sysreg_s(val, SYS_ICC_IGRPEN1_EL1);
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index b5df99c6f680..b54b55597ffb 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -55,6 +55,7 @@ struct gic_chip_data {
 	struct irq_domain	*domain;
 	u64			redist_stride;
 	u32			nr_redist_regions;
+	bool			has_rss;
 	unsigned int		irq_nr;
 	struct partition_desc	*ppi_descs[16];
 };
@@ -63,7 +64,9 @@ static struct gic_chip_data gic_data __read_mostly;
 static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE;
 
 static struct gic_kvm_info gic_v3_kvm_info;
+static DEFINE_PER_CPU(bool, has_rss);
 
+#define MPIDR_RS(mpidr)			(((mpidr) & 0xF0UL) >> 4)
 #define gic_data_rdist()		(this_cpu_ptr(gic_data.rdists.rdist))
 #define gic_data_rdist_rd_base()	(gic_data_rdist()->rd_base)
 #define gic_data_rdist_sgi_base()	(gic_data_rdist_rd_base() + SZ_64K)
@@ -526,6 +529,10 @@ static void gic_update_vlpi_properties(void)
 
 static void gic_cpu_sys_reg_init(void)
 {
+	int i, cpu = smp_processor_id();
+	u64 mpidr = cpu_logical_map(cpu);
+	u64 need_rss = MPIDR_RS(mpidr);
+
 	/*
 	 * Need to check that the SRE bit has actually been set. If
 	 * not, it means that SRE is disabled at EL2. We're going to
@@ -557,6 +564,30 @@ static void gic_cpu_sys_reg_init(void)
 
 	/* ... and let's hit the road... */
 	gic_write_grpen1(1);
+
+	/* Keep the RSS capability status in per_cpu variable */
+	per_cpu(has_rss, cpu) = !!(gic_read_ctlr() & ICC_CTLR_EL1_RSS);
+
+	/* Check all the CPUs have capable of sending SGIs to other CPUs */
+	for_each_online_cpu(i) {
+		bool have_rss = per_cpu(has_rss, i) && per_cpu(has_rss, cpu);
+
+		need_rss |= MPIDR_RS(cpu_logical_map(i));
+		if (need_rss && (!have_rss))
+			pr_crit("CPU%d (%lx) can't SGI CPU%d (%lx), no RSS\n",
+				cpu, (unsigned long)mpidr,
+				i, (unsigned long)cpu_logical_map(i));
+	}
+
+	/**
+	 * GIC spec says, when ICC_CTLR_EL1.RSS==1 and GICD_TYPER.RSS==0,
+	 * writing ICC_ASGI1R_EL1 register with RS != 0 is a CONSTRAINED
+	 * UNPREDICTABLE choice of :
+	 *   - The write is ignored.
+	 *   - The RS field is treated as 0.
+	 */
+	if (need_rss && (!gic_data.has_rss))
+		pr_crit_once("RSS is required but GICD doesn't support it\n");
 }
 
 static int gic_dist_supports_lpis(void)
@@ -591,6 +622,9 @@ static void gic_cpu_init(void)
 
 #ifdef CONFIG_SMP
 
+#define MPIDR_TO_SGI_RS(mpidr)	(MPIDR_RS(mpidr) << ICC_SGI1R_RS_SHIFT)
+#define MPIDR_TO_SGI_CLUSTER_ID(mpidr)	((mpidr) & ~0xFUL)
+
 static int gic_starting_cpu(unsigned int cpu)
 {
 	gic_cpu_init();
@@ -605,13 +639,6 @@ static u16 gic_compute_target_list(int *base_cpu, const struct cpumask *mask,
 	u16 tlist = 0;
 
 	while (cpu < nr_cpu_ids) {
-		/*
-		 * If we ever get a cluster of more than 16 CPUs, just
-		 * scream and skip that CPU.
-		 */
-		if (WARN_ON((mpidr & 0xff) >= 16))
-			goto out;
-
 		tlist |= 1 << (mpidr & 0xf);
 
 		next_cpu = cpumask_next(cpu, mask);
@@ -621,7 +648,7 @@ static u16 gic_compute_target_list(int *base_cpu, const struct cpumask *mask,
 
 		mpidr = cpu_logical_map(cpu);
 
-		if (cluster_id != (mpidr & ~0xffUL)) {
+		if (cluster_id != MPIDR_TO_SGI_CLUSTER_ID(mpidr)) {
 			cpu--;
 			goto out;
 		}
@@ -643,6 +670,7 @@ static void gic_send_sgi(u64 cluster_id, u16 tlist, unsigned int irq)
 	       MPIDR_TO_SGI_AFFINITY(cluster_id, 2)	|
 	       irq << ICC_SGI1R_SGI_ID_SHIFT		|
 	       MPIDR_TO_SGI_AFFINITY(cluster_id, 1)	|
+	       MPIDR_TO_SGI_RS(cluster_id)		|
 	       tlist << ICC_SGI1R_TARGET_LIST_SHIFT);
 
 	pr_debug("CPU%d: ICC_SGI1R_EL1 %llx\n", smp_processor_id(), val);
@@ -663,7 +691,7 @@ static void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
 	smp_wmb();
 
 	for_each_cpu(cpu, mask) {
-		unsigned long cluster_id = cpu_logical_map(cpu) & ~0xffUL;
+		u64 cluster_id = MPIDR_TO_SGI_CLUSTER_ID(cpu_logical_map(cpu));
 		u16 tlist;
 
 		tlist = gic_compute_target_list(&cpu, mask, cluster_id);
@@ -1007,6 +1035,10 @@ static int __init gic_init_bases(void __iomem *dist_base,
 		goto out_free;
 	}
 
+	gic_data.has_rss = !!(typer & GICD_TYPER_RSS);
+	pr_info("Distributor has %sRange Selector support\n",
+		gic_data.has_rss ? "" : "no ");
+
 	set_handle_irq(gic_handle_irq);
 
 	gic_update_vlpi_properties();
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 1ea576c8126f..b8b59989bd73 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -68,6 +68,7 @@
 #define GICD_CTLR_ENABLE_SS_G1		(1U << 1)
 #define GICD_CTLR_ENABLE_SS_G0		(1U << 0)
 
+#define GICD_TYPER_RSS			(1U << 26)
 #define GICD_TYPER_LPIS			(1U << 17)
 #define GICD_TYPER_MBIS			(1U << 16)
 
@@ -459,6 +460,7 @@
 #define ICC_CTLR_EL1_SEIS_MASK		(0x1 << ICC_CTLR_EL1_SEIS_SHIFT)
 #define ICC_CTLR_EL1_A3V_SHIFT		15
 #define ICC_CTLR_EL1_A3V_MASK		(0x1 << ICC_CTLR_EL1_A3V_SHIFT)
+#define ICC_CTLR_EL1_RSS		(0x1 << 18)
 #define ICC_PMR_EL1_SHIFT		0
 #define ICC_PMR_EL1_MASK		(0xff << ICC_PMR_EL1_SHIFT)
 #define ICC_BPR0_EL1_SHIFT		0
@@ -547,6 +549,8 @@
 #define ICC_SGI1R_AFFINITY_2_SHIFT	32
 #define ICC_SGI1R_AFFINITY_2_MASK	(0xffULL << ICC_SGI1R_AFFINITY_2_SHIFT)
 #define ICC_SGI1R_IRQ_ROUTING_MODE_BIT	40
+#define ICC_SGI1R_RS_SHIFT		44
+#define ICC_SGI1R_RS_MASK		(0xfULL << ICC_SGI1R_RS_SHIFT)
 #define ICC_SGI1R_AFFINITY_3_SHIFT	48
 #define ICC_SGI1R_AFFINITY_3_MASK	(0xffULL << ICC_SGI1R_AFFINITY_3_SHIFT)
 
-- 
cgit v1.2.3


From ab60491ee5d346557f152c7e8d3e7238c9b96c5c Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Sun, 8 Oct 2017 18:48:06 +0100
Subject: irqchip/gic-v3-its: Make GICv4_ITS_LIST_MAX globally available

As we're about to make use of the maximum number of ITSs in
a GICv4 system, let's make this value global (and rename it to
GICv4_ITS_LIST_MAX).

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v3-its.c   | 10 ++--------
 include/linux/irqchip/arm-gic-v4.h |  6 ++++++
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 994f0879e7df..a63b4ee34860 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -156,12 +156,6 @@ static DEFINE_SPINLOCK(its_lock);
 static struct rdists *gic_rdists;
 static struct irq_domain *its_parent;
 
-/*
- * We have a maximum number of 16 ITSs in the whole system if we're
- * using the ITSList mechanism
- */
-#define ITS_LIST_MAX		16
-
 static unsigned long its_list_map;
 static u16 vmovp_seq_num;
 static DEFINE_RAW_SPINLOCK(vmovp_lock);
@@ -2988,8 +2982,8 @@ static int __init its_compute_its_list_map(struct resource *res,
 	 * locking. Should this change, we should address
 	 * this.
 	 */
-	its_number = find_first_zero_bit(&its_list_map, ITS_LIST_MAX);
-	if (its_number >= ITS_LIST_MAX) {
+	its_number = find_first_zero_bit(&its_list_map, GICv4_ITS_LIST_MAX);
+	if (its_number >= GICv4_ITS_LIST_MAX) {
 		pr_err("ITS@%pa: No ITSList entry available!\n",
 		       &res->start);
 		return -EINVAL;
diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h
index 58a4d89aa82c..e26a668826e6 100644
--- a/include/linux/irqchip/arm-gic-v4.h
+++ b/include/linux/irqchip/arm-gic-v4.h
@@ -20,6 +20,12 @@
 
 struct its_vpe;
 
+/*
+ * Maximum number of ITTs when GITS_TYPER.VMOVP == 0, using the
+ * ITSList mechanism to perform inter-ITS synchronization.
+ */
+#define GICv4_ITS_LIST_MAX		16
+
 /* Embedded in kvm.arch */
 struct its_vm {
 	struct fwnode_handle	*fwnode;
-- 
cgit v1.2.3


From 2247e1bf70639642b1c1375aa9176ccd95736400 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Sun, 8 Oct 2017 18:50:36 +0100
Subject: irqchip/gic-v3-its: Limit scope of VPE mapping to be per ITS

So far, we map all VPEs on all ITSs. While this is not wrong,
this is quite a big hammer, as moving a VPE around requires
all ITSs to be synchronized. Needles to say, this is an
expensive proposition.

Instead, let's switch to a mode where we issue VMAPP commands
only on ITSs that are actually involved in reporting interrupts
to the given VM.

For that purpose, we refcount the number of interrupts are are
mapped for this VM on each ITS, performing the map/unmap
operations as required. It then allows us to use this refcount
to only issue VMOVP to the ITSs that need to know about this
VM.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v3-its.c   | 75 ++++++++++++++++++++++++++++++++++++++
 include/linux/irqchip/arm-gic-v4.h |  1 +
 2 files changed, 76 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index b14585d6397b..dc0ece20f964 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -971,6 +971,9 @@ static void its_send_vmovp(struct its_vpe *vpe)
 		if (!its->is_v4)
 			continue;
 
+		if (!vpe->its_vm->vlpi_count[its->list_nr])
+			continue;
+
 		desc.its_vmovp_cmd.col = &its->collections[col_id];
 		its_send_single_vcommand(its, its_build_vmovp_cmd, &desc);
 	}
@@ -1154,6 +1157,58 @@ static int its_irq_set_irqchip_state(struct irq_data *d,
 	return 0;
 }
 
+static void its_map_vm(struct its_node *its, struct its_vm *vm)
+{
+	unsigned long flags;
+
+	/* Not using the ITS list? Everything is always mapped. */
+	if (!its_list_map)
+		return;
+
+	raw_spin_lock_irqsave(&vmovp_lock, flags);
+
+	/*
+	 * If the VM wasn't mapped yet, iterate over the vpes and get
+	 * them mapped now.
+	 */
+	vm->vlpi_count[its->list_nr]++;
+
+	if (vm->vlpi_count[its->list_nr] == 1) {
+		int i;
+
+		for (i = 0; i < vm->nr_vpes; i++) {
+			struct its_vpe *vpe = vm->vpes[i];
+
+			/* Map the VPE to the first possible CPU */
+			vpe->col_idx = cpumask_first(cpu_online_mask);
+			its_send_vmapp(its, vpe, true);
+			its_send_vinvall(its, vpe);
+		}
+	}
+
+	raw_spin_unlock_irqrestore(&vmovp_lock, flags);
+}
+
+static void its_unmap_vm(struct its_node *its, struct its_vm *vm)
+{
+	unsigned long flags;
+
+	/* Not using the ITS list? Everything is always mapped. */
+	if (!its_list_map)
+		return;
+
+	raw_spin_lock_irqsave(&vmovp_lock, flags);
+
+	if (!--vm->vlpi_count[its->list_nr]) {
+		int i;
+
+		for (i = 0; i < vm->nr_vpes; i++)
+			its_send_vmapp(its, vm->vpes[i], false);
+	}
+
+	raw_spin_unlock_irqrestore(&vmovp_lock, flags);
+}
+
 static int its_vlpi_map(struct irq_data *d, struct its_cmd_info *info)
 {
 	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
@@ -1189,6 +1244,9 @@ static int its_vlpi_map(struct irq_data *d, struct its_cmd_info *info)
 		/* Already mapped, move it around */
 		its_send_vmovi(its_dev, event);
 	} else {
+		/* Ensure all the VPEs are mapped on this ITS */
+		its_map_vm(its_dev->its, info->map->vm);
+
 		/* Drop the physical mapping */
 		its_send_discard(its_dev, event);
 
@@ -1250,6 +1308,9 @@ static int its_vlpi_unmap(struct irq_data *d)
 				    LPI_PROP_ENABLED |
 				    LPI_PROP_GROUP1));
 
+	/* Potentially unmap the VM from this ITS */
+	its_unmap_vm(its_dev->its, its_dev->event_map.vm);
+
 	/*
 	 * Drop the refcount and make the device available again if
 	 * this was the last VLPI.
@@ -2463,6 +2524,9 @@ static void its_vpe_invall(struct its_vpe *vpe)
 		if (!its->is_v4)
 			continue;
 
+		if (its_list_map && !vpe->its_vm->vlpi_count[its->list_nr])
+			continue;
+
 		its_send_vinvall(its, vpe);
 	}
 }
@@ -2713,6 +2777,10 @@ static int its_vpe_irq_domain_activate(struct irq_domain *domain,
 	struct its_vpe *vpe = irq_data_get_irq_chip_data(d);
 	struct its_node *its;
 
+	/* If we use the list map, we issue VMAPP on demand... */
+	if (its_list_map)
+		return true;
+
 	/* Map the VPE to the first possible CPU */
 	vpe->col_idx = cpumask_first(cpu_online_mask);
 
@@ -2733,6 +2801,13 @@ static void its_vpe_irq_domain_deactivate(struct irq_domain *domain,
 	struct its_vpe *vpe = irq_data_get_irq_chip_data(d);
 	struct its_node *its;
 
+	/*
+	 * If we use the list map, we unmap the VPE once no VLPIs are
+	 * associated with the VM.
+	 */
+	if (its_list_map)
+		return;
+
 	list_for_each_entry(its, &its_nodes, entry) {
 		if (!its->is_v4)
 			continue;
diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h
index e26a668826e6..43cde15f221b 100644
--- a/include/linux/irqchip/arm-gic-v4.h
+++ b/include/linux/irqchip/arm-gic-v4.h
@@ -36,6 +36,7 @@ struct its_vm {
 	irq_hw_number_t		db_lpi_base;
 	unsigned long		*db_bitmap;
 	int			nr_db_lpis;
+	u32			vlpi_count[GICv4_ITS_LIST_MAX];
 };
 
 /* Embedded in kvm_vcpu.arch */
-- 
cgit v1.2.3


From 7a0947e755084b918e33242fd558e55cb443408e Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 17 Oct 2017 17:16:52 -0700
Subject: dql: make dql_init return void

dql_init always returned 0, and the only place that uses it
in network core code didn't care about the return value anyway.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Acked-by: Hiroaki SHIMODA <shimoda.hiroaki@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dynamic_queue_limits.h | 2 +-
 lib/dynamic_queue_limits.c           | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h
index a4be70398ce1..f69f98541953 100644
--- a/include/linux/dynamic_queue_limits.h
+++ b/include/linux/dynamic_queue_limits.h
@@ -98,7 +98,7 @@ void dql_completed(struct dql *dql, unsigned int count);
 void dql_reset(struct dql *dql);
 
 /* Initialize dql state */
-int dql_init(struct dql *dql, unsigned hold_time);
+void dql_init(struct dql *dql, unsigned int hold_time);
 
 #endif /* _KERNEL_ */
 
diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c
index f346715e2255..dbe61c4c2a97 100644
--- a/lib/dynamic_queue_limits.c
+++ b/lib/dynamic_queue_limits.c
@@ -127,12 +127,11 @@ void dql_reset(struct dql *dql)
 }
 EXPORT_SYMBOL(dql_reset);
 
-int dql_init(struct dql *dql, unsigned hold_time)
+void dql_init(struct dql *dql, unsigned int hold_time)
 {
 	dql->max_limit = DQL_MAX_LIMIT;
 	dql->min_limit = 0;
 	dql->slack_hold_time = hold_time;
 	dql_reset(dql);
-	return 0;
 }
 EXPORT_SYMBOL(dql_init);
-- 
cgit v1.2.3


From 93b138dd9d6e01cd360f347512b6c5bf6f0c4ce2 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 18 Sep 2017 00:01:44 +0900
Subject: printk: simplify no_printk()

Commit 069f0cd00df0 ("printk: Make the printk*once() variants return
a value") surrounded the macro implementation with ({ ... }).

Now, the inner do { ... } while (0); is redundant.

Link: http://lkml.kernel.org/r/1505660504-11059-1-git-send-email-yamada.masahiro@socionext.com
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/printk.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index e10f27468322..7911f7364346 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -131,10 +131,8 @@ struct va_format {
  */
 #define no_printk(fmt, ...)				\
 ({							\
-	do {						\
-		if (0)					\
-			printk(fmt, ##__VA_ARGS__);	\
-	} while (0);					\
+	if (0)						\
+		printk(fmt, ##__VA_ARGS__);		\
 	0;						\
 })
 
-- 
cgit v1.2.3


From aa293583f0fe8b1634aeadbea06b4d0d04c30a95 Mon Sep 17 00:00:00 2001
From: Bhumika Goyal <bhumirks@gmail.com>
Date: Mon, 16 Oct 2017 17:18:40 +0200
Subject: configfs: make ci_type field, some pointers and function arguments
 const

The ci_type field of the config_item structure do not modify the fields
of the config_item_type structure it points to. And the other pointers
initialized with ci_type do not modify the fields as well.
So, make the ci_type field and the pointers initialized with ci_type
as const.

Make the struct config_item_type *type function argument of functions
config_{item/group}_init_type_name const as the argument in both the
functions is only stored in the ci_type field of a config_item structure
which is now made const.
Make the argument of configfs_register_default_group const as it is
only passed to the argument of the function config_group_init_type_name
which is now const.

Signed-off-by: Bhumika Goyal <bhumirks@gmail.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/configfs/dir.c        | 10 +++++-----
 fs/configfs/item.c       |  6 +++---
 fs/configfs/symlink.c    |  4 ++--
 include/linux/configfs.h |  8 ++++----
 4 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 56fb26127fef..577cff24707b 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -584,7 +584,7 @@ static void detach_attrs(struct config_item * item)
 
 static int populate_attrs(struct config_item *item)
 {
-	struct config_item_type *t = item->ci_type;
+	const struct config_item_type *t = item->ci_type;
 	struct configfs_attribute *attr;
 	struct configfs_bin_attribute *bin_attr;
 	int error = 0;
@@ -901,7 +901,7 @@ static void configfs_detach_group(struct config_item *item)
 static void client_disconnect_notify(struct config_item *parent_item,
 				     struct config_item *item)
 {
-	struct config_item_type *type;
+	const struct config_item_type *type;
 
 	type = parent_item->ci_type;
 	BUG_ON(!type);
@@ -920,7 +920,7 @@ static void client_disconnect_notify(struct config_item *parent_item,
 static void client_drop_item(struct config_item *parent_item,
 			     struct config_item *item)
 {
-	struct config_item_type *type;
+	const struct config_item_type *type;
 
 	type = parent_item->ci_type;
 	BUG_ON(!type);
@@ -1260,7 +1260,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	struct config_item *parent_item;
 	struct configfs_subsystem *subsys;
 	struct configfs_dirent *sd;
-	struct config_item_type *type;
+	const struct config_item_type *type;
 	struct module *subsys_owner = NULL, *new_item_owner = NULL;
 	char *name;
 
@@ -1810,7 +1810,7 @@ EXPORT_SYMBOL(configfs_unregister_group);
 struct config_group *
 configfs_register_default_group(struct config_group *parent_group,
 				const char *name,
-				struct config_item_type *item_type)
+				const struct config_item_type *item_type)
 {
 	int ret;
 	struct config_group *group;
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index a66f6624d899..88f266efc09b 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL(config_item_set_name);
 
 void config_item_init_type_name(struct config_item *item,
 				const char *name,
-				struct config_item_type *type)
+				const struct config_item_type *type)
 {
 	config_item_set_name(item, "%s", name);
 	item->ci_type = type;
@@ -122,7 +122,7 @@ void config_item_init_type_name(struct config_item *item,
 EXPORT_SYMBOL(config_item_init_type_name);
 
 void config_group_init_type_name(struct config_group *group, const char *name,
-			 struct config_item_type *type)
+			 const struct config_item_type *type)
 {
 	config_item_set_name(&group->cg_item, "%s", name);
 	group->cg_item.ci_type = type;
@@ -148,7 +148,7 @@ EXPORT_SYMBOL(config_item_get_unless_zero);
 
 static void config_item_cleanup(struct config_item *item)
 {
-	struct config_item_type *t = item->ci_type;
+	const struct config_item_type *t = item->ci_type;
 	struct config_group *s = item->ci_group;
 	struct config_item *parent = item->ci_parent;
 
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index c8aabba502f6..78ffc2699993 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -138,7 +138,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
 	struct configfs_dirent *sd;
 	struct config_item *parent_item;
 	struct config_item *target_item = NULL;
-	struct config_item_type *type;
+	const struct config_item_type *type;
 
 	sd = dentry->d_parent->d_fsdata;
 	/*
@@ -186,7 +186,7 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry)
 	struct configfs_dirent *sd = dentry->d_fsdata;
 	struct configfs_symlink *sl;
 	struct config_item *parent_item;
-	struct config_item_type *type;
+	const struct config_item_type *type;
 	int ret;
 
 	ret = -EPERM;  /* What lack-of-symlink returns */
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index c96709049683..90b90f8baf99 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -58,7 +58,7 @@ struct config_item {
 	struct list_head	ci_entry;
 	struct config_item	*ci_parent;
 	struct config_group	*ci_group;
-	struct config_item_type	*ci_type;
+	const struct config_item_type	*ci_type;
 	struct dentry		*ci_dentry;
 };
 
@@ -72,7 +72,7 @@ static inline char *config_item_name(struct config_item * item)
 
 extern void config_item_init_type_name(struct config_item *item,
 				       const char *name,
-				       struct config_item_type *type);
+				       const struct config_item_type *type);
 
 extern struct config_item *config_item_get(struct config_item *);
 extern struct config_item *config_item_get_unless_zero(struct config_item *);
@@ -101,7 +101,7 @@ struct config_group {
 extern void config_group_init(struct config_group *group);
 extern void config_group_init_type_name(struct config_group *group,
 					const char *name,
-					struct config_item_type *type);
+					const struct config_item_type *type);
 
 static inline struct config_group *to_config_group(struct config_item *item)
 {
@@ -261,7 +261,7 @@ void configfs_remove_default_groups(struct config_group *group);
 struct config_group *
 configfs_register_default_group(struct config_group *parent_group,
 				const char *name,
-				struct config_item_type *item_type);
+				const struct config_item_type *item_type);
 void configfs_unregister_default_group(struct config_group *group);
 
 /* These functions can sleep and can alloc with GFP_KERNEL */
-- 
cgit v1.2.3


From 612a462acbb9f3bfc7c8433b44118b0a156560da Mon Sep 17 00:00:00 2001
From: Bhumika Goyal <bhumirks@gmail.com>
Date: Mon, 16 Oct 2017 17:18:43 +0200
Subject: iio: make function argument and some structures const

Make the argument of the functions iio_sw{d/t}_group_init_type_name const
as they are only passed to the function config_group_init_type_name having
the argument as const.

Make the config_item_type structures const as they are either passed to
the functions having the argument as const or they are
stored in the const "ci_type" field of a config_item structure.

Signed-off-by: Bhumika Goyal <bhumirks@gmail.com>
Acked-by: Jonathan Cameron <Jonathan.Cameron@Huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/iio/dummy/iio_simple_dummy.c   | 2 +-
 drivers/iio/industrialio-configfs.c    | 2 +-
 drivers/iio/industrialio-sw-device.c   | 6 +++---
 drivers/iio/industrialio-sw-trigger.c  | 6 +++---
 drivers/iio/trigger/iio-trig-hrtimer.c | 2 +-
 drivers/iio/trigger/iio-trig-loop.c    | 2 +-
 include/linux/iio/sw_device.h          | 2 +-
 include/linux/iio/sw_trigger.h         | 2 +-
 8 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/dummy/iio_simple_dummy.c b/drivers/iio/dummy/iio_simple_dummy.c
index ad3410e528b6..e177a0862b2b 100644
--- a/drivers/iio/dummy/iio_simple_dummy.c
+++ b/drivers/iio/dummy/iio_simple_dummy.c
@@ -26,7 +26,7 @@
 #include <linux/iio/sw_device.h>
 #include "iio_simple_dummy.h"
 
-static struct config_item_type iio_dummy_type = {
+static const struct config_item_type iio_dummy_type = {
 	.ct_owner = THIS_MODULE,
 };
 
diff --git a/drivers/iio/industrialio-configfs.c b/drivers/iio/industrialio-configfs.c
index 45ce2bc47180..5a0aae119369 100644
--- a/drivers/iio/industrialio-configfs.c
+++ b/drivers/iio/industrialio-configfs.c
@@ -17,7 +17,7 @@
 #include <linux/iio/iio.h>
 #include <linux/iio/configfs.h>
 
-static struct config_item_type iio_root_group_type = {
+static const struct config_item_type iio_root_group_type = {
 	.ct_owner       = THIS_MODULE,
 };
 
diff --git a/drivers/iio/industrialio-sw-device.c b/drivers/iio/industrialio-sw-device.c
index 81b49cfca452..90df97c542f6 100644
--- a/drivers/iio/industrialio-sw-device.c
+++ b/drivers/iio/industrialio-sw-device.c
@@ -19,9 +19,9 @@
 #include <linux/configfs.h>
 
 static struct config_group *iio_devices_group;
-static struct config_item_type iio_device_type_group_type;
+static const struct config_item_type iio_device_type_group_type;
 
-static struct config_item_type iio_devices_group_type = {
+static const struct config_item_type iio_devices_group_type = {
 	.ct_owner = THIS_MODULE,
 };
 
@@ -156,7 +156,7 @@ static struct configfs_group_operations device_ops = {
 	.drop_item	= &device_drop_group,
 };
 
-static struct config_item_type iio_device_type_group_type = {
+static const struct config_item_type iio_device_type_group_type = {
 	.ct_group_ops = &device_ops,
 	.ct_owner       = THIS_MODULE,
 };
diff --git a/drivers/iio/industrialio-sw-trigger.c b/drivers/iio/industrialio-sw-trigger.c
index 8d24fb159cc9..bc6b7fb43e3a 100644
--- a/drivers/iio/industrialio-sw-trigger.c
+++ b/drivers/iio/industrialio-sw-trigger.c
@@ -19,9 +19,9 @@
 #include <linux/configfs.h>
 
 static struct config_group *iio_triggers_group;
-static struct config_item_type iio_trigger_type_group_type;
+static const struct config_item_type iio_trigger_type_group_type;
 
-static struct config_item_type iio_triggers_group_type = {
+static const struct config_item_type iio_triggers_group_type = {
 	.ct_owner = THIS_MODULE,
 };
 
@@ -156,7 +156,7 @@ static struct configfs_group_operations trigger_ops = {
 	.drop_item	= &trigger_drop_group,
 };
 
-static struct config_item_type iio_trigger_type_group_type = {
+static const struct config_item_type iio_trigger_type_group_type = {
 	.ct_group_ops = &trigger_ops,
 	.ct_owner       = THIS_MODULE,
 };
diff --git a/drivers/iio/trigger/iio-trig-hrtimer.c b/drivers/iio/trigger/iio-trig-hrtimer.c
index a1cad6cc2e0f..ce7d120b348e 100644
--- a/drivers/iio/trigger/iio-trig-hrtimer.c
+++ b/drivers/iio/trigger/iio-trig-hrtimer.c
@@ -30,7 +30,7 @@ struct iio_hrtimer_info {
 	ktime_t period;
 };
 
-static struct config_item_type iio_hrtimer_type = {
+static const struct config_item_type iio_hrtimer_type = {
 	.ct_owner = THIS_MODULE,
 };
 
diff --git a/drivers/iio/trigger/iio-trig-loop.c b/drivers/iio/trigger/iio-trig-loop.c
index dc6be28f96fe..745a01f04a27 100644
--- a/drivers/iio/trigger/iio-trig-loop.c
+++ b/drivers/iio/trigger/iio-trig-loop.c
@@ -36,7 +36,7 @@ struct iio_loop_info {
 	struct task_struct *task;
 };
 
-static struct config_item_type iio_loop_type = {
+static const struct config_item_type iio_loop_type = {
 	.ct_owner = THIS_MODULE,
 };
 
diff --git a/include/linux/iio/sw_device.h b/include/linux/iio/sw_device.h
index fa7931933067..8642b91a7577 100644
--- a/include/linux/iio/sw_device.h
+++ b/include/linux/iio/sw_device.h
@@ -60,7 +60,7 @@ void iio_sw_device_type_configfs_unregister(struct iio_sw_device_type *dt);
 static inline
 void iio_swd_group_init_type_name(struct iio_sw_device *d,
 				  const char *name,
-				  struct config_item_type *type)
+				  const struct config_item_type *type)
 {
 #if IS_ENABLED(CONFIG_CONFIGFS_FS)
 	config_group_init_type_name(&d->group, name, type);
diff --git a/include/linux/iio/sw_trigger.h b/include/linux/iio/sw_trigger.h
index c97eab67558f..0c43738a9e24 100644
--- a/include/linux/iio/sw_trigger.h
+++ b/include/linux/iio/sw_trigger.h
@@ -60,7 +60,7 @@ void iio_sw_trigger_type_configfs_unregister(struct iio_sw_trigger_type *tt);
 static inline
 void iio_swt_group_init_type_name(struct iio_sw_trigger *t,
 				  const char *name,
-				  struct config_item_type *type)
+				  const struct config_item_type *type)
 {
 #if IS_ENABLED(CONFIG_CONFIGFS_FS)
 	config_group_init_type_name(&t->group, name, type);
-- 
cgit v1.2.3


From d89e2378a97fafdc74cbf997e7c88af75b81610a Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 12 Oct 2017 16:56:14 +0100
Subject: drivers: flag buses which demand DMA configuration

We do not want the common dma_configure() pathway to apply
indiscriminately to all devices, since there are plenty of buses which
do not have DMA capability, and if their child devices were used for
DMA API calls it would only be indicative of a driver bug. However,
there are a number of buses for which DMA is implicitly expected even
when not described by firmware - those we whitelist with an automatic
opt-in to dma_configure(), assuming that the DMA address space and the
physical address space are equivalent if not otherwise specified.

Commit 723288836628 ("of: restrict DMA configuration") introduced a
short-term fix by comparing explicit bus types, but this approach is far
from pretty, doesn't scale well, and fails to cope at all with bus
drivers which may be built as modules, like host1x. Let's refine things
by making that opt-in a property of the bus type, which neatly addresses
those problems and lets the decision of whether firmware description of
DMA capability should be optional or mandatory stay internal to the bus
drivers themselves.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/amba/bus.c       | 1 +
 drivers/base/platform.c  | 1 +
 drivers/gpu/host1x/bus.c | 1 +
 drivers/of/device.c      | 8 +-------
 drivers/pci/pci-driver.c | 1 +
 include/linux/device.h   | 4 ++++
 6 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c
index e0f74ddc22b7..594c228d2f02 100644
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -195,6 +195,7 @@ struct bus_type amba_bustype = {
 	.match		= amba_match,
 	.uevent		= amba_uevent,
 	.pm		= &amba_pm,
+	.force_dma	= true,
 };
 
 static int __init amba_init(void)
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 9045c5f3734e..c203fb90c1a0 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -1143,6 +1143,7 @@ struct bus_type platform_bus_type = {
 	.match		= platform_match,
 	.uevent		= platform_uevent,
 	.pm		= &platform_dev_pm_ops,
+	.force_dma	= true,
 };
 EXPORT_SYMBOL_GPL(platform_bus_type);
 
diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c
index f9cde03030fd..ed03b3243195 100644
--- a/drivers/gpu/host1x/bus.c
+++ b/drivers/gpu/host1x/bus.c
@@ -320,6 +320,7 @@ struct bus_type host1x_bus_type = {
 	.name = "host1x",
 	.match = host1x_device_match,
 	.pm = &host1x_device_pm_ops,
+	.force_dma = true,
 };
 
 static void __host1x_device_del(struct host1x_device *device)
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 64b710265d39..25bddf9c9fe1 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -9,9 +9,7 @@
 #include <linux/module.h>
 #include <linux/mod_devicetable.h>
 #include <linux/slab.h>
-#include <linux/pci.h>
 #include <linux/platform_device.h>
-#include <linux/amba/bus.h>
 
 #include <asm/errno.h>
 #include "of_private.h"
@@ -101,11 +99,7 @@ int of_dma_configure(struct device *dev, struct device_node *np)
 		 * DMA configuration regardless of whether "dma-ranges" is
 		 * correctly specified or not.
 		 */
-		if (!dev_is_pci(dev) &&
-#ifdef CONFIG_ARM_AMBA
-		    dev->bus != &amba_bustype &&
-#endif
-		    dev->bus != &platform_bus_type)
+		if (!dev->bus->force_dma)
 			return ret == -ENODEV ? 0 : ret;
 
 		dma_addr = offset = 0;
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 11bd267fc137..38bdb97b6dc6 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -1466,6 +1466,7 @@ struct bus_type pci_bus_type = {
 	.drv_groups	= pci_drv_groups,
 	.pm		= PCI_PM_OPS_PTR,
 	.num_vf		= pci_bus_num_vf,
+	.force_dma	= true,
 };
 EXPORT_SYMBOL(pci_bus_type);
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 66fe271c2544..d60f7701e245 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -97,6 +97,8 @@ extern void bus_remove_file(struct bus_type *, struct bus_attribute *);
  * @p:		The private data of the driver core, only the driver core can
  *		touch this.
  * @lock_key:	Lock class key for use by the lock validator
+ * @force_dma:	Assume devices on this bus should be set up by dma_configure()
+ * 		even if DMA capability is not explicitly described by firmware.
  *
  * A bus is a channel between the processor and one or more devices. For the
  * purposes of the device model, all devices are connected via a bus, even if
@@ -135,6 +137,8 @@ struct bus_type {
 
 	struct subsys_private *p;
 	struct lock_class_key lock_key;
+
+	bool force_dma;
 };
 
 extern int __must_check bus_register(struct bus_type *bus);
-- 
cgit v1.2.3


From c9eb6172c328dde7e14812f94f8da87b691e41b5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 27 Aug 2017 10:37:15 +0200
Subject: dma-mapping: turn dma_cache_sync into a dma_map_ops method

After we removed all the dead wood it turns out only two architectures
actually implement dma_cache_sync as a real op: mips and parisc.  Add
a cache_sync method to struct dma_map_ops and implement it for the
mips defualt DMA ops, and the parisc pa11 ops.

Note that arm, arc and openrisc support DMA_ATTR_NON_CONSISTENT, but
never provided a functional dma_cache_sync implementations, which
seems somewhat odd.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
---
 arch/alpha/include/asm/dma-mapping.h      |  2 --
 arch/cris/include/asm/dma-mapping.h       |  6 ------
 arch/frv/include/asm/dma-mapping.h        |  6 ------
 arch/hexagon/include/asm/dma-mapping.h    |  3 ---
 arch/ia64/include/asm/dma-mapping.h       |  6 ------
 arch/m32r/include/asm/dma-mapping.h       |  5 -----
 arch/m68k/include/asm/dma-mapping.h       |  6 ------
 arch/metag/include/asm/dma-mapping.h      | 10 ----------
 arch/microblaze/include/asm/dma-mapping.h |  5 -----
 arch/mips/include/asm/dma-mapping.h       |  3 ---
 arch/mips/mm/dma-default.c                |  7 +++----
 arch/mn10300/include/asm/dma-mapping.h    |  6 ------
 arch/nios2/include/asm/dma-mapping.h      |  9 ---------
 arch/parisc/include/asm/dma-mapping.h     |  8 --------
 arch/parisc/kernel/pci-dma.c              |  8 ++++++++
 arch/powerpc/include/asm/dma-mapping.h    |  5 -----
 arch/s390/include/asm/dma-mapping.h       |  5 -----
 arch/sh/include/asm/dma-mapping.h         |  6 ------
 arch/sparc/include/asm/dma-mapping.h      |  8 --------
 arch/tile/include/asm/dma-mapping.h       |  9 ---------
 arch/unicore32/include/asm/dma-mapping.h  |  5 -----
 arch/x86/include/asm/dma-mapping.h        |  6 ------
 arch/xtensa/include/asm/dma-mapping.h     |  5 -----
 include/linux/dma-mapping.h               | 13 +++++++++++++
 24 files changed, 24 insertions(+), 128 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/dma-mapping.h b/arch/alpha/include/asm/dma-mapping.h
index 5d53666935e6..399a4f49355e 100644
--- a/arch/alpha/include/asm/dma-mapping.h
+++ b/arch/alpha/include/asm/dma-mapping.h
@@ -8,6 +8,4 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 	return dma_ops;
 }
 
-#define dma_cache_sync(dev, va, size, dir)		  ((void)0)
-
 #endif	/* _ALPHA_DMA_MAPPING_H */
diff --git a/arch/cris/include/asm/dma-mapping.h b/arch/cris/include/asm/dma-mapping.h
index 256169de3743..e30adde42beb 100644
--- a/arch/cris/include/asm/dma-mapping.h
+++ b/arch/cris/include/asm/dma-mapping.h
@@ -16,10 +16,4 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 }
 #endif
 
-static inline void
-dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-	       enum dma_data_direction direction)
-{
-}
-
 #endif
diff --git a/arch/frv/include/asm/dma-mapping.h b/arch/frv/include/asm/dma-mapping.h
index da0e5c9744c4..da24ae943f02 100644
--- a/arch/frv/include/asm/dma-mapping.h
+++ b/arch/frv/include/asm/dma-mapping.h
@@ -14,10 +14,4 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 	return &frv_dma_ops;
 }
 
-static inline
-void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-		    enum dma_data_direction direction)
-{
-}
-
 #endif  /* _ASM_DMA_MAPPING_H */
diff --git a/arch/hexagon/include/asm/dma-mapping.h b/arch/hexagon/include/asm/dma-mapping.h
index 463dbc18f853..5208de242e79 100644
--- a/arch/hexagon/include/asm/dma-mapping.h
+++ b/arch/hexagon/include/asm/dma-mapping.h
@@ -37,9 +37,6 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 	return dma_ops;
 }
 
-extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-			   enum dma_data_direction direction);
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
 	if (!dev->dma_mask)
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
index 99dfc1aa9d3c..9e5b5df76ff8 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -44,10 +44,4 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 	return daddr;
 }
 
-static inline void
-dma_cache_sync (struct device *dev, void *vaddr, size_t size,
-	enum dma_data_direction dir)
-{
-}
-
 #endif /* _ASM_IA64_DMA_MAPPING_H */
diff --git a/arch/m32r/include/asm/dma-mapping.h b/arch/m32r/include/asm/dma-mapping.h
index aff3ae8b62f7..9e993daed7a0 100644
--- a/arch/m32r/include/asm/dma-mapping.h
+++ b/arch/m32r/include/asm/dma-mapping.h
@@ -13,11 +13,6 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 	return &dma_noop_ops;
 }
 
-static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-				  enum dma_data_direction direction)
-{
-}
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
 	if (!dev->dma_mask)
diff --git a/arch/m68k/include/asm/dma-mapping.h b/arch/m68k/include/asm/dma-mapping.h
index 9210e470771b..9a0d559fcc13 100644
--- a/arch/m68k/include/asm/dma-mapping.h
+++ b/arch/m68k/include/asm/dma-mapping.h
@@ -8,10 +8,4 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
         return &m68k_dma_ops;
 }
 
-static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-				  enum dma_data_direction dir)
-{
-	/* we use coherent allocation, so not much to do here. */
-}
-
 #endif  /* _M68K_DMA_MAPPING_H */
diff --git a/arch/metag/include/asm/dma-mapping.h b/arch/metag/include/asm/dma-mapping.h
index ea573be2b6d0..340265dcf839 100644
--- a/arch/metag/include/asm/dma-mapping.h
+++ b/arch/metag/include/asm/dma-mapping.h
@@ -8,14 +8,4 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 	return &metag_dma_ops;
 }
 
-/*
- * dma_alloc_attrs() always returns non-cacheable memory, so there's no need to
- * do any flushing here.
- */
-static inline void
-dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-	       enum dma_data_direction direction)
-{
-}
-
 #endif
diff --git a/arch/microblaze/include/asm/dma-mapping.h b/arch/microblaze/include/asm/dma-mapping.h
index ad448e4aedb6..6b9ea39405b8 100644
--- a/arch/microblaze/include/asm/dma-mapping.h
+++ b/arch/microblaze/include/asm/dma-mapping.h
@@ -25,9 +25,4 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 	return &dma_direct_ops;
 }
 
-static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-		enum dma_data_direction direction)
-{
-}
-
 #endif	/* _ASM_MICROBLAZE_DMA_MAPPING_H */
diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
index aba71385f9d1..6ea1439430a2 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -26,9 +26,6 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
 
-extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-	       enum dma_data_direction direction);
-
 #define arch_setup_dma_ops arch_setup_dma_ops
 static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base,
 				      u64 size, const struct iommu_ops *iommu,
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index c01bd20d0208..2e2514e00720 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -383,7 +383,7 @@ static int mips_dma_supported(struct device *dev, u64 mask)
 	return plat_dma_supported(dev, mask);
 }
 
-void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+static void mips_dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 			 enum dma_data_direction direction)
 {
 	BUG_ON(direction == DMA_NONE);
@@ -392,8 +392,6 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 		__dma_sync_virtual(vaddr, size, direction);
 }
 
-EXPORT_SYMBOL(dma_cache_sync);
-
 static const struct dma_map_ops mips_default_dma_map_ops = {
 	.alloc = mips_dma_alloc_coherent,
 	.free = mips_dma_free_coherent,
@@ -407,7 +405,8 @@ static const struct dma_map_ops mips_default_dma_map_ops = {
 	.sync_sg_for_cpu = mips_dma_sync_sg_for_cpu,
 	.sync_sg_for_device = mips_dma_sync_sg_for_device,
 	.mapping_error = mips_dma_mapping_error,
-	.dma_supported = mips_dma_supported
+	.dma_supported = mips_dma_supported,
+	.cache_sync = mips_dma_cache_sync,
 };
 
 const struct dma_map_ops *mips_dma_map_ops = &mips_default_dma_map_ops;
diff --git a/arch/mn10300/include/asm/dma-mapping.h b/arch/mn10300/include/asm/dma-mapping.h
index dc24163b190f..439e474ed6d7 100644
--- a/arch/mn10300/include/asm/dma-mapping.h
+++ b/arch/mn10300/include/asm/dma-mapping.h
@@ -18,10 +18,4 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 	return &mn10300_dma_ops;
 }
 
-static inline
-void dma_cache_sync(void *vaddr, size_t size,
-		    enum dma_data_direction direction)
-{
-}
-
 #endif
diff --git a/arch/nios2/include/asm/dma-mapping.h b/arch/nios2/include/asm/dma-mapping.h
index f8dc62222741..6ceb92251da0 100644
--- a/arch/nios2/include/asm/dma-mapping.h
+++ b/arch/nios2/include/asm/dma-mapping.h
@@ -17,13 +17,4 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 	return &nios2_dma_ops;
 }
 
-/*
- * dma_alloc_attrs() always returns non-cacheable memory, so there's no need to
- * do any flushing here.
- */
-static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-				  enum dma_data_direction direction)
-{
-}
-
 #endif /* _ASM_NIOS2_DMA_MAPPING_H */
diff --git a/arch/parisc/include/asm/dma-mapping.h b/arch/parisc/include/asm/dma-mapping.h
index 2b16282add69..cb26bbd71d8a 100644
--- a/arch/parisc/include/asm/dma-mapping.h
+++ b/arch/parisc/include/asm/dma-mapping.h
@@ -32,14 +32,6 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 	return hppa_dma_ops;
 }
 
-static inline void
-dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-	       enum dma_data_direction direction)
-{
-	if (hppa_dma_ops->sync_single_for_cpu)
-		flush_kernel_dcache_range((unsigned long)vaddr, size);
-}
-
 static inline void *
 parisc_walk_tree(struct device *dev)
 {
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index bd4c0a7471d3..f87c34cb3b43 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -571,6 +571,12 @@ static void pa11_dma_sync_sg_for_device(struct device *dev, struct scatterlist *
 		flush_kernel_vmap_range(sg_virt(sg), sg->length);
 }
 
+static void pa11_dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+	       enum dma_data_direction direction)
+{
+	flush_kernel_dcache_range((unsigned long)vaddr, size);
+}
+
 const struct dma_map_ops pcxl_dma_ops = {
 	.dma_supported =	pa11_dma_supported,
 	.alloc =		pa11_dma_alloc,
@@ -583,6 +589,7 @@ const struct dma_map_ops pcxl_dma_ops = {
 	.sync_single_for_device = pa11_dma_sync_single_for_device,
 	.sync_sg_for_cpu =	pa11_dma_sync_sg_for_cpu,
 	.sync_sg_for_device =	pa11_dma_sync_sg_for_device,
+	.cache_sync =		pa11_dma_cache_sync,
 };
 
 static void *pcx_dma_alloc(struct device *dev, size_t size,
@@ -619,4 +626,5 @@ const struct dma_map_ops pcx_dma_ops = {
 	.sync_single_for_device = pa11_dma_sync_single_for_device,
 	.sync_sg_for_cpu =	pa11_dma_sync_sg_for_cpu,
 	.sync_sg_for_device =	pa11_dma_sync_sg_for_device,
+	.cache_sync =		pa11_dma_cache_sync,
 };
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index 320846442bfb..2e43c2ef7632 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -141,10 +141,5 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 
 #define ARCH_HAS_DMA_MMAP_COHERENT
 
-static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-		enum dma_data_direction direction)
-{
-}
-
 #endif /* __KERNEL__ */
 #endif	/* _ASM_DMA_MAPPING_H */
diff --git a/arch/s390/include/asm/dma-mapping.h b/arch/s390/include/asm/dma-mapping.h
index 512ad0eaa11a..b17304b13de5 100644
--- a/arch/s390/include/asm/dma-mapping.h
+++ b/arch/s390/include/asm/dma-mapping.h
@@ -15,11 +15,6 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 	return &dma_noop_ops;
 }
 
-static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-				  enum dma_data_direction direction)
-{
-}
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
 	if (!dev->dma_mask)
diff --git a/arch/sh/include/asm/dma-mapping.h b/arch/sh/include/asm/dma-mapping.h
index b46194ecef17..e89df111c017 100644
--- a/arch/sh/include/asm/dma-mapping.h
+++ b/arch/sh/include/asm/dma-mapping.h
@@ -9,12 +9,6 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 	return dma_ops;
 }
 
-static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-		    enum dma_data_direction dir)
-{
-}
-
-/* arch/sh/mm/consistent.c */
 extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 					dma_addr_t *dma_addr, gfp_t flag,
 					unsigned long attrs);
diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h
index 60bf1633d554..b298ed45cb23 100644
--- a/arch/sparc/include/asm/dma-mapping.h
+++ b/arch/sparc/include/asm/dma-mapping.h
@@ -5,14 +5,6 @@
 #include <linux/mm.h>
 #include <linux/dma-debug.h>
 
-static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-				  enum dma_data_direction dir)
-{
-	/* Since dma_{alloc,free}_noncoherent() allocated coherent memory, this
-	 * routine can be a nop.
-	 */
-}
-
 extern const struct dma_map_ops *dma_ops;
 extern const struct dma_map_ops pci32_dma_ops;
 
diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index 7061dc8af43a..97ad62878290 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -67,13 +67,4 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 #define HAVE_ARCH_DMA_SET_MASK 1
 int dma_set_mask(struct device *dev, u64 mask);
 
-/*
- * dma_alloc_attrs() always returns non-cacheable memory, so there's no need to
- * do any flushing here.
- */
-static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-				  enum dma_data_direction direction)
-{
-}
-
 #endif /* _ASM_TILE_DMA_MAPPING_H */
diff --git a/arch/unicore32/include/asm/dma-mapping.h b/arch/unicore32/include/asm/dma-mapping.h
index e949855bb794..ac608c2f6af6 100644
--- a/arch/unicore32/include/asm/dma-mapping.h
+++ b/arch/unicore32/include/asm/dma-mapping.h
@@ -45,10 +45,5 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
 
-static inline void dma_cache_sync(struct device *dev, void *vaddr,
-		size_t size, enum dma_data_direction direction)
-{
-}
-
 #endif /* __KERNEL__ */
 #endif
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index c6d3367be916..c26747340ba5 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -67,12 +67,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 }
 #endif /* CONFIG_X86_DMA_REMAP */
 
-static inline void
-dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-	enum dma_data_direction dir)
-{
-}
-
 static inline unsigned long dma_alloc_coherent_mask(struct device *dev,
 						    gfp_t gfp)
 {
diff --git a/arch/xtensa/include/asm/dma-mapping.h b/arch/xtensa/include/asm/dma-mapping.h
index 353e0314d6ba..153bf2370988 100644
--- a/arch/xtensa/include/asm/dma-mapping.h
+++ b/arch/xtensa/include/asm/dma-mapping.h
@@ -23,11 +23,6 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 	return &xtensa_dma_map_ops;
 }
 
-static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-		    enum dma_data_direction direction)
-{
-}
-
 static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
 {
 	return (dma_addr_t)paddr;
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 29ce9815da87..028a375d240d 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -126,6 +126,8 @@ struct dma_map_ops {
 	void (*sync_sg_for_device)(struct device *dev,
 				   struct scatterlist *sg, int nents,
 				   enum dma_data_direction dir);
+	void (*cache_sync)(struct device *dev, void *vaddr, size_t size,
+			enum dma_data_direction direction);
 	int (*mapping_error)(struct device *dev, dma_addr_t dma_addr);
 	int (*dma_supported)(struct device *dev, u64 mask);
 #ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
@@ -436,6 +438,17 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 #define dma_map_page(d, p, o, s, r) dma_map_page_attrs(d, p, o, s, r, 0)
 #define dma_unmap_page(d, a, s, r) dma_unmap_page_attrs(d, a, s, r, 0)
 
+static inline void
+dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+		enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->cache_sync)
+		ops->cache_sync(dev, vaddr, size, dir);
+}
+
 extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
 			   void *cpu_addr, dma_addr_t dma_addr, size_t size);
 
-- 
cgit v1.2.3


From 2940bc4333707a05e69b3ffd737bda0dc0c3004f Mon Sep 17 00:00:00 2001
From: Shaokun Zhang <zhangshaokun@hisilicon.com>
Date: Thu, 19 Oct 2017 19:05:18 +0800
Subject: perf: hisi: Add support for HiSilicon SoC L3C PMU driver

This patch adds support for L3C PMU driver in HiSilicon SoC chip, Each
L3C has own control, counter and interrupt registers and is an separate
PMU. For each L3C PMU, it has 8-programable counters and each counter
is free-running. Interrupt is supported to handle counter (48-bits)
overflow.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Signed-off-by: Anurup M <anurup.m@huawei.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/hisilicon/Makefile              |   2 +-
 drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c | 463 +++++++++++++++++++++++++++
 include/linux/cpuhotplug.h                   |   1 +
 3 files changed, 465 insertions(+), 1 deletion(-)
 create mode 100644 drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c

(limited to 'include/linux')

diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile
index 2783bb31e72b..4a3d3e6002a6 100644
--- a/drivers/perf/hisilicon/Makefile
+++ b/drivers/perf/hisilicon/Makefile
@@ -1 +1 @@
-obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o
+obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o
diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
new file mode 100644
index 000000000000..0bde5d919b2e
--- /dev/null
+++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
@@ -0,0 +1,463 @@
+/*
+ * HiSilicon SoC L3C uncore Hardware event counters support
+ *
+ * Copyright (C) 2017 Hisilicon Limited
+ * Author: Anurup M <anurup.m@huawei.com>
+ *         Shaokun Zhang <zhangshaokun@hisilicon.com>
+ *
+ * This code is based on the uncore PMUs like arm-cci and arm-ccn.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/acpi.h>
+#include <linux/bug.h>
+#include <linux/cpuhotplug.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/list.h>
+#include <linux/platform_device.h>
+#include <linux/smp.h>
+
+#include "hisi_uncore_pmu.h"
+
+/* L3C register definition */
+#define L3C_PERF_CTRL		0x0408
+#define L3C_INT_MASK		0x0800
+#define L3C_INT_STATUS		0x0808
+#define L3C_INT_CLEAR		0x080c
+#define L3C_EVENT_CTRL	        0x1c00
+#define L3C_EVENT_TYPE0		0x1d00
+/*
+ * Each counter is 48-bits and [48:63] are reserved
+ * which are Read-As-Zero and Writes-Ignored.
+ */
+#define L3C_CNTR0_LOWER		0x1e00
+
+/* L3C has 8-counters */
+#define L3C_NR_COUNTERS		0x8
+
+#define L3C_PERF_CTRL_EN	0x20000
+#define L3C_EVTYPE_NONE		0xff
+
+/*
+ * Select the counter register offset using the counter index
+ */
+static u32 hisi_l3c_pmu_get_counter_offset(int cntr_idx)
+{
+	return (L3C_CNTR0_LOWER + (cntr_idx * 8));
+}
+
+static u64 hisi_l3c_pmu_read_counter(struct hisi_pmu *l3c_pmu,
+				     struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+
+	if (!hisi_uncore_pmu_counter_valid(l3c_pmu, idx)) {
+		dev_err(l3c_pmu->dev, "Unsupported event index:%d!\n", idx);
+		return 0;
+	}
+
+	/* Read 64-bits and the upper 16 bits are RAZ */
+	return readq(l3c_pmu->base + hisi_l3c_pmu_get_counter_offset(idx));
+}
+
+static void hisi_l3c_pmu_write_counter(struct hisi_pmu *l3c_pmu,
+				       struct hw_perf_event *hwc, u64 val)
+{
+	u32 idx = hwc->idx;
+
+	if (!hisi_uncore_pmu_counter_valid(l3c_pmu, idx)) {
+		dev_err(l3c_pmu->dev, "Unsupported event index:%d!\n", idx);
+		return;
+	}
+
+	/* Write 64-bits and the upper 16 bits are WI */
+	writeq(val, l3c_pmu->base + hisi_l3c_pmu_get_counter_offset(idx));
+}
+
+static void hisi_l3c_pmu_write_evtype(struct hisi_pmu *l3c_pmu, int idx,
+				      u32 type)
+{
+	u32 reg, reg_idx, shift, val;
+
+	/*
+	 * Select the appropriate event select register(L3C_EVENT_TYPE0/1).
+	 * There are 2 event select registers for the 8 hardware counters.
+	 * Event code is 8-bits and for the former 4 hardware counters,
+	 * L3C_EVENT_TYPE0 is chosen. For the latter 4 hardware counters,
+	 * L3C_EVENT_TYPE1 is chosen.
+	 */
+	reg = L3C_EVENT_TYPE0 + (idx / 4) * 4;
+	reg_idx = idx % 4;
+	shift = 8 * reg_idx;
+
+	/* Write event code to L3C_EVENT_TYPEx Register */
+	val = readl(l3c_pmu->base + reg);
+	val &= ~(L3C_EVTYPE_NONE << shift);
+	val |= (type << shift);
+	writel(val, l3c_pmu->base + reg);
+}
+
+static void hisi_l3c_pmu_start_counters(struct hisi_pmu *l3c_pmu)
+{
+	u32 val;
+
+	/*
+	 * Set perf_enable bit in L3C_PERF_CTRL register to start counting
+	 * for all enabled counters.
+	 */
+	val = readl(l3c_pmu->base + L3C_PERF_CTRL);
+	val |= L3C_PERF_CTRL_EN;
+	writel(val, l3c_pmu->base + L3C_PERF_CTRL);
+}
+
+static void hisi_l3c_pmu_stop_counters(struct hisi_pmu *l3c_pmu)
+{
+	u32 val;
+
+	/*
+	 * Clear perf_enable bit in L3C_PERF_CTRL register to stop counting
+	 * for all enabled counters.
+	 */
+	val = readl(l3c_pmu->base + L3C_PERF_CTRL);
+	val &= ~(L3C_PERF_CTRL_EN);
+	writel(val, l3c_pmu->base + L3C_PERF_CTRL);
+}
+
+static void hisi_l3c_pmu_enable_counter(struct hisi_pmu *l3c_pmu,
+					struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	/* Enable counter index in L3C_EVENT_CTRL register */
+	val = readl(l3c_pmu->base + L3C_EVENT_CTRL);
+	val |= (1 << hwc->idx);
+	writel(val, l3c_pmu->base + L3C_EVENT_CTRL);
+}
+
+static void hisi_l3c_pmu_disable_counter(struct hisi_pmu *l3c_pmu,
+					 struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	/* Clear counter index in L3C_EVENT_CTRL register */
+	val = readl(l3c_pmu->base + L3C_EVENT_CTRL);
+	val &= ~(1 << hwc->idx);
+	writel(val, l3c_pmu->base + L3C_EVENT_CTRL);
+}
+
+static void hisi_l3c_pmu_enable_counter_int(struct hisi_pmu *l3c_pmu,
+					    struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	val = readl(l3c_pmu->base + L3C_INT_MASK);
+	/* Write 0 to enable interrupt */
+	val &= ~(1 << hwc->idx);
+	writel(val, l3c_pmu->base + L3C_INT_MASK);
+}
+
+static void hisi_l3c_pmu_disable_counter_int(struct hisi_pmu *l3c_pmu,
+					     struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	val = readl(l3c_pmu->base + L3C_INT_MASK);
+	/* Write 1 to mask interrupt */
+	val |= (1 << hwc->idx);
+	writel(val, l3c_pmu->base + L3C_INT_MASK);
+}
+
+static irqreturn_t hisi_l3c_pmu_isr(int irq, void *dev_id)
+{
+	struct hisi_pmu *l3c_pmu = dev_id;
+	struct perf_event *event;
+	unsigned long overflown;
+	int idx;
+
+	/* Read L3C_INT_STATUS register */
+	overflown = readl(l3c_pmu->base + L3C_INT_STATUS);
+	if (!overflown)
+		return IRQ_NONE;
+
+	/*
+	 * Find the counter index which overflowed if the bit was set
+	 * and handle it.
+	 */
+	for_each_set_bit(idx, &overflown, L3C_NR_COUNTERS) {
+		/* Write 1 to clear the IRQ status flag */
+		writel((1 << idx), l3c_pmu->base + L3C_INT_CLEAR);
+
+		/* Get the corresponding event struct */
+		event = l3c_pmu->pmu_events.hw_events[idx];
+		if (!event)
+			continue;
+
+		hisi_uncore_pmu_event_update(event);
+		hisi_uncore_pmu_set_event_period(event);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int hisi_l3c_pmu_init_irq(struct hisi_pmu *l3c_pmu,
+				 struct platform_device *pdev)
+{
+	int irq, ret;
+
+	/* Read and init IRQ */
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0) {
+		dev_err(&pdev->dev, "L3C PMU get irq fail; irq:%d\n", irq);
+		return irq;
+	}
+
+	ret = devm_request_irq(&pdev->dev, irq, hisi_l3c_pmu_isr,
+			       IRQF_NOBALANCING | IRQF_NO_THREAD,
+			       dev_name(&pdev->dev), l3c_pmu);
+	if (ret < 0) {
+		dev_err(&pdev->dev,
+			"Fail to request IRQ:%d ret:%d\n", irq, ret);
+		return ret;
+	}
+
+	l3c_pmu->irq = irq;
+
+	return 0;
+}
+
+static const struct acpi_device_id hisi_l3c_pmu_acpi_match[] = {
+	{ "HISI0213", },
+	{},
+};
+MODULE_DEVICE_TABLE(acpi, hisi_l3c_pmu_acpi_match);
+
+static int hisi_l3c_pmu_init_data(struct platform_device *pdev,
+				  struct hisi_pmu *l3c_pmu)
+{
+	unsigned long long id;
+	struct resource *res;
+	acpi_status status;
+
+	status = acpi_evaluate_integer(ACPI_HANDLE(&pdev->dev),
+				       "_UID", NULL, &id);
+	if (ACPI_FAILURE(status))
+		return -EINVAL;
+
+	l3c_pmu->index_id = id;
+
+	/*
+	 * Use the SCCL_ID and CCL_ID to identify the L3C PMU, while
+	 * SCCL_ID is in MPIDR[aff2] and CCL_ID is in MPIDR[aff1].
+	 */
+	if (device_property_read_u32(&pdev->dev, "hisilicon,scl-id",
+				     &l3c_pmu->sccl_id)) {
+		dev_err(&pdev->dev, "Can not read l3c sccl-id!\n");
+		return -EINVAL;
+	}
+
+	if (device_property_read_u32(&pdev->dev, "hisilicon,ccl-id",
+				     &l3c_pmu->ccl_id)) {
+		dev_err(&pdev->dev, "Can not read l3c ccl-id!\n");
+		return -EINVAL;
+	}
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	l3c_pmu->base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(l3c_pmu->base)) {
+		dev_err(&pdev->dev, "ioremap failed for l3c_pmu resource\n");
+		return PTR_ERR(l3c_pmu->base);
+	}
+
+	return 0;
+}
+
+static struct attribute *hisi_l3c_pmu_format_attr[] = {
+	HISI_PMU_FORMAT_ATTR(event, "config:0-7"),
+	NULL,
+};
+
+static const struct attribute_group hisi_l3c_pmu_format_group = {
+	.name = "format",
+	.attrs = hisi_l3c_pmu_format_attr,
+};
+
+static struct attribute *hisi_l3c_pmu_events_attr[] = {
+	HISI_PMU_EVENT_ATTR(rd_cpipe,		0x00),
+	HISI_PMU_EVENT_ATTR(wr_cpipe,		0x01),
+	HISI_PMU_EVENT_ATTR(rd_hit_cpipe,	0x02),
+	HISI_PMU_EVENT_ATTR(wr_hit_cpipe,	0x03),
+	HISI_PMU_EVENT_ATTR(victim_num,		0x04),
+	HISI_PMU_EVENT_ATTR(rd_spipe,		0x20),
+	HISI_PMU_EVENT_ATTR(wr_spipe,		0x21),
+	HISI_PMU_EVENT_ATTR(rd_hit_spipe,	0x22),
+	HISI_PMU_EVENT_ATTR(wr_hit_spipe,	0x23),
+	HISI_PMU_EVENT_ATTR(back_invalid,	0x29),
+	HISI_PMU_EVENT_ATTR(retry_cpu,		0x40),
+	HISI_PMU_EVENT_ATTR(retry_ring,		0x41),
+	HISI_PMU_EVENT_ATTR(prefetch_drop,	0x42),
+	NULL,
+};
+
+static const struct attribute_group hisi_l3c_pmu_events_group = {
+	.name = "events",
+	.attrs = hisi_l3c_pmu_events_attr,
+};
+
+static DEVICE_ATTR(cpumask, 0444, hisi_cpumask_sysfs_show, NULL);
+
+static struct attribute *hisi_l3c_pmu_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static const struct attribute_group hisi_l3c_pmu_cpumask_attr_group = {
+	.attrs = hisi_l3c_pmu_cpumask_attrs,
+};
+
+static const struct attribute_group *hisi_l3c_pmu_attr_groups[] = {
+	&hisi_l3c_pmu_format_group,
+	&hisi_l3c_pmu_events_group,
+	&hisi_l3c_pmu_cpumask_attr_group,
+	NULL,
+};
+
+static const struct hisi_uncore_ops hisi_uncore_l3c_ops = {
+	.write_evtype		= hisi_l3c_pmu_write_evtype,
+	.get_event_idx		= hisi_uncore_pmu_get_event_idx,
+	.start_counters		= hisi_l3c_pmu_start_counters,
+	.stop_counters		= hisi_l3c_pmu_stop_counters,
+	.enable_counter		= hisi_l3c_pmu_enable_counter,
+	.disable_counter	= hisi_l3c_pmu_disable_counter,
+	.enable_counter_int	= hisi_l3c_pmu_enable_counter_int,
+	.disable_counter_int	= hisi_l3c_pmu_disable_counter_int,
+	.write_counter		= hisi_l3c_pmu_write_counter,
+	.read_counter		= hisi_l3c_pmu_read_counter,
+};
+
+static int hisi_l3c_pmu_dev_probe(struct platform_device *pdev,
+				  struct hisi_pmu *l3c_pmu)
+{
+	int ret;
+
+	ret = hisi_l3c_pmu_init_data(pdev, l3c_pmu);
+	if (ret)
+		return ret;
+
+	ret = hisi_l3c_pmu_init_irq(l3c_pmu, pdev);
+	if (ret)
+		return ret;
+
+	l3c_pmu->num_counters = L3C_NR_COUNTERS;
+	l3c_pmu->counter_bits = 48;
+	l3c_pmu->ops = &hisi_uncore_l3c_ops;
+	l3c_pmu->dev = &pdev->dev;
+	l3c_pmu->on_cpu = -1;
+	l3c_pmu->check_event = 0x59;
+
+	return 0;
+}
+
+static int hisi_l3c_pmu_probe(struct platform_device *pdev)
+{
+	struct hisi_pmu *l3c_pmu;
+	char *name;
+	int ret;
+
+	l3c_pmu = devm_kzalloc(&pdev->dev, sizeof(*l3c_pmu), GFP_KERNEL);
+	if (!l3c_pmu)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, l3c_pmu);
+
+	ret = hisi_l3c_pmu_dev_probe(pdev, l3c_pmu);
+	if (ret)
+		return ret;
+
+	ret = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
+				       &l3c_pmu->node);
+	if (ret) {
+		dev_err(&pdev->dev, "Error %d registering hotplug\n", ret);
+		return ret;
+	}
+
+	name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sccl%u_l3c%u",
+			      l3c_pmu->sccl_id, l3c_pmu->index_id);
+	l3c_pmu->pmu = (struct pmu) {
+		.name		= name,
+		.task_ctx_nr	= perf_invalid_context,
+		.event_init	= hisi_uncore_pmu_event_init,
+		.pmu_enable	= hisi_uncore_pmu_enable,
+		.pmu_disable	= hisi_uncore_pmu_disable,
+		.add		= hisi_uncore_pmu_add,
+		.del		= hisi_uncore_pmu_del,
+		.start		= hisi_uncore_pmu_start,
+		.stop		= hisi_uncore_pmu_stop,
+		.read		= hisi_uncore_pmu_read,
+		.attr_groups	= hisi_l3c_pmu_attr_groups,
+	};
+
+	ret = perf_pmu_register(&l3c_pmu->pmu, name, -1);
+	if (ret) {
+		dev_err(l3c_pmu->dev, "L3C PMU register failed!\n");
+		cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
+					    &l3c_pmu->node);
+	}
+
+	return ret;
+}
+
+static int hisi_l3c_pmu_remove(struct platform_device *pdev)
+{
+	struct hisi_pmu *l3c_pmu = platform_get_drvdata(pdev);
+
+	perf_pmu_unregister(&l3c_pmu->pmu);
+	cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
+				    &l3c_pmu->node);
+
+	return 0;
+}
+
+static struct platform_driver hisi_l3c_pmu_driver = {
+	.driver = {
+		.name = "hisi_l3c_pmu",
+		.acpi_match_table = ACPI_PTR(hisi_l3c_pmu_acpi_match),
+	},
+	.probe = hisi_l3c_pmu_probe,
+	.remove = hisi_l3c_pmu_remove,
+};
+
+static int __init hisi_l3c_pmu_module_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
+				      "AP_PERF_ARM_HISI_L3_ONLINE",
+				      hisi_uncore_pmu_online_cpu,
+				      hisi_uncore_pmu_offline_cpu);
+	if (ret) {
+		pr_err("L3C PMU: Error setup hotplug, ret = %d\n", ret);
+		return ret;
+	}
+
+	ret = platform_driver_register(&hisi_l3c_pmu_driver);
+	if (ret)
+		cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_L3_ONLINE);
+
+	return ret;
+}
+module_init(hisi_l3c_pmu_module_init);
+
+static void __exit hisi_l3c_pmu_module_exit(void)
+{
+	platform_driver_unregister(&hisi_l3c_pmu_driver);
+	cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_L3_ONLINE);
+}
+module_exit(hisi_l3c_pmu_module_exit);
+
+MODULE_DESCRIPTION("HiSilicon SoC L3C uncore PMU driver");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Anurup M <anurup.m@huawei.com>");
+MODULE_AUTHOR("Shaokun Zhang <zhangshaokun@hisilicon.com>");
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 6d508767e144..9621efdbb3bd 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -153,6 +153,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_S390_SF_ONLINE,
 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
 	CPUHP_AP_PERF_ARM_CCN_ONLINE,
+	CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
 	CPUHP_AP_PERF_ARM_L2X0_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE,
-- 
cgit v1.2.3


From 2bab3cf9104c5ab80a1b9c706d81d997548401e4 Mon Sep 17 00:00:00 2001
From: Shaokun Zhang <zhangshaokun@hisilicon.com>
Date: Thu, 19 Oct 2017 19:05:19 +0800
Subject: perf: hisi: Add support for HiSilicon SoC HHA PMU driver

L3 cache coherence is maintained by Hydra Home Agent (HHA) in HiSilicon
SoC. This patch adds support for HHA PMU driver, Each HHA has own
control, counter and interrupt registers and is an separate PMU. For
each HHA PMU, it has 16-programable counters and each counter is
free-running. Interrupt is supported to handle counter (48-bits)
overflow.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Signed-off-by: Anurup M <anurup.m@huawei.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/hisilicon/Makefile              |   2 +-
 drivers/perf/hisilicon/hisi_uncore_hha_pmu.c | 473 +++++++++++++++++++++++++++
 include/linux/cpuhotplug.h                   |   1 +
 3 files changed, 475 insertions(+), 1 deletion(-)
 create mode 100644 drivers/perf/hisilicon/hisi_uncore_hha_pmu.c

(limited to 'include/linux')

diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile
index 4a3d3e6002a6..a72afe84d034 100644
--- a/drivers/perf/hisilicon/Makefile
+++ b/drivers/perf/hisilicon/Makefile
@@ -1 +1 @@
-obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o
+obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o hisi_uncore_hha_pmu.o
diff --git a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
new file mode 100644
index 000000000000..443906e0aff3
--- /dev/null
+++ b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
@@ -0,0 +1,473 @@
+/*
+ * HiSilicon SoC HHA uncore Hardware event counters support
+ *
+ * Copyright (C) 2017 Hisilicon Limited
+ * Author: Shaokun Zhang <zhangshaokun@hisilicon.com>
+ *         Anurup M <anurup.m@huawei.com>
+ *
+ * This code is based on the uncore PMUs like arm-cci and arm-ccn.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/acpi.h>
+#include <linux/bug.h>
+#include <linux/cpuhotplug.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/list.h>
+#include <linux/platform_device.h>
+#include <linux/smp.h>
+
+#include "hisi_uncore_pmu.h"
+
+/* HHA register definition */
+#define HHA_INT_MASK		0x0804
+#define HHA_INT_STATUS		0x0808
+#define HHA_INT_CLEAR		0x080C
+#define HHA_PERF_CTRL		0x1E00
+#define HHA_EVENT_CTRL		0x1E04
+#define HHA_EVENT_TYPE0		0x1E80
+/*
+ * Each counter is 48-bits and [48:63] are reserved
+ * which are Read-As-Zero and Writes-Ignored.
+ */
+#define HHA_CNT0_LOWER		0x1F00
+
+/* HHA has 16-counters */
+#define HHA_NR_COUNTERS		0x10
+
+#define HHA_PERF_CTRL_EN	0x1
+#define HHA_EVTYPE_NONE		0xff
+
+/*
+ * Select the counter register offset using the counter index
+ * each counter is 48-bits.
+ */
+static u32 hisi_hha_pmu_get_counter_offset(int cntr_idx)
+{
+	return (HHA_CNT0_LOWER + (cntr_idx * 8));
+}
+
+static u64 hisi_hha_pmu_read_counter(struct hisi_pmu *hha_pmu,
+				     struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+
+	if (!hisi_uncore_pmu_counter_valid(hha_pmu, idx)) {
+		dev_err(hha_pmu->dev, "Unsupported event index:%d!\n", idx);
+		return 0;
+	}
+
+	/* Read 64 bits and like L3C, top 16 bits are RAZ */
+	return readq(hha_pmu->base + hisi_hha_pmu_get_counter_offset(idx));
+}
+
+static void hisi_hha_pmu_write_counter(struct hisi_pmu *hha_pmu,
+				       struct hw_perf_event *hwc, u64 val)
+{
+	u32 idx = hwc->idx;
+
+	if (!hisi_uncore_pmu_counter_valid(hha_pmu, idx)) {
+		dev_err(hha_pmu->dev, "Unsupported event index:%d!\n", idx);
+		return;
+	}
+
+	/* Write 64 bits and like L3C, top 16 bits are WI */
+	writeq(val, hha_pmu->base + hisi_hha_pmu_get_counter_offset(idx));
+}
+
+static void hisi_hha_pmu_write_evtype(struct hisi_pmu *hha_pmu, int idx,
+				      u32 type)
+{
+	u32 reg, reg_idx, shift, val;
+
+	/*
+	 * Select the appropriate event select register(HHA_EVENT_TYPEx).
+	 * There are 4 event select registers for the 16 hardware counters.
+	 * Event code is 8-bits and for the first 4 hardware counters,
+	 * HHA_EVENT_TYPE0 is chosen. For the next 4 hardware counters,
+	 * HHA_EVENT_TYPE1 is chosen and so on.
+	 */
+	reg = HHA_EVENT_TYPE0 + 4 * (idx / 4);
+	reg_idx = idx % 4;
+	shift = 8 * reg_idx;
+
+	/* Write event code to HHA_EVENT_TYPEx register */
+	val = readl(hha_pmu->base + reg);
+	val &= ~(HHA_EVTYPE_NONE << shift);
+	val |= (type << shift);
+	writel(val, hha_pmu->base + reg);
+}
+
+static void hisi_hha_pmu_start_counters(struct hisi_pmu *hha_pmu)
+{
+	u32 val;
+
+	/*
+	 * Set perf_enable bit in HHA_PERF_CTRL to start event
+	 * counting for all enabled counters.
+	 */
+	val = readl(hha_pmu->base + HHA_PERF_CTRL);
+	val |= HHA_PERF_CTRL_EN;
+	writel(val, hha_pmu->base + HHA_PERF_CTRL);
+}
+
+static void hisi_hha_pmu_stop_counters(struct hisi_pmu *hha_pmu)
+{
+	u32 val;
+
+	/*
+	 * Clear perf_enable bit in HHA_PERF_CTRL to stop event
+	 * counting for all enabled counters.
+	 */
+	val = readl(hha_pmu->base + HHA_PERF_CTRL);
+	val &= ~(HHA_PERF_CTRL_EN);
+	writel(val, hha_pmu->base + HHA_PERF_CTRL);
+}
+
+static void hisi_hha_pmu_enable_counter(struct hisi_pmu *hha_pmu,
+					struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	/* Enable counter index in HHA_EVENT_CTRL register */
+	val = readl(hha_pmu->base + HHA_EVENT_CTRL);
+	val |= (1 << hwc->idx);
+	writel(val, hha_pmu->base + HHA_EVENT_CTRL);
+}
+
+static void hisi_hha_pmu_disable_counter(struct hisi_pmu *hha_pmu,
+					 struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	/* Clear counter index in HHA_EVENT_CTRL register */
+	val = readl(hha_pmu->base + HHA_EVENT_CTRL);
+	val &= ~(1 << hwc->idx);
+	writel(val, hha_pmu->base + HHA_EVENT_CTRL);
+}
+
+static void hisi_hha_pmu_enable_counter_int(struct hisi_pmu *hha_pmu,
+					    struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	/* Write 0 to enable interrupt */
+	val = readl(hha_pmu->base + HHA_INT_MASK);
+	val &= ~(1 << hwc->idx);
+	writel(val, hha_pmu->base + HHA_INT_MASK);
+}
+
+static void hisi_hha_pmu_disable_counter_int(struct hisi_pmu *hha_pmu,
+					     struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	/* Write 1 to mask interrupt */
+	val = readl(hha_pmu->base + HHA_INT_MASK);
+	val |= (1 << hwc->idx);
+	writel(val, hha_pmu->base + HHA_INT_MASK);
+}
+
+static irqreturn_t hisi_hha_pmu_isr(int irq, void *dev_id)
+{
+	struct hisi_pmu *hha_pmu = dev_id;
+	struct perf_event *event;
+	unsigned long overflown;
+	int idx;
+
+	/* Read HHA_INT_STATUS register */
+	overflown = readl(hha_pmu->base + HHA_INT_STATUS);
+	if (!overflown)
+		return IRQ_NONE;
+
+	/*
+	 * Find the counter index which overflowed if the bit was set
+	 * and handle it
+	 */
+	for_each_set_bit(idx, &overflown, HHA_NR_COUNTERS) {
+		/* Write 1 to clear the IRQ status flag */
+		writel((1 << idx), hha_pmu->base + HHA_INT_CLEAR);
+
+		/* Get the corresponding event struct */
+		event = hha_pmu->pmu_events.hw_events[idx];
+		if (!event)
+			continue;
+
+		hisi_uncore_pmu_event_update(event);
+		hisi_uncore_pmu_set_event_period(event);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int hisi_hha_pmu_init_irq(struct hisi_pmu *hha_pmu,
+				 struct platform_device *pdev)
+{
+	int irq, ret;
+
+	/* Read and init IRQ */
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0) {
+		dev_err(&pdev->dev, "HHA PMU get irq fail; irq:%d\n", irq);
+		return irq;
+	}
+
+	ret = devm_request_irq(&pdev->dev, irq, hisi_hha_pmu_isr,
+			      IRQF_NOBALANCING | IRQF_NO_THREAD,
+			      dev_name(&pdev->dev), hha_pmu);
+	if (ret < 0) {
+		dev_err(&pdev->dev,
+			"Fail to request IRQ:%d ret:%d\n", irq, ret);
+		return ret;
+	}
+
+	hha_pmu->irq = irq;
+
+	return 0;
+}
+
+static const struct acpi_device_id hisi_hha_pmu_acpi_match[] = {
+	{ "HISI0243", },
+	{},
+};
+MODULE_DEVICE_TABLE(acpi, hisi_hha_pmu_acpi_match);
+
+static int hisi_hha_pmu_init_data(struct platform_device *pdev,
+				  struct hisi_pmu *hha_pmu)
+{
+	unsigned long long id;
+	struct resource *res;
+	acpi_status status;
+
+	status = acpi_evaluate_integer(ACPI_HANDLE(&pdev->dev),
+				       "_UID", NULL, &id);
+	if (ACPI_FAILURE(status))
+		return -EINVAL;
+
+	hha_pmu->index_id = id;
+
+	/*
+	 * Use SCCL_ID and UID to identify the HHA PMU, while
+	 * SCCL_ID is in MPIDR[aff2].
+	 */
+	if (device_property_read_u32(&pdev->dev, "hisilicon,scl-id",
+				     &hha_pmu->sccl_id)) {
+		dev_err(&pdev->dev, "Can not read hha sccl-id!\n");
+		return -EINVAL;
+	}
+	/* HHA PMUs only share the same SCCL */
+	hha_pmu->ccl_id = -1;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	hha_pmu->base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(hha_pmu->base)) {
+		dev_err(&pdev->dev, "ioremap failed for hha_pmu resource\n");
+		return PTR_ERR(hha_pmu->base);
+	}
+
+	return 0;
+}
+
+static struct attribute *hisi_hha_pmu_format_attr[] = {
+	HISI_PMU_FORMAT_ATTR(event, "config:0-7"),
+	NULL,
+};
+
+static const struct attribute_group hisi_hha_pmu_format_group = {
+	.name = "format",
+	.attrs = hisi_hha_pmu_format_attr,
+};
+
+static struct attribute *hisi_hha_pmu_events_attr[] = {
+	HISI_PMU_EVENT_ATTR(rx_ops_num,		0x00),
+	HISI_PMU_EVENT_ATTR(rx_outer,		0x01),
+	HISI_PMU_EVENT_ATTR(rx_sccl,		0x02),
+	HISI_PMU_EVENT_ATTR(rx_ccix,		0x03),
+	HISI_PMU_EVENT_ATTR(rx_wbi,		0x04),
+	HISI_PMU_EVENT_ATTR(rx_wbip,		0x05),
+	HISI_PMU_EVENT_ATTR(rx_wtistash,	0x11),
+	HISI_PMU_EVENT_ATTR(rd_ddr_64b,		0x1c),
+	HISI_PMU_EVENT_ATTR(wr_dr_64b,		0x1d),
+	HISI_PMU_EVENT_ATTR(rd_ddr_128b,	0x1e),
+	HISI_PMU_EVENT_ATTR(wr_ddr_128b,	0x1f),
+	HISI_PMU_EVENT_ATTR(spill_num,		0x20),
+	HISI_PMU_EVENT_ATTR(spill_success,	0x21),
+	HISI_PMU_EVENT_ATTR(bi_num,		0x23),
+	HISI_PMU_EVENT_ATTR(mediated_num,	0x32),
+	HISI_PMU_EVENT_ATTR(tx_snp_num,		0x33),
+	HISI_PMU_EVENT_ATTR(tx_snp_outer,	0x34),
+	HISI_PMU_EVENT_ATTR(tx_snp_ccix,	0x35),
+	HISI_PMU_EVENT_ATTR(rx_snprspdata,	0x38),
+	HISI_PMU_EVENT_ATTR(rx_snprsp_outer,	0x3c),
+	HISI_PMU_EVENT_ATTR(sdir-lookup,	0x40),
+	HISI_PMU_EVENT_ATTR(edir-lookup,	0x41),
+	HISI_PMU_EVENT_ATTR(sdir-hit,		0x42),
+	HISI_PMU_EVENT_ATTR(edir-hit,		0x43),
+	HISI_PMU_EVENT_ATTR(sdir-home-migrate,	0x4c),
+	HISI_PMU_EVENT_ATTR(edir-home-migrate,  0x4d),
+	NULL,
+};
+
+static const struct attribute_group hisi_hha_pmu_events_group = {
+	.name = "events",
+	.attrs = hisi_hha_pmu_events_attr,
+};
+
+static DEVICE_ATTR(cpumask, 0444, hisi_cpumask_sysfs_show, NULL);
+
+static struct attribute *hisi_hha_pmu_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static const struct attribute_group hisi_hha_pmu_cpumask_attr_group = {
+	.attrs = hisi_hha_pmu_cpumask_attrs,
+};
+
+static const struct attribute_group *hisi_hha_pmu_attr_groups[] = {
+	&hisi_hha_pmu_format_group,
+	&hisi_hha_pmu_events_group,
+	&hisi_hha_pmu_cpumask_attr_group,
+	NULL,
+};
+
+static const struct hisi_uncore_ops hisi_uncore_hha_ops = {
+	.write_evtype		= hisi_hha_pmu_write_evtype,
+	.get_event_idx		= hisi_uncore_pmu_get_event_idx,
+	.start_counters		= hisi_hha_pmu_start_counters,
+	.stop_counters		= hisi_hha_pmu_stop_counters,
+	.enable_counter		= hisi_hha_pmu_enable_counter,
+	.disable_counter	= hisi_hha_pmu_disable_counter,
+	.enable_counter_int	= hisi_hha_pmu_enable_counter_int,
+	.disable_counter_int	= hisi_hha_pmu_disable_counter_int,
+	.write_counter		= hisi_hha_pmu_write_counter,
+	.read_counter		= hisi_hha_pmu_read_counter,
+};
+
+static int hisi_hha_pmu_dev_probe(struct platform_device *pdev,
+				  struct hisi_pmu *hha_pmu)
+{
+	int ret;
+
+	ret = hisi_hha_pmu_init_data(pdev, hha_pmu);
+	if (ret)
+		return ret;
+
+	ret = hisi_hha_pmu_init_irq(hha_pmu, pdev);
+	if (ret)
+		return ret;
+
+	hha_pmu->num_counters = HHA_NR_COUNTERS;
+	hha_pmu->counter_bits = 48;
+	hha_pmu->ops = &hisi_uncore_hha_ops;
+	hha_pmu->dev = &pdev->dev;
+	hha_pmu->on_cpu = -1;
+	hha_pmu->check_event = 0x65;
+
+	return 0;
+}
+
+static int hisi_hha_pmu_probe(struct platform_device *pdev)
+{
+	struct hisi_pmu *hha_pmu;
+	char *name;
+	int ret;
+
+	hha_pmu = devm_kzalloc(&pdev->dev, sizeof(*hha_pmu), GFP_KERNEL);
+	if (!hha_pmu)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, hha_pmu);
+
+	ret = hisi_hha_pmu_dev_probe(pdev, hha_pmu);
+	if (ret)
+		return ret;
+
+	ret = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE,
+				       &hha_pmu->node);
+	if (ret) {
+		dev_err(&pdev->dev, "Error %d registering hotplug\n", ret);
+		return ret;
+	}
+
+	name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sccl%u_hha%u",
+			      hha_pmu->sccl_id, hha_pmu->index_id);
+	hha_pmu->pmu = (struct pmu) {
+		.name		= name,
+		.task_ctx_nr	= perf_invalid_context,
+		.event_init	= hisi_uncore_pmu_event_init,
+		.pmu_enable	= hisi_uncore_pmu_enable,
+		.pmu_disable	= hisi_uncore_pmu_disable,
+		.add		= hisi_uncore_pmu_add,
+		.del		= hisi_uncore_pmu_del,
+		.start		= hisi_uncore_pmu_start,
+		.stop		= hisi_uncore_pmu_stop,
+		.read		= hisi_uncore_pmu_read,
+		.attr_groups	= hisi_hha_pmu_attr_groups,
+	};
+
+	ret = perf_pmu_register(&hha_pmu->pmu, name, -1);
+	if (ret) {
+		dev_err(hha_pmu->dev, "HHA PMU register failed!\n");
+		cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE,
+					    &hha_pmu->node);
+	}
+
+	return ret;
+}
+
+static int hisi_hha_pmu_remove(struct platform_device *pdev)
+{
+	struct hisi_pmu *hha_pmu = platform_get_drvdata(pdev);
+
+	perf_pmu_unregister(&hha_pmu->pmu);
+	cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE,
+				    &hha_pmu->node);
+
+	return 0;
+}
+
+static struct platform_driver hisi_hha_pmu_driver = {
+	.driver = {
+		.name = "hisi_hha_pmu",
+		.acpi_match_table = ACPI_PTR(hisi_hha_pmu_acpi_match),
+	},
+	.probe = hisi_hha_pmu_probe,
+	.remove = hisi_hha_pmu_remove,
+};
+
+static int __init hisi_hha_pmu_module_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE,
+				      "AP_PERF_ARM_HISI_HHA_ONLINE",
+				      hisi_uncore_pmu_online_cpu,
+				      hisi_uncore_pmu_offline_cpu);
+	if (ret) {
+		pr_err("HHA PMU: Error setup hotplug, ret = %d;\n", ret);
+		return ret;
+	}
+
+	ret = platform_driver_register(&hisi_hha_pmu_driver);
+	if (ret)
+		cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE);
+
+	return ret;
+}
+module_init(hisi_hha_pmu_module_init);
+
+static void __exit hisi_hha_pmu_module_exit(void)
+{
+	platform_driver_unregister(&hisi_hha_pmu_driver);
+	cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE);
+}
+module_exit(hisi_hha_pmu_module_exit);
+
+MODULE_DESCRIPTION("HiSilicon SoC HHA uncore PMU driver");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Shaokun Zhang <zhangshaokun@hisilicon.com>");
+MODULE_AUTHOR("Anurup M <anurup.m@huawei.com>");
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 9621efdbb3bd..cd63c52e7d93 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -153,6 +153,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_S390_SF_ONLINE,
 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
 	CPUHP_AP_PERF_ARM_CCN_ONLINE,
+	CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE,
 	CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
 	CPUHP_AP_PERF_ARM_L2X0_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
-- 
cgit v1.2.3


From 904dcf03f086a2e3b9d1e02cb57c43ea2e588c8c Mon Sep 17 00:00:00 2001
From: Shaokun Zhang <zhangshaokun@hisilicon.com>
Date: Thu, 19 Oct 2017 19:05:20 +0800
Subject: perf: hisi: Add support for HiSilicon SoC DDRC PMU driver

This patch adds support for DDRC PMU driver in HiSilicon SoC chip, Each
DDRC has own control, counter and interrupt registers and is an separate
PMU. For each DDRC PMU, it has 8-fixed-purpose counters which have been
mapped to 8-events by hardware, it assumes that counter index is equal
to event code (0 - 7) in DDRC PMU driver. Interrupt is supported to
handle counter (32-bits) overflow.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Signed-off-by: Anurup M <anurup.m@huawei.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/hisilicon/Makefile               |   2 +-
 drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c | 463 ++++++++++++++++++++++++++
 include/linux/cpuhotplug.h                    |   1 +
 3 files changed, 465 insertions(+), 1 deletion(-)
 create mode 100644 drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c

(limited to 'include/linux')

diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile
index a72afe84d034..2621d51ae87a 100644
--- a/drivers/perf/hisilicon/Makefile
+++ b/drivers/perf/hisilicon/Makefile
@@ -1 +1 @@
-obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o hisi_uncore_hha_pmu.o
+obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o
diff --git a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
new file mode 100644
index 000000000000..1b10ea05a914
--- /dev/null
+++ b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
@@ -0,0 +1,463 @@
+/*
+ * HiSilicon SoC DDRC uncore Hardware event counters support
+ *
+ * Copyright (C) 2017 Hisilicon Limited
+ * Author: Shaokun Zhang <zhangshaokun@hisilicon.com>
+ *         Anurup M <anurup.m@huawei.com>
+ *
+ * This code is based on the uncore PMUs like arm-cci and arm-ccn.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/acpi.h>
+#include <linux/bug.h>
+#include <linux/cpuhotplug.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/list.h>
+#include <linux/platform_device.h>
+#include <linux/smp.h>
+
+#include "hisi_uncore_pmu.h"
+
+/* DDRC register definition */
+#define DDRC_PERF_CTRL		0x010
+#define DDRC_FLUX_WR		0x380
+#define DDRC_FLUX_RD		0x384
+#define DDRC_FLUX_WCMD          0x388
+#define DDRC_FLUX_RCMD          0x38c
+#define DDRC_PRE_CMD            0x3c0
+#define DDRC_ACT_CMD            0x3c4
+#define DDRC_BNK_CHG            0x3c8
+#define DDRC_RNK_CHG            0x3cc
+#define DDRC_EVENT_CTRL         0x6C0
+#define DDRC_INT_MASK		0x6c8
+#define DDRC_INT_STATUS		0x6cc
+#define DDRC_INT_CLEAR		0x6d0
+
+/* DDRC has 8-counters */
+#define DDRC_NR_COUNTERS	0x8
+#define DDRC_PERF_CTRL_EN	0x2
+
+/*
+ * For DDRC PMU, there are eight-events and every event has been mapped
+ * to fixed-purpose counters which register offset is not consistent.
+ * Therefore there is no write event type and we assume that event
+ * code (0 to 7) is equal to counter index in PMU driver.
+ */
+#define GET_DDRC_EVENTID(hwc)	(hwc->config_base & 0x7)
+
+static const u32 ddrc_reg_off[] = {
+	DDRC_FLUX_WR, DDRC_FLUX_RD, DDRC_FLUX_WCMD, DDRC_FLUX_RCMD,
+	DDRC_PRE_CMD, DDRC_ACT_CMD, DDRC_BNK_CHG, DDRC_RNK_CHG
+};
+
+/*
+ * Select the counter register offset using the counter index.
+ * In DDRC there are no programmable counter, the count
+ * is readed form the statistics counter register itself.
+ */
+static u32 hisi_ddrc_pmu_get_counter_offset(int cntr_idx)
+{
+	return ddrc_reg_off[cntr_idx];
+}
+
+static u64 hisi_ddrc_pmu_read_counter(struct hisi_pmu *ddrc_pmu,
+				      struct hw_perf_event *hwc)
+{
+	/* Use event code as counter index */
+	u32 idx = GET_DDRC_EVENTID(hwc);
+
+	if (!hisi_uncore_pmu_counter_valid(ddrc_pmu, idx)) {
+		dev_err(ddrc_pmu->dev, "Unsupported event index:%d!\n", idx);
+		return 0;
+	}
+
+	return readl(ddrc_pmu->base + hisi_ddrc_pmu_get_counter_offset(idx));
+}
+
+static void hisi_ddrc_pmu_write_counter(struct hisi_pmu *ddrc_pmu,
+					struct hw_perf_event *hwc, u64 val)
+{
+	u32 idx = GET_DDRC_EVENTID(hwc);
+
+	if (!hisi_uncore_pmu_counter_valid(ddrc_pmu, idx)) {
+		dev_err(ddrc_pmu->dev, "Unsupported event index:%d!\n", idx);
+		return;
+	}
+
+	writel((u32)val,
+	       ddrc_pmu->base + hisi_ddrc_pmu_get_counter_offset(idx));
+}
+
+/*
+ * For DDRC PMU, event has been mapped to fixed-purpose counter by hardware,
+ * so there is no need to write event type.
+ */
+static void hisi_ddrc_pmu_write_evtype(struct hisi_pmu *hha_pmu, int idx,
+				       u32 type)
+{
+}
+
+static void hisi_ddrc_pmu_start_counters(struct hisi_pmu *ddrc_pmu)
+{
+	u32 val;
+
+	/* Set perf_enable in DDRC_PERF_CTRL to start event counting */
+	val = readl(ddrc_pmu->base + DDRC_PERF_CTRL);
+	val |= DDRC_PERF_CTRL_EN;
+	writel(val, ddrc_pmu->base + DDRC_PERF_CTRL);
+}
+
+static void hisi_ddrc_pmu_stop_counters(struct hisi_pmu *ddrc_pmu)
+{
+	u32 val;
+
+	/* Clear perf_enable in DDRC_PERF_CTRL to stop event counting */
+	val = readl(ddrc_pmu->base + DDRC_PERF_CTRL);
+	val &= ~DDRC_PERF_CTRL_EN;
+	writel(val, ddrc_pmu->base + DDRC_PERF_CTRL);
+}
+
+static void hisi_ddrc_pmu_enable_counter(struct hisi_pmu *ddrc_pmu,
+					 struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	/* Set counter index(event code) in DDRC_EVENT_CTRL register */
+	val = readl(ddrc_pmu->base + DDRC_EVENT_CTRL);
+	val |= (1 << GET_DDRC_EVENTID(hwc));
+	writel(val, ddrc_pmu->base + DDRC_EVENT_CTRL);
+}
+
+static void hisi_ddrc_pmu_disable_counter(struct hisi_pmu *ddrc_pmu,
+					  struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	/* Clear counter index(event code) in DDRC_EVENT_CTRL register */
+	val = readl(ddrc_pmu->base + DDRC_EVENT_CTRL);
+	val &= ~(1 << GET_DDRC_EVENTID(hwc));
+	writel(val, ddrc_pmu->base + DDRC_EVENT_CTRL);
+}
+
+static int hisi_ddrc_pmu_get_event_idx(struct perf_event *event)
+{
+	struct hisi_pmu *ddrc_pmu = to_hisi_pmu(event->pmu);
+	unsigned long *used_mask = ddrc_pmu->pmu_events.used_mask;
+	struct hw_perf_event *hwc = &event->hw;
+	/* For DDRC PMU, we use event code as counter index */
+	int idx = GET_DDRC_EVENTID(hwc);
+
+	if (test_bit(idx, used_mask))
+		return -EAGAIN;
+
+	set_bit(idx, used_mask);
+
+	return idx;
+}
+
+static void hisi_ddrc_pmu_enable_counter_int(struct hisi_pmu *ddrc_pmu,
+					     struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	/* Write 0 to enable interrupt */
+	val = readl(ddrc_pmu->base + DDRC_INT_MASK);
+	val &= ~(1 << GET_DDRC_EVENTID(hwc));
+	writel(val, ddrc_pmu->base + DDRC_INT_MASK);
+}
+
+static void hisi_ddrc_pmu_disable_counter_int(struct hisi_pmu *ddrc_pmu,
+					      struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	/* Write 1 to mask interrupt */
+	val = readl(ddrc_pmu->base + DDRC_INT_MASK);
+	val |= (1 << GET_DDRC_EVENTID(hwc));
+	writel(val, ddrc_pmu->base + DDRC_INT_MASK);
+}
+
+static irqreturn_t hisi_ddrc_pmu_isr(int irq, void *dev_id)
+{
+	struct hisi_pmu *ddrc_pmu = dev_id;
+	struct perf_event *event;
+	unsigned long overflown;
+	int idx;
+
+	/* Read the DDRC_INT_STATUS register */
+	overflown = readl(ddrc_pmu->base + DDRC_INT_STATUS);
+	if (!overflown)
+		return IRQ_NONE;
+
+	/*
+	 * Find the counter index which overflowed if the bit was set
+	 * and handle it
+	 */
+	for_each_set_bit(idx, &overflown, DDRC_NR_COUNTERS) {
+		/* Write 1 to clear the IRQ status flag */
+		writel((1 << idx), ddrc_pmu->base + DDRC_INT_CLEAR);
+
+		/* Get the corresponding event struct */
+		event = ddrc_pmu->pmu_events.hw_events[idx];
+		if (!event)
+			continue;
+
+		hisi_uncore_pmu_event_update(event);
+		hisi_uncore_pmu_set_event_period(event);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int hisi_ddrc_pmu_init_irq(struct hisi_pmu *ddrc_pmu,
+				  struct platform_device *pdev)
+{
+	int irq, ret;
+
+	/* Read and init IRQ */
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0) {
+		dev_err(&pdev->dev, "DDRC PMU get irq fail; irq:%d\n", irq);
+		return irq;
+	}
+
+	ret = devm_request_irq(&pdev->dev, irq, hisi_ddrc_pmu_isr,
+			       IRQF_NOBALANCING | IRQF_NO_THREAD,
+			       dev_name(&pdev->dev), ddrc_pmu);
+	if (ret < 0) {
+		dev_err(&pdev->dev,
+			"Fail to request IRQ:%d ret:%d\n", irq, ret);
+		return ret;
+	}
+
+	ddrc_pmu->irq = irq;
+
+	return 0;
+}
+
+static const struct acpi_device_id hisi_ddrc_pmu_acpi_match[] = {
+	{ "HISI0233", },
+	{},
+};
+MODULE_DEVICE_TABLE(acpi, hisi_ddrc_pmu_acpi_match);
+
+static int hisi_ddrc_pmu_init_data(struct platform_device *pdev,
+				   struct hisi_pmu *ddrc_pmu)
+{
+	struct resource *res;
+
+	/*
+	 * Use the SCCL_ID and DDRC channel ID to identify the
+	 * DDRC PMU, while SCCL_ID is in MPIDR[aff2].
+	 */
+	if (device_property_read_u32(&pdev->dev, "hisilicon,ch-id",
+				     &ddrc_pmu->index_id)) {
+		dev_err(&pdev->dev, "Can not read ddrc channel-id!\n");
+		return -EINVAL;
+	}
+
+	if (device_property_read_u32(&pdev->dev, "hisilicon,scl-id",
+				     &ddrc_pmu->sccl_id)) {
+		dev_err(&pdev->dev, "Can not read ddrc sccl-id!\n");
+		return -EINVAL;
+	}
+	/* DDRC PMUs only share the same SCCL */
+	ddrc_pmu->ccl_id = -1;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	ddrc_pmu->base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(ddrc_pmu->base)) {
+		dev_err(&pdev->dev, "ioremap failed for ddrc_pmu resource\n");
+		return PTR_ERR(ddrc_pmu->base);
+	}
+
+	return 0;
+}
+
+static struct attribute *hisi_ddrc_pmu_format_attr[] = {
+	HISI_PMU_FORMAT_ATTR(event, "config:0-4"),
+	NULL,
+};
+
+static const struct attribute_group hisi_ddrc_pmu_format_group = {
+	.name = "format",
+	.attrs = hisi_ddrc_pmu_format_attr,
+};
+
+static struct attribute *hisi_ddrc_pmu_events_attr[] = {
+	HISI_PMU_EVENT_ATTR(flux_wr,		0x00),
+	HISI_PMU_EVENT_ATTR(flux_rd,		0x01),
+	HISI_PMU_EVENT_ATTR(flux_wcmd,		0x02),
+	HISI_PMU_EVENT_ATTR(flux_rcmd,		0x03),
+	HISI_PMU_EVENT_ATTR(pre_cmd,		0x04),
+	HISI_PMU_EVENT_ATTR(act_cmd,		0x05),
+	HISI_PMU_EVENT_ATTR(rnk_chg,		0x06),
+	HISI_PMU_EVENT_ATTR(rw_chg,		0x07),
+	NULL,
+};
+
+static const struct attribute_group hisi_ddrc_pmu_events_group = {
+	.name = "events",
+	.attrs = hisi_ddrc_pmu_events_attr,
+};
+
+static DEVICE_ATTR(cpumask, 0444, hisi_cpumask_sysfs_show, NULL);
+
+static struct attribute *hisi_ddrc_pmu_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static const struct attribute_group hisi_ddrc_pmu_cpumask_attr_group = {
+	.attrs = hisi_ddrc_pmu_cpumask_attrs,
+};
+
+static const struct attribute_group *hisi_ddrc_pmu_attr_groups[] = {
+	&hisi_ddrc_pmu_format_group,
+	&hisi_ddrc_pmu_events_group,
+	&hisi_ddrc_pmu_cpumask_attr_group,
+	NULL,
+};
+
+static const struct hisi_uncore_ops hisi_uncore_ddrc_ops = {
+	.write_evtype           = hisi_ddrc_pmu_write_evtype,
+	.get_event_idx		= hisi_ddrc_pmu_get_event_idx,
+	.start_counters		= hisi_ddrc_pmu_start_counters,
+	.stop_counters		= hisi_ddrc_pmu_stop_counters,
+	.enable_counter		= hisi_ddrc_pmu_enable_counter,
+	.disable_counter	= hisi_ddrc_pmu_disable_counter,
+	.enable_counter_int	= hisi_ddrc_pmu_enable_counter_int,
+	.disable_counter_int	= hisi_ddrc_pmu_disable_counter_int,
+	.write_counter		= hisi_ddrc_pmu_write_counter,
+	.read_counter		= hisi_ddrc_pmu_read_counter,
+};
+
+static int hisi_ddrc_pmu_dev_probe(struct platform_device *pdev,
+				   struct hisi_pmu *ddrc_pmu)
+{
+	int ret;
+
+	ret = hisi_ddrc_pmu_init_data(pdev, ddrc_pmu);
+	if (ret)
+		return ret;
+
+	ret = hisi_ddrc_pmu_init_irq(ddrc_pmu, pdev);
+	if (ret)
+		return ret;
+
+	ddrc_pmu->num_counters = DDRC_NR_COUNTERS;
+	ddrc_pmu->counter_bits = 32;
+	ddrc_pmu->ops = &hisi_uncore_ddrc_ops;
+	ddrc_pmu->dev = &pdev->dev;
+	ddrc_pmu->on_cpu = -1;
+	ddrc_pmu->check_event = 7;
+
+	return 0;
+}
+
+static int hisi_ddrc_pmu_probe(struct platform_device *pdev)
+{
+	struct hisi_pmu *ddrc_pmu;
+	char *name;
+	int ret;
+
+	ddrc_pmu = devm_kzalloc(&pdev->dev, sizeof(*ddrc_pmu), GFP_KERNEL);
+	if (!ddrc_pmu)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, ddrc_pmu);
+
+	ret = hisi_ddrc_pmu_dev_probe(pdev, ddrc_pmu);
+	if (ret)
+		return ret;
+
+	ret = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE,
+				       &ddrc_pmu->node);
+	if (ret) {
+		dev_err(&pdev->dev, "Error %d registering hotplug;\n", ret);
+		return ret;
+	}
+
+	name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sccl%u_ddrc%u",
+			      ddrc_pmu->sccl_id, ddrc_pmu->index_id);
+	ddrc_pmu->pmu = (struct pmu) {
+		.name		= name,
+		.task_ctx_nr	= perf_invalid_context,
+		.event_init	= hisi_uncore_pmu_event_init,
+		.pmu_enable	= hisi_uncore_pmu_enable,
+		.pmu_disable	= hisi_uncore_pmu_disable,
+		.add		= hisi_uncore_pmu_add,
+		.del		= hisi_uncore_pmu_del,
+		.start		= hisi_uncore_pmu_start,
+		.stop		= hisi_uncore_pmu_stop,
+		.read		= hisi_uncore_pmu_read,
+		.attr_groups	= hisi_ddrc_pmu_attr_groups,
+	};
+
+	ret = perf_pmu_register(&ddrc_pmu->pmu, name, -1);
+	if (ret) {
+		dev_err(ddrc_pmu->dev, "DDRC PMU register failed!\n");
+		cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE,
+					    &ddrc_pmu->node);
+	}
+
+	return ret;
+}
+
+static int hisi_ddrc_pmu_remove(struct platform_device *pdev)
+{
+	struct hisi_pmu *ddrc_pmu = platform_get_drvdata(pdev);
+
+	perf_pmu_unregister(&ddrc_pmu->pmu);
+	cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE,
+				    &ddrc_pmu->node);
+
+	return 0;
+}
+
+static struct platform_driver hisi_ddrc_pmu_driver = {
+	.driver = {
+		.name = "hisi_ddrc_pmu",
+		.acpi_match_table = ACPI_PTR(hisi_ddrc_pmu_acpi_match),
+	},
+	.probe = hisi_ddrc_pmu_probe,
+	.remove = hisi_ddrc_pmu_remove,
+};
+
+static int __init hisi_ddrc_pmu_module_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE,
+				      "AP_PERF_ARM_HISI_DDRC_ONLINE",
+				      hisi_uncore_pmu_online_cpu,
+				      hisi_uncore_pmu_offline_cpu);
+	if (ret) {
+		pr_err("DDRC PMU: setup hotplug, ret = %d\n", ret);
+		return ret;
+	}
+
+	ret = platform_driver_register(&hisi_ddrc_pmu_driver);
+	if (ret)
+		cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE);
+
+	return ret;
+}
+module_init(hisi_ddrc_pmu_module_init);
+
+static void __exit hisi_ddrc_pmu_module_exit(void)
+{
+	platform_driver_unregister(&hisi_ddrc_pmu_driver);
+	cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE);
+
+}
+module_exit(hisi_ddrc_pmu_module_exit);
+
+MODULE_DESCRIPTION("HiSilicon SoC DDRC uncore PMU driver");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Shaokun Zhang <zhangshaokun@hisilicon.com>");
+MODULE_AUTHOR("Anurup M <anurup.m@huawei.com>");
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index cd63c52e7d93..587006de6f82 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -153,6 +153,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_S390_SF_ONLINE,
 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
 	CPUHP_AP_PERF_ARM_CCN_ONLINE,
+	CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE,
 	CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE,
 	CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
 	CPUHP_AP_PERF_ARM_L2X0_ONLINE,
-- 
cgit v1.2.3


From 7d7363e403ce959941f80684cc5f33e747afff17 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 16 Oct 2017 16:32:51 -0700
Subject: documentation: kernel-api: add more info on bitmap functions

There are some good comments about bitmap operations in lib/bitmap.c
and include/linux/bitmap.h, so format them for document generation and
pull them into core-api/kernel-api.rst.

I converted the "tables" of functions from using tabs to using spaces
so that they are more readable in the source file and in the generated
output.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/core-api/kernel-api.rst |  12 ++++
 include/linux/bitmap.h                | 105 ++++++++++++++++++----------------
 lib/bitmap.c                          |   4 +-
 3 files changed, 72 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index 9c6902ba6e67..81754add8a49 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -53,6 +53,18 @@ The Linux kernel provides more basic utility functions.
 Bitmap Operations
 -----------------
 
+.. kernel-doc:: lib/bitmap.c
+   :doc: bitmap introduction
+
+.. kernel-doc:: include/linux/bitmap.h
+   :doc: declare bitmap
+
+.. kernel-doc:: include/linux/bitmap.h
+   :doc: bitmap overview
+
+.. kernel-doc:: include/linux/bitmap.h
+   :doc: bitmap bitops
+
 .. kernel-doc:: lib/bitmap.c
    :export:
 
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 5c4178016b1e..d9974c7a0a61 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -21,65 +21,74 @@
  * See lib/bitmap.c for more details.
  */
 
-/*
+/**
+ * DOC: bitmap overview
+ *
  * The available bitmap operations and their rough meaning in the
  * case that the bitmap is a single unsigned long are thus:
  *
  * Note that nbits should be always a compile time evaluable constant.
  * Otherwise many inlines will generate horrible code.
  *
- * bitmap_zero(dst, nbits)			*dst = 0UL
- * bitmap_fill(dst, nbits)			*dst = ~0UL
- * bitmap_copy(dst, src, nbits)			*dst = *src
- * bitmap_and(dst, src1, src2, nbits)		*dst = *src1 & *src2
- * bitmap_or(dst, src1, src2, nbits)		*dst = *src1 | *src2
- * bitmap_xor(dst, src1, src2, nbits)		*dst = *src1 ^ *src2
- * bitmap_andnot(dst, src1, src2, nbits)	*dst = *src1 & ~(*src2)
- * bitmap_complement(dst, src, nbits)		*dst = ~(*src)
- * bitmap_equal(src1, src2, nbits)		Are *src1 and *src2 equal?
- * bitmap_intersects(src1, src2, nbits) 	Do *src1 and *src2 overlap?
- * bitmap_subset(src1, src2, nbits)		Is *src1 a subset of *src2?
- * bitmap_empty(src, nbits)			Are all bits zero in *src?
- * bitmap_full(src, nbits)			Are all bits set in *src?
- * bitmap_weight(src, nbits)			Hamming Weight: number set bits
- * bitmap_set(dst, pos, nbits)			Set specified bit area
- * bitmap_clear(dst, pos, nbits)		Clear specified bit area
- * bitmap_find_next_zero_area(buf, len, pos, n, mask)	Find bit free area
- * bitmap_find_next_zero_area_off(buf, len, pos, n, mask)	as above
- * bitmap_shift_right(dst, src, n, nbits)	*dst = *src >> n
- * bitmap_shift_left(dst, src, n, nbits)	*dst = *src << n
- * bitmap_remap(dst, src, old, new, nbits)	*dst = map(old, new)(src)
- * bitmap_bitremap(oldbit, old, new, nbits)	newbit = map(old, new)(oldbit)
- * bitmap_onto(dst, orig, relmap, nbits)	*dst = orig relative to relmap
- * bitmap_fold(dst, orig, sz, nbits)		dst bits = orig bits mod sz
- * bitmap_parse(buf, buflen, dst, nbits)	Parse bitmap dst from kernel buf
- * bitmap_parse_user(ubuf, ulen, dst, nbits)	Parse bitmap dst from user buf
- * bitmap_parselist(buf, dst, nbits)		Parse bitmap dst from kernel buf
- * bitmap_parselist_user(buf, dst, nbits)	Parse bitmap dst from user buf
- * bitmap_find_free_region(bitmap, bits, order)	Find and allocate bit region
- * bitmap_release_region(bitmap, pos, order)	Free specified bit region
- * bitmap_allocate_region(bitmap, pos, order)	Allocate specified bit region
- * bitmap_from_u32array(dst, nbits, buf, nwords) *dst = *buf (nwords 32b words)
- * bitmap_to_u32array(buf, nwords, src, nbits)	*buf = *dst (nwords 32b words)
+ * ::
+ *
+ *  bitmap_zero(dst, nbits)                     *dst = 0UL
+ *  bitmap_fill(dst, nbits)                     *dst = ~0UL
+ *  bitmap_copy(dst, src, nbits)                *dst = *src
+ *  bitmap_and(dst, src1, src2, nbits)          *dst = *src1 & *src2
+ *  bitmap_or(dst, src1, src2, nbits)           *dst = *src1 | *src2
+ *  bitmap_xor(dst, src1, src2, nbits)          *dst = *src1 ^ *src2
+ *  bitmap_andnot(dst, src1, src2, nbits)       *dst = *src1 & ~(*src2)
+ *  bitmap_complement(dst, src, nbits)          *dst = ~(*src)
+ *  bitmap_equal(src1, src2, nbits)             Are *src1 and *src2 equal?
+ *  bitmap_intersects(src1, src2, nbits)        Do *src1 and *src2 overlap?
+ *  bitmap_subset(src1, src2, nbits)            Is *src1 a subset of *src2?
+ *  bitmap_empty(src, nbits)                    Are all bits zero in *src?
+ *  bitmap_full(src, nbits)                     Are all bits set in *src?
+ *  bitmap_weight(src, nbits)                   Hamming Weight: number set bits
+ *  bitmap_set(dst, pos, nbits)                 Set specified bit area
+ *  bitmap_clear(dst, pos, nbits)               Clear specified bit area
+ *  bitmap_find_next_zero_area(buf, len, pos, n, mask)  Find bit free area
+ *  bitmap_find_next_zero_area_off(buf, len, pos, n, mask)  as above
+ *  bitmap_shift_right(dst, src, n, nbits)      *dst = *src >> n
+ *  bitmap_shift_left(dst, src, n, nbits)       *dst = *src << n
+ *  bitmap_remap(dst, src, old, new, nbits)     *dst = map(old, new)(src)
+ *  bitmap_bitremap(oldbit, old, new, nbits)    newbit = map(old, new)(oldbit)
+ *  bitmap_onto(dst, orig, relmap, nbits)       *dst = orig relative to relmap
+ *  bitmap_fold(dst, orig, sz, nbits)           dst bits = orig bits mod sz
+ *  bitmap_parse(buf, buflen, dst, nbits)       Parse bitmap dst from kernel buf
+ *  bitmap_parse_user(ubuf, ulen, dst, nbits)   Parse bitmap dst from user buf
+ *  bitmap_parselist(buf, dst, nbits)           Parse bitmap dst from kernel buf
+ *  bitmap_parselist_user(buf, dst, nbits)      Parse bitmap dst from user buf
+ *  bitmap_find_free_region(bitmap, bits, order)  Find and allocate bit region
+ *  bitmap_release_region(bitmap, pos, order)   Free specified bit region
+ *  bitmap_allocate_region(bitmap, pos, order)  Allocate specified bit region
+ *  bitmap_from_u32array(dst, nbits, buf, nwords)  *dst = *buf (nwords 32b words)
+ *  bitmap_to_u32array(buf, nwords, src, nbits) *buf = *dst (nwords 32b words)
+ *
  */
 
-/*
- * Also the following operations in asm/bitops.h apply to bitmaps.
+/**
+ * DOC: bitmap bitops
+ *
+ * Also the following operations in asm/bitops.h apply to bitmaps.::
+ *
+ *  set_bit(bit, addr)                  *addr |= bit
+ *  clear_bit(bit, addr)                *addr &= ~bit
+ *  change_bit(bit, addr)               *addr ^= bit
+ *  test_bit(bit, addr)                 Is bit set in *addr?
+ *  test_and_set_bit(bit, addr)         Set bit and return old value
+ *  test_and_clear_bit(bit, addr)       Clear bit and return old value
+ *  test_and_change_bit(bit, addr)      Change bit and return old value
+ *  find_first_zero_bit(addr, nbits)    Position first zero bit in *addr
+ *  find_first_bit(addr, nbits)         Position first set bit in *addr
+ *  find_next_zero_bit(addr, nbits, bit)  Position next zero bit in *addr >= bit
+ *  find_next_bit(addr, nbits, bit)     Position next set bit in *addr >= bit
  *
- * set_bit(bit, addr)			*addr |= bit
- * clear_bit(bit, addr)			*addr &= ~bit
- * change_bit(bit, addr)		*addr ^= bit
- * test_bit(bit, addr)			Is bit set in *addr?
- * test_and_set_bit(bit, addr)		Set bit and return old value
- * test_and_clear_bit(bit, addr)	Clear bit and return old value
- * test_and_change_bit(bit, addr)	Change bit and return old value
- * find_first_zero_bit(addr, nbits)	Position first zero bit in *addr
- * find_first_bit(addr, nbits)		Position first set bit in *addr
- * find_next_zero_bit(addr, nbits, bit)	Position next zero bit in *addr >= bit
- * find_next_bit(addr, nbits, bit)	Position next set bit in *addr >= bit
  */
 
-/*
+/**
+ * DOC: declare bitmap
  * The DECLARE_BITMAP(name,bits) macro, in linux/types.h, can be used
  * to declare an array named 'name' of just enough unsigned longs to
  * contain all bit positions from 0 to 'bits' - 1.
diff --git a/lib/bitmap.c b/lib/bitmap.c
index c82c61b66e16..d8f0c094b18e 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -18,7 +18,9 @@
 
 #include <asm/page.h>
 
-/*
+/**
+ * DOC: bitmap introduction
+ *
  * bitmaps provide an array of bits, implemented using an an
  * array of unsigned longs.  The number of valid bits in a
  * given bitmap does _not_ need to be an exact multiple of
-- 
cgit v1.2.3


From f628ba9e22e0fa32ac6270a0d696fe1509889540 Mon Sep 17 00:00:00 2001
From: Grygorii Strashko <grygorii.strashko@ti.com>
Date: Mon, 9 Oct 2017 17:16:05 -0500
Subject: gpiolib: drop irq_base field from gpio_chip struct

Hence, the last user of irq_base field was removed by commit b4c495f03ae3
("gpio: mockup: use irq_sim") it can be removed safely.

Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index c97f8325e8bf..6bbda879fb8b 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -83,7 +83,6 @@ struct module;
  * @irqchip: GPIO IRQ chip impl, provided by GPIO driver
  * @irqdomain: Interrupt translation domain; responsible for mapping
  *	between GPIO hwirq number and linux irq number
- * @irq_base: first linux IRQ number assigned to GPIO IRQ chip (deprecated)
  * @irq_handler: the irq handler to use (often a predefined irq core function)
  *	for GPIO IRQs, provided by GPIO driver
  * @irq_default_type: default IRQ triggering type applied during GPIO driver
@@ -165,7 +164,6 @@ struct gpio_chip {
 	 */
 	struct irq_chip		*irqchip;
 	struct irq_domain	*irqdomain;
-	unsigned int		irq_base;
 	irq_flow_handler_t	irq_handler;
 	unsigned int		irq_default_type;
 	unsigned int		irq_chained_parent;
-- 
cgit v1.2.3


From 5307e2ad69ab3b0e0622fdf8b254c1d4565eb924 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Thu, 12 Oct 2017 12:40:10 +0200
Subject: bitops: Introduce assign_bit()

A common idiom is to assign a value to a bit with:

    if (value)
        set_bit(nr, addr);
    else
        clear_bit(nr, addr);

Likewise common is the one-line expression variant:

    value ? set_bit(nr, addr) : clear_bit(nr, addr);

Commit 9a8ac3ae682e ("dm mpath: cleanup QUEUE_IF_NO_PATH bit
manipulation by introducing assign_bit()") introduced assign_bit()
to the md subsystem for brevity.

Make it available to others, specifically gpiolib and the upcoming
driver for Maxim MAX3191x industrial serializer chips.

As requested by Peter Zijlstra, change the argument order to reflect
traditional "dst = src" in C, hence "assign_bit(nr, addr, value)".

Cc: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Neil Brown <neilb@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/md/dm-mpath.c  | 22 +++++++---------------
 include/linux/bitops.h | 24 ++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 11f273d2f018..0e211de9bc54 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -641,14 +641,6 @@ static void process_queued_bios(struct work_struct *work)
 	blk_finish_plug(&plug);
 }
 
-static void assign_bit(bool value, long nr, unsigned long *addr)
-{
-	if (value)
-		set_bit(nr, addr);
-	else
-		clear_bit(nr, addr);
-}
-
 /*
  * If we run out of usable paths, should we queue I/O or error it?
  */
@@ -658,11 +650,11 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
 	unsigned long flags;
 
 	spin_lock_irqsave(&m->lock, flags);
-	assign_bit((save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) ||
-		   (!save_old_value && queue_if_no_path),
-		   MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
-	assign_bit(queue_if_no_path || dm_noflush_suspending(m->ti),
-		   MPATHF_QUEUE_IF_NO_PATH, &m->flags);
+	assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags,
+		   (save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) ||
+		   (!save_old_value && queue_if_no_path));
+	assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags,
+		   queue_if_no_path || dm_noflush_suspending(m->ti));
 	spin_unlock_irqrestore(&m->lock, flags);
 
 	if (!queue_if_no_path) {
@@ -1588,8 +1580,8 @@ static void multipath_resume(struct dm_target *ti)
 	unsigned long flags;
 
 	spin_lock_irqsave(&m->lock, flags);
-	assign_bit(test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags),
-		   MPATHF_QUEUE_IF_NO_PATH, &m->flags);
+	assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags,
+		   test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags));
 	spin_unlock_irqrestore(&m->lock, flags);
 }
 
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 8fbe259b197c..9a874deee6e2 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -227,6 +227,30 @@ static inline unsigned long __ffs64(u64 word)
 	return __ffs((unsigned long)word);
 }
 
+/**
+ * assign_bit - Assign value to a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ * @value: the value to assign
+ */
+static __always_inline void assign_bit(long nr, volatile unsigned long *addr,
+				       bool value)
+{
+	if (value)
+		set_bit(nr, addr);
+	else
+		clear_bit(nr, addr);
+}
+
+static __always_inline void __assign_bit(long nr, volatile unsigned long *addr,
+					 bool value)
+{
+	if (value)
+		__set_bit(nr, addr);
+	else
+		__clear_bit(nr, addr);
+}
+
 #ifdef __KERNEL__
 
 #ifndef set_mask_bits
-- 
cgit v1.2.3


From eec1d566cdf94b57e8f5ba9fe60eea214929bcfc Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Thu, 12 Oct 2017 12:40:10 +0200
Subject: gpio: Introduce ->get_multiple callback

SPI-attached GPIO controllers typically read out all inputs in one go.
If callers desire the values of multipe inputs, ideally a single readout
should take place to return the desired values.  However the current
driver API only offers a ->get callback but no ->get_multiple (unlike
->set_multiple, which is present).  Thus, to read multiple inputs, a
full readout needs to be performed for every single value (barring
driver-internal caching), which is inefficient.

In fact, the lack of a ->get_multiple callback has been bemoaned
repeatedly by the gpio subsystem maintainer:
http://www.spinics.net/lists/linux-gpio/msg10571.html
http://www.spinics.net/lists/devicetree/msg121734.html

Introduce the missing callback.  Add corresponding consumer functions
such as gpiod_get_array_value().  Amend linehandle_ioctl() to take
advantage of the newly added infrastructure.  Update the documentation.

Cc: Rojhalat Ibrahim <imr@rtschenk.de>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/gpio/consumer.txt |  41 ++++++---
 drivers/gpio/gpiolib.c          | 179 +++++++++++++++++++++++++++++++++++++---
 drivers/gpio/gpiolib.h          |   4 +
 include/linux/gpio/consumer.h   |  43 ++++++++++
 include/linux/gpio/driver.h     |   5 ++
 5 files changed, 250 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/gpio/consumer.txt b/Documentation/gpio/consumer.txt
index ddbfa775a78a..63e1bd1d88e3 100644
--- a/Documentation/gpio/consumer.txt
+++ b/Documentation/gpio/consumer.txt
@@ -295,9 +295,22 @@ as possible, especially by drivers which should not care about the actual
 physical line level and worry about the logical value instead.
 
 
-Set multiple GPIO outputs with a single function call
------------------------------------------------------
-The following functions set the output values of an array of GPIOs:
+Access multiple GPIOs with a single function call
+-------------------------------------------------
+The following functions get or set the values of an array of GPIOs:
+
+	int gpiod_get_array_value(unsigned int array_size,
+				  struct gpio_desc **desc_array,
+				  int *value_array);
+	int gpiod_get_raw_array_value(unsigned int array_size,
+				      struct gpio_desc **desc_array,
+				      int *value_array);
+	int gpiod_get_array_value_cansleep(unsigned int array_size,
+					   struct gpio_desc **desc_array,
+					   int *value_array);
+	int gpiod_get_raw_array_value_cansleep(unsigned int array_size,
+					   struct gpio_desc **desc_array,
+					   int *value_array);
 
 	void gpiod_set_array_value(unsigned int array_size,
 				   struct gpio_desc **desc_array,
@@ -312,34 +325,40 @@ The following functions set the output values of an array of GPIOs:
 						struct gpio_desc **desc_array,
 						int *value_array)
 
-The array can be an arbitrary set of GPIOs. The functions will try to set
+The array can be an arbitrary set of GPIOs. The functions will try to access
 GPIOs belonging to the same bank or chip simultaneously if supported by the
 corresponding chip driver. In that case a significantly improved performance
-can be expected. If simultaneous setting is not possible the GPIOs will be set
-sequentially.
+can be expected. If simultaneous access is not possible the GPIOs will be
+accessed sequentially.
 
-The gpiod_set_array() functions take three arguments:
+The functions take three arguments:
 	* array_size	- the number of array elements
 	* desc_array	- an array of GPIO descriptors
-	* value_array	- an array of values to assign to the GPIOs
+	* value_array	- an array to store the GPIOs' values (get) or
+			  an array of values to assign to the GPIOs (set)
 
 The descriptor array can be obtained using the gpiod_get_array() function
 or one of its variants. If the group of descriptors returned by that function
-matches the desired group of GPIOs, those GPIOs can be set by simply using
+matches the desired group of GPIOs, those GPIOs can be accessed by simply using
 the struct gpio_descs returned by gpiod_get_array():
 
 	struct gpio_descs *my_gpio_descs = gpiod_get_array(...);
 	gpiod_set_array_value(my_gpio_descs->ndescs, my_gpio_descs->desc,
 			      my_gpio_values);
 
-It is also possible to set a completely arbitrary array of descriptors. The
+It is also possible to access a completely arbitrary array of descriptors. The
 descriptors may be obtained using any combination of gpiod_get() and
 gpiod_get_array(). Afterwards the array of descriptors has to be setup
-manually before it can be used with gpiod_set_array().
+manually before it can be passed to one of the above functions.
 
 Note that for optimal performance GPIOs belonging to the same chip should be
 contiguous within the array of descriptors.
 
+The return value of gpiod_get_array_value() and its variants is 0 on success
+or negative on error. Note the difference to gpiod_get_value(), which returns
+0 or 1 on success to convey the GPIO value. With the array functions, the GPIO
+values are stored in value_array rather than passed back as return value.
+
 
 GPIOs mapped to IRQs
 --------------------
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 8113929eb2bd..ff46a1b6299e 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -365,28 +365,28 @@ static long linehandle_ioctl(struct file *filep, unsigned int cmd,
 	struct linehandle_state *lh = filep->private_data;
 	void __user *ip = (void __user *)arg;
 	struct gpiohandle_data ghd;
+	int vals[GPIOHANDLES_MAX];
 	int i;
 
 	if (cmd == GPIOHANDLE_GET_LINE_VALUES_IOCTL) {
-		int val;
+		/* TODO: check if descriptors are really input */
+		int ret = gpiod_get_array_value_complex(false,
+							true,
+							lh->numdescs,
+							lh->descs,
+							vals);
+		if (ret)
+			return ret;
 
 		memset(&ghd, 0, sizeof(ghd));
-
-		/* TODO: check if descriptors are really input */
-		for (i = 0; i < lh->numdescs; i++) {
-			val = gpiod_get_value_cansleep(lh->descs[i]);
-			if (val < 0)
-				return val;
-			ghd.values[i] = val;
-		}
+		for (i = 0; i < lh->numdescs; i++)
+			ghd.values[i] = vals[i];
 
 		if (copy_to_user(ip, &ghd, sizeof(ghd)))
 			return -EFAULT;
 
 		return 0;
 	} else if (cmd == GPIOHANDLE_SET_LINE_VALUES_IOCTL) {
-		int vals[GPIOHANDLES_MAX];
-
 		/* TODO: check if descriptors are really output */
 		if (copy_from_user(&ghd, ip, sizeof(ghd)))
 			return -EFAULT;
@@ -2466,6 +2466,71 @@ static int gpiod_get_raw_value_commit(const struct gpio_desc *desc)
 	return value;
 }
 
+static int gpio_chip_get_multiple(struct gpio_chip *chip,
+				  unsigned long *mask, unsigned long *bits)
+{
+	if (chip->get_multiple) {
+		return chip->get_multiple(chip, mask, bits);
+	} else if (chip->get) {
+		int i, value;
+
+		for_each_set_bit(i, mask, chip->ngpio) {
+			value = chip->get(chip, i);
+			if (value < 0)
+				return value;
+			__assign_bit(i, bits, value);
+		}
+		return 0;
+	}
+	return -EIO;
+}
+
+int gpiod_get_array_value_complex(bool raw, bool can_sleep,
+				  unsigned int array_size,
+				  struct gpio_desc **desc_array,
+				  int *value_array)
+{
+	int i = 0;
+
+	while (i < array_size) {
+		struct gpio_chip *chip = desc_array[i]->gdev->chip;
+		unsigned long mask[BITS_TO_LONGS(chip->ngpio)];
+		unsigned long bits[BITS_TO_LONGS(chip->ngpio)];
+		int first, j, ret;
+
+		if (!can_sleep)
+			WARN_ON(chip->can_sleep);
+
+		/* collect all inputs belonging to the same chip */
+		first = i;
+		memset(mask, 0, sizeof(mask));
+		do {
+			const struct gpio_desc *desc = desc_array[i];
+			int hwgpio = gpio_chip_hwgpio(desc);
+
+			__set_bit(hwgpio, mask);
+			i++;
+		} while ((i < array_size) &&
+			 (desc_array[i]->gdev->chip == chip));
+
+		ret = gpio_chip_get_multiple(chip, mask, bits);
+		if (ret)
+			return ret;
+
+		for (j = first; j < i; j++) {
+			const struct gpio_desc *desc = desc_array[j];
+			int hwgpio = gpio_chip_hwgpio(desc);
+			int value = test_bit(hwgpio, bits);
+
+			if (!raw && test_bit(FLAG_ACTIVE_LOW, &desc->flags))
+				value = !value;
+			value_array[j] = value;
+			trace_gpio_value(desc_to_gpio(desc), 1, value);
+		}
+	}
+	return 0;
+}
+
 /**
  * gpiod_get_raw_value() - return a gpio's raw value
  * @desc: gpio whose value will be returned
@@ -2514,6 +2579,51 @@ int gpiod_get_value(const struct gpio_desc *desc)
 }
 EXPORT_SYMBOL_GPL(gpiod_get_value);
 
+/**
+ * gpiod_get_raw_array_value() - read raw values from an array of GPIOs
+ * @array_size: number of elements in the descriptor / value arrays
+ * @desc_array: array of GPIO descriptors whose values will be read
+ * @value_array: array to store the read values
+ *
+ * Read the raw values of the GPIOs, i.e. the values of the physical lines
+ * without regard for their ACTIVE_LOW status.  Return 0 in case of success,
+ * else an error code.
+ *
+ * This function should be called from contexts where we cannot sleep,
+ * and it will complain if the GPIO chip functions potentially sleep.
+ */
+int gpiod_get_raw_array_value(unsigned int array_size,
+			      struct gpio_desc **desc_array, int *value_array)
+{
+	if (!desc_array)
+		return -EINVAL;
+	return gpiod_get_array_value_complex(true, false, array_size,
+					     desc_array, value_array);
+}
+EXPORT_SYMBOL_GPL(gpiod_get_raw_array_value);
+
+/**
+ * gpiod_get_array_value() - read values from an array of GPIOs
+ * @array_size: number of elements in the descriptor / value arrays
+ * @desc_array: array of GPIO descriptors whose values will be read
+ * @value_array: array to store the read values
+ *
+ * Read the logical values of the GPIOs, i.e. taking their ACTIVE_LOW status
+ * into account.  Return 0 in case of success, else an error code.
+ *
+ * This function should be called from contexts where we cannot sleep,
+ * and it will complain if the GPIO chip functions potentially sleep.
+ */
+int gpiod_get_array_value(unsigned int array_size,
+			  struct gpio_desc **desc_array, int *value_array)
+{
+	if (!desc_array)
+		return -EINVAL;
+	return gpiod_get_array_value_complex(false, false, array_size,
+					     desc_array, value_array);
+}
+EXPORT_SYMBOL_GPL(gpiod_get_array_value);
+
 /*
  *  gpio_set_open_drain_value_commit() - Set the open drain gpio's value.
  * @desc: gpio descriptor whose state need to be set.
@@ -2942,6 +3052,53 @@ int gpiod_get_value_cansleep(const struct gpio_desc *desc)
 }
 EXPORT_SYMBOL_GPL(gpiod_get_value_cansleep);
 
+/**
+ * gpiod_get_raw_array_value_cansleep() - read raw values from an array of GPIOs
+ * @array_size: number of elements in the descriptor / value arrays
+ * @desc_array: array of GPIO descriptors whose values will be read
+ * @value_array: array to store the read values
+ *
+ * Read the raw values of the GPIOs, i.e. the values of the physical lines
+ * without regard for their ACTIVE_LOW status.  Return 0 in case of success,
+ * else an error code.
+ *
+ * This function is to be called from contexts that can sleep.
+ */
+int gpiod_get_raw_array_value_cansleep(unsigned int array_size,
+				       struct gpio_desc **desc_array,
+				       int *value_array)
+{
+	might_sleep_if(extra_checks);
+	if (!desc_array)
+		return -EINVAL;
+	return gpiod_get_array_value_complex(true, true, array_size,
+					     desc_array, value_array);
+}
+EXPORT_SYMBOL_GPL(gpiod_get_raw_array_value_cansleep);
+
+/**
+ * gpiod_get_array_value_cansleep() - read values from an array of GPIOs
+ * @array_size: number of elements in the descriptor / value arrays
+ * @desc_array: array of GPIO descriptors whose values will be read
+ * @value_array: array to store the read values
+ *
+ * Read the logical values of the GPIOs, i.e. taking their ACTIVE_LOW status
+ * into account.  Return 0 in case of success, else an error code.
+ *
+ * This function is to be called from contexts that can sleep.
+ */
+int gpiod_get_array_value_cansleep(unsigned int array_size,
+				   struct gpio_desc **desc_array,
+				   int *value_array)
+{
+	might_sleep_if(extra_checks);
+	if (!desc_array)
+		return -EINVAL;
+	return gpiod_get_array_value_complex(false, true, array_size,
+					     desc_array, value_array);
+}
+EXPORT_SYMBOL_GPL(gpiod_get_array_value_cansleep);
+
 /**
  * gpiod_set_raw_value_cansleep() - assign a gpio's raw value
  * @desc: gpio whose value will be assigned
diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h
index d003ccb12781..10a48caf8477 100644
--- a/drivers/gpio/gpiolib.h
+++ b/drivers/gpio/gpiolib.h
@@ -180,6 +180,10 @@ static inline bool acpi_can_fallback_to_crs(struct acpi_device *adev,
 #endif
 
 struct gpio_desc *gpiochip_get_desc(struct gpio_chip *chip, u16 hwnum);
+int gpiod_get_array_value_complex(bool raw, bool can_sleep,
+				  unsigned int array_size,
+				  struct gpio_desc **desc_array,
+				  int *value_array);
 void gpiod_set_array_value_complex(bool raw, bool can_sleep,
 				   unsigned int array_size,
 				   struct gpio_desc **desc_array,
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 8f702fcbe485..d4920ec1f1da 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -99,10 +99,15 @@ int gpiod_direction_output_raw(struct gpio_desc *desc, int value);
 
 /* Value get/set from non-sleeping context */
 int gpiod_get_value(const struct gpio_desc *desc);
+int gpiod_get_array_value(unsigned int array_size,
+			  struct gpio_desc **desc_array, int *value_array);
 void gpiod_set_value(struct gpio_desc *desc, int value);
 void gpiod_set_array_value(unsigned int array_size,
 			   struct gpio_desc **desc_array, int *value_array);
 int gpiod_get_raw_value(const struct gpio_desc *desc);
+int gpiod_get_raw_array_value(unsigned int array_size,
+			      struct gpio_desc **desc_array,
+			      int *value_array);
 void gpiod_set_raw_value(struct gpio_desc *desc, int value);
 void gpiod_set_raw_array_value(unsigned int array_size,
 			       struct gpio_desc **desc_array,
@@ -110,11 +115,17 @@ void gpiod_set_raw_array_value(unsigned int array_size,
 
 /* Value get/set from sleeping context */
 int gpiod_get_value_cansleep(const struct gpio_desc *desc);
+int gpiod_get_array_value_cansleep(unsigned int array_size,
+				   struct gpio_desc **desc_array,
+				   int *value_array);
 void gpiod_set_value_cansleep(struct gpio_desc *desc, int value);
 void gpiod_set_array_value_cansleep(unsigned int array_size,
 				    struct gpio_desc **desc_array,
 				    int *value_array);
 int gpiod_get_raw_value_cansleep(const struct gpio_desc *desc);
+int gpiod_get_raw_array_value_cansleep(unsigned int array_size,
+				       struct gpio_desc **desc_array,
+				       int *value_array);
 void gpiod_set_raw_value_cansleep(struct gpio_desc *desc, int value);
 void gpiod_set_raw_array_value_cansleep(unsigned int array_size,
 					struct gpio_desc **desc_array,
@@ -305,6 +316,14 @@ static inline int gpiod_get_value(const struct gpio_desc *desc)
 	WARN_ON(1);
 	return 0;
 }
+static inline int gpiod_get_array_value(unsigned int array_size,
+					struct gpio_desc **desc_array,
+					int *value_array)
+{
+	/* GPIO can never have been requested */
+	WARN_ON(1);
+	return 0;
+}
 static inline void gpiod_set_value(struct gpio_desc *desc, int value)
 {
 	/* GPIO can never have been requested */
@@ -323,6 +342,14 @@ static inline int gpiod_get_raw_value(const struct gpio_desc *desc)
 	WARN_ON(1);
 	return 0;
 }
+static inline int gpiod_get_raw_array_value(unsigned int array_size,
+					    struct gpio_desc **desc_array,
+					    int *value_array)
+{
+	/* GPIO can never have been requested */
+	WARN_ON(1);
+	return 0;
+}
 static inline void gpiod_set_raw_value(struct gpio_desc *desc, int value)
 {
 	/* GPIO can never have been requested */
@@ -342,6 +369,14 @@ static inline int gpiod_get_value_cansleep(const struct gpio_desc *desc)
 	WARN_ON(1);
 	return 0;
 }
+static inline int gpiod_get_array_value_cansleep(unsigned int array_size,
+				     struct gpio_desc **desc_array,
+				     int *value_array)
+{
+	/* GPIO can never have been requested */
+	WARN_ON(1);
+	return 0;
+}
 static inline void gpiod_set_value_cansleep(struct gpio_desc *desc, int value)
 {
 	/* GPIO can never have been requested */
@@ -360,6 +395,14 @@ static inline int gpiod_get_raw_value_cansleep(const struct gpio_desc *desc)
 	WARN_ON(1);
 	return 0;
 }
+static inline int gpiod_get_raw_array_value_cansleep(unsigned int array_size,
+					       struct gpio_desc **desc_array,
+					       int *value_array)
+{
+	/* GPIO can never have been requested */
+	WARN_ON(1);
+	return 0;
+}
 static inline void gpiod_set_raw_value_cansleep(struct gpio_desc *desc,
 						int value)
 {
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 6bbda879fb8b..bda95a9b7b8c 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -35,6 +35,8 @@ struct module;
  * @direction_input: configures signal "offset" as input, or returns error
  * @direction_output: configures signal "offset" as output, or returns error
  * @get: returns value for signal "offset", 0=low, 1=high, or negative error
+ * @get_multiple: reads values for multiple signals defined by "mask" and
+ *	stores them in "bits", returns 0 on success or negative error
  * @set: assigns output value for signal "offset"
  * @set_multiple: assigns output values for multiple signals defined by "mask"
  * @set_config: optional hook for all kinds of settings. Uses the same
@@ -125,6 +127,9 @@ struct gpio_chip {
 						unsigned offset, int value);
 	int			(*get)(struct gpio_chip *chip,
 						unsigned offset);
+	int			(*get_multiple)(struct gpio_chip *chip,
+						unsigned long *mask,
+						unsigned long *bits);
 	void			(*set)(struct gpio_chip *chip,
 						unsigned offset, int value);
 	void			(*set_multiple)(struct gpio_chip *chip,
-- 
cgit v1.2.3


From c5053e695d621f8a8a3992c5e93dd3be088fa267 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 19 Oct 2017 16:19:03 -0700
Subject: Input: gpio_mouse - kill off platform data

This is not used much: git grep  gpio_mouse_platform_data shows
that absolutely nothing in the kernel defines this platform
data.

It could be argued that the driver should be deleted. But that
is a bit harsh I think since it seems generally useful. So
this patch starts a series which repurposes it to be used with
hardware nodes from device tree or ACPI.

This first patch simply localize the platform data header and
allocates a dummy platform data.

Yes: this patch leaves the driver in a pretty useless state,
but since nothing is instantiating this driver, it doesn't
make it more useless than it already is. Later patches makes
use of the driver.

Acked-by: Hans-Christian Noren Egtvedt <egtvedt@samfundet.no>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/mouse/gpio_mouse.c | 59 +++++++++++++++++++++++++++++++++-----
 include/linux/gpio_mouse.h       | 61 ----------------------------------------
 2 files changed, 52 insertions(+), 68 deletions(-)
 delete mode 100644 include/linux/gpio_mouse.h

(limited to 'include/linux')

diff --git a/drivers/input/mouse/gpio_mouse.c b/drivers/input/mouse/gpio_mouse.c
index ced07391304b..dcaba1e4fffb 100644
--- a/drivers/input/mouse/gpio_mouse.c
+++ b/drivers/input/mouse/gpio_mouse.c
@@ -12,8 +12,54 @@
 #include <linux/platform_device.h>
 #include <linux/input-polldev.h>
 #include <linux/gpio.h>
-#include <linux/gpio_mouse.h>
 
+#define GPIO_MOUSE_POLARITY_ACT_HIGH	0x00
+#define GPIO_MOUSE_POLARITY_ACT_LOW	0x01
+
+#define GPIO_MOUSE_PIN_UP	0
+#define GPIO_MOUSE_PIN_DOWN	1
+#define GPIO_MOUSE_PIN_LEFT	2
+#define GPIO_MOUSE_PIN_RIGHT	3
+#define GPIO_MOUSE_PIN_BLEFT	4
+#define GPIO_MOUSE_PIN_BMIDDLE	5
+#define GPIO_MOUSE_PIN_BRIGHT	6
+#define GPIO_MOUSE_PIN_MAX	7
+
+/**
+ * struct gpio_mouse_platform_data
+ * @scan_ms: integer in ms specifying the scan periode.
+ * @polarity: Pin polarity, active high or low.
+ * @up: GPIO line for up value.
+ * @down: GPIO line for down value.
+ * @left: GPIO line for left value.
+ * @right: GPIO line for right value.
+ * @bleft: GPIO line for left button.
+ * @bmiddle: GPIO line for middle button.
+ * @bright: GPIO line for right button.
+ * @pins: GPIO line numbers used for the mouse.
+ *
+ * This struct must be added to the platform_device in the board code.
+ * It is used by the gpio_mouse driver to setup GPIO lines and to
+ * calculate mouse movement.
+ */
+struct gpio_mouse_platform_data {
+	int scan_ms;
+	int polarity;
+
+	union {
+		struct {
+			int up;
+			int down;
+			int left;
+			int right;
+
+			int bleft;
+			int bmiddle;
+			int bright;
+		};
+		int pins[GPIO_MOUSE_PIN_MAX];
+	};
+};
 
 /*
  * Timer function which is run every scan_ms ms when the device is opened.
@@ -47,17 +93,16 @@ static void gpio_mouse_scan(struct input_polled_dev *dev)
 
 static int gpio_mouse_probe(struct platform_device *pdev)
 {
-	struct gpio_mouse_platform_data *pdata = dev_get_platdata(&pdev->dev);
+	struct device *dev = &pdev->dev;
+	struct gpio_mouse_platform_data *pdata;
 	struct input_polled_dev *input_poll;
 	struct input_dev *input;
 	int pin, i;
 	int error;
 
-	if (!pdata) {
-		dev_err(&pdev->dev, "no platform data\n");
-		error = -ENXIO;
-		goto out;
-	}
+	pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
+	if (!pdata)
+		return -ENOMEM;
 
 	if (pdata->scan_ms < 0) {
 		dev_err(&pdev->dev, "invalid scan time\n");
diff --git a/include/linux/gpio_mouse.h b/include/linux/gpio_mouse.h
deleted file mode 100644
index 44ed7aa14d85..000000000000
--- a/include/linux/gpio_mouse.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Driver for simulating a mouse on GPIO lines.
- *
- * Copyright (C) 2007 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef _GPIO_MOUSE_H
-#define _GPIO_MOUSE_H
-
-#define GPIO_MOUSE_POLARITY_ACT_HIGH	0x00
-#define GPIO_MOUSE_POLARITY_ACT_LOW	0x01
-
-#define GPIO_MOUSE_PIN_UP	0
-#define GPIO_MOUSE_PIN_DOWN	1
-#define GPIO_MOUSE_PIN_LEFT	2
-#define GPIO_MOUSE_PIN_RIGHT	3
-#define GPIO_MOUSE_PIN_BLEFT	4
-#define GPIO_MOUSE_PIN_BMIDDLE	5
-#define GPIO_MOUSE_PIN_BRIGHT	6
-#define GPIO_MOUSE_PIN_MAX	7
-
-/**
- * struct gpio_mouse_platform_data
- * @scan_ms: integer in ms specifying the scan periode.
- * @polarity: Pin polarity, active high or low.
- * @up: GPIO line for up value.
- * @down: GPIO line for down value.
- * @left: GPIO line for left value.
- * @right: GPIO line for right value.
- * @bleft: GPIO line for left button.
- * @bmiddle: GPIO line for middle button.
- * @bright: GPIO line for right button.
- *
- * This struct must be added to the platform_device in the board code.
- * It is used by the gpio_mouse driver to setup GPIO lines and to
- * calculate mouse movement.
- */
-struct gpio_mouse_platform_data {
-	int scan_ms;
-	int polarity;
-
-	union {
-		struct {
-			int up;
-			int down;
-			int left;
-			int right;
-
-			int bleft;
-			int bmiddle;
-			int bright;
-		};
-		int pins[GPIO_MOUSE_PIN_MAX];
-	};
-};
-
-#endif /* _GPIO_MOUSE_H */
-- 
cgit v1.2.3


From 2cbfca66ba5e00606bb0f24aba1e9cd8efe58849 Mon Sep 17 00:00:00 2001
From: Andrew Jeffery <andrew@aj.id.au>
Date: Fri, 20 Oct 2017 13:27:58 +1030
Subject: gpio: Fix loose spelling

Literally.

I expect "lose" was meant here, rather than "loose", though you could feasibly
use a somewhat uncommon definition of "loose" to mean what would be meant by
"lose": "Loose the hounds" for instance, as in "Release the hounds".
Substituting in "value" for "hounds" gives "release the value", and makes some
sense, but futher substituting back to loose gives "loose the value" which
overall just seems a bit anachronistic.

Instead, use modern, pragmatic English and save a character.

Cc: Russell Currey <ruscur@russell.cc>
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib-of.c       | 4 ++--
 drivers/gpio/gpiolib.c          | 6 +++---
 drivers/gpio/gpiolib.h          | 2 +-
 include/dt-bindings/gpio/gpio.h | 2 +-
 include/linux/gpio/machine.h    | 2 +-
 include/linux/of_gpio.h         | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
index bfcd20699ec8..e0d59e61b52f 100644
--- a/drivers/gpio/gpiolib-of.c
+++ b/drivers/gpio/gpiolib-of.c
@@ -153,8 +153,8 @@ struct gpio_desc *of_find_gpio(struct device *dev, const char *con_id,
 			*flags |= GPIO_OPEN_SOURCE;
 	}
 
-	if (of_flags & OF_GPIO_SLEEP_MAY_LOOSE_VALUE)
-		*flags |= GPIO_SLEEP_MAY_LOOSE_VALUE;
+	if (of_flags & OF_GPIO_SLEEP_MAY_LOSE_VALUE)
+		*flags |= GPIO_SLEEP_MAY_LOSE_VALUE;
 
 	return desc;
 }
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index e7372093d968..3827f0767101 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -3010,7 +3010,7 @@ bool gpiochip_line_is_persistent(struct gpio_chip *chip, unsigned int offset)
 	if (offset >= chip->ngpio)
 		return false;
 
-	return !test_bit(FLAG_SLEEP_MAY_LOOSE_VALUE,
+	return !test_bit(FLAG_SLEEP_MAY_LOSE_VALUE,
 			 &chip->gpiodev->descs[offset].flags);
 }
 EXPORT_SYMBOL_GPL(gpiochip_line_is_persistent);
@@ -3435,8 +3435,8 @@ int gpiod_configure_flags(struct gpio_desc *desc, const char *con_id,
 		set_bit(FLAG_OPEN_DRAIN, &desc->flags);
 	if (lflags & GPIO_OPEN_SOURCE)
 		set_bit(FLAG_OPEN_SOURCE, &desc->flags);
-	if (lflags & GPIO_SLEEP_MAY_LOOSE_VALUE)
-		set_bit(FLAG_SLEEP_MAY_LOOSE_VALUE, &desc->flags);
+	if (lflags & GPIO_SLEEP_MAY_LOSE_VALUE)
+		set_bit(FLAG_SLEEP_MAY_LOSE_VALUE, &desc->flags);
 
 	/* No particular flag request, return here... */
 	if (!(dflags & GPIOD_FLAGS_BIT_DIR_SET)) {
diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h
index 10a48caf8477..af48322839c3 100644
--- a/drivers/gpio/gpiolib.h
+++ b/drivers/gpio/gpiolib.h
@@ -205,7 +205,7 @@ struct gpio_desc {
 #define FLAG_OPEN_SOURCE 8	/* Gpio is open source type */
 #define FLAG_USED_AS_IRQ 9	/* GPIO is connected to an IRQ */
 #define FLAG_IS_HOGGED	11	/* GPIO is hogged */
-#define FLAG_SLEEP_MAY_LOOSE_VALUE 12	/* GPIO may loose value in sleep */
+#define FLAG_SLEEP_MAY_LOSE_VALUE 12	/* GPIO may lose value in sleep */
 
 	/* Connection label */
 	const char		*label;
diff --git a/include/dt-bindings/gpio/gpio.h b/include/dt-bindings/gpio/gpio.h
index c5074584561d..70de5b7a6c9b 100644
--- a/include/dt-bindings/gpio/gpio.h
+++ b/include/dt-bindings/gpio/gpio.h
@@ -30,6 +30,6 @@
 
 /* Bit 3 express GPIO suspend/resume persistence */
 #define GPIO_SLEEP_MAINTAIN_VALUE 0
-#define GPIO_SLEEP_MAY_LOOSE_VALUE 8
+#define GPIO_SLEEP_MAY_LOSE_VALUE 8
 
 #endif
diff --git a/include/linux/gpio/machine.h b/include/linux/gpio/machine.h
index ba4ccfd900f9..5e9f294c29eb 100644
--- a/include/linux/gpio/machine.h
+++ b/include/linux/gpio/machine.h
@@ -10,7 +10,7 @@ enum gpio_lookup_flags {
 	GPIO_OPEN_DRAIN = (1 << 1),
 	GPIO_OPEN_SOURCE = (1 << 2),
 	GPIO_SLEEP_MAINTAIN_VALUE = (0 << 3),
-	GPIO_SLEEP_MAY_LOOSE_VALUE = (1 << 3),
+	GPIO_SLEEP_MAY_LOSE_VALUE = (1 << 3),
 };
 
 /**
diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h
index ca10f43564de..1fe205582111 100644
--- a/include/linux/of_gpio.h
+++ b/include/linux/of_gpio.h
@@ -31,7 +31,7 @@ enum of_gpio_flags {
 	OF_GPIO_ACTIVE_LOW = 0x1,
 	OF_GPIO_SINGLE_ENDED = 0x2,
 	OF_GPIO_OPEN_DRAIN = 0x4,
-	OF_GPIO_SLEEP_MAY_LOOSE_VALUE = 0x8,
+	OF_GPIO_SLEEP_MAY_LOSE_VALUE = 0x8,
 };
 
 #ifdef CONFIG_OF_GPIO
-- 
cgit v1.2.3


From 590c845930457d25d27dc1fdd964a1ce18ef2d7d Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 6 Oct 2017 08:14:37 +0900
Subject: kprobes: Disable the jprobes APIs

Disable the jprobes APIs and comment out the jprobes API function
code. This is in preparation of removing all jprobes related
code (including kprobe's break_handler).

Nowadays ftrace and other tracing features are mature enough
to replace jprobes use-cases. Users can safely use ftrace and
perf probe etc. for their use cases.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S . Miller <davem@davemloft.net>
Cc: Ian McDonald <ian.mcdonald@jandi.co.nz>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlad Yasevich <vyasevich@gmail.com>
Link: http://lkml.kernel.org/r/150724527741.5014.15465541485637899227.stgit@devbox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kprobes.h | 40 ++++++++++++++++++----------------------
 kernel/kprobes.c        |  2 ++
 2 files changed, 20 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index bd2684700b74..56b2e698dbad 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -391,10 +391,6 @@ int register_kprobes(struct kprobe **kps, int num);
 void unregister_kprobes(struct kprobe **kps, int num);
 int setjmp_pre_handler(struct kprobe *, struct pt_regs *);
 int longjmp_break_handler(struct kprobe *, struct pt_regs *);
-int register_jprobe(struct jprobe *p);
-void unregister_jprobe(struct jprobe *p);
-int register_jprobes(struct jprobe **jps, int num);
-void unregister_jprobes(struct jprobe **jps, int num);
 void jprobe_return(void);
 unsigned long arch_deref_entry_point(void *);
 
@@ -443,20 +439,6 @@ static inline void unregister_kprobe(struct kprobe *p)
 static inline void unregister_kprobes(struct kprobe **kps, int num)
 {
 }
-static inline int register_jprobe(struct jprobe *p)
-{
-	return -ENOSYS;
-}
-static inline int register_jprobes(struct jprobe **jps, int num)
-{
-	return -ENOSYS;
-}
-static inline void unregister_jprobe(struct jprobe *p)
-{
-}
-static inline void unregister_jprobes(struct jprobe **jps, int num)
-{
-}
 static inline void jprobe_return(void)
 {
 }
@@ -486,6 +468,20 @@ static inline int enable_kprobe(struct kprobe *kp)
 	return -ENOSYS;
 }
 #endif /* CONFIG_KPROBES */
+static inline int __deprecated register_jprobe(struct jprobe *p)
+{
+	return -ENOSYS;
+}
+static inline int __deprecated register_jprobes(struct jprobe **jps, int num)
+{
+	return -ENOSYS;
+}
+static inline void __deprecated unregister_jprobe(struct jprobe *p)
+{
+}
+static inline void __deprecated unregister_jprobes(struct jprobe **jps, int num)
+{
+}
 static inline int disable_kretprobe(struct kretprobe *rp)
 {
 	return disable_kprobe(&rp->kp);
@@ -494,13 +490,13 @@ static inline int enable_kretprobe(struct kretprobe *rp)
 {
 	return enable_kprobe(&rp->kp);
 }
-static inline int disable_jprobe(struct jprobe *jp)
+static inline int __deprecated disable_jprobe(struct jprobe *jp)
 {
-	return disable_kprobe(&jp->kp);
+	return -ENOSYS;
 }
-static inline int enable_jprobe(struct jprobe *jp)
+static inline int __deprecated enable_jprobe(struct jprobe *jp)
 {
-	return enable_kprobe(&jp->kp);
+	return -ENOSYS;
 }
 
 #ifndef CONFIG_KPROBES
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a8fc1492b308..da2ccf142358 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1771,6 +1771,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
 	return (unsigned long)entry;
 }
 
+#if 0
 int register_jprobes(struct jprobe **jps, int num)
 {
 	int ret = 0, i;
@@ -1839,6 +1840,7 @@ void unregister_jprobes(struct jprobe **jps, int num)
 	}
 }
 EXPORT_SYMBOL_GPL(unregister_jprobes);
+#endif
 
 #ifdef CONFIG_KRETPROBES
 /*
-- 
cgit v1.2.3


From 52f737c2da40259ac9962170ce608b6fb1b55ee4 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 19 Oct 2017 13:28:38 -0700
Subject: timer: Provide wrappers safe for use with LOCKDEP

Under LOCKDEP, the timer lock_class_key (set up in __setup_timer) needs
to be tied to the caller's context, so an inline for timer_setup()
won't work. We do, however, want to keep the inline version around for
argument type checking, though, so this provides macro wrappers in the
LOCKDEP case.

This fixes the case of different timers sharing the same LOCKDEP instance,
and producing a false positive warning:

[  580.840858] ======================================================
[  580.842299] WARNING: possible circular locking dependency detected
[  580.843684] 4.14.0-rc4+ #17 Not tainted
[  580.844554] ------------------------------------------------------
[  580.845945] swapper/9/0 is trying to acquire lock:
[  580.847024]  (slock-AF_INET){+.-.}, at: [<ffffffff84ea4c34>] tcp_write_timer+0x24/0xd0
[  580.848834]
               but task is already holding lock:
[  580.850107]  ((timer)#2){+.-.}, at: [<ffffffff846df7c0>] call_timer_fn+0x0/0x300
[  580.851663]
               which lock already depends on the new lock.

[  580.853439]
               the existing dependency chain (in reverse order) is:
[  580.855311]
               -> #1 ((timer)#2){+.-.}:
[  580.856538]        __lock_acquire+0x114d/0x11a0
[  580.857506]        lock_acquire+0xb0/0x1d0
[  580.858373]        del_timer_sync+0x3c/0xb0
[  580.859260]        inet_csk_reqsk_queue_drop+0x7f/0x1b0
...
               -> #0 (slock-AF_INET){+.-.}:
[  580.884980]        check_prev_add+0x666/0x700
[  580.885790]        __lock_acquire+0x114d/0x11a0
[  580.886575]        lock_acquire+0xb0/0x1d0
[  580.887289]        _raw_spin_lock+0x2c/0x40
[  580.888021]        tcp_write_timer+0x24/0xd0
...
[  580.900055]  Possible unsafe locking scenario:

[  580.901043]        CPU0                    CPU1
[  580.901797]        ----                    ----
[  580.902540]   lock((timer)#2);
[  580.903046]                                lock(slock-AF_INET);
[  580.904006]                                lock((timer)#2);
[  580.904915]   lock(slock-AF_INET);
[  580.905502]

In this report, del_timer_sync() is from:

	inet_csk_reqsk_queue_drop()
		reqsk_queue_unlink()
			del_timer_sync(&req->rsk_timer)

but tcp_write_timer()'s timer is attached to icsk_retransmit_timer. Both
had the same lock_class_key, since they were using timer_setup(). Switching
to a macro allows for a separate context, avoiding the false positive.

Fixes: 686fef928bba ("timer: Prepare to change timer callback argument type")
Reported-by: Craig Gallek <cgallek@google.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: netdev@vger.kernel.org
Cc: "David S. Miller" <davem@davemloft.net>
Link: https://lkml.kernel.org/r/20171019202838.GA43223@beast
---
 include/linux/timer.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 10685c33e679..09950482309b 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -150,6 +150,7 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 #define TIMER_DATA_TYPE		unsigned long
 #define TIMER_FUNC_TYPE		void (*)(TIMER_DATA_TYPE)
 
+#ifndef CONFIG_LOCKDEP
 static inline void timer_setup(struct timer_list *timer,
 			       void (*callback)(struct timer_list *),
 			       unsigned int flags)
@@ -165,6 +166,19 @@ static inline void timer_setup_on_stack(struct timer_list *timer,
 	__setup_timer_on_stack(timer, (TIMER_FUNC_TYPE)callback,
 			       (TIMER_DATA_TYPE)timer, flags);
 }
+#else
+/*
+ * Under LOCKDEP, the timer lock_class_key (set up in __init_timer) needs
+ * to be tied to the caller's context, so an inline (above) won't work. We
+ * do want to keep the inline for argument type checking, though.
+ */
+# define timer_setup(timer, callback, flags)				\
+		__setup_timer(timer, (TIMER_FUNC_TYPE)callback,		\
+			      (TIMER_DATA_TYPE)timer, flags)
+# define timer_setup_on_stack(timer, callback, flags)			\
+		__setup_timer_on_stack(timer, (TIMER_FUNC_TYPE)callback,\
+				       (TIMER_DATA_TYPE)timer, flags)
+#endif
 
 #define from_timer(var, callback_timer, timer_fieldname) \
 	container_of(callback_timer, typeof(*var), timer_fieldname)
-- 
cgit v1.2.3


From de95e04791a03de5cb681980a3880db6919e3b4a Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Oct 2017 09:56:54 -0700
Subject: net: Add extack to validator_info structs used for address notifier

Add extack to in_validator_info and in6_validator_info. Update the one
user of each, ipvlan, to return an error message for failures.

Only manual configuration of an address is plumbed in the IPv6 code path.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ipvlan/ipvlan_main.c | 10 ++++++++--
 include/linux/inetdevice.h       |  1 +
 include/net/addrconf.h           |  1 +
 net/ipv4/devinet.c               |  8 +++++---
 net/ipv6/addrconf.c              | 22 ++++++++++++----------
 5 files changed, 27 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 6842739b6679..f0ab55df57f1 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -847,8 +847,11 @@ static int ipvlan_addr6_validator_event(struct notifier_block *unused,
 
 	switch (event) {
 	case NETDEV_UP:
-		if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true))
+		if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true)) {
+			NL_SET_ERR_MSG(i6vi->extack,
+				       "Address already assigned to an ipvlan device");
 			return notifier_from_errno(-EADDRINUSE);
+		}
 		break;
 	}
 
@@ -917,8 +920,11 @@ static int ipvlan_addr4_validator_event(struct notifier_block *unused,
 
 	switch (event) {
 	case NETDEV_UP:
-		if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false))
+		if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false)) {
+			NL_SET_ERR_MSG(ivi->extack,
+				       "Address already assigned to an ipvlan device");
 			return notifier_from_errno(-EADDRINUSE);
+		}
 		break;
 	}
 
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 751d051f0bc7..681dff30940b 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -154,6 +154,7 @@ struct in_ifaddr {
 struct in_validator_info {
 	__be32			ivi_addr;
 	struct in_device	*ivi_dev;
+	struct netlink_ext_ack	*extack;
 };
 
 int register_inetaddr_notifier(struct notifier_block *nb);
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 87981cd63180..b8b16437c6d5 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -55,6 +55,7 @@ struct prefix_info {
 struct in6_validator_info {
 	struct in6_addr		i6vi_addr;
 	struct inet6_dev	*i6vi_dev;
+	struct netlink_ext_ack	*extack;
 };
 
 #define IN6_ADDR_HSIZE_SHIFT	4
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e1e2ec0525e6..a4573bccd6da 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -444,7 +444,7 @@ static void check_lifetime(struct work_struct *work);
 static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime);
 
 static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
-			     u32 portid)
+			     u32 portid, struct netlink_ext_ack *extack)
 {
 	struct in_device *in_dev = ifa->ifa_dev;
 	struct in_ifaddr *ifa1, **ifap, **last_primary;
@@ -489,6 +489,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
 	 */
 	ivi.ivi_addr = ifa->ifa_address;
 	ivi.ivi_dev = ifa->ifa_dev;
+	ivi.extack = extack;
 	ret = blocking_notifier_call_chain(&inetaddr_validator_chain,
 					   NETDEV_UP, &ivi);
 	ret = notifier_to_errno(ret);
@@ -521,7 +522,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
 
 static int inet_insert_ifa(struct in_ifaddr *ifa)
 {
-	return __inet_insert_ifa(ifa, NULL, 0);
+	return __inet_insert_ifa(ifa, NULL, 0, NULL);
 }
 
 static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
@@ -902,7 +903,8 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 				return ret;
 			}
 		}
-		return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
+		return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid,
+					 extack);
 	} else {
 		inet_free_ifa(ifa);
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index dd9c0c435f71..93f9c0a61911 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -987,7 +987,7 @@ static struct inet6_ifaddr *
 ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 	      const struct in6_addr *peer_addr, int pfxlen,
 	      int scope, u32 flags, u32 valid_lft, u32 prefered_lft,
-	      bool can_block)
+	      bool can_block, struct netlink_ext_ack *extack)
 {
 	gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC;
 	struct net *net = dev_net(idev->dev);
@@ -1019,6 +1019,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 		struct in6_validator_info i6vi = {
 			.i6vi_addr = *addr,
 			.i6vi_dev = idev,
+			.extack = extack,
 		};
 
 		err = inet6addr_validator_notifier_call_chain(NETDEV_UP, &i6vi);
@@ -1356,7 +1357,7 @@ retry:
 
 	ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen,
 			    ipv6_addr_scope(&addr), addr_flags,
-			    tmp_valid_lft, tmp_prefered_lft, true);
+			    tmp_valid_lft, tmp_prefered_lft, true, NULL);
 	if (IS_ERR(ift)) {
 		in6_ifa_put(ifp);
 		in6_dev_put(idev);
@@ -2040,7 +2041,7 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp)
 
 		ifp2 = ipv6_add_addr(idev, &new_addr, NULL, pfxlen,
 				     scope, flags, valid_lft,
-				     preferred_lft, false);
+				     preferred_lft, false, NULL);
 		if (IS_ERR(ifp2))
 			goto lock_errdad;
 
@@ -2498,7 +2499,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
 					    pinfo->prefix_len,
 					    addr_type&IPV6_ADDR_SCOPE_MASK,
 					    addr_flags, valid_lft,
-					    prefered_lft, false);
+					    prefered_lft, false, NULL);
 
 		if (IS_ERR_OR_NULL(ifp))
 			return -1;
@@ -2808,7 +2809,8 @@ static int inet6_addr_add(struct net *net, int ifindex,
 			  const struct in6_addr *pfx,
 			  const struct in6_addr *peer_pfx,
 			  unsigned int plen, __u32 ifa_flags,
-			  __u32 prefered_lft, __u32 valid_lft)
+			  __u32 prefered_lft, __u32 valid_lft,
+			  struct netlink_ext_ack *extack)
 {
 	struct inet6_ifaddr *ifp;
 	struct inet6_dev *idev;
@@ -2867,7 +2869,7 @@ static int inet6_addr_add(struct net *net, int ifindex,
 	}
 
 	ifp = ipv6_add_addr(idev, pfx, peer_pfx, plen, scope, ifa_flags,
-			    valid_lft, prefered_lft, true);
+			    valid_lft, prefered_lft, true, extack);
 
 	if (!IS_ERR(ifp)) {
 		if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
@@ -2952,7 +2954,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg)
 	rtnl_lock();
 	err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, NULL,
 			     ireq.ifr6_prefixlen, IFA_F_PERMANENT,
-			     INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
+			     INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, NULL);
 	rtnl_unlock();
 	return err;
 }
@@ -2983,7 +2985,7 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 	ifp = ipv6_add_addr(idev, addr, NULL, plen,
 			    scope, IFA_F_PERMANENT,
 			    INFINITY_LIFE_TIME, INFINITY_LIFE_TIME,
-			    true);
+			    true, NULL);
 	if (!IS_ERR(ifp)) {
 		spin_lock_bh(&ifp->lock);
 		ifp->flags &= ~IFA_F_TENTATIVE;
@@ -3083,7 +3085,7 @@ void addrconf_add_linklocal(struct inet6_dev *idev,
 #endif
 
 	ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags,
-			    INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true);
+			    INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true, NULL);
 	if (!IS_ERR(ifp)) {
 		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0);
 		addrconf_dad_start(ifp);
@@ -4586,7 +4588,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 		 */
 		return inet6_addr_add(net, ifm->ifa_index, pfx, peer_pfx,
 				      ifm->ifa_prefixlen, ifa_flags,
-				      preferred_lft, valid_lft);
+				      preferred_lft, valid_lft, extack);
 	}
 
 	if (nlh->nlmsg_flags & NLM_F_EXCL ||
-- 
cgit v1.2.3


From 617dd7cc490b72345277e2666c8ed34d4f47f0da Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 30 Aug 2017 12:48:31 +0200
Subject: gpu: host1x: syncpt: Request syncpoints per client

Rather than request syncpoints for a struct device *, request them for a
struct host1x_client *. This is important because subsequent patches are
going to break the assumption that host1x will always be the parent for
devices requesting a syncpoint. It's also a more natural choice because
host1x clients are really the only ones that will know how to deal with
syncpoints.

Note that host1x clients are always guaranteed to be children of host1x,
regardless of their location in the device tree.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/dc.c   |  2 +-
 drivers/gpu/drm/tegra/gr2d.c |  2 +-
 drivers/gpu/drm/tegra/gr3d.c |  2 +-
 drivers/gpu/drm/tegra/vic.c  |  2 +-
 drivers/gpu/host1x/syncpt.c  | 16 ++++++++--------
 drivers/gpu/host1x/syncpt.h  |  2 +-
 include/linux/host1x.h       |  2 +-
 7 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c
index 4df39112e38e..bc51eb855bf1 100644
--- a/drivers/gpu/drm/tegra/dc.c
+++ b/drivers/gpu/drm/tegra/dc.c
@@ -1756,7 +1756,7 @@ static int tegra_dc_init(struct host1x_client *client)
 	struct drm_plane *cursor = NULL;
 	int err;
 
-	dc->syncpt = host1x_syncpt_request(dc->dev, flags);
+	dc->syncpt = host1x_syncpt_request(client, flags);
 	if (!dc->syncpt)
 		dev_warn(dc->dev, "failed to allocate syncpoint\n");
 
diff --git a/drivers/gpu/drm/tegra/gr2d.c b/drivers/gpu/drm/tegra/gr2d.c
index 6ea070da7718..9a8ea93016a9 100644
--- a/drivers/gpu/drm/tegra/gr2d.c
+++ b/drivers/gpu/drm/tegra/gr2d.c
@@ -36,7 +36,7 @@ static int gr2d_init(struct host1x_client *client)
 	if (!gr2d->channel)
 		return -ENOMEM;
 
-	client->syncpts[0] = host1x_syncpt_request(client->dev, flags);
+	client->syncpts[0] = host1x_syncpt_request(client, flags);
 	if (!client->syncpts[0]) {
 		host1x_channel_put(gr2d->channel);
 		return -ENOMEM;
diff --git a/drivers/gpu/drm/tegra/gr3d.c b/drivers/gpu/drm/tegra/gr3d.c
index cee2ab645cde..28c4ef63065b 100644
--- a/drivers/gpu/drm/tegra/gr3d.c
+++ b/drivers/gpu/drm/tegra/gr3d.c
@@ -46,7 +46,7 @@ static int gr3d_init(struct host1x_client *client)
 	if (!gr3d->channel)
 		return -ENOMEM;
 
-	client->syncpts[0] = host1x_syncpt_request(client->dev, flags);
+	client->syncpts[0] = host1x_syncpt_request(client, flags);
 	if (!client->syncpts[0]) {
 		host1x_channel_put(gr3d->channel);
 		return -ENOMEM;
diff --git a/drivers/gpu/drm/tegra/vic.c b/drivers/gpu/drm/tegra/vic.c
index 2448229fa653..52899fba15e6 100644
--- a/drivers/gpu/drm/tegra/vic.c
+++ b/drivers/gpu/drm/tegra/vic.c
@@ -167,7 +167,7 @@ static int vic_init(struct host1x_client *client)
 		goto detach_device;
 	}
 
-	client->syncpts[0] = host1x_syncpt_request(client->dev, 0);
+	client->syncpts[0] = host1x_syncpt_request(client, 0);
 	if (!client->syncpts[0]) {
 		err = -ENOMEM;
 		goto free_channel;
diff --git a/drivers/gpu/host1x/syncpt.c b/drivers/gpu/host1x/syncpt.c
index 048ac9e344ce..fcba94cbf4ed 100644
--- a/drivers/gpu/host1x/syncpt.c
+++ b/drivers/gpu/host1x/syncpt.c
@@ -54,7 +54,7 @@ static void host1x_syncpt_base_free(struct host1x_syncpt_base *base)
 }
 
 static struct host1x_syncpt *host1x_syncpt_alloc(struct host1x *host,
-						 struct device *dev,
+						 struct host1x_client *client,
 						 unsigned long flags)
 {
 	int i;
@@ -76,11 +76,11 @@ static struct host1x_syncpt *host1x_syncpt_alloc(struct host1x *host,
 	}
 
 	name = kasprintf(GFP_KERNEL, "%02u-%s", sp->id,
-			dev ? dev_name(dev) : NULL);
+			 client ? dev_name(client->dev) : NULL);
 	if (!name)
 		goto free_base;
 
-	sp->dev = dev;
+	sp->client = client;
 	sp->name = name;
 
 	if (flags & HOST1X_SYNCPT_CLIENT_MANAGED)
@@ -419,7 +419,7 @@ int host1x_syncpt_init(struct host1x *host)
 
 /**
  * host1x_syncpt_request() - request a syncpoint
- * @dev: device requesting the syncpoint
+ * @client: client requesting the syncpoint
  * @flags: flags
  *
  * host1x client drivers can use this function to allocate a syncpoint for
@@ -427,12 +427,12 @@ int host1x_syncpt_init(struct host1x *host)
  * use by the client exclusively. When no longer using a syncpoint, a host1x
  * client driver needs to release it using host1x_syncpt_free().
  */
-struct host1x_syncpt *host1x_syncpt_request(struct device *dev,
+struct host1x_syncpt *host1x_syncpt_request(struct host1x_client *client,
 					    unsigned long flags)
 {
-	struct host1x *host = dev_get_drvdata(dev->parent);
+	struct host1x *host = dev_get_drvdata(client->parent->parent);
 
-	return host1x_syncpt_alloc(host, dev, flags);
+	return host1x_syncpt_alloc(host, client, flags);
 }
 EXPORT_SYMBOL(host1x_syncpt_request);
 
@@ -456,7 +456,7 @@ void host1x_syncpt_free(struct host1x_syncpt *sp)
 	host1x_syncpt_base_free(sp->base);
 	kfree(sp->name);
 	sp->base = NULL;
-	sp->dev = NULL;
+	sp->client = NULL;
 	sp->name = NULL;
 	sp->client_managed = false;
 
diff --git a/drivers/gpu/host1x/syncpt.h b/drivers/gpu/host1x/syncpt.h
index f719205105ac..9d88d37c2397 100644
--- a/drivers/gpu/host1x/syncpt.h
+++ b/drivers/gpu/host1x/syncpt.h
@@ -44,7 +44,7 @@ struct host1x_syncpt {
 	const char *name;
 	bool client_managed;
 	struct host1x *host;
-	struct device *dev;
+	struct host1x_client *client;
 	struct host1x_syncpt_base *base;
 
 	/* interrupt data */
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index 630b1a98ab58..ddf7f9ca86cc 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -157,7 +157,7 @@ int host1x_syncpt_incr(struct host1x_syncpt *sp);
 u32 host1x_syncpt_incr_max(struct host1x_syncpt *sp, u32 incrs);
 int host1x_syncpt_wait(struct host1x_syncpt *sp, u32 thresh, long timeout,
 		       u32 *value);
-struct host1x_syncpt *host1x_syncpt_request(struct device *dev,
+struct host1x_syncpt *host1x_syncpt_request(struct host1x_client *client,
 					    unsigned long flags);
 void host1x_syncpt_free(struct host1x_syncpt *sp);
 
-- 
cgit v1.2.3


From 6e71b04a82248ccf13a94b85cbc674a9fefe53f5 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 18 Oct 2017 13:00:22 -0700
Subject: bpf: Add file mode configuration into bpf maps

Introduce the map read/write flags to the eBPF syscalls that returns the
map fd. The flags is used to set up the file mode when construct a new
file descriptor for bpf maps. To not break the backward capability, the
f_flags is set to O_RDWR if the flag passed by syscall is 0. Otherwise
it should be O_RDONLY or O_WRONLY. When the userspace want to modify or
read the map content, it will check the file mode to see if it is
allowed to make the change.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  8 +++--
 include/uapi/linux/bpf.h |  6 ++++
 kernel/bpf/arraymap.c    |  6 +++-
 kernel/bpf/devmap.c      |  5 ++-
 kernel/bpf/hashtab.c     |  5 +--
 kernel/bpf/inode.c       | 15 ++++++---
 kernel/bpf/lpm_trie.c    |  3 +-
 kernel/bpf/sockmap.c     |  5 ++-
 kernel/bpf/stackmap.c    |  5 ++-
 kernel/bpf/syscall.c     | 88 ++++++++++++++++++++++++++++++++++++++++++------
 net/netfilter/xt_bpf.c   |  2 +-
 11 files changed, 122 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d67ccdc0099f..3e5508f2fa87 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -315,11 +315,11 @@ void bpf_map_area_free(void *base);
 
 extern int sysctl_unprivileged_bpf_disabled;
 
-int bpf_map_new_fd(struct bpf_map *map);
+int bpf_map_new_fd(struct bpf_map *map, int flags);
 int bpf_prog_new_fd(struct bpf_prog *prog);
 
 int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
-int bpf_obj_get_user(const char __user *pathname);
+int bpf_obj_get_user(const char __user *pathname, int flags);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
@@ -338,6 +338,8 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
 				void *key, void *value, u64 map_flags);
 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
 
+int bpf_get_file_flag(int flags);
+
 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
  * forced to use 'long' read/writes to try to atomically copy long counters.
  * Best-effort only.  No barriers here, since it _will_ race with concurrent
@@ -421,7 +423,7 @@ static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
 {
 }
 
-static inline int bpf_obj_get_user(const char __user *pathname)
+static inline int bpf_obj_get_user(const char __user *pathname, int flags)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4303fb6c3817..d83f95ea6a1b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -218,6 +218,10 @@ enum bpf_attach_type {
 
 #define BPF_OBJ_NAME_LEN 16U
 
+/* Flags for accessing BPF object */
+#define BPF_F_RDONLY		(1U << 3)
+#define BPF_F_WRONLY		(1U << 4)
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
@@ -260,6 +264,7 @@ union bpf_attr {
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
 		__aligned_u64	pathname;
 		__u32		bpf_fd;
+		__u32		file_flags;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
@@ -287,6 +292,7 @@ union bpf_attr {
 			__u32		map_id;
 		};
 		__u32		next_id;
+		__u32		open_flags;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 68d866628be0..988c04c91e10 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -19,6 +19,9 @@
 
 #include "map_in_map.h"
 
+#define ARRAY_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 static void bpf_array_free_percpu(struct bpf_array *array)
 {
 	int i;
@@ -56,7 +59,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE ||
+	    attr->value_size == 0 ||
+	    attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
 	    (percpu && numa_node != NUMA_NO_NODE))
 		return ERR_PTR(-EINVAL);
 
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index e093d9a2c4dd..e5d3de7cff2e 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -50,6 +50,9 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 
+#define DEV_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 struct bpf_dtab_netdev {
 	struct net_device *dev;
 	struct bpf_dtab *dtab;
@@ -80,7 +83,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
 	dtab = kzalloc(sizeof(*dtab), GFP_USER);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 431126f31ea3..919955236e63 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -18,8 +18,9 @@
 #include "bpf_lru_list.h"
 #include "map_in_map.h"
 
-#define HTAB_CREATE_FLAG_MASK \
-	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE)
+#define HTAB_CREATE_FLAG_MASK						\
+	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\
+	 BPF_F_RDONLY | BPF_F_WRONLY)
 
 struct bucket {
 	struct hlist_nulls_head head;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index be1dde967208..01aaef1a77c5 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -295,7 +295,7 @@ out:
 }
 
 static void *bpf_obj_do_get(const struct filename *pathname,
-			    enum bpf_type *type)
+			    enum bpf_type *type, int flags)
 {
 	struct inode *inode;
 	struct path path;
@@ -307,7 +307,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,
 		return ERR_PTR(ret);
 
 	inode = d_backing_inode(path.dentry);
-	ret = inode_permission(inode, MAY_WRITE);
+	ret = inode_permission(inode, ACC_MODE(flags));
 	if (ret)
 		goto out;
 
@@ -326,18 +326,23 @@ out:
 	return ERR_PTR(ret);
 }
 
-int bpf_obj_get_user(const char __user *pathname)
+int bpf_obj_get_user(const char __user *pathname, int flags)
 {
 	enum bpf_type type = BPF_TYPE_UNSPEC;
 	struct filename *pname;
 	int ret = -ENOENT;
+	int f_flags;
 	void *raw;
 
+	f_flags = bpf_get_file_flag(flags);
+	if (f_flags < 0)
+		return f_flags;
+
 	pname = getname(pathname);
 	if (IS_ERR(pname))
 		return PTR_ERR(pname);
 
-	raw = bpf_obj_do_get(pname, &type);
+	raw = bpf_obj_do_get(pname, &type, f_flags);
 	if (IS_ERR(raw)) {
 		ret = PTR_ERR(raw);
 		goto out;
@@ -346,7 +351,7 @@ int bpf_obj_get_user(const char __user *pathname)
 	if (type == BPF_TYPE_PROG)
 		ret = bpf_prog_new_fd(raw);
 	else if (type == BPF_TYPE_MAP)
-		ret = bpf_map_new_fd(raw);
+		ret = bpf_map_new_fd(raw, f_flags);
 	else
 		goto out;
 
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 34d8a690ea05..885e45479680 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -495,7 +495,8 @@ out:
 #define LPM_KEY_SIZE_MAX	LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
 #define LPM_KEY_SIZE_MIN	LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
 
-#define LPM_CREATE_FLAG_MASK	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE)
+#define LPM_CREATE_FLAG_MASK	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE |	\
+				 BPF_F_RDONLY | BPF_F_WRONLY)
 
 static struct bpf_map *trie_alloc(union bpf_attr *attr)
 {
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index a298d6666698..86ec846f2d5e 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -40,6 +40,9 @@
 #include <linux/list.h>
 #include <net/strparser.h>
 
+#define SOCK_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 struct bpf_stab {
 	struct bpf_map map;
 	struct sock **sock_map;
@@ -489,7 +492,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+	    attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
 	if (attr->value_size > KMALLOC_MAX_SIZE)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 135be433e9a0..a15bc636cc98 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,9 @@
 #include <linux/perf_event.h>
 #include "percpu_freelist.h"
 
+#define STACK_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 struct stack_map_bucket {
 	struct pcpu_freelist_node fnode;
 	u32 hash;
@@ -60,7 +63,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-	if (attr->map_flags & ~BPF_F_NUMA_NODE)
+	if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
 	/* check sanity of attributes */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0e893cac6795..676a06e6b322 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -34,6 +34,8 @@
 #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
 #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
 
+#define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
+
 DEFINE_PER_CPU(int, bpf_prog_active);
 static DEFINE_IDR(prog_idr);
 static DEFINE_SPINLOCK(prog_idr_lock);
@@ -294,17 +296,48 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 }
 #endif
 
+static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
+			      loff_t *ppos)
+{
+	/* We need this handler such that alloc_file() enables
+	 * f_mode with FMODE_CAN_READ.
+	 */
+	return -EINVAL;
+}
+
+static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
+			       size_t siz, loff_t *ppos)
+{
+	/* We need this handler such that alloc_file() enables
+	 * f_mode with FMODE_CAN_WRITE.
+	 */
+	return -EINVAL;
+}
+
 static const struct file_operations bpf_map_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= bpf_map_show_fdinfo,
 #endif
 	.release	= bpf_map_release,
+	.read		= bpf_dummy_read,
+	.write		= bpf_dummy_write,
 };
 
-int bpf_map_new_fd(struct bpf_map *map)
+int bpf_map_new_fd(struct bpf_map *map, int flags)
 {
 	return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
-				O_RDWR | O_CLOEXEC);
+				flags | O_CLOEXEC);
+}
+
+int bpf_get_file_flag(int flags)
+{
+	if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
+		return -EINVAL;
+	if (flags & BPF_F_RDONLY)
+		return O_RDONLY;
+	if (flags & BPF_F_WRONLY)
+		return O_WRONLY;
+	return O_RDWR;
 }
 
 /* helper macro to check that unused fields 'union bpf_attr' are zero */
@@ -344,12 +377,17 @@ static int map_create(union bpf_attr *attr)
 {
 	int numa_node = bpf_map_attr_numa_node(attr);
 	struct bpf_map *map;
+	int f_flags;
 	int err;
 
 	err = CHECK_ATTR(BPF_MAP_CREATE);
 	if (err)
 		return -EINVAL;
 
+	f_flags = bpf_get_file_flag(attr->map_flags);
+	if (f_flags < 0)
+		return f_flags;
+
 	if (numa_node != NUMA_NO_NODE &&
 	    ((unsigned int)numa_node >= nr_node_ids ||
 	     !node_online(numa_node)))
@@ -375,7 +413,7 @@ static int map_create(union bpf_attr *attr)
 	if (err)
 		goto free_map;
 
-	err = bpf_map_new_fd(map);
+	err = bpf_map_new_fd(map, f_flags);
 	if (err < 0) {
 		/* failed to allocate fd.
 		 * bpf_map_put() is needed because the above
@@ -490,6 +528,11 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_READ)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	key = memdup_user(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -570,6 +613,11 @@ static int map_update_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	key = memdup_user(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -659,6 +707,11 @@ static int map_delete_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	key = memdup_user(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -702,6 +755,11 @@ static int map_get_next_key(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_READ)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	if (ukey) {
 		key = memdup_user(ukey, map->key_size);
 		if (IS_ERR(key)) {
@@ -908,6 +966,8 @@ static const struct file_operations bpf_prog_fops = {
 	.show_fdinfo	= bpf_prog_show_fdinfo,
 #endif
 	.release	= bpf_prog_release,
+	.read		= bpf_dummy_read,
+	.write		= bpf_dummy_write,
 };
 
 int bpf_prog_new_fd(struct bpf_prog *prog)
@@ -1117,11 +1177,11 @@ free_prog_nouncharge:
 	return err;
 }
 
-#define BPF_OBJ_LAST_FIELD bpf_fd
+#define BPF_OBJ_LAST_FIELD file_flags
 
 static int bpf_obj_pin(const union bpf_attr *attr)
 {
-	if (CHECK_ATTR(BPF_OBJ))
+	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
 		return -EINVAL;
 
 	return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
@@ -1129,10 +1189,12 @@ static int bpf_obj_pin(const union bpf_attr *attr)
 
 static int bpf_obj_get(const union bpf_attr *attr)
 {
-	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
+	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
+	    attr->file_flags & ~BPF_OBJ_FLAG_MASK)
 		return -EINVAL;
 
-	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname));
+	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
+				attr->file_flags);
 }
 
 #ifdef CONFIG_CGROUP_BPF
@@ -1392,20 +1454,26 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
 	return fd;
 }
 
-#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
 
 static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 {
 	struct bpf_map *map;
 	u32 id = attr->map_id;
+	int f_flags;
 	int fd;
 
-	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
+	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
+	    attr->open_flags & ~BPF_OBJ_FLAG_MASK)
 		return -EINVAL;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	f_flags = bpf_get_file_flag(attr->open_flags);
+	if (f_flags < 0)
+		return f_flags;
+
 	spin_lock_bh(&map_idr_lock);
 	map = idr_find(&map_idr, id);
 	if (map)
@@ -1417,7 +1485,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	fd = bpf_map_new_fd(map);
+	fd = bpf_map_new_fd(map, f_flags);
 	if (fd < 0)
 		bpf_map_put(map);
 
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 29123934887b..041da0d9c06f 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -56,7 +56,7 @@ static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret)
 	int retval, fd;
 
 	set_fs(KERNEL_DS);
-	fd = bpf_obj_get_user(path);
+	fd = bpf_obj_get_user(path, 0);
 	set_fs(oldfs);
 	if (fd < 0)
 		return fd;
-- 
cgit v1.2.3


From afdb09c720b62b8090584c11151d856df330e57d Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 18 Oct 2017 13:00:24 -0700
Subject: security: bpf: Add LSM hooks for bpf object related syscall

Introduce several LSM hooks for the syscalls that will allow the
userspace to access to eBPF object such as eBPF programs and eBPF maps.
The security check is aimed to enforce a per object security protection
for eBPF object so only processes with the right priviliges can
read/write to a specific map or use a specific eBPF program. Besides
that, a general security hook is added before the multiplexer of bpf
syscall to check the cmd and the attribute used for the command. The
actual security module can decide which command need to be checked and
how the cmd should be checked.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h       |  6 ++++++
 include/linux/lsm_hooks.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/security.h  | 45 +++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c      | 34 +++++++++++++++++++++++++++--
 security/security.c       | 32 ++++++++++++++++++++++++++++
 5 files changed, 169 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3e5508f2fa87..84c192da3e0b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -57,6 +57,9 @@ struct bpf_map {
 	atomic_t usercnt;
 	struct bpf_map *inner_map_meta;
 	char name[BPF_OBJ_NAME_LEN];
+#ifdef CONFIG_SECURITY
+	void *security;
+#endif
 };
 
 /* function argument constraints */
@@ -193,6 +196,9 @@ struct bpf_prog_aux {
 	struct user_struct *user;
 	u64 load_time; /* ns since boottime */
 	char name[BPF_OBJ_NAME_LEN];
+#ifdef CONFIG_SECURITY
+	void *security;
+#endif
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index c9258124e417..7161d8e7ee79 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1351,6 +1351,40 @@
  *	@inode we wish to get the security context of.
  *	@ctx is a pointer in which to place the allocated security context.
  *	@ctxlen points to the place to put the length of @ctx.
+ *
+ * Security hooks for using the eBPF maps and programs functionalities through
+ * eBPF syscalls.
+ *
+ * @bpf:
+ *	Do a initial check for all bpf syscalls after the attribute is copied
+ *	into the kernel. The actual security module can implement their own
+ *	rules to check the specific cmd they need.
+ *
+ * @bpf_map:
+ *	Do a check when the kernel generate and return a file descriptor for
+ *	eBPF maps.
+ *
+ *	@map: bpf map that we want to access
+ *	@mask: the access flags
+ *
+ * @bpf_prog:
+ *	Do a check when the kernel generate and return a file descriptor for
+ *	eBPF programs.
+ *
+ *	@prog: bpf prog that userspace want to use.
+ *
+ * @bpf_map_alloc_security:
+ *	Initialize the security field inside bpf map.
+ *
+ * @bpf_map_free_security:
+ *	Clean up the security information stored inside bpf map.
+ *
+ * @bpf_prog_alloc_security:
+ *	Initialize the security field inside bpf program.
+ *
+ * @bpf_prog_free_security:
+ *	Clean up the security information stored inside bpf prog.
+ *
  */
 union security_list_options {
 	int (*binder_set_context_mgr)(struct task_struct *mgr);
@@ -1682,6 +1716,17 @@ union security_list_options {
 				struct audit_context *actx);
 	void (*audit_rule_free)(void *lsmrule);
 #endif /* CONFIG_AUDIT */
+
+#ifdef CONFIG_BPF_SYSCALL
+	int (*bpf)(int cmd, union bpf_attr *attr,
+				 unsigned int size);
+	int (*bpf_map)(struct bpf_map *map, fmode_t fmode);
+	int (*bpf_prog)(struct bpf_prog *prog);
+	int (*bpf_map_alloc_security)(struct bpf_map *map);
+	void (*bpf_map_free_security)(struct bpf_map *map);
+	int (*bpf_prog_alloc_security)(struct bpf_prog_aux *aux);
+	void (*bpf_prog_free_security)(struct bpf_prog_aux *aux);
+#endif /* CONFIG_BPF_SYSCALL */
 };
 
 struct security_hook_heads {
@@ -1901,6 +1946,15 @@ struct security_hook_heads {
 	struct list_head audit_rule_match;
 	struct list_head audit_rule_free;
 #endif /* CONFIG_AUDIT */
+#ifdef CONFIG_BPF_SYSCALL
+	struct list_head bpf;
+	struct list_head bpf_map;
+	struct list_head bpf_prog;
+	struct list_head bpf_map_alloc_security;
+	struct list_head bpf_map_free_security;
+	struct list_head bpf_prog_alloc_security;
+	struct list_head bpf_prog_free_security;
+#endif /* CONFIG_BPF_SYSCALL */
 } __randomize_layout;
 
 /*
diff --git a/include/linux/security.h b/include/linux/security.h
index ce6265960d6c..18800b0911e5 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -31,6 +31,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/bpf.h>
 
 struct linux_binprm;
 struct cred;
@@ -1730,6 +1731,50 @@ static inline void securityfs_remove(struct dentry *dentry)
 
 #endif
 
+#ifdef CONFIG_BPF_SYSCALL
+#ifdef CONFIG_SECURITY
+extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
+extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
+extern int security_bpf_prog(struct bpf_prog *prog);
+extern int security_bpf_map_alloc(struct bpf_map *map);
+extern void security_bpf_map_free(struct bpf_map *map);
+extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux);
+extern void security_bpf_prog_free(struct bpf_prog_aux *aux);
+#else
+static inline int security_bpf(int cmd, union bpf_attr *attr,
+					     unsigned int size)
+{
+	return 0;
+}
+
+static inline int security_bpf_map(struct bpf_map *map, fmode_t fmode)
+{
+	return 0;
+}
+
+static inline int security_bpf_prog(struct bpf_prog *prog)
+{
+	return 0;
+}
+
+static inline int security_bpf_map_alloc(struct bpf_map *map)
+{
+	return 0;
+}
+
+static inline void security_bpf_map_free(struct bpf_map *map)
+{ }
+
+static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+{
+	return 0;
+}
+
+static inline void security_bpf_prog_free(struct bpf_prog_aux *aux)
+{ }
+#endif /* CONFIG_SECURITY */
+#endif /* CONFIG_BPF_SYSCALL */
+
 #ifdef CONFIG_SECURITY
 
 static inline char *alloc_secdata(void)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 676a06e6b322..5cb56d06b48d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -212,6 +212,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
 	struct bpf_map *map = container_of(work, struct bpf_map, work);
 
 	bpf_map_uncharge_memlock(map);
+	security_bpf_map_free(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 }
@@ -325,6 +326,12 @@ static const struct file_operations bpf_map_fops = {
 
 int bpf_map_new_fd(struct bpf_map *map, int flags)
 {
+	int ret;
+
+	ret = security_bpf_map(map, OPEN_FMODE(flags));
+	if (ret < 0)
+		return ret;
+
 	return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
 				flags | O_CLOEXEC);
 }
@@ -405,10 +412,14 @@ static int map_create(union bpf_attr *attr)
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
 
-	err = bpf_map_charge_memlock(map);
+	err = security_bpf_map_alloc(map);
 	if (err)
 		goto free_map_nouncharge;
 
+	err = bpf_map_charge_memlock(map);
+	if (err)
+		goto free_map_sec;
+
 	err = bpf_map_alloc_id(map);
 	if (err)
 		goto free_map;
@@ -430,6 +441,8 @@ static int map_create(union bpf_attr *attr)
 
 free_map:
 	bpf_map_uncharge_memlock(map);
+free_map_sec:
+	security_bpf_map_free(map);
 free_map_nouncharge:
 	map->ops->map_free(map);
 	return err;
@@ -914,6 +927,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 
 	free_used_maps(aux);
 	bpf_prog_uncharge_memlock(aux->prog);
+	security_bpf_prog_free(aux);
 	bpf_prog_free(aux->prog);
 }
 
@@ -972,6 +986,12 @@ static const struct file_operations bpf_prog_fops = {
 
 int bpf_prog_new_fd(struct bpf_prog *prog)
 {
+	int ret;
+
+	ret = security_bpf_prog(prog);
+	if (ret < 0)
+		return ret;
+
 	return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
 				O_RDWR | O_CLOEXEC);
 }
@@ -1111,10 +1131,14 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (!prog)
 		return -ENOMEM;
 
-	err = bpf_prog_charge_memlock(prog);
+	err = security_bpf_prog_alloc(prog->aux);
 	if (err)
 		goto free_prog_nouncharge;
 
+	err = bpf_prog_charge_memlock(prog);
+	if (err)
+		goto free_prog_sec;
+
 	prog->len = attr->insn_cnt;
 
 	err = -EFAULT;
@@ -1172,6 +1196,8 @@ free_used_maps:
 	free_used_maps(prog->aux);
 free_prog:
 	bpf_prog_uncharge_memlock(prog);
+free_prog_sec:
+	security_bpf_prog_free(prog->aux);
 free_prog_nouncharge:
 	bpf_prog_free(prog);
 	return err;
@@ -1640,6 +1666,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	if (copy_from_user(&attr, uattr, size) != 0)
 		return -EFAULT;
 
+	err = security_bpf(cmd, &attr, size);
+	if (err < 0)
+		return err;
+
 	switch (cmd) {
 	case BPF_MAP_CREATE:
 		err = map_create(&attr);
diff --git a/security/security.c b/security/security.c
index 4bf0f571b4ef..1cd8526cb0b7 100644
--- a/security/security.c
+++ b/security/security.c
@@ -12,6 +12,7 @@
  *	(at your option) any later version.
  */
 
+#include <linux/bpf.h>
 #include <linux/capability.h>
 #include <linux/dcache.h>
 #include <linux/module.h>
@@ -1703,3 +1704,34 @@ int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule,
 				actx);
 }
 #endif /* CONFIG_AUDIT */
+
+#ifdef CONFIG_BPF_SYSCALL
+int security_bpf(int cmd, union bpf_attr *attr, unsigned int size)
+{
+	return call_int_hook(bpf, 0, cmd, attr, size);
+}
+int security_bpf_map(struct bpf_map *map, fmode_t fmode)
+{
+	return call_int_hook(bpf_map, 0, map, fmode);
+}
+int security_bpf_prog(struct bpf_prog *prog)
+{
+	return call_int_hook(bpf_prog, 0, prog);
+}
+int security_bpf_map_alloc(struct bpf_map *map)
+{
+	return call_int_hook(bpf_map_alloc_security, 0, map);
+}
+int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+{
+	return call_int_hook(bpf_prog_alloc_security, 0, aux);
+}
+void security_bpf_map_free(struct bpf_map *map)
+{
+	call_void_hook(bpf_map_free_security, map);
+}
+void security_bpf_prog_free(struct bpf_prog_aux *aux)
+{
+	call_void_hook(bpf_prog_free_security, aux);
+}
+#endif /* CONFIG_BPF_SYSCALL */
-- 
cgit v1.2.3


From f66e448cfda021b0bcd884f26709796fe19c7cc1 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 18 Oct 2017 13:00:26 -0700
Subject: selinux: bpf: Add addtional check for bpf object file receive

Introduce a bpf object related check when sending and receiving files
through unix domain socket as well as binder. It checks if the receiving
process have privilege to read/write the bpf map or use the bpf program.
This check is necessary because the bpf maps and programs are using a
anonymous inode as their shared inode so the normal way of checking the
files and sockets when passing between processes cannot work properly on
eBPF object. This check only works when the BPF_SYSCALL is configured.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
Reviewed-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  3 +++
 kernel/bpf/syscall.c     |  4 ++--
 security/selinux/hooks.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 84c192da3e0b..1e334b248ff6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -288,6 +288,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
+extern const struct file_operations bpf_map_fops;
+extern const struct file_operations bpf_prog_fops;
+
 #define BPF_PROG_TYPE(_id, _name) \
 	extern const struct bpf_prog_ops _name ## _prog_ops; \
 	extern const struct bpf_verifier_ops _name ## _verifier_ops;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5cb56d06b48d..323be2473c4b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -315,7 +315,7 @@ static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
 	return -EINVAL;
 }
 
-static const struct file_operations bpf_map_fops = {
+const struct file_operations bpf_map_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= bpf_map_show_fdinfo,
 #endif
@@ -975,7 +975,7 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
 }
 #endif
 
-static const struct file_operations bpf_prog_fops = {
+const struct file_operations bpf_prog_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= bpf_prog_show_fdinfo,
 #endif
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 12cf7de8cbed..2e3a627fc0b1 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1815,6 +1815,10 @@ static inline int file_path_has_perm(const struct cred *cred,
 	return inode_has_perm(cred, file_inode(file), av, &ad);
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+static int bpf_fd_pass(struct file *file, u32 sid);
+#endif
+
 /* Check whether a task can use an open file descriptor to
    access an inode in a given way.  Check access to the
    descriptor itself, and then use dentry_has_perm to
@@ -1845,6 +1849,12 @@ static int file_has_perm(const struct cred *cred,
 			goto out;
 	}
 
+#ifdef CONFIG_BPF_SYSCALL
+	rc = bpf_fd_pass(file, cred_sid(cred));
+	if (rc)
+		return rc;
+#endif
+
 	/* av is zero if only checking access to the descriptor. */
 	rc = 0;
 	if (av)
@@ -2165,6 +2175,12 @@ static int selinux_binder_transfer_file(struct task_struct *from,
 			return rc;
 	}
 
+#ifdef CONFIG_BPF_SYSCALL
+	rc = bpf_fd_pass(file, sid);
+	if (rc)
+		return rc;
+#endif
+
 	if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
 		return 0;
 
@@ -6288,6 +6304,39 @@ static u32 bpf_map_fmode_to_av(fmode_t fmode)
 	return av;
 }
 
+/* This function will check the file pass through unix socket or binder to see
+ * if it is a bpf related object. And apply correspinding checks on the bpf
+ * object based on the type. The bpf maps and programs, not like other files and
+ * socket, are using a shared anonymous inode inside the kernel as their inode.
+ * So checking that inode cannot identify if the process have privilege to
+ * access the bpf object and that's why we have to add this additional check in
+ * selinux_file_receive and selinux_binder_transfer_files.
+ */
+static int bpf_fd_pass(struct file *file, u32 sid)
+{
+	struct bpf_security_struct *bpfsec;
+	struct bpf_prog *prog;
+	struct bpf_map *map;
+	int ret;
+
+	if (file->f_op == &bpf_map_fops) {
+		map = file->private_data;
+		bpfsec = map->security;
+		ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
+				   bpf_map_fmode_to_av(file->f_mode), NULL);
+		if (ret)
+			return ret;
+	} else if (file->f_op == &bpf_prog_fops) {
+		prog = file->private_data;
+		bpfsec = prog->aux->security;
+		ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
+				   BPF__PROG_RUN, NULL);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
 static int selinux_bpf_map(struct bpf_map *map, fmode_t fmode)
 {
 	u32 sid = current_sid();
-- 
cgit v1.2.3


From e4d0b679a8467b6d0c169642f0b5f57d8d6eacc2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 17 Sep 2017 14:30:06 -0700
Subject: srcu: Add parameters to SRCU docbook comments

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h  | 1 +
 kernel/rcu/srcutree.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 39af9bc0f653..62be8966e837 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -78,6 +78,7 @@ void synchronize_srcu(struct srcu_struct *sp);
 
 /**
  * srcu_read_lock_held - might we be in SRCU read-side critical section?
+ * @sp: The srcu_struct structure to check
  *
  * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU
  * read-side critical section.  In absence of CONFIG_DEBUG_LOCK_ALLOC,
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 729a8706751d..6d5880089ff6 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -854,7 +854,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 /**
  * call_srcu() - Queue a callback for invocation after an SRCU grace period
  * @sp: srcu_struct in queue the callback
- * @head: structure to be used for queueing the SRCU callback.
+ * @rhp: structure to be used for queueing the SRCU callback.
  * @func: function to be invoked after the SRCU grace period
  *
  * The callback function will be invoked some time after a full SRCU
-- 
cgit v1.2.3


From 8c4083b30e56fc71b0e94c26374b32d95d5ea461 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 19 Oct 2017 15:50:29 +0200
Subject: net: sched: add block bind/unbind notif. and extended block_get/put

Introduce new type of ndo_setup_tc message to propage binding/unbinding
of a block to driver. Call this ndo whenever qdisc gets/puts a block.
Alongside with this, there's need to propagate binder type from qdisc
code down to the notifier. So introduce extended variants of
block_get/put in order to pass this info.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 include/net/pkt_cls.h     | 40 +++++++++++++++++++++++++++++++++
 net/sched/cls_api.c       | 56 ++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 94 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bf014afcb914..4de5b08ee0fb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -775,6 +775,7 @@ enum tc_setup_type {
 	TC_SETUP_CLSFLOWER,
 	TC_SETUP_CLSMATCHALL,
 	TC_SETUP_CLSBPF,
+	TC_SETUP_BLOCK,
 };
 
 /* These structures hold the attributes of xdp state that are being passed
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 49a143e0fe65..41bc7d774047 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -17,13 +17,27 @@ struct tcf_walker {
 int register_tcf_proto_ops(struct tcf_proto_ops *ops);
 int unregister_tcf_proto_ops(struct tcf_proto_ops *ops);
 
+enum tcf_block_binder_type {
+	TCF_BLOCK_BINDER_TYPE_UNSPEC,
+};
+
+struct tcf_block_ext_info {
+	enum tcf_block_binder_type binder_type;
+};
+
 #ifdef CONFIG_NET_CLS
 struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
 				bool create);
 void tcf_chain_put(struct tcf_chain *chain);
 int tcf_block_get(struct tcf_block **p_block,
 		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q);
+int tcf_block_get_ext(struct tcf_block **p_block,
+		      struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+		      struct tcf_block_ext_info *ei);
 void tcf_block_put(struct tcf_block *block);
+void tcf_block_put_ext(struct tcf_block *block,
+		       struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+		       struct tcf_block_ext_info *ei);
 
 static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
 {
@@ -46,10 +60,25 @@ int tcf_block_get(struct tcf_block **p_block,
 	return 0;
 }
 
+static inline
+int tcf_block_get_ext(struct tcf_block **p_block,
+		      struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+		      struct tcf_block_ext_info *ei)
+{
+	return 0;
+}
+
 static inline void tcf_block_put(struct tcf_block *block)
 {
 }
 
+static inline
+void tcf_block_put_ext(struct tcf_block *block,
+		       struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+		       struct tcf_block_ext_info *ei)
+{
+}
+
 static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
 {
 	return NULL;
@@ -434,6 +463,17 @@ tcf_match_indev(struct sk_buff *skb, int ifindex)
 int tc_setup_cb_call(struct tcf_exts *exts, enum tc_setup_type type,
 		     void *type_data, bool err_stop);
 
+enum tc_block_command {
+	TC_BLOCK_BIND,
+	TC_BLOCK_UNBIND,
+};
+
+struct tc_block_offload {
+	enum tc_block_command command;
+	enum tcf_block_binder_type binder_type;
+	struct tcf_block *block;
+};
+
 struct tc_cls_common_offload {
 	u32 chain_index;
 	__be16 protocol;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 2e8e87fd9d97..92dce26d10e3 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -240,8 +240,36 @@ tcf_chain_filter_chain_ptr_set(struct tcf_chain *chain,
 	chain->p_filter_chain = p_filter_chain;
 }
 
-int tcf_block_get(struct tcf_block **p_block,
-		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q)
+static void tcf_block_offload_cmd(struct tcf_block *block, struct Qdisc *q,
+				  struct tcf_block_ext_info *ei,
+				  enum tc_block_command command)
+{
+	struct net_device *dev = q->dev_queue->dev;
+	struct tc_block_offload bo = {};
+
+	if (!tc_can_offload(dev))
+		return;
+	bo.command = command;
+	bo.binder_type = ei->binder_type;
+	bo.block = block;
+	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+}
+
+static void tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
+				   struct tcf_block_ext_info *ei)
+{
+	tcf_block_offload_cmd(block, q, ei, TC_BLOCK_BIND);
+}
+
+static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
+				     struct tcf_block_ext_info *ei)
+{
+	tcf_block_offload_cmd(block, q, ei, TC_BLOCK_UNBIND);
+}
+
+int tcf_block_get_ext(struct tcf_block **p_block,
+		      struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+		      struct tcf_block_ext_info *ei)
 {
 	struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL);
 	struct tcf_chain *chain;
@@ -259,6 +287,7 @@ int tcf_block_get(struct tcf_block **p_block,
 	tcf_chain_filter_chain_ptr_set(chain, p_filter_chain);
 	block->net = qdisc_net(q);
 	block->q = q;
+	tcf_block_offload_bind(block, q, ei);
 	*p_block = block;
 	return 0;
 
@@ -266,15 +295,28 @@ err_chain_create:
 	kfree(block);
 	return err;
 }
+EXPORT_SYMBOL(tcf_block_get_ext);
+
+int tcf_block_get(struct tcf_block **p_block,
+		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q)
+{
+	struct tcf_block_ext_info ei = {0, };
+
+	return tcf_block_get_ext(p_block, p_filter_chain, q, &ei);
+}
 EXPORT_SYMBOL(tcf_block_get);
 
-void tcf_block_put(struct tcf_block *block)
+void tcf_block_put_ext(struct tcf_block *block,
+		       struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
+		       struct tcf_block_ext_info *ei)
 {
 	struct tcf_chain *chain, *tmp;
 
 	if (!block)
 		return;
 
+	tcf_block_offload_unbind(block, q, ei);
+
 	/* XXX: Standalone actions are not allowed to jump to any chain, and
 	 * bound actions should be all removed after flushing. However,
 	 * filters are destroyed in RCU callbacks, we have to hold the chains
@@ -302,6 +344,14 @@ void tcf_block_put(struct tcf_block *block)
 		tcf_chain_put(chain);
 	kfree(block);
 }
+EXPORT_SYMBOL(tcf_block_put_ext);
+
+void tcf_block_put(struct tcf_block *block)
+{
+	struct tcf_block_ext_info ei = {0, };
+
+	tcf_block_put_ext(block, NULL, block->q, &ei);
+}
 EXPORT_SYMBOL(tcf_block_put);
 
 /* Main classifier routine: scans classifier chain attached
-- 
cgit v1.2.3


From ff61b5e3f041c2f1aa8d7c700af3007889973889 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 10:23:37 +0300
Subject: drivers, net, mlx4: convert mlx4_cq.refcount from atomic_t to
 refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable mlx4_cq.refcount is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/cq.c | 8 ++++----
 include/linux/mlx4/device.h             | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c b/drivers/net/ethernet/mellanox/mlx4/cq.c
index 72eb50cd5ecd..d8e9a323122e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cq.c
@@ -69,7 +69,7 @@ void mlx4_cq_tasklet_cb(unsigned long data)
 	list_for_each_entry_safe(mcq, temp, &ctx->process_list, tasklet_ctx.list) {
 		list_del_init(&mcq->tasklet_ctx.list);
 		mcq->tasklet_ctx.comp(mcq);
-		if (atomic_dec_and_test(&mcq->refcount))
+		if (refcount_dec_and_test(&mcq->refcount))
 			complete(&mcq->free);
 		if (time_after(jiffies, end))
 			break;
@@ -92,7 +92,7 @@ static void mlx4_add_cq_to_tasklet(struct mlx4_cq *cq)
 	 * still arrive.
 	 */
 	if (list_empty_careful(&cq->tasklet_ctx.list)) {
-		atomic_inc(&cq->refcount);
+		refcount_inc(&cq->refcount);
 		kick = list_empty(&tasklet_ctx->list);
 		list_add_tail(&cq->tasklet_ctx.list, &tasklet_ctx->list);
 		if (kick)
@@ -344,7 +344,7 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
 	cq->cons_index = 0;
 	cq->arm_sn     = 1;
 	cq->uar        = uar;
-	atomic_set(&cq->refcount, 1);
+	refcount_set(&cq->refcount, 1);
 	init_completion(&cq->free);
 	cq->comp = mlx4_add_cq_to_tasklet;
 	cq->tasklet_ctx.priv =
@@ -386,7 +386,7 @@ void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq)
 	    priv->eq_table.eq[MLX4_EQ_ASYNC].irq)
 		synchronize_irq(priv->eq_table.eq[MLX4_EQ_ASYNC].irq);
 
-	if (atomic_dec_and_test(&cq->refcount))
+	if (refcount_dec_and_test(&cq->refcount))
 		complete(&cq->free);
 	wait_for_completion(&cq->free);
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index b0a57e043fa3..daac2e3a1a58 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -40,7 +40,7 @@
 #include <linux/cpu_rmap.h>
 #include <linux/crash_dump.h>
 
-#include <linux/atomic.h>
+#include <linux/refcount.h>
 
 #include <linux/timecounter.h>
 
@@ -751,7 +751,7 @@ struct mlx4_cq {
 	int			cqn;
 	unsigned		vector;
 
-	atomic_t		refcount;
+	refcount_t		refcount;
 	struct completion	free;
 	struct {
 		struct list_head list;
-- 
cgit v1.2.3


From 0068895ff845c38e9e2b65c002c53c623379e436 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 10:23:38 +0300
Subject: drivers, net, mlx4: convert mlx4_qp.refcount from atomic_t to
 refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable mlx4_qp.refcount is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/qp.c | 8 ++++----
 include/linux/mlx4/device.h             | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index 203320923340..769598f7b6c8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -55,7 +55,7 @@ void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type)
 
 	qp = __mlx4_qp_lookup(dev, qpn);
 	if (qp)
-		atomic_inc(&qp->refcount);
+		refcount_inc(&qp->refcount);
 
 	spin_unlock(&qp_table->lock);
 
@@ -66,7 +66,7 @@ void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type)
 
 	qp->event(qp, event_type);
 
-	if (atomic_dec_and_test(&qp->refcount))
+	if (refcount_dec_and_test(&qp->refcount))
 		complete(&qp->free);
 }
 
@@ -420,7 +420,7 @@ int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp)
 	if (err)
 		goto err_icm;
 
-	atomic_set(&qp->refcount, 1);
+	refcount_set(&qp->refcount, 1);
 	init_completion(&qp->free);
 
 	return 0;
@@ -520,7 +520,7 @@ EXPORT_SYMBOL_GPL(mlx4_qp_remove);
 
 void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp)
 {
-	if (atomic_dec_and_test(&qp->refcount))
+	if (refcount_dec_and_test(&qp->refcount))
 		complete(&qp->free);
 	wait_for_completion(&qp->free);
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index daac2e3a1a58..b8e19c4d6caa 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -768,7 +768,7 @@ struct mlx4_qp {
 
 	int			qpn;
 
-	atomic_t		refcount;
+	refcount_t		refcount;
 	struct completion	free;
 	u8			usage;
 };
-- 
cgit v1.2.3


From 17ac99b2b8d08ed40f4525491d6eff330329a6d2 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 10:23:39 +0300
Subject: drivers, net, mlx4: convert mlx4_srq.refcount from atomic_t to
 refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable mlx4_srq.refcount is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/srq.c | 8 ++++----
 include/linux/mlx4/device.h              | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/srq.c b/drivers/net/ethernet/mellanox/mlx4/srq.c
index bedf52126824..cbe4d9746ddf 100644
--- a/drivers/net/ethernet/mellanox/mlx4/srq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/srq.c
@@ -49,7 +49,7 @@ void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type)
 	srq = radix_tree_lookup(&srq_table->tree, srqn & (dev->caps.num_srqs - 1));
 	rcu_read_unlock();
 	if (srq)
-		atomic_inc(&srq->refcount);
+		refcount_inc(&srq->refcount);
 	else {
 		mlx4_warn(dev, "Async event for bogus SRQ %08x\n", srqn);
 		return;
@@ -57,7 +57,7 @@ void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type)
 
 	srq->event(srq, event_type);
 
-	if (atomic_dec_and_test(&srq->refcount))
+	if (refcount_dec_and_test(&srq->refcount))
 		complete(&srq->free);
 }
 
@@ -203,7 +203,7 @@ int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd,
 	if (err)
 		goto err_radix;
 
-	atomic_set(&srq->refcount, 1);
+	refcount_set(&srq->refcount, 1);
 	init_completion(&srq->free);
 
 	return 0;
@@ -232,7 +232,7 @@ void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq)
 	radix_tree_delete(&srq_table->tree, srq->srqn);
 	spin_unlock_irq(&srq_table->lock);
 
-	if (atomic_dec_and_test(&srq->refcount))
+	if (refcount_dec_and_test(&srq->refcount))
 		complete(&srq->free);
 	wait_for_completion(&srq->free);
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index b8e19c4d6caa..a9b5fed8f7c6 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -781,7 +781,7 @@ struct mlx4_srq {
 	int			max_gs;
 	int			wqe_shift;
 
-	atomic_t		refcount;
+	refcount_t		refcount;
 	struct completion	free;
 };
 
-- 
cgit v1.2.3


From a4b51a9f83c6d359ff8fc0c66009283b6fdeeaf8 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 10:23:40 +0300
Subject: drivers, net, mlx5: convert mlx5_cq.refcount from atomic_t to
 refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable mlx5_cq.refcount is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/cq.c | 16 ++++++++--------
 include/linux/mlx5/cq.h                      |  4 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
index 336d4738b807..1016e05c7ec7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -58,7 +58,7 @@ void mlx5_cq_tasklet_cb(unsigned long data)
 				 tasklet_ctx.list) {
 		list_del_init(&mcq->tasklet_ctx.list);
 		mcq->tasklet_ctx.comp(mcq);
-		if (atomic_dec_and_test(&mcq->refcount))
+		if (refcount_dec_and_test(&mcq->refcount))
 			complete(&mcq->free);
 		if (time_after(jiffies, end))
 			break;
@@ -80,7 +80,7 @@ static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq)
 	 * still arrive.
 	 */
 	if (list_empty_careful(&cq->tasklet_ctx.list)) {
-		atomic_inc(&cq->refcount);
+		refcount_inc(&cq->refcount);
 		list_add_tail(&cq->tasklet_ctx.list, &tasklet_ctx->list);
 	}
 	spin_unlock_irqrestore(&tasklet_ctx->lock, flags);
@@ -94,7 +94,7 @@ void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn)
 	spin_lock(&table->lock);
 	cq = radix_tree_lookup(&table->tree, cqn);
 	if (likely(cq))
-		atomic_inc(&cq->refcount);
+		refcount_inc(&cq->refcount);
 	spin_unlock(&table->lock);
 
 	if (!cq) {
@@ -106,7 +106,7 @@ void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn)
 
 	cq->comp(cq);
 
-	if (atomic_dec_and_test(&cq->refcount))
+	if (refcount_dec_and_test(&cq->refcount))
 		complete(&cq->free);
 }
 
@@ -119,7 +119,7 @@ void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type)
 
 	cq = radix_tree_lookup(&table->tree, cqn);
 	if (cq)
-		atomic_inc(&cq->refcount);
+		refcount_inc(&cq->refcount);
 
 	spin_unlock(&table->lock);
 
@@ -130,7 +130,7 @@ void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type)
 
 	cq->event(cq, event_type);
 
-	if (atomic_dec_and_test(&cq->refcount))
+	if (refcount_dec_and_test(&cq->refcount))
 		complete(&cq->free);
 }
 
@@ -159,7 +159,7 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 	cq->cqn = MLX5_GET(create_cq_out, out, cqn);
 	cq->cons_index = 0;
 	cq->arm_sn     = 0;
-	atomic_set(&cq->refcount, 1);
+	refcount_set(&cq->refcount, 1);
 	init_completion(&cq->free);
 	if (!cq->comp)
 		cq->comp = mlx5_add_cq_to_tasklet;
@@ -222,7 +222,7 @@ int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq)
 	synchronize_irq(cq->irqn);
 
 	mlx5_debug_cq_remove(dev, cq);
-	if (atomic_dec_and_test(&cq->refcount))
+	if (refcount_dec_and_test(&cq->refcount))
 		complete(&cq->free);
 	wait_for_completion(&cq->free);
 
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index 95898847c7d4..6a57ec2f1ef7 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -35,7 +35,7 @@
 
 #include <rdma/ib_verbs.h>
 #include <linux/mlx5/driver.h>
-
+#include <linux/refcount.h>
 
 struct mlx5_core_cq {
 	u32			cqn;
@@ -43,7 +43,7 @@ struct mlx5_core_cq {
 	__be32		       *set_ci_db;
 	__be32		       *arm_db;
 	struct mlx5_uars_page  *uar;
-	atomic_t		refcount;
+	refcount_t		refcount;
 	struct completion	free;
 	unsigned		vector;
 	unsigned int		irqn;
-- 
cgit v1.2.3


From e65f7ee39b4d7604a78b03ed35d723e1001fc241 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 10:23:49 +0300
Subject: drivers, connector: convert cn_callback_entry.refcnt from atomic_t to
 refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable cn_callback_entry.refcnt is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/connector/cn_queue.c  | 4 ++--
 drivers/connector/connector.c | 2 +-
 include/linux/connector.h     | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/connector/cn_queue.c b/drivers/connector/cn_queue.c
index 1f8bf054d11c..9c54fdf7acea 100644
--- a/drivers/connector/cn_queue.c
+++ b/drivers/connector/cn_queue.c
@@ -45,7 +45,7 @@ cn_queue_alloc_callback_entry(struct cn_queue_dev *dev, const char *name,
 		return NULL;
 	}
 
-	atomic_set(&cbq->refcnt, 1);
+	refcount_set(&cbq->refcnt, 1);
 
 	atomic_inc(&dev->refcnt);
 	cbq->pdev = dev;
@@ -58,7 +58,7 @@ cn_queue_alloc_callback_entry(struct cn_queue_dev *dev, const char *name,
 
 void cn_queue_release_callback(struct cn_callback_entry *cbq)
 {
-	if (!atomic_dec_and_test(&cbq->refcnt))
+	if (!refcount_dec_and_test(&cbq->refcnt))
 		return;
 
 	atomic_dec(&cbq->pdev->refcnt);
diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c
index 25693b045371..8615594bd065 100644
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -157,7 +157,7 @@ static int cn_call_callback(struct sk_buff *skb)
 	spin_lock_bh(&dev->cbdev->queue_lock);
 	list_for_each_entry(i, &dev->cbdev->queue_list, callback_entry) {
 		if (cn_cb_equal(&i->id.id, &msg->id)) {
-			atomic_inc(&i->refcnt);
+			refcount_inc(&i->refcnt);
 			cbq = i;
 			break;
 		}
diff --git a/include/linux/connector.h b/include/linux/connector.h
index f8fe8637d771..032102b19645 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -22,7 +22,7 @@
 #define __CONNECTOR_H
 
 
-#include <linux/atomic.h>
+#include <linux/refcount.h>
 
 #include <linux/list.h>
 #include <linux/workqueue.h>
@@ -49,7 +49,7 @@ struct cn_callback_id {
 
 struct cn_callback_entry {
 	struct list_head callback_entry;
-	atomic_t refcnt;
+	refcount_t refcnt;
 	struct cn_queue_dev *pdev;
 
 	struct cn_callback_id id;
-- 
cgit v1.2.3


From eb297bc716ec2cb3147e6e99002e37058c65cba3 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Tue, 10 Oct 2017 22:08:54 -0700
Subject: of: reserved_mem: Accessor for acquiring reserved_mem

In some cases drivers referencing a reserved-memory region might want to
remap the entire region, but when defining the reserved-memory by "size"
the client driver has no means to know the associated base address of
the reserved memory region.

This patch adds an accessor for such drivers to acquire a handle to
their associated reserved-memory for this purpose.

A complicating factor for the implementation is that the reserved_mem
objects are created from the flattened DeviceTree, as such we can't
use the device_node address for comparison. Fortunately the name of the
node will be used as "name" of the reserved_mem and will be used when
building the full_name, so we can compare the "name" with the basename
of the full_name to find the match.

Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
 drivers/of/of_reserved_mem.c    | 26 ++++++++++++++++++++++++++
 include/linux/of_reserved_mem.h |  5 +++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c
index d507c3569a88..b108c7a1f74c 100644
--- a/drivers/of/of_reserved_mem.c
+++ b/drivers/of/of_reserved_mem.c
@@ -397,3 +397,29 @@ void of_reserved_mem_device_release(struct device *dev)
 	rmem->ops->device_release(rmem, dev);
 }
 EXPORT_SYMBOL_GPL(of_reserved_mem_device_release);
+
+/**
+ * of_reserved_mem_lookup() - acquire reserved_mem from a device node
+ * @np:		node pointer of the desired reserved-memory region
+ *
+ * This function allows drivers to acquire a reference to the reserved_mem
+ * struct based on a device node handle.
+ *
+ * Returns a reserved_mem reference, or NULL on error.
+ */
+struct reserved_mem *of_reserved_mem_lookup(struct device_node *np)
+{
+	const char *name;
+	int i;
+
+	if (!np->full_name)
+		return NULL;
+
+	name = kbasename(np->full_name);
+	for (i = 0; i < reserved_mem_count; i++)
+		if (!strcmp(reserved_mem[i].name, name))
+			return &reserved_mem[i];
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(of_reserved_mem_lookup);
diff --git a/include/linux/of_reserved_mem.h b/include/linux/of_reserved_mem.h
index f8e1992d6423..c58f780104f9 100644
--- a/include/linux/of_reserved_mem.h
+++ b/include/linux/of_reserved_mem.h
@@ -44,6 +44,7 @@ int early_init_dt_alloc_reserved_memory_arch(phys_addr_t size,
 void fdt_init_reserved_mem(void);
 void fdt_reserved_mem_save_node(unsigned long node, const char *uname,
 			       phys_addr_t base, phys_addr_t size);
+struct reserved_mem *of_reserved_mem_lookup(struct device_node *np);
 #else
 static inline int of_reserved_mem_device_init_by_idx(struct device *dev,
 					struct device_node *np, int idx)
@@ -55,6 +56,10 @@ static inline void of_reserved_mem_device_release(struct device *pdev) { }
 static inline void fdt_init_reserved_mem(void) { }
 static inline void fdt_reserved_mem_save_node(unsigned long node,
 		const char *uname, phys_addr_t base, phys_addr_t size) { }
+static inline struct reserved_mem *of_reserved_mem_lookup(struct device_node *np)
+{
+	return NULL;
+}
 #endif
 
 /**
-- 
cgit v1.2.3


From 176aa36012135d172394a928a03fb03dfecd83f9 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Thu, 21 Sep 2017 12:11:24 +0900
Subject: extcon: Split out extcon header file for consumer and provider device

The extcon has two type of extcon devices as following.
- 'extcon provider deivce' adds new extcon device and detect the
   state/properties of external connector. Also, it notifies the
   state/properties to the extcon consumer device.
- 'extcon consumer device' gets the change state/properties
   from extcon provider device.
Prior to that, include/linux/extcon.h contains all exported API for
both provider and consumer device driver. To clarify the meaning of
header file and to remove the wrong use-case on consumer device,
this patch separates into extcon.h and extcon-provider.h.

[Description for include/linux/{extcon.h|extcon-provider.h}]
- extcon.h includes the extcon API and data structure for extcon consumer
  device driver. This header file contains the following APIs:
  : Register/unregister the notifier to catch the change of extcon device
  : Get the extcon device instance
  : Get the extcon device name
  : Get the state of each external connector
  : Get the property value of each external connector
  : Get the property capability of each external connector

- extcon-provider.h includes the extcon API and data structure for extcon
  provider device driver. This header file contains the following APIs:
  : Include 'include/linux/extcon.h'
  : Allocate the memory for extcon device instance
  : Register/unregister extcon device
  : Set the state of each external connector
  : Set the property value of each external connector
  : Set the property capability of each external connector

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Acked-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
Acked-by: Chen-Yu Tsai <wens@csie.org>
Acked-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Acked-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Acked-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 drivers/extcon/extcon-adc-jack.c              |   2 +-
 drivers/extcon/extcon-arizona.c               |   2 +-
 drivers/extcon/extcon-axp288.c                |   2 +-
 drivers/extcon/extcon-gpio.c                  |   2 +-
 drivers/extcon/extcon-intel-cht-wc.c          |   2 +-
 drivers/extcon/extcon-intel-int3496.c         |   2 +-
 drivers/extcon/extcon-max14577.c              |   2 +-
 drivers/extcon/extcon-max3355.c               |   2 +-
 drivers/extcon/extcon-max77693.c              |   2 +-
 drivers/extcon/extcon-max77843.c              |   2 +-
 drivers/extcon/extcon-max8997.c               |   2 +-
 drivers/extcon/extcon-qcom-spmi-misc.c        |   2 +-
 drivers/extcon/extcon-rt8973a.c               |   2 +-
 drivers/extcon/extcon-sm5502.c                |   2 +-
 drivers/extcon/extcon-usb-gpio.c              |   2 +-
 drivers/extcon/extcon-usbc-cros-ec.c          |   2 +-
 drivers/extcon/extcon.h                       |   2 +-
 drivers/phy/allwinner/phy-sun4i-usb.c         |   2 +-
 drivers/phy/broadcom/phy-bcm-ns2-usbdrd.c     |   2 +-
 drivers/phy/renesas/phy-rcar-gen3-usb2.c      |   2 +-
 drivers/phy/rockchip/phy-rockchip-inno-usb2.c |   2 +-
 drivers/power/supply/qcom_smbb.c              |   2 +-
 drivers/usb/gadget/udc/renesas_usb3.c         |   2 +-
 drivers/usb/phy/phy-tahvo.c                   |   2 +-
 include/linux/extcon-provider.h               | 142 ++++++++++++++++++++++++++
 include/linux/extcon.h                        | 109 +-------------------
 include/linux/mfd/palmas.h                    |   2 +-
 27 files changed, 172 insertions(+), 129 deletions(-)
 create mode 100644 include/linux/extcon-provider.h

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon-adc-jack.c b/drivers/extcon/extcon-adc-jack.c
index 6f6537ab0a79..3877d86c746a 100644
--- a/drivers/extcon/extcon-adc-jack.c
+++ b/drivers/extcon/extcon-adc-jack.c
@@ -26,7 +26,7 @@
 #include <linux/workqueue.h>
 #include <linux/iio/consumer.h>
 #include <linux/extcon/extcon-adc-jack.h>
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 
 /**
  * struct adc_jack_data - internal data for adc_jack device driver
diff --git a/drivers/extcon/extcon-arizona.c b/drivers/extcon/extcon-arizona.c
index f84da4a17724..da0e9bc4262f 100644
--- a/drivers/extcon/extcon-arizona.c
+++ b/drivers/extcon/extcon-arizona.c
@@ -27,7 +27,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/property.h>
 #include <linux/regulator/consumer.h>
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 
 #include <sound/soc.h>
 
diff --git a/drivers/extcon/extcon-axp288.c b/drivers/extcon/extcon-axp288.c
index f4fd03e58e37..981fba56bc18 100644
--- a/drivers/extcon/extcon-axp288.c
+++ b/drivers/extcon/extcon-axp288.c
@@ -22,7 +22,7 @@
 #include <linux/platform_device.h>
 #include <linux/property.h>
 #include <linux/notifier.h>
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 #include <linux/regmap.h>
 #include <linux/gpio.h>
 #include <linux/gpio/consumer.h>
diff --git a/drivers/extcon/extcon-gpio.c b/drivers/extcon/extcon-gpio.c
index ebed22f22d75..ab770adcca7e 100644
--- a/drivers/extcon/extcon-gpio.c
+++ b/drivers/extcon/extcon-gpio.c
@@ -17,7 +17,7 @@
  * GNU General Public License for more details.
  */
 
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 #include <linux/extcon/extcon-gpio.h>
 #include <linux/gpio.h>
 #include <linux/gpio/consumer.h>
diff --git a/drivers/extcon/extcon-intel-cht-wc.c b/drivers/extcon/extcon-intel-cht-wc.c
index 91a0023074af..7c4bc8c44c3f 100644
--- a/drivers/extcon/extcon-intel-cht-wc.c
+++ b/drivers/extcon/extcon-intel-cht-wc.c
@@ -15,7 +15,7 @@
  * more details.
  */
 
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/mfd/intel_soc_pmic.h>
diff --git a/drivers/extcon/extcon-intel-int3496.c b/drivers/extcon/extcon-intel-int3496.c
index 1a45e745717d..c8691b5a9cb0 100644
--- a/drivers/extcon/extcon-intel-int3496.c
+++ b/drivers/extcon/extcon-intel-int3496.c
@@ -19,7 +19,7 @@
  */
 
 #include <linux/acpi.h>
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 #include <linux/gpio.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
diff --git a/drivers/extcon/extcon-max14577.c b/drivers/extcon/extcon-max14577.c
index f6414b7fa5bc..6c2c9996eb71 100644
--- a/drivers/extcon/extcon-max14577.c
+++ b/drivers/extcon/extcon-max14577.c
@@ -23,7 +23,7 @@
 #include <linux/platform_device.h>
 #include <linux/mfd/max14577.h>
 #include <linux/mfd/max14577-private.h>
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 
 #define	DELAY_MS_DEFAULT		17000		/* unit: millisecond */
 
diff --git a/drivers/extcon/extcon-max3355.c b/drivers/extcon/extcon-max3355.c
index 533e16a952b8..0aa410836f4e 100644
--- a/drivers/extcon/extcon-max3355.c
+++ b/drivers/extcon/extcon-max3355.c
@@ -9,7 +9,7 @@
  * may be copied, distributed, and modified under those terms.
  */
 
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 #include <linux/gpio.h>
 #include <linux/gpio/consumer.h>
 #include <linux/interrupt.h>
diff --git a/drivers/extcon/extcon-max77693.c b/drivers/extcon/extcon-max77693.c
index 7a5856809047..643411066ad9 100644
--- a/drivers/extcon/extcon-max77693.c
+++ b/drivers/extcon/extcon-max77693.c
@@ -26,7 +26,7 @@
 #include <linux/mfd/max77693.h>
 #include <linux/mfd/max77693-common.h>
 #include <linux/mfd/max77693-private.h>
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 #include <linux/regmap.h>
 #include <linux/irqdomain.h>
 
diff --git a/drivers/extcon/extcon-max77843.c b/drivers/extcon/extcon-max77843.c
index 6e722d552cf1..28f251ff0fa2 100644
--- a/drivers/extcon/extcon-max77843.c
+++ b/drivers/extcon/extcon-max77843.c
@@ -11,7 +11,7 @@
  * (at your option) any later version.
  */
 
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 #include <linux/i2c.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
diff --git a/drivers/extcon/extcon-max8997.c b/drivers/extcon/extcon-max8997.c
index 4a0612fb9c07..8152790d72e1 100644
--- a/drivers/extcon/extcon-max8997.c
+++ b/drivers/extcon/extcon-max8997.c
@@ -25,7 +25,7 @@
 #include <linux/kobject.h>
 #include <linux/mfd/max8997.h>
 #include <linux/mfd/max8997-private.h>
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 #include <linux/irqdomain.h>
 
 #define	DEV_NAME			"max8997-muic"
diff --git a/drivers/extcon/extcon-qcom-spmi-misc.c b/drivers/extcon/extcon-qcom-spmi-misc.c
index b8cde096a808..660bbf163bf5 100644
--- a/drivers/extcon/extcon-qcom-spmi-misc.c
+++ b/drivers/extcon/extcon-qcom-spmi-misc.c
@@ -15,7 +15,7 @@
  * GNU General Public License for more details.
  */
 
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
diff --git a/drivers/extcon/extcon-rt8973a.c b/drivers/extcon/extcon-rt8973a.c
index eaa355e7d9e4..e059bd5f2041 100644
--- a/drivers/extcon/extcon-rt8973a.c
+++ b/drivers/extcon/extcon-rt8973a.c
@@ -20,7 +20,7 @@
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
 #include <linux/slab.h>
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 
 #include "extcon-rt8973a.h"
 
diff --git a/drivers/extcon/extcon-sm5502.c b/drivers/extcon/extcon-sm5502.c
index 106ef0297b53..0cfb5a3efdf6 100644
--- a/drivers/extcon/extcon-sm5502.c
+++ b/drivers/extcon/extcon-sm5502.c
@@ -19,7 +19,7 @@
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
 #include <linux/slab.h>
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 
 #include "extcon-sm5502.h"
 
diff --git a/drivers/extcon/extcon-usb-gpio.c b/drivers/extcon/extcon-usb-gpio.c
index 9c925b05b7aa..53762864a9f7 100644
--- a/drivers/extcon/extcon-usb-gpio.c
+++ b/drivers/extcon/extcon-usb-gpio.c
@@ -14,7 +14,7 @@
  * GNU General Public License for more details.
  */
 
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 #include <linux/gpio.h>
 #include <linux/gpio/consumer.h>
 #include <linux/init.h>
diff --git a/drivers/extcon/extcon-usbc-cros-ec.c b/drivers/extcon/extcon-usbc-cros-ec.c
index 598956f1dcae..6187f731b29d 100644
--- a/drivers/extcon/extcon-usbc-cros-ec.c
+++ b/drivers/extcon/extcon-usbc-cros-ec.c
@@ -14,7 +14,7 @@
  * GNU General Public License for more details.
  */
 
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 #include <linux/kernel.h>
 #include <linux/mfd/cros_ec.h>
 #include <linux/module.h>
diff --git a/drivers/extcon/extcon.h b/drivers/extcon/extcon.h
index dddddcfa0587..cc1b436eb66a 100644
--- a/drivers/extcon/extcon.h
+++ b/drivers/extcon/extcon.h
@@ -1,7 +1,7 @@
 #ifndef __LINUX_EXTCON_INTERNAL_H__
 #define __LINUX_EXTCON_INTERNAL_H__
 
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 
 /**
  * struct extcon_dev - An extcon device represents one external connector.
diff --git a/drivers/phy/allwinner/phy-sun4i-usb.c b/drivers/phy/allwinner/phy-sun4i-usb.c
index 1161e11fb3cf..ef34f97f214b 100644
--- a/drivers/phy/allwinner/phy-sun4i-usb.c
+++ b/drivers/phy/allwinner/phy-sun4i-usb.c
@@ -24,7 +24,7 @@
 #include <linux/clk.h>
 #include <linux/delay.h>
 #include <linux/err.h>
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 #include <linux/io.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
diff --git a/drivers/phy/broadcom/phy-bcm-ns2-usbdrd.c b/drivers/phy/broadcom/phy-bcm-ns2-usbdrd.c
index d099a0c8cee5..7ceea5ae2704 100644
--- a/drivers/phy/broadcom/phy-bcm-ns2-usbdrd.c
+++ b/drivers/phy/broadcom/phy-bcm-ns2-usbdrd.c
@@ -12,7 +12,7 @@
  */
 
 #include <linux/delay.h>
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 #include <linux/gpio.h>
 #include <linux/gpio/consumer.h>
 #include <linux/init.h>
diff --git a/drivers/phy/renesas/phy-rcar-gen3-usb2.c b/drivers/phy/renesas/phy-rcar-gen3-usb2.c
index 54c34298a000..b33e2994ccce 100644
--- a/drivers/phy/renesas/phy-rcar-gen3-usb2.c
+++ b/drivers/phy/renesas/phy-rcar-gen3-usb2.c
@@ -12,7 +12,7 @@
  * published by the Free Software Foundation.
  */
 
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/module.h>
diff --git a/drivers/phy/rockchip/phy-rockchip-inno-usb2.c b/drivers/phy/rockchip/phy-rockchip-inno-usb2.c
index ee7ce5ee53f9..5049dac79bd0 100644
--- a/drivers/phy/rockchip/phy-rockchip-inno-usb2.c
+++ b/drivers/phy/rockchip/phy-rockchip-inno-usb2.c
@@ -17,7 +17,7 @@
 #include <linux/clk.h>
 #include <linux/clk-provider.h>
 #include <linux/delay.h>
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/gpio/consumer.h>
diff --git a/drivers/power/supply/qcom_smbb.c b/drivers/power/supply/qcom_smbb.c
index f6a0d245731d..11de691b9a71 100644
--- a/drivers/power/supply/qcom_smbb.c
+++ b/drivers/power/supply/qcom_smbb.c
@@ -34,7 +34,7 @@
 #include <linux/power_supply.h>
 #include <linux/regmap.h>
 #include <linux/slab.h>
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 #include <linux/regulator/driver.h>
 
 #define SMBB_CHG_VMAX		0x040
diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c
index df37c1e6e9d5..8b4d051fdbed 100644
--- a/drivers/usb/gadget/udc/renesas_usb3.c
+++ b/drivers/usb/gadget/udc/renesas_usb3.c
@@ -12,7 +12,7 @@
 #include <linux/delay.h>
 #include <linux/dma-mapping.h>
 #include <linux/err.h>
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/module.h>
diff --git a/drivers/usb/phy/phy-tahvo.c b/drivers/usb/phy/phy-tahvo.c
index 8babd318c0ed..354e8c98af05 100644
--- a/drivers/usb/phy/phy-tahvo.c
+++ b/drivers/usb/phy/phy-tahvo.c
@@ -23,7 +23,7 @@
 #include <linux/io.h>
 #include <linux/clk.h>
 #include <linux/usb.h>
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/usb/otg.h>
diff --git a/include/linux/extcon-provider.h b/include/linux/extcon-provider.h
new file mode 100644
index 000000000000..2feca5881fa7
--- /dev/null
+++ b/include/linux/extcon-provider.h
@@ -0,0 +1,142 @@
+/*
+ * External Connector (extcon) framework
+ * - linux/include/linux/extcon-provider.h for extcon provider device driver.
+ *
+ * Copyright (C) 2017 Samsung Electronics
+ * Author: Chanwoo Choi <cw00.choi@samsung.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_EXTCON_PROVIDER_H__
+#define __LINUX_EXTCON_PROVIDER_H__
+
+#include <linux/extcon.h>
+
+struct extcon_dev;
+
+#if IS_ENABLED(CONFIG_EXTCON)
+
+/* Following APIs register/unregister the extcon device. */
+extern int extcon_dev_register(struct extcon_dev *edev);
+extern void extcon_dev_unregister(struct extcon_dev *edev);
+extern int devm_extcon_dev_register(struct device *dev,
+				struct extcon_dev *edev);
+extern void devm_extcon_dev_unregister(struct device *dev,
+				struct extcon_dev *edev);
+
+/* Following APIs allocate/free the memory of the extcon device. */
+extern struct extcon_dev *extcon_dev_allocate(const unsigned int *cable);
+extern void extcon_dev_free(struct extcon_dev *edev);
+extern struct extcon_dev *devm_extcon_dev_allocate(struct device *dev,
+				const unsigned int *cable);
+extern void devm_extcon_dev_free(struct device *dev, struct extcon_dev *edev);
+
+/* Synchronize the state and property value for each external connector. */
+extern int extcon_sync(struct extcon_dev *edev, unsigned int id);
+
+/*
+ * Following APIs set the connected state of each external connector.
+ * The 'id' argument indicates the defined external connector.
+ */
+extern int extcon_set_state(struct extcon_dev *edev, unsigned int id,
+				bool state);
+extern int extcon_set_state_sync(struct extcon_dev *edev, unsigned int id,
+				bool state);
+
+/*
+ * Following APIs set the property of each external connector.
+ * The 'id' argument indicates the defined external connector
+ * and the 'prop' indicates the extcon property.
+ *
+ * And extcon_set_property_capability() set the capability of the property
+ * for each external connector. They are used to set the capability of the
+ * property of each external connector based on the id and property.
+ */
+extern int extcon_set_property(struct extcon_dev *edev, unsigned int id,
+				unsigned int prop,
+				union extcon_property_value prop_val);
+extern int extcon_set_property_sync(struct extcon_dev *edev, unsigned int id,
+				unsigned int prop,
+				union extcon_property_value prop_val);
+extern int extcon_set_property_capability(struct extcon_dev *edev,
+				unsigned int id, unsigned int prop);
+
+#else /* CONFIG_EXTCON */
+static inline int extcon_dev_register(struct extcon_dev *edev)
+{
+	return 0;
+}
+
+static inline void extcon_dev_unregister(struct extcon_dev *edev) { }
+
+static inline int devm_extcon_dev_register(struct device *dev,
+				struct extcon_dev *edev)
+{
+	return -EINVAL;
+}
+
+static inline void devm_extcon_dev_unregister(struct device *dev,
+				struct extcon_dev *edev) { }
+
+static inline struct extcon_dev *extcon_dev_allocate(const unsigned int *cable)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline void extcon_dev_free(struct extcon_dev *edev) { }
+
+static inline struct extcon_dev *devm_extcon_dev_allocate(struct device *dev,
+				const unsigned int *cable)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline void devm_extcon_dev_free(struct extcon_dev *edev) { }
+
+
+static inline int extcon_set_state(struct extcon_dev *edev, unsigned int id,
+				bool state)
+{
+	return 0;
+}
+
+static inline int extcon_set_state_sync(struct extcon_dev *edev, unsigned int id,
+				bool state)
+{
+	return 0;
+}
+
+static inline int extcon_sync(struct extcon_dev *edev, unsigned int id)
+{
+	return 0;
+}
+
+static inline int extcon_set_property(struct extcon_dev *edev, unsigned int id,
+				unsigned int prop,
+				union extcon_property_value prop_val)
+{
+	return 0;
+}
+
+static inline int extcon_set_property_sync(struct extcon_dev *edev,
+				unsigned int id, unsigned int prop,
+				union extcon_property_value prop_val)
+{
+	return 0;
+}
+
+static inline int extcon_set_property_capability(struct extcon_dev *edev,
+				unsigned int id, unsigned int prop)
+{
+	return 0;
+}
+#endif /* CONFIG_EXTCON */
+#endif /* __LINUX_EXTCON_PROVIDER_H__ */
diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index 744d60ca80c3..6d94e82c8ad9 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -1,5 +1,6 @@
 /*
  * External Connector (extcon) framework
+ * - linux/include/linux/extcon.h for extcon consumer device driver.
  *
  * Copyright (C) 2015 Samsung Electronics
  * Author: Chanwoo Choi <cw00.choi@samsung.com>
@@ -170,61 +171,29 @@ union extcon_property_value {
 	int intval;	/* type : integer (intval) */
 };
 
-struct extcon_cable;
 struct extcon_dev;
 
 #if IS_ENABLED(CONFIG_EXTCON)
-
-/* Following APIs register/unregister the extcon device. */
-extern int extcon_dev_register(struct extcon_dev *edev);
-extern void extcon_dev_unregister(struct extcon_dev *edev);
-extern int devm_extcon_dev_register(struct device *dev,
-				struct extcon_dev *edev);
-extern void devm_extcon_dev_unregister(struct device *dev,
-				struct extcon_dev *edev);
-
-/* Following APIs allocate/free the memory of the extcon device. */
-extern struct extcon_dev *extcon_dev_allocate(const unsigned int *cable);
-extern void extcon_dev_free(struct extcon_dev *edev);
-extern struct extcon_dev *devm_extcon_dev_allocate(struct device *dev,
-				const unsigned int *cable);
-extern void devm_extcon_dev_free(struct device *dev, struct extcon_dev *edev);
-
-/* Synchronize the state and property value for each external connector. */
-extern int extcon_sync(struct extcon_dev *edev, unsigned int id);
-
 /*
- * Following APIs get/set the connected state of each external connector.
+ * Following APIs get the connected state of each external connector.
  * The 'id' argument indicates the defined external connector.
  */
 extern int extcon_get_state(struct extcon_dev *edev, unsigned int id);
-extern int extcon_set_state(struct extcon_dev *edev, unsigned int id,
-				bool state);
-extern int extcon_set_state_sync(struct extcon_dev *edev, unsigned int id,
-				bool state);
 
 /*
- * Following APIs get/set the property of each external connector.
+ * Following APIs get the property of each external connector.
  * The 'id' argument indicates the defined external connector
  * and the 'prop' indicates the extcon property.
  *
- * And extcon_get/set_property_capability() set the capability of the property
- * for each external connector. They are used to set the capability of the
+ * And extcon_get_property_capability() get the capability of the property
+ * for each external connector. They are used to get the capability of the
  * property of each external connector based on the id and property.
  */
 extern int extcon_get_property(struct extcon_dev *edev, unsigned int id,
 				unsigned int prop,
 				union extcon_property_value *prop_val);
-extern int extcon_set_property(struct extcon_dev *edev, unsigned int id,
-				unsigned int prop,
-				union extcon_property_value prop_val);
-extern int extcon_set_property_sync(struct extcon_dev *edev, unsigned int id,
-				unsigned int prop,
-				union extcon_property_value prop_val);
 extern int extcon_get_property_capability(struct extcon_dev *edev,
 				unsigned int id, unsigned int prop);
-extern int extcon_set_property_capability(struct extcon_dev *edev,
-				unsigned int id, unsigned int prop);
 
 /*
  * Following APIs register the notifier block in order to detect
@@ -268,79 +237,17 @@ extern struct extcon_dev *extcon_get_edev_by_phandle(struct device *dev,
 extern const char *extcon_get_edev_name(struct extcon_dev *edev);
 
 #else /* CONFIG_EXTCON */
-static inline int extcon_dev_register(struct extcon_dev *edev)
-{
-	return 0;
-}
-
-static inline void extcon_dev_unregister(struct extcon_dev *edev) { }
-
-static inline int devm_extcon_dev_register(struct device *dev,
-				struct extcon_dev *edev)
-{
-	return -EINVAL;
-}
-
-static inline void devm_extcon_dev_unregister(struct device *dev,
-				struct extcon_dev *edev) { }
-
-static inline struct extcon_dev *extcon_dev_allocate(const unsigned int *cable)
-{
-	return ERR_PTR(-ENOSYS);
-}
-
-static inline void extcon_dev_free(struct extcon_dev *edev) { }
-
-static inline struct extcon_dev *devm_extcon_dev_allocate(struct device *dev,
-				const unsigned int *cable)
-{
-	return ERR_PTR(-ENOSYS);
-}
-
-static inline void devm_extcon_dev_free(struct extcon_dev *edev) { }
-
-
 static inline int extcon_get_state(struct extcon_dev *edev, unsigned int id)
 {
 	return 0;
 }
 
-static inline int extcon_set_state(struct extcon_dev *edev, unsigned int id,
-				bool state)
-{
-	return 0;
-}
-
-static inline int extcon_set_state_sync(struct extcon_dev *edev, unsigned int id,
-				bool state)
-{
-	return 0;
-}
-
-static inline int extcon_sync(struct extcon_dev *edev, unsigned int id)
-{
-	return 0;
-}
-
 static inline int extcon_get_property(struct extcon_dev *edev, unsigned int id,
 				unsigned int prop,
 				union extcon_property_value *prop_val)
 {
 	return 0;
 }
-static inline int extcon_set_property(struct extcon_dev *edev, unsigned int id,
-				unsigned int prop,
-				union extcon_property_value prop_val)
-{
-	return 0;
-}
-
-static inline int extcon_set_property_sync(struct extcon_dev *edev,
-				unsigned int id, unsigned int prop,
-				union extcon_property_value prop_val)
-{
-	return 0;
-}
 
 static inline int extcon_get_property_capability(struct extcon_dev *edev,
 				unsigned int id, unsigned int prop)
@@ -348,12 +255,6 @@ static inline int extcon_get_property_capability(struct extcon_dev *edev,
 	return 0;
 }
 
-static inline int extcon_set_property_capability(struct extcon_dev *edev,
-				unsigned int id, unsigned int prop)
-{
-	return 0;
-}
-
 static inline int extcon_register_notifier(struct extcon_dev *edev,
 				unsigned int id, struct notifier_block *nb)
 {
diff --git a/include/linux/mfd/palmas.h b/include/linux/mfd/palmas.h
index 6dec43826303..3c8568aa82a5 100644
--- a/include/linux/mfd/palmas.h
+++ b/include/linux/mfd/palmas.h
@@ -20,7 +20,7 @@
 #include <linux/leds.h>
 #include <linux/regmap.h>
 #include <linux/regulator/driver.h>
-#include <linux/extcon.h>
+#include <linux/extcon-provider.h>
 #include <linux/of_gpio.h>
 #include <linux/usb/phy_companion.h>
 
-- 
cgit v1.2.3


From 11a6e41c0ee503ffcb971d260bd07dc99b77f13a Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Mon, 4 Sep 2017 14:53:13 +0200
Subject: phy: Return NULL if the phy is optional

If we're trying to get a handle to an optional phy, then the phy framework
being disabled shouldn't return an hard error.

Instead, return NULL just like phy_optional_get does when there's no phy
provided in the DT.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 include/linux/phy/phy.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h
index e694d4008c4a..10888a717860 100644
--- a/include/linux/phy/phy.h
+++ b/include/linux/phy/phy.h
@@ -291,7 +291,7 @@ static inline struct phy *devm_phy_get(struct device *dev, const char *string)
 static inline struct phy *devm_phy_optional_get(struct device *dev,
 						const char *string)
 {
-	return ERR_PTR(-ENOSYS);
+	return NULL;
 }
 
 static inline struct phy *devm_of_phy_get(struct device *dev,
-- 
cgit v1.2.3


From fd3e4c98e6e7a12dc47ef98ee12c0c3c024b5ee9 Mon Sep 17 00:00:00 2001
From: Vivek Gautam <vivek.gautam@codeaurora.org>
Date: Thu, 12 Oct 2017 11:49:33 +0530
Subject: phy: Add UFS PHY modes

UFS phy has two modes for each High speed generation.
These modes are identified by two rates of operations -
Rate A, and Rate B.
Add these UFS phy modes to phy framework.

Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 include/linux/phy/phy.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h
index 10888a717860..194d08174516 100644
--- a/include/linux/phy/phy.h
+++ b/include/linux/phy/phy.h
@@ -29,6 +29,8 @@ enum phy_mode {
 	PHY_MODE_USB_OTG,
 	PHY_MODE_SGMII,
 	PHY_MODE_10GKR,
+	PHY_MODE_UFS_HS_A,
+	PHY_MODE_UFS_HS_B,
 };
 
 /**
-- 
cgit v1.2.3


From 052553af6a31b459adfdc6fd1eebced75de332fc Mon Sep 17 00:00:00 2001
From: Vivek Gautam <vivek.gautam@codeaurora.org>
Date: Thu, 12 Oct 2017 11:49:36 +0530
Subject: ufs/phy: qcom: Refactor to use phy_init call

Refactor ufs_qcom_power_up_sequence() to get rid of ugly
exported phy APIs and use the phy_init() and phy_power_on()
to do the phy initialization.

Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
Reviewed-by: Subhash Jadavani <subhashj@codeaurora.org>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 drivers/phy/qualcomm/phy-qcom-ufs-i.h        |  3 +-
 drivers/phy/qualcomm/phy-qcom-ufs-qmp-14nm.c | 15 ++++++++--
 drivers/phy/qualcomm/phy-qcom-ufs-qmp-20nm.c | 15 ++++++++--
 drivers/phy/qualcomm/phy-qcom-ufs.c          | 42 ++++++++++------------------
 drivers/scsi/ufs/ufs-qcom.c                  | 36 ++++++++++--------------
 include/linux/phy/phy-qcom-ufs.h             |  3 --
 6 files changed, 55 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/phy/qualcomm/phy-qcom-ufs-i.h b/drivers/phy/qualcomm/phy-qcom-ufs-i.h
index 94326ed107c3..822c83b8efcd 100644
--- a/drivers/phy/qualcomm/phy-qcom-ufs-i.h
+++ b/drivers/phy/qualcomm/phy-qcom-ufs-i.h
@@ -114,6 +114,7 @@ struct ufs_qcom_phy {
 	struct ufs_qcom_phy_calibration *cached_regs;
 	int cached_regs_table_size;
 	bool is_powered_on;
+	bool is_started;
 	struct ufs_qcom_phy_specific_ops *phy_spec_ops;
 
 	enum phy_mode mode;
@@ -123,7 +124,6 @@ struct ufs_qcom_phy {
  * struct ufs_qcom_phy_specific_ops - set of pointers to functions which have a
  * specific implementation per phy. Each UFS phy, should implement
  * those functions according to its spec and requirements
- * @calibrate_phy: pointer to a function that calibrate the phy
  * @start_serdes: pointer to a function that starts the serdes
  * @is_physical_coding_sublayer_ready: pointer to a function that
  * checks pcs readiness. returns 0 for success and non-zero for error.
@@ -132,7 +132,6 @@ struct ufs_qcom_phy {
  * and writes to QSERDES_RX_SIGDET_CNTRL attribute
  */
 struct ufs_qcom_phy_specific_ops {
-	int (*calibrate_phy)(struct ufs_qcom_phy *phy, bool is_rate_B);
 	void (*start_serdes)(struct ufs_qcom_phy *phy);
 	int (*is_physical_coding_sublayer_ready)(struct ufs_qcom_phy *phy);
 	void (*set_tx_lane_enable)(struct ufs_qcom_phy *phy, u32 val);
diff --git a/drivers/phy/qualcomm/phy-qcom-ufs-qmp-14nm.c b/drivers/phy/qualcomm/phy-qcom-ufs-qmp-14nm.c
index af65785230b5..ba1895b76a5d 100644
--- a/drivers/phy/qualcomm/phy-qcom-ufs-qmp-14nm.c
+++ b/drivers/phy/qualcomm/phy-qcom-ufs-qmp-14nm.c
@@ -44,7 +44,19 @@ void ufs_qcom_phy_qmp_14nm_advertise_quirks(struct ufs_qcom_phy *phy_common)
 
 static int ufs_qcom_phy_qmp_14nm_init(struct phy *generic_phy)
 {
-	return 0;
+	struct ufs_qcom_phy *phy_common = get_ufs_qcom_phy(generic_phy);
+	bool is_rate_B = false;
+	int ret;
+
+	if (phy_common->mode == PHY_MODE_UFS_HS_B)
+		is_rate_B = true;
+
+	ret = ufs_qcom_phy_qmp_14nm_phy_calibrate(phy_common, is_rate_B);
+	if (!ret)
+		/* phy calibrated, but yet to be started */
+		phy_common->is_started = false;
+
+	return ret;
 }
 
 static int ufs_qcom_phy_qmp_14nm_exit(struct phy *generic_phy)
@@ -120,7 +132,6 @@ static const struct phy_ops ufs_qcom_phy_qmp_14nm_phy_ops = {
 };
 
 static struct ufs_qcom_phy_specific_ops phy_14nm_ops = {
-	.calibrate_phy		= ufs_qcom_phy_qmp_14nm_phy_calibrate,
 	.start_serdes		= ufs_qcom_phy_qmp_14nm_start_serdes,
 	.is_physical_coding_sublayer_ready = ufs_qcom_phy_qmp_14nm_is_pcs_ready,
 	.set_tx_lane_enable	= ufs_qcom_phy_qmp_14nm_set_tx_lane_enable,
diff --git a/drivers/phy/qualcomm/phy-qcom-ufs-qmp-20nm.c b/drivers/phy/qualcomm/phy-qcom-ufs-qmp-20nm.c
index 5c18c41dbdb4..49f435c71147 100644
--- a/drivers/phy/qualcomm/phy-qcom-ufs-qmp-20nm.c
+++ b/drivers/phy/qualcomm/phy-qcom-ufs-qmp-20nm.c
@@ -63,7 +63,19 @@ void ufs_qcom_phy_qmp_20nm_advertise_quirks(struct ufs_qcom_phy *phy_common)
 
 static int ufs_qcom_phy_qmp_20nm_init(struct phy *generic_phy)
 {
-	return 0;
+	struct ufs_qcom_phy *phy_common = get_ufs_qcom_phy(generic_phy);
+	bool is_rate_B = false;
+	int ret;
+
+	if (phy_common->mode == PHY_MODE_UFS_HS_B)
+		is_rate_B = true;
+
+	ret = ufs_qcom_phy_qmp_20nm_phy_calibrate(phy_common, is_rate_B);
+	if (!ret)
+		/* phy calibrated, but yet to be started */
+		phy_common->is_started = false;
+
+	return ret;
 }
 
 static int ufs_qcom_phy_qmp_20nm_exit(struct phy *generic_phy)
@@ -178,7 +190,6 @@ static const struct phy_ops ufs_qcom_phy_qmp_20nm_phy_ops = {
 };
 
 static struct ufs_qcom_phy_specific_ops phy_20nm_ops = {
-	.calibrate_phy		= ufs_qcom_phy_qmp_20nm_phy_calibrate,
 	.start_serdes		= ufs_qcom_phy_qmp_20nm_start_serdes,
 	.is_physical_coding_sublayer_ready = ufs_qcom_phy_qmp_20nm_is_pcs_ready,
 	.set_tx_lane_enable	= ufs_qcom_phy_qmp_20nm_set_tx_lane_enable,
diff --git a/drivers/phy/qualcomm/phy-qcom-ufs.c b/drivers/phy/qualcomm/phy-qcom-ufs.c
index 43865ef340e2..c5ff4525edef 100644
--- a/drivers/phy/qualcomm/phy-qcom-ufs.c
+++ b/drivers/phy/qualcomm/phy-qcom-ufs.c
@@ -518,9 +518,8 @@ void ufs_qcom_phy_disable_iface_clk(struct ufs_qcom_phy *phy)
 	}
 }
 
-int ufs_qcom_phy_start_serdes(struct phy *generic_phy)
+static int ufs_qcom_phy_start_serdes(struct ufs_qcom_phy *ufs_qcom_phy)
 {
-	struct ufs_qcom_phy *ufs_qcom_phy = get_ufs_qcom_phy(generic_phy);
 	int ret = 0;
 
 	if (!ufs_qcom_phy->phy_spec_ops->start_serdes) {
@@ -533,7 +532,6 @@ int ufs_qcom_phy_start_serdes(struct phy *generic_phy)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(ufs_qcom_phy_start_serdes);
 
 int ufs_qcom_phy_set_tx_lane_enable(struct phy *generic_phy, u32 tx_lanes)
 {
@@ -564,31 +562,8 @@ void ufs_qcom_phy_save_controller_version(struct phy *generic_phy,
 }
 EXPORT_SYMBOL_GPL(ufs_qcom_phy_save_controller_version);
 
-int ufs_qcom_phy_calibrate_phy(struct phy *generic_phy, bool is_rate_B)
-{
-	struct ufs_qcom_phy *ufs_qcom_phy = get_ufs_qcom_phy(generic_phy);
-	int ret = 0;
-
-	if (!ufs_qcom_phy->phy_spec_ops->calibrate_phy) {
-		dev_err(ufs_qcom_phy->dev, "%s: calibrate_phy() callback is not supported\n",
-			__func__);
-		ret = -ENOTSUPP;
-	} else {
-		ret = ufs_qcom_phy->phy_spec_ops->
-				calibrate_phy(ufs_qcom_phy, is_rate_B);
-		if (ret)
-			dev_err(ufs_qcom_phy->dev, "%s: calibrate_phy() failed %d\n",
-				__func__, ret);
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(ufs_qcom_phy_calibrate_phy);
-
-int ufs_qcom_phy_is_pcs_ready(struct phy *generic_phy)
+static int ufs_qcom_phy_is_pcs_ready(struct ufs_qcom_phy *ufs_qcom_phy)
 {
-	struct ufs_qcom_phy *ufs_qcom_phy = get_ufs_qcom_phy(generic_phy);
-
 	if (!ufs_qcom_phy->phy_spec_ops->is_physical_coding_sublayer_ready) {
 		dev_err(ufs_qcom_phy->dev, "%s: is_physical_coding_sublayer_ready() callback is not supported\n",
 			__func__);
@@ -598,7 +573,6 @@ int ufs_qcom_phy_is_pcs_ready(struct phy *generic_phy)
 	return ufs_qcom_phy->phy_spec_ops->
 			is_physical_coding_sublayer_ready(ufs_qcom_phy);
 }
-EXPORT_SYMBOL_GPL(ufs_qcom_phy_is_pcs_ready);
 
 int ufs_qcom_phy_power_on(struct phy *generic_phy)
 {
@@ -609,6 +583,18 @@ int ufs_qcom_phy_power_on(struct phy *generic_phy)
 	if (phy_common->is_powered_on)
 		return 0;
 
+	if (!phy_common->is_started) {
+		err = ufs_qcom_phy_start_serdes(phy_common);
+		if (err)
+			return err;
+
+		err = ufs_qcom_phy_is_pcs_ready(phy_common);
+		if (err)
+			return err;
+
+		phy_common->is_started = true;
+	}
+
 	err = ufs_qcom_phy_enable_vreg(dev, &phy_common->vdda_phy);
 	if (err) {
 		dev_err(dev, "%s enable vdda_phy failed, err=%d\n",
diff --git a/drivers/scsi/ufs/ufs-qcom.c b/drivers/scsi/ufs/ufs-qcom.c
index 44c21d5818ee..890eafeb8ad4 100644
--- a/drivers/scsi/ufs/ufs-qcom.c
+++ b/drivers/scsi/ufs/ufs-qcom.c
@@ -281,10 +281,10 @@ static int ufs_qcom_power_up_sequence(struct ufs_hba *hba)
 	/* provide 1ms delay to let the reset pulse propagate */
 	usleep_range(1000, 1100);
 
-	ret = ufs_qcom_phy_calibrate_phy(phy, is_rate_B);
-
+	/* phy initialization - calibrate the phy */
+	ret = phy_init(phy);
 	if (ret) {
-		dev_err(hba->dev, "%s: ufs_qcom_phy_calibrate_phy() failed, ret = %d\n",
+		dev_err(hba->dev, "%s: phy init failed, ret = %d\n",
 			__func__, ret);
 		goto out;
 	}
@@ -297,21 +297,22 @@ static int ufs_qcom_power_up_sequence(struct ufs_hba *hba)
 	 * voltage, current to settle down before starting serdes.
 	 */
 	usleep_range(1000, 1100);
-	ret = ufs_qcom_phy_start_serdes(phy);
+
+	/* power on phy - start serdes and phy's power and clocks */
+	ret = phy_power_on(phy);
 	if (ret) {
-		dev_err(hba->dev, "%s: ufs_qcom_phy_start_serdes() failed, ret = %d\n",
+		dev_err(hba->dev, "%s: phy power on failed, ret = %d\n",
 			__func__, ret);
-		goto out;
+		goto out_disable_phy;
 	}
 
-	ret = ufs_qcom_phy_is_pcs_ready(phy);
-	if (ret)
-		dev_err(hba->dev,
-			"%s: is_physical_coding_sublayer_ready() failed, ret = %d\n",
-			__func__, ret);
-
 	ufs_qcom_select_unipro_mode(host);
 
+	return 0;
+
+out_disable_phy:
+	ufs_qcom_assert_reset(hba);
+	phy_exit(phy);
 out:
 	return ret;
 }
@@ -1276,14 +1277,9 @@ static int ufs_qcom_init(struct ufs_hba *hba)
 	ufs_qcom_phy_save_controller_version(host->generic_phy,
 		host->hw_ver.major, host->hw_ver.minor, host->hw_ver.step);
 
-	phy_init(host->generic_phy);
-	err = phy_power_on(host->generic_phy);
-	if (err)
-		goto out_unregister_bus;
-
 	err = ufs_qcom_init_lane_clks(host);
 	if (err)
-		goto out_disable_phy;
+		goto out_variant_clear;
 
 	ufs_qcom_set_caps(hba);
 	ufs_qcom_advertise_quirks(hba);
@@ -1304,10 +1300,6 @@ static int ufs_qcom_init(struct ufs_hba *hba)
 
 	goto out;
 
-out_disable_phy:
-	phy_power_off(host->generic_phy);
-out_unregister_bus:
-	phy_exit(host->generic_phy);
 out_variant_clear:
 	ufshcd_set_variant(hba, NULL);
 out:
diff --git a/include/linux/phy/phy-qcom-ufs.h b/include/linux/phy/phy-qcom-ufs.h
index 35c070ea6ea3..0a2c18a9771d 100644
--- a/include/linux/phy/phy-qcom-ufs.h
+++ b/include/linux/phy/phy-qcom-ufs.h
@@ -31,10 +31,7 @@ void ufs_qcom_phy_enable_dev_ref_clk(struct phy *phy);
  */
 void ufs_qcom_phy_disable_dev_ref_clk(struct phy *phy);
 
-int ufs_qcom_phy_start_serdes(struct phy *phy);
 int ufs_qcom_phy_set_tx_lane_enable(struct phy *phy, u32 tx_lanes);
-int ufs_qcom_phy_calibrate_phy(struct phy *phy, bool is_rate_B);
-int ufs_qcom_phy_is_pcs_ready(struct phy *phy);
 void ufs_qcom_phy_save_controller_version(struct phy *phy,
 			u8 major, u16 minor, u16 step);
 
-- 
cgit v1.2.3


From 36914111e6829be36b23d1109214250b5ee1ee9c Mon Sep 17 00:00:00 2001
From: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
Date: Mon, 9 Oct 2017 14:00:50 +0200
Subject: drivers: phy: add calibrate method

Some quirky UDCs (like dwc3 on Exynos) need to have their phys calibrated e.g.
for using super speed. This patch adds a new phy_calibrate() method.
When the calibration should be used is dependent on actual chip.

In case of dwc3 on Exynos the calibration must happen after usb_add_hcd()
(while in host mode), because certain phy parameters like Tx LOS levels
and boost levels need to be calibrated further post initialization of xHCI
controller, to get SuperSpeed operations working. But an hcd must be
prepared first in order to pass it to usb_add_hcd(), so, in particular, dwc3
registers must be available first, and in order for the latter to happen
the phys must be initialized. This poses a chicken and egg problem if
the calibration were to be performed in phy_init(). To break the circular
dependency a separate method is added which can be called at a desired
moment after phy intialization.

Signed-off-by: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 drivers/phy/phy-core.c  | 15 +++++++++++++++
 include/linux/phy/phy.h | 10 ++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c
index a268f4d6f3e9..b4964b067aec 100644
--- a/drivers/phy/phy-core.c
+++ b/drivers/phy/phy-core.c
@@ -372,6 +372,21 @@ int phy_reset(struct phy *phy)
 }
 EXPORT_SYMBOL_GPL(phy_reset);
 
+int phy_calibrate(struct phy *phy)
+{
+	int ret;
+
+	if (!phy || !phy->ops->calibrate)
+		return 0;
+
+	mutex_lock(&phy->mutex);
+	ret = phy->ops->calibrate(phy);
+	mutex_unlock(&phy->mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phy_calibrate);
+
 /**
  * _of_phy_get() - lookup and obtain a reference to a phy by phandle
  * @np: device_node for which to get the phy
diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h
index 194d08174516..4f8423a948d5 100644
--- a/include/linux/phy/phy.h
+++ b/include/linux/phy/phy.h
@@ -41,6 +41,7 @@ enum phy_mode {
  * @power_off: powering off the phy
  * @set_mode: set the mode of the phy
  * @reset: resetting the phy
+ * @calibrate: calibrate the phy
  * @owner: the module owner containing the ops
  */
 struct phy_ops {
@@ -50,6 +51,7 @@ struct phy_ops {
 	int	(*power_off)(struct phy *phy);
 	int	(*set_mode)(struct phy *phy, enum phy_mode mode);
 	int	(*reset)(struct phy *phy);
+	int	(*calibrate)(struct phy *phy);
 	struct module *owner;
 };
 
@@ -143,6 +145,7 @@ int phy_power_on(struct phy *phy);
 int phy_power_off(struct phy *phy);
 int phy_set_mode(struct phy *phy, enum phy_mode mode);
 int phy_reset(struct phy *phy);
+int phy_calibrate(struct phy *phy);
 static inline int phy_get_bus_width(struct phy *phy)
 {
 	return phy->attrs.bus_width;
@@ -264,6 +267,13 @@ static inline int phy_reset(struct phy *phy)
 	return -ENOSYS;
 }
 
+static inline int phy_calibrate(struct phy *phy)
+{
+	if (!phy)
+		return 0;
+	return -ENOSYS;
+}
+
 static inline int phy_get_bus_width(struct phy *phy)
 {
 	return -ENOSYS;
-- 
cgit v1.2.3


From 8dd8d2c95d0252bc64aa8e061a5fb4fbcd05f826 Mon Sep 17 00:00:00 2001
From: Daniel Drake <drake@endlessm.com>
Date: Wed, 18 Oct 2017 15:15:01 +0800
Subject: USB: Force disconnect Huawei 4G modem during suspend

When going into S3 suspend, the Acer TravelMate P648-M and P648-G3
laptops immediately wake up 3-4 seconds later for no obvious reason.

Unbinding the integrated Huawei 4G LTE modem before suspend avoids
the issue, even though we are not using the modem at all (checked
from rescue.target/runlevel1). The problem also occurs when the option
and cdc-ether modem drivers aren't loaded; it reproduces just with the
base usb driver. Under Windows the system can suspend fine.

Seeking a better fix, we've tried a lot of things, including:
 - Check that the device's power/wakeup is disabled
 - Check that remote wakeup is off at the USB level
 - All the quirks in drivers/usb/core/quirks.c e.g. USB_QUIRK_RESET_RESUME,
   USB_QUIRK_RESET, USB_QUIRK_IGNORE_REMOTE_WAKEUP, USB_QUIRK_NO_LPM.

but none of that makes any difference.

There are no errors in the logs showing any suspend/resume-related issues.
When the system wakes up due to the modem, log-wise it appears to be a
normal resume.

Introduce a quirk to disable the port during suspend when the modem is
detected.

The modem from the P648-G3 model is:
T:  Bus=01 Lev=01 Prnt=01 Port=08 Cnt=04 Dev#=  5 Spd=480  MxCh= 0
D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=ff MxPS=64 #Cfgs=  3
P:  Vendor=12d1 ProdID=15c3 Rev= 1.02
S:  Manufacturer=Huawei Technologies Co., Ltd.
S:  Product=HUAWEI Mobile
S:  SerialNumber=0123456789ABCDEF
C:  #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=  2mA
I:  If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=06 Prot=10 Driver=
E:  Ad=82(I) Atr=03(Int.) MxPS=  10 Ivl=32ms
E:  Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:  If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=06 Prot=13 Driver=
E:  Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:  If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=06 Prot=12 Driver=
E:  Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:  If#= 3 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=06 Prot=16 Driver=
E:  Ad=86(I) Atr=03(Int.) MxPS=  16 Ivl=2ms
I:  If#= 3 Alt= 1 #EPs= 3 Cls=ff(vend.) Sub=06 Prot=16 Driver=
E:  Ad=86(I) Atr=03(Int.) MxPS=  16 Ivl=2ms
E:  Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:  If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=06 Prot=1b Driver=
E:  Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
C:* #Ifs= 6 Cfg#= 2 Atr=a0 MxPwr=  2mA
I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=06 Prot=00 Driver=cdc_ether
E:  Ad=82(I) Atr=03(Int.) MxPS=  16 Ivl=2ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=06 Prot=00 Driver=cdc_ether
E:  Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=06 Prot=10 Driver=option
E:  Ad=84(I) Atr=03(Int.) MxPS=  10 Ivl=32ms
E:  Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=06 Prot=13 Driver=option
E:  Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=06 Prot=12 Driver=option
E:  Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=06 Prot=1b Driver=option
E:  Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
C:  #Ifs= 2 Cfg#= 3 Atr=a0 MxPwr=  2mA
A:  FirstIf#= 0 IfCount= 2 Cls=02(comm.) Sub=0e Prot=00
I:  If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=0e Prot=00 Driver=
E:  Ad=82(I) Atr=03(Int.) MxPS=  16 Ivl=2ms
I:  If#= 1 Alt= 0 #EPs= 0 Cls=0a(data ) Sub=00 Prot=02 Driver=
I:  If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=
E:  Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms

Based on an earlier patch by Chris Chiu.

Signed-off-by: Daniel Drake <drake@endlessm.com>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/driver.c  | 10 +++++++++-
 drivers/usb/core/hub.c     | 13 +++++++++++++
 drivers/usb/core/quirks.c  |  6 ++++++
 drivers/usb/core/usb.h     |  1 +
 include/linux/usb/quirks.h |  6 ++++++
 5 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index eb87a259d55c..353993f983c8 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -1461,6 +1461,7 @@ static void choose_wakeup(struct usb_device *udev, pm_message_t msg)
 int usb_suspend(struct device *dev, pm_message_t msg)
 {
 	struct usb_device	*udev = to_usb_device(dev);
+	int r;
 
 	unbind_no_pm_drivers_interfaces(udev);
 
@@ -1469,7 +1470,14 @@ int usb_suspend(struct device *dev, pm_message_t msg)
 	 * so we may still need to unbind and rebind upon resume
 	 */
 	choose_wakeup(udev, msg);
-	return usb_suspend_both(udev, msg);
+	r = usb_suspend_both(udev, msg);
+	if (r)
+		return r;
+
+	if (udev->quirks & USB_QUIRK_DISCONNECT_SUSPEND)
+		usb_port_disable(udev);
+
+	return 0;
 }
 
 /* The device lock is held by the PM core */
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index b5c733613823..941968f011df 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -4180,6 +4180,19 @@ static int hub_port_disable(struct usb_hub *hub, int port1, int set_state)
 	return ret;
 }
 
+/*
+ * usb_port_disable - disable a usb device's upstream port
+ * @udev: device to disable
+ * Context: @udev locked, must be able to sleep.
+ *
+ * Disables a USB device that isn't in active use.
+ */
+int usb_port_disable(struct usb_device *udev)
+{
+	struct usb_hub *hub = usb_hub_to_struct_hub(udev->parent);
+
+	return hub_port_disable(hub, udev->portnum, 0);
+}
 
 /* USB 2.0 spec, 7.1.7.3 / fig 7-29:
  *
diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index 82806e311202..746d2b19109c 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -203,6 +203,12 @@ static const struct usb_device_id usb_quirk_list[] = {
 	{ USB_DEVICE(0x10d6, 0x2200), .driver_info =
 			USB_QUIRK_STRING_FETCH_255 },
 
+	/* Huawei 4G LTE module */
+	{ USB_DEVICE(0x12d1, 0x15bb), .driver_info =
+			USB_QUIRK_DISCONNECT_SUSPEND },
+	{ USB_DEVICE(0x12d1, 0x15c3), .driver_info =
+			USB_QUIRK_DISCONNECT_SUSPEND },
+
 	/* SKYMEDI USB_DRIVE */
 	{ USB_DEVICE(0x1516, 0x8628), .driver_info = USB_QUIRK_RESET_RESUME },
 
diff --git a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h
index dc6949248823..f71890a2db4e 100644
--- a/drivers/usb/core/usb.h
+++ b/drivers/usb/core/usb.h
@@ -73,6 +73,7 @@ extern void usb_hub_cleanup(void);
 extern int usb_major_init(void);
 extern void usb_major_cleanup(void);
 extern int usb_device_supports_lpm(struct usb_device *udev);
+extern int usb_port_disable(struct usb_device *udev);
 
 #ifdef	CONFIG_PM
 
diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h
index de2a722fe3cf..bdc639cc80b4 100644
--- a/include/linux/usb/quirks.h
+++ b/include/linux/usb/quirks.h
@@ -56,4 +56,10 @@
  */
 #define USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL	BIT(11)
 
+/*
+ * Device needs to be disconnected before suspend to prevent spurious
+ * wakeup.
+ */
+#define USB_QUIRK_DISCONNECT_SUSPEND		BIT(12)
+
 #endif /* __LINUX_USB_QUIRKS_H */
-- 
cgit v1.2.3


From a11bc476b987925654369411dd8281a60cb5a175 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Mon, 4 Sep 2017 12:19:07 -0700
Subject: Input: uinput - fold header into the driver proper

There is nothing in the uinput kernel header that is of use to anyone in
the kernel besides the uinput driver itself, so let's fold it into the
driver code (leaving uapi part intact).

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/misc/uinput.c | 40 +++++++++++++++++++++-
 include/linux/uinput.h      | 81 ---------------------------------------------
 2 files changed, 39 insertions(+), 82 deletions(-)
 delete mode 100644 include/linux/uinput.h

(limited to 'include/linux')

diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c
index 06f3ac67dde1..7b41aad7ec27 100644
--- a/drivers/input/misc/uinput.c
+++ b/drivers/input/misc/uinput.c
@@ -31,6 +31,7 @@
  *	0.1	20/06/2002
  *		- first public version
  */
+#include <uapi/linux/uinput.h>
 #include <linux/poll.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -38,10 +39,47 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
-#include <linux/uinput.h>
 #include <linux/input/mt.h>
 #include "../input-compat.h"
 
+#define UINPUT_NAME		"uinput"
+#define UINPUT_BUFFER_SIZE	16
+#define UINPUT_NUM_REQUESTS	16
+
+enum uinput_state { UIST_NEW_DEVICE, UIST_SETUP_COMPLETE, UIST_CREATED };
+
+struct uinput_request {
+	unsigned int		id;
+	unsigned int		code;	/* UI_FF_UPLOAD, UI_FF_ERASE */
+
+	int			retval;
+	struct completion	done;
+
+	union {
+		unsigned int	effect_id;
+		struct {
+			struct ff_effect *effect;
+			struct ff_effect *old;
+		} upload;
+	} u;
+};
+
+struct uinput_device {
+	struct input_dev	*dev;
+	struct mutex		mutex;
+	enum uinput_state	state;
+	wait_queue_head_t	waitq;
+	unsigned char		ready;
+	unsigned char		head;
+	unsigned char		tail;
+	struct input_event	buff[UINPUT_BUFFER_SIZE];
+	unsigned int		ff_effects_max;
+
+	struct uinput_request	*requests[UINPUT_NUM_REQUESTS];
+	wait_queue_head_t	requests_waitq;
+	spinlock_t		requests_lock;
+};
+
 static int uinput_dev_event(struct input_dev *dev,
 			    unsigned int type, unsigned int code, int value)
 {
diff --git a/include/linux/uinput.h b/include/linux/uinput.h
deleted file mode 100644
index 75de43da2301..000000000000
--- a/include/linux/uinput.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- *  User level driver support for input subsystem
- *
- * Heavily based on evdev.c by Vojtech Pavlik
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Author: Aristeu Sergio Rozanski Filho <aris@cathedrallabs.org>
- *
- * Changes/Revisions:
- *	0.5	08/13/2015 (David Herrmann <dh.herrmann@gmail.com> &
- *			    Benjamin Tissoires <benjamin.tissoires@redhat.com>)
- *		- add UI_DEV_SETUP ioctl
- *		- add UI_ABS_SETUP ioctl
- *		- add UI_GET_VERSION ioctl
- *	0.4	01/09/2014 (Benjamin Tissoires <benjamin.tissoires@redhat.com>)
- *		- add UI_GET_SYSNAME ioctl
- *	0.3	24/05/2006 (Anssi Hannula <anssi.hannulagmail.com>)
- *		- update ff support for the changes in kernel interface
- *		- add UINPUT_VERSION
- *	0.2	16/10/2004 (Micah Dowty <micah@navi.cx>)
- *		- added force feedback support
- *             - added UI_SET_PHYS
- *	0.1	20/06/2002
- *		- first public version
- */
-#ifndef __UINPUT_H_
-#define __UINPUT_H_
-
-#include <uapi/linux/uinput.h>
-
-#define UINPUT_NAME		"uinput"
-#define UINPUT_BUFFER_SIZE	16
-#define UINPUT_NUM_REQUESTS	16
-
-enum uinput_state { UIST_NEW_DEVICE, UIST_SETUP_COMPLETE, UIST_CREATED };
-
-struct uinput_request {
-	unsigned int		id;
-	unsigned int		code;	/* UI_FF_UPLOAD, UI_FF_ERASE */
-
-	int			retval;
-	struct completion	done;
-
-	union {
-		unsigned int	effect_id;
-		struct {
-			struct ff_effect *effect;
-			struct ff_effect *old;
-		} upload;
-	} u;
-};
-
-struct uinput_device {
-	struct input_dev	*dev;
-	struct mutex		mutex;
-	enum uinput_state	state;
-	wait_queue_head_t	waitq;
-	unsigned char		ready;
-	unsigned char		head;
-	unsigned char		tail;
-	struct input_event	buff[UINPUT_BUFFER_SIZE];
-	unsigned int		ff_effects_max;
-
-	struct uinput_request	*requests[UINPUT_NUM_REQUESTS];
-	wait_queue_head_t	requests_waitq;
-	spinlock_t		requests_lock;
-};
-#endif	/* __UINPUT_H_ */
-- 
cgit v1.2.3


From 7b9651103b64c8a55eb6cec4c2240584e968354c Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Wed, 18 Oct 2017 11:56:21 +0200
Subject: extcon: max77843: Add OTG power control to the MUIC driver

Enabling power on VBUS micro-usb pin is required only when passive OTG
cable is connected. Initially OTG VBUS power control was planned to be
done in charger driver. However such information is not really available
from the extcon notifications, so VBUS power control has to be done
directly in MUIC driver, which has all information about the attached
accessory.

For example SmartDock is externally powered accessory, provides OTG
(USB HOST) functionality and use VBUS pin for charging a device battery,
so the VBUS charging pump should be disabled in such case.

Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Chanwoo Choi <cw00.choi@samsung.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/extcon/extcon-max77843.c     | 16 ++++++++++++++++
 include/linux/mfd/max77843-private.h |  3 +++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon-max77843.c b/drivers/extcon/extcon-max77843.c
index 28f251ff0fa2..17d6bd76edb4 100644
--- a/drivers/extcon/extcon-max77843.c
+++ b/drivers/extcon/extcon-max77843.c
@@ -240,6 +240,21 @@ static int max77843_muic_set_path(struct max77843_muic_info *info,
 	return 0;
 }
 
+static void max77843_charger_set_otg_vbus(struct max77843_muic_info *info,
+		 bool on)
+{
+	struct max77693_dev *max77843 = info->max77843;
+	unsigned int cnfg00;
+
+	if (on)
+		cnfg00 = MAX77843_CHG_OTG_MASK | MAX77843_CHG_BOOST_MASK;
+	else
+		cnfg00 = MAX77843_CHG_ENABLE | MAX77843_CHG_BUCK_MASK;
+
+	regmap_update_bits(max77843->regmap_chg, MAX77843_CHG_REG_CHG_CNFG_00,
+			   MAX77843_CHG_MODE_MASK, cnfg00);
+}
+
 static int max77843_muic_get_cable_type(struct max77843_muic_info *info,
 		enum max77843_muic_cable_group group, bool *attached)
 {
@@ -355,6 +370,7 @@ static int max77843_muic_adc_gnd_handler(struct max77843_muic_info *info)
 			return ret;
 
 		extcon_set_state_sync(info->edev, EXTCON_USB_HOST, attached);
+		max77843_charger_set_otg_vbus(info, attached);
 		break;
 	case MAX77843_MUIC_GND_MHL_VB:
 	case MAX77843_MUIC_GND_MHL:
diff --git a/include/linux/mfd/max77843-private.h b/include/linux/mfd/max77843-private.h
index c19303b0ccfd..0223cd5941c8 100644
--- a/include/linux/mfd/max77843-private.h
+++ b/include/linux/mfd/max77843-private.h
@@ -245,10 +245,13 @@ enum max77843_irq_muic {
 #define MAX77843_CHG_OVER_CURRENT_BAT		(0x06 << 4)
 
 /* MAX77843 CHG_CNFG_00 register */
+#define MAX77843_CHG_MODE_MASK			0x0f
 #define MAX77843_CHG_DISABLE			0x00
 #define MAX77843_CHG_ENABLE			0x05
 #define MAX77843_CHG_MASK			0x01
+#define MAX77843_CHG_OTG_MASK			0x02
 #define MAX77843_CHG_BUCK_MASK			0x04
+#define MAX77843_CHG_BOOST_MASK			0x08
 
 /* MAX77843 CHG_CNFG_01 register */
 #define MAX77843_CHG_RESTART_THRESHOLD_100	0x00
-- 
cgit v1.2.3


From 4a4a87146a07c866ad2ef49cc32296e6583b1cee Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Wed, 18 Oct 2017 11:56:22 +0200
Subject: extcon: max77843: Add support for SmartDock accessory

SmartDock uses ADC_RESERVED_ACC_3 (0x10) ADC ID type and provides following
features:
1. USB host with embedded USB hub (2-4 ports) for mice, keyboard, etc,
2. MHL for video output,
3. charging.

Tested with Unitek Y-2165 MHL+OTG Hub Smart Phone Dock.

Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Chanwoo Choi <cw00.choi@samsung.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/extcon/extcon-max77843.c     | 77 +++++++++++++++++++++++++++++-------
 include/linux/mfd/max77843-private.h |  2 +
 2 files changed, 65 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon-max77843.c b/drivers/extcon/extcon-max77843.c
index 17d6bd76edb4..c9fcd6cd41cb 100644
--- a/drivers/extcon/extcon-max77843.c
+++ b/drivers/extcon/extcon-max77843.c
@@ -80,7 +80,7 @@ enum max77843_muic_accessory_type {
 	MAX77843_MUIC_ADC_REMOTE_S12_BUTTON,
 	MAX77843_MUIC_ADC_RESERVED_ACC_1,
 	MAX77843_MUIC_ADC_RESERVED_ACC_2,
-	MAX77843_MUIC_ADC_RESERVED_ACC_3,
+	MAX77843_MUIC_ADC_RESERVED_ACC_3, /* SmartDock */
 	MAX77843_MUIC_ADC_RESERVED_ACC_4,
 	MAX77843_MUIC_ADC_RESERVED_ACC_5,
 	MAX77843_MUIC_ADC_AUDIO_DEVICE_TYPE2,
@@ -119,6 +119,7 @@ enum max77843_muic_charger_type {
 	MAX77843_MUIC_CHG_SPECIAL_BIAS,
 	MAX77843_MUIC_CHG_RESERVED,
 	MAX77843_MUIC_CHG_GND,
+	MAX77843_MUIC_CHG_DOCK,
 };
 
 static const unsigned int max77843_extcon_cable[] = {
@@ -130,6 +131,7 @@ static const unsigned int max77843_extcon_cable[] = {
 	EXTCON_CHG_USB_FAST,
 	EXTCON_CHG_USB_SLOW,
 	EXTCON_DISP_MHL,
+	EXTCON_DOCK,
 	EXTCON_JIG,
 	EXTCON_NONE,
 };
@@ -200,7 +202,7 @@ static const struct regmap_irq_chip max77843_muic_irq_chip = {
 };
 
 static int max77843_muic_set_path(struct max77843_muic_info *info,
-		u8 val, bool attached)
+		u8 val, bool attached, bool nobccomp)
 {
 	struct max77693_dev *max77843 = info->max77843;
 	int ret = 0;
@@ -210,10 +212,16 @@ static int max77843_muic_set_path(struct max77843_muic_info *info,
 		ctrl1 = val;
 	else
 		ctrl1 = MAX77843_MUIC_CONTROL1_SW_OPEN;
+	if (nobccomp) {
+		/* Disable BC1.2 protocol and force manual switch control */
+		ctrl1 |= MAX77843_MUIC_CONTROL1_NOBCCOMP_MASK;
+	}
 
 	ret = regmap_update_bits(max77843->regmap_muic,
 			MAX77843_MUIC_REG_CONTROL1,
-			MAX77843_MUIC_CONTROL1_COM_SW, ctrl1);
+			MAX77843_MUIC_CONTROL1_COM_SW |
+				MAX77843_MUIC_CONTROL1_NOBCCOMP_MASK,
+			ctrl1);
 	if (ret < 0) {
 		dev_err(info->dev, "Cannot switch MUIC port\n");
 		return ret;
@@ -303,6 +311,19 @@ static int max77843_muic_get_cable_type(struct max77843_muic_info *info,
 			break;
 		}
 
+		if (adc == MAX77843_MUIC_ADC_RESERVED_ACC_3) { /* SmartDock */
+			if (chg_type == MAX77843_MUIC_CHG_NONE) {
+				*attached = false;
+				cable_type = info->prev_chg_type;
+				info->prev_chg_type = MAX77843_MUIC_CHG_NONE;
+			} else {
+				*attached = true;
+				cable_type = MAX77843_MUIC_CHG_DOCK;
+				info->prev_chg_type = MAX77843_MUIC_CHG_DOCK;
+			}
+			break;
+		}
+
 		if (chg_type == MAX77843_MUIC_CHG_NONE) {
 			*attached = false;
 			cable_type = info->prev_chg_type;
@@ -365,7 +386,7 @@ static int max77843_muic_adc_gnd_handler(struct max77843_muic_info *info)
 	case MAX77843_MUIC_GND_USB_HOST_VB:
 		ret = max77843_muic_set_path(info,
 					     MAX77843_MUIC_CONTROL1_SW_USB,
-					     attached);
+					     attached, false);
 		if (ret < 0)
 			return ret;
 
@@ -376,7 +397,7 @@ static int max77843_muic_adc_gnd_handler(struct max77843_muic_info *info)
 	case MAX77843_MUIC_GND_MHL:
 		ret = max77843_muic_set_path(info,
 					     MAX77843_MUIC_CONTROL1_SW_OPEN,
-					     attached);
+					     attached, false);
 		if (ret < 0)
 			return ret;
 
@@ -412,7 +433,7 @@ static int max77843_muic_jig_handler(struct max77843_muic_info *info,
 		return -EINVAL;
 	}
 
-	ret = max77843_muic_set_path(info, path, attached);
+	ret = max77843_muic_set_path(info, path, attached, false);
 	if (ret < 0)
 		return ret;
 
@@ -421,6 +442,26 @@ static int max77843_muic_jig_handler(struct max77843_muic_info *info,
 	return 0;
 }
 
+static int max77843_muic_dock_handler(struct max77843_muic_info *info,
+		bool attached)
+{
+	int ret;
+
+	dev_dbg(info->dev, "external connector is %s (adc: 0x10)\n",
+			attached ? "attached" : "detached");
+
+	ret = max77843_muic_set_path(info, MAX77843_MUIC_CONTROL1_SW_USB,
+				     attached, attached);
+	if (ret < 0)
+		return ret;
+
+	extcon_set_state_sync(info->edev, EXTCON_DISP_MHL, attached);
+	extcon_set_state_sync(info->edev, EXTCON_USB_HOST, attached);
+	extcon_set_state_sync(info->edev, EXTCON_DOCK, attached);
+
+	return 0;
+}
+
 static int max77843_muic_adc_handler(struct max77843_muic_info *info)
 {
 	int ret, cable_type;
@@ -435,6 +476,11 @@ static int max77843_muic_adc_handler(struct max77843_muic_info *info)
 		info->prev_cable_type);
 
 	switch (cable_type) {
+	case MAX77843_MUIC_ADC_RESERVED_ACC_3: /* SmartDock */
+		ret = max77843_muic_dock_handler(info, attached);
+		if (ret < 0)
+			return ret;
+		break;
 	case MAX77843_MUIC_ADC_GROUND:
 		ret = max77843_muic_adc_gnd_handler(info);
 		if (ret < 0)
@@ -462,7 +508,6 @@ static int max77843_muic_adc_handler(struct max77843_muic_info *info)
 	case MAX77843_MUIC_ADC_REMOTE_S12_BUTTON:
 	case MAX77843_MUIC_ADC_RESERVED_ACC_1:
 	case MAX77843_MUIC_ADC_RESERVED_ACC_2:
-	case MAX77843_MUIC_ADC_RESERVED_ACC_3:
 	case MAX77843_MUIC_ADC_RESERVED_ACC_4:
 	case MAX77843_MUIC_ADC_RESERVED_ACC_5:
 	case MAX77843_MUIC_ADC_AUDIO_DEVICE_TYPE2:
@@ -506,7 +551,7 @@ static int max77843_muic_chg_handler(struct max77843_muic_info *info)
 	case MAX77843_MUIC_CHG_USB:
 		ret = max77843_muic_set_path(info,
 					     MAX77843_MUIC_CONTROL1_SW_USB,
-					     attached);
+					     attached, false);
 		if (ret < 0)
 			return ret;
 
@@ -517,7 +562,7 @@ static int max77843_muic_chg_handler(struct max77843_muic_info *info)
 	case MAX77843_MUIC_CHG_DOWNSTREAM:
 		ret = max77843_muic_set_path(info,
 					     MAX77843_MUIC_CONTROL1_SW_OPEN,
-					     attached);
+					     attached, false);
 		if (ret < 0)
 			return ret;
 
@@ -527,7 +572,7 @@ static int max77843_muic_chg_handler(struct max77843_muic_info *info)
 	case MAX77843_MUIC_CHG_DEDICATED:
 		ret = max77843_muic_set_path(info,
 					     MAX77843_MUIC_CONTROL1_SW_OPEN,
-					     attached);
+					     attached, false);
 		if (ret < 0)
 			return ret;
 
@@ -537,7 +582,7 @@ static int max77843_muic_chg_handler(struct max77843_muic_info *info)
 	case MAX77843_MUIC_CHG_SPECIAL_500MA:
 		ret = max77843_muic_set_path(info,
 					     MAX77843_MUIC_CONTROL1_SW_OPEN,
-					     attached);
+					     attached, false);
 		if (ret < 0)
 			return ret;
 
@@ -547,7 +592,7 @@ static int max77843_muic_chg_handler(struct max77843_muic_info *info)
 	case MAX77843_MUIC_CHG_SPECIAL_1A:
 		ret = max77843_muic_set_path(info,
 					     MAX77843_MUIC_CONTROL1_SW_OPEN,
-					     attached);
+					     attached, false);
 		if (ret < 0)
 			return ret;
 
@@ -566,6 +611,9 @@ static int max77843_muic_chg_handler(struct max77843_muic_info *info)
 			extcon_set_state_sync(info->edev, EXTCON_CHG_USB_DCP,
 						false);
 		break;
+	case MAX77843_MUIC_CHG_DOCK:
+		extcon_set_state_sync(info->edev, EXTCON_CHG_USB_DCP, attached);
+		break;
 	case MAX77843_MUIC_CHG_NONE:
 		break;
 	default:
@@ -574,7 +622,7 @@ static int max77843_muic_chg_handler(struct max77843_muic_info *info)
 			attached ? "attached" : "detached", chg_type);
 
 		max77843_muic_set_path(info, MAX77843_MUIC_CONTROL1_SW_OPEN,
-				       attached);
+				       attached, false);
 		return -EINVAL;
 	}
 
@@ -814,7 +862,8 @@ static int max77843_muic_probe(struct platform_device *pdev)
 	max77843_muic_set_debounce_time(info, MAX77843_DEBOUNCE_TIME_25MS);
 
 	/* Set initial path for UART */
-	max77843_muic_set_path(info, MAX77843_MUIC_CONTROL1_SW_UART, true);
+	max77843_muic_set_path(info, MAX77843_MUIC_CONTROL1_SW_UART, true,
+			       false);
 
 	/* Check revision number of MUIC device */
 	ret = regmap_read(max77843->regmap_muic, MAX77843_MUIC_REG_ID, &id);
diff --git a/include/linux/mfd/max77843-private.h b/include/linux/mfd/max77843-private.h
index 0223cd5941c8..b8908bf8d315 100644
--- a/include/linux/mfd/max77843-private.h
+++ b/include/linux/mfd/max77843-private.h
@@ -350,6 +350,7 @@ enum max77843_irq_muic {
 /* MAX77843 CONTROL register */
 #define MAX77843_MUIC_CONTROL1_COMP1SW_SHIFT	0
 #define MAX77843_MUIC_CONTROL1_COMP2SW_SHIFT	3
+#define MAX77843_MUIC_CONTROL1_NOBCCOMP_SHIFT	6
 #define MAX77843_MUIC_CONTROL1_IDBEN_SHIFT	7
 #define MAX77843_MUIC_CONTROL2_LOWPWR_SHIFT	0
 #define MAX77843_MUIC_CONTROL2_ADCEN_SHIFT	1
@@ -366,6 +367,7 @@ enum max77843_irq_muic {
 #define MAX77843_MUIC_CONTROL1_COMP1SW_MASK	(0x7 << MAX77843_MUIC_CONTROL1_COMP1SW_SHIFT)
 #define MAX77843_MUIC_CONTROL1_COMP2SW_MASK	(0x7 << MAX77843_MUIC_CONTROL1_COMP2SW_SHIFT)
 #define MAX77843_MUIC_CONTROL1_IDBEN_MASK	BIT(MAX77843_MUIC_CONTROL1_IDBEN_SHIFT)
+#define MAX77843_MUIC_CONTROL1_NOBCCOMP_MASK	BIT(MAX77843_MUIC_CONTROL1_NOBCCOMP_SHIFT)
 #define MAX77843_MUIC_CONTROL2_LOWPWR_MASK	BIT(MAX77843_MUIC_CONTROL2_LOWPWR_SHIFT)
 #define MAX77843_MUIC_CONTROL2_ADCEN_MASK	BIT(MAX77843_MUIC_CONTROL2_ADCEN_SHIFT)
 #define MAX77843_MUIC_CONTROL2_CPEN_MASK	BIT(MAX77843_MUIC_CONTROL2_CPEN_SHIFT)
-- 
cgit v1.2.3


From c6cd924efe941ef62eb805c59e4a09e219ac5c6d Mon Sep 17 00:00:00 2001
From: Yanjiang Jin <yanjiang.jin@windriver.com>
Date: Tue, 24 Oct 2017 14:23:41 +0800
Subject: cpu/hotplug: Remove obsolete notifier macros

commit 530e9b76ae8f ("cpu/hotplug: Remove obsolete cpu hotplug
register/unregister functions")' removed the below macros:

- #define CPU_UP_CANCELED 0x0004 /* CPU (unsigned)v NOT coming up */
- #define CPU_DOWN_PREPARE 0x0005 /* CPU (unsigned)v going down */
- #define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */

But "CPU_UP_CANCELED_FROZEN, CPU_DOWN_PREPARE_FROZEN and
CPU_DOWN_FAILED_FROZEN" still refer to them, and nobody uses these "FROZEN"
macros now, so remove them too.

Signed-off-by: Yanjiang Jin <yanjiang.jin@windriver.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: peterz@infradead.org
Cc: bigeasy@linutronix.de
Cc: jinyanjiang@gmail.com
Link: https://lkml.kernel.org/r/20171024062341.179678-1-yanjiang.jin@windriver.com
---
 include/linux/cpu.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index ca73bc1563f4..cd4771b772c0 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -72,9 +72,6 @@ struct notifier_block;
 
 #define CPU_ONLINE_FROZEN	(CPU_ONLINE | CPU_TASKS_FROZEN)
 #define CPU_UP_PREPARE_FROZEN	(CPU_UP_PREPARE | CPU_TASKS_FROZEN)
-#define CPU_UP_CANCELED_FROZEN	(CPU_UP_CANCELED | CPU_TASKS_FROZEN)
-#define CPU_DOWN_PREPARE_FROZEN	(CPU_DOWN_PREPARE | CPU_TASKS_FROZEN)
-#define CPU_DOWN_FAILED_FROZEN	(CPU_DOWN_FAILED | CPU_TASKS_FROZEN)
 #define CPU_DEAD_FROZEN		(CPU_DEAD | CPU_TASKS_FROZEN)
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From e22cdc3fc5991956146b9856d36b4971fe54dcd6 Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Mon, 23 Oct 2017 19:01:54 +0600
Subject: sched/isolcpus: Fix "isolcpus=" boot parameter handling when
 !CONFIG_CPUMASK_OFFSTACK

cpulist_parse() uses nr_cpumask_bits as a limit to parse the
passed buffer from kernel commandline. What nr_cpumask_bits
represents varies depending upon the CONFIG_CPUMASK_OFFSTACK option:

 - If CONFIG_CPUMASK_OFFSTACK=n, then nr_cpumask_bits is the same as
   NR_CPUS, which might not represent the # of CPUs that really exist
   (default 64). So, there's a chance of a gap between nr_cpu_ids
   and NR_CPUS, which ultimately lead towards invalid cpulist_parse()
   operation. For example, if isolcpus=9 is passed on an 8 cpu
   system (CONFIG_CPUMASK_OFFSTACK=n) it doesn't show the error
   that it's supposed to.

This patch fixes this bug by finding the last CPU of the passed
isolcpus= list and checking it against nr_cpu_ids.

It also fixes the error message where the nr_cpu_ids should be
nr_cpu_ids-1, since CPU numbering starts from 0.

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adobriyan@gmail.com
Cc: akpm@linux-foundation.org
Cc: longman@redhat.com
Cc: mka@chromium.org
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/20171023130154.9050-1-rakib.mullick@gmail.com
[ Enhanced the changelog and the kernel message. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

 include/linux/cpumask.h |   16 ++++++++++++++++
 kernel/sched/topology.c |    4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)
---
 include/linux/cpumask.h | 16 ++++++++++++++++
 kernel/sched/topology.c |  4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index cd415b733c2a..63661de67ad4 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -130,6 +130,11 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
 	return 0;
 }
 
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+	return 0;
+}
+
 /* Valid inputs for n are -1 and 0. */
 static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
 {
@@ -178,6 +183,17 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
 	return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
 }
 
+/**
+ * cpumask_last - get the last CPU in a cpumask
+ * @srcp:	- the cpumask pointer
+ *
+ * Returns	>= nr_cpumask_bits if no CPUs set.
+ */
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+	return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
+}
+
 unsigned int cpumask_next(int n, const struct cpumask *srcp);
 
 /**
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index e50450c2fed8..e3d31b0880dc 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -476,8 +476,8 @@ static int __init isolated_cpu_setup(char *str)
 
 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
 	ret = cpulist_parse(str, cpu_isolated_map);
-	if (ret) {
-		pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids);
+	if (ret || cpumask_last(cpu_isolated_map) >= nr_cpu_ids) {
+		pr_err("sched: Error, all isolcpus= values must be between 0 and %u - ignoring them.\n", nr_cpu_ids-1);
 		return 0;
 	}
 	return 1;
-- 
cgit v1.2.3


From 71c02379c762cb616c00fd5c4ed253fbf6bbe11b Mon Sep 17 00:00:00 2001
From: Christoph Paasch <cpaasch@apple.com>
Date: Mon, 23 Oct 2017 13:22:23 -0700
Subject: tcp: Configure TFO without cookie per socket and/or per route

We already allow to enable TFO without a cookie by using the
fastopen-sysctl and setting it to TFO_SERVER_COOKIE_NOT_REQD (or
TFO_CLIENT_NO_COOKIE).
This is safe to do in certain environments where we know that there
isn't a malicous host (aka., data-centers) or when the
application-protocol already provides an authentication mechanism in the
first flight of data.

A server however might be providing multiple services or talking to both
sides (public Internet and data-center). So, this server would want to
enable cookie-less TFO for certain services and/or for connections that
go to the data-center.

This patch exposes a socket-option and a per-route attribute to enable such
fine-grained configurations.

Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h            |  3 ++-
 include/net/tcp.h              |  3 ++-
 include/uapi/linux/rtnetlink.h |  2 ++
 include/uapi/linux/tcp.h       |  1 +
 net/ipv4/tcp.c                 | 12 ++++++++++++
 net/ipv4/tcp_fastopen.c        | 20 +++++++++++++++++---
 net/ipv4/tcp_input.c           |  2 +-
 7 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1d2c44e09e31..173a7c2f9636 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -215,7 +215,8 @@ struct tcp_sock {
 	u8	chrono_type:2,	/* current chronograph type */
 		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
 		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
-		unused:4;
+		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
+		unused:3;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		unused1	    : 1,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2c13484704cb..2392f74074e7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1567,7 +1567,8 @@ int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
 void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
-			      struct tcp_fastopen_cookie *foc);
+			      struct tcp_fastopen_cookie *foc,
+			      const struct dst_entry *dst);
 void tcp_fastopen_init_key_once(struct net *net);
 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 			     struct tcp_fastopen_cookie *cookie);
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index dab7dad9e01a..fe6679268901 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -430,6 +430,8 @@ enum {
 #define RTAX_QUICKACK RTAX_QUICKACK
 	RTAX_CC_ALGO,
 #define RTAX_CC_ALGO RTAX_CC_ALGO
+	RTAX_FASTOPEN_NO_COOKIE,
+#define RTAX_FASTOPEN_NO_COOKIE RTAX_FASTOPEN_NO_COOKIE
 	__RTAX_MAX
 };
 
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 69c7493e42f8..d67e1d40c6d6 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -120,6 +120,7 @@ enum {
 #define TCP_ULP			31	/* Attach a ULP to a TCP connection */
 #define TCP_MD5SIG_EXT		32	/* TCP MD5 Signature with extensions */
 #define TCP_FASTOPEN_KEY	33	/* Set the key for Fast Open (cookie) */
+#define TCP_FASTOPEN_NO_COOKIE	34	/* Enable TFO without a TFO cookie */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index be07e9b6dbdd..8f36277e82e9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2836,6 +2836,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			err = -EOPNOTSUPP;
 		}
 		break;
+	case TCP_FASTOPEN_NO_COOKIE:
+		if (val > 1 || val < 0)
+			err = -EINVAL;
+		else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+			err = -EINVAL;
+		else
+			tp->fastopen_no_cookie = val;
+		break;
 	case TCP_TIMESTAMP:
 		if (!tp->repair)
 			err = -EPERM;
@@ -3256,6 +3264,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 		val = tp->fastopen_connect;
 		break;
 
+	case TCP_FASTOPEN_NO_COOKIE:
+		val = tp->fastopen_no_cookie;
+		break;
+
 	case TCP_TIMESTAMP:
 		val = tcp_time_stamp_raw() + tp->tsoffset;
 		break;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 21075ce19cb6..e0a4b56644aa 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -310,13 +310,23 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
 	return true;
 }
 
+static bool tcp_fastopen_no_cookie(const struct sock *sk,
+				   const struct dst_entry *dst,
+				   int flag)
+{
+	return (sock_net(sk)->ipv4.sysctl_tcp_fastopen & flag) ||
+	       tcp_sk(sk)->fastopen_no_cookie ||
+	       (dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE));
+}
+
 /* Returns true if we should perform Fast Open on the SYN. The cookie (foc)
  * may be updated and return the client in the SYN-ACK later. E.g., Fast Open
  * cookie request (foc->len == 0).
  */
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
-			      struct tcp_fastopen_cookie *foc)
+			      struct tcp_fastopen_cookie *foc,
+			      const struct dst_entry *dst)
 {
 	bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
 	int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
@@ -333,7 +343,8 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 		return NULL;
 	}
 
-	if (syn_data && (tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
+	if (syn_data &&
+	    tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD))
 		goto fastopen;
 
 	if (foc->len >= 0 &&  /* Client presents or requests a cookie */
@@ -370,6 +381,7 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 			       struct tcp_fastopen_cookie *cookie)
 {
 	unsigned long last_syn_loss = 0;
+	const struct dst_entry *dst;
 	int syn_loss = 0;
 
 	tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
@@ -387,7 +399,9 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 		return false;
 	}
 
-	if (sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
+	dst = __sk_dst_get(sk);
+
+	if (tcp_fastopen_no_cookie(sk, dst, TFO_CLIENT_NO_COOKIE)) {
 		cookie->len = -1;
 		return true;
 	}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c5e64d4b5839..893286db4623 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6332,7 +6332,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	tcp_openreq_init_rwin(req, sk, dst);
 	if (!want_cookie) {
 		tcp_reqsk_record_syn(sk, req, skb);
-		fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc);
+		fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
 	}
 	if (fastopen_sk) {
 		af_ops->send_synack(fastopen_sk, dst, &fl, req,
-- 
cgit v1.2.3


From d15155824c5014803d91b829736d249c500bdda6 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 24 Oct 2017 11:22:46 +0100
Subject: linux/compiler.h: Split into compiler.h and compiler_types.h

linux/compiler.h is included indirectly by linux/types.h via
uapi/linux/types.h -> uapi/linux/posix_types.h -> linux/stddef.h
-> uapi/linux/stddef.h and is needed to provide a proper definition of
offsetof.

Unfortunately, compiler.h requires a definition of
smp_read_barrier_depends() for defining lockless_dereference() and soon
for defining READ_ONCE(), which means that all
users of READ_ONCE() will need to include asm/barrier.h to avoid splats
such as:

   In file included from include/uapi/linux/stddef.h:1:0,
                    from include/linux/stddef.h:4,
                    from arch/h8300/kernel/asm-offsets.c:11:
   include/linux/list.h: In function 'list_empty':
>> include/linux/compiler.h:343:2: error: implicit declaration of function 'smp_read_barrier_depends' [-Werror=implicit-function-declaration]
     smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \
     ^

A better alternative is to include asm/barrier.h in linux/compiler.h,
but this requires a type definition for "bool" on some architectures
(e.g. x86), which is defined later by linux/types.h. Type "bool" is also
used directly in linux/compiler.h, so the whole thing is pretty fragile.

This patch splits compiler.h in two: compiler_types.h contains type
annotations, definitions and the compiler-specific parts, whereas
compiler.h #includes compiler-types.h and additionally defines macros
such as {READ,WRITE.ACCESS}_ONCE().

uapi/linux/stddef.h and linux/linkage.h are then moved over to include
linux/compiler_types.h, which fixes the build for h8 and blackfin.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1508840570-22169-2-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/include/asm/ptrace.h   |   3 +-
 arch/sparc/include/asm/ptrace.h |   1 +
 arch/um/include/shared/init.h   |   2 +-
 include/linux/compiler-clang.h  |   2 +-
 include/linux/compiler-gcc.h    |   2 +-
 include/linux/compiler-intel.h  |   2 +-
 include/linux/compiler.h        | 265 +-------------------------------------
 include/linux/compiler_types.h  | 274 ++++++++++++++++++++++++++++++++++++++++
 include/linux/linkage.h         |   2 +-
 include/uapi/linux/stddef.h     |   2 +-
 scripts/headers_install.sh      |   2 +-
 11 files changed, 286 insertions(+), 271 deletions(-)
 create mode 100644 include/linux/compiler_types.h

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/ptrace.h b/arch/arm/include/asm/ptrace.h
index e9c9a117bd25..c7cdbb43ae7c 100644
--- a/arch/arm/include/asm/ptrace.h
+++ b/arch/arm/include/asm/ptrace.h
@@ -126,8 +126,7 @@ extern unsigned long profile_pc(struct pt_regs *regs);
 /*
  * kprobe-based event tracer support
  */
-#include <linux/stddef.h>
-#include <linux/types.h>
+#include <linux/compiler.h>
 #define MAX_REG_OFFSET (offsetof(struct pt_regs, ARM_ORIG_r0))
 
 extern int regs_query_register_offset(const char *name);
diff --git a/arch/sparc/include/asm/ptrace.h b/arch/sparc/include/asm/ptrace.h
index d73428e4333c..b383484edcd3 100644
--- a/arch/sparc/include/asm/ptrace.h
+++ b/arch/sparc/include/asm/ptrace.h
@@ -6,6 +6,7 @@
 #if defined(__sparc__) && defined(__arch64__)
 #ifndef __ASSEMBLY__
 
+#include <linux/compiler.h>
 #include <linux/threads.h>
 #include <asm/switch_to.h>
 
diff --git a/arch/um/include/shared/init.h b/arch/um/include/shared/init.h
index 233e2593eee0..094e96ce653b 100644
--- a/arch/um/include/shared/init.h
+++ b/arch/um/include/shared/init.h
@@ -40,7 +40,7 @@
 typedef int (*initcall_t)(void);
 typedef void (*exitcall_t)(void);
 
-#include <linux/compiler.h>
+#include <linux/compiler_types.h>
 
 /* These are for everybody (although not all archs will actually
    discard it in modules) */
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index de179993e039..5947a3e6c0e6 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -1,4 +1,4 @@
-#ifndef __LINUX_COMPILER_H
+#ifndef __LINUX_COMPILER_TYPES_H
 #error "Please don't include <linux/compiler-clang.h> directly, include <linux/compiler.h> instead."
 #endif
 
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 16d41de92ee3..ce8e965646ef 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -1,4 +1,4 @@
-#ifndef __LINUX_COMPILER_H
+#ifndef __LINUX_COMPILER_TYPES_H
 #error "Please don't include <linux/compiler-gcc.h> directly, include <linux/compiler.h> instead."
 #endif
 
diff --git a/include/linux/compiler-intel.h b/include/linux/compiler-intel.h
index d4c71132d07f..e438ac89c692 100644
--- a/include/linux/compiler-intel.h
+++ b/include/linux/compiler-intel.h
@@ -1,4 +1,4 @@
-#ifndef __LINUX_COMPILER_H
+#ifndef __LINUX_COMPILER_TYPES_H
 #error "Please don't include <linux/compiler-intel.h> directly, include <linux/compiler.h> instead."
 #endif
 
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index e95a2631e545..08083186e54f 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -1,111 +1,12 @@
 #ifndef __LINUX_COMPILER_H
 #define __LINUX_COMPILER_H
 
-#ifndef __ASSEMBLY__
+#include <linux/compiler_types.h>
 
-#ifdef __CHECKER__
-# define __user		__attribute__((noderef, address_space(1)))
-# define __kernel	__attribute__((address_space(0)))
-# define __safe		__attribute__((safe))
-# define __force	__attribute__((force))
-# define __nocast	__attribute__((nocast))
-# define __iomem	__attribute__((noderef, address_space(2)))
-# define __must_hold(x)	__attribute__((context(x,1,1)))
-# define __acquires(x)	__attribute__((context(x,0,1)))
-# define __releases(x)	__attribute__((context(x,1,0)))
-# define __acquire(x)	__context__(x,1)
-# define __release(x)	__context__(x,-1)
-# define __cond_lock(x,c)	((c) ? ({ __acquire(x); 1; }) : 0)
-# define __percpu	__attribute__((noderef, address_space(3)))
-# define __rcu		__attribute__((noderef, address_space(4)))
-# define __private	__attribute__((noderef))
-extern void __chk_user_ptr(const volatile void __user *);
-extern void __chk_io_ptr(const volatile void __iomem *);
-# define ACCESS_PRIVATE(p, member) (*((typeof((p)->member) __force *) &(p)->member))
-#else /* __CHECKER__ */
-# ifdef STRUCTLEAK_PLUGIN
-#  define __user __attribute__((user))
-# else
-#  define __user
-# endif
-# define __kernel
-# define __safe
-# define __force
-# define __nocast
-# define __iomem
-# define __chk_user_ptr(x) (void)0
-# define __chk_io_ptr(x) (void)0
-# define __builtin_warning(x, y...) (1)
-# define __must_hold(x)
-# define __acquires(x)
-# define __releases(x)
-# define __acquire(x) (void)0
-# define __release(x) (void)0
-# define __cond_lock(x,c) (c)
-# define __percpu
-# define __rcu
-# define __private
-# define ACCESS_PRIVATE(p, member) ((p)->member)
-#endif /* __CHECKER__ */
-
-/* Indirect macros required for expanded argument pasting, eg. __LINE__. */
-#define ___PASTE(a,b) a##b
-#define __PASTE(a,b) ___PASTE(a,b)
+#ifndef __ASSEMBLY__
 
 #ifdef __KERNEL__
 
-#ifdef __GNUC__
-#include <linux/compiler-gcc.h>
-#endif
-
-#if defined(CC_USING_HOTPATCH) && !defined(__CHECKER__)
-#define notrace __attribute__((hotpatch(0,0)))
-#else
-#define notrace __attribute__((no_instrument_function))
-#endif
-
-/* Intel compiler defines __GNUC__. So we will overwrite implementations
- * coming from above header files here
- */
-#ifdef __INTEL_COMPILER
-# include <linux/compiler-intel.h>
-#endif
-
-/* Clang compiler defines __GNUC__. So we will overwrite implementations
- * coming from above header files here
- */
-#ifdef __clang__
-#include <linux/compiler-clang.h>
-#endif
-
-/*
- * Generic compiler-dependent macros required for kernel
- * build go below this comment. Actual compiler/compiler version
- * specific implementations come from the above header files
- */
-
-struct ftrace_branch_data {
-	const char *func;
-	const char *file;
-	unsigned line;
-	union {
-		struct {
-			unsigned long correct;
-			unsigned long incorrect;
-		};
-		struct {
-			unsigned long miss;
-			unsigned long hit;
-		};
-		unsigned long miss_hit[2];
-	};
-};
-
-struct ftrace_likely_data {
-	struct ftrace_branch_data	data;
-	unsigned long			constant;
-};
-
 /*
  * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code
  * to disable branch tracing on a per file basis.
@@ -332,6 +233,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  * with an explicit memory barrier or atomic instruction that provides the
  * required ordering.
  */
+#include <asm/barrier.h>
 
 #define __READ_ONCE(x, check)						\
 ({									\
@@ -362,167 +264,6 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 
 #endif /* __ASSEMBLY__ */
 
-#ifdef __KERNEL__
-/*
- * Allow us to mark functions as 'deprecated' and have gcc emit a nice
- * warning for each use, in hopes of speeding the functions removal.
- * Usage is:
- * 		int __deprecated foo(void)
- */
-#ifndef __deprecated
-# define __deprecated		/* unimplemented */
-#endif
-
-#ifdef MODULE
-#define __deprecated_for_modules __deprecated
-#else
-#define __deprecated_for_modules
-#endif
-
-#ifndef __must_check
-#define __must_check
-#endif
-
-#ifndef CONFIG_ENABLE_MUST_CHECK
-#undef __must_check
-#define __must_check
-#endif
-#ifndef CONFIG_ENABLE_WARN_DEPRECATED
-#undef __deprecated
-#undef __deprecated_for_modules
-#define __deprecated
-#define __deprecated_for_modules
-#endif
-
-#ifndef __malloc
-#define __malloc
-#endif
-
-/*
- * Allow us to avoid 'defined but not used' warnings on functions and data,
- * as well as force them to be emitted to the assembly file.
- *
- * As of gcc 3.4, static functions that are not marked with attribute((used))
- * may be elided from the assembly file.  As of gcc 3.4, static data not so
- * marked will not be elided, but this may change in a future gcc version.
- *
- * NOTE: Because distributions shipped with a backported unit-at-a-time
- * compiler in gcc 3.3, we must define __used to be __attribute__((used))
- * for gcc >=3.3 instead of 3.4.
- *
- * In prior versions of gcc, such functions and data would be emitted, but
- * would be warned about except with attribute((unused)).
- *
- * Mark functions that are referenced only in inline assembly as __used so
- * the code is emitted even though it appears to be unreferenced.
- */
-#ifndef __used
-# define __used			/* unimplemented */
-#endif
-
-#ifndef __maybe_unused
-# define __maybe_unused		/* unimplemented */
-#endif
-
-#ifndef __always_unused
-# define __always_unused	/* unimplemented */
-#endif
-
-#ifndef noinline
-#define noinline
-#endif
-
-/*
- * Rather then using noinline to prevent stack consumption, use
- * noinline_for_stack instead.  For documentation reasons.
- */
-#define noinline_for_stack noinline
-
-#ifndef __always_inline
-#define __always_inline inline
-#endif
-
-#endif /* __KERNEL__ */
-
-/*
- * From the GCC manual:
- *
- * Many functions do not examine any values except their arguments,
- * and have no effects except the return value.  Basically this is
- * just slightly more strict class than the `pure' attribute above,
- * since function is not allowed to read global memory.
- *
- * Note that a function that has pointer arguments and examines the
- * data pointed to must _not_ be declared `const'.  Likewise, a
- * function that calls a non-`const' function usually must not be
- * `const'.  It does not make sense for a `const' function to return
- * `void'.
- */
-#ifndef __attribute_const__
-# define __attribute_const__	/* unimplemented */
-#endif
-
-#ifndef __designated_init
-# define __designated_init
-#endif
-
-#ifndef __latent_entropy
-# define __latent_entropy
-#endif
-
-#ifndef __randomize_layout
-# define __randomize_layout __designated_init
-#endif
-
-#ifndef __no_randomize_layout
-# define __no_randomize_layout
-#endif
-
-#ifndef randomized_struct_fields_start
-# define randomized_struct_fields_start
-# define randomized_struct_fields_end
-#endif
-
-/*
- * Tell gcc if a function is cold. The compiler will assume any path
- * directly leading to the call is unlikely.
- */
-
-#ifndef __cold
-#define __cold
-#endif
-
-/* Simple shorthand for a section definition */
-#ifndef __section
-# define __section(S) __attribute__ ((__section__(#S)))
-#endif
-
-#ifndef __visible
-#define __visible
-#endif
-
-#ifndef __nostackprotector
-# define __nostackprotector
-#endif
-
-/*
- * Assume alignment of return value.
- */
-#ifndef __assume_aligned
-#define __assume_aligned(a, ...)
-#endif
-
-
-/* Are two types/vars the same type (ignoring qualifiers)? */
-#ifndef __same_type
-# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
-#endif
-
-/* Is this type a native word size -- useful for atomic operations */
-#ifndef __native_word
-# define __native_word(t) (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long))
-#endif
-
 /* Compile time object size, -1 for unknown */
 #ifndef __compiletime_object_size
 # define __compiletime_object_size(obj) -1
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
new file mode 100644
index 000000000000..6b79a9bba9a7
--- /dev/null
+++ b/include/linux/compiler_types.h
@@ -0,0 +1,274 @@
+#ifndef __LINUX_COMPILER_TYPES_H
+#define __LINUX_COMPILER_TYPES_H
+
+#ifndef __ASSEMBLY__
+
+#ifdef __CHECKER__
+# define __user		__attribute__((noderef, address_space(1)))
+# define __kernel	__attribute__((address_space(0)))
+# define __safe		__attribute__((safe))
+# define __force	__attribute__((force))
+# define __nocast	__attribute__((nocast))
+# define __iomem	__attribute__((noderef, address_space(2)))
+# define __must_hold(x)	__attribute__((context(x,1,1)))
+# define __acquires(x)	__attribute__((context(x,0,1)))
+# define __releases(x)	__attribute__((context(x,1,0)))
+# define __acquire(x)	__context__(x,1)
+# define __release(x)	__context__(x,-1)
+# define __cond_lock(x,c)	((c) ? ({ __acquire(x); 1; }) : 0)
+# define __percpu	__attribute__((noderef, address_space(3)))
+# define __rcu		__attribute__((noderef, address_space(4)))
+# define __private	__attribute__((noderef))
+extern void __chk_user_ptr(const volatile void __user *);
+extern void __chk_io_ptr(const volatile void __iomem *);
+# define ACCESS_PRIVATE(p, member) (*((typeof((p)->member) __force *) &(p)->member))
+#else /* __CHECKER__ */
+# ifdef STRUCTLEAK_PLUGIN
+#  define __user __attribute__((user))
+# else
+#  define __user
+# endif
+# define __kernel
+# define __safe
+# define __force
+# define __nocast
+# define __iomem
+# define __chk_user_ptr(x) (void)0
+# define __chk_io_ptr(x) (void)0
+# define __builtin_warning(x, y...) (1)
+# define __must_hold(x)
+# define __acquires(x)
+# define __releases(x)
+# define __acquire(x) (void)0
+# define __release(x) (void)0
+# define __cond_lock(x,c) (c)
+# define __percpu
+# define __rcu
+# define __private
+# define ACCESS_PRIVATE(p, member) ((p)->member)
+#endif /* __CHECKER__ */
+
+/* Indirect macros required for expanded argument pasting, eg. __LINE__. */
+#define ___PASTE(a,b) a##b
+#define __PASTE(a,b) ___PASTE(a,b)
+
+#ifdef __KERNEL__
+
+#ifdef __GNUC__
+#include <linux/compiler-gcc.h>
+#endif
+
+#if defined(CC_USING_HOTPATCH) && !defined(__CHECKER__)
+#define notrace __attribute__((hotpatch(0,0)))
+#else
+#define notrace __attribute__((no_instrument_function))
+#endif
+
+/* Intel compiler defines __GNUC__. So we will overwrite implementations
+ * coming from above header files here
+ */
+#ifdef __INTEL_COMPILER
+# include <linux/compiler-intel.h>
+#endif
+
+/* Clang compiler defines __GNUC__. So we will overwrite implementations
+ * coming from above header files here
+ */
+#ifdef __clang__
+#include <linux/compiler-clang.h>
+#endif
+
+/*
+ * Generic compiler-dependent macros required for kernel
+ * build go below this comment. Actual compiler/compiler version
+ * specific implementations come from the above header files
+ */
+
+struct ftrace_branch_data {
+	const char *func;
+	const char *file;
+	unsigned line;
+	union {
+		struct {
+			unsigned long correct;
+			unsigned long incorrect;
+		};
+		struct {
+			unsigned long miss;
+			unsigned long hit;
+		};
+		unsigned long miss_hit[2];
+	};
+};
+
+struct ftrace_likely_data {
+	struct ftrace_branch_data	data;
+	unsigned long			constant;
+};
+
+#endif /* __KERNEL__ */
+
+#endif /* __ASSEMBLY__ */
+
+#ifdef __KERNEL__
+/*
+ * Allow us to mark functions as 'deprecated' and have gcc emit a nice
+ * warning for each use, in hopes of speeding the functions removal.
+ * Usage is:
+ * 		int __deprecated foo(void)
+ */
+#ifndef __deprecated
+# define __deprecated		/* unimplemented */
+#endif
+
+#ifdef MODULE
+#define __deprecated_for_modules __deprecated
+#else
+#define __deprecated_for_modules
+#endif
+
+#ifndef __must_check
+#define __must_check
+#endif
+
+#ifndef CONFIG_ENABLE_MUST_CHECK
+#undef __must_check
+#define __must_check
+#endif
+#ifndef CONFIG_ENABLE_WARN_DEPRECATED
+#undef __deprecated
+#undef __deprecated_for_modules
+#define __deprecated
+#define __deprecated_for_modules
+#endif
+
+#ifndef __malloc
+#define __malloc
+#endif
+
+/*
+ * Allow us to avoid 'defined but not used' warnings on functions and data,
+ * as well as force them to be emitted to the assembly file.
+ *
+ * As of gcc 3.4, static functions that are not marked with attribute((used))
+ * may be elided from the assembly file.  As of gcc 3.4, static data not so
+ * marked will not be elided, but this may change in a future gcc version.
+ *
+ * NOTE: Because distributions shipped with a backported unit-at-a-time
+ * compiler in gcc 3.3, we must define __used to be __attribute__((used))
+ * for gcc >=3.3 instead of 3.4.
+ *
+ * In prior versions of gcc, such functions and data would be emitted, but
+ * would be warned about except with attribute((unused)).
+ *
+ * Mark functions that are referenced only in inline assembly as __used so
+ * the code is emitted even though it appears to be unreferenced.
+ */
+#ifndef __used
+# define __used			/* unimplemented */
+#endif
+
+#ifndef __maybe_unused
+# define __maybe_unused		/* unimplemented */
+#endif
+
+#ifndef __always_unused
+# define __always_unused	/* unimplemented */
+#endif
+
+#ifndef noinline
+#define noinline
+#endif
+
+/*
+ * Rather then using noinline to prevent stack consumption, use
+ * noinline_for_stack instead.  For documentation reasons.
+ */
+#define noinline_for_stack noinline
+
+#ifndef __always_inline
+#define __always_inline inline
+#endif
+
+#endif /* __KERNEL__ */
+
+/*
+ * From the GCC manual:
+ *
+ * Many functions do not examine any values except their arguments,
+ * and have no effects except the return value.  Basically this is
+ * just slightly more strict class than the `pure' attribute above,
+ * since function is not allowed to read global memory.
+ *
+ * Note that a function that has pointer arguments and examines the
+ * data pointed to must _not_ be declared `const'.  Likewise, a
+ * function that calls a non-`const' function usually must not be
+ * `const'.  It does not make sense for a `const' function to return
+ * `void'.
+ */
+#ifndef __attribute_const__
+# define __attribute_const__	/* unimplemented */
+#endif
+
+#ifndef __designated_init
+# define __designated_init
+#endif
+
+#ifndef __latent_entropy
+# define __latent_entropy
+#endif
+
+#ifndef __randomize_layout
+# define __randomize_layout __designated_init
+#endif
+
+#ifndef __no_randomize_layout
+# define __no_randomize_layout
+#endif
+
+#ifndef randomized_struct_fields_start
+# define randomized_struct_fields_start
+# define randomized_struct_fields_end
+#endif
+
+/*
+ * Tell gcc if a function is cold. The compiler will assume any path
+ * directly leading to the call is unlikely.
+ */
+
+#ifndef __cold
+#define __cold
+#endif
+
+/* Simple shorthand for a section definition */
+#ifndef __section
+# define __section(S) __attribute__ ((__section__(#S)))
+#endif
+
+#ifndef __visible
+#define __visible
+#endif
+
+#ifndef __nostackprotector
+# define __nostackprotector
+#endif
+
+/*
+ * Assume alignment of return value.
+ */
+#ifndef __assume_aligned
+#define __assume_aligned(a, ...)
+#endif
+
+
+/* Are two types/vars the same type (ignoring qualifiers)? */
+#ifndef __same_type
+# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
+#endif
+
+/* Is this type a native word size -- useful for atomic operations */
+#ifndef __native_word
+# define __native_word(t) (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long))
+#endif
+
+#endif /* __LINUX_COMPILER_TYPES_H */
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index a6a42dd02466..ebd61b80fed4 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX_LINKAGE_H
 #define _LINUX_LINKAGE_H
 
-#include <linux/compiler.h>
+#include <linux/compiler_types.h>
 #include <linux/stringify.h>
 #include <linux/export.h>
 #include <asm/linkage.h>
diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h
index 621fa8ac4425..d1f7cb732dfc 100644
--- a/include/uapi/linux/stddef.h
+++ b/include/uapi/linux/stddef.h
@@ -1,4 +1,4 @@
-#include <linux/compiler.h>
+#include <linux/compiler_types.h>
 
 #ifndef __always_inline
 #define __always_inline inline
diff --git a/scripts/headers_install.sh b/scripts/headers_install.sh
index fdebd66f8fc1..63b8cc26456a 100755
--- a/scripts/headers_install.sh
+++ b/scripts/headers_install.sh
@@ -33,7 +33,7 @@ do
 	sed -r \
 		-e 's/([ \t(])(__user|__force|__iomem)[ \t]/\1/g' \
 		-e 's/__attribute_const__([ \t]|$)/\1/g' \
-		-e 's@^#include <linux/compiler.h>@@' \
+		-e 's@^#include <linux/compiler(|_types).h>@@' \
 		-e 's/(^|[^a-zA-Z0-9])__packed([^a-zA-Z0-9_]|$)/\1__attribute__((packed))\2/g' \
 		-e 's/(^|[ \t(])(inline|asm|volatile)([ \t(]|$)/\1__\2__\3/g' \
 		-e 's@#(ifndef|define|endif[ \t]*/[*])[ \t]*_UAPI@#\1 @' \
-- 
cgit v1.2.3


From 76ebbe78f7390aee075a7f3768af197ded1bdfbb Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 24 Oct 2017 11:22:47 +0100
Subject: locking/barriers: Add implicit smp_read_barrier_depends() to
 READ_ONCE()

In preparation for the removal of lockless_dereference(), which is the
same as READ_ONCE() on all architectures other than Alpha, add an
implicit smp_read_barrier_depends() to READ_ONCE() so that it can be
used to head dependency chains on all architectures.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1508840570-22169-3-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 08083186e54f..7d7b77da9716 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -242,6 +242,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 		__read_once_size(&(x), __u.__c, sizeof(x));		\
 	else								\
 		__read_once_size_nocheck(&(x), __u.__c, sizeof(x));	\
+	smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \
 	__u.__val;							\
 })
 #define READ_ONCE(x) __READ_ONCE(x, 1)
-- 
cgit v1.2.3


From 506458efaf153c1ea480591c5602a5a3ba5a3b76 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 24 Oct 2017 11:22:48 +0100
Subject: locking/barriers: Convert users of lockless_dereference() to
 READ_ONCE()

READ_ONCE() now has an implicit smp_read_barrier_depends() call, so it
can be used instead of lockless_dereference() without any change in
semantics.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1508840570-22169-4-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/core.c             |  2 +-
 arch/x86/include/asm/mmu_context.h |  4 ++--
 arch/x86/kernel/ldt.c              |  2 +-
 drivers/md/dm-mpath.c              | 20 ++++++++++----------
 fs/dcache.c                        |  4 ++--
 fs/overlayfs/ovl_entry.h           |  2 +-
 fs/overlayfs/readdir.c             |  2 +-
 include/linux/rculist.h            |  4 ++--
 include/linux/rcupdate.h           |  4 ++--
 kernel/events/core.c               |  4 ++--
 kernel/seccomp.c                   |  2 +-
 kernel/task_work.c                 |  2 +-
 mm/slab.h                          |  2 +-
 13 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 80534d3c2480..589af1eec7c1 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2371,7 +2371,7 @@ static unsigned long get_segment_base(unsigned int segment)
 		struct ldt_struct *ldt;
 
 		/* IRQs are off, so this synchronizes with smp_store_release */
-		ldt = lockless_dereference(current->active_mm->context.ldt);
+		ldt = READ_ONCE(current->active_mm->context.ldt);
 		if (!ldt || idx >= ldt->nr_entries)
 			return 0;
 
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 3c856a15b98e..efc530642f7d 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -72,8 +72,8 @@ static inline void load_mm_ldt(struct mm_struct *mm)
 #ifdef CONFIG_MODIFY_LDT_SYSCALL
 	struct ldt_struct *ldt;
 
-	/* lockless_dereference synchronizes with smp_store_release */
-	ldt = lockless_dereference(mm->context.ldt);
+	/* READ_ONCE synchronizes with smp_store_release */
+	ldt = READ_ONCE(mm->context.ldt);
 
 	/*
 	 * Any change to mm->context.ldt is followed by an IPI to all
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index f0e64db18ac8..0a21390642c4 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -101,7 +101,7 @@ static void finalize_ldt_struct(struct ldt_struct *ldt)
 static void install_ldt(struct mm_struct *current_mm,
 			struct ldt_struct *ldt)
 {
-	/* Synchronizes with lockless_dereference in load_mm_ldt. */
+	/* Synchronizes with READ_ONCE in load_mm_ldt. */
 	smp_store_release(&current_mm->context.ldt, ldt);
 
 	/* Activate the LDT for all CPUs using current_mm. */
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 11f273d2f018..3f88c9d32f7e 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -366,7 +366,7 @@ static struct pgpath *choose_path_in_pg(struct multipath *m,
 
 	pgpath = path_to_pgpath(path);
 
-	if (unlikely(lockless_dereference(m->current_pg) != pg)) {
+	if (unlikely(READ_ONCE(m->current_pg) != pg)) {
 		/* Only update current_pgpath if pg changed */
 		spin_lock_irqsave(&m->lock, flags);
 		m->current_pgpath = pgpath;
@@ -390,7 +390,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
 	}
 
 	/* Were we instructed to switch PG? */
-	if (lockless_dereference(m->next_pg)) {
+	if (READ_ONCE(m->next_pg)) {
 		spin_lock_irqsave(&m->lock, flags);
 		pg = m->next_pg;
 		if (!pg) {
@@ -406,7 +406,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
 
 	/* Don't change PG until it has no remaining paths */
 check_current_pg:
-	pg = lockless_dereference(m->current_pg);
+	pg = READ_ONCE(m->current_pg);
 	if (pg) {
 		pgpath = choose_path_in_pg(m, pg, nr_bytes);
 		if (!IS_ERR_OR_NULL(pgpath))
@@ -473,7 +473,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 	struct request *clone;
 
 	/* Do we need to select a new pgpath? */
-	pgpath = lockless_dereference(m->current_pgpath);
+	pgpath = READ_ONCE(m->current_pgpath);
 	if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
 		pgpath = choose_pgpath(m, nr_bytes);
 
@@ -535,7 +535,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
 	bool queue_io;
 
 	/* Do we need to select a new pgpath? */
-	pgpath = lockless_dereference(m->current_pgpath);
+	pgpath = READ_ONCE(m->current_pgpath);
 	queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
 	if (!pgpath || !queue_io)
 		pgpath = choose_pgpath(m, nr_bytes);
@@ -1804,7 +1804,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
 	struct pgpath *current_pgpath;
 	int r;
 
-	current_pgpath = lockless_dereference(m->current_pgpath);
+	current_pgpath = READ_ONCE(m->current_pgpath);
 	if (!current_pgpath)
 		current_pgpath = choose_pgpath(m, 0);
 
@@ -1826,7 +1826,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
 	}
 
 	if (r == -ENOTCONN) {
-		if (!lockless_dereference(m->current_pg)) {
+		if (!READ_ONCE(m->current_pg)) {
 			/* Path status changed, redo selection */
 			(void) choose_pgpath(m, 0);
 		}
@@ -1895,9 +1895,9 @@ static int multipath_busy(struct dm_target *ti)
 		return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED);
 
 	/* Guess which priority_group will be used at next mapping time */
-	pg = lockless_dereference(m->current_pg);
-	next_pg = lockless_dereference(m->next_pg);
-	if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg))
+	pg = READ_ONCE(m->current_pg);
+	next_pg = READ_ONCE(m->next_pg);
+	if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg))
 		pg = next_pg;
 
 	if (!pg) {
diff --git a/fs/dcache.c b/fs/dcache.c
index f90141387f01..34c852af215c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -231,7 +231,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c
 {
 	/*
 	 * Be careful about RCU walk racing with rename:
-	 * use 'lockless_dereference' to fetch the name pointer.
+	 * use 'READ_ONCE' to fetch the name pointer.
 	 *
 	 * NOTE! Even if a rename will mean that the length
 	 * was not loaded atomically, we don't care. The
@@ -245,7 +245,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c
 	 * early because the data cannot match (there can
 	 * be no NUL in the ct/tcount data)
 	 */
-	const unsigned char *cs = lockless_dereference(dentry->d_name.name);
+	const unsigned char *cs = READ_ONCE(dentry->d_name.name);
 
 	return dentry_string_cmp(cs, ct, tcount);
 }
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 25d9b5adcd42..36b49bd09264 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -77,5 +77,5 @@ static inline struct ovl_inode *OVL_I(struct inode *inode)
 
 static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi)
 {
-	return lockless_dereference(oi->__upperdentry);
+	return READ_ONCE(oi->__upperdentry);
 }
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 0f85ee9c3268..c67a7703296b 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -754,7 +754,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
 	if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
 		struct inode *inode = file_inode(file);
 
-		realfile = lockless_dereference(od->upperfile);
+		realfile = READ_ONCE(od->upperfile);
 		if (!realfile) {
 			struct path upperpath;
 
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 2bea1d5e9930..5ed091c064b2 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -274,7 +274,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list,
  * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
  */
 #define list_entry_rcu(ptr, type, member) \
-	container_of(lockless_dereference(ptr), type, member)
+	container_of(READ_ONCE(ptr), type, member)
 
 /*
  * Where are list_empty_rcu() and list_first_entry_rcu()?
@@ -367,7 +367,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list,
  * example is when items are added to the list, but never deleted.
  */
 #define list_entry_lockless(ptr, type, member) \
-	container_of((typeof(ptr))lockless_dereference(ptr), type, member)
+	container_of((typeof(ptr))READ_ONCE(ptr), type, member)
 
 /**
  * list_for_each_entry_lockless - iterate over rcu list of given type
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 1a9f70d44af9..a6ddc42f87a5 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -346,7 +346,7 @@ static inline void rcu_preempt_sleep_check(void) { }
 #define __rcu_dereference_check(p, c, space) \
 ({ \
 	/* Dependency order vs. p above. */ \
-	typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p); \
+	typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \
 	RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \
 	rcu_dereference_sparse(p, space); \
 	((typeof(*p) __force __kernel *)(________p1)); \
@@ -360,7 +360,7 @@ static inline void rcu_preempt_sleep_check(void) { }
 #define rcu_dereference_raw(p) \
 ({ \
 	/* Dependency order vs. p above. */ \
-	typeof(p) ________p1 = lockless_dereference(p); \
+	typeof(p) ________p1 = READ_ONCE(p); \
 	((typeof(*p) __force __kernel *)(________p1)); \
 })
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9d93db81fa36..824a583079a1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4231,7 +4231,7 @@ static void perf_remove_from_owner(struct perf_event *event)
 	 * indeed free this event, otherwise we need to serialize on
 	 * owner->perf_event_mutex.
 	 */
-	owner = lockless_dereference(event->owner);
+	owner = READ_ONCE(event->owner);
 	if (owner) {
 		/*
 		 * Since delayed_put_task_struct() also drops the last
@@ -4328,7 +4328,7 @@ again:
 		 * Cannot change, child events are not migrated, see the
 		 * comment with perf_event_ctx_lock_nested().
 		 */
-		ctx = lockless_dereference(child->ctx);
+		ctx = READ_ONCE(child->ctx);
 		/*
 		 * Since child_mutex nests inside ctx::mutex, we must jump
 		 * through hoops. We start by grabbing a reference on the ctx.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 0ae832e13b97..8ac79355915b 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -189,7 +189,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
 	u32 ret = SECCOMP_RET_ALLOW;
 	/* Make sure cross-thread synced filter points somewhere sane. */
 	struct seccomp_filter *f =
-			lockless_dereference(current->seccomp.filter);
+			READ_ONCE(current->seccomp.filter);
 
 	/* Ensure unexpected behavior doesn't result in failing open. */
 	if (unlikely(WARN_ON(f == NULL)))
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 836a72a66fba..9a9f262fc53d 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -67,7 +67,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
 	 * we raced with task_work_run(), *pprev == NULL/exited.
 	 */
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
-	while ((work = lockless_dereference(*pprev))) {
+	while ((work = READ_ONCE(*pprev))) {
 		if (work->func != func)
 			pprev = &work->next;
 		else if (cmpxchg(pprev, work, work->next) == work)
diff --git a/mm/slab.h b/mm/slab.h
index 073362816acc..8894f811a89d 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -258,7 +258,7 @@ cache_from_memcg_idx(struct kmem_cache *s, int idx)
 	 * memcg_caches issues a write barrier to match this (see
 	 * memcg_create_kmem_cache()).
 	 */
-	cachep = lockless_dereference(arr->entries[idx]);
+	cachep = READ_ONCE(arr->entries[idx]);
 	rcu_read_unlock();
 
 	return cachep;
-- 
cgit v1.2.3


From 59ecbbe7b31cd2d86ff9a9f461a00f7e7533aedc Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 24 Oct 2017 11:22:49 +0100
Subject: locking/barriers: Kill lockless_dereference()

lockless_dereference() is a nice idea, but it gained little traction in
kernel code since its introduction three years ago. This is partly
because it's a pain to type, but also because using READ_ONCE() instead
has worked correctly on all architectures apart from Alpha, which is a
fully supported but somewhat niche architecture these days.

Now that READ_ONCE() has been upgraded to contain an implicit
smp_read_barrier_depends() and the few callers of lockless_dereference()
have been converted, we can remove lockless_dereference() altogether.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1508840570-22169-5-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/memory-barriers.txt                    | 12 ------------
 Documentation/translations/ko_KR/memory-barriers.txt | 12 ------------
 include/linux/compiler.h                             | 20 --------------------
 3 files changed, 44 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index b759a60624fd..470a682f3fa4 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1886,18 +1886,6 @@ There are some more advanced barrier functions:
      See Documentation/atomic_{t,bitops}.txt for more information.
 
 
- (*) lockless_dereference();
-
-     This can be thought of as a pointer-fetch wrapper around the
-     smp_read_barrier_depends() data-dependency barrier.
-
-     This is also similar to rcu_dereference(), but in cases where
-     object lifetime is handled by some mechanism other than RCU, for
-     example, when the objects removed only when the system goes down.
-     In addition, lockless_dereference() is used in some data structures
-     that can be used both with and without RCU.
-
-
  (*) dma_wmb();
  (*) dma_rmb();
 
diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt
index a7a813258013..ec3b46e27b7a 100644
--- a/Documentation/translations/ko_KR/memory-barriers.txt
+++ b/Documentation/translations/ko_KR/memory-barriers.txt
@@ -1858,18 +1858,6 @@ Mandatory 배리어들은 SMP 시스템에서도 UP 시스템에서도 SMP 효
      참고하세요.
 
 
- (*) lockless_dereference();
-
-     이 함수는 smp_read_barrier_depends() 데이터 의존성 배리어를 사용하는
-     포인터 읽어오기 래퍼(wrapper) 함수로 생각될 수 있습니다.
-
-     객체의 라이프타임이 RCU 외의 메커니즘으로 관리된다는 점을 제외하면
-     rcu_dereference() 와도 유사한데, 예를 들면 객체가 시스템이 꺼질 때에만
-     제거되는 경우 등입니다.  또한, lockless_dereference() 은 RCU 와 함께
-     사용될수도, RCU 없이 사용될 수도 있는 일부 데이터 구조에 사용되고
-     있습니다.
-
-
  (*) dma_wmb();
  (*) dma_rmb();
 
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 7d7b77da9716..5a1cab48442c 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -346,24 +346,4 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 	(volatile typeof(x) *)&(x); })
 #define ACCESS_ONCE(x) (*__ACCESS_ONCE(x))
 
-/**
- * lockless_dereference() - safely load a pointer for later dereference
- * @p: The pointer to load
- *
- * Similar to rcu_dereference(), but for situations where the pointed-to
- * object's lifetime is managed by something other than RCU.  That
- * "something other" might be reference counting or simple immortality.
- *
- * The seemingly unused variable ___typecheck_p validates that @p is
- * indeed a pointer type by using a pointer to typeof(*p) as the type.
- * Taking a pointer to typeof(*p) again is needed in case p is void *.
- */
-#define lockless_dereference(p) \
-({ \
-	typeof(p) _________p1 = READ_ONCE(p); \
-	typeof(*(p)) *___typecheck_p __maybe_unused; \
-	smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
-	(_________p1); \
-})
-
 #endif /* __LINUX_COMPILER_H */
-- 
cgit v1.2.3


From 08395c7f4d9f5808b5754a0dbed969f378bde0c3 Mon Sep 17 00:00:00 2001
From: Julien Thierry <julien.thierry@arm.com>
Date: Fri, 13 Oct 2017 12:26:44 +0100
Subject: irqdesc: Add function to identify percpu_devid irqs

irq_is_percpu indicates whether an irq should only target a single cpu.
PERCPU_DEVID flag indicates that an irq can be configured differently on
each cpu it can target.

Provide a function to check whether an irq is PERCPU_DEVID.

Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Julien Thierry <julien.thierry@arm.com>
---
 include/linux/irqdesc.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 3e90a094798d..93960cf36e23 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -244,6 +244,14 @@ static inline int irq_is_percpu(unsigned int irq)
 	return desc->status_use_accessors & IRQ_PER_CPU;
 }
 
+static inline int irq_is_percpu_devid(unsigned int irq)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	return desc->status_use_accessors & IRQ_PER_CPU_DEVID;
+}
+
 static inline void
 irq_set_lockdep_class(unsigned int irq, struct lock_class_key *class)
 {
-- 
cgit v1.2.3


From e87c6bc3852b981e71c757be20771546ce9f76f3 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 23 Oct 2017 23:53:08 -0700
Subject: bpf: permit multiple bpf attachments for a single perf event

This patch enables multiple bpf attachments for a
kprobe/uprobe/tracepoint single trace event.
Each trace_event keeps a list of attached perf events.
When an event happens, all attached bpf programs will
be executed based on the order of attachment.

A global bpf_event_mutex lock is introduced to protect
prog_array attaching and detaching. An alternative will
be introduce a mutex lock in every trace_event_call
structure, but it takes a lot of extra memory.
So a global bpf_event_mutex lock is a good compromise.

The bpf prog detachment involves allocation of memory.
If the allocation fails, a dummy do-nothing program
will replace to-be-detached program in-place.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h           | 30 +++++++++++++---
 include/linux/trace_events.h  | 43 ++++++++++++++++++++---
 include/trace/perf.h          |  6 ++--
 kernel/bpf/core.c             | 81 ++++++++++++++++++++++++++++++++++++++++++
 kernel/events/core.c          | 26 +++++---------
 kernel/trace/bpf_trace.c      | 82 ++++++++++++++++++++++++++++++++++++++++---
 kernel/trace/trace_kprobe.c   |  6 ++--
 kernel/trace/trace_syscalls.c | 34 ++++++++++--------
 kernel/trace/trace_uprobe.c   |  3 +-
 9 files changed, 255 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1e334b248ff6..172be7faf7ba 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -273,18 +273,38 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs);
 int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 				__u32 __user *prog_ids, u32 cnt);
 
-#define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+				struct bpf_prog *old_prog);
+int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+			struct bpf_prog *exclude_prog,
+			struct bpf_prog *include_prog,
+			struct bpf_prog_array **new_array);
+
+#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null)	\
 	({						\
-		struct bpf_prog **_prog;		\
+		struct bpf_prog **_prog, *__prog;	\
+		struct bpf_prog_array *_array;		\
 		u32 _ret = 1;				\
 		rcu_read_lock();			\
-		_prog = rcu_dereference(array)->progs;	\
-		for (; *_prog; _prog++)			\
-			_ret &= func(*_prog, ctx);	\
+		_array = rcu_dereference(array);	\
+		if (unlikely(check_non_null && !_array))\
+			goto _out;			\
+		_prog = _array->progs;			\
+		while ((__prog = READ_ONCE(*_prog))) {	\
+			_ret &= func(__prog, ctx);	\
+			_prog++;			\
+		}					\
+_out:							\
 		rcu_read_unlock();			\
 		_ret;					\
 	 })
 
+#define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
+	__BPF_PROG_RUN_ARRAY(array, ctx, func, false)
+
+#define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func)	\
+	__BPF_PROG_RUN_ARRAY(array, ctx, func, true)
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2e0f22298fe9..fc6aeca945db 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -271,14 +271,37 @@ struct trace_event_call {
 #ifdef CONFIG_PERF_EVENTS
 	int				perf_refcount;
 	struct hlist_head __percpu	*perf_events;
-	struct bpf_prog			*prog;
-	struct perf_event		*bpf_prog_owner;
+	struct bpf_prog_array __rcu	*prog_array;
 
 	int	(*perf_perm)(struct trace_event_call *,
 			     struct perf_event *);
 #endif
 };
 
+#ifdef CONFIG_PERF_EVENTS
+static inline bool bpf_prog_array_valid(struct trace_event_call *call)
+{
+	/*
+	 * This inline function checks whether call->prog_array
+	 * is valid or not. The function is called in various places,
+	 * outside rcu_read_lock/unlock, as a heuristic to speed up execution.
+	 *
+	 * If this function returns true, and later call->prog_array
+	 * becomes false inside rcu_read_lock/unlock region,
+	 * we bail out then. If this function return false,
+	 * there is a risk that we might miss a few events if the checking
+	 * were delayed until inside rcu_read_lock/unlock region and
+	 * call->prog_array happened to become non-NULL then.
+	 *
+	 * Here, READ_ONCE() is used instead of rcu_access_pointer().
+	 * rcu_access_pointer() requires the actual definition of
+	 * "struct bpf_prog_array" while READ_ONCE() only needs
+	 * a declaration of the same type.
+	 */
+	return !!READ_ONCE(call->prog_array);
+}
+#endif
+
 static inline const char *
 trace_event_name(struct trace_event_call *call)
 {
@@ -435,12 +458,23 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
 }
 
 #ifdef CONFIG_BPF_EVENTS
-unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
+unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
+int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
+void perf_event_detach_bpf_prog(struct perf_event *event);
 #else
-static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
 	return 1;
 }
+
+static inline int
+perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void perf_event_detach_bpf_prog(struct perf_event *event) { }
+
 #endif
 
 enum {
@@ -511,6 +545,7 @@ perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 {
 	perf_tp_event(type, count, raw_data, size, regs, head, rctx, task, event);
 }
+
 #endif
 
 #endif /* _LINUX_TRACE_EVENT_H */
diff --git a/include/trace/perf.h b/include/trace/perf.h
index 04fe68bbe767..14f127b6acf5 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -34,7 +34,6 @@ perf_trace_##call(void *__data, proto)					\
 	struct trace_event_call *event_call = __data;			\
 	struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
 	struct trace_event_raw_##call *entry;				\
-	struct bpf_prog *prog = event_call->prog;			\
 	struct pt_regs *__regs;						\
 	u64 __count = 1;						\
 	struct task_struct *__task = NULL;				\
@@ -46,8 +45,9 @@ perf_trace_##call(void *__data, proto)					\
 	__data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
 									\
 	head = this_cpu_ptr(event_call->perf_events);			\
-	if (!prog && __builtin_constant_p(!__task) && !__task &&	\
-				hlist_empty(head))			\
+	if (!bpf_prog_array_valid(event_call) &&			\
+	    __builtin_constant_p(!__task) && !__task &&			\
+	    hlist_empty(head))						\
 		return;							\
 									\
 	__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8e7c8bf2b687..7fe448799d76 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1394,6 +1394,20 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
+static unsigned int __bpf_prog_ret1(const void *ctx,
+				    const struct bpf_insn *insn)
+{
+	return 1;
+}
+
+static struct bpf_prog_dummy {
+	struct bpf_prog prog;
+} dummy_bpf_prog = {
+	.prog = {
+		.bpf_func = __bpf_prog_ret1,
+	},
+};
+
 /* to avoid allocating empty bpf_prog_array for cgroups that
  * don't have bpf program attached use one global 'empty_prog_array'
  * It will not be modified the caller of bpf_prog_array_alloc()
@@ -1463,6 +1477,73 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 	return 0;
 }
 
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+				struct bpf_prog *old_prog)
+{
+	struct bpf_prog **prog = progs->progs;
+
+	for (; *prog; prog++)
+		if (*prog == old_prog) {
+			WRITE_ONCE(*prog, &dummy_bpf_prog.prog);
+			break;
+		}
+}
+
+int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+			struct bpf_prog *exclude_prog,
+			struct bpf_prog *include_prog,
+			struct bpf_prog_array **new_array)
+{
+	int new_prog_cnt, carry_prog_cnt = 0;
+	struct bpf_prog **existing_prog;
+	struct bpf_prog_array *array;
+	int new_prog_idx = 0;
+
+	/* Figure out how many existing progs we need to carry over to
+	 * the new array.
+	 */
+	if (old_array) {
+		existing_prog = old_array->progs;
+		for (; *existing_prog; existing_prog++) {
+			if (*existing_prog != exclude_prog &&
+			    *existing_prog != &dummy_bpf_prog.prog)
+				carry_prog_cnt++;
+			if (*existing_prog == include_prog)
+				return -EEXIST;
+		}
+	}
+
+	/* How many progs (not NULL) will be in the new array? */
+	new_prog_cnt = carry_prog_cnt;
+	if (include_prog)
+		new_prog_cnt += 1;
+
+	/* Do we have any prog (not NULL) in the new array? */
+	if (!new_prog_cnt) {
+		*new_array = NULL;
+		return 0;
+	}
+
+	/* +1 as the end of prog_array is marked with NULL */
+	array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
+	if (!array)
+		return -ENOMEM;
+
+	/* Fill in the new prog array */
+	if (carry_prog_cnt) {
+		existing_prog = old_array->progs;
+		for (; *existing_prog; existing_prog++)
+			if (*existing_prog != exclude_prog &&
+			    *existing_prog != &dummy_bpf_prog.prog)
+				array->progs[new_prog_idx++] = *existing_prog;
+	}
+	if (include_prog)
+		array->progs[new_prog_idx++] = include_prog;
+	array->progs[new_prog_idx] = NULL;
+	*new_array = array;
+	return 0;
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9f78a6825bbe..9660ee65fbef 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7954,11 +7954,9 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
 			       struct pt_regs *regs, struct hlist_head *head,
 			       struct task_struct *task)
 {
-	struct bpf_prog *prog = call->prog;
-
-	if (prog) {
+	if (bpf_prog_array_valid(call)) {
 		*(struct pt_regs **)raw_data = regs;
-		if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
+		if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
 			perf_swevent_put_recursion_context(rctx);
 			return;
 		}
@@ -8147,13 +8145,11 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 {
 	bool is_kprobe, is_tracepoint, is_syscall_tp;
 	struct bpf_prog *prog;
+	int ret;
 
 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
 		return perf_event_set_bpf_handler(event, prog_fd);
 
-	if (event->tp_event->prog)
-		return -EEXIST;
-
 	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
 	is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
 	is_syscall_tp = is_syscall_trace_event(event->tp_event);
@@ -8181,26 +8177,20 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 			return -EACCES;
 		}
 	}
-	event->tp_event->prog = prog;
-	event->tp_event->bpf_prog_owner = event;
 
-	return 0;
+	ret = perf_event_attach_bpf_prog(event, prog);
+	if (ret)
+		bpf_prog_put(prog);
+	return ret;
 }
 
 static void perf_event_free_bpf_prog(struct perf_event *event)
 {
-	struct bpf_prog *prog;
-
 	if (event->attr.type != PERF_TYPE_TRACEPOINT) {
 		perf_event_free_bpf_handler(event);
 		return;
 	}
-
-	prog = event->tp_event->prog;
-	if (prog && event->tp_event->bpf_prog_owner == event) {
-		event->tp_event->prog = NULL;
-		bpf_prog_put(prog);
-	}
+	perf_event_detach_bpf_prog(event);
 }
 
 #else
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3126da2f468a..b65011d320e3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -17,7 +17,7 @@
 
 /**
  * trace_call_bpf - invoke BPF program
- * @prog: BPF program
+ * @call: tracepoint event
  * @ctx: opaque context pointer
  *
  * kprobe handlers execute BPF programs via this helper.
@@ -29,7 +29,7 @@
  * 1 - store kprobe event into ring buffer
  * Other values are reserved and currently alias to 1
  */
-unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
 	unsigned int ret;
 
@@ -49,9 +49,22 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
 		goto out;
 	}
 
-	rcu_read_lock();
-	ret = BPF_PROG_RUN(prog, ctx);
-	rcu_read_unlock();
+	/*
+	 * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
+	 * to all call sites, we did a bpf_prog_array_valid() there to check
+	 * whether call->prog_array is empty or not, which is
+	 * a heurisitc to speed up execution.
+	 *
+	 * If bpf_prog_array_valid() fetched prog_array was
+	 * non-NULL, we go into trace_call_bpf() and do the actual
+	 * proper rcu_dereference() under RCU lock.
+	 * If it turns out that prog_array is NULL then, we bail out.
+	 * For the opposite, if the bpf_prog_array_valid() fetched pointer
+	 * was NULL, you'll skip the prog_array with the risk of missing
+	 * out of events when it was updated in between this and the
+	 * rcu_dereference() which is accepted risk.
+	 */
+	ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
 
  out:
 	__this_cpu_dec(bpf_prog_active);
@@ -741,3 +754,62 @@ const struct bpf_verifier_ops perf_event_verifier_ops = {
 
 const struct bpf_prog_ops perf_event_prog_ops = {
 };
+
+static DEFINE_MUTEX(bpf_event_mutex);
+
+int perf_event_attach_bpf_prog(struct perf_event *event,
+			       struct bpf_prog *prog)
+{
+	struct bpf_prog_array __rcu *old_array;
+	struct bpf_prog_array *new_array;
+	int ret = -EEXIST;
+
+	mutex_lock(&bpf_event_mutex);
+
+	if (event->prog)
+		goto out;
+
+	old_array = rcu_dereference_protected(event->tp_event->prog_array,
+					      lockdep_is_held(&bpf_event_mutex));
+	ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+	if (ret < 0)
+		goto out;
+
+	/* set the new array to event->tp_event and set event->prog */
+	event->prog = prog;
+	rcu_assign_pointer(event->tp_event->prog_array, new_array);
+	bpf_prog_array_free(old_array);
+
+out:
+	mutex_unlock(&bpf_event_mutex);
+	return ret;
+}
+
+void perf_event_detach_bpf_prog(struct perf_event *event)
+{
+	struct bpf_prog_array __rcu *old_array;
+	struct bpf_prog_array *new_array;
+	int ret;
+
+	mutex_lock(&bpf_event_mutex);
+
+	if (!event->prog)
+		goto out;
+
+	old_array = rcu_dereference_protected(event->tp_event->prog_array,
+					      lockdep_is_held(&bpf_event_mutex));
+
+	ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+	if (ret < 0) {
+		bpf_prog_array_delete_safe(old_array, event->prog);
+	} else {
+		rcu_assign_pointer(event->tp_event->prog_array, new_array);
+		bpf_prog_array_free(old_array);
+	}
+
+	bpf_prog_put(event->prog);
+	event->prog = NULL;
+
+out:
+	mutex_unlock(&bpf_event_mutex);
+}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8a907e12b6b9..abf92e478cfb 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1174,13 +1174,12 @@ static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
-	struct bpf_prog *prog = call->prog;
 	struct kprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
-	if (prog && !trace_call_bpf(prog, regs))
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
 		return;
 
 	head = this_cpu_ptr(call->perf_events);
@@ -1210,13 +1209,12 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 		    struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
-	struct bpf_prog *prog = call->prog;
 	struct kretprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
-	if (prog && !trace_call_bpf(prog, regs))
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
 		return;
 
 	head = this_cpu_ptr(call->perf_events);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 696afe72d3b1..71a6af34d7a9 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -559,9 +559,10 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
 static int sys_perf_refcount_enter;
 static int sys_perf_refcount_exit;
 
-static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
-			      struct syscall_metadata *sys_data,
-			      struct syscall_trace_enter *rec) {
+static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
+			       struct syscall_metadata *sys_data,
+			       struct syscall_trace_enter *rec)
+{
 	struct syscall_tp_t {
 		unsigned long long regs;
 		unsigned long syscall_nr;
@@ -573,7 +574,7 @@ static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
 	param.syscall_nr = rec->nr;
 	for (i = 0; i < sys_data->nb_args; i++)
 		param.args[i] = rec->args[i];
-	return trace_call_bpf(prog, &param);
+	return trace_call_bpf(call, &param);
 }
 
 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
@@ -581,7 +582,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_enter *rec;
 	struct hlist_head *head;
-	struct bpf_prog *prog;
+	bool valid_prog_array;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -596,9 +597,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	if (!sys_data)
 		return;
 
-	prog = READ_ONCE(sys_data->enter_event->prog);
 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
-	if (!prog && hlist_empty(head))
+	valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
+	if (!valid_prog_array && hlist_empty(head))
 		return;
 
 	/* get the size after alignment with the u32 buffer size field */
@@ -614,7 +615,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
 
-	if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) ||
+	if ((valid_prog_array &&
+	     !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
 	    hlist_empty(head)) {
 		perf_swevent_put_recursion_context(rctx);
 		return;
@@ -659,8 +661,9 @@ static void perf_sysenter_disable(struct trace_event_call *call)
 	mutex_unlock(&syscall_trace_lock);
 }
 
-static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
-			      struct syscall_trace_exit *rec) {
+static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
+			      struct syscall_trace_exit *rec)
+{
 	struct syscall_tp_t {
 		unsigned long long regs;
 		unsigned long syscall_nr;
@@ -670,7 +673,7 @@ static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
 	*(struct pt_regs **)&param = regs;
 	param.syscall_nr = rec->nr;
 	param.ret = rec->ret;
-	return trace_call_bpf(prog, &param);
+	return trace_call_bpf(call, &param);
 }
 
 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
@@ -678,7 +681,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
 	struct hlist_head *head;
-	struct bpf_prog *prog;
+	bool valid_prog_array;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -693,9 +696,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
-	prog = READ_ONCE(sys_data->exit_event->prog);
 	head = this_cpu_ptr(sys_data->exit_event->perf_events);
-	if (!prog && hlist_empty(head))
+	valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
+	if (!valid_prog_array && hlist_empty(head))
 		return;
 
 	/* We can probably do that at build time */
@@ -709,7 +712,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
 
-	if ((prog && !perf_call_bpf_exit(prog, regs, rec)) ||
+	if ((valid_prog_array &&
+	     !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) ||
 	    hlist_empty(head)) {
 		perf_swevent_put_recursion_context(rctx);
 		return;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 4525e0271a53..153c0e411461 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1113,13 +1113,12 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 {
 	struct trace_event_call *call = &tu->tp.call;
 	struct uprobe_trace_entry_head *entry;
-	struct bpf_prog *prog = call->prog;
 	struct hlist_head *head;
 	void *data;
 	int size, esize;
 	int rctx;
 
-	if (prog && !trace_call_bpf(prog, regs))
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
 		return;
 
 	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
-- 
cgit v1.2.3


From 4df714be4dcf40bfb0d4af0f851a6e1977afa02e Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 12 Oct 2017 13:20:48 +0100
Subject: locking/atomic: Add atomic_cond_read_acquire()

smp_cond_load_acquire() provides a way to spin on a variable with acquire
semantics until some conditional expression involving the variable is
satisfied. Architectures such as arm64 can potentially enter a low-power
state, waking up only when the value of the variable changes, which
reduces the system impact of tight polling loops.

This patch makes the same interface available to users of atomic_t,
atomic64_t and atomic_long_t, rather than require messy accesses to the
structure internals.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Jeremy.Linton@arm.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/1507810851-306-3-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/atomic-long.h | 3 +++
 include/linux/atomic.h            | 4 ++++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/asm-generic/atomic-long.h b/include/asm-generic/atomic-long.h
index 288cc9e96395..f2d97b782031 100644
--- a/include/asm-generic/atomic-long.h
+++ b/include/asm-generic/atomic-long.h
@@ -243,4 +243,7 @@ static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u)
 #define atomic_long_inc_not_zero(l) \
 	ATOMIC_LONG_PFX(_inc_not_zero)((ATOMIC_LONG_PFX(_t) *)(l))
 
+#define atomic_long_cond_read_acquire(v, c) \
+	ATOMIC_LONG_PFX(_cond_read_acquire)((ATOMIC_LONG_PFX(_t) *)(v), (c))
+
 #endif  /*  _ASM_GENERIC_ATOMIC_LONG_H  */
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 40d6bfec0e0d..0aeb2b3f4578 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -653,6 +653,8 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 }
 #endif
 
+#define atomic_cond_read_acquire(v, c)	smp_cond_load_acquire(&(v)->counter, (c))
+
 #ifdef CONFIG_GENERIC_ATOMIC64
 #include <asm-generic/atomic64.h>
 #endif
@@ -1072,6 +1074,8 @@ static inline long long atomic64_fetch_andnot_release(long long i, atomic64_t *v
 }
 #endif
 
+#define atomic64_cond_read_acquire(v, c)	smp_cond_load_acquire(&(v)->counter, (c))
+
 #include <asm-generic/atomic-long.h>
 
 #endif /* _LINUX_ATOMIC_H */
-- 
cgit v1.2.3


From 66702eb59064f1077e89cce8e7e3cd48ec4b486c Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 23 Oct 2017 14:07:14 -0700
Subject: locking/atomics, fs/dcache: Convert ACCESS_ONCE() to
 READ_ONCE()/WRITE_ONCE()

For several reasons, it is desirable to use {READ,WRITE}_ONCE() in
preference to ACCESS_ONCE(), and new code is expected to use one of the
former. So far, there's been no reason to change most existing uses of
ACCESS_ONCE(), as these aren't currently harmful.

However, for some features it is necessary to instrument reads and
writes separately, which is not possible with ACCESS_ONCE(). This
distinction is critical to correct operation.

It's possible to transform the bulk of kernel code using the Coccinelle
script below. However, this doesn't handle comments, leaving references
to ACCESS_ONCE() instances which have been removed. As a preparatory
step, this patch converts the dcache code and comments to use
{READ,WRITE}_ONCE() consistently.

----
virtual patch

@ depends on patch @
expression E1, E2;
@@

- ACCESS_ONCE(E1) = E2
+ WRITE_ONCE(E1, E2)

@ depends on patch @
expression E;
@@

- ACCESS_ONCE(E)
+ READ_ONCE(E)
----

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: davem@davemloft.net
Cc: linux-arch@vger.kernel.org
Cc: mpe@ellerman.id.au
Cc: shuah@kernel.org
Cc: snitzer@redhat.com
Cc: thor.thayer@linux.intel.com
Cc: tj@kernel.org
Cc: will.deacon@arm.com
Link: http://lkml.kernel.org/r/1508792849-3115-4-git-send-email-paulmck@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/dcache.c            | 18 +++++++++---------
 include/linux/dcache.h |  4 ++--
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 34c852af215c..bcc9f6981569 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -630,7 +630,7 @@ static inline struct dentry *lock_parent(struct dentry *dentry)
 	rcu_read_lock();
 	spin_unlock(&dentry->d_lock);
 again:
-	parent = ACCESS_ONCE(dentry->d_parent);
+	parent = READ_ONCE(dentry->d_parent);
 	spin_lock(&parent->d_lock);
 	/*
 	 * We can't blindly lock dentry until we are sure
@@ -721,7 +721,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	 * around with a zero refcount.
 	 */
 	smp_rmb();
-	d_flags = ACCESS_ONCE(dentry->d_flags);
+	d_flags = READ_ONCE(dentry->d_flags);
 	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED;
 
 	/* Nothing to do? Dropping the reference was all we needed? */
@@ -850,11 +850,11 @@ struct dentry *dget_parent(struct dentry *dentry)
 	 * locking.
 	 */
 	rcu_read_lock();
-	ret = ACCESS_ONCE(dentry->d_parent);
+	ret = READ_ONCE(dentry->d_parent);
 	gotref = lockref_get_not_zero(&ret->d_lockref);
 	rcu_read_unlock();
 	if (likely(gotref)) {
-		if (likely(ret == ACCESS_ONCE(dentry->d_parent)))
+		if (likely(ret == READ_ONCE(dentry->d_parent)))
 			return ret;
 		dput(ret);
 	}
@@ -3040,7 +3040,7 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen)
  * @buflen: allocated length of the buffer
  * @name:   name string and length qstr structure
  *
- * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to
+ * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to
  * make sure that either the old or the new name pointer and length are
  * fetched. However, there may be mismatch between length and pointer.
  * The length cannot be trusted, we need to copy it byte-by-byte until
@@ -3054,8 +3054,8 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen)
  */
 static int prepend_name(char **buffer, int *buflen, const struct qstr *name)
 {
-	const char *dname = ACCESS_ONCE(name->name);
-	u32 dlen = ACCESS_ONCE(name->len);
+	const char *dname = READ_ONCE(name->name);
+	u32 dlen = READ_ONCE(name->len);
 	char *p;
 
 	smp_read_barrier_depends();
@@ -3120,7 +3120,7 @@ restart:
 		struct dentry * parent;
 
 		if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
-			struct mount *parent = ACCESS_ONCE(mnt->mnt_parent);
+			struct mount *parent = READ_ONCE(mnt->mnt_parent);
 			/* Escaped? */
 			if (dentry != vfsmnt->mnt_root) {
 				bptr = *buffer;
@@ -3130,7 +3130,7 @@ restart:
 			}
 			/* Global root? */
 			if (mnt != parent) {
-				dentry = ACCESS_ONCE(mnt->mnt_mountpoint);
+				dentry = READ_ONCE(mnt->mnt_mountpoint);
 				mnt = parent;
 				vfsmnt = &mnt->mnt;
 				continue;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index ed1a7cf6923a..1d8f5818f647 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -519,7 +519,7 @@ static inline struct inode *d_inode(const struct dentry *dentry)
 }
 
 /**
- * d_inode_rcu - Get the actual inode of this dentry with ACCESS_ONCE()
+ * d_inode_rcu - Get the actual inode of this dentry with READ_ONCE()
  * @dentry: The dentry to query
  *
  * This is the helper normal filesystems should use to get at their own inodes
@@ -527,7 +527,7 @@ static inline struct inode *d_inode(const struct dentry *dentry)
  */
 static inline struct inode *d_inode_rcu(const struct dentry *dentry)
 {
-	return ACCESS_ONCE(dentry->d_inode);
+	return READ_ONCE(dentry->d_inode);
 }
 
 /**
-- 
cgit v1.2.3


From 14cd5d4a0125f643350e7fa12f5384f1fc2d3e9d Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 23 Oct 2017 14:07:17 -0700
Subject: locking/atomics, net/netlink/netfilter: Convert ACCESS_ONCE() to
 READ_ONCE()/WRITE_ONCE()

For several reasons, it is desirable to use {READ,WRITE}_ONCE() in
preference to ACCESS_ONCE(), and new code is expected to use one of the
former. So far, there's been no reason to change most existing uses of
ACCESS_ONCE(), as these aren't currently harmful.

However, for some features it is necessary to instrument reads and
writes separately, which is not possible with ACCESS_ONCE(). This
distinction is critical to correct operation.

It's possible to transform the bulk of kernel code using the Coccinelle
script below. However, this doesn't handle comments, leaving references
to ACCESS_ONCE() instances which have been removed. As a preparatory
step, this patch converts netlink and netfilter code and comments to use
{READ,WRITE}_ONCE() consistently.

----
virtual patch

@ depends on patch @
expression E1, E2;
@@

- ACCESS_ONCE(E1) = E2
+ WRITE_ONCE(E1, E2)

@ depends on patch @
expression E;
@@

- ACCESS_ONCE(E)
+ READ_ONCE(E)
----

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Florian Westphal <fw@strlen.de>
Cc: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: mpe@ellerman.id.au
Cc: shuah@kernel.org
Cc: snitzer@redhat.com
Cc: thor.thayer@linux.intel.com
Cc: tj@kernel.org
Cc: viro@zeniv.linux.org.uk
Cc: will.deacon@arm.com
Link: http://lkml.kernel.org/r/1508792849-3115-7-git-send-email-paulmck@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/genetlink.h           | 2 +-
 include/linux/netfilter/nfnetlink.h | 2 +-
 include/linux/rtnetlink.h           | 2 +-
 include/net/netfilter/nf_tables.h   | 4 ++--
 net/netfilter/ipvs/ip_vs_sync.c     | 2 +-
 net/netfilter/nfnetlink_queue.c     | 4 ++--
 6 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genetlink.h b/include/linux/genetlink.h
index a4c61cbce777..0e694cf62414 100644
--- a/include/linux/genetlink.h
+++ b/include/linux/genetlink.h
@@ -30,7 +30,7 @@ extern wait_queue_head_t genl_sk_destructing_waitq;
  * @p: The pointer to read, prior to dereferencing
  *
  * Return the value of the specified RCU-protected pointer, but omit
- * both the smp_read_barrier_depends() and the ACCESS_ONCE(), because
+ * both the smp_read_barrier_depends() and the READ_ONCE(), because
  * caller holds genl mutex.
  */
 #define genl_dereference(p)					\
diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 41d04e9d088a..0f47a4aa7fc4 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -66,7 +66,7 @@ static inline bool lockdep_nfnl_is_held(__u8 subsys_id)
  * @ss: The nfnetlink subsystem ID
  *
  * Return the value of the specified RCU-protected pointer, but omit
- * both the smp_read_barrier_depends() and the ACCESS_ONCE(), because
+ * both the smp_read_barrier_depends() and the READ_ONCE(), because
  * caller holds the NFNL subsystem mutex.
  */
 #define nfnl_dereference(p, ss)					\
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index dea59c8eec54..765f7b915475 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -67,7 +67,7 @@ static inline bool lockdep_rtnl_is_held(void)
  * @p: The pointer to read, prior to dereferencing
  *
  * Return the value of the specified RCU-protected pointer, but omit
- * both the smp_read_barrier_depends() and the ACCESS_ONCE(), because
+ * both the smp_read_barrier_depends() and the READ_ONCE(), because
  * caller holds RTNL.
  */
 #define rtnl_dereference(p)					\
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 0f5b12a4ad09..5c68e279eaea 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1164,8 +1164,8 @@ static inline u8 nft_genmask_next(const struct net *net)
 
 static inline u8 nft_genmask_cur(const struct net *net)
 {
-	/* Use ACCESS_ONCE() to prevent refetching the value for atomicity */
-	return 1 << ACCESS_ONCE(net->nft.gencursor);
+	/* Use READ_ONCE() to prevent refetching the value for atomicity */
+	return 1 << READ_ONCE(net->nft.gencursor);
 }
 
 #define NFT_GENMASK_ANY		((1 << 0) | (1 << 1))
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 0e5b64a75da0..1cfffd42d1e2 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -457,7 +457,7 @@ static inline bool in_persistence(struct ip_vs_conn *cp)
 static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs,
 				  struct ip_vs_conn *cp, int pkts)
 {
-	unsigned long orig = ACCESS_ONCE(cp->sync_endtime);
+	unsigned long orig = READ_ONCE(cp->sync_endtime);
 	unsigned long now = jiffies;
 	unsigned long n = (now + cp->timeout) & ~3UL;
 	unsigned int sync_refresh_period;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index c9796629858f..a16356cacec3 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -401,7 +401,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 
 	outdev = entry->state.out;
 
-	switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) {
+	switch ((enum nfqnl_config_mode)READ_ONCE(queue->copy_mode)) {
 	case NFQNL_COPY_META:
 	case NFQNL_COPY_NONE:
 		break;
@@ -412,7 +412,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 		    skb_checksum_help(entskb))
 			return NULL;
 
-		data_len = ACCESS_ONCE(queue->copy_range);
+		data_len = READ_ONCE(queue->copy_range);
 		if (data_len > entskb->len)
 			data_len = entskb->len;
 
-- 
cgit v1.2.3


From ef4d9af62f47e3070b00c3307a4d8eb5092bb9a2 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 23 Oct 2017 14:07:19 -0700
Subject: locking/atomics, net/average: Convert ACCESS_ONCE() to
 READ_ONCE()/WRITE_ONCE()

For several reasons, it is desirable to use {READ,WRITE}_ONCE() in
preference to ACCESS_ONCE(), and new code is expected to use one of the
former. So far, there's been no reason to change most existing uses of
ACCESS_ONCE(), as these aren't currently harmful.

However, for some features it is necessary to instrument reads and
writes separately, which is not possible with ACCESS_ONCE(). This
distinction is critical to correct operation.

It's possible to transform the bulk of kernel code using the Coccinelle
script below. However, this doesn't pick up some uses, including those
in <linux/average.h>. As a preparatory step, this patch converts the
file to use {READ,WRITE}_ONCE() consistently.

At the same time, this patch addds missing includes necessary for
{READ,WRITE}_ONCE(), *BUG_ON*(), and ilog2().

----
virtual patch

@ depends on patch @
expression E1, E2;
@@

- ACCESS_ONCE(E1) = E2
+ WRITE_ONCE(E1, E2)

@ depends on patch @
expression E;
@@

- ACCESS_ONCE(E)
+ READ_ONCE(E)
----

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Johannes Berg <johannes.berg@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: mpe@ellerman.id.au
Cc: shuah@kernel.org
Cc: snitzer@redhat.com
Cc: thor.thayer@linux.intel.com
Cc: tj@kernel.org
Cc: viro@zeniv.linux.org.uk
Cc: will.deacon@arm.com
Link: http://lkml.kernel.org/r/1508792849-3115-9-git-send-email-paulmck@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/average.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/average.h b/include/linux/average.h
index 7ddaf340d2ac..3f462292269c 100644
--- a/include/linux/average.h
+++ b/include/linux/average.h
@@ -1,6 +1,10 @@
 #ifndef _LINUX_AVERAGE_H
 #define _LINUX_AVERAGE_H
 
+#include <linux/bug.h>
+#include <linux/compiler.h>
+#include <linux/log2.h>
+
 /*
  * Exponentially weighted moving average (EWMA)
  *
@@ -48,7 +52,7 @@
 	static inline void ewma_##name##_add(struct ewma_##name *e,	\
 					     unsigned long val)		\
 	{								\
-		unsigned long internal = ACCESS_ONCE(e->internal);	\
+		unsigned long internal = READ_ONCE(e->internal);	\
 		unsigned long weight_rcp = ilog2(_weight_rcp);		\
 		unsigned long precision = _precision;			\
 									\
@@ -57,10 +61,10 @@
 		BUILD_BUG_ON((_precision) > 30);			\
 		BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp);		\
 									\
-		ACCESS_ONCE(e->internal) = internal ?			\
+		WRITE_ONCE(e->internal, internal ?			\
 			(((internal << weight_rcp) - internal) +	\
 				(val << precision)) >> weight_rcp :	\
-			(val << precision);				\
+			(val << precision));				\
 	}
 
 #endif /* _LINUX_AVERAGE_H */
-- 
cgit v1.2.3


From 6aa7de059173a986114ac43b8f50b297a86f09a8 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 23 Oct 2017 14:07:29 -0700
Subject: locking/atomics: COCCINELLE/treewide: Convert trivial ACCESS_ONCE()
 patterns to READ_ONCE()/WRITE_ONCE()

Please do not apply this to mainline directly, instead please re-run the
coccinelle script shown below and apply its output.

For several reasons, it is desirable to use {READ,WRITE}_ONCE() in
preference to ACCESS_ONCE(), and new code is expected to use one of the
former. So far, there's been no reason to change most existing uses of
ACCESS_ONCE(), as these aren't harmful, and changing them results in
churn.

However, for some features, the read/write distinction is critical to
correct operation. To distinguish these cases, separate read/write
accessors must be used. This patch migrates (most) remaining
ACCESS_ONCE() instances to {READ,WRITE}_ONCE(), using the following
coccinelle script:

----
// Convert trivial ACCESS_ONCE() uses to equivalent READ_ONCE() and
// WRITE_ONCE()

// $ make coccicheck COCCI=/home/mark/once.cocci SPFLAGS="--include-headers" MODE=patch

virtual patch

@ depends on patch @
expression E1, E2;
@@

- ACCESS_ONCE(E1) = E2
+ WRITE_ONCE(E1, E2)

@ depends on patch @
expression E;
@@

- ACCESS_ONCE(E)
+ READ_ONCE(E)
----

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: davem@davemloft.net
Cc: linux-arch@vger.kernel.org
Cc: mpe@ellerman.id.au
Cc: shuah@kernel.org
Cc: snitzer@redhat.com
Cc: thor.thayer@linux.intel.com
Cc: tj@kernel.org
Cc: viro@zeniv.linux.org.uk
Cc: will.deacon@arm.com
Link: http://lkml.kernel.org/r/1508792849-3115-19-git-send-email-paulmck@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arc/kernel/smp.c                              |  2 +-
 arch/arm/include/asm/spinlock.h                    |  2 +-
 arch/arm/mach-tegra/cpuidle-tegra20.c              |  2 +-
 arch/arm/vdso/vgettimeofday.c                      |  2 +-
 arch/ia64/include/asm/spinlock.h                   |  8 ++---
 arch/mips/include/asm/vdso.h                       |  2 +-
 arch/mips/kernel/pm-cps.c                          |  2 +-
 arch/mn10300/kernel/mn10300-serial.c               |  4 +--
 arch/parisc/include/asm/atomic.h                   |  2 +-
 arch/powerpc/platforms/powernv/opal-msglog.c       |  2 +-
 arch/s390/include/asm/spinlock.h                   |  6 ++--
 arch/s390/lib/spinlock.c                           | 16 +++++-----
 arch/sparc/include/asm/atomic_32.h                 |  2 +-
 arch/tile/gxio/dma_queue.c                         |  4 +--
 arch/tile/include/gxio/dma_queue.h                 |  2 +-
 arch/tile/kernel/ptrace.c                          |  2 +-
 arch/x86/entry/common.c                            |  2 +-
 arch/x86/entry/vdso/vclock_gettime.c               |  2 +-
 arch/x86/events/core.c                             |  2 +-
 arch/x86/include/asm/vgtod.h                       |  2 +-
 arch/x86/kernel/espfix_64.c                        |  6 ++--
 arch/x86/kernel/nmi.c                              |  2 +-
 arch/x86/kvm/mmu.c                                 |  4 +--
 arch/x86/kvm/page_track.c                          |  2 +-
 arch/x86/xen/p2m.c                                 |  2 +-
 arch/xtensa/platforms/xtfpga/lcd.c                 | 14 ++++-----
 block/blk-wbt.c                                    |  2 +-
 drivers/base/core.c                                |  2 +-
 drivers/base/power/runtime.c                       |  4 +--
 drivers/char/random.c                              |  4 +--
 drivers/clocksource/bcm2835_timer.c                |  2 +-
 drivers/crypto/caam/jr.c                           |  4 +--
 drivers/crypto/nx/nx-842-powernv.c                 |  2 +-
 drivers/firewire/ohci.c                            | 10 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c          |  4 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c            |  4 +--
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c      |  2 +-
 drivers/gpu/drm/radeon/radeon_gem.c                |  4 +--
 drivers/gpu/drm/vmwgfx/vmwgfx_surface.c            |  2 +-
 drivers/infiniband/hw/hfi1/file_ops.c              |  2 +-
 drivers/infiniband/hw/hfi1/pio.c                   |  6 ++--
 drivers/infiniband/hw/hfi1/ruc.c                   |  2 +-
 drivers/infiniband/hw/hfi1/sdma.c                  |  8 ++---
 drivers/infiniband/hw/hfi1/sdma.h                  |  2 +-
 drivers/infiniband/hw/hfi1/uc.c                    |  4 +--
 drivers/infiniband/hw/hfi1/ud.c                    |  4 +--
 drivers/infiniband/hw/hfi1/user_sdma.c             |  8 ++---
 drivers/infiniband/hw/qib/qib_ruc.c                |  2 +-
 drivers/infiniband/hw/qib/qib_uc.c                 |  4 +--
 drivers/infiniband/hw/qib/qib_ud.c                 |  4 +--
 drivers/infiniband/sw/rdmavt/qp.c                  |  6 ++--
 drivers/input/misc/regulator-haptic.c              |  2 +-
 drivers/md/dm-bufio.c                              | 10 +++---
 drivers/md/dm-kcopyd.c                             |  4 +--
 drivers/md/dm-stats.c                              | 36 +++++++++++-----------
 drivers/md/dm-switch.c                             |  2 +-
 drivers/md/dm-thin.c                               |  2 +-
 drivers/md/dm-verity-target.c                      |  2 +-
 drivers/md/dm.c                                    |  4 +--
 drivers/md/md.c                                    |  2 +-
 drivers/md/raid5.c                                 |  2 +-
 drivers/misc/mic/scif/scif_rb.c                    |  8 ++---
 drivers/misc/mic/scif/scif_rma_list.c              |  2 +-
 drivers/net/bonding/bond_alb.c                     |  2 +-
 drivers/net/bonding/bond_main.c                    |  6 ++--
 drivers/net/ethernet/chelsio/cxgb4/sge.c           |  4 +--
 drivers/net/ethernet/emulex/benet/be_main.c        |  2 +-
 drivers/net/ethernet/hisilicon/hip04_eth.c         |  4 +--
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c     |  4 +--
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c     |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c        |  4 +--
 drivers/net/ethernet/intel/i40e/i40e_ptp.c         |  4 +--
 drivers/net/ethernet/intel/igb/e1000_regs.h        |  2 +-
 drivers/net/ethernet/intel/igb/igb_main.c          |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.h    |  4 +--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      |  8 ++---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c       |  4 +--
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c  |  2 +-
 drivers/net/ethernet/intel/ixgbevf/vf.h            |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_tx.c         | 12 ++++----
 drivers/net/ethernet/neterion/vxge/vxge-main.c     |  2 +-
 drivers/net/ethernet/sfc/ef10.c                    | 10 +++---
 drivers/net/ethernet/sfc/efx.c                     |  4 +--
 drivers/net/ethernet/sfc/falcon/efx.c              |  4 +--
 drivers/net/ethernet/sfc/falcon/falcon.c           |  4 +--
 drivers/net/ethernet/sfc/falcon/farch.c            |  8 ++---
 drivers/net/ethernet/sfc/falcon/nic.h              |  6 ++--
 drivers/net/ethernet/sfc/falcon/tx.c               |  6 ++--
 drivers/net/ethernet/sfc/farch.c                   |  8 ++---
 drivers/net/ethernet/sfc/nic.h                     |  6 ++--
 drivers/net/ethernet/sfc/ptp.c                     | 10 +++---
 drivers/net/ethernet/sfc/tx.c                      |  6 ++--
 drivers/net/ethernet/sun/niu.c                     |  4 +--
 drivers/net/tap.c                                  |  2 +-
 drivers/net/tun.c                                  |  4 +--
 drivers/net/wireless/ath/ath5k/desc.c              |  8 ++---
 .../wireless/broadcom/brcm80211/brcmfmac/sdio.c    |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/ops.c       |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/tx.c        |  4 +--
 drivers/net/wireless/intel/iwlwifi/pcie/rx.c       |  2 +-
 drivers/net/wireless/intel/iwlwifi/pcie/trans.c    | 10 +++---
 drivers/net/wireless/mac80211_hwsim.c              |  4 +--
 drivers/scsi/qla2xxx/qla_target.c                  |  2 +-
 drivers/target/target_core_user.c                  |  2 +-
 drivers/usb/class/cdc-wdm.c                        |  2 +-
 drivers/usb/core/devio.c                           |  2 +-
 drivers/usb/core/sysfs.c                           |  4 +--
 drivers/usb/gadget/udc/gr_udc.c                    |  4 +--
 drivers/usb/host/ohci-hcd.c                        |  2 +-
 drivers/usb/host/uhci-hcd.h                        |  4 +--
 drivers/vfio/vfio.c                                |  2 +-
 drivers/vhost/scsi.c                               |  2 +-
 fs/aio.c                                           |  2 +-
 fs/buffer.c                                        |  3 +-
 fs/crypto/keyinfo.c                                |  2 +-
 fs/direct-io.c                                     |  2 +-
 fs/exec.c                                          |  2 +-
 fs/fcntl.c                                         |  2 +-
 fs/fs_pin.c                                        |  4 +--
 fs/fuse/dev.c                                      |  2 +-
 fs/inode.c                                         |  2 +-
 fs/namei.c                                         |  4 +--
 fs/namespace.c                                     |  2 +-
 fs/nfs/dir.c                                       |  8 ++---
 fs/proc/array.c                                    |  2 +-
 fs/proc_namespace.c                                |  2 +-
 fs/splice.c                                        |  2 +-
 fs/userfaultfd.c                                   |  8 ++---
 fs/xfs/xfs_log_priv.h                              |  4 +--
 include/linux/bitops.h                             |  4 +--
 include/linux/dynamic_queue_limits.h               |  2 +-
 include/linux/huge_mm.h                            |  2 +-
 include/linux/if_team.h                            |  2 +-
 include/linux/llist.h                              |  2 +-
 include/linux/pm_runtime.h                         |  2 +-
 include/net/ip_vs.h                                |  6 ++--
 kernel/acct.c                                      |  4 +--
 kernel/events/core.c                               |  6 ++--
 kernel/events/ring_buffer.c                        |  2 +-
 kernel/exit.c                                      |  2 +-
 kernel/trace/ring_buffer.c                         |  2 +-
 kernel/trace/trace.h                               |  2 +-
 kernel/trace/trace_stack.c                         |  2 +-
 kernel/user_namespace.c                            |  2 +-
 lib/assoc_array.c                                  | 20 ++++++------
 lib/dynamic_queue_limits.c                         |  2 +-
 lib/llist.c                                        |  2 +-
 lib/vsprintf.c                                     |  4 +--
 mm/huge_memory.c                                   |  2 +-
 net/core/dev.c                                     |  2 +-
 net/core/pktgen.c                                  |  2 +-
 net/ipv4/inet_fragment.c                           |  2 +-
 net/ipv4/route.c                                   |  2 +-
 net/ipv4/tcp_output.c                              |  2 +-
 net/ipv4/udp.c                                     |  4 +--
 net/ipv6/ip6_tunnel.c                              |  8 ++---
 net/ipv6/udp.c                                     |  4 +--
 net/llc/llc_input.c                                |  4 +--
 net/mac80211/sta_info.c                            |  2 +-
 net/netlabel/netlabel_calipso.c                    |  2 +-
 net/wireless/nl80211.c                             |  2 +-
 sound/firewire/amdtp-am824.c                       |  6 ++--
 sound/firewire/amdtp-stream.c                      | 23 +++++++-------
 sound/firewire/amdtp-stream.h                      |  2 +-
 sound/firewire/digi00x/amdtp-dot.c                 |  6 ++--
 sound/firewire/fireface/amdtp-ff.c                 |  4 +--
 sound/firewire/fireface/ff-midi.c                  | 10 +++---
 sound/firewire/fireface/ff-transaction.c           |  8 ++---
 sound/firewire/isight.c                            | 18 +++++------
 sound/firewire/motu/amdtp-motu.c                   |  4 +--
 sound/firewire/oxfw/oxfw-scs1x.c                   | 12 ++++----
 sound/firewire/tascam/amdtp-tascam.c               |  4 +--
 sound/firewire/tascam/tascam-transaction.c         |  6 ++--
 sound/soc/xtensa/xtfpga-i2s.c                      |  6 ++--
 sound/usb/bcd2000/bcd2000.c                        |  4 +--
 tools/arch/x86/include/asm/atomic.h                |  2 +-
 tools/include/asm-generic/atomic-gcc.h             |  2 +-
 tools/perf/util/auxtrace.h                         |  4 +--
 tools/perf/util/session.h                          |  2 +-
 virt/kvm/kvm_main.c                                |  2 +-
 180 files changed, 383 insertions(+), 385 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index f46267153ec2..94cabe73664b 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -245,7 +245,7 @@ static void ipi_send_msg_one(int cpu, enum ipi_msg_type msg)
 	 * and read back old value
 	 */
 	do {
-		new = old = ACCESS_ONCE(*ipi_data_ptr);
+		new = old = READ_ONCE(*ipi_data_ptr);
 		new |= 1U << msg;
 	} while (cmpxchg(ipi_data_ptr, old, new) != old);
 
diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h
index daa87212c9a1..77f50ae0aeb4 100644
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -71,7 +71,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
 
 	while (lockval.tickets.next != lockval.tickets.owner) {
 		wfe();
-		lockval.tickets.owner = ACCESS_ONCE(lock->tickets.owner);
+		lockval.tickets.owner = READ_ONCE(lock->tickets.owner);
 	}
 
 	smp_mb();
diff --git a/arch/arm/mach-tegra/cpuidle-tegra20.c b/arch/arm/mach-tegra/cpuidle-tegra20.c
index 76e4c83cd5c8..3f24addd7972 100644
--- a/arch/arm/mach-tegra/cpuidle-tegra20.c
+++ b/arch/arm/mach-tegra/cpuidle-tegra20.c
@@ -179,7 +179,7 @@ static int tegra20_idle_lp2_coupled(struct cpuidle_device *dev,
 	bool entered_lp2 = false;
 
 	if (tegra_pending_sgi())
-		ACCESS_ONCE(abort_flag) = true;
+		WRITE_ONCE(abort_flag, true);
 
 	cpuidle_coupled_parallel_barrier(dev, &abort_barrier);
 
diff --git a/arch/arm/vdso/vgettimeofday.c b/arch/arm/vdso/vgettimeofday.c
index 79214d5ff097..a9dd619c6c29 100644
--- a/arch/arm/vdso/vgettimeofday.c
+++ b/arch/arm/vdso/vgettimeofday.c
@@ -35,7 +35,7 @@ static notrace u32 __vdso_read_begin(const struct vdso_data *vdata)
 {
 	u32 seq;
 repeat:
-	seq = ACCESS_ONCE(vdata->seq_count);
+	seq = READ_ONCE(vdata->seq_count);
 	if (seq & 1) {
 		cpu_relax();
 		goto repeat;
diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h
index 35b31884863b..e98775be112d 100644
--- a/arch/ia64/include/asm/spinlock.h
+++ b/arch/ia64/include/asm/spinlock.h
@@ -61,7 +61,7 @@ static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
 
 static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
 {
-	int tmp = ACCESS_ONCE(lock->lock);
+	int tmp = READ_ONCE(lock->lock);
 
 	if (!(((tmp >> TICKET_SHIFT) ^ tmp) & TICKET_MASK))
 		return ia64_cmpxchg(acq, &lock->lock, tmp, tmp + 1, sizeof (tmp)) == tmp;
@@ -73,19 +73,19 @@ static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
 	unsigned short	*p = (unsigned short *)&lock->lock + 1, tmp;
 
 	asm volatile ("ld2.bias %0=[%1]" : "=r"(tmp) : "r"(p));
-	ACCESS_ONCE(*p) = (tmp + 2) & ~1;
+	WRITE_ONCE(*p, (tmp + 2) & ~1);
 }
 
 static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
 {
-	long tmp = ACCESS_ONCE(lock->lock);
+	long tmp = READ_ONCE(lock->lock);
 
 	return !!(((tmp >> TICKET_SHIFT) ^ tmp) & TICKET_MASK);
 }
 
 static inline int __ticket_spin_is_contended(arch_spinlock_t *lock)
 {
-	long tmp = ACCESS_ONCE(lock->lock);
+	long tmp = READ_ONCE(lock->lock);
 
 	return ((tmp - (tmp >> TICKET_SHIFT)) & TICKET_MASK) > 1;
 }
diff --git a/arch/mips/include/asm/vdso.h b/arch/mips/include/asm/vdso.h
index b7cd6cf77b83..91bf0c2c265c 100644
--- a/arch/mips/include/asm/vdso.h
+++ b/arch/mips/include/asm/vdso.h
@@ -99,7 +99,7 @@ static inline u32 vdso_data_read_begin(const union mips_vdso_data *data)
 	u32 seq;
 
 	while (true) {
-		seq = ACCESS_ONCE(data->seq_count);
+		seq = READ_ONCE(data->seq_count);
 		if (likely(!(seq & 1))) {
 			/* Paired with smp_wmb() in vdso_data_write_*(). */
 			smp_rmb();
diff --git a/arch/mips/kernel/pm-cps.c b/arch/mips/kernel/pm-cps.c
index 4655017f2377..1d2996cd58da 100644
--- a/arch/mips/kernel/pm-cps.c
+++ b/arch/mips/kernel/pm-cps.c
@@ -166,7 +166,7 @@ int cps_pm_enter_state(enum cps_pm_state state)
 	nc_core_ready_count = nc_addr;
 
 	/* Ensure ready_count is zero-initialised before the assembly runs */
-	ACCESS_ONCE(*nc_core_ready_count) = 0;
+	WRITE_ONCE(*nc_core_ready_count, 0);
 	coupled_barrier(&per_cpu(pm_barrier, core), online);
 
 	/* Run the generated entry code */
diff --git a/arch/mn10300/kernel/mn10300-serial.c b/arch/mn10300/kernel/mn10300-serial.c
index 7ecf69879e2d..d7ef1232a82a 100644
--- a/arch/mn10300/kernel/mn10300-serial.c
+++ b/arch/mn10300/kernel/mn10300-serial.c
@@ -543,7 +543,7 @@ static void mn10300_serial_receive_interrupt(struct mn10300_serial_port *port)
 
 try_again:
 	/* pull chars out of the hat */
-	ix = ACCESS_ONCE(port->rx_outp);
+	ix = READ_ONCE(port->rx_outp);
 	if (CIRC_CNT(port->rx_inp, ix, MNSC_BUFFER_SIZE) == 0) {
 		if (push && !tport->low_latency)
 			tty_flip_buffer_push(tport);
@@ -1724,7 +1724,7 @@ static int mn10300_serial_poll_get_char(struct uart_port *_port)
 	if (mn10300_serial_int_tbl[port->rx_irq].port != NULL) {
 		do {
 			/* pull chars out of the hat */
-			ix = ACCESS_ONCE(port->rx_outp);
+			ix = READ_ONCE(port->rx_outp);
 			if (CIRC_CNT(port->rx_inp, ix, MNSC_BUFFER_SIZE) == 0)
 				return NO_POLL_CHAR;
 
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index 17b98a87e5e2..c57d4e8307f2 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -260,7 +260,7 @@ atomic64_set(atomic64_t *v, s64 i)
 static __inline__ s64
 atomic64_read(const atomic64_t *v)
 {
-	return ACCESS_ONCE((v)->counter);
+	return READ_ONCE((v)->counter);
 }
 
 #define atomic64_inc(v)		(atomic64_add(   1,(v)))
diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c
index 7a9cde0cfbd1..acd3206dfae3 100644
--- a/arch/powerpc/platforms/powernv/opal-msglog.c
+++ b/arch/powerpc/platforms/powernv/opal-msglog.c
@@ -43,7 +43,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
 	if (!opal_memcons)
 		return -ENODEV;
 
-	out_pos = be32_to_cpu(ACCESS_ONCE(opal_memcons->out_pos));
+	out_pos = be32_to_cpu(READ_ONCE(opal_memcons->out_pos));
 
 	/* Now we've read out_pos, put a barrier in before reading the new
 	 * data it points to in conbuf. */
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 9fa855f91e55..66f4160010ef 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -117,14 +117,14 @@ extern int _raw_write_trylock_retry(arch_rwlock_t *lp);
 
 static inline int arch_read_trylock_once(arch_rwlock_t *rw)
 {
-	int old = ACCESS_ONCE(rw->lock);
+	int old = READ_ONCE(rw->lock);
 	return likely(old >= 0 &&
 		      __atomic_cmpxchg_bool(&rw->lock, old, old + 1));
 }
 
 static inline int arch_write_trylock_once(arch_rwlock_t *rw)
 {
-	int old = ACCESS_ONCE(rw->lock);
+	int old = READ_ONCE(rw->lock);
 	return likely(old == 0 &&
 		      __atomic_cmpxchg_bool(&rw->lock, 0, 0x80000000));
 }
@@ -211,7 +211,7 @@ static inline void arch_read_unlock(arch_rwlock_t *rw)
 	int old;
 
 	do {
-		old = ACCESS_ONCE(rw->lock);
+		old = READ_ONCE(rw->lock);
 	} while (!__atomic_cmpxchg_bool(&rw->lock, old, old - 1));
 }
 
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index b12663d653d8..34e30b9ea234 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -162,8 +162,8 @@ void _raw_read_lock_wait(arch_rwlock_t *rw)
 				smp_yield_cpu(~owner);
 			count = spin_retry;
 		}
-		old = ACCESS_ONCE(rw->lock);
-		owner = ACCESS_ONCE(rw->owner);
+		old = READ_ONCE(rw->lock);
+		owner = READ_ONCE(rw->owner);
 		if (old < 0)
 			continue;
 		if (__atomic_cmpxchg_bool(&rw->lock, old, old + 1))
@@ -178,7 +178,7 @@ int _raw_read_trylock_retry(arch_rwlock_t *rw)
 	int old;
 
 	while (count-- > 0) {
-		old = ACCESS_ONCE(rw->lock);
+		old = READ_ONCE(rw->lock);
 		if (old < 0)
 			continue;
 		if (__atomic_cmpxchg_bool(&rw->lock, old, old + 1))
@@ -202,8 +202,8 @@ void _raw_write_lock_wait(arch_rwlock_t *rw, int prev)
 				smp_yield_cpu(~owner);
 			count = spin_retry;
 		}
-		old = ACCESS_ONCE(rw->lock);
-		owner = ACCESS_ONCE(rw->owner);
+		old = READ_ONCE(rw->lock);
+		owner = READ_ONCE(rw->owner);
 		smp_mb();
 		if (old >= 0) {
 			prev = __RAW_LOCK(&rw->lock, 0x80000000, __RAW_OP_OR);
@@ -230,8 +230,8 @@ void _raw_write_lock_wait(arch_rwlock_t *rw)
 				smp_yield_cpu(~owner);
 			count = spin_retry;
 		}
-		old = ACCESS_ONCE(rw->lock);
-		owner = ACCESS_ONCE(rw->owner);
+		old = READ_ONCE(rw->lock);
+		owner = READ_ONCE(rw->owner);
 		if (old >= 0 &&
 		    __atomic_cmpxchg_bool(&rw->lock, old, old | 0x80000000))
 			prev = old;
@@ -251,7 +251,7 @@ int _raw_write_trylock_retry(arch_rwlock_t *rw)
 	int old;
 
 	while (count-- > 0) {
-		old = ACCESS_ONCE(rw->lock);
+		old = READ_ONCE(rw->lock);
 		if (old)
 			continue;
 		if (__atomic_cmpxchg_bool(&rw->lock, 0, 0x80000000))
diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h
index 7643e979e333..e2f398e9456c 100644
--- a/arch/sparc/include/asm/atomic_32.h
+++ b/arch/sparc/include/asm/atomic_32.h
@@ -31,7 +31,7 @@ void atomic_set(atomic_t *, int);
 
 #define atomic_set_release(v, i)	atomic_set((v), (i))
 
-#define atomic_read(v)          ACCESS_ONCE((v)->counter)
+#define atomic_read(v)          READ_ONCE((v)->counter)
 
 #define atomic_add(i, v)	((void)atomic_add_return( (int)(i), (v)))
 #define atomic_sub(i, v)	((void)atomic_add_return(-(int)(i), (v)))
diff --git a/arch/tile/gxio/dma_queue.c b/arch/tile/gxio/dma_queue.c
index baa60357f8ba..b7ba577d82ca 100644
--- a/arch/tile/gxio/dma_queue.c
+++ b/arch/tile/gxio/dma_queue.c
@@ -163,14 +163,14 @@ int __gxio_dma_queue_is_complete(__gxio_dma_queue_t *dma_queue,
 				 int64_t completion_slot, int update)
 {
 	if (update) {
-		if (ACCESS_ONCE(dma_queue->hw_complete_count) >
+		if (READ_ONCE(dma_queue->hw_complete_count) >
 		    completion_slot)
 			return 1;
 
 		__gxio_dma_queue_update_credits(dma_queue);
 	}
 
-	return ACCESS_ONCE(dma_queue->hw_complete_count) > completion_slot;
+	return READ_ONCE(dma_queue->hw_complete_count) > completion_slot;
 }
 
 EXPORT_SYMBOL_GPL(__gxio_dma_queue_is_complete);
diff --git a/arch/tile/include/gxio/dma_queue.h b/arch/tile/include/gxio/dma_queue.h
index b9e45e37649e..c8fd47edba30 100644
--- a/arch/tile/include/gxio/dma_queue.h
+++ b/arch/tile/include/gxio/dma_queue.h
@@ -121,7 +121,7 @@ static inline int64_t __gxio_dma_queue_reserve(__gxio_dma_queue_t *dma_queue,
 		 * if the result is LESS than "hw_complete_count".
 		 */
 		uint64_t complete;
-		complete = ACCESS_ONCE(dma_queue->hw_complete_count);
+		complete = READ_ONCE(dma_queue->hw_complete_count);
 		slot |= (complete & 0xffffffffff000000);
 		if (slot < complete)
 			slot += 0x1000000;
diff --git a/arch/tile/kernel/ptrace.c b/arch/tile/kernel/ptrace.c
index e1a078e6828e..d516d61751c2 100644
--- a/arch/tile/kernel/ptrace.c
+++ b/arch/tile/kernel/ptrace.c
@@ -255,7 +255,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 
 int do_syscall_trace_enter(struct pt_regs *regs)
 {
-	u32 work = ACCESS_ONCE(current_thread_info()->flags);
+	u32 work = READ_ONCE(current_thread_info()->flags);
 
 	if ((work & _TIF_SYSCALL_TRACE) &&
 	    tracehook_report_syscall_entry(regs)) {
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 03505ffbe1b6..eaa0ba66cf96 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -75,7 +75,7 @@ static long syscall_trace_enter(struct pt_regs *regs)
 	if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
 		BUG_ON(regs != task_pt_regs(current));
 
-	work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
+	work = READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
 
 	if (unlikely(work & _TIF_SYSCALL_EMU))
 		emulated = true;
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index fa8dbfcf7ed3..11b13c4b43d5 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -318,7 +318,7 @@ int gettimeofday(struct timeval *, struct timezone *)
 notrace time_t __vdso_time(time_t *t)
 {
 	/* This is atomic on x86 so we don't need any locks. */
-	time_t result = ACCESS_ONCE(gtod->wall_time_sec);
+	time_t result = READ_ONCE(gtod->wall_time_sec);
 
 	if (t)
 		*t = result;
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 589af1eec7c1..140d33288e78 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2118,7 +2118,7 @@ static int x86_pmu_event_init(struct perf_event *event)
 			event->destroy(event);
 	}
 
-	if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
+	if (READ_ONCE(x86_pmu.attr_rdpmc))
 		event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
 
 	return err;
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 022e59714562..53dd162576a8 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -48,7 +48,7 @@ static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s)
 	unsigned ret;
 
 repeat:
-	ret = ACCESS_ONCE(s->seq);
+	ret = READ_ONCE(s->seq);
 	if (unlikely(ret & 1)) {
 		cpu_relax();
 		goto repeat;
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 9c4e7ba6870c..7d7715dde901 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -155,14 +155,14 @@ void init_espfix_ap(int cpu)
 	page = cpu/ESPFIX_STACKS_PER_PAGE;
 
 	/* Did another CPU already set this up? */
-	stack_page = ACCESS_ONCE(espfix_pages[page]);
+	stack_page = READ_ONCE(espfix_pages[page]);
 	if (likely(stack_page))
 		goto done;
 
 	mutex_lock(&espfix_init_mutex);
 
 	/* Did we race on the lock? */
-	stack_page = ACCESS_ONCE(espfix_pages[page]);
+	stack_page = READ_ONCE(espfix_pages[page]);
 	if (stack_page)
 		goto unlock_done;
 
@@ -200,7 +200,7 @@ void init_espfix_ap(int cpu)
 		set_pte(&pte_p[n*PTE_STRIDE], pte);
 
 	/* Job is done for this CPU and any CPU which shares this page */
-	ACCESS_ONCE(espfix_pages[page]) = stack_page;
+	WRITE_ONCE(espfix_pages[page], stack_page);
 
 unlock_done:
 	mutex_unlock(&espfix_init_mutex);
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 35aafc95e4b8..18bc9b51ac9b 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -105,7 +105,7 @@ static void nmi_max_handler(struct irq_work *w)
 {
 	struct nmiaction *a = container_of(w, struct nmiaction, irq_work);
 	int remainder_ns, decimal_msecs;
-	u64 whole_msecs = ACCESS_ONCE(a->max_duration);
+	u64 whole_msecs = READ_ONCE(a->max_duration);
 
 	remainder_ns = do_div(whole_msecs, (1000 * 1000));
 	decimal_msecs = remainder_ns / 1000;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7a69cf053711..a119b361b8b7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -443,7 +443,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 
 static u64 __get_spte_lockless(u64 *sptep)
 {
-	return ACCESS_ONCE(*sptep);
+	return READ_ONCE(*sptep);
 }
 #else
 union split_spte {
@@ -4819,7 +4819,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	 * If we don't have indirect shadow pages, it means no page is
 	 * write-protected, so we can exit simply.
 	 */
-	if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
+	if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
 		return;
 
 	remote_flush = local_flush = false;
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index ea67dc876316..01c1371f39f8 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -157,7 +157,7 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
 		return false;
 
 	index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
-	return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]);
+	return !!READ_ONCE(slot->arch.gfn_track[mode][index]);
 }
 
 void kvm_page_track_cleanup(struct kvm *kvm)
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 6083ba462f35..13b4f19b9131 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -547,7 +547,7 @@ int xen_alloc_p2m_entry(unsigned long pfn)
 	if (p2m_top_mfn && pfn < MAX_P2M_PFN) {
 		topidx = p2m_top_index(pfn);
 		top_mfn_p = &p2m_top_mfn[topidx];
-		mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
+		mid_mfn = READ_ONCE(p2m_top_mfn_p[topidx]);
 
 		BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
 
diff --git a/arch/xtensa/platforms/xtfpga/lcd.c b/arch/xtensa/platforms/xtfpga/lcd.c
index 4dc0c1b43f4b..2f7eb66c23ec 100644
--- a/arch/xtensa/platforms/xtfpga/lcd.c
+++ b/arch/xtensa/platforms/xtfpga/lcd.c
@@ -34,23 +34,23 @@
 static void lcd_put_byte(u8 *addr, u8 data)
 {
 #ifdef CONFIG_XTFPGA_LCD_8BIT_ACCESS
-	ACCESS_ONCE(*addr) = data;
+	WRITE_ONCE(*addr, data);
 #else
-	ACCESS_ONCE(*addr) = data & 0xf0;
-	ACCESS_ONCE(*addr) = (data << 4) & 0xf0;
+	WRITE_ONCE(*addr, data & 0xf0);
+	WRITE_ONCE(*addr, (data << 4) & 0xf0);
 #endif
 }
 
 static int __init lcd_init(void)
 {
-	ACCESS_ONCE(*LCD_INSTR_ADDR) = LCD_DISPLAY_MODE8BIT;
+	WRITE_ONCE(*LCD_INSTR_ADDR, LCD_DISPLAY_MODE8BIT);
 	mdelay(5);
-	ACCESS_ONCE(*LCD_INSTR_ADDR) = LCD_DISPLAY_MODE8BIT;
+	WRITE_ONCE(*LCD_INSTR_ADDR, LCD_DISPLAY_MODE8BIT);
 	udelay(200);
-	ACCESS_ONCE(*LCD_INSTR_ADDR) = LCD_DISPLAY_MODE8BIT;
+	WRITE_ONCE(*LCD_INSTR_ADDR, LCD_DISPLAY_MODE8BIT);
 	udelay(50);
 #ifndef CONFIG_XTFPGA_LCD_8BIT_ACCESS
-	ACCESS_ONCE(*LCD_INSTR_ADDR) = LCD_DISPLAY_MODE4BIT;
+	WRITE_ONCE(*LCD_INSTR_ADDR, LCD_DISPLAY_MODE4BIT);
 	udelay(50);
 	lcd_put_byte(LCD_INSTR_ADDR, LCD_DISPLAY_MODE4BIT);
 	udelay(50);
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 6a9a0f03a67b..d822530e6aea 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -261,7 +261,7 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat)
 
 static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
 {
-	u64 now, issue = ACCESS_ONCE(rwb->sync_issue);
+	u64 now, issue = READ_ONCE(rwb->sync_issue);
 
 	if (!issue || !rwb->sync_cookie)
 		return 0;
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 12ebd055724c..4b8ba2a75a4d 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -668,7 +668,7 @@ const char *dev_driver_string(const struct device *dev)
 	 * so be careful about accessing it.  dev->bus and dev->class should
 	 * never change once they are set, so they don't need special care.
 	 */
-	drv = ACCESS_ONCE(dev->driver);
+	drv = READ_ONCE(dev->driver);
 	return drv ? drv->name :
 			(dev->bus ? dev->bus->name :
 			(dev->class ? dev->class->name : ""));
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 7bcf80fa9ada..41d7c2b99f69 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -134,11 +134,11 @@ unsigned long pm_runtime_autosuspend_expiration(struct device *dev)
 	if (!dev->power.use_autosuspend)
 		goto out;
 
-	autosuspend_delay = ACCESS_ONCE(dev->power.autosuspend_delay);
+	autosuspend_delay = READ_ONCE(dev->power.autosuspend_delay);
 	if (autosuspend_delay < 0)
 		goto out;
 
-	last_busy = ACCESS_ONCE(dev->power.last_busy);
+	last_busy = READ_ONCE(dev->power.last_busy);
 	elapsed = jiffies - last_busy;
 	if (elapsed < 0)
 		goto out;	/* jiffies has wrapped around. */
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 8ad92707e45f..6c7ccac2679e 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -641,7 +641,7 @@ static void credit_entropy_bits(struct entropy_store *r, int nbits)
 		return;
 
 retry:
-	entropy_count = orig = ACCESS_ONCE(r->entropy_count);
+	entropy_count = orig = READ_ONCE(r->entropy_count);
 	if (nfrac < 0) {
 		/* Debit */
 		entropy_count += nfrac;
@@ -1265,7 +1265,7 @@ static size_t account(struct entropy_store *r, size_t nbytes, int min,
 
 	/* Can we pull enough? */
 retry:
-	entropy_count = orig = ACCESS_ONCE(r->entropy_count);
+	entropy_count = orig = READ_ONCE(r->entropy_count);
 	ibytes = nbytes;
 	/* never pull more than available */
 	have_bytes = entropy_count >> (ENTROPY_SHIFT + 3);
diff --git a/drivers/clocksource/bcm2835_timer.c b/drivers/clocksource/bcm2835_timer.c
index 39e489a96ad7..60da2537bef9 100644
--- a/drivers/clocksource/bcm2835_timer.c
+++ b/drivers/clocksource/bcm2835_timer.c
@@ -71,7 +71,7 @@ static irqreturn_t bcm2835_time_interrupt(int irq, void *dev_id)
 	if (readl_relaxed(timer->control) & timer->match_mask) {
 		writel_relaxed(timer->match_mask, timer->control);
 
-		event_handler = ACCESS_ONCE(timer->evt.event_handler);
+		event_handler = READ_ONCE(timer->evt.event_handler);
 		if (event_handler)
 			event_handler(&timer->evt);
 		return IRQ_HANDLED;
diff --git a/drivers/crypto/caam/jr.c b/drivers/crypto/caam/jr.c
index d258953ff488..f4f258075b89 100644
--- a/drivers/crypto/caam/jr.c
+++ b/drivers/crypto/caam/jr.c
@@ -172,7 +172,7 @@ static void caam_jr_dequeue(unsigned long devarg)
 
 	while (rd_reg32(&jrp->rregs->outring_used)) {
 
-		head = ACCESS_ONCE(jrp->head);
+		head = READ_ONCE(jrp->head);
 
 		spin_lock(&jrp->outlock);
 
@@ -341,7 +341,7 @@ int caam_jr_enqueue(struct device *dev, u32 *desc,
 	spin_lock_bh(&jrp->inplock);
 
 	head = jrp->head;
-	tail = ACCESS_ONCE(jrp->tail);
+	tail = READ_ONCE(jrp->tail);
 
 	if (!rd_reg32(&jrp->rregs->inpring_avail) ||
 	    CIRC_SPACE(head, tail, JOBR_DEPTH) <= 0) {
diff --git a/drivers/crypto/nx/nx-842-powernv.c b/drivers/crypto/nx/nx-842-powernv.c
index 874ddf5e9087..0f20f5ec9617 100644
--- a/drivers/crypto/nx/nx-842-powernv.c
+++ b/drivers/crypto/nx/nx-842-powernv.c
@@ -193,7 +193,7 @@ static int wait_for_csb(struct nx842_workmem *wmem,
 	ktime_t start = wmem->start, now = ktime_get();
 	ktime_t timeout = ktime_add_ms(start, CSB_WAIT_MAX);
 
-	while (!(ACCESS_ONCE(csb->flags) & CSB_V)) {
+	while (!(READ_ONCE(csb->flags) & CSB_V)) {
 		cpu_relax();
 		now = ktime_get();
 		if (ktime_after(now, timeout))
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 8bf89267dc25..ccf52368a073 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -734,7 +734,7 @@ static unsigned int ar_search_last_active_buffer(struct ar_context *ctx,
 	__le16 res_count, next_res_count;
 
 	i = ar_first_buffer_index(ctx);
-	res_count = ACCESS_ONCE(ctx->descriptors[i].res_count);
+	res_count = READ_ONCE(ctx->descriptors[i].res_count);
 
 	/* A buffer that is not yet completely filled must be the last one. */
 	while (i != last && res_count == 0) {
@@ -742,8 +742,7 @@ static unsigned int ar_search_last_active_buffer(struct ar_context *ctx,
 		/* Peek at the next descriptor. */
 		next_i = ar_next_buffer_index(i);
 		rmb(); /* read descriptors in order */
-		next_res_count = ACCESS_ONCE(
-				ctx->descriptors[next_i].res_count);
+		next_res_count = READ_ONCE(ctx->descriptors[next_i].res_count);
 		/*
 		 * If the next descriptor is still empty, we must stop at this
 		 * descriptor.
@@ -759,8 +758,7 @@ static unsigned int ar_search_last_active_buffer(struct ar_context *ctx,
 			if (MAX_AR_PACKET_SIZE > PAGE_SIZE && i != last) {
 				next_i = ar_next_buffer_index(next_i);
 				rmb();
-				next_res_count = ACCESS_ONCE(
-					ctx->descriptors[next_i].res_count);
+				next_res_count = READ_ONCE(ctx->descriptors[next_i].res_count);
 				if (next_res_count != cpu_to_le16(PAGE_SIZE))
 					goto next_buffer_is_active;
 			}
@@ -2812,7 +2810,7 @@ static int handle_ir_buffer_fill(struct context *context,
 	u32 buffer_dma;
 
 	req_count = le16_to_cpu(last->req_count);
-	res_count = le16_to_cpu(ACCESS_ONCE(last->res_count));
+	res_count = le16_to_cpu(READ_ONCE(last->res_count));
 	completed = req_count - res_count;
 	buffer_dma = le32_to_cpu(last->data_address);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 333bad749067..303b5e099a98 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -260,7 +260,7 @@ static void amdgpu_fence_fallback(unsigned long arg)
  */
 int amdgpu_fence_wait_empty(struct amdgpu_ring *ring)
 {
-	uint64_t seq = ACCESS_ONCE(ring->fence_drv.sync_seq);
+	uint64_t seq = READ_ONCE(ring->fence_drv.sync_seq);
 	struct dma_fence *fence, **ptr;
 	int r;
 
@@ -300,7 +300,7 @@ unsigned amdgpu_fence_count_emitted(struct amdgpu_ring *ring)
 	amdgpu_fence_process(ring);
 	emitted = 0x100000000ull;
 	emitted -= atomic_read(&ring->fence_drv.last_seq);
-	emitted += ACCESS_ONCE(ring->fence_drv.sync_seq);
+	emitted += READ_ONCE(ring->fence_drv.sync_seq);
 	return lower_32_bits(emitted);
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 7171968f261e..6149a47fe63d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -788,11 +788,11 @@ static int amdgpu_debugfs_gem_bo_info(int id, void *ptr, void *data)
 	seq_printf(m, "\t0x%08x: %12ld byte %s",
 		   id, amdgpu_bo_size(bo), placement);
 
-	offset = ACCESS_ONCE(bo->tbo.mem.start);
+	offset = READ_ONCE(bo->tbo.mem.start);
 	if (offset != AMDGPU_BO_INVALID_OFFSET)
 		seq_printf(m, " @ 0x%010Lx", offset);
 
-	pin_count = ACCESS_ONCE(bo->pin_count);
+	pin_count = READ_ONCE(bo->pin_count);
 	if (pin_count)
 		seq_printf(m, " pin count %d", pin_count);
 	seq_printf(m, "\n");
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 38cea6fb25a8..a25f6c72f219 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -187,7 +187,7 @@ static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity)
 	if (kfifo_is_empty(&entity->job_queue))
 		return false;
 
-	if (ACCESS_ONCE(entity->dependency))
+	if (READ_ONCE(entity->dependency))
 		return false;
 
 	return true;
diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c
index 3386452bd2f0..cf3deb283da5 100644
--- a/drivers/gpu/drm/radeon/radeon_gem.c
+++ b/drivers/gpu/drm/radeon/radeon_gem.c
@@ -451,7 +451,7 @@ int radeon_gem_busy_ioctl(struct drm_device *dev, void *data,
 	else
 		r = 0;
 
-	cur_placement = ACCESS_ONCE(robj->tbo.mem.mem_type);
+	cur_placement = READ_ONCE(robj->tbo.mem.mem_type);
 	args->domain = radeon_mem_type_to_domain(cur_placement);
 	drm_gem_object_put_unlocked(gobj);
 	return r;
@@ -481,7 +481,7 @@ int radeon_gem_wait_idle_ioctl(struct drm_device *dev, void *data,
 		r = ret;
 
 	/* Flush HDP cache via MMIO if necessary */
-	cur_placement = ACCESS_ONCE(robj->tbo.mem.mem_type);
+	cur_placement = READ_ONCE(robj->tbo.mem.mem_type);
 	if (rdev->asic->mmio_hdp_flush &&
 	    radeon_mem_type_to_domain(cur_placement) == RADEON_GEM_DOMAIN_VRAM)
 		robj->rdev->asic->mmio_hdp_flush(rdev);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
index a552e4ea5440..6ac094ee8983 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
@@ -904,7 +904,7 @@ vmw_surface_handle_reference(struct vmw_private *dev_priv,
 		if (unlikely(drm_is_render_client(file_priv)))
 			require_exist = true;
 
-		if (ACCESS_ONCE(vmw_fpriv(file_priv)->locked_master)) {
+		if (READ_ONCE(vmw_fpriv(file_priv)->locked_master)) {
 			DRM_ERROR("Locked master refused legacy "
 				  "surface reference.\n");
 			return -EACCES;
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index d9a1e9893136..97bea2e1aa6a 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -380,7 +380,7 @@ static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
 		if (sc->flags & SCF_FROZEN) {
 			wait_event_interruptible_timeout(
 				dd->event_queue,
-				!(ACCESS_ONCE(dd->flags) & HFI1_FROZEN),
+				!(READ_ONCE(dd->flags) & HFI1_FROZEN),
 				msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
 			if (dd->flags & HFI1_FROZEN)
 				return -ENOLCK;
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
index 7108a4b5e94c..75e740780285 100644
--- a/drivers/infiniband/hw/hfi1/pio.c
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -1423,14 +1423,14 @@ retry:
 			goto done;
 		}
 		/* copy from receiver cache line and recalculate */
-		sc->alloc_free = ACCESS_ONCE(sc->free);
+		sc->alloc_free = READ_ONCE(sc->free);
 		avail =
 			(unsigned long)sc->credits -
 			(sc->fill - sc->alloc_free);
 		if (blocks > avail) {
 			/* still no room, actively update */
 			sc_release_update(sc);
-			sc->alloc_free = ACCESS_ONCE(sc->free);
+			sc->alloc_free = READ_ONCE(sc->free);
 			trycount++;
 			goto retry;
 		}
@@ -1667,7 +1667,7 @@ void sc_release_update(struct send_context *sc)
 
 	/* call sent buffer callbacks */
 	code = -1;				/* code not yet set */
-	head = ACCESS_ONCE(sc->sr_head);	/* snapshot the head */
+	head = READ_ONCE(sc->sr_head);	/* snapshot the head */
 	tail = sc->sr_tail;
 	while (head != tail) {
 		pbuf = &sc->sr[tail].pbuf;
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
index b3291f0fde9a..a7fc664f0d4e 100644
--- a/drivers/infiniband/hw/hfi1/ruc.c
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -363,7 +363,7 @@ static void ruc_loopback(struct rvt_qp *sqp)
 
 again:
 	smp_read_barrier_depends(); /* see post_one_send() */
-	if (sqp->s_last == ACCESS_ONCE(sqp->s_head))
+	if (sqp->s_last == READ_ONCE(sqp->s_head))
 		goto clr_busy;
 	wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
 
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index 6781bcdb10b3..08346d25441c 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -1725,7 +1725,7 @@ retry:
 
 		swhead = sde->descq_head & sde->sdma_mask;
 		/* this code is really bad for cache line trading */
-		swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+		swtail = READ_ONCE(sde->descq_tail) & sde->sdma_mask;
 		cnt = sde->descq_cnt;
 
 		if (swhead < swtail)
@@ -1872,7 +1872,7 @@ retry:
 	if ((status & sde->idle_mask) && !idle_check_done) {
 		u16 swtail;
 
-		swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+		swtail = READ_ONCE(sde->descq_tail) & sde->sdma_mask;
 		if (swtail != hwhead) {
 			hwhead = (u16)read_sde_csr(sde, SD(HEAD));
 			idle_check_done = 1;
@@ -2222,7 +2222,7 @@ void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde)
 	u16 len;
 
 	head = sde->descq_head & sde->sdma_mask;
-	tail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+	tail = READ_ONCE(sde->descq_tail) & sde->sdma_mask;
 	seq_printf(s, SDE_FMT, sde->this_idx,
 		   sde->cpu,
 		   sdma_state_name(sde->state.current_state),
@@ -3305,7 +3305,7 @@ int sdma_ahg_alloc(struct sdma_engine *sde)
 		return -EINVAL;
 	}
 	while (1) {
-		nr = ffz(ACCESS_ONCE(sde->ahg_bits));
+		nr = ffz(READ_ONCE(sde->ahg_bits));
 		if (nr > 31) {
 			trace_hfi1_ahg_allocate(sde, -ENOSPC);
 			return -ENOSPC;
diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h
index 107011d8613b..374c59784950 100644
--- a/drivers/infiniband/hw/hfi1/sdma.h
+++ b/drivers/infiniband/hw/hfi1/sdma.h
@@ -445,7 +445,7 @@ static inline u16 sdma_descq_freecnt(struct sdma_engine *sde)
 {
 	return sde->descq_cnt -
 		(sde->descq_tail -
-		 ACCESS_ONCE(sde->descq_head)) - 1;
+		 READ_ONCE(sde->descq_head)) - 1;
 }
 
 static inline u16 sdma_descq_inprocess(struct sdma_engine *sde)
diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c
index 0b646173ca22..9a31c585427f 100644
--- a/drivers/infiniband/hw/hfi1/uc.c
+++ b/drivers/infiniband/hw/hfi1/uc.c
@@ -80,7 +80,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 			goto bail;
 		/* We are in the error state, flush the work request. */
 		smp_read_barrier_depends(); /* see post_one_send() */
-		if (qp->s_last == ACCESS_ONCE(qp->s_head))
+		if (qp->s_last == READ_ONCE(qp->s_head))
 			goto bail;
 		/* If DMAs are in progress, we can't flush immediately. */
 		if (iowait_sdma_pending(&priv->s_iowait)) {
@@ -121,7 +121,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 			goto bail;
 		/* Check if send work queue is empty. */
 		smp_read_barrier_depends(); /* see post_one_send() */
-		if (qp->s_cur == ACCESS_ONCE(qp->s_head)) {
+		if (qp->s_cur == READ_ONCE(qp->s_head)) {
 			clear_ahg(qp);
 			goto bail;
 		}
diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c
index 2ba74fdd6f15..7fec6b984e3e 100644
--- a/drivers/infiniband/hw/hfi1/ud.c
+++ b/drivers/infiniband/hw/hfi1/ud.c
@@ -487,7 +487,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 			goto bail;
 		/* We are in the error state, flush the work request. */
 		smp_read_barrier_depends(); /* see post_one_send */
-		if (qp->s_last == ACCESS_ONCE(qp->s_head))
+		if (qp->s_last == READ_ONCE(qp->s_head))
 			goto bail;
 		/* If DMAs are in progress, we can't flush immediately. */
 		if (iowait_sdma_pending(&priv->s_iowait)) {
@@ -501,7 +501,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 
 	/* see post_one_send() */
 	smp_read_barrier_depends();
-	if (qp->s_cur == ACCESS_ONCE(qp->s_head))
+	if (qp->s_cur == READ_ONCE(qp->s_head))
 		goto bail;
 
 	wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
index c0c0e0445cbf..8ec6e8a8d6f7 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.c
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -276,7 +276,7 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
 		/* Wait until all requests have been freed. */
 		wait_event_interruptible(
 			pq->wait,
-			(ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE));
+			(READ_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE));
 		kfree(pq->reqs);
 		kfree(pq->req_in_use);
 		kmem_cache_destroy(pq->txreq_cache);
@@ -591,7 +591,7 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 			if (ret != -EBUSY) {
 				req->status = ret;
 				WRITE_ONCE(req->has_error, 1);
-				if (ACCESS_ONCE(req->seqcomp) ==
+				if (READ_ONCE(req->seqcomp) ==
 				    req->seqsubmitted - 1)
 					goto free_req;
 				return ret;
@@ -825,7 +825,7 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 		 */
 		if (req->data_len) {
 			iovec = &req->iovs[req->iov_idx];
-			if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) {
+			if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
 				if (++req->iov_idx == req->data_iovs) {
 					ret = -EFAULT;
 					goto free_txreq;
@@ -1390,7 +1390,7 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
 	} else {
 		if (status != SDMA_TXREQ_S_OK)
 			req->status = status;
-		if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) &&
+		if (req->seqcomp == (READ_ONCE(req->seqsubmitted) - 1) &&
 		    (READ_ONCE(req->done) ||
 		     READ_ONCE(req->has_error))) {
 			user_sdma_free_request(req, false);
diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c
index 53efbb0b40c4..9a37e844d4c8 100644
--- a/drivers/infiniband/hw/qib/qib_ruc.c
+++ b/drivers/infiniband/hw/qib/qib_ruc.c
@@ -368,7 +368,7 @@ static void qib_ruc_loopback(struct rvt_qp *sqp)
 
 again:
 	smp_read_barrier_depends(); /* see post_one_send() */
-	if (sqp->s_last == ACCESS_ONCE(sqp->s_head))
+	if (sqp->s_last == READ_ONCE(sqp->s_head))
 		goto clr_busy;
 	wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
 
diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c
index 498e2202e72c..bddcc37ace44 100644
--- a/drivers/infiniband/hw/qib/qib_uc.c
+++ b/drivers/infiniband/hw/qib/qib_uc.c
@@ -61,7 +61,7 @@ int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags)
 			goto bail;
 		/* We are in the error state, flush the work request. */
 		smp_read_barrier_depends(); /* see post_one_send() */
-		if (qp->s_last == ACCESS_ONCE(qp->s_head))
+		if (qp->s_last == READ_ONCE(qp->s_head))
 			goto bail;
 		/* If DMAs are in progress, we can't flush immediately. */
 		if (atomic_read(&priv->s_dma_busy)) {
@@ -91,7 +91,7 @@ int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags)
 			goto bail;
 		/* Check if send work queue is empty. */
 		smp_read_barrier_depends(); /* see post_one_send() */
-		if (qp->s_cur == ACCESS_ONCE(qp->s_head))
+		if (qp->s_cur == READ_ONCE(qp->s_head))
 			goto bail;
 		/*
 		 * Start a new request.
diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c
index be4907453ac4..15962ed193ce 100644
--- a/drivers/infiniband/hw/qib/qib_ud.c
+++ b/drivers/infiniband/hw/qib/qib_ud.c
@@ -253,7 +253,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags)
 			goto bail;
 		/* We are in the error state, flush the work request. */
 		smp_read_barrier_depends(); /* see post_one_send */
-		if (qp->s_last == ACCESS_ONCE(qp->s_head))
+		if (qp->s_last == READ_ONCE(qp->s_head))
 			goto bail;
 		/* If DMAs are in progress, we can't flush immediately. */
 		if (atomic_read(&priv->s_dma_busy)) {
@@ -267,7 +267,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags)
 
 	/* see post_one_send() */
 	smp_read_barrier_depends();
-	if (qp->s_cur == ACCESS_ONCE(qp->s_head))
+	if (qp->s_cur == READ_ONCE(qp->s_head))
 		goto bail;
 
 	wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 22df09ae809e..b670cb9d2006 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1073,7 +1073,7 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err)
 	rdi->driver_f.notify_error_qp(qp);
 
 	/* Schedule the sending tasklet to drain the send work queue. */
-	if (ACCESS_ONCE(qp->s_last) != qp->s_head)
+	if (READ_ONCE(qp->s_last) != qp->s_head)
 		rdi->driver_f.schedule_send(qp);
 
 	rvt_clear_mr_refs(qp, 0);
@@ -1686,7 +1686,7 @@ static inline int rvt_qp_is_avail(
 	if (likely(qp->s_avail))
 		return 0;
 	smp_read_barrier_depends(); /* see rc.c */
-	slast = ACCESS_ONCE(qp->s_last);
+	slast = READ_ONCE(qp->s_last);
 	if (qp->s_head >= slast)
 		avail = qp->s_size - (qp->s_head - slast);
 	else
@@ -1917,7 +1917,7 @@ int rvt_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 	 * ahead and kick the send engine into gear. Otherwise we will always
 	 * just schedule the send to happen later.
 	 */
-	call_send = qp->s_head == ACCESS_ONCE(qp->s_last) && !wr->next;
+	call_send = qp->s_head == READ_ONCE(qp->s_last) && !wr->next;
 
 	for (; wr; wr = wr->next) {
 		err = rvt_post_one_wr(qp, wr, &call_send);
diff --git a/drivers/input/misc/regulator-haptic.c b/drivers/input/misc/regulator-haptic.c
index 2e8f801932be..a1db1e5040dc 100644
--- a/drivers/input/misc/regulator-haptic.c
+++ b/drivers/input/misc/regulator-haptic.c
@@ -233,7 +233,7 @@ static int __maybe_unused regulator_haptic_resume(struct device *dev)
 
 	haptic->suspended = false;
 
-	magnitude = ACCESS_ONCE(haptic->magnitude);
+	magnitude = READ_ONCE(haptic->magnitude);
 	if (magnitude)
 		regulator_haptic_set_voltage(haptic, magnitude);
 
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index d216a8f7bc22..33bb074d6941 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -347,7 +347,7 @@ static void __cache_size_refresh(void)
 	BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
 	BUG_ON(dm_bufio_client_count < 0);
 
-	dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size);
+	dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
 
 	/*
 	 * Use default if set to 0 and report the actual cache size used.
@@ -960,7 +960,7 @@ static void __get_memory_limit(struct dm_bufio_client *c,
 {
 	unsigned long buffers;
 
-	if (unlikely(ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) {
+	if (unlikely(READ_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) {
 		if (mutex_trylock(&dm_bufio_clients_lock)) {
 			__cache_size_refresh();
 			mutex_unlock(&dm_bufio_clients_lock);
@@ -1600,7 +1600,7 @@ static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
 
 static unsigned long get_retain_buffers(struct dm_bufio_client *c)
 {
-        unsigned long retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes);
+        unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
         return retain_bytes >> (c->sectors_per_block_bits + SECTOR_SHIFT);
 }
 
@@ -1647,7 +1647,7 @@ dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
 
-	return ACCESS_ONCE(c->n_buffers[LIST_CLEAN]) + ACCESS_ONCE(c->n_buffers[LIST_DIRTY]);
+	return READ_ONCE(c->n_buffers[LIST_CLEAN]) + READ_ONCE(c->n_buffers[LIST_DIRTY]);
 }
 
 /*
@@ -1818,7 +1818,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
 
 static unsigned get_max_age_hz(void)
 {
-	unsigned max_age = ACCESS_ONCE(dm_bufio_max_age);
+	unsigned max_age = READ_ONCE(dm_bufio_max_age);
 
 	if (max_age > UINT_MAX / HZ)
 		max_age = UINT_MAX / HZ;
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index cf2c67e35eaf..eb45cc3df31d 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -107,7 +107,7 @@ static void io_job_start(struct dm_kcopyd_throttle *t)
 try_again:
 	spin_lock_irq(&throttle_spinlock);
 
-	throttle = ACCESS_ONCE(t->throttle);
+	throttle = READ_ONCE(t->throttle);
 
 	if (likely(throttle >= 100))
 		goto skip_limit;
@@ -157,7 +157,7 @@ static void io_job_finish(struct dm_kcopyd_throttle *t)
 
 	t->num_io_jobs--;
 
-	if (likely(ACCESS_ONCE(t->throttle) >= 100))
+	if (likely(READ_ONCE(t->throttle) >= 100))
 		goto skip_limit;
 
 	if (!t->num_io_jobs) {
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 6028d8247f58..a1a5eec783cc 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -431,7 +431,7 @@ do_sync_free:
 		synchronize_rcu_expedited();
 		dm_stat_free(&s->rcu_head);
 	} else {
-		ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1;
+		WRITE_ONCE(dm_stat_need_rcu_barrier, 1);
 		call_rcu(&s->rcu_head, dm_stat_free);
 	}
 	return 0;
@@ -639,12 +639,12 @@ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
 		 */
 		last = raw_cpu_ptr(stats->last);
 		stats_aux->merged =
-			(bi_sector == (ACCESS_ONCE(last->last_sector) &&
+			(bi_sector == (READ_ONCE(last->last_sector) &&
 				       ((bi_rw == WRITE) ==
-					(ACCESS_ONCE(last->last_rw) == WRITE))
+					(READ_ONCE(last->last_rw) == WRITE))
 				       ));
-		ACCESS_ONCE(last->last_sector) = end_sector;
-		ACCESS_ONCE(last->last_rw) = bi_rw;
+		WRITE_ONCE(last->last_sector, end_sector);
+		WRITE_ONCE(last->last_rw, bi_rw);
 	}
 
 	rcu_read_lock();
@@ -693,22 +693,22 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
 
 	for_each_possible_cpu(cpu) {
 		p = &s->stat_percpu[cpu][x];
-		shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
-		shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]);
-		shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]);
-		shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]);
-		shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]);
-		shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]);
-		shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]);
-		shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]);
-		shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]);
-		shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
-		shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
-		shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
+		shared->tmp.sectors[READ] += READ_ONCE(p->sectors[READ]);
+		shared->tmp.sectors[WRITE] += READ_ONCE(p->sectors[WRITE]);
+		shared->tmp.ios[READ] += READ_ONCE(p->ios[READ]);
+		shared->tmp.ios[WRITE] += READ_ONCE(p->ios[WRITE]);
+		shared->tmp.merges[READ] += READ_ONCE(p->merges[READ]);
+		shared->tmp.merges[WRITE] += READ_ONCE(p->merges[WRITE]);
+		shared->tmp.ticks[READ] += READ_ONCE(p->ticks[READ]);
+		shared->tmp.ticks[WRITE] += READ_ONCE(p->ticks[WRITE]);
+		shared->tmp.io_ticks[READ] += READ_ONCE(p->io_ticks[READ]);
+		shared->tmp.io_ticks[WRITE] += READ_ONCE(p->io_ticks[WRITE]);
+		shared->tmp.io_ticks_total += READ_ONCE(p->io_ticks_total);
+		shared->tmp.time_in_queue += READ_ONCE(p->time_in_queue);
 		if (s->n_histogram_entries) {
 			unsigned i;
 			for (i = 0; i < s->n_histogram_entries + 1; i++)
-				shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]);
+				shared->tmp.histogram[i] += READ_ONCE(p->histogram[i]);
 		}
 	}
 }
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index 4c8de1ff78ca..8d0ba879777e 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -144,7 +144,7 @@ static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long
 
 	switch_get_position(sctx, region_nr, &region_index, &bit);
 
-	return (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
+	return (READ_ONCE(sctx->region_table[region_index]) >> bit) &
 		((1 << sctx->region_table_entry_bits) - 1);
 }
 
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 1e25705209c2..89e5dff9b4cf 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2431,7 +2431,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 	struct pool_c *pt = pool->ti->private;
 	bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
 	enum pool_mode old_mode = get_pool_mode(pool);
-	unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ;
+	unsigned long no_space_timeout = READ_ONCE(no_space_timeout_secs) * HZ;
 
 	/*
 	 * Never allow the pool to transition to PM_WRITE mode if user
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index bda3caca23ca..fba93237a780 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -589,7 +589,7 @@ static void verity_prefetch_io(struct work_struct *work)
 		verity_hash_at_level(v, pw->block, i, &hash_block_start, NULL);
 		verity_hash_at_level(v, pw->block + pw->n_blocks - 1, i, &hash_block_end, NULL);
 		if (!i) {
-			unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster);
+			unsigned cluster = READ_ONCE(dm_verity_prefetch_cluster);
 
 			cluster >>= v->data_dev_block_bits;
 			if (unlikely(!cluster))
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4be85324f44d..8aaffa19b29a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -114,7 +114,7 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 
 static int __dm_get_module_param_int(int *module_param, int min, int max)
 {
-	int param = ACCESS_ONCE(*module_param);
+	int param = READ_ONCE(*module_param);
 	int modified_param = 0;
 	bool modified = true;
 
@@ -136,7 +136,7 @@ static int __dm_get_module_param_int(int *module_param, int min, int max)
 unsigned __dm_get_module_param(unsigned *module_param,
 			       unsigned def, unsigned max)
 {
-	unsigned param = ACCESS_ONCE(*module_param);
+	unsigned param = READ_ONCE(*module_param);
 	unsigned modified_param = 0;
 
 	if (!param)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0ff1bbf6c90e..447ddcbc9566 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2651,7 +2651,7 @@ state_show(struct md_rdev *rdev, char *page)
 {
 	char *sep = ",";
 	size_t len = 0;
-	unsigned long flags = ACCESS_ONCE(rdev->flags);
+	unsigned long flags = READ_ONCE(rdev->flags);
 
 	if (test_bit(Faulty, &flags) ||
 	    (!test_bit(ExternalBbl, &flags) &&
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 928e24a07133..7d9a50eed9db 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6072,7 +6072,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
 	 */
 	rcu_read_lock();
 	for (i = 0; i < conf->raid_disks; i++) {
-		struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev);
+		struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
 
 		if (rdev == NULL || test_bit(Faulty, &rdev->flags))
 			still_degraded = 1;
diff --git a/drivers/misc/mic/scif/scif_rb.c b/drivers/misc/mic/scif/scif_rb.c
index 637cc4686742..b665757ca89a 100644
--- a/drivers/misc/mic/scif/scif_rb.c
+++ b/drivers/misc/mic/scif/scif_rb.c
@@ -138,7 +138,7 @@ void scif_rb_commit(struct scif_rb *rb)
 	 * the read barrier in scif_rb_count(..)
 	 */
 	wmb();
-	ACCESS_ONCE(*rb->write_ptr) = rb->current_write_offset;
+	WRITE_ONCE(*rb->write_ptr, rb->current_write_offset);
 #ifdef CONFIG_INTEL_MIC_CARD
 	/*
 	 * X100 Si bug: For the case where a Core is performing an EXT_WR
@@ -147,7 +147,7 @@ void scif_rb_commit(struct scif_rb *rb)
 	 * This way, if ordering is violated for the Interrupt Message, it will
 	 * fall just behind the first Posted associated with the first EXT_WR.
 	 */
-	ACCESS_ONCE(*rb->write_ptr) = rb->current_write_offset;
+	WRITE_ONCE(*rb->write_ptr, rb->current_write_offset);
 #endif
 }
 
@@ -210,7 +210,7 @@ void scif_rb_update_read_ptr(struct scif_rb *rb)
 	 * scif_rb_space(..)
 	 */
 	mb();
-	ACCESS_ONCE(*rb->read_ptr) = new_offset;
+	WRITE_ONCE(*rb->read_ptr, new_offset);
 #ifdef CONFIG_INTEL_MIC_CARD
 	/*
 	 * X100 Si Bug: For the case where a Core is performing an EXT_WR
@@ -219,7 +219,7 @@ void scif_rb_update_read_ptr(struct scif_rb *rb)
 	 * This way, if ordering is violated for the Interrupt Message, it will
 	 * fall just behind the first Posted associated with the first EXT_WR.
 	 */
-	ACCESS_ONCE(*rb->read_ptr) = new_offset;
+	WRITE_ONCE(*rb->read_ptr, new_offset);
 #endif
 }
 
diff --git a/drivers/misc/mic/scif/scif_rma_list.c b/drivers/misc/mic/scif/scif_rma_list.c
index e1ef8daedd5a..a036dbb4101e 100644
--- a/drivers/misc/mic/scif/scif_rma_list.c
+++ b/drivers/misc/mic/scif/scif_rma_list.c
@@ -277,7 +277,7 @@ retry:
 		 * Need to restart list traversal if there has been
 		 * an asynchronous list entry deletion.
 		 */
-		if (ACCESS_ONCE(ep->rma_info.async_list_del))
+		if (READ_ONCE(ep->rma_info.async_list_del))
 			goto retry;
 	}
 	mutex_unlock(&ep->rma_info.rma_lock);
diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index c02cc817a490..1ed9529e7bd1 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1378,7 +1378,7 @@ int bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 				unsigned int count;
 
 				slaves = rcu_dereference(bond->slave_arr);
-				count = slaves ? ACCESS_ONCE(slaves->count) : 0;
+				count = slaves ? READ_ONCE(slaves->count) : 0;
 				if (likely(count))
 					tx_slave = slaves->arr[hash_index %
 							       count];
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index c99dc59d729b..af51b90cecbb 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1167,7 +1167,7 @@ static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb)
 	slave = bond_slave_get_rcu(skb->dev);
 	bond = slave->bond;
 
-	recv_probe = ACCESS_ONCE(bond->recv_probe);
+	recv_probe = READ_ONCE(bond->recv_probe);
 	if (recv_probe) {
 		ret = recv_probe(skb, bond, slave);
 		if (ret == RX_HANDLER_CONSUMED) {
@@ -3810,7 +3810,7 @@ static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev
 		else
 			bond_xmit_slave_id(bond, skb, 0);
 	} else {
-		int slave_cnt = ACCESS_ONCE(bond->slave_cnt);
+		int slave_cnt = READ_ONCE(bond->slave_cnt);
 
 		if (likely(slave_cnt)) {
 			slave_id = bond_rr_gen_slave_id(bond);
@@ -3972,7 +3972,7 @@ static int bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev)
 	unsigned int count;
 
 	slaves = rcu_dereference(bond->slave_arr);
-	count = slaves ? ACCESS_ONCE(slaves->count) : 0;
+	count = slaves ? READ_ONCE(slaves->count) : 0;
 	if (likely(count)) {
 		slave = slaves->arr[bond_xmit_hash(bond, skb) % count];
 		bond_dev_queue_xmit(bond, skb, slave->dev);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index 4ef68f69b58c..43f52a8fe708 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -405,7 +405,7 @@ void free_tx_desc(struct adapter *adap, struct sge_txq *q,
  */
 static inline int reclaimable(const struct sge_txq *q)
 {
-	int hw_cidx = ntohs(ACCESS_ONCE(q->stat->cidx));
+	int hw_cidx = ntohs(READ_ONCE(q->stat->cidx));
 	hw_cidx -= q->cidx;
 	return hw_cidx < 0 ? hw_cidx + q->size : hw_cidx;
 }
@@ -1375,7 +1375,7 @@ out_free:	dev_kfree_skb_any(skb);
  */
 static inline void reclaim_completed_tx_imm(struct sge_txq *q)
 {
-	int hw_cidx = ntohs(ACCESS_ONCE(q->stat->cidx));
+	int hw_cidx = ntohs(READ_ONCE(q->stat->cidx));
 	int reclaim = hw_cidx - q->cidx;
 
 	if (reclaim < 0)
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 0e3d9f39a807..c6e859a27ee6 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -605,7 +605,7 @@ static void accumulate_16bit_val(u32 *acc, u16 val)
 
 	if (wrapped)
 		newacc += 65536;
-	ACCESS_ONCE(*acc) = newacc;
+	WRITE_ONCE(*acc, newacc);
 }
 
 static void populate_erx_stats(struct be_adapter *adapter,
diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c
index 0cec06bec63e..340e28211135 100644
--- a/drivers/net/ethernet/hisilicon/hip04_eth.c
+++ b/drivers/net/ethernet/hisilicon/hip04_eth.c
@@ -373,7 +373,7 @@ static int hip04_tx_reclaim(struct net_device *ndev, bool force)
 	unsigned int count;
 
 	smp_rmb();
-	count = tx_count(ACCESS_ONCE(priv->tx_head), tx_tail);
+	count = tx_count(READ_ONCE(priv->tx_head), tx_tail);
 	if (count == 0)
 		goto out;
 
@@ -431,7 +431,7 @@ static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	dma_addr_t phys;
 
 	smp_rmb();
-	count = tx_count(tx_head, ACCESS_ONCE(priv->tx_tail));
+	count = tx_count(tx_head, READ_ONCE(priv->tx_tail));
 	if (count == (TX_DESC_NUM - 1)) {
 		netif_stop_queue(ndev);
 		return NETDEV_TX_BUSY;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
index 8f326f87a815..2cb9539c931e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
@@ -264,7 +264,7 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid)
 		 vsi->rx_buf_failed, vsi->rx_page_failed);
 	rcu_read_lock();
 	for (i = 0; i < vsi->num_queue_pairs; i++) {
-		struct i40e_ring *rx_ring = ACCESS_ONCE(vsi->rx_rings[i]);
+		struct i40e_ring *rx_ring = READ_ONCE(vsi->rx_rings[i]);
 
 		if (!rx_ring)
 			continue;
@@ -320,7 +320,7 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid)
 			 ITR_IS_DYNAMIC(rx_ring->rx_itr_setting) ? "dynamic" : "fixed");
 	}
 	for (i = 0; i < vsi->num_queue_pairs; i++) {
-		struct i40e_ring *tx_ring = ACCESS_ONCE(vsi->tx_rings[i]);
+		struct i40e_ring *tx_ring = READ_ONCE(vsi->tx_rings[i]);
 
 		if (!tx_ring)
 			continue;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 05e89864f781..e9e04a485e0a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1570,7 +1570,7 @@ static void i40e_get_ethtool_stats(struct net_device *netdev,
 	}
 	rcu_read_lock();
 	for (j = 0; j < vsi->num_queue_pairs; j++) {
-		tx_ring = ACCESS_ONCE(vsi->tx_rings[j]);
+		tx_ring = READ_ONCE(vsi->tx_rings[j]);
 
 		if (!tx_ring)
 			continue;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 6498da8806cb..de1fcac7834d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -455,7 +455,7 @@ static void i40e_get_netdev_stats_struct(struct net_device *netdev,
 		u64 bytes, packets;
 		unsigned int start;
 
-		tx_ring = ACCESS_ONCE(vsi->tx_rings[i]);
+		tx_ring = READ_ONCE(vsi->tx_rings[i]);
 		if (!tx_ring)
 			continue;
 		i40e_get_netdev_stats_struct_tx(tx_ring, stats);
@@ -791,7 +791,7 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi)
 	rcu_read_lock();
 	for (q = 0; q < vsi->num_queue_pairs; q++) {
 		/* locate Tx ring */
-		p = ACCESS_ONCE(vsi->tx_rings[q]);
+		p = READ_ONCE(vsi->tx_rings[q]);
 
 		do {
 			start = u64_stats_fetch_begin_irq(&p->syncp);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
index d8456c381c99..97381238eb7c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ptp.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
@@ -130,7 +130,7 @@ static int i40e_ptp_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
 	}
 
 	smp_mb(); /* Force any pending update before accessing. */
-	adj = ACCESS_ONCE(pf->ptp_base_adj);
+	adj = READ_ONCE(pf->ptp_base_adj);
 
 	freq = adj;
 	freq *= ppb;
@@ -499,7 +499,7 @@ void i40e_ptp_set_increment(struct i40e_pf *pf)
 	wr32(hw, I40E_PRTTSYN_INC_H, incval >> 32);
 
 	/* Update the base adjustement value. */
-	ACCESS_ONCE(pf->ptp_base_adj) = incval;
+	WRITE_ONCE(pf->ptp_base_adj, incval);
 	smp_mb(); /* Force the above update. */
 }
 
diff --git a/drivers/net/ethernet/intel/igb/e1000_regs.h b/drivers/net/ethernet/intel/igb/e1000_regs.h
index 58adbf234e07..31a3f09df9f7 100644
--- a/drivers/net/ethernet/intel/igb/e1000_regs.h
+++ b/drivers/net/ethernet/intel/igb/e1000_regs.h
@@ -375,7 +375,7 @@ u32 igb_rd32(struct e1000_hw *hw, u32 reg);
 /* write operations, indexed using DWORDS */
 #define wr32(reg, val) \
 do { \
-	u8 __iomem *hw_addr = ACCESS_ONCE((hw)->hw_addr); \
+	u8 __iomem *hw_addr = READ_ONCE((hw)->hw_addr); \
 	if (!E1000_REMOVED(hw_addr)) \
 		writel((val), &hw_addr[(reg)]); \
 } while (0)
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index fd4a46b03cc8..6bccc2be2b91 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -750,7 +750,7 @@ static void igb_cache_ring_register(struct igb_adapter *adapter)
 u32 igb_rd32(struct e1000_hw *hw, u32 reg)
 {
 	struct igb_adapter *igb = container_of(hw, struct igb_adapter, hw);
-	u8 __iomem *hw_addr = ACCESS_ONCE(hw->hw_addr);
+	u8 __iomem *hw_addr = READ_ONCE(hw->hw_addr);
 	u32 value = 0;
 
 	if (E1000_REMOVED(hw_addr))
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h
index e083732adf64..a01409e2e06c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h
@@ -161,7 +161,7 @@ static inline bool ixgbe_removed(void __iomem *addr)
 
 static inline void ixgbe_write_reg(struct ixgbe_hw *hw, u32 reg, u32 value)
 {
-	u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
+	u8 __iomem *reg_addr = READ_ONCE(hw->hw_addr);
 
 	if (ixgbe_removed(reg_addr))
 		return;
@@ -180,7 +180,7 @@ static inline void writeq(u64 val, void __iomem *addr)
 
 static inline void ixgbe_write_reg64(struct ixgbe_hw *hw, u32 reg, u64 value)
 {
-	u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
+	u8 __iomem *reg_addr = READ_ONCE(hw->hw_addr);
 
 	if (ixgbe_removed(reg_addr))
 		return;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 4d76afd13868..2224e691ee07 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -380,7 +380,7 @@ static void ixgbe_check_remove(struct ixgbe_hw *hw, u32 reg)
  */
 u32 ixgbe_read_reg(struct ixgbe_hw *hw, u32 reg)
 {
-	u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
+	u8 __iomem *reg_addr = READ_ONCE(hw->hw_addr);
 	u32 value;
 
 	if (ixgbe_removed(reg_addr))
@@ -8630,7 +8630,7 @@ static void ixgbe_get_stats64(struct net_device *netdev,
 
 	rcu_read_lock();
 	for (i = 0; i < adapter->num_rx_queues; i++) {
-		struct ixgbe_ring *ring = ACCESS_ONCE(adapter->rx_ring[i]);
+		struct ixgbe_ring *ring = READ_ONCE(adapter->rx_ring[i]);
 		u64 bytes, packets;
 		unsigned int start;
 
@@ -8646,12 +8646,12 @@ static void ixgbe_get_stats64(struct net_device *netdev,
 	}
 
 	for (i = 0; i < adapter->num_tx_queues; i++) {
-		struct ixgbe_ring *ring = ACCESS_ONCE(adapter->tx_ring[i]);
+		struct ixgbe_ring *ring = READ_ONCE(adapter->tx_ring[i]);
 
 		ixgbe_get_ring_stats64(stats, ring);
 	}
 	for (i = 0; i < adapter->num_xdp_queues; i++) {
-		struct ixgbe_ring *ring = ACCESS_ONCE(adapter->xdp_ring[i]);
+		struct ixgbe_ring *ring = READ_ONCE(adapter->xdp_ring[i]);
 
 		ixgbe_get_ring_stats64(stats, ring);
 	}
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
index 86d6924a2b71..ae312c45696a 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
@@ -378,7 +378,7 @@ static int ixgbe_ptp_adjfreq_82599(struct ptp_clock_info *ptp, s32 ppb)
 	}
 
 	smp_mb();
-	incval = ACCESS_ONCE(adapter->base_incval);
+	incval = READ_ONCE(adapter->base_incval);
 
 	freq = incval;
 	freq *= ppb;
@@ -1159,7 +1159,7 @@ void ixgbe_ptp_start_cyclecounter(struct ixgbe_adapter *adapter)
 	}
 
 	/* update the base incval used to calculate frequency adjustment */
-	ACCESS_ONCE(adapter->base_incval) = incval;
+	WRITE_ONCE(adapter->base_incval, incval);
 	smp_mb();
 
 	/* need lock to prevent incorrect read while modifying cyclecounter */
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 032f8ac06357..cacb30682434 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -164,7 +164,7 @@ static void ixgbevf_check_remove(struct ixgbe_hw *hw, u32 reg)
 
 u32 ixgbevf_read_reg(struct ixgbe_hw *hw, u32 reg)
 {
-	u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
+	u8 __iomem *reg_addr = READ_ONCE(hw->hw_addr);
 	u32 value;
 
 	if (IXGBE_REMOVED(reg_addr))
diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.h b/drivers/net/ethernet/intel/ixgbevf/vf.h
index 04d8d4ee4f04..c651fefcc3d2 100644
--- a/drivers/net/ethernet/intel/ixgbevf/vf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/vf.h
@@ -182,7 +182,7 @@ struct ixgbevf_info {
 
 static inline void ixgbe_write_reg(struct ixgbe_hw *hw, u32 reg, u32 value)
 {
-	u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
+	u8 __iomem *reg_addr = READ_ONCE(hw->hw_addr);
 
 	if (IXGBE_REMOVED(reg_addr))
 		return;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 8a32a8f7f9c0..3541a7f9d12e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -414,8 +414,8 @@ bool mlx4_en_process_tx_cq(struct net_device *dev,
 
 	index = cons_index & size_mask;
 	cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
-	last_nr_txbb = ACCESS_ONCE(ring->last_nr_txbb);
-	ring_cons = ACCESS_ONCE(ring->cons);
+	last_nr_txbb = READ_ONCE(ring->last_nr_txbb);
+	ring_cons = READ_ONCE(ring->cons);
 	ring_index = ring_cons & size_mask;
 	stamp_index = ring_index;
 
@@ -479,8 +479,8 @@ bool mlx4_en_process_tx_cq(struct net_device *dev,
 	wmb();
 
 	/* we want to dirty this cache line once */
-	ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb;
-	ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped;
+	WRITE_ONCE(ring->last_nr_txbb, last_nr_txbb);
+	WRITE_ONCE(ring->cons, ring_cons + txbbs_skipped);
 
 	if (cq->type == TX_XDP)
 		return done < budget;
@@ -858,7 +858,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto tx_drop;
 
 	/* fetch ring->cons far ahead before needing it to avoid stall */
-	ring_cons = ACCESS_ONCE(ring->cons);
+	ring_cons = READ_ONCE(ring->cons);
 
 	real_size = get_real_size(skb, shinfo, dev, &lso_header_size,
 				  &inline_ok, &fragptr);
@@ -1066,7 +1066,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 		 */
 		smp_rmb();
 
-		ring_cons = ACCESS_ONCE(ring->cons);
+		ring_cons = READ_ONCE(ring->cons);
 		if (unlikely(!mlx4_en_is_tx_ring_full(ring))) {
 			netif_tx_wake_queue(ring->tx_queue);
 			ring->wake_queue++;
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index 50ea69d88480..5dd5f61e1114 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -2629,7 +2629,7 @@ static void vxge_poll_vp_lockup(unsigned long data)
 		ring = &vdev->vpaths[i].ring;
 
 		/* Truncated to machine word size number of frames */
-		rx_frms = ACCESS_ONCE(ring->stats.rx_frms);
+		rx_frms = READ_ONCE(ring->stats.rx_frms);
 
 		/* Did this vpath received any packets */
 		if (ring->stats.prev_rx_frms == rx_frms) {
diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index 13f72f5b18d2..a95a46bcd339 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -2073,7 +2073,7 @@ static irqreturn_t efx_ef10_msi_interrupt(int irq, void *dev_id)
 	netif_vdbg(efx, intr, efx->net_dev,
 		   "IRQ %d on CPU %d\n", irq, raw_smp_processor_id());
 
-	if (likely(ACCESS_ONCE(efx->irq_soft_enabled))) {
+	if (likely(READ_ONCE(efx->irq_soft_enabled))) {
 		/* Note test interrupts */
 		if (context->index == efx->irq_level)
 			efx->last_irq_cpu = raw_smp_processor_id();
@@ -2088,7 +2088,7 @@ static irqreturn_t efx_ef10_msi_interrupt(int irq, void *dev_id)
 static irqreturn_t efx_ef10_legacy_interrupt(int irq, void *dev_id)
 {
 	struct efx_nic *efx = dev_id;
-	bool soft_enabled = ACCESS_ONCE(efx->irq_soft_enabled);
+	bool soft_enabled = READ_ONCE(efx->irq_soft_enabled);
 	struct efx_channel *channel;
 	efx_dword_t reg;
 	u32 queues;
@@ -3291,7 +3291,7 @@ static int efx_ef10_handle_rx_event(struct efx_channel *channel,
 	bool rx_cont;
 	u16 flags = 0;
 
-	if (unlikely(ACCESS_ONCE(efx->reset_pending)))
+	if (unlikely(READ_ONCE(efx->reset_pending)))
 		return 0;
 
 	/* Basic packet information */
@@ -3428,7 +3428,7 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event)
 	unsigned int tx_ev_q_label;
 	int tx_descs = 0;
 
-	if (unlikely(ACCESS_ONCE(efx->reset_pending)))
+	if (unlikely(READ_ONCE(efx->reset_pending)))
 		return 0;
 
 	if (unlikely(EFX_QWORD_FIELD(*event, ESF_DZ_TX_DROP_EVENT)))
@@ -5316,7 +5316,7 @@ static void efx_ef10_filter_remove_old(struct efx_nic *efx)
 	int i;
 
 	for (i = 0; i < HUNT_FILTER_TBL_ROWS; i++) {
-		if (ACCESS_ONCE(table->entry[i].spec) &
+		if (READ_ONCE(table->entry[i].spec) &
 		    EFX_EF10_FILTER_FLAG_AUTO_OLD) {
 			rc = efx_ef10_filter_remove_internal(efx,
 					1U << EFX_FILTER_PRI_AUTO, i, true);
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index b9cb697b2818..016616a63880 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -2809,7 +2809,7 @@ static void efx_reset_work(struct work_struct *data)
 	unsigned long pending;
 	enum reset_type method;
 
-	pending = ACCESS_ONCE(efx->reset_pending);
+	pending = READ_ONCE(efx->reset_pending);
 	method = fls(pending) - 1;
 
 	if (method == RESET_TYPE_MC_BIST)
@@ -2874,7 +2874,7 @@ void efx_schedule_reset(struct efx_nic *efx, enum reset_type type)
 	/* If we're not READY then just leave the flags set as the cue
 	 * to abort probing or reschedule the reset later.
 	 */
-	if (ACCESS_ONCE(efx->state) != STATE_READY)
+	if (READ_ONCE(efx->state) != STATE_READY)
 		return;
 
 	/* efx_process_channel() will no longer read events once a
diff --git a/drivers/net/ethernet/sfc/falcon/efx.c b/drivers/net/ethernet/sfc/falcon/efx.c
index 29614da91cbf..7263275fde4a 100644
--- a/drivers/net/ethernet/sfc/falcon/efx.c
+++ b/drivers/net/ethernet/sfc/falcon/efx.c
@@ -2545,7 +2545,7 @@ static void ef4_reset_work(struct work_struct *data)
 	unsigned long pending;
 	enum reset_type method;
 
-	pending = ACCESS_ONCE(efx->reset_pending);
+	pending = READ_ONCE(efx->reset_pending);
 	method = fls(pending) - 1;
 
 	if ((method == RESET_TYPE_RECOVER_OR_DISABLE ||
@@ -2605,7 +2605,7 @@ void ef4_schedule_reset(struct ef4_nic *efx, enum reset_type type)
 	/* If we're not READY then just leave the flags set as the cue
 	 * to abort probing or reschedule the reset later.
 	 */
-	if (ACCESS_ONCE(efx->state) != STATE_READY)
+	if (READ_ONCE(efx->state) != STATE_READY)
 		return;
 
 	queue_work(reset_workqueue, &efx->reset_work);
diff --git a/drivers/net/ethernet/sfc/falcon/falcon.c b/drivers/net/ethernet/sfc/falcon/falcon.c
index 93c713c1f627..cd8bb472d758 100644
--- a/drivers/net/ethernet/sfc/falcon/falcon.c
+++ b/drivers/net/ethernet/sfc/falcon/falcon.c
@@ -452,7 +452,7 @@ static irqreturn_t falcon_legacy_interrupt_a1(int irq, void *dev_id)
 		   "IRQ %d on CPU %d status " EF4_OWORD_FMT "\n",
 		   irq, raw_smp_processor_id(), EF4_OWORD_VAL(*int_ker));
 
-	if (!likely(ACCESS_ONCE(efx->irq_soft_enabled)))
+	if (!likely(READ_ONCE(efx->irq_soft_enabled)))
 		return IRQ_HANDLED;
 
 	/* Check to see if we have a serious error condition */
@@ -1372,7 +1372,7 @@ static void falcon_reconfigure_mac_wrapper(struct ef4_nic *efx)
 	ef4_oword_t reg;
 	int link_speed, isolate;
 
-	isolate = !!ACCESS_ONCE(efx->reset_pending);
+	isolate = !!READ_ONCE(efx->reset_pending);
 
 	switch (link_state->speed) {
 	case 10000: link_speed = 3; break;
diff --git a/drivers/net/ethernet/sfc/falcon/farch.c b/drivers/net/ethernet/sfc/falcon/farch.c
index 05916c710d8c..494884f6af4a 100644
--- a/drivers/net/ethernet/sfc/falcon/farch.c
+++ b/drivers/net/ethernet/sfc/falcon/farch.c
@@ -834,7 +834,7 @@ ef4_farch_handle_tx_event(struct ef4_channel *channel, ef4_qword_t *event)
 	struct ef4_nic *efx = channel->efx;
 	int tx_packets = 0;
 
-	if (unlikely(ACCESS_ONCE(efx->reset_pending)))
+	if (unlikely(READ_ONCE(efx->reset_pending)))
 		return 0;
 
 	if (likely(EF4_QWORD_FIELD(*event, FSF_AZ_TX_EV_COMP))) {
@@ -990,7 +990,7 @@ ef4_farch_handle_rx_event(struct ef4_channel *channel, const ef4_qword_t *event)
 	struct ef4_rx_queue *rx_queue;
 	struct ef4_nic *efx = channel->efx;
 
-	if (unlikely(ACCESS_ONCE(efx->reset_pending)))
+	if (unlikely(READ_ONCE(efx->reset_pending)))
 		return;
 
 	rx_ev_cont = EF4_QWORD_FIELD(*event, FSF_AZ_RX_EV_JUMBO_CONT);
@@ -1504,7 +1504,7 @@ irqreturn_t ef4_farch_fatal_interrupt(struct ef4_nic *efx)
 irqreturn_t ef4_farch_legacy_interrupt(int irq, void *dev_id)
 {
 	struct ef4_nic *efx = dev_id;
-	bool soft_enabled = ACCESS_ONCE(efx->irq_soft_enabled);
+	bool soft_enabled = READ_ONCE(efx->irq_soft_enabled);
 	ef4_oword_t *int_ker = efx->irq_status.addr;
 	irqreturn_t result = IRQ_NONE;
 	struct ef4_channel *channel;
@@ -1596,7 +1596,7 @@ irqreturn_t ef4_farch_msi_interrupt(int irq, void *dev_id)
 		   "IRQ %d on CPU %d status " EF4_OWORD_FMT "\n",
 		   irq, raw_smp_processor_id(), EF4_OWORD_VAL(*int_ker));
 
-	if (!likely(ACCESS_ONCE(efx->irq_soft_enabled)))
+	if (!likely(READ_ONCE(efx->irq_soft_enabled)))
 		return IRQ_HANDLED;
 
 	/* Handle non-event-queue sources */
diff --git a/drivers/net/ethernet/sfc/falcon/nic.h b/drivers/net/ethernet/sfc/falcon/nic.h
index a4c4592f6023..54ca457cdb15 100644
--- a/drivers/net/ethernet/sfc/falcon/nic.h
+++ b/drivers/net/ethernet/sfc/falcon/nic.h
@@ -83,7 +83,7 @@ static inline struct ef4_tx_queue *ef4_tx_queue_partner(struct ef4_tx_queue *tx_
 static inline bool __ef4_nic_tx_is_empty(struct ef4_tx_queue *tx_queue,
 					 unsigned int write_count)
 {
-	unsigned int empty_read_count = ACCESS_ONCE(tx_queue->empty_read_count);
+	unsigned int empty_read_count = READ_ONCE(tx_queue->empty_read_count);
 
 	if (empty_read_count == 0)
 		return false;
@@ -464,11 +464,11 @@ irqreturn_t ef4_farch_fatal_interrupt(struct ef4_nic *efx);
 
 static inline int ef4_nic_event_test_irq_cpu(struct ef4_channel *channel)
 {
-	return ACCESS_ONCE(channel->event_test_cpu);
+	return READ_ONCE(channel->event_test_cpu);
 }
 static inline int ef4_nic_irq_test_irq_cpu(struct ef4_nic *efx)
 {
-	return ACCESS_ONCE(efx->last_irq_cpu);
+	return READ_ONCE(efx->last_irq_cpu);
 }
 
 /* Global Resources */
diff --git a/drivers/net/ethernet/sfc/falcon/tx.c b/drivers/net/ethernet/sfc/falcon/tx.c
index 6a75f4140a4b..6486814e97dc 100644
--- a/drivers/net/ethernet/sfc/falcon/tx.c
+++ b/drivers/net/ethernet/sfc/falcon/tx.c
@@ -134,8 +134,8 @@ static void ef4_tx_maybe_stop_queue(struct ef4_tx_queue *txq1)
 	 */
 	netif_tx_stop_queue(txq1->core_txq);
 	smp_mb();
-	txq1->old_read_count = ACCESS_ONCE(txq1->read_count);
-	txq2->old_read_count = ACCESS_ONCE(txq2->read_count);
+	txq1->old_read_count = READ_ONCE(txq1->read_count);
+	txq2->old_read_count = READ_ONCE(txq2->read_count);
 
 	fill_level = max(txq1->insert_count - txq1->old_read_count,
 			 txq2->insert_count - txq2->old_read_count);
@@ -524,7 +524,7 @@ void ef4_xmit_done(struct ef4_tx_queue *tx_queue, unsigned int index)
 
 	/* Check whether the hardware queue is now empty */
 	if ((int)(tx_queue->read_count - tx_queue->old_write_count) >= 0) {
-		tx_queue->old_write_count = ACCESS_ONCE(tx_queue->write_count);
+		tx_queue->old_write_count = READ_ONCE(tx_queue->write_count);
 		if (tx_queue->read_count == tx_queue->old_write_count) {
 			smp_mb();
 			tx_queue->empty_read_count =
diff --git a/drivers/net/ethernet/sfc/farch.c b/drivers/net/ethernet/sfc/farch.c
index ba45150f53c7..86454d25a405 100644
--- a/drivers/net/ethernet/sfc/farch.c
+++ b/drivers/net/ethernet/sfc/farch.c
@@ -827,7 +827,7 @@ efx_farch_handle_tx_event(struct efx_channel *channel, efx_qword_t *event)
 	struct efx_nic *efx = channel->efx;
 	int tx_packets = 0;
 
-	if (unlikely(ACCESS_ONCE(efx->reset_pending)))
+	if (unlikely(READ_ONCE(efx->reset_pending)))
 		return 0;
 
 	if (likely(EFX_QWORD_FIELD(*event, FSF_AZ_TX_EV_COMP))) {
@@ -979,7 +979,7 @@ efx_farch_handle_rx_event(struct efx_channel *channel, const efx_qword_t *event)
 	struct efx_rx_queue *rx_queue;
 	struct efx_nic *efx = channel->efx;
 
-	if (unlikely(ACCESS_ONCE(efx->reset_pending)))
+	if (unlikely(READ_ONCE(efx->reset_pending)))
 		return;
 
 	rx_ev_cont = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_JUMBO_CONT);
@@ -1520,7 +1520,7 @@ irqreturn_t efx_farch_fatal_interrupt(struct efx_nic *efx)
 irqreturn_t efx_farch_legacy_interrupt(int irq, void *dev_id)
 {
 	struct efx_nic *efx = dev_id;
-	bool soft_enabled = ACCESS_ONCE(efx->irq_soft_enabled);
+	bool soft_enabled = READ_ONCE(efx->irq_soft_enabled);
 	efx_oword_t *int_ker = efx->irq_status.addr;
 	irqreturn_t result = IRQ_NONE;
 	struct efx_channel *channel;
@@ -1612,7 +1612,7 @@ irqreturn_t efx_farch_msi_interrupt(int irq, void *dev_id)
 		   "IRQ %d on CPU %d status " EFX_OWORD_FMT "\n",
 		   irq, raw_smp_processor_id(), EFX_OWORD_VAL(*int_ker));
 
-	if (!likely(ACCESS_ONCE(efx->irq_soft_enabled)))
+	if (!likely(READ_ONCE(efx->irq_soft_enabled)))
 		return IRQ_HANDLED;
 
 	/* Handle non-event-queue sources */
diff --git a/drivers/net/ethernet/sfc/nic.h b/drivers/net/ethernet/sfc/nic.h
index 4d7fb8af880d..7b51b6371724 100644
--- a/drivers/net/ethernet/sfc/nic.h
+++ b/drivers/net/ethernet/sfc/nic.h
@@ -81,7 +81,7 @@ static struct efx_tx_queue *efx_tx_queue_partner(struct efx_tx_queue *tx_queue)
 static inline bool __efx_nic_tx_is_empty(struct efx_tx_queue *tx_queue,
 					 unsigned int write_count)
 {
-	unsigned int empty_read_count = ACCESS_ONCE(tx_queue->empty_read_count);
+	unsigned int empty_read_count = READ_ONCE(tx_queue->empty_read_count);
 
 	if (empty_read_count == 0)
 		return false;
@@ -617,11 +617,11 @@ irqreturn_t efx_farch_fatal_interrupt(struct efx_nic *efx);
 
 static inline int efx_nic_event_test_irq_cpu(struct efx_channel *channel)
 {
-	return ACCESS_ONCE(channel->event_test_cpu);
+	return READ_ONCE(channel->event_test_cpu);
 }
 static inline int efx_nic_irq_test_irq_cpu(struct efx_nic *efx)
 {
-	return ACCESS_ONCE(efx->last_irq_cpu);
+	return READ_ONCE(efx->last_irq_cpu);
 }
 
 /* Global Resources */
diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c
index 60cdb97f58e2..56c2db398def 100644
--- a/drivers/net/ethernet/sfc/ptp.c
+++ b/drivers/net/ethernet/sfc/ptp.c
@@ -658,7 +658,7 @@ static void efx_ptp_send_times(struct efx_nic *efx,
 
 	/* Write host time for specified period or until MC is done */
 	while ((timespec64_compare(&now.ts_real, &limit) < 0) &&
-	       ACCESS_ONCE(*mc_running)) {
+	       READ_ONCE(*mc_running)) {
 		struct timespec64 update_time;
 		unsigned int host_time;
 
@@ -668,7 +668,7 @@ static void efx_ptp_send_times(struct efx_nic *efx,
 		do {
 			pps_get_ts(&now);
 		} while ((timespec64_compare(&now.ts_real, &update_time) < 0) &&
-			 ACCESS_ONCE(*mc_running));
+			 READ_ONCE(*mc_running));
 
 		/* Synchronise NIC with single word of time only */
 		host_time = (now.ts_real.tv_sec << MC_NANOSECOND_BITS |
@@ -832,14 +832,14 @@ static int efx_ptp_synchronize(struct efx_nic *efx, unsigned int num_readings)
 		       ptp->start.dma_addr);
 
 	/* Clear flag that signals MC ready */
-	ACCESS_ONCE(*start) = 0;
+	WRITE_ONCE(*start, 0);
 	rc = efx_mcdi_rpc_start(efx, MC_CMD_PTP, synch_buf,
 				MC_CMD_PTP_IN_SYNCHRONIZE_LEN);
 	EFX_WARN_ON_ONCE_PARANOID(rc);
 
 	/* Wait for start from MCDI (or timeout) */
 	timeout = jiffies + msecs_to_jiffies(MAX_SYNCHRONISE_WAIT_MS);
-	while (!ACCESS_ONCE(*start) && (time_before(jiffies, timeout))) {
+	while (!READ_ONCE(*start) && (time_before(jiffies, timeout))) {
 		udelay(20);	/* Usually start MCDI execution quickly */
 		loops++;
 	}
@@ -849,7 +849,7 @@ static int efx_ptp_synchronize(struct efx_nic *efx, unsigned int num_readings)
 	if (!time_before(jiffies, timeout))
 		++ptp->sync_timeouts;
 
-	if (ACCESS_ONCE(*start))
+	if (READ_ONCE(*start))
 		efx_ptp_send_times(efx, &last_time);
 
 	/* Collect results */
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index 32bf1fecf864..efb66ea21f27 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -136,8 +136,8 @@ static void efx_tx_maybe_stop_queue(struct efx_tx_queue *txq1)
 	 */
 	netif_tx_stop_queue(txq1->core_txq);
 	smp_mb();
-	txq1->old_read_count = ACCESS_ONCE(txq1->read_count);
-	txq2->old_read_count = ACCESS_ONCE(txq2->read_count);
+	txq1->old_read_count = READ_ONCE(txq1->read_count);
+	txq2->old_read_count = READ_ONCE(txq2->read_count);
 
 	fill_level = max(txq1->insert_count - txq1->old_read_count,
 			 txq2->insert_count - txq2->old_read_count);
@@ -752,7 +752,7 @@ void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index)
 
 	/* Check whether the hardware queue is now empty */
 	if ((int)(tx_queue->read_count - tx_queue->old_write_count) >= 0) {
-		tx_queue->old_write_count = ACCESS_ONCE(tx_queue->write_count);
+		tx_queue->old_write_count = READ_ONCE(tx_queue->write_count);
 		if (tx_queue->read_count == tx_queue->old_write_count) {
 			smp_mb();
 			tx_queue->empty_read_count =
diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c
index 6a4e8e1bbd90..8ab0fb6892d5 100644
--- a/drivers/net/ethernet/sun/niu.c
+++ b/drivers/net/ethernet/sun/niu.c
@@ -6245,7 +6245,7 @@ static void niu_get_rx_stats(struct niu *np,
 
 	pkts = dropped = errors = bytes = 0;
 
-	rx_rings = ACCESS_ONCE(np->rx_rings);
+	rx_rings = READ_ONCE(np->rx_rings);
 	if (!rx_rings)
 		goto no_rings;
 
@@ -6276,7 +6276,7 @@ static void niu_get_tx_stats(struct niu *np,
 
 	pkts = errors = bytes = 0;
 
-	tx_rings = ACCESS_ONCE(np->tx_rings);
+	tx_rings = READ_ONCE(np->tx_rings);
 	if (!tx_rings)
 		goto no_rings;
 
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 21b71ae947fd..b55b29b90b88 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -257,7 +257,7 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap,
 	 * and validate that the result isn't NULL - in case we are
 	 * racing against queue removal.
 	 */
-	int numvtaps = ACCESS_ONCE(tap->numvtaps);
+	int numvtaps = READ_ONCE(tap->numvtaps);
 	__u32 rxq;
 
 	if (!numvtaps)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index e21bf90b819f..27cd50c5bc9e 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -469,7 +469,7 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
 	u32 numqueues = 0;
 
 	rcu_read_lock();
-	numqueues = ACCESS_ONCE(tun->numqueues);
+	numqueues = READ_ONCE(tun->numqueues);
 
 	txq = __skb_get_hash_symmetric(skb);
 	if (txq) {
@@ -864,7 +864,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	rcu_read_lock();
 	tfile = rcu_dereference(tun->tfiles[txq]);
-	numqueues = ACCESS_ONCE(tun->numqueues);
+	numqueues = READ_ONCE(tun->numqueues);
 
 	/* Drop packet if interface is not attached */
 	if (txq >= numqueues)
diff --git a/drivers/net/wireless/ath/ath5k/desc.c b/drivers/net/wireless/ath/ath5k/desc.c
index bd8d4392d68b..80f75139495f 100644
--- a/drivers/net/wireless/ath/ath5k/desc.c
+++ b/drivers/net/wireless/ath/ath5k/desc.c
@@ -500,13 +500,13 @@ ath5k_hw_proc_4word_tx_status(struct ath5k_hw *ah,
 
 	tx_status = &desc->ud.ds_tx5212.tx_stat;
 
-	txstat1 = ACCESS_ONCE(tx_status->tx_status_1);
+	txstat1 = READ_ONCE(tx_status->tx_status_1);
 
 	/* No frame has been send or error */
 	if (unlikely(!(txstat1 & AR5K_DESC_TX_STATUS1_DONE)))
 		return -EINPROGRESS;
 
-	txstat0 = ACCESS_ONCE(tx_status->tx_status_0);
+	txstat0 = READ_ONCE(tx_status->tx_status_0);
 
 	/*
 	 * Get descriptor status
@@ -700,14 +700,14 @@ ath5k_hw_proc_5212_rx_status(struct ath5k_hw *ah,
 	u32 rxstat0, rxstat1;
 
 	rx_status = &desc->ud.ds_rx.rx_stat;
-	rxstat1 = ACCESS_ONCE(rx_status->rx_status_1);
+	rxstat1 = READ_ONCE(rx_status->rx_status_1);
 
 	/* No frame received / not ready */
 	if (unlikely(!(rxstat1 & AR5K_5212_RX_DESC_STATUS1_DONE)))
 		return -EINPROGRESS;
 
 	memset(rs, 0, sizeof(struct ath5k_rx_status));
-	rxstat0 = ACCESS_ONCE(rx_status->rx_status_0);
+	rxstat0 = READ_ONCE(rx_status->rx_status_0);
 
 	/*
 	 * Frame receive status
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index 613caca7dc02..785a0f33b7e6 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
@@ -3628,7 +3628,7 @@ static void brcmf_sdio_dataworker(struct work_struct *work)
 
 	bus->dpc_running = true;
 	wmb();
-	while (ACCESS_ONCE(bus->dpc_triggered)) {
+	while (READ_ONCE(bus->dpc_triggered)) {
 		bus->dpc_triggered = false;
 		brcmf_sdio_dpc(bus);
 		bus->idlecount = 0;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
index 231878969332..0f45f34e39d3 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
@@ -1118,7 +1118,7 @@ void iwl_mvm_set_hw_ctkill_state(struct iwl_mvm *mvm, bool state)
 static bool iwl_mvm_set_hw_rfkill_state(struct iwl_op_mode *op_mode, bool state)
 {
 	struct iwl_mvm *mvm = IWL_OP_MODE_GET_MVM(op_mode);
-	bool calibrating = ACCESS_ONCE(mvm->calibrating);
+	bool calibrating = READ_ONCE(mvm->calibrating);
 
 	if (state)
 		set_bit(IWL_MVM_STATUS_HW_RFKILL, &mvm->status);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
index 6f2e2af23219..6e9d3289b9d0 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
@@ -652,7 +652,7 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb)
 				return -1;
 		} else if (info.control.vif->type == NL80211_IFTYPE_STATION &&
 			   is_multicast_ether_addr(hdr->addr1)) {
-			u8 ap_sta_id = ACCESS_ONCE(mvmvif->ap_sta_id);
+			u8 ap_sta_id = READ_ONCE(mvmvif->ap_sta_id);
 
 			if (ap_sta_id != IWL_MVM_INVALID_STA)
 				sta_id = ap_sta_id;
@@ -700,7 +700,7 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb,
 	snap_ip_tcp = 8 + skb_transport_header(skb) - skb_network_header(skb) +
 		tcp_hdrlen(skb);
 
-	dbg_max_amsdu_len = ACCESS_ONCE(mvm->max_amsdu_len);
+	dbg_max_amsdu_len = READ_ONCE(mvm->max_amsdu_len);
 
 	if (!sta->max_amsdu_len ||
 	    !ieee80211_is_data_qos(hdr->frame_control) ||
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
index a06b6612b658..f25ce3a1ea50 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
@@ -1247,7 +1247,7 @@ restart:
 	spin_lock(&rxq->lock);
 	/* uCode's read index (stored in shared DRAM) indicates the last Rx
 	 * buffer that the driver may process (last buffer filled by ucode). */
-	r = le16_to_cpu(ACCESS_ONCE(rxq->rb_stts->closed_rb_num)) & 0x0FFF;
+	r = le16_to_cpu(READ_ONCE(rxq->rb_stts->closed_rb_num)) & 0x0FFF;
 	i = rxq->read;
 
 	/* W/A 9000 device step A0 wrap-around bug */
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
index 2e3e013ec95a..9ad3f4fe5894 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
@@ -2076,12 +2076,12 @@ static int iwl_trans_pcie_wait_txq_empty(struct iwl_trans *trans, int txq_idx)
 
 	IWL_DEBUG_TX_QUEUES(trans, "Emptying queue %d...\n", txq_idx);
 	txq = trans_pcie->txq[txq_idx];
-	wr_ptr = ACCESS_ONCE(txq->write_ptr);
+	wr_ptr = READ_ONCE(txq->write_ptr);
 
-	while (txq->read_ptr != ACCESS_ONCE(txq->write_ptr) &&
+	while (txq->read_ptr != READ_ONCE(txq->write_ptr) &&
 	       !time_after(jiffies,
 			   now + msecs_to_jiffies(IWL_FLUSH_WAIT_MS))) {
-		u8 write_ptr = ACCESS_ONCE(txq->write_ptr);
+		u8 write_ptr = READ_ONCE(txq->write_ptr);
 
 		if (WARN_ONCE(wr_ptr != write_ptr,
 			      "WR pointer moved while flushing %d -> %d\n",
@@ -2553,7 +2553,7 @@ static u32 iwl_trans_pcie_dump_rbs(struct iwl_trans *trans,
 
 	spin_lock(&rxq->lock);
 
-	r = le16_to_cpu(ACCESS_ONCE(rxq->rb_stts->closed_rb_num)) & 0x0FFF;
+	r = le16_to_cpu(READ_ONCE(rxq->rb_stts->closed_rb_num)) & 0x0FFF;
 
 	for (i = rxq->read, j = 0;
 	     i != r && j < allocated_rb_nums;
@@ -2814,7 +2814,7 @@ static struct iwl_trans_dump_data
 		/* Dump RBs is supported only for pre-9000 devices (1 queue) */
 		struct iwl_rxq *rxq = &trans_pcie->rxq[0];
 		/* RBs */
-		num_rbs = le16_to_cpu(ACCESS_ONCE(rxq->rb_stts->closed_rb_num))
+		num_rbs = le16_to_cpu(READ_ONCE(rxq->rb_stts->closed_rb_num))
 				      & 0x0FFF;
 		num_rbs = (num_rbs - rxq->read) & RX_QUEUE_MASK;
 		len += num_rbs * (sizeof(*data) +
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 6467ffac9811..d2b3d6177a55 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -1380,7 +1380,7 @@ static void mac80211_hwsim_tx(struct ieee80211_hw *hw,
 	mac80211_hwsim_monitor_rx(hw, skb, channel);
 
 	/* wmediumd mode check */
-	_portid = ACCESS_ONCE(data->wmediumd);
+	_portid = READ_ONCE(data->wmediumd);
 
 	if (_portid)
 		return mac80211_hwsim_tx_frame_nl(hw, skb, _portid);
@@ -1477,7 +1477,7 @@ static void mac80211_hwsim_tx_frame(struct ieee80211_hw *hw,
 				    struct ieee80211_channel *chan)
 {
 	struct mac80211_hwsim_data *data = hw->priv;
-	u32 _pid = ACCESS_ONCE(data->wmediumd);
+	u32 _pid = READ_ONCE(data->wmediumd);
 
 	if (ieee80211_hw_check(hw, SUPPORTS_RC_TABLE)) {
 		struct ieee80211_tx_info *txi = IEEE80211_SKB_CB(skb);
diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c
index f05cfc83c9c8..f946bf889015 100644
--- a/drivers/scsi/qla2xxx/qla_target.c
+++ b/drivers/scsi/qla2xxx/qla_target.c
@@ -996,7 +996,7 @@ static void qlt_free_session_done(struct work_struct *work)
 	if (logout_started) {
 		bool traced = false;
 
-		while (!ACCESS_ONCE(sess->logout_completed)) {
+		while (!READ_ONCE(sess->logout_completed)) {
 			if (!traced) {
 				ql_dbg(ql_dbg_tgt_mgt, vha, 0xf086,
 					"%s: waiting for sess %p logout\n",
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 942d094269fb..9469695f5871 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -985,7 +985,7 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
 	mb = udev->mb_addr;
 	tcmu_flush_dcache_range(mb, sizeof(*mb));
 
-	while (udev->cmdr_last_cleaned != ACCESS_ONCE(mb->cmd_tail)) {
+	while (udev->cmdr_last_cleaned != READ_ONCE(mb->cmd_tail)) {
 
 		struct tcmu_cmd_entry *entry = (void *) mb + CMDR_OFF + udev->cmdr_last_cleaned;
 		struct tcmu_cmd *cmd;
diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index 3e865dbf878c..fbaa2a90d25d 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -483,7 +483,7 @@ static ssize_t wdm_read
 	if (rv < 0)
 		return -ERESTARTSYS;
 
-	cntr = ACCESS_ONCE(desc->length);
+	cntr = READ_ONCE(desc->length);
 	if (cntr == 0) {
 		desc->read = 0;
 retry:
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index e9326f31db8d..4ae667d8c238 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -150,7 +150,7 @@ static int usbfs_increase_memory_usage(u64 amount)
 {
 	u64 lim;
 
-	lim = ACCESS_ONCE(usbfs_memory_mb);
+	lim = READ_ONCE(usbfs_memory_mb);
 	lim <<= 20;
 
 	atomic64_add(amount, &usbfs_memory_usage);
diff --git a/drivers/usb/core/sysfs.c b/drivers/usb/core/sysfs.c
index d930bfda4010..58d59c5f8592 100644
--- a/drivers/usb/core/sysfs.c
+++ b/drivers/usb/core/sysfs.c
@@ -973,7 +973,7 @@ static ssize_t interface_show(struct device *dev, struct device_attribute *attr,
 	char *string;
 
 	intf = to_usb_interface(dev);
-	string = ACCESS_ONCE(intf->cur_altsetting->string);
+	string = READ_ONCE(intf->cur_altsetting->string);
 	if (!string)
 		return 0;
 	return sprintf(buf, "%s\n", string);
@@ -989,7 +989,7 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
 
 	intf = to_usb_interface(dev);
 	udev = interface_to_usbdev(intf);
-	alt = ACCESS_ONCE(intf->cur_altsetting);
+	alt = READ_ONCE(intf->cur_altsetting);
 
 	return sprintf(buf, "usb:v%04Xp%04Xd%04Xdc%02Xdsc%02Xdp%02X"
 			"ic%02Xisc%02Xip%02Xin%02X\n",
diff --git a/drivers/usb/gadget/udc/gr_udc.c b/drivers/usb/gadget/udc/gr_udc.c
index 1f9941145746..0b59fa50aa30 100644
--- a/drivers/usb/gadget/udc/gr_udc.c
+++ b/drivers/usb/gadget/udc/gr_udc.c
@@ -1261,7 +1261,7 @@ static int gr_handle_in_ep(struct gr_ep *ep)
 	if (!req->last_desc)
 		return 0;
 
-	if (ACCESS_ONCE(req->last_desc->ctrl) & GR_DESC_IN_CTRL_EN)
+	if (READ_ONCE(req->last_desc->ctrl) & GR_DESC_IN_CTRL_EN)
 		return 0; /* Not put in hardware buffers yet */
 
 	if (gr_read32(&ep->regs->epstat) & (GR_EPSTAT_B1 | GR_EPSTAT_B0))
@@ -1290,7 +1290,7 @@ static int gr_handle_out_ep(struct gr_ep *ep)
 	if (!req->curr_desc)
 		return 0;
 
-	ctrl = ACCESS_ONCE(req->curr_desc->ctrl);
+	ctrl = READ_ONCE(req->curr_desc->ctrl);
 	if (ctrl & GR_DESC_OUT_CTRL_EN)
 		return 0; /* Not received yet */
 
diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c
index 44924824fa41..c86f89babd57 100644
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@@ -785,7 +785,7 @@ static void io_watchdog_func(unsigned long _ohci)
 		}
 
 		/* find the last TD processed by the controller. */
-		head = hc32_to_cpu(ohci, ACCESS_ONCE(ed->hwHeadP)) & TD_MASK;
+		head = hc32_to_cpu(ohci, READ_ONCE(ed->hwHeadP)) & TD_MASK;
 		td_start = td;
 		td_next = list_prepare_entry(td, &ed->td_list, td_list);
 		list_for_each_entry_continue(td_next, &ed->td_list, td_list) {
diff --git a/drivers/usb/host/uhci-hcd.h b/drivers/usb/host/uhci-hcd.h
index 91b22b2ea3aa..09a2a259941b 100644
--- a/drivers/usb/host/uhci-hcd.h
+++ b/drivers/usb/host/uhci-hcd.h
@@ -186,7 +186,7 @@ struct uhci_qh {
  * We need a special accessor for the element pointer because it is
  * subject to asynchronous updates by the controller.
  */
-#define qh_element(qh)		ACCESS_ONCE((qh)->element)
+#define qh_element(qh)		READ_ONCE((qh)->element)
 
 #define LINK_TO_QH(uhci, qh)	(UHCI_PTR_QH((uhci)) | \
 				cpu_to_hc32((uhci), (qh)->dma_handle))
@@ -274,7 +274,7 @@ struct uhci_td {
  * subject to asynchronous updates by the controller.
  */
 #define td_status(uhci, td)		hc32_to_cpu((uhci), \
-						ACCESS_ONCE((td)->status))
+						READ_ONCE((td)->status))
 
 #define LINK_TO_TD(uhci, td)		(cpu_to_hc32((uhci), (td)->dma_handle))
 
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index f5a86f651f38..2bc3705a99bd 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -665,7 +665,7 @@ static int vfio_dev_viable(struct device *dev, void *data)
 {
 	struct vfio_group *group = data;
 	struct vfio_device *device;
-	struct device_driver *drv = ACCESS_ONCE(dev->driver);
+	struct device_driver *drv = READ_ONCE(dev->driver);
 	struct vfio_unbound_dev *unbound;
 	int ret = -EINVAL;
 
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 046f6d280af5..35e929f132e8 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -929,7 +929,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 			continue;
 		}
 
-		tpg = ACCESS_ONCE(vs_tpg[*target]);
+		tpg = READ_ONCE(vs_tpg[*target]);
 		if (unlikely(!tpg)) {
 			/* Target does not exist, fail the request */
 			vhost_scsi_send_bad_target(vs, vq, head, out);
diff --git a/fs/aio.c b/fs/aio.c
index 5a2487217072..e6de7715228c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -576,7 +576,7 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
 	 * actually has a cancel function, hence the cmpxchg()
 	 */
 
-	cancel = ACCESS_ONCE(kiocb->ki_cancel);
+	cancel = READ_ONCE(kiocb->ki_cancel);
 	do {
 		if (!cancel || cancel == KIOCB_CANCELLED)
 			return -EINVAL;
diff --git a/fs/buffer.c b/fs/buffer.c
index 170df856bdb9..32ce01f0f95f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1692,7 +1692,8 @@ static struct buffer_head *create_page_buffers(struct page *page, struct inode *
 	BUG_ON(!PageLocked(page));
 
 	if (!page_has_buffers(page))
-		create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
+		create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits),
+				     b_state);
 	return page_buffers(page);
 }
 
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 8e704d12a1cf..0083bd4fcaa5 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -373,7 +373,7 @@ void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci)
 	struct fscrypt_info *prev;
 
 	if (ci == NULL)
-		ci = ACCESS_ONCE(inode->i_crypt_info);
+		ci = READ_ONCE(inode->i_crypt_info);
 	if (ci == NULL)
 		return;
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b53e66d9abd7..98fe1325da9d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1152,7 +1152,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 		      get_block_t get_block, dio_iodone_t end_io,
 		      dio_submit_t submit_io, int flags)
 {
-	unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
+	unsigned i_blkbits = READ_ONCE(inode->i_blkbits);
 	unsigned blkbits = i_blkbits;
 	unsigned blocksize_mask = (1 << blkbits) - 1;
 	ssize_t retval = -EINVAL;
diff --git a/fs/exec.c b/fs/exec.c
index 3e14ba25f678..1d6243d9f2b6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1911,7 +1911,7 @@ void set_dumpable(struct mm_struct *mm, int value)
 		return;
 
 	do {
-		old = ACCESS_ONCE(mm->flags);
+		old = READ_ONCE(mm->flags);
 		new = (old & ~MMF_DUMPABLE_MASK) | value;
 	} while (cmpxchg(&mm->flags, old, new) != old);
 }
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 448a1119f0be..57bf2964bb83 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -724,7 +724,7 @@ static void send_sigio_to_task(struct task_struct *p,
 	 * F_SETSIG can change ->signum lockless in parallel, make
 	 * sure we read it once and use the same value throughout.
 	 */
-	int signum = ACCESS_ONCE(fown->signum);
+	int signum = READ_ONCE(fown->signum);
 
 	if (!sigio_perm(p, fown, signum))
 		return;
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index e747b3d720ee..2d07f292b625 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -78,7 +78,7 @@ void mnt_pin_kill(struct mount *m)
 	while (1) {
 		struct hlist_node *p;
 		rcu_read_lock();
-		p = ACCESS_ONCE(m->mnt_pins.first);
+		p = READ_ONCE(m->mnt_pins.first);
 		if (!p) {
 			rcu_read_unlock();
 			break;
@@ -92,7 +92,7 @@ void group_pin_kill(struct hlist_head *p)
 	while (1) {
 		struct hlist_node *q;
 		rcu_read_lock();
-		q = ACCESS_ONCE(p->first);
+		q = READ_ONCE(p->first);
 		if (!q) {
 			rcu_read_unlock();
 			break;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 13c65dd2d37d..a42d89371748 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -33,7 +33,7 @@ static struct fuse_dev *fuse_get_dev(struct file *file)
 	 * Lockless access is OK, because file->private data is set
 	 * once during mount and is valid until the file is released.
 	 */
-	return ACCESS_ONCE(file->private_data);
+	return READ_ONCE(file->private_data);
 }
 
 static void fuse_request_init(struct fuse_req *req, struct page **pages,
diff --git a/fs/inode.c b/fs/inode.c
index d1e35b53bb23..fd401028a309 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2090,7 +2090,7 @@ void inode_set_flags(struct inode *inode, unsigned int flags,
 
 	WARN_ON_ONCE(flags & ~mask);
 	do {
-		old_flags = ACCESS_ONCE(inode->i_flags);
+		old_flags = READ_ONCE(inode->i_flags);
 		new_flags = (old_flags & ~mask) | flags;
 	} while (unlikely(cmpxchg(&inode->i_flags, old_flags,
 				  new_flags) != old_flags));
diff --git a/fs/namei.c b/fs/namei.c
index c75ea03ca147..40a0f34bf990 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1209,7 +1209,7 @@ static int follow_managed(struct path *path, struct nameidata *nd)
 	/* Given that we're not holding a lock here, we retain the value in a
 	 * local variable for each dentry as we look at it so that we don't see
 	 * the components of that value change under us */
-	while (managed = ACCESS_ONCE(path->dentry->d_flags),
+	while (managed = READ_ONCE(path->dentry->d_flags),
 	       managed &= DCACHE_MANAGED_DENTRY,
 	       unlikely(managed != 0)) {
 		/* Allow the filesystem to manage the transit without i_mutex
@@ -1394,7 +1394,7 @@ int follow_down(struct path *path)
 	unsigned managed;
 	int ret;
 
-	while (managed = ACCESS_ONCE(path->dentry->d_flags),
+	while (managed = READ_ONCE(path->dentry->d_flags),
 	       unlikely(managed & DCACHE_MANAGED_DENTRY)) {
 		/* Allow the filesystem to manage the transit without i_mutex
 		 * being held.
diff --git a/fs/namespace.c b/fs/namespace.c
index d18deb4c410b..e158ec6b527b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -353,7 +353,7 @@ int __mnt_want_write(struct vfsmount *m)
 	 * incremented count after it has set MNT_WRITE_HOLD.
 	 */
 	smp_mb();
-	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
+	while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
 		cpu_relax();
 	/*
 	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5ceaeb1f6fb6..f439f1c45008 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1081,7 +1081,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 	int error;
 
 	if (flags & LOOKUP_RCU) {
-		parent = ACCESS_ONCE(dentry->d_parent);
+		parent = READ_ONCE(dentry->d_parent);
 		dir = d_inode_rcu(parent);
 		if (!dir)
 			return -ECHILD;
@@ -1168,7 +1168,7 @@ out_set_verifier:
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
  out_valid:
 	if (flags & LOOKUP_RCU) {
-		if (parent != ACCESS_ONCE(dentry->d_parent))
+		if (parent != READ_ONCE(dentry->d_parent))
 			return -ECHILD;
 	} else
 		dput(parent);
@@ -1582,7 +1582,7 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 		struct inode *dir;
 
 		if (flags & LOOKUP_RCU) {
-			parent = ACCESS_ONCE(dentry->d_parent);
+			parent = READ_ONCE(dentry->d_parent);
 			dir = d_inode_rcu(parent);
 			if (!dir)
 				return -ECHILD;
@@ -1596,7 +1596,7 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 			ret = -ECHILD;
 		if (!(flags & LOOKUP_RCU))
 			dput(parent);
-		else if (parent != ACCESS_ONCE(dentry->d_parent))
+		else if (parent != READ_ONCE(dentry->d_parent))
 			return -ECHILD;
 		goto out;
 	}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 77a8eacbe032..375e8bf0dd24 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -453,7 +453,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		cutime = sig->cutime;
 		cstime = sig->cstime;
 		cgtime = sig->cgtime;
-		rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
+		rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
 
 		/* add up live thread stats at the group level */
 		if (whole) {
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 99dff222fe67..03afd5150916 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -27,7 +27,7 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &p->ns->poll, wait);
 
-	event = ACCESS_ONCE(ns->event);
+	event = READ_ONCE(ns->event);
 	if (m->poll_event != event) {
 		m->poll_event = event;
 		res |= POLLERR | POLLPRI;
diff --git a/fs/splice.c b/fs/splice.c
index f3084cce0ea6..39e2dc01ac12 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -253,7 +253,7 @@ EXPORT_SYMBOL(add_to_pipe);
  */
 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
 {
-	unsigned int buffers = ACCESS_ONCE(pipe->buffers);
+	unsigned int buffers = READ_ONCE(pipe->buffers);
 
 	spd->nr_pages_max = buffers;
 	if (buffers <= PIPE_DEF_BUFFERS)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 1c713fd5b3e6..f46d133c0949 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -381,7 +381,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 	 * in __get_user_pages if userfaultfd_release waits on the
 	 * caller of handle_userfault to release the mmap_sem.
 	 */
-	if (unlikely(ACCESS_ONCE(ctx->released))) {
+	if (unlikely(READ_ONCE(ctx->released))) {
 		/*
 		 * Don't return VM_FAULT_SIGBUS in this case, so a non
 		 * cooperative manager can close the uffd after the
@@ -477,7 +477,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 						       vmf->flags, reason);
 	up_read(&mm->mmap_sem);
 
-	if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
+	if (likely(must_wait && !READ_ONCE(ctx->released) &&
 		   (return_to_userland ? !signal_pending(current) :
 		    !fatal_signal_pending(current)))) {
 		wake_up_poll(&ctx->fd_wqh, POLLIN);
@@ -586,7 +586,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 		set_current_state(TASK_KILLABLE);
 		if (ewq->msg.event == 0)
 			break;
-		if (ACCESS_ONCE(ctx->released) ||
+		if (READ_ONCE(ctx->released) ||
 		    fatal_signal_pending(current)) {
 			/*
 			 * &ewq->wq may be queued in fork_event, but
@@ -833,7 +833,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 	struct userfaultfd_wake_range range = { .len = 0, };
 	unsigned long new_flags;
 
-	ACCESS_ONCE(ctx->released) = true;
+	WRITE_ONCE(ctx->released, true);
 
 	if (!mmget_not_zero(mm))
 		goto wakeup;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 51bf7b827387..129975970d99 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -592,9 +592,9 @@ xlog_valid_lsn(
 	 * a transiently forward state. Instead, we can see the LSN in a
 	 * transiently behind state if we happen to race with a cycle wrap.
 	 */
-	cur_cycle = ACCESS_ONCE(log->l_curr_cycle);
+	cur_cycle = READ_ONCE(log->l_curr_cycle);
 	smp_rmb();
-	cur_block = ACCESS_ONCE(log->l_curr_block);
+	cur_block = READ_ONCE(log->l_curr_block);
 
 	if ((CYCLE_LSN(lsn) > cur_cycle) ||
 	    (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) {
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 8fbe259b197c..0a7ce668f8e0 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -236,7 +236,7 @@ static inline unsigned long __ffs64(u64 word)
 	typeof(*ptr) old, new;					\
 								\
 	do {							\
-		old = ACCESS_ONCE(*ptr);			\
+		old = READ_ONCE(*ptr);			\
 		new = (old & ~mask) | bits;			\
 	} while (cmpxchg(ptr, old, new) != old);		\
 								\
@@ -251,7 +251,7 @@ static inline unsigned long __ffs64(u64 word)
 	typeof(*ptr) old, new;					\
 								\
 	do {							\
-		old = ACCESS_ONCE(*ptr);			\
+		old = READ_ONCE(*ptr);			\
 		new = old & ~clear;				\
 	} while (!(old & test) &&				\
 		 cmpxchg(ptr, old, new) != old);		\
diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h
index a4be70398ce1..36dd4ffb5715 100644
--- a/include/linux/dynamic_queue_limits.h
+++ b/include/linux/dynamic_queue_limits.h
@@ -88,7 +88,7 @@ static inline void dql_queued(struct dql *dql, unsigned int count)
 /* Returns how many objects can be queued, < 0 indicates over limit. */
 static inline int dql_avail(const struct dql *dql)
 {
-	return ACCESS_ONCE(dql->adj_limit) - ACCESS_ONCE(dql->num_queued);
+	return READ_ONCE(dql->adj_limit) - READ_ONCE(dql->num_queued);
 }
 
 /* Record number of completed objects and recalculate the limit. */
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 14bc21c2ee7f..785a00ca4628 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -221,7 +221,7 @@ extern struct page *huge_zero_page;
 
 static inline bool is_huge_zero_page(struct page *page)
 {
-	return ACCESS_ONCE(huge_zero_page) == page;
+	return READ_ONCE(huge_zero_page) == page;
 }
 
 static inline bool is_huge_zero_pmd(pmd_t pmd)
diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index 30294603526f..d95cae09dea0 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -247,7 +247,7 @@ static inline struct team_port *team_get_port_by_index(struct team *team,
 
 static inline int team_num_to_port_index(struct team *team, unsigned int num)
 {
-	int en_port_count = ACCESS_ONCE(team->en_port_count);
+	int en_port_count = READ_ONCE(team->en_port_count);
 
 	if (unlikely(!en_port_count))
 		return 0;
diff --git a/include/linux/llist.h b/include/linux/llist.h
index 1957635e6d5f..85abc2915e8d 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -198,7 +198,7 @@ static inline void init_llist_head(struct llist_head *list)
  */
 static inline bool llist_empty(const struct llist_head *head)
 {
-	return ACCESS_ONCE(head->first) == NULL;
+	return READ_ONCE(head->first) == NULL;
 }
 
 static inline struct llist_node *llist_next(struct llist_node *node)
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 2efb08a60e63..f0fc4700b6ff 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -105,7 +105,7 @@ static inline bool pm_runtime_callbacks_present(struct device *dev)
 
 static inline void pm_runtime_mark_last_busy(struct device *dev)
 {
-	ACCESS_ONCE(dev->power.last_busy) = jiffies;
+	WRITE_ONCE(dev->power.last_busy, jiffies);
 }
 
 static inline bool pm_runtime_is_irq_safe(struct device *dev)
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 4f4f786255ef..3fadb6f9982b 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -983,12 +983,12 @@ static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
 
 static inline int sysctl_sync_period(struct netns_ipvs *ipvs)
 {
-	return ACCESS_ONCE(ipvs->sysctl_sync_threshold[1]);
+	return READ_ONCE(ipvs->sysctl_sync_threshold[1]);
 }
 
 static inline unsigned int sysctl_sync_refresh_period(struct netns_ipvs *ipvs)
 {
-	return ACCESS_ONCE(ipvs->sysctl_sync_refresh_period);
+	return READ_ONCE(ipvs->sysctl_sync_refresh_period);
 }
 
 static inline int sysctl_sync_retries(struct netns_ipvs *ipvs)
@@ -1013,7 +1013,7 @@ static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs)
 
 static inline int sysctl_sync_ports(struct netns_ipvs *ipvs)
 {
-	return ACCESS_ONCE(ipvs->sysctl_sync_ports);
+	return READ_ONCE(ipvs->sysctl_sync_ports);
 }
 
 static inline int sysctl_sync_persist_mode(struct netns_ipvs *ipvs)
diff --git a/kernel/acct.c b/kernel/acct.c
index 5e72af29ab73..21eedd0dd81a 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -146,7 +146,7 @@ static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
 again:
 	smp_rmb();
 	rcu_read_lock();
-	res = to_acct(ACCESS_ONCE(ns->bacct));
+	res = to_acct(READ_ONCE(ns->bacct));
 	if (!res) {
 		rcu_read_unlock();
 		return NULL;
@@ -158,7 +158,7 @@ again:
 	}
 	rcu_read_unlock();
 	mutex_lock(&res->lock);
-	if (res != to_acct(ACCESS_ONCE(ns->bacct))) {
+	if (res != to_acct(READ_ONCE(ns->bacct))) {
 		mutex_unlock(&res->lock);
 		acct_put(res);
 		goto again;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 824a583079a1..8fd2f2d1358a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1200,7 +1200,7 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
 
 again:
 	rcu_read_lock();
-	ctx = ACCESS_ONCE(event->ctx);
+	ctx = READ_ONCE(event->ctx);
 	if (!atomic_inc_not_zero(&ctx->refcount)) {
 		rcu_read_unlock();
 		goto again;
@@ -5302,8 +5302,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		if (!rb)
 			goto aux_unlock;
 
-		aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
-		aux_size = ACCESS_ONCE(rb->user_page->aux_size);
+		aux_offset = READ_ONCE(rb->user_page->aux_offset);
+		aux_size = READ_ONCE(rb->user_page->aux_size);
 
 		if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
 			goto aux_unlock;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index f684d8e5fa2b..f3e37971c842 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -381,7 +381,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	 * (B) <-> (C) ordering is still observed by the pmu driver.
 	 */
 	if (!rb->aux_overwrite) {
-		aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
+		aux_tail = READ_ONCE(rb->user_page->aux_tail);
 		handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
 		if (aux_head - aux_tail < perf_aux_size(rb))
 			handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
diff --git a/kernel/exit.c b/kernel/exit.c
index f6cad39f35df..6b4298a41167 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1339,7 +1339,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 	 * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
 	 * can't confuse the checks below.
 	 */
-	int exit_state = ACCESS_ONCE(p->exit_state);
+	int exit_state = READ_ONCE(p->exit_state);
 	int ret;
 
 	if (unlikely(exit_state == EXIT_DEAD))
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 81279c6602ff..845f3805c73d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2724,7 +2724,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	 * if it happened, we have to fail the write.
 	 */
 	barrier();
-	if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
+	if (unlikely(READ_ONCE(cpu_buffer->buffer) != buffer)) {
 		local_dec(&cpu_buffer->committing);
 		local_dec(&cpu_buffer->commits);
 		return NULL;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 652c682707cd..9050c8b3ccde 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1459,7 +1459,7 @@ extern struct trace_event_file *find_event_file(struct trace_array *tr,
 
 static inline void *event_file_data(struct file *filp)
 {
-	return ACCESS_ONCE(file_inode(filp)->i_private);
+	return READ_ONCE(file_inode(filp)->i_private);
 }
 
 extern struct mutex event_mutex;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 49cb41412eec..780262210c9a 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -77,7 +77,7 @@ check_stack(unsigned long ip, unsigned long *stack)
 {
 	unsigned long this_size, flags; unsigned long *p, *top, *start;
 	static int tracer_frame;
-	int frame_size = ACCESS_ONCE(tracer_frame);
+	int frame_size = READ_ONCE(tracer_frame);
 	int i, x;
 
 	this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c490f1e4313b..d32b45662fb6 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -894,7 +894,7 @@ static bool new_idmap_permitted(const struct file *file,
 int proc_setgroups_show(struct seq_file *seq, void *v)
 {
 	struct user_namespace *ns = seq->private;
-	unsigned long userns_flags = ACCESS_ONCE(ns->flags);
+	unsigned long userns_flags = READ_ONCE(ns->flags);
 
 	seq_printf(seq, "%s\n",
 		   (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
diff --git a/lib/assoc_array.c b/lib/assoc_array.c
index 155c55d8db5f..fe7953aead82 100644
--- a/lib/assoc_array.c
+++ b/lib/assoc_array.c
@@ -39,7 +39,7 @@ begin_node:
 		/* Descend through a shortcut */
 		shortcut = assoc_array_ptr_to_shortcut(cursor);
 		smp_read_barrier_depends();
-		cursor = ACCESS_ONCE(shortcut->next_node);
+		cursor = READ_ONCE(shortcut->next_node);
 	}
 
 	node = assoc_array_ptr_to_node(cursor);
@@ -55,7 +55,7 @@ begin_node:
 	 */
 	has_meta = 0;
 	for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
-		ptr = ACCESS_ONCE(node->slots[slot]);
+		ptr = READ_ONCE(node->slots[slot]);
 		has_meta |= (unsigned long)ptr;
 		if (ptr && assoc_array_ptr_is_leaf(ptr)) {
 			/* We need a barrier between the read of the pointer
@@ -89,7 +89,7 @@ continue_node:
 	smp_read_barrier_depends();
 
 	for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
-		ptr = ACCESS_ONCE(node->slots[slot]);
+		ptr = READ_ONCE(node->slots[slot]);
 		if (assoc_array_ptr_is_meta(ptr)) {
 			cursor = ptr;
 			goto begin_node;
@@ -98,7 +98,7 @@ continue_node:
 
 finished_node:
 	/* Move up to the parent (may need to skip back over a shortcut) */
-	parent = ACCESS_ONCE(node->back_pointer);
+	parent = READ_ONCE(node->back_pointer);
 	slot = node->parent_slot;
 	if (parent == stop)
 		return 0;
@@ -107,7 +107,7 @@ finished_node:
 		shortcut = assoc_array_ptr_to_shortcut(parent);
 		smp_read_barrier_depends();
 		cursor = parent;
-		parent = ACCESS_ONCE(shortcut->back_pointer);
+		parent = READ_ONCE(shortcut->back_pointer);
 		slot = shortcut->parent_slot;
 		if (parent == stop)
 			return 0;
@@ -147,7 +147,7 @@ int assoc_array_iterate(const struct assoc_array *array,
 					void *iterator_data),
 			void *iterator_data)
 {
-	struct assoc_array_ptr *root = ACCESS_ONCE(array->root);
+	struct assoc_array_ptr *root = READ_ONCE(array->root);
 
 	if (!root)
 		return 0;
@@ -194,7 +194,7 @@ assoc_array_walk(const struct assoc_array *array,
 
 	pr_devel("-->%s()\n", __func__);
 
-	cursor = ACCESS_ONCE(array->root);
+	cursor = READ_ONCE(array->root);
 	if (!cursor)
 		return assoc_array_walk_tree_empty;
 
@@ -220,7 +220,7 @@ consider_node:
 
 	slot = segments >> (level & ASSOC_ARRAY_KEY_CHUNK_MASK);
 	slot &= ASSOC_ARRAY_FAN_MASK;
-	ptr = ACCESS_ONCE(node->slots[slot]);
+	ptr = READ_ONCE(node->slots[slot]);
 
 	pr_devel("consider slot %x [ix=%d type=%lu]\n",
 		 slot, level, (unsigned long)ptr & 3);
@@ -294,7 +294,7 @@ follow_shortcut:
 	} while (sc_level < shortcut->skip_to_level);
 
 	/* The shortcut matches the leaf's index to this point. */
-	cursor = ACCESS_ONCE(shortcut->next_node);
+	cursor = READ_ONCE(shortcut->next_node);
 	if (((level ^ sc_level) & ~ASSOC_ARRAY_KEY_CHUNK_MASK) != 0) {
 		level = sc_level;
 		goto jumped;
@@ -337,7 +337,7 @@ void *assoc_array_find(const struct assoc_array *array,
 	 * the terminal node.
 	 */
 	for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
-		ptr = ACCESS_ONCE(node->slots[slot]);
+		ptr = READ_ONCE(node->slots[slot]);
 		if (ptr && assoc_array_ptr_is_leaf(ptr)) {
 			/* We need a barrier between the read of the pointer
 			 * and dereferencing the pointer - but only if we are
diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c
index f346715e2255..81770a55cb16 100644
--- a/lib/dynamic_queue_limits.c
+++ b/lib/dynamic_queue_limits.c
@@ -20,7 +20,7 @@ void dql_completed(struct dql *dql, unsigned int count)
 	unsigned int ovlimit, completed, num_queued;
 	bool all_prev_completed;
 
-	num_queued = ACCESS_ONCE(dql->num_queued);
+	num_queued = READ_ONCE(dql->num_queued);
 
 	/* Can't complete more than what's in queue */
 	BUG_ON(count > num_queued - dql->num_completed);
diff --git a/lib/llist.c b/lib/llist.c
index ae5872b1df0c..7062e931a7bb 100644
--- a/lib/llist.c
+++ b/lib/llist.c
@@ -41,7 +41,7 @@ bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
 	struct llist_node *first;
 
 	do {
-		new_last->next = first = ACCESS_ONCE(head->first);
+		new_last->next = first = READ_ONCE(head->first);
 	} while (cmpxchg(&head->first, first, new_first) != first);
 
 	return !first;
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 86c3385b9eb3..1746bae94d41 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -620,8 +620,8 @@ char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_sp
 
 	rcu_read_lock();
 	for (i = 0; i < depth; i++, d = p) {
-		p = ACCESS_ONCE(d->d_parent);
-		array[i] = ACCESS_ONCE(d->d_name.name);
+		p = READ_ONCE(d->d_parent);
+		array[i] = READ_ONCE(d->d_name.name);
 		if (p == d) {
 			if (i)
 				array[i] = "";
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 269b5df58543..c3bf907a03ee 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2715,7 +2715,7 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
 		struct shrink_control *sc)
 {
 	struct pglist_data *pgdata = NODE_DATA(sc->nid);
-	return ACCESS_ONCE(pgdata->split_queue_len);
+	return READ_ONCE(pgdata->split_queue_len);
 }
 
 static unsigned long deferred_split_scan(struct shrinker *shrink,
diff --git a/net/core/dev.c b/net/core/dev.c
index 11596a302a26..61559ca3980b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3725,7 +3725,7 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 	if (flow_table && flow_id <= flow_table->mask) {
 		rflow = &flow_table->flows[flow_id];
-		cpu = ACCESS_ONCE(rflow->cpu);
+		cpu = READ_ONCE(rflow->cpu);
 		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
 			   rflow->last_qtail) <
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 6e1e10ff433a..3b2034f6d49d 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3377,7 +3377,7 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
 
 static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 {
-	unsigned int burst = ACCESS_ONCE(pkt_dev->burst);
+	unsigned int burst = READ_ONCE(pkt_dev->burst);
 	struct net_device *odev = pkt_dev->odev;
 	struct netdev_queue *txq;
 	struct sk_buff *skb;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index af74d0433453..f9597ba26599 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -164,7 +164,7 @@ static void inet_frag_worker(struct work_struct *work)
 
 	local_bh_disable();
 
-	for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
+	for (i = READ_ONCE(f->next_bucket); budget; --budget) {
 		evicted += inet_evict_bucket(f, &f->hash[i]);
 		i = (i + 1) & (INETFRAGS_HASHSZ - 1);
 		if (evicted > INETFRAGS_EVICT_MAX)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3d9f1c2f81c5..c0864562083b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -495,7 +495,7 @@ u32 ip_idents_reserve(u32 hash, int segs)
 {
 	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
-	u32 old = ACCESS_ONCE(*p_tstamp);
+	u32 old = READ_ONCE(*p_tstamp);
 	u32 now = (u32)jiffies;
 	u32 new, delta = 0;
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0bc9e46a5369..48531da1aba6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1908,7 +1908,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
 		goto send_now;
 
-	win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
+	win_divisor = READ_ONCE(sysctl_tcp_tso_win_divisor);
 	if (win_divisor) {
 		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ebfbccae62fd..02ec9a349303 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1853,7 +1853,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		 */
 
 		/* if we're overly short, let UDP handle it */
-		encap_rcv = ACCESS_ONCE(up->encap_rcv);
+		encap_rcv = READ_ONCE(up->encap_rcv);
 		if (encap_rcv) {
 			int ret;
 
@@ -2298,7 +2298,7 @@ void udp_destroy_sock(struct sock *sk)
 	unlock_sock_fast(sk, slow);
 	if (static_key_false(&udp_encap_needed) && up->encap_type) {
 		void (*encap_destroy)(struct sock *sk);
-		encap_destroy = ACCESS_ONCE(up->encap_destroy);
+		encap_destroy = READ_ONCE(up->encap_destroy);
 		if (encap_destroy)
 			encap_destroy(sk);
 	}
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index a1c24443cd9e..dab946554157 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -490,7 +490,7 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
 	if (!t)
 		goto out;
 
-	tproto = ACCESS_ONCE(t->parms.proto);
+	tproto = READ_ONCE(t->parms.proto);
 	if (tproto != ipproto && tproto != 0)
 		goto out;
 
@@ -899,7 +899,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
 	t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr);
 
 	if (t) {
-		u8 tproto = ACCESS_ONCE(t->parms.proto);
+		u8 tproto = READ_ONCE(t->parms.proto);
 
 		if (tproto != ipproto && tproto != 0)
 			goto drop;
@@ -1233,7 +1233,7 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 
-	tproto = ACCESS_ONCE(t->parms.proto);
+	tproto = READ_ONCE(t->parms.proto);
 	if (tproto != IPPROTO_IPIP && tproto != 0)
 		return -1;
 
@@ -1303,7 +1303,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 	u8 tproto;
 	int err;
 
-	tproto = ACCESS_ONCE(t->parms.proto);
+	tproto = READ_ONCE(t->parms.proto);
 	if ((tproto != IPPROTO_IPV6 && tproto != 0) ||
 	    ip6_tnl_addr_conflict(t, ipv6h))
 		return -1;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 40d7234c27b9..3f30fa313bf2 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -606,7 +606,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		 */
 
 		/* if we're overly short, let UDP handle it */
-		encap_rcv = ACCESS_ONCE(up->encap_rcv);
+		encap_rcv = READ_ONCE(up->encap_rcv);
 		if (encap_rcv) {
 			int ret;
 
@@ -1432,7 +1432,7 @@ void udpv6_destroy_sock(struct sock *sk)
 
 	if (static_key_false(&udpv6_encap_needed) && up->encap_type) {
 		void (*encap_destroy)(struct sock *sk);
-		encap_destroy = ACCESS_ONCE(up->encap_destroy);
+		encap_destroy = READ_ONCE(up->encap_destroy);
 		if (encap_destroy)
 			encap_destroy(sk);
 	}
diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c
index dd3e83328ad5..82cb93f66b9b 100644
--- a/net/llc/llc_input.c
+++ b/net/llc/llc_input.c
@@ -193,7 +193,7 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev,
 	 */
 	rcv = rcu_dereference(sap->rcv_func);
 	dest = llc_pdu_type(skb);
-	sap_handler = dest ? ACCESS_ONCE(llc_type_handlers[dest - 1]) : NULL;
+	sap_handler = dest ? READ_ONCE(llc_type_handlers[dest - 1]) : NULL;
 	if (unlikely(!sap_handler)) {
 		if (rcv)
 			rcv(skb, dev, pt, orig_dev);
@@ -214,7 +214,7 @@ drop:
 	kfree_skb(skb);
 	goto out;
 handle_station:
-	sta_handler = ACCESS_ONCE(llc_station_handler);
+	sta_handler = READ_ONCE(llc_station_handler);
 	if (!sta_handler)
 		goto drop;
 	sta_handler(skb);
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 69615016d5bf..214d2ba02877 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2008,7 +2008,7 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate,
 
 static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo)
 {
-	u16 rate = ACCESS_ONCE(sta_get_last_rx_stats(sta)->last_rate);
+	u16 rate = READ_ONCE(sta_get_last_rx_stats(sta)->last_rate);
 
 	if (rate == STA_STATS_RATE_INVALID)
 		return -EINVAL;
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index d177dd066504..4d748975117d 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -393,7 +393,7 @@ EXPORT_SYMBOL(netlbl_calipso_ops_register);
 
 static const struct netlbl_calipso_ops *netlbl_calipso_ops_get(void)
 {
-	return ACCESS_ONCE(calipso_ops);
+	return READ_ONCE(calipso_ops);
 }
 
 /**
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index d396cb61a280..eb866647a27a 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -14201,7 +14201,7 @@ static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd,
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	struct sk_buff *msg;
 	void *hdr;
-	u32 nlportid = ACCESS_ONCE(wdev->ap_unexpected_nlportid);
+	u32 nlportid = READ_ONCE(wdev->ap_unexpected_nlportid);
 
 	if (!nlportid)
 		return false;
diff --git a/sound/firewire/amdtp-am824.c b/sound/firewire/amdtp-am824.c
index 23ccddb20de1..4210e5c6262e 100644
--- a/sound/firewire/amdtp-am824.c
+++ b/sound/firewire/amdtp-am824.c
@@ -247,7 +247,7 @@ void amdtp_am824_midi_trigger(struct amdtp_stream *s, unsigned int port,
 	struct amdtp_am824 *p = s->protocol;
 
 	if (port < p->midi_ports)
-		ACCESS_ONCE(p->midi[port]) = midi;
+		WRITE_ONCE(p->midi[port], midi);
 }
 EXPORT_SYMBOL_GPL(amdtp_am824_midi_trigger);
 
@@ -336,7 +336,7 @@ static unsigned int process_rx_data_blocks(struct amdtp_stream *s, __be32 *buffe
 					   unsigned int data_blocks, unsigned int *syt)
 {
 	struct amdtp_am824 *p = s->protocol;
-	struct snd_pcm_substream *pcm = ACCESS_ONCE(s->pcm);
+	struct snd_pcm_substream *pcm = READ_ONCE(s->pcm);
 	unsigned int pcm_frames;
 
 	if (pcm) {
@@ -357,7 +357,7 @@ static unsigned int process_tx_data_blocks(struct amdtp_stream *s, __be32 *buffe
 					   unsigned int data_blocks, unsigned int *syt)
 {
 	struct amdtp_am824 *p = s->protocol;
-	struct snd_pcm_substream *pcm = ACCESS_ONCE(s->pcm);
+	struct snd_pcm_substream *pcm = READ_ONCE(s->pcm);
 	unsigned int pcm_frames;
 
 	if (pcm) {
diff --git a/sound/firewire/amdtp-stream.c b/sound/firewire/amdtp-stream.c
index 3fc581a5ad62..4a1dc145327b 100644
--- a/sound/firewire/amdtp-stream.c
+++ b/sound/firewire/amdtp-stream.c
@@ -376,7 +376,7 @@ static void update_pcm_pointers(struct amdtp_stream *s,
 	ptr = s->pcm_buffer_pointer + frames;
 	if (ptr >= pcm->runtime->buffer_size)
 		ptr -= pcm->runtime->buffer_size;
-	ACCESS_ONCE(s->pcm_buffer_pointer) = ptr;
+	WRITE_ONCE(s->pcm_buffer_pointer, ptr);
 
 	s->pcm_period_pointer += frames;
 	if (s->pcm_period_pointer >= pcm->runtime->period_size) {
@@ -388,7 +388,7 @@ static void update_pcm_pointers(struct amdtp_stream *s,
 static void pcm_period_tasklet(unsigned long data)
 {
 	struct amdtp_stream *s = (void *)data;
-	struct snd_pcm_substream *pcm = ACCESS_ONCE(s->pcm);
+	struct snd_pcm_substream *pcm = READ_ONCE(s->pcm);
 
 	if (pcm)
 		snd_pcm_period_elapsed(pcm);
@@ -453,7 +453,7 @@ static int handle_out_packet(struct amdtp_stream *s,
 		s->data_block_counter =
 				(s->data_block_counter + data_blocks) & 0xff;
 
-	buffer[0] = cpu_to_be32(ACCESS_ONCE(s->source_node_id_field) |
+	buffer[0] = cpu_to_be32(READ_ONCE(s->source_node_id_field) |
 				(s->data_block_quadlets << CIP_DBS_SHIFT) |
 				((s->sph << CIP_SPH_SHIFT) & CIP_SPH_MASK) |
 				s->data_block_counter);
@@ -472,7 +472,7 @@ static int handle_out_packet(struct amdtp_stream *s,
 	if (queue_out_packet(s, payload_length) < 0)
 		return -EIO;
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (pcm && pcm_frames > 0)
 		update_pcm_pointers(s, pcm, pcm_frames);
 
@@ -504,7 +504,7 @@ static int handle_out_packet_without_header(struct amdtp_stream *s,
 	if (queue_out_packet(s, payload_length) < 0)
 		return -EIO;
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (pcm && pcm_frames > 0)
 		update_pcm_pointers(s, pcm, pcm_frames);
 
@@ -621,7 +621,7 @@ end:
 	if (queue_in_packet(s) < 0)
 		return -EIO;
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (pcm && pcm_frames > 0)
 		update_pcm_pointers(s, pcm, pcm_frames);
 
@@ -649,7 +649,7 @@ static int handle_in_packet_without_header(struct amdtp_stream *s,
 	if (queue_in_packet(s) < 0)
 		return -EIO;
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (pcm && pcm_frames > 0)
 		update_pcm_pointers(s, pcm, pcm_frames);
 
@@ -947,7 +947,7 @@ unsigned long amdtp_stream_pcm_pointer(struct amdtp_stream *s)
 	if (!in_interrupt() && amdtp_stream_running(s))
 		fw_iso_context_flush_completions(s->context);
 
-	return ACCESS_ONCE(s->pcm_buffer_pointer);
+	return READ_ONCE(s->pcm_buffer_pointer);
 }
 EXPORT_SYMBOL(amdtp_stream_pcm_pointer);
 
@@ -977,9 +977,8 @@ EXPORT_SYMBOL(amdtp_stream_pcm_ack);
 void amdtp_stream_update(struct amdtp_stream *s)
 {
 	/* Precomputing. */
-	ACCESS_ONCE(s->source_node_id_field) =
-		(fw_parent_device(s->unit)->card->node_id << CIP_SID_SHIFT) &
-								CIP_SID_MASK;
+	WRITE_ONCE(s->source_node_id_field,
+                   (fw_parent_device(s->unit)->card->node_id << CIP_SID_SHIFT) & CIP_SID_MASK);
 }
 EXPORT_SYMBOL(amdtp_stream_update);
 
@@ -1022,7 +1021,7 @@ void amdtp_stream_pcm_abort(struct amdtp_stream *s)
 {
 	struct snd_pcm_substream *pcm;
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (pcm)
 		snd_pcm_stop_xrun(pcm);
 }
diff --git a/sound/firewire/amdtp-stream.h b/sound/firewire/amdtp-stream.h
index ed6eafd10992..f9abd8b07ce6 100644
--- a/sound/firewire/amdtp-stream.h
+++ b/sound/firewire/amdtp-stream.h
@@ -220,7 +220,7 @@ static inline bool amdtp_stream_pcm_running(struct amdtp_stream *s)
 static inline void amdtp_stream_pcm_trigger(struct amdtp_stream *s,
 					    struct snd_pcm_substream *pcm)
 {
-	ACCESS_ONCE(s->pcm) = pcm;
+	WRITE_ONCE(s->pcm, pcm);
 }
 
 static inline bool cip_sfc_is_base_44100(enum cip_sfc sfc)
diff --git a/sound/firewire/digi00x/amdtp-dot.c b/sound/firewire/digi00x/amdtp-dot.c
index 1453c34ce99f..4a884a335248 100644
--- a/sound/firewire/digi00x/amdtp-dot.c
+++ b/sound/firewire/digi00x/amdtp-dot.c
@@ -327,7 +327,7 @@ void amdtp_dot_midi_trigger(struct amdtp_stream *s, unsigned int port,
 	struct amdtp_dot *p = s->protocol;
 
 	if (port < MAX_MIDI_PORTS)
-		ACCESS_ONCE(p->midi[port]) = midi;
+		WRITE_ONCE(p->midi[port], midi);
 }
 
 static unsigned int process_tx_data_blocks(struct amdtp_stream *s,
@@ -338,7 +338,7 @@ static unsigned int process_tx_data_blocks(struct amdtp_stream *s,
 	struct snd_pcm_substream *pcm;
 	unsigned int pcm_frames;
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (pcm) {
 		read_pcm_s32(s, pcm, buffer, data_blocks);
 		pcm_frames = data_blocks;
@@ -359,7 +359,7 @@ static unsigned int process_rx_data_blocks(struct amdtp_stream *s,
 	struct snd_pcm_substream *pcm;
 	unsigned int pcm_frames;
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (pcm) {
 		write_pcm_s32(s, pcm, buffer, data_blocks);
 		pcm_frames = data_blocks;
diff --git a/sound/firewire/fireface/amdtp-ff.c b/sound/firewire/fireface/amdtp-ff.c
index 780da9deb2f0..77c7598b61ab 100644
--- a/sound/firewire/fireface/amdtp-ff.c
+++ b/sound/firewire/fireface/amdtp-ff.c
@@ -108,7 +108,7 @@ static unsigned int process_rx_data_blocks(struct amdtp_stream *s,
 					   unsigned int data_blocks,
 					   unsigned int *syt)
 {
-	struct snd_pcm_substream *pcm = ACCESS_ONCE(s->pcm);
+	struct snd_pcm_substream *pcm = READ_ONCE(s->pcm);
 	unsigned int pcm_frames;
 
 	if (pcm) {
@@ -127,7 +127,7 @@ static unsigned int process_tx_data_blocks(struct amdtp_stream *s,
 					   unsigned int data_blocks,
 					   unsigned int *syt)
 {
-	struct snd_pcm_substream *pcm = ACCESS_ONCE(s->pcm);
+	struct snd_pcm_substream *pcm = READ_ONCE(s->pcm);
 	unsigned int pcm_frames;
 
 	if (pcm) {
diff --git a/sound/firewire/fireface/ff-midi.c b/sound/firewire/fireface/ff-midi.c
index 949ee56b4e0e..6a49611ee462 100644
--- a/sound/firewire/fireface/ff-midi.c
+++ b/sound/firewire/fireface/ff-midi.c
@@ -22,7 +22,7 @@ static int midi_playback_open(struct snd_rawmidi_substream *substream)
 	ff->running_status[substream->number] = 0;
 	ff->rx_midi_error[substream->number] = false;
 
-	ACCESS_ONCE(ff->rx_midi_substreams[substream->number]) = substream;
+	WRITE_ONCE(ff->rx_midi_substreams[substream->number], substream);
 
 	return 0;
 }
@@ -38,7 +38,7 @@ static int midi_playback_close(struct snd_rawmidi_substream *substream)
 	struct snd_ff *ff = substream->rmidi->private_data;
 
 	cancel_work_sync(&ff->rx_midi_work[substream->number]);
-	ACCESS_ONCE(ff->rx_midi_substreams[substream->number]) = NULL;
+	WRITE_ONCE(ff->rx_midi_substreams[substream->number], NULL);
 
 	return 0;
 }
@@ -52,10 +52,10 @@ static void midi_capture_trigger(struct snd_rawmidi_substream *substream,
 	spin_lock_irqsave(&ff->lock, flags);
 
 	if (up)
-		ACCESS_ONCE(ff->tx_midi_substreams[substream->number]) =
-								substream;
+		WRITE_ONCE(ff->tx_midi_substreams[substream->number],
+			   substream);
 	else
-		ACCESS_ONCE(ff->tx_midi_substreams[substream->number]) = NULL;
+		WRITE_ONCE(ff->tx_midi_substreams[substream->number], NULL);
 
 	spin_unlock_irqrestore(&ff->lock, flags);
 }
diff --git a/sound/firewire/fireface/ff-transaction.c b/sound/firewire/fireface/ff-transaction.c
index dd6c8e839647..332b29f8ed75 100644
--- a/sound/firewire/fireface/ff-transaction.c
+++ b/sound/firewire/fireface/ff-transaction.c
@@ -12,7 +12,7 @@ static void finish_transmit_midi_msg(struct snd_ff *ff, unsigned int port,
 				     int rcode)
 {
 	struct snd_rawmidi_substream *substream =
-				ACCESS_ONCE(ff->rx_midi_substreams[port]);
+				READ_ONCE(ff->rx_midi_substreams[port]);
 
 	if (rcode_is_permanent_error(rcode)) {
 		ff->rx_midi_error[port] = true;
@@ -60,7 +60,7 @@ static inline void fill_midi_buf(struct snd_ff *ff, unsigned int port,
 static void transmit_midi_msg(struct snd_ff *ff, unsigned int port)
 {
 	struct snd_rawmidi_substream *substream =
-			ACCESS_ONCE(ff->rx_midi_substreams[port]);
+			READ_ONCE(ff->rx_midi_substreams[port]);
 	u8 *buf = (u8 *)ff->msg_buf[port];
 	int i, len;
 
@@ -159,7 +159,7 @@ static void handle_midi_msg(struct fw_card *card, struct fw_request *request,
 		 */
 		index = (quad >> 8) & 0xff;
 		if (index > 0) {
-			substream = ACCESS_ONCE(ff->tx_midi_substreams[0]);
+			substream = READ_ONCE(ff->tx_midi_substreams[0]);
 			if (substream != NULL) {
 				byte = quad & 0xff;
 				snd_rawmidi_receive(substream, &byte, 1);
@@ -169,7 +169,7 @@ static void handle_midi_msg(struct fw_card *card, struct fw_request *request,
 		/* Message in second port. */
 		index = (quad >> 24) & 0xff;
 		if (index > 0) {
-			substream = ACCESS_ONCE(ff->tx_midi_substreams[1]);
+			substream = READ_ONCE(ff->tx_midi_substreams[1]);
 			if (substream != NULL) {
 				byte = (quad >> 16) & 0xff;
 				snd_rawmidi_receive(substream, &byte, 1);
diff --git a/sound/firewire/isight.c b/sound/firewire/isight.c
index 5826aa8362f1..46092fa3ff9b 100644
--- a/sound/firewire/isight.c
+++ b/sound/firewire/isight.c
@@ -96,7 +96,7 @@ static void isight_update_pointers(struct isight *isight, unsigned int count)
 	ptr += count;
 	if (ptr >= runtime->buffer_size)
 		ptr -= runtime->buffer_size;
-	ACCESS_ONCE(isight->buffer_pointer) = ptr;
+	WRITE_ONCE(isight->buffer_pointer, ptr);
 
 	isight->period_counter += count;
 	if (isight->period_counter >= runtime->period_size) {
@@ -111,7 +111,7 @@ static void isight_samples(struct isight *isight,
 	struct snd_pcm_runtime *runtime;
 	unsigned int count1;
 
-	if (!ACCESS_ONCE(isight->pcm_running))
+	if (!READ_ONCE(isight->pcm_running))
 		return;
 
 	runtime = isight->pcm->runtime;
@@ -131,7 +131,7 @@ static void isight_samples(struct isight *isight,
 
 static void isight_pcm_abort(struct isight *isight)
 {
-	if (ACCESS_ONCE(isight->pcm_active))
+	if (READ_ONCE(isight->pcm_active))
 		snd_pcm_stop_xrun(isight->pcm);
 }
 
@@ -141,7 +141,7 @@ static void isight_dropped_samples(struct isight *isight, unsigned int total)
 	u32 dropped;
 	unsigned int count1;
 
-	if (!ACCESS_ONCE(isight->pcm_running))
+	if (!READ_ONCE(isight->pcm_running))
 		return;
 
 	runtime = isight->pcm->runtime;
@@ -293,7 +293,7 @@ static int isight_hw_params(struct snd_pcm_substream *substream,
 	if (err < 0)
 		return err;
 
-	ACCESS_ONCE(isight->pcm_active) = true;
+	WRITE_ONCE(isight->pcm_active, true);
 
 	return 0;
 }
@@ -331,7 +331,7 @@ static int isight_hw_free(struct snd_pcm_substream *substream)
 {
 	struct isight *isight = substream->private_data;
 
-	ACCESS_ONCE(isight->pcm_active) = false;
+	WRITE_ONCE(isight->pcm_active, false);
 
 	mutex_lock(&isight->mutex);
 	isight_stop_streaming(isight);
@@ -424,10 +424,10 @@ static int isight_trigger(struct snd_pcm_substream *substream, int cmd)
 
 	switch (cmd) {
 	case SNDRV_PCM_TRIGGER_START:
-		ACCESS_ONCE(isight->pcm_running) = true;
+		WRITE_ONCE(isight->pcm_running, true);
 		break;
 	case SNDRV_PCM_TRIGGER_STOP:
-		ACCESS_ONCE(isight->pcm_running) = false;
+		WRITE_ONCE(isight->pcm_running, false);
 		break;
 	default:
 		return -EINVAL;
@@ -439,7 +439,7 @@ static snd_pcm_uframes_t isight_pointer(struct snd_pcm_substream *substream)
 {
 	struct isight *isight = substream->private_data;
 
-	return ACCESS_ONCE(isight->buffer_pointer);
+	return READ_ONCE(isight->buffer_pointer);
 }
 
 static int isight_create_pcm(struct isight *isight)
diff --git a/sound/firewire/motu/amdtp-motu.c b/sound/firewire/motu/amdtp-motu.c
index 96f0091144bb..f0555a24d90e 100644
--- a/sound/firewire/motu/amdtp-motu.c
+++ b/sound/firewire/motu/amdtp-motu.c
@@ -310,7 +310,7 @@ static unsigned int process_tx_data_blocks(struct amdtp_stream *s,
 	if (p->midi_ports)
 		read_midi_messages(s, buffer, data_blocks);
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (data_blocks > 0 && pcm)
 		read_pcm_s32(s, pcm->runtime, buffer, data_blocks);
 
@@ -374,7 +374,7 @@ static unsigned int process_rx_data_blocks(struct amdtp_stream *s,
 	if (p->midi_ports)
 		write_midi_messages(s, buffer, data_blocks);
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (pcm)
 		write_pcm_s32(s, pcm->runtime, buffer, data_blocks);
 	else
diff --git a/sound/firewire/oxfw/oxfw-scs1x.c b/sound/firewire/oxfw/oxfw-scs1x.c
index 02d595665898..f33497cdc706 100644
--- a/sound/firewire/oxfw/oxfw-scs1x.c
+++ b/sound/firewire/oxfw/oxfw-scs1x.c
@@ -112,7 +112,7 @@ static void handle_hss(struct fw_card *card, struct fw_request *request,
 	}
 
 	if (length >= 1) {
-		stream = ACCESS_ONCE(scs->input);
+		stream = READ_ONCE(scs->input);
 		if (stream)
 			midi_input_packet(scs, stream, data, length);
 	}
@@ -183,7 +183,7 @@ static void scs_output_work(struct work_struct *work)
 	if (scs->transaction_running)
 		return;
 
-	stream = ACCESS_ONCE(scs->output);
+	stream = READ_ONCE(scs->output);
 	if (!stream || scs->error) {
 		scs->output_idle = true;
 		wake_up(&scs->idle_wait);
@@ -291,9 +291,9 @@ static void midi_capture_trigger(struct snd_rawmidi_substream *stream, int up)
 
 	if (up) {
 		scs->input_escape_count = 0;
-		ACCESS_ONCE(scs->input) = stream;
+		WRITE_ONCE(scs->input, stream);
 	} else {
-		ACCESS_ONCE(scs->input) = NULL;
+		WRITE_ONCE(scs->input, NULL);
 	}
 }
 
@@ -319,10 +319,10 @@ static void midi_playback_trigger(struct snd_rawmidi_substream *stream, int up)
 		scs->transaction_bytes = 0;
 		scs->error = false;
 
-		ACCESS_ONCE(scs->output) = stream;
+		WRITE_ONCE(scs->output, stream);
 		schedule_work(&scs->work);
 	} else {
-		ACCESS_ONCE(scs->output) = NULL;
+		WRITE_ONCE(scs->output, NULL);
 	}
 }
 static void midi_playback_drain(struct snd_rawmidi_substream *stream)
diff --git a/sound/firewire/tascam/amdtp-tascam.c b/sound/firewire/tascam/amdtp-tascam.c
index 6aff1fc1c72d..ab482423c165 100644
--- a/sound/firewire/tascam/amdtp-tascam.c
+++ b/sound/firewire/tascam/amdtp-tascam.c
@@ -124,7 +124,7 @@ static unsigned int process_tx_data_blocks(struct amdtp_stream *s,
 {
 	struct snd_pcm_substream *pcm;
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (data_blocks > 0 && pcm)
 		read_pcm_s32(s, pcm, buffer, data_blocks);
 
@@ -143,7 +143,7 @@ static unsigned int process_rx_data_blocks(struct amdtp_stream *s,
 	/* This field is not used. */
 	*syt = 0x0000;
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (pcm)
 		write_pcm_s32(s, pcm, buffer, data_blocks);
 	else
diff --git a/sound/firewire/tascam/tascam-transaction.c b/sound/firewire/tascam/tascam-transaction.c
index 8967c52f5032..2ad692dd4b13 100644
--- a/sound/firewire/tascam/tascam-transaction.c
+++ b/sound/firewire/tascam/tascam-transaction.c
@@ -148,7 +148,7 @@ static void async_midi_port_callback(struct fw_card *card, int rcode,
 				     void *callback_data)
 {
 	struct snd_fw_async_midi_port *port = callback_data;
-	struct snd_rawmidi_substream *substream = ACCESS_ONCE(port->substream);
+	struct snd_rawmidi_substream *substream = READ_ONCE(port->substream);
 
 	/* This port is closed. */
 	if (substream == NULL)
@@ -173,7 +173,7 @@ static void midi_port_work(struct work_struct *work)
 {
 	struct snd_fw_async_midi_port *port =
 			container_of(work, struct snd_fw_async_midi_port, work);
-	struct snd_rawmidi_substream *substream = ACCESS_ONCE(port->substream);
+	struct snd_rawmidi_substream *substream = READ_ONCE(port->substream);
 	int generation;
 
 	/* Under transacting or error state. */
@@ -282,7 +282,7 @@ static void handle_midi_tx(struct fw_card *card, struct fw_request *request,
 				bytes = 3;
 		}
 
-		substream = ACCESS_ONCE(tscm->tx_midi_substreams[port]);
+		substream = READ_ONCE(tscm->tx_midi_substreams[port]);
 		if (substream != NULL)
 			snd_rawmidi_receive(substream, b + 1, bytes);
 	}
diff --git a/sound/soc/xtensa/xtfpga-i2s.c b/sound/soc/xtensa/xtfpga-i2s.c
index 8382ffa3bcaf..2472144b329e 100644
--- a/sound/soc/xtensa/xtfpga-i2s.c
+++ b/sound/soc/xtensa/xtfpga-i2s.c
@@ -165,7 +165,7 @@ static bool xtfpga_pcm_push_tx(struct xtfpga_i2s *i2s)
 	tx_substream = rcu_dereference(i2s->tx_substream);
 	tx_active = tx_substream && snd_pcm_running(tx_substream);
 	if (tx_active) {
-		unsigned tx_ptr = ACCESS_ONCE(i2s->tx_ptr);
+		unsigned tx_ptr = READ_ONCE(i2s->tx_ptr);
 		unsigned new_tx_ptr = i2s->tx_fn(i2s, tx_substream->runtime,
 						 tx_ptr);
 
@@ -437,7 +437,7 @@ static int xtfpga_pcm_trigger(struct snd_pcm_substream *substream, int cmd)
 	case SNDRV_PCM_TRIGGER_START:
 	case SNDRV_PCM_TRIGGER_RESUME:
 	case SNDRV_PCM_TRIGGER_PAUSE_RELEASE:
-		ACCESS_ONCE(i2s->tx_ptr) = 0;
+		WRITE_ONCE(i2s->tx_ptr, 0);
 		rcu_assign_pointer(i2s->tx_substream, substream);
 		xtfpga_pcm_refill_fifo(i2s);
 		break;
@@ -459,7 +459,7 @@ static snd_pcm_uframes_t xtfpga_pcm_pointer(struct snd_pcm_substream *substream)
 {
 	struct snd_pcm_runtime *runtime = substream->runtime;
 	struct xtfpga_i2s *i2s = runtime->private_data;
-	snd_pcm_uframes_t pos = ACCESS_ONCE(i2s->tx_ptr);
+	snd_pcm_uframes_t pos = READ_ONCE(i2s->tx_ptr);
 
 	return pos < runtime->buffer_size ? pos : 0;
 }
diff --git a/sound/usb/bcd2000/bcd2000.c b/sound/usb/bcd2000/bcd2000.c
index 7371e5b06035..fc579f330601 100644
--- a/sound/usb/bcd2000/bcd2000.c
+++ b/sound/usb/bcd2000/bcd2000.c
@@ -108,7 +108,7 @@ static void bcd2000_midi_handle_input(struct bcd2000 *bcd2k,
 	unsigned int payload_length, tocopy;
 	struct snd_rawmidi_substream *midi_receive_substream;
 
-	midi_receive_substream = ACCESS_ONCE(bcd2k->midi_receive_substream);
+	midi_receive_substream = READ_ONCE(bcd2k->midi_receive_substream);
 	if (!midi_receive_substream)
 		return;
 
@@ -139,7 +139,7 @@ static void bcd2000_midi_send(struct bcd2000 *bcd2k)
 
 	BUILD_BUG_ON(sizeof(device_cmd_prefix) >= BUFSIZE);
 
-	midi_out_substream = ACCESS_ONCE(bcd2k->midi_out_substream);
+	midi_out_substream = READ_ONCE(bcd2k->midi_out_substream);
 	if (!midi_out_substream)
 		return;
 
diff --git a/tools/arch/x86/include/asm/atomic.h b/tools/arch/x86/include/asm/atomic.h
index 328eeceec709..96e2d06cb031 100644
--- a/tools/arch/x86/include/asm/atomic.h
+++ b/tools/arch/x86/include/asm/atomic.h
@@ -24,7 +24,7 @@
  */
 static inline int atomic_read(const atomic_t *v)
 {
-	return ACCESS_ONCE((v)->counter);
+	return READ_ONCE((v)->counter);
 }
 
 /**
diff --git a/tools/include/asm-generic/atomic-gcc.h b/tools/include/asm-generic/atomic-gcc.h
index 5e9738f97bf3..97427e700e3b 100644
--- a/tools/include/asm-generic/atomic-gcc.h
+++ b/tools/include/asm-generic/atomic-gcc.h
@@ -21,7 +21,7 @@
  */
 static inline int atomic_read(const atomic_t *v)
 {
-	return ACCESS_ONCE((v)->counter);
+	return READ_ONCE((v)->counter);
 }
 
 /**
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
index 33b5e6cdf38c..d19e11b68de7 100644
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h
@@ -378,7 +378,7 @@ struct addr_filters {
 static inline u64 auxtrace_mmap__read_snapshot_head(struct auxtrace_mmap *mm)
 {
 	struct perf_event_mmap_page *pc = mm->userpg;
-	u64 head = ACCESS_ONCE(pc->aux_head);
+	u64 head = READ_ONCE(pc->aux_head);
 
 	/* Ensure all reads are done after we read the head */
 	rmb();
@@ -389,7 +389,7 @@ static inline u64 auxtrace_mmap__read_head(struct auxtrace_mmap *mm)
 {
 	struct perf_event_mmap_page *pc = mm->userpg;
 #if BITS_PER_LONG == 64 || !defined(HAVE_SYNC_COMPARE_AND_SWAP_SUPPORT)
-	u64 head = ACCESS_ONCE(pc->aux_head);
+	u64 head = READ_ONCE(pc->aux_head);
 #else
 	u64 head = __sync_val_compare_and_swap(&pc->aux_head, 0, 0);
 #endif
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 47b5e7dbcb18..aae9645c7122 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -113,7 +113,7 @@ int __perf_session__set_tracepoints_handlers(struct perf_session *session,
 
 extern volatile int session_done;
 
-#define session_done()	ACCESS_ONCE(session_done)
+#define session_done()	READ_ONCE(session_done)
 
 int perf_session__deliver_synth_event(struct perf_session *session,
 				      union perf_event *event,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9deb5a245b83..ce507ae1d4f5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2302,7 +2302,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 				continue;
 			} else if (pass && i > last_boosted_vcpu)
 				break;
-			if (!ACCESS_ONCE(vcpu->preempted))
+			if (!READ_ONCE(vcpu->preempted))
 				continue;
 			if (vcpu == me)
 				continue;
-- 
cgit v1.2.3


From 24efd94bc38290dc1d9775a1e767ed4685d8a79b Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 20 Oct 2017 16:31:27 +0200
Subject: gpio: mmio: Make pin2mask() a private business

The vtable call pin2mask() was introducing a vtable function call
in every gpiochip callback for a generic MMIO GPIO chip. This was
not exactly efficient. (Maybe link-time optimization could get rid of
it, I don't know.)

After removing all external calls into this API we can make it a
boolean flag in the struct gpio_chip call and sink the function into
the gpio-mmio driver yielding encapsulation and potential speedups.

Cc: Anton Vorontsov <anton@enomsg.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-mmio.c    | 45 +++++++++++++++++++--------------------------
 include/linux/gpio/driver.h |  8 ++++----
 2 files changed, 23 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-mmio.c b/drivers/gpio/gpio-mmio.c
index f7da40e46c55..63bb48878de1 100644
--- a/drivers/gpio/gpio-mmio.c
+++ b/drivers/gpio/gpio-mmio.c
@@ -126,20 +126,16 @@ static unsigned long bgpio_read32be(void __iomem *reg)
 	return ioread32be(reg);
 }
 
-static unsigned long bgpio_pin2mask(struct gpio_chip *gc, unsigned int pin)
+static unsigned long bgpio_line2mask(struct gpio_chip *gc, unsigned int line)
 {
-	return BIT(pin);
-}
-
-static unsigned long bgpio_pin2mask_be(struct gpio_chip *gc,
-				       unsigned int pin)
-{
-	return BIT(gc->bgpio_bits - 1 - pin);
+	if (gc->be_bits)
+		return BIT(gc->bgpio_bits - 1 - line);
+	return BIT(line);
 }
 
 static int bgpio_get_set(struct gpio_chip *gc, unsigned int gpio)
 {
-	unsigned long pinmask = gc->pin2mask(gc, gpio);
+	unsigned long pinmask = bgpio_line2mask(gc, gpio);
 
 	if (gc->bgpio_dir & pinmask)
 		return !!(gc->read_reg(gc->reg_set) & pinmask);
@@ -149,7 +145,7 @@ static int bgpio_get_set(struct gpio_chip *gc, unsigned int gpio)
 
 static int bgpio_get(struct gpio_chip *gc, unsigned int gpio)
 {
-	return !!(gc->read_reg(gc->reg_dat) & gc->pin2mask(gc, gpio));
+	return !!(gc->read_reg(gc->reg_dat) & bgpio_line2mask(gc, gpio));
 }
 
 static void bgpio_set_none(struct gpio_chip *gc, unsigned int gpio, int val)
@@ -158,7 +154,7 @@ static void bgpio_set_none(struct gpio_chip *gc, unsigned int gpio, int val)
 
 static void bgpio_set(struct gpio_chip *gc, unsigned int gpio, int val)
 {
-	unsigned long mask = gc->pin2mask(gc, gpio);
+	unsigned long mask = bgpio_line2mask(gc, gpio);
 	unsigned long flags;
 
 	spin_lock_irqsave(&gc->bgpio_lock, flags);
@@ -176,7 +172,7 @@ static void bgpio_set(struct gpio_chip *gc, unsigned int gpio, int val)
 static void bgpio_set_with_clear(struct gpio_chip *gc, unsigned int gpio,
 				 int val)
 {
-	unsigned long mask = gc->pin2mask(gc, gpio);
+	unsigned long mask = bgpio_line2mask(gc, gpio);
 
 	if (val)
 		gc->write_reg(gc->reg_set, mask);
@@ -186,7 +182,7 @@ static void bgpio_set_with_clear(struct gpio_chip *gc, unsigned int gpio,
 
 static void bgpio_set_set(struct gpio_chip *gc, unsigned int gpio, int val)
 {
-	unsigned long mask = gc->pin2mask(gc, gpio);
+	unsigned long mask = bgpio_line2mask(gc, gpio);
 	unsigned long flags;
 
 	spin_lock_irqsave(&gc->bgpio_lock, flags);
@@ -216,9 +212,9 @@ static void bgpio_multiple_get_masks(struct gpio_chip *gc,
 			break;
 		if (__test_and_clear_bit(i, mask)) {
 			if (test_bit(i, bits))
-				*set_mask |= gc->pin2mask(gc, i);
+				*set_mask |= bgpio_line2mask(gc, i);
 			else
-				*clear_mask |= gc->pin2mask(gc, i);
+				*clear_mask |= bgpio_line2mask(gc, i);
 		}
 	}
 }
@@ -294,7 +290,7 @@ static int bgpio_dir_in(struct gpio_chip *gc, unsigned int gpio)
 
 	spin_lock_irqsave(&gc->bgpio_lock, flags);
 
-	gc->bgpio_dir &= ~gc->pin2mask(gc, gpio);
+	gc->bgpio_dir &= ~bgpio_line2mask(gc, gpio);
 	gc->write_reg(gc->reg_dir, gc->bgpio_dir);
 
 	spin_unlock_irqrestore(&gc->bgpio_lock, flags);
@@ -305,7 +301,7 @@ static int bgpio_dir_in(struct gpio_chip *gc, unsigned int gpio)
 static int bgpio_get_dir(struct gpio_chip *gc, unsigned int gpio)
 {
 	/* Return 0 if output, 1 of input */
-	return !(gc->read_reg(gc->reg_dir) & gc->pin2mask(gc, gpio));
+	return !(gc->read_reg(gc->reg_dir) & bgpio_line2mask(gc, gpio));
 }
 
 static int bgpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
@@ -316,7 +312,7 @@ static int bgpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
 
 	spin_lock_irqsave(&gc->bgpio_lock, flags);
 
-	gc->bgpio_dir |= gc->pin2mask(gc, gpio);
+	gc->bgpio_dir |= bgpio_line2mask(gc, gpio);
 	gc->write_reg(gc->reg_dir, gc->bgpio_dir);
 
 	spin_unlock_irqrestore(&gc->bgpio_lock, flags);
@@ -330,7 +326,7 @@ static int bgpio_dir_in_inv(struct gpio_chip *gc, unsigned int gpio)
 
 	spin_lock_irqsave(&gc->bgpio_lock, flags);
 
-	gc->bgpio_dir |= gc->pin2mask(gc, gpio);
+	gc->bgpio_dir |= bgpio_line2mask(gc, gpio);
 	gc->write_reg(gc->reg_dir, gc->bgpio_dir);
 
 	spin_unlock_irqrestore(&gc->bgpio_lock, flags);
@@ -346,7 +342,7 @@ static int bgpio_dir_out_inv(struct gpio_chip *gc, unsigned int gpio, int val)
 
 	spin_lock_irqsave(&gc->bgpio_lock, flags);
 
-	gc->bgpio_dir &= ~gc->pin2mask(gc, gpio);
+	gc->bgpio_dir &= ~bgpio_line2mask(gc, gpio);
 	gc->write_reg(gc->reg_dir, gc->bgpio_dir);
 
 	spin_unlock_irqrestore(&gc->bgpio_lock, flags);
@@ -357,12 +353,11 @@ static int bgpio_dir_out_inv(struct gpio_chip *gc, unsigned int gpio, int val)
 static int bgpio_get_dir_inv(struct gpio_chip *gc, unsigned int gpio)
 {
 	/* Return 0 if output, 1 if input */
-	return !!(gc->read_reg(gc->reg_dir) & gc->pin2mask(gc, gpio));
+	return !!(gc->read_reg(gc->reg_dir) & bgpio_line2mask(gc, gpio));
 }
 
 static int bgpio_setup_accessors(struct device *dev,
 				 struct gpio_chip *gc,
-				 bool bit_be,
 				 bool byte_be)
 {
 
@@ -406,8 +401,6 @@ static int bgpio_setup_accessors(struct device *dev,
 		return -EINVAL;
 	}
 
-	gc->pin2mask = bit_be ? bgpio_pin2mask_be : bgpio_pin2mask;
-
 	return 0;
 }
 
@@ -531,8 +524,8 @@ int bgpio_init(struct gpio_chip *gc, struct device *dev,
 	if (ret)
 		return ret;
 
-	ret = bgpio_setup_accessors(dev, gc, flags & BGPIOF_BIG_ENDIAN,
-				    flags & BGPIOF_BIG_ENDIAN_BYTE_ORDER);
+	gc->be_bits = !!(flags & BGPIOF_BIG_ENDIAN);
+	ret = bgpio_setup_accessors(dev, gc, flags & BGPIOF_BIG_ENDIAN_BYTE_ORDER);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index bda95a9b7b8c..ed04fa2a00a8 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -67,9 +67,9 @@ struct module;
  *	registers.
  * @read_reg: reader function for generic GPIO
  * @write_reg: writer function for generic GPIO
- * @pin2mask: some generic GPIO controllers work with the big-endian bits
- *	notation, e.g. in a 8-bits register, GPIO7 is the least significant
- *	bit. This callback assigns the right bit mask.
+ * @be_bits: if the generic GPIO has big endian bit order (bit 31 is representing
+ *	line 0, bit 30 is line 1 ... bit 0 is line 31) this is set to true by the
+ *	generic GPIO core. It is for internal housekeeping only.
  * @reg_dat: data (in) register for generic GPIO
  * @reg_set: output set register (out=high) for generic GPIO
  * @reg_clr: output clear register (out=low) for generic GPIO
@@ -151,7 +151,7 @@ struct gpio_chip {
 #if IS_ENABLED(CONFIG_GPIO_GENERIC)
 	unsigned long (*read_reg)(void __iomem *reg);
 	void (*write_reg)(void __iomem *reg, unsigned long data);
-	unsigned long (*pin2mask)(struct gpio_chip *gc, unsigned int pin);
+	bool be_bits;
 	void __iomem *reg_dat;
 	void __iomem *reg_set;
 	void __iomem *reg_clr;
-- 
cgit v1.2.3


From 6f0397d7e100f3b3978d6ebb6b2dea29ee7c4a95 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Wed, 25 Oct 2017 17:55:58 +0900
Subject: locking/lockdep: Provide empty lockdep_map structure for
 !CONFIG_LOCKDEP

After this patch the lockdep_map structure takes no space if lockdep is
disabled, reducing the number of #ifdefs in unrelated kernel code.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: amir73il@gmail.com
Cc: axboe@kernel.dk
Cc: darrick.wong@oracle.com
Cc: david@fromorbit.com
Cc: hch@infradead.org
Cc: idryomov@gmail.com
Cc: johan@kernel.org
Cc: johannes.berg@intel.com
Cc: kernel-team@lge.com
Cc: linux-block@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-xfs@vger.kernel.org
Cc: oleg@redhat.com
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/1508921765-15396-3-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index bfa8e0b0d6f1..b6662d05bcda 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -527,6 +527,11 @@ static inline void lockdep_on(void)
  */
 struct lock_class_key { };
 
+/*
+ * The lockdep_map takes no space if lockdep is disabled:
+ */
+struct lockdep_map { };
+
 #define lockdep_depth(tsk)	(0)
 
 #define lockdep_is_held_type(l, r)		(1)
-- 
cgit v1.2.3


From 24208435e343679b21502fb90786084dfaf15369 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Wed, 25 Oct 2017 17:55:59 +0900
Subject: locking/lockdep, sched/completions: Change the prefix of lock name
 for completion variables

CONFIG_LOCKDEP_COMPLETIONS uses "(complete)" as a prefix of lock name
for completion variable.

However, what we should use here is a noun - so use "(completion)" instead.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: amir73il@gmail.com
Cc: axboe@kernel.dk
Cc: darrick.wong@oracle.com
Cc: david@fromorbit.com
Cc: hch@infradead.org
Cc: idryomov@gmail.com
Cc: johan@kernel.org
Cc: johannes.berg@intel.com
Cc: kernel-team@lge.com
Cc: linux-block@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-xfs@vger.kernel.org
Cc: oleg@redhat.com
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/1508921765-15396-4-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/completion.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/completion.h b/include/linux/completion.h
index cae5400022a3..91218036b1c8 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -53,7 +53,7 @@ static inline void complete_release_commit(struct completion *x)
 do {									\
 	static struct lock_class_key __key;				\
 	lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map,	\
-			"(complete)" #x,				\
+			"(completion)" #x,				\
 			&__key, 0);					\
 	__init_completion(x);						\
 } while (0)
@@ -67,7 +67,7 @@ static inline void complete_release_commit(struct completion *x) {}
 #ifdef CONFIG_LOCKDEP_COMPLETIONS
 #define COMPLETION_INITIALIZER(work) \
 	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
-	STATIC_CROSS_LOCKDEP_MAP_INIT("(complete)" #work, &(work)) }
+	STATIC_CROSS_LOCKDEP_MAP_INIT("(completion)" #work, &(work)) }
 #else
 #define COMPLETION_INITIALIZER(work) \
 	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
-- 
cgit v1.2.3


From a7967bc31584bd282682981295861e7bcba19e65 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Wed, 25 Oct 2017 17:56:03 +0900
Subject: sched/completions: Add support for initializing completions with
 lockdep_map

Sometimes we want to initialize completions with sparate lockdep maps
to assign lock classes as desired. For example, the workqueue code
needs to directly manage lockdep maps, since only the code is aware of
how to classify lockdep maps properly.

Provide additional macros initializing completions in that way.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: amir73il@gmail.com
Cc: axboe@kernel.dk
Cc: darrick.wong@oracle.com
Cc: david@fromorbit.com
Cc: hch@infradead.org
Cc: idryomov@gmail.com
Cc: johan@kernel.org
Cc: johannes.berg@intel.com
Cc: kernel-team@lge.com
Cc: linux-block@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-xfs@vger.kernel.org
Cc: oleg@redhat.com
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/1508921765-15396-8-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/completion.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/completion.h b/include/linux/completion.h
index 91218036b1c8..4da49916ef3f 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -49,6 +49,13 @@ static inline void complete_release_commit(struct completion *x)
 	lock_commit_crosslock((struct lockdep_map *)&x->map);
 }
 
+#define init_completion_map(x, m)					\
+do {									\
+	lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map,	\
+			(m)->name, (m)->key, 0);				\
+	__init_completion(x);						\
+} while (0)
+
 #define init_completion(x)						\
 do {									\
 	static struct lock_class_key __key;				\
@@ -58,6 +65,7 @@ do {									\
 	__init_completion(x);						\
 } while (0)
 #else
+#define init_completion_map(x, m) __init_completion(x)
 #define init_completion(x) __init_completion(x)
 static inline void complete_acquire(struct completion *x) {}
 static inline void complete_release(struct completion *x) {}
@@ -73,6 +81,9 @@ static inline void complete_release_commit(struct completion *x) {}
 	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
 #endif
 
+#define COMPLETION_INITIALIZER_ONSTACK_MAP(work, map) \
+	(*({ init_completion_map(&(work), &(map)); &(work); }))
+
 #define COMPLETION_INITIALIZER_ONSTACK(work) \
 	(*({ init_completion(&work); &work; }))
 
@@ -102,8 +113,11 @@ static inline void complete_release_commit(struct completion *x) {}
 #ifdef CONFIG_LOCKDEP
 # define DECLARE_COMPLETION_ONSTACK(work) \
 	struct completion work = COMPLETION_INITIALIZER_ONSTACK(work)
+# define DECLARE_COMPLETION_ONSTACK_MAP(work, map) \
+	struct completion work = COMPLETION_INITIALIZER_ONSTACK_MAP(work, map)
 #else
 # define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work)
+# define DECLARE_COMPLETION_ONSTACK_MAP(work, map) DECLARE_COMPLETION(work)
 #endif
 
 /**
-- 
cgit v1.2.3


From fd1a5b04dfb899f84ddeb8acdaea6b98283df1e5 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Wed, 25 Oct 2017 17:56:04 +0900
Subject: workqueue: Remove now redundant lock acquisitions wrt. workqueue
 flushes

The workqueue code added manual lock acquisition annotations to catch
deadlocks.

After lockdepcrossrelease was introduced, some of those became redundant,
since wait_for_completion() already does the acquisition and tracking.

Remove the duplicate annotations.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: amir73il@gmail.com
Cc: axboe@kernel.dk
Cc: darrick.wong@oracle.com
Cc: david@fromorbit.com
Cc: hch@infradead.org
Cc: idryomov@gmail.com
Cc: johan@kernel.org
Cc: johannes.berg@intel.com
Cc: kernel-team@lge.com
Cc: linux-block@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-xfs@vger.kernel.org
Cc: oleg@redhat.com
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/1508921765-15396-9-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/workqueue.h |  4 ++--
 kernel/workqueue.c        | 19 +++----------------
 2 files changed, 5 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 1c49431f3121..c8a572cb49be 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -218,7 +218,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 									\
 		__init_work((_work), _onstack);				\
 		(_work)->data = (atomic_long_t) WORK_DATA_INIT();	\
-		lockdep_init_map(&(_work)->lockdep_map, #_work, &__key, 0); \
+		lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, &__key, 0); \
 		INIT_LIST_HEAD(&(_work)->entry);			\
 		(_work)->func = (_func);				\
 	} while (0)
@@ -398,7 +398,7 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
 	static struct lock_class_key __key;				\
 	const char *__lock_name;					\
 									\
-	__lock_name = #fmt#args;					\
+	__lock_name = "(wq_completion)"#fmt#args;			\
 									\
 	__alloc_workqueue_key((fmt), (flags), (max_active),		\
 			      &__key, __lock_name, ##args);		\
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 39831b2f3c5f..160fdc6e839a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2497,15 +2497,8 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
 	INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
 	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
 
-	/*
-	 * Explicitly init the crosslock for wq_barrier::done, make its lock
-	 * key a subkey of the corresponding work. As a result we won't
-	 * build a dependency between wq_barrier::done and unrelated work.
-	 */
-	lockdep_init_map_crosslock((struct lockdep_map *)&barr->done.map,
-				   "(complete)wq_barr::done",
-				   target->lockdep_map.key, 1);
-	__init_completion(&barr->done);
+	init_completion_map(&barr->done, &target->lockdep_map);
+
 	barr->task = current;
 
 	/*
@@ -2611,16 +2604,13 @@ void flush_workqueue(struct workqueue_struct *wq)
 	struct wq_flusher this_flusher = {
 		.list = LIST_HEAD_INIT(this_flusher.list),
 		.flush_color = -1,
-		.done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
+		.done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map),
 	};
 	int next_color;
 
 	if (WARN_ON(!wq_online))
 		return;
 
-	lock_map_acquire(&wq->lockdep_map);
-	lock_map_release(&wq->lockdep_map);
-
 	mutex_lock(&wq->mutex);
 
 	/*
@@ -2883,9 +2873,6 @@ bool flush_work(struct work_struct *work)
 	if (WARN_ON(!wq_online))
 		return false;
 
-	lock_map_acquire(&work->lockdep_map);
-	lock_map_release(&work->lockdep_map);
-
 	if (start_flush_work(work, &barr)) {
 		wait_for_completion(&barr.done);
 		destroy_work_on_stack(&barr.work);
-- 
cgit v1.2.3


From ccc8708790273811db24676223b040710793cba7 Mon Sep 17 00:00:00 2001
From: Noa Osherovich <noaos@mellanox.com>
Date: Tue, 17 Oct 2017 18:01:13 +0300
Subject: IB/mlx5: Allow creation of a multi-packet RQ

Allow creation of a multi-packet receive queue.

In order to create a multi-packet RQ, the following fields in
the mlx5_ib_rwq should be set:
- log_num_strides: Log of number of strides per WQE
- single_stride_log_num_of_bytes: Log of a single stride size
- two_byte_shift_en: When enabled, hardware pads 2 bytes of zeros
  before writing the message to memory (e.g. for the IP alignment).

Signed-off-by: Noa Osherovich <noaos@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  4 +++
 drivers/infiniband/hw/mlx5/qp.c      | 52 ++++++++++++++++++++++++++++++------
 include/linux/mlx5/mlx5_ifc.h        |  1 +
 include/uapi/rdma/mlx5-abi.h         |  8 +++++-
 4 files changed, 56 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 8de40852818b..e7deaa08535b 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -254,6 +254,7 @@ struct mlx5_ib_wq {
 
 enum mlx5_ib_wq_flags {
 	MLX5_IB_WQ_FLAGS_DELAY_DROP = 0x1,
+	MLX5_IB_WQ_FLAGS_STRIDING_RQ = 0x2,
 };
 
 #define MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES 9
@@ -269,6 +270,9 @@ struct mlx5_ib_rwq {
 	u32			log_rq_size;
 	u32			rq_page_offset;
 	u32			log_page_size;
+	u32			log_num_strides;
+	u32			two_byte_shift_en;
+	u32			single_stride_log_num_of_bytes;
 	struct ib_umem		*umem;
 	size_t			buf_size;
 	unsigned int		page_shift;
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 37a0976240fd..d209c684d729 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -4706,9 +4706,19 @@ static int  create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd,
 	MLX5_SET(rqc,  rqc, state, MLX5_RQC_STATE_RST);
 	MLX5_SET(rqc,  rqc, flush_in_error_en, 1);
 	wq = MLX5_ADDR_OF(rqc, rqc, wq);
-	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
+	MLX5_SET(wq, wq, wq_type,
+		 rwq->create_flags & MLX5_IB_WQ_FLAGS_STRIDING_RQ ?
+		 MLX5_WQ_TYPE_CYCLIC_STRIDING_RQ : MLX5_WQ_TYPE_CYCLIC);
 	MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
 	MLX5_SET(wq, wq, log_wq_stride, rwq->log_rq_stride);
+	if (rwq->create_flags & MLX5_IB_WQ_FLAGS_STRIDING_RQ) {
+		MLX5_SET(wq, wq, two_byte_shift_en, rwq->two_byte_shift_en);
+		MLX5_SET(wq, wq, log_wqe_stride_size,
+			 rwq->single_stride_log_num_of_bytes -
+			 MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES);
+		MLX5_SET(wq, wq, log_wqe_num_of_strides, rwq->log_num_strides -
+			 MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES);
+	}
 	MLX5_SET(wq, wq, log_wq_sz, rwq->log_rq_size);
 	MLX5_SET(wq, wq, pd, to_mpd(pd)->pdn);
 	MLX5_SET(wq, wq, page_offset, rwq->rq_page_offset);
@@ -4790,7 +4800,8 @@ static int prepare_user_rq(struct ib_pd *pd,
 	int err;
 	size_t required_cmd_sz;
 
-	required_cmd_sz = offsetof(typeof(ucmd), reserved) + sizeof(ucmd.reserved);
+	required_cmd_sz = offsetof(typeof(ucmd), single_stride_log_num_of_bytes)
+		+ sizeof(ucmd.single_stride_log_num_of_bytes);
 	if (udata->inlen < required_cmd_sz) {
 		mlx5_ib_dbg(dev, "invalid inlen\n");
 		return -EINVAL;
@@ -4808,14 +4819,39 @@ static int prepare_user_rq(struct ib_pd *pd,
 		return -EFAULT;
 	}
 
-	if (ucmd.comp_mask) {
+	if (ucmd.comp_mask & (~MLX5_IB_CREATE_WQ_STRIDING_RQ)) {
 		mlx5_ib_dbg(dev, "invalid comp mask\n");
 		return -EOPNOTSUPP;
-	}
-
-	if (ucmd.reserved) {
-		mlx5_ib_dbg(dev, "invalid reserved\n");
-		return -EOPNOTSUPP;
+	} else if (ucmd.comp_mask & MLX5_IB_CREATE_WQ_STRIDING_RQ) {
+		if (!MLX5_CAP_GEN(dev->mdev, striding_rq)) {
+			mlx5_ib_dbg(dev, "Striding RQ is not supported\n");
+			return -EOPNOTSUPP;
+		}
+		if ((ucmd.single_stride_log_num_of_bytes <
+		    MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES) ||
+		    (ucmd.single_stride_log_num_of_bytes >
+		     MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES)) {
+			mlx5_ib_dbg(dev, "Invalid log stride size (%u. Range is %u - %u)\n",
+				    ucmd.single_stride_log_num_of_bytes,
+				    MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES,
+				    MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES);
+			return -EINVAL;
+		}
+		if ((ucmd.single_wqe_log_num_of_strides >
+		    MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES) ||
+		     (ucmd.single_wqe_log_num_of_strides <
+			MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES)) {
+			mlx5_ib_dbg(dev, "Invalid log num strides (%u. Range is %u - %u)\n",
+				    ucmd.single_wqe_log_num_of_strides,
+				    MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES,
+				    MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES);
+			return -EINVAL;
+		}
+		rwq->single_stride_log_num_of_bytes =
+			ucmd.single_stride_log_num_of_bytes;
+		rwq->log_num_strides = ucmd.single_wqe_log_num_of_strides;
+		rwq->two_byte_shift_en = !!ucmd.two_byte_shift_en;
+		rwq->create_flags |= MLX5_IB_WQ_FLAGS_STRIDING_RQ;
 	}
 
 	err = set_user_rq_size(dev, init_attr, &ucmd, rwq);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 69772347f866..db655db45b77 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -744,6 +744,7 @@ enum {
 	MLX5_WQ_TYPE_LINKED_LIST  = 0x0,
 	MLX5_WQ_TYPE_CYCLIC       = 0x1,
 	MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ = 0x2,
+	MLX5_WQ_TYPE_CYCLIC_STRIDING_RQ = 0x3,
 };
 
 enum {
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index 0832d9502200..b1d5b87ba3fd 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -308,6 +308,10 @@ struct mlx5_ib_alloc_mw {
 	__u16	reserved2;
 };
 
+enum mlx5_ib_create_wq_mask {
+	MLX5_IB_CREATE_WQ_STRIDING_RQ	= (1 << 0),
+};
+
 struct mlx5_ib_create_wq {
 	__u64   buf_addr;
 	__u64   db_addr;
@@ -316,7 +320,9 @@ struct mlx5_ib_create_wq {
 	__u32   user_index;
 	__u32   flags;
 	__u32   comp_mask;
-	__u32   reserved;
+	__u32	single_stride_log_num_of_bytes;
+	__u32	single_wqe_log_num_of_strides;
+	__u32	two_byte_shift_en;
 };
 
 struct mlx5_ib_create_ah_resp {
-- 
cgit v1.2.3


From 0ff8e79ca7c81fafa3f5c91a1b58efc85cbc2302 Mon Sep 17 00:00:00 2001
From: Guy Levi <guyle@mellanox.com>
Date: Thu, 19 Oct 2017 08:25:51 +0300
Subject: IB/mlx5: Add 128B CQE compression and padding HW bits

Adding new bits in mlx5_ifc_cmd_hca_cap to get the hardware capabilities
for:
 - compression_128: Support 128B CQE compression
 - cqe_128_always: Support 128B CQE padding

Signed-off-by: Guy Levi <guyle@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/linux/mlx5/mlx5_ifc.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index db655db45b77..d49928c314ed 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1047,7 +1047,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   num_of_uars_per_page[0x20];
 	u8         reserved_at_540[0x40];
 
-	u8         reserved_at_580[0x3f];
+	u8         reserved_at_580[0x3d];
+	u8         cqe_128_always[0x1];
+	u8         cqe_compression_128[0x1];
 	u8         cqe_compression[0x1];
 
 	u8         cqe_compression_timeout[0x10];
-- 
cgit v1.2.3


From 7a0c8f4244e9ec7a630563d294b211342b46223d Mon Sep 17 00:00:00 2001
From: Guy Levi <guyle@mellanox.com>
Date: Thu, 19 Oct 2017 08:25:53 +0300
Subject: IB/mlx5: Support padded 128B CQE feature

In some benchmarks and some CPU architectures, writing the CQE on a full
cache line size improves performance by saving memory access operations
(read-modify-write) relative to partial cache line change. This patch
lets the user to configure the device to pad the CQE up to 128B in case
its content is less than 128B. Currently the driver supports only padding
for a CQE size of 128B.

Signed-off-by: Guy Levi <guyle@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/cq.c      | 27 +++++++++++++++++++++++----
 drivers/infiniband/hw/mlx5/main.c    |  4 ++++
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  5 +++++
 include/linux/mlx5/cq.h              |  6 ++++--
 include/uapi/rdma/mlx5-abi.h         |  7 ++++++-
 5 files changed, 42 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 51871f049c57..01b218a3c277 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -754,13 +754,13 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 	int err;
 
 	ucmdlen = udata->inlen < sizeof(ucmd) ?
-		  (sizeof(ucmd) - sizeof(ucmd.reserved)) : sizeof(ucmd);
+		  (sizeof(ucmd) - sizeof(ucmd.flags)) : sizeof(ucmd);
 
 	if (ib_copy_from_udata(&ucmd, udata, ucmdlen))
 		return -EFAULT;
 
 	if (ucmdlen == sizeof(ucmd) &&
-	    ucmd.reserved != 0)
+	    (ucmd.flags & ~(MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD)))
 		return -EINVAL;
 
 	if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128)
@@ -830,6 +830,19 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 			 ilog2(ucmd.cqe_comp_res_format));
 	}
 
+	if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD) {
+		if (*cqe_size != 128 ||
+		    !MLX5_CAP_GEN(dev->mdev, cqe_128_always)) {
+			err = -EOPNOTSUPP;
+			mlx5_ib_warn(dev,
+				     "CQE padding is not supported for CQE size of %dB!\n",
+				     *cqe_size);
+			goto err_cqb;
+		}
+
+		cq->private_flags |= MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD;
+	}
+
 	return 0;
 
 err_cqb:
@@ -989,7 +1002,10 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
 	cq->cqe_size = cqe_size;
 
 	cqc = MLX5_ADDR_OF(create_cq_in, cqb, cq_context);
-	MLX5_SET(cqc, cqc, cqe_sz, cqe_sz_to_mlx_sz(cqe_size));
+	MLX5_SET(cqc, cqc, cqe_sz,
+		 cqe_sz_to_mlx_sz(cqe_size,
+				  cq->private_flags &
+				  MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD));
 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(entries));
 	MLX5_SET(cqc, cqc, uar_page, index);
 	MLX5_SET(cqc, cqc, c_eqn, eqn);
@@ -1339,7 +1355,10 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
 
 	MLX5_SET(cqc, cqc, log_page_size,
 		 page_shift - MLX5_ADAPTER_PAGE_SHIFT);
-	MLX5_SET(cqc, cqc, cqe_sz, cqe_sz_to_mlx_sz(cqe_size));
+	MLX5_SET(cqc, cqc, cqe_sz,
+		 cqe_sz_to_mlx_sz(cqe_size,
+				  cq->private_flags &
+				  MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD));
 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(entries));
 
 	MLX5_SET(modify_cq_in, in, op_mod, MLX5_CQ_OPMOD_RESIZE);
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index b9337562aa90..1edd41e3be1b 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -826,9 +826,13 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 
 	if (field_avail(typeof(resp), flags, uhw->outlen)) {
 		resp.response_length += sizeof(resp.flags);
+
 		if (MLX5_CAP_GEN(mdev, cqe_compression_128))
 			resp.flags |=
 				MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP;
+
+		if (MLX5_CAP_GEN(mdev, cqe_128_always))
+			resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD;
 	}
 
 	if (field_avail(typeof(resp), sw_parsing_caps,
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index e7deaa08535b..137f2116911f 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -444,6 +444,10 @@ struct mlx5_shared_mr_info {
 	struct ib_umem		*umem;
 };
 
+enum mlx5_ib_cq_pr_flags {
+	MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD	= 1 << 0,
+};
+
 struct mlx5_ib_cq {
 	struct ib_cq		ibcq;
 	struct mlx5_core_cq	mcq;
@@ -466,6 +470,7 @@ struct mlx5_ib_cq {
 	struct list_head	wc_list;
 	enum ib_cq_notify_flags notify_flags;
 	struct work_struct	notify_work;
+	u16			private_flags; /* Use mlx5_ib_cq_pr_flags */
 };
 
 struct mlx5_ib_wc {
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index 95898847c7d4..cc718e245b1e 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -125,11 +125,13 @@ struct mlx5_cq_modify_params {
 enum {
 	CQE_SIZE_64 = 0,
 	CQE_SIZE_128 = 1,
+	CQE_SIZE_128_PAD = 2,
 };
 
-static inline int cqe_sz_to_mlx_sz(u8 size)
+static inline int cqe_sz_to_mlx_sz(u8 size, int padding_128_en)
 {
-	return size == 64 ? CQE_SIZE_64 : CQE_SIZE_128;
+	return padding_128_en ? CQE_SIZE_128_PAD :
+				size == 64 ? CQE_SIZE_64 : CQE_SIZE_128;
 }
 
 static inline void mlx5_cq_set_ci(struct mlx5_core_cq *cq)
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index a8fc1f0956d0..201a60f032dd 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -206,6 +206,7 @@ struct mlx5_ib_striding_rq_caps {
 enum mlx5_ib_query_dev_resp_flags {
 	/* Support 128B CQE compression */
 	MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP = 1 << 0,
+	MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD  = 1 << 1,
 };
 
 struct mlx5_ib_query_device_resp {
@@ -221,13 +222,17 @@ struct mlx5_ib_query_device_resp {
 	struct mlx5_ib_striding_rq_caps striding_rq_caps;
 };
 
+enum mlx5_ib_create_cq_flags {
+	MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD	= 1 << 0,
+};
+
 struct mlx5_ib_create_cq {
 	__u64	buf_addr;
 	__u64	db_addr;
 	__u32	cqe_size;
 	__u8    cqe_comp_en;
 	__u8    cqe_comp_res_format;
-	__u16	reserved; /* explicit padding (optional on i386) */
+	__u16	flags;
 };
 
 struct mlx5_ib_create_cq_resp {
-- 
cgit v1.2.3


From 4d350f1f89ee8ea84be1ef09717bf392fa5a3b45 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Thu, 19 Oct 2017 08:25:54 +0300
Subject: IB/mlx5: Update tunnel offloads bits

This patch updates the mlx5_ifc with the following:
- Fix tunnel_stateless_gre typo.
- max_geneve_opt_len - Maximum geneve options length.
- tunnel_stateless_geneve_rx - If set, receive Stateless
  Offloads for Geneve tunneled (inner) packets are supported.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/linux/mlx5/mlx5_ifc.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index d49928c314ed..d4c29c183d28 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -614,7 +614,9 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 	u8         swp[0x1];
 	u8         swp_csum[0x1];
 	u8         swp_lso[0x1];
-	u8         reserved_at_23[0x1d];
+	u8         reserved_at_23[0x1b];
+	u8         max_geneve_opt_len[0x1];
+	u8         tunnel_stateless_geneve_rx[0x1];
 
 	u8         reserved_at_40[0x10];
 	u8         lro_min_mss_size[0x10];
-- 
cgit v1.2.3


From 8ac0d9a81edf2ef4a2268b65b802a6b856dc77e6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 Oct 2017 12:35:02 -0600
Subject: elevator: allow name aliases

Since we now lookup elevator types with the appropriate multiqueue
capability, allow schedulers to register with an alias alongside
the real name. This is in preparation for allowing 'mq-deadline'
to register an alias of 'deadline' as well.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/elevator.c         | 23 +++++++++++++++++------
 include/linux/elevator.h |  1 +
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/block/elevator.c b/block/elevator.c
index 77856bf29568..7bda083d5968 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -83,6 +83,16 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
 }
 EXPORT_SYMBOL(elv_bio_merge_ok);
 
+static bool elevator_match(const struct elevator_type *e, const char *name)
+{
+	if (!strcmp(e->elevator_name, name))
+		return true;
+	if (e->elevator_alias && !strcmp(e->elevator_alias, name))
+		return true;
+
+	return false;
+}
+
 /*
  * Return scheduler with name 'name' and with matching 'mq capability
  */
@@ -91,7 +101,7 @@ static struct elevator_type *elevator_find(const char *name, bool mq)
 	struct elevator_type *e;
 
 	list_for_each_entry(e, &elv_list, list) {
-		if (!strcmp(e->elevator_name, name) && (mq == e->uses_mq))
+		if (elevator_match(e, name) && (mq == e->uses_mq))
 			return e;
 	}
 
@@ -922,9 +932,9 @@ int elv_register(struct elevator_type *e)
 	spin_unlock(&elv_list_lock);
 
 	/* print pretty message */
-	if (!strcmp(e->elevator_name, chosen_elevator) ||
+	if (elevator_match(e, chosen_elevator) ||
 			(!*chosen_elevator &&
-			 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
+			 elevator_match(e, CONFIG_DEFAULT_IOSCHED)))
 				def = " (default)";
 
 	printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
@@ -1077,8 +1087,7 @@ static int __elevator_change(struct request_queue *q, const char *name)
 	if (!e)
 		return -EINVAL;
 
-	if (q->elevator &&
-	    !strcmp(elevator_name, q->elevator->type->elevator_name)) {
+	if (q->elevator && elevator_match(q->elevator->type, elevator_name)) {
 		elevator_put(e);
 		return 0;
 	}
@@ -1114,6 +1123,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
 	struct elevator_queue *e = q->elevator;
 	struct elevator_type *elv = NULL;
 	struct elevator_type *__e;
+	bool uses_mq = q->mq_ops != NULL;
 	int len = 0;
 
 	if (!queue_is_rq_based(q))
@@ -1126,7 +1136,8 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
 
 	spin_lock(&elv_list_lock);
 	list_for_each_entry(__e, &elv_list, list) {
-		if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) {
+		if (elv && elevator_match(elv, __e->elevator_name) &&
+		    (__e->uses_mq == uses_mq)) {
 			len += sprintf(name+len, "[%s] ", elv->elevator_name);
 			continue;
 		}
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 5bc8f8682a3e..6df8b14f1f6a 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -144,6 +144,7 @@ struct elevator_type
 	size_t icq_align;	/* ditto */
 	struct elv_fs_entry *elevator_attrs;
 	char elevator_name[ELV_NAME_MAX];
+	const char *elevator_alias;
 	struct module *elevator_owner;
 	bool uses_mq;
 #ifdef CONFIG_BLK_DEBUG_FS
-- 
cgit v1.2.3


From 8bb705e3e79d84e77edd4499e74483dd96a4626c Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Tue, 24 Oct 2017 14:40:26 -0500
Subject: PCI: Add pci_resize_resource() for resizing BARs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a pci_resize_resource() interface to allow device drivers to resize
BARs of their devices.

This is useful for devices with large local storage, e.g., graphics
devices.  These devices often only expose 256MB BARs initially to be
compatible with 32-bit systems.

This function only tries to reprogram the windows of the bridge directly
above the requesting device and only the BAR of the same type (usually mem,
64bit, prefetchable).  This is done to avoid disturbing other drivers by
changing the BARs of their devices.

Drivers should use the following sequence to resize their BARs:
1. Disable memory decoding of the device using the PCI cfg dword.
2. Use pci_release_resource() to release all BARs which can move during the
   resize, including the one you want to resize.
3. Call pci_resize_resource() for each BAR you want to resize.
4. Call pci_assign_unassigned_bus_resources() to reassign new locations
   for all BARs which are not resized, but could move.
5. If everything worked as expected, enable memory decoding in the device
   again using the PCI cfg dword.

Signed-off-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/setup-bus.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/setup-res.c | 58 +++++++++++++++++++++++++++++
 include/linux/pci.h     |  3 ++
 3 files changed, 159 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 73dda0d59bbc..6826a893288a 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1913,6 +1913,104 @@ enable_all:
 }
 EXPORT_SYMBOL_GPL(pci_assign_unassigned_bridge_resources);
 
+int pci_reassign_bridge_resources(struct pci_dev *bridge, unsigned long type)
+{
+	struct pci_dev_resource *dev_res;
+	struct pci_dev *next;
+	LIST_HEAD(saved);
+	LIST_HEAD(added);
+	LIST_HEAD(failed);
+	unsigned int i;
+	int ret;
+
+	/* Walk to the root hub, releasing bridge BARs when possible */
+	next = bridge;
+	do {
+		bridge = next;
+		for (i = PCI_BRIDGE_RESOURCES; i < PCI_BRIDGE_RESOURCE_END;
+		     i++) {
+			struct resource *res = &bridge->resource[i];
+
+			if ((res->flags ^ type) & PCI_RES_TYPE_MASK)
+				continue;
+
+			/* Ignore BARs which are still in use */
+			if (res->child)
+				continue;
+
+			ret = add_to_list(&saved, bridge, res, 0, 0);
+			if (ret)
+				goto cleanup;
+
+			dev_info(&bridge->dev, "BAR %d: releasing %pR\n",
+				 i, res);
+
+			if (res->parent)
+				release_resource(res);
+			res->start = 0;
+			res->end = 0;
+			break;
+		}
+		if (i == PCI_BRIDGE_RESOURCE_END)
+			break;
+
+		next = bridge->bus ? bridge->bus->self : NULL;
+	} while (next);
+
+	if (list_empty(&saved))
+		return -ENOENT;
+
+	__pci_bus_size_bridges(bridge->subordinate, &added);
+	__pci_bridge_assign_resources(bridge, &added, &failed);
+	BUG_ON(!list_empty(&added));
+
+	if (!list_empty(&failed)) {
+		ret = -ENOSPC;
+		goto cleanup;
+	}
+
+	list_for_each_entry(dev_res, &saved, list) {
+		/* Skip the bridge we just assigned resources for. */
+		if (bridge == dev_res->dev)
+			continue;
+
+		bridge = dev_res->dev;
+		pci_setup_bridge(bridge->subordinate);
+	}
+
+	free_list(&saved);
+	return 0;
+
+cleanup:
+	/* restore size and flags */
+	list_for_each_entry(dev_res, &failed, list) {
+		struct resource *res = dev_res->res;
+
+		res->start = dev_res->start;
+		res->end = dev_res->end;
+		res->flags = dev_res->flags;
+	}
+	free_list(&failed);
+
+	/* Revert to the old configuration */
+	list_for_each_entry(dev_res, &saved, list) {
+		struct resource *res = dev_res->res;
+
+		bridge = dev_res->dev;
+		i = res - bridge->resource;
+
+		res->start = dev_res->start;
+		res->end = dev_res->end;
+		res->flags = dev_res->flags;
+
+		pci_claim_resource(bridge, i);
+		pci_setup_bridge(bridge->subordinate);
+	}
+	free_list(&saved);
+
+	return ret;
+}
+
 void pci_assign_unassigned_bus_resources(struct pci_bus *bus)
 {
 	struct pci_dev *dev;
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index e576e1a8d978..bf0089ef2177 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -396,6 +396,64 @@ int pci_reassign_resource(struct pci_dev *dev, int resno, resource_size_t addsiz
 	return 0;
 }
 
+void pci_release_resource(struct pci_dev *dev, int resno)
+{
+	struct resource *res = dev->resource + resno;
+
+	dev_info(&dev->dev, "BAR %d: releasing %pR\n", resno, res);
+	release_resource(res);
+	res->end = resource_size(res) - 1;
+	res->start = 0;
+	res->flags |= IORESOURCE_UNSET;
+}
+EXPORT_SYMBOL(pci_release_resource);
+
+int pci_resize_resource(struct pci_dev *dev, int resno, int size)
+{
+	struct resource *res = dev->resource + resno;
+	int old, ret;
+	u32 sizes;
+	u16 cmd;
+
+	/* Make sure the resource isn't assigned before resizing it. */
+	if (!(res->flags & IORESOURCE_UNSET))
+		return -EBUSY;
+
+	pci_read_config_word(dev, PCI_COMMAND, &cmd);
+	if (cmd & PCI_COMMAND_MEMORY)
+		return -EBUSY;
+
+	sizes = pci_rebar_get_possible_sizes(dev, resno);
+	if (!sizes)
+		return -ENOTSUPP;
+
+	if (!(sizes & BIT(size)))
+		return -EINVAL;
+
+	old = pci_rebar_get_current_size(dev, resno);
+	if (old < 0)
+		return old;
+
+	ret = pci_rebar_set_size(dev, resno, size);
+	if (ret)
+		return ret;
+
+	res->end = res->start + pci_rebar_size_to_bytes(size) - 1;
+
+	/* Check if the new config works by trying to assign everything. */
+	ret = pci_reassign_bridge_resources(dev->bus->self, res->flags);
+	if (ret)
+		goto error_resize;
+
+	return 0;
+
+error_resize:
+	pci_rebar_set_size(dev, resno, old);
+	res->end = res->start + pci_rebar_size_to_bytes(old) - 1;
+	return ret;
+}
+EXPORT_SYMBOL(pci_resize_resource);
+
 int pci_enable_resources(struct pci_dev *dev, int mask)
 {
 	u16 cmd, old_cmd;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index f4f8ee5a7362..55e9fe6dd577 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1106,6 +1106,8 @@ void pci_reset_bridge_secondary_bus(struct pci_dev *dev);
 void pci_update_resource(struct pci_dev *dev, int resno);
 int __must_check pci_assign_resource(struct pci_dev *dev, int i);
 int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align);
+void pci_release_resource(struct pci_dev *dev, int resno);
+int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size);
 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
 bool pci_device_is_present(struct pci_dev *pdev);
 void pci_ignore_hotplug(struct pci_dev *dev);
@@ -1185,6 +1187,7 @@ void pci_assign_unassigned_resources(void);
 void pci_assign_unassigned_bridge_resources(struct pci_dev *bridge);
 void pci_assign_unassigned_bus_resources(struct pci_bus *bus);
 void pci_assign_unassigned_root_bus_resources(struct pci_bus *bus);
+int pci_reassign_bridge_resources(struct pci_dev *bridge, unsigned long type);
 void pdev_enable_device(struct pci_dev *);
 int pci_enable_resources(struct pci_dev *, int mask);
 void pci_assign_irq(struct pci_dev *dev);
-- 
cgit v1.2.3


From 88ca59d1aaf28c25b47a9f933090e480ba6dc92a Mon Sep 17 00:00:00 2001
From: Girish Moodalbail <girish.moodalbail@oracle.com>
Date: Wed, 25 Oct 2017 12:26:43 -0700
Subject: macvlan: remove unused fields in struct macvlan_dev

commit 635b8c8ecdd2 ("tap: Renaming tap related APIs, data structures,
macros") captured all the tap related fields into a new struct tap_dev.
However, it failed to remove those fields from struct macvlan_dev.
Those fields are currently unused and must be removed. While there
I moved the comment for MAX_TAP_QUEUES to the right place.

Fixes: 635b8c8ecdd27142 (tap: Renaming tap related APIs, data structures, macros)
Signed-off-by: Girish Moodalbail <girish.moodalbail@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_macvlan.h | 15 ---------------
 include/linux/if_tap.h     |  4 ++++
 2 files changed, 4 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 10e319f41fb1..e13b369df02b 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -10,13 +10,6 @@
 #include <linux/u64_stats_sync.h>
 
 struct macvlan_port;
-struct macvtap_queue;
-
-/*
- * Maximum times a macvtap device can be opened. This can be used to
- * configure the number of receive queue, e.g. for multiqueue virtio.
- */
-#define MAX_TAP_QUEUES	256
 
 #define MACVLAN_MC_FILTER_BITS	8
 #define MACVLAN_MC_FILTER_SZ	(1 << MACVLAN_MC_FILTER_BITS)
@@ -35,14 +28,6 @@ struct macvlan_dev {
 	netdev_features_t	set_features;
 	enum macvlan_mode	mode;
 	u16			flags;
-	/* This array tracks active taps. */
-	struct tap_queue	__rcu *taps[MAX_TAP_QUEUES];
-	/* This list tracks all taps (both enabled and disabled) */
-	struct list_head	queue_list;
-	int			numvtaps;
-	int			numqueues;
-	netdev_features_t	tap_features;
-	int			minor;
 	int			nest_level;
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	struct netpoll		*netpoll;
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 4837157da0dc..d1b5173ad8f0 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -22,6 +22,10 @@ static inline struct skb_array *tap_get_skb_array(struct file *f)
 #include <net/sock.h>
 #include <linux/skb_array.h>
 
+/*
+ * Maximum times a tap device can be opened. This can be used to
+ * configure the number of receive queue, e.g. for multiqueue virtio.
+ */
 #define MAX_TAP_QUEUES 256
 
 struct tap_queue;
-- 
cgit v1.2.3


From e319e1fbd9d42420ab6eec0bfd75eb9ad7ca63b1 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Wed, 25 Oct 2017 17:56:05 +0900
Subject: block, locking/lockdep: Assign a lock_class per gendisk used for
 wait_for_completion()

Darrick posted the following warning and Dave Chinner analyzed it:

> ======================================================
> WARNING: possible circular locking dependency detected
> 4.14.0-rc1-fixes #1 Tainted: G        W
> ------------------------------------------------------
> loop0/31693 is trying to acquire lock:
>  (&(&ip->i_mmaplock)->mr_lock){++++}, at: [<ffffffffa00f1b0c>] xfs_ilock+0x23c/0x330 [xfs]
>
> but now in release context of a crosslock acquired at the following:
>  ((complete)&ret.event){+.+.}, at: [<ffffffff81326c1f>] submit_bio_wait+0x7f/0xb0
>
> which lock already depends on the new lock.
>
> the existing dependency chain (in reverse order) is:
>
> -> #2 ((complete)&ret.event){+.+.}:
>        lock_acquire+0xab/0x200
>        wait_for_completion_io+0x4e/0x1a0
>        submit_bio_wait+0x7f/0xb0
>        blkdev_issue_zeroout+0x71/0xa0
>        xfs_bmapi_convert_unwritten+0x11f/0x1d0 [xfs]
>        xfs_bmapi_write+0x374/0x11f0 [xfs]
>        xfs_iomap_write_direct+0x2ac/0x430 [xfs]
>        xfs_file_iomap_begin+0x20d/0xd50 [xfs]
>        iomap_apply+0x43/0xe0
>        dax_iomap_rw+0x89/0xf0
>        xfs_file_dax_write+0xcc/0x220 [xfs]
>        xfs_file_write_iter+0xf0/0x130 [xfs]
>        __vfs_write+0xd9/0x150
>        vfs_write+0xc8/0x1c0
>        SyS_write+0x45/0xa0
>        entry_SYSCALL_64_fastpath+0x1f/0xbe
>
> -> #1 (&xfs_nondir_ilock_class){++++}:
>        lock_acquire+0xab/0x200
>        down_write_nested+0x4a/0xb0
>        xfs_ilock+0x263/0x330 [xfs]
>        xfs_setattr_size+0x152/0x370 [xfs]
>        xfs_vn_setattr+0x6b/0x90 [xfs]
>        notify_change+0x27d/0x3f0
>        do_truncate+0x5b/0x90
>        path_openat+0x237/0xa90
>        do_filp_open+0x8a/0xf0
>        do_sys_open+0x11c/0x1f0
>        entry_SYSCALL_64_fastpath+0x1f/0xbe
>
> -> #0 (&(&ip->i_mmaplock)->mr_lock){++++}:
>        up_write+0x1c/0x40
>        xfs_iunlock+0x1d0/0x310 [xfs]
>        xfs_file_fallocate+0x8a/0x310 [xfs]
>        loop_queue_work+0xb7/0x8d0
>        kthread_worker_fn+0xb9/0x1f0
>
> Chain exists of:
>   &(&ip->i_mmaplock)->mr_lock --> &xfs_nondir_ilock_class --> (complete)&ret.event
>
>  Possible unsafe locking scenario by crosslock:
>
>        CPU0                    CPU1
>        ----                    ----
>   lock(&xfs_nondir_ilock_class);
>   lock((complete)&ret.event);
>                                lock(&(&ip->i_mmaplock)->mr_lock);
>                                unlock((complete)&ret.event);
>
>                *** DEADLOCK ***

The warning is a false positive, caused by the fact that all
wait_for_completion()s in submit_bio_wait() are waiting with the same
lock class.

However, some bios have nothing to do with others, for example in the case
of loop devices, there's no direct connection between the bios of an upper
device and the bios of a lower device(=loop device).

The safest way to assign different lock classes to different devices is
to do it for each gendisk. In other words, this patch assigns a
lockdep_map per gendisk and uses it when initializing completion in
submit_bio_wait().

Analyzed-by: Dave Chinner <david@fromorbit.com>
Reported-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: amir73il@gmail.com
Cc: axboe@kernel.dk
Cc: david@fromorbit.com
Cc: hch@infradead.org
Cc: idryomov@gmail.com
Cc: johan@kernel.org
Cc: johannes.berg@intel.com
Cc: kernel-team@lge.com
Cc: linux-block@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-xfs@vger.kernel.org
Cc: oleg@redhat.com
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/1508921765-15396-10-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 block/bio.c           |  2 +-
 block/genhd.c         | 10 ++--------
 include/linux/genhd.h | 22 ++++++++++++++++++++--
 3 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 5e901bfc0576..cc60213e56d8 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -935,7 +935,7 @@ static void submit_bio_wait_endio(struct bio *bio)
  */
 int submit_bio_wait(struct bio *bio)
 {
-	DECLARE_COMPLETION_ONSTACK(done);
+	DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map);
 
 	bio->bi_private = &done;
 	bio->bi_end_io = submit_bio_wait_endio;
diff --git a/block/genhd.c b/block/genhd.c
index dd305c65ffb0..630c0da6cfcf 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1354,13 +1354,7 @@ dev_t blk_lookup_devt(const char *name, int partno)
 }
 EXPORT_SYMBOL(blk_lookup_devt);
 
-struct gendisk *alloc_disk(int minors)
-{
-	return alloc_disk_node(minors, NUMA_NO_NODE);
-}
-EXPORT_SYMBOL(alloc_disk);
-
-struct gendisk *alloc_disk_node(int minors, int node_id)
+struct gendisk *__alloc_disk_node(int minors, int node_id)
 {
 	struct gendisk *disk;
 	struct disk_part_tbl *ptbl;
@@ -1411,7 +1405,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
 	}
 	return disk;
 }
-EXPORT_SYMBOL(alloc_disk_node);
+EXPORT_SYMBOL(__alloc_disk_node);
 
 struct kobject *get_disk(struct gendisk *disk)
 {
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index ea652bfcd675..19d18710546a 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -206,6 +206,7 @@ struct gendisk {
 #endif	/* CONFIG_BLK_DEV_INTEGRITY */
 	int node_id;
 	struct badblocks *bb;
+	struct lockdep_map lockdep_map;
 };
 
 static inline struct gendisk *part_to_disk(struct hd_struct *part)
@@ -590,8 +591,7 @@ extern void __delete_partition(struct percpu_ref *);
 extern void delete_partition(struct gendisk *, int);
 extern void printk_all_partitions(void);
 
-extern struct gendisk *alloc_disk_node(int minors, int node_id);
-extern struct gendisk *alloc_disk(int minors);
+extern struct gendisk *__alloc_disk_node(int minors, int node_id);
 extern struct kobject *get_disk(struct gendisk *disk);
 extern void put_disk(struct gendisk *disk);
 extern void blk_register_region(dev_t devt, unsigned long range,
@@ -615,6 +615,24 @@ extern ssize_t part_fail_store(struct device *dev,
 			       const char *buf, size_t count);
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 
+#define alloc_disk_node(minors, node_id)				\
+({									\
+	static struct lock_class_key __key;				\
+	const char *__name;						\
+	struct gendisk *__disk;						\
+									\
+	__name = "(gendisk_completion)"#minors"("#node_id")";		\
+									\
+	__disk = __alloc_disk_node(minors, node_id);			\
+									\
+	if (__disk)							\
+		lockdep_init_map(&__disk->lockdep_map, __name, &__key, 0); \
+									\
+	__disk;								\
+})
+
+#define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE)
+
 static inline int hd_ref_init(struct hd_struct *part)
 {
 	if (percpu_ref_init(&part->ref, __delete_partition, 0,
-- 
cgit v1.2.3


From f1d981eaecf8ace68ec1d15bf05f28a4887ea6fb Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Mon, 23 Oct 2017 10:32:08 +0900
Subject: PM / devfreq: Use the available min/max frequency

The commit a76caf55e5b35 ("thermal: Add devfreq cooling") is able
to disable OPP as a cooling device. In result, both update_devfreq()
and {min|max}_freq_show() have to consider the 'opp->available'
status of each OPP.

So, this patch adds the 'scaling_{min|max}_freq' to struct devfreq
in order to indicate the available mininum and maximum frequency
by adjusting OPP interface such as dev_pm_opp_{disable|enable}().
The 'scaling_{min|max}_freq' are used for on both update_devfreq()
and {min|max}_freq_show().

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/devfreq.c | 40 ++++++++++++++++++++++++++++++++--------
 include/linux/devfreq.h   |  4 ++++
 2 files changed, 36 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index b6ba24e5db0d..ee3e7cee30b6 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -28,6 +28,9 @@
 #include <linux/of.h>
 #include "governor.h"
 
+#define MAX(a,b)	((a > b) ? a : b)
+#define MIN(a,b)	((a < b) ? a : b)
+
 static struct class *devfreq_class;
 
 /*
@@ -255,7 +258,7 @@ static int devfreq_notify_transition(struct devfreq *devfreq,
 int update_devfreq(struct devfreq *devfreq)
 {
 	struct devfreq_freqs freqs;
-	unsigned long freq, cur_freq;
+	unsigned long freq, cur_freq, min_freq, max_freq;
 	int err = 0;
 	u32 flags = 0;
 
@@ -273,19 +276,21 @@ int update_devfreq(struct devfreq *devfreq)
 		return err;
 
 	/*
-	 * Adjust the frequency with user freq and QoS.
+	 * Adjust the frequency with user freq, QoS and available freq.
 	 *
 	 * List from the highest priority
 	 * max_freq
 	 * min_freq
 	 */
+	max_freq = MIN(devfreq->scaling_max_freq, devfreq->max_freq);
+	min_freq = MAX(devfreq->scaling_min_freq, devfreq->min_freq);
 
-	if (devfreq->min_freq && freq < devfreq->min_freq) {
-		freq = devfreq->min_freq;
+	if (min_freq && freq < min_freq) {
+		freq = min_freq;
 		flags &= ~DEVFREQ_FLAG_LEAST_UPPER_BOUND; /* Use GLB */
 	}
-	if (devfreq->max_freq && freq > devfreq->max_freq) {
-		freq = devfreq->max_freq;
+	if (max_freq && freq > max_freq) {
+		freq = max_freq;
 		flags |= DEVFREQ_FLAG_LEAST_UPPER_BOUND; /* Use LUB */
 	}
 
@@ -494,6 +499,19 @@ static int devfreq_notifier_call(struct notifier_block *nb, unsigned long type,
 	int ret;
 
 	mutex_lock(&devfreq->lock);
+
+	devfreq->scaling_min_freq = find_available_min_freq(devfreq);
+	if (!devfreq->scaling_min_freq) {
+		mutex_unlock(&devfreq->lock);
+		return -EINVAL;
+	}
+
+	devfreq->scaling_max_freq = find_available_max_freq(devfreq);
+	if (!devfreq->scaling_max_freq) {
+		mutex_unlock(&devfreq->lock);
+		return -EINVAL;
+	}
+
 	ret = update_devfreq(devfreq);
 	mutex_unlock(&devfreq->lock);
 
@@ -593,6 +611,7 @@ struct devfreq *devfreq_add_device(struct device *dev,
 		err = -EINVAL;
 		goto err_dev;
 	}
+	devfreq->scaling_min_freq = devfreq->min_freq;
 
 	devfreq->max_freq = find_available_max_freq(devfreq);
 	if (!devfreq->max_freq) {
@@ -600,6 +619,7 @@ struct devfreq *devfreq_add_device(struct device *dev,
 		err = -EINVAL;
 		goto err_dev;
 	}
+	devfreq->scaling_max_freq = devfreq->max_freq;
 
 	dev_set_name(&devfreq->dev, "devfreq%d",
 				atomic_inc_return(&devfreq_no));
@@ -1127,7 +1147,9 @@ unlock:
 static ssize_t min_freq_show(struct device *dev, struct device_attribute *attr,
 			     char *buf)
 {
-	return sprintf(buf, "%lu\n", to_devfreq(dev)->min_freq);
+	struct devfreq *df = to_devfreq(dev);
+
+	return sprintf(buf, "%lu\n", MAX(df->scaling_min_freq, df->min_freq));
 }
 
 static ssize_t max_freq_store(struct device *dev, struct device_attribute *attr,
@@ -1161,7 +1183,9 @@ static DEVICE_ATTR_RW(min_freq);
 static ssize_t max_freq_show(struct device *dev, struct device_attribute *attr,
 			     char *buf)
 {
-	return sprintf(buf, "%lu\n", to_devfreq(dev)->max_freq);
+	struct devfreq *df = to_devfreq(dev);
+
+	return sprintf(buf, "%lu\n", MIN(df->scaling_max_freq, df->max_freq));
 }
 static DEVICE_ATTR_RW(max_freq);
 
diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
index 597294e0cc40..997a9eb34191 100644
--- a/include/linux/devfreq.h
+++ b/include/linux/devfreq.h
@@ -120,6 +120,8 @@ struct devfreq_dev_profile {
  *		touch this.
  * @min_freq:	Limit minimum frequency requested by user (0: none)
  * @max_freq:	Limit maximum frequency requested by user (0: none)
+ * @scaling_min_freq:	Limit minimum frequency requested by OPP interface
+ * @scaling_max_freq:	Limit maximum frequency requested by OPP interface
  * @stop_polling:	 devfreq polling status of a device.
  * @total_trans:	Number of devfreq transitions
  * @trans_table:	Statistics of devfreq transitions
@@ -153,6 +155,8 @@ struct devfreq {
 
 	unsigned long min_freq;
 	unsigned long max_freq;
+	unsigned long scaling_min_freq;
+	unsigned long scaling_max_freq;
 	bool stop_polling;
 
 	/* information for device frequency transition */
-- 
cgit v1.2.3


From 416b46a2627ae8de1466f90787dede6f9c5a1bfa Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Mon, 23 Oct 2017 10:32:10 +0900
Subject: PM / devfreq: Show the all available frequencies

The commit a76caf55e5b35 ("thermal: Add devfreq cooling") allows
the devfreq device to use the cooling device. When the cooling down
are required, the devfreq_cooling.c disables the OPP entry with
the dev_pm_opp_disable(). In result, 'available_frequencies'[1]
sysfs node never came to show the all available frequencies.
[1] /sys/class/devfreq/.../available_frequencies

So, this patch uses the 'freq_table' in the 'struct devfreq_dev_profile'
in order to show the all available frequencies.
- If 'freq_table' is NULL, devfreq core initializes them by using OPP values.
- If 'freq_table' is initialized, devfreq core just uses the 'freq_table'.

And this patch adds some comment about the sort way of 'freq_table'.

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/devfreq.c | 16 +++++-----------
 include/linux/devfreq.h   |  5 +++--
 2 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index b2920cd2b78e..381f92e5e794 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -1194,22 +1194,16 @@ static ssize_t available_frequencies_show(struct device *d,
 					  char *buf)
 {
 	struct devfreq *df = to_devfreq(d);
-	struct device *dev = df->dev.parent;
-	struct dev_pm_opp *opp;
 	ssize_t count = 0;
-	unsigned long freq = 0;
+	int i;
 
-	do {
-		opp = dev_pm_opp_find_freq_ceil(dev, &freq);
-		if (IS_ERR(opp))
-			break;
+	mutex_lock(&df->lock);
 
-		dev_pm_opp_put(opp);
+	for (i = 0; i < df->profile->max_state; i++)
 		count += scnprintf(&buf[count], (PAGE_SIZE - count - 2),
-				   "%lu ", freq);
-		freq++;
-	} while (1);
+				"%lu ", df->profile->freq_table[i]);
 
+	mutex_unlock(&df->lock);
 	/* Truncate the trailing space */
 	if (count)
 		count--;
diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
index 997a9eb34191..19520625ea94 100644
--- a/include/linux/devfreq.h
+++ b/include/linux/devfreq.h
@@ -84,8 +84,9 @@ struct devfreq_dev_status {
  *			from devfreq_remove_device() call. If the user
  *			has registered devfreq->nb at a notifier-head,
  *			this is the time to unregister it.
- * @freq_table:	Optional list of frequencies to support statistics.
- * @max_state:	The size of freq_table.
+ * @freq_table:		Optional list of frequencies to support statistics
+ *			and freq_table must be generated in ascending order.
+ * @max_state:		The size of freq_table.
  */
 struct devfreq_dev_profile {
 	unsigned long initial_freq;
-- 
cgit v1.2.3


From aa7c352f9841ab3fee5bf1de127a45e6310124a6 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Mon, 23 Oct 2017 10:32:12 +0900
Subject: PM / devfreq: Define the constant governor name

Prior to that, the devfreq device uses the governor name when adding
the itself. In order to prevent the mistake used the wrong governor name,
this patch defines the governor name as a constant and then uses them
instead of using the string directly.

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
Cc: Kukjin Kim <kgene@kernel.org>
Cc: Krzysztof Kozlowski <krzk@kernel.org>
Cc: linux-samsung-soc@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/devfreq/exynos-bus.c              | 5 +++--
 drivers/devfreq/governor_passive.c        | 2 +-
 drivers/devfreq/governor_performance.c    | 2 +-
 drivers/devfreq/governor_powersave.c      | 2 +-
 drivers/devfreq/governor_simpleondemand.c | 2 +-
 drivers/devfreq/governor_userspace.c      | 2 +-
 drivers/devfreq/rk3399_dmc.c              | 2 +-
 include/linux/devfreq.h                   | 7 +++++++
 8 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/devfreq/exynos-bus.c b/drivers/devfreq/exynos-bus.c
index 49f68929e024..c25658b26598 100644
--- a/drivers/devfreq/exynos-bus.c
+++ b/drivers/devfreq/exynos-bus.c
@@ -436,7 +436,8 @@ static int exynos_bus_probe(struct platform_device *pdev)
 	ondemand_data->downdifferential = 5;
 
 	/* Add devfreq device to monitor and handle the exynos bus */
-	bus->devfreq = devm_devfreq_add_device(dev, profile, "simple_ondemand",
+	bus->devfreq = devm_devfreq_add_device(dev, profile,
+						DEVFREQ_GOV_SIMPLE_ONDEMAND,
 						ondemand_data);
 	if (IS_ERR(bus->devfreq)) {
 		dev_err(dev, "failed to add devfreq device\n");
@@ -488,7 +489,7 @@ passive:
 	passive_data->parent = parent_devfreq;
 
 	/* Add devfreq device for exynos bus with passive governor */
-	bus->devfreq = devm_devfreq_add_device(dev, profile, "passive",
+	bus->devfreq = devm_devfreq_add_device(dev, profile, DEVFREQ_GOV_PASSIVE,
 						passive_data);
 	if (IS_ERR(bus->devfreq)) {
 		dev_err(dev,
diff --git a/drivers/devfreq/governor_passive.c b/drivers/devfreq/governor_passive.c
index 673ad8cc9a1d..3bc29acbd54e 100644
--- a/drivers/devfreq/governor_passive.c
+++ b/drivers/devfreq/governor_passive.c
@@ -183,7 +183,7 @@ static int devfreq_passive_event_handler(struct devfreq *devfreq,
 }
 
 static struct devfreq_governor devfreq_passive = {
-	.name = "passive",
+	.name = DEVFREQ_GOV_PASSIVE,
 	.immutable = 1,
 	.get_target_freq = devfreq_passive_get_target_freq,
 	.event_handler = devfreq_passive_event_handler,
diff --git a/drivers/devfreq/governor_performance.c b/drivers/devfreq/governor_performance.c
index c72f942f30a8..4d23ecfbd948 100644
--- a/drivers/devfreq/governor_performance.c
+++ b/drivers/devfreq/governor_performance.c
@@ -42,7 +42,7 @@ static int devfreq_performance_handler(struct devfreq *devfreq,
 }
 
 static struct devfreq_governor devfreq_performance = {
-	.name = "performance",
+	.name = DEVFREQ_GOV_PERFORMANCE,
 	.get_target_freq = devfreq_performance_func,
 	.event_handler = devfreq_performance_handler,
 };
diff --git a/drivers/devfreq/governor_powersave.c b/drivers/devfreq/governor_powersave.c
index 0c6bed567e6d..0c42f23249ef 100644
--- a/drivers/devfreq/governor_powersave.c
+++ b/drivers/devfreq/governor_powersave.c
@@ -39,7 +39,7 @@ static int devfreq_powersave_handler(struct devfreq *devfreq,
 }
 
 static struct devfreq_governor devfreq_powersave = {
-	.name = "powersave",
+	.name = DEVFREQ_GOV_POWERSAVE,
 	.get_target_freq = devfreq_powersave_func,
 	.event_handler = devfreq_powersave_handler,
 };
diff --git a/drivers/devfreq/governor_simpleondemand.c b/drivers/devfreq/governor_simpleondemand.c
index ae72ba5e78df..28e0f2de7100 100644
--- a/drivers/devfreq/governor_simpleondemand.c
+++ b/drivers/devfreq/governor_simpleondemand.c
@@ -125,7 +125,7 @@ static int devfreq_simple_ondemand_handler(struct devfreq *devfreq,
 }
 
 static struct devfreq_governor devfreq_simple_ondemand = {
-	.name = "simple_ondemand",
+	.name = DEVFREQ_GOV_SIMPLE_ONDEMAND,
 	.get_target_freq = devfreq_simple_ondemand_func,
 	.event_handler = devfreq_simple_ondemand_handler,
 };
diff --git a/drivers/devfreq/governor_userspace.c b/drivers/devfreq/governor_userspace.c
index 77028c27593c..080607c3f34d 100644
--- a/drivers/devfreq/governor_userspace.c
+++ b/drivers/devfreq/governor_userspace.c
@@ -87,7 +87,7 @@ static struct attribute *dev_entries[] = {
 	NULL,
 };
 static const struct attribute_group dev_attr_group = {
-	.name	= "userspace",
+	.name	= DEVFREQ_GOV_USERSPACE,
 	.attrs	= dev_entries,
 };
 
diff --git a/drivers/devfreq/rk3399_dmc.c b/drivers/devfreq/rk3399_dmc.c
index 1b89ebbad02c..5dfbfa3cc878 100644
--- a/drivers/devfreq/rk3399_dmc.c
+++ b/drivers/devfreq/rk3399_dmc.c
@@ -431,7 +431,7 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev)
 
 	data->devfreq = devm_devfreq_add_device(dev,
 					   &rk3399_devfreq_dmc_profile,
-					   "simple_ondemand",
+					   DEVFREQ_GOV_SIMPLE_ONDEMAND,
 					   &data->ondemand_data);
 	if (IS_ERR(data->devfreq))
 		return PTR_ERR(data->devfreq);
diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
index 19520625ea94..3aae5b3af87c 100644
--- a/include/linux/devfreq.h
+++ b/include/linux/devfreq.h
@@ -19,6 +19,13 @@
 
 #define DEVFREQ_NAME_LEN 16
 
+/* DEVFREQ governor name */
+#define DEVFREQ_GOV_SIMPLE_ONDEMAND	"simple_ondemand"
+#define DEVFREQ_GOV_PERFORMANCE		"performance"
+#define DEVFREQ_GOV_POWERSAVE		"powersave"
+#define DEVFREQ_GOV_USERSPACE		"userspace"
+#define DEVFREQ_GOV_PASSIVE		"passive"
+
 /* DEVFREQ notifier interface */
 #define DEVFREQ_TRANSITION_NOTIFIER	(0)
 
-- 
cgit v1.2.3


From 60e2a7780793bae0debc275a9ccd57f7da0cf195 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Wed, 25 Oct 2017 11:01:45 +0200
Subject: tcp: TCP experimental option for SMC

The SMC protocol [1] relies on the use of a new TCP experimental
option [2, 3]. With this option, SMC capabilities are exchanged
between peers during the TCP three way handshake. This patch adds
support for this experimental option to TCP.

References:
[1] SMC-R Informational RFC: http://www.rfc-editor.org/info/rfc7609
[2] Shared Use of TCP Experimental Options RFC 6994:
    https://tools.ietf.org/rfc/rfc6994.txt
[3] IANA ExID SMCR:
http://www.iana.org/assignments/tcp-parameters/tcp-parameters.xhtml#tcp-exids

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  9 +++++--
 include/net/inet_sock.h  |  3 ++-
 include/net/tcp.h        |  7 ++++++
 net/ipv4/tcp.c           |  6 +++++
 net/ipv4/tcp_input.c     | 35 +++++++++++++++++++++++++++
 net/ipv4/tcp_minisocks.c | 19 +++++++++++++++
 net/ipv4/tcp_output.c    | 63 +++++++++++++++++++++++++++++++++++++++++++++---
 7 files changed, 136 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 173a7c2f9636..8c431385b272 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -98,7 +98,8 @@ struct tcp_options_received {
 		tstamp_ok : 1,	/* TIMESTAMP seen on SYN packet		*/
 		dsack : 1,	/* D-SACK is scheduled			*/
 		wscale_ok : 1,	/* Wscale seen on SYN packet		*/
-		sack_ok : 4,	/* SACK seen on SYN packet		*/
+		sack_ok : 3,	/* SACK seen on SYN packet		*/
+		smc_ok : 1,	/* SMC seen on SYN packet		*/
 		snd_wscale : 4,	/* Window scaling received from sender	*/
 		rcv_wscale : 4;	/* Window scaling to send to receiver	*/
 	u8	num_sacks;	/* Number of SACK blocks		*/
@@ -110,6 +111,9 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
 {
 	rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
 	rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
+#if IS_ENABLED(CONFIG_SMC)
+	rx_opt->smc_ok = 0;
+#endif
 }
 
 /* This is the max number of SACKS that we'll generate and process. It's safe
@@ -229,7 +233,8 @@ struct tcp_sock {
 		syn_fastopen_ch:1, /* Active TFO re-enabling probe */
 		syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
 		save_syn:1,	/* Save headers of SYN packet */
-		is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
+		is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
+		syn_smc:1;	/* SYN includes SMC */
 	u32	tlp_high_seq;	/* snd_nxt at the time of TLP retransmit. */
 
 /* RTT measurement */
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 425752f768d2..c49938d1481a 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -92,7 +92,8 @@ struct inet_request_sock {
 				wscale_ok  : 1,
 				ecn_ok	   : 1,
 				acked	   : 1,
-				no_srccheck: 1;
+				no_srccheck: 1,
+				smc_ok	   : 1;
 	kmemcheck_bitfield_end(flags);
 	u32                     ir_mark;
 	union {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2392f74074e7..285bc82dea41 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -191,6 +191,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
  * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
  */
 #define TCPOPT_FASTOPEN_MAGIC	0xF989
+#define TCPOPT_SMC_MAGIC	0xE2D4C3D9
 
 /*
  *     TCP option lengths
@@ -203,6 +204,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOLEN_MD5SIG         18
 #define TCPOLEN_FASTOPEN_BASE  2
 #define TCPOLEN_EXP_FASTOPEN_BASE  4
+#define TCPOLEN_EXP_SMC_BASE   6
 
 /* But this is what stacks really send out. */
 #define TCPOLEN_TSTAMP_ALIGNED		12
@@ -213,6 +215,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOLEN_SACK_PERBLOCK		8
 #define TCPOLEN_MD5SIG_ALIGNED		20
 #define TCPOLEN_MSS_ALIGNED		4
+#define TCPOLEN_EXP_SMC_BASE_ALIGNED	8
 
 /* Flags in tp->nonagle */
 #define TCP_NAGLE_OFF		1	/* Nagle's algo is disabled */
@@ -2108,4 +2111,8 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
 {
 	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
 }
+
+#if IS_ENABLED(CONFIG_SMC)
+extern struct static_key_false tcp_have_smc;
+#endif
 #endif	/* _TCP_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8f36277e82e9..f6e1c00e300e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -270,6 +270,7 @@
 #include <linux/time.h>
 #include <linux/slab.h>
 #include <linux/errqueue.h>
+#include <linux/static_key.h>
 
 #include <net/icmp.h>
 #include <net/inet_common.h>
@@ -302,6 +303,11 @@ EXPORT_SYMBOL(sysctl_tcp_wmem);
 atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
 EXPORT_SYMBOL(tcp_memory_allocated);
 
+#if IS_ENABLED(CONFIG_SMC)
+DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
+EXPORT_SYMBOL(tcp_have_smc);
+#endif
+
 /*
  * Current number of TCP sockets.
  */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 893286db4623..337f6011528a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -76,6 +76,8 @@
 #include <asm/unaligned.h>
 #include <linux/errqueue.h>
 #include <trace/events/tcp.h>
+#include <linux/unaligned/access_ok.h>
+#include <linux/static_key.h>
 
 int sysctl_tcp_fack __read_mostly;
 int sysctl_tcp_max_reordering __read_mostly = 300;
@@ -3737,6 +3739,21 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
 	foc->exp = exp_opt;
 }
 
+static void smc_parse_options(const struct tcphdr *th,
+			      struct tcp_options_received *opt_rx,
+			      const unsigned char *ptr,
+			      int opsize)
+{
+#if IS_ENABLED(CONFIG_SMC)
+	if (static_branch_unlikely(&tcp_have_smc)) {
+		if (th->syn && !(opsize & 1) &&
+		    opsize >= TCPOLEN_EXP_SMC_BASE &&
+		    get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
+			opt_rx->smc_ok = 1;
+	}
+#endif
+}
+
 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
  * But, this can also be called on packets in the established flow when
  * the fast version below fails.
@@ -3844,6 +3861,9 @@ void tcp_parse_options(const struct net *net,
 					tcp_parse_fastopen_option(opsize -
 						TCPOLEN_EXP_FASTOPEN_BASE,
 						ptr + 2, th->syn, foc, true);
+				else
+					smc_parse_options(th, opt_rx, ptr,
+							  opsize);
 				break;
 
 			}
@@ -5598,6 +5618,16 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 	return false;
 }
 
+static void smc_check_reset_syn(struct tcp_sock *tp)
+{
+#if IS_ENABLED(CONFIG_SMC)
+	if (static_branch_unlikely(&tcp_have_smc)) {
+		if (tp->syn_smc && !tp->rx_opt.smc_ok)
+			tp->syn_smc = 0;
+	}
+#endif
+}
+
 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 					 const struct tcphdr *th)
 {
@@ -5704,6 +5734,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		 * is initialized. */
 		tp->copied_seq = tp->rcv_nxt;
 
+		smc_check_reset_syn(tp);
+
 		smp_mb();
 
 		tcp_finish_connect(sk, skb);
@@ -6157,6 +6189,9 @@ static void tcp_openreq_init(struct request_sock *req,
 	ireq->ir_rmt_port = tcp_hdr(skb)->source;
 	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
 	ireq->ir_mark = inet_request_mark(sk, skb);
+#if IS_ENABLED(CONFIG_SMC)
+	ireq->smc_ok = rx_opt->smc_ok;
+#endif
 }
 
 struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index a952357054f4..056009f1c14f 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -23,6 +23,7 @@
 #include <linux/slab.h>
 #include <linux/sysctl.h>
 #include <linux/workqueue.h>
+#include <linux/static_key.h>
 #include <net/tcp.h>
 #include <net/inet_common.h>
 #include <net/xfrm.h>
@@ -416,6 +417,21 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
 }
 EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
 
+static void smc_check_reset_syn_req(struct tcp_sock *oldtp,
+				    struct request_sock *req,
+				    struct tcp_sock *newtp)
+{
+#if IS_ENABLED(CONFIG_SMC)
+	struct inet_request_sock *ireq;
+
+	if (static_branch_unlikely(&tcp_have_smc)) {
+		ireq = inet_rsk(req);
+		if (oldtp->syn_smc && !ireq->smc_ok)
+			newtp->syn_smc = 0;
+	}
+#endif
+}
+
 /* This is not only more efficient than what we used to do, it eliminates
  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
  *
@@ -433,6 +449,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		struct tcp_request_sock *treq = tcp_rsk(req);
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
 		struct tcp_sock *newtp = tcp_sk(newsk);
+		struct tcp_sock *oldtp = tcp_sk(sk);
+
+		smc_check_reset_syn_req(oldtp, req, newtp);
 
 		/* Now setup tcp_sock */
 		newtp->pred_flags = 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1f01f4c9c738..c8fc512e0bbb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -41,6 +41,7 @@
 #include <linux/compiler.h>
 #include <linux/gfp.h>
 #include <linux/module.h>
+#include <linux/static_key.h>
 
 #include <trace/events/tcp.h>
 
@@ -422,6 +423,22 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
 #define OPTION_MD5		(1 << 2)
 #define OPTION_WSCALE		(1 << 3)
 #define OPTION_FAST_OPEN_COOKIE	(1 << 8)
+#define OPTION_SMC		(1 << 9)
+
+static void smc_options_write(__be32 *ptr, u16 *options)
+{
+#if IS_ENABLED(CONFIG_SMC)
+	if (static_branch_unlikely(&tcp_have_smc)) {
+		if (unlikely(OPTION_SMC & *options)) {
+			*ptr++ = htonl((TCPOPT_NOP  << 24) |
+				       (TCPOPT_NOP  << 16) |
+				       (TCPOPT_EXP <<  8) |
+				       (TCPOLEN_EXP_SMC_BASE));
+			*ptr++ = htonl(TCPOPT_SMC_MAGIC);
+		}
+	}
+#endif
+}
 
 struct tcp_out_options {
 	u16 options;		/* bit field of OPTION_* */
@@ -540,6 +557,41 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
 		}
 		ptr += (len + 3) >> 2;
 	}
+
+	smc_options_write(ptr, &options);
+}
+
+static void smc_set_option(const struct tcp_sock *tp,
+			   struct tcp_out_options *opts,
+			   unsigned int *remaining)
+{
+#if IS_ENABLED(CONFIG_SMC)
+	if (static_branch_unlikely(&tcp_have_smc)) {
+		if (tp->syn_smc) {
+			if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+				opts->options |= OPTION_SMC;
+				*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
+			}
+		}
+	}
+#endif
+}
+
+static void smc_set_option_cond(const struct tcp_sock *tp,
+				const struct inet_request_sock *ireq,
+				struct tcp_out_options *opts,
+				unsigned int *remaining)
+{
+#if IS_ENABLED(CONFIG_SMC)
+	if (static_branch_unlikely(&tcp_have_smc)) {
+		if (tp->syn_smc && ireq->smc_ok) {
+			if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+				opts->options |= OPTION_SMC;
+				*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
+			}
+		}
+	}
+#endif
 }
 
 /* Compute TCP options for SYN packets. This is not the final
@@ -607,11 +659,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 		}
 	}
 
+	smc_set_option(tp, opts, &remaining);
+
 	return MAX_TCP_OPTION_SPACE - remaining;
 }
 
 /* Set up TCP options for SYN-ACKs. */
-static unsigned int tcp_synack_options(struct request_sock *req,
+static unsigned int tcp_synack_options(const struct sock *sk,
+				       struct request_sock *req,
 				       unsigned int mss, struct sk_buff *skb,
 				       struct tcp_out_options *opts,
 				       const struct tcp_md5sig_key *md5,
@@ -667,6 +722,8 @@ static unsigned int tcp_synack_options(struct request_sock *req,
 		}
 	}
 
+	smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
+
 	return MAX_TCP_OPTION_SPACE - remaining;
 }
 
@@ -3195,8 +3252,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
 #endif
 	skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
-	tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) +
-			  sizeof(*th);
+	tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
+					     foc) + sizeof(*th);
 
 	skb_push(skb, tcp_header_size);
 	skb_reset_transport_header(skb);
-- 
cgit v1.2.3


From d41bf8c9deaed1a90b18d3ffc5639d4c19f0259a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 23 Oct 2017 16:18:27 -0700
Subject: cgroup, sched: Move basic cpu stats from cgroup.stat to cpu.stat

The basic cpu stat is currently shown with "cpu." prefix in
cgroup.stat, and the same information is duplicated in cpu.stat when
cpu controller is enabled.  This is ugly and not very scalable as we
want to expand the coverage of stat information which is always
available.

This patch makes cgroup core always create "cpu.stat" file and show
the basic cpu stat there and calls the cpu controller to show the
extra stats when enabled.  This ensures that the same information
isn't presented in multiple places and makes future expansion of basic
stats easier.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 Documentation/cgroup-v2.txt     | 15 ++++-------
 include/linux/cgroup-defs.h     |  2 ++
 include/linux/cgroup.h          |  2 --
 kernel/cgroup/cgroup-internal.h |  1 +
 kernel/cgroup/cgroup.c          | 60 +++++++++++++++++++++++++++++++++++++++--
 kernel/cgroup/stat.c            | 10 +++----
 kernel/sched/core.c             | 13 +++------
 7 files changed, 75 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 0bbdc720dd7c..779211fbb69f 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -886,15 +886,6 @@ All cgroup core files are prefixed with "cgroup."
 		A dying cgroup can consume system resources not exceeding
 		limits, which were active at the moment of cgroup deletion.
 
-	  cpu.usage_usec
-		CPU time consumed in the subtree.
-
-	  cpu.user_usec
-		User CPU time consumed in the subtree.
-
-	  cpu.system_usec
-		System CPU time consumed in the subtree.
-
 
 Controllers
 ===========
@@ -915,12 +906,16 @@ All time durations are in microseconds.
 
   cpu.stat
 	A read-only flat-keyed file which exists on non-root cgroups.
+	This file exists whether the controller is enabled or not.
 
-	It reports the following six stats:
+	It always reports the following three stats:
 
 	- usage_usec
 	- user_usec
 	- system_usec
+
+	and the following three when the controller is enabled:
+
 	- nr_periods
 	- nr_throttled
 	- throttled_usec
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 3e55bbd31ad1..ada6df7b1f55 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -569,6 +569,8 @@ struct cgroup_subsys {
 	void (*css_released)(struct cgroup_subsys_state *css);
 	void (*css_free)(struct cgroup_subsys_state *css);
 	void (*css_reset)(struct cgroup_subsys_state *css);
+	int (*css_extra_stat_show)(struct seq_file *seq,
+				   struct cgroup_subsys_state *css);
 
 	int (*can_attach)(struct cgroup_taskset *tset);
 	void (*cancel_attach)(struct cgroup_taskset *tset);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 328a70ce0e23..03cad08b09d1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -703,8 +703,6 @@ static inline void cpuacct_account_field(struct task_struct *tsk, int index,
 					 u64 val) {}
 #endif
 
-void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix);
-
 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
 void __cgroup_account_cputime_field(struct cgroup *cgrp,
 				    enum cpu_usage_stat index, u64 delta_exec);
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index fa642c99586a..4dc317090920 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -205,6 +205,7 @@ int cgroup_task_count(const struct cgroup *cgrp);
 void cgroup_stat_flush(struct cgroup *cgrp);
 int cgroup_stat_init(struct cgroup *cgrp);
 void cgroup_stat_exit(struct cgroup *cgrp);
+void cgroup_stat_show_cputime(struct seq_file *seq);
 void cgroup_stat_boot(void);
 
 /*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 7975b20f1fd1..d9773e49a1b4 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -463,6 +463,28 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
 		return &cgrp->self;
 }
 
+/**
+ * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get @cgrp's css assocaited with @ss.  If the css doesn't exist
+ * or is offline, %NULL is returned.
+ */
+static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
+						     struct cgroup_subsys *ss)
+{
+	struct cgroup_subsys_state *css;
+
+	rcu_read_lock();
+	css = cgroup_css(cgrp, ss);
+	if (!css || !css_tryget_online(css))
+		css = NULL;
+	rcu_read_unlock();
+
+	return css;
+}
+
 /**
  * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
  * @cgrp: the cgroup of interest
@@ -3311,11 +3333,40 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "nr_dying_descendants %d\n",
 		   cgroup->nr_dying_descendants);
 
-	cgroup_stat_show_cputime(seq, "cpu.");
-
 	return 0;
 }
 
+static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
+						 struct cgroup *cgrp, int ssid)
+{
+	struct cgroup_subsys *ss = cgroup_subsys[ssid];
+	struct cgroup_subsys_state *css;
+	int ret;
+
+	if (!ss->css_extra_stat_show)
+		return 0;
+
+	css = cgroup_tryget_css(cgrp, ss);
+	if (!css)
+		return 0;
+
+	ret = ss->css_extra_stat_show(seq, css);
+	css_put(css);
+	return ret;
+}
+
+static int cpu_stat_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	int ret = 0;
+
+	cgroup_stat_show_cputime(seq);
+#ifdef CONFIG_CGROUP_SCHED
+	ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
+#endif
+	return ret;
+}
+
 static int cgroup_file_open(struct kernfs_open_file *of)
 {
 	struct cftype *cft = of->kn->priv;
@@ -4423,6 +4474,11 @@ static struct cftype cgroup_base_files[] = {
 		.name = "cgroup.stat",
 		.seq_show = cgroup_stat_show,
 	},
+	{
+		.name = "cpu.stat",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_stat_show,
+	},
 	{ }	/* terminate */
 };
 
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
index 9cce79e89320..133b465691d6 100644
--- a/kernel/cgroup/stat.c
+++ b/kernel/cgroup/stat.c
@@ -256,7 +256,7 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
 	cgroup_cpu_stat_account_end(cgrp, cstat);
 }
 
-void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix)
+void cgroup_stat_show_cputime(struct seq_file *seq)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
 	u64 usage, utime, stime;
@@ -278,10 +278,10 @@ void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix)
 	do_div(utime, NSEC_PER_USEC);
 	do_div(stime, NSEC_PER_USEC);
 
-	seq_printf(seq, "%susage_usec %llu\n"
-		   "%suser_usec %llu\n"
-		   "%ssystem_usec %llu\n",
-		   prefix, usage, prefix, utime, prefix, stime);
+	seq_printf(seq, "usage_usec %llu\n"
+		   "user_usec %llu\n"
+		   "system_usec %llu\n",
+		   usage, utime, stime);
 }
 
 int cgroup_stat_init(struct cgroup *cgrp)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ad255162a830..0b3eec389552 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6678,13 +6678,12 @@ static struct cftype cpu_legacy_files[] = {
 	{ }	/* Terminate */
 };
 
-static int cpu_stat_show(struct seq_file *sf, void *v)
+static int cpu_extra_stat_show(struct seq_file *sf,
+			       struct cgroup_subsys_state *css)
 {
-	cgroup_stat_show_cputime(sf, "");
-
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
-		struct task_group *tg = css_tg(seq_css(sf));
+		struct task_group *tg = css_tg(css);
 		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
 		u64 throttled_usec;
 
@@ -6817,11 +6816,6 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
 #endif
 
 static struct cftype cpu_files[] = {
-	{
-		.name = "stat",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.seq_show = cpu_stat_show,
-	},
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "weight",
@@ -6852,6 +6846,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.css_online	= cpu_cgroup_css_online,
 	.css_released	= cpu_cgroup_css_released,
 	.css_free	= cpu_cgroup_css_free,
+	.css_extra_stat_show = cpu_extra_stat_show,
 	.fork		= cpu_cgroup_fork,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
-- 
cgit v1.2.3


From 032cfd66afcc2dd2c7be89c71b020fcb15bcc37d Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 25 Oct 2017 03:53:59 -0700
Subject: drivers/net: wan/sdla: Convert timers to use timer_setup()

In preparation for unconditionally passing the struct timer_list pointer to
all timer callbacks, switch to using the new timer_setup() and from_timer()
to pass the timer pointer explicitly.

Cc: Allen Pais <allen.lkml@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Tobias Klauser <tklauser@distanz.ch>
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wan/sdla.c  | 12 +++++-------
 include/linux/if_frad.h |  1 +
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wan/sdla.c b/drivers/net/wan/sdla.c
index 0cc48902dbb9..57ed259c8208 100644
--- a/drivers/net/wan/sdla.c
+++ b/drivers/net/wan/sdla.c
@@ -927,13 +927,10 @@ static irqreturn_t sdla_isr(int dummy, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static void sdla_poll(unsigned long device)
+static void sdla_poll(struct timer_list *t)
 {
-	struct net_device	  *dev;
-	struct frad_local *flp;
-
-	dev = (struct net_device *) device;
-	flp = netdev_priv(dev);
+	struct frad_local *flp = from_timer(flp, t, timer);
+	struct net_device *dev = flp->dev;
 
 	if (sdla_byte(dev, SDLA_502_RCV_BUF))
 		sdla_receive(dev);
@@ -1616,8 +1613,9 @@ static void setup_sdla(struct net_device *dev)
 	flp->assoc		= sdla_assoc;
 	flp->deassoc		= sdla_deassoc;
 	flp->dlci_conf		= sdla_dlci_conf;
+	flp->dev		= dev;
 
-	setup_timer(&flp->timer, sdla_poll, (unsigned long)dev);
+	timer_setup(&flp->timer, sdla_poll, 0);
 	flp->timer.expires	= 1;
 }
 
diff --git a/include/linux/if_frad.h b/include/linux/if_frad.h
index 46df7e565d6f..82a1b4e93570 100644
--- a/include/linux/if_frad.h
+++ b/include/linux/if_frad.h
@@ -83,6 +83,7 @@ struct frad_local
 
    /* fields that are used by the Sangoma SDLA cards */
    struct timer_list timer;
+   struct net_device *dev;
    int               type;		/* adapter type */
    int               state;		/* state of the S502/8 control latch */
    int               buffer;		/* current buffer for S508 firmware */
-- 
cgit v1.2.3


From ecad0d2cb8a7997afdc95031ee3328b997aba5c4 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Mon, 23 Oct 2017 15:11:36 -0700
Subject: nvme-fc: remove NVME_FC_MAX_SEGMENTS

The define is an arbitrary limit to the io size on the initiator,
capping the io to 1MB-4KB.

Remove the define from the transport. I/O size will solely be limited
by the LLDD sg limits.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c         | 6 ++----
 include/linux/nvme-fc-driver.h | 2 --
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index d12775d12983..b600c07803cf 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2435,7 +2435,6 @@ static int
 nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
 {
 	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
-	u32 segs;
 	int ret;
 	bool changed;
 
@@ -2486,9 +2485,8 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
 	if (ret)
 		goto out_disconnect_admin_queue;
 
-	segs = min_t(u32, NVME_FC_MAX_SEGMENTS,
-			ctrl->lport->ops->max_sgl_segments);
-	ctrl->ctrl.max_hw_sectors = (segs - 1) << (PAGE_SHIFT - 9);
+	ctrl->ctrl.max_hw_sectors =
+		(ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9);
 
 	ret = nvme_init_identify(&ctrl->ctrl);
 	if (ret)
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index 4ea03b9a5c8c..2be4db353937 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -102,8 +102,6 @@ enum nvmefc_fcp_datadir {
 };
 
 
-#define NVME_FC_MAX_SEGMENTS		256
-
 /**
  * struct nvmefc_fcp_req - Request structure passed from NVME-FC transport
  *                         to LLDD in order to perform a NVME FCP IO operation.
-- 
cgit v1.2.3


From 7863406143d8bbbbda07a61285c5f4c217908dfd Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:28 +0200
Subject: sched/isolation: Move housekeeping related code to its own file

The housekeeping code is currently tied to the NOHZ code. As we are
planning to make housekeeping independent from it, start with moving
the relevant code to its own file.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-2-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/net/ethernet/tile/tilegx.c |  2 +-
 include/linux/sched/isolation.h    | 56 ++++++++++++++++++++++++++++++++++++++
 include/linux/tick.h               | 37 -------------------------
 init/main.c                        |  2 ++
 kernel/rcu/tree_plugin.h           |  1 +
 kernel/rcu/update.c                |  1 +
 kernel/sched/Makefile              |  1 +
 kernel/sched/core.c                |  1 +
 kernel/sched/fair.c                |  1 +
 kernel/sched/isolation.c           | 33 ++++++++++++++++++++++
 kernel/time/tick-sched.c           | 18 ------------
 kernel/watchdog.c                  |  1 +
 12 files changed, 98 insertions(+), 56 deletions(-)
 create mode 100644 include/linux/sched/isolation.h
 create mode 100644 kernel/sched/isolation.c

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
index c00102b8145a..27a3272dcd48 100644
--- a/drivers/net/ethernet/tile/tilegx.c
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -40,7 +40,7 @@
 #include <linux/tcp.h>
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
-#include <linux/tick.h>
+#include <linux/sched/isolation.h>
 
 #include <asm/checksum.h>
 #include <asm/homecache.h>
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
new file mode 100644
index 000000000000..b7cfbc46286c
--- /dev/null
+++ b/include/linux/sched/isolation.h
@@ -0,0 +1,56 @@
+#ifndef _LINUX_SCHED_ISOLATION_H
+#define _LINUX_SCHED_ISOLATION_H
+
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <linux/tick.h>
+
+#ifdef CONFIG_NO_HZ_FULL
+extern cpumask_var_t housekeeping_mask;
+
+static inline int housekeeping_any_cpu(void)
+{
+	return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+}
+
+extern void __init housekeeping_init(void);
+
+#else
+
+static inline int housekeeping_any_cpu(void)
+{
+	return smp_processor_id();
+}
+
+static inline void housekeeping_init(void) { }
+#endif /* CONFIG_NO_HZ_FULL */
+
+
+static inline const struct cpumask *housekeeping_cpumask(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+	if (tick_nohz_full_enabled())
+		return housekeeping_mask;
+#endif
+	return cpu_possible_mask;
+}
+
+static inline bool is_housekeeping_cpu(int cpu)
+{
+#ifdef CONFIG_NO_HZ_FULL
+	if (tick_nohz_full_enabled())
+		return cpumask_test_cpu(cpu, housekeeping_mask);
+#endif
+	return true;
+}
+
+static inline void housekeeping_affine(struct task_struct *t)
+{
+#ifdef CONFIG_NO_HZ_FULL
+	if (tick_nohz_full_enabled())
+		set_cpus_allowed_ptr(t, housekeeping_mask);
+
+#endif
+}
+
+#endif /* _LINUX_SCHED_ISOLATION_H */
diff --git a/include/linux/tick.h b/include/linux/tick.h
index fe01e68bf520..68afc09aa8ac 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -137,7 +137,6 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
 #ifdef CONFIG_NO_HZ_FULL
 extern bool tick_nohz_full_running;
 extern cpumask_var_t tick_nohz_full_mask;
-extern cpumask_var_t housekeeping_mask;
 
 static inline bool tick_nohz_full_enabled(void)
 {
@@ -161,11 +160,6 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask)
 		cpumask_or(mask, mask, tick_nohz_full_mask);
 }
 
-static inline int housekeeping_any_cpu(void)
-{
-	return cpumask_any_and(housekeeping_mask, cpu_online_mask);
-}
-
 extern void tick_nohz_dep_set(enum tick_dep_bits bit);
 extern void tick_nohz_dep_clear(enum tick_dep_bits bit);
 extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit);
@@ -235,10 +229,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
 extern void tick_nohz_full_kick_cpu(int cpu);
 extern void __tick_nohz_task_switch(void);
 #else
-static inline int housekeeping_any_cpu(void)
-{
-	return smp_processor_id();
-}
 static inline bool tick_nohz_full_enabled(void) { return false; }
 static inline bool tick_nohz_full_cpu(int cpu) { return false; }
 static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
@@ -260,33 +250,6 @@ static inline void tick_nohz_full_kick_cpu(int cpu) { }
 static inline void __tick_nohz_task_switch(void) { }
 #endif
 
-static inline const struct cpumask *housekeeping_cpumask(void)
-{
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		return housekeeping_mask;
-#endif
-	return cpu_possible_mask;
-}
-
-static inline bool is_housekeeping_cpu(int cpu)
-{
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		return cpumask_test_cpu(cpu, housekeeping_mask);
-#endif
-	return true;
-}
-
-static inline void housekeeping_affine(struct task_struct *t)
-{
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		set_cpus_allowed_ptr(t, housekeeping_mask);
-
-#endif
-}
-
 static inline void tick_nohz_task_switch(void)
 {
 	if (tick_nohz_full_enabled())
diff --git a/init/main.c b/init/main.c
index 0ee9c6866ada..4610c99ae306 100644
--- a/init/main.c
+++ b/init/main.c
@@ -46,6 +46,7 @@
 #include <linux/cgroup.h>
 #include <linux/efi.h>
 #include <linux/tick.h>
+#include <linux/sched/isolation.h>
 #include <linux/interrupt.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
@@ -606,6 +607,7 @@ asmlinkage __visible void __init start_kernel(void)
 	early_irq_init();
 	init_IRQ();
 	tick_init();
+	housekeeping_init();
 	rcu_init_nohz();
 	init_timers();
 	hrtimers_init();
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index e012b9be777e..c7632f51c5a0 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -29,6 +29,7 @@
 #include <linux/oom.h>
 #include <linux/sched/debug.h>
 #include <linux/smpboot.h>
+#include <linux/sched/isolation.h>
 #include <uapi/linux/sched/types.h>
 #include "../time/tick-internal.h"
 
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 5033b66d2753..79abeb0ca329 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -51,6 +51,7 @@
 #include <linux/kthread.h>
 #include <linux/tick.h>
 #include <linux/rcupdate_wait.h>
+#include <linux/sched/isolation.h>
 
 #define CREATE_TRACE_POINTS
 
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 78f54932ea1d..871d43d73375 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -26,3 +26,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
 obj-$(CONFIG_MEMBARRIER) += membarrier.o
+obj-$(CONFIG_NO_HZ_FULL) += isolation.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2288a145bf01..ad188acb7636 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,6 +26,7 @@
 #include <linux/profile.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/sched/isolation.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56f343b8e749..591481db8c6a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -32,6 +32,7 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate.h>
 #include <linux/task_work.h>
+#include <linux/sched/isolation.h>
 
 #include <trace/events/sched.h>
 
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
new file mode 100644
index 000000000000..3589252ed476
--- /dev/null
+++ b/kernel/sched/isolation.c
@@ -0,0 +1,33 @@
+/*
+ *  Housekeeping management. Manage the targets for routine code that can run on
+ *  any CPU: unbound workqueues, timers, kthreads and any offloadable work.
+ *
+ * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
+ *
+ */
+
+#include <linux/sched/isolation.h>
+#include <linux/tick.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+
+cpumask_var_t housekeeping_mask;
+
+void __init housekeeping_init(void)
+{
+	if (!tick_nohz_full_enabled())
+		return;
+
+	if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
+		WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
+		cpumask_clear(tick_nohz_full_mask);
+		tick_nohz_full_running = false;
+		return;
+	}
+
+	cpumask_andnot(housekeeping_mask,
+		       cpu_possible_mask, tick_nohz_full_mask);
+
+	/* We need at least one CPU to handle housekeeping work */
+	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
+}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7b258c59d78a..27d7d522ac4e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -166,7 +166,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 
 #ifdef CONFIG_NO_HZ_FULL
 cpumask_var_t tick_nohz_full_mask;
-cpumask_var_t housekeeping_mask;
 bool tick_nohz_full_running;
 static atomic_t tick_dep_mask;
 
@@ -438,13 +437,6 @@ void __init tick_nohz_init(void)
 			return;
 	}
 
-	if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
-		WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
-		cpumask_clear(tick_nohz_full_mask);
-		tick_nohz_full_running = false;
-		return;
-	}
-
 	/*
 	 * Full dynticks uses irq work to drive the tick rescheduling on safe
 	 * locking contexts. But then we need irq work to raise its own
@@ -453,7 +445,6 @@ void __init tick_nohz_init(void)
 	if (!arch_irq_work_has_interrupt()) {
 		pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
 		cpumask_clear(tick_nohz_full_mask);
-		cpumask_copy(housekeeping_mask, cpu_possible_mask);
 		tick_nohz_full_running = false;
 		return;
 	}
@@ -466,9 +457,6 @@ void __init tick_nohz_init(void)
 		cpumask_clear_cpu(cpu, tick_nohz_full_mask);
 	}
 
-	cpumask_andnot(housekeeping_mask,
-		       cpu_possible_mask, tick_nohz_full_mask);
-
 	for_each_cpu(cpu, tick_nohz_full_mask)
 		context_tracking_cpu_set(cpu);
 
@@ -478,12 +466,6 @@ void __init tick_nohz_init(void)
 	WARN_ON(ret < 0);
 	pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
 		cpumask_pr_args(tick_nohz_full_mask));
-
-	/*
-	 * We need at least one CPU to handle housekeeping work such
-	 * as timekeeping, unbound timers, workqueues, ...
-	 */
-	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
 #endif
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6bcb854909c0..3c44dbaa4b9f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,6 +24,7 @@
 #include <linux/workqueue.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/isolation.h>
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
-- 
cgit v1.2.3


From 9f0ca2d97ef0b5e966be2cfef26c7c094ec14e41 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:30 +0200
Subject: sched/isolation: Provide a dynamic off-case to housekeeping_any_cpu()

housekeeping_any_cpu() doesn't handle correctly the case where
CONFIG_NO_HZ_FULL=y and no CPU is in nohz_full mode. So far no caller
needs this but let's prepare to avoid any future surprise.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-4-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/isolation.h | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index b7cfbc46286c..040df04fa78a 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -7,25 +7,20 @@
 
 #ifdef CONFIG_NO_HZ_FULL
 extern cpumask_var_t housekeeping_mask;
-
-static inline int housekeeping_any_cpu(void)
-{
-	return cpumask_any_and(housekeeping_mask, cpu_online_mask);
-}
-
 extern void __init housekeeping_init(void);
-
 #else
+static inline void housekeeping_init(void) { }
+#endif /* CONFIG_NO_HZ_FULL */
 
 static inline int housekeeping_any_cpu(void)
 {
+#ifdef CONFIG_NO_HZ_FULL
+	if (tick_nohz_full_enabled())
+		return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+#endif
 	return smp_processor_id();
 }
 
-static inline void housekeeping_init(void) { }
-#endif /* CONFIG_NO_HZ_FULL */
-
-
 static inline const struct cpumask *housekeeping_cpumask(void)
 {
 #ifdef CONFIG_NO_HZ_FULL
-- 
cgit v1.2.3


From 7e56a1cf4b28f5739526877b8dbad623fae2e4e7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:31 +0200
Subject: sched/isolation: Make the housekeeping cpumask private

Nobody needs to access this detail. housekeeping_cpumask() already
takes care of it.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-5-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/isolation.h | 31 ++++++++++---------------------
 kernel/sched/isolation.c        | 36 +++++++++++++++++++++++++++++++++++-
 2 files changed, 45 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 040df04fa78a..ed935ffc6ffa 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -6,46 +6,35 @@
 #include <linux/tick.h>
 
 #ifdef CONFIG_NO_HZ_FULL
-extern cpumask_var_t housekeeping_mask;
+extern int housekeeping_any_cpu(void);
+extern const struct cpumask *housekeeping_cpumask(void);
+extern void housekeeping_affine(struct task_struct *t);
+extern bool housekeeping_test_cpu(int cpu);
 extern void __init housekeeping_init(void);
+
 #else
-static inline void housekeeping_init(void) { }
-#endif /* CONFIG_NO_HZ_FULL */
 
 static inline int housekeeping_any_cpu(void)
 {
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		return cpumask_any_and(housekeeping_mask, cpu_online_mask);
-#endif
 	return smp_processor_id();
 }
 
 static inline const struct cpumask *housekeeping_cpumask(void)
 {
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		return housekeeping_mask;
-#endif
 	return cpu_possible_mask;
 }
 
+static inline void housekeeping_affine(struct task_struct *t) { }
+static inline void housekeeping_init(void) { }
+#endif /* CONFIG_NO_HZ_FULL */
+
 static inline bool is_housekeeping_cpu(int cpu)
 {
 #ifdef CONFIG_NO_HZ_FULL
 	if (tick_nohz_full_enabled())
-		return cpumask_test_cpu(cpu, housekeeping_mask);
+		return housekeeping_test_cpu(cpu);
 #endif
 	return true;
 }
 
-static inline void housekeeping_affine(struct task_struct *t)
-{
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		set_cpus_allowed_ptr(t, housekeeping_mask);
-
-#endif
-}
-
 #endif /* _LINUX_SCHED_ISOLATION_H */
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 3589252ed476..16445097eb25 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -11,7 +11,41 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 
-cpumask_var_t housekeeping_mask;
+static cpumask_var_t housekeeping_mask;
+
+int housekeeping_any_cpu(void)
+{
+	if (tick_nohz_full_enabled())
+		return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+
+	return smp_processor_id();
+}
+EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
+
+const struct cpumask *housekeeping_cpumask(void)
+{
+	if (tick_nohz_full_enabled())
+		return housekeeping_mask;
+
+	return cpu_possible_mask;
+}
+EXPORT_SYMBOL_GPL(housekeeping_cpumask);
+
+void housekeeping_affine(struct task_struct *t)
+{
+	if (tick_nohz_full_enabled())
+		set_cpus_allowed_ptr(t, housekeeping_mask);
+}
+EXPORT_SYMBOL_GPL(housekeeping_affine);
+
+bool housekeeping_test_cpu(int cpu)
+{
+	if (tick_nohz_full_enabled())
+		return cpumask_test_cpu(cpu, housekeeping_mask);
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
 
 void __init housekeeping_init(void)
 {
-- 
cgit v1.2.3


From e179f5a04ba46ee5c5439480c2bfd68c358168b7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:32 +0200
Subject: sched/isolation: Use its own static key

Housekeeping code still depends on the nohz_full static key. Since we want
to decouple housekeeping from NOHZ, let's create a housekeeping specific
static key.

It's mostly relevant for calls to is_housekeeping_cpu() from the scheduler.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-6-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/isolation.h |  3 ++-
 kernel/sched/isolation.c        | 13 +++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index ed935ffc6ffa..194c586fbb12 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -6,6 +6,7 @@
 #include <linux/tick.h>
 
 #ifdef CONFIG_NO_HZ_FULL
+DECLARE_STATIC_KEY_FALSE(housekeeping_overriden);
 extern int housekeeping_any_cpu(void);
 extern const struct cpumask *housekeeping_cpumask(void);
 extern void housekeeping_affine(struct task_struct *t);
@@ -31,7 +32,7 @@ static inline void housekeeping_init(void) { }
 static inline bool is_housekeeping_cpu(int cpu)
 {
 #ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
+	if (static_branch_unlikely(&housekeeping_overriden))
 		return housekeeping_test_cpu(cpu);
 #endif
 	return true;
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 16445097eb25..bb8ba19a0235 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -10,12 +10,15 @@
 #include <linux/tick.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/static_key.h>
 
+DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
+EXPORT_SYMBOL_GPL(housekeeping_overriden);
 static cpumask_var_t housekeeping_mask;
 
 int housekeeping_any_cpu(void)
 {
-	if (tick_nohz_full_enabled())
+	if (static_branch_unlikely(&housekeeping_overriden))
 		return cpumask_any_and(housekeeping_mask, cpu_online_mask);
 
 	return smp_processor_id();
@@ -24,7 +27,7 @@ EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
 
 const struct cpumask *housekeeping_cpumask(void)
 {
-	if (tick_nohz_full_enabled())
+	if (static_branch_unlikely(&housekeeping_overriden))
 		return housekeeping_mask;
 
 	return cpu_possible_mask;
@@ -33,14 +36,14 @@ EXPORT_SYMBOL_GPL(housekeeping_cpumask);
 
 void housekeeping_affine(struct task_struct *t)
 {
-	if (tick_nohz_full_enabled())
+	if (static_branch_unlikely(&housekeeping_overriden))
 		set_cpus_allowed_ptr(t, housekeeping_mask);
 }
 EXPORT_SYMBOL_GPL(housekeeping_affine);
 
 bool housekeeping_test_cpu(int cpu)
 {
-	if (tick_nohz_full_enabled())
+	if (static_branch_unlikely(&housekeeping_overriden))
 		return cpumask_test_cpu(cpu, housekeeping_mask);
 
 	return true;
@@ -62,6 +65,8 @@ void __init housekeeping_init(void)
 	cpumask_andnot(housekeeping_mask,
 		       cpu_possible_mask, tick_nohz_full_mask);
 
+	static_branch_enable(&housekeeping_overriden);
+
 	/* We need at least one CPU to handle housekeeping work */
 	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
-- 
cgit v1.2.3


From 204c083a009378dfa751175b5fcddc75988bab6c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:33 +0200
Subject: sched/isolation: Rename is_housekeeping_cpu() to housekeeping_cpu()

Fit it into the housekeeping_*() namespace.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-7-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/isolation.h | 2 +-
 kernel/sched/core.c             | 6 +++---
 kernel/sched/fair.c             | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 194c586fbb12..ad0f5d986a2e 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -29,7 +29,7 @@ static inline void housekeeping_affine(struct task_struct *t) { }
 static inline void housekeeping_init(void) { }
 #endif /* CONFIG_NO_HZ_FULL */
 
-static inline bool is_housekeeping_cpu(int cpu)
+static inline bool housekeeping_cpu(int cpu)
 {
 #ifdef CONFIG_NO_HZ_FULL
 	if (static_branch_unlikely(&housekeeping_overriden))
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ad188acb7636..d0fb448dd43a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -527,7 +527,7 @@ int get_nohz_timer_target(void)
 	int i, cpu = smp_processor_id();
 	struct sched_domain *sd;
 
-	if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
+	if (!idle_cpu(cpu) && housekeeping_cpu(cpu))
 		return cpu;
 
 	rcu_read_lock();
@@ -536,14 +536,14 @@ int get_nohz_timer_target(void)
 			if (cpu == i)
 				continue;
 
-			if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
+			if (!idle_cpu(i) && housekeeping_cpu(i)) {
 				cpu = i;
 				goto unlock;
 			}
 		}
 	}
 
-	if (!is_housekeeping_cpu(cpu))
+	if (!housekeeping_cpu(cpu))
 		cpu = housekeeping_any_cpu();
 unlock:
 	rcu_read_unlock();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 591481db8c6a..cdece8f967f0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9027,7 +9027,7 @@ void nohz_balance_enter_idle(int cpu)
 		return;
 
 	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
-	if (!is_housekeeping_cpu(cpu))
+	if (!housekeeping_cpu(cpu))
 		return;
 
 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
-- 
cgit v1.2.3


From 5c4991e24c69737bd41fc2737b1e3980abbf73f9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:34 +0200
Subject: sched/isolation: Split out new CONFIG_CPU_ISOLATION=y config from
 CONFIG_NO_HZ_FULL

Split the housekeeping config from CONFIG_NO_HZ_FULL. This way we finally
separate the isolation code from NOHZ.

Although a dependency to CONFIG_NO_HZ_FULL remains for now, while the
housekeeping code still deals with NOHZ internals.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-8-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/isolation.h | 6 +++---
 init/Kconfig                    | 8 ++++++++
 kernel/sched/Makefile           | 2 +-
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index ad0f5d986a2e..93ac2367a520 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -5,7 +5,7 @@
 #include <linux/init.h>
 #include <linux/tick.h>
 
-#ifdef CONFIG_NO_HZ_FULL
+#ifdef CONFIG_CPU_ISOLATION
 DECLARE_STATIC_KEY_FALSE(housekeeping_overriden);
 extern int housekeeping_any_cpu(void);
 extern const struct cpumask *housekeeping_cpumask(void);
@@ -27,11 +27,11 @@ static inline const struct cpumask *housekeeping_cpumask(void)
 
 static inline void housekeeping_affine(struct task_struct *t) { }
 static inline void housekeeping_init(void) { }
-#endif /* CONFIG_NO_HZ_FULL */
+#endif /* CONFIG_CPU_ISOLATION */
 
 static inline bool housekeeping_cpu(int cpu)
 {
-#ifdef CONFIG_NO_HZ_FULL
+#ifdef CONFIG_CPU_ISOLATION
 	if (static_branch_unlikely(&housekeeping_overriden))
 		return housekeeping_test_cpu(cpu);
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index 78cb2461012e..6f52e6f4bd9d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -472,6 +472,14 @@ config TASK_IO_ACCOUNTING
 
 endmenu # "CPU/Task time and stats accounting"
 
+config CPU_ISOLATION
+	bool "CPU isolation"
+	depends on NO_HZ_FULL
+	help
+	  Make sure that CPUs running critical tasks are not disturbed by
+	  any source of "noise" such as unbound workqueues, timers, kthreads...
+	  Unbound jobs get offloaded to housekeeping CPUs.
+
 source "kernel/rcu/Kconfig"
 
 config BUILD_BIN2C
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 871d43d73375..dbe61302c352 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -26,4 +26,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
 obj-$(CONFIG_MEMBARRIER) += membarrier.o
-obj-$(CONFIG_NO_HZ_FULL) += isolation.o
+obj-$(CONFIG_CPU_ISOLATION) += isolation.o
-- 
cgit v1.2.3


From de201559df872f83d0c08fb4effe3efd28e6cbc8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:35 +0200
Subject: sched/isolation: Introduce housekeeping flags

Before we implement isolcpus under housekeeping, we need the isolation
features to be more finegrained. For example some people want NOHZ_FULL
without the full scheduler isolation, others want full scheduler
isolation without NOHZ_FULL.

So let's cut all these isolation features piecewise, at the risk of
overcutting it right now. We can still merge some flags later if they
always make sense together.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-9-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/net/ethernet/tile/tilegx.c |  4 ++--
 include/linux/sched/isolation.h    | 26 +++++++++++++++++---------
 kernel/rcu/tree_plugin.h           |  2 +-
 kernel/rcu/update.c                |  2 +-
 kernel/sched/core.c                |  8 ++++----
 kernel/sched/fair.c                |  2 +-
 kernel/sched/isolation.c           | 26 +++++++++++++++-----------
 kernel/watchdog.c                  |  3 ++-
 8 files changed, 43 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
index 27a3272dcd48..b3e5816a4678 100644
--- a/drivers/net/ethernet/tile/tilegx.c
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -2270,8 +2270,8 @@ static int __init tile_net_init_module(void)
 		tile_net_dev_init(name, mac);
 
 	if (!network_cpus_init())
-		cpumask_and(&network_cpus_map, housekeeping_cpumask(),
-			    cpu_online_mask);
+		cpumask_and(&network_cpus_map,
+			    housekeeping_cpumask(HK_FLAG_MISC), cpu_online_mask);
 
 	return 0;
 }
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 93ac2367a520..9bb753eece3b 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -5,35 +5,43 @@
 #include <linux/init.h>
 #include <linux/tick.h>
 
+enum hk_flags {
+	HK_FLAG_TIMER		= 1,
+	HK_FLAG_RCU		= (1 << 1),
+	HK_FLAG_MISC		= (1 << 2),
+	HK_FLAG_SCHED		= (1 << 3),
+};
+
 #ifdef CONFIG_CPU_ISOLATION
 DECLARE_STATIC_KEY_FALSE(housekeeping_overriden);
-extern int housekeeping_any_cpu(void);
-extern const struct cpumask *housekeeping_cpumask(void);
-extern void housekeeping_affine(struct task_struct *t);
-extern bool housekeeping_test_cpu(int cpu);
+extern int housekeeping_any_cpu(enum hk_flags flags);
+extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags);
+extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags);
+extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags);
 extern void __init housekeeping_init(void);
 
 #else
 
-static inline int housekeeping_any_cpu(void)
+static inline int housekeeping_any_cpu(enum hk_flags flags)
 {
 	return smp_processor_id();
 }
 
-static inline const struct cpumask *housekeeping_cpumask(void)
+static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
 {
 	return cpu_possible_mask;
 }
 
-static inline void housekeeping_affine(struct task_struct *t) { }
+static inline void housekeeping_affine(struct task_struct *t,
+				       enum hk_flags flags) { }
 static inline void housekeeping_init(void) { }
 #endif /* CONFIG_CPU_ISOLATION */
 
-static inline bool housekeeping_cpu(int cpu)
+static inline bool housekeeping_cpu(int cpu, enum hk_flags flags)
 {
 #ifdef CONFIG_CPU_ISOLATION
 	if (static_branch_unlikely(&housekeeping_overriden))
-		return housekeeping_test_cpu(cpu);
+		return housekeeping_test_cpu(cpu, flags);
 #endif
 	return true;
 }
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c7632f51c5a0..34125d23e58a 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2584,7 +2584,7 @@ static void rcu_bind_gp_kthread(void)
 
 	if (!tick_nohz_full_enabled())
 		return;
-	housekeeping_affine(current);
+	housekeeping_affine(current, HK_FLAG_RCU);
 }
 
 /* Record the current task on dyntick-idle entry. */
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 79abeb0ca329..e3e60efaafee 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -719,7 +719,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 	LIST_HEAD(rcu_tasks_holdouts);
 
 	/* Run on housekeeping CPUs by default.  Sysadm can move if desired. */
-	housekeeping_affine(current);
+	housekeeping_affine(current, HK_FLAG_RCU);
 
 	/*
 	 * Each pass through the following loop makes one check for
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d0fb448dd43a..2210c0203e51 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -527,7 +527,7 @@ int get_nohz_timer_target(void)
 	int i, cpu = smp_processor_id();
 	struct sched_domain *sd;
 
-	if (!idle_cpu(cpu) && housekeeping_cpu(cpu))
+	if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
 		return cpu;
 
 	rcu_read_lock();
@@ -536,15 +536,15 @@ int get_nohz_timer_target(void)
 			if (cpu == i)
 				continue;
 
-			if (!idle_cpu(i) && housekeeping_cpu(i)) {
+			if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
 				cpu = i;
 				goto unlock;
 			}
 		}
 	}
 
-	if (!housekeeping_cpu(cpu))
-		cpu = housekeeping_any_cpu();
+	if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
+		cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
 unlock:
 	rcu_read_unlock();
 	return cpu;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cdece8f967f0..f755de86a813 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9027,7 +9027,7 @@ void nohz_balance_enter_idle(int cpu)
 		return;
 
 	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
-	if (!housekeeping_cpu(cpu))
+	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
 		return;
 
 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index bb8ba19a0235..37a138a780ff 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -15,37 +15,39 @@
 DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
 EXPORT_SYMBOL_GPL(housekeeping_overriden);
 static cpumask_var_t housekeeping_mask;
+static unsigned int housekeeping_flags;
 
-int housekeeping_any_cpu(void)
+int housekeeping_any_cpu(enum hk_flags flags)
 {
 	if (static_branch_unlikely(&housekeeping_overriden))
-		return cpumask_any_and(housekeeping_mask, cpu_online_mask);
-
+		if (housekeeping_flags & flags)
+			return cpumask_any_and(housekeeping_mask, cpu_online_mask);
 	return smp_processor_id();
 }
 EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
 
-const struct cpumask *housekeeping_cpumask(void)
+const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
 {
 	if (static_branch_unlikely(&housekeeping_overriden))
-		return housekeeping_mask;
-
+		if (housekeeping_flags & flags)
+			return housekeeping_mask;
 	return cpu_possible_mask;
 }
 EXPORT_SYMBOL_GPL(housekeeping_cpumask);
 
-void housekeeping_affine(struct task_struct *t)
+void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
 {
 	if (static_branch_unlikely(&housekeeping_overriden))
-		set_cpus_allowed_ptr(t, housekeeping_mask);
+		if (housekeeping_flags & flags)
+			set_cpus_allowed_ptr(t, housekeeping_mask);
 }
 EXPORT_SYMBOL_GPL(housekeeping_affine);
 
-bool housekeeping_test_cpu(int cpu)
+bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
 {
 	if (static_branch_unlikely(&housekeeping_overriden))
-		return cpumask_test_cpu(cpu, housekeeping_mask);
-
+		if (housekeeping_flags & flags)
+			return cpumask_test_cpu(cpu, housekeeping_mask);
 	return true;
 }
 EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
@@ -65,6 +67,8 @@ void __init housekeeping_init(void)
 	cpumask_andnot(housekeeping_mask,
 		       cpu_possible_mask, tick_nohz_full_mask);
 
+	housekeeping_flags = HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+
 	static_branch_enable(&housekeeping_overriden);
 
 	/* We need at least one CPU to handle housekeeping work */
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 562652c9c815..e9e2ebb3db29 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -777,7 +777,8 @@ void __init lockup_detector_init(void)
 	if (tick_nohz_full_enabled())
 		pr_info("Disabling watchdog on nohz_full cores by default\n");
 
-	cpumask_copy(&watchdog_cpumask, housekeeping_cpumask());
+	cpumask_copy(&watchdog_cpumask,
+		     housekeeping_cpumask(HK_FLAG_TIMER));
 
 	if (!watchdog_nmi_probe())
 		nmi_watchdog_available = true;
-- 
cgit v1.2.3


From 6f1982fedd59856bcc42a9b521be4c3ffd2f60a7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:36 +0200
Subject: sched/isolation: Handle the nohz_full= parameter

We want to centralize the isolation management, done by the housekeeping
subsystem. Therefore we need to handle the nohz_full= parameter from
there.

Since nohz_full= so far has involved unbound timers, watchdog, RCU
and tilegx NAPI isolation, we keep that default behaviour.

nohz_full= will be deprecated in the future. We want to control
the isolation features from the isolcpus= parameter.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-10-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/isolation.h |  1 +
 include/linux/tick.h            |  2 ++
 init/Kconfig                    |  1 -
 kernel/sched/isolation.c        | 42 +++++++++++++++++++++++++++++------------
 kernel/time/tick-sched.c        | 13 +++----------
 5 files changed, 36 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 9bb753eece3b..e53cfa96e91e 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -10,6 +10,7 @@ enum hk_flags {
 	HK_FLAG_RCU		= (1 << 1),
 	HK_FLAG_MISC		= (1 << 2),
 	HK_FLAG_SCHED		= (1 << 3),
+	HK_FLAG_TICK		= (1 << 4),
 };
 
 #ifdef CONFIG_CPU_ISOLATION
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 68afc09aa8ac..e2a163a9f96c 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -228,6 +228,7 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
 
 extern void tick_nohz_full_kick_cpu(int cpu);
 extern void __tick_nohz_task_switch(void);
+extern void __init tick_nohz_full_setup(cpumask_var_t cpumask);
 #else
 static inline bool tick_nohz_full_enabled(void) { return false; }
 static inline bool tick_nohz_full_cpu(int cpu) { return false; }
@@ -248,6 +249,7 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
 
 static inline void tick_nohz_full_kick_cpu(int cpu) { }
 static inline void __tick_nohz_task_switch(void) { }
+static inline void tick_nohz_full_setup(cpumask_var_t cpumask) { }
 #endif
 
 static inline void tick_nohz_task_switch(void)
diff --git a/init/Kconfig b/init/Kconfig
index 6f52e6f4bd9d..f8564df58d0a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -474,7 +474,6 @@ endmenu # "CPU/Task time and stats accounting"
 
 config CPU_ISOLATION
 	bool "CPU isolation"
-	depends on NO_HZ_FULL
 	help
 	  Make sure that CPUs running critical tasks are not disturbed by
 	  any source of "noise" such as unbound workqueues, timers, kthreads...
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 37a138a780ff..1f61e440358d 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -54,23 +54,41 @@ EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
 
 void __init housekeeping_init(void)
 {
-	if (!tick_nohz_full_enabled())
+	if (!housekeeping_flags)
 		return;
 
-	if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
-		WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
-		cpumask_clear(tick_nohz_full_mask);
-		tick_nohz_full_running = false;
-		return;
+	static_branch_enable(&housekeeping_overriden);
+
+	/* We need at least one CPU to handle housekeeping work */
+	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+static int __init housekeeping_nohz_full_setup(char *str)
+{
+	cpumask_var_t non_housekeeping_mask;
+
+	alloc_bootmem_cpumask_var(&non_housekeeping_mask);
+	if (cpulist_parse(str, non_housekeeping_mask) < 0) {
+		pr_warn("Housekeeping: Incorrect nohz_full cpumask\n");
+		free_bootmem_cpumask_var(non_housekeeping_mask);
+		return 0;
 	}
 
-	cpumask_andnot(housekeeping_mask,
-		       cpu_possible_mask, tick_nohz_full_mask);
+	alloc_bootmem_cpumask_var(&housekeeping_mask);
+	cpumask_andnot(housekeeping_mask, cpu_possible_mask, non_housekeeping_mask);
 
-	housekeeping_flags = HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+	if (cpumask_empty(housekeeping_mask))
+		cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
 
-	static_branch_enable(&housekeeping_overriden);
+	housekeeping_flags = HK_FLAG_TICK | HK_FLAG_TIMER |
+				HK_FLAG_RCU | HK_FLAG_MISC;
 
-	/* We need at least one CPU to handle housekeeping work */
-	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
+	tick_nohz_full_setup(non_housekeeping_mask);
+
+	free_bootmem_cpumask_var(non_housekeeping_mask);
+
+	return 1;
 }
+__setup("nohz_full=", housekeeping_nohz_full_setup);
+#endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 27d7d522ac4e..69f3dbe38984 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -385,20 +385,13 @@ out:
 	local_irq_restore(flags);
 }
 
-/* Parse the boot-time nohz CPU list from the kernel parameters. */
-static int __init tick_nohz_full_setup(char *str)
+/* Get the boot-time nohz CPU list from the kernel parameters. */
+void __init tick_nohz_full_setup(cpumask_var_t cpumask)
 {
 	alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
-	if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
-		pr_warn("NO_HZ: Incorrect nohz_full cpumask\n");
-		free_bootmem_cpumask_var(tick_nohz_full_mask);
-		return 1;
-	}
+	cpumask_copy(tick_nohz_full_mask, cpumask);
 	tick_nohz_full_running = true;
-
-	return 1;
 }
-__setup("nohz_full=", tick_nohz_full_setup);
 
 static int tick_nohz_cpu_down(unsigned int cpu)
 {
-- 
cgit v1.2.3


From edb9382175c3ebdced8ffdb3e0f20052ad9fdbe9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:37 +0200
Subject: sched/isolation: Move isolcpus= handling to the housekeeping code

We want to centralize the isolation features, to be done by the housekeeping
subsystem and scheduler domain isolation is a significant part of it.

No intended behaviour change, we just reuse the housekeeping cpumask
and core code.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-11-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/base/cpu.c              | 11 ++++++-
 include/linux/sched.h           |  2 --
 include/linux/sched/isolation.h |  1 +
 kernel/cgroup/cpuset.c          | 15 ++++------
 kernel/sched/core.c             | 16 +----------
 kernel/sched/isolation.c        | 63 ++++++++++++++++++++++++++++++++---------
 kernel/sched/topology.c         | 24 ++++------------
 7 files changed, 73 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 321cd7b4d817..a73ab95558f5 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -18,6 +18,7 @@
 #include <linux/cpufeature.h>
 #include <linux/tick.h>
 #include <linux/pm_qos.h>
+#include <linux/sched/isolation.h>
 
 #include "base.h"
 
@@ -271,8 +272,16 @@ static ssize_t print_cpus_isolated(struct device *dev,
 				  struct device_attribute *attr, char *buf)
 {
 	int n = 0, len = PAGE_SIZE-2;
+	cpumask_var_t isolated;
 
-	n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(cpu_isolated_map));
+	if (!alloc_cpumask_var(&isolated, GFP_KERNEL))
+		return -ENOMEM;
+
+	cpumask_andnot(isolated, cpu_possible_mask,
+		       housekeeping_cpumask(HK_FLAG_DOMAIN));
+	n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(isolated));
+
+	free_cpumask_var(isolated);
 
 	return n;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0f897dfc195e..1b0cc0d6df8d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -165,8 +165,6 @@ struct task_group;
 /* Task command name length: */
 #define TASK_COMM_LEN			16
 
-extern cpumask_var_t			cpu_isolated_map;
-
 extern void scheduler_tick(void);
 
 #define	MAX_SCHEDULE_TIMEOUT		LONG_MAX
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index e53cfa96e91e..d849431c8060 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -11,6 +11,7 @@ enum hk_flags {
 	HK_FLAG_MISC		= (1 << 2),
 	HK_FLAG_SCHED		= (1 << 3),
 	HK_FLAG_TICK		= (1 << 4),
+	HK_FLAG_DOMAIN		= (1 << 5),
 };
 
 #ifdef CONFIG_CPU_ISOLATION
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4657e2924ecb..f7efa7b4d825 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -57,7 +57,7 @@
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
 #include <linux/oom.h>
-
+#include <linux/sched/isolation.h>
 #include <linux/uaccess.h>
 #include <linux/atomic.h>
 #include <linux/mutex.h>
@@ -656,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
 	int csn;		/* how many cpuset ptrs in csa so far */
 	int i, j, k;		/* indices for partition finding loops */
 	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
-	cpumask_var_t non_isolated_cpus;  /* load balanced CPUs */
 	struct sched_domain_attr *dattr;  /* attributes for custom domains */
 	int ndoms = 0;		/* number of sched domains in result */
 	int nslot;		/* next empty doms[] struct cpumask slot */
@@ -666,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
 	dattr = NULL;
 	csa = NULL;
 
-	if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
-		goto done;
-	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-
 	/* Special case for the 99% of systems with one, full, sched domain */
 	if (is_sched_load_balance(&top_cpuset)) {
 		ndoms = 1;
@@ -683,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
 			update_domain_attr_tree(dattr, &top_cpuset);
 		}
 		cpumask_and(doms[0], top_cpuset.effective_cpus,
-				     non_isolated_cpus);
+			    housekeeping_cpumask(HK_FLAG_DOMAIN));
 
 		goto done;
 	}
@@ -707,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
 		 */
 		if (!cpumask_empty(cp->cpus_allowed) &&
 		    !(is_sched_load_balance(cp) &&
-		      cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
+		      cpumask_intersects(cp->cpus_allowed,
+					 housekeeping_cpumask(HK_FLAG_DOMAIN))))
 			continue;
 
 		if (is_sched_load_balance(cp))
@@ -789,7 +785,7 @@ restart:
 
 			if (apn == b->pn) {
 				cpumask_or(dp, dp, b->effective_cpus);
-				cpumask_and(dp, dp, non_isolated_cpus);
+				cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
 				if (dattr)
 					update_domain_attr_tree(dattr + nslot, b);
 
@@ -802,7 +798,6 @@ restart:
 	BUG_ON(nslot != ndoms);
 
 done:
-	free_cpumask_var(non_isolated_cpus);
 	kfree(csa);
 
 	/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2210c0203e51..1a55c842bfbc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -84,9 +84,6 @@ __read_mostly int scheduler_running;
  */
 int sysctl_sched_rt_runtime = 950000;
 
-/* CPUs with isolated domains */
-cpumask_var_t cpu_isolated_map;
-
 /*
  * __task_rq_lock - lock the rq @p resides on.
  */
@@ -5735,10 +5732,6 @@ static inline void sched_init_smt(void) { }
 
 void __init sched_init_smp(void)
 {
-	cpumask_var_t non_isolated_cpus;
-
-	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
-
 	sched_init_numa();
 
 	/*
@@ -5748,16 +5741,12 @@ void __init sched_init_smp(void)
 	 */
 	mutex_lock(&sched_domains_mutex);
 	sched_init_domains(cpu_active_mask);
-	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-	if (cpumask_empty(non_isolated_cpus))
-		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);
 
 	/* Move init over to a non-isolated CPU */
-	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
 		BUG();
 	sched_init_granularity();
-	free_cpumask_var(non_isolated_cpus);
 
 	init_sched_rt_class();
 	init_sched_dl_class();
@@ -5961,9 +5950,6 @@ void __init sched_init(void)
 	calc_load_update = jiffies + LOAD_FREQ;
 
 #ifdef CONFIG_SMP
-	/* May be allocated at isolcpus cmdline parse time */
-	if (cpu_isolated_map == NULL)
-		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 	idle_thread_set_boot_cpu();
 	set_cpu_rq_start_time(smp_processor_id());
 #endif
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 1f61e440358d..8f666bc5abe8 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -63,32 +63,69 @@ void __init housekeeping_init(void)
 	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
 
-#ifdef CONFIG_NO_HZ_FULL
-static int __init housekeeping_nohz_full_setup(char *str)
+static int __init housekeeping_setup(char *str, enum hk_flags flags)
 {
 	cpumask_var_t non_housekeeping_mask;
+	int err;
 
 	alloc_bootmem_cpumask_var(&non_housekeeping_mask);
-	if (cpulist_parse(str, non_housekeeping_mask) < 0) {
-		pr_warn("Housekeeping: Incorrect nohz_full cpumask\n");
+	err = cpulist_parse(str, non_housekeeping_mask);
+	if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) {
+		pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
 		free_bootmem_cpumask_var(non_housekeeping_mask);
 		return 0;
 	}
 
-	alloc_bootmem_cpumask_var(&housekeeping_mask);
-	cpumask_andnot(housekeeping_mask, cpu_possible_mask, non_housekeeping_mask);
-
-	if (cpumask_empty(housekeeping_mask))
-		cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+	if (!housekeeping_flags) {
+		alloc_bootmem_cpumask_var(&housekeeping_mask);
+		cpumask_andnot(housekeeping_mask,
+			       cpu_possible_mask, non_housekeeping_mask);
+		if (cpumask_empty(housekeeping_mask))
+			cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+	} else {
+		cpumask_var_t tmp;
+
+		alloc_bootmem_cpumask_var(&tmp);
+		cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
+		if (!cpumask_equal(tmp, housekeeping_mask)) {
+			pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
+			free_bootmem_cpumask_var(tmp);
+			free_bootmem_cpumask_var(non_housekeeping_mask);
+			return 0;
+		}
+		free_bootmem_cpumask_var(tmp);
+	}
 
-	housekeeping_flags = HK_FLAG_TICK | HK_FLAG_TIMER |
-				HK_FLAG_RCU | HK_FLAG_MISC;
+	if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
+		if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+			tick_nohz_full_setup(non_housekeeping_mask);
+		} else {
+			pr_warn("Housekeeping: nohz unsupported."
+				" Build with CONFIG_NO_HZ_FULL\n");
+			free_bootmem_cpumask_var(non_housekeeping_mask);
+			return 0;
+		}
+	}
 
-	tick_nohz_full_setup(non_housekeeping_mask);
+	housekeeping_flags |= flags;
 
 	free_bootmem_cpumask_var(non_housekeeping_mask);
 
 	return 1;
 }
+
+static int __init housekeeping_nohz_full_setup(char *str)
+{
+	unsigned int flags;
+
+	flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+
+	return housekeeping_setup(str, flags);
+}
 __setup("nohz_full=", housekeeping_nohz_full_setup);
-#endif
+
+static int __init housekeeping_isolcpus_setup(char *str)
+{
+	return housekeeping_setup(str, HK_FLAG_DOMAIN);
+}
+__setup("isolcpus=", housekeeping_isolcpus_setup);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index e3d31b0880dc..2e6b9126ffdc 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,6 +3,7 @@
  */
 #include <linux/sched.h>
 #include <linux/mutex.h>
+#include <linux/sched/isolation.h>
 
 #include "sched.h"
 
@@ -469,21 +470,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 	update_top_cache_domain(cpu);
 }
 
-/* Setup the mask of CPUs configured for isolated domains */
-static int __init isolated_cpu_setup(char *str)
-{
-	int ret;
-
-	alloc_bootmem_cpumask_var(&cpu_isolated_map);
-	ret = cpulist_parse(str, cpu_isolated_map);
-	if (ret || cpumask_last(cpu_isolated_map) >= nr_cpu_ids) {
-		pr_err("sched: Error, all isolcpus= values must be between 0 and %u - ignoring them.\n", nr_cpu_ids-1);
-		return 0;
-	}
-	return 1;
-}
-__setup("isolcpus=", isolated_cpu_setup);
-
 struct s_data {
 	struct sched_domain ** __percpu sd;
 	struct root_domain	*rd;
@@ -1792,7 +1778,7 @@ int sched_init_domains(const struct cpumask *cpu_map)
 	doms_cur = alloc_sched_domains(ndoms_cur);
 	if (!doms_cur)
 		doms_cur = &fallback_doms;
-	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
+	cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
 	err = build_sched_domains(doms_cur[0], NULL);
 	register_sched_domain_sysctl();
 
@@ -1875,7 +1861,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 		doms_new = alloc_sched_domains(1);
 		if (doms_new) {
 			n = 1;
-			cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+			cpumask_and(doms_new[0], cpu_active_mask,
+				    housekeeping_cpumask(HK_FLAG_DOMAIN));
 		}
 	} else {
 		n = ndoms_new;
@@ -1898,7 +1885,8 @@ match1:
 	if (!doms_new) {
 		n = 0;
 		doms_new = &fallback_doms;
-		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+		cpumask_and(doms_new[0], cpu_active_mask,
+			    housekeeping_cpumask(HK_FLAG_DOMAIN));
 	}
 
 	/* Build new domains: */
-- 
cgit v1.2.3


From 7d9285e82db5defca4d9674ba089429eeca0c697 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 5 Oct 2017 09:19:19 -0700
Subject: perf/bpf: Extend the perf_event_read_local() interface, a.k.a. "bpf:
 perf event change needed for subsequent bpf helpers"

eBPF programs would like access to the (perf) event enabled and
running times along with the event value, such that they can deal with
event multiplexing (among other things).

This patch extends the interface; a future eBPF patch will utilize
the new functionality.

[ Note, there's a same-content commit with a poor changelog and a meaningless
  title in the networking tree as well - but we need this change for subsequent
  perf work, so apply it here as well, with a proper changelog. Hopefully Git
  will be able to sort out this somewhat messy workflow, if there are no other,
  conflicting changes to these files. ]

Signed-off-by: Yonghong Song <yhs@fb.com>
[ Rewrote the changelog. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <ast@fb.com>
Cc: <daniel@iogearbox.net>
Cc: <rostedt@goodmis.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: David S. Miller <davem@davemloft.net>
Link: http://lkml.kernel.org/r/20171005161923.332790-2-yhs@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  7 +++++--
 kernel/bpf/arraymap.c      |  2 +-
 kernel/events/core.c       | 15 +++++++++++++--
 kernel/trace/bpf_trace.c   |  2 +-
 4 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8e22f24ded6a..79b18a20cf5d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -806,6 +806,7 @@ struct perf_output_handle {
 struct bpf_perf_event_data_kern {
 	struct pt_regs *regs;
 	struct perf_sample_data *data;
+	struct perf_event *event;
 };
 
 #ifdef CONFIG_CGROUP_PERF
@@ -884,7 +885,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				void *context);
 extern void perf_pmu_migrate_context(struct pmu *pmu,
 				int src_cpu, int dst_cpu);
-int perf_event_read_local(struct perf_event *event, u64 *value);
+int perf_event_read_local(struct perf_event *event, u64 *value,
+			  u64 *enabled, u64 *running);
 extern u64 perf_event_read_value(struct perf_event *event,
 				 u64 *enabled, u64 *running);
 
@@ -1286,7 +1288,8 @@ static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *
 {
 	return ERR_PTR(-EINVAL);
 }
-static inline int perf_event_read_local(struct perf_event *event, u64 *value)
+static inline int perf_event_read_local(struct perf_event *event, u64 *value,
+					u64 *enabled, u64 *running)
 {
 	return -EINVAL;
 }
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index e2636737b69b..c4b9ab01bba5 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
 
 	ee = ERR_PTR(-EOPNOTSUPP);
 	event = perf_file->private_data;
-	if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
+	if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
 		goto err_out;
 
 	ee = bpf_event_entry_gen(perf_file, map_file);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 04989fb769f0..74671e101b92 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3684,10 +3684,12 @@ static inline u64 perf_event_count(struct perf_event *event)
  *     will not be local and we cannot read them atomically
  *   - must not have a pmu::count method
  */
-int perf_event_read_local(struct perf_event *event, u64 *value)
+int perf_event_read_local(struct perf_event *event, u64 *value,
+			  u64 *enabled, u64 *running)
 {
 	unsigned long flags;
 	int ret = 0;
+	u64 now;
 
 	/*
 	 * Disabling interrupts avoids all counter scheduling (context
@@ -3718,13 +3720,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
 		goto out;
 	}
 
+	now = event->shadow_ctx_time + perf_clock();
+	if (enabled)
+		*enabled = now - event->tstamp_enabled;
 	/*
 	 * If the event is currently on this CPU, its either a per-task event,
 	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
 	 * oncpu == -1).
 	 */
-	if (event->oncpu == smp_processor_id())
+	if (event->oncpu == smp_processor_id()) {
 		event->pmu->read(event);
+		if (running)
+			*running = now - event->tstamp_running;
+	} else if (running) {
+		*running = event->total_time_running;
+	}
 
 	*value = local64_read(&event->count);
 out:
@@ -8072,6 +8082,7 @@ static void bpf_overflow_handler(struct perf_event *event,
 	struct bpf_perf_event_data_kern ctx = {
 		.data = data,
 		.regs = regs,
+		.event = event,
 	};
 	int ret = 0;
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index dc498b605d5d..95888ae6c263 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -275,7 +275,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
 	if (!ee)
 		return -ENOENT;
 
-	err = perf_event_read_local(ee->event, &value);
+	err = perf_event_read_local(ee->event, &value, NULL, NULL);
 	/*
 	 * this api is ugly since we miss [-22..-2] range of valid
 	 * counter values, but that's uapi
-- 
cgit v1.2.3


From 8ca2bd41c7d1c135e9ac6f25970c2d491865088a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Sep 2017 14:12:35 +0200
Subject: perf/core: Rename 'enum perf_event_active_state'

Its a weird name, active is one of the states, it should not be part
of the name, also, its too long.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 6 +++---
 kernel/events/core.c       | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 79b18a20cf5d..b7532650de47 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -485,9 +485,9 @@ struct perf_addr_filters_head {
 };
 
 /**
- * enum perf_event_active_state - the states of a event
+ * enum perf_event_state - the states of a event
  */
-enum perf_event_active_state {
+enum perf_event_state {
 	PERF_EVENT_STATE_DEAD		= -4,
 	PERF_EVENT_STATE_EXIT		= -3,
 	PERF_EVENT_STATE_ERROR		= -2,
@@ -578,7 +578,7 @@ struct perf_event {
 	struct pmu			*pmu;
 	void				*pmu_private;
 
-	enum perf_event_active_state	state;
+	enum perf_event_state		state;
 	unsigned int			attach_state;
 	local64_t			count;
 	atomic64_t			child_count;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b249e0f197a7..6322e245176c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10801,7 +10801,7 @@ inherit_event(struct perf_event *parent_event,
 	      struct perf_event *group_leader,
 	      struct perf_event_context *child_ctx)
 {
-	enum perf_event_active_state parent_state = parent_event->state;
+	enum perf_event_state parent_state = parent_event->state;
 	struct perf_event *child_event;
 	unsigned long flags;
 
-- 
cgit v1.2.3


From 0d3d73aac2ff05c78387aa9dcc2c8aa3804405e7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Sep 2017 14:16:28 +0200
Subject: perf/core: Rewrite event timekeeping

The current even timekeeping, which computes enabled and running
times, uses 3 distinct timestamps to reflect the various event states:
OFF (stopped), INACTIVE (enabled) and ACTIVE (running).

Furthermore, the update rules are such that even INACTIVE events need
their timestamps updated. This is undesirable because we'd like to not
touch INACTIVE events if at all possible, this makes event scheduling
(much) more expensive than needed.

Rewrite the timekeeping to directly use event->state, this greatly
simplifies the code and results in only having to update things when
we change state, or an up-to-date value is requested (read).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  19 +--
 kernel/events/core.c       | 385 +++++++++++++++------------------------------
 2 files changed, 130 insertions(+), 274 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b7532650de47..874b71a70058 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -588,26 +588,10 @@ struct perf_event {
 	 * has been enabled (i.e. eligible to run, and the task has
 	 * been scheduled in, if this is a per-task event)
 	 * and running (scheduled onto the CPU), respectively.
-	 *
-	 * They are computed from tstamp_enabled, tstamp_running and
-	 * tstamp_stopped when the event is in INACTIVE or ACTIVE state.
 	 */
 	u64				total_time_enabled;
 	u64				total_time_running;
-
-	/*
-	 * These are timestamps used for computing total_time_enabled
-	 * and total_time_running when the event is in INACTIVE or
-	 * ACTIVE state, measured in nanoseconds from an arbitrary point
-	 * in time.
-	 * tstamp_enabled: the notional time when the event was enabled
-	 * tstamp_running: the notional time when the event was scheduled on
-	 * tstamp_stopped: in INACTIVE state, the notional time when the
-	 *	event was scheduled off.
-	 */
-	u64				tstamp_enabled;
-	u64				tstamp_running;
-	u64				tstamp_stopped;
+	u64				tstamp;
 
 	/*
 	 * timestamp shadows the actual context timing but it can
@@ -699,7 +683,6 @@ struct perf_event {
 
 #ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp; /* cgroup event is attach to */
-	int				cgrp_defer_enabled;
 #endif
 
 	struct list_head		sb_list;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6f74f9c35490..2551e8ce7224 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -582,6 +582,88 @@ static inline u64 perf_event_clock(struct perf_event *event)
 	return event->clock();
 }
 
+/*
+ * State based event timekeeping...
+ *
+ * The basic idea is to use event->state to determine which (if any) time
+ * fields to increment with the current delta. This means we only need to
+ * update timestamps when we change state or when they are explicitly requested
+ * (read).
+ *
+ * Event groups make things a little more complicated, but not terribly so. The
+ * rules for a group are that if the group leader is OFF the entire group is
+ * OFF, irrespecive of what the group member states are. This results in
+ * __perf_effective_state().
+ *
+ * A futher ramification is that when a group leader flips between OFF and
+ * !OFF, we need to update all group member times.
+ *
+ *
+ * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
+ * need to make sure the relevant context time is updated before we try and
+ * update our timestamps.
+ */
+
+static __always_inline enum perf_event_state
+__perf_effective_state(struct perf_event *event)
+{
+	struct perf_event *leader = event->group_leader;
+
+	if (leader->state <= PERF_EVENT_STATE_OFF)
+		return leader->state;
+
+	return event->state;
+}
+
+static __always_inline void
+__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
+{
+	enum perf_event_state state = __perf_effective_state(event);
+	u64 delta = now - event->tstamp;
+
+	*enabled = event->total_time_enabled;
+	if (state >= PERF_EVENT_STATE_INACTIVE)
+		*enabled += delta;
+
+	*running = event->total_time_running;
+	if (state >= PERF_EVENT_STATE_ACTIVE)
+		*running += delta;
+}
+
+static void perf_event_update_time(struct perf_event *event)
+{
+	u64 now = perf_event_time(event);
+
+	__perf_update_times(event, now, &event->total_time_enabled,
+					&event->total_time_running);
+	event->tstamp = now;
+}
+
+static void perf_event_update_sibling_time(struct perf_event *leader)
+{
+	struct perf_event *sibling;
+
+	list_for_each_entry(sibling, &leader->sibling_list, group_entry)
+		perf_event_update_time(sibling);
+}
+
+static void
+perf_event_set_state(struct perf_event *event, enum perf_event_state state)
+{
+	if (event->state == state)
+		return;
+
+	perf_event_update_time(event);
+	/*
+	 * If a group leader gets enabled/disabled all its siblings
+	 * are affected too.
+	 */
+	if ((event->state < 0) ^ (state < 0))
+		perf_event_update_sibling_time(event);
+
+	WRITE_ONCE(event->state, state);
+}
+
 #ifdef CONFIG_CGROUP_PERF
 
 static inline bool
@@ -841,40 +923,6 @@ perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
 	event->shadow_ctx_time = now - t->timestamp;
 }
 
-static inline void
-perf_cgroup_defer_enabled(struct perf_event *event)
-{
-	/*
-	 * when the current task's perf cgroup does not match
-	 * the event's, we need to remember to call the
-	 * perf_mark_enable() function the first time a task with
-	 * a matching perf cgroup is scheduled in.
-	 */
-	if (is_cgroup_event(event) && !perf_cgroup_match(event))
-		event->cgrp_defer_enabled = 1;
-}
-
-static inline void
-perf_cgroup_mark_enabled(struct perf_event *event,
-			 struct perf_event_context *ctx)
-{
-	struct perf_event *sub;
-	u64 tstamp = perf_event_time(event);
-
-	if (!event->cgrp_defer_enabled)
-		return;
-
-	event->cgrp_defer_enabled = 0;
-
-	event->tstamp_enabled = tstamp - event->total_time_enabled;
-	list_for_each_entry(sub, &event->sibling_list, group_entry) {
-		if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
-			sub->tstamp_enabled = tstamp - sub->total_time_enabled;
-			sub->cgrp_defer_enabled = 0;
-		}
-	}
-}
-
 /*
  * Update cpuctx->cgrp so that it is set when first cgroup event is added and
  * cleared when last cgroup event is removed.
@@ -972,17 +1020,6 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
 	return 0;
 }
 
-static inline void
-perf_cgroup_defer_enabled(struct perf_event *event)
-{
-}
-
-static inline void
-perf_cgroup_mark_enabled(struct perf_event *event,
-			 struct perf_event_context *ctx)
-{
-}
-
 static inline void
 list_update_cgroup_event(struct perf_event *event,
 			 struct perf_event_context *ctx, bool add)
@@ -1396,60 +1433,6 @@ static u64 perf_event_time(struct perf_event *event)
 	return ctx ? ctx->time : 0;
 }
 
-/*
- * Update the total_time_enabled and total_time_running fields for a event.
- */
-static void update_event_times(struct perf_event *event)
-{
-	struct perf_event_context *ctx = event->ctx;
-	u64 run_end;
-
-	lockdep_assert_held(&ctx->lock);
-
-	if (event->state < PERF_EVENT_STATE_INACTIVE ||
-	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
-		return;
-
-	/*
-	 * in cgroup mode, time_enabled represents
-	 * the time the event was enabled AND active
-	 * tasks were in the monitored cgroup. This is
-	 * independent of the activity of the context as
-	 * there may be a mix of cgroup and non-cgroup events.
-	 *
-	 * That is why we treat cgroup events differently
-	 * here.
-	 */
-	if (is_cgroup_event(event))
-		run_end = perf_cgroup_event_time(event);
-	else if (ctx->is_active)
-		run_end = ctx->time;
-	else
-		run_end = event->tstamp_stopped;
-
-	event->total_time_enabled = run_end - event->tstamp_enabled;
-
-	if (event->state == PERF_EVENT_STATE_INACTIVE)
-		run_end = event->tstamp_stopped;
-	else
-		run_end = perf_event_time(event);
-
-	event->total_time_running = run_end - event->tstamp_running;
-
-}
-
-/*
- * Update total_time_enabled and total_time_running for all events in a group.
- */
-static void update_group_times(struct perf_event *leader)
-{
-	struct perf_event *event;
-
-	update_event_times(leader);
-	list_for_each_entry(event, &leader->sibling_list, group_entry)
-		update_event_times(event);
-}
-
 static enum event_type_t get_event_type(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
@@ -1492,6 +1475,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
 	event->attach_state |= PERF_ATTACH_CONTEXT;
 
+	event->tstamp = perf_event_time(event);
+
 	/*
 	 * If we're a stand alone event or group leader, we go to the context
 	 * list, group events are kept attached to the group so that
@@ -1699,8 +1684,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	if (event->group_leader == event)
 		list_del_init(&event->group_entry);
 
-	update_group_times(event);
-
 	/*
 	 * If event was in error state, then keep it
 	 * that way, otherwise bogus counts will be
@@ -1709,7 +1692,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	 * of the event
 	 */
 	if (event->state > PERF_EVENT_STATE_OFF)
-		event->state = PERF_EVENT_STATE_OFF;
+		perf_event_set_state(event, PERF_EVENT_STATE_OFF);
 
 	ctx->generation++;
 }
@@ -1808,38 +1791,24 @@ event_sched_out(struct perf_event *event,
 		  struct perf_cpu_context *cpuctx,
 		  struct perf_event_context *ctx)
 {
-	u64 tstamp = perf_event_time(event);
-	u64 delta;
+	enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
 
 	WARN_ON_ONCE(event->ctx != ctx);
 	lockdep_assert_held(&ctx->lock);
 
-	/*
-	 * An event which could not be activated because of
-	 * filter mismatch still needs to have its timings
-	 * maintained, otherwise bogus information is return
-	 * via read() for time_enabled, time_running:
-	 */
-	if (event->state == PERF_EVENT_STATE_INACTIVE &&
-	    !event_filter_match(event)) {
-		delta = tstamp - event->tstamp_stopped;
-		event->tstamp_running += delta;
-		event->tstamp_stopped = tstamp;
-	}
-
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
 		return;
 
 	perf_pmu_disable(event->pmu);
 
-	event->tstamp_stopped = tstamp;
 	event->pmu->del(event, 0);
 	event->oncpu = -1;
-	event->state = PERF_EVENT_STATE_INACTIVE;
+
 	if (event->pending_disable) {
 		event->pending_disable = 0;
-		event->state = PERF_EVENT_STATE_OFF;
+		state = PERF_EVENT_STATE_OFF;
 	}
+	perf_event_set_state(event, state);
 
 	if (!is_software_event(event))
 		cpuctx->active_oncpu--;
@@ -1859,7 +1828,9 @@ group_sched_out(struct perf_event *group_event,
 		struct perf_event_context *ctx)
 {
 	struct perf_event *event;
-	int state = group_event->state;
+
+	if (group_event->state != PERF_EVENT_STATE_ACTIVE)
+		return;
 
 	perf_pmu_disable(ctx->pmu);
 
@@ -1873,7 +1844,7 @@ group_sched_out(struct perf_event *group_event,
 
 	perf_pmu_enable(ctx->pmu);
 
-	if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
+	if (group_event->attr.exclusive)
 		cpuctx->exclusive = 0;
 }
 
@@ -1965,12 +1936,12 @@ static void __perf_event_disable(struct perf_event *event,
 		update_cgrp_time_from_event(event);
 	}
 
-	update_group_times(event);
 	if (event == event->group_leader)
 		group_sched_out(event, cpuctx, ctx);
 	else
 		event_sched_out(event, cpuctx, ctx);
-	event->state = PERF_EVENT_STATE_OFF;
+
+	perf_event_set_state(event, PERF_EVENT_STATE_OFF);
 }
 
 /*
@@ -2027,8 +1998,7 @@ void perf_event_disable_inatomic(struct perf_event *event)
 }
 
 static void perf_set_shadow_time(struct perf_event *event,
-				 struct perf_event_context *ctx,
-				 u64 tstamp)
+				 struct perf_event_context *ctx)
 {
 	/*
 	 * use the correct time source for the time snapshot
@@ -2056,9 +2026,9 @@ static void perf_set_shadow_time(struct perf_event *event,
 	 * is cleaner and simpler to understand.
 	 */
 	if (is_cgroup_event(event))
-		perf_cgroup_set_shadow_time(event, tstamp);
+		perf_cgroup_set_shadow_time(event, event->tstamp);
 	else
-		event->shadow_ctx_time = tstamp - ctx->timestamp;
+		event->shadow_ctx_time = event->tstamp - ctx->timestamp;
 }
 
 #define MAX_INTERRUPTS (~0ULL)
@@ -2071,7 +2041,6 @@ event_sched_in(struct perf_event *event,
 		 struct perf_cpu_context *cpuctx,
 		 struct perf_event_context *ctx)
 {
-	u64 tstamp = perf_event_time(event);
 	int ret = 0;
 
 	lockdep_assert_held(&ctx->lock);
@@ -2086,7 +2055,7 @@ event_sched_in(struct perf_event *event,
 	 * ->oncpu if it sees ACTIVE.
 	 */
 	smp_wmb();
-	WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
+	perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
 
 	/*
 	 * Unthrottle events, since we scheduled we might have missed several
@@ -2100,19 +2069,17 @@ event_sched_in(struct perf_event *event,
 
 	perf_pmu_disable(event->pmu);
 
-	perf_set_shadow_time(event, ctx, tstamp);
+	perf_set_shadow_time(event, ctx);
 
 	perf_log_itrace_start(event);
 
 	if (event->pmu->add(event, PERF_EF_START)) {
-		event->state = PERF_EVENT_STATE_INACTIVE;
+		perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
 		event->oncpu = -1;
 		ret = -EAGAIN;
 		goto out;
 	}
 
-	event->tstamp_running += tstamp - event->tstamp_stopped;
-
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
 	if (!ctx->nr_active++)
@@ -2136,8 +2103,6 @@ group_sched_in(struct perf_event *group_event,
 {
 	struct perf_event *event, *partial_group = NULL;
 	struct pmu *pmu = ctx->pmu;
-	u64 now = ctx->time;
-	bool simulate = false;
 
 	if (group_event->state == PERF_EVENT_STATE_OFF)
 		return 0;
@@ -2167,27 +2132,13 @@ group_error:
 	/*
 	 * Groups can be scheduled in as one unit only, so undo any
 	 * partial group before returning:
-	 * The events up to the failed event are scheduled out normally,
-	 * tstamp_stopped will be updated.
-	 *
-	 * The failed events and the remaining siblings need to have
-	 * their timings updated as if they had gone thru event_sched_in()
-	 * and event_sched_out(). This is required to get consistent timings
-	 * across the group. This also takes care of the case where the group
-	 * could never be scheduled by ensuring tstamp_stopped is set to mark
-	 * the time the event was actually stopped, such that time delta
-	 * calculation in update_event_times() is correct.
+	 * The events up to the failed event are scheduled out normally.
 	 */
 	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 		if (event == partial_group)
-			simulate = true;
+			break;
 
-		if (simulate) {
-			event->tstamp_running += now - event->tstamp_stopped;
-			event->tstamp_stopped = now;
-		} else {
-			event_sched_out(event, cpuctx, ctx);
-		}
+		event_sched_out(event, cpuctx, ctx);
 	}
 	event_sched_out(group_event, cpuctx, ctx);
 
@@ -2229,46 +2180,11 @@ static int group_can_go_on(struct perf_event *event,
 	return can_add_hw;
 }
 
-/*
- * Complement to update_event_times(). This computes the tstamp_* values to
- * continue 'enabled' state from @now, and effectively discards the time
- * between the prior tstamp_stopped and now (as we were in the OFF state, or
- * just switched (context) time base).
- *
- * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
- * cannot have been scheduled in yet. And going into INACTIVE state means
- * '@event->tstamp_stopped = @now'.
- *
- * Thus given the rules of update_event_times():
- *
- *   total_time_enabled = tstamp_stopped - tstamp_enabled
- *   total_time_running = tstamp_stopped - tstamp_running
- *
- * We can insert 'tstamp_stopped == now' and reverse them to compute new
- * tstamp_* values.
- */
-static void __perf_event_enable_time(struct perf_event *event, u64 now)
-{
-	WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
-
-	event->tstamp_stopped = now;
-	event->tstamp_enabled = now - event->total_time_enabled;
-	event->tstamp_running = now - event->total_time_running;
-}
-
 static void add_event_to_ctx(struct perf_event *event,
 			       struct perf_event_context *ctx)
 {
-	u64 tstamp = perf_event_time(event);
-
 	list_add_event(event, ctx);
 	perf_group_attach(event);
-	/*
-	 * We can be called with event->state == STATE_OFF when we create with
-	 * .disabled = 1. In that case the IOC_ENABLE will call this function.
-	 */
-	if (event->state == PERF_EVENT_STATE_INACTIVE)
-		__perf_event_enable_time(event, tstamp);
 }
 
 static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2499,28 +2415,6 @@ again:
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
-/*
- * Put a event into inactive state and update time fields.
- * Enabling the leader of a group effectively enables all
- * the group members that aren't explicitly disabled, so we
- * have to update their ->tstamp_enabled also.
- * Note: this works for group members as well as group leaders
- * since the non-leader members' sibling_lists will be empty.
- */
-static void __perf_event_mark_enabled(struct perf_event *event)
-{
-	struct perf_event *sub;
-	u64 tstamp = perf_event_time(event);
-
-	event->state = PERF_EVENT_STATE_INACTIVE;
-	__perf_event_enable_time(event, tstamp);
-	list_for_each_entry(sub, &event->sibling_list, group_entry) {
-		/* XXX should not be > INACTIVE if event isn't */
-		if (sub->state >= PERF_EVENT_STATE_INACTIVE)
-			__perf_event_enable_time(sub, tstamp);
-	}
-}
-
 /*
  * Cross CPU call to enable a performance event
  */
@@ -2539,14 +2433,12 @@ static void __perf_event_enable(struct perf_event *event,
 	if (ctx->is_active)
 		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
 
-	__perf_event_mark_enabled(event);
+	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
 
 	if (!ctx->is_active)
 		return;
 
 	if (!event_filter_match(event)) {
-		if (is_cgroup_event(event))
-			perf_cgroup_defer_enabled(event);
 		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
 		return;
 	}
@@ -2866,18 +2758,10 @@ static void __perf_event_sync_stat(struct perf_event *event,
 	 * we know the event must be on the current CPU, therefore we
 	 * don't need to use it.
 	 */
-	switch (event->state) {
-	case PERF_EVENT_STATE_ACTIVE:
+	if (event->state == PERF_EVENT_STATE_ACTIVE)
 		event->pmu->read(event);
-		/* fall-through */
 
-	case PERF_EVENT_STATE_INACTIVE:
-		update_event_times(event);
-		break;
-
-	default:
-		break;
-	}
+	perf_event_update_time(event);
 
 	/*
 	 * In order to keep per-task stats reliable we need to flip the event
@@ -3114,10 +2998,6 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
 		if (!event_filter_match(event))
 			continue;
 
-		/* may need to reset tstamp_enabled */
-		if (is_cgroup_event(event))
-			perf_cgroup_mark_enabled(event, ctx);
-
 		if (group_can_go_on(event, cpuctx, 1))
 			group_sched_in(event, cpuctx, ctx);
 
@@ -3125,10 +3005,8 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
 		 * If this pinned group hasn't been scheduled,
 		 * put it in error state.
 		 */
-		if (event->state == PERF_EVENT_STATE_INACTIVE) {
-			update_group_times(event);
-			event->state = PERF_EVENT_STATE_ERROR;
-		}
+		if (event->state == PERF_EVENT_STATE_INACTIVE)
+			perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
 	}
 }
 
@@ -3150,10 +3028,6 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 		if (!event_filter_match(event))
 			continue;
 
-		/* may need to reset tstamp_enabled */
-		if (is_cgroup_event(event))
-			perf_cgroup_mark_enabled(event, ctx);
-
 		if (group_can_go_on(event, cpuctx, can_add_hw)) {
 			if (group_sched_in(event, cpuctx, ctx))
 				can_add_hw = 0;
@@ -3545,7 +3419,7 @@ static int event_enable_on_exec(struct perf_event *event,
 	if (event->state >= PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
-	__perf_event_mark_enabled(event);
+	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
 
 	return 1;
 }
@@ -3644,10 +3518,9 @@ static void __perf_event_read(void *info)
 		update_cgrp_time_from_event(event);
 	}
 
-	if (!data->group)
-		update_event_times(event);
-	else
-		update_group_times(event);
+	perf_event_update_time(event);
+	if (data->group)
+		perf_event_update_sibling_time(event);
 
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
 		goto unlock;
@@ -3696,7 +3569,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
 {
 	unsigned long flags;
 	int ret = 0;
-	u64 now;
 
 	/*
 	 * Disabling interrupts avoids all counter scheduling (context
@@ -3727,23 +3599,26 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
 		goto out;
 	}
 
-	now = event->shadow_ctx_time + perf_clock();
-	if (enabled)
-		*enabled = now - event->tstamp_enabled;
+
 	/*
 	 * If the event is currently on this CPU, its either a per-task event,
 	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
 	 * oncpu == -1).
 	 */
-	if (event->oncpu == smp_processor_id()) {
+	if (event->oncpu == smp_processor_id())
 		event->pmu->read(event);
-		if (running)
-			*running = now - event->tstamp_running;
-	} else if (running) {
-		*running = event->total_time_running;
-	}
 
 	*value = local64_read(&event->count);
+	if (enabled || running) {
+		u64 now = event->shadow_ctx_time + perf_clock();
+		u64 __enabled, __running;
+
+		__perf_update_times(event, now, &__enabled, &__running);
+		if (enabled)
+			*enabled = __enabled;
+		if (running)
+			*running = __running;
+	}
 out:
 	local_irq_restore(flags);
 
@@ -3818,10 +3693,9 @@ again:
 			update_cgrp_time_from_event(event);
 		}
 
+		perf_event_update_time(event);
 		if (group)
-			update_group_times(event);
-		else
-			update_event_times(event);
+			perf_event_update_sibling_time(event);
 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
 	}
 
@@ -4945,8 +4819,7 @@ static void calc_timer_values(struct perf_event *event,
 
 	*now = perf_clock();
 	ctx_time = event->shadow_ctx_time + *now;
-	*enabled = ctx_time - event->tstamp_enabled;
-	*running = ctx_time - event->tstamp_running;
+	__perf_update_times(event, ctx_time, enabled, running);
 }
 
 static void perf_event_init_userpage(struct perf_event *event)
@@ -10581,7 +10454,7 @@ perf_event_exit_event(struct perf_event *child_event,
 	if (parent_event)
 		perf_group_detach(child_event);
 	list_del_event(child_event, child_ctx);
-	child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
+	perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
 	raw_spin_unlock_irq(&child_ctx->lock);
 
 	/*
-- 
cgit v1.2.3


From 035226b964c820f65e201cdf123705a8f1d7c670 Mon Sep 17 00:00:00 2001
From: Gianluca Borello <g.borello@gmail.com>
Date: Thu, 26 Oct 2017 01:47:42 +0000
Subject: bpf: remove tail_call and get_stackid helper declarations from bpf.h

commit afdb09c720b6 ("security: bpf: Add LSM hooks for bpf object related
syscall") included linux/bpf.h in linux/security.h. As a result, bpf
programs including bpf_helpers.h and some other header that ends up
pulling in also security.h, such as several examples under samples/bpf,
fail to compile because bpf_tail_call and bpf_get_stackid are now
"redefined as different kind of symbol".

>From bpf.h:

u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);

Whereas in bpf_helpers.h they are:

static void (*bpf_tail_call)(void *ctx, void *map, int index);
static int (*bpf_get_stackid)(void *ctx, void *map, int flags);

Fix this by removing the unused declaration of bpf_tail_call and moving
the declaration of bpf_get_stackid in bpf_trace.c, which is the only
place where it's needed.

Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      | 3 ---
 kernel/trace/bpf_trace.c | 2 ++
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 172be7faf7ba..520aeebe0d93 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -231,9 +231,6 @@ struct bpf_event_entry {
 	struct rcu_head rcu;
 };
 
-u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
-u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
-
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
 int bpf_prog_calc_tag(struct bpf_prog *fp);
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b65011d320e3..136aa6bb0422 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -15,6 +15,8 @@
 #include <linux/ctype.h>
 #include "trace.h"
 
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+
 /**
  * trace_call_bpf - invoke BPF program
  * @call: tracepoint event
-- 
cgit v1.2.3


From 728fe6cef27444b6575c5e7ab5de13274610488b Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 11 Oct 2017 11:41:19 +0200
Subject: i2c: Allow overriding dev_name through board_info

For devices not instantiated through ACPI the i2c-client's device-name
gets set to <busnr>-<addr> by default, e.g. "0-0022" this means that
the device-name is dependent on the order in which the i2c-busses are
enumerated.

In some cases having a predictable constant device-name is desirable,
for example on non device-tree platforms the link between a regulator
and its consumers is specified by the platform code by setting
regulator_init_data.consumers. This array identifies the regulator's
consumers by dev_name and supply(-name). Which requires a constant
dev_name.

This commit adds a dev_name field to i2c_board_info allowing
platform code to set a contstant dev_name so that the device can
be identified by its dev_name in other platform code.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Mark Brown <broonie@kernel.org> (live at ELCE17)
Acked-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com> (live at ELCE17)
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core-base.c | 10 ++++++++--
 include/linux/i2c.h         |  2 ++
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c
index 56e46581b84b..875d6cacaa17 100644
--- a/drivers/i2c/i2c-core-base.c
+++ b/drivers/i2c/i2c-core-base.c
@@ -666,10 +666,16 @@ static void i2c_adapter_unlock_bus(struct i2c_adapter *adapter,
 }
 
 static void i2c_dev_set_name(struct i2c_adapter *adap,
-			     struct i2c_client *client)
+			     struct i2c_client *client,
+			     struct i2c_board_info const *info)
 {
 	struct acpi_device *adev = ACPI_COMPANION(&client->dev);
 
+	if (info && info->dev_name) {
+		dev_set_name(&client->dev, "i2c-%s", info->dev_name);
+		return;
+	}
+
 	if (adev) {
 		dev_set_name(&client->dev, "i2c-%s", acpi_dev_name(adev));
 		return;
@@ -766,7 +772,7 @@ i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info)
 	client->dev.of_node = info->of_node;
 	client->dev.fwnode = info->fwnode;
 
-	i2c_dev_set_name(adap, client);
+	i2c_dev_set_name(adap, client, info);
 
 	if (info->properties) {
 		status = device_add_properties(&client->dev, info->properties);
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index d501d3956f13..0f774406fad0 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -304,6 +304,7 @@ static inline bool i2c_detect_slave_mode(struct device *dev) { return false; }
  * @type: chip type, to initialize i2c_client.name
  * @flags: to initialize i2c_client.flags
  * @addr: stored in i2c_client.addr
+ * @dev_name: Overrides the default <busnr>-<addr> dev_name if set
  * @platform_data: stored in i2c_client.dev.platform_data
  * @archdata: copied into i2c_client.dev.archdata
  * @of_node: pointer to OpenFirmware device node
@@ -328,6 +329,7 @@ struct i2c_board_info {
 	char		type[I2C_NAME_SIZE];
 	unsigned short	flags;
 	unsigned short	addr;
+	const char	*dev_name;
 	void		*platform_data;
 	struct dev_archdata	*archdata;
 	struct device_node *of_node;
-- 
cgit v1.2.3


From 356c3e9afac0cc19c3d3b0cbc67106ce8efa0743 Mon Sep 17 00:00:00 2001
From: Egil Hjelmeland <privat@egil-hjelmeland.no>
Date: Thu, 26 Oct 2017 11:00:48 +0200
Subject: net: dsa: lan9303: Move struct lan9303 to include/linux/dsa/lan9303.h

The next patch require net/dsa/tag_lan9303.c to access struct lan9303.
Therefore move struct lan9303 definitions from drivers/net/dsa/lan9303.h
to new file include/linux/dsa/lan9303.h.

Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                 |  1 +
 drivers/net/dsa/lan9303.h   | 34 +---------------------------------
 include/linux/dsa/lan9303.h | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 33 deletions(-)
 create mode 100644 include/linux/dsa/lan9303.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index e3a7ca9d2783..c9ee7abf4627 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9415,6 +9415,7 @@ M:	Florian Fainelli <f.fainelli@gmail.com>
 S:	Maintained
 F:	net/dsa/
 F:	include/net/dsa.h
+F:	include/linux/dsa/
 F:	drivers/net/dsa/
 
 NETWORKING [GENERAL]
diff --git a/drivers/net/dsa/lan9303.h b/drivers/net/dsa/lan9303.h
index d807b1be35f2..b868e5040830 100644
--- a/drivers/net/dsa/lan9303.h
+++ b/drivers/net/dsa/lan9303.h
@@ -2,39 +2,7 @@
 #include <linux/device.h>
 #include <net/dsa.h>
 
-struct lan9303;
-
-struct lan9303_phy_ops {
-	/* PHY 1 and 2 access*/
-	int	(*phy_read)(struct lan9303 *chip, int port, int regnum);
-	int	(*phy_write)(struct lan9303 *chip, int port,
-			     int regnum, u16 val);
-};
-
-#define LAN9303_NUM_ALR_RECORDS 512
-struct lan9303_alr_cache_entry {
-	u8  mac_addr[ETH_ALEN];
-	u8  port_map;           /* Bitmap of ports. Zero if unused entry */
-	u8  stp_override;       /* non zero if set ALR_DAT1_AGE_OVERRID */
-};
-
-struct lan9303 {
-	struct device *dev;
-	struct regmap *regmap;
-	struct regmap_irq_chip_data *irq_data;
-	struct gpio_desc *reset_gpio;
-	u32 reset_duration; /* in [ms] */
-	bool phy_addr_sel_strap;
-	struct dsa_switch *ds;
-	struct mutex indirect_mutex; /* protect indexed register access */
-	const struct lan9303_phy_ops *ops;
-	bool is_bridged; /* true if port 1 and 2 are bridged */
-	u32 swe_port_state; /* remember SWE_PORT_STATE while not bridged */
-	/* LAN9303 do not offer reading specific ALR entry. Cache all
-	 * static entries in a flat table
-	 **/
-	struct lan9303_alr_cache_entry alr_cache[LAN9303_NUM_ALR_RECORDS];
-};
+#include <linux/dsa/lan9303.h>
 
 extern const struct regmap_access_table lan9303_register_set;
 extern const struct lan9303_phy_ops lan9303_indirect_phy_ops;
diff --git a/include/linux/dsa/lan9303.h b/include/linux/dsa/lan9303.h
new file mode 100644
index 000000000000..05d8d136baab
--- /dev/null
+++ b/include/linux/dsa/lan9303.h
@@ -0,0 +1,36 @@
+/* Included by drivers/net/dsa/lan9303.h and net/dsa/tag_lan9303.c */
+#include <linux/if_ether.h>
+
+struct lan9303;
+
+struct lan9303_phy_ops {
+	/* PHY 1 and 2 access*/
+	int	(*phy_read)(struct lan9303 *chip, int port, int regnum);
+	int	(*phy_write)(struct lan9303 *chip, int port,
+			     int regnum, u16 val);
+};
+
+#define LAN9303_NUM_ALR_RECORDS 512
+struct lan9303_alr_cache_entry {
+	u8  mac_addr[ETH_ALEN];
+	u8  port_map;           /* Bitmap of ports. Zero if unused entry */
+	u8  stp_override;       /* non zero if set ALR_DAT1_AGE_OVERRID */
+};
+
+struct lan9303 {
+	struct device *dev;
+	struct regmap *regmap;
+	struct regmap_irq_chip_data *irq_data;
+	struct gpio_desc *reset_gpio;
+	u32 reset_duration; /* in [ms] */
+	bool phy_addr_sel_strap;
+	struct dsa_switch *ds;
+	struct mutex indirect_mutex; /* protect indexed register access */
+	const struct lan9303_phy_ops *ops;
+	bool is_bridged; /* true if port 1 and 2 are bridged */
+	u32 swe_port_state; /* remember SWE_PORT_STATE while not bridged */
+	/* LAN9303 do not offer reading specific ALR entry. Cache all
+	 * static entries in a flat table
+	 **/
+	struct lan9303_alr_cache_entry alr_cache[LAN9303_NUM_ALR_RECORDS];
+};
-- 
cgit v1.2.3


From 3d0bd028ffb4a4915cb64cfa0d2cee1578cc0321 Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Mon, 16 Oct 2017 18:01:27 -0700
Subject: net/sched: Add support for HW offloading for CBS

This adds support for offloading the CBS algorithm to the controller,
if supported. Drivers wanting to support CBS offload must implement
the .ndo_setup_tc callback and handle the TC_SETUP_CBS (introduced
here) type.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Tested-by: Henrik Austad <henrik@austad.us>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/netdevice.h |   1 +
 include/net/pkt_sched.h   |   9 ++++
 net/sched/sch_cbs.c       | 104 ++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 102 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6c7960c8338a..5e02f79b2110 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -776,6 +776,7 @@ enum tc_setup_type {
 	TC_SETUP_CLSMATCHALL,
 	TC_SETUP_CLSBPF,
 	TC_SETUP_BLOCK,
+	TC_SETUP_CBS,
 };
 
 /* These structures hold the attributes of xdp state that are being passed
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index b8ecafce4ba1..02f2db26e30c 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -140,4 +140,13 @@ static inline struct net *qdisc_net(struct Qdisc *q)
 	return dev_net(q->dev_queue->dev);
 }
 
+struct tc_cbs_qopt_offload {
+	u8 enable;
+	s32 queue;
+	s32 hicredit;
+	s32 locredit;
+	s32 idleslope;
+	s32 sendslope;
+};
+
 #endif
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index 0e85133c5653..bdb533b7fb8c 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -68,6 +68,8 @@
 #define BYTES_PER_KBIT (1000LL / 8)
 
 struct cbs_sched_data {
+	bool offload;
+	int queue;
 	s64 port_rate; /* in bytes/s */
 	s64 last; /* timestamp in ns */
 	s64 credits; /* in bytes */
@@ -80,6 +82,11 @@ struct cbs_sched_data {
 	struct sk_buff *(*dequeue)(struct Qdisc *sch);
 };
 
+static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch)
+{
+	return qdisc_enqueue_tail(skb, sch);
+}
+
 static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct cbs_sched_data *q = qdisc_priv(sch);
@@ -169,6 +176,11 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
 	return skb;
 }
 
+static struct sk_buff *cbs_dequeue_offload(struct Qdisc *sch)
+{
+	return qdisc_dequeue_head(sch);
+}
+
 static struct sk_buff *cbs_dequeue(struct Qdisc *sch)
 {
 	struct cbs_sched_data *q = qdisc_priv(sch);
@@ -180,14 +192,66 @@ static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = {
 	[TCA_CBS_PARMS]	= { .len = sizeof(struct tc_cbs_qopt) },
 };
 
+static void cbs_disable_offload(struct net_device *dev,
+				struct cbs_sched_data *q)
+{
+	struct tc_cbs_qopt_offload cbs = { };
+	const struct net_device_ops *ops;
+	int err;
+
+	if (!q->offload)
+		return;
+
+	q->enqueue = cbs_enqueue_soft;
+	q->dequeue = cbs_dequeue_soft;
+
+	ops = dev->netdev_ops;
+	if (!ops->ndo_setup_tc)
+		return;
+
+	cbs.queue = q->queue;
+	cbs.enable = 0;
+
+	err = ops->ndo_setup_tc(dev, TC_SETUP_CBS, &cbs);
+	if (err < 0)
+		pr_warn("Couldn't disable CBS offload for queue %d\n",
+			cbs.queue);
+}
+
+static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q,
+			      const struct tc_cbs_qopt *opt)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+	struct tc_cbs_qopt_offload cbs = { };
+	int err;
+
+	if (!ops->ndo_setup_tc)
+		return -EOPNOTSUPP;
+
+	cbs.queue = q->queue;
+
+	cbs.enable = 1;
+	cbs.hicredit = opt->hicredit;
+	cbs.locredit = opt->locredit;
+	cbs.idleslope = opt->idleslope;
+	cbs.sendslope = opt->sendslope;
+
+	err = ops->ndo_setup_tc(dev, TC_SETUP_CBS, &cbs);
+	if (err < 0)
+		return err;
+
+	q->enqueue = cbs_enqueue_offload;
+	q->dequeue = cbs_dequeue_offload;
+
+	return 0;
+}
+
 static int cbs_change(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct cbs_sched_data *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
 	struct nlattr *tb[TCA_CBS_MAX + 1];
-	struct ethtool_link_ksettings ecmd;
 	struct tc_cbs_qopt *qopt;
-	s64 link_speed;
 	int err;
 
 	err = nla_parse_nested(tb, TCA_CBS_MAX, opt, cbs_policy, NULL);
@@ -199,23 +263,30 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt)
 
 	qopt = nla_data(tb[TCA_CBS_PARMS]);
 
-	if (qopt->offload)
-		return -EOPNOTSUPP;
+	if (!qopt->offload) {
+		struct ethtool_link_ksettings ecmd;
+		s64 link_speed;
 
-	if (!__ethtool_get_link_ksettings(dev, &ecmd))
-		link_speed = ecmd.base.speed;
-	else
-		link_speed = SPEED_1000;
+		if (!__ethtool_get_link_ksettings(dev, &ecmd))
+			link_speed = ecmd.base.speed;
+		else
+			link_speed = SPEED_1000;
 
-	q->port_rate = link_speed * 1000 * BYTES_PER_KBIT;
+		q->port_rate = link_speed * 1000 * BYTES_PER_KBIT;
 
-	q->enqueue = cbs_enqueue_soft;
-	q->dequeue = cbs_dequeue_soft;
+		cbs_disable_offload(dev, q);
+	} else {
+		err = cbs_enable_offload(dev, q, qopt);
+		if (err < 0)
+			return err;
+	}
 
+	/* Everything went OK, save the parameters used. */
 	q->hicredit = qopt->hicredit;
 	q->locredit = qopt->locredit;
 	q->idleslope = qopt->idleslope * BYTES_PER_KBIT;
 	q->sendslope = qopt->sendslope * BYTES_PER_KBIT;
+	q->offload = qopt->offload;
 
 	return 0;
 }
@@ -223,10 +294,16 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt)
 static int cbs_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct cbs_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
 
 	if (!opt)
 		return -EINVAL;
 
+	q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
+
+	q->enqueue = cbs_enqueue_soft;
+	q->dequeue = cbs_dequeue_soft;
+
 	qdisc_watchdog_init(&q->watchdog, sch);
 
 	return cbs_change(sch, opt);
@@ -235,8 +312,11 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt)
 static void cbs_destroy(struct Qdisc *sch)
 {
 	struct cbs_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
 
 	qdisc_watchdog_cancel(&q->watchdog);
+
+	cbs_disable_offload(dev, q);
 }
 
 static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -253,7 +333,7 @@ static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
 	opt.locredit = q->locredit;
 	opt.sendslope = div64_s64(q->sendslope, BYTES_PER_KBIT);
 	opt.idleslope = div64_s64(q->idleslope, BYTES_PER_KBIT);
-	opt.offload = 0;
+	opt.offload = q->offload;
 
 	if (nla_put(skb, TCA_CBS_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
-- 
cgit v1.2.3


From 9b9f2b8bc2ac98d91da714660c53d1cdac999e09 Mon Sep 17 00:00:00 2001
From: Phil Reid <preid@electromag.com.au>
Date: Thu, 24 Aug 2017 17:31:01 +0800
Subject: i2c: i2c-smbus: Use threaded irq for smbalert

Prior to this commit the smbalert_irq was handling in the hard irq
context. This change switch to using a thread irq which avoids the need
for the work thread. Using threaded irq also removes the need for the
edge_triggered flag as the enabling / disabling of the hard irq for level
triggered interrupts will be handled by the irq core.

Without this change have an irq connected to something like an i2c gpio
resulted in a null ptr deferences. Specifically handle_nested_irq calls
the threaded irq handler.

There are currently 3 in tree drivers affected by this change.

i2c-parport driver calls i2c_handle_smbus_alert in a hard irq context.
This driver use edge trigger interrupts which skip the enable / disable
calls. But it still need to handle the smbus transaction on a thread. So
the work thread is kept for this driver.

i2c-parport-light & i2c-thunderx-pcidrv provide the irq number in the
setup which will result in the thread irq being used.

i2c-parport-light is edge trigger so the enable / disable call was
skipped as well.

i2c-thunderx-pcidrv is getting the edge / level trigger setting from of
data and was setting the flag as required. However the irq core should
handle this automatically.

Signed-off-by: Phil Reid <preid@electromag.com.au>
Reviewed-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/busses/i2c-parport-light.c   |  1 -
 drivers/i2c/busses/i2c-parport.c         |  1 -
 drivers/i2c/busses/i2c-thunderx-pcidrv.c |  6 -----
 drivers/i2c/i2c-smbus.c                  | 41 +++++++++++++-------------------
 include/linux/i2c-smbus.h                |  1 -
 5 files changed, 17 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/busses/i2c-parport-light.c b/drivers/i2c/busses/i2c-parport-light.c
index faa8fb8f2b8f..fa41ff799533 100644
--- a/drivers/i2c/busses/i2c-parport-light.c
+++ b/drivers/i2c/busses/i2c-parport-light.c
@@ -123,7 +123,6 @@ static struct i2c_adapter parport_adapter = {
 
 /* SMBus alert support */
 static struct i2c_smbus_alert_setup alert_data = {
-	.alert_edge_triggered	= 1,
 };
 static struct i2c_client *ara;
 static struct lineop parport_ctrl_irq = {
diff --git a/drivers/i2c/busses/i2c-parport.c b/drivers/i2c/busses/i2c-parport.c
index a8e54df4aed6..319209a07353 100644
--- a/drivers/i2c/busses/i2c-parport.c
+++ b/drivers/i2c/busses/i2c-parport.c
@@ -237,7 +237,6 @@ static void i2c_parport_attach(struct parport *port)
 
 	/* Setup SMBus alert if supported */
 	if (adapter_parm[type].smbus_alert) {
-		adapter->alert_data.alert_edge_triggered = 1;
 		adapter->ara = i2c_setup_smbus_alert(&adapter->adapter,
 						     &adapter->alert_data);
 		if (adapter->ara)
diff --git a/drivers/i2c/busses/i2c-thunderx-pcidrv.c b/drivers/i2c/busses/i2c-thunderx-pcidrv.c
index df0976f4432a..1a7cad874756 100644
--- a/drivers/i2c/busses/i2c-thunderx-pcidrv.c
+++ b/drivers/i2c/busses/i2c-thunderx-pcidrv.c
@@ -118,8 +118,6 @@ static void thunder_i2c_clock_disable(struct device *dev, struct clk *clk)
 static int thunder_i2c_smbus_setup_of(struct octeon_i2c *i2c,
 				      struct device_node *node)
 {
-	u32 type;
-
 	if (!node)
 		return -EINVAL;
 
@@ -127,10 +125,6 @@ static int thunder_i2c_smbus_setup_of(struct octeon_i2c *i2c,
 	if (!i2c->alert_data.irq)
 		return -EINVAL;
 
-	type = irqd_get_trigger_type(irq_get_irq_data(i2c->alert_data.irq));
-	i2c->alert_data.alert_edge_triggered =
-		(type & IRQ_TYPE_LEVEL_MASK) ? 1 : 0;
-
 	i2c->ara = i2c_setup_smbus_alert(&i2c->adap, &i2c->alert_data);
 	if (!i2c->ara)
 		return -ENODEV;
diff --git a/drivers/i2c/i2c-smbus.c b/drivers/i2c/i2c-smbus.c
index f9271c713d20..d4af2701ac6e 100644
--- a/drivers/i2c/i2c-smbus.c
+++ b/drivers/i2c/i2c-smbus.c
@@ -25,8 +25,6 @@
 #include <linux/workqueue.h>
 
 struct i2c_smbus_alert {
-	unsigned int		alert_edge_triggered:1;
-	int			irq;
 	struct work_struct	alert;
 	struct i2c_client	*ara;		/* Alert response address */
 };
@@ -72,13 +70,12 @@ static int smbus_do_alert(struct device *dev, void *addrp)
  * The alert IRQ handler needs to hand work off to a task which can issue
  * SMBus calls, because those sleeping calls can't be made in IRQ context.
  */
-static void smbus_alert(struct work_struct *work)
+static irqreturn_t smbus_alert(int irq, void *d)
 {
-	struct i2c_smbus_alert *alert;
+	struct i2c_smbus_alert *alert = d;
 	struct i2c_client *ara;
 	unsigned short prev_addr = 0;	/* Not a valid address */
 
-	alert = container_of(work, struct i2c_smbus_alert, alert);
 	ara = alert->ara;
 
 	for (;;) {
@@ -115,21 +112,17 @@ static void smbus_alert(struct work_struct *work)
 		prev_addr = data.addr;
 	}
 
-	/* We handled all alerts; re-enable level-triggered IRQs */
-	if (!alert->alert_edge_triggered)
-		enable_irq(alert->irq);
+	return IRQ_HANDLED;
 }
 
-static irqreturn_t smbalert_irq(int irq, void *d)
+static void smbalert_work(struct work_struct *work)
 {
-	struct i2c_smbus_alert *alert = d;
+	struct i2c_smbus_alert *alert;
+
+	alert = container_of(work, struct i2c_smbus_alert, alert);
 
-	/* Disable level-triggered IRQs until we handle them */
-	if (!alert->alert_edge_triggered)
-		disable_irq_nosync(irq);
+	smbus_alert(0, alert);
 
-	schedule_work(&alert->alert);
-	return IRQ_HANDLED;
 }
 
 /* Setup SMBALERT# infrastructure */
@@ -139,28 +132,28 @@ static int smbalert_probe(struct i2c_client *ara,
 	struct i2c_smbus_alert_setup *setup = dev_get_platdata(&ara->dev);
 	struct i2c_smbus_alert *alert;
 	struct i2c_adapter *adapter = ara->adapter;
-	int res;
+	int res, irq;
 
 	alert = devm_kzalloc(&ara->dev, sizeof(struct i2c_smbus_alert),
 			     GFP_KERNEL);
 	if (!alert)
 		return -ENOMEM;
 
-	alert->alert_edge_triggered = setup->alert_edge_triggered;
-	alert->irq = setup->irq;
-	INIT_WORK(&alert->alert, smbus_alert);
+	irq = setup->irq;
+	INIT_WORK(&alert->alert, smbalert_work);
 	alert->ara = ara;
 
-	if (setup->irq > 0) {
-		res = devm_request_irq(&ara->dev, setup->irq, smbalert_irq,
-				       0, "smbus_alert", alert);
+	if (irq > 0) {
+		res = devm_request_threaded_irq(&ara->dev, irq,
+						NULL, smbus_alert,
+						IRQF_SHARED | IRQF_ONESHOT,
+						"smbus_alert", alert);
 		if (res)
 			return res;
 	}
 
 	i2c_set_clientdata(ara, alert);
-	dev_info(&adapter->dev, "supports SMBALERT#, %s trigger\n",
-		 setup->alert_edge_triggered ? "edge" : "level");
+	dev_info(&adapter->dev, "supports SMBALERT#\n");
 
 	return 0;
 }
diff --git a/include/linux/i2c-smbus.h b/include/linux/i2c-smbus.h
index a1385023a29b..19efbd14e812 100644
--- a/include/linux/i2c-smbus.h
+++ b/include/linux/i2c-smbus.h
@@ -42,7 +42,6 @@
  * properly set.
  */
 struct i2c_smbus_alert_setup {
-	unsigned int		alert_edge_triggered:1;
 	int			irq;
 };
 
-- 
cgit v1.2.3


From 69d17246ab255dda8e71c8d65396b4aa6121b7ad Mon Sep 17 00:00:00 2001
From: Phil Reid <preid@electromag.com.au>
Date: Thu, 24 Aug 2017 17:31:03 +0800
Subject: i2c: i2c-smbus: add of_i2c_setup_smbus_alert

This commit adds of_i2c_setup_smbus_alert which allows the smbalert
driver to be attached to an i2c adapter via the device tree.

Signed-off-by: Phil Reid <preid@electromag.com.au>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 Documentation/devicetree/bindings/i2c/i2c.txt |  4 ++--
 drivers/i2c/i2c-core-smbus.c                  | 22 ++++++++++++++++++++++
 drivers/i2c/i2c-smbus.c                       | 10 +++++++++-
 include/linux/i2c-smbus.h                     |  9 +++++++++
 4 files changed, 42 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/i2c/i2c.txt b/Documentation/devicetree/bindings/i2c/i2c.txt
index cee9d5055fa2..11263982470e 100644
--- a/Documentation/devicetree/bindings/i2c/i2c.txt
+++ b/Documentation/devicetree/bindings/i2c/i2c.txt
@@ -59,8 +59,8 @@ wants to support one of the below features, it should adapt the bindings below.
 	interrupts used by the device.
 
 - interrupt-names
-	"irq" and "wakeup" names are recognized by I2C core, other names are
-	left to individual drivers.
+	"irq", "wakeup" and "smbus_alert" names are recognized by I2C core,
+	other names are	left to individual drivers.
 
 - host-notify
 	device uses SMBus host notify protocol instead of interrupt line.
diff --git a/drivers/i2c/i2c-core-smbus.c b/drivers/i2c/i2c-core-smbus.c
index 7f3ec02f085a..4bb9927afd01 100644
--- a/drivers/i2c/i2c-core-smbus.c
+++ b/drivers/i2c/i2c-core-smbus.c
@@ -625,3 +625,25 @@ struct i2c_client *i2c_setup_smbus_alert(struct i2c_adapter *adapter,
 	return i2c_new_device(adapter, &ara_board_info);
 }
 EXPORT_SYMBOL_GPL(i2c_setup_smbus_alert);
+
+#if IS_ENABLED(CONFIG_I2C_SMBUS) && IS_ENABLED(CONFIG_OF)
+int of_i2c_setup_smbus_alert(struct i2c_adapter *adapter)
+{
+	struct i2c_client *client;
+	int irq;
+
+	irq = of_property_match_string(adapter->dev.of_node, "interrupt-names",
+				       "smbus_alert");
+	if (irq == -EINVAL || irq == -ENODATA)
+		return 0;
+	else if (irq < 0)
+		return irq;
+
+	client = i2c_setup_smbus_alert(adapter, NULL);
+	if (!client)
+		return -ENODEV;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(of_i2c_setup_smbus_alert);
+#endif
diff --git a/drivers/i2c/i2c-smbus.c b/drivers/i2c/i2c-smbus.c
index d0bb0358b578..5a1dd7f13bac 100644
--- a/drivers/i2c/i2c-smbus.c
+++ b/drivers/i2c/i2c-smbus.c
@@ -21,6 +21,7 @@
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/of_irq.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 
@@ -139,7 +140,14 @@ static int smbalert_probe(struct i2c_client *ara,
 	if (!alert)
 		return -ENOMEM;
 
-	irq = setup->irq;
+	if (setup) {
+		irq = setup->irq;
+	} else {
+		irq = of_irq_get_byname(adapter->dev.of_node, "smbus_alert");
+		if (irq <= 0)
+			return irq;
+	}
+
 	INIT_WORK(&alert->alert, smbalert_work);
 	alert->ara = ara;
 
diff --git a/include/linux/i2c-smbus.h b/include/linux/i2c-smbus.h
index 19efbd14e812..fb0e040b1abb 100644
--- a/include/linux/i2c-smbus.h
+++ b/include/linux/i2c-smbus.h
@@ -49,4 +49,13 @@ struct i2c_client *i2c_setup_smbus_alert(struct i2c_adapter *adapter,
 					 struct i2c_smbus_alert_setup *setup);
 int i2c_handle_smbus_alert(struct i2c_client *ara);
 
+#if IS_ENABLED(CONFIG_I2C_SMBUS) && IS_ENABLED(CONFIG_OF)
+int of_i2c_setup_smbus_alert(struct i2c_adapter *adap);
+#else
+static inline int of_i2c_setup_smbus_alert(struct i2c_adapter *adap)
+{
+	return 0;
+}
+#endif
+
 #endif /* _LINUX_I2C_SMBUS_H */
-- 
cgit v1.2.3


From 186731145f920fb1514200043bcaf9c689693857 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sun, 10 Sep 2017 11:44:46 +0200
Subject: hwmon: (sht15) Root out platform data

After finding out there are active users of this sensor I noticed:

- It has a single PXA27x board file using the platform data
- The platform data is only used to carry two GPIO pins, all other
  fields are unused
- The driver does not use GPIO descriptors but the legacy GPIO
  API

I saw we can swiftly fix this by:

- Killing off the platform data entirely
- Define a GPIO descriptor lookup table in the board file
- Use the standard devm_gpiod_get() to grab the GPIO descriptors
  from either the device tree or the board file table.

This compiles, but needs testing.

Cc: arm@kernel.org
Cc: Marco Franchi <marco.franchi@nxp.com>
Cc: Davide Hug <d@videhug.ch>
Cc: Jonathan Cameron <jic23@cam.ac.uk>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Tested-by: Marco Franchi <marco.franchi@nxp.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/hwmon/sht15           |   3 +-
 arch/arm/mach-pxa/stargate2.c       |  17 ++--
 drivers/hwmon/sht15.c               | 167 ++++++++++++------------------------
 include/linux/platform_data/sht15.h |  38 --------
 4 files changed, 65 insertions(+), 160 deletions(-)
 delete mode 100644 include/linux/platform_data/sht15.h

(limited to 'include/linux')

diff --git a/Documentation/hwmon/sht15 b/Documentation/hwmon/sht15
index 778987d1856f..5e3207c3b177 100644
--- a/Documentation/hwmon/sht15
+++ b/Documentation/hwmon/sht15
@@ -42,8 +42,7 @@ chip. These coefficients are used to internally calibrate the signals from the
 sensors. Disabling the reload of those coefficients allows saving 10ms for each
 measurement and decrease power consumption, while losing on precision.
 
-Some options may be set directly in the sht15_platform_data structure
-or via sysfs attributes.
+Some options may be set via sysfs attributes.
 
 Notes:
   * The regulator supply name is set to "vcc".
diff --git a/arch/arm/mach-pxa/stargate2.c b/arch/arm/mach-pxa/stargate2.c
index 2d45d18b1a5e..6b7df6fd2448 100644
--- a/arch/arm/mach-pxa/stargate2.c
+++ b/arch/arm/mach-pxa/stargate2.c
@@ -29,6 +29,7 @@
 #include <linux/platform_data/pcf857x.h>
 #include <linux/platform_data/at24.h>
 #include <linux/smc91x.h>
+#include <linux/gpio/machine.h>
 #include <linux/gpio.h>
 #include <linux/leds.h>
 
@@ -52,7 +53,6 @@
 #include <linux/spi/spi.h>
 #include <linux/spi/pxa2xx_spi.h>
 #include <linux/mfd/da903x.h>
-#include <linux/platform_data/sht15.h>
 
 #include "devices.h"
 #include "generic.h"
@@ -137,17 +137,18 @@ static unsigned long sg2_im2_unified_pin_config[] __initdata = {
 	GPIO10_GPIO, /* large basic connector pin 23 */
 };
 
-static struct sht15_platform_data platform_data_sht15 = {
-	.gpio_data =  100,
-	.gpio_sck  =  98,
+static struct gpiod_lookup_table sht15_gpiod_table = {
+	.dev_id = "sht15",
+	.table = {
+		/* FIXME: should this have |GPIO_OPEN_DRAIN set? */
+		GPIO_LOOKUP("gpio-pxa", 100, "data", GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP("gpio-pxa", 98, "clk", GPIO_ACTIVE_HIGH),
+	},
 };
 
 static struct platform_device sht15 = {
 	.name = "sht15",
 	.id = -1,
-	.dev = {
-		.platform_data = &platform_data_sht15,
-	},
 };
 
 static struct regulator_consumer_supply stargate2_sensor_3_con[] = {
@@ -608,6 +609,7 @@ static void __init imote2_init(void)
 
 	imote2_stargate2_init();
 
+	gpiod_add_lookup_table(&sht15_gpiod_table);
 	platform_add_devices(imote2_devices, ARRAY_SIZE(imote2_devices));
 
 	i2c_register_board_info(0, imote2_i2c_board_info,
@@ -988,6 +990,7 @@ static void __init stargate2_init(void)
 
 	imote2_stargate2_init();
 
+	gpiod_add_lookup_table(&sht15_gpiod_table);
 	platform_add_devices(ARRAY_AND_SIZE(stargate2_devices));
 
 	i2c_register_board_info(0, ARRAY_AND_SIZE(stargate2_i2c_board_info));
diff --git a/drivers/hwmon/sht15.c b/drivers/hwmon/sht15.c
index e4d642b673c6..0e3e5f83f5cf 100644
--- a/drivers/hwmon/sht15.c
+++ b/drivers/hwmon/sht15.c
@@ -18,13 +18,11 @@
 
 #include <linux/interrupt.h>
 #include <linux/irq.h>
-#include <linux/gpio.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/hwmon.h>
 #include <linux/hwmon-sysfs.h>
 #include <linux/mutex.h>
-#include <linux/platform_data/sht15.h>
 #include <linux/platform_device.h>
 #include <linux/sched.h>
 #include <linux/delay.h>
@@ -34,7 +32,8 @@
 #include <linux/slab.h>
 #include <linux/atomic.h>
 #include <linux/bitrev.h>
-#include <linux/of_gpio.h>
+#include <linux/gpio/consumer.h>
+#include <linux/of.h>
 
 /* Commands */
 #define SHT15_MEASURE_TEMP		0x03
@@ -122,7 +121,8 @@ static const u8 sht15_crc8_table[] = {
 
 /**
  * struct sht15_data - device instance specific data
- * @pdata:		platform data (gpio's etc).
+ * @sck:		clock GPIO line
+ * @data:		data GPIO line
  * @read_work:		bh of interrupt handler.
  * @wait_queue:		wait queue for getting values from device.
  * @val_temp:		last temperature value read from device.
@@ -150,7 +150,8 @@ static const u8 sht15_crc8_table[] = {
  * @interrupt_handled:	flag used to indicate a handler has been scheduled.
  */
 struct sht15_data {
-	struct sht15_platform_data	*pdata;
+	struct gpio_desc		*sck;
+	struct gpio_desc		*data;
 	struct work_struct		read_work;
 	wait_queue_head_t		wait_queue;
 	uint16_t			val_temp;
@@ -205,16 +206,16 @@ static int sht15_connection_reset(struct sht15_data *data)
 {
 	int i, err;
 
-	err = gpio_direction_output(data->pdata->gpio_data, 1);
+	err = gpiod_direction_output(data->data, 1);
 	if (err)
 		return err;
 	ndelay(SHT15_TSCKL);
-	gpio_set_value(data->pdata->gpio_sck, 0);
+	gpiod_set_value(data->sck, 0);
 	ndelay(SHT15_TSCKL);
 	for (i = 0; i < 9; ++i) {
-		gpio_set_value(data->pdata->gpio_sck, 1);
+		gpiod_set_value(data->sck, 1);
 		ndelay(SHT15_TSCKH);
-		gpio_set_value(data->pdata->gpio_sck, 0);
+		gpiod_set_value(data->sck, 0);
 		ndelay(SHT15_TSCKL);
 	}
 	return 0;
@@ -227,11 +228,11 @@ static int sht15_connection_reset(struct sht15_data *data)
  */
 static inline void sht15_send_bit(struct sht15_data *data, int val)
 {
-	gpio_set_value(data->pdata->gpio_data, val);
+	gpiod_set_value(data->data, val);
 	ndelay(SHT15_TSU);
-	gpio_set_value(data->pdata->gpio_sck, 1);
+	gpiod_set_value(data->sck, 1);
 	ndelay(SHT15_TSCKH);
-	gpio_set_value(data->pdata->gpio_sck, 0);
+	gpiod_set_value(data->sck, 0);
 	ndelay(SHT15_TSCKL); /* clock low time */
 }
 
@@ -248,23 +249,23 @@ static int sht15_transmission_start(struct sht15_data *data)
 	int err;
 
 	/* ensure data is high and output */
-	err = gpio_direction_output(data->pdata->gpio_data, 1);
+	err = gpiod_direction_output(data->data, 1);
 	if (err)
 		return err;
 	ndelay(SHT15_TSU);
-	gpio_set_value(data->pdata->gpio_sck, 0);
+	gpiod_set_value(data->sck, 0);
 	ndelay(SHT15_TSCKL);
-	gpio_set_value(data->pdata->gpio_sck, 1);
+	gpiod_set_value(data->sck, 1);
 	ndelay(SHT15_TSCKH);
-	gpio_set_value(data->pdata->gpio_data, 0);
+	gpiod_set_value(data->data, 0);
 	ndelay(SHT15_TSU);
-	gpio_set_value(data->pdata->gpio_sck, 0);
+	gpiod_set_value(data->sck, 0);
 	ndelay(SHT15_TSCKL);
-	gpio_set_value(data->pdata->gpio_sck, 1);
+	gpiod_set_value(data->sck, 1);
 	ndelay(SHT15_TSCKH);
-	gpio_set_value(data->pdata->gpio_data, 1);
+	gpiod_set_value(data->data, 1);
 	ndelay(SHT15_TSU);
-	gpio_set_value(data->pdata->gpio_sck, 0);
+	gpiod_set_value(data->sck, 0);
 	ndelay(SHT15_TSCKL);
 	return 0;
 }
@@ -292,20 +293,20 @@ static int sht15_wait_for_response(struct sht15_data *data)
 {
 	int err;
 
-	err = gpio_direction_input(data->pdata->gpio_data);
+	err = gpiod_direction_input(data->data);
 	if (err)
 		return err;
-	gpio_set_value(data->pdata->gpio_sck, 1);
+	gpiod_set_value(data->sck, 1);
 	ndelay(SHT15_TSCKH);
-	if (gpio_get_value(data->pdata->gpio_data)) {
-		gpio_set_value(data->pdata->gpio_sck, 0);
+	if (gpiod_get_value(data->data)) {
+		gpiod_set_value(data->sck, 0);
 		dev_err(data->dev, "Command not acknowledged\n");
 		err = sht15_connection_reset(data);
 		if (err)
 			return err;
 		return -EIO;
 	}
-	gpio_set_value(data->pdata->gpio_sck, 0);
+	gpiod_set_value(data->sck, 0);
 	ndelay(SHT15_TSCKL);
 	return 0;
 }
@@ -360,17 +361,17 @@ static int sht15_ack(struct sht15_data *data)
 {
 	int err;
 
-	err = gpio_direction_output(data->pdata->gpio_data, 0);
+	err = gpiod_direction_output(data->data, 0);
 	if (err)
 		return err;
 	ndelay(SHT15_TSU);
-	gpio_set_value(data->pdata->gpio_sck, 1);
+	gpiod_set_value(data->sck, 1);
 	ndelay(SHT15_TSU);
-	gpio_set_value(data->pdata->gpio_sck, 0);
+	gpiod_set_value(data->sck, 0);
 	ndelay(SHT15_TSU);
-	gpio_set_value(data->pdata->gpio_data, 1);
+	gpiod_set_value(data->data, 1);
 
-	return gpio_direction_input(data->pdata->gpio_data);
+	return gpiod_direction_input(data->data);
 }
 
 /**
@@ -383,13 +384,13 @@ static int sht15_end_transmission(struct sht15_data *data)
 {
 	int err;
 
-	err = gpio_direction_output(data->pdata->gpio_data, 1);
+	err = gpiod_direction_output(data->data, 1);
 	if (err)
 		return err;
 	ndelay(SHT15_TSU);
-	gpio_set_value(data->pdata->gpio_sck, 1);
+	gpiod_set_value(data->sck, 1);
 	ndelay(SHT15_TSCKH);
-	gpio_set_value(data->pdata->gpio_sck, 0);
+	gpiod_set_value(data->sck, 0);
 	ndelay(SHT15_TSCKL);
 	return 0;
 }
@@ -405,10 +406,10 @@ static u8 sht15_read_byte(struct sht15_data *data)
 
 	for (i = 0; i < 8; ++i) {
 		byte <<= 1;
-		gpio_set_value(data->pdata->gpio_sck, 1);
+		gpiod_set_value(data->sck, 1);
 		ndelay(SHT15_TSCKH);
-		byte |= !!gpio_get_value(data->pdata->gpio_data);
-		gpio_set_value(data->pdata->gpio_sck, 0);
+		byte |= !!gpiod_get_value(data->data);
+		gpiod_set_value(data->sck, 0);
 		ndelay(SHT15_TSCKL);
 	}
 	return byte;
@@ -428,7 +429,7 @@ static int sht15_send_status(struct sht15_data *data, u8 status)
 	err = sht15_send_cmd(data, SHT15_WRITE_STATUS);
 	if (err)
 		return err;
-	err = gpio_direction_output(data->pdata->gpio_data, 1);
+	err = gpiod_direction_output(data->data, 1);
 	if (err)
 		return err;
 	ndelay(SHT15_TSU);
@@ -528,14 +529,14 @@ static int sht15_measurement(struct sht15_data *data,
 	if (ret)
 		return ret;
 
-	ret = gpio_direction_input(data->pdata->gpio_data);
+	ret = gpiod_direction_input(data->data);
 	if (ret)
 		return ret;
 	atomic_set(&data->interrupt_handled, 0);
 
-	enable_irq(gpio_to_irq(data->pdata->gpio_data));
-	if (gpio_get_value(data->pdata->gpio_data) == 0) {
-		disable_irq_nosync(gpio_to_irq(data->pdata->gpio_data));
+	enable_irq(gpiod_to_irq(data->data));
+	if (gpiod_get_value(data->data) == 0) {
+		disable_irq_nosync(gpiod_to_irq(data->data));
 		/* Only relevant if the interrupt hasn't occurred. */
 		if (!atomic_read(&data->interrupt_handled))
 			schedule_work(&data->read_work);
@@ -547,7 +548,7 @@ static int sht15_measurement(struct sht15_data *data,
 		data->state = SHT15_READING_NOTHING;
 		return -EIO;
 	} else if (ret == 0) { /* timeout occurred */
-		disable_irq_nosync(gpio_to_irq(data->pdata->gpio_data));
+		disable_irq_nosync(gpiod_to_irq(data->data));
 		ret = sht15_connection_reset(data);
 		if (ret)
 			return ret;
@@ -826,15 +827,15 @@ static void sht15_bh_read_data(struct work_struct *work_s)
 			       read_work);
 
 	/* Firstly, verify the line is low */
-	if (gpio_get_value(data->pdata->gpio_data)) {
+	if (gpiod_get_value(data->data)) {
 		/*
 		 * If not, then start the interrupt again - care here as could
 		 * have gone low in meantime so verify it hasn't!
 		 */
 		atomic_set(&data->interrupt_handled, 0);
-		enable_irq(gpio_to_irq(data->pdata->gpio_data));
+		enable_irq(gpiod_to_irq(data->data));
 		/* If still not occurred or another handler was scheduled */
-		if (gpio_get_value(data->pdata->gpio_data)
+		if (gpiod_get_value(data->data)
 		    || atomic_read(&data->interrupt_handled))
 			return;
 	}
@@ -918,46 +919,6 @@ static const struct of_device_id sht15_dt_match[] = {
 	{ },
 };
 MODULE_DEVICE_TABLE(of, sht15_dt_match);
-
-/*
- * This function returns NULL if pdev isn't a device instatiated by dt,
- * a pointer to pdata if it could successfully get all information
- * from dt or a negative ERR_PTR() on error.
- */
-static struct sht15_platform_data *sht15_probe_dt(struct device *dev)
-{
-	struct device_node *np = dev->of_node;
-	struct sht15_platform_data *pdata;
-
-	/* no device tree device */
-	if (!np)
-		return NULL;
-
-	pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
-	if (!pdata)
-		return ERR_PTR(-ENOMEM);
-
-	pdata->gpio_data = of_get_named_gpio(np, "data-gpios", 0);
-	if (pdata->gpio_data < 0) {
-		if (pdata->gpio_data != -EPROBE_DEFER)
-			dev_err(dev, "data-gpios not found\n");
-		return ERR_PTR(pdata->gpio_data);
-	}
-
-	pdata->gpio_sck = of_get_named_gpio(np, "clk-gpios", 0);
-	if (pdata->gpio_sck < 0) {
-		if (pdata->gpio_sck != -EPROBE_DEFER)
-			dev_err(dev, "clk-gpios not found\n");
-		return ERR_PTR(pdata->gpio_sck);
-	}
-
-	return pdata;
-}
-#else
-static inline struct sht15_platform_data *sht15_probe_dt(struct device *dev)
-{
-	return NULL;
-}
 #endif
 
 static int sht15_probe(struct platform_device *pdev)
@@ -977,25 +938,6 @@ static int sht15_probe(struct platform_device *pdev)
 	data->dev = &pdev->dev;
 	init_waitqueue_head(&data->wait_queue);
 
-	data->pdata = sht15_probe_dt(&pdev->dev);
-	if (IS_ERR(data->pdata))
-		return PTR_ERR(data->pdata);
-	if (data->pdata == NULL) {
-		data->pdata = dev_get_platdata(&pdev->dev);
-		if (data->pdata == NULL) {
-			dev_err(&pdev->dev, "no platform data supplied\n");
-			return -EINVAL;
-		}
-	}
-
-	data->supply_uv = data->pdata->supply_mv * 1000;
-	if (data->pdata->checksum)
-		data->checksumming = true;
-	if (data->pdata->no_otp_reload)
-		status |= SHT15_STATUS_NO_OTP_RELOAD;
-	if (data->pdata->low_resolution)
-		status |= SHT15_STATUS_LOW_RESOLUTION;
-
 	/*
 	 * If a regulator is available,
 	 * query what the supply voltage actually is!
@@ -1030,21 +972,20 @@ static int sht15_probe(struct platform_device *pdev)
 	}
 
 	/* Try requesting the GPIOs */
-	ret = devm_gpio_request_one(&pdev->dev, data->pdata->gpio_sck,
-			GPIOF_OUT_INIT_LOW, "SHT15 sck");
-	if (ret) {
+	data->sck = devm_gpiod_get(&pdev->dev, "clk", GPIOD_OUT_LOW);
+	if (IS_ERR(data->sck)) {
+		ret = PTR_ERR(data->sck);
 		dev_err(&pdev->dev, "clock line GPIO request failed\n");
 		goto err_release_reg;
 	}
-
-	ret = devm_gpio_request(&pdev->dev, data->pdata->gpio_data,
-				"SHT15 data");
-	if (ret) {
+	data->data = devm_gpiod_get(&pdev->dev, "data", GPIOD_IN);
+	if (IS_ERR(data->data)) {
+		ret = PTR_ERR(data->data);
 		dev_err(&pdev->dev, "data line GPIO request failed\n");
 		goto err_release_reg;
 	}
 
-	ret = devm_request_irq(&pdev->dev, gpio_to_irq(data->pdata->gpio_data),
+	ret = devm_request_irq(&pdev->dev, gpiod_to_irq(data->data),
 			       sht15_interrupt_fired,
 			       IRQF_TRIGGER_FALLING,
 			       "sht15 data",
@@ -1053,7 +994,7 @@ static int sht15_probe(struct platform_device *pdev)
 		dev_err(&pdev->dev, "failed to get irq for data line\n");
 		goto err_release_reg;
 	}
-	disable_irq_nosync(gpio_to_irq(data->pdata->gpio_data));
+	disable_irq_nosync(gpiod_to_irq(data->data));
 	ret = sht15_connection_reset(data);
 	if (ret)
 		goto err_release_reg;
diff --git a/include/linux/platform_data/sht15.h b/include/linux/platform_data/sht15.h
deleted file mode 100644
index 12289c1e9413..000000000000
--- a/include/linux/platform_data/sht15.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * sht15.h - support for the SHT15 Temperature and Humidity Sensor
- *
- * Copyright (c) 2009 Jonathan Cameron
- *
- * Copyright (c) 2007 Wouter Horre
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * For further information, see the Documentation/hwmon/sht15 file.
- */
-
-#ifndef _PDATA_SHT15_H
-#define _PDATA_SHT15_H
-
-/**
- * struct sht15_platform_data - sht15 connectivity info
- * @gpio_data:		no. of gpio to which bidirectional data line is
- *			connected.
- * @gpio_sck:		no. of gpio to which the data clock is connected.
- * @supply_mv:		supply voltage in mv. Overridden by regulator if
- *			available.
- * @checksum:		flag to indicate the checksum should be validated.
- * @no_otp_reload:	flag to indicate no reload from OTP.
- * @low_resolution:	flag to indicate the temp/humidity resolution to use.
- */
-struct sht15_platform_data {
-	int gpio_data;
-	int gpio_sck;
-	int supply_mv;
-	bool checksum;
-	bool no_otp_reload;
-	bool low_resolution;
-};
-
-#endif /* _PDATA_SHT15_H */
-- 
cgit v1.2.3


From ef7a612415958de1f9afd86235d38b14975d0b7c Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 26 Sep 2017 01:09:05 +0200
Subject: hwmon: (gpio-fan) Localize platform data

There is not a single user of the platform data header in
<linux/gpio-fan.h>. We can conclude that all current users are
probing from the device tree, so start simplifying the code by
pulling the header into the driver.

Convert "unsigned" to "unsigned int" in the process to make
checkpatch happy.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/gpio-fan.c | 23 ++++++++++++++++++++++-
 include/linux/gpio-fan.h | 36 ------------------------------------
 2 files changed, 22 insertions(+), 37 deletions(-)
 delete mode 100644 include/linux/gpio-fan.h

(limited to 'include/linux')

diff --git a/drivers/hwmon/gpio-fan.c b/drivers/hwmon/gpio-fan.c
index f29cee9398ef..cfa8d9b578dd 100644
--- a/drivers/hwmon/gpio-fan.c
+++ b/drivers/hwmon/gpio-fan.c
@@ -30,12 +30,33 @@
 #include <linux/mutex.h>
 #include <linux/hwmon.h>
 #include <linux/gpio.h>
-#include <linux/gpio-fan.h>
 #include <linux/of.h>
 #include <linux/of_platform.h>
 #include <linux/of_gpio.h>
 #include <linux/thermal.h>
 
+struct gpio_fan_alarm {
+	unsigned int	gpio;
+	unsigned int	active_low;
+};
+
+struct gpio_fan_speed {
+	int rpm;
+	int ctrl_val;
+};
+
+struct gpio_fan_platform_data {
+	int			num_ctrl;
+	unsigned int		*ctrl;	/* fan control GPIOs. */
+	struct gpio_fan_alarm	*alarm;	/* fan alarm GPIO. */
+	/*
+	 * Speed conversion array: rpm from/to GPIO bit field.
+	 * This array _must_ be sorted in ascending rpm order.
+	 */
+	int			num_speed;
+	struct gpio_fan_speed	*speed;
+};
+
 struct gpio_fan_data {
 	struct platform_device	*pdev;
 	struct device		*hwmon_dev;
diff --git a/include/linux/gpio-fan.h b/include/linux/gpio-fan.h
deleted file mode 100644
index 096659169215..000000000000
--- a/include/linux/gpio-fan.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * include/linux/gpio-fan.h
- *
- * Platform data structure for GPIO fan driver
- *
- * This file is licensed under the terms of the GNU General Public
- * License version 2.  This program is licensed "as is" without any
- * warranty of any kind, whether express or implied.
- */
-
-#ifndef __LINUX_GPIO_FAN_H
-#define __LINUX_GPIO_FAN_H
-
-struct gpio_fan_alarm {
-	unsigned	gpio;
-	unsigned	active_low;
-};
-
-struct gpio_fan_speed {
-	int rpm;
-	int ctrl_val;
-};
-
-struct gpio_fan_platform_data {
-	int			num_ctrl;
-	unsigned		*ctrl;	/* fan control GPIOs. */
-	struct gpio_fan_alarm	*alarm;	/* fan alarm GPIO. */
-	/*
-	 * Speed conversion array: rpm from/to GPIO bit field.
-	 * This array _must_ be sorted in ascending rpm order.
-	 */
-	int			num_speed;
-	struct gpio_fan_speed	*speed;
-};
-
-#endif /* __LINUX_GPIO_FAN_H */
-- 
cgit v1.2.3


From b2e63555592f81331c8da3afaa607d8cf83e8138 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sun, 10 Sep 2017 01:30:46 +0200
Subject: i2c: gpio: Convert to use descriptors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This converts the GPIO-based I2C-driver to using GPIO
descriptors instead of the old global numberspace-based
GPIO interface. We:

- Convert the driver to unconditionally grab two GPIOs
  from the device by index 0 (SDA) and 1 (SCL) which
  will work fine with device tree and descriptor tables.
  The existing device trees will continue to work just
  like before, but without any roundtrip through the
  global numberspace.

- Brutally convert all boardfiles still passing global
  GPIOs by registering descriptor tables associated with
  the devices instead so this driver does not need to keep
  supporting passing any GPIO numbers as platform data.

There is no stepwise approach as elegant as this, I
strongly prefer this big hammer over any antsteps for this
conversion. This way the old GPIO numbers go away and
NEVER COME BACK.

Special conversion for the different boards utilizing
I2C-GPIO:

- EP93xx (arch/arm/mach-ep93xx): pretty straight forward as
  all boards were using the same two GPIO lines, just define
  these two in a lookup table for "i2c-gpio" and register
  these along with the device. None of them define any
  other platform data so just pass NULL as platform data.
  This platform selects GPIOLIB so all should be smooth.
  The pins appear on a gpiochip for bank "G" as pins 1 (SDA)
  and 0 (SCL).

- IXP4 (arch/arm/mach-ixp4): descriptor tables have to
  be registered for each board separately. They all use
  "IXP4XX_GPIO_CHIP" so it is pretty straight forward.
  Most board define no other platform data than SCL/SDA
  so they can drop the #include of <linux/i2c-gpio.h> and
  assign NULL to platform data.

  The "goramo_mlr" (Goramo Multilink Router) board is a bit
  worrisome: it implements its own I2C bit-banging in the
  board file, and optionally registers an I2C serial port,
  but claims the same GPIO lines for itself in the board file.
  This is not going to work: there will be competition for the
  GPIO lines, so delete the optional extra I2C bus instead, no
  I2C devices are registered on it anyway, there are just hints
  that it may contain an EEPROM that may be accessed from
  userspace. This needs to be fixed up properly by the serial
  clock using I2C emulation so drop a note in the code.

- KS8695 board acs5k (arch/arm/mach-ks8695/board-acs5.c)
  has some platform data in addition to the pins so it needs to
  be kept around sans GPIO lines. Its GPIO chip is named
  "KS8695" and the arch selects GPIOLIB.

- PXA boards (arch/arm/mach-pxa/*) use some of the platform
  data so it needs to be preserved here. The viper board even
  registers two GPIO I2Cs. The gpiochip is named "gpio-pxa" and
  the arch selects GPIOLIB.

- SA1100 Simpad (arch/arm/mach-sa1100/simpad.c) defines a GPIO
  I2C bus, and the arch selects GPIOLIB.

- Blackfin boards (arch/blackfin/bf533 etc) for these I assume
  their I2C GPIOs refer to the local gpiochip defined in
  arch/blackfin/kernel/bfin_gpio.c names "BFIN-GPIO".
  The arch selects GPIOLIB. The boards get spiked with
  IF_ENABLED(I2C_GPIO) but that is a side effect of it
  being like that already (I would just have Kconfig select
  I2C_GPIO and get rid of them all.) I also delete any
  platform data set to 0 as it will get that value anyway
  from static declartions of platform data.

- The MIPS selects GPIOLIB and the Alchemy machine is using
  two local GPIO chips, one of them has a GPIO I2C. We need
  to adjust the local offset from the global number space here.
  The ATH79 has a proper GPIO driver in drivers/gpio/gpio-ath79.c
  and AFAICT the chip is named "ath79-gpio" and the PB44
  PCF857x expander spawns from this on GPIO 1 and 0. The latter
  board only use the platform data to specify pins so it can be
  cut altogether after this.

- The MFD Silicon Motion SM501 is a special case. It dynamically
  spawns an I2C bus off the MFD using sm501_create_subdev().
  We use an approach to dynamically create a machine descriptor
  table and attach this to the "SM501-LOW" or "SM501-HIGH"
  gpiochip. We use chip-local offsets to grab the right lines.
  We can get rid of two local static inline helpers as part
  of this refactoring.

Cc: Steven Miao <realmz6@gmail.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Magnus Damm <magnus.damm@gmail.com>
Cc: Ben Dooks <ben.dooks@codethink.co.uk>
Cc: Heiko Schocher <hs@denx.de>
Acked-by: Wu, Aaron <Aaron.Wu@analog.com>
Acked-by: Olof Johansson <olof@lixom.net>
Acked-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 arch/arm/mach-ep93xx/core.c                  |  39 ++++----
 arch/arm/mach-ep93xx/edb93xx.c               |  15 +--
 arch/arm/mach-ep93xx/include/mach/platform.h |   4 +-
 arch/arm/mach-ep93xx/simone.c                |  12 +--
 arch/arm/mach-ep93xx/snappercl15.c           |  12 +--
 arch/arm/mach-ep93xx/vision_ep9307.c         |   7 +-
 arch/arm/mach-ixp4xx/avila-setup.c           |  17 +++-
 arch/arm/mach-ixp4xx/dsmg600-setup.c         |  16 +++-
 arch/arm/mach-ixp4xx/fsg-setup.c             |  16 +++-
 arch/arm/mach-ixp4xx/goramo_mlr.c            |  24 ++---
 arch/arm/mach-ixp4xx/ixdp425-setup.c         |  16 +++-
 arch/arm/mach-ixp4xx/nas100d-setup.c         |  16 +++-
 arch/arm/mach-ixp4xx/nslu2-setup.c           |  16 +++-
 arch/arm/mach-ks8695/board-acs5k.c           |  13 ++-
 arch/arm/mach-pxa/palmz72.c                  |  12 ++-
 arch/arm/mach-pxa/viper.c                    |  27 +++++-
 arch/arm/mach-sa1100/simpad.c                |  12 ++-
 arch/blackfin/mach-bf533/boards/blackstamp.c |  19 +++-
 arch/blackfin/mach-bf533/boards/ezkit.c      |  18 +++-
 arch/blackfin/mach-bf533/boards/stamp.c      |  18 +++-
 arch/blackfin/mach-bf561/boards/ezkit.c      |  18 +++-
 arch/mips/alchemy/board-gpr.c                |  19 +++-
 arch/mips/ath79/mach-pb44.c                  |  16 +++-
 drivers/i2c/busses/i2c-gpio.c                | 134 +++++++++++++--------------
 drivers/mfd/sm501.c                          |  49 +++++-----
 include/linux/i2c-gpio.h                     |   4 -
 26 files changed, 327 insertions(+), 242 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-ep93xx/core.c b/arch/arm/mach-ep93xx/core.c
index f53c61813998..7e99fe829ad1 100644
--- a/arch/arm/mach-ep93xx/core.c
+++ b/arch/arm/mach-ep93xx/core.c
@@ -31,7 +31,7 @@
 #include <linux/amba/serial.h>
 #include <linux/mtd/physmap.h>
 #include <linux/i2c.h>
-#include <linux/i2c-gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/spi/spi.h>
 #include <linux/export.h>
 #include <linux/irqchip/arm-vic.h>
@@ -320,42 +320,45 @@ void __init ep93xx_register_eth(struct ep93xx_eth_data *data, int copy_addr)
 /*************************************************************************
  * EP93xx i2c peripheral handling
  *************************************************************************/
-static struct i2c_gpio_platform_data ep93xx_i2c_data;
+
+/* All EP93xx devices use the same two GPIO pins for I2C bit-banging */
+static struct gpiod_lookup_table ep93xx_i2c_gpiod_table = {
+	.dev_id		= "i2c-gpio",
+	.table		= {
+		/* Use local offsets on gpiochip/port "G" */
+		GPIO_LOOKUP_IDX("G", 1, NULL, 0, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("G", 0, NULL, 1, GPIO_ACTIVE_HIGH),
+	},
+};
 
 static struct platform_device ep93xx_i2c_device = {
 	.name		= "i2c-gpio",
 	.id		= 0,
 	.dev		= {
-		.platform_data	= &ep93xx_i2c_data,
+		.platform_data	= NULL,
 	},
 };
 
 /**
  * ep93xx_register_i2c - Register the i2c platform device.
- * @data:	platform specific i2c-gpio configuration (__initdata)
  * @devices:	platform specific i2c bus device information (__initdata)
  * @num:	the number of devices on the i2c bus
  */
-void __init ep93xx_register_i2c(struct i2c_gpio_platform_data *data,
-				struct i2c_board_info *devices, int num)
+void __init ep93xx_register_i2c(struct i2c_board_info *devices, int num)
 {
 	/*
-	 * Set the EEPROM interface pin drive type control.
-	 * Defines the driver type for the EECLK and EEDAT pins as either
-	 * open drain, which will require an external pull-up, or a normal
-	 * CMOS driver.
+	 * FIXME: this just sets the two pins as non-opendrain, as no
+	 * platforms tries to do that anyway. Flag the applicable lines
+	 * as open drain in the GPIO_LOOKUP above and the driver or
+	 * gpiolib will handle open drain/open drain emulation as need
+	 * be. Right now i2c-gpio emulates open drain which is not
+	 * optimal.
 	 */
-	if (data->sda_is_open_drain && data->sda_pin != EP93XX_GPIO_LINE_EEDAT)
-		pr_warning("sda != EEDAT, open drain has no effect\n");
-	if (data->scl_is_open_drain && data->scl_pin != EP93XX_GPIO_LINE_EECLK)
-		pr_warning("scl != EECLK, open drain has no effect\n");
-
-	__raw_writel((data->sda_is_open_drain << 1) |
-		     (data->scl_is_open_drain << 0),
+	__raw_writel((0 << 1) | (0 << 0),
 		     EP93XX_GPIO_EEDRIVE);
 
-	ep93xx_i2c_data = *data;
 	i2c_register_board_info(0, devices, num);
+	gpiod_add_lookup_table(&ep93xx_i2c_gpiod_table);
 	platform_device_register(&ep93xx_i2c_device);
 }
 
diff --git a/arch/arm/mach-ep93xx/edb93xx.c b/arch/arm/mach-ep93xx/edb93xx.c
index 7a7f280b07d7..8e89ec8b6f0f 100644
--- a/arch/arm/mach-ep93xx/edb93xx.c
+++ b/arch/arm/mach-ep93xx/edb93xx.c
@@ -28,7 +28,6 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/i2c.h>
-#include <linux/i2c-gpio.h>
 #include <linux/spi/spi.h>
 
 #include <sound/cs4271.h>
@@ -61,14 +60,6 @@ static struct ep93xx_eth_data __initdata edb93xx_eth_data = {
 /*************************************************************************
  * EDB93xx i2c peripheral handling
  *************************************************************************/
-static struct i2c_gpio_platform_data __initdata edb93xx_i2c_gpio_data = {
-	.sda_pin		= EP93XX_GPIO_LINE_EEDAT,
-	.sda_is_open_drain	= 0,
-	.scl_pin		= EP93XX_GPIO_LINE_EECLK,
-	.scl_is_open_drain	= 0,
-	.udelay			= 0,	/* default to 100 kHz */
-	.timeout		= 0,	/* default to 100 ms */
-};
 
 static struct i2c_board_info __initdata edb93xxa_i2c_board_info[] = {
 	{
@@ -86,13 +77,11 @@ static void __init edb93xx_register_i2c(void)
 {
 	if (machine_is_edb9302a() || machine_is_edb9307a() ||
 	    machine_is_edb9315a()) {
-		ep93xx_register_i2c(&edb93xx_i2c_gpio_data,
-				    edb93xxa_i2c_board_info,
+		ep93xx_register_i2c(edb93xxa_i2c_board_info,
 				    ARRAY_SIZE(edb93xxa_i2c_board_info));
 	} else if (machine_is_edb9302() || machine_is_edb9307()
 		|| machine_is_edb9312() || machine_is_edb9315()) {
-		ep93xx_register_i2c(&edb93xx_i2c_gpio_data,
-				    edb93xx_i2c_board_info,
+		ep93xx_register_i2c(edb93xx_i2c_board_info,
 				    ARRAY_SIZE(edb93xx_i2c_board_info));
 	}
 }
diff --git a/arch/arm/mach-ep93xx/include/mach/platform.h b/arch/arm/mach-ep93xx/include/mach/platform.h
index db0839691ef5..a686fd6caee1 100644
--- a/arch/arm/mach-ep93xx/include/mach/platform.h
+++ b/arch/arm/mach-ep93xx/include/mach/platform.h
@@ -7,7 +7,6 @@
 #include <linux/reboot.h>
 
 struct device;
-struct i2c_gpio_platform_data;
 struct i2c_board_info;
 struct spi_board_info;
 struct platform_device;
@@ -36,8 +35,7 @@ void ep93xx_register_flash(unsigned int width,
 			   resource_size_t start, resource_size_t size);
 
 void ep93xx_register_eth(struct ep93xx_eth_data *data, int copy_addr);
-void ep93xx_register_i2c(struct i2c_gpio_platform_data *data,
-			 struct i2c_board_info *devices, int num);
+void ep93xx_register_i2c(struct i2c_board_info *devices, int num);
 void ep93xx_register_spi(struct ep93xx_spi_info *info,
 			 struct spi_board_info *devices, int num);
 void ep93xx_register_fb(struct ep93xxfb_mach_info *data);
diff --git a/arch/arm/mach-ep93xx/simone.c b/arch/arm/mach-ep93xx/simone.c
index c7a40f245892..e61f3dee24c2 100644
--- a/arch/arm/mach-ep93xx/simone.c
+++ b/arch/arm/mach-ep93xx/simone.c
@@ -19,7 +19,6 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/i2c.h>
-#include <linux/i2c-gpio.h>
 #include <linux/mmc/host.h>
 #include <linux/spi/spi.h>
 #include <linux/spi/mmc_spi.h>
@@ -129,15 +128,6 @@ static struct ep93xx_spi_info simone_spi_info __initdata = {
 	.use_dma = 1,
 };
 
-static struct i2c_gpio_platform_data __initdata simone_i2c_gpio_data = {
-	.sda_pin		= EP93XX_GPIO_LINE_EEDAT,
-	.sda_is_open_drain	= 0,
-	.scl_pin		= EP93XX_GPIO_LINE_EECLK,
-	.scl_is_open_drain	= 0,
-	.udelay			= 0,
-	.timeout		= 0,
-};
-
 static struct i2c_board_info __initdata simone_i2c_board_info[] = {
 	{
 		I2C_BOARD_INFO("ds1337", 0x68),
@@ -161,7 +151,7 @@ static void __init simone_init_machine(void)
 	ep93xx_register_flash(2, EP93XX_CS6_PHYS_BASE, SZ_8M);
 	ep93xx_register_eth(&simone_eth_data, 1);
 	ep93xx_register_fb(&simone_fb_info);
-	ep93xx_register_i2c(&simone_i2c_gpio_data, simone_i2c_board_info,
+	ep93xx_register_i2c(simone_i2c_board_info,
 			    ARRAY_SIZE(simone_i2c_board_info));
 	ep93xx_register_spi(&simone_spi_info, simone_spi_devices,
 			    ARRAY_SIZE(simone_spi_devices));
diff --git a/arch/arm/mach-ep93xx/snappercl15.c b/arch/arm/mach-ep93xx/snappercl15.c
index 8b29398f4dc7..45940c1d7787 100644
--- a/arch/arm/mach-ep93xx/snappercl15.c
+++ b/arch/arm/mach-ep93xx/snappercl15.c
@@ -21,7 +21,6 @@
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/i2c.h>
-#include <linux/i2c-gpio.h>
 #include <linux/fb.h>
 
 #include <linux/mtd/partitions.h>
@@ -127,15 +126,6 @@ static struct ep93xx_eth_data __initdata snappercl15_eth_data = {
 	.phy_id			= 1,
 };
 
-static struct i2c_gpio_platform_data __initdata snappercl15_i2c_gpio_data = {
-	.sda_pin		= EP93XX_GPIO_LINE_EEDAT,
-	.sda_is_open_drain	= 0,
-	.scl_pin		= EP93XX_GPIO_LINE_EECLK,
-	.scl_is_open_drain	= 0,
-	.udelay			= 0,
-	.timeout		= 0,
-};
-
 static struct i2c_board_info __initdata snappercl15_i2c_data[] = {
 	{
 		/* Audio codec */
@@ -161,7 +151,7 @@ static void __init snappercl15_init_machine(void)
 {
 	ep93xx_init_devices();
 	ep93xx_register_eth(&snappercl15_eth_data, 1);
-	ep93xx_register_i2c(&snappercl15_i2c_gpio_data, snappercl15_i2c_data,
+	ep93xx_register_i2c(snappercl15_i2c_data,
 			    ARRAY_SIZE(snappercl15_i2c_data));
 	ep93xx_register_fb(&snappercl15_fb_info);
 	snappercl15_register_audio();
diff --git a/arch/arm/mach-ep93xx/vision_ep9307.c b/arch/arm/mach-ep93xx/vision_ep9307.c
index 1daf9441058c..5a0b6187990a 100644
--- a/arch/arm/mach-ep93xx/vision_ep9307.c
+++ b/arch/arm/mach-ep93xx/vision_ep9307.c
@@ -22,7 +22,6 @@
 #include <linux/io.h>
 #include <linux/mtd/partitions.h>
 #include <linux/i2c.h>
-#include <linux/i2c-gpio.h>
 #include <linux/platform_data/pca953x.h>
 #include <linux/spi/spi.h>
 #include <linux/spi/flash.h>
@@ -144,10 +143,6 @@ static struct pca953x_platform_data pca953x_77_gpio_data = {
 /*************************************************************************
  * I2C Bus
  *************************************************************************/
-static struct i2c_gpio_platform_data vision_i2c_gpio_data __initdata = {
-	.sda_pin		= EP93XX_GPIO_LINE_EEDAT,
-	.scl_pin		= EP93XX_GPIO_LINE_EECLK,
-};
 
 static struct i2c_board_info vision_i2c_info[] __initdata = {
 	{
@@ -289,7 +284,7 @@ static void __init vision_init_machine(void)
 
 	vision_i2c_info[1].irq = gpio_to_irq(EP93XX_GPIO_LINE_F(7));
 
-	ep93xx_register_i2c(&vision_i2c_gpio_data, vision_i2c_info,
+	ep93xx_register_i2c(vision_i2c_info,
 				ARRAY_SIZE(vision_i2c_info));
 	ep93xx_register_spi(&vision_spi_master, vision_spi_board_info,
 				ARRAY_SIZE(vision_spi_board_info));
diff --git a/arch/arm/mach-ixp4xx/avila-setup.c b/arch/arm/mach-ixp4xx/avila-setup.c
index 6beec150c060..72122b5e7f28 100644
--- a/arch/arm/mach-ixp4xx/avila-setup.c
+++ b/arch/arm/mach-ixp4xx/avila-setup.c
@@ -17,7 +17,7 @@
 #include <linux/serial.h>
 #include <linux/tty.h>
 #include <linux/serial_8250.h>
-#include <linux/i2c-gpio.h>
+#include <linux/gpio/machine.h>
 #include <asm/types.h>
 #include <asm/setup.h>
 #include <asm/memory.h>
@@ -49,16 +49,21 @@ static struct platform_device avila_flash = {
 	.resource	= &avila_flash_resource,
 };
 
-static struct i2c_gpio_platform_data avila_i2c_gpio_data = {
-	.sda_pin	= AVILA_SDA_PIN,
-	.scl_pin	= AVILA_SCL_PIN,
+static struct gpiod_lookup_table avila_i2c_gpiod_table = {
+	.dev_id		= "i2c-gpio",
+	.table		= {
+		GPIO_LOOKUP_IDX("IXP4XX_GPIO_CHIP", AVILA_SDA_PIN,
+				NULL, 0, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("IXP4XX_GPIO_CHIP", AVILA_SCL_PIN,
+				NULL, 1, GPIO_ACTIVE_HIGH),
+	},
 };
 
 static struct platform_device avila_i2c_gpio = {
 	.name		= "i2c-gpio",
 	.id		= 0,
 	.dev	 = {
-		.platform_data	= &avila_i2c_gpio_data,
+		.platform_data	= NULL,
 	},
 };
 
@@ -147,6 +152,8 @@ static void __init avila_init(void)
 	avila_flash_resource.end =
 		IXP4XX_EXP_BUS_BASE(0) + ixp4xx_exp_bus_size - 1;
 
+	gpiod_add_lookup_table(&avila_i2c_gpiod_table);
+
 	platform_add_devices(avila_devices, ARRAY_SIZE(avila_devices));
 
 	avila_pata_resources[0].start = IXP4XX_EXP_BUS_BASE(1);
diff --git a/arch/arm/mach-ixp4xx/dsmg600-setup.c b/arch/arm/mach-ixp4xx/dsmg600-setup.c
index b3bd0e137f6d..68ccd669051b 100644
--- a/arch/arm/mach-ixp4xx/dsmg600-setup.c
+++ b/arch/arm/mach-ixp4xx/dsmg600-setup.c
@@ -25,7 +25,7 @@
 #include <linux/leds.h>
 #include <linux/reboot.h>
 #include <linux/i2c.h>
-#include <linux/i2c-gpio.h>
+#include <linux/gpio/machine.h>
 
 #include <mach/hardware.h>
 
@@ -68,16 +68,21 @@ static struct platform_device dsmg600_flash = {
 	.resource		= &dsmg600_flash_resource,
 };
 
-static struct i2c_gpio_platform_data dsmg600_i2c_gpio_data = {
-	.sda_pin		= DSMG600_SDA_PIN,
-	.scl_pin		= DSMG600_SCL_PIN,
+static struct gpiod_lookup_table dsmg600_i2c_gpiod_table = {
+	.dev_id		= "i2c-gpio",
+	.table		= {
+		GPIO_LOOKUP_IDX("IXP4XX_GPIO_CHIP", DSMG600_SDA_PIN,
+				NULL, 0, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("IXP4XX_GPIO_CHIP", DSMG600_SCL_PIN,
+				NULL, 1, GPIO_ACTIVE_HIGH),
+	},
 };
 
 static struct platform_device dsmg600_i2c_gpio = {
 	.name			= "i2c-gpio",
 	.id			= 0,
 	.dev	 = {
-		.platform_data	= &dsmg600_i2c_gpio_data,
+		.platform_data	= NULL,
 	},
 };
 
@@ -269,6 +274,7 @@ static void __init dsmg600_init(void)
 	dsmg600_flash_resource.end =
 		IXP4XX_EXP_BUS_BASE(0) + ixp4xx_exp_bus_size - 1;
 
+	gpiod_add_lookup_table(&dsmg600_i2c_gpiod_table);
 	i2c_register_board_info(0, dsmg600_i2c_board_info,
 				ARRAY_SIZE(dsmg600_i2c_board_info));
 
diff --git a/arch/arm/mach-ixp4xx/fsg-setup.c b/arch/arm/mach-ixp4xx/fsg-setup.c
index 5c4b0c4a1b37..a0350ad15175 100644
--- a/arch/arm/mach-ixp4xx/fsg-setup.c
+++ b/arch/arm/mach-ixp4xx/fsg-setup.c
@@ -22,7 +22,7 @@
 #include <linux/leds.h>
 #include <linux/reboot.h>
 #include <linux/i2c.h>
-#include <linux/i2c-gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/io.h>
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
@@ -54,16 +54,21 @@ static struct platform_device fsg_flash = {
 	.resource		= &fsg_flash_resource,
 };
 
-static struct i2c_gpio_platform_data fsg_i2c_gpio_data = {
-	.sda_pin		= FSG_SDA_PIN,
-	.scl_pin		= FSG_SCL_PIN,
+static struct gpiod_lookup_table fsg_i2c_gpiod_table = {
+	.dev_id		= "i2c-gpio",
+	.table		= {
+		GPIO_LOOKUP_IDX("IXP4XX_GPIO_CHIP", FSG_SDA_PIN,
+				NULL, 0, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("IXP4XX_GPIO_CHIP", FSG_SCL_PIN,
+				NULL, 1, GPIO_ACTIVE_HIGH),
+	},
 };
 
 static struct platform_device fsg_i2c_gpio = {
 	.name			= "i2c-gpio",
 	.id			= 0,
 	.dev = {
-		.platform_data	= &fsg_i2c_gpio_data,
+		.platform_data	= NULL,
 	},
 };
 
@@ -196,6 +201,7 @@ static void __init fsg_init(void)
 	/* Configure CS2 for operation, 8bit and writable */
 	*IXP4XX_EXP_CS2 = 0xbfff0002;
 
+	gpiod_add_lookup_table(&fsg_i2c_gpiod_table);
 	i2c_register_board_info(0, fsg_i2c_board_info,
 				ARRAY_SIZE(fsg_i2c_board_info));
 
diff --git a/arch/arm/mach-ixp4xx/goramo_mlr.c b/arch/arm/mach-ixp4xx/goramo_mlr.c
index 80bd9d6d04de..f1529aa3f8e2 100644
--- a/arch/arm/mach-ixp4xx/goramo_mlr.c
+++ b/arch/arm/mach-ixp4xx/goramo_mlr.c
@@ -6,7 +6,6 @@
 #include <linux/delay.h>
 #include <linux/gpio.h>
 #include <linux/hdlc.h>
-#include <linux/i2c-gpio.h>
 #include <linux/io.h>
 #include <linux/irq.h>
 #include <linux/kernel.h>
@@ -78,6 +77,12 @@
 static u32 hw_bits = 0xFFFFFFFD;    /* assume all hardware present */;
 static u8 control_value;
 
+/*
+ * FIXME: this is reimplementing I2C bit-bangining. Move this
+ * over to using driver/i2c/busses/i2c-gpio.c like all other boards
+ * and register proper I2C device(s) on the bus for this. (See
+ * other IXP4xx boards for examples.)
+ */
 static void set_scl(u8 value)
 {
 	gpio_set_value(GPIO_SCL, !!value);
@@ -216,20 +221,6 @@ static struct platform_device device_flash = {
 	.resource	= &flash_resource,
 };
 
-
-/* I^2C interface */
-static struct i2c_gpio_platform_data i2c_data = {
-	.sda_pin	= GPIO_SDA,
-	.scl_pin	= GPIO_SCL,
-};
-
-static struct platform_device device_i2c = {
-	.name		= "i2c-gpio",
-	.id		= 0,
-	.dev		= { .platform_data = &i2c_data },
-};
-
-
 /* IXP425 2 UART ports */
 static struct resource uart_resources[] = {
 	{
@@ -411,9 +402,6 @@ static void __init gmlr_init(void)
 	if (hw_bits & CFG_HW_HAS_HSS1)
 		device_tab[devices++] = &device_hss_tab[1]; /* max index 5 */
 
-	if (hw_bits & CFG_HW_HAS_EEPROM)
-		device_tab[devices++] = &device_i2c; /* max index 6 */
-
 	gpio_request(GPIO_SCL, "SCL/clock");
 	gpio_request(GPIO_SDA, "SDA/data");
 	gpio_request(GPIO_STR, "strobe");
diff --git a/arch/arm/mach-ixp4xx/ixdp425-setup.c b/arch/arm/mach-ixp4xx/ixdp425-setup.c
index 93b89291c06b..8937263cec4a 100644
--- a/arch/arm/mach-ixp4xx/ixdp425-setup.c
+++ b/arch/arm/mach-ixp4xx/ixdp425-setup.c
@@ -14,7 +14,7 @@
 #include <linux/serial.h>
 #include <linux/tty.h>
 #include <linux/serial_8250.h>
-#include <linux/i2c-gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/io.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/rawnand.h>
@@ -122,16 +122,21 @@ static struct platform_device ixdp425_flash_nand = {
 };
 #endif	/* CONFIG_MTD_NAND_PLATFORM */
 
-static struct i2c_gpio_platform_data ixdp425_i2c_gpio_data = {
-	.sda_pin	= IXDP425_SDA_PIN,
-	.scl_pin	= IXDP425_SCL_PIN,
+static struct gpiod_lookup_table ixdp425_i2c_gpiod_table = {
+	.dev_id		= "i2c-gpio",
+	.table		= {
+		GPIO_LOOKUP_IDX("IXP4XX_GPIO_CHIP", IXDP425_SDA_PIN,
+				NULL, 0, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("IXP4XX_GPIO_CHIP", IXDP425_SCL_PIN,
+				NULL, 1, GPIO_ACTIVE_HIGH),
+	},
 };
 
 static struct platform_device ixdp425_i2c_gpio = {
 	.name		= "i2c-gpio",
 	.id		= 0,
 	.dev	 = {
-		.platform_data	= &ixdp425_i2c_gpio_data,
+		.platform_data	= NULL,
 	},
 };
 
@@ -245,6 +250,7 @@ static void __init ixdp425_init(void)
 		ixdp425_uart_data[1].flags = 0;
 	}
 
+	gpiod_add_lookup_table(&ixdp425_i2c_gpiod_table);
 	platform_add_devices(ixdp425_devices, ARRAY_SIZE(ixdp425_devices));
 }
 
diff --git a/arch/arm/mach-ixp4xx/nas100d-setup.c b/arch/arm/mach-ixp4xx/nas100d-setup.c
index 4e0f762bc651..612ec8c63456 100644
--- a/arch/arm/mach-ixp4xx/nas100d-setup.c
+++ b/arch/arm/mach-ixp4xx/nas100d-setup.c
@@ -27,7 +27,7 @@
 #include <linux/leds.h>
 #include <linux/reboot.h>
 #include <linux/i2c.h>
-#include <linux/i2c-gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/io.h>
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
@@ -100,16 +100,21 @@ static struct platform_device nas100d_leds = {
 	.dev.platform_data	= &nas100d_led_data,
 };
 
-static struct i2c_gpio_platform_data nas100d_i2c_gpio_data = {
-	.sda_pin		= NAS100D_SDA_PIN,
-	.scl_pin		= NAS100D_SCL_PIN,
+static struct gpiod_lookup_table nas100d_i2c_gpiod_table = {
+	.dev_id		= "i2c-gpio",
+	.table		= {
+		GPIO_LOOKUP_IDX("IXP4XX_GPIO_CHIP", NAS100D_SDA_PIN,
+				NULL, 0, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("IXP4XX_GPIO_CHIP", NAS100D_SCL_PIN,
+				NULL, 1, GPIO_ACTIVE_HIGH),
+	},
 };
 
 static struct platform_device nas100d_i2c_gpio = {
 	.name			= "i2c-gpio",
 	.id			= 0,
 	.dev	 = {
-		.platform_data	= &nas100d_i2c_gpio_data,
+		.platform_data	= NULL,
 	},
 };
 
@@ -280,6 +285,7 @@ static void __init nas100d_init(void)
 	nas100d_flash_resource.end =
 		IXP4XX_EXP_BUS_BASE(0) + ixp4xx_exp_bus_size - 1;
 
+	gpiod_add_lookup_table(&nas100d_i2c_gpiod_table);
 	i2c_register_board_info(0, nas100d_i2c_board_info,
 				ARRAY_SIZE(nas100d_i2c_board_info));
 
diff --git a/arch/arm/mach-ixp4xx/nslu2-setup.c b/arch/arm/mach-ixp4xx/nslu2-setup.c
index 88c025f52d8d..13afb03b50fa 100644
--- a/arch/arm/mach-ixp4xx/nslu2-setup.c
+++ b/arch/arm/mach-ixp4xx/nslu2-setup.c
@@ -24,7 +24,7 @@
 #include <linux/leds.h>
 #include <linux/reboot.h>
 #include <linux/i2c.h>
-#include <linux/i2c-gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/io.h>
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
@@ -68,9 +68,14 @@ static struct platform_device nslu2_flash = {
 	.resource		= &nslu2_flash_resource,
 };
 
-static struct i2c_gpio_platform_data nslu2_i2c_gpio_data = {
-	.sda_pin		= NSLU2_SDA_PIN,
-	.scl_pin		= NSLU2_SCL_PIN,
+static struct gpiod_lookup_table nslu2_i2c_gpiod_table = {
+	.dev_id		= "i2c-gpio",
+	.table		= {
+		GPIO_LOOKUP_IDX("IXP4XX_GPIO_CHIP", NSLU2_SDA_PIN,
+				NULL, 0, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("IXP4XX_GPIO_CHIP", NSLU2_SCL_PIN,
+				NULL, 1, GPIO_ACTIVE_HIGH),
+	},
 };
 
 static struct i2c_board_info __initdata nslu2_i2c_board_info [] = {
@@ -115,7 +120,7 @@ static struct platform_device nslu2_i2c_gpio = {
 	.name			= "i2c-gpio",
 	.id			= 0,
 	.dev	 = {
-		.platform_data	= &nslu2_i2c_gpio_data,
+		.platform_data	= NULL,
 	},
 };
 
@@ -250,6 +255,7 @@ static void __init nslu2_init(void)
 	nslu2_flash_resource.end =
 		IXP4XX_EXP_BUS_BASE(0) + ixp4xx_exp_bus_size - 1;
 
+	gpiod_add_lookup_table(&nslu2_i2c_gpiod_table);
 	i2c_register_board_info(0, nslu2_i2c_board_info,
 				ARRAY_SIZE(nslu2_i2c_board_info));
 
diff --git a/arch/arm/mach-ks8695/board-acs5k.c b/arch/arm/mach-ks8695/board-acs5k.c
index e4d709c8ed32..f034724e01e1 100644
--- a/arch/arm/mach-ks8695/board-acs5k.c
+++ b/arch/arm/mach-ks8695/board-acs5k.c
@@ -16,7 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
-
+#include <linux/gpio/machine.h>
 #include <linux/i2c.h>
 #include <linux/i2c-algo-bit.h>
 #include <linux/i2c-gpio.h>
@@ -38,9 +38,15 @@
 
 #include "generic.h"
 
+static struct gpiod_lookup_table acs5k_i2c_gpiod_table = {
+	.dev_id		= "i2c-gpio",
+	.table		= {
+		GPIO_LOOKUP_IDX("KS8695", 4, NULL, 0, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("KS8695", 5, NULL, 1, GPIO_ACTIVE_HIGH),
+	},
+};
+
 static struct i2c_gpio_platform_data acs5k_i2c_device_platdata = {
-	.sda_pin	= 4,
-	.scl_pin	= 5,
 	.udelay		= 10,
 };
 
@@ -95,6 +101,7 @@ static struct i2c_board_info acs5k_i2c_devs[] __initdata = {
 static void acs5k_i2c_init(void)
 {
 	/* The gpio interface */
+	gpiod_add_lookup_table(&acs5k_i2c_gpiod_table);
 	platform_device_register(&acs5k_i2c_device);
 	/* I2C devices */
 	i2c_register_board_info(0, acs5k_i2c_devs,
diff --git a/arch/arm/mach-pxa/palmz72.c b/arch/arm/mach-pxa/palmz72.c
index 29630061e700..94f75632c007 100644
--- a/arch/arm/mach-pxa/palmz72.c
+++ b/arch/arm/mach-pxa/palmz72.c
@@ -31,6 +31,7 @@
 #include <linux/power_supply.h>
 #include <linux/usb/gpio_vbus.h>
 #include <linux/i2c-gpio.h>
+#include <linux/gpio/machine.h>
 
 #include <asm/mach-types.h>
 #include <asm/suspend.h>
@@ -320,9 +321,15 @@ static struct soc_camera_link palmz72_iclink = {
 	.flags		= SOCAM_DATAWIDTH_8,
 };
 
+static struct gpiod_lookup_table palmz72_i2c_gpiod_table = {
+	.dev_id		= "i2c-gpio",
+	.table		= {
+		GPIO_LOOKUP_IDX("gpio-pxa", 118, NULL, 0, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("gpio-pxa", 117, NULL, 1, GPIO_ACTIVE_HIGH),
+	},
+};
+
 static struct i2c_gpio_platform_data palmz72_i2c_bus_data = {
-	.sda_pin	= 118,
-	.scl_pin	= 117,
 	.udelay		= 10,
 	.timeout	= 100,
 };
@@ -369,6 +376,7 @@ static void __init palmz72_camera_init(void)
 {
 	palmz72_cam_gpio_init();
 	pxa_set_camera_info(&palmz72_pxacamera_platform_data);
+	gpiod_add_lookup_table(&palmz72_i2c_gpiod_table);
 	platform_device_register(&palmz72_i2c_bus_device);
 	platform_device_register(&palmz72_camera);
 }
diff --git a/arch/arm/mach-pxa/viper.c b/arch/arm/mach-pxa/viper.c
index 8e89d91b206b..a680742bee2b 100644
--- a/arch/arm/mach-pxa/viper.c
+++ b/arch/arm/mach-pxa/viper.c
@@ -36,6 +36,7 @@
 #include <linux/gpio.h>
 #include <linux/jiffies.h>
 #include <linux/i2c-gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/i2c/pxa-i2c.h>
 #include <linux/serial_8250.h>
 #include <linux/smc91x.h>
@@ -458,9 +459,17 @@ static struct platform_device smc91x_device = {
 };
 
 /* i2c */
+static struct gpiod_lookup_table viper_i2c_gpiod_table = {
+	.dev_id		= "i2c-gpio",
+	.table		= {
+		GPIO_LOOKUP_IDX("gpio-pxa", VIPER_RTC_I2C_SDA_GPIO,
+				NULL, 0, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("gpio-pxa", VIPER_RTC_I2C_SCL_GPIO,
+				NULL, 1, GPIO_ACTIVE_HIGH),
+	},
+};
+
 static struct i2c_gpio_platform_data i2c_bus_data = {
-	.sda_pin = VIPER_RTC_I2C_SDA_GPIO,
-	.scl_pin = VIPER_RTC_I2C_SCL_GPIO,
 	.udelay  = 10,
 	.timeout = HZ,
 };
@@ -779,12 +788,20 @@ static int __init viper_tpm_setup(char *str)
 
 __setup("tpm=", viper_tpm_setup);
 
+struct gpiod_lookup_table viper_tpm_i2c_gpiod_table = {
+	.dev_id = "i2c-gpio",
+	.table = {
+		GPIO_LOOKUP_IDX("gpio-pxa", VIPER_TPM_I2C_SDA_GPIO,
+				NULL, 0, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("gpio-pxa", VIPER_TPM_I2C_SCL_GPIO,
+				NULL, 1, GPIO_ACTIVE_HIGH),
+	},
+};
+
 static void __init viper_tpm_init(void)
 {
 	struct platform_device *tpm_device;
 	struct i2c_gpio_platform_data i2c_tpm_data = {
-		.sda_pin = VIPER_TPM_I2C_SDA_GPIO,
-		.scl_pin = VIPER_TPM_I2C_SCL_GPIO,
 		.udelay  = 10,
 		.timeout = HZ,
 	};
@@ -794,6 +811,7 @@ static void __init viper_tpm_init(void)
 	if (!viper_tpm)
 		return;
 
+	gpiod_add_lookup_table(&viper_tpm_i2c_gpiod_table);
 	tpm_device = platform_device_alloc("i2c-gpio", 2);
 	if (tpm_device) {
 		if (!platform_device_add_data(tpm_device,
@@ -943,6 +961,7 @@ static void __init viper_init(void)
 		smc91x_device.num_resources--;
 
 	pxa_set_i2c_info(NULL);
+	gpiod_add_lookup_table(&viper_i2c_gpiod_table);
 	pwm_add_table(viper_pwm_lookup, ARRAY_SIZE(viper_pwm_lookup));
 	platform_add_devices(viper_devs, ARRAY_SIZE(viper_devs));
 
diff --git a/arch/arm/mach-sa1100/simpad.c b/arch/arm/mach-sa1100/simpad.c
index bb3ca9c763de..c6e7e6d8733a 100644
--- a/arch/arm/mach-sa1100/simpad.c
+++ b/arch/arm/mach-sa1100/simpad.c
@@ -16,6 +16,7 @@
 #include <linux/mtd/partitions.h>
 #include <linux/io.h>
 #include <linux/gpio/driver.h>
+#include <linux/gpio/machine.h>
 
 #include <mach/hardware.h>
 #include <asm/setup.h>
@@ -323,9 +324,15 @@ static struct platform_device simpad_gpio_leds = {
 /*
  * i2c
  */
+static struct gpiod_lookup_table simpad_i2c_gpiod_table = {
+	.dev_id = "i2c-gpio",
+	.table = {
+		GPIO_LOOKUP_IDX("gpio", GPIO_GPIO21, NULL, 0, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("gpio", GPIO_GPIO25, NULL, 1, GPIO_ACTIVE_HIGH),
+	},
+};
+
 static struct i2c_gpio_platform_data simpad_i2c_data = {
-	.sda_pin = GPIO_GPIO21,
-	.scl_pin = GPIO_GPIO25,
 	.udelay = 10,
 	.timeout = HZ,
 };
@@ -380,6 +387,7 @@ static int __init simpad_init(void)
 			      ARRAY_SIZE(simpad_flash_resources));
 	sa11x0_register_mcp(&simpad_mcp_data);
 
+	gpiod_add_lookup_table(&simpad_i2c_gpiod_table);
 	ret = platform_add_devices(devices, ARRAY_SIZE(devices));
 	if(ret)
 		printk(KERN_WARNING "simpad: Unable to register mq200 framebuffer device");
diff --git a/arch/blackfin/mach-bf533/boards/blackstamp.c b/arch/blackfin/mach-bf533/boards/blackstamp.c
index 0ccf0cf4daaf..d801ca5ca6c4 100644
--- a/arch/blackfin/mach-bf533/boards/blackstamp.c
+++ b/arch/blackfin/mach-bf533/boards/blackstamp.c
@@ -22,6 +22,7 @@
 #include <linux/irq.h>
 #include <linux/gpio.h>
 #include <linux/i2c.h>
+#include <linux/gpio/machine.h>
 #include <asm/dma.h>
 #include <asm/bfin5xx_spi.h>
 #include <asm/portmux.h>
@@ -362,11 +363,17 @@ static struct platform_device bfin_device_gpiokeys = {
 #if IS_ENABLED(CONFIG_I2C_GPIO)
 #include <linux/i2c-gpio.h>
 
+static struct gpiod_lookup_table bfin_i2c_gpiod_table = {
+	.dev_id = "i2c-gpio",
+	.table = {
+		GPIO_LOOKUP_IDX("BFIN-GPIO", GPIO_PF8, NULL, 0,
+				GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("BFIN-GPIO", GPIO_PF9, NULL, 1,
+				GPIO_ACTIVE_HIGH),
+	},
+};
+
 static struct i2c_gpio_platform_data i2c_gpio_data = {
-	.sda_pin		= GPIO_PF8,
-	.scl_pin		= GPIO_PF9,
-	.sda_is_open_drain	= 0,
-	.scl_is_open_drain	= 0,
 	.udelay			= 40,
 }; /* This hasn't actually been used these pins
     * are (currently) free pins on the expansion connector */
@@ -462,7 +469,9 @@ static int __init blackstamp_init(void)
 	int ret;
 
 	printk(KERN_INFO "%s(): registering device resources\n", __func__);
-
+#if IS_ENABLED(CONFIG_I2C_GPIO)
+	gpiod_add_lookup_table(&bfin_i2c_gpiod_table);
+#endif
 	i2c_register_board_info(0, bfin_i2c_board_info,
 				ARRAY_SIZE(bfin_i2c_board_info));
 
diff --git a/arch/blackfin/mach-bf533/boards/ezkit.c b/arch/blackfin/mach-bf533/boards/ezkit.c
index 3625e9eaa8a8..463a72358b0e 100644
--- a/arch/blackfin/mach-bf533/boards/ezkit.c
+++ b/arch/blackfin/mach-bf533/boards/ezkit.c
@@ -19,6 +19,7 @@
 #endif
 #include <linux/irq.h>
 #include <linux/i2c.h>
+#include <linux/gpio/machine.h>
 #include <asm/dma.h>
 #include <asm/bfin5xx_spi.h>
 #include <asm/portmux.h>
@@ -390,11 +391,17 @@ static struct platform_device bfin_device_gpiokeys = {
 #if IS_ENABLED(CONFIG_I2C_GPIO)
 #include <linux/i2c-gpio.h>
 
+static struct gpiod_lookup_table bfin_i2c_gpiod_table = {
+	.dev_id = "i2c-gpio",
+	.table = {
+		GPIO_LOOKUP_IDX("BFIN-GPIO", GPIO_PF1, NULL, 0,
+				GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("BFIN-GPIO", GPIO_PF0, NULL, 1,
+				GPIO_ACTIVE_HIGH),
+	},
+};
+
 static struct i2c_gpio_platform_data i2c_gpio_data = {
-	.sda_pin		= GPIO_PF1,
-	.scl_pin		= GPIO_PF0,
-	.sda_is_open_drain	= 0,
-	.scl_is_open_drain	= 0,
 	.udelay			= 40,
 };
 
@@ -516,6 +523,9 @@ static struct platform_device *ezkit_devices[] __initdata = {
 static int __init ezkit_init(void)
 {
 	printk(KERN_INFO "%s(): registering device resources\n", __func__);
+#if IS_ENABLED(CONFIG_I2C_GPIO)
+	gpiod_add_lookup_table(&bfin_i2c_gpiod_table);
+#endif
 	platform_add_devices(ezkit_devices, ARRAY_SIZE(ezkit_devices));
 	spi_register_board_info(bfin_spi_board_info, ARRAY_SIZE(bfin_spi_board_info));
 	i2c_register_board_info(0, bfin_i2c_board_info,
diff --git a/arch/blackfin/mach-bf533/boards/stamp.c b/arch/blackfin/mach-bf533/boards/stamp.c
index 23eada79439c..d2479359adb7 100644
--- a/arch/blackfin/mach-bf533/boards/stamp.c
+++ b/arch/blackfin/mach-bf533/boards/stamp.c
@@ -21,6 +21,7 @@
 #include <linux/gpio.h>
 #include <linux/irq.h>
 #include <linux/i2c.h>
+#include <linux/gpio/machine.h>
 #include <asm/dma.h>
 #include <asm/bfin5xx_spi.h>
 #include <asm/reboot.h>
@@ -512,11 +513,17 @@ static struct platform_device bfin_device_gpiokeys = {
 #if IS_ENABLED(CONFIG_I2C_GPIO)
 #include <linux/i2c-gpio.h>
 
+static struct gpiod_lookup_table bfin_i2c_gpiod_table = {
+	.dev_id = "i2c-gpio",
+	.table = {
+		GPIO_LOOKUP_IDX("BFIN-GPIO", GPIO_PF2, NULL, 0,
+				GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("BFIN-GPIO", GPIO_PF3, NULL, 1,
+				GPIO_ACTIVE_HIGH),
+	},
+};
+
 static struct i2c_gpio_platform_data i2c_gpio_data = {
-	.sda_pin		= GPIO_PF2,
-	.scl_pin		= GPIO_PF3,
-	.sda_is_open_drain	= 0,
-	.scl_is_open_drain	= 0,
 	.udelay			= 10,
 };
 
@@ -848,6 +855,9 @@ static int __init stamp_init(void)
 
 	printk(KERN_INFO "%s(): registering device resources\n", __func__);
 
+#if IS_ENABLED(CONFIG_I2C_GPIO)
+	gpiod_add_lookup_table(&bfin_i2c_gpiod_table);
+#endif
 	i2c_register_board_info(0, bfin_i2c_board_info,
 				ARRAY_SIZE(bfin_i2c_board_info));
 
diff --git a/arch/blackfin/mach-bf561/boards/ezkit.c b/arch/blackfin/mach-bf561/boards/ezkit.c
index 57d1c43726d9..72f757ebaa84 100644
--- a/arch/blackfin/mach-bf561/boards/ezkit.c
+++ b/arch/blackfin/mach-bf561/boards/ezkit.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/gpio.h>
 #include <linux/delay.h>
+#include <linux/gpio/machine.h>
 #include <asm/dma.h>
 #include <asm/bfin5xx_spi.h>
 #include <asm/portmux.h>
@@ -379,11 +380,17 @@ static struct platform_device bfin_device_gpiokeys = {
 #if IS_ENABLED(CONFIG_I2C_GPIO)
 #include <linux/i2c-gpio.h>
 
+static struct gpiod_lookup_table bfin_i2c_gpiod_table = {
+	.dev_id = "i2c-gpio",
+	.table = {
+		GPIO_LOOKUP_IDX("BFIN-GPIO", GPIO_PF1, NULL, 0,
+				GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("BFIN-GPIO", GPIO_PF0, NULL, 1,
+				GPIO_ACTIVE_HIGH),
+	},
+};
+
 static struct i2c_gpio_platform_data i2c_gpio_data = {
-	.sda_pin		= GPIO_PF1,
-	.scl_pin		= GPIO_PF0,
-	.sda_is_open_drain	= 0,
-	.scl_is_open_drain	= 0,
 	.udelay			= 10,
 };
 
@@ -633,6 +640,9 @@ static int __init ezkit_init(void)
 
 	printk(KERN_INFO "%s(): registering device resources\n", __func__);
 
+#if IS_ENABLED(CONFIG_I2C_GPIO)
+	gpiod_add_lookup_table(&bfin_i2c_gpiod_table);
+#endif
 	ret = platform_add_devices(ezkit_devices, ARRAY_SIZE(ezkit_devices));
 	if (ret < 0)
 		return ret;
diff --git a/arch/mips/alchemy/board-gpr.c b/arch/mips/alchemy/board-gpr.c
index 6fb6b3faa158..daebc36e5ecb 100644
--- a/arch/mips/alchemy/board-gpr.c
+++ b/arch/mips/alchemy/board-gpr.c
@@ -30,6 +30,7 @@
 #include <linux/gpio.h>
 #include <linux/i2c.h>
 #include <linux/i2c-gpio.h>
+#include <linux/gpio/machine.h>
 #include <asm/bootinfo.h>
 #include <asm/idle.h>
 #include <asm/reboot.h>
@@ -218,10 +219,23 @@ static struct platform_device gpr_led_devices = {
 /*
  * I2C
  */
+static struct gpiod_lookup_table gpr_i2c_gpiod_table = {
+	.dev_id = "i2c-gpio",
+	.table = {
+		/*
+		 * This should be on "GPIO2" which has base at 200 so
+		 * the global numbers 209 and 210 should correspond to
+		 * local offsets 9 and 10.
+		 */
+		GPIO_LOOKUP_IDX("alchemy-gpio2", 9, NULL, 0,
+				GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("alchemy-gpio2", 10, NULL, 1,
+				GPIO_ACTIVE_HIGH),
+	},
+};
+
 static struct i2c_gpio_platform_data gpr_i2c_data = {
-	.sda_pin		= 209,
 	.sda_is_open_drain	= 1,
-	.scl_pin		= 210,
 	.scl_is_open_drain	= 1,
 	.udelay			= 2,		/* ~100 kHz */
 	.timeout		= HZ,
@@ -295,6 +309,7 @@ arch_initcall(gpr_pci_init);
 
 static int __init gpr_dev_init(void)
 {
+	gpiod_add_lookup_table(&gpr_i2c_gpiod_table);
 	i2c_register_board_info(0, gpr_i2c_info, ARRAY_SIZE(gpr_i2c_info));
 
 	return platform_add_devices(gpr_devices, ARRAY_SIZE(gpr_devices));
diff --git a/arch/mips/ath79/mach-pb44.c b/arch/mips/ath79/mach-pb44.c
index be78298dffb4..a95409063847 100644
--- a/arch/mips/ath79/mach-pb44.c
+++ b/arch/mips/ath79/mach-pb44.c
@@ -11,7 +11,7 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/i2c.h>
-#include <linux/i2c-gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/platform_data/pcf857x.h>
 
 #include "machtypes.h"
@@ -33,16 +33,21 @@
 #define PB44_KEYS_POLL_INTERVAL		20	/* msecs */
 #define PB44_KEYS_DEBOUNCE_INTERVAL	(3 * PB44_KEYS_POLL_INTERVAL)
 
-static struct i2c_gpio_platform_data pb44_i2c_gpio_data = {
-	.sda_pin	= PB44_GPIO_I2C_SDA,
-	.scl_pin	= PB44_GPIO_I2C_SCL,
+static struct gpiod_lookup_table pb44_i2c_gpiod_table = {
+	.dev_id = "i2c-gpio",
+	.table = {
+		GPIO_LOOKUP_IDX("ath79-gpio", PB44_GPIO_I2C_SDA,
+				NULL, 0, GPIO_ACTIVE_HIGH),
+		GPIO_LOOKUP_IDX("ath79-gpio", PB44_GPIO_I2C_SCL,
+				NULL, 1, GPIO_ACTIVE_HIGH),
+	},
 };
 
 static struct platform_device pb44_i2c_gpio_device = {
 	.name		= "i2c-gpio",
 	.id		= 0,
 	.dev = {
-		.platform_data	= &pb44_i2c_gpio_data,
+		.platform_data	= NULL,
 	}
 };
 
@@ -103,6 +108,7 @@ static struct ath79_spi_platform_data pb44_spi_data = {
 
 static void __init pb44_init(void)
 {
+	gpiod_add_lookup_table(&pb44_i2c_gpiod_table);
 	i2c_register_board_info(0, pb44_i2c_board_info,
 				ARRAY_SIZE(pb44_i2c_board_info));
 	platform_device_register(&pb44_i2c_gpio_device);
diff --git a/drivers/i2c/busses/i2c-gpio.c b/drivers/i2c/busses/i2c-gpio.c
index 0ef8fcc6ac3a..b4664037eded 100644
--- a/drivers/i2c/busses/i2c-gpio.c
+++ b/drivers/i2c/busses/i2c-gpio.c
@@ -14,11 +14,12 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/platform_device.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/of.h>
-#include <linux/of_gpio.h>
 
 struct i2c_gpio_private_data {
+	struct gpio_desc *sda;
+	struct gpio_desc *scl;
 	struct i2c_adapter adap;
 	struct i2c_algo_bit_data bit_data;
 	struct i2c_gpio_platform_data pdata;
@@ -27,12 +28,18 @@ struct i2c_gpio_private_data {
 /* Toggle SDA by changing the direction of the pin */
 static void i2c_gpio_setsda_dir(void *data, int state)
 {
-	struct i2c_gpio_platform_data *pdata = data;
-
+	struct i2c_gpio_private_data *priv = data;
+
+	/*
+	 * This is a way of saying "do not drive
+	 * me actively high" which means emulating open drain.
+	 * The right way to do this is for gpiolib to
+	 * handle this, by the function below.
+	 */
 	if (state)
-		gpio_direction_input(pdata->sda_pin);
+		gpiod_direction_input(priv->sda);
 	else
-		gpio_direction_output(pdata->sda_pin, 0);
+		gpiod_direction_output(priv->sda, 0);
 }
 
 /*
@@ -42,20 +49,20 @@ static void i2c_gpio_setsda_dir(void *data, int state)
  */
 static void i2c_gpio_setsda_val(void *data, int state)
 {
-	struct i2c_gpio_platform_data *pdata = data;
+	struct i2c_gpio_private_data *priv = data;
 
-	gpio_set_value(pdata->sda_pin, state);
+	gpiod_set_value(priv->sda, state);
 }
 
 /* Toggle SCL by changing the direction of the pin. */
 static void i2c_gpio_setscl_dir(void *data, int state)
 {
-	struct i2c_gpio_platform_data *pdata = data;
+	struct i2c_gpio_private_data *priv = data;
 
 	if (state)
-		gpio_direction_input(pdata->scl_pin);
+		gpiod_direction_input(priv->scl);
 	else
-		gpio_direction_output(pdata->scl_pin, 0);
+		gpiod_direction_output(priv->scl, 0);
 }
 
 /*
@@ -66,44 +73,23 @@ static void i2c_gpio_setscl_dir(void *data, int state)
  */
 static void i2c_gpio_setscl_val(void *data, int state)
 {
-	struct i2c_gpio_platform_data *pdata = data;
+	struct i2c_gpio_private_data *priv = data;
 
-	gpio_set_value(pdata->scl_pin, state);
+	gpiod_set_value(priv->scl, state);
 }
 
 static int i2c_gpio_getsda(void *data)
 {
-	struct i2c_gpio_platform_data *pdata = data;
+	struct i2c_gpio_private_data *priv = data;
 
-	return gpio_get_value(pdata->sda_pin);
+	return gpiod_get_value(priv->sda);
 }
 
 static int i2c_gpio_getscl(void *data)
 {
-	struct i2c_gpio_platform_data *pdata = data;
+	struct i2c_gpio_private_data *priv = data;
 
-	return gpio_get_value(pdata->scl_pin);
-}
-
-static int of_i2c_gpio_get_pins(struct device_node *np,
-				unsigned int *sda_pin, unsigned int *scl_pin)
-{
-	if (of_gpio_count(np) < 2)
-		return -ENODEV;
-
-	*sda_pin = of_get_gpio(np, 0);
-	*scl_pin = of_get_gpio(np, 1);
-
-	if (*sda_pin == -EPROBE_DEFER || *scl_pin == -EPROBE_DEFER)
-		return -EPROBE_DEFER;
-
-	if (!gpio_is_valid(*sda_pin) || !gpio_is_valid(*scl_pin)) {
-		pr_err("%pOF: invalid GPIO pins, sda=%d/scl=%d\n",
-		       np, *sda_pin, *scl_pin);
-		return -ENODEV;
-	}
-
-	return 0;
+	return gpiod_get_value(priv->scl);
 }
 
 static void of_i2c_gpio_get_props(struct device_node *np,
@@ -130,64 +116,65 @@ static int i2c_gpio_probe(struct platform_device *pdev)
 	struct i2c_gpio_platform_data *pdata;
 	struct i2c_algo_bit_data *bit_data;
 	struct i2c_adapter *adap;
-	unsigned int sda_pin, scl_pin;
 	int ret;
 
-	/* First get the GPIO pins; if it fails, we'll defer the probe. */
-	if (pdev->dev.of_node) {
-		ret = of_i2c_gpio_get_pins(pdev->dev.of_node,
-					   &sda_pin, &scl_pin);
-		if (ret)
-			return ret;
-	} else {
-		if (!dev_get_platdata(&pdev->dev))
-			return -ENXIO;
-		pdata = dev_get_platdata(&pdev->dev);
-		sda_pin = pdata->sda_pin;
-		scl_pin = pdata->scl_pin;
-	}
+	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
 
-	ret = devm_gpio_request(&pdev->dev, sda_pin, "sda");
-	if (ret) {
+	/* First get the GPIO pins; if it fails, we'll defer the probe. */
+	priv->sda = devm_gpiod_get_index(&pdev->dev, NULL, 0, GPIOD_OUT_HIGH);
+	if (IS_ERR(priv->sda)) {
+		ret = PTR_ERR(priv->sda);
+		/* FIXME: hack in the old code, is this really necessary? */
 		if (ret == -EINVAL)
-			ret = -EPROBE_DEFER;	/* Try again later */
+			ret = -EPROBE_DEFER;
 		return ret;
 	}
-	ret = devm_gpio_request(&pdev->dev, scl_pin, "scl");
-	if (ret) {
+	priv->scl = devm_gpiod_get_index(&pdev->dev, NULL, 1, GPIOD_OUT_LOW);
+	if (IS_ERR(priv->scl)) {
+		ret = PTR_ERR(priv->scl);
+		/* FIXME: hack in the old code, is this really necessary? */
 		if (ret == -EINVAL)
-			ret = -EPROBE_DEFER;	/* Try again later */
+			ret = -EPROBE_DEFER;
 		return ret;
 	}
 
-	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
-	if (!priv)
-		return -ENOMEM;
 	adap = &priv->adap;
 	bit_data = &priv->bit_data;
 	pdata = &priv->pdata;
 
 	if (pdev->dev.of_node) {
-		pdata->sda_pin = sda_pin;
-		pdata->scl_pin = scl_pin;
 		of_i2c_gpio_get_props(pdev->dev.of_node, pdata);
 	} else {
-		memcpy(pdata, dev_get_platdata(&pdev->dev), sizeof(*pdata));
+		/*
+		 * If all platform data settings are zero it is OK
+		 * to not provide any platform data from the board.
+		 */
+		if (dev_get_platdata(&pdev->dev))
+			memcpy(pdata, dev_get_platdata(&pdev->dev),
+			       sizeof(*pdata));
 	}
 
+	/*
+	 * FIXME: this is a hack emulating the open drain emulation
+	 * that gpiolib can already do for us. Make all clients properly
+	 * flag their lines as open drain and get rid of this property
+	 * and the special callback.
+	 */
 	if (pdata->sda_is_open_drain) {
-		gpio_direction_output(pdata->sda_pin, 1);
+		gpiod_direction_output(priv->sda, 1);
 		bit_data->setsda = i2c_gpio_setsda_val;
 	} else {
-		gpio_direction_input(pdata->sda_pin);
+		gpiod_direction_input(priv->sda);
 		bit_data->setsda = i2c_gpio_setsda_dir;
 	}
 
 	if (pdata->scl_is_open_drain || pdata->scl_is_output_only) {
-		gpio_direction_output(pdata->scl_pin, 1);
+		gpiod_direction_output(priv->scl, 1);
 		bit_data->setscl = i2c_gpio_setscl_val;
 	} else {
-		gpio_direction_input(pdata->scl_pin);
+		gpiod_direction_input(priv->scl);
 		bit_data->setscl = i2c_gpio_setscl_dir;
 	}
 
@@ -207,7 +194,7 @@ static int i2c_gpio_probe(struct platform_device *pdev)
 	else
 		bit_data->timeout = HZ / 10;		/* 100 ms */
 
-	bit_data->data = pdata;
+	bit_data->data = priv;
 
 	adap->owner = THIS_MODULE;
 	if (pdev->dev.of_node)
@@ -227,8 +214,13 @@ static int i2c_gpio_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, priv);
 
-	dev_info(&pdev->dev, "using pins %u (SDA) and %u (SCL%s)\n",
-		 pdata->sda_pin, pdata->scl_pin,
+	/*
+	 * FIXME: using global GPIO numbers is not helpful. If/when we
+	 * get accessors to get the actual name of the GPIO line,
+	 * from the descriptor, then provide that instead.
+	 */
+	dev_info(&pdev->dev, "using lines %u (SDA) and %u (SCL%s)\n",
+		 desc_to_gpio(priv->sda), desc_to_gpio(priv->scl),
 		 pdata->scl_is_output_only
 		 ? ", no clock stretching" : "");
 
diff --git a/drivers/mfd/sm501.c b/drivers/mfd/sm501.c
index 40534352e574..4d40d013a412 100644
--- a/drivers/mfd/sm501.c
+++ b/drivers/mfd/sm501.c
@@ -20,6 +20,7 @@
 #include <linux/platform_device.h>
 #include <linux/pci.h>
 #include <linux/i2c-gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/slab.h>
 
 #include <linux/sm501.h>
@@ -1107,14 +1108,6 @@ static void sm501_gpio_remove(struct sm501_devdata *sm)
 	kfree(gpio->regs_res);
 }
 
-static inline int sm501_gpio_pin2nr(struct sm501_devdata *sm, unsigned int pin)
-{
-	struct sm501_gpio *gpio = &sm->gpio;
-	int base = (pin < 32) ? gpio->low.gpio.base : gpio->high.gpio.base;
-
-	return (pin % 32) + base;
-}
-
 static inline int sm501_gpio_isregistered(struct sm501_devdata *sm)
 {
 	return sm->gpio.registered;
@@ -1129,11 +1122,6 @@ static inline void sm501_gpio_remove(struct sm501_devdata *sm)
 {
 }
 
-static inline int sm501_gpio_pin2nr(struct sm501_devdata *sm, unsigned int pin)
-{
-	return -1;
-}
-
 static inline int sm501_gpio_isregistered(struct sm501_devdata *sm)
 {
 	return 0;
@@ -1145,20 +1133,37 @@ static int sm501_register_gpio_i2c_instance(struct sm501_devdata *sm,
 {
 	struct i2c_gpio_platform_data *icd;
 	struct platform_device *pdev;
+	struct gpiod_lookup_table *lookup;
 
 	pdev = sm501_create_subdev(sm, "i2c-gpio", 0,
 				   sizeof(struct i2c_gpio_platform_data));
 	if (!pdev)
 		return -ENOMEM;
 
-	icd = dev_get_platdata(&pdev->dev);
-
-	/* We keep the pin_sda and pin_scl fields relative in case the
-	 * same platform data is passed to >1 SM501.
-	 */
+	/* Create a gpiod lookup using gpiochip-local offsets */
+	lookup = devm_kzalloc(&pdev->dev,
+			      sizeof(*lookup) + 3 * sizeof(struct gpiod_lookup),
+			      GFP_KERNEL);
+	lookup->dev_id = "i2c-gpio";
+	if (iic->pin_sda < 32)
+		lookup->table[0].chip_label = "SM501-LOW";
+	else
+		lookup->table[0].chip_label = "SM501-HIGH";
+	lookup->table[0].chip_hwnum = iic->pin_sda % 32;
+	lookup->table[0].con_id = NULL;
+	lookup->table[0].idx = 0;
+	lookup->table[0].flags = GPIO_ACTIVE_HIGH;
+	if (iic->pin_scl < 32)
+		lookup->table[1].chip_label = "SM501-LOW";
+	else
+		lookup->table[1].chip_label = "SM501-HIGH";
+	lookup->table[1].chip_hwnum = iic->pin_scl % 32;
+	lookup->table[1].con_id = NULL;
+	lookup->table[1].idx = 1;
+	lookup->table[1].flags = GPIO_ACTIVE_HIGH;
+	gpiod_add_lookup_table(lookup);
 
-	icd->sda_pin = sm501_gpio_pin2nr(sm, iic->pin_sda);
-	icd->scl_pin = sm501_gpio_pin2nr(sm, iic->pin_scl);
+	icd = dev_get_platdata(&pdev->dev);
 	icd->timeout = iic->timeout;
 	icd->udelay = iic->udelay;
 
@@ -1170,9 +1175,9 @@ static int sm501_register_gpio_i2c_instance(struct sm501_devdata *sm,
 
 	pdev->id = iic->bus_num;
 
-	dev_info(sm->dev, "registering i2c-%d: sda=%d (%d), scl=%d (%d)\n",
+	dev_info(sm->dev, "registering i2c-%d: sda=%d, scl=%d\n",
 		 iic->bus_num,
-		 icd->sda_pin, iic->pin_sda, icd->scl_pin, iic->pin_scl);
+		 iic->pin_sda, iic->pin_scl);
 
 	return sm501_register_device(sm, pdev);
 }
diff --git a/include/linux/i2c-gpio.h b/include/linux/i2c-gpio.h
index c1bcb1f1d73b..352c1426fd4d 100644
--- a/include/linux/i2c-gpio.h
+++ b/include/linux/i2c-gpio.h
@@ -12,8 +12,6 @@
 
 /**
  * struct i2c_gpio_platform_data - Platform-dependent data for i2c-gpio
- * @sda_pin: GPIO pin ID to use for SDA
- * @scl_pin: GPIO pin ID to use for SCL
  * @udelay: signal toggle delay. SCL frequency is (500 / udelay) kHz
  * @timeout: clock stretching timeout in jiffies. If the slave keeps
  *	SCL low for longer than this, the transfer will time out.
@@ -26,8 +24,6 @@
  * @scl_is_output_only: SCL output drivers cannot be turned off.
  */
 struct i2c_gpio_platform_data {
-	unsigned int	sda_pin;
-	unsigned int	scl_pin;
 	int		udelay;
 	int		timeout;
 	unsigned int	sda_is_open_drain:1;
-- 
cgit v1.2.3


From f926dfc112bc6cf41d7068ee5e3f261e13a5bec8 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sun, 10 Sep 2017 19:26:22 +0200
Subject: gpio: Make it possible for consumers to enforce open drain

Some busses, like I2C, strictly need to have the line handled
as open drain, i.e. not actively driven high. For this reason
the i2c-gpio.c bit-banged I2C driver is reimplementing open
drain handling outside of gpiolib.

This is not very optimal. Instead make it possible for a
consumer to explcitly express that the line must be handled
as open drain instead of allowing local hacks papering over
this issue.

The descriptor tables, whether DT, ACPI or board files, should
of course have flagged these lines as open drain. E.g.:
enum gpio_lookup_flags GPIO_OPEN_DRAIN for a board file, or
gpios = <&foo 42 GPIO_ACTIVE_HIGH|GPIO_OPEN_DRAIN>; in a
device tree using <dt-bindings/gpio/gpio.h>

But more often than not, these descriptors are wrong. So
we need to make it possible for consumers to enforce this
open drain behaviour.

We now have two new enumerated GPIO descriptor config flags:
GPIOD_OUT_LOW_OPEN_DRAIN and GPIOD_OUT_HIGH_OPEN_DRAIN
that will set up the lined enforced as open drain as output
low or high, using open drain (if the driver supports it)
or using open drain emulation (setting the line as input
to drive it high) from the gpiolib core.

Cc: linux-gpio@vger.kernel.org
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c        | 13 +++++++++++++
 include/linux/gpio/consumer.h |  6 ++++++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index eb80dac4e26a..c952ef1b2f46 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -3264,8 +3264,21 @@ int gpiod_configure_flags(struct gpio_desc *desc, const char *con_id,
 
 	if (lflags & GPIO_ACTIVE_LOW)
 		set_bit(FLAG_ACTIVE_LOW, &desc->flags);
+
 	if (lflags & GPIO_OPEN_DRAIN)
 		set_bit(FLAG_OPEN_DRAIN, &desc->flags);
+	else if (dflags & GPIOD_FLAGS_BIT_OPEN_DRAIN) {
+		/*
+		 * This enforces open drain mode from the consumer side.
+		 * This is necessary for some busses like I2C, but the lookup
+		 * should *REALLY* have specified them as open drain in the
+		 * first place, so print a little warning here.
+		 */
+		set_bit(FLAG_OPEN_DRAIN, &desc->flags);
+		gpiod_warn(desc,
+			   "enforced open drain please flag it properly in DT/ACPI DSDT/board file\n");
+	}
+
 	if (lflags & GPIO_OPEN_SOURCE)
 		set_bit(FLAG_OPEN_SOURCE, &desc->flags);
 	if (lflags & GPIO_SLEEP_MAY_LOOSE_VALUE)
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 8f702fcbe485..5f72a49d1aa3 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -28,6 +28,7 @@ struct gpio_descs {
 #define GPIOD_FLAGS_BIT_DIR_SET		BIT(0)
 #define GPIOD_FLAGS_BIT_DIR_OUT		BIT(1)
 #define GPIOD_FLAGS_BIT_DIR_VAL		BIT(2)
+#define GPIOD_FLAGS_BIT_OPEN_DRAIN	BIT(3)
 
 /**
  * Optional flags that can be passed to one of gpiod_* to configure direction
@@ -39,6 +40,11 @@ enum gpiod_flags {
 	GPIOD_OUT_LOW	= GPIOD_FLAGS_BIT_DIR_SET | GPIOD_FLAGS_BIT_DIR_OUT,
 	GPIOD_OUT_HIGH	= GPIOD_FLAGS_BIT_DIR_SET | GPIOD_FLAGS_BIT_DIR_OUT |
 			  GPIOD_FLAGS_BIT_DIR_VAL,
+	GPIOD_OUT_LOW_OPEN_DRAIN = GPIOD_FLAGS_BIT_DIR_SET |
+			  GPIOD_FLAGS_BIT_DIR_OUT | GPIOD_FLAGS_BIT_OPEN_DRAIN,
+	GPIOD_OUT_HIGH_OPEN_DRAIN = GPIOD_FLAGS_BIT_DIR_SET |
+			  GPIOD_FLAGS_BIT_DIR_OUT | GPIOD_FLAGS_BIT_DIR_VAL |
+			  GPIOD_FLAGS_BIT_OPEN_DRAIN,
 };
 
 #ifdef CONFIG_GPIOLIB
-- 
cgit v1.2.3


From 79ea73b05aaa83b7451a3aee48a5e835fb0f3864 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Fri, 15 Sep 2017 20:13:28 +0200
Subject: mmc: sdhci-pci: remove outdated declaration

The function was removed half a year ago, so this declaration can go,
too.

Fixes: 51ced59cc02e0d ("mmc: sdhci-pci: Use ACPI DSM to get driver strength for some Intel devices")
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Acked-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/mmc/sdhci-pci-data.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmc/sdhci-pci-data.h b/include/linux/mmc/sdhci-pci-data.h
index fda15b6d4135..618f90d6e1ba 100644
--- a/include/linux/mmc/sdhci-pci-data.h
+++ b/include/linux/mmc/sdhci-pci-data.h
@@ -14,7 +14,4 @@ struct sdhci_pci_data {
 
 extern struct sdhci_pci_data *(*sdhci_pci_get_data)(struct pci_dev *pdev,
 				int slotno);
-
-extern int sdhci_pci_spt_drive_strength;
-
 #endif
-- 
cgit v1.2.3


From 6c0cedd1ef9527ef13e66875746570e76a3188a7 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Fri, 22 Sep 2017 15:36:51 +0300
Subject: mmc: core: Introduce host claiming by context

Currently the host can be claimed by a task.  Change this so that the host
can be claimed by a context that may or may not be a task.  This provides
for the host to be claimed by a block driver queue to support blk-mq, while
maintaining compatibility with the existing use of mmc_claim_host().

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/block.c    |  4 ++--
 drivers/mmc/core/core.c     | 48 ++++++++++++++++++++++++++++++++++++++-------
 drivers/mmc/core/core.h     |  9 +++++----
 drivers/mmc/core/mmc.c      |  4 ++--
 drivers/mmc/core/sd.c       |  4 ++--
 drivers/mmc/core/sdio_irq.c |  3 ++-
 include/linux/mmc/host.h    |  7 ++++++-
 7 files changed, 60 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index f11537a66a60..9476312f081a 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -1989,7 +1989,7 @@ void mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 
 	if (req && !mq->qcnt)
 		/* claim host only for the first request */
-		mmc_get_card(card);
+		mmc_get_card(card, NULL);
 
 	ret = mmc_blk_part_switch(card, md->part_type);
 	if (ret) {
@@ -2052,7 +2052,7 @@ void mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 
 out:
 	if (!mq->qcnt)
-		mmc_put_card(card);
+		mmc_put_card(card, NULL);
 }
 
 static inline int mmc_blk_readonly(struct mmc_card *card)
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 66c9cf49ad2f..b997cf92ce6c 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -832,9 +832,36 @@ unsigned int mmc_align_data_size(struct mmc_card *card, unsigned int sz)
 }
 EXPORT_SYMBOL(mmc_align_data_size);
 
+/*
+ * Allow claiming an already claimed host if the context is the same or there is
+ * no context but the task is the same.
+ */
+static inline bool mmc_ctx_matches(struct mmc_host *host, struct mmc_ctx *ctx,
+				   struct task_struct *task)
+{
+	return host->claimer == ctx ||
+	       (!ctx && task && host->claimer->task == task);
+}
+
+static inline void mmc_ctx_set_claimer(struct mmc_host *host,
+				       struct mmc_ctx *ctx,
+				       struct task_struct *task)
+{
+	if (!host->claimer) {
+		if (ctx)
+			host->claimer = ctx;
+		else
+			host->claimer = &host->default_ctx;
+	}
+	if (task)
+		host->claimer->task = task;
+}
+
 /**
  *	__mmc_claim_host - exclusively claim a host
  *	@host: mmc host to claim
+ *	@ctx: context that claims the host or NULL in which case the default
+ *	context will be used
  *	@abort: whether or not the operation should be aborted
  *
  *	Claim a host for a set of operations.  If @abort is non null and
@@ -842,8 +869,10 @@ EXPORT_SYMBOL(mmc_align_data_size);
  *	that non-zero value without acquiring the lock.  Returns zero
  *	with the lock held otherwise.
  */
-int __mmc_claim_host(struct mmc_host *host, atomic_t *abort)
+int __mmc_claim_host(struct mmc_host *host, struct mmc_ctx *ctx,
+		     atomic_t *abort)
 {
+	struct task_struct *task = ctx ? NULL : current;
 	DECLARE_WAITQUEUE(wait, current);
 	unsigned long flags;
 	int stop;
@@ -856,7 +885,7 @@ int __mmc_claim_host(struct mmc_host *host, atomic_t *abort)
 	while (1) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		stop = abort ? atomic_read(abort) : 0;
-		if (stop || !host->claimed || host->claimer == current)
+		if (stop || !host->claimed || mmc_ctx_matches(host, ctx, task))
 			break;
 		spin_unlock_irqrestore(&host->lock, flags);
 		schedule();
@@ -865,7 +894,7 @@ int __mmc_claim_host(struct mmc_host *host, atomic_t *abort)
 	set_current_state(TASK_RUNNING);
 	if (!stop) {
 		host->claimed = 1;
-		host->claimer = current;
+		mmc_ctx_set_claimer(host, ctx, task);
 		host->claim_cnt += 1;
 		if (host->claim_cnt == 1)
 			pm = true;
@@ -900,6 +929,7 @@ void mmc_release_host(struct mmc_host *host)
 		spin_unlock_irqrestore(&host->lock, flags);
 	} else {
 		host->claimed = 0;
+		host->claimer->task = NULL;
 		host->claimer = NULL;
 		spin_unlock_irqrestore(&host->lock, flags);
 		wake_up(&host->wq);
@@ -913,10 +943,10 @@ EXPORT_SYMBOL(mmc_release_host);
  * This is a helper function, which fetches a runtime pm reference for the
  * card device and also claims the host.
  */
-void mmc_get_card(struct mmc_card *card)
+void mmc_get_card(struct mmc_card *card, struct mmc_ctx *ctx)
 {
 	pm_runtime_get_sync(&card->dev);
-	mmc_claim_host(card->host);
+	__mmc_claim_host(card->host, ctx, NULL);
 }
 EXPORT_SYMBOL(mmc_get_card);
 
@@ -924,9 +954,13 @@ EXPORT_SYMBOL(mmc_get_card);
  * This is a helper function, which releases the host and drops the runtime
  * pm reference for the card device.
  */
-void mmc_put_card(struct mmc_card *card)
+void mmc_put_card(struct mmc_card *card, struct mmc_ctx *ctx)
 {
-	mmc_release_host(card->host);
+	struct mmc_host *host = card->host;
+
+	WARN_ON(ctx && host->claimer != ctx);
+
+	mmc_release_host(host);
 	pm_runtime_mark_last_busy(&card->dev);
 	pm_runtime_put_autosuspend(&card->dev);
 }
diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h
index ca861091a776..94675f88f704 100644
--- a/drivers/mmc/core/core.h
+++ b/drivers/mmc/core/core.h
@@ -128,10 +128,11 @@ int mmc_set_blocklen(struct mmc_card *card, unsigned int blocklen);
 int mmc_set_blockcount(struct mmc_card *card, unsigned int blockcount,
 			bool is_rel_write);
 
-int __mmc_claim_host(struct mmc_host *host, atomic_t *abort);
+int __mmc_claim_host(struct mmc_host *host, struct mmc_ctx *ctx,
+		     atomic_t *abort);
 void mmc_release_host(struct mmc_host *host);
-void mmc_get_card(struct mmc_card *card);
-void mmc_put_card(struct mmc_card *card);
+void mmc_get_card(struct mmc_card *card, struct mmc_ctx *ctx);
+void mmc_put_card(struct mmc_card *card, struct mmc_ctx *ctx);
 
 /**
  *	mmc_claim_host - exclusively claim a host
@@ -141,7 +142,7 @@ void mmc_put_card(struct mmc_card *card);
  */
 static inline void mmc_claim_host(struct mmc_host *host)
 {
-	__mmc_claim_host(host, NULL);
+	__mmc_claim_host(host, NULL, NULL);
 }
 
 #endif
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 36217ad5e9b1..8d77ddc85af1 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -1911,14 +1911,14 @@ static void mmc_detect(struct mmc_host *host)
 {
 	int err;
 
-	mmc_get_card(host->card);
+	mmc_get_card(host->card, NULL);
 
 	/*
 	 * Just check if our card has been removed.
 	 */
 	err = _mmc_detect_card_removed(host);
 
-	mmc_put_card(host->card);
+	mmc_put_card(host->card, NULL);
 
 	if (err) {
 		mmc_remove(host);
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 4fd1620b732d..2036b2b4835c 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -1056,14 +1056,14 @@ static void mmc_sd_detect(struct mmc_host *host)
 {
 	int err;
 
-	mmc_get_card(host->card);
+	mmc_get_card(host->card, NULL);
 
 	/*
 	 * Just check if our card has been removed.
 	 */
 	err = _mmc_detect_card_removed(host);
 
-	mmc_put_card(host->card);
+	mmc_put_card(host->card, NULL);
 
 	if (err) {
 		mmc_sd_remove(host);
diff --git a/drivers/mmc/core/sdio_irq.c b/drivers/mmc/core/sdio_irq.c
index c771843e4c15..7a2eaf8410a3 100644
--- a/drivers/mmc/core/sdio_irq.c
+++ b/drivers/mmc/core/sdio_irq.c
@@ -155,7 +155,8 @@ static int sdio_irq_thread(void *_host)
 		 * holding of the host lock does not cover too much work
 		 * that doesn't require that lock to be held.
 		 */
-		ret = __mmc_claim_host(host, &host->sdio_irq_thread_abort);
+		ret = __mmc_claim_host(host, NULL,
+				       &host->sdio_irq_thread_abort);
 		if (ret)
 			break;
 		ret = process_sdio_pending_irqs(host);
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 9a43763a68ad..443f7a8cdfe5 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -255,6 +255,10 @@ struct mmc_supply {
 	struct regulator *vqmmc;	/* Optional Vccq supply */
 };
 
+struct mmc_ctx {
+	struct task_struct *task;
+};
+
 struct mmc_host {
 	struct device		*parent;
 	struct device		class_dev;
@@ -388,8 +392,9 @@ struct mmc_host {
 	struct mmc_card		*card;		/* device attached to this host */
 
 	wait_queue_head_t	wq;
-	struct task_struct	*claimer;	/* task that has host claimed */
+	struct mmc_ctx		*claimer;	/* context that has host claimed */
 	int			claim_cnt;	/* "claim" nesting count */
+	struct mmc_ctx		default_ctx;	/* default context */
 
 	struct delayed_work	detect;
 	int			detect_change;	/* card detect flag */
-- 
cgit v1.2.3


From 72a5af554df837e373efb0d6c8fc68c568f9a7ac Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Fri, 22 Sep 2017 15:36:52 +0300
Subject: mmc: core: Add support for handling CQE requests

Add core support for handling CQE requests, including starting, completing
and recovering.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/core.c  | 163 +++++++++++++++++++++++++++++++++++++++++++++--
 drivers/mmc/core/core.h  |   4 ++
 include/linux/mmc/host.h |   2 +
 3 files changed, 164 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index b997cf92ce6c..2ff614d4ffac 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -266,7 +266,8 @@ static void __mmc_start_request(struct mmc_host *host, struct mmc_request *mrq)
 	host->ops->request(host, mrq);
 }
 
-static void mmc_mrq_pr_debug(struct mmc_host *host, struct mmc_request *mrq)
+static void mmc_mrq_pr_debug(struct mmc_host *host, struct mmc_request *mrq,
+			     bool cqe)
 {
 	if (mrq->sbc) {
 		pr_debug("<%s: starting CMD%u arg %08x flags %08x>\n",
@@ -275,9 +276,12 @@ static void mmc_mrq_pr_debug(struct mmc_host *host, struct mmc_request *mrq)
 	}
 
 	if (mrq->cmd) {
-		pr_debug("%s: starting CMD%u arg %08x flags %08x\n",
-			 mmc_hostname(host), mrq->cmd->opcode, mrq->cmd->arg,
-			 mrq->cmd->flags);
+		pr_debug("%s: starting %sCMD%u arg %08x flags %08x\n",
+			 mmc_hostname(host), cqe ? "CQE direct " : "",
+			 mrq->cmd->opcode, mrq->cmd->arg, mrq->cmd->flags);
+	} else if (cqe) {
+		pr_debug("%s: starting CQE transfer for tag %d blkaddr %u\n",
+			 mmc_hostname(host), mrq->tag, mrq->data->blk_addr);
 	}
 
 	if (mrq->data) {
@@ -342,7 +346,7 @@ static int mmc_start_request(struct mmc_host *host, struct mmc_request *mrq)
 	if (mmc_card_removed(host->card))
 		return -ENOMEDIUM;
 
-	mmc_mrq_pr_debug(host, mrq);
+	mmc_mrq_pr_debug(host, mrq, false);
 
 	WARN_ON(!host->claimed);
 
@@ -482,6 +486,155 @@ void mmc_wait_for_req_done(struct mmc_host *host, struct mmc_request *mrq)
 }
 EXPORT_SYMBOL(mmc_wait_for_req_done);
 
+/*
+ * mmc_cqe_start_req - Start a CQE request.
+ * @host: MMC host to start the request
+ * @mrq: request to start
+ *
+ * Start the request, re-tuning if needed and it is possible. Returns an error
+ * code if the request fails to start or -EBUSY if CQE is busy.
+ */
+int mmc_cqe_start_req(struct mmc_host *host, struct mmc_request *mrq)
+{
+	int err;
+
+	/*
+	 * CQE cannot process re-tuning commands. Caller must hold retuning
+	 * while CQE is in use.  Re-tuning can happen here only when CQE has no
+	 * active requests i.e. this is the first.  Note, re-tuning will call
+	 * ->cqe_off().
+	 */
+	err = mmc_retune(host);
+	if (err)
+		goto out_err;
+
+	mrq->host = host;
+
+	mmc_mrq_pr_debug(host, mrq, true);
+
+	err = mmc_mrq_prep(host, mrq);
+	if (err)
+		goto out_err;
+
+	err = host->cqe_ops->cqe_request(host, mrq);
+	if (err)
+		goto out_err;
+
+	trace_mmc_request_start(host, mrq);
+
+	return 0;
+
+out_err:
+	if (mrq->cmd) {
+		pr_debug("%s: failed to start CQE direct CMD%u, error %d\n",
+			 mmc_hostname(host), mrq->cmd->opcode, err);
+	} else {
+		pr_debug("%s: failed to start CQE transfer for tag %d, error %d\n",
+			 mmc_hostname(host), mrq->tag, err);
+	}
+	return err;
+}
+EXPORT_SYMBOL(mmc_cqe_start_req);
+
+/**
+ *	mmc_cqe_request_done - CQE has finished processing an MMC request
+ *	@host: MMC host which completed request
+ *	@mrq: MMC request which completed
+ *
+ *	CQE drivers should call this function when they have completed
+ *	their processing of a request.
+ */
+void mmc_cqe_request_done(struct mmc_host *host, struct mmc_request *mrq)
+{
+	mmc_should_fail_request(host, mrq);
+
+	/* Flag re-tuning needed on CRC errors */
+	if ((mrq->cmd && mrq->cmd->error == -EILSEQ) ||
+	    (mrq->data && mrq->data->error == -EILSEQ))
+		mmc_retune_needed(host);
+
+	trace_mmc_request_done(host, mrq);
+
+	if (mrq->cmd) {
+		pr_debug("%s: CQE req done (direct CMD%u): %d\n",
+			 mmc_hostname(host), mrq->cmd->opcode, mrq->cmd->error);
+	} else {
+		pr_debug("%s: CQE transfer done tag %d\n",
+			 mmc_hostname(host), mrq->tag);
+	}
+
+	if (mrq->data) {
+		pr_debug("%s:     %d bytes transferred: %d\n",
+			 mmc_hostname(host),
+			 mrq->data->bytes_xfered, mrq->data->error);
+	}
+
+	mrq->done(mrq);
+}
+EXPORT_SYMBOL(mmc_cqe_request_done);
+
+/**
+ *	mmc_cqe_post_req - CQE post process of a completed MMC request
+ *	@host: MMC host
+ *	@mrq: MMC request to be processed
+ */
+void mmc_cqe_post_req(struct mmc_host *host, struct mmc_request *mrq)
+{
+	if (host->cqe_ops->cqe_post_req)
+		host->cqe_ops->cqe_post_req(host, mrq);
+}
+EXPORT_SYMBOL(mmc_cqe_post_req);
+
+/* Arbitrary 1 second timeout */
+#define MMC_CQE_RECOVERY_TIMEOUT	1000
+
+/*
+ * mmc_cqe_recovery - Recover from CQE errors.
+ * @host: MMC host to recover
+ *
+ * Recovery consists of stopping CQE, stopping eMMC, discarding the queue in
+ * in eMMC, and discarding the queue in CQE. CQE must call
+ * mmc_cqe_request_done() on all requests. An error is returned if the eMMC
+ * fails to discard its queue.
+ */
+int mmc_cqe_recovery(struct mmc_host *host)
+{
+	struct mmc_command cmd;
+	int err;
+
+	mmc_retune_hold_now(host);
+
+	/*
+	 * Recovery is expected seldom, if at all, but it reduces performance,
+	 * so make sure it is not completely silent.
+	 */
+	pr_warn("%s: running CQE recovery\n", mmc_hostname(host));
+
+	host->cqe_ops->cqe_recovery_start(host);
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.opcode       = MMC_STOP_TRANSMISSION,
+	cmd.flags        = MMC_RSP_R1B | MMC_CMD_AC,
+	cmd.flags       &= ~MMC_RSP_CRC; /* Ignore CRC */
+	cmd.busy_timeout = MMC_CQE_RECOVERY_TIMEOUT,
+	mmc_wait_for_cmd(host, &cmd, 0);
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.opcode       = MMC_CMDQ_TASK_MGMT;
+	cmd.arg          = 1; /* Discard entire queue */
+	cmd.flags        = MMC_RSP_R1B | MMC_CMD_AC;
+	cmd.flags       &= ~MMC_RSP_CRC; /* Ignore CRC */
+	cmd.busy_timeout = MMC_CQE_RECOVERY_TIMEOUT,
+	err = mmc_wait_for_cmd(host, &cmd, 0);
+
+	host->cqe_ops->cqe_recovery_finish(host);
+
+	mmc_retune_release(host);
+
+	return err;
+}
+EXPORT_SYMBOL(mmc_cqe_recovery);
+
 /**
  *	mmc_is_req_done - Determine if a 'cap_cmd_during_tfr' request is done
  *	@host: MMC host
diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h
index 94675f88f704..ba5a8fea0dc2 100644
--- a/drivers/mmc/core/core.h
+++ b/drivers/mmc/core/core.h
@@ -145,4 +145,8 @@ static inline void mmc_claim_host(struct mmc_host *host)
 	__mmc_claim_host(host, NULL, NULL);
 }
 
+int mmc_cqe_start_req(struct mmc_host *host, struct mmc_request *mrq);
+void mmc_cqe_post_req(struct mmc_host *host, struct mmc_request *mrq);
+int mmc_cqe_recovery(struct mmc_host *host);
+
 #endif
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 443f7a8cdfe5..c296f4351c1d 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -474,6 +474,8 @@ void mmc_detect_change(struct mmc_host *, unsigned long delay);
 void mmc_request_done(struct mmc_host *, struct mmc_request *);
 void mmc_command_done(struct mmc_host *host, struct mmc_request *mrq);
 
+void mmc_cqe_request_done(struct mmc_host *host, struct mmc_request *mrq);
+
 static inline void mmc_signal_sdio_irq(struct mmc_host *host)
 {
 	host->ops->enable_sdio_irq(host, 0);
-- 
cgit v1.2.3


From 563be8b60324f8947e9c71ec81a8cb67ea2f2127 Mon Sep 17 00:00:00 2001
From: rui_feng <rui_feng@realsil.com.cn>
Date: Fri, 22 Sep 2017 16:07:35 +0800
Subject: mmc: rtsx: fix tuning fail on gen3 PCI-Express

On gen3 PCI-Express we should send command one by one.
If sending many commands in one packet will lead to a failure.

Signed-off-by: rui_feng <rui_feng@realsil.com.cn>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/rtsx_pci_sdmmc.c | 38 ++++++++++++++++++--------------------
 include/linux/mfd/rtsx_pci.h      |  1 +
 2 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/rtsx_pci_sdmmc.c b/drivers/mmc/host/rtsx_pci_sdmmc.c
index 41b57713b620..0848dc0f882e 100644
--- a/drivers/mmc/host/rtsx_pci_sdmmc.c
+++ b/drivers/mmc/host/rtsx_pci_sdmmc.c
@@ -618,29 +618,22 @@ static int sd_change_phase(struct realtek_pci_sdmmc *host,
 		u8 sample_point, bool rx)
 {
 	struct rtsx_pcr *pcr = host->pcr;
-	int err;
 
 	dev_dbg(sdmmc_dev(host), "%s(%s): sample_point = %d\n",
 			__func__, rx ? "RX" : "TX", sample_point);
 
-	rtsx_pci_init_cmd(pcr);
-
-	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, CLK_CTL, CHANGE_CLK, CHANGE_CLK);
+	rtsx_pci_write_register(pcr, CLK_CTL, CHANGE_CLK, CHANGE_CLK);
 	if (rx)
-		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD,
-				SD_VPRX_CTL, 0x1F, sample_point);
+		rtsx_pci_write_register(pcr, SD_VPRX_CTL,
+			PHASE_SELECT_MASK, sample_point);
 	else
-		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD,
-				SD_VPTX_CTL, 0x1F, sample_point);
-	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_VPCLK0_CTL, PHASE_NOT_RESET, 0);
-	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_VPCLK0_CTL,
-			PHASE_NOT_RESET, PHASE_NOT_RESET);
-	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, CLK_CTL, CHANGE_CLK, 0);
-	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_CFG1, SD_ASYNC_FIFO_NOT_RST, 0);
-
-	err = rtsx_pci_send_cmd(pcr, 100);
-	if (err < 0)
-		return err;
+		rtsx_pci_write_register(pcr, SD_VPTX_CTL,
+			PHASE_SELECT_MASK, sample_point);
+	rtsx_pci_write_register(pcr, SD_VPCLK0_CTL, PHASE_NOT_RESET, 0);
+	rtsx_pci_write_register(pcr, SD_VPCLK0_CTL, PHASE_NOT_RESET,
+				PHASE_NOT_RESET);
+	rtsx_pci_write_register(pcr, CLK_CTL, CHANGE_CLK, 0);
+	rtsx_pci_write_register(pcr, SD_CFG1, SD_ASYNC_FIFO_NOT_RST, 0);
 
 	return 0;
 }
@@ -708,10 +701,12 @@ static int sd_tuning_rx_cmd(struct realtek_pci_sdmmc *host,
 {
 	int err;
 	struct mmc_command cmd = {};
+	struct rtsx_pcr *pcr = host->pcr;
 
-	err = sd_change_phase(host, sample_point, true);
-	if (err < 0)
-		return err;
+	sd_change_phase(host, sample_point, true);
+
+	rtsx_pci_write_register(pcr, SD_CFG3, SD_RSP_80CLK_TIMEOUT_EN,
+		SD_RSP_80CLK_TIMEOUT_EN);
 
 	cmd.opcode = opcode;
 	err = sd_read_data(host, &cmd, 0x40, NULL, 0, 100);
@@ -719,9 +714,12 @@ static int sd_tuning_rx_cmd(struct realtek_pci_sdmmc *host,
 		/* Wait till SD DATA IDLE */
 		sd_wait_data_idle(host);
 		sd_clear_error(host);
+		rtsx_pci_write_register(pcr, SD_CFG3,
+			SD_RSP_80CLK_TIMEOUT_EN, 0);
 		return err;
 	}
 
+	rtsx_pci_write_register(pcr, SD_CFG3, SD_RSP_80CLK_TIMEOUT_EN, 0);
 	return 0;
 }
 
diff --git a/include/linux/mfd/rtsx_pci.h b/include/linux/mfd/rtsx_pci.h
index 116816fb9110..7815d8db7eca 100644
--- a/include/linux/mfd/rtsx_pci.h
+++ b/include/linux/mfd/rtsx_pci.h
@@ -334,6 +334,7 @@
 #define DCM_DRP_RD_DATA_H		0xFC29
 #define SD_VPCLK0_CTL			0xFC2A
 #define SD_VPCLK1_CTL			0xFC2B
+#define   PHASE_SELECT_MASK		0x1F
 #define SD_DCMPS0_CTL			0xFC2C
 #define SD_DCMPS1_CTL			0xFC2D
 #define SD_VPTX_CTL			SD_VPCLK0_CTL
-- 
cgit v1.2.3


From 6186d06c519e217351fac95b545f334f8582af90 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Sun, 15 Oct 2017 14:46:14 +0200
Subject: mmc: parse new binding for eMMC fixed driver type

Parse the new binding and store it in the host struct after doing some
sanity checks. The code is designed to support fixed SD driver type if
we ever need that.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/host.c  | 13 ++++++++++++-
 drivers/mmc/core/mmc.c   | 11 ++++++++---
 include/linux/mmc/host.h |  2 ++
 3 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index e58be39b1568..35a9e4fd1a9f 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -179,7 +179,7 @@ static void mmc_retune_timer(unsigned long data)
 int mmc_of_parse(struct mmc_host *host)
 {
 	struct device *dev = host->parent;
-	u32 bus_width;
+	u32 bus_width, drv_type;
 	int ret;
 	bool cd_cap_invert, cd_gpio_invert = false;
 	bool ro_cap_invert, ro_gpio_invert = false;
@@ -321,6 +321,15 @@ int mmc_of_parse(struct mmc_host *host)
 	if (device_property_read_bool(dev, "no-mmc"))
 		host->caps2 |= MMC_CAP2_NO_MMC;
 
+	/* Must be after "non-removable" check */
+	if (device_property_read_u32(dev, "fixed-emmc-driver-type", &drv_type) == 0) {
+		if (host->caps & MMC_CAP_NONREMOVABLE)
+			host->fixed_drv_type = drv_type;
+		else
+			dev_err(host->parent,
+				"can't use fixed driver type, media is removable\n");
+	}
+
 	host->dsr_req = !device_property_read_u32(dev, "dsr", &host->dsr);
 	if (host->dsr_req && (host->dsr & ~0xffff)) {
 		dev_err(host->parent,
@@ -393,6 +402,8 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
 	host->max_blk_size = 512;
 	host->max_blk_count = PAGE_SIZE / 512;
 
+	host->fixed_drv_type = -EINVAL;
+
 	return host;
 }
 
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 066efbb89483..a552f61060d2 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -1291,13 +1291,18 @@ out_err:
 static void mmc_select_driver_type(struct mmc_card *card)
 {
 	int card_drv_type, drive_strength, drv_type;
+	int fixed_drv_type = card->host->fixed_drv_type;
 
 	card_drv_type = card->ext_csd.raw_driver_strength |
 			mmc_driver_type_mask(0);
 
-	drive_strength = mmc_select_drive_strength(card,
-						   card->ext_csd.hs200_max_dtr,
-						   card_drv_type, &drv_type);
+	if (fixed_drv_type >= 0)
+		drive_strength = card_drv_type & mmc_driver_type_mask(fixed_drv_type)
+				 ? fixed_drv_type : 0;
+	else
+		drive_strength = mmc_select_drive_strength(card,
+							   card->ext_csd.hs200_max_dtr,
+							   card_drv_type, &drv_type);
 
 	card->drive_strength = drive_strength;
 
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index c296f4351c1d..e7743eca1021 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -354,6 +354,8 @@ struct mmc_host {
 #define MMC_CAP2_CQE		(1 << 23)	/* Has eMMC command queue engine */
 #define MMC_CAP2_CQE_DCMD	(1 << 24)	/* CQE can issue a direct command */
 
+	int			fixed_drv_type;	/* fixed driver type for non-removable media */
+
 	mmc_pm_flag_t		pm_caps;	/* supported pm features */
 
 	/* host specific block data */
-- 
cgit v1.2.3


From 0f295b0650c90362b4111f46d7f9149a0a4191be Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Fri, 13 Oct 2017 11:54:33 -0600
Subject: rtc: Allow rtc drivers to specify the tv_nsec value for ntp

ntp is currently hardwired to try and call the rtc set when wall clock
tv_nsec is 0.5 seconds. This historical behaviour works well with certain
PC RTCs, but is not universal to all rtc hardware.

Change how this works by introducing the driver specific concept of
set_offset_nsec, the delay between current wall clock time and the target
time to set (with a 0 tv_nsecs).

For x86-style CMOS set_offset_nsec should be -0.5 s which causes the last
second to be written 0.5 s after it has started.

For compat with the old rtc_set_ntp_time, the value is defaulted to
+ 0.5 s, which causes the next second to be written 0.5s before it starts,
as things were before this patch.

Testing shows many non-x86 RTCs would like set_offset_nsec ~= 0,
so ultimately each RTC driver should set the set_offset_nsec according
to its needs, and non x86 architectures should stop using
update_persistent_clock64 in order to access this feature.
Future patches will revise the drivers as needed.

Since CMOS and RTC now have very different handling they are split
into two dedicated code paths, sharing the support code, and ifdefs
are replaced with IS_ENABLED.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 drivers/rtc/class.c   |   3 +
 drivers/rtc/systohc.c |  53 +++++++++++-----
 include/linux/rtc.h   |  43 ++++++++++++-
 kernel/time/ntp.c     | 166 ++++++++++++++++++++++++++++++++++----------------
 4 files changed, 196 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index 2ed970d61da1..722d683e0b0f 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -161,6 +161,9 @@ static struct rtc_device *rtc_allocate_device(void)
 
 	device_initialize(&rtc->dev);
 
+	/* Drivers can revise this default after allocating the device. */
+	rtc->set_offset_nsec =  NSEC_PER_SEC / 2;
+
 	rtc->irq_freq = 1;
 	rtc->max_user_freq = 64;
 	rtc->dev.class = rtc_class;
diff --git a/drivers/rtc/systohc.c b/drivers/rtc/systohc.c
index b4a68ffcd06b..0c177647ea6c 100644
--- a/drivers/rtc/systohc.c
+++ b/drivers/rtc/systohc.c
@@ -10,6 +10,7 @@
 /**
  * rtc_set_ntp_time - Save NTP synchronized time to the RTC
  * @now: Current time of day
+ * @target_nsec: pointer for desired now->tv_nsec value
  *
  * Replacement for the NTP platform function update_persistent_clock64
  * that stores time for later retrieval by rtc_hctosys.
@@ -18,30 +19,52 @@
  * possible at all, and various other -errno for specific temporary failure
  * cases.
  *
+ * -EPROTO is returned if now.tv_nsec is not close enough to *target_nsec.
+ (
  * If temporary failure is indicated the caller should try again 'soon'
  */
-int rtc_set_ntp_time(struct timespec64 now)
+int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec)
 {
 	struct rtc_device *rtc;
 	struct rtc_time tm;
+	struct timespec64 to_set;
 	int err = -ENODEV;
-
-	if (now.tv_nsec < (NSEC_PER_SEC >> 1))
-		rtc_time64_to_tm(now.tv_sec, &tm);
-	else
-		rtc_time64_to_tm(now.tv_sec + 1, &tm);
+	bool ok;
 
 	rtc = rtc_class_open(CONFIG_RTC_SYSTOHC_DEVICE);
-	if (rtc) {
-		/* rtc_hctosys exclusively uses UTC, so we call set_time here,
-		 * not set_mmss. */
-		if (rtc->ops &&
-		    (rtc->ops->set_time ||
-		     rtc->ops->set_mmss64 ||
-		     rtc->ops->set_mmss))
-			err = rtc_set_time(rtc, &tm);
-		rtc_class_close(rtc);
+	if (!rtc)
+		goto out_err;
+
+	if (!rtc->ops || (!rtc->ops->set_time && !rtc->ops->set_mmss64 &&
+			  !rtc->ops->set_mmss))
+		goto out_close;
+
+	/* Compute the value of tv_nsec we require the caller to supply in
+	 * now.tv_nsec.  This is the value such that (now +
+	 * set_offset_nsec).tv_nsec == 0.
+	 */
+	set_normalized_timespec64(&to_set, 0, -rtc->set_offset_nsec);
+	*target_nsec = to_set.tv_nsec;
+
+	/* The ntp code must call this with the correct value in tv_nsec, if
+	 * it does not we update target_nsec and return EPROTO to make the ntp
+	 * code try again later.
+	 */
+	ok = rtc_tv_nsec_ok(rtc->set_offset_nsec, &to_set, &now);
+	if (!ok) {
+		err = -EPROTO;
+		goto out_close;
 	}
 
+	rtc_time64_to_tm(to_set.tv_sec, &tm);
+
+	/* rtc_hctosys exclusively uses UTC, so we call set_time here, not
+	 * set_mmss.
+	 */
+	err = rtc_set_time(rtc, &tm);
+
+out_close:
+	rtc_class_close(rtc);
+out_err:
 	return err;
 }
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index e6d0f9c1cafd..5b13fa029fd6 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -135,6 +135,14 @@ struct rtc_device {
 	/* Some hardware can't support UIE mode */
 	int uie_unsupported;
 
+	/* Number of nsec it takes to set the RTC clock. This influences when
+	 * the set ops are called. An offset:
+	 *   - of 0.5 s will call RTC set for wall clock time 10.0 s at 9.5 s
+	 *   - of 1.5 s will call RTC set for wall clock time 10.0 s at 8.5 s
+	 *   - of -0.5 s will call RTC set for wall clock time 10.0 s at 10.5 s
+	 */
+	long set_offset_nsec;
+
 	bool registered;
 
 	struct nvmem_config *nvmem_config;
@@ -172,7 +180,7 @@ extern void devm_rtc_device_unregister(struct device *dev,
 
 extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm);
 extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm);
-extern int rtc_set_ntp_time(struct timespec64 now);
+extern int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec);
 int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm);
 extern int rtc_read_alarm(struct rtc_device *rtc,
 			struct rtc_wkalrm *alrm);
@@ -221,6 +229,39 @@ static inline bool is_leap_year(unsigned int year)
 	return (!(year % 4) && (year % 100)) || !(year % 400);
 }
 
+/* Determine if we can call to driver to set the time. Drivers can only be
+ * called to set a second aligned time value, and the field set_offset_nsec
+ * specifies how far away from the second aligned time to call the driver.
+ *
+ * This also computes 'to_set' which is the time we are trying to set, and has
+ * a zero in tv_nsecs, such that:
+ *    to_set - set_delay_nsec == now +/- FUZZ
+ *
+ */
+static inline bool rtc_tv_nsec_ok(s64 set_offset_nsec,
+				  struct timespec64 *to_set,
+				  const struct timespec64 *now)
+{
+	/* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */
+	const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5;
+	struct timespec64 delay = {.tv_sec = 0,
+				   .tv_nsec = set_offset_nsec};
+
+	*to_set = timespec64_add(*now, delay);
+
+	if (to_set->tv_nsec < TIME_SET_NSEC_FUZZ) {
+		to_set->tv_nsec = 0;
+		return true;
+	}
+
+	if (to_set->tv_nsec > NSEC_PER_SEC - TIME_SET_NSEC_FUZZ) {
+		to_set->tv_sec++;
+		to_set->tv_nsec = 0;
+		return true;
+	}
+	return false;
+}
+
 #define rtc_register_device(device) \
 	__rtc_register_device(THIS_MODULE, device)
 
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index edf19cc53140..bc19de1a0683 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -492,6 +492,67 @@ out:
 	return leap;
 }
 
+static void sync_hw_clock(struct work_struct *work);
+static DECLARE_DELAYED_WORK(sync_work, sync_hw_clock);
+
+static void sched_sync_hw_clock(struct timespec64 now,
+				unsigned long target_nsec, bool fail)
+
+{
+	struct timespec64 next;
+
+	getnstimeofday64(&next);
+	if (!fail)
+		next.tv_sec = 659;
+	else {
+		/*
+		 * Try again as soon as possible. Delaying long periods
+		 * decreases the accuracy of the work queue timer. Due to this
+		 * the algorithm is very likely to require a short-sleep retry
+		 * after the above long sleep to synchronize ts_nsec.
+		 */
+		next.tv_sec = 0;
+	}
+
+	/* Compute the needed delay that will get to tv_nsec == target_nsec */
+	next.tv_nsec = target_nsec - next.tv_nsec;
+	if (next.tv_nsec <= 0)
+		next.tv_nsec += NSEC_PER_SEC;
+	if (next.tv_nsec >= NSEC_PER_SEC) {
+		next.tv_sec++;
+		next.tv_nsec -= NSEC_PER_SEC;
+	}
+
+	queue_delayed_work(system_power_efficient_wq, &sync_work,
+			   timespec64_to_jiffies(&next));
+}
+
+static void sync_rtc_clock(void)
+{
+	unsigned long target_nsec;
+	struct timespec64 adjust, now;
+	int rc;
+
+	if (!IS_ENABLED(CONFIG_RTC_SYSTOHC))
+		return;
+
+	getnstimeofday64(&now);
+
+	adjust = now;
+	if (persistent_clock_is_local)
+		adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
+
+	/*
+	 * The current RTC in use will provide the target_nsec it wants to be
+	 * called at, and does rtc_tv_nsec_ok internally.
+	 */
+	rc = rtc_set_ntp_time(adjust, &target_nsec);
+	if (rc == -ENODEV)
+		return;
+
+	sched_sync_hw_clock(now, target_nsec, rc);
+}
+
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
 int __weak update_persistent_clock(struct timespec now)
 {
@@ -507,76 +568,75 @@ int __weak update_persistent_clock64(struct timespec64 now64)
 }
 #endif
 
-#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
-static void sync_cmos_clock(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
-
-static void sync_cmos_clock(struct work_struct *work)
+static bool sync_cmos_clock(void)
 {
+	static bool no_cmos;
 	struct timespec64 now;
-	struct timespec64 next;
-	int fail = 1;
+	struct timespec64 adjust;
+	int rc = -EPROTO;
+	long target_nsec = NSEC_PER_SEC / 2;
+
+	if (!IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE))
+		return false;
+
+	if (no_cmos)
+		return false;
 
 	/*
-	 * If we have an externally synchronized Linux clock, then update
-	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
-	 * called as close as possible to 500 ms before the new second starts.
-	 * This code is run on a timer.  If the clock is set, that timer
-	 * may not expire at the correct time.  Thus, we adjust...
-	 * We want the clock to be within a couple of ticks from the target.
+	 * Historically update_persistent_clock64() has followed x86
+	 * semantics, which match the MC146818A/etc RTC. This RTC will store
+	 * 'adjust' and then in .5s it will advance once second.
+	 *
+	 * Architectures are strongly encouraged to use rtclib and not
+	 * implement this legacy API.
 	 */
-	if (!ntp_synced()) {
-		/*
-		 * Not synced, exit, do not restart a timer (if one is
-		 * running, let it run out).
-		 */
-		return;
-	}
-
 	getnstimeofday64(&now);
-	if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
-		struct timespec64 adjust = now;
-
-		fail = -ENODEV;
+	if (rtc_tv_nsec_ok(-1 * target_nsec, &adjust, &now)) {
 		if (persistent_clock_is_local)
 			adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
-#ifdef CONFIG_GENERIC_CMOS_UPDATE
-		fail = update_persistent_clock64(adjust);
-#endif
-
-#ifdef CONFIG_RTC_SYSTOHC
-		if (fail == -ENODEV)
-			fail = rtc_set_ntp_time(adjust);
-#endif
+		rc = update_persistent_clock64(adjust);
+		/*
+		 * The machine does not support update_persistent_clock64 even
+		 * though it defines CONFIG_GENERIC_CMOS_UPDATE.
+		 */
+		if (rc == -ENODEV) {
+			no_cmos = true;
+			return false;
+		}
 	}
 
-	next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
-	if (next.tv_nsec <= 0)
-		next.tv_nsec += NSEC_PER_SEC;
+	sched_sync_hw_clock(now, target_nsec, rc);
+	return true;
+}
 
-	if (!fail || fail == -ENODEV)
-		next.tv_sec = 659;
-	else
-		next.tv_sec = 0;
+/*
+ * If we have an externally synchronized Linux clock, then update RTC clock
+ * accordingly every ~11 minutes. Generally RTCs can only store second
+ * precision, but many RTCs will adjust the phase of their second tick to
+ * match the moment of update. This infrastructure arranges to call to the RTC
+ * set at the correct moment to phase synchronize the RTC second tick over
+ * with the kernel clock.
+ */
+static void sync_hw_clock(struct work_struct *work)
+{
+	if (!ntp_synced())
+		return;
 
-	if (next.tv_nsec >= NSEC_PER_SEC) {
-		next.tv_sec++;
-		next.tv_nsec -= NSEC_PER_SEC;
-	}
-	queue_delayed_work(system_power_efficient_wq,
-			   &sync_cmos_work, timespec64_to_jiffies(&next));
+	if (sync_cmos_clock())
+		return;
+
+	sync_rtc_clock();
 }
 
 void ntp_notify_cmos_timer(void)
 {
-	queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
-}
-
-#else
-void ntp_notify_cmos_timer(void) { }
-#endif
+	if (!ntp_synced())
+		return;
 
+	if (IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE) ||
+	    IS_ENABLED(CONFIG_RTC_SYSTOHC))
+		queue_delayed_work(system_power_efficient_wq, &sync_work, 0);
+}
 
 /*
  * Propagate a new txc->status value into the NTP state:
-- 
cgit v1.2.3


From e0956dcc4ba74ec4b17e32fc9a156fcba1ef6610 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 19 Oct 2017 13:14:44 +0200
Subject: timekeeping: Consolidate timekeeping_inject_offset code

The code to check the adjtimex() or clock_adjtime() arguments is spread
out across multiple files for presumably only historic reasons. As a
preparatation for a rework to get rid of the use of 'struct timeval'
and 'struct timespec' in there, this moves all the portions into
kernel/time/timekeeping.c and marks them as 'static'.

The warp_clock() function here is not as closely related as the others,
but I feel it still makes sense to move it here in order to consolidate
all callers of timekeeping_inject_offset().

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
[jstultz: Whitespace fixup]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/time.h       |  26 ----------
 kernel/time/ntp.c          |  61 ----------------------
 kernel/time/ntp_internal.h |   1 -
 kernel/time/time.c         |  36 +------------
 kernel/time/timekeeping.c  | 123 ++++++++++++++++++++++++++++++++++++++++++++-
 kernel/time/timekeeping.h  |   2 +-
 6 files changed, 123 insertions(+), 126 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 9bc1f945777c..c0fbad08448f 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -134,32 +134,6 @@ static inline bool timeval_valid(const struct timeval *tv)
 
 extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 
-/*
- * Validates if a timespec/timeval used to inject a time offset is valid.
- * Offsets can be postive or negative. The value of the timeval/timespec
- * is the sum of its fields, but *NOTE*: the field tv_usec/tv_nsec must
- * always be non-negative.
- */
-static inline bool timeval_inject_offset_valid(const struct timeval *tv)
-{
-	/* We don't check the tv_sec as it can be positive or negative */
-
-	/* Can't have more microseconds then a second */
-	if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC)
-		return false;
-	return true;
-}
-
-static inline bool timespec_inject_offset_valid(const struct timespec *ts)
-{
-	/* We don't check the tv_sec as it can be positive or negative */
-
-	/* Can't have more nanoseconds then a second */
-	if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
-		return false;
-	return true;
-}
-
 /* Some architectures do not supply their own clocksource.
  * This is mainly the case in architectures that get their
  * inter-tick times by reading the counter on their interval
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index bc19de1a0683..90f84582a076 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -713,67 +713,6 @@ static inline void process_adjtimex_modes(struct timex *txc,
 }
 
 
-
-/**
- * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex
- */
-int ntp_validate_timex(struct timex *txc)
-{
-	if (txc->modes & ADJ_ADJTIME) {
-		/* singleshot must not be used with any other mode bits */
-		if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
-			return -EINVAL;
-		if (!(txc->modes & ADJ_OFFSET_READONLY) &&
-		    !capable(CAP_SYS_TIME))
-			return -EPERM;
-	} else {
-		/* In order to modify anything, you gotta be super-user! */
-		 if (txc->modes && !capable(CAP_SYS_TIME))
-			return -EPERM;
-		/*
-		 * if the quartz is off by more than 10% then
-		 * something is VERY wrong!
-		 */
-		if (txc->modes & ADJ_TICK &&
-		    (txc->tick <  900000/USER_HZ ||
-		     txc->tick > 1100000/USER_HZ))
-			return -EINVAL;
-	}
-
-	if (txc->modes & ADJ_SETOFFSET) {
-		/* In order to inject time, you gotta be super-user! */
-		if (!capable(CAP_SYS_TIME))
-			return -EPERM;
-
-		if (txc->modes & ADJ_NANO) {
-			struct timespec ts;
-
-			ts.tv_sec = txc->time.tv_sec;
-			ts.tv_nsec = txc->time.tv_usec;
-			if (!timespec_inject_offset_valid(&ts))
-				return -EINVAL;
-
-		} else {
-			if (!timeval_inject_offset_valid(&txc->time))
-				return -EINVAL;
-		}
-	}
-
-	/*
-	 * Check for potential multiplication overflows that can
-	 * only happen on 64-bit systems:
-	 */
-	if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
-		if (LLONG_MIN / PPM_SCALE > txc->freq)
-			return -EINVAL;
-		if (LLONG_MAX / PPM_SCALE < txc->freq)
-			return -EINVAL;
-	}
-
-	return 0;
-}
-
-
 /*
  * adjtimex mainly allows reading (and writing, if superuser) of
  * kernel time-keeping variables. used by xntpd.
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index d8a7c11fa71a..74b52cd48209 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -7,7 +7,6 @@ extern void ntp_clear(void);
 extern u64 ntp_tick_length(void);
 extern ktime_t ntp_get_next_leap(void);
 extern int second_overflow(time64_t secs);
-extern int ntp_validate_timex(struct timex *);
 extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
 extern void __hardpps(const struct timespec64 *, const struct timespec64 *);
 #endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 44a8c1402133..04684e294f00 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -157,40 +157,6 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
 	return 0;
 }
 
-/*
- * Indicates if there is an offset between the system clock and the hardware
- * clock/persistent clock/rtc.
- */
-int persistent_clock_is_local;
-
-/*
- * Adjust the time obtained from the CMOS to be UTC time instead of
- * local time.
- *
- * This is ugly, but preferable to the alternatives.  Otherwise we
- * would either need to write a program to do it in /etc/rc (and risk
- * confusion if the program gets run more than once; it would also be
- * hard to make the program warp the clock precisely n hours)  or
- * compile in the timezone information into the kernel.  Bad, bad....
- *
- *						- TYT, 1992-01-01
- *
- * The best thing to do is to keep the CMOS clock in universal time (UTC)
- * as real UNIX machines always do it. This avoids all headaches about
- * daylight saving times and warping kernel clocks.
- */
-static inline void warp_clock(void)
-{
-	if (sys_tz.tz_minuteswest != 0) {
-		struct timespec adjust;
-
-		persistent_clock_is_local = 1;
-		adjust.tv_sec = sys_tz.tz_minuteswest * 60;
-		adjust.tv_nsec = 0;
-		timekeeping_inject_offset(&adjust);
-	}
-}
-
 /*
  * In case for some reason the CMOS clock has not already been running
  * in UTC, but in some local time: The first time we set the timezone,
@@ -224,7 +190,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
 		if (firsttime) {
 			firsttime = 0;
 			if (!tv)
-				warp_clock();
+				timekeeping_warp_clock();
 		}
 	}
 	if (tv)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2cafb49aa65e..7d8e0e842484 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1258,13 +1258,39 @@ out:
 }
 EXPORT_SYMBOL(do_settimeofday64);
 
+/*
+ * Validates if a timespec/timeval used to inject a time offset is valid.
+ * Offsets can be postive or negative. The value of the timeval/timespec
+ * is the sum of its fields, but *NOTE*: the field tv_usec/tv_nsec must
+ * always be non-negative.
+ */
+static inline bool timeval_inject_offset_valid(const struct timeval *tv)
+{
+	/* We don't check the tv_sec as it can be positive or negative */
+
+	/* Can't have more microseconds then a second */
+	if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC)
+		return false;
+	return true;
+}
+
+static inline bool timespec_inject_offset_valid(const struct timespec *ts)
+{
+	/* We don't check the tv_sec as it can be positive or negative */
+
+	/* Can't have more nanoseconds then a second */
+	if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
+		return false;
+	return true;
+}
+
 /**
  * timekeeping_inject_offset - Adds or subtracts from the current time.
  * @tv:		pointer to the timespec variable containing the offset
  *
  * Adds or subtracts an offset value from the current time.
  */
-int timekeeping_inject_offset(struct timespec *ts)
+static int timekeeping_inject_offset(struct timespec *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned long flags;
@@ -1303,7 +1329,40 @@ error: /* even if we error out, we forwarded the time, so call update */
 
 	return ret;
 }
-EXPORT_SYMBOL(timekeeping_inject_offset);
+
+/*
+ * Indicates if there is an offset between the system clock and the hardware
+ * clock/persistent clock/rtc.
+ */
+int persistent_clock_is_local;
+
+/*
+ * Adjust the time obtained from the CMOS to be UTC time instead of
+ * local time.
+ *
+ * This is ugly, but preferable to the alternatives.  Otherwise we
+ * would either need to write a program to do it in /etc/rc (and risk
+ * confusion if the program gets run more than once; it would also be
+ * hard to make the program warp the clock precisely n hours)  or
+ * compile in the timezone information into the kernel.  Bad, bad....
+ *
+ *						- TYT, 1992-01-01
+ *
+ * The best thing to do is to keep the CMOS clock in universal time (UTC)
+ * as real UNIX machines always do it. This avoids all headaches about
+ * daylight saving times and warping kernel clocks.
+ */
+void timekeeping_warp_clock(void)
+{
+	if (sys_tz.tz_minuteswest != 0) {
+		struct timespec adjust;
+
+		persistent_clock_is_local = 1;
+		adjust.tv_sec = sys_tz.tz_minuteswest * 60;
+		adjust.tv_nsec = 0;
+		timekeeping_inject_offset(&adjust);
+	}
+}
 
 /**
  * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
@@ -2247,6 +2306,66 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
 	return base;
 }
 
+/**
+ * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex
+ */
+static int ntp_validate_timex(struct timex *txc)
+{
+	if (txc->modes & ADJ_ADJTIME) {
+		/* singleshot must not be used with any other mode bits */
+		if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
+			return -EINVAL;
+		if (!(txc->modes & ADJ_OFFSET_READONLY) &&
+		    !capable(CAP_SYS_TIME))
+			return -EPERM;
+	} else {
+		/* In order to modify anything, you gotta be super-user! */
+		if (txc->modes && !capable(CAP_SYS_TIME))
+			return -EPERM;
+		/*
+		 * if the quartz is off by more than 10% then
+		 * something is VERY wrong!
+		 */
+		if (txc->modes & ADJ_TICK &&
+		    (txc->tick <  900000/USER_HZ ||
+		     txc->tick > 1100000/USER_HZ))
+			return -EINVAL;
+	}
+
+	if (txc->modes & ADJ_SETOFFSET) {
+		/* In order to inject time, you gotta be super-user! */
+		if (!capable(CAP_SYS_TIME))
+			return -EPERM;
+
+		if (txc->modes & ADJ_NANO) {
+			struct timespec ts;
+
+			ts.tv_sec = txc->time.tv_sec;
+			ts.tv_nsec = txc->time.tv_usec;
+			if (!timespec_inject_offset_valid(&ts))
+				return -EINVAL;
+
+		} else {
+			if (!timeval_inject_offset_valid(&txc->time))
+				return -EINVAL;
+		}
+	}
+
+	/*
+	 * Check for potential multiplication overflows that can
+	 * only happen on 64-bit systems:
+	 */
+	if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
+		if (LLONG_MIN / PPM_SCALE > txc->freq)
+			return -EINVAL;
+		if (LLONG_MAX / PPM_SCALE < txc->freq)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+
 /**
  * do_adjtimex() - Accessor function to NTP __do_adjtimex function
  */
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index d0914676d4c5..44aec7893cdd 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -10,7 +10,7 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
 
 extern int timekeeping_valid_for_hres(void);
 extern u64 timekeeping_max_deferment(void);
-extern int timekeeping_inject_offset(struct timespec *ts);
+extern void timekeeping_warp_clock(void);
 extern int timekeeping_suspend(void);
 extern void timekeeping_resume(void);
 
-- 
cgit v1.2.3


From 85bf19e7df2479140eff2348a4e6a9c19b5c3960 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 19 Oct 2017 13:14:46 +0200
Subject: time: Remove unused functions

The (slow but) ongoing work on conversion from timespec to timespec64
has led some timespec based helper functions to become unused.

No new code should use them, so we can remove the functions entirely.
I'm planning to obsolete additional interfaces next and remove
more of these.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/time.h   | 18 ------------------
 include/linux/time64.h | 28 ----------------------------
 kernel/time/time.c     | 18 ------------------
 3 files changed, 64 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index c0fbad08448f..0e8a80918484 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -39,15 +39,6 @@ static inline int timespec_compare(const struct timespec *lhs, const struct time
 	return lhs->tv_nsec - rhs->tv_nsec;
 }
 
-static inline int timeval_compare(const struct timeval *lhs, const struct timeval *rhs)
-{
-	if (lhs->tv_sec < rhs->tv_sec)
-		return -1;
-	if (lhs->tv_sec > rhs->tv_sec)
-		return 1;
-	return lhs->tv_usec - rhs->tv_usec;
-}
-
 extern time64_t mktime64(const unsigned int year, const unsigned int mon,
 			const unsigned int day, const unsigned int hour,
 			const unsigned int min, const unsigned int sec);
@@ -65,15 +56,6 @@ static inline unsigned long mktime(const unsigned int year,
 
 extern void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec);
 
-/*
- * timespec_add_safe assumes both values are positive and checks
- * for overflow. It will return TIME_T_MAX if the reutrn would be
- * smaller then either of the arguments.
- */
-extern struct timespec timespec_add_safe(const struct timespec lhs,
-					 const struct timespec rhs);
-
-
 static inline struct timespec timespec_add(struct timespec lhs,
 						struct timespec rhs)
 {
diff --git a/include/linux/time64.h b/include/linux/time64.h
index 980c71b3001a..402b595c76d2 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -53,16 +53,6 @@ static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
 	return ts;
 }
 
-static inline struct itimerspec itimerspec64_to_itimerspec(struct itimerspec64 *its64)
-{
-	return *its64;
-}
-
-static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec *its)
-{
-	return *its;
-}
-
 # define timespec64_equal		timespec_equal
 # define timespec64_compare		timespec_compare
 # define set_normalized_timespec64	set_normalized_timespec
@@ -94,24 +84,6 @@ static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
 	return ret;
 }
 
-static inline struct itimerspec itimerspec64_to_itimerspec(struct itimerspec64 *its64)
-{
-	struct itimerspec ret;
-
-	ret.it_interval = timespec64_to_timespec(its64->it_interval);
-	ret.it_value = timespec64_to_timespec(its64->it_value);
-	return ret;
-}
-
-static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec *its)
-{
-	struct itimerspec64 ret;
-
-	ret.it_interval = timespec_to_timespec64(its->it_interval);
-	ret.it_value = timespec_to_timespec64(its->it_value);
-	return ret;
-}
-
 static inline int timespec64_equal(const struct timespec64 *a,
 				   const struct timespec64 *b)
 {
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 04684e294f00..947fb614c78f 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -818,24 +818,6 @@ unsigned long nsecs_to_jiffies(u64 n)
 }
 EXPORT_SYMBOL_GPL(nsecs_to_jiffies);
 
-/*
- * Add two timespec values and do a safety check for overflow.
- * It's assumed that both values are valid (>= 0)
- */
-struct timespec timespec_add_safe(const struct timespec lhs,
-				  const struct timespec rhs)
-{
-	struct timespec res;
-
-	set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec,
-				lhs.tv_nsec + rhs.tv_nsec);
-
-	if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)
-		res.tv_sec = TIME_T_MAX;
-
-	return res;
-}
-
 /*
  * Add two timespec64 values and do a safety check for overflow.
  * It's assumed that both values are valid (>= 0).
-- 
cgit v1.2.3


From 5dbf20127f8cca8588ad0b0e3e8ded587ac7afa0 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 19 Oct 2017 13:14:47 +0200
Subject: time: Move time_t based interfaces to time32.h

Interfaces based on 'struct timespec' or 'struct timeval' should no
longer be used for new code, which can use either ktime_t or 'struct
timespec64' instead.

To make this a little clearer, this moves the various helpers into a new
time32.h header. For the moment, this gets included by the normal time.h,
but we may be able to separate it entirely when most users of time32.h
are gone.

Individual helpers in the new file can get removed once they become unused
in the future.

Since the contents of time32.h look a lot like what's in time64.h, I'm
reordering them during the move to make them more similar, and to allow
a follow-up patch to redirect the 'timespec' based functions to thei
'timespec64' based counterparts on 64-bit architectures later.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
[jstultz: Whitespace & checkpatch fixups]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/time.h   | 163 +--------------------------------------------
 include/linux/time32.h | 176 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 177 insertions(+), 162 deletions(-)
 create mode 100644 include/linux/time32.h

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 0e8a80918484..c375f54a678d 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -17,105 +17,10 @@ int get_itimerspec64(struct itimerspec64 *it,
 int put_itimerspec64(const struct itimerspec64 *it,
 			struct itimerspec __user *uit);
 
-#define TIME_T_MAX	(time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1)
-
-static inline int timespec_equal(const struct timespec *a,
-                                 const struct timespec *b)
-{
-	return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec);
-}
-
-/*
- * lhs < rhs:  return <0
- * lhs == rhs: return 0
- * lhs > rhs:  return >0
- */
-static inline int timespec_compare(const struct timespec *lhs, const struct timespec *rhs)
-{
-	if (lhs->tv_sec < rhs->tv_sec)
-		return -1;
-	if (lhs->tv_sec > rhs->tv_sec)
-		return 1;
-	return lhs->tv_nsec - rhs->tv_nsec;
-}
-
 extern time64_t mktime64(const unsigned int year, const unsigned int mon,
 			const unsigned int day, const unsigned int hour,
 			const unsigned int min, const unsigned int sec);
 
-/**
- * Deprecated. Use mktime64().
- */
-static inline unsigned long mktime(const unsigned int year,
-			const unsigned int mon, const unsigned int day,
-			const unsigned int hour, const unsigned int min,
-			const unsigned int sec)
-{
-	return mktime64(year, mon, day, hour, min, sec);
-}
-
-extern void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec);
-
-static inline struct timespec timespec_add(struct timespec lhs,
-						struct timespec rhs)
-{
-	struct timespec ts_delta;
-	set_normalized_timespec(&ts_delta, lhs.tv_sec + rhs.tv_sec,
-				lhs.tv_nsec + rhs.tv_nsec);
-	return ts_delta;
-}
-
-/*
- * sub = lhs - rhs, in normalized form
- */
-static inline struct timespec timespec_sub(struct timespec lhs,
-						struct timespec rhs)
-{
-	struct timespec ts_delta;
-	set_normalized_timespec(&ts_delta, lhs.tv_sec - rhs.tv_sec,
-				lhs.tv_nsec - rhs.tv_nsec);
-	return ts_delta;
-}
-
-/*
- * Returns true if the timespec is norm, false if denorm:
- */
-static inline bool timespec_valid(const struct timespec *ts)
-{
-	/* Dates before 1970 are bogus */
-	if (ts->tv_sec < 0)
-		return false;
-	/* Can't have more nanoseconds then a second */
-	if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
-		return false;
-	return true;
-}
-
-static inline bool timespec_valid_strict(const struct timespec *ts)
-{
-	if (!timespec_valid(ts))
-		return false;
-	/* Disallow values that could overflow ktime_t */
-	if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX)
-		return false;
-	return true;
-}
-
-static inline bool timeval_valid(const struct timeval *tv)
-{
-	/* Dates before 1970 are bogus */
-	if (tv->tv_sec < 0)
-		return false;
-
-	/* Can't have more microseconds then a second */
-	if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC)
-		return false;
-
-	return true;
-}
-
-extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
-
 /* Some architectures do not supply their own clocksource.
  * This is mainly the case in architectures that get their
  * inter-tick times by reading the counter on their interval
@@ -164,73 +69,7 @@ struct tm {
 
 void time64_to_tm(time64_t totalsecs, int offset, struct tm *result);
 
-/**
- * time_to_tm - converts the calendar time to local broken-down time
- *
- * @totalsecs	the number of seconds elapsed since 00:00:00 on January 1, 1970,
- *		Coordinated Universal Time (UTC).
- * @offset	offset seconds adding to totalsecs.
- * @result	pointer to struct tm variable to receive broken-down time
- */
-static inline void time_to_tm(time_t totalsecs, int offset, struct tm *result)
-{
-	time64_to_tm(totalsecs, offset, result);
-}
-
-/**
- * timespec_to_ns - Convert timespec to nanoseconds
- * @ts:		pointer to the timespec variable to be converted
- *
- * Returns the scalar nanosecond representation of the timespec
- * parameter.
- */
-static inline s64 timespec_to_ns(const struct timespec *ts)
-{
-	return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
-}
-
-/**
- * timeval_to_ns - Convert timeval to nanoseconds
- * @ts:		pointer to the timeval variable to be converted
- *
- * Returns the scalar nanosecond representation of the timeval
- * parameter.
- */
-static inline s64 timeval_to_ns(const struct timeval *tv)
-{
-	return ((s64) tv->tv_sec * NSEC_PER_SEC) +
-		tv->tv_usec * NSEC_PER_USEC;
-}
-
-/**
- * ns_to_timespec - Convert nanoseconds to timespec
- * @nsec:	the nanoseconds value to be converted
- *
- * Returns the timespec representation of the nsec parameter.
- */
-extern struct timespec ns_to_timespec(const s64 nsec);
-
-/**
- * ns_to_timeval - Convert nanoseconds to timeval
- * @nsec:	the nanoseconds value to be converted
- *
- * Returns the timeval representation of the nsec parameter.
- */
-extern struct timeval ns_to_timeval(const s64 nsec);
-
-/**
- * timespec_add_ns - Adds nanoseconds to a timespec
- * @a:		pointer to timespec to be incremented
- * @ns:		unsigned nanoseconds value to be added
- *
- * This must always be inlined because its used from the x86-64 vdso,
- * which cannot call other kernel functions.
- */
-static __always_inline void timespec_add_ns(struct timespec *a, u64 ns)
-{
-	a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns);
-	a->tv_nsec = ns;
-}
+# include <linux/time32.h>
 
 static inline bool itimerspec64_valid(const struct itimerspec64 *its)
 {
diff --git a/include/linux/time32.h b/include/linux/time32.h
new file mode 100644
index 000000000000..9b9c43f0d39b
--- /dev/null
+++ b/include/linux/time32.h
@@ -0,0 +1,176 @@
+#ifndef _LINUX_TIME32_H
+#define _LINUX_TIME32_H
+/*
+ * These are all interfaces based on the old time_t definition
+ * that overflows in 2038 on 32-bit architectures. New code
+ * should use the replacements based on time64_t and timespec64.
+ *
+ * Any interfaces in here that become unused as we migrate
+ * code to time64_t should get removed.
+ */
+
+#include <linux/time64.h>
+
+#define TIME_T_MAX	(time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1)
+
+static inline int timespec_equal(const struct timespec *a,
+				 const struct timespec *b)
+{
+	return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec);
+}
+
+/*
+ * lhs < rhs:  return <0
+ * lhs == rhs: return 0
+ * lhs > rhs:  return >0
+ */
+static inline int timespec_compare(const struct timespec *lhs, const struct timespec *rhs)
+{
+	if (lhs->tv_sec < rhs->tv_sec)
+		return -1;
+	if (lhs->tv_sec > rhs->tv_sec)
+		return 1;
+	return lhs->tv_nsec - rhs->tv_nsec;
+}
+
+extern void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec);
+
+static inline struct timespec timespec_add(struct timespec lhs,
+						struct timespec rhs)
+{
+	struct timespec ts_delta;
+
+	set_normalized_timespec(&ts_delta, lhs.tv_sec + rhs.tv_sec,
+				lhs.tv_nsec + rhs.tv_nsec);
+	return ts_delta;
+}
+
+/*
+ * sub = lhs - rhs, in normalized form
+ */
+static inline struct timespec timespec_sub(struct timespec lhs,
+						struct timespec rhs)
+{
+	struct timespec ts_delta;
+
+	set_normalized_timespec(&ts_delta, lhs.tv_sec - rhs.tv_sec,
+				lhs.tv_nsec - rhs.tv_nsec);
+	return ts_delta;
+}
+
+/*
+ * Returns true if the timespec is norm, false if denorm:
+ */
+static inline bool timespec_valid(const struct timespec *ts)
+{
+	/* Dates before 1970 are bogus */
+	if (ts->tv_sec < 0)
+		return false;
+	/* Can't have more nanoseconds then a second */
+	if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
+		return false;
+	return true;
+}
+
+static inline bool timespec_valid_strict(const struct timespec *ts)
+{
+	if (!timespec_valid(ts))
+		return false;
+	/* Disallow values that could overflow ktime_t */
+	if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX)
+		return false;
+	return true;
+}
+
+/**
+ * timespec_to_ns - Convert timespec to nanoseconds
+ * @ts:		pointer to the timespec variable to be converted
+ *
+ * Returns the scalar nanosecond representation of the timespec
+ * parameter.
+ */
+static inline s64 timespec_to_ns(const struct timespec *ts)
+{
+	return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
+}
+
+/**
+ * ns_to_timespec - Convert nanoseconds to timespec
+ * @nsec:	the nanoseconds value to be converted
+ *
+ * Returns the timespec representation of the nsec parameter.
+ */
+extern struct timespec ns_to_timespec(const s64 nsec);
+
+/**
+ * timespec_add_ns - Adds nanoseconds to a timespec
+ * @a:		pointer to timespec to be incremented
+ * @ns:		unsigned nanoseconds value to be added
+ *
+ * This must always be inlined because its used from the x86-64 vdso,
+ * which cannot call other kernel functions.
+ */
+static __always_inline void timespec_add_ns(struct timespec *a, u64 ns)
+{
+	a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns);
+	a->tv_nsec = ns;
+}
+
+/**
+ * time_to_tm - converts the calendar time to local broken-down time
+ *
+ * @totalsecs	the number of seconds elapsed since 00:00:00 on January 1, 1970,
+ *		Coordinated Universal Time (UTC).
+ * @offset	offset seconds adding to totalsecs.
+ * @result	pointer to struct tm variable to receive broken-down time
+ */
+static inline void time_to_tm(time_t totalsecs, int offset, struct tm *result)
+{
+	time64_to_tm(totalsecs, offset, result);
+}
+
+static inline unsigned long mktime(const unsigned int year,
+			const unsigned int mon, const unsigned int day,
+			const unsigned int hour, const unsigned int min,
+			const unsigned int sec)
+{
+	return mktime64(year, mon, day, hour, min, sec);
+}
+
+static inline bool timeval_valid(const struct timeval *tv)
+{
+	/* Dates before 1970 are bogus */
+	if (tv->tv_sec < 0)
+		return false;
+
+	/* Can't have more microseconds then a second */
+	if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC)
+		return false;
+
+	return true;
+}
+
+extern struct timespec timespec_trunc(struct timespec t, unsigned int gran);
+
+/**
+ * timeval_to_ns - Convert timeval to nanoseconds
+ * @ts:		pointer to the timeval variable to be converted
+ *
+ * Returns the scalar nanosecond representation of the timeval
+ * parameter.
+ */
+static inline s64 timeval_to_ns(const struct timeval *tv)
+{
+	return ((s64) tv->tv_sec * NSEC_PER_SEC) +
+		tv->tv_usec * NSEC_PER_USEC;
+}
+
+/**
+ * ns_to_timeval - Convert nanoseconds to timeval
+ * @nsec:	the nanoseconds value to be converted
+ *
+ * Returns the timeval representation of the nsec parameter.
+ */
+extern struct timeval ns_to_timeval(const s64 nsec);
+
+#endif
-- 
cgit v1.2.3


From abc8f96e3eb846fcf6333395ee1f6ed4a734576c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 19 Oct 2017 13:14:48 +0200
Subject: time: Move time_t conversion helpers to time32.h

On 64-bit architectures, the timespec64 based helpers in linux/time.h
are defined as macros pointing to their timespec based counterparts.
This made sense when they were first introduced, but as we are migrating
away from timespec in general, it's much less intuitive now.

This changes the macros to work in the exact opposite way: we always
provide the timespec64 based helpers and define the old interfaces as
macros for them. Now we can move those macros into linux/time32.h, which
already contains the respective helpers for 32-bit architectures.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/time32.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/time64.h | 50 +-------------------------------------------------
 kernel/time/time.c     |  5 +++--
 3 files changed, 49 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time32.h b/include/linux/time32.h
index 9b9c43f0d39b..65b1de25198d 100644
--- a/include/linux/time32.h
+++ b/include/linux/time32.h
@@ -13,6 +13,49 @@
 
 #define TIME_T_MAX	(time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1)
 
+#if __BITS_PER_LONG == 64
+
+/* timespec64 is defined as timespec here */
+static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
+{
+	return ts64;
+}
+
+static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
+{
+	return ts;
+}
+
+# define timespec_equal			timespec64_equal
+# define timespec_compare		timespec64_compare
+# define set_normalized_timespec	set_normalized_timespec64
+# define timespec_add			timespec64_add
+# define timespec_sub			timespec64_sub
+# define timespec_valid			timespec64_valid
+# define timespec_valid_strict		timespec64_valid_strict
+# define timespec_to_ns			timespec64_to_ns
+# define ns_to_timespec			ns_to_timespec64
+# define timespec_add_ns		timespec64_add_ns
+
+#else
+static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
+{
+	struct timespec ret;
+
+	ret.tv_sec = (time_t)ts64.tv_sec;
+	ret.tv_nsec = ts64.tv_nsec;
+	return ret;
+}
+
+static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
+{
+	struct timespec64 ret;
+
+	ret.tv_sec = ts.tv_sec;
+	ret.tv_nsec = ts.tv_nsec;
+	return ret;
+}
+
 static inline int timespec_equal(const struct timespec *a,
 				 const struct timespec *b)
 {
@@ -116,6 +159,8 @@ static __always_inline void timespec_add_ns(struct timespec *a, u64 ns)
 	a->tv_nsec = ns;
 }
 
+#endif
+
 /**
  * time_to_tm - converts the calendar time to local broken-down time
  *
diff --git a/include/linux/time64.h b/include/linux/time64.h
index 402b595c76d2..ec1888cf5378 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -7,11 +7,8 @@
 typedef __s64 time64_t;
 typedef __u64 timeu64_t;
 
-/*
- * This wants to go into uapi/linux/time.h once we agreed about the
- * userspace interfaces.
- */
 #if __BITS_PER_LONG == 64
+/* this trick allows us to optimize out timespec64_to_timespec */
 # define timespec64 timespec
 #define itimerspec64 itimerspec
 #else
@@ -41,49 +38,6 @@ struct itimerspec64 {
 #define KTIME_MAX			((s64)~((u64)1 << 63))
 #define KTIME_SEC_MAX			(KTIME_MAX / NSEC_PER_SEC)
 
-#if __BITS_PER_LONG == 64
-
-static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
-{
-	return ts64;
-}
-
-static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
-{
-	return ts;
-}
-
-# define timespec64_equal		timespec_equal
-# define timespec64_compare		timespec_compare
-# define set_normalized_timespec64	set_normalized_timespec
-# define timespec64_add			timespec_add
-# define timespec64_sub			timespec_sub
-# define timespec64_valid		timespec_valid
-# define timespec64_valid_strict	timespec_valid_strict
-# define timespec64_to_ns		timespec_to_ns
-# define ns_to_timespec64		ns_to_timespec
-# define timespec64_add_ns		timespec_add_ns
-
-#else
-
-static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
-{
-	struct timespec ret;
-
-	ret.tv_sec = (time_t)ts64.tv_sec;
-	ret.tv_nsec = ts64.tv_nsec;
-	return ret;
-}
-
-static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
-{
-	struct timespec64 ret;
-
-	ret.tv_sec = ts.tv_sec;
-	ret.tv_nsec = ts.tv_nsec;
-	return ret;
-}
-
 static inline int timespec64_equal(const struct timespec64 *a,
 				   const struct timespec64 *b)
 {
@@ -185,8 +139,6 @@ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
 	a->tv_nsec = ns;
 }
 
-#endif
-
 /*
  * timespec64_add_safe assumes both values are positive and checks for
  * overflow. It will return TIME64_MAX in case of overflow.
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 947fb614c78f..fe60ebd301cf 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -407,6 +407,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0,
 }
 EXPORT_SYMBOL(mktime64);
 
+#if __BITS_PER_LONG == 32
 /**
  * set_normalized_timespec - set timespec sec and nsec parts and normalize
  *
@@ -467,6 +468,7 @@ struct timespec ns_to_timespec(const s64 nsec)
 	return ts;
 }
 EXPORT_SYMBOL(ns_to_timespec);
+#endif
 
 /**
  * ns_to_timeval - Convert nanoseconds to timeval
@@ -486,7 +488,6 @@ struct timeval ns_to_timeval(const s64 nsec)
 }
 EXPORT_SYMBOL(ns_to_timeval);
 
-#if BITS_PER_LONG == 32
 /**
  * set_normalized_timespec - set timespec sec and nsec parts and normalize
  *
@@ -547,7 +548,7 @@ struct timespec64 ns_to_timespec64(const s64 nsec)
 	return ts;
 }
 EXPORT_SYMBOL(ns_to_timespec64);
-#endif
+
 /**
  * msecs_to_jiffies: - convert milliseconds to jiffies
  * @m:	time in milliseconds
-- 
cgit v1.2.3


From 6546911ed369af8d747215ff8b6144618e91c6ab Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 19 Oct 2017 13:14:49 +0200
Subject: time: Move old timekeeping interfaces to timekeeping32.h

The interfaces based on 'struct timespec' and 'unsigned long' seconds
are no longer recommended for new code, and we are trying to migrate to
ktime_t based interfaces and other y2038-safe variants.

This moves all the legacy interfaces from linux/timekeeping.h into a
new timekeeping32.h to better document this.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/ktime.h         |   1 +
 include/linux/timekeeping.h   | 137 +-------------------------------------
 include/linux/timekeeping32.h | 151 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 154 insertions(+), 135 deletions(-)
 create mode 100644 include/linux/timekeeping32.h

(limited to 'include/linux')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 0c8bd45c8206..5b9fddbaac41 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -270,5 +270,6 @@ static inline ktime_t ms_to_ktime(u64 ms)
 }
 
 # include <linux/timekeeping.h>
+# include <linux/timekeeping32.h>
 
 #endif
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index ddc229ff6d1e..405beea4e71b 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -15,27 +15,16 @@ extern void xtime_update(unsigned long ticks);
 /*
  * Get and set timeofday
  */
-extern void do_gettimeofday(struct timeval *tv);
 extern int do_settimeofday64(const struct timespec64 *ts);
 extern int do_sys_settimeofday64(const struct timespec64 *tv,
 				 const struct timezone *tz);
 /*
  * Kernel time accessors
  */
-unsigned long get_seconds(void);
 struct timespec64 current_kernel_time64(void);
-/* does not take xtime_lock */
-struct timespec __current_kernel_time(void);
-
-static inline struct timespec current_kernel_time(void)
-{
-	struct timespec64 now = current_kernel_time64();
-
-	return timespec64_to_timespec(now);
-}
 
 /*
- * timespec based interfaces
+ * timespec64 based interfaces
  */
 struct timespec64 get_monotonic_coarse64(void);
 extern void getrawmonotonic64(struct timespec64 *ts);
@@ -47,116 +36,6 @@ extern int __getnstimeofday64(struct timespec64 *tv);
 extern void getnstimeofday64(struct timespec64 *tv);
 extern void getboottime64(struct timespec64 *ts);
 
-#if BITS_PER_LONG == 64
-/**
- * Deprecated. Use do_settimeofday64().
- */
-static inline int do_settimeofday(const struct timespec *ts)
-{
-	return do_settimeofday64(ts);
-}
-
-static inline int __getnstimeofday(struct timespec *ts)
-{
-	return __getnstimeofday64(ts);
-}
-
-static inline void getnstimeofday(struct timespec *ts)
-{
-	getnstimeofday64(ts);
-}
-
-static inline void ktime_get_ts(struct timespec *ts)
-{
-	ktime_get_ts64(ts);
-}
-
-static inline void ktime_get_real_ts(struct timespec *ts)
-{
-	getnstimeofday64(ts);
-}
-
-static inline void getrawmonotonic(struct timespec *ts)
-{
-	getrawmonotonic64(ts);
-}
-
-static inline struct timespec get_monotonic_coarse(void)
-{
-	return get_monotonic_coarse64();
-}
-
-static inline void getboottime(struct timespec *ts)
-{
-	return getboottime64(ts);
-}
-#else
-/**
- * Deprecated. Use do_settimeofday64().
- */
-static inline int do_settimeofday(const struct timespec *ts)
-{
-	struct timespec64 ts64;
-
-	ts64 = timespec_to_timespec64(*ts);
-	return do_settimeofday64(&ts64);
-}
-
-static inline int __getnstimeofday(struct timespec *ts)
-{
-	struct timespec64 ts64;
-	int ret = __getnstimeofday64(&ts64);
-
-	*ts = timespec64_to_timespec(ts64);
-	return ret;
-}
-
-static inline void getnstimeofday(struct timespec *ts)
-{
-	struct timespec64 ts64;
-
-	getnstimeofday64(&ts64);
-	*ts = timespec64_to_timespec(ts64);
-}
-
-static inline void ktime_get_ts(struct timespec *ts)
-{
-	struct timespec64 ts64;
-
-	ktime_get_ts64(&ts64);
-	*ts = timespec64_to_timespec(ts64);
-}
-
-static inline void ktime_get_real_ts(struct timespec *ts)
-{
-	struct timespec64 ts64;
-
-	getnstimeofday64(&ts64);
-	*ts = timespec64_to_timespec(ts64);
-}
-
-static inline void getrawmonotonic(struct timespec *ts)
-{
-	struct timespec64 ts64;
-
-	getrawmonotonic64(&ts64);
-	*ts = timespec64_to_timespec(ts64);
-}
-
-static inline struct timespec get_monotonic_coarse(void)
-{
-	return timespec64_to_timespec(get_monotonic_coarse64());
-}
-
-static inline void getboottime(struct timespec *ts)
-{
-	struct timespec64 ts64;
-
-	getboottime64(&ts64);
-	*ts = timespec64_to_timespec(ts64);
-}
-#endif
-
 #define ktime_get_real_ts64(ts)	getnstimeofday64(ts)
 
 /*
@@ -241,23 +120,13 @@ extern u64 ktime_get_raw_fast_ns(void);
 extern u64 ktime_get_boot_fast_ns(void);
 
 /*
- * Timespec interfaces utilizing the ktime based ones
+ * timespec64 interfaces utilizing the ktime based ones
  */
-static inline void get_monotonic_boottime(struct timespec *ts)
-{
-	*ts = ktime_to_timespec(ktime_get_boottime());
-}
-
 static inline void get_monotonic_boottime64(struct timespec64 *ts)
 {
 	*ts = ktime_to_timespec64(ktime_get_boottime());
 }
 
-static inline void timekeeping_clocktai(struct timespec *ts)
-{
-	*ts = ktime_to_timespec(ktime_get_clocktai());
-}
-
 static inline void timekeeping_clocktai64(struct timespec64 *ts)
 {
 	*ts = ktime_to_timespec64(ktime_get_clocktai());
@@ -340,10 +209,8 @@ extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot);
  */
 extern int persistent_clock_is_local;
 
-extern void read_persistent_clock(struct timespec *ts);
 extern void read_persistent_clock64(struct timespec64 *ts);
 extern void read_boot_clock64(struct timespec64 *ts);
-extern int update_persistent_clock(struct timespec now);
 extern int update_persistent_clock64(struct timespec64 now);
 
 
diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h
new file mode 100644
index 000000000000..af4114d5dc17
--- /dev/null
+++ b/include/linux/timekeeping32.h
@@ -0,0 +1,151 @@
+#ifndef _LINUX_TIMEKEEPING32_H
+#define _LINUX_TIMEKEEPING32_H
+/*
+ * These interfaces are all based on the old timespec type
+ * and should get replaced with the timespec64 based versions
+ * over time so we can remove the file here.
+ */
+
+extern void do_gettimeofday(struct timeval *tv);
+unsigned long get_seconds(void);
+
+/* does not take xtime_lock */
+struct timespec __current_kernel_time(void);
+
+static inline struct timespec current_kernel_time(void)
+{
+	struct timespec64 now = current_kernel_time64();
+
+	return timespec64_to_timespec(now);
+}
+
+#if BITS_PER_LONG == 64
+/**
+ * Deprecated. Use do_settimeofday64().
+ */
+static inline int do_settimeofday(const struct timespec *ts)
+{
+	return do_settimeofday64(ts);
+}
+
+static inline int __getnstimeofday(struct timespec *ts)
+{
+	return __getnstimeofday64(ts);
+}
+
+static inline void getnstimeofday(struct timespec *ts)
+{
+	getnstimeofday64(ts);
+}
+
+static inline void ktime_get_ts(struct timespec *ts)
+{
+	ktime_get_ts64(ts);
+}
+
+static inline void ktime_get_real_ts(struct timespec *ts)
+{
+	getnstimeofday64(ts);
+}
+
+static inline void getrawmonotonic(struct timespec *ts)
+{
+	getrawmonotonic64(ts);
+}
+
+static inline struct timespec get_monotonic_coarse(void)
+{
+	return get_monotonic_coarse64();
+}
+
+static inline void getboottime(struct timespec *ts)
+{
+	return getboottime64(ts);
+}
+#else
+/**
+ * Deprecated. Use do_settimeofday64().
+ */
+static inline int do_settimeofday(const struct timespec *ts)
+{
+	struct timespec64 ts64;
+
+	ts64 = timespec_to_timespec64(*ts);
+	return do_settimeofday64(&ts64);
+}
+
+static inline int __getnstimeofday(struct timespec *ts)
+{
+	struct timespec64 ts64;
+	int ret = __getnstimeofday64(&ts64);
+
+	*ts = timespec64_to_timespec(ts64);
+	return ret;
+}
+
+static inline void getnstimeofday(struct timespec *ts)
+{
+	struct timespec64 ts64;
+
+	getnstimeofday64(&ts64);
+	*ts = timespec64_to_timespec(ts64);
+}
+
+static inline void ktime_get_ts(struct timespec *ts)
+{
+	struct timespec64 ts64;
+
+	ktime_get_ts64(&ts64);
+	*ts = timespec64_to_timespec(ts64);
+}
+
+static inline void ktime_get_real_ts(struct timespec *ts)
+{
+	struct timespec64 ts64;
+
+	getnstimeofday64(&ts64);
+	*ts = timespec64_to_timespec(ts64);
+}
+
+static inline void getrawmonotonic(struct timespec *ts)
+{
+	struct timespec64 ts64;
+
+	getrawmonotonic64(&ts64);
+	*ts = timespec64_to_timespec(ts64);
+}
+
+static inline struct timespec get_monotonic_coarse(void)
+{
+	return timespec64_to_timespec(get_monotonic_coarse64());
+}
+
+static inline void getboottime(struct timespec *ts)
+{
+	struct timespec64 ts64;
+
+	getboottime64(&ts64);
+	*ts = timespec64_to_timespec(ts64);
+}
+#endif
+
+/*
+ * Timespec interfaces utilizing the ktime based ones
+ */
+static inline void get_monotonic_boottime(struct timespec *ts)
+{
+	*ts = ktime_to_timespec(ktime_get_boottime());
+}
+
+static inline void timekeeping_clocktai(struct timespec *ts)
+{
+	*ts = ktime_to_timespec(ktime_get_clocktai());
+}
+
+/*
+ * Persistent clock related interfaces
+ */
+extern void read_persistent_clock(struct timespec *ts);
+extern int update_persistent_clock(struct timespec now);
+
+#endif
-- 
cgit v1.2.3


From d82bd359972a7fe71a778396cf287bc9f9f3b981 Mon Sep 17 00:00:00 2001
From: Avaneesh Kumar Dwivedi <akdwived@codeaurora.org>
Date: Tue, 24 Oct 2017 21:22:24 +0530
Subject: firmware: scm: Add new SCM call API for switching memory ownership

Two different processors on a SOC need to switch memory ownership
during load/unload. To enable this, second level memory map table
need to be updated, which is done by secure layer.
This patch adds the interface for making secure monitor call for
memory ownership switching request.

Acked-by: Andy Gross <andy.gross@linaro.org>
Signed-off-by: Avaneesh Kumar Dwivedi <akdwived@codeaurora.org>
[bjorn: Minor style and kerneldoc updates]
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/firmware/qcom_scm-32.c |  7 ++++
 drivers/firmware/qcom_scm-64.c | 27 ++++++++++++
 drivers/firmware/qcom_scm.c    | 95 ++++++++++++++++++++++++++++++++++++++++++
 drivers/firmware/qcom_scm.h    |  5 +++
 include/linux/qcom_scm.h       | 16 +++++++
 5 files changed, 150 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/qcom_scm-32.c b/drivers/firmware/qcom_scm-32.c
index 93e3b96b6dfa..60d96bcf6694 100644
--- a/drivers/firmware/qcom_scm-32.c
+++ b/drivers/firmware/qcom_scm-32.c
@@ -579,6 +579,13 @@ int __qcom_scm_set_remote_state(struct device *dev, u32 state, u32 id)
 	return ret ? : le32_to_cpu(scm_ret);
 }
 
+int __qcom_scm_assign_mem(struct device *dev, phys_addr_t mem_region,
+			  size_t mem_sz, phys_addr_t src, size_t src_sz,
+			  phys_addr_t dest, size_t dest_sz)
+{
+	return -ENODEV;
+}
+
 int __qcom_scm_restore_sec_cfg(struct device *dev, u32 device_id,
 			       u32 spare)
 {
diff --git a/drivers/firmware/qcom_scm-64.c b/drivers/firmware/qcom_scm-64.c
index 6e6d561708e2..ea5b1e680f67 100644
--- a/drivers/firmware/qcom_scm-64.c
+++ b/drivers/firmware/qcom_scm-64.c
@@ -382,6 +382,33 @@ int __qcom_scm_set_remote_state(struct device *dev, u32 state, u32 id)
 	return ret ? : res.a1;
 }
 
+int __qcom_scm_assign_mem(struct device *dev, phys_addr_t mem_region,
+			  size_t mem_sz, phys_addr_t src, size_t src_sz,
+			  phys_addr_t dest, size_t dest_sz)
+{
+	int ret;
+	struct qcom_scm_desc desc = {0};
+	struct arm_smccc_res res;
+
+	desc.args[0] = mem_region;
+	desc.args[1] = mem_sz;
+	desc.args[2] = src;
+	desc.args[3] = src_sz;
+	desc.args[4] = dest;
+	desc.args[5] = dest_sz;
+	desc.args[6] = 0;
+
+	desc.arginfo = QCOM_SCM_ARGS(7, QCOM_SCM_RO, QCOM_SCM_VAL,
+				     QCOM_SCM_RO, QCOM_SCM_VAL, QCOM_SCM_RO,
+				     QCOM_SCM_VAL, QCOM_SCM_VAL);
+
+	ret = qcom_scm_call(dev, QCOM_SCM_SVC_MP,
+			    QCOM_MEM_PROT_ASSIGN_ID,
+			    &desc, &res);
+
+	return ret ? : res.a1;
+}
+
 int __qcom_scm_restore_sec_cfg(struct device *dev, u32 device_id, u32 spare)
 {
 	struct qcom_scm_desc desc = {0};
diff --git a/drivers/firmware/qcom_scm.c b/drivers/firmware/qcom_scm.c
index bb16510d75ba..334eaa341828 100644
--- a/drivers/firmware/qcom_scm.c
+++ b/drivers/firmware/qcom_scm.c
@@ -40,6 +40,19 @@ struct qcom_scm {
 	struct reset_controller_dev reset;
 };
 
+struct qcom_scm_current_perm_info {
+	__le32 vmid;
+	__le32 perm;
+	__le64 ctx;
+	__le32 ctx_size;
+	__le32 unused;
+};
+
+struct qcom_scm_mem_map_info {
+	__le64 mem_addr;
+	__le64 mem_size;
+};
+
 static struct qcom_scm *__scm;
 
 static int qcom_scm_clk_enable(void)
@@ -348,6 +361,88 @@ int qcom_scm_set_remote_state(u32 state, u32 id)
 }
 EXPORT_SYMBOL(qcom_scm_set_remote_state);
 
+/**
+ * qcom_scm_assign_mem() - Make a secure call to reassign memory ownership
+ * @mem_addr: mem region whose ownership need to be reassigned
+ * @mem_sz:   size of the region.
+ * @srcvm:    vmid for current set of owners, each set bit in
+ *            flag indicate a unique owner
+ * @newvm:    array having new owners and corrsponding permission
+ *            flags
+ * @dest_cnt: number of owners in next set.
+ *
+ * Return negative errno on failure, 0 on success, with @srcvm updated.
+ */
+int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz,
+			unsigned int *srcvm,
+			struct qcom_scm_vmperm *newvm, int dest_cnt)
+{
+	struct qcom_scm_current_perm_info *destvm;
+	struct qcom_scm_mem_map_info *mem_to_map;
+	phys_addr_t mem_to_map_phys;
+	phys_addr_t dest_phys;
+	phys_addr_t ptr_phys;
+	size_t mem_to_map_sz;
+	size_t dest_sz;
+	size_t src_sz;
+	size_t ptr_sz;
+	int next_vm;
+	__le32 *src;
+	void *ptr;
+	int ret;
+	int len;
+	int i;
+
+	src_sz = hweight_long(*srcvm) * sizeof(*src);
+	mem_to_map_sz = sizeof(*mem_to_map);
+	dest_sz = dest_cnt * sizeof(*destvm);
+	ptr_sz = ALIGN(src_sz, SZ_64) + ALIGN(mem_to_map_sz, SZ_64) +
+			ALIGN(dest_sz, SZ_64);
+
+	ptr = dma_alloc_coherent(__scm->dev, ptr_sz, &ptr_phys, GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+
+	/* Fill source vmid detail */
+	src = ptr;
+	len = hweight_long(*srcvm);
+	for (i = 0; i < len; i++) {
+		src[i] = cpu_to_le32(ffs(*srcvm) - 1);
+		*srcvm ^= 1 << (ffs(*srcvm) - 1);
+	}
+
+	/* Fill details of mem buff to map */
+	mem_to_map = ptr + ALIGN(src_sz, SZ_64);
+	mem_to_map_phys = ptr_phys + ALIGN(src_sz, SZ_64);
+	mem_to_map[0].mem_addr = cpu_to_le64(mem_addr);
+	mem_to_map[0].mem_size = cpu_to_le64(mem_sz);
+
+	next_vm = 0;
+	/* Fill details of next vmid detail */
+	destvm = ptr + ALIGN(mem_to_map_sz, SZ_64) + ALIGN(src_sz, SZ_64);
+	dest_phys = ptr_phys + ALIGN(mem_to_map_sz, SZ_64) + ALIGN(src_sz, SZ_64);
+	for (i = 0; i < dest_cnt; i++) {
+		destvm[i].vmid = cpu_to_le32(newvm[i].vmid);
+		destvm[i].perm = cpu_to_le32(newvm[i].perm);
+		destvm[i].ctx = 0;
+		destvm[i].ctx_size = 0;
+		next_vm |= BIT(newvm[i].vmid);
+	}
+
+	ret = __qcom_scm_assign_mem(__scm->dev, mem_to_map_phys, mem_to_map_sz,
+				    ptr_phys, src_sz, dest_phys, dest_sz);
+	dma_free_coherent(__scm->dev, ALIGN(ptr_sz, SZ_64), ptr, ptr_phys);
+	if (ret) {
+		dev_err(__scm->dev,
+			"Assign memory protection call failed %d.\n", ret);
+		return -EINVAL;
+	}
+
+	*srcvm = next_vm;
+	return 0;
+}
+EXPORT_SYMBOL(qcom_scm_assign_mem);
+
 static int qcom_scm_probe(struct platform_device *pdev)
 {
 	struct qcom_scm *scm;
diff --git a/drivers/firmware/qcom_scm.h b/drivers/firmware/qcom_scm.h
index 9bea691f30fb..fe54b7ba4db4 100644
--- a/drivers/firmware/qcom_scm.h
+++ b/drivers/firmware/qcom_scm.h
@@ -95,5 +95,10 @@ extern int __qcom_scm_iommu_secure_ptbl_size(struct device *dev, u32 spare,
 					     size_t *size);
 extern int __qcom_scm_iommu_secure_ptbl_init(struct device *dev, u64 addr,
 					     u32 size, u32 spare);
+#define QCOM_MEM_PROT_ASSIGN_ID	0x16
+extern int  __qcom_scm_assign_mem(struct device *dev,
+				  phys_addr_t mem_region, size_t mem_sz,
+				  phys_addr_t src, size_t src_sz,
+				  phys_addr_t dest, size_t dest_sz);
 
 #endif
diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h
index e5380471c2cd..6f8da9e182f9 100644
--- a/include/linux/qcom_scm.h
+++ b/include/linux/qcom_scm.h
@@ -23,6 +23,19 @@ struct qcom_scm_hdcp_req {
 	u32 val;
 };
 
+struct qcom_scm_vmperm {
+	int vmid;
+	int perm;
+};
+
+#define QCOM_SCM_VMID_HLOS       0x3
+#define QCOM_SCM_VMID_MSS_MSA    0xF
+#define QCOM_SCM_PERM_READ       0x4
+#define QCOM_SCM_PERM_WRITE      0x2
+#define QCOM_SCM_PERM_EXEC       0x1
+#define QCOM_SCM_PERM_RW (QCOM_SCM_PERM_READ | QCOM_SCM_PERM_WRITE)
+#define QCOM_SCM_PERM_RWX (QCOM_SCM_PERM_RW | QCOM_SCM_PERM_EXEC)
+
 #if IS_ENABLED(CONFIG_QCOM_SCM)
 extern int qcom_scm_set_cold_boot_addr(void *entry, const cpumask_t *cpus);
 extern int qcom_scm_set_warm_boot_addr(void *entry, const cpumask_t *cpus);
@@ -37,6 +50,9 @@ extern int qcom_scm_pas_mem_setup(u32 peripheral, phys_addr_t addr,
 				  phys_addr_t size);
 extern int qcom_scm_pas_auth_and_reset(u32 peripheral);
 extern int qcom_scm_pas_shutdown(u32 peripheral);
+extern int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz,
+			       unsigned int *src, struct qcom_scm_vmperm *newvm,
+			       int dest_cnt);
 extern void qcom_scm_cpu_power_down(u32 flags);
 extern u32 qcom_scm_get_version(void);
 extern int qcom_scm_set_remote_state(u32 state, u32 id);
-- 
cgit v1.2.3


From 707ce9eac5fc3b68f98c887dddea3911a8fc4f9f Mon Sep 17 00:00:00 2001
From: James Ban <James.Ban.opensource@diasemi.com>
Date: Mon, 30 Oct 2017 11:32:38 +0900
Subject: regulator: da9211: update for supporting da9223/4/5

This is update for supporting additional devices da9223/4/5.
Only device strings is added because only package type is different.

Signed-off-by: James Ban <James.Ban..opensource@diasemi.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 .../devicetree/bindings/regulator/da9211.txt       | 82 ++++++++++++++++++++--
 drivers/regulator/Kconfig                          |  2 +-
 drivers/regulator/da9211-regulator.c               | 14 ++--
 drivers/regulator/da9211-regulator.h               |  2 +-
 include/linux/regulator/da9211.h                   |  5 +-
 5 files changed, 91 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/regulator/da9211.txt b/Documentation/devicetree/bindings/regulator/da9211.txt
index 0f2a6f8fcafd..27717e816e71 100644
--- a/Documentation/devicetree/bindings/regulator/da9211.txt
+++ b/Documentation/devicetree/bindings/regulator/da9211.txt
@@ -1,8 +1,9 @@
-* Dialog Semiconductor DA9211/DA9212/DA9213/DA9214/DA9215 Voltage Regulator
+* Dialog Semiconductor DA9211/DA9212/DA9213/DA9223/DA9214/DA9224/DA9215/DA9225
+ Voltage Regulator
 
 Required properties:
-- compatible: "dlg,da9211" or "dlg,da9212" or "dlg,da9213"
-  or "dlg,da9214" or "dlg,da9215"
+- compatible: "dlg,da9211" or "dlg,da9212" or "dlg,da9213" or "dlg,da9223"
+  or "dlg,da9214" or "dlg,da9224" or "dlg,da9215" or "dlg,da9225"
 - reg: I2C slave address, usually 0x68.
 - interrupts: the interrupt outputs of the controller
 - regulators: A node that houses a sub-node for each regulator within the
@@ -16,7 +17,6 @@ Optional properties:
 - Any optional property defined in regulator.txt
 
 Example 1) DA9211
-
 	pmic: da9211@68 {
 		compatible = "dlg,da9211";
 		reg = <0x68>;
@@ -35,7 +35,6 @@ Example 1) DA9211
 	};
 
 Example 2) DA9212
-
 	pmic: da9212@68 {
 		compatible = "dlg,da9212";
 		reg = <0x68>;
@@ -79,7 +78,25 @@ Example 3) DA9213
 		};
 	};
 
-Example 4) DA9214
+Example 4) DA9223
+	pmic: da9223@68 {
+		compatible = "dlg,da9223";
+		reg = <0x68>;
+		interrupts = <3 27>;
+
+		regulators {
+			BUCKA {
+				regulator-name = "VBUCKA";
+				regulator-min-microvolt = < 300000>;
+				regulator-max-microvolt = <1570000>;
+				regulator-min-microamp 	= <3000000>;
+				regulator-max-microamp 	= <6000000>;
+				enable-gpios = <&gpio 27 0>;
+			};
+		};
+	};
+
+Example 5) DA9214
 	pmic: da9214@68 {
 		compatible = "dlg,da9214";
 		reg = <0x68>;
@@ -105,7 +122,33 @@ Example 4) DA9214
 		};
 	};
 
-Example 5) DA9215
+Example 6) DA9224
+	pmic: da9224@68 {
+		compatible = "dlg,da9224";
+		reg = <0x68>;
+		interrupts = <3 27>;
+
+		regulators {
+			BUCKA {
+				regulator-name = "VBUCKA";
+				regulator-min-microvolt = < 300000>;
+				regulator-max-microvolt = <1570000>;
+				regulator-min-microamp 	= <3000000>;
+				regulator-max-microamp 	= <6000000>;
+				enable-gpios = <&gpio 27 0>;
+			};
+			BUCKB {
+				regulator-name = "VBUCKB";
+				regulator-min-microvolt = < 300000>;
+				regulator-max-microvolt = <1570000>;
+				regulator-min-microamp 	= <3000000>;
+				regulator-max-microamp 	= <6000000>;
+				enable-gpios = <&gpio 17 0>;
+			};
+		};
+	};
+
+Example 7) DA9215
 	pmic: da9215@68 {
 		compatible = "dlg,da9215";
 		reg = <0x68>;
@@ -131,3 +174,28 @@ Example 5) DA9215
 		};
 	};
 
+Example 8) DA9225
+	pmic: da9225@68 {
+		compatible = "dlg,da9225";
+		reg = <0x68>;
+		interrupts = <3 27>;
+
+		regulators {
+			BUCKA {
+				regulator-name = "VBUCKA";
+				regulator-min-microvolt = < 300000>;
+				regulator-max-microvolt = <1570000>;
+				regulator-min-microamp 	= <4000000>;
+				regulator-max-microamp 	= <7000000>;
+				enable-gpios = <&gpio 27 0>;
+			};
+			BUCKB {
+				regulator-name = "VBUCKB";
+				regulator-min-microvolt = < 300000>;
+				regulator-max-microvolt = <1570000>;
+				regulator-min-microamp 	= <4000000>;
+				regulator-max-microamp 	= <7000000>;
+				enable-gpios = <&gpio 17 0>;
+			};
+		};
+	};
diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 0fd6195601ba..96cd55f9e3c5 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -244,7 +244,7 @@ config REGULATOR_DA9210
 	  interface.
 
 config REGULATOR_DA9211
-	tristate "Dialog Semiconductor DA9211/DA9212/DA9213/DA9214/DA9215 regulator"
+	tristate "Dialog Semiconductor DA9211/DA9212/DA9213/DA9223/DA9214/DA9224/DA9215/DA9225 regulator"
 	depends on I2C
 	select REGMAP_I2C
 	help
diff --git a/drivers/regulator/da9211-regulator.c b/drivers/regulator/da9211-regulator.c
index aa47280efd32..9b8f47617724 100644
--- a/drivers/regulator/da9211-regulator.c
+++ b/drivers/regulator/da9211-regulator.c
@@ -1,6 +1,6 @@
 /*
  * da9211-regulator.c - Regulator device driver for DA9211/DA9212
- * /DA9213/DA9214/DA9215
+ * /DA9213/DA9223/DA9214/DA9224/DA9215/DA9225
  * Copyright (C) 2015  Dialog Semiconductor Ltd.
  *
  * This library is free software; you can redistribute it and/or
@@ -496,8 +496,11 @@ static const struct i2c_device_id da9211_i2c_id[] = {
 	{"da9211", DA9211},
 	{"da9212", DA9212},
 	{"da9213", DA9213},
+	{"da9223", DA9223},
 	{"da9214", DA9214},
+	{"da9224", DA9224},
 	{"da9215", DA9215},
+	{"da9225", DA9225},
 	{},
 };
 MODULE_DEVICE_TABLE(i2c, da9211_i2c_id);
@@ -507,8 +510,11 @@ static const struct of_device_id da9211_dt_ids[] = {
 	{ .compatible = "dlg,da9211", .data = &da9211_i2c_id[0] },
 	{ .compatible = "dlg,da9212", .data = &da9211_i2c_id[1] },
 	{ .compatible = "dlg,da9213", .data = &da9211_i2c_id[2] },
-	{ .compatible = "dlg,da9214", .data = &da9211_i2c_id[3] },
-	{ .compatible = "dlg,da9215", .data = &da9211_i2c_id[4] },
+	{ .compatible = "dlg,da9223", .data = &da9211_i2c_id[3] },
+	{ .compatible = "dlg,da9214", .data = &da9211_i2c_id[4] },
+	{ .compatible = "dlg,da9224", .data = &da9211_i2c_id[5] },
+	{ .compatible = "dlg,da9215", .data = &da9211_i2c_id[6] },
+	{ .compatible = "dlg,da9225", .data = &da9211_i2c_id[7] },
 	{},
 };
 MODULE_DEVICE_TABLE(of, da9211_dt_ids);
@@ -526,5 +532,5 @@ static struct i2c_driver da9211_regulator_driver = {
 module_i2c_driver(da9211_regulator_driver);
 
 MODULE_AUTHOR("James Ban <James.Ban.opensource@diasemi.com>");
-MODULE_DESCRIPTION("DA9211/DA9212/DA9213/DA9214/DA9215 regulator driver");
+MODULE_DESCRIPTION("DA9211/DA9212/DA9213/DA9223/DA9214/DA9224/DA9215/DA9225 regulator driver");
 MODULE_LICENSE("GPL");
diff --git a/drivers/regulator/da9211-regulator.h b/drivers/regulator/da9211-regulator.h
index b841bbf330cc..2cb32aab4f82 100644
--- a/drivers/regulator/da9211-regulator.h
+++ b/drivers/regulator/da9211-regulator.h
@@ -1,6 +1,6 @@
 /*
  * da9211-regulator.h - Regulator definitions for DA9211/DA9212
- * /DA9213/DA9214/DA9215
+ * /DA9213/DA9223/DA9214/DA9224/DA9215/DA9225
  * Copyright (C) 2015  Dialog Semiconductor Ltd.
  *
  * This program is free software; you can redistribute it and/or
diff --git a/include/linux/regulator/da9211.h b/include/linux/regulator/da9211.h
index 80cb40b7c88d..f2fd2d3bf58f 100644
--- a/include/linux/regulator/da9211.h
+++ b/include/linux/regulator/da9211.h
@@ -1,6 +1,6 @@
 /*
  * da9211.h - Regulator device driver for DA9211/DA9212
- * /DA9213/DA9214/DA9215
+ * /DA9213/DA9223/DA9214/DA9224/DA9215/DA9225
  * Copyright (C) 2015  Dialog Semiconductor Ltd.
  *
  * This program is free software; you can redistribute it and/or
@@ -25,8 +25,11 @@ enum da9211_chip_id {
 	DA9211,
 	DA9212,
 	DA9213,
+	DA9223,
 	DA9214,
+	DA9224,
 	DA9215,
+	DA9225,
 };
 
 struct da9211_pdata {
-- 
cgit v1.2.3


From 6981fbf3780366093858c5d2dcdaadcd1fbb04be Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sun, 29 Oct 2017 11:33:40 -0700
Subject: Drivers: hv: vmbus: Expose per-channel interrupts and events counters

When investigating performance, it is useful to be able to look at
the number of host and guest events per-channel. This is equivalent
to per-device interrupt statistics.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/stable/sysfs-bus-vmbus | 26 ++++++++++++++++++++------
 drivers/hv/connection.c                  |  2 ++
 drivers/hv/vmbus_drv.c                   | 16 ++++++++++++++++
 include/linux/hyperv.h                   |  4 ++++
 4 files changed, 42 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/stable/sysfs-bus-vmbus b/Documentation/ABI/stable/sysfs-bus-vmbus
index 0ebd8a1537a0..d4077cc60d55 100644
--- a/Documentation/ABI/stable/sysfs-bus-vmbus
+++ b/Documentation/ABI/stable/sysfs-bus-vmbus
@@ -61,39 +61,53 @@ Date:		September. 2017
 KernelVersion:	4.14
 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
 Description:	Inbound channel signaling state
-Users:		Debuggig tools
+Users:		Debugging tools
 
 What:		/sys/bus/vmbus/devices/vmbus_*/channels/relid/latency
 Date:		September. 2017
 KernelVersion:	4.14
 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
 Description:	Channel signaling latency
-Users:		Debuggig tools
+Users:		Debugging tools
 
 What:		/sys/bus/vmbus/devices/vmbus_*/channels/relid/out_mask
 Date:		September. 2017
 KernelVersion:	4.14
 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
 Description:	Outbound channel signaling state
-Users:		Debuggig tools
+Users:		Debugging tools
 
 What:		/sys/bus/vmbus/devices/vmbus_*/channels/relid/pending
 Date:		September. 2017
 KernelVersion:	4.14
 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
 Description:	Channel interrupt pending state
-Users:		Debuggig tools
+Users:		Debugging tools
 
 What:		/sys/bus/vmbus/devices/vmbus_*/channels/relid/read_avail
 Date:		September. 2017
 KernelVersion:	4.14
 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
 Description:	Bytes availabble to read
-Users:		Debuggig tools
+Users:		Debugging tools
 
 What:		/sys/bus/vmbus/devices/vmbus_*/channels/relid/write_avail
 Date:		September. 2017
 KernelVersion:	4.14
 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
 Description:	Bytes availabble to write
-Users:		Debuggig tools
+Users:		Debugging tools
+
+What:		/sys/bus/vmbus/devices/vmbus_*/channels/relid/events
+Date:		September. 2017
+KernelVersion:	4.14
+Contact:	Stephen Hemminger <sthemmin@microsoft.com>
+Description:	Number of times we have signaled the host
+Users:		Debugging tools
+
+What:		/sys/bus/vmbus/devices/vmbus_*/channels/relid/interrupts
+Date:		September. 2017
+KernelVersion:	4.14
+Contact:	Stephen Hemminger <sthemmin@microsoft.com>
+Description:	Number of times we have taken an interrupt (incoming)
+Users:		Debugging tools
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index f41901f80b64..b06a6b796819 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -409,6 +409,8 @@ void vmbus_set_event(struct vmbus_channel *channel)
 	if (!channel->is_dedicated_interrupt)
 		vmbus_send_interrupt(child_relid);
 
+	++channel->sig_events;
+
 	hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, channel->sig_event);
 }
 EXPORT_SYMBOL_GPL(vmbus_set_event);
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 1da8e818f4de..bca8188f3c8c 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -944,6 +944,8 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
 			if (channel->rescind)
 				continue;
 
+			++channel->interrupts;
+
 			switch (channel->callback_mode) {
 			case HV_CALL_ISR:
 				vmbus_channel_isr(channel);
@@ -1237,6 +1239,18 @@ static ssize_t channel_latency_show(const struct vmbus_channel *channel,
 }
 VMBUS_CHAN_ATTR(latency, S_IRUGO, channel_latency_show, NULL);
 
+static ssize_t channel_interrupts_show(const struct vmbus_channel *channel, char *buf)
+{
+	return sprintf(buf, "%llu\n", channel->interrupts);
+}
+VMBUS_CHAN_ATTR(interrupts, S_IRUGO, channel_interrupts_show, NULL);
+
+static ssize_t channel_events_show(const struct vmbus_channel *channel, char *buf)
+{
+	return sprintf(buf, "%llu\n", channel->sig_events);
+}
+VMBUS_CHAN_ATTR(events, S_IRUGO, channel_events_show, NULL);
+
 static struct attribute *vmbus_chan_attrs[] = {
 	&chan_attr_out_mask.attr,
 	&chan_attr_in_mask.attr,
@@ -1245,6 +1259,8 @@ static struct attribute *vmbus_chan_attrs[] = {
 	&chan_attr_cpu.attr,
 	&chan_attr_pending.attr,
 	&chan_attr_latency.attr,
+	&chan_attr_interrupts.attr,
+	&chan_attr_events.attr,
 	NULL
 };
 
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index ea6b5586ad77..f3e97c5f94c9 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -719,6 +719,10 @@ struct vmbus_channel {
 
 	struct vmbus_close_msg close_msg;
 
+	/* Statistics */
+	u64	interrupts;	/* Host to Guest interrupts */
+	u64	sig_events;	/* Guest to Host events */
+
 	/* Channel callback's invoked in softirq context */
 	struct tasklet_struct callback_event;
 	void (*onchannel_callback)(void *context);
-- 
cgit v1.2.3


From 3974320ca6aa68d479051f208d5c95afd1e47a4c Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Thu, 16 Feb 2017 10:27:16 -0500
Subject: GFS2: Implement iomap for block_map

This patch implements iomap for block mapping, and switches the
block_map function to use it under the covers.

The additional IOMAP_F_BOUNDARY iomap flag indicates when iomap has
reached a "metadata boundary" and fetching the next mapping is likely to
incur an additional I/O.  This flag is used for setting the bh buffer
boundary flag.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/bmap.c        | 273 +++++++++++++++++++++++++++++++++++++-------------
 fs/gfs2/bmap.h        |   4 +
 fs/gfs2/trace_gfs2.h  |  65 ++++++++++++
 include/linux/iomap.h |   3 +-
 4 files changed, 276 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 03badc8417d7..d5f0d96169c5 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -13,6 +13,7 @@
 #include <linux/blkdev.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
+#include <linux/iomap.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -505,10 +506,8 @@ static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt)
  * Returns: errno on error
  */
 
-static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
-			   bool zero_new, struct metapath *mp,
-			   const size_t maxlen, sector_t *dblock,
-			   unsigned *dblks)
+static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
+			    unsigned flags, struct metapath *mp)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -516,36 +515,37 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 	struct buffer_head *dibh = mp->mp_bh[0];
 	u64 bn;
 	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
+	unsigned dblks = 0;
 	unsigned ptrs_per_blk;
 	const unsigned end_of_metadata = mp->mp_fheight - 1;
 	int ret;
-	int eob = 0;
 	enum alloc_state state;
 	__be64 *ptr;
 	__be64 zero_bn = 0;
+	size_t maxlen = iomap->length >> inode->i_blkbits;
 
 	BUG_ON(mp->mp_aheight < 1);
 	BUG_ON(dibh == NULL);
 
-	*dblock = 0;
-	*dblks = 0;
 	gfs2_trans_add_meta(ip->i_gl, dibh);
 
 	if (mp->mp_fheight == mp->mp_aheight) {
 		struct buffer_head *bh;
+		int eob;
+
 		/* Bottom indirect block exists, find unalloced extent size */
 		ptr = metapointer(end_of_metadata, mp);
 		bh = mp->mp_bh[end_of_metadata];
-		*dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr,
-					    maxlen, &eob);
-		BUG_ON(*dblks < 1);
+		dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr,
+					   maxlen, &eob);
+		BUG_ON(dblks < 1);
 		state = ALLOC_DATA;
 	} else {
 		/* Need to allocate indirect blocks */
 		ptrs_per_blk = mp->mp_fheight > 1 ? sdp->sd_inptrs :
 			sdp->sd_diptrs;
-		*dblks = min(maxlen, (size_t)(ptrs_per_blk -
-					      mp->mp_list[end_of_metadata]));
+		dblks = min(maxlen, (size_t)(ptrs_per_blk -
+					     mp->mp_list[end_of_metadata]));
 		if (mp->mp_fheight == ip->i_height) {
 			/* Writing into existing tree, extend tree down */
 			iblks = mp->mp_fheight - mp->mp_aheight;
@@ -561,7 +561,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 
 	/* start of the second part of the function (state machine) */
 
-	blks = *dblks + iblks;
+	blks = dblks + iblks;
 	i = mp->mp_aheight;
 	do {
 		int error;
@@ -618,26 +618,29 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 				break;
 		/* Tree complete, adding data blocks */
 		case ALLOC_DATA:
-			BUG_ON(n > *dblks);
+			BUG_ON(n > dblks);
 			BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
 			gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
-			*dblks = n;
+			dblks = n;
 			ptr = metapointer(end_of_metadata, mp);
-			*dblock = bn;
+			iomap->addr = bn << inode->i_blkbits;
+			iomap->flags |= IOMAP_F_NEW;
 			while (n-- > 0)
 				*ptr++ = cpu_to_be64(bn++);
-			if (zero_new) {
-				ret = sb_issue_zeroout(sb, *dblock, *dblks,
-						       GFP_NOFS);
+			if (flags & IOMAP_ZERO) {
+				ret = sb_issue_zeroout(sb, iomap->addr >> inode->i_blkbits,
+						       dblks, GFP_NOFS);
 				if (ret) {
 					fs_err(sdp,
 					       "Failed to zero data buffers\n");
+					flags &= ~IOMAP_ZERO;
 				}
 			}
 			break;
 		}
-	} while ((state != ALLOC_DATA) || !(*dblock));
+	} while (iomap->addr == IOMAP_NULL_ADDR);
 
+	iomap->length = (u64)dblks << inode->i_blkbits;
 	ip->i_height = mp->mp_fheight;
 	gfs2_add_inode_blocks(&ip->i_inode, alloced);
 	gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
@@ -645,47 +648,123 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 }
 
 /**
- * gfs2_block_map - Map a block from an inode to a disk block
+ * hole_size - figure out the size of a hole
  * @inode: The inode
- * @lblock: The logical block number
- * @bh_map: The bh to be mapped
- * @create: True if its ok to alloc blocks to satify the request
+ * @lblock: The logical starting block number
+ * @mp: The metapath
  *
- * Sets buffer_mapped() if successful, sets buffer_boundary() if a
- * read of metadata will be required before the next block can be
- * mapped. Sets buffer_new() if new blocks were allocated.
+ * Returns: The hole size in bytes
  *
- * Returns: errno
  */
+static u64 hole_size(struct inode *inode, sector_t lblock, struct metapath *mp)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct metapath mp_eof;
+	u64 factor = 1;
+	int hgt;
+	u64 holesz = 0;
+	const __be64 *first, *end, *ptr;
+	const struct buffer_head *bh;
+	u64 lblock_stop = (i_size_read(inode) - 1) >> inode->i_blkbits;
+	int zeroptrs;
+	bool done = false;
+
+	/* Get another metapath, to the very last byte */
+	find_metapath(sdp, lblock_stop, &mp_eof, ip->i_height);
+	for (hgt = ip->i_height - 1; hgt >= 0 && !done; hgt--) {
+		bh = mp->mp_bh[hgt];
+		if (bh) {
+			zeroptrs = 0;
+			first = metapointer(hgt, mp);
+			end = (const __be64 *)(bh->b_data + bh->b_size);
+
+			for (ptr = first; ptr < end; ptr++) {
+				if (*ptr) {
+					done = true;
+					break;
+				} else {
+					zeroptrs++;
+				}
+			}
+		} else {
+			zeroptrs = sdp->sd_inptrs;
+		}
+		if (factor * zeroptrs >= lblock_stop - lblock + 1) {
+			holesz = lblock_stop - lblock + 1;
+			break;
+		}
+		holesz += factor * zeroptrs;
 
-int gfs2_block_map(struct inode *inode, sector_t lblock,
-		   struct buffer_head *bh_map, int create)
+		factor *= sdp->sd_inptrs;
+		if (hgt && (mp->mp_list[hgt - 1] < mp_eof.mp_list[hgt - 1]))
+			(mp->mp_list[hgt - 1])++;
+	}
+	return holesz << inode->i_blkbits;
+}
+
+static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
+		      sizeof(struct gfs2_dinode);
+	iomap->offset = 0;
+	iomap->length = i_size_read(inode);
+	iomap->type = IOMAP_MAPPED;
+	iomap->flags = IOMAP_F_DATA_INLINE;
+}
+
+/**
+ * gfs2_iomap_begin - Map blocks from an inode to disk blocks
+ * @inode: The inode
+ * @pos: Starting position in bytes
+ * @length: Length to map, in bytes
+ * @flags: iomap flags
+ * @iomap: The iomap structure
+ *
+ * Returns: errno
+ */
+int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
+		     unsigned flags, struct iomap *iomap)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct metapath mp = { .mp_aheight = 1, };
 	unsigned int factor = sdp->sd_sb.sb_bsize;
-	const size_t maxlen = bh_map->b_size >> inode->i_blkbits;
 	const u64 *arr = sdp->sd_heightsize;
 	__be64 *ptr;
-	u64 size;
-	struct metapath mp;
+	sector_t lblock;
+	sector_t lend;
 	int ret;
 	int eob;
 	unsigned int len;
 	struct buffer_head *bh;
 	u8 height;
-	bool zero_new = false;
-	sector_t dblock = 0;
-	unsigned dblks;
 
-	BUG_ON(maxlen == 0);
+	trace_gfs2_iomap_start(ip, pos, length, flags);
+	if (!length) {
+		ret = -EINVAL;
+		goto out;
+	}
 
-	memset(&mp, 0, sizeof(mp));
-	bmap_lock(ip, create);
-	clear_buffer_mapped(bh_map);
-	clear_buffer_new(bh_map);
-	clear_buffer_boundary(bh_map);
-	trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
+	if ((flags & IOMAP_REPORT) && gfs2_is_stuffed(ip)) {
+		gfs2_stuffed_iomap(inode, iomap);
+		if (pos >= iomap->length)
+			return -ENOENT;
+		ret = 0;
+		goto out;
+	}
+
+	lblock = pos >> inode->i_blkbits;
+	lend = (pos + length + sdp->sd_sb.sb_bsize - 1) >> inode->i_blkbits;
+
+	iomap->offset = lblock << inode->i_blkbits;
+	iomap->addr = IOMAP_NULL_ADDR;
+	iomap->type = IOMAP_HOLE;
+	iomap->length = (u64)(lend - lblock) << inode->i_blkbits;
+	iomap->flags = IOMAP_F_MERGED;
+	bmap_lock(ip, 0);
 
 	/*
 	 * Directory data blocks have a struct gfs2_meta_header header, so the
@@ -699,56 +778,114 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 
 	ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
 	if (ret)
-		goto out;
+		goto out_release;
 
 	height = ip->i_height;
-	size = (lblock + 1) * factor;
-	while (size > arr[height])
+	while ((lblock + 1) * factor > arr[height])
 		height++;
 	find_metapath(sdp, lblock, &mp, height);
-	mp.mp_aheight = 1;
 	if (height > ip->i_height || gfs2_is_stuffed(ip))
 		goto do_alloc;
+
 	ret = lookup_metapath(ip, &mp);
 	if (ret < 0)
-		goto out;
+		goto out_release;
+
 	if (mp.mp_aheight != ip->i_height)
 		goto do_alloc;
+
 	ptr = metapointer(ip->i_height - 1, &mp);
 	if (*ptr == 0)
 		goto do_alloc;
-	map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr));
+
+	iomap->type = IOMAP_MAPPED;
+	iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
+
 	bh = mp.mp_bh[ip->i_height - 1];
-	len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob);
-	bh_map->b_size = (len << inode->i_blkbits);
+	len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, lend - lblock, &eob);
 	if (eob)
-		set_buffer_boundary(bh_map);
+		iomap->flags |= IOMAP_F_BOUNDARY;
+	iomap->length = (u64)len << inode->i_blkbits;
+
 	ret = 0;
-out:
+
+out_release:
 	release_metapath(&mp);
-	trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
-	bmap_unlock(ip, create);
+	bmap_unlock(ip, 0);
+out:
+	trace_gfs2_iomap_end(ip, iomap, ret);
 	return ret;
 
 do_alloc:
-	/* All allocations are done here, firstly check create flag */
-	if (!create) {
-		BUG_ON(gfs2_is_stuffed(ip));
+	if (!(flags & IOMAP_WRITE)) {
+		if (pos >= i_size_read(inode)) {
+			ret = -ENOENT;
+			goto out_release;
+		}
 		ret = 0;
-		goto out;
+		iomap->length = hole_size(inode, lblock, &mp);
+		goto out_release;
 	}
 
-	/* At this point ret is the tree depth of already allocated blocks */
+	ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
+	goto out_release;
+}
+
+/**
+ * gfs2_block_map - Map a block from an inode to a disk block
+ * @inode: The inode
+ * @lblock: The logical block number
+ * @bh_map: The bh to be mapped
+ * @create: True if its ok to alloc blocks to satify the request
+ *
+ * Sets buffer_mapped() if successful, sets buffer_boundary() if a
+ * read of metadata will be required before the next block can be
+ * mapped. Sets buffer_new() if new blocks were allocated.
+ *
+ * Returns: errno
+ */
+
+int gfs2_block_map(struct inode *inode, sector_t lblock,
+		   struct buffer_head *bh_map, int create)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct iomap iomap;
+	int ret, flags = 0;
+
+	clear_buffer_mapped(bh_map);
+	clear_buffer_new(bh_map);
+	clear_buffer_boundary(bh_map);
+	trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
+
+	if (create)
+		flags |= IOMAP_WRITE;
 	if (buffer_zeronew(bh_map))
-		zero_new = true;
-	ret = gfs2_bmap_alloc(inode, lblock, zero_new, &mp, maxlen, &dblock,
-			      &dblks);
-	if (ret == 0) {
-		map_bh(bh_map, inode->i_sb, dblock);
-		bh_map->b_size = dblks << inode->i_blkbits;
-		set_buffer_new(bh_map);
+		flags |= IOMAP_ZERO;
+	ret = gfs2_iomap_begin(inode, (loff_t)lblock << inode->i_blkbits,
+			       bh_map->b_size, flags, &iomap);
+	if (ret) {
+		if (!create && ret == -ENOENT) {
+			/* Return unmapped buffer beyond the end of file.  */
+			ret = 0;
+		}
+		goto out;
+	}
+
+	if (iomap.length > bh_map->b_size) {
+		iomap.length = bh_map->b_size;
+		iomap.flags &= ~IOMAP_F_BOUNDARY;
 	}
-	goto out;
+	if (iomap.addr != IOMAP_NULL_ADDR)
+		map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
+	bh_map->b_size = iomap.length;
+	if (iomap.flags & IOMAP_F_BOUNDARY)
+		set_buffer_boundary(bh_map);
+	if (iomap.flags & IOMAP_F_NEW)
+		set_buffer_new(bh_map);
+
+out:
+	trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
+	return ret;
 }
 
 /*
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 81ded5e2aaa2..443cc182cf18 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -10,6 +10,8 @@
 #ifndef __BMAP_DOT_H__
 #define __BMAP_DOT_H__
 
+#include <linux/iomap.h>
+
 #include "inode.h"
 
 struct inode;
@@ -47,6 +49,8 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
 extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
 extern int gfs2_block_map(struct inode *inode, sector_t lblock,
 			  struct buffer_head *bh, int create);
+extern int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
+			    unsigned flags, struct iomap *iomap);
 extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
 			   u64 *dblock, unsigned *extlen);
 extern int gfs2_setattr_size(struct inode *inode, u64 size);
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 49ac55da4e33..3c91ae3cf0b2 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -12,6 +12,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/writeback.h>
 #include <linux/ktime.h>
+#include <linux/iomap.h>
 #include "incore.h"
 #include "glock.h"
 #include "rgrp.h"
@@ -469,6 +470,70 @@ TRACE_EVENT(gfs2_bmap,
 		  __entry->errno)
 );
 
+TRACE_EVENT(gfs2_iomap_start,
+
+	TP_PROTO(const struct gfs2_inode *ip, loff_t pos, ssize_t length,
+		 u16 flags),
+
+	TP_ARGS(ip, pos, length, flags),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	u64,	inum			)
+		__field(	loff_t, pos			)
+		__field(	ssize_t, length			)
+		__field(	u16,	flags			)
+	),
+
+	TP_fast_assign(
+		__entry->dev            = ip->i_gl->gl_name.ln_sbd->sd_vfs->s_dev;
+		__entry->inum		= ip->i_no_addr;
+		__entry->pos		= pos;
+		__entry->length		= length;
+		__entry->flags		= flags;
+	),
+
+	TP_printk("%u,%u bmap %llu iomap start %llu/%lu flags:%08x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->inum,
+		  (unsigned long long)__entry->pos,
+		  (unsigned long)__entry->length, (u16)__entry->flags)
+);
+
+TRACE_EVENT(gfs2_iomap_end,
+
+	TP_PROTO(const struct gfs2_inode *ip, struct iomap *iomap, int ret),
+
+	TP_ARGS(ip, iomap, ret),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	u64,	inum			)
+		__field(	loff_t, offset			)
+		__field(	ssize_t, length			)
+		__field(	u16,	flags			)
+		__field(	u16,	type			)
+		__field(	int,	ret			)
+	),
+
+	TP_fast_assign(
+		__entry->dev            = ip->i_gl->gl_name.ln_sbd->sd_vfs->s_dev;
+		__entry->inum		= ip->i_no_addr;
+		__entry->offset		= iomap->offset;
+		__entry->length		= iomap->length;
+		__entry->flags		= iomap->flags;
+		__entry->type		= iomap->type;
+		__entry->ret		= ret;
+	),
+
+	TP_printk("%u,%u bmap %llu iomap end %llu/%lu ty:%d flags:%08x rc:%d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->inum,
+		  (unsigned long long)__entry->offset,
+		  (unsigned long)__entry->length, (u16)__entry->type,
+		  (u16)__entry->flags, __entry->ret)
+);
+
 /* Keep track of blocks as they are allocated/freed */
 TRACE_EVENT(gfs2_block_alloc,
 
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 2b0790dbd6ea..a61be86710b5 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -21,7 +21,8 @@ struct vm_fault;
 /*
  * Flags for all iomap mappings:
  */
-#define IOMAP_F_NEW	0x01	/* blocks have been newly allocated */
+#define IOMAP_F_NEW		0x01	/* blocks have been newly allocated */
+#define IOMAP_F_BOUNDARY	0x02	/* mapping ends at metadata boundary */
 
 /*
  * Flags that only need to be reported for IOMAP_REPORT requests:
-- 
cgit v1.2.3


From b2f270e8747387335d80428c576118e7d87f69cc Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 17 Oct 2017 19:04:41 -0700
Subject: module: Prepare to convert all module_param_call() prototypes

After actually converting all module_param_call() function prototypes, we
no longer need to do a tricky sizeof(func(thing)) type-check. Remove it.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/moduleparam.h | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 1ee7b30dafec..037a90a522a5 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -228,18 +228,10 @@ struct kparam_array
 
 /* Obsolete - use module_param_cb() */
 #define module_param_call(name, set, get, arg, perm)			\
-	static const struct kernel_param_ops __param_ops_##name =		\
+	static const struct kernel_param_ops __param_ops_##name =	\
 		{ .flags = 0, (void *)set, (void *)get };		\
 	__module_param_call(MODULE_PARAM_PREFIX,			\
-			    name, &__param_ops_##name, arg,		\
-			    (perm) + sizeof(__check_old_set_param(set))*0, -1, 0)
-
-/* We don't get oldget: it's often a new-style param_get_uint, etc. */
-static inline int
-__check_old_set_param(int (*oldset)(const char *, struct kernel_param *))
-{
-	return 0;
-}
+			    name, &__param_ops_##name, arg, perm, -1, 0)
 
 #ifdef CONFIG_SYSFS
 extern void kernel_param_lock(struct module *mod);
-- 
cgit v1.2.3


From ece1996a21eeb344b49200e627c6660111009c10 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 17 Oct 2017 19:04:43 -0700
Subject: module: Do not paper over type mismatches in module_param_call()

The module_param_call() macro was explicitly casting the .set and
.get function prototypes away. This can lead to hard-to-find type
mismatches. Now that all the function prototypes have been fixed
tree-wide, we can drop these casts, and use named initializers too.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/moduleparam.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 037a90a522a5..20386252fe3e 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -227,9 +227,9 @@ struct kparam_array
 	    VERIFY_OCTAL_PERMISSIONS(perm), level, flags, { arg } }
 
 /* Obsolete - use module_param_cb() */
-#define module_param_call(name, set, get, arg, perm)			\
+#define module_param_call(name, _set, _get, arg, perm)			\
 	static const struct kernel_param_ops __param_ops_##name =	\
-		{ .flags = 0, (void *)set, (void *)get };		\
+		{ .flags = 0, .set = _set, .get = _get };		\
 	__module_param_call(MODULE_PARAM_PREFIX,			\
 			    name, &__param_ops_##name, arg, perm, -1, 0)
 
-- 
cgit v1.2.3


From 7761daa6a1599fa5479b8da367470f632a1927e0 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 13:26:01 +0300
Subject: fsnotify: convert fsnotify_group.refcnt from atomic_t to refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable fsnotify_group.refcnt is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/group.c                | 6 +++---
 include/linux/fsnotify_backend.h | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/group.c b/fs/notify/group.c
index 32357534de18..b7a4b6a69efa 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -107,7 +107,7 @@ void fsnotify_destroy_group(struct fsnotify_group *group)
  */
 void fsnotify_get_group(struct fsnotify_group *group)
 {
-	atomic_inc(&group->refcnt);
+	refcount_inc(&group->refcnt);
 }
 
 /*
@@ -115,7 +115,7 @@ void fsnotify_get_group(struct fsnotify_group *group)
  */
 void fsnotify_put_group(struct fsnotify_group *group)
 {
-	if (atomic_dec_and_test(&group->refcnt))
+	if (refcount_dec_and_test(&group->refcnt))
 		fsnotify_final_destroy_group(group);
 }
 
@@ -131,7 +131,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 		return ERR_PTR(-ENOMEM);
 
 	/* set to 0 when there a no external references to this group */
-	atomic_set(&group->refcnt, 1);
+	refcount_set(&group->refcnt, 1);
 	atomic_set(&group->num_marks, 0);
 	atomic_set(&group->user_waits, 0);
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index c6c69318752b..20a57bac38f2 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -17,6 +17,7 @@
 #include <linux/types.h>
 #include <linux/atomic.h>
 #include <linux/user_namespace.h>
+#include <linux/refcount.h>
 
 /*
  * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
@@ -135,7 +136,7 @@ struct fsnotify_group {
 	 * inotify_init() and the refcnt will hit 0 only when that fd has been
 	 * closed.
 	 */
-	atomic_t refcnt;		/* things with interest in this group */
+	refcount_t refcnt;		/* things with interest in this group */
 
 	const struct fsnotify_ops *ops;	/* how this group handles things */
 
-- 
cgit v1.2.3


From 6685df31255493c3f0e9e0b8bf885e4c9762fc5d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Mon, 30 Oct 2017 21:14:56 +0100
Subject: fanotify: clean up CONFIG_FANOTIFY_ACCESS_PERMISSIONS ifdefs

The only negative from this patch should be an addition of 32bytes to
'struct fsnotify_group' if CONFIG_FANOTIFY_ACCESS_PERMISSIONS is not
defined.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c      | 30 +++++++-------------------
 fs/notify/fanotify/fanotify.h      |  8 +++++--
 fs/notify/fanotify/fanotify_user.c | 43 ++++++++++++++------------------------
 include/linux/fsnotify_backend.h   |  2 --
 4 files changed, 29 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index df3f484e458a..63f56b007280 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -35,15 +35,13 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
 
 	pr_debug("%s: list=%p event=%p\n", __func__, list, event);
 
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 	/*
 	 * Don't merge a permission event with any other event so that we know
 	 * the event structure we have created in fanotify_handle_event() is the
 	 * one we should check for permission response.
 	 */
-	if (event->mask & FAN_ALL_PERM_EVENTS)
+	if (fanotify_is_perm_event(event->mask))
 		return 0;
-#endif
 
 	list_for_each_entry_reverse(test_event, list, list) {
 		if (should_merge(test_event, event)) {
@@ -55,7 +53,6 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
 	return 0;
 }
 
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 static int fanotify_get_response(struct fsnotify_group *group,
 				 struct fanotify_perm_event_info *event,
 				 struct fsnotify_iter_info *iter_info)
@@ -82,7 +79,6 @@ static int fanotify_get_response(struct fsnotify_group *group,
 	
 	return ret;
 }
-#endif
 
 static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
 				       struct fsnotify_mark *vfsmnt_mark,
@@ -141,8 +137,7 @@ struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
 {
 	struct fanotify_event_info *event;
 
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-	if (mask & FAN_ALL_PERM_EVENTS) {
+	if (fanotify_is_perm_event(mask)) {
 		struct fanotify_perm_event_info *pevent;
 
 		pevent = kmem_cache_alloc(fanotify_perm_event_cachep,
@@ -153,7 +148,6 @@ struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
 		pevent->response = 0;
 		goto init;
 	}
-#endif
 	event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
 	if (!event)
 		return NULL;
@@ -200,8 +194,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 	pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
 		 mask);
 
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-	if (mask & FAN_ALL_PERM_EVENTS) {
+	if (fanotify_is_perm_event(mask)) {
 		/*
 		 * fsnotify_prepare_user_wait() fails if we race with mark
 		 * deletion.  Just let the operation pass in that case.
@@ -209,7 +202,6 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 		if (!fsnotify_prepare_user_wait(iter_info))
 			return 0;
 	}
-#endif
 
 	event = fanotify_alloc_event(inode, mask, data);
 	ret = -ENOMEM;
@@ -225,21 +217,15 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 		fsnotify_destroy_event(group, fsn_event);
 
 		ret = 0;
-		goto finish;
-	}
-
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-	if (mask & FAN_ALL_PERM_EVENTS) {
+	} else if (fanotify_is_perm_event(mask)) {
 		ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event),
 					    iter_info);
 		fsnotify_destroy_event(group, fsn_event);
 	}
 finish:
-	if (mask & FAN_ALL_PERM_EVENTS)
+	if (fanotify_is_perm_event(mask))
 		fsnotify_finish_user_wait(iter_info);
-#else
-finish:
-#endif
+
 	return ret;
 }
 
@@ -259,13 +245,11 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
 	event = FANOTIFY_E(fsn_event);
 	path_put(&event->path);
 	put_pid(event->tgid);
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-	if (fsn_event->mask & FAN_ALL_PERM_EVENTS) {
+	if (fanotify_is_perm_event(fsn_event->mask)) {
 		kmem_cache_free(fanotify_perm_event_cachep,
 				FANOTIFY_PE(fsn_event));
 		return;
 	}
-#endif
 	kmem_cache_free(fanotify_event_cachep, event);
 }
 
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 4eb6f5efa282..dc219cf07a6a 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -21,7 +21,6 @@ struct fanotify_event_info {
 	struct pid *tgid;
 };
 
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 /*
  * Structure for permission fanotify events. It gets allocated and freed in
  * fanotify_handle_event() since we wait there for user response. When the
@@ -40,7 +39,12 @@ FANOTIFY_PE(struct fsnotify_event *fse)
 {
 	return container_of(fse, struct fanotify_perm_event_info, fae.fse);
 }
-#endif
+
+static inline bool fanotify_is_perm_event(u32 mask)
+{
+	return IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS) &&
+		mask & FAN_ALL_PERM_EVENTS;
+}
 
 static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
 {
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 907a481ac781..a434de023c49 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -142,7 +142,6 @@ static int fill_event_metadata(struct fsnotify_group *group,
 	return ret;
 }
 
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 static struct fanotify_perm_event_info *dequeue_event(
 				struct fsnotify_group *group, int fd)
 {
@@ -199,7 +198,6 @@ static int process_access_response(struct fsnotify_group *group,
 
 	return 0;
 }
-#endif
 
 static ssize_t copy_event_to_user(struct fsnotify_group *group,
 				  struct fsnotify_event *event,
@@ -221,10 +219,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 			 fanotify_event_metadata.event_len))
 		goto out_close_fd;
 
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-	if (event->mask & FAN_ALL_PERM_EVENTS)
+	if (fanotify_is_perm_event(event->mask))
 		FANOTIFY_PE(event)->fd = fd;
-#endif
 
 	if (fd != FAN_NOFD)
 		fd_install(fd, f);
@@ -309,10 +305,9 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
 		 * Permission events get queued to wait for response.  Other
 		 * events can be destroyed now.
 		 */
-		if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) {
+		if (!fanotify_is_perm_event(kevent->mask)) {
 			fsnotify_destroy_event(group, kevent);
 		} else {
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 			if (ret <= 0) {
 				FANOTIFY_PE(kevent)->response = FAN_DENY;
 				wake_up(&group->fanotify_data.access_waitq);
@@ -322,7 +317,6 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
 					&group->fanotify_data.access_list);
 				spin_unlock(&group->notification_lock);
 			}
-#endif
 		}
 		if (ret < 0)
 			break;
@@ -338,11 +332,13 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
 
 static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 {
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 	struct fanotify_response response = { .fd = -1, .response = -1 };
 	struct fsnotify_group *group;
 	int ret;
 
+	if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
+		return -EINVAL;
+
 	group = file->private_data;
 
 	if (count > sizeof(response))
@@ -358,16 +354,11 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
 		count = ret;
 
 	return count;
-#else
-	return -EINVAL;
-#endif
 }
 
 static int fanotify_release(struct inode *ignored, struct file *file)
 {
 	struct fsnotify_group *group = file->private_data;
-
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 	struct fanotify_perm_event_info *event, *next;
 	struct fsnotify_event *fsn_event;
 
@@ -403,14 +394,14 @@ static int fanotify_release(struct inode *ignored, struct file *file)
 			spin_unlock(&group->notification_lock);
 			fsnotify_destroy_event(group, fsn_event);
 			spin_lock(&group->notification_lock);
-		} else
+		} else {
 			FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
+		}
 	}
 	spin_unlock(&group->notification_lock);
 
 	/* Response for all permission events it set, wakeup waiters */
 	wake_up(&group->fanotify_data.access_waitq);
-#endif
 
 	/* matches the fanotify_init->fsnotify_alloc_group */
 	fsnotify_destroy_group(group);
@@ -768,10 +759,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	if (force_o_largefile())
 		event_f_flags |= O_LARGEFILE;
 	group->fanotify_data.f_flags = event_f_flags;
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 	init_waitqueue_head(&group->fanotify_data.access_waitq);
 	INIT_LIST_HEAD(&group->fanotify_data.access_list);
-#endif
 	switch (flags & FAN_ALL_CLASS_BITS) {
 	case FAN_CLASS_NOTIF:
 		group->priority = FS_PRIO_0;
@@ -825,6 +814,7 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
 	struct fsnotify_group *group;
 	struct fd f;
 	struct path path;
+	u32 valid_mask = FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD;
 	int ret;
 
 	pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
@@ -855,11 +845,10 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
 		mask &= ~FAN_ONDIR;
 	}
 
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-	if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD))
-#else
-	if (mask & ~(FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD))
-#endif
+	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
+		valid_mask |= FAN_ALL_PERM_EVENTS;
+
+	if (mask & ~valid_mask)
 		return -EINVAL;
 
 	f = fdget(fanotify_fd);
@@ -949,10 +938,10 @@ static int __init fanotify_user_setup(void)
 {
 	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
 	fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-	fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event_info,
-						SLAB_PANIC);
-#endif
+	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
+		fanotify_perm_event_cachep =
+			KMEM_CACHE(fanotify_perm_event_info, SLAB_PANIC);
+	}
 
 	return 0;
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 20a57bac38f2..744e2b9969fc 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -183,11 +183,9 @@ struct fsnotify_group {
 #endif
 #ifdef CONFIG_FANOTIFY
 		struct fanotify_group_private_data {
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 			/* allows a group to block waiting for a userspace response */
 			struct list_head access_list;
 			wait_queue_head_t access_waitq;
-#endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
 			int f_flags;
 			unsigned int max_marks;
 			struct user_struct *user;
-- 
cgit v1.2.3


From ab97f87325e28b7ef7717e6cb62e8da14a7176e1 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 13:26:02 +0300
Subject: fsnotify: convert fsnotify_mark.refcnt from atomic_t to refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable fsnotify_mark.refcnt is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/inotify/inotify_user.c |  4 ++--
 fs/notify/mark.c                 | 14 +++++++-------
 include/linux/fsnotify_backend.h |  2 +-
 kernel/audit_tree.c              |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 7cc7d3fb1862..d3c20e0bb046 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -376,7 +376,7 @@ static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group
 
 		fsnotify_get_mark(fsn_mark);
 		/* One ref for being in the idr, one ref we just took */
-		BUG_ON(atomic_read(&fsn_mark->refcnt) < 2);
+		BUG_ON(refcount_read(&fsn_mark->refcnt) < 2);
 	}
 
 	return i_mark;
@@ -446,7 +446,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
 	 * One ref for being in the idr
 	 * one ref grabbed by inotify_idr_find
 	 */
-	if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 2)) {
+	if (unlikely(refcount_read(&i_mark->fsn_mark.refcnt) < 2)) {
 		printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n",
 			 __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group);
 		/* we can't really recover with bad ref cnting.. */
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index f3a32ea15b49..e9191b416434 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -105,8 +105,8 @@ static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn);
 
 void fsnotify_get_mark(struct fsnotify_mark *mark)
 {
-	WARN_ON_ONCE(!atomic_read(&mark->refcnt));
-	atomic_inc(&mark->refcnt);
+	WARN_ON_ONCE(!refcount_read(&mark->refcnt));
+	refcount_inc(&mark->refcnt);
 }
 
 static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
@@ -201,7 +201,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
 
 	/* Catch marks that were actually never attached to object */
 	if (!mark->connector) {
-		if (atomic_dec_and_test(&mark->refcnt))
+		if (refcount_dec_and_test(&mark->refcnt))
 			fsnotify_final_mark_destroy(mark);
 		return;
 	}
@@ -210,7 +210,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
 	 * We have to be careful so that traversals of obj_list under lock can
 	 * safely grab mark reference.
 	 */
-	if (!atomic_dec_and_lock(&mark->refcnt, &mark->connector->lock))
+	if (!refcount_dec_and_lock(&mark->refcnt, &mark->connector->lock))
 		return;
 
 	conn = mark->connector;
@@ -258,7 +258,7 @@ static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark)
 	if (!mark)
 		return true;
 
-	if (atomic_inc_not_zero(&mark->refcnt)) {
+	if (refcount_inc_not_zero(&mark->refcnt)) {
 		spin_lock(&mark->lock);
 		if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) {
 			/* mark is attached, group is still alive then */
@@ -335,7 +335,7 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
 
 	WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex));
 	WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) &&
-		     atomic_read(&mark->refcnt) < 1 +
+		     refcount_read(&mark->refcnt) < 1 +
 			!!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED));
 
 	spin_lock(&mark->lock);
@@ -737,7 +737,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
 {
 	memset(mark, 0, sizeof(*mark));
 	spin_lock_init(&mark->lock);
-	atomic_set(&mark->refcnt, 1);
+	refcount_set(&mark->refcnt, 1);
 	fsnotify_get_group(group);
 	mark->group = group;
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 744e2b9969fc..9bcb43953f4e 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -242,7 +242,7 @@ struct fsnotify_mark {
 	__u32 mask;
 	/* We hold one for presence in g_list. Also one ref for each 'thing'
 	 * in kernel that found and may be using this mark. */
-	atomic_t refcnt;
+	refcount_t refcnt;
 	/* Group this mark is for. Set on mark creation, stable until last ref
 	 * is dropped */
 	struct fsnotify_group *group;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 011d46e5f73f..45ec960ad536 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1007,7 +1007,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
 	 * We are guaranteed to have at least one reference to the mark from
 	 * either the inode or the caller of fsnotify_destroy_mark().
 	 */
-	BUG_ON(atomic_read(&entry->refcnt) < 1);
+	BUG_ON(refcount_read(&entry->refcnt) < 1);
 }
 
 static const struct fsnotify_ops audit_tree_ops = {
-- 
cgit v1.2.3


From aa4bf44dc851c6bdd4f7b61b5f2c56c84dfe2ff0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Wed, 25 Oct 2017 00:04:40 +0200
Subject: userns: use union in {g,u}idmap struct

- Add a struct containing two pointer to extents and wrap both the static extent
  array and the struct into a union. This is done in preparation for bumping the
  {g,u}idmap limits for user namespaces.
- Add brackets around anonymous union when using designated initializers to
  initialize members in order to please gcc <= 4.4.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/user_namespace.h | 18 +++++++++++++-----
 kernel/user.c                  | 30 ++++++++++++++++++------------
 2 files changed, 31 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index c18e01252346..7c83d7f6289b 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -12,13 +12,21 @@
 
 #define UID_GID_MAP_MAX_EXTENTS 5
 
+struct uid_gid_extent {
+	u32 first;
+	u32 lower_first;
+	u32 count;
+};
+
 struct uid_gid_map {	/* 64 bytes -- 1 cache line */
 	u32 nr_extents;
-	struct uid_gid_extent {
-		u32 first;
-		u32 lower_first;
-		u32 count;
-	} extent[UID_GID_MAP_MAX_EXTENTS];
+	union {
+		struct uid_gid_extent extent[UID_GID_MAP_MAX_EXTENTS];
+		struct {
+			struct uid_gid_extent *forward;
+			struct uid_gid_extent *reverse;
+		};
+	};
 };
 
 #define USERNS_SETGROUPS_ALLOWED 1UL
diff --git a/kernel/user.c b/kernel/user.c
index 00281add65b2..9a20acce460d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -26,26 +26,32 @@
 struct user_namespace init_user_ns = {
 	.uid_map = {
 		.nr_extents = 1,
-		.extent[0] = {
-			.first = 0,
-			.lower_first = 0,
-			.count = 4294967295U,
+		{
+			.extent[0] = {
+				.first = 0,
+				.lower_first = 0,
+				.count = 4294967295U,
+			},
 		},
 	},
 	.gid_map = {
 		.nr_extents = 1,
-		.extent[0] = {
-			.first = 0,
-			.lower_first = 0,
-			.count = 4294967295U,
+		{
+			.extent[0] = {
+				.first = 0,
+				.lower_first = 0,
+				.count = 4294967295U,
+			},
 		},
 	},
 	.projid_map = {
 		.nr_extents = 1,
-		.extent[0] = {
-			.first = 0,
-			.lower_first = 0,
-			.count = 4294967295U,
+		{
+			.extent[0] = {
+				.first = 0,
+				.lower_first = 0,
+				.count = 4294967295U,
+			},
 		},
 	},
 	.count = ATOMIC_INIT(3),
-- 
cgit v1.2.3


From 6397fac4915ab3002dc15aae751455da1a852f25 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Wed, 25 Oct 2017 00:04:41 +0200
Subject: userns: bump idmap limits to 340

There are quite some use cases where users run into the current limit for
{g,u}id mappings. Consider a user requesting us to map everything but 999, and
1001 for a given range of 1000000000 with a sub{g,u}id layout of:

some-user:100000:1000000000
some-user:999:1
some-user:1000:1
some-user:1001:1
some-user:1002:1

This translates to:

MAPPING-TYPE | CONTAINER |    HOST |     RANGE |
-------------|-----------|---------|-----------|
         uid |       999 |     999 |         1 |
         uid |      1001 |    1001 |         1 |
         uid |         0 | 1000000 |       999 |
         uid |      1000 | 1001000 |         1 |
         uid |      1002 | 1001002 | 999998998 |
------------------------------------------------
         gid |       999 |     999 |         1 |
         gid |      1001 |    1001 |         1 |
         gid |         0 | 1000000 |       999 |
         gid |      1000 | 1001000 |         1 |
         gid |      1002 | 1001002 | 999998998 |

which is already the current limit.

As discussed at LPC simply bumping the number of limits is not going to work
since this would mean that struct uid_gid_map won't fit into a single cache-line
anymore thereby regressing performance for the base-cases. The same problem
seems to arise when using a single pointer. So the idea is to use

struct uid_gid_extent {
	u32 first;
	u32 lower_first;
	u32 count;
};

struct uid_gid_map { /* 64 bytes -- 1 cache line */
	u32 nr_extents;
	union {
		struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
		struct {
			struct uid_gid_extent *forward;
			struct uid_gid_extent *reverse;
		};
	};
};

For the base cases we will only use the struct uid_gid_extent extent member. If
we go over UID_GID_MAP_MAX_BASE_EXTENTS mappings we perform a single 4k
kmalloc() which means we can have a maximum of 340 mappings
(340 * size(struct uid_gid_extent) = 4080). For the latter case we use two
pointers "forward" and "reverse". The forward pointer points to an array sorted
by "first" and the reverse pointer points to an array sorted by "lower_first".
We can then perform binary search on those arrays.

Performance Testing:
When Eric introduced the extent-based struct uid_gid_map approach he measured
the performanc impact of his idmap changes:

> My benchmark consisted of going to single user mode where nothing else was
> running. On an ext4 filesystem opening 1,000,000 files and looping through all
> of the files 1000 times and calling fstat on the individuals files. This was
> to ensure I was benchmarking stat times where the inodes were in the kernels
> cache, but the inode values were not in the processors cache. My results:

> v3.4-rc1:         ~= 156ns (unmodified v3.4-rc1 with user namespace support disabled)
> v3.4-rc1-userns-: ~= 155ns (v3.4-rc1 with my user namespace patches and user namespace support disabled)
> v3.4-rc1-userns+: ~= 164ns (v3.4-rc1 with my user namespace patches and user namespace support enabled)

I used an identical approach on my laptop. Here's a thorough description of what
I did. I built a 4.14.0-rc4 mainline kernel with my new idmap patches applied. I
booted into single user mode and used an ext4 filesystem to open/create
1,000,000 files. Then I looped through all of the files calling fstat() on each
of them 1000 times and calculated the mean fstat() time for a single file. (The
test program can be found below.)

Here are the results. For fun, I compared the first version of my patch which
scaled linearly with the new version of the patch:

|   # MAPPINGS |   PATCH-V1 | PATCH-NEW |
|--------------|------------|-----------|
|   0 mappings |     158 ns |   158 ns  |
|   1 mappings |     164 ns |   157 ns  |
|   2 mappings |     170 ns |   158 ns  |
|   3 mappings |     175 ns |   161 ns  |
|   5 mappings |     187 ns |   165 ns  |
|  10 mappings |     218 ns |   199 ns  |
|  50 mappings |     528 ns |   218 ns  |
| 100 mappings |     980 ns |   229 ns  |
| 200 mappings |    1880 ns |   239 ns  |
| 300 mappings |    2760 ns |   240 ns  |
| 340 mappings | not tested |   248 ns  |

Here's the test program I used. I asked Eric what he did and this is a more
"advanced" implementation of the idea. It's pretty straight-forward:

 #define __GNU_SOURCE
 #define __STDC_FORMAT_MACROS
 #include <errno.h>
 #include <dirent.h>
 #include <fcntl.h>
 #include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/types.h>

 int main(int argc, char *argv[])
 {
 	int ret;
 	size_t i, k;
 	int fd[1000000];
 	int times[1000];
 	char pathname[4096];
 	struct stat st;
 	struct timeval t1, t2;
 	uint64_t time_in_mcs;
 	uint64_t sum = 0;

 	if (argc != 2) {
 		fprintf(stderr, "Please specify a directory where to create "
 				"the test files\n");
 		exit(EXIT_FAILURE);
 	}

 	for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) {
 		sprintf(pathname, "%s/idmap_test_%zu", argv[1], i);
 		fd[i]= open(pathname, O_RDWR | O_CREAT, S_IXUSR | S_IXGRP | S_IXOTH);
 		if (fd[i] < 0) {
 			ssize_t j;
 			for (j = i; j >= 0; j--)
 				close(fd[j]);
 			exit(EXIT_FAILURE);
 		}
 	}

 	for (k = 0; k < 1000; k++) {
 		ret = gettimeofday(&t1, NULL);
 		if (ret < 0)
 			goto close_all;

 		for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) {
 			ret = fstat(fd[i], &st);
 			if (ret < 0)
 				goto close_all;
 		}

 		ret = gettimeofday(&t2, NULL);
 		if (ret < 0)
 			goto close_all;

 		time_in_mcs = (1000000 * t2.tv_sec + t2.tv_usec) -
 			      (1000000 * t1.tv_sec + t1.tv_usec);
 		printf("Total time in micro seconds:       %" PRIu64 "\n",
 		       time_in_mcs);
 		printf("Total time in nanoseconds:         %" PRIu64 "\n",
 		       time_in_mcs * 1000);
 		printf("Time per file in nanoseconds:      %" PRIu64 "\n",
 		       (time_in_mcs * 1000) / 1000000);
 		times[k] = (time_in_mcs * 1000) / 1000000;
 	}

 close_all:
 	for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++)
 		close(fd[i]);

 	if (ret < 0)
 		exit(EXIT_FAILURE);

 	for (k = 0; k < 1000; k++) {
 		sum += times[k];
 	}

 	printf("Mean time per file in nanoseconds: %" PRIu64 "\n", sum / 1000);

 	exit(EXIT_SUCCESS);;
 }

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
CC: Serge Hallyn <serge@hallyn.com>
CC: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/user_namespace.h |   7 +-
 kernel/user_namespace.c        | 350 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 324 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 7c83d7f6289b..1c1046a60fb4 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -10,7 +10,8 @@
 #include <linux/sysctl.h>
 #include <linux/err.h>
 
-#define UID_GID_MAP_MAX_EXTENTS 5
+#define UID_GID_MAP_MAX_BASE_EXTENTS 5
+#define UID_GID_MAP_MAX_EXTENTS 340
 
 struct uid_gid_extent {
 	u32 first;
@@ -18,10 +19,10 @@ struct uid_gid_extent {
 	u32 count;
 };
 
-struct uid_gid_map {	/* 64 bytes -- 1 cache line */
+struct uid_gid_map { /* 64 bytes -- 1 cache line */
 	u32 nr_extents;
 	union {
-		struct uid_gid_extent extent[UID_GID_MAP_MAX_EXTENTS];
+		struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
 		struct {
 			struct uid_gid_extent *forward;
 			struct uid_gid_extent *reverse;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c490f1e4313b..5fd2d53dbc75 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -23,6 +23,8 @@
 #include <linux/ctype.h>
 #include <linux/projid.h>
 #include <linux/fs_struct.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
 
 static struct kmem_cache *user_ns_cachep __read_mostly;
 static DEFINE_MUTEX(userns_state_mutex);
@@ -181,6 +183,18 @@ static void free_user_ns(struct work_struct *work)
 	do {
 		struct ucounts *ucounts = ns->ucounts;
 		parent = ns->parent;
+		if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+			kfree(ns->gid_map.forward);
+			kfree(ns->gid_map.reverse);
+		}
+		if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+			kfree(ns->uid_map.forward);
+			kfree(ns->uid_map.reverse);
+		}
+		if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+			kfree(ns->projid_map.forward);
+			kfree(ns->projid_map.reverse);
+		}
 		retire_userns_sysctls(ns);
 #ifdef CONFIG_PERSISTENT_KEYRINGS
 		key_put(ns->persistent_keyring_register);
@@ -198,7 +212,84 @@ void __put_user_ns(struct user_namespace *ns)
 }
 EXPORT_SYMBOL(__put_user_ns);
 
-static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
+/**
+ * idmap_key struct holds the information necessary to find an idmapping in a
+ * sorted idmap array. It is passed to cmp_map_id() as first argument.
+ */
+struct idmap_key {
+	bool map_up; /* true  -> id from kid; false -> kid from id */
+	u32 id; /* id to find */
+	u32 count; /* == 0 unless used with map_id_range_down() */
+};
+
+/**
+ * cmp_map_id - Function to be passed to bsearch() to find the requested
+ * idmapping. Expects struct idmap_key to be passed via @k.
+ */
+static int cmp_map_id(const void *k, const void *e)
+{
+	u32 first, last, id2;
+	const struct idmap_key *key = k;
+	const struct uid_gid_extent *el = e;
+
+	/* handle map_id_range_down() */
+	if (key->count)
+		id2 = key->id + key->count - 1;
+	else
+		id2 = key->id;
+
+	/* handle map_id_{down,up}() */
+	if (key->map_up)
+		first = el->lower_first;
+	else
+		first = el->first;
+
+	last = first + el->count - 1;
+
+	if (key->id >= first && key->id <= last &&
+	    (id2 >= first && id2 <= last))
+		return 0;
+
+	if (key->id < first || id2 < first)
+		return -1;
+
+	return 1;
+}
+
+/**
+ * map_id_range_down_max - Find idmap via binary search in ordered idmap array.
+ * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static u32 map_id_range_down_max(struct uid_gid_map *map, u32 id, u32 count)
+{
+	u32 extents;
+	struct uid_gid_extent *extent;
+	struct idmap_key key;
+
+	key.map_up = false;
+	key.count = count;
+	key.id = id;
+
+	extents = map->nr_extents;
+	smp_rmb();
+
+	extent = bsearch(&key, map->forward, extents,
+			 sizeof(struct uid_gid_extent), cmp_map_id);
+	/* Map the id or note failure */
+	if (extent)
+		id = (id - extent->first) + extent->lower_first;
+	else
+		id = (u32) -1;
+
+	return id;
+}
+
+/**
+ * map_id_range_down_base - Find idmap via binary search in static extent array.
+ * Can only be called if number of mappings is equal or less than
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static u32 map_id_range_down_base(struct uid_gid_map *map, u32 id, u32 count)
 {
 	unsigned idx, extents;
 	u32 first, last, id2;
@@ -224,7 +315,23 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
 	return id;
 }
 
-static u32 map_id_down(struct uid_gid_map *map, u32 id)
+static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
+{
+	u32 extents = map->nr_extents;
+	smp_rmb();
+
+	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		return map_id_range_down_base(map, id, count);
+
+	return map_id_range_down_max(map, id, count);
+}
+
+/**
+ * map_id_down_base - Find idmap via binary search in static extent array.
+ * Can only be called if number of mappings is equal or less than
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static u32 map_id_down_base(struct uid_gid_map *map, u32 id)
 {
 	unsigned idx, extents;
 	u32 first, last;
@@ -247,7 +354,23 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id)
 	return id;
 }
 
-static u32 map_id_up(struct uid_gid_map *map, u32 id)
+static u32 map_id_down(struct uid_gid_map *map, u32 id)
+{
+	u32 extents = map->nr_extents;
+	smp_rmb();
+
+	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		return map_id_down_base(map, id);
+
+	return map_id_range_down_max(map, id, 0);
+}
+
+/**
+ * map_id_up_base - Find idmap via binary search in static extent array.
+ * Can only be called if number of mappings is equal or less than
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static u32 map_id_up_base(struct uid_gid_map *map, u32 id)
 {
 	unsigned idx, extents;
 	u32 first, last;
@@ -270,6 +393,45 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id)
 	return id;
 }
 
+/**
+ * map_id_up_max - Find idmap via binary search in ordered idmap array.
+ * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static u32 map_id_up_max(struct uid_gid_map *map, u32 id)
+{
+	u32 extents;
+	struct uid_gid_extent *extent;
+	struct idmap_key key;
+
+	key.map_up = true;
+	key.count = 0;
+	key.id = id;
+
+	extents = map->nr_extents;
+	smp_rmb();
+
+	extent = bsearch(&key, map->reverse, extents,
+			 sizeof(struct uid_gid_extent), cmp_map_id);
+	/* Map the id or note failure */
+	if (extent)
+		id = (id - extent->lower_first) + extent->first;
+	else
+		id = (u32) -1;
+
+	return id;
+}
+
+static u32 map_id_up(struct uid_gid_map *map, u32 id)
+{
+	u32 extents = map->nr_extents;
+	smp_rmb();
+
+	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		return map_id_up_base(map, id);
+
+	return map_id_up_max(map, id);
+}
+
 /**
  *	make_kuid - Map a user-namespace uid pair into a kuid.
  *	@ns:  User namespace that the uid is in
@@ -540,13 +702,15 @@ static int projid_m_show(struct seq_file *seq, void *v)
 static void *m_start(struct seq_file *seq, loff_t *ppos,
 		     struct uid_gid_map *map)
 {
-	struct uid_gid_extent *extent = NULL;
 	loff_t pos = *ppos;
 
-	if (pos < map->nr_extents)
-		extent = &map->extent[pos];
+	if (pos >= map->nr_extents)
+		return NULL;
+
+	if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		return &map->extent[pos];
 
-	return extent;
+	return &map->forward[pos];
 }
 
 static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
@@ -618,7 +782,10 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
 		u32 prev_upper_last, prev_lower_last;
 		struct uid_gid_extent *prev;
 
-		prev = &new_map->extent[idx];
+		if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+			prev = &new_map->extent[idx];
+		else
+			prev = &new_map->forward[idx];
 
 		prev_upper_first = prev->first;
 		prev_lower_first = prev->lower_first;
@@ -638,6 +805,102 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
 	return false;
 }
 
+/**
+ * insert_extent - Safely insert a new idmap extent into struct uid_gid_map.
+ * Takes care to allocate a 4K block of memory if the number of mappings exceeds
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent)
+{
+	if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) {
+		map->extent[map->nr_extents].first = extent->first;
+		map->extent[map->nr_extents].lower_first = extent->lower_first;
+		map->extent[map->nr_extents].count = extent->count;
+		return 0;
+	}
+
+	if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) {
+		struct uid_gid_extent *forward;
+
+		/* Allocate memory for 340 mappings. */
+		forward = kmalloc(sizeof(struct uid_gid_extent) *
+				 UID_GID_MAP_MAX_EXTENTS, GFP_KERNEL);
+		if (!forward)
+			return -ENOMEM;
+
+		/* Copy over memory. Only set up memory for the forward pointer.
+		 * Defer the memory setup for the reverse pointer.
+		 */
+		memcpy(forward, map->extent,
+		       map->nr_extents * sizeof(map->extent[0]));
+
+		map->forward = forward;
+		map->reverse = NULL;
+	}
+
+	map->forward[map->nr_extents].first = extent->first;
+	map->forward[map->nr_extents].lower_first = extent->lower_first;
+	map->forward[map->nr_extents].count = extent->count;
+	return 0;
+}
+
+/* cmp function to sort() forward mappings */
+static int cmp_extents_forward(const void *a, const void *b)
+{
+	const struct uid_gid_extent *e1 = a;
+	const struct uid_gid_extent *e2 = b;
+
+	if (e1->first < e2->first)
+		return -1;
+
+	if (e1->first > e2->first)
+		return 1;
+
+	return 0;
+}
+
+/* cmp function to sort() reverse mappings */
+static int cmp_extents_reverse(const void *a, const void *b)
+{
+	const struct uid_gid_extent *e1 = a;
+	const struct uid_gid_extent *e2 = b;
+
+	if (e1->lower_first < e2->lower_first)
+		return -1;
+
+	if (e1->lower_first > e2->lower_first)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * sort_idmaps - Sorts an array of idmap entries.
+ * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static int sort_idmaps(struct uid_gid_map *map)
+{
+	if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		return 0;
+
+	/* Sort forward array. */
+	sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent),
+	     cmp_extents_forward, NULL);
+
+	/* Only copy the memory from forward we actually need. */
+	map->reverse = kmemdup(map->forward,
+			       map->nr_extents * sizeof(struct uid_gid_extent),
+			       GFP_KERNEL);
+	if (!map->reverse)
+		return -ENOMEM;
+
+	/* Sort reverse array. */
+	sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent),
+	     cmp_extents_reverse, NULL);
+
+	return 0;
+}
+
 static ssize_t map_write(struct file *file, const char __user *buf,
 			 size_t count, loff_t *ppos,
 			 int cap_setid,
@@ -648,7 +911,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	struct user_namespace *ns = seq->private;
 	struct uid_gid_map new_map;
 	unsigned idx;
-	struct uid_gid_extent *extent = NULL;
+	struct uid_gid_extent extent;
 	char *kbuf = NULL, *pos, *next_line;
 	ssize_t ret = -EINVAL;
 
@@ -673,6 +936,8 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	 */
 	mutex_lock(&userns_state_mutex);
 
+	memset(&new_map, 0, sizeof(struct uid_gid_map));
+
 	ret = -EPERM;
 	/* Only allow one successful write to the map */
 	if (map->nr_extents != 0)
@@ -700,9 +965,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	/* Parse the user data */
 	ret = -EINVAL;
 	pos = kbuf;
-	new_map.nr_extents = 0;
 	for (; pos; pos = next_line) {
-		extent = &new_map.extent[new_map.nr_extents];
 
 		/* Find the end of line and ensure I don't look past it */
 		next_line = strchr(pos, '\n');
@@ -714,17 +977,17 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 		}
 
 		pos = skip_spaces(pos);
-		extent->first = simple_strtoul(pos, &pos, 10);
+		extent.first = simple_strtoul(pos, &pos, 10);
 		if (!isspace(*pos))
 			goto out;
 
 		pos = skip_spaces(pos);
-		extent->lower_first = simple_strtoul(pos, &pos, 10);
+		extent.lower_first = simple_strtoul(pos, &pos, 10);
 		if (!isspace(*pos))
 			goto out;
 
 		pos = skip_spaces(pos);
-		extent->count = simple_strtoul(pos, &pos, 10);
+		extent.count = simple_strtoul(pos, &pos, 10);
 		if (*pos && !isspace(*pos))
 			goto out;
 
@@ -734,29 +997,33 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 			goto out;
 
 		/* Verify we have been given valid starting values */
-		if ((extent->first == (u32) -1) ||
-		    (extent->lower_first == (u32) -1))
+		if ((extent.first == (u32) -1) ||
+		    (extent.lower_first == (u32) -1))
 			goto out;
 
 		/* Verify count is not zero and does not cause the
 		 * extent to wrap
 		 */
-		if ((extent->first + extent->count) <= extent->first)
+		if ((extent.first + extent.count) <= extent.first)
 			goto out;
-		if ((extent->lower_first + extent->count) <=
-		     extent->lower_first)
+		if ((extent.lower_first + extent.count) <=
+		     extent.lower_first)
 			goto out;
 
 		/* Do the ranges in extent overlap any previous extents? */
-		if (mappings_overlap(&new_map, extent))
+		if (mappings_overlap(&new_map, &extent))
 			goto out;
 
-		new_map.nr_extents++;
-
-		/* Fail if the file contains too many extents */
-		if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
+		if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS &&
 		    (next_line != NULL))
 			goto out;
+
+		ret = insert_extent(&new_map, &extent);
+		if (ret < 0)
+			goto out;
+		ret = -EINVAL;
+
+		new_map.nr_extents++;
 	}
 	/* Be very certaint the new map actually exists */
 	if (new_map.nr_extents == 0)
@@ -767,16 +1034,26 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
 		goto out;
 
+	ret = sort_idmaps(&new_map);
+	if (ret < 0)
+		goto out;
+
+	ret = -EPERM;
 	/* Map the lower ids from the parent user namespace to the
 	 * kernel global id space.
 	 */
 	for (idx = 0; idx < new_map.nr_extents; idx++) {
+		struct uid_gid_extent *e;
 		u32 lower_first;
-		extent = &new_map.extent[idx];
+
+		if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+			e = &new_map.extent[idx];
+		else
+			e = &new_map.forward[idx];
 
 		lower_first = map_id_range_down(parent_map,
-						extent->lower_first,
-						extent->count);
+						e->lower_first,
+						e->count);
 
 		/* Fail if we can not map the specified extent to
 		 * the kernel global id space.
@@ -784,18 +1061,31 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 		if (lower_first == (u32) -1)
 			goto out;
 
-		extent->lower_first = lower_first;
+		e->lower_first = lower_first;
 	}
 
 	/* Install the map */
-	memcpy(map->extent, new_map.extent,
-		new_map.nr_extents*sizeof(new_map.extent[0]));
+	if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
+		memcpy(map->extent, new_map.extent,
+		       new_map.nr_extents * sizeof(new_map.extent[0]));
+	} else {
+		map->forward = new_map.forward;
+		map->reverse = new_map.reverse;
+	}
 	smp_wmb();
 	map->nr_extents = new_map.nr_extents;
 
 	*ppos = count;
 	ret = count;
 out:
+	if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+		kfree(new_map.forward);
+		kfree(new_map.reverse);
+		map->forward = NULL;
+		map->reverse = NULL;
+		map->nr_extents = 0;
+	}
+
 	mutex_unlock(&userns_state_mutex);
 	kfree(kbuf);
 	return ret;
-- 
cgit v1.2.3


From 638f5b90d46016372a8e3e0a434f199cc5e12b8c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 31 Oct 2017 18:16:05 -0700
Subject: bpf: reduce verifier memory consumption

the verifier got progressively smarter over time and size of its internal
state grew as well. Time to reduce the memory consumption.

Before:
sizeof(struct bpf_verifier_state) = 6520
After:
sizeof(struct bpf_verifier_state) = 896

It's done by observing that majority of BPF programs use little to
no stack whereas verifier kept all of 512 stack slots ready always.
Instead dynamically reallocate struct verifier state when stack
access is detected.
Runtime difference before vs after is within a noise.
The number of processed instructions stays the same.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c |   8 +-
 include/linux/bpf_verifier.h                      |  16 +-
 kernel/bpf/verifier.c                             | 437 ++++++++++++++--------
 3 files changed, 305 insertions(+), 156 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 3d3dcac1c942..a8c7615546a9 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -76,9 +76,9 @@ nfp_bpf_goto_meta(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 
 static int
 nfp_bpf_check_exit(struct nfp_prog *nfp_prog,
-		   const struct bpf_verifier_env *env)
+		   struct bpf_verifier_env *env)
 {
-	const struct bpf_reg_state *reg0 = &env->cur_state.regs[0];
+	const struct bpf_reg_state *reg0 = cur_regs(env) + BPF_REG_0;
 	u64 imm;
 
 	if (nfp_prog->act == NN_ACT_XDP)
@@ -144,9 +144,9 @@ nfp_bpf_check_stack_access(struct nfp_prog *nfp_prog,
 
 static int
 nfp_bpf_check_ptr(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
-		  const struct bpf_verifier_env *env, u8 reg_no)
+		  struct bpf_verifier_env *env, u8 reg_no)
 {
-	const struct bpf_reg_state *reg = &env->cur_state.regs[reg_no];
+	const struct bpf_reg_state *reg = cur_regs(env) + reg_no;
 	int err;
 
 	if (reg->type != PTR_TO_CTX &&
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index feeaea93d959..3b0976aaac75 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -88,14 +88,19 @@ enum bpf_stack_slot_type {
 
 #define BPF_REG_SIZE 8	/* size of eBPF register in bytes */
 
+struct bpf_stack_state {
+	struct bpf_reg_state spilled_ptr;
+	u8 slot_type[BPF_REG_SIZE];
+};
+
 /* state of the program:
  * type of all registers and stack info
  */
 struct bpf_verifier_state {
 	struct bpf_reg_state regs[MAX_BPF_REG];
-	u8 stack_slot_type[MAX_BPF_STACK];
-	struct bpf_reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
 	struct bpf_verifier_state *parent;
+	int allocated_stack;
+	struct bpf_stack_state *stack;
 };
 
 /* linked list of verifier states used to prune search */
@@ -145,7 +150,7 @@ struct bpf_verifier_env {
 	struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
 	int stack_size;			/* number of states to be processed */
 	bool strict_alignment;		/* perform strict pointer alignment checks */
-	struct bpf_verifier_state cur_state; /* current verifier state */
+	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
 	void *analyzer_priv; /* pointer to external analyzer's private data */
@@ -159,6 +164,11 @@ struct bpf_verifier_env {
 	struct bpf_verifer_log log;
 };
 
+static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
+{
+	return env->cur_state->regs;
+}
+
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 		 void *priv);
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d906775e12c1..5f26f7ad124f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -276,43 +276,132 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			verbose(env, ")");
 		}
 	}
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] == STACK_SPILL)
-			verbose(env, " fp%d=%s", -MAX_BPF_STACK + i,
-				reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] == STACK_SPILL)
+			verbose(env, " fp%d=%s",
+				-MAX_BPF_STACK + i * BPF_REG_SIZE,
+				reg_type_str[state->stack[i].spilled_ptr.type]);
 	}
 	verbose(env, "\n");
 }
 
-static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
+static int copy_stack_state(struct bpf_verifier_state *dst,
+			    const struct bpf_verifier_state *src)
 {
-	struct bpf_verifier_stack_elem *elem;
-	int insn_idx;
+	if (!src->stack)
+		return 0;
+	if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) {
+		/* internal bug, make state invalid to reject the program */
+		memset(dst, 0, sizeof(*dst));
+		return -EFAULT;
+	}
+	memcpy(dst->stack, src->stack,
+	       sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE));
+	return 0;
+}
+
+/* do_check() starts with zero-sized stack in struct bpf_verifier_state to
+ * make it consume minimal amount of memory. check_stack_write() access from
+ * the program calls into realloc_verifier_state() to grow the stack size.
+ * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
+ * which this function copies over. It points to previous bpf_verifier_state
+ * which is never reallocated
+ */
+static int realloc_verifier_state(struct bpf_verifier_state *state, int size,
+				  bool copy_old)
+{
+	u32 old_size = state->allocated_stack;
+	struct bpf_stack_state *new_stack;
+	int slot = size / BPF_REG_SIZE;
+
+	if (size <= old_size || !size) {
+		if (copy_old)
+			return 0;
+		state->allocated_stack = slot * BPF_REG_SIZE;
+		if (!size && old_size) {
+			kfree(state->stack);
+			state->stack = NULL;
+		}
+		return 0;
+	}
+	new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state),
+				  GFP_KERNEL);
+	if (!new_stack)
+		return -ENOMEM;
+	if (copy_old) {
+		if (state->stack)
+			memcpy(new_stack, state->stack,
+			       sizeof(*new_stack) * (old_size / BPF_REG_SIZE));
+		memset(new_stack + old_size / BPF_REG_SIZE, 0,
+		       sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE);
+	}
+	state->allocated_stack = slot * BPF_REG_SIZE;
+	kfree(state->stack);
+	state->stack = new_stack;
+	return 0;
+}
+
+static void free_verifier_state(struct bpf_verifier_state *state)
+{
+	kfree(state->stack);
+	kfree(state);
+}
+
+/* copy verifier state from src to dst growing dst stack space
+ * when necessary to accommodate larger src stack
+ */
+static int copy_verifier_state(struct bpf_verifier_state *dst,
+			       const struct bpf_verifier_state *src)
+{
+	int err;
+
+	err = realloc_verifier_state(dst, src->allocated_stack, false);
+	if (err)
+		return err;
+	memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack));
+	return copy_stack_state(dst, src);
+}
+
+static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
+		     int *insn_idx)
+{
+	struct bpf_verifier_state *cur = env->cur_state;
+	struct bpf_verifier_stack_elem *elem, *head = env->head;
+	int err;
 
 	if (env->head == NULL)
-		return -1;
+		return -ENOENT;
 
-	memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
-	insn_idx = env->head->insn_idx;
+	if (cur) {
+		err = copy_verifier_state(cur, &head->st);
+		if (err)
+			return err;
+	}
+	if (insn_idx)
+		*insn_idx = head->insn_idx;
 	if (prev_insn_idx)
-		*prev_insn_idx = env->head->prev_insn_idx;
-	elem = env->head->next;
-	kfree(env->head);
+		*prev_insn_idx = head->prev_insn_idx;
+	elem = head->next;
+	kfree(head);
 	env->head = elem;
 	env->stack_size--;
-	return insn_idx;
+	return 0;
 }
 
 static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 					     int insn_idx, int prev_insn_idx)
 {
+	struct bpf_verifier_state *cur = env->cur_state;
 	struct bpf_verifier_stack_elem *elem;
+	int err;
 
-	elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+	elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
 	if (!elem)
 		goto err;
 
-	memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
+	err = copy_verifier_state(&elem->st, cur);
+	if (err)
+		return NULL;
 	elem->insn_idx = insn_idx;
 	elem->prev_insn_idx = prev_insn_idx;
 	elem->next = env->head;
@@ -325,7 +414,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 	return &elem->st;
 err:
 	/* pop all elements and return */
-	while (pop_stack(env, NULL) >= 0);
+	while (!pop_stack(env, NULL, NULL));
 	return NULL;
 }
 
@@ -550,7 +639,7 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
 static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 			 enum reg_arg_type t)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = env->cur_state->regs;
 
 	if (regno >= MAX_BPF_REG) {
 		verbose(env, "R%d is invalid\n", regno);
@@ -563,7 +652,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose(env, "R%d !read_ok\n", regno);
 			return -EACCES;
 		}
-		mark_reg_read(&env->cur_state, regno);
+		mark_reg_read(env->cur_state, regno);
 	} else {
 		/* check whether register used as dest operand can be written to */
 		if (regno == BPF_REG_FP) {
@@ -601,10 +690,21 @@ static int check_stack_write(struct bpf_verifier_env *env,
 			     struct bpf_verifier_state *state, int off,
 			     int size, int value_regno)
 {
-	int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
+	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
+
+	err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE),
+				     true);
+	if (err)
+		return err;
 	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
 	 * so it's aligned access and [off, off + size) are within stack limits
 	 */
+	if (!env->allow_ptr_leaks &&
+	    state->stack[spi].slot_type[0] == STACK_SPILL &&
+	    size != BPF_REG_SIZE) {
+		verbose(env, "attempt to corrupt spilled pointer on stack\n");
+		return -EACCES;
+	}
 
 	if (value_regno >= 0 &&
 	    is_spillable_regtype(state->regs[value_regno].type)) {
@@ -616,17 +716,18 @@ static int check_stack_write(struct bpf_verifier_env *env,
 		}
 
 		/* save register state */
-		state->spilled_regs[spi] = state->regs[value_regno];
-		state->spilled_regs[spi].live |= REG_LIVE_WRITTEN;
+		state->stack[spi].spilled_ptr = state->regs[value_regno];
+		state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
 
 		for (i = 0; i < BPF_REG_SIZE; i++)
-			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+			state->stack[spi].slot_type[i] = STACK_SPILL;
 	} else {
 		/* regular write of data into stack */
-		state->spilled_regs[spi] = (struct bpf_reg_state) {};
+		state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};
 
 		for (i = 0; i < size; i++)
-			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
+			state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
+				STACK_MISC;
 	}
 	return 0;
 }
@@ -637,10 +738,10 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo
 
 	while (parent) {
 		/* if read wasn't screened by an earlier write ... */
-		if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN)
+		if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
 			break;
 		/* ... then we depend on parent's value */
-		parent->spilled_regs[slot].live |= REG_LIVE_READ;
+		parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
 		state = parent;
 		parent = state->parent;
 	}
@@ -650,34 +751,37 @@ static int check_stack_read(struct bpf_verifier_env *env,
 			    struct bpf_verifier_state *state, int off, int size,
 			    int value_regno)
 {
-	u8 *slot_type;
-	int i, spi;
+	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
+	u8 *stype;
 
-	slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
+	if (state->allocated_stack <= slot) {
+		verbose(env, "invalid read from stack off %d+0 size %d\n",
+			off, size);
+		return -EACCES;
+	}
+	stype = state->stack[spi].slot_type;
 
-	if (slot_type[0] == STACK_SPILL) {
+	if (stype[0] == STACK_SPILL) {
 		if (size != BPF_REG_SIZE) {
 			verbose(env, "invalid size of register spill\n");
 			return -EACCES;
 		}
 		for (i = 1; i < BPF_REG_SIZE; i++) {
-			if (slot_type[i] != STACK_SPILL) {
+			if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
 				verbose(env, "corrupted spill memory\n");
 				return -EACCES;
 			}
 		}
 
-		spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
-
 		if (value_regno >= 0) {
 			/* restore register state from stack */
-			state->regs[value_regno] = state->spilled_regs[spi];
+			state->regs[value_regno] = state->stack[spi].spilled_ptr;
 			mark_stack_slot_read(state, spi);
 		}
 		return 0;
 	} else {
 		for (i = 0; i < size; i++) {
-			if (slot_type[i] != STACK_MISC) {
+			if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) {
 				verbose(env, "invalid read from stack off %d+%d size %d\n",
 					off, i, size);
 				return -EACCES;
@@ -694,7 +798,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
 static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 			    int size)
 {
-	struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_map *map = regs[regno].map_ptr;
 
 	if (off < 0 || size <= 0 || off + size > map->value_size) {
 		verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
@@ -706,9 +811,9 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 
 /* check read/write into a map element with possible variable offset */
 static int check_map_access(struct bpf_verifier_env *env, u32 regno,
-				int off, int size)
+			    int off, int size)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_reg_state *reg = &state->regs[regno];
 	int err;
 
@@ -783,7 +888,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
 				 int off, int size)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = &regs[regno];
 
 	if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
@@ -797,7 +902,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
 static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 			       int size)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = &regs[regno];
 	int err;
 
@@ -866,7 +971,7 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
 
 static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 {
-	return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
+	return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
 }
 
 static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
@@ -968,8 +1073,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			    int bpf_size, enum bpf_access_type t,
 			    int value_regno)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
-	struct bpf_reg_state *reg = &state->regs[regno];
+	struct bpf_verifier_state *state = env->cur_state;
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_reg_state *reg = regs + regno;
 	int size, err = 0;
 
 	size = bpf_size_to_bytes(bpf_size);
@@ -993,7 +1099,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 
 		err = check_map_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown(env, state->regs, value_regno);
+			mark_reg_unknown(env, regs, value_regno);
 
 	} else if (reg->type == PTR_TO_CTX) {
 		enum bpf_reg_type reg_type = SCALAR_VALUE;
@@ -1028,14 +1134,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			 * case, we know the offset is zero.
 			 */
 			if (reg_type == SCALAR_VALUE)
-				mark_reg_unknown(env, state->regs, value_regno);
+				mark_reg_unknown(env, regs, value_regno);
 			else
-				mark_reg_known_zero(env, state->regs,
+				mark_reg_known_zero(env, regs,
 						    value_regno);
-			state->regs[value_regno].id = 0;
-			state->regs[value_regno].off = 0;
-			state->regs[value_regno].range = 0;
-			state->regs[value_regno].type = reg_type;
+			regs[value_regno].id = 0;
+			regs[value_regno].off = 0;
+			regs[value_regno].range = 0;
+			regs[value_regno].type = reg_type;
 		}
 
 	} else if (reg->type == PTR_TO_STACK) {
@@ -1061,19 +1167,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		if (env->prog->aux->stack_depth < -off)
 			env->prog->aux->stack_depth = -off;
 
-		if (t == BPF_WRITE) {
-			if (!env->allow_ptr_leaks &&
-			    state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
-			    size != BPF_REG_SIZE) {
-				verbose(env, "attempt to corrupt spilled pointer on stack\n");
-				return -EACCES;
-			}
+		if (t == BPF_WRITE)
 			err = check_stack_write(env, state, off, size,
 						value_regno);
-		} else {
+		else
 			err = check_stack_read(env, state, off, size,
 					       value_regno);
-		}
 	} else if (reg_is_pkt_pointer(reg)) {
 		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
 			verbose(env, "cannot write into packet\n");
@@ -1087,7 +1186,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		}
 		err = check_packet_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown(env, state->regs, value_regno);
+			mark_reg_unknown(env, regs, value_regno);
 	} else {
 		verbose(env, "R%d invalid mem access '%s'\n", regno,
 			reg_type_str[reg->type]);
@@ -1095,11 +1194,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	}
 
 	if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
-	    state->regs[value_regno].type == SCALAR_VALUE) {
+	    regs[value_regno].type == SCALAR_VALUE) {
 		/* b/h/w load zero-extends, mark upper bits as known 0 */
-		state->regs[value_regno].var_off = tnum_cast(
-					state->regs[value_regno].var_off, size);
-		__update_reg_bounds(&state->regs[value_regno]);
+		regs[value_regno].var_off =
+			tnum_cast(regs[value_regno].var_off, size);
+		__update_reg_bounds(&regs[value_regno]);
 	}
 	return err;
 }
@@ -1156,9 +1255,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 				int access_size, bool zero_size_allowed,
 				struct bpf_call_arg_meta *meta)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_reg_state *regs = state->regs;
-	int off, i;
+	int off, i, slot, spi;
 
 	if (regs[regno].type != PTR_TO_STACK) {
 		/* Allow zero-byte read from NULL, regardless of pointer type */
@@ -1198,7 +1297,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 	}
 
 	for (i = 0; i < access_size; i++) {
-		if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
+		slot = -(off + i) - 1;
+		spi = slot / BPF_REG_SIZE;
+		if (state->allocated_stack <= slot ||
+		    state->stack[spi].slot_type[slot % BPF_REG_SIZE] !=
+			STACK_MISC) {
 			verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
 				off, i, access_size);
 			return -EACCES;
@@ -1211,7 +1314,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 				   int access_size, bool zero_size_allowed,
 				   struct bpf_call_arg_meta *meta)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
@@ -1229,7 +1332,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			  enum bpf_arg_type arg_type,
 			  struct bpf_call_arg_meta *meta)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
 	enum bpf_reg_type expected_type, type = reg->type;
 	int err = 0;
 
@@ -1514,7 +1617,7 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
  */
 static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_reg_state *regs = state->regs, *reg;
 	int i;
 
@@ -1522,10 +1625,10 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 		if (reg_is_pkt_pointer_any(&regs[i]))
 			mark_reg_unknown(env, regs, i);
 
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] != STACK_SPILL)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		reg = &state->spilled_regs[i / BPF_REG_SIZE];
+		reg = &state->stack[i].spilled_ptr;
 		if (reg_is_pkt_pointer_any(reg))
 			__mark_reg_unknown(reg);
 	}
@@ -1533,9 +1636,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 
 static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
 	const struct bpf_func_proto *fn = NULL;
-	struct bpf_reg_state *regs = state->regs;
+	struct bpf_reg_state *regs;
 	struct bpf_call_arg_meta meta;
 	bool changes_data;
 	int i, err;
@@ -1603,6 +1705,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 			return err;
 	}
 
+	regs = cur_regs(env);
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(env, regs, caller_saved[i]);
@@ -1691,7 +1794,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 				   const struct bpf_reg_state *ptr_reg,
 				   const struct bpf_reg_state *off_reg)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+	struct bpf_reg_state *regs = cur_regs(env), *dst_reg;
 	bool known = tnum_is_const(off_reg->var_off);
 	s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
 	    smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
@@ -1703,13 +1806,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	dst_reg = &regs[dst];
 
 	if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
-		print_verifier_state(env, &env->cur_state);
+		print_verifier_state(env, env->cur_state);
 		verbose(env,
 			"verifier internal error: known but bad sbounds\n");
 		return -EINVAL;
 	}
 	if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
-		print_verifier_state(env, &env->cur_state);
+		print_verifier_state(env, env->cur_state);
 		verbose(env,
 			"verifier internal error: known but bad ubounds\n");
 		return -EINVAL;
@@ -1890,7 +1993,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 				      struct bpf_reg_state *dst_reg,
 				      struct bpf_reg_state src_reg)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	u8 opcode = BPF_OP(insn->code);
 	bool src_known, dst_known;
 	s64 smin_val, smax_val;
@@ -2111,7 +2214,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 				   struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg;
+	struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;
 	struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
 	u8 opcode = BPF_OP(insn->code);
 	int rc;
@@ -2185,12 +2288,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 
 	/* Got here implies adding two SCALAR_VALUEs */
 	if (WARN_ON_ONCE(ptr_reg)) {
-		print_verifier_state(env, &env->cur_state);
+		print_verifier_state(env, env->cur_state);
 		verbose(env, "verifier internal error: unexpected ptr_reg\n");
 		return -EINVAL;
 	}
 	if (WARN_ON(!src_reg)) {
-		print_verifier_state(env, &env->cur_state);
+		print_verifier_state(env, env->cur_state);
 		verbose(env, "verifier internal error: no src_reg\n");
 		return -EINVAL;
 	}
@@ -2200,7 +2303,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 /* check validity of 32-bit and 64-bit arithmetic operations */
 static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	u8 opcode = BPF_OP(insn->code);
 	int err;
 
@@ -2421,10 +2524,10 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 			/* keep the maximum range already checked */
 			regs[i].range = max(regs[i].range, new_range);
 
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] != STACK_SPILL)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		reg = &state->spilled_regs[i / BPF_REG_SIZE];
+		reg = &state->stack[i].spilled_ptr;
 		if (reg->type == type && reg->id == dst_reg->id)
 			reg->range = max_t(u16, reg->range, new_range);
 	}
@@ -2674,17 +2777,17 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
 	for (i = 0; i < MAX_BPF_REG; i++)
 		mark_map_reg(regs, i, id, is_null);
 
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] != STACK_SPILL)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null);
+		mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
 	}
 }
 
 static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			     struct bpf_insn *insn, int *insn_idx)
 {
-	struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state;
+	struct bpf_verifier_state *other_branch, *this_branch = env->cur_state;
 	struct bpf_reg_state *regs = this_branch->regs, *dst_reg;
 	u8 opcode = BPF_OP(insn->code);
 	int err;
@@ -2876,7 +2979,7 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
 /* verify BPF_LD_IMM64 instruction */
 static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	int err;
 
 	if (BPF_SIZE(insn->code) != BPF_DW) {
@@ -2937,7 +3040,7 @@ static bool may_access_skb(enum bpf_prog_type type)
  */
 static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	u8 mode = BPF_MODE(insn->code);
 	int i, err;
 
@@ -2999,7 +3102,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 		return 0;
 	}
 
-	reg = &env->cur_state.regs[BPF_REG_0];
+	reg = cur_regs(env) + BPF_REG_0;
 	if (reg->type != SCALAR_VALUE) {
 		verbose(env, "At program exit the register R0 is not a known value (%s)\n",
 			reg_type_str[reg->type]);
@@ -3363,6 +3466,57 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	return false;
 }
 
+static bool stacksafe(struct bpf_verifier_state *old,
+		      struct bpf_verifier_state *cur,
+		      struct idpair *idmap)
+{
+	int i, spi;
+
+	/* if explored stack has more populated slots than current stack
+	 * such stacks are not equivalent
+	 */
+	if (old->allocated_stack > cur->allocated_stack)
+		return false;
+
+	/* walk slots of the explored stack and ignore any additional
+	 * slots in the current stack, since explored(safe) state
+	 * didn't use them
+	 */
+	for (i = 0; i < old->allocated_stack; i++) {
+		spi = i / BPF_REG_SIZE;
+
+		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
+			continue;
+		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+		    cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+			/* Ex: old explored (safe) state has STACK_SPILL in
+			 * this stack slot, but current has has STACK_MISC ->
+			 * this verifier states are not equivalent,
+			 * return false to continue verification of this path
+			 */
+			return false;
+		if (i % BPF_REG_SIZE)
+			continue;
+		if (old->stack[spi].slot_type[0] != STACK_SPILL)
+			continue;
+		if (!regsafe(&old->stack[spi].spilled_ptr,
+			     &cur->stack[spi].spilled_ptr,
+			     idmap))
+			/* when explored and current stack slot are both storing
+			 * spilled registers, check that stored pointers types
+			 * are the same as well.
+			 * Ex: explored safe path could have stored
+			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
+			 * but current path has stored:
+			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
+			 * such verifier states are not equivalent.
+			 * return false to continue verification of this path
+			 */
+			return false;
+	}
+	return true;
+}
+
 /* compare two verifier states
  *
  * all states stored in state_list are known to be valid, since
@@ -3407,37 +3561,8 @@ static bool states_equal(struct bpf_verifier_env *env,
 			goto out_free;
 	}
 
-	for (i = 0; i < MAX_BPF_STACK; i++) {
-		if (old->stack_slot_type[i] == STACK_INVALID)
-			continue;
-		if (old->stack_slot_type[i] != cur->stack_slot_type[i])
-			/* Ex: old explored (safe) state has STACK_SPILL in
-			 * this stack slot, but current has has STACK_MISC ->
-			 * this verifier states are not equivalent,
-			 * return false to continue verification of this path
-			 */
-			goto out_free;
-		if (i % BPF_REG_SIZE)
-			continue;
-		if (old->stack_slot_type[i] != STACK_SPILL)
-			continue;
-		if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE],
-			     &cur->spilled_regs[i / BPF_REG_SIZE],
-			     idmap))
-			/* when explored and current stack slot are both storing
-			 * spilled registers, check that stored pointers types
-			 * are the same as well.
-			 * Ex: explored safe path could have stored
-			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
-			 * but current path has stored:
-			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
-			 * such verifier states are not equivalent.
-			 * return false to continue verification of this path
-			 */
-			goto out_free;
-		else
-			continue;
-	}
+	if (!stacksafe(old, cur, idmap))
+		goto out_free;
 	ret = true;
 out_free:
 	kfree(idmap);
@@ -3473,17 +3598,19 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state,
 		}
 	}
 	/* ... and stack slots */
-	for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) {
-		if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
+		    i < parent->allocated_stack / BPF_REG_SIZE; i++) {
+		if (parent->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		if (parent->spilled_regs[i].live & REG_LIVE_READ)
+		if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
 			continue;
-		if (writes && (state->spilled_regs[i].live & REG_LIVE_WRITTEN))
+		if (writes &&
+		    (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN))
 			continue;
-		if (state->spilled_regs[i].live & REG_LIVE_READ) {
-			parent->spilled_regs[i].live |= REG_LIVE_READ;
+		if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) {
+			parent->stack[i].spilled_ptr.live |= REG_LIVE_READ;
 			touched = true;
 		}
 	}
@@ -3513,6 +3640,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl;
+	struct bpf_verifier_state *cur = env->cur_state;
 	int i;
 
 	sl = env->explored_states[insn_idx];
@@ -3523,7 +3651,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		return 0;
 
 	while (sl != STATE_LIST_MARK) {
-		if (states_equal(env, &sl->state, &env->cur_state)) {
+		if (states_equal(env, &sl->state, cur)) {
 			/* reached equivalent register/stack state,
 			 * prune the search.
 			 * Registers read by the continuation are read by us.
@@ -3534,7 +3662,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			 * they'll be immediately forgotten as we're pruning
 			 * this state and will pop a new one.
 			 */
-			propagate_liveness(&sl->state, &env->cur_state);
+			propagate_liveness(&sl->state, cur);
 			return 1;
 		}
 		sl = sl->next;
@@ -3546,16 +3674,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	 * it will be rejected. Since there are no loops, we won't be
 	 * seeing this 'insn_idx' instruction again on the way to bpf_exit
 	 */
-	new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER);
+	new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
 	if (!new_sl)
 		return -ENOMEM;
 
 	/* add new state to the head of linked list */
-	memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
+	copy_verifier_state(&new_sl->state, cur);
 	new_sl->next = env->explored_states[insn_idx];
 	env->explored_states[insn_idx] = new_sl;
 	/* connect new state to parentage chain */
-	env->cur_state.parent = &new_sl->state;
+	cur->parent = &new_sl->state;
 	/* clear write marks in current state: the writes we did are not writes
 	 * our child did, so they don't screen off its reads from us.
 	 * (There are no read marks in current state, because reads always mark
@@ -3563,10 +3691,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	 * explored_states can get read marks.)
 	 */
 	for (i = 0; i < BPF_REG_FP; i++)
-		env->cur_state.regs[i].live = REG_LIVE_NONE;
-	for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++)
-		if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL)
-			env->cur_state.spilled_regs[i].live = REG_LIVE_NONE;
+		cur->regs[i].live = REG_LIVE_NONE;
+	for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++)
+		if (cur->stack[i].slot_type[0] == STACK_SPILL)
+			cur->stack[i].spilled_ptr.live = REG_LIVE_NONE;
 	return 0;
 }
 
@@ -3581,15 +3709,19 @@ static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
 
 static int do_check(struct bpf_verifier_env *env)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state;
 	struct bpf_insn *insns = env->prog->insnsi;
-	struct bpf_reg_state *regs = state->regs;
+	struct bpf_reg_state *regs;
 	int insn_cnt = env->prog->len;
 	int insn_idx, prev_insn_idx = 0;
 	int insn_processed = 0;
 	bool do_print_state = false;
 
-	init_reg_state(env, regs);
+	state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
+	if (!state)
+		return -ENOMEM;
+	env->cur_state = state;
+	init_reg_state(env, state->regs);
 	state->parent = NULL;
 	insn_idx = 0;
 	for (;;) {
@@ -3637,7 +3769,7 @@ static int do_check(struct bpf_verifier_env *env)
 			else
 				verbose(env, "\nfrom %d to %d:",
 					prev_insn_idx, insn_idx);
-			print_verifier_state(env, &env->cur_state);
+			print_verifier_state(env, state);
 			do_print_state = false;
 		}
 
@@ -3651,6 +3783,7 @@ static int do_check(struct bpf_verifier_env *env)
 		if (err)
 			return err;
 
+		regs = cur_regs(env);
 		if (class == BPF_ALU || class == BPF_ALU64) {
 			err = check_alu_op(env, insn);
 			if (err)
@@ -3818,8 +3951,10 @@ static int do_check(struct bpf_verifier_env *env)
 				if (err)
 					return err;
 process_bpf_exit:
-				insn_idx = pop_stack(env, &prev_insn_idx);
-				if (insn_idx < 0) {
+				err = pop_stack(env, &prev_insn_idx, &insn_idx);
+				if (err < 0) {
+					if (err != -ENOENT)
+						return err;
 					break;
 				} else {
 					do_print_state = true;
@@ -4359,9 +4494,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
 
 	ret = do_check(env);
+	free_verifier_state(env->cur_state);
+	env->cur_state = NULL;
 
 skip_full_check:
-	while (pop_stack(env, NULL) >= 0);
+	while (!pop_stack(env, NULL, NULL));
 	free_states(env);
 
 	if (ret == 0)
@@ -4464,9 +4601,11 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
 
 	ret = do_check(env);
+	free_verifier_state(env->cur_state);
+	env->cur_state = NULL;
 
 skip_full_check:
-	while (pop_stack(env, NULL) >= 0);
+	while (!pop_stack(env, NULL, NULL));
 	free_states(env);
 
 	mutex_unlock(&bpf_verifier_lock);
-- 
cgit v1.2.3


From a9903f04e0a4ea522d959c2f287cdf0ab029e324 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Mon, 30 Oct 2017 11:08:16 -0700
Subject: sched/sysctl: Fix attributes of some extern declarations

The definition of sysctl_sched_migration_cost, sysctl_sched_nr_migrate
and sysctl_sched_time_avg includes the attribute const_debug. This
attribute is not part of the extern declaration of these variables in
include/linux/sched/sysctl.h, while it is in kernel/sched/sched.h,
and as a result Clang generates warnings like this:

  kernel/sched/sched.h:1618:33: warning: section attribute is specified on redeclared variable [-Wsection]
  extern const_debug unsigned int sysctl_sched_time_avg;
                                ^
  ./include/linux/sched/sysctl.h:42:21: note: previous declaration is here
  extern unsigned int sysctl_sched_time_avg;

The header only declares the variables when CONFIG_SCHED_DEBUG is defined,
therefore it is not necessary to duplicate the definition of const_debug.
Instead we can use the attribute __read_mostly, which is the expansion of
const_debug when CONFIG_SCHED_DEBUG=y is set.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Nick Desaulniers <nick.desaulniers@gmail.com>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Guenter Roeck <groeck@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shile Zhang <shile.zhang@nokia.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171030180816.170850-1-mka@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/sysctl.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 0f5ecd4d298e..d34c823f3d36 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -37,9 +37,9 @@ extern unsigned int sysctl_numa_balancing_scan_period_max;
 extern unsigned int sysctl_numa_balancing_scan_size;
 
 #ifdef CONFIG_SCHED_DEBUG
-extern unsigned int sysctl_sched_migration_cost;
-extern unsigned int sysctl_sched_nr_migrate;
-extern unsigned int sysctl_sched_time_avg;
+extern __read_mostly unsigned int sysctl_sched_migration_cost;
+extern __read_mostly unsigned int sysctl_sched_nr_migrate;
+extern __read_mostly unsigned int sysctl_sched_time_avg;
 
 int sched_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *length,
-- 
cgit v1.2.3


From 8698b9364710e7bac84b3af07dd410e39c8c2e08 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Wed, 1 Nov 2017 10:11:55 +0800
Subject: regmap: Add hardware spinlock support

On some platforms, when reading or writing some special registers through
regmap, we should acquire one hardware spinlock to synchronize between
the multiple subsystems. Thus this patch adds the hardware spinlock
support for regmap.

Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/internal.h |   2 +
 drivers/base/regmap/regmap.c   | 101 ++++++++++++++++++++++++++++++++++-------
 include/linux/regmap.h         |   6 +++
 3 files changed, 93 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h
index 2a4435d76028..8641183cac2f 100644
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h
@@ -157,6 +157,8 @@ struct regmap {
 
 	struct rb_root range_tree;
 	void *selector_work_buf;	/* Scratch buffer used for selector */
+
+	struct hwspinlock *hwlock;
 };
 
 struct regcache_ops {
diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index b9a779a4a739..999e981a174a 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -20,6 +20,7 @@
 #include <linux/sched.h>
 #include <linux/delay.h>
 #include <linux/log2.h>
+#include <linux/hwspinlock.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
@@ -413,6 +414,49 @@ static unsigned int regmap_parse_64_native(const void *buf)
 }
 #endif
 
+static void regmap_lock_hwlock(void *__map)
+{
+	struct regmap *map = __map;
+
+	hwspin_lock_timeout(map->hwlock, UINT_MAX);
+}
+
+static void regmap_lock_hwlock_irq(void *__map)
+{
+	struct regmap *map = __map;
+
+	hwspin_lock_timeout_irq(map->hwlock, UINT_MAX);
+}
+
+static void regmap_lock_hwlock_irqsave(void *__map)
+{
+	struct regmap *map = __map;
+
+	hwspin_lock_timeout_irqsave(map->hwlock, UINT_MAX,
+				    &map->spinlock_flags);
+}
+
+static void regmap_unlock_hwlock(void *__map)
+{
+	struct regmap *map = __map;
+
+	hwspin_unlock(map->hwlock);
+}
+
+static void regmap_unlock_hwlock_irq(void *__map)
+{
+	struct regmap *map = __map;
+
+	hwspin_unlock_irq(map->hwlock);
+}
+
+static void regmap_unlock_hwlock_irqrestore(void *__map)
+{
+	struct regmap *map = __map;
+
+	hwspin_unlock_irqrestore(map->hwlock, &map->spinlock_flags);
+}
+
 static void regmap_lock_mutex(void *__map)
 {
 	struct regmap *map = __map;
@@ -627,6 +671,29 @@ struct regmap *__regmap_init(struct device *dev,
 		map->lock = config->lock;
 		map->unlock = config->unlock;
 		map->lock_arg = config->lock_arg;
+	} else if (config->hwlock_id) {
+		map->hwlock = hwspin_lock_request_specific(config->hwlock_id);
+		if (!map->hwlock) {
+			ret = -ENXIO;
+			goto err_map;
+		}
+
+		switch (config->hwlock_mode) {
+		case HWLOCK_IRQSTATE:
+			map->lock = regmap_lock_hwlock_irqsave;
+			map->unlock = regmap_unlock_hwlock_irqrestore;
+			break;
+		case HWLOCK_IRQ:
+			map->lock = regmap_lock_hwlock_irq;
+			map->unlock = regmap_unlock_hwlock_irq;
+			break;
+		default:
+			map->lock = regmap_lock_hwlock;
+			map->unlock = regmap_unlock_hwlock;
+			break;
+		}
+
+		map->lock_arg = map;
 	} else {
 		if ((bus && bus->fast_io) ||
 		    config->fast_io) {
@@ -729,7 +796,7 @@ struct regmap *__regmap_init(struct device *dev,
 			map->format.format_write = regmap_format_2_6_write;
 			break;
 		default:
-			goto err_map;
+			goto err_hwlock;
 		}
 		break;
 
@@ -739,7 +806,7 @@ struct regmap *__regmap_init(struct device *dev,
 			map->format.format_write = regmap_format_4_12_write;
 			break;
 		default:
-			goto err_map;
+			goto err_hwlock;
 		}
 		break;
 
@@ -749,7 +816,7 @@ struct regmap *__regmap_init(struct device *dev,
 			map->format.format_write = regmap_format_7_9_write;
 			break;
 		default:
-			goto err_map;
+			goto err_hwlock;
 		}
 		break;
 
@@ -759,7 +826,7 @@ struct regmap *__regmap_init(struct device *dev,
 			map->format.format_write = regmap_format_10_14_write;
 			break;
 		default:
-			goto err_map;
+			goto err_hwlock;
 		}
 		break;
 
@@ -779,13 +846,13 @@ struct regmap *__regmap_init(struct device *dev,
 			map->format.format_reg = regmap_format_16_native;
 			break;
 		default:
-			goto err_map;
+			goto err_hwlock;
 		}
 		break;
 
 	case 24:
 		if (reg_endian != REGMAP_ENDIAN_BIG)
-			goto err_map;
+			goto err_hwlock;
 		map->format.format_reg = regmap_format_24;
 		break;
 
@@ -801,7 +868,7 @@ struct regmap *__regmap_init(struct device *dev,
 			map->format.format_reg = regmap_format_32_native;
 			break;
 		default:
-			goto err_map;
+			goto err_hwlock;
 		}
 		break;
 
@@ -818,13 +885,13 @@ struct regmap *__regmap_init(struct device *dev,
 			map->format.format_reg = regmap_format_64_native;
 			break;
 		default:
-			goto err_map;
+			goto err_hwlock;
 		}
 		break;
 #endif
 
 	default:
-		goto err_map;
+		goto err_hwlock;
 	}
 
 	if (val_endian == REGMAP_ENDIAN_NATIVE)
@@ -853,12 +920,12 @@ struct regmap *__regmap_init(struct device *dev,
 			map->format.parse_val = regmap_parse_16_native;
 			break;
 		default:
-			goto err_map;
+			goto err_hwlock;
 		}
 		break;
 	case 24:
 		if (val_endian != REGMAP_ENDIAN_BIG)
-			goto err_map;
+			goto err_hwlock;
 		map->format.format_val = regmap_format_24;
 		map->format.parse_val = regmap_parse_24;
 		break;
@@ -879,7 +946,7 @@ struct regmap *__regmap_init(struct device *dev,
 			map->format.parse_val = regmap_parse_32_native;
 			break;
 		default:
-			goto err_map;
+			goto err_hwlock;
 		}
 		break;
 #ifdef CONFIG_64BIT
@@ -900,7 +967,7 @@ struct regmap *__regmap_init(struct device *dev,
 			map->format.parse_val = regmap_parse_64_native;
 			break;
 		default:
-			goto err_map;
+			goto err_hwlock;
 		}
 		break;
 #endif
@@ -909,18 +976,18 @@ struct regmap *__regmap_init(struct device *dev,
 	if (map->format.format_write) {
 		if ((reg_endian != REGMAP_ENDIAN_BIG) ||
 		    (val_endian != REGMAP_ENDIAN_BIG))
-			goto err_map;
+			goto err_hwlock;
 		map->use_single_write = true;
 	}
 
 	if (!map->format.format_write &&
 	    !(map->format.format_reg && map->format.format_val))
-		goto err_map;
+		goto err_hwlock;
 
 	map->work_buf = kzalloc(map->format.buf_size, GFP_KERNEL);
 	if (map->work_buf == NULL) {
 		ret = -ENOMEM;
-		goto err_map;
+		goto err_hwlock;
 	}
 
 	if (map->format.format_write) {
@@ -1041,6 +1108,8 @@ err_regcache:
 err_range:
 	regmap_range_exit(map);
 	kfree(map->work_buf);
+err_hwlock:
+	hwspin_lock_free(map->hwlock);
 err_map:
 	kfree(map);
 err:
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 978abfbac617..edc32aac84d7 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -273,6 +273,9 @@ typedef void (*regmap_unlock)(void *);
  *
  * @ranges: Array of configuration entries for virtual address ranges.
  * @num_ranges: Number of range configuration entries.
+ * @hwlock_id: Specify the hardware spinlock id.
+ * @hwlock_mode: The hardware spinlock mode, should be HWLOCK_IRQSTATE,
+ *		 HWLOCK_IRQ or 0.
  */
 struct regmap_config {
 	const char *name;
@@ -317,6 +320,9 @@ struct regmap_config {
 
 	const struct regmap_range_cfg *ranges;
 	unsigned int num_ranges;
+
+	unsigned int hwlock_id;
+	unsigned int hwlock_mode;
 };
 
 /**
-- 
cgit v1.2.3


From e9292f2c03851ef81bef38579a0ee9c42140e586 Mon Sep 17 00:00:00 2001
From: Egil Hjelmeland <privat@egil-hjelmeland.no>
Date: Tue, 31 Oct 2017 15:48:01 +0100
Subject: net: dsa: lan9303: Add STP ALR entry on port 0

STP BPDUs arriving on user ports must sent to CPU port only,
for processing by the SW bridge.

Add an ALR entry with STP state override to fix that.

Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/lan9303-core.c | 2 ++
 include/linux/dsa/lan9303.h    | 2 ++
 net/dsa/tag_lan9303.c          | 1 -
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index 4c412bd52319..c4afc8f1a66d 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -773,6 +773,7 @@ static int lan9303_separate_ports(struct lan9303 *chip)
 {
 	int ret;
 
+	lan9303_alr_del_port(chip, eth_stp_addr, 0);
 	ret = lan9303_write_switch_reg(chip, LAN9303_SWE_PORT_MIRROR,
 				LAN9303_SWE_PORT_MIRROR_SNIFFER_PORT0 |
 				LAN9303_SWE_PORT_MIRROR_MIRRORED_PORT1 |
@@ -797,6 +798,7 @@ static void lan9303_bridge_ports(struct lan9303 *chip)
 
 	lan9303_write_switch_reg(chip, LAN9303_SWE_PORT_STATE,
 				 chip->swe_port_state);
+	lan9303_alr_add_port(chip, eth_stp_addr, 0, true);
 }
 
 static int lan9303_handle_reset(struct lan9303 *chip)
diff --git a/include/linux/dsa/lan9303.h b/include/linux/dsa/lan9303.h
index 05d8d136baab..b2110e69630f 100644
--- a/include/linux/dsa/lan9303.h
+++ b/include/linux/dsa/lan9303.h
@@ -34,3 +34,5 @@ struct lan9303 {
 	 **/
 	struct lan9303_alr_cache_entry alr_cache[LAN9303_NUM_ALR_RECORDS];
 };
+
+#define eth_stp_addr eth_reserved_addr_base
diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c
index 537ca991fafe..18f45cd9f625 100644
--- a/net/dsa/tag_lan9303.c
+++ b/net/dsa/tag_lan9303.c
@@ -42,7 +42,6 @@
 #define LAN9303_TAG_LEN 4
 # define LAN9303_TAG_TX_USE_ALR BIT(3)
 # define LAN9303_TAG_TX_STP_OVERRIDE BIT(4)
-#define eth_stp_addr eth_reserved_addr_base
 
 /* Decide whether to transmit using ALR lookup, or transmit directly to
  * port using tag. ALR learning is performed only when using ALR lookup.
-- 
cgit v1.2.3


From 7930d0a00ff5dbcc80f793d1a7a6b8de4e591f1a Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 14 Oct 2017 17:22:27 +0800
Subject: sbitmap: introduce __sbitmap_for_each_set()

For blk-mq, we need to be able to iterate software queues starting
from any queue in a round robin fashion, so introduce this helper.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Cc: Omar Sandoval <osandov@fb.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 64 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 47 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index a1904aadbc45..0dcc60e820de 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -211,10 +211,14 @@ bool sbitmap_any_bit_set(const struct sbitmap *sb);
  */
 bool sbitmap_any_bit_clear(const struct sbitmap *sb);
 
+#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
+#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
+
 typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *);
 
 /**
- * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
+ * __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
+ * @start: Where to start the iteration.
  * @sb: Bitmap to iterate over.
  * @fn: Callback. Should return true to continue or false to break early.
  * @data: Pointer to pass to callback.
@@ -222,35 +226,61 @@ typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *);
  * This is inline even though it's non-trivial so that the function calls to the
  * callback will hopefully get optimized away.
  */
-static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
-					void *data)
+static inline void __sbitmap_for_each_set(struct sbitmap *sb,
+					  unsigned int start,
+					  sb_for_each_fn fn, void *data)
 {
-	unsigned int i;
+	unsigned int index;
+	unsigned int nr;
+	unsigned int scanned = 0;
 
-	for (i = 0; i < sb->map_nr; i++) {
-		struct sbitmap_word *word = &sb->map[i];
-		unsigned int off, nr;
+	if (start >= sb->depth)
+		start = 0;
+	index = SB_NR_TO_INDEX(sb, start);
+	nr = SB_NR_TO_BIT(sb, start);
 
-		if (!word->word)
-			continue;
+	while (scanned < sb->depth) {
+		struct sbitmap_word *word = &sb->map[index];
+		unsigned int depth = min_t(unsigned int, word->depth - nr,
+					   sb->depth - scanned);
 
-		nr = 0;
-		off = i << sb->shift;
+		scanned += depth;
+		if (!word->word)
+			goto next;
+
+		/*
+		 * On the first iteration of the outer loop, we need to add the
+		 * bit offset back to the size of the word for find_next_bit().
+		 * On all other iterations, nr is zero, so this is a noop.
+		 */
+		depth += nr;
 		while (1) {
-			nr = find_next_bit(&word->word, word->depth, nr);
-			if (nr >= word->depth)
+			nr = find_next_bit(&word->word, depth, nr);
+			if (nr >= depth)
 				break;
-
-			if (!fn(sb, off + nr, data))
+			if (!fn(sb, (index << sb->shift) + nr, data))
 				return;
 
 			nr++;
 		}
+next:
+		nr = 0;
+		if (++index >= sb->map_nr)
+			index = 0;
 	}
 }
 
-#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
-#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
+/**
+ * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
+ * @sb: Bitmap to iterate over.
+ * @fn: Callback. Should return true to continue or false to break early.
+ * @data: Pointer to pass to callback.
+ */
+static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
+					void *data)
+{
+	__sbitmap_for_each_set(sb, 0, fn, data);
+}
 
 static inline unsigned long *__sbitmap_word(struct sbitmap *sb,
 					    unsigned int bitnr)
-- 
cgit v1.2.3


From de1482974080ec9ef414bf048b2646b246b63f6e Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 14 Oct 2017 17:22:29 +0800
Subject: blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

For SCSI devices, there is often a per-request-queue depth, which needs
to be respected before queuing one request.

Currently blk-mq always dequeues the request first, then calls
.queue_rq() to dispatch the request to lld. One obvious issue with this
approach is that I/O merging may not be successful, because when the
per-request-queue depth can't be respected, .queue_rq() has to return
BLK_STS_RESOURCE, and then this request has to stay in hctx->dispatch
list. This means it never gets a chance to be merged with other IO.

This patch introduces .get_budget and .put_budget callback in blk_mq_ops,
then we can try to get reserved budget first before dequeuing request.
If the budget for queueing I/O can't be satisfied, we don't need to
dequeue request at all. Hence the request can be left in the IO
scheduler queue, for more merging opportunities.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c   | 55 +++++++++++++++++++++++++++++++++++++++++---------
 block/blk-mq-sched.h   |  2 +-
 block/blk-mq.c         | 43 ++++++++++++++++++++++++++++++++++-----
 block/blk-mq.h         | 20 +++++++++++++++++-
 include/linux/blk-mq.h | 11 ++++++++++
 5 files changed, 114 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index be29ba849408..8e525e66a0d9 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -89,31 +89,57 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 	return false;
 }
 
-static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+/* return true if hctx need to run again */
+static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	LIST_HEAD(rq_list);
 
 	do {
-		struct request *rq = e->type->ops.mq.dispatch_request(hctx);
+		struct request *rq;
+		blk_status_t ret;
 
-		if (!rq)
+		if (e->type->ops.mq.has_work &&
+				!e->type->ops.mq.has_work(hctx))
 			break;
+
+		ret = blk_mq_get_dispatch_budget(hctx);
+		if (ret == BLK_STS_RESOURCE)
+			return true;
+
+		rq = e->type->ops.mq.dispatch_request(hctx);
+		if (!rq) {
+			blk_mq_put_dispatch_budget(hctx);
+			break;
+		} else if (ret != BLK_STS_OK) {
+			blk_mq_end_request(rq, ret);
+			continue;
+		}
+
+		/*
+		 * Now this rq owns the budget which has to be released
+		 * if this rq won't be queued to driver via .queue_rq()
+		 * in blk_mq_dispatch_rq_list().
+		 */
 		list_add(&rq->queuelist, &rq_list);
-	} while (blk_mq_dispatch_rq_list(q, &rq_list));
+	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+	return false;
 }
 
-void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+/* return true if hw queue need to be run again */
+bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
 	LIST_HEAD(rq_list);
+	bool run_queue = false;
 
 	/* RCU or SRCU read lock is needed before checking quiesced flag */
 	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
-		return;
+		return false;
 
 	hctx->run++;
 
@@ -143,14 +169,23 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 */
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
-		if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch)
-			blk_mq_do_dispatch_sched(hctx);
+		if (blk_mq_dispatch_rq_list(q, &rq_list, false) &&
+				has_sched_dispatch)
+			run_queue = blk_mq_do_dispatch_sched(hctx);
 	} else if (has_sched_dispatch) {
-		blk_mq_do_dispatch_sched(hctx);
+		run_queue = blk_mq_do_dispatch_sched(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
-		blk_mq_dispatch_rq_list(q, &rq_list);
+		blk_mq_dispatch_rq_list(q, &rq_list, false);
 	}
+
+	if (run_queue && !blk_mq_sched_needs_restart(hctx) &&
+			!test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) {
+		blk_mq_sched_mark_restart_hctx(hctx);
+		return true;
+	}
+
+	return false;
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 9267d0b7c197..2434061cc5b7 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -22,7 +22,7 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
 				  struct blk_mq_ctx *ctx,
 				  struct list_head *list, bool run_queue_async);
 
-void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
+bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
 
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 40cba1b1978f..dcb467369999 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1048,7 +1048,8 @@ static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
 	return true;
 }
 
-bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
+bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
+		bool got_budget)
 {
 	struct blk_mq_hw_ctx *hctx;
 	struct request *rq;
@@ -1057,6 +1058,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 	if (list_empty(list))
 		return false;
 
+	WARN_ON(!list_is_singular(list) && got_budget);
+
 	/*
 	 * Now process all the entries, sending them to the driver.
 	 */
@@ -1074,16 +1077,30 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 			 * The initial allocation attempt failed, so we need to
 			 * rerun the hardware queue when a tag is freed.
 			 */
-			if (!blk_mq_dispatch_wait_add(hctx))
+			if (!blk_mq_dispatch_wait_add(hctx)) {
+				if (got_budget)
+					blk_mq_put_dispatch_budget(hctx);
 				break;
+			}
 
 			/*
 			 * It's possible that a tag was freed in the window
 			 * between the allocation failure and adding the
 			 * hardware queue to the wait queue.
 			 */
-			if (!blk_mq_get_driver_tag(rq, &hctx, false))
+			if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+				if (got_budget)
+					blk_mq_put_dispatch_budget(hctx);
+				break;
+			}
+		}
+
+		if (!got_budget) {
+			ret = blk_mq_get_dispatch_budget(hctx);
+			if (ret == BLK_STS_RESOURCE)
 				break;
+			if (ret != BLK_STS_OK)
+				goto fail_rq;
 		}
 
 		list_del_init(&rq->queuelist);
@@ -1111,6 +1128,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 			break;
 		}
 
+ fail_rq:
 		if (unlikely(ret != BLK_STS_OK)) {
 			errors++;
 			blk_mq_end_request(rq, BLK_STS_IOERR);
@@ -1169,6 +1187,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	int srcu_idx;
+	bool run_queue;
 
 	/*
 	 * We should be running this queue from one of the CPUs that
@@ -1185,15 +1204,18 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
 		rcu_read_lock();
-		blk_mq_sched_dispatch_requests(hctx);
+		run_queue = blk_mq_sched_dispatch_requests(hctx);
 		rcu_read_unlock();
 	} else {
 		might_sleep();
 
 		srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
-		blk_mq_sched_dispatch_requests(hctx);
+		run_queue = blk_mq_sched_dispatch_requests(hctx);
 		srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
 	}
+
+	if (run_queue)
+		blk_mq_run_hw_queue(hctx, true);
 }
 
 /*
@@ -1582,6 +1604,13 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 	if (!blk_mq_get_driver_tag(rq, NULL, false))
 		goto insert;
 
+	ret = blk_mq_get_dispatch_budget(hctx);
+	if (ret == BLK_STS_RESOURCE) {
+		blk_mq_put_driver_tag(rq);
+		goto insert;
+	} else if (ret != BLK_STS_OK)
+		goto fail_rq;
+
 	new_cookie = request_to_qc_t(hctx, rq);
 
 	/*
@@ -1598,6 +1627,7 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		__blk_mq_requeue_request(rq);
 		goto insert;
 	default:
+ fail_rq:
 		*cookie = BLK_QC_T_NONE;
 		blk_mq_end_request(rq, ret);
 		return;
@@ -2582,6 +2612,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	if (!set->ops->queue_rq)
 		return -EINVAL;
 
+	if (!set->ops->get_budget ^ !set->ops->put_budget)
+		return -EINVAL;
+
 	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
 		pr_info("blk-mq: reduced tag depth to %u\n",
 			BLK_MQ_MAX_DEPTH);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ef15b3414da5..e413b732374e 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -30,7 +30,7 @@ void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
-bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *);
+bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
 bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
@@ -137,4 +137,22 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
 void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
 			unsigned int inflight[2]);
 
+static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
+
+	if (q->mq_ops->put_budget)
+		q->mq_ops->put_budget(hctx);
+}
+
+static inline blk_status_t blk_mq_get_dispatch_budget(
+		struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
+
+	if (q->mq_ops->get_budget)
+		return q->mq_ops->get_budget(hctx);
+	return BLK_STS_OK;
+}
+
 #endif
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 50c6485cb04f..901457df3d64 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -90,6 +90,8 @@ struct blk_mq_queue_data {
 
 typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
 		const struct blk_mq_queue_data *);
+typedef blk_status_t (get_budget_fn)(struct blk_mq_hw_ctx *);
+typedef void (put_budget_fn)(struct blk_mq_hw_ctx *);
 typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
@@ -111,6 +113,15 @@ struct blk_mq_ops {
 	 */
 	queue_rq_fn		*queue_rq;
 
+	/*
+	 * Reserve budget before queue request, once .queue_rq is
+	 * run, it is driver's responsibility to release the
+	 * reserved budget. Also we have to handle failure case
+	 * of .get_budget for avoiding I/O deadlock.
+	 */
+	get_budget_fn		*get_budget;
+	put_budget_fn		*put_budget;
+
 	/*
 	 * Called on request timeout
 	 */
-- 
cgit v1.2.3


From b347689ffbca745ac457ee27400ce1affd571c6f Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 14 Oct 2017 17:22:30 +0800
Subject: blk-mq-sched: improve dispatching from sw queue

SCSI devices use host-wide tagset, and the shared driver tag space is
often quite big. However, there is also a queue depth for each lun(
.cmd_per_lun), which is often small, for example, on both lpfc and
qla2xxx, .cmd_per_lun is just 3.

So lots of requests may stay in sw queue, and we always flush all
belonging to same hw queue and dispatch them all to driver.
Unfortunately it is easy to cause queue busy because of the small
.cmd_per_lun.  Once these requests are flushed out, they have to stay in
hctx->dispatch, and no bio merge can happen on these requests, and
sequential IO performance is harmed.

This patch introduces blk_mq_dequeue_from_ctx for dequeuing a request
from a sw queue, so that we can dispatch them in scheduler's way. We can
then avoid dequeueing too many requests from sw queue, since we don't
flush ->dispatch completely.

This patch improves dispatching from sw queue by using the .get_budget
and .put_budget callbacks.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c   | 74 ++++++++++++++++++++++++++++++++++++++++++++++++--
 block/blk-mq.c         | 39 ++++++++++++++++++++++++++
 block/blk-mq.h         |  2 ++
 include/linux/blk-mq.h |  2 ++
 4 files changed, 114 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 8e525e66a0d9..df8581bb0a37 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -128,6 +128,61 @@ static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 	return false;
 }
 
+static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
+					  struct blk_mq_ctx *ctx)
+{
+	unsigned idx = ctx->index_hw;
+
+	if (++idx == hctx->nr_ctx)
+		idx = 0;
+
+	return hctx->ctxs[idx];
+}
+
+/* return true if hctx need to run again */
+static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
+	LIST_HEAD(rq_list);
+	struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+
+	do {
+		struct request *rq;
+		blk_status_t ret;
+
+		if (!sbitmap_any_bit_set(&hctx->ctx_map))
+			break;
+
+		ret = blk_mq_get_dispatch_budget(hctx);
+		if (ret == BLK_STS_RESOURCE)
+			return true;
+
+		rq = blk_mq_dequeue_from_ctx(hctx, ctx);
+		if (!rq) {
+			blk_mq_put_dispatch_budget(hctx);
+			break;
+		} else if (ret != BLK_STS_OK) {
+			blk_mq_end_request(rq, ret);
+			continue;
+		}
+
+		/*
+		 * Now this rq owns the budget which has to be released
+		 * if this rq won't be queued to driver via .queue_rq()
+		 * in blk_mq_dispatch_rq_list().
+		 */
+		list_add(&rq->queuelist, &rq_list);
+
+		/* round robin for fair dispatch */
+		ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
+
+	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+	WRITE_ONCE(hctx->dispatch_from, ctx);
+
+	return false;
+}
+
 /* return true if hw queue need to be run again */
 bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
@@ -169,11 +224,24 @@ bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 */
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
-		if (blk_mq_dispatch_rq_list(q, &rq_list, false) &&
-				has_sched_dispatch)
-			run_queue = blk_mq_do_dispatch_sched(hctx);
+		if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
+			if (has_sched_dispatch)
+				run_queue = blk_mq_do_dispatch_sched(hctx);
+			else
+				run_queue = blk_mq_do_dispatch_ctx(hctx);
+		}
 	} else if (has_sched_dispatch) {
 		run_queue = blk_mq_do_dispatch_sched(hctx);
+	} else if (q->mq_ops->get_budget) {
+		/*
+		 * If we need to get budget before queuing request, we
+		 * dequeue request one by one from sw queue for avoiding
+		 * to mess up I/O merge when dispatch runs out of resource.
+		 *
+		 * TODO: get more budgets, and dequeue more requests in
+		 * one time.
+		 */
+		run_queue = blk_mq_do_dispatch_ctx(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
 		blk_mq_dispatch_rq_list(q, &rq_list, false);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index dcb467369999..097ca3ece716 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -914,6 +914,45 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 }
 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
 
+struct dispatch_rq_data {
+	struct blk_mq_hw_ctx *hctx;
+	struct request *rq;
+};
+
+static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
+		void *data)
+{
+	struct dispatch_rq_data *dispatch_data = data;
+	struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
+	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+
+	spin_lock(&ctx->lock);
+	if (unlikely(!list_empty(&ctx->rq_list))) {
+		dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
+		list_del_init(&dispatch_data->rq->queuelist);
+		if (list_empty(&ctx->rq_list))
+			sbitmap_clear_bit(sb, bitnr);
+	}
+	spin_unlock(&ctx->lock);
+
+	return !dispatch_data->rq;
+}
+
+struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
+					struct blk_mq_ctx *start)
+{
+	unsigned off = start ? start->index_hw : 0;
+	struct dispatch_rq_data data = {
+		.hctx = hctx,
+		.rq   = NULL,
+	};
+
+	__sbitmap_for_each_set(&hctx->ctx_map, off,
+			       dispatch_rq_from_ctx, &data);
+
+	return data.rq;
+}
+
 static inline unsigned int queued_to_index(unsigned int queued)
 {
 	if (!queued)
diff --git a/block/blk-mq.h b/block/blk-mq.h
index e413b732374e..522b420dedc0 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -35,6 +35,8 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
 bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
 				bool wait);
+struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
+					struct blk_mq_ctx *start);
 
 /*
  * Internal helpers for allocating/freeing the request map
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 901457df3d64..e5e6becd57d3 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -30,6 +30,8 @@ struct blk_mq_hw_ctx {
 
 	struct sbitmap		ctx_map;
 
+	struct blk_mq_ctx	*dispatch_from;
+
 	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;
 
-- 
cgit v1.2.3


From ac7fe82b6fcf77e757e88005c33b8147c1b7b73f Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Wed, 25 Oct 2017 16:43:15 -0700
Subject: nvme-fc: add a dev_loss_tmo field to the remoteport

Add a dev_loss_tmo value, paralleling the SCSI FC transport, for device
connectivity loss.

The transport initializes the value in the nvme_fc_register_remoteport()
call. If the value is not set, a default of 60s is set.

Add a new routine to the api, nvme_fc_set_remoteport_devloss() routine,
which allows the lldd to dynamically update the value on an existing
remoteport.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c         | 31 +++++++++++++++++++++++++++++++
 include/linux/nvme-fc-driver.h | 11 +++++++++--
 2 files changed, 40 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index e37c69f7921d..25479d3031fa 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -45,6 +45,8 @@ enum nvme_fc_queue_flags {
 
 #define NVMEFC_QUEUE_DELAY	3		/* ms units */
 
+#define NVME_FC_DEFAULT_DEV_LOSS_TMO	60	/* seconds */
+
 struct nvme_fc_queue {
 	struct nvme_fc_ctrl	*ctrl;
 	struct device		*dev;
@@ -585,6 +587,11 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
 	newrec->remoteport.port_id = pinfo->port_id;
 	newrec->remoteport.port_state = FC_OBJSTATE_ONLINE;
 	newrec->remoteport.port_num = idx;
+	/* a registration value of dev_loss_tmo=0 results in the default */
+	if (pinfo->dev_loss_tmo)
+		newrec->remoteport.dev_loss_tmo = pinfo->dev_loss_tmo;
+	else
+		newrec->remoteport.dev_loss_tmo = NVME_FC_DEFAULT_DEV_LOSS_TMO;
 
 	spin_lock_irqsave(&nvme_fc_lock, flags);
 	list_add_tail(&newrec->endp_list, &lport->endp_list);
@@ -688,6 +695,30 @@ nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport)
 }
 EXPORT_SYMBOL_GPL(nvme_fc_rescan_remoteport);
 
+int
+nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *portptr,
+			u32 dev_loss_tmo)
+{
+	struct nvme_fc_rport *rport = remoteport_to_rport(portptr);
+	struct nvme_fc_ctrl *ctrl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rport->lock, flags);
+
+	if (portptr->port_state != FC_OBJSTATE_ONLINE) {
+		spin_unlock_irqrestore(&rport->lock, flags);
+		return -EINVAL;
+	}
+
+	/* a dev_loss_tmo of 0 (immediate) is allowed to be set */
+	rport->remoteport.dev_loss_tmo = dev_loss_tmo;
+
+	spin_unlock_irqrestore(&rport->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_fc_set_remoteport_devloss);
+
 
 /* *********************** FC-NVME DMA Handling **************************** */
 
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index 2be4db353937..496ff759f84c 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -40,6 +40,8 @@
  * @node_name: FC WWNN for the port
  * @port_name: FC WWPN for the port
  * @port_role: What NVME roles are supported (see FC_PORT_ROLE_xxx)
+ * @dev_loss_tmo: maximum delay for reconnects to an association on
+ *             this device. Used only on a remoteport.
  *
  * Initialization values for dynamic port fields:
  * @port_id:      FC N_Port_ID currently assigned the port. Upper 8 bits must
@@ -50,6 +52,7 @@ struct nvme_fc_port_info {
 	u64			port_name;
 	u32			port_role;
 	u32			port_id;
+	u32			dev_loss_tmo;
 };
 
 
@@ -200,6 +203,9 @@ enum nvme_fc_obj_state {
  *             The length of the buffer corresponds to the local_priv_sz
  *             value specified in the nvme_fc_port_template supplied by
  *             the LLDD.
+ * @dev_loss_tmo: maximum delay for reconnects to an association on
+ *             this device. To modify, lldd must call
+ *             nvme_fc_set_remoteport_devloss().
  *
  * Fields with dynamic values. Values may change base on link state. LLDD
  * may reference fields directly to change them. Initialized by the
@@ -257,10 +263,9 @@ struct nvme_fc_remote_port {
 	u32 port_role;
 	u64 node_name;
 	u64 port_name;
-
 	struct nvme_fc_local_port *localport;
-
 	void *private;
+	u32 dev_loss_tmo;
 
 	/* dynamic fields */
 	u32 port_id;
@@ -446,6 +451,8 @@ int nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *remoteport);
 
 void nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport);
 
+int nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *remoteport,
+			u32 dev_loss_tmo);
 
 
 /*
-- 
cgit v1.2.3


From 00ed87da35e88a7a4d7f41673beab52ef828f12b Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 1 Nov 2017 07:32:50 -0700
Subject: timer: Add parenthesis around timer_setup() macro arguments

In the case where expressions are passed as macro arguments, the LOCKDEP
version of the timer macros need enclosing parenthesis.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20171101143250.GA65266@beast
---
 include/linux/timer.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 09950482309b..a1af92bac0d5 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -173,11 +173,12 @@ static inline void timer_setup_on_stack(struct timer_list *timer,
  * do want to keep the inline for argument type checking, though.
  */
 # define timer_setup(timer, callback, flags)				\
-		__setup_timer(timer, (TIMER_FUNC_TYPE)callback,		\
-			      (TIMER_DATA_TYPE)timer, flags)
+		__setup_timer((timer), (TIMER_FUNC_TYPE)(callback),	\
+			      (TIMER_DATA_TYPE)(timer), (flags))
 # define timer_setup_on_stack(timer, callback, flags)			\
-		__setup_timer_on_stack(timer, (TIMER_FUNC_TYPE)callback,\
-				       (TIMER_DATA_TYPE)timer, flags)
+		__setup_timer_on_stack((timer),				\
+				       (TIMER_FUNC_TYPE)(callback),	\
+				       (TIMER_DATA_TYPE)(timer), (flags))
 #endif
 
 #define from_timer(var, callback_timer, timer_fieldname) \
-- 
cgit v1.2.3


From aa795c41d9cd41dc9c915dd1956ddd0e4ae44485 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Fri, 1 Sep 2017 16:16:40 -0700
Subject: clk: Add devm_of_clk_add_hw_provider()/del_provider() APIs

Sometimes we only have one of_clk_del_provider() call in driver
error and remove paths, because we're missing a
devm_of_clk_add_hw_provider() API. Introduce the API so we can
convert drivers to use this and potentially reduce the amount of
code needed to remove providers in drivers.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 Documentation/driver-model/devres.txt |  1 +
 drivers/clk/clk.c                     | 52 +++++++++++++++++++++++++++++++++++
 include/linux/clk-provider.h          | 13 +++++++++
 3 files changed, 66 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index 69f08c0f23a8..c180045eb43b 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -237,6 +237,7 @@ CLOCK
   devm_clk_get()
   devm_clk_put()
   devm_clk_hw_register()
+  devm_of_clk_add_hw_provider()
 
 DMA
   dmam_alloc_coherent()
diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index c8d83acda006..856d4858ae27 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -3177,6 +3177,37 @@ int of_clk_add_hw_provider(struct device_node *np,
 }
 EXPORT_SYMBOL_GPL(of_clk_add_hw_provider);
 
+static void devm_of_clk_release_provider(struct device *dev, void *res)
+{
+	of_clk_del_provider(*(struct device_node **)res);
+}
+
+int devm_of_clk_add_hw_provider(struct device *dev,
+			struct clk_hw *(*get)(struct of_phandle_args *clkspec,
+					      void *data),
+			void *data)
+{
+	struct device_node **ptr, *np;
+	int ret;
+
+	ptr = devres_alloc(devm_of_clk_release_provider, sizeof(*ptr),
+			   GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+
+	np = dev->of_node;
+	ret = of_clk_add_hw_provider(np, get, data);
+	if (!ret) {
+		*ptr = np;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devm_of_clk_add_hw_provider);
+
 /**
  * of_clk_del_provider() - Remove a previously registered clock provider
  * @np: Device node pointer associated with clock provider
@@ -3198,6 +3229,27 @@ void of_clk_del_provider(struct device_node *np)
 }
 EXPORT_SYMBOL_GPL(of_clk_del_provider);
 
+static int devm_clk_provider_match(struct device *dev, void *res, void *data)
+{
+	struct device_node **np = res;
+
+	if (WARN_ON(!np || !*np))
+		return 0;
+
+	return *np == data;
+}
+
+void devm_of_clk_del_provider(struct device *dev)
+{
+	int ret;
+
+	ret = devres_release(dev, devm_of_clk_release_provider,
+			     devm_clk_provider_match, dev->of_node);
+
+	WARN_ON(ret);
+}
+EXPORT_SYMBOL(devm_of_clk_del_provider);
+
 static struct clk_hw *
 __of_clk_get_hw_from_provider(struct of_clk_provider *provider,
 			      struct of_phandle_args *clkspec)
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 5100ec1b5d55..ba79d6186133 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -815,7 +815,12 @@ int of_clk_add_hw_provider(struct device_node *np,
 			   struct clk_hw *(*get)(struct of_phandle_args *clkspec,
 						 void *data),
 			   void *data);
+int devm_of_clk_add_hw_provider(struct device *dev,
+			   struct clk_hw *(*get)(struct of_phandle_args *clkspec,
+						 void *data),
+			   void *data);
 void of_clk_del_provider(struct device_node *np);
+void devm_of_clk_del_provider(struct device *dev);
 struct clk *of_clk_src_simple_get(struct of_phandle_args *clkspec,
 				  void *data);
 struct clk_hw *of_clk_hw_simple_get(struct of_phandle_args *clkspec,
@@ -847,7 +852,15 @@ static inline int of_clk_add_hw_provider(struct device_node *np,
 {
 	return 0;
 }
+static inline int devm_of_clk_add_hw_provider(struct device *dev,
+			   struct clk_hw *(*get)(struct of_phandle_args *clkspec,
+						 void *data),
+			   void *data)
+{
+	return 0;
+}
 static inline void of_clk_del_provider(struct device_node *np) {}
+static inline void devm_of_clk_del_provider(struct device *dev) {}
 static inline struct clk *of_clk_src_simple_get(
 	struct of_phandle_args *clkspec, void *data)
 {
-- 
cgit v1.2.3


From 7066fdd0d7427f1ba455d186e75213c442c9663b Mon Sep 17 00:00:00 2001
From: Rajendra Nayak <rnayak@codeaurora.org>
Date: Tue, 10 Oct 2017 14:27:13 +0530
Subject: clk: qcom: clk-smd-rpm: add msm8996 rpmclks

Add all RPM controlled clocks on msm8996 platform

[srini: Fixed various issues with offsets and made names specific to msm8996]
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Rajendra Nayak <rnayak@codeaurora.org>
Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 .../devicetree/bindings/clock/qcom,rpmcc.txt       |  1 +
 drivers/clk/qcom/clk-smd-rpm.c                     | 82 ++++++++++++++++++++++
 include/dt-bindings/clock/qcom,rpmcc.h             | 14 ++++
 include/linux/soc/qcom/smd-rpm.h                   |  4 ++
 4 files changed, 101 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/clock/qcom,rpmcc.txt b/Documentation/devicetree/bindings/clock/qcom,rpmcc.txt
index 833a89f06ae0..4491d1c104aa 100644
--- a/Documentation/devicetree/bindings/clock/qcom,rpmcc.txt
+++ b/Documentation/devicetree/bindings/clock/qcom,rpmcc.txt
@@ -15,6 +15,7 @@ Required properties :
 			"qcom,rpmcc-msm8916", "qcom,rpmcc"
 			"qcom,rpmcc-msm8974", "qcom,rpmcc"
 			"qcom,rpmcc-apq8064", "qcom,rpmcc"
+			"qcom,rpmcc-msm8996", "qcom,rpmcc"
 
 - #clock-cells : shall contain 1
 
diff --git a/drivers/clk/qcom/clk-smd-rpm.c b/drivers/clk/qcom/clk-smd-rpm.c
index cc03d5508627..c26d9007bfc4 100644
--- a/drivers/clk/qcom/clk-smd-rpm.c
+++ b/drivers/clk/qcom/clk-smd-rpm.c
@@ -530,9 +530,91 @@ static const struct rpm_smd_clk_desc rpm_clk_msm8974 = {
 	.clks = msm8974_clks,
 	.num_clks = ARRAY_SIZE(msm8974_clks),
 };
+
+/* msm8996 */
+DEFINE_CLK_SMD_RPM(msm8996, pcnoc_clk, pcnoc_a_clk, QCOM_SMD_RPM_BUS_CLK, 0);
+DEFINE_CLK_SMD_RPM(msm8996, snoc_clk, snoc_a_clk, QCOM_SMD_RPM_BUS_CLK, 1);
+DEFINE_CLK_SMD_RPM(msm8996, cnoc_clk, cnoc_a_clk, QCOM_SMD_RPM_BUS_CLK, 2);
+DEFINE_CLK_SMD_RPM(msm8996, bimc_clk, bimc_a_clk, QCOM_SMD_RPM_MEM_CLK, 0);
+DEFINE_CLK_SMD_RPM(msm8996, mmssnoc_axi_rpm_clk, mmssnoc_axi_rpm_a_clk,
+		   QCOM_SMD_RPM_MMAXI_CLK, 0);
+DEFINE_CLK_SMD_RPM(msm8996, ipa_clk, ipa_a_clk, QCOM_SMD_RPM_IPA_CLK, 0);
+DEFINE_CLK_SMD_RPM(msm8996, ce1_clk, ce1_a_clk, QCOM_SMD_RPM_CE_CLK, 0);
+DEFINE_CLK_SMD_RPM_BRANCH(msm8996, aggre1_noc_clk, aggre1_noc_a_clk,
+			  QCOM_SMD_RPM_AGGR_CLK, 1, 1000);
+DEFINE_CLK_SMD_RPM_BRANCH(msm8996, aggre2_noc_clk, aggre2_noc_a_clk,
+			  QCOM_SMD_RPM_AGGR_CLK, 2, 1000);
+DEFINE_CLK_SMD_RPM_QDSS(msm8996, qdss_clk, qdss_a_clk,
+			QCOM_SMD_RPM_MISC_CLK, 1);
+DEFINE_CLK_SMD_RPM_XO_BUFFER(msm8996, bb_clk1, bb_clk1_a, 1);
+DEFINE_CLK_SMD_RPM_XO_BUFFER(msm8996, bb_clk2, bb_clk2_a, 2);
+DEFINE_CLK_SMD_RPM_XO_BUFFER(msm8996, rf_clk1, rf_clk1_a, 4);
+DEFINE_CLK_SMD_RPM_XO_BUFFER(msm8996, rf_clk2, rf_clk2_a, 5);
+DEFINE_CLK_SMD_RPM_XO_BUFFER(msm8996, ln_bb_clk, ln_bb_a_clk, 8);
+DEFINE_CLK_SMD_RPM_XO_BUFFER(msm8996, div_clk1, div_clk1_a, 0xb);
+DEFINE_CLK_SMD_RPM_XO_BUFFER(msm8996, div_clk2, div_clk2_a, 0xc);
+DEFINE_CLK_SMD_RPM_XO_BUFFER(msm8996, div_clk3, div_clk3_a, 0xd);
+DEFINE_CLK_SMD_RPM_XO_BUFFER_PINCTRL(msm8996, bb_clk1_pin, bb_clk1_a_pin, 1);
+DEFINE_CLK_SMD_RPM_XO_BUFFER_PINCTRL(msm8996, bb_clk2_pin, bb_clk2_a_pin, 2);
+DEFINE_CLK_SMD_RPM_XO_BUFFER_PINCTRL(msm8996, rf_clk1_pin, rf_clk1_a_pin, 4);
+DEFINE_CLK_SMD_RPM_XO_BUFFER_PINCTRL(msm8996, rf_clk2_pin, rf_clk2_a_pin, 5);
+
+static struct clk_smd_rpm *msm8996_clks[] = {
+	[RPM_SMD_PCNOC_CLK] = &msm8996_pcnoc_clk,
+	[RPM_SMD_PCNOC_A_CLK] = &msm8996_pcnoc_a_clk,
+	[RPM_SMD_SNOC_CLK] = &msm8996_snoc_clk,
+	[RPM_SMD_SNOC_A_CLK] = &msm8996_snoc_a_clk,
+	[RPM_SMD_CNOC_CLK] = &msm8996_cnoc_clk,
+	[RPM_SMD_CNOC_A_CLK] = &msm8996_cnoc_a_clk,
+	[RPM_SMD_BIMC_CLK] = &msm8996_bimc_clk,
+	[RPM_SMD_BIMC_A_CLK] = &msm8996_bimc_a_clk,
+	[RPM_SMD_MMAXI_CLK] = &msm8996_mmssnoc_axi_rpm_clk,
+	[RPM_SMD_MMAXI_A_CLK] = &msm8996_mmssnoc_axi_rpm_a_clk,
+	[RPM_SMD_IPA_CLK] = &msm8996_ipa_clk,
+	[RPM_SMD_IPA_A_CLK] = &msm8996_ipa_a_clk,
+	[RPM_SMD_CE1_CLK] = &msm8996_ce1_clk,
+	[RPM_SMD_CE1_A_CLK] = &msm8996_ce1_a_clk,
+	[RPM_SMD_AGGR1_NOC_CLK] = &msm8996_aggre1_noc_clk,
+	[RPM_SMD_AGGR1_NOC_A_CLK] = &msm8996_aggre1_noc_a_clk,
+	[RPM_SMD_AGGR2_NOC_CLK] = &msm8996_aggre2_noc_clk,
+	[RPM_SMD_AGGR2_NOC_A_CLK] = &msm8996_aggre2_noc_a_clk,
+	[RPM_SMD_QDSS_CLK] = &msm8996_qdss_clk,
+	[RPM_SMD_QDSS_A_CLK] = &msm8996_qdss_a_clk,
+	[RPM_SMD_BB_CLK1] = &msm8996_bb_clk1,
+	[RPM_SMD_BB_CLK1_A] = &msm8996_bb_clk1_a,
+	[RPM_SMD_BB_CLK2] = &msm8996_bb_clk2,
+	[RPM_SMD_BB_CLK2_A] = &msm8996_bb_clk2_a,
+	[RPM_SMD_RF_CLK1] = &msm8996_rf_clk1,
+	[RPM_SMD_RF_CLK1_A] = &msm8996_rf_clk1_a,
+	[RPM_SMD_RF_CLK2] = &msm8996_rf_clk2,
+	[RPM_SMD_RF_CLK2_A] = &msm8996_rf_clk2_a,
+	[RPM_SMD_LN_BB_CLK] = &msm8996_ln_bb_clk,
+	[RPM_SMD_LN_BB_A_CLK] = &msm8996_ln_bb_a_clk,
+	[RPM_SMD_DIV_CLK1] = &msm8996_div_clk1,
+	[RPM_SMD_DIV_A_CLK1] = &msm8996_div_clk1_a,
+	[RPM_SMD_DIV_CLK2] = &msm8996_div_clk2,
+	[RPM_SMD_DIV_A_CLK2] = &msm8996_div_clk2_a,
+	[RPM_SMD_DIV_CLK3] = &msm8996_div_clk3,
+	[RPM_SMD_DIV_A_CLK3] = &msm8996_div_clk3_a,
+	[RPM_SMD_BB_CLK1_PIN] = &msm8996_bb_clk1_pin,
+	[RPM_SMD_BB_CLK1_A_PIN] = &msm8996_bb_clk1_a_pin,
+	[RPM_SMD_BB_CLK2_PIN] = &msm8996_bb_clk2_pin,
+	[RPM_SMD_BB_CLK2_A_PIN] = &msm8996_bb_clk2_a_pin,
+	[RPM_SMD_RF_CLK1_PIN] = &msm8996_rf_clk1_pin,
+	[RPM_SMD_RF_CLK1_A_PIN] = &msm8996_rf_clk1_a_pin,
+	[RPM_SMD_RF_CLK2_PIN] = &msm8996_rf_clk2_pin,
+	[RPM_SMD_RF_CLK2_A_PIN] = &msm8996_rf_clk2_a_pin,
+};
+
+static const struct rpm_smd_clk_desc rpm_clk_msm8996 = {
+	.clks = msm8996_clks,
+	.num_clks = ARRAY_SIZE(msm8996_clks),
+};
+
 static const struct of_device_id rpm_smd_clk_match_table[] = {
 	{ .compatible = "qcom,rpmcc-msm8916", .data = &rpm_clk_msm8916 },
 	{ .compatible = "qcom,rpmcc-msm8974", .data = &rpm_clk_msm8974 },
+	{ .compatible = "qcom,rpmcc-msm8996", .data = &rpm_clk_msm8996 },
 	{ }
 };
 MODULE_DEVICE_TABLE(of, rpm_smd_clk_match_table);
diff --git a/include/dt-bindings/clock/qcom,rpmcc.h b/include/dt-bindings/clock/qcom,rpmcc.h
index 29dad206a74b..b8337a5fa347 100644
--- a/include/dt-bindings/clock/qcom,rpmcc.h
+++ b/include/dt-bindings/clock/qcom,rpmcc.h
@@ -104,5 +104,19 @@
 #define RPM_SMD_CXO_A1_A_PIN			59
 #define RPM_SMD_CXO_A2_PIN			60
 #define RPM_SMD_CXO_A2_A_PIN			61
+#define RPM_SMD_AGGR1_NOC_CLK			62
+#define RPM_SMD_AGGR1_NOC_A_CLK			63
+#define RPM_SMD_AGGR2_NOC_CLK			64
+#define RPM_SMD_AGGR2_NOC_A_CLK			65
+#define RPM_SMD_MMAXI_CLK			66
+#define RPM_SMD_MMAXI_A_CLK			67
+#define RPM_SMD_IPA_CLK				68
+#define RPM_SMD_IPA_A_CLK			69
+#define RPM_SMD_CE1_CLK				70
+#define RPM_SMD_CE1_A_CLK			71
+#define RPM_SMD_DIV_CLK3			72
+#define RPM_SMD_DIV_A_CLK3			73
+#define RPM_SMD_LN_BB_CLK			74
+#define RPM_SMD_LN_BB_A_CLK			75
 
 #endif
diff --git a/include/linux/soc/qcom/smd-rpm.h b/include/linux/soc/qcom/smd-rpm.h
index 2a53dcaeeeed..ebdabd669d93 100644
--- a/include/linux/soc/qcom/smd-rpm.h
+++ b/include/linux/soc/qcom/smd-rpm.h
@@ -26,6 +26,10 @@ struct qcom_smd_rpm;
 #define QCOM_SMD_RPM_SMPB	0x62706d73
 #define QCOM_SMD_RPM_SPDM	0x63707362
 #define QCOM_SMD_RPM_VSA	0x00617376
+#define QCOM_SMD_RPM_MMAXI_CLK	0x69786d6d
+#define QCOM_SMD_RPM_IPA_CLK	0x617069
+#define QCOM_SMD_RPM_CE_CLK	0x6563
+#define QCOM_SMD_RPM_AGGR_CLK	0x72676761
 
 int qcom_rpm_smd_write(struct qcom_smd_rpm *rpm,
 		       int state,
-- 
cgit v1.2.3


From 1495dc9f0a711a54f8fec849ce7f3a8f585a11e5 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 1 Nov 2017 11:48:00 -0700
Subject: security: bpf: replace include of linux/bpf.h with forward
 declarations

Touching linux/bpf.h makes us rebuild a surprisingly large
portion of the kernel.  Remove the unnecessary dependency
from security.h, it only needs forward declarations.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/security.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 18800b0911e5..73f1ef625d40 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -31,7 +31,6 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
-#include <linux/bpf.h>
 
 struct linux_binprm;
 struct cred;
@@ -1732,6 +1731,10 @@ static inline void securityfs_remove(struct dentry *dentry)
 #endif
 
 #ifdef CONFIG_BPF_SYSCALL
+union bpf_attr;
+struct bpf_map;
+struct bpf_prog;
+struct bpf_prog_aux;
 #ifdef CONFIG_SECURITY
 extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
 extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
-- 
cgit v1.2.3


From 908a543ac7cdf3aa8a283ec42cab3c16e2fc45a2 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sun, 24 Sep 2017 18:19:18 +0200
Subject: clk: clk-gpio: Make GPIO clock provider use descriptors only

After som grep:ing it turns out nothing in the kernel is really calling
clk_[hw_]_register_gpio_[gate|mux](). All existing instances are just
created directly from the device tree probe functions at the bottom of
the clk-gpio.c clock provider file.

This means we can change the signature of the function without any
consequences! Everyone should be using GPIO descriptors now, so let's
just go in and enforce that.

This saves a bit of code since GPIO descriptors know inherently if they
are active low so no need for the code keeping track of that.

We leave it to the caller to come up with the GPIO descriptor. It is
nowadays possible to do that even without a corresponding device, so
no excuse not to pass them around. The one in-kernel user lifecycles
it using devm_gpiod_get() in gpio_clk_driver_probe().

Cc: Sergej Sawazki <ce3a@gmx.de>
Cc: Jyri Sarha <jsarha@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-gpio.c       | 90 +++++++++++++++++---------------------------
 include/linux/clk-provider.h | 12 +++---
 2 files changed, 41 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-gpio.c b/drivers/clk/clk-gpio.c
index 86b245746a6b..9d057073e110 100644
--- a/drivers/clk/clk-gpio.c
+++ b/drivers/clk/clk-gpio.c
@@ -15,9 +15,7 @@
 #include <linux/clk-provider.h>
 #include <linux/export.h>
 #include <linux/slab.h>
-#include <linux/gpio.h>
 #include <linux/gpio/consumer.h>
-#include <linux/of_gpio.h>
 #include <linux/err.h>
 #include <linux/device.h>
 #include <linux/platform_device.h>
@@ -95,14 +93,12 @@ const struct clk_ops clk_gpio_mux_ops = {
 EXPORT_SYMBOL_GPL(clk_gpio_mux_ops);
 
 static struct clk_hw *clk_register_gpio(struct device *dev, const char *name,
-		const char * const *parent_names, u8 num_parents, unsigned gpio,
-		bool active_low, unsigned long flags,
-		const struct clk_ops *clk_gpio_ops)
+		const char * const *parent_names, u8 num_parents, struct gpio_desc *gpiod,
+		unsigned long flags, const struct clk_ops *clk_gpio_ops)
 {
 	struct clk_gpio *clk_gpio;
 	struct clk_hw *hw;
 	struct clk_init_data init = {};
-	unsigned long gpio_flags;
 	int err;
 
 	if (dev)
@@ -113,24 +109,13 @@ static struct clk_hw *clk_register_gpio(struct device *dev, const char *name,
 	if (!clk_gpio)
 		return ERR_PTR(-ENOMEM);
 
-	if (active_low)
-		gpio_flags = GPIOF_ACTIVE_LOW | GPIOF_OUT_INIT_HIGH;
-	else
-		gpio_flags = GPIOF_OUT_INIT_LOW;
-
-	if (dev)
-		err = devm_gpio_request_one(dev, gpio, gpio_flags, name);
-	else
-		err = gpio_request_one(gpio, gpio_flags, name);
-	if (err) {
-		if (err != -EPROBE_DEFER)
-			pr_err("%s: %s: Error requesting clock control gpio %u\n",
-					__func__, name, gpio);
-		if (!dev)
-			kfree(clk_gpio);
-
-		return ERR_PTR(err);
-	}
+	/*
+	 * Set to disabled no matter what: NOTE if the GPIO line is active low
+	 * the GPIO descriptor knows this and will set it high to deassert the
+	 * line. This assumes the GPIO descriptor has been requested using
+	 * GPIOD_ASIS by the callers so we need to initialize it as disabled here.
+	 */
+	gpiod_set_value(gpiod, 0);
 
 	init.name = name;
 	init.ops = clk_gpio_ops;
@@ -138,7 +123,7 @@ static struct clk_hw *clk_register_gpio(struct device *dev, const char *name,
 	init.parent_names = parent_names;
 	init.num_parents = num_parents;
 
-	clk_gpio->gpiod = gpio_to_desc(gpio);
+	clk_gpio->gpiod = gpiod;
 	clk_gpio->hw.init = &init;
 
 	hw = &clk_gpio->hw;
@@ -151,7 +136,6 @@ static struct clk_hw *clk_register_gpio(struct device *dev, const char *name,
 		return hw;
 
 	if (!dev) {
-		gpiod_put(clk_gpio->gpiod);
 		kfree(clk_gpio);
 	}
 
@@ -164,29 +148,27 @@ static struct clk_hw *clk_register_gpio(struct device *dev, const char *name,
  * @dev: device that is registering this clock
  * @name: name of this clock
  * @parent_name: name of this clock's parent
- * @gpio: gpio number to gate this clock
- * @active_low: true if gpio should be set to 0 to enable clock
+ * @gpiod: gpio descriptor to gate this clock
  * @flags: clock flags
  */
 struct clk_hw *clk_hw_register_gpio_gate(struct device *dev, const char *name,
-		const char *parent_name, unsigned gpio, bool active_low,
+		const char *parent_name, struct gpio_desc *gpiod,
 		unsigned long flags)
 {
 	return clk_register_gpio(dev, name,
 			(parent_name ? &parent_name : NULL),
-			(parent_name ? 1 : 0), gpio, active_low, flags,
+			(parent_name ? 1 : 0), gpiod, flags,
 			&clk_gpio_gate_ops);
 }
 EXPORT_SYMBOL_GPL(clk_hw_register_gpio_gate);
 
 struct clk *clk_register_gpio_gate(struct device *dev, const char *name,
-		const char *parent_name, unsigned gpio, bool active_low,
+		const char *parent_name, struct gpio_desc *gpiod,
 		unsigned long flags)
 {
 	struct clk_hw *hw;
 
-	hw = clk_hw_register_gpio_gate(dev, name, parent_name, gpio, active_low,
-				       flags);
+	hw = clk_hw_register_gpio_gate(dev, name, parent_name, gpiod, flags);
 	if (IS_ERR(hw))
 		return ERR_CAST(hw);
 	return hw->clk;
@@ -199,13 +181,12 @@ EXPORT_SYMBOL_GPL(clk_register_gpio_gate);
  * @name: name of this clock
  * @parent_names: names of this clock's parents
  * @num_parents: number of parents listed in @parent_names
- * @gpio: gpio number to gate this clock
- * @active_low: true if gpio should be set to 0 to enable clock
+ * @gpiod: gpio descriptor to gate this clock
  * @flags: clock flags
  */
 struct clk_hw *clk_hw_register_gpio_mux(struct device *dev, const char *name,
-		const char * const *parent_names, u8 num_parents, unsigned gpio,
-		bool active_low, unsigned long flags)
+		const char * const *parent_names, u8 num_parents, struct gpio_desc *gpiod,
+		unsigned long flags)
 {
 	if (num_parents != 2) {
 		pr_err("mux-clock %s must have 2 parents\n", name);
@@ -213,18 +194,18 @@ struct clk_hw *clk_hw_register_gpio_mux(struct device *dev, const char *name,
 	}
 
 	return clk_register_gpio(dev, name, parent_names, num_parents,
-			gpio, active_low, flags, &clk_gpio_mux_ops);
+			gpiod, flags, &clk_gpio_mux_ops);
 }
 EXPORT_SYMBOL_GPL(clk_hw_register_gpio_mux);
 
 struct clk *clk_register_gpio_mux(struct device *dev, const char *name,
-		const char * const *parent_names, u8 num_parents, unsigned gpio,
-		bool active_low, unsigned long flags)
+		const char * const *parent_names, u8 num_parents, struct gpio_desc *gpiod,
+		unsigned long flags)
 {
 	struct clk_hw *hw;
 
 	hw = clk_hw_register_gpio_mux(dev, name, parent_names, num_parents,
-			gpio, active_low, flags);
+			gpiod, flags);
 	if (IS_ERR(hw))
 		return ERR_CAST(hw);
 	return hw->clk;
@@ -236,10 +217,10 @@ static int gpio_clk_driver_probe(struct platform_device *pdev)
 	struct device_node *node = pdev->dev.of_node;
 	const char **parent_names, *gpio_name;
 	unsigned int num_parents;
-	int gpio;
-	enum of_gpio_flags of_flags;
+	struct gpio_desc *gpiod;
 	struct clk *clk;
-	bool active_low, is_mux;
+	bool is_mux;
+	int ret;
 
 	num_parents = of_clk_get_parent_count(node);
 	if (num_parents) {
@@ -255,28 +236,27 @@ static int gpio_clk_driver_probe(struct platform_device *pdev)
 
 	is_mux = of_device_is_compatible(node, "gpio-mux-clock");
 
-	gpio_name = is_mux ? "select-gpios" : "enable-gpios";
-	gpio = of_get_named_gpio_flags(node, gpio_name, 0, &of_flags);
-	if (gpio < 0) {
-		if (gpio == -EPROBE_DEFER)
+	gpio_name = is_mux ? "select" : "enable";
+	gpiod = devm_gpiod_get(&pdev->dev, gpio_name, GPIOD_ASIS);
+	if (IS_ERR(gpiod)) {
+		ret = PTR_ERR(gpiod);
+		if (ret == -EPROBE_DEFER)
 			pr_debug("%s: %s: GPIOs not yet available, retry later\n",
 					node->name, __func__);
 		else
-			pr_err("%s: %s: Can't get '%s' DT property\n",
+			pr_err("%s: %s: Can't get '%s' named GPIO property\n",
 					node->name, __func__,
 					gpio_name);
-		return gpio;
+		return ret;
 	}
 
-	active_low = of_flags & OF_GPIO_ACTIVE_LOW;
-
 	if (is_mux)
 		clk = clk_register_gpio_mux(&pdev->dev, node->name,
-				parent_names, num_parents, gpio, active_low, 0);
+				parent_names, num_parents, gpiod, 0);
 	else
 		clk = clk_register_gpio_gate(&pdev->dev, node->name,
-				parent_names ?  parent_names[0] : NULL, gpio,
-				active_low, 0);
+				parent_names ?  parent_names[0] : NULL, gpiod,
+				0);
 	if (IS_ERR(clk))
 		return PTR_ERR(clk);
 
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 5100ec1b5d55..063d8cb9926f 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -682,10 +682,10 @@ struct clk_gpio {
 
 extern const struct clk_ops clk_gpio_gate_ops;
 struct clk *clk_register_gpio_gate(struct device *dev, const char *name,
-		const char *parent_name, unsigned gpio, bool active_low,
+		const char *parent_name, struct gpio_desc *gpiod,
 		unsigned long flags);
 struct clk_hw *clk_hw_register_gpio_gate(struct device *dev, const char *name,
-		const char *parent_name, unsigned gpio, bool active_low,
+		const char *parent_name, struct gpio_desc *gpiod,
 		unsigned long flags);
 void clk_hw_unregister_gpio_gate(struct clk_hw *hw);
 
@@ -701,11 +701,11 @@ void clk_hw_unregister_gpio_gate(struct clk_hw *hw);
 
 extern const struct clk_ops clk_gpio_mux_ops;
 struct clk *clk_register_gpio_mux(struct device *dev, const char *name,
-		const char * const *parent_names, u8 num_parents, unsigned gpio,
-		bool active_low, unsigned long flags);
+		const char * const *parent_names, u8 num_parents, struct gpio_desc *gpiod,
+		unsigned long flags);
 struct clk_hw *clk_hw_register_gpio_mux(struct device *dev, const char *name,
-		const char * const *parent_names, u8 num_parents, unsigned gpio,
-		bool active_low, unsigned long flags);
+		const char * const *parent_names, u8 num_parents, struct gpio_desc *gpiod,
+		unsigned long flags);
 void clk_hw_unregister_gpio_mux(struct clk_hw *hw);
 
 /**
-- 
cgit v1.2.3


From d4d7b4ad2f05c03fb25252aea66f9f3cd7cfbe06 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 26 Oct 2017 10:44:07 +0100
Subject: irqchip/gic-v3-its: Setup VLPI properties at map time

So far, we require the hypervisor to update the VLPI properties
once the the VLPI mapping has been established. While this
makes it easy for the ITS driver, it creates a window where
an incoming interrupt can be delivered with an unknown set
of properties. Not very nice.

Instead, let's add a "properties" field to the mapping structure,
and use that to configure the VLPI before it actually gets mapped.

Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v3-its.c   | 18 ++++++++++++++++--
 include/linux/irqchip/arm-gic-v4.h |  2 ++
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 6a74f0497f82..29b2ff5c6841 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -1008,9 +1008,15 @@ static void lpi_write_config(struct irq_data *d, u8 clr, u8 set)
 	if (irqd_is_forwarded_to_vcpu(d)) {
 		struct its_device *its_dev = irq_data_get_irq_chip_data(d);
 		u32 event = its_get_event_id(d);
+		struct its_vlpi_map *map;
 
 		prop_page = its_dev->event_map.vm->vprop_page;
-		hwirq = its_dev->event_map.vlpi_maps[event].vintid;
+		map = &its_dev->event_map.vlpi_maps[event];
+		hwirq = map->vintid;
+
+		/* Remember the updated property */
+		map->properties &= ~clr;
+		map->properties |= set | LPI_PROP_GROUP1;
 	} else {
 		prop_page = gic_rdists->prop_page;
 		hwirq = d->hwirq;
@@ -1249,12 +1255,20 @@ static int its_vlpi_map(struct irq_data *d, struct its_cmd_info *info)
 		/* Ensure all the VPEs are mapped on this ITS */
 		its_map_vm(its_dev->its, info->map->vm);
 
+		/*
+		 * Flag the interrupt as forwarded so that we can
+		 * start poking the virtual property table.
+		 */
+		irqd_set_forwarded_to_vcpu(d);
+
+		/* Write out the property to the prop table */
+		lpi_write_config(d, 0xff, info->map->properties);
+
 		/* Drop the physical mapping */
 		its_send_discard(its_dev, event);
 
 		/* and install the virtual one */
 		its_send_vmapti(its_dev, event);
-		irqd_set_forwarded_to_vcpu(d);
 
 		/* Increment the number of VLPIs */
 		its_dev->event_map.nr_vlpis++;
diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h
index 43cde15f221b..447da8ca2156 100644
--- a/include/linux/irqchip/arm-gic-v4.h
+++ b/include/linux/irqchip/arm-gic-v4.h
@@ -71,12 +71,14 @@ struct its_vpe {
  * @vm:		Pointer to the GICv4 notion of a VM
  * @vpe:	Pointer to the GICv4 notion of a virtual CPU (VPE)
  * @vintid:	Virtual LPI number
+ * @properties:	Priority and enable bits (as written in the prop table)
  * @db_enabled:	Is the VPE doorbell to be generated?
  */
 struct its_vlpi_map {
 	struct its_vm		*vm;
 	struct its_vpe		*vpe;
 	u32			vintid;
+	u8			properties;
 	bool			db_enabled;
 };
 
-- 
cgit v1.2.3


From 4b82130077d93539c9fbb0f5eee21965cea9cfe9 Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Mon, 30 Oct 2017 10:15:00 +0800
Subject: irqdomain: Update the comments of fwnode field of irq_domain
 structure

Commit:

f110711a6053 ("irqdomain: Convert irqdomain-%3Eof_node to fwnode")

converted of_node field to fwnode, but didn't update its comments.

Update it.

Fixes: f110711a6053 ("irqdomain: Convert irqdomain-%3Eof_node to fwnode")
Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqdomain.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index df162f7a4aad..ce48a23d621f 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -138,8 +138,8 @@ struct irq_domain_chip_generic;
  * @mapcount: The number of mapped interrupts
  *
  * Optional elements
- * @of_node: Pointer to device tree nodes associated with the irq_domain. Used
- *           when decoding device tree interrupt specifiers.
+ * @fwnode: Pointer to firmware node associated with the irq_domain. Pretty easy
+ *          to swap it for the of_node via the irq_domain_get_of_node accessor
  * @gc: Pointer to a list of generic chips. There is a helper function for
  *      setting up one or more generic chips for interrupt controllers
  *      drivers using the generic chip library which uses this pointer.
-- 
cgit v1.2.3


From da61fcf9d62a05f3508f5646d353a9c2604bac76 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@mips.com>
Date: Tue, 31 Oct 2017 09:41:45 -0700
Subject: irqchip: mips-gic: Use irq_cpu_online to (un)mask all-VP(E) IRQs

The gic_all_vpes_local_irq_controller chip currently attempts to operate
on all CPUs/VPs in the system when masking or unmasking an interrupt.
This has a few drawbacks:

 - In multi-cluster systems we may not always have access to all CPUs in
   the system. When all CPUs in a cluster are powered down that
   cluster's GIC may also power down, in which case we cannot configure
   its state.

 - Relatedly, if we power down a cluster after having configured
   interrupts for CPUs within it then the cluster's GIC may lose state &
   we need to reconfigure it. The current approach doesn't take this
   into account.

 - It's wasteful if we run Linux on fewer VPs than are present in the
   system. For example if we run a uniprocessor kernel on CPU0 of a
   system with 16 CPUs then there's no point in us configuring CPUs
   1-15.

 - The implementation is also lacking in that it expects the range
   0..gic_vpes-1 to represent valid Linux CPU numbers which may not
   always be the case - for example if we run on a system with more VPs
   than the kernel is configured to support.

Fix all of these issues by only configuring the affected interrupts for
CPUs which are online at the time, and recording the configuration in a
new struct gic_all_vpes_chip_data for later use by CPUs being brought
online. We register a CPU hotplug state (reusing
CPUHP_AP_IRQ_GIC_STARTING which the ARM GIC driver uses, and which seems
suitably generic for reuse with the MIPS GIC) and execute
irq_cpu_online() in order to configure the interrupts on the newly
onlined CPU.

Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-mips-gic.c | 72 ++++++++++++++++++++++++++++++++----------
 include/linux/cpuhotplug.h     |  1 +
 2 files changed, 57 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 6fdcc1552fab..60f644279803 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -8,6 +8,7 @@
  */
 #include <linux/bitmap.h>
 #include <linux/clocksource.h>
+#include <linux/cpuhotplug.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
@@ -55,6 +56,11 @@ static struct irq_chip gic_level_irq_controller, gic_edge_irq_controller;
 DECLARE_BITMAP(ipi_resrv, GIC_MAX_INTRS);
 DECLARE_BITMAP(ipi_available, GIC_MAX_INTRS);
 
+static struct gic_all_vpes_chip_data {
+	u32	map;
+	bool	mask;
+} gic_all_vpes_chip_data[GIC_NUM_LOCAL_INTRS];
+
 static void gic_clear_pcpu_masks(unsigned int intr)
 {
 	unsigned int i;
@@ -338,13 +344,17 @@ static struct irq_chip gic_local_irq_controller = {
 
 static void gic_mask_local_irq_all_vpes(struct irq_data *d)
 {
-	int intr = GIC_HWIRQ_TO_LOCAL(d->hwirq);
-	int i;
+	struct gic_all_vpes_chip_data *cd;
 	unsigned long flags;
+	int intr, cpu;
+
+	intr = GIC_HWIRQ_TO_LOCAL(d->hwirq);
+	cd = irq_data_get_irq_chip_data(d);
+	cd->mask = false;
 
 	spin_lock_irqsave(&gic_lock, flags);
-	for (i = 0; i < gic_vpes; i++) {
-		write_gic_vl_other(mips_cm_vp_id(i));
+	for_each_online_cpu(cpu) {
+		write_gic_vl_other(mips_cm_vp_id(cpu));
 		write_gic_vo_rmask(BIT(intr));
 	}
 	spin_unlock_irqrestore(&gic_lock, flags);
@@ -352,22 +362,40 @@ static void gic_mask_local_irq_all_vpes(struct irq_data *d)
 
 static void gic_unmask_local_irq_all_vpes(struct irq_data *d)
 {
-	int intr = GIC_HWIRQ_TO_LOCAL(d->hwirq);
-	int i;
+	struct gic_all_vpes_chip_data *cd;
 	unsigned long flags;
+	int intr, cpu;
+
+	intr = GIC_HWIRQ_TO_LOCAL(d->hwirq);
+	cd = irq_data_get_irq_chip_data(d);
+	cd->mask = true;
 
 	spin_lock_irqsave(&gic_lock, flags);
-	for (i = 0; i < gic_vpes; i++) {
-		write_gic_vl_other(mips_cm_vp_id(i));
+	for_each_online_cpu(cpu) {
+		write_gic_vl_other(mips_cm_vp_id(cpu));
 		write_gic_vo_smask(BIT(intr));
 	}
 	spin_unlock_irqrestore(&gic_lock, flags);
 }
 
+static void gic_all_vpes_irq_cpu_online(struct irq_data *d)
+{
+	struct gic_all_vpes_chip_data *cd;
+	unsigned int intr;
+
+	intr = GIC_HWIRQ_TO_LOCAL(d->hwirq);
+	cd = irq_data_get_irq_chip_data(d);
+
+	write_gic_vl_map(intr, cd->map);
+	if (cd->mask)
+		write_gic_vl_smask(BIT(intr));
+}
+
 static struct irq_chip gic_all_vpes_local_irq_controller = {
-	.name			=	"MIPS GIC Local",
-	.irq_mask		=	gic_mask_local_irq_all_vpes,
-	.irq_unmask		=	gic_unmask_local_irq_all_vpes,
+	.name			= "MIPS GIC Local",
+	.irq_mask		= gic_mask_local_irq_all_vpes,
+	.irq_unmask		= gic_unmask_local_irq_all_vpes,
+	.irq_cpu_online		= gic_all_vpes_irq_cpu_online,
 };
 
 static void __gic_irq_dispatch(void)
@@ -424,9 +452,10 @@ static int gic_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
 static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq,
 			      irq_hw_number_t hwirq)
 {
+	struct gic_all_vpes_chip_data *cd;
 	unsigned long flags;
 	unsigned int intr;
-	int err, i;
+	int err, cpu;
 	u32 map;
 
 	if (hwirq >= GIC_SHARED_HWIRQ_BASE) {
@@ -459,9 +488,11 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq,
 		 * the rest of the MIPS kernel code does not use the
 		 * percpu IRQ API for them.
 		 */
+		cd = &gic_all_vpes_chip_data[intr];
+		cd->map = map;
 		err = irq_domain_set_hwirq_and_chip(d, virq, hwirq,
 						    &gic_all_vpes_local_irq_controller,
-						    NULL);
+						    cd);
 		if (err)
 			return err;
 
@@ -484,8 +515,8 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq,
 		return -EPERM;
 
 	spin_lock_irqsave(&gic_lock, flags);
-	for (i = 0; i < gic_vpes; i++) {
-		write_gic_vl_other(mips_cm_vp_id(i));
+	for_each_online_cpu(cpu) {
+		write_gic_vl_other(mips_cm_vp_id(cpu));
 		write_gic_vo_map(intr, map);
 	}
 	spin_unlock_irqrestore(&gic_lock, flags);
@@ -622,6 +653,13 @@ static const struct irq_domain_ops gic_ipi_domain_ops = {
 	.match = gic_ipi_domain_match,
 };
 
+static int gic_cpu_startup(unsigned int cpu)
+{
+	/* Invoke irq_cpu_online callbacks to enable desired interrupts */
+	irq_cpu_online();
+
+	return 0;
+}
 
 static int __init gic_of_init(struct device_node *node,
 			      struct device_node *parent)
@@ -768,6 +806,8 @@ static int __init gic_of_init(struct device_node *node,
 		}
 	}
 
-	return 0;
+	return cpuhp_setup_state(CPUHP_AP_IRQ_MIPS_GIC_STARTING,
+				 "irqchip/mips/gic:starting",
+				 gic_cpu_startup, NULL);
 }
 IRQCHIP_DECLARE(mips_gic, "mti,gic", gic_of_init);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 6d508767e144..1966a45bc453 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -98,6 +98,7 @@ enum cpuhp_state {
 	CPUHP_AP_IRQ_HIP04_STARTING,
 	CPUHP_AP_IRQ_ARMADA_XP_STARTING,
 	CPUHP_AP_IRQ_BCM2836_STARTING,
+	CPUHP_AP_IRQ_MIPS_GIC_STARTING,
 	CPUHP_AP_ARM_MVEBU_COHERENCY,
 	CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
 	CPUHP_AP_PERF_X86_STARTING,
-- 
cgit v1.2.3


From aa9ad44a42b4cf4387f8ecddaf8e51707fdcda5a Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 23 Aug 2017 12:48:26 -0700
Subject: libnvdimm: move poison list functions to a new 'badrange' file

nfit_test needs to use the poison list manipulation code as well. Make
it more generic and in the process rename poison to badrange, and move
all the related helpers to a new file.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
[vishal: Add badrange.o to nfit_test's Kbuild]
[vishal: add a missed include in bus.c for the new badrange functions]
[vishal: rename all instances of 'be' to 'bre']
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c    |   2 +-
 drivers/acpi/nfit/mce.c     |   2 +-
 drivers/nvdimm/Makefile     |   1 +
 drivers/nvdimm/badrange.c   | 294 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/nvdimm/bus.c        |  24 ++--
 drivers/nvdimm/core.c       | 260 +--------------------------------------
 drivers/nvdimm/nd-core.h    |   3 +-
 drivers/nvdimm/nd.h         |   6 -
 include/linux/libnvdimm.h   |  21 +++-
 tools/testing/nvdimm/Kbuild |   1 +
 10 files changed, 332 insertions(+), 282 deletions(-)
 create mode 100644 drivers/nvdimm/badrange.c

(limited to 'include/linux')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index aa75d67ceee9..8043bfde7c63 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2486,7 +2486,7 @@ static int ars_status_process_records(struct acpi_nfit_desc *acpi_desc,
 		if (ars_status->out_length
 				< 44 + sizeof(struct nd_ars_record) * (i + 1))
 			break;
-		rc = nvdimm_bus_add_poison(nvdimm_bus,
+		rc = nvdimm_bus_add_badrange(nvdimm_bus,
 				ars_status->records[i].err_address,
 				ars_status->records[i].length);
 		if (rc)
diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c
index feeb95d574fa..b92921439657 100644
--- a/drivers/acpi/nfit/mce.c
+++ b/drivers/acpi/nfit/mce.c
@@ -67,7 +67,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
 			continue;
 
 		/* If this fails due to an -ENOMEM, there is little we can do */
-		nvdimm_bus_add_poison(acpi_desc->nvdimm_bus,
+		nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus,
 				ALIGN(mce->addr, L1_CACHE_BYTES),
 				L1_CACHE_BYTES);
 		nvdimm_region_notify(nfit_spa->nd_region,
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 909554c3f955..ca6d325438a3 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -20,6 +20,7 @@ libnvdimm-y += region_devs.o
 libnvdimm-y += region.o
 libnvdimm-y += namespace_devs.o
 libnvdimm-y += label.o
+libnvdimm-y += badrange.o
 libnvdimm-$(CONFIG_ND_CLAIM) += claim.o
 libnvdimm-$(CONFIG_BTT) += btt_devs.o
 libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o
diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c
new file mode 100644
index 000000000000..0b67dcf43234
--- /dev/null
+++ b/drivers/nvdimm/badrange.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/libnvdimm.h>
+#include <linux/badblocks.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/device.h>
+#include <linux/ctype.h>
+#include <linux/ndctl.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include "nd-core.h"
+#include "nd.h"
+
+void badrange_init(struct badrange *badrange)
+{
+	INIT_LIST_HEAD(&badrange->list);
+	spin_lock_init(&badrange->lock);
+}
+EXPORT_SYMBOL_GPL(badrange_init);
+
+static void append_badrange_entry(struct badrange *badrange,
+		struct badrange_entry *bre, u64 addr, u64 length)
+{
+	lockdep_assert_held(&badrange->lock);
+	bre->start = addr;
+	bre->length = length;
+	list_add_tail(&bre->list, &badrange->list);
+}
+
+static int alloc_and_append_badrange_entry(struct badrange *badrange,
+		u64 addr, u64 length, gfp_t flags)
+{
+	struct badrange_entry *bre;
+
+	bre = kzalloc(sizeof(*bre), flags);
+	if (!bre)
+		return -ENOMEM;
+
+	append_badrange_entry(badrange, bre, addr, length);
+	return 0;
+}
+
+static int add_badrange(struct badrange *badrange, u64 addr, u64 length)
+{
+	struct badrange_entry *bre, *bre_new;
+
+	spin_unlock(&badrange->lock);
+	bre_new = kzalloc(sizeof(*bre_new), GFP_KERNEL);
+	spin_lock(&badrange->lock);
+
+	if (list_empty(&badrange->list)) {
+		if (!bre_new)
+			return -ENOMEM;
+		append_badrange_entry(badrange, bre_new, addr, length);
+		return 0;
+	}
+
+	/*
+	 * There is a chance this is a duplicate, check for those first.
+	 * This will be the common case as ARS_STATUS returns all known
+	 * errors in the SPA space, and we can't query it per region
+	 */
+	list_for_each_entry(bre, &badrange->list, list)
+		if (bre->start == addr) {
+			/* If length has changed, update this list entry */
+			if (bre->length != length)
+				bre->length = length;
+			kfree(bre_new);
+			return 0;
+		}
+
+	/*
+	 * If not a duplicate or a simple length update, add the entry as is,
+	 * as any overlapping ranges will get resolved when the list is consumed
+	 * and converted to badblocks
+	 */
+	if (!bre_new)
+		return -ENOMEM;
+	append_badrange_entry(badrange, bre_new, addr, length);
+
+	return 0;
+}
+
+int badrange_add(struct badrange *badrange, u64 addr, u64 length)
+{
+	int rc;
+
+	spin_lock(&badrange->lock);
+	rc = add_badrange(badrange, addr, length);
+	spin_unlock(&badrange->lock);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(badrange_add);
+
+void badrange_forget(struct badrange *badrange, phys_addr_t start,
+		unsigned int len)
+{
+	struct list_head *badrange_list = &badrange->list;
+	u64 clr_end = start + len - 1;
+	struct badrange_entry *bre, *next;
+
+	spin_lock(&badrange->lock);
+	WARN_ON_ONCE(list_empty(badrange_list));
+
+	/*
+	 * [start, clr_end] is the badrange interval being cleared.
+	 * [bre->start, bre_end] is the badrange_list entry we're comparing
+	 * the above interval against. The badrange list entry may need
+	 * to be modified (update either start or length), deleted, or
+	 * split into two based on the overlap characteristics
+	 */
+
+	list_for_each_entry_safe(bre, next, badrange_list, list) {
+		u64 bre_end = bre->start + bre->length - 1;
+
+		/* Skip intervals with no intersection */
+		if (bre_end < start)
+			continue;
+		if (bre->start >  clr_end)
+			continue;
+		/* Delete completely overlapped badrange entries */
+		if ((bre->start >= start) && (bre_end <= clr_end)) {
+			list_del(&bre->list);
+			kfree(bre);
+			continue;
+		}
+		/* Adjust start point of partially cleared entries */
+		if ((start <= bre->start) && (clr_end > bre->start)) {
+			bre->length -= clr_end - bre->start + 1;
+			bre->start = clr_end + 1;
+			continue;
+		}
+		/* Adjust bre->length for partial clearing at the tail end */
+		if ((bre->start < start) && (bre_end <= clr_end)) {
+			/* bre->start remains the same */
+			bre->length = start - bre->start;
+			continue;
+		}
+		/*
+		 * If clearing in the middle of an entry, we split it into
+		 * two by modifying the current entry to represent one half of
+		 * the split, and adding a new entry for the second half.
+		 */
+		if ((bre->start < start) && (bre_end > clr_end)) {
+			u64 new_start = clr_end + 1;
+			u64 new_len = bre_end - new_start + 1;
+
+			/* Add new entry covering the right half */
+			alloc_and_append_badrange_entry(badrange, new_start,
+					new_len, GFP_NOWAIT);
+			/* Adjust this entry to cover the left half */
+			bre->length = start - bre->start;
+			continue;
+		}
+	}
+	spin_unlock(&badrange->lock);
+}
+EXPORT_SYMBOL_GPL(badrange_forget);
+
+static void set_badblock(struct badblocks *bb, sector_t s, int num)
+{
+	dev_dbg(bb->dev, "Found a bad range (0x%llx, 0x%llx)\n",
+			(u64) s * 512, (u64) num * 512);
+	/* this isn't an error as the hardware will still throw an exception */
+	if (badblocks_set(bb, s, num, 1))
+		dev_info_once(bb->dev, "%s: failed for sector %llx\n",
+				__func__, (u64) s);
+}
+
+/**
+ * __add_badblock_range() - Convert a physical address range to bad sectors
+ * @bb:		badblocks instance to populate
+ * @ns_offset:	namespace offset where the error range begins (in bytes)
+ * @len:	number of bytes of badrange to be added
+ *
+ * This assumes that the range provided with (ns_offset, len) is within
+ * the bounds of physical addresses for this namespace, i.e. lies in the
+ * interval [ns_start, ns_start + ns_size)
+ */
+static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
+{
+	const unsigned int sector_size = 512;
+	sector_t start_sector, end_sector;
+	u64 num_sectors;
+	u32 rem;
+
+	start_sector = div_u64(ns_offset, sector_size);
+	end_sector = div_u64_rem(ns_offset + len, sector_size, &rem);
+	if (rem)
+		end_sector++;
+	num_sectors = end_sector - start_sector;
+
+	if (unlikely(num_sectors > (u64)INT_MAX)) {
+		u64 remaining = num_sectors;
+		sector_t s = start_sector;
+
+		while (remaining) {
+			int done = min_t(u64, remaining, INT_MAX);
+
+			set_badblock(bb, s, done);
+			remaining -= done;
+			s += done;
+		}
+	} else
+		set_badblock(bb, start_sector, num_sectors);
+}
+
+static void badblocks_populate(struct badrange *badrange,
+		struct badblocks *bb, const struct resource *res)
+{
+	struct badrange_entry *bre;
+
+	if (list_empty(&badrange->list))
+		return;
+
+	list_for_each_entry(bre, &badrange->list, list) {
+		u64 bre_end = bre->start + bre->length - 1;
+
+		/* Discard intervals with no intersection */
+		if (bre_end < res->start)
+			continue;
+		if (bre->start >  res->end)
+			continue;
+		/* Deal with any overlap after start of the namespace */
+		if (bre->start >= res->start) {
+			u64 start = bre->start;
+			u64 len;
+
+			if (bre_end <= res->end)
+				len = bre->length;
+			else
+				len = res->start + resource_size(res)
+					- bre->start;
+			__add_badblock_range(bb, start - res->start, len);
+			continue;
+		}
+		/*
+		 * Deal with overlap for badrange starting before
+		 * the namespace.
+		 */
+		if (bre->start < res->start) {
+			u64 len;
+
+			if (bre_end < res->end)
+				len = bre->start + bre->length - res->start;
+			else
+				len = resource_size(res);
+			__add_badblock_range(bb, 0, len);
+		}
+	}
+}
+
+/**
+ * nvdimm_badblocks_populate() - Convert a list of badranges to badblocks
+ * @region: parent region of the range to interrogate
+ * @bb: badblocks instance to populate
+ * @res: resource range to consider
+ *
+ * The badrange list generated during bus initialization may contain
+ * multiple, possibly overlapping physical address ranges.  Compare each
+ * of these ranges to the resource range currently being initialized,
+ * and add badblocks entries for all matching sub-ranges
+ */
+void nvdimm_badblocks_populate(struct nd_region *nd_region,
+		struct badblocks *bb, const struct resource *res)
+{
+	struct nvdimm_bus *nvdimm_bus;
+
+	if (!is_memory(&nd_region->dev)) {
+		dev_WARN_ONCE(&nd_region->dev, 1,
+				"%s only valid for pmem regions\n", __func__);
+		return;
+	}
+	nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
+
+	nvdimm_bus_lock(&nvdimm_bus->dev);
+	badblocks_populate(&nvdimm_bus->badrange, bb, res);
+	nvdimm_bus_unlock(&nvdimm_bus->dev);
+}
+EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index baf283986a7e..0a5e6cd758fe 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -11,6 +11,7 @@
  * General Public License for more details.
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/libnvdimm.h>
 #include <linux/sched/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
@@ -221,7 +222,7 @@ static void nvdimm_account_cleared_poison(struct nvdimm_bus *nvdimm_bus,
 		phys_addr_t phys, u64 cleared)
 {
 	if (cleared > 0)
-		nvdimm_forget_poison(nvdimm_bus, phys, cleared);
+		badrange_forget(&nvdimm_bus->badrange, phys, cleared);
 
 	if (cleared > 0 && cleared / 512)
 		nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared);
@@ -344,11 +345,10 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 		return NULL;
 	INIT_LIST_HEAD(&nvdimm_bus->list);
 	INIT_LIST_HEAD(&nvdimm_bus->mapping_list);
-	INIT_LIST_HEAD(&nvdimm_bus->poison_list);
 	init_waitqueue_head(&nvdimm_bus->probe_wait);
 	nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
 	mutex_init(&nvdimm_bus->reconfig_mutex);
-	spin_lock_init(&nvdimm_bus->poison_lock);
+	badrange_init(&nvdimm_bus->badrange);
 	if (nvdimm_bus->id < 0) {
 		kfree(nvdimm_bus);
 		return NULL;
@@ -395,15 +395,15 @@ static int child_unregister(struct device *dev, void *data)
 	return 0;
 }
 
-static void free_poison_list(struct list_head *poison_list)
+static void free_badrange_list(struct list_head *badrange_list)
 {
-	struct nd_poison *pl, *next;
+	struct badrange_entry *bre, *next;
 
-	list_for_each_entry_safe(pl, next, poison_list, list) {
-		list_del(&pl->list);
-		kfree(pl);
+	list_for_each_entry_safe(bre, next, badrange_list, list) {
+		list_del(&bre->list);
+		kfree(bre);
 	}
-	list_del_init(poison_list);
+	list_del_init(badrange_list);
 }
 
 static int nd_bus_remove(struct device *dev)
@@ -417,9 +417,9 @@ static int nd_bus_remove(struct device *dev)
 	nd_synchronize();
 	device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
 
-	spin_lock(&nvdimm_bus->poison_lock);
-	free_poison_list(&nvdimm_bus->poison_list);
-	spin_unlock(&nvdimm_bus->poison_lock);
+	spin_lock(&nvdimm_bus->badrange.lock);
+	free_badrange_list(&nvdimm_bus->badrange.list);
+	spin_unlock(&nvdimm_bus->badrange.lock);
 
 	nvdimm_bus_destroy_ndctl(nvdimm_bus);
 
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index bb71f0cf8f5d..1dc527660637 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -398,265 +398,11 @@ struct attribute_group nvdimm_bus_attribute_group = {
 };
 EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group);
 
-static void set_badblock(struct badblocks *bb, sector_t s, int num)
+int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 {
-	dev_dbg(bb->dev, "Found a poison range (0x%llx, 0x%llx)\n",
-			(u64) s * 512, (u64) num * 512);
-	/* this isn't an error as the hardware will still throw an exception */
-	if (badblocks_set(bb, s, num, 1))
-		dev_info_once(bb->dev, "%s: failed for sector %llx\n",
-				__func__, (u64) s);
+	return badrange_add(&nvdimm_bus->badrange, addr, length);
 }
-
-/**
- * __add_badblock_range() - Convert a physical address range to bad sectors
- * @bb:		badblocks instance to populate
- * @ns_offset:	namespace offset where the error range begins (in bytes)
- * @len:	number of bytes of poison to be added
- *
- * This assumes that the range provided with (ns_offset, len) is within
- * the bounds of physical addresses for this namespace, i.e. lies in the
- * interval [ns_start, ns_start + ns_size)
- */
-static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
-{
-	const unsigned int sector_size = 512;
-	sector_t start_sector, end_sector;
-	u64 num_sectors;
-	u32 rem;
-
-	start_sector = div_u64(ns_offset, sector_size);
-	end_sector = div_u64_rem(ns_offset + len, sector_size, &rem);
-	if (rem)
-		end_sector++;
-	num_sectors = end_sector - start_sector;
-
-	if (unlikely(num_sectors > (u64)INT_MAX)) {
-		u64 remaining = num_sectors;
-		sector_t s = start_sector;
-
-		while (remaining) {
-			int done = min_t(u64, remaining, INT_MAX);
-
-			set_badblock(bb, s, done);
-			remaining -= done;
-			s += done;
-		}
-	} else
-		set_badblock(bb, start_sector, num_sectors);
-}
-
-static void badblocks_populate(struct list_head *poison_list,
-		struct badblocks *bb, const struct resource *res)
-{
-	struct nd_poison *pl;
-
-	if (list_empty(poison_list))
-		return;
-
-	list_for_each_entry(pl, poison_list, list) {
-		u64 pl_end = pl->start + pl->length - 1;
-
-		/* Discard intervals with no intersection */
-		if (pl_end < res->start)
-			continue;
-		if (pl->start >  res->end)
-			continue;
-		/* Deal with any overlap after start of the namespace */
-		if (pl->start >= res->start) {
-			u64 start = pl->start;
-			u64 len;
-
-			if (pl_end <= res->end)
-				len = pl->length;
-			else
-				len = res->start + resource_size(res)
-					- pl->start;
-			__add_badblock_range(bb, start - res->start, len);
-			continue;
-		}
-		/* Deal with overlap for poison starting before the namespace */
-		if (pl->start < res->start) {
-			u64 len;
-
-			if (pl_end < res->end)
-				len = pl->start + pl->length - res->start;
-			else
-				len = resource_size(res);
-			__add_badblock_range(bb, 0, len);
-		}
-	}
-}
-
-/**
- * nvdimm_badblocks_populate() - Convert a list of poison ranges to badblocks
- * @region: parent region of the range to interrogate
- * @bb: badblocks instance to populate
- * @res: resource range to consider
- *
- * The poison list generated during bus initialization may contain
- * multiple, possibly overlapping physical address ranges.  Compare each
- * of these ranges to the resource range currently being initialized,
- * and add badblocks entries for all matching sub-ranges
- */
-void nvdimm_badblocks_populate(struct nd_region *nd_region,
-		struct badblocks *bb, const struct resource *res)
-{
-	struct nvdimm_bus *nvdimm_bus;
-	struct list_head *poison_list;
-
-	if (!is_memory(&nd_region->dev)) {
-		dev_WARN_ONCE(&nd_region->dev, 1,
-				"%s only valid for pmem regions\n", __func__);
-		return;
-	}
-	nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
-	poison_list = &nvdimm_bus->poison_list;
-
-	nvdimm_bus_lock(&nvdimm_bus->dev);
-	badblocks_populate(poison_list, bb, res);
-	nvdimm_bus_unlock(&nvdimm_bus->dev);
-}
-EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
-
-static void append_poison_entry(struct nvdimm_bus *nvdimm_bus,
-		struct nd_poison *pl, u64 addr, u64 length)
-{
-	lockdep_assert_held(&nvdimm_bus->poison_lock);
-	pl->start = addr;
-	pl->length = length;
-	list_add_tail(&pl->list, &nvdimm_bus->poison_list);
-}
-
-static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length,
-			gfp_t flags)
-{
-	struct nd_poison *pl;
-
-	pl = kzalloc(sizeof(*pl), flags);
-	if (!pl)
-		return -ENOMEM;
-
-	append_poison_entry(nvdimm_bus, pl, addr, length);
-	return 0;
-}
-
-static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
-{
-	struct nd_poison *pl, *pl_new;
-
-	spin_unlock(&nvdimm_bus->poison_lock);
-	pl_new = kzalloc(sizeof(*pl_new), GFP_KERNEL);
-	spin_lock(&nvdimm_bus->poison_lock);
-
-	if (list_empty(&nvdimm_bus->poison_list)) {
-		if (!pl_new)
-			return -ENOMEM;
-		append_poison_entry(nvdimm_bus, pl_new, addr, length);
-		return 0;
-	}
-
-	/*
-	 * There is a chance this is a duplicate, check for those first.
-	 * This will be the common case as ARS_STATUS returns all known
-	 * errors in the SPA space, and we can't query it per region
-	 */
-	list_for_each_entry(pl, &nvdimm_bus->poison_list, list)
-		if (pl->start == addr) {
-			/* If length has changed, update this list entry */
-			if (pl->length != length)
-				pl->length = length;
-			kfree(pl_new);
-			return 0;
-		}
-
-	/*
-	 * If not a duplicate or a simple length update, add the entry as is,
-	 * as any overlapping ranges will get resolved when the list is consumed
-	 * and converted to badblocks
-	 */
-	if (!pl_new)
-		return -ENOMEM;
-	append_poison_entry(nvdimm_bus, pl_new, addr, length);
-
-	return 0;
-}
-
-int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
-{
-	int rc;
-
-	spin_lock(&nvdimm_bus->poison_lock);
-	rc = bus_add_poison(nvdimm_bus, addr, length);
-	spin_unlock(&nvdimm_bus->poison_lock);
-
-	return rc;
-}
-EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
-
-void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start,
-		unsigned int len)
-{
-	struct list_head *poison_list = &nvdimm_bus->poison_list;
-	u64 clr_end = start + len - 1;
-	struct nd_poison *pl, *next;
-
-	spin_lock(&nvdimm_bus->poison_lock);
-	WARN_ON_ONCE(list_empty(poison_list));
-
-	/*
-	 * [start, clr_end] is the poison interval being cleared.
-	 * [pl->start, pl_end] is the poison_list entry we're comparing
-	 * the above interval against. The poison list entry may need
-	 * to be modified (update either start or length), deleted, or
-	 * split into two based on the overlap characteristics
-	 */
-
-	list_for_each_entry_safe(pl, next, poison_list, list) {
-		u64 pl_end = pl->start + pl->length - 1;
-
-		/* Skip intervals with no intersection */
-		if (pl_end < start)
-			continue;
-		if (pl->start >  clr_end)
-			continue;
-		/* Delete completely overlapped poison entries */
-		if ((pl->start >= start) && (pl_end <= clr_end)) {
-			list_del(&pl->list);
-			kfree(pl);
-			continue;
-		}
-		/* Adjust start point of partially cleared entries */
-		if ((start <= pl->start) && (clr_end > pl->start)) {
-			pl->length -= clr_end - pl->start + 1;
-			pl->start = clr_end + 1;
-			continue;
-		}
-		/* Adjust pl->length for partial clearing at the tail end */
-		if ((pl->start < start) && (pl_end <= clr_end)) {
-			/* pl->start remains the same */
-			pl->length = start - pl->start;
-			continue;
-		}
-		/*
-		 * If clearing in the middle of an entry, we split it into
-		 * two by modifying the current entry to represent one half of
-		 * the split, and adding a new entry for the second half.
-		 */
-		if ((pl->start < start) && (pl_end > clr_end)) {
-			u64 new_start = clr_end + 1;
-			u64 new_len = pl_end - new_start + 1;
-
-			/* Add new entry covering the right half */
-			add_poison(nvdimm_bus, new_start, new_len, GFP_NOWAIT);
-			/* Adjust this entry to cover the left half */
-			pl->length = start - pl->start;
-			continue;
-		}
-	}
-	spin_unlock(&nvdimm_bus->poison_lock);
-}
-EXPORT_SYMBOL_GPL(nvdimm_forget_poison);
+EXPORT_SYMBOL_GPL(nvdimm_bus_add_badrange);
 
 #ifdef CONFIG_BLK_DEV_INTEGRITY
 int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 86bc19ae30da..79274ead54fb 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -29,10 +29,9 @@ struct nvdimm_bus {
 	struct list_head list;
 	struct device dev;
 	int id, probe_active;
-	struct list_head poison_list;
 	struct list_head mapping_list;
 	struct mutex reconfig_mutex;
-	spinlock_t poison_lock;
+	struct badrange badrange;
 };
 
 struct nvdimm {
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 156be00e1f76..e958f3724c41 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -34,12 +34,6 @@ enum {
 	NVDIMM_IO_ATOMIC = 1,
 };
 
-struct nd_poison {
-	u64 start;
-	u64 length;
-	struct list_head list;
-};
-
 struct nvdimm_drvdata {
 	struct device *dev;
 	int nslabel_size;
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 3eaad2fbf284..f8109ddb5ef1 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -18,6 +18,18 @@
 #include <linux/sizes.h>
 #include <linux/types.h>
 #include <linux/uuid.h>
+#include <linux/spinlock.h>
+
+struct badrange_entry {
+	u64 start;
+	u64 length;
+	struct list_head list;
+};
+
+struct badrange {
+	struct list_head list;
+	spinlock_t lock;
+};
 
 enum {
 	/* when a dimm supports both PMEM and BLK access a label is required */
@@ -129,9 +141,12 @@ static inline struct nd_blk_region_desc *to_blk_region_desc(
 
 }
 
-int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length);
-void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
-		phys_addr_t start, unsigned int len);
+void badrange_init(struct badrange *badrange);
+int badrange_add(struct badrange *badrange, u64 addr, u64 length);
+void badrange_forget(struct badrange *badrange, phys_addr_t start,
+		unsigned int len);
+int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr,
+		u64 length);
 struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 		struct nvdimm_bus_descriptor *nfit_desc);
 void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus);
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index d870520da68b..3272ab5a5cb9 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -69,6 +69,7 @@ libnvdimm-y += $(NVDIMM_SRC)/region_devs.o
 libnvdimm-y += $(NVDIMM_SRC)/region.o
 libnvdimm-y += $(NVDIMM_SRC)/namespace_devs.o
 libnvdimm-y += $(NVDIMM_SRC)/label.o
+libnvdimm-y += $(NVDIMM_SRC)/badrange.o
 libnvdimm-$(CONFIG_ND_CLAIM) += $(NVDIMM_SRC)/claim.o
 libnvdimm-$(CONFIG_BTT) += $(NVDIMM_SRC)/btt_devs.o
 libnvdimm-$(CONFIG_NVDIMM_PFN) += $(NVDIMM_SRC)/pfn_devs.o
-- 
cgit v1.2.3


From 054287295b1132c8742ea55f8e3af9cbd630c932 Mon Sep 17 00:00:00 2001
From: Egil Hjelmeland <privat@egil-hjelmeland.no>
Date: Thu, 2 Nov 2017 10:36:48 +0100
Subject: net: Define eth_stp_addr in linux/etherdevice.h

The lan9303 driver defines eth_stp_addr as a synonym to
eth_reserved_addr_base to get the STP ethernet address 01:80:c2:00:00:00.

eth_reserved_addr_base is also used to define the start of Bridge Reserved
ethernet address range, which happen to be the STP address.

br_dev_setup refer to eth_reserved_addr_base as a definition of STP
address.

Clean up by:
 - Move the eth_stp_addr definition to linux/etherdevice.h
 - Use eth_stp_addr instead of eth_reserved_addr_base in br_dev_setup.

Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/lan9303.h | 2 --
 include/linux/etherdevice.h | 1 +
 net/bridge/br_device.c      | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/lan9303.h b/include/linux/dsa/lan9303.h
index b2110e69630f..05d8d136baab 100644
--- a/include/linux/dsa/lan9303.h
+++ b/include/linux/dsa/lan9303.h
@@ -34,5 +34,3 @@ struct lan9303 {
 	 **/
 	struct lan9303_alr_cache_entry alr_cache[LAN9303_NUM_ALR_RECORDS];
 };
-
-#define eth_stp_addr eth_reserved_addr_base
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 2d9f80848d4b..263dbcad22fc 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -66,6 +66,7 @@ int eth_gro_complete(struct sk_buff *skb, int nhoff);
 /* Reserved Ethernet Addresses per IEEE 802.1Q */
 static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) =
 { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
+#define eth_stp_addr eth_reserved_addr_base
 
 /**
  * is_link_local_ether_addr - Determine if given Ethernet address is link-local
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 28bb22186fa0..af5b8c87f590 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -421,7 +421,7 @@ void br_dev_setup(struct net_device *dev)
 	br->bridge_id.prio[0] = 0x80;
 	br->bridge_id.prio[1] = 0x00;
 
-	ether_addr_copy(br->group_addr, eth_reserved_addr_base);
+	ether_addr_copy(br->group_addr, eth_stp_addr);
 
 	br->stp_enabled = BR_NO_STP;
 	br->group_fwd_mask = BR_GROUPFWD_DEFAULT;
-- 
cgit v1.2.3


From 17c0899682a55666e58f2200d5599f3a7b22867f Mon Sep 17 00:00:00 2001
From: Jack Pham <jackp@codeaurora.org>
Date: Thu, 2 Nov 2017 17:21:41 -0700
Subject: usb: remove msm_hsusb_hw.h

The last two remaining drivers (ehci-msm.c and phy-msm-usb.c) that
needed this header were recently removed, so delete this now-unused
file.

Signed-off-by: Jack Pham <jackp@codeaurora.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/msm_hsusb_hw.h | 77 ----------------------------------------
 1 file changed, 77 deletions(-)
 delete mode 100644 include/linux/usb/msm_hsusb_hw.h

(limited to 'include/linux')

diff --git a/include/linux/usb/msm_hsusb_hw.h b/include/linux/usb/msm_hsusb_hw.h
deleted file mode 100644
index 974c3796a23f..000000000000
--- a/include/linux/usb/msm_hsusb_hw.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (C) 2007 Google, Inc.
- * Author: Brian Swetland <swetland@google.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#ifndef __LINUX_USB_GADGET_MSM72K_UDC_H__
-#define __LINUX_USB_GADGET_MSM72K_UDC_H__
-
-/* USB phy selector - in TCSR address range */
-#define USB2_PHY_SEL         0xfd4ab000
-
-#define USB_AHBBURST         (MSM_USB_BASE + 0x0090)
-#define USB_AHBMODE          (MSM_USB_BASE + 0x0098)
-#define USB_GENCONFIG_2      (MSM_USB_BASE + 0x00a0)
-#define ULPI_TX_PKT_EN_CLR_FIX	BIT(19)
-
-#define USB_CAPLENGTH        (MSM_USB_BASE + 0x0100) /* 8 bit */
-
-#define USB_USBCMD           (MSM_USB_BASE + 0x0140)
-#define USB_PORTSC           (MSM_USB_BASE + 0x0184)
-#define USB_OTGSC            (MSM_USB_BASE + 0x01A4)
-#define USB_USBMODE          (MSM_USB_BASE + 0x01A8)
-#define USB_PHY_CTRL         (MSM_USB_BASE + 0x0240)
-#define USB_PHY_CTRL2        (MSM_USB_BASE + 0x0278)
-
-#define GENCONFIG_2_SESS_VLD_CTRL_EN	BIT(7)
-#define USBCMD_SESS_VLD_CTRL		BIT(25)
-
-#define USBCMD_RESET   2
-#define USB_USBINTR          (MSM_USB_BASE + 0x0148)
-
-#define PORTSC_PHCD            (1 << 23) /* phy suspend mode */
-#define PORTSC_PTS_MASK        (3 << 30)
-#define PORTSC_PTS_ULPI        (2 << 30)
-#define PORTSC_PTS_SERIAL      (3 << 30)
-
-#define USB_ULPI_VIEWPORT    (MSM_USB_BASE + 0x0170)
-#define ULPI_RUN              (1 << 30)
-#define ULPI_WRITE            (1 << 29)
-#define ULPI_READ             (0 << 29)
-#define ULPI_ADDR(n)          (((n) & 255) << 16)
-#define ULPI_DATA(n)          ((n) & 255)
-#define ULPI_DATA_READ(n)     (((n) >> 8) & 255)
-
-/* synopsys 28nm phy registers */
-#define ULPI_PWR_CLK_MNG_REG	0x88
-#define OTG_COMP_DISABLE	BIT(0)
-
-#define ULPI_MISC_A			0x96
-#define ULPI_MISC_A_VBUSVLDEXTSEL	BIT(1)
-#define ULPI_MISC_A_VBUSVLDEXT		BIT(0)
-
-#define ASYNC_INTR_CTRL         (1 << 29) /* Enable async interrupt */
-#define ULPI_STP_CTRL           (1 << 30) /* Block communication with PHY */
-#define PHY_RETEN               (1 << 1) /* PHY retention enable/disable */
-#define PHY_POR_ASSERT		(1 << 0) /* USB2 28nm PHY POR ASSERT */
-
-/* OTG definitions */
-#define OTGSC_INTSTS_MASK	(0x7f << 16)
-#define OTGSC_ID		(1 << 8)
-#define OTGSC_BSV		(1 << 11)
-#define OTGSC_IDIS		(1 << 16)
-#define OTGSC_BSVIS		(1 << 19)
-#define OTGSC_IDIE		(1 << 24)
-#define OTGSC_BSVIE		(1 << 27)
-
-#endif /* __LINUX_USB_GADGET_MSM72K_UDC_H__ */
-- 
cgit v1.2.3


From ec7ed7708e009e046d1e16ed53ba4d6048748d07 Mon Sep 17 00:00:00 2001
From: Angelo Dureghello <angelo@sysam.it>
Date: Sat, 28 Oct 2017 00:23:01 +0200
Subject: spi: spi-fsl-dspi: enabling Coldfire mcf5441x dspi

Signed-off-by: Angelo Dureghello <angelo@sysam.it>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/Kconfig              |  2 +-
 drivers/spi/spi-fsl-dspi.c       | 66 +++++++++++++++++++++++++++-------------
 include/linux/spi/spi-fsl-dspi.h | 31 +++++++++++++++++++
 3 files changed, 77 insertions(+), 22 deletions(-)
 create mode 100644 include/linux/spi/spi-fsl-dspi.h

(limited to 'include/linux')

diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig
index a75f2a2cf780..a8b761979673 100644
--- a/drivers/spi/Kconfig
+++ b/drivers/spi/Kconfig
@@ -379,7 +379,7 @@ config SPI_FSL_DSPI
 	tristate "Freescale DSPI controller"
 	select REGMAP_MMIO
 	depends on HAS_DMA
-	depends on SOC_VF610 || SOC_LS1021A || ARCH_LAYERSCAPE || COMPILE_TEST
+	depends on SOC_VF610 || SOC_LS1021A || ARCH_LAYERSCAPE || M5441x || COMPILE_TEST
 	help
 	  This enables support for the Freescale DSPI controller in master
 	  mode. VF610 platform uses the controller.
diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c
index d89127f4a46d..f652f70cb8db 100644
--- a/drivers/spi/spi-fsl-dspi.c
+++ b/drivers/spi/spi-fsl-dspi.c
@@ -32,6 +32,7 @@
 #include <linux/regmap.h>
 #include <linux/sched.h>
 #include <linux/spi/spi.h>
+#include <linux/spi/spi-fsl-dspi.h>
 #include <linux/spi/spi_bitbang.h>
 #include <linux/time.h>
 
@@ -151,6 +152,11 @@ static const struct fsl_dspi_devtype_data ls2085a_data = {
 	.max_clock_factor = 8,
 };
 
+static const struct fsl_dspi_devtype_data coldfire_data = {
+	.trans_mode = DSPI_EOQ_MODE,
+	.max_clock_factor = 8,
+};
+
 struct fsl_dspi_dma {
 	/* Length of transfer in words of DSPI_FIFO_SIZE */
 	u32 curr_xfer_len;
@@ -741,6 +747,7 @@ static int dspi_setup(struct spi_device *spi)
 {
 	struct chip_data *chip;
 	struct fsl_dspi *dspi = spi_master_get_devdata(spi->master);
+	struct fsl_dspi_platform_data *pdata;
 	u32 cs_sck_delay = 0, sck_cs_delay = 0;
 	unsigned char br = 0, pbr = 0, pcssck = 0, cssck = 0;
 	unsigned char pasc = 0, asc = 0, fmsz = 0;
@@ -761,11 +768,18 @@ static int dspi_setup(struct spi_device *spi)
 			return -ENOMEM;
 	}
 
-	of_property_read_u32(spi->dev.of_node, "fsl,spi-cs-sck-delay",
-			&cs_sck_delay);
+	pdata = dev_get_platdata(&dspi->pdev->dev);
 
-	of_property_read_u32(spi->dev.of_node, "fsl,spi-sck-cs-delay",
-			&sck_cs_delay);
+	if (!pdata) {
+		of_property_read_u32(spi->dev.of_node, "fsl,spi-cs-sck-delay",
+				&cs_sck_delay);
+
+		of_property_read_u32(spi->dev.of_node, "fsl,spi-sck-cs-delay",
+				&sck_cs_delay);
+	} else {
+		cs_sck_delay = pdata->cs_sck_delay;
+		sck_cs_delay = pdata->sck_cs_delay;
+	}
 
 	chip->mcr_val = SPI_MCR_MASTER | SPI_MCR_PCSIS |
 		SPI_MCR_CLR_TXF | SPI_MCR_CLR_RXF;
@@ -949,6 +963,7 @@ static int dspi_probe(struct platform_device *pdev)
 	struct fsl_dspi *dspi;
 	struct resource *res;
 	void __iomem *base;
+	struct fsl_dspi_platform_data *pdata;
 	int ret = 0, cs_num, bus_num;
 
 	master = spi_alloc_master(&pdev->dev, sizeof(struct fsl_dspi));
@@ -969,25 +984,34 @@ static int dspi_probe(struct platform_device *pdev)
 	master->bits_per_word_mask = SPI_BPW_MASK(4) | SPI_BPW_MASK(8) |
 					SPI_BPW_MASK(16);
 
-	ret = of_property_read_u32(np, "spi-num-chipselects", &cs_num);
-	if (ret < 0) {
-		dev_err(&pdev->dev, "can't get spi-num-chipselects\n");
-		goto out_master_put;
-	}
-	master->num_chipselect = cs_num;
+	pdata = dev_get_platdata(&pdev->dev);
+	if (pdata) {
+		master->num_chipselect = pdata->cs_num;
+		master->bus_num = pdata->bus_num;
 
-	ret = of_property_read_u32(np, "bus-num", &bus_num);
-	if (ret < 0) {
-		dev_err(&pdev->dev, "can't get bus-num\n");
-		goto out_master_put;
-	}
-	master->bus_num = bus_num;
+		dspi->devtype_data = &coldfire_data;
+	} else {
 
-	dspi->devtype_data = of_device_get_match_data(&pdev->dev);
-	if (!dspi->devtype_data) {
-		dev_err(&pdev->dev, "can't get devtype_data\n");
-		ret = -EFAULT;
-		goto out_master_put;
+		ret = of_property_read_u32(np, "spi-num-chipselects", &cs_num);
+		if (ret < 0) {
+			dev_err(&pdev->dev, "can't get spi-num-chipselects\n");
+			goto out_master_put;
+		}
+		master->num_chipselect = cs_num;
+
+		ret = of_property_read_u32(np, "bus-num", &bus_num);
+		if (ret < 0) {
+			dev_err(&pdev->dev, "can't get bus-num\n");
+			goto out_master_put;
+		}
+		master->bus_num = bus_num;
+
+		dspi->devtype_data = of_device_get_match_data(&pdev->dev);
+		if (!dspi->devtype_data) {
+			dev_err(&pdev->dev, "can't get devtype_data\n");
+			ret = -EFAULT;
+			goto out_master_put;
+		}
 	}
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
diff --git a/include/linux/spi/spi-fsl-dspi.h b/include/linux/spi/spi-fsl-dspi.h
new file mode 100644
index 000000000000..74c9bae20bf2
--- /dev/null
+++ b/include/linux/spi/spi-fsl-dspi.h
@@ -0,0 +1,31 @@
+/*
+ * Freescale DSPI controller driver
+ *
+ * Copyright (c) 2017 Angelo Dureghello <angelo@sysam.it>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef SPI_FSL_DSPI_HEADER_H
+#define SPI_FSL_DSPI_HEADER_H
+
+/**
+ * struct fsl_dspi_platform_data - platform data for the Freescale DSPI driver
+ * @bus_num: board specific identifier for this DSPI driver.
+ * @cs_num: number of chip selects supported by this DSPI driver.
+ */
+struct fsl_dspi_platform_data {
+	u32 cs_num;
+	u32 bus_num;
+	u32 sck_cs_delay;
+	u32 cs_sck_delay;
+};
+
+#endif /* SPI_FSL_DSPI_HEADER_H */
-- 
cgit v1.2.3


From 46209401f8f6116bd0b2c2d14a63958e83ffca0b Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 3 Nov 2017 11:46:25 +0100
Subject: net: core: introduce mini_Qdisc and eliminate usage of tp->q for
 clsact fastpath

In sch_handle_egress and sch_handle_ingress tp->q is used only in order
to update stats. So stats and filter list are the only things that are
needed in clsact qdisc fastpath processing. Introduce new mini_Qdisc
struct to hold those items. Also, introduce a helper to swap the
mini_Qdisc structures in case filter list head changes.

This removes need for tp->q usage without added overhead.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  9 ++++++---
 include/net/sch_generic.h | 32 ++++++++++++++++++++++++++++++++
 net/core/dev.c            | 21 +++++++++++----------
 net/sched/sch_generic.c   | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 net/sched/sch_ingress.c   | 19 ++++++++++++++-----
 5 files changed, 109 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5e02f79b2110..7de7656550c2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1559,6 +1559,8 @@ enum netdev_priv_flags {
  *
  *	@rx_handler:		handler for received packets
  *	@rx_handler_data: 	XXX: need comments on this one
+ *	@miniq_ingress:		ingress/clsact qdisc specific data for
+ *				ingress processing
  *	@ingress_queue:		XXX: need comments on this one
  *	@broadcast:		hw bcast address
  *
@@ -1576,7 +1578,8 @@ enum netdev_priv_flags {
  *	@tx_global_lock: 	XXX: need comments on this one
  *
  *	@xps_maps:	XXX: need comments on this one
- *
+ *	@miniq_egress:		clsact qdisc specific data for
+ *				egress processing
  *	@watchdog_timeo:	Represents the timeout that is used by
  *				the watchdog (see dev_watchdog())
  *	@watchdog_timer:	List of timers
@@ -1795,7 +1798,7 @@ struct net_device {
 	void __rcu		*rx_handler_data;
 
 #ifdef CONFIG_NET_CLS_ACT
-	struct tcf_proto __rcu  *ingress_cl_list;
+	struct mini_Qdisc __rcu	*miniq_ingress;
 #endif
 	struct netdev_queue __rcu *ingress_queue;
 #ifdef CONFIG_NETFILTER_INGRESS
@@ -1826,7 +1829,7 @@ struct net_device {
 	struct xps_dev_maps __rcu *xps_maps;
 #endif
 #ifdef CONFIG_NET_CLS_ACT
-	struct tcf_proto __rcu  *egress_cl_list;
+	struct mini_Qdisc __rcu	*miniq_egress;
 #endif
 
 	/* These may be needed for future network-power-down code. */
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f230269e0bfb..c64e62c9450a 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -904,4 +904,36 @@ static inline void psched_ratecfg_getrate(struct tc_ratespec *res,
 	res->linklayer = (r->linklayer & TC_LINKLAYER_MASK);
 }
 
+/* Mini Qdisc serves for specific needs of ingress/clsact Qdisc.
+ * The fast path only needs to access filter list and to update stats
+ */
+struct mini_Qdisc {
+	struct tcf_proto *filter_list;
+	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+	struct gnet_stats_queue	__percpu *cpu_qstats;
+	struct rcu_head rcu;
+};
+
+static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq,
+						const struct sk_buff *skb)
+{
+	bstats_cpu_update(this_cpu_ptr(miniq->cpu_bstats), skb);
+}
+
+static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq)
+{
+	this_cpu_inc(miniq->cpu_qstats->drops);
+}
+
+struct mini_Qdisc_pair {
+	struct mini_Qdisc miniq1;
+	struct mini_Qdisc miniq2;
+	struct mini_Qdisc __rcu **p_miniq;
+};
+
+void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
+			  struct tcf_proto *tp_head);
+void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
+			  struct mini_Qdisc __rcu **p_miniq);
+
 #endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 24ac9083bc13..1423cf4d695c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3274,22 +3274,22 @@ EXPORT_SYMBOL(dev_loopback_xmit);
 static struct sk_buff *
 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 {
-	struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
+	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
 	struct tcf_result cl_res;
 
-	if (!cl)
+	if (!miniq)
 		return skb;
 
 	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
-	qdisc_bstats_cpu_update(cl->q, skb);
+	mini_qdisc_bstats_cpu_update(miniq, skb);
 
-	switch (tcf_classify(skb, cl, &cl_res, false)) {
+	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
 	case TC_ACT_OK:
 	case TC_ACT_RECLASSIFY:
 		skb->tc_index = TC_H_MIN(cl_res.classid);
 		break;
 	case TC_ACT_SHOT:
-		qdisc_qstats_cpu_drop(cl->q);
+		mini_qdisc_qstats_cpu_drop(miniq);
 		*ret = NET_XMIT_DROP;
 		kfree_skb(skb);
 		return NULL;
@@ -4189,7 +4189,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 		   struct net_device *orig_dev)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
+	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
 	struct tcf_result cl_res;
 
 	/* If there's at least one ingress present somewhere (so
@@ -4197,8 +4197,9 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 	 * that are not configured with an ingress qdisc will bail
 	 * out here.
 	 */
-	if (!cl)
+	if (!miniq)
 		return skb;
+
 	if (*pt_prev) {
 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
 		*pt_prev = NULL;
@@ -4206,15 +4207,15 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 
 	qdisc_skb_cb(skb)->pkt_len = skb->len;
 	skb->tc_at_ingress = 1;
-	qdisc_bstats_cpu_update(cl->q, skb);
+	mini_qdisc_bstats_cpu_update(miniq, skb);
 
-	switch (tcf_classify(skb, cl, &cl_res, false)) {
+	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
 	case TC_ACT_OK:
 	case TC_ACT_RECLASSIFY:
 		skb->tc_index = TC_H_MIN(cl_res.classid);
 		break;
 	case TC_ACT_SHOT:
-		qdisc_qstats_cpu_drop(cl->q);
+		mini_qdisc_qstats_cpu_drop(miniq);
 		kfree_skb(skb);
 		return NULL;
 	case TC_ACT_STOLEN:
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index aa74aa42b5d7..3839cbbdc32b 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1024,3 +1024,49 @@ void psched_ratecfg_precompute(struct psched_ratecfg *r,
 	}
 }
 EXPORT_SYMBOL(psched_ratecfg_precompute);
+
+static void mini_qdisc_rcu_func(struct rcu_head *head)
+{
+}
+
+void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
+			  struct tcf_proto *tp_head)
+{
+	struct mini_Qdisc *miniq_old = rtnl_dereference(*miniqp->p_miniq);
+	struct mini_Qdisc *miniq;
+
+	if (!tp_head) {
+		RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
+		return;
+	}
+
+	miniq = !miniq_old || miniq_old == &miniqp->miniq2 ?
+		&miniqp->miniq1 : &miniqp->miniq2;
+
+	/* We need to make sure that readers won't see the miniq
+	 * we are about to modify. So wait until previous call_rcu_bh callback
+	 * is done.
+	 */
+	rcu_barrier_bh();
+	miniq->filter_list = tp_head;
+	rcu_assign_pointer(*miniqp->p_miniq, miniq);
+
+	if (miniq_old)
+		/* This is counterpart of the rcu barrier above. We need to
+		 * block potential new user of miniq_old until all readers
+		 * are not seeing it.
+		 */
+		call_rcu_bh(&miniq_old->rcu, mini_qdisc_rcu_func);
+}
+EXPORT_SYMBOL(mini_qdisc_pair_swap);
+
+void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
+			  struct mini_Qdisc __rcu **p_miniq)
+{
+	miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats;
+	miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats;
+	miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats;
+	miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats;
+	miniqp->p_miniq = p_miniq;
+}
+EXPORT_SYMBOL(mini_qdisc_pair_init);
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 811845815b8c..5ecc38f35d47 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -21,6 +21,7 @@
 struct ingress_sched_data {
 	struct tcf_block *block;
 	struct tcf_block_ext_info block_info;
+	struct mini_Qdisc_pair miniqp;
 };
 
 static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
@@ -56,9 +57,9 @@ static struct tcf_block *ingress_tcf_block(struct Qdisc *sch, unsigned long cl)
 
 static void clsact_chain_head_change(struct tcf_proto *tp_head, void *priv)
 {
-	struct tcf_proto __rcu **p_filter_chain = priv;
+	struct mini_Qdisc_pair *miniqp = priv;
 
-	rcu_assign_pointer(*p_filter_chain, tp_head);
+	mini_qdisc_pair_swap(miniqp, tp_head);
 }
 
 static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
@@ -67,9 +68,11 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
 	struct net_device *dev = qdisc_dev(sch);
 	int err;
 
+	mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress);
+
 	q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
 	q->block_info.chain_head_change = clsact_chain_head_change;
-	q->block_info.chain_head_change_priv = &dev->ingress_cl_list;
+	q->block_info.chain_head_change_priv = &q->miniqp;
 
 	err = tcf_block_get_ext(&q->block, sch, &q->block_info);
 	if (err)
@@ -128,6 +131,8 @@ struct clsact_sched_data {
 	struct tcf_block *egress_block;
 	struct tcf_block_ext_info ingress_block_info;
 	struct tcf_block_ext_info egress_block_info;
+	struct mini_Qdisc_pair miniqp_ingress;
+	struct mini_Qdisc_pair miniqp_egress;
 };
 
 static unsigned long clsact_find(struct Qdisc *sch, u32 classid)
@@ -167,17 +172,21 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
 	struct net_device *dev = qdisc_dev(sch);
 	int err;
 
+	mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress);
+
 	q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
 	q->ingress_block_info.chain_head_change = clsact_chain_head_change;
-	q->ingress_block_info.chain_head_change_priv = &dev->ingress_cl_list;
+	q->ingress_block_info.chain_head_change_priv = &q->miniqp_ingress;
 
 	err = tcf_block_get_ext(&q->ingress_block, sch, &q->ingress_block_info);
 	if (err)
 		return err;
 
+	mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress);
+
 	q->egress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
 	q->egress_block_info.chain_head_change = clsact_chain_head_change;
-	q->egress_block_info.chain_head_change_priv = &dev->egress_cl_list;
+	q->egress_block_info.chain_head_change_priv = &q->miniqp_egress;
 
 	err = tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info);
 	if (err)
-- 
cgit v1.2.3


From a622c641665c32a879348efb2abc389527479ee4 Mon Sep 17 00:00:00 2001
From: Ladislav Michl <ladis@linux-mips.org>
Date: Wed, 25 Oct 2017 20:42:57 +0200
Subject: memory: omap-gpmc: Remove deprecated gpmc_update_nand_reg()

Deprecated gpmc_update_nand_reg() is no longer used, remove.

Signed-off-by: Ladislav Michl <ladis@linux-mips.org>
Signed-off-by: Roger Quadros <rogerq@ti.com>
---
 drivers/memory/omap-gpmc.c | 53 +++++++++++++++++++++-------------------------
 include/linux/omap-gpmc.h  | 12 -----------
 2 files changed, 24 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/memory/omap-gpmc.c b/drivers/memory/omap-gpmc.c
index f2aef0b87bc6..a385a35c7de9 100644
--- a/drivers/memory/omap-gpmc.c
+++ b/drivers/memory/omap-gpmc.c
@@ -1075,10 +1075,33 @@ int gpmc_configure(int cmd, int wval)
 }
 EXPORT_SYMBOL(gpmc_configure);
 
-void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs)
+static bool gpmc_nand_writebuffer_empty(void)
+{
+	if (gpmc_read_reg(GPMC_STATUS) & GPMC_STATUS_EMPTYWRITEBUFFERSTATUS)
+		return true;
+
+	return false;
+}
+
+static struct gpmc_nand_ops nand_ops = {
+	.nand_writebuffer_empty = gpmc_nand_writebuffer_empty,
+};
+
+/**
+ * gpmc_omap_get_nand_ops - Get the GPMC NAND interface
+ * @regs: the GPMC NAND register map exclusive for NAND use.
+ * @cs: GPMC chip select number on which the NAND sits. The
+ *      register map returned will be specific to this chip select.
+ *
+ * Returns NULL on error e.g. invalid cs.
+ */
+struct gpmc_nand_ops *gpmc_omap_get_nand_ops(struct gpmc_nand_regs *reg, int cs)
 {
 	int i;
 
+	if (cs >= gpmc_cs_num)
+		return NULL;
+
 	reg->gpmc_nand_command = gpmc_base + GPMC_CS0_OFFSET +
 				GPMC_CS_NAND_COMMAND + GPMC_CS_SIZE * cs;
 	reg->gpmc_nand_address = gpmc_base + GPMC_CS0_OFFSET +
@@ -1110,34 +1133,6 @@ void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs)
 		reg->gpmc_bch_result6[i] = gpmc_base + GPMC_ECC_BCH_RESULT_6 +
 					   i * GPMC_BCH_SIZE;
 	}
-}
-
-static bool gpmc_nand_writebuffer_empty(void)
-{
-	if (gpmc_read_reg(GPMC_STATUS) & GPMC_STATUS_EMPTYWRITEBUFFERSTATUS)
-		return true;
-
-	return false;
-}
-
-static struct gpmc_nand_ops nand_ops = {
-	.nand_writebuffer_empty = gpmc_nand_writebuffer_empty,
-};
-
-/**
- * gpmc_omap_get_nand_ops - Get the GPMC NAND interface
- * @regs: the GPMC NAND register map exclusive for NAND use.
- * @cs: GPMC chip select number on which the NAND sits. The
- *      register map returned will be specific to this chip select.
- *
- * Returns NULL on error e.g. invalid cs.
- */
-struct gpmc_nand_ops *gpmc_omap_get_nand_ops(struct gpmc_nand_regs *reg, int cs)
-{
-	if (cs >= gpmc_cs_num)
-		return NULL;
-
-	gpmc_update_nand_reg(reg, cs);
 
 	return &nand_ops;
 }
diff --git a/include/linux/omap-gpmc.h b/include/linux/omap-gpmc.h
index fd0de00c0d77..edfa280c3d56 100644
--- a/include/linux/omap-gpmc.h
+++ b/include/linux/omap-gpmc.h
@@ -36,18 +36,6 @@ static inline struct gpmc_nand_ops *gpmc_omap_get_nand_ops(struct gpmc_nand_regs
 }
 #endif /* CONFIG_OMAP_GPMC */
 
-/*--------------------------------*/
-
-/* deprecated APIs */
-#if IS_ENABLED(CONFIG_OMAP_GPMC)
-void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs);
-#else
-static inline void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs)
-{
-}
-#endif /* CONFIG_OMAP_GPMC */
-/*--------------------------------*/
-
 extern int gpmc_calc_timings(struct gpmc_timings *gpmc_t,
 			     struct gpmc_settings *gpmc_s,
 			     struct gpmc_device_timings *dev_t);
-- 
cgit v1.2.3


From 592e254502041f953e84d091eae2c68cba04c10b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 3 Nov 2017 12:21:21 +0100
Subject: mm: Handle 0 flags in _calc_vm_trans() macro

_calc_vm_trans() does not handle the situation when some of the passed
flags are 0 (which can happen if these VM flags do not make sense for
the architecture). Improve the _calc_vm_trans() macro to return 0 in
such situation. Since all passed flags are constant, this does not add
any runtime overhead.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/mman.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mman.h b/include/linux/mman.h
index c8367041fafd..edb6cf6a81ed 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -63,8 +63,9 @@ static inline bool arch_validate_prot(unsigned long prot)
  * ("bit1" and "bit2" must be single bits)
  */
 #define _calc_vm_trans(x, bit1, bit2) \
+  ((!(bit1) || !(bit2)) ? 0 : \
   ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \
-   : ((x) & (bit1)) / ((bit1) / (bit2)))
+   : ((x) & (bit1)) / ((bit1) / (bit2))))
 
 /*
  * Combine the mmap "prot" argument into "vm_flags" used internally.
-- 
cgit v1.2.3


From 1c9725974074a047f6080eecc62c50a8e840d050 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 1 Nov 2017 16:36:30 +0100
Subject: mm: introduce MAP_SHARED_VALIDATE, a mechanism to safely define new
 mmap flags

The mmap(2) syscall suffers from the ABI anti-pattern of not validating
unknown flags. However, proposals like MAP_SYNC need a mechanism to
define new behavior that is known to fail on older kernels without the
support. Define a new MAP_SHARED_VALIDATE flag pattern that is
guaranteed to fail on all legacy mmap implementations.

It is worth noting that the original proposal was for a standalone
MAP_VALIDATE flag. However, when that  could not be supported by all
archs Linus observed:

    I see why you *think* you want a bitmap. You think you want
    a bitmap because you want to make MAP_VALIDATE be part of MAP_SYNC
    etc, so that people can do

    ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED
		    | MAP_SYNC, fd, 0);

    and "know" that MAP_SYNC actually takes.

    And I'm saying that whole wish is bogus. You're fundamentally
    depending on special semantics, just make it explicit. It's already
    not portable, so don't try to make it so.

    Rename that MAP_VALIDATE as MAP_SHARED_VALIDATE, make it have a value
    of 0x3, and make people do

    ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED_VALIDATE
		    | MAP_SYNC, fd, 0);

    and then the kernel side is easier too (none of that random garbage
    playing games with looking at the "MAP_VALIDATE bit", but just another
    case statement in that map type thing.

    Boom. Done.

Similar to ->fallocate() we also want the ability to validate the
support for new flags on a per ->mmap() 'struct file_operations'
instance basis.  Towards that end arrange for flags to be generically
validated against a mmap_supported_flags exported by 'struct
file_operations'. By default all existing flags are implicitly
supported, but new flags require MAP_SHARED_VALIDATE and
per-instance-opt-in.

Cc: Jan Kara <jack@suse.cz>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Suggested-by: Christoph Hellwig <hch@lst.de>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/alpha/include/uapi/asm/mman.h           |  1 +
 arch/mips/include/uapi/asm/mman.h            |  1 +
 arch/parisc/include/uapi/asm/mman.h          |  1 +
 arch/xtensa/include/uapi/asm/mman.h          |  1 +
 include/linux/fs.h                           |  1 +
 include/linux/mman.h                         | 39 ++++++++++++++++++++++++++++
 include/uapi/asm-generic/mman-common.h       |  1 +
 mm/mmap.c                                    | 15 +++++++++++
 tools/include/uapi/asm-generic/mman-common.h |  1 +
 9 files changed, 61 insertions(+)

(limited to 'include/linux')

diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 3b26cc62dadb..f6d118aaedb9 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -11,6 +11,7 @@
 
 #define MAP_SHARED	0x01		/* Share changes */
 #define MAP_PRIVATE	0x02		/* Changes are private */
+#define MAP_SHARED_VALIDATE 0x03	/* share + validate extension flags */
 #define MAP_TYPE	0x0f		/* Mask for type of mapping (OSF/1 is _wrong_) */
 #define MAP_FIXED	0x100		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x10		/* don't use a file */
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index da3216007fe0..93268e4cd3c7 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -28,6 +28,7 @@
  */
 #define MAP_SHARED	0x001		/* Share changes */
 #define MAP_PRIVATE	0x002		/* Changes are private */
+#define MAP_SHARED_VALIDATE 0x003	/* share + validate extension flags */
 #define MAP_TYPE	0x00f		/* Mask for type of mapping */
 #define MAP_FIXED	0x010		/* Interpret addr exactly */
 
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index 775b5d5e41a1..bca652aa1677 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -11,6 +11,7 @@
 
 #define MAP_SHARED	0x01		/* Share changes */
 #define MAP_PRIVATE	0x02		/* Changes are private */
+#define MAP_SHARED_VALIDATE 0x03	/* share + validate extension flags */
 #define MAP_TYPE	0x03		/* Mask for type of mapping */
 #define MAP_FIXED	0x04		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x10		/* don't use a file */
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index b15b278aa314..9ab426374714 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -35,6 +35,7 @@
  */
 #define MAP_SHARED	0x001		/* Share changes */
 #define MAP_PRIVATE	0x002		/* Changes are private */
+#define MAP_SHARED_VALIDATE 0x003	/* share + validate extension flags */
 #define MAP_TYPE	0x00f		/* Mask for type of mapping */
 #define MAP_FIXED	0x010		/* Interpret addr exactly */
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 13dab191a23e..57added3201d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1701,6 +1701,7 @@ struct file_operations {
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
+	unsigned long mmap_supported_flags;
 	int (*open) (struct inode *, struct file *);
 	int (*flush) (struct file *, fl_owner_t id);
 	int (*release) (struct inode *, struct file *);
diff --git a/include/linux/mman.h b/include/linux/mman.h
index edb6cf6a81ed..74452e3f2536 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -7,6 +7,45 @@
 #include <linux/atomic.h>
 #include <uapi/linux/mman.h>
 
+/*
+ * Arrange for legacy / undefined architecture specific flags to be
+ * ignored by default in LEGACY_MAP_MASK.
+ */
+#ifndef MAP_32BIT
+#define MAP_32BIT 0
+#endif
+#ifndef MAP_HUGE_2MB
+#define MAP_HUGE_2MB 0
+#endif
+#ifndef MAP_HUGE_1GB
+#define MAP_HUGE_1GB 0
+#endif
+#ifndef MAP_UNINITIALIZED
+#define MAP_UNINITIALIZED 0
+#endif
+
+/*
+ * The historical set of flags that all mmap implementations implicitly
+ * support when a ->mmap_validate() op is not provided in file_operations.
+ */
+#define LEGACY_MAP_MASK (MAP_SHARED \
+		| MAP_PRIVATE \
+		| MAP_FIXED \
+		| MAP_ANONYMOUS \
+		| MAP_DENYWRITE \
+		| MAP_EXECUTABLE \
+		| MAP_UNINITIALIZED \
+		| MAP_GROWSDOWN \
+		| MAP_LOCKED \
+		| MAP_NORESERVE \
+		| MAP_POPULATE \
+		| MAP_NONBLOCK \
+		| MAP_STACK \
+		| MAP_HUGETLB \
+		| MAP_32BIT \
+		| MAP_HUGE_2MB \
+		| MAP_HUGE_1GB)
+
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern unsigned long sysctl_overcommit_kbytes;
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 203268f9231e..8ce7f5a0800f 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -16,6 +16,7 @@
 
 #define MAP_SHARED	0x01		/* Share changes */
 #define MAP_PRIVATE	0x02		/* Changes are private */
+#define MAP_SHARED_VALIDATE 0x03	/* share + validate extension flags */
 #define MAP_TYPE	0x0f		/* Mask for type of mapping */
 #define MAP_FIXED	0x10		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x20		/* don't use a file */
diff --git a/mm/mmap.c b/mm/mmap.c
index 680506faceae..924839fac0e6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1387,9 +1387,24 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 
 	if (file) {
 		struct inode *inode = file_inode(file);
+		unsigned long flags_mask;
+
+		flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
 
 		switch (flags & MAP_TYPE) {
 		case MAP_SHARED:
+			/*
+			 * Force use of MAP_SHARED_VALIDATE with non-legacy
+			 * flags. E.g. MAP_SYNC is dangerous to use with
+			 * MAP_SHARED as you don't know which consistency model
+			 * you will get. We silently ignore unsupported flags
+			 * with MAP_SHARED to preserve backward compatibility.
+			 */
+			flags &= LEGACY_MAP_MASK;
+			/* fall through */
+		case MAP_SHARED_VALIDATE:
+			if (flags & ~flags_mask)
+				return -EOPNOTSUPP;
 			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
 				return -EACCES;
 
diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h
index 203268f9231e..8ce7f5a0800f 100644
--- a/tools/include/uapi/asm-generic/mman-common.h
+++ b/tools/include/uapi/asm-generic/mman-common.h
@@ -16,6 +16,7 @@
 
 #define MAP_SHARED	0x01		/* Share changes */
 #define MAP_PRIVATE	0x02		/* Changes are private */
+#define MAP_SHARED_VALIDATE 0x03	/* share + validate extension flags */
 #define MAP_TYPE	0x0f		/* Mask for type of mapping */
 #define MAP_FIXED	0x10		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x20		/* don't use a file */
-- 
cgit v1.2.3


From d81b8a722f691a3907eb4a6df410ab517ba5bfcc Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:31 +0100
Subject: mm: Remove VM_FAULT_HWPOISON_LARGE_MASK

It is unused.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/mm.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 065d99deb847..ca72b67153d5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1182,8 +1182,6 @@ static inline void clear_page_pfmemalloc(struct page *page)
 #define VM_FAULT_FALLBACK 0x0800	/* huge page fault failed, fall back to small */
 #define VM_FAULT_DONE_COW   0x1000	/* ->fault has fully handled COW */
 
-#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
-
 #define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
 			 VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
 			 VM_FAULT_FALLBACK)
-- 
cgit v1.2.3


From 9a0dd42251439de635088b97533109a935864a84 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:39 +0100
Subject: dax: Allow dax_iomap_fault() to return pfn

For synchronous page fault dax_iomap_fault() will need to return PFN
which will then need to be inserted into page tables after fsync()
completes. Add necessary parameter to dax_iomap_fault().

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c            | 13 +++++++------
 fs/ext2/file.c      |  2 +-
 fs/ext4/file.c      |  2 +-
 fs/xfs/xfs_file.c   |  4 ++--
 include/linux/dax.h |  2 +-
 5 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 5214ed9ba508..5ddf15161390 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1079,7 +1079,7 @@ static int dax_fault_return(int error)
 	return VM_FAULT_SIGBUS;
 }
 
-static int dax_iomap_pte_fault(struct vm_fault *vmf,
+static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			       const struct iomap_ops *ops)
 {
 	struct vm_area_struct *vma = vmf->vma;
@@ -1280,7 +1280,7 @@ fallback:
 	return VM_FAULT_FALLBACK;
 }
 
-static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			       const struct iomap_ops *ops)
 {
 	struct vm_area_struct *vma = vmf->vma;
@@ -1425,7 +1425,7 @@ out:
 	return result;
 }
 #else
-static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			       const struct iomap_ops *ops)
 {
 	return VM_FAULT_FALLBACK;
@@ -1436,6 +1436,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
  * dax_iomap_fault - handle a page fault on a DAX file
  * @vmf: The description of the fault
  * @pe_size: Size of the page to fault in
+ * @pfnp: PFN to insert for synchronous faults if fsync is required
  * @ops: Iomap ops passed from the file system
  *
  * When a page fault occurs, filesystems may call this helper in
@@ -1444,13 +1445,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
  * successfully.
  */
 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
-		    const struct iomap_ops *ops)
+		    pfn_t *pfnp, const struct iomap_ops *ops)
 {
 	switch (pe_size) {
 	case PE_SIZE_PTE:
-		return dax_iomap_pte_fault(vmf, ops);
+		return dax_iomap_pte_fault(vmf, pfnp, ops);
 	case PE_SIZE_PMD:
-		return dax_iomap_pmd_fault(vmf, ops);
+		return dax_iomap_pmd_fault(vmf, pfnp, ops);
 	default:
 		return VM_FAULT_FALLBACK;
 	}
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index ff3a3636a5ca..d2bb7c96307d 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -99,7 +99,7 @@ static int ext2_dax_fault(struct vm_fault *vmf)
 	}
 	down_read(&ei->dax_sem);
 
-	ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops);
+	ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ext2_iomap_ops);
 
 	up_read(&ei->dax_sem);
 	if (vmf->flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index b1da660ac3bc..3cec0b95672f 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -306,7 +306,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
 		down_read(&EXT4_I(inode)->i_mmap_sem);
 	}
 	if (!IS_ERR(handle))
-		result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
+		result = dax_iomap_fault(vmf, pe_size, NULL, &ext4_iomap_ops);
 	else
 		result = VM_FAULT_SIGBUS;
 	if (write) {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 309e26c9dddb..7c6b8def6eed 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1040,7 +1040,7 @@ __xfs_filemap_fault(
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	if (IS_DAX(inode)) {
-		ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vmf, pe_size, NULL, &xfs_iomap_ops);
 	} else {
 		if (write_fault)
 			ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
@@ -1111,7 +1111,7 @@ xfs_filemap_pfn_mkwrite(
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
 	else if (IS_DAX(inode))
-		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &xfs_iomap_ops);
 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
 	sb_end_pagefault(inode->i_sb);
 	return ret;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 122197124b9d..e7fa4b8f45bc 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -95,7 +95,7 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev);
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops);
 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
-		    const struct iomap_ops *ops);
+		    pfn_t *pfnp, const struct iomap_ops *ops);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);
-- 
cgit v1.2.3


From b6fb293f2497a9841d94f6b57bd2bb2cd222da43 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:41 +0100
Subject: mm: Define MAP_SYNC and VM_SYNC flags

Define new MAP_SYNC flag and corresponding VMA VM_SYNC flag. As the
MAP_SYNC flag is not part of LEGACY_MAP_MASK, currently it will be
refused by all MAP_SHARED_VALIDATE map attempts and silently ignored for
everything else.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/proc/task_mmu.c              | 1 +
 include/linux/mm.h              | 1 +
 include/linux/mman.h            | 8 ++++++--
 include/uapi/asm-generic/mman.h | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5589b4bd4b85..ea78b37deeaa 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -664,6 +664,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_ACCOUNT)]	= "ac",
 		[ilog2(VM_NORESERVE)]	= "nr",
 		[ilog2(VM_HUGETLB)]	= "ht",
+		[ilog2(VM_SYNC)]	= "sf",
 		[ilog2(VM_ARCH_1)]	= "ar",
 		[ilog2(VM_WIPEONFORK)]	= "wf",
 		[ilog2(VM_DONTDUMP)]	= "dd",
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ca72b67153d5..5411cb7442de 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -189,6 +189,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
 #define VM_NORESERVE	0x00200000	/* should the VM suppress accounting */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
+#define VM_SYNC		0x00800000	/* Synchronous page faults */
 #define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
 #define VM_WIPEONFORK	0x02000000	/* Wipe VMA contents in child. */
 #define VM_DONTDUMP	0x04000000	/* Do not include in the core dump */
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 74452e3f2536..3427bf3daef5 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -9,7 +9,7 @@
 
 /*
  * Arrange for legacy / undefined architecture specific flags to be
- * ignored by default in LEGACY_MAP_MASK.
+ * ignored by mmap handling code.
  */
 #ifndef MAP_32BIT
 #define MAP_32BIT 0
@@ -23,6 +23,9 @@
 #ifndef MAP_UNINITIALIZED
 #define MAP_UNINITIALIZED 0
 #endif
+#ifndef MAP_SYNC
+#define MAP_SYNC 0
+#endif
 
 /*
  * The historical set of flags that all mmap implementations implicitly
@@ -126,7 +129,8 @@ calc_vm_flag_bits(unsigned long flags)
 {
 	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
 	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
-	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    );
+	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
+	       _calc_vm_trans(flags, MAP_SYNC,	     VM_SYNC      );
 }
 
 unsigned long vm_commit_limit(void);
diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h
index 7162cd4cca73..00e55627d2df 100644
--- a/include/uapi/asm-generic/mman.h
+++ b/include/uapi/asm-generic/mman.h
@@ -12,6 +12,7 @@
 #define MAP_NONBLOCK	0x10000		/* do not block on IO */
 #define MAP_STACK	0x20000		/* give out an address that is best suited for process/thread stacks */
 #define MAP_HUGETLB	0x40000		/* create a huge page mapping */
+#define MAP_SYNC	0x80000		/* perform synchronous page faults for the mapping */
 
 /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */
 
-- 
cgit v1.2.3


From caa51d26f85c248f1c4f43a870ad3ef84bf9eb8f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:42 +0100
Subject: dax, iomap: Add support for synchronous faults

Add a flag to iomap interface informing the caller that inode needs
fdstasync(2) for returned extent to become persistent and use it in DAX
fault code so that we don't map such extents into page tables
immediately. Instead we propagate the information that fdatasync(2) is
necessary from dax_iomap_fault() with a new VM_FAULT_NEEDDSYNC flag.
Filesystem fault handler is then responsible for calling fdatasync(2)
and inserting pfn into page tables.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c              | 39 +++++++++++++++++++++++++++++++++++++--
 include/linux/iomap.h |  5 +++++
 include/linux/mm.h    |  6 +++++-
 3 files changed, 47 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index efc210ff6665..bb9ff907738c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1091,6 +1091,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	unsigned flags = IOMAP_FAULT;
 	int error, major = 0;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
+	bool sync;
 	int vmf_ret = 0;
 	void *entry;
 	pfn_t pfn;
@@ -1169,6 +1170,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 		goto finish_iomap;
 	}
 
+	sync = (vma->vm_flags & VM_SYNC) && (iomap.flags & IOMAP_F_DIRTY);
+
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
 		if (iomap.flags & IOMAP_F_NEW) {
@@ -1182,12 +1185,27 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 
 		entry = dax_insert_mapping_entry(mapping, vmf, entry,
 						 dax_iomap_sector(&iomap, pos),
-						 0, write);
+						 0, write && !sync);
 		if (IS_ERR(entry)) {
 			error = PTR_ERR(entry);
 			goto error_finish_iomap;
 		}
 
+		/*
+		 * If we are doing synchronous page fault and inode needs fsync,
+		 * we can insert PTE into page tables only after that happens.
+		 * Skip insertion for now and return the pfn so that caller can
+		 * insert it after fsync is done.
+		 */
+		if (sync) {
+			if (WARN_ON_ONCE(!pfnp)) {
+				error = -EIO;
+				goto error_finish_iomap;
+			}
+			*pfnp = pfn;
+			vmf_ret = VM_FAULT_NEEDDSYNC | major;
+			goto finish_iomap;
+		}
 		trace_dax_insert_mapping(inode, vmf, entry);
 		if (write)
 			error = vm_insert_mixed_mkwrite(vma, vaddr, pfn);
@@ -1287,6 +1305,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
+	bool sync;
 	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
 	struct inode *inode = mapping->host;
 	int result = VM_FAULT_FALLBACK;
@@ -1371,6 +1390,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	if (iomap.offset + iomap.length < pos + PMD_SIZE)
 		goto finish_iomap;
 
+	sync = (vma->vm_flags & VM_SYNC) && (iomap.flags & IOMAP_F_DIRTY);
+
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
 		error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
@@ -1379,10 +1400,24 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 
 		entry = dax_insert_mapping_entry(mapping, vmf, entry,
 						dax_iomap_sector(&iomap, pos),
-						RADIX_DAX_PMD, write);
+						RADIX_DAX_PMD, write && !sync);
 		if (IS_ERR(entry))
 			goto finish_iomap;
 
+		/*
+		 * If we are doing synchronous page fault and inode needs fsync,
+		 * we can insert PMD into page tables only after that happens.
+		 * Skip insertion for now and return the pfn so that caller can
+		 * insert it after fsync is done.
+		 */
+		if (sync) {
+			if (WARN_ON_ONCE(!pfnp))
+				goto finish_iomap;
+			*pfnp = pfn;
+			result = VM_FAULT_NEEDDSYNC;
+			goto finish_iomap;
+		}
+
 		trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
 		result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
 					    write);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index f64dc6ce5161..73e3b7085dbe 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -22,6 +22,11 @@ struct vm_fault;
  * Flags for all iomap mappings:
  */
 #define IOMAP_F_NEW	0x01	/* blocks have been newly allocated */
+/*
+ * IOMAP_F_DIRTY indicates the inode has uncommitted metadata needed to access
+ * written data and requires fdatasync to commit them to persistent storage.
+ */
+#define IOMAP_F_DIRTY	0x02
 
 /*
  * Flags that only need to be reported for IOMAP_REPORT requests:
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5411cb7442de..f57e55782d7d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1182,6 +1182,9 @@ static inline void clear_page_pfmemalloc(struct page *page)
 #define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
 #define VM_FAULT_FALLBACK 0x0800	/* huge page fault failed, fall back to small */
 #define VM_FAULT_DONE_COW   0x1000	/* ->fault has fully handled COW */
+#define VM_FAULT_NEEDDSYNC  0x2000	/* ->fault did not modify page tables
+					 * and needs fsync() to complete (for
+					 * synchronous page faults in DAX) */
 
 #define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
 			 VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
@@ -1199,7 +1202,8 @@ static inline void clear_page_pfmemalloc(struct page *page)
 	{ VM_FAULT_LOCKED,		"LOCKED" }, \
 	{ VM_FAULT_RETRY,		"RETRY" }, \
 	{ VM_FAULT_FALLBACK,		"FALLBACK" }, \
-	{ VM_FAULT_DONE_COW,		"DONE_COW" }
+	{ VM_FAULT_DONE_COW,		"DONE_COW" }, \
+	{ VM_FAULT_NEEDDSYNC,		"NEEDDSYNC" }
 
 /* Encode hstate index for a hwpoisoned large page */
 #define VM_FAULT_SET_HINDEX(x) ((x) << 12)
-- 
cgit v1.2.3


From 71eab6dfd91eabf06fe782a590c07e8a0795f5ea Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:43 +0100
Subject: dax: Implement dax_finish_sync_fault()

Implement a function that filesystems can call to finish handling of
synchronous page faults. It takes care of syncing appropriare file range
and insertion of page table entry.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c                      | 83 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/dax.h           |  2 ++
 include/trace/events/fs_dax.h |  2 ++
 3 files changed, 87 insertions(+)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index bb9ff907738c..78233c716757 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1492,3 +1492,86 @@ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
 	}
 }
 EXPORT_SYMBOL_GPL(dax_iomap_fault);
+
+/**
+ * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
+ * @vmf: The description of the fault
+ * @pe_size: Size of entry to be inserted
+ * @pfn: PFN to insert
+ *
+ * This function inserts writeable PTE or PMD entry into page tables for mmaped
+ * DAX file.  It takes care of marking corresponding radix tree entry as dirty
+ * as well.
+ */
+static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
+				  enum page_entry_size pe_size,
+				  pfn_t pfn)
+{
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	void *entry, **slot;
+	pgoff_t index = vmf->pgoff;
+	int vmf_ret, error;
+
+	spin_lock_irq(&mapping->tree_lock);
+	entry = get_unlocked_mapping_entry(mapping, index, &slot);
+	/* Did we race with someone splitting entry or so? */
+	if (!entry ||
+	    (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
+	    (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
+		put_unlocked_mapping_entry(mapping, index, entry);
+		spin_unlock_irq(&mapping->tree_lock);
+		trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
+						      VM_FAULT_NOPAGE);
+		return VM_FAULT_NOPAGE;
+	}
+	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
+	entry = lock_slot(mapping, slot);
+	spin_unlock_irq(&mapping->tree_lock);
+	switch (pe_size) {
+	case PE_SIZE_PTE:
+		error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
+		vmf_ret = dax_fault_return(error);
+		break;
+#ifdef CONFIG_FS_DAX_PMD
+	case PE_SIZE_PMD:
+		vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
+			pfn, true);
+		break;
+#endif
+	default:
+		vmf_ret = VM_FAULT_FALLBACK;
+	}
+	put_locked_mapping_entry(mapping, index);
+	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret);
+	return vmf_ret;
+}
+
+/**
+ * dax_finish_sync_fault - finish synchronous page fault
+ * @vmf: The description of the fault
+ * @pe_size: Size of entry to be inserted
+ * @pfn: PFN to insert
+ *
+ * This function ensures that the file range touched by the page fault is
+ * stored persistently on the media and handles inserting of appropriate page
+ * table entry.
+ */
+int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+			  pfn_t pfn)
+{
+	int err;
+	loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
+	size_t len = 0;
+
+	if (pe_size == PE_SIZE_PTE)
+		len = PAGE_SIZE;
+	else if (pe_size == PE_SIZE_PMD)
+		len = PMD_SIZE;
+	else
+		WARN_ON_ONCE(1);
+	err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
+	if (err)
+		return VM_FAULT_SIGBUS;
+	return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
+}
+EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index e7fa4b8f45bc..d403f78b706c 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -96,6 +96,8 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops);
 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
 		    pfn_t *pfnp, const struct iomap_ops *ops);
+int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+			  pfn_t pfn);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index 88a9d19b8ff8..7725459fafef 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -190,6 +190,8 @@ DEFINE_EVENT(dax_pte_fault_class, name, \
 DEFINE_PTE_FAULT_EVENT(dax_pte_fault);
 DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done);
 DEFINE_PTE_FAULT_EVENT(dax_load_hole);
+DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite_no_entry);
+DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite);
 
 TRACE_EVENT(dax_insert_mapping,
 	TP_PROTO(struct inode *inode, struct vm_fault *vmf, void *radix_entry),
-- 
cgit v1.2.3


From b8a6176c214cf9aa2679131ed7e4515cddaadc33 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2017 16:36:45 +0100
Subject: ext4: Support for synchronous DAX faults

We return IOMAP_F_DIRTY flag from ext4_iomap_begin() when asked to
prepare blocks for writing and the inode has some uncommitted metadata
changes. In the fault handler ext4_dax_fault() we then detect this case
(through VM_FAULT_NEEDDSYNC return value) and call helper
dax_finish_sync_fault() to flush metadata changes and insert page table
entry. Note that this will also dirty corresponding radix tree entry
which is what we want - fsync(2) will still provide data integrity
guarantees for applications not using userspace flushing. And
applications using userspace flushing can avoid calling fsync(2) and
thus avoid the performance overhead.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/ext4/file.c       | 15 ++++++++++++++-
 fs/ext4/inode.c      | 15 +++++++++++++++
 fs/jbd2/journal.c    | 17 +++++++++++++++++
 include/linux/jbd2.h |  1 +
 4 files changed, 47 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 208adfc3e673..08a1d1a33a90 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -26,6 +26,7 @@
 #include <linux/quotaops.h>
 #include <linux/pagevec.h>
 #include <linux/uio.h>
+#include <linux/mman.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -295,6 +296,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
 	 */
 	bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
 		(vmf->vma->vm_flags & VM_SHARED);
+	pfn_t pfn;
 
 	if (write) {
 		sb_start_pagefault(sb);
@@ -310,9 +312,12 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
 	} else {
 		down_read(&EXT4_I(inode)->i_mmap_sem);
 	}
-	result = dax_iomap_fault(vmf, pe_size, NULL, &ext4_iomap_ops);
+	result = dax_iomap_fault(vmf, pe_size, &pfn, &ext4_iomap_ops);
 	if (write) {
 		ext4_journal_stop(handle);
+		/* Handling synchronous page fault? */
+		if (result & VM_FAULT_NEEDDSYNC)
+			result = dax_finish_sync_fault(vmf, pe_size, pfn);
 		up_read(&EXT4_I(inode)->i_mmap_sem);
 		sb_end_pagefault(sb);
 	} else {
@@ -350,6 +355,13 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 		return -EIO;
 
+	/*
+	 * We don't support synchronous mappings for non-DAX files. At least
+	 * until someone comes with a sensible use case.
+	 */
+	if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
+		return -EOPNOTSUPP;
+
 	file_accessed(file);
 	if (IS_DAX(file_inode(file))) {
 		vma->vm_ops = &ext4_dax_vm_ops;
@@ -719,6 +731,7 @@ const struct file_operations ext4_file_operations = {
 	.compat_ioctl	= ext4_compat_ioctl,
 #endif
 	.mmap		= ext4_file_mmap,
+	.mmap_supported_flags = MAP_SYNC,
 	.open		= ext4_file_open,
 	.release	= ext4_release_file,
 	.fsync		= ext4_sync_file,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 31db875bc7a1..13a198924a0f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3394,6 +3394,19 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 }
 
 #ifdef CONFIG_FS_DAX
+static bool ext4_inode_datasync_dirty(struct inode *inode)
+{
+	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+
+	if (journal)
+		return !jbd2_transaction_committed(journal,
+					EXT4_I(inode)->i_datasync_tid);
+	/* Any metadata buffers to write? */
+	if (!list_empty(&inode->i_mapping->private_list))
+		return true;
+	return inode->i_state & I_DIRTY_DATASYNC;
+}
+
 static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 			    unsigned flags, struct iomap *iomap)
 {
@@ -3466,6 +3479,8 @@ retry:
 	}
 
 	iomap->flags = 0;
+	if ((flags & IOMAP_WRITE) && ext4_inode_datasync_dirty(inode))
+		iomap->flags |= IOMAP_F_DIRTY;
 	iomap->bdev = inode->i_sb->s_bdev;
 	iomap->dax_dev = sbi->s_daxdev;
 	iomap->offset = first_block << blkbits;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 7d5ef3bf3f3e..fa8cde498b4b 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -738,6 +738,23 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	return err;
 }
 
+/* Return 1 when transaction with given tid has already committed. */
+int jbd2_transaction_committed(journal_t *journal, tid_t tid)
+{
+	int ret = 1;
+
+	read_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction &&
+	    journal->j_running_transaction->t_tid == tid)
+		ret = 0;
+	if (journal->j_committing_transaction &&
+	    journal->j_committing_transaction->t_tid == tid)
+		ret = 0;
+	read_unlock(&journal->j_state_lock);
+	return ret;
+}
+EXPORT_SYMBOL(jbd2_transaction_committed);
+
 /*
  * When this function returns the transaction corresponding to tid
  * will be completed.  If the transaction has currently running, start
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 606b6bce3a5b..296d1e0ea87b 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1367,6 +1367,7 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid);
 int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
 int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
 int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
+int jbd2_transaction_committed(journal_t *journal, tid_t tid);
 int jbd2_complete_transaction(journal_t *journal, tid_t tid);
 int jbd2_log_do_checkpoint(journal_t *journal);
 int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
-- 
cgit v1.2.3


From ada69a1639eca54ff74d839a6513c43db8d57d70 Mon Sep 17 00:00:00 2001
From: Gilad Ben-Yossef <gilad@benyossef.com>
Date: Wed, 18 Oct 2017 08:00:38 +0100
Subject: crypto: introduce crypto wait for async op

Invoking a possibly async. crypto op and waiting for completion
while correctly handling backlog processing is a common task
in the crypto API implementation and outside users of it.

This patch adds a generic implementation for doing so in
preparation for using it across the board instead of hand
rolled versions.

Signed-off-by: Gilad Ben-Yossef <gilad@benyossef.com>
CC: Eric Biggers <ebiggers3@gmail.com>
CC: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/api.c           | 13 +++++++++++++
 include/linux/crypto.h | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)

(limited to 'include/linux')

diff --git a/crypto/api.c b/crypto/api.c
index 941cd4c6c7ec..2a2479d168aa 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -24,6 +24,7 @@
 #include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/completion.h>
 #include "internal.h"
 
 LIST_HEAD(crypto_alg_list);
@@ -595,5 +596,17 @@ int crypto_has_alg(const char *name, u32 type, u32 mask)
 }
 EXPORT_SYMBOL_GPL(crypto_has_alg);
 
+void crypto_req_done(struct crypto_async_request *req, int err)
+{
+	struct crypto_wait *wait = req->data;
+
+	if (err == -EINPROGRESS)
+		return;
+
+	wait->err = err;
+	complete(&wait->completion);
+}
+EXPORT_SYMBOL_GPL(crypto_req_done);
+
 MODULE_DESCRIPTION("Cryptographic core API");
 MODULE_LICENSE("GPL");
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 84da9978e951..78508ca4b108 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/uaccess.h>
+#include <linux/completion.h>
 
 /*
  * Autoloaded crypto modules should only use a prefixed name to avoid allowing
@@ -467,6 +468,45 @@ struct crypto_alg {
 	struct module *cra_module;
 } CRYPTO_MINALIGN_ATTR;
 
+/*
+ * A helper struct for waiting for completion of async crypto ops
+ */
+struct crypto_wait {
+	struct completion completion;
+	int err;
+};
+
+/*
+ * Macro for declaring a crypto op async wait object on stack
+ */
+#define DECLARE_CRYPTO_WAIT(_wait) \
+	struct crypto_wait _wait = { \
+		COMPLETION_INITIALIZER_ONSTACK((_wait).completion), 0 }
+
+/*
+ * Async ops completion helper functioons
+ */
+void crypto_req_done(struct crypto_async_request *req, int err);
+
+static inline int crypto_wait_req(int err, struct crypto_wait *wait)
+{
+	switch (err) {
+	case -EINPROGRESS:
+	case -EBUSY:
+		wait_for_completion(&wait->completion);
+		reinit_completion(&wait->completion);
+		err = wait->err;
+		break;
+	};
+
+	return err;
+}
+
+static inline void crypto_init_wait(struct crypto_wait *wait)
+{
+	init_completion(&wait->completion);
+}
+
 /*
  * Algorithm registration interface.
  */
-- 
cgit v1.2.3


From 27e64b4be4b863d884f3ec1686a2f744ae93a1b9 Mon Sep 17 00:00:00 2001
From: Dave Martin <Dave.Martin@arm.com>
Date: Tue, 31 Oct 2017 15:50:53 +0000
Subject: regset: Add support for dynamically sized regsets

Currently the regset API doesn't allow for the possibility that
regsets (or at least, the amount of meaningful data in a regset)
may change in size.

In particular, this results in useless padding being added to
coredumps if a regset's current size is smaller than its
theoretical maximum size.

This patch adds a get_size() function to struct user_regset.
Individual regset implementations can implement this function to
return the current size of the regset data.  A regset_size()
function is added to provide callers with an abstract interface for
determining the size of a regset without needing to know whether
the regset is dynamically sized or not.

The only affected user of this interface is the ELF coredump code:
This patch ports ELF coredump to dump regsets with their actual
size in the coredump.  This has no effect except for new regsets
that are dynamically sized and provide a get_size() implementation.

Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Dmitry Safonov <dsafonov@virtuozzo.com>
Cc: H. J. Lu <hjl.tools@gmail.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 fs/binfmt_elf.c        | 10 ++++----
 include/linux/regset.h | 67 ++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 65 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 73b01e474fdc..c697882b3aba 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1699,7 +1699,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 				 long signr, size_t *total)
 {
 	unsigned int i;
-	unsigned int regset_size = view->regsets[0].n * view->regsets[0].size;
+	unsigned int regset0_size = regset_size(t->task, &view->regsets[0]);
 
 	/*
 	 * NT_PRSTATUS is the one special case, because the regset data
@@ -1708,11 +1708,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 	 * We assume that regset 0 is NT_PRSTATUS.
 	 */
 	fill_prstatus(&t->prstatus, t->task, signr);
-	(void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset_size,
+	(void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset0_size,
 				    &t->prstatus.pr_reg, NULL);
 
 	fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
-		  PRSTATUS_SIZE(t->prstatus, regset_size), &t->prstatus);
+		  PRSTATUS_SIZE(t->prstatus, regset0_size), &t->prstatus);
 	*total += notesize(&t->notes[0]);
 
 	do_thread_regset_writeback(t->task, &view->regsets[0]);
@@ -1728,7 +1728,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 		if (regset->core_note_type && regset->get &&
 		    (!regset->active || regset->active(t->task, regset))) {
 			int ret;
-			size_t size = regset->n * regset->size;
+			size_t size = regset_size(t->task, regset);
 			void *data = kmalloc(size, GFP_KERNEL);
 			if (unlikely(!data))
 				return 0;
@@ -1743,7 +1743,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 						  size, data);
 				else {
 					SET_PR_FPVALID(&t->prstatus,
-							1, regset_size);
+							1, regset0_size);
 					fill_note(&t->notes[i], "CORE",
 						  NT_PRFPREG, size, data);
 				}
diff --git a/include/linux/regset.h b/include/linux/regset.h
index 8e0c9febf495..494cedaafdf2 100644
--- a/include/linux/regset.h
+++ b/include/linux/regset.h
@@ -106,6 +106,28 @@ typedef int user_regset_writeback_fn(struct task_struct *target,
 				     const struct user_regset *regset,
 				     int immediate);
 
+/**
+ * user_regset_get_size_fn - type of @get_size function in &struct user_regset
+ * @target:	thread being examined
+ * @regset:	regset being examined
+ *
+ * This call is optional; usually the pointer is %NULL.
+ *
+ * When provided, this function must return the current size of regset
+ * data, as observed by the @get function in &struct user_regset.  The
+ * value returned must be a multiple of @size.  The returned size is
+ * required to be valid only until the next time (if any) @regset is
+ * modified for @target.
+ *
+ * This function is intended for dynamically sized regsets.  A regset
+ * that is statically sized does not need to implement it.
+ *
+ * This function should not be called directly: instead, callers should
+ * call regset_size() to determine the current size of a regset.
+ */
+typedef unsigned int user_regset_get_size_fn(struct task_struct *target,
+					     const struct user_regset *regset);
+
 /**
  * struct user_regset - accessible thread CPU state
  * @n:			Number of slots (registers).
@@ -117,19 +139,33 @@ typedef int user_regset_writeback_fn(struct task_struct *target,
  * @set:		Function to store values.
  * @active:		Function to report if regset is active, or %NULL.
  * @writeback:		Function to write data back to user memory, or %NULL.
+ * @get_size:		Function to return the regset's size, or %NULL.
  *
  * This data structure describes a machine resource we call a register set.
  * This is part of the state of an individual thread, not necessarily
  * actual CPU registers per se.  A register set consists of a number of
  * similar slots, given by @n.  Each slot is @size bytes, and aligned to
- * @align bytes (which is at least @size).
+ * @align bytes (which is at least @size).  For dynamically-sized
+ * regsets, @n must contain the maximum possible number of slots for the
+ * regset, and @get_size must point to a function that returns the
+ * current regset size.
  *
- * These functions must be called only on the current thread or on a
- * thread that is in %TASK_STOPPED or %TASK_TRACED state, that we are
- * guaranteed will not be woken up and return to user mode, and that we
- * have called wait_task_inactive() on.  (The target thread always might
- * wake up for SIGKILL while these functions are working, in which case
- * that thread's user_regset state might be scrambled.)
+ * Callers that need to know only the current size of the regset and do
+ * not care about its internal structure should call regset_size()
+ * instead of inspecting @n or calling @get_size.
+ *
+ * For backward compatibility, the @get and @set methods must pad to, or
+ * accept, @n * @size bytes, even if the current regset size is smaller.
+ * The precise semantics of these operations depend on the regset being
+ * accessed.
+ *
+ * The functions to which &struct user_regset members point must be
+ * called only on the current thread or on a thread that is in
+ * %TASK_STOPPED or %TASK_TRACED state, that we are guaranteed will not
+ * be woken up and return to user mode, and that we have called
+ * wait_task_inactive() on.  (The target thread always might wake up for
+ * SIGKILL while these functions are working, in which case that
+ * thread's user_regset state might be scrambled.)
  *
  * The @pos argument must be aligned according to @align; the @count
  * argument must be a multiple of @size.  These functions are not
@@ -156,6 +192,7 @@ struct user_regset {
 	user_regset_set_fn		*set;
 	user_regset_active_fn		*active;
 	user_regset_writeback_fn	*writeback;
+	user_regset_get_size_fn		*get_size;
 	unsigned int			n;
 	unsigned int 			size;
 	unsigned int 			align;
@@ -371,5 +408,21 @@ static inline int copy_regset_from_user(struct task_struct *target,
 	return regset->set(target, regset, offset, size, NULL, data);
 }
 
+/**
+ * regset_size - determine the current size of a regset
+ * @target:	thread to be examined
+ * @regset:	regset to be examined
+ *
+ * Note that the returned size is valid only until the next time
+ * (if any) @regset is modified for @target.
+ */
+static inline unsigned int regset_size(struct task_struct *target,
+				       const struct user_regset *regset)
+{
+	if (!regset->get_size)
+		return regset->n * regset->size;
+	else
+		return regset->get_size(target, regset);
+}
 
 #endif	/* <linux/regset.h> */
-- 
cgit v1.2.3


From 8977f563845bdd61fc61c4dfb399270b7d5667c6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:29:48 +0300
Subject: block: move REQ_NOWAIT

This flag should be before the operation-specific REQ_NOUNMAP bit.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 3385c89f402e..c4c6c8bced2e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -224,11 +224,11 @@ enum req_flag_bits {
 	__REQ_PREFLUSH,		/* request for cache flush */
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
 	__REQ_BACKGROUND,	/* background IO */
+	__REQ_NOWAIT,           /* Don't wait if request will block */
 
 	/* command specific flags for REQ_OP_WRITE_ZEROES: */
 	__REQ_NOUNMAP,		/* do not free blocks when zeroing */
 
-	__REQ_NOWAIT,           /* Don't wait if request will block */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -245,9 +245,9 @@ enum req_flag_bits {
 #define REQ_PREFLUSH		(1ULL << __REQ_PREFLUSH)
 #define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
 #define REQ_BACKGROUND		(1ULL << __REQ_BACKGROUND)
+#define REQ_NOWAIT		(1ULL << __REQ_NOWAIT)
 
 #define REQ_NOUNMAP		(1ULL << __REQ_NOUNMAP)
-#define REQ_NOWAIT		(1ULL << __REQ_NOWAIT)
 
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
-- 
cgit v1.2.3


From 96222bcc732d0504363dc772637c50e53b4bd41e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:29:49 +0300
Subject: block: add REQ_DRV bit

Set aside a bit in the request/bio flags for driver use.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index c4c6c8bced2e..1b04085255fb 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -229,6 +229,9 @@ enum req_flag_bits {
 	/* command specific flags for REQ_OP_WRITE_ZEROES: */
 	__REQ_NOUNMAP,		/* do not free blocks when zeroing */
 
+	/* for driver use */
+	__REQ_DRV,
+
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -249,6 +252,8 @@ enum req_flag_bits {
 
 #define REQ_NOUNMAP		(1ULL << __REQ_NOUNMAP)
 
+#define REQ_DRV			(1ULL << __REQ_DRV)
+
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
 
-- 
cgit v1.2.3


From f421e1d9ade4e1b88183e54425cf50e390d16a7f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:29:50 +0300
Subject: block: provide a direct_make_request helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This helper allows reinserting a bio into a new queue without much
overhead, but requires all queue limits to be the same for the upper
and lower queues, and it does not provide any recursion preventions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Javier González <javier@cnexlabs.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 34 ++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |  1 +
 2 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index bb4fce694a60..96aaa3a419a3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2243,6 +2243,40 @@ out:
 }
 EXPORT_SYMBOL(generic_make_request);
 
+/**
+ * direct_make_request - hand a buffer directly to its device driver for I/O
+ * @bio:  The bio describing the location in memory and on the device.
+ *
+ * This function behaves like generic_make_request(), but does not protect
+ * against recursion.  Must only be used if the called driver is known
+ * to not call generic_make_request (or direct_make_request) again from
+ * its make_request function.  (Calling direct_make_request again from
+ * a workqueue is perfectly fine as that doesn't recurse).
+ */
+blk_qc_t direct_make_request(struct bio *bio)
+{
+	struct request_queue *q = bio->bi_disk->queue;
+	bool nowait = bio->bi_opf & REQ_NOWAIT;
+	blk_qc_t ret;
+
+	if (!generic_make_request_checks(bio))
+		return BLK_QC_T_NONE;
+
+	if (unlikely(blk_queue_enter(q, nowait))) {
+		if (nowait && !blk_queue_dying(q))
+			bio->bi_status = BLK_STS_AGAIN;
+		else
+			bio->bi_status = BLK_STS_IOERR;
+		bio_endio(bio);
+		return BLK_QC_T_NONE;
+	}
+
+	ret = q->make_request_fn(q, bio);
+	blk_queue_exit(q);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(direct_make_request);
+
 /**
  * submit_bio - submit a bio to the block device layer for I/O
  * @bio: The &struct bio which describes the I/O
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 72637028f3c9..eda3d25c0f68 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -920,6 +920,7 @@ static inline void rq_flush_dcache_pages(struct request *rq)
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 extern blk_qc_t generic_make_request(struct bio *bio);
+extern blk_qc_t direct_make_request(struct bio *bio);
 extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
 extern void blk_put_request(struct request *);
-- 
cgit v1.2.3


From ef71de8b15d891b27b8c983a9a8972b11cb4576a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:29:51 +0300
Subject: block: add a blk_steal_bios helper

This helpers allows to bounce steal the uncompleted bios from a request so
that they can be reissued on another path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 21 +++++++++++++++++++++
 include/linux/blkdev.h |  2 ++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 96aaa3a419a3..68cfe6780a9b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2769,6 +2769,27 @@ struct request *blk_fetch_request(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_fetch_request);
 
+/*
+ * Steal bios from a request and add them to a bio list.
+ * The request must not have been partially completed before.
+ */
+void blk_steal_bios(struct bio_list *list, struct request *rq)
+{
+	if (rq->bio) {
+		if (list->tail)
+			list->tail->bi_next = rq->bio;
+		else
+			list->head = rq->bio;
+		list->tail = rq->biotail;
+
+		rq->bio = NULL;
+		rq->biotail = NULL;
+	}
+
+	rq->__data_len = 0;
+}
+EXPORT_SYMBOL_GPL(blk_steal_bios);
+
 /**
  * blk_update_request - Special helper function for request stacking drivers
  * @req:      the request being processed
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index eda3d25c0f68..fddda6a1f9b5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1094,6 +1094,8 @@ extern struct request *blk_peek_request(struct request_queue *q);
 extern void blk_start_request(struct request *rq);
 extern struct request *blk_fetch_request(struct request_queue *q);
 
+void blk_steal_bios(struct bio_list *list, struct request *rq);
+
 /*
  * Request completion related functions.
  *
-- 
cgit v1.2.3


From 517bf3c306bad4fe0da631f90ae2ec40924dee2b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:29:52 +0300
Subject: block: don't look at the struct device dev_t in disk_devt

The hidden gendisks introduced in the next patch need to keep the dev
field in their struct device empty so that udev won't try to create
block device nodes for them.  To support that rewrite disk_devt to
look at the major and first_minor fields in the gendisk itself instead
of looking into the struct device.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c         | 4 ----
 include/linux/genhd.h | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index dd305c65ffb0..1174d24e405e 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -649,10 +649,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 		return;
 	}
 	disk_to_dev(disk)->devt = devt;
-
-	/* ->major and ->first_minor aren't supposed to be
-	 * dereferenced from here on, but set them just in case.
-	 */
 	disk->major = MAJOR(devt);
 	disk->first_minor = MINOR(devt);
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index ea652bfcd675..5c0ed5db33c2 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -234,7 +234,7 @@ static inline bool disk_part_scan_enabled(struct gendisk *disk)
 
 static inline dev_t disk_devt(struct gendisk *disk)
 {
-	return disk_to_dev(disk)->devt;
+	return MKDEV(disk->major, disk->first_minor);
 }
 
 static inline dev_t part_devt(struct hd_struct *part)
-- 
cgit v1.2.3


From 8ddcd653257c18a669fcb75ee42c37054908e0d6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:29:53 +0300
Subject: block: introduce GENHD_FL_HIDDEN

With this flag a driver can create a gendisk that can be used for I/O
submission inside the kernel, but which is not registered as user
facing block device.  This will be useful for the NVMe multipath
implementation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c         | 68 +++++++++++++++++++++++++++++++++++++--------------
 include/linux/genhd.h |  1 +
 2 files changed, 51 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 1174d24e405e..835e907e6e01 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -585,6 +585,11 @@ static void register_disk(struct device *parent, struct gendisk *disk)
 	 */
 	pm_runtime_set_memalloc_noio(ddev, true);
 
+	if (disk->flags & GENHD_FL_HIDDEN) {
+		dev_set_uevent_suppress(ddev, 0);
+		return;
+	}
+
 	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
 	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
 
@@ -616,6 +621,11 @@ exit:
 	while ((part = disk_part_iter_next(&piter)))
 		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
 	disk_part_iter_exit(&piter);
+
+	err = sysfs_create_link(&ddev->kobj,
+				&disk->queue->backing_dev_info->dev->kobj,
+				"bdi");
+	WARN_ON(err);
 }
 
 /**
@@ -630,7 +640,6 @@ exit:
  */
 void device_add_disk(struct device *parent, struct gendisk *disk)
 {
-	struct backing_dev_info *bdi;
 	dev_t devt;
 	int retval;
 
@@ -639,7 +648,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 	 * parameters make sense.
 	 */
 	WARN_ON(disk->minors && !(disk->major || disk->first_minor));
-	WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT));
+	WARN_ON(!disk->minors &&
+		!(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN)));
 
 	disk->flags |= GENHD_FL_UP;
 
@@ -648,18 +658,26 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 		WARN_ON(1);
 		return;
 	}
-	disk_to_dev(disk)->devt = devt;
 	disk->major = MAJOR(devt);
 	disk->first_minor = MINOR(devt);
 
 	disk_alloc_events(disk);
 
-	/* Register BDI before referencing it from bdev */
-	bdi = disk->queue->backing_dev_info;
-	bdi_register_owner(bdi, disk_to_dev(disk));
-
-	blk_register_region(disk_devt(disk), disk->minors, NULL,
-			    exact_match, exact_lock, disk);
+	if (disk->flags & GENHD_FL_HIDDEN) {
+		/*
+		 * Don't let hidden disks show up in /proc/partitions,
+		 * and don't bother scanning for partitions either.
+		 */
+		disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
+		disk->flags |= GENHD_FL_NO_PART_SCAN;
+	} else {
+		/* Register BDI before referencing it from bdev */
+		disk_to_dev(disk)->devt = devt;
+		bdi_register_owner(disk->queue->backing_dev_info,
+				disk_to_dev(disk));
+		blk_register_region(disk_devt(disk), disk->minors, NULL,
+				    exact_match, exact_lock, disk);
+	}
 	register_disk(parent, disk);
 	blk_register_queue(disk);
 
@@ -669,10 +687,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 	 */
 	WARN_ON_ONCE(!blk_get_queue(disk->queue));
 
-	retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
-				   "bdi");
-	WARN_ON(retval);
-
 	disk_add_events(disk);
 	blk_integrity_add(disk);
 }
@@ -701,7 +715,8 @@ void del_gendisk(struct gendisk *disk)
 	set_capacity(disk, 0);
 	disk->flags &= ~GENHD_FL_UP;
 
-	sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
+	if (!(disk->flags & GENHD_FL_HIDDEN))
+		sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
 	if (disk->queue) {
 		/*
 		 * Unregister bdi before releasing device numbers (as they can
@@ -712,13 +727,15 @@ void del_gendisk(struct gendisk *disk)
 	} else {
 		WARN_ON(1);
 	}
-	blk_unregister_region(disk_devt(disk), disk->minors);
+
+	if (!(disk->flags & GENHD_FL_HIDDEN)) {
+		blk_unregister_region(disk_devt(disk), disk->minors);
+		kobject_put(disk->part0.holder_dir);
+		kobject_put(disk->slave_dir);
+	}
 
 	part_stat_set_all(&disk->part0, 0);
 	disk->part0.stamp = 0;
-
-	kobject_put(disk->part0.holder_dir);
-	kobject_put(disk->slave_dir);
 	if (!sysfs_deprecated)
 		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
@@ -781,6 +798,10 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
 		spin_unlock_bh(&ext_devt_lock);
 	}
 
+	if (unlikely(disk->flags & GENHD_FL_HIDDEN)) {
+		put_disk(disk);
+		disk = NULL;
+	}
 	return disk;
 }
 EXPORT_SYMBOL(get_gendisk);
@@ -1024,6 +1045,15 @@ static ssize_t disk_removable_show(struct device *dev,
 		       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
 }
 
+static ssize_t disk_hidden_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%d\n",
+		       (disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
+}
+
 static ssize_t disk_ro_show(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
@@ -1061,6 +1091,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev,
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
+static DEVICE_ATTR(hidden, S_IRUGO, disk_hidden_show, NULL);
 static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
@@ -1085,6 +1116,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_range.attr,
 	&dev_attr_ext_range.attr,
 	&dev_attr_removable.attr,
+	&dev_attr_hidden.attr,
 	&dev_attr_ro.attr,
 	&dev_attr_size.attr,
 	&dev_attr_alignment_offset.attr,
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5c0ed5db33c2..93aae3476f58 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -140,6 +140,7 @@ struct hd_struct {
 #define GENHD_FL_NATIVE_CAPACITY		128
 #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE	256
 #define GENHD_FL_NO_PART_SCAN			512
+#define GENHD_FL_HIDDEN				1024
 
 enum {
 	DISK_EVENT_MEDIA_CHANGE			= 1 << 0, /* media changed */
-- 
cgit v1.2.3


From ea435e1b9392a33deceaea2a16ebaa3397bead93 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:29:54 +0300
Subject: block: add a poll_fn callback to struct request_queue

That we we can also poll non blk-mq queues.  Mostly needed for
the NVMe multipath code, but could also be useful elsewhere.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c             | 11 +++++++++++
 block/blk-mq.c               | 14 +++++---------
 drivers/nvme/target/io-cmd.c |  2 +-
 fs/block_dev.c               |  4 ++--
 fs/direct-io.c               |  2 +-
 fs/iomap.c                   |  2 +-
 include/linux/blkdev.h       |  4 +++-
 mm/page_io.c                 |  2 +-
 8 files changed, 25 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 68cfe6780a9b..395bfb10d658 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2321,6 +2321,17 @@ blk_qc_t submit_bio(struct bio *bio)
 }
 EXPORT_SYMBOL(submit_bio);
 
+bool blk_poll(struct request_queue *q, blk_qc_t cookie)
+{
+	if (!q->poll_fn || !blk_qc_t_valid(cookie))
+		return false;
+
+	if (current->plug)
+		blk_flush_plug_list(current->plug, false);
+	return q->poll_fn(q, cookie);
+}
+EXPORT_SYMBOL_GPL(blk_poll);
+
 /**
  * blk_cloned_rq_check_limits - Helper function to check a cloned request
  *                              for new the queue limits
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e4d2490f4e7e..95ea5889b825 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -37,6 +37,7 @@
 #include "blk-wbt.h"
 #include "blk-mq-sched.h"
 
+static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
 
@@ -2499,6 +2500,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	spin_lock_init(&q->requeue_lock);
 
 	blk_queue_make_request(q, blk_mq_make_request);
+	if (q->mq_ops->poll)
+		q->poll_fn = blk_mq_poll;
 
 	/*
 	 * Do this after blk_queue_make_request() overrides it...
@@ -2961,20 +2964,14 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
 	return false;
 }
 
-bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
+static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 {
 	struct blk_mq_hw_ctx *hctx;
-	struct blk_plug *plug;
 	struct request *rq;
 
-	if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
-	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+	if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
 		return false;
 
-	plug = current->plug;
-	if (plug)
-		blk_flush_plug_list(plug, false);
-
 	hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
 	if (!blk_qc_t_is_internal(cookie))
 		rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
@@ -2992,7 +2989,6 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 
 	return __blk_mq_poll(hctx, rq);
 }
-EXPORT_SYMBOL_GPL(blk_mq_poll);
 
 static int __init blk_mq_init(void)
 {
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index 0d4c23dc4532..db632818777d 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -94,7 +94,7 @@ static void nvmet_execute_rw(struct nvmet_req *req)
 
 	cookie = submit_bio(bio);
 
-	blk_mq_poll(bdev_get_queue(req->ns->bdev), cookie);
+	blk_poll(bdev_get_queue(req->ns->bdev), cookie);
 }
 
 static void nvmet_execute_flush(struct nvmet_req *req)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 07ddccd17801..4afa4d5ff969 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -237,7 +237,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 		if (!READ_ONCE(bio.bi_private))
 			break;
 		if (!(iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_mq_poll(bdev_get_queue(bdev), qc))
+		    !blk_poll(bdev_get_queue(bdev), qc))
 			io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
@@ -402,7 +402,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 			break;
 
 		if (!(iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_mq_poll(bdev_get_queue(bdev), qc))
+		    !blk_poll(bdev_get_queue(bdev), qc))
 			io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 62cf812ed0e5..d2bc339cb1e9 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -486,7 +486,7 @@ static struct bio *dio_await_one(struct dio *dio)
 		dio->waiter = current;
 		spin_unlock_irqrestore(&dio->bio_lock, flags);
 		if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_mq_poll(dio->bio_disk->queue, dio->bio_cookie))
+		    !blk_poll(dio->bio_disk->queue, dio->bio_cookie))
 			io_schedule();
 		/* wake up sets us TASK_RUNNING */
 		spin_lock_irqsave(&dio->bio_lock, flags);
diff --git a/fs/iomap.c b/fs/iomap.c
index 8194d30bdca0..4241bac905b1 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1049,7 +1049,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 
 			if (!(iocb->ki_flags & IOCB_HIPRI) ||
 			    !dio->submit.last_queue ||
-			    !blk_mq_poll(dio->submit.last_queue,
+			    !blk_poll(dio->submit.last_queue,
 					 dio->submit.cookie))
 				io_schedule();
 		}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fddda6a1f9b5..225617dd0a3f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -266,6 +266,7 @@ struct blk_queue_ctx;
 
 typedef void (request_fn_proc) (struct request_queue *q);
 typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
+typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t);
 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
 
@@ -408,6 +409,7 @@ struct request_queue {
 
 	request_fn_proc		*request_fn;
 	make_request_fn		*make_request_fn;
+	poll_q_fn		*poll_fn;
 	prep_rq_fn		*prep_rq_fn;
 	unprep_rq_fn		*unprep_rq_fn;
 	softirq_done_fn		*softirq_done_fn;
@@ -975,7 +977,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 int blk_status_to_errno(blk_status_t status);
 blk_status_t errno_to_blk_status(int errno);
 
-bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
+bool blk_poll(struct request_queue *q, blk_qc_t cookie);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
diff --git a/mm/page_io.c b/mm/page_io.c
index 21502d341a67..ff04de630c46 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -407,7 +407,7 @@ int swap_readpage(struct page *page, bool do_poll)
 		if (!READ_ONCE(bio->bi_private))
 			break;
 
-		if (!blk_mq_poll(disk->queue, qc))
+		if (!blk_poll(disk->queue, qc))
 			break;
 	}
 	__set_current_state(TASK_RUNNING);
-- 
cgit v1.2.3


From 973b546451fdf11e518cc96d1b137af893a38db5 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Fri, 3 Nov 2017 10:51:33 -0600
Subject: iommu/vt-d: Clear Page Request Overflow fault bit

Currently Page Request Overflow bit in IOMMU Fault Status register
is not cleared. Not clearing this bit would mean that any  future
page-request is going to be automatically dropped by IOMMU.

Suggested-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/iommu/dmar.c        | 3 ++-
 include/linux/intel-iommu.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 1ea7cd537873..9a7ffd13c7f0 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1679,7 +1679,8 @@ irqreturn_t dmar_fault(int irq, void *dev_id)
 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
 	}
 
-	writel(DMA_FSTS_PFO | DMA_FSTS_PPF, iommu->reg + DMAR_FSTS_REG);
+	writel(DMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_PRO,
+	       iommu->reg + DMAR_FSTS_REG);
 
 unlock_exit:
 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 485a5b48f038..f3274d9f46a2 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -212,6 +212,7 @@
 #define DMA_FSTS_IQE (1 << 4)
 #define DMA_FSTS_ICE (1 << 5)
 #define DMA_FSTS_ITE (1 << 6)
+#define DMA_FSTS_PRO (1 << 7)
 #define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
 
 /* FRCD_REG, 32 bits access */
-- 
cgit v1.2.3


From 722c856d46c6ca74a246b54a72f14751fec01aae Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@dell.com>
Date: Wed, 1 Nov 2017 14:25:23 -0500
Subject: platform/x86: wmi: Add new method wmidev_evaluate_method
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drivers properly using the wmibus can pass their wmi_device
pointer rather than the GUID back to the WMI bus to evaluate
the proper methods.

Any "new" drivers added that use the WMI bus should use this
rather than the old wmi_evaluate_method that would take the
GUID.

Signed-off-by: Mario Limonciello <mario.limonciello@dell.com>
Reviewed-by: Edward O'Callaghan <quasisec@google.com>
Reviewed-by: Pali Rohár <pali.rohar@gmail.com>
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 drivers/platform/x86/wmi.c | 28 ++++++++++++++++++++++++----
 include/linux/wmi.h        |  6 ++++++
 2 files changed, 30 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index 7a05843aff19..4d73a87c2ddf 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -200,6 +200,28 @@ static acpi_status wmi_method_enable(struct wmi_block *wblock, int enable)
  */
 acpi_status wmi_evaluate_method(const char *guid_string, u8 instance,
 u32 method_id, const struct acpi_buffer *in, struct acpi_buffer *out)
+{
+	struct wmi_block *wblock = NULL;
+
+	if (!find_guid(guid_string, &wblock))
+		return AE_ERROR;
+	return wmidev_evaluate_method(&wblock->dev, instance, method_id,
+				      in, out);
+}
+EXPORT_SYMBOL_GPL(wmi_evaluate_method);
+
+/**
+ * wmidev_evaluate_method - Evaluate a WMI method
+ * @wdev: A wmi bus device from a driver
+ * @instance: Instance index
+ * @method_id: Method ID to call
+ * &in: Buffer containing input for the method call
+ * &out: Empty buffer to return the method results
+ *
+ * Call an ACPI-WMI method
+ */
+acpi_status wmidev_evaluate_method(struct wmi_device *wdev, u8 instance,
+	u32 method_id, const struct acpi_buffer *in, struct acpi_buffer *out)
 {
 	struct guid_block *block = NULL;
 	struct wmi_block *wblock = NULL;
@@ -209,9 +231,7 @@ u32 method_id, const struct acpi_buffer *in, struct acpi_buffer *out)
 	union acpi_object params[3];
 	char method[5] = "WM";
 
-	if (!find_guid(guid_string, &wblock))
-		return AE_ERROR;
-
+	wblock = container_of(wdev, struct wmi_block, dev);
 	block = &wblock->gblock;
 	handle = wblock->acpi_device->handle;
 
@@ -246,7 +266,7 @@ u32 method_id, const struct acpi_buffer *in, struct acpi_buffer *out)
 
 	return status;
 }
-EXPORT_SYMBOL_GPL(wmi_evaluate_method);
+EXPORT_SYMBOL_GPL(wmidev_evaluate_method);
 
 static acpi_status __query_block(struct wmi_block *wblock, u8 instance,
 				 struct acpi_buffer *out)
diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index cd0d7734dc49..2cd10c3b89e9 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -26,6 +26,12 @@ struct wmi_device {
 	bool setable;
 };
 
+/* evaluate the ACPI method associated with this device */
+extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev,
+					  u8 instance, u32 method_id,
+					  const struct acpi_buffer *in,
+					  struct acpi_buffer *out);
+
 /* Caller must kfree the result. */
 extern union acpi_object *wmidev_block_query(struct wmi_device *wdev,
 					     u8 instance);
-- 
cgit v1.2.3


From f97e058cfe8032504e310bd5c20e35d640ef2858 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@dell.com>
Date: Wed, 1 Nov 2017 14:25:28 -0500
Subject: platform/x86: wmi: Don't allow drivers to get each other's GUIDs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The only driver using this was dell-wmi, and it really was a hack.
The driver was getting a data attribute from another driver and this
type of action should not be encouraged.

Rather drivers that need to interact with one another should pass
data back and forth via exported functions.

Signed-off-by: Mario Limonciello <mario.limonciello@dell.com>
Reviewed-by: Edward O'Callaghan <quasisec@google.com>
Reviewed-by: Pali Rohár <pali.rohar@gmail.com>
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 drivers/platform/x86/wmi.c | 17 -----------------
 include/linux/wmi.h        |  4 ----
 2 files changed, 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index 4d73a87c2ddf..bcb41c1c7f52 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -368,23 +368,6 @@ union acpi_object *wmidev_block_query(struct wmi_device *wdev, u8 instance)
 }
 EXPORT_SYMBOL_GPL(wmidev_block_query);
 
-struct wmi_device *wmidev_get_other_guid(struct wmi_device *wdev,
-					 const char *guid_string)
-{
-	struct wmi_block *this_wb = container_of(wdev, struct wmi_block, dev);
-	struct wmi_block *other_wb;
-
-	if (!find_guid(guid_string, &other_wb))
-		return NULL;
-
-	if (other_wb->acpi_device != this_wb->acpi_device)
-		return NULL;
-
-	get_device(&other_wb->dev.dev);
-	return &other_wb->dev;
-}
-EXPORT_SYMBOL_GPL(wmidev_get_other_guid);
-
 /**
  * wmi_set_block - Write to a WMI block
  * @guid_string: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba
diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index 2cd10c3b89e9..ddee427e0721 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -36,10 +36,6 @@ extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev,
 extern union acpi_object *wmidev_block_query(struct wmi_device *wdev,
 					     u8 instance);
 
-/* Gets another device on the same bus.  Caller must put_device the result. */
-extern struct wmi_device *wmidev_get_other_guid(struct wmi_device *wdev,
-						const char *guid_string);
-
 struct wmi_device_id {
 	const char *guid_string;
 };
-- 
cgit v1.2.3


From 44b6b7661132b1b0e5fd3147ded66f1e4a817ca9 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@dell.com>
Date: Wed, 1 Nov 2017 14:25:35 -0500
Subject: platform/x86: wmi: create userspace interface for drivers

For WMI operations that are only Set or Query readable and writable sysfs
attributes created by WMI vendor drivers or the bus driver makes sense.

For other WMI operations that are run on Method, there needs to be a
way to guarantee to userspace that the results from the method call
belong to the data request to the method call.  Sysfs attributes don't
work well in this scenario because two userspace processes may be
competing at reading/writing an attribute and step on each other's
data.

When a WMI vendor driver declares a callback method in the wmi_driver
the WMI bus driver will create a character device that maps to that
function.  This callback method will be responsible for filtering
invalid requests and performing the actual call.

That character device will correspond to this path:
/dev/wmi/$driver

Performing read() on this character device will provide the size
of the buffer that the character device needs to perform calls.
This buffer size can be set by vendor drivers through a new symbol
or when MOF parsing is available by the MOF.

Performing ioctl() on this character device will be interpretd
by the WMI bus driver. It will perform sanity tests for size of
data, test them for a valid instance, copy the data from userspace
and pass iton to the vendor driver to further process and run.

This creates an implicit policy that each driver will only be allowed
a single character device.  If a module matches multiple GUID's,
the wmi_devices will need to be all handled by the same wmi_driver.

The WMI vendor drivers will be responsible for managing inappropriate
access to this character device and proper locking on data used by
it.

When a WMI vendor driver is unloaded the WMI bus driver will clean
up the character device and any memory allocated for the call.

Signed-off-by: Mario Limonciello <mario.limonciello@dell.com>
Reviewed-by: Edward O'Callaghan <quasisec@google.com>
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 MAINTAINERS                |   1 +
 drivers/platform/x86/wmi.c | 189 ++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/wmi.h        |   5 ++
 include/uapi/linux/wmi.h   |  26 +++++++
 4 files changed, 219 insertions(+), 2 deletions(-)
 create mode 100644 include/uapi/linux/wmi.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index aede236d10f1..3af07502220a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -384,6 +384,7 @@ ACPI WMI DRIVER
 L:	platform-driver-x86@vger.kernel.org
 S:	Orphan
 F:	drivers/platform/x86/wmi.c
+F:	include/uapi/linux/wmi.h
 
 AD1889 ALSA SOUND DRIVER
 M:	Thibaut Varene <T-Bone@parisc-linux.org>
diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index bcb41c1c7f52..8c31ed4f0e1b 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -38,12 +38,15 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/miscdevice.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/types.h>
+#include <linux/uaccess.h>
 #include <linux/uuid.h>
 #include <linux/wmi.h>
+#include <uapi/linux/wmi.h>
 
 ACPI_MODULE_NAME("wmi");
 MODULE_AUTHOR("Carlos Corbacho");
@@ -69,9 +72,12 @@ struct wmi_block {
 	struct wmi_device dev;
 	struct list_head list;
 	struct guid_block gblock;
+	struct miscdevice char_dev;
+	struct mutex char_mutex;
 	struct acpi_device *acpi_device;
 	wmi_notify_handler handler;
 	void *handler_data;
+	u64 req_buf_size;
 
 	bool read_takes_no_args;
 };
@@ -188,6 +194,25 @@ static acpi_status wmi_method_enable(struct wmi_block *wblock, int enable)
 /*
  * Exported WMI functions
  */
+
+/**
+ * set_required_buffer_size - Sets the buffer size needed for performing IOCTL
+ * @wdev: A wmi bus device from a driver
+ * @instance: Instance index
+ *
+ * Allocates memory needed for buffer, stores the buffer size in that memory
+ */
+int set_required_buffer_size(struct wmi_device *wdev, u64 length)
+{
+	struct wmi_block *wblock;
+
+	wblock = container_of(wdev, struct wmi_block, dev);
+	wblock->req_buf_size = length;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(set_required_buffer_size);
+
 /**
  * wmi_evaluate_method - Evaluate a WMI method
  * @guid_string: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba
@@ -764,6 +789,111 @@ static int wmi_dev_match(struct device *dev, struct device_driver *driver)
 
 	return 0;
 }
+static int wmi_char_open(struct inode *inode, struct file *filp)
+{
+	const char *driver_name = filp->f_path.dentry->d_iname;
+	struct wmi_block *wblock = NULL;
+	struct wmi_block *next = NULL;
+
+	list_for_each_entry_safe(wblock, next, &wmi_block_list, list) {
+		if (!wblock->dev.dev.driver)
+			continue;
+		if (strcmp(driver_name, wblock->dev.dev.driver->name) == 0) {
+			filp->private_data = wblock;
+			break;
+		}
+	}
+
+	if (!filp->private_data)
+		return -ENODEV;
+
+	return nonseekable_open(inode, filp);
+}
+
+static ssize_t wmi_char_read(struct file *filp, char __user *buffer,
+	size_t length, loff_t *offset)
+{
+	struct wmi_block *wblock = filp->private_data;
+
+	return simple_read_from_buffer(buffer, length, offset,
+				       &wblock->req_buf_size,
+				       sizeof(wblock->req_buf_size));
+}
+
+static long wmi_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct wmi_ioctl_buffer __user *input =
+		(struct wmi_ioctl_buffer __user *) arg;
+	struct wmi_block *wblock = filp->private_data;
+	struct wmi_ioctl_buffer *buf = NULL;
+	struct wmi_driver *wdriver = NULL;
+	int ret;
+
+	if (_IOC_TYPE(cmd) != WMI_IOC)
+		return -ENOTTY;
+
+	/* make sure we're not calling a higher instance than exists*/
+	if (_IOC_NR(cmd) >= wblock->gblock.instance_count)
+		return -EINVAL;
+
+	mutex_lock(&wblock->char_mutex);
+	buf = wblock->handler_data;
+	if (get_user(buf->length, &input->length)) {
+		dev_dbg(&wblock->dev.dev, "Read length from user failed\n");
+		ret = -EFAULT;
+		goto out_ioctl;
+	}
+	/* if it's too small, abort */
+	if (buf->length < wblock->req_buf_size) {
+		dev_err(&wblock->dev.dev,
+			"Buffer %lld too small, need at least %lld\n",
+			buf->length, wblock->req_buf_size);
+		ret = -EINVAL;
+		goto out_ioctl;
+	}
+	/* if it's too big, warn, driver will only use what is needed */
+	if (buf->length > wblock->req_buf_size)
+		dev_warn(&wblock->dev.dev,
+			"Buffer %lld is bigger than required %lld\n",
+			buf->length, wblock->req_buf_size);
+
+	/* copy the structure from userspace */
+	if (copy_from_user(buf, input, wblock->req_buf_size)) {
+		dev_dbg(&wblock->dev.dev, "Copy %llu from user failed\n",
+			wblock->req_buf_size);
+		ret = -EFAULT;
+		goto out_ioctl;
+	}
+
+	/* let the driver do any filtering and do the call */
+	wdriver = container_of(wblock->dev.dev.driver,
+			       struct wmi_driver, driver);
+	if (!try_module_get(wdriver->driver.owner))
+		return -EBUSY;
+	ret = wdriver->filter_callback(&wblock->dev, cmd, buf);
+	module_put(wdriver->driver.owner);
+	if (ret)
+		goto out_ioctl;
+
+	/* return the result (only up to our internal buffer size) */
+	if (copy_to_user(input, buf, wblock->req_buf_size)) {
+		dev_dbg(&wblock->dev.dev, "Copy %llu to user failed\n",
+			wblock->req_buf_size);
+		ret = -EFAULT;
+	}
+
+out_ioctl:
+	mutex_unlock(&wblock->char_mutex);
+	return ret;
+}
+
+static const struct file_operations wmi_fops = {
+	.owner		= THIS_MODULE,
+	.read		= wmi_char_read,
+	.open		= wmi_char_open,
+	.unlocked_ioctl	= wmi_ioctl,
+	.compat_ioctl	= wmi_ioctl,
+};
 
 static int wmi_dev_probe(struct device *dev)
 {
@@ -771,16 +901,63 @@ static int wmi_dev_probe(struct device *dev)
 	struct wmi_driver *wdriver =
 		container_of(dev->driver, struct wmi_driver, driver);
 	int ret = 0;
+	int count;
+	char *buf;
 
 	if (ACPI_FAILURE(wmi_method_enable(wblock, 1)))
 		dev_warn(dev, "failed to enable device -- probing anyway\n");
 
 	if (wdriver->probe) {
 		ret = wdriver->probe(dev_to_wdev(dev));
-		if (ret != 0 && ACPI_FAILURE(wmi_method_enable(wblock, 0)))
-			dev_warn(dev, "failed to disable device\n");
+		if (ret != 0)
+			goto probe_failure;
 	}
 
+	/* driver wants a character device made */
+	if (wdriver->filter_callback) {
+		/* check that required buffer size declared by driver or MOF */
+		if (!wblock->req_buf_size) {
+			dev_err(&wblock->dev.dev,
+				"Required buffer size not set\n");
+			ret = -EINVAL;
+			goto probe_failure;
+		}
+
+		count = get_order(wblock->req_buf_size);
+		wblock->handler_data = (void *)__get_free_pages(GFP_KERNEL,
+								count);
+		if (!wblock->handler_data) {
+			ret = -ENOMEM;
+			goto probe_failure;
+		}
+
+		buf = kmalloc(strlen(wdriver->driver.name) + 4, GFP_KERNEL);
+		if (!buf) {
+			ret = -ENOMEM;
+			goto probe_string_failure;
+		}
+		sprintf(buf, "wmi/%s", wdriver->driver.name);
+		wblock->char_dev.minor = MISC_DYNAMIC_MINOR;
+		wblock->char_dev.name = buf;
+		wblock->char_dev.fops = &wmi_fops;
+		wblock->char_dev.mode = 0444;
+		ret = misc_register(&wblock->char_dev);
+		if (ret) {
+			dev_warn(dev, "failed to register char dev: %d", ret);
+			ret = -ENOMEM;
+			goto probe_misc_failure;
+		}
+	}
+
+	return 0;
+
+probe_misc_failure:
+	kfree(buf);
+probe_string_failure:
+	kfree(wblock->handler_data);
+probe_failure:
+	if (ACPI_FAILURE(wmi_method_enable(wblock, 0)))
+		dev_warn(dev, "failed to disable device\n");
 	return ret;
 }
 
@@ -791,6 +968,13 @@ static int wmi_dev_remove(struct device *dev)
 		container_of(dev->driver, struct wmi_driver, driver);
 	int ret = 0;
 
+	if (wdriver->filter_callback) {
+		misc_deregister(&wblock->char_dev);
+		kfree(wblock->char_dev.name);
+		free_pages((unsigned long)wblock->handler_data,
+			   get_order(wblock->req_buf_size));
+	}
+
 	if (wdriver->remove)
 		ret = wdriver->remove(dev_to_wdev(dev));
 
@@ -847,6 +1031,7 @@ static int wmi_create_device(struct device *wmi_bus_dev,
 
 	if (gblock->flags & ACPI_WMI_METHOD) {
 		wblock->dev.dev.type = &wmi_type_method;
+		mutex_init(&wblock->char_mutex);
 		goto out_init;
 	}
 
diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index ddee427e0721..4757cb5077e5 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -18,6 +18,7 @@
 
 #include <linux/device.h>
 #include <linux/acpi.h>
+#include <uapi/linux/wmi.h>
 
 struct wmi_device {
 	struct device dev;
@@ -36,6 +37,8 @@ extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev,
 extern union acpi_object *wmidev_block_query(struct wmi_device *wdev,
 					     u8 instance);
 
+extern int set_required_buffer_size(struct wmi_device *wdev, u64 length);
+
 struct wmi_device_id {
 	const char *guid_string;
 };
@@ -47,6 +50,8 @@ struct wmi_driver {
 	int (*probe)(struct wmi_device *wdev);
 	int (*remove)(struct wmi_device *wdev);
 	void (*notify)(struct wmi_device *device, union acpi_object *data);
+	long (*filter_callback)(struct wmi_device *wdev, unsigned int cmd,
+				struct wmi_ioctl_buffer *arg);
 };
 
 extern int __must_check __wmi_driver_register(struct wmi_driver *driver,
diff --git a/include/uapi/linux/wmi.h b/include/uapi/linux/wmi.h
new file mode 100644
index 000000000000..7e52350ac9b3
--- /dev/null
+++ b/include/uapi/linux/wmi.h
@@ -0,0 +1,26 @@
+/*
+ *  User API methods for ACPI-WMI mapping driver
+ *
+ *  Copyright (C) 2017 Dell, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+#ifndef _UAPI_LINUX_WMI_H
+#define _UAPI_LINUX_WMI_H
+
+#include <linux/types.h>
+
+/* WMI bus will filter all WMI vendor driver requests through this IOC */
+#define WMI_IOC 'W'
+
+/* All ioctl requests through WMI should declare their size followed by
+ * relevant data objects
+ */
+struct wmi_ioctl_buffer {
+	__u64	length;
+	__u8	data[];
+};
+
+#endif
-- 
cgit v1.2.3


From 5fd54ace4721fc5ce2bb5aef6318fcf17f421460 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 3 Nov 2017 11:28:30 +0100
Subject: USB: add SPDX identifiers to all remaining files in drivers/usb/

It's good to have SPDX identifiers in all files to make it easier to
audit the kernel tree for correct licenses.

Update the drivers/usb/ and include/linux/usb* files with the correct
SPDX license identifier based on the license text in the file itself.
The SPDX identifier is a legally binding shorthand, which can be used
instead of the full boiler plate text.

This work is based on a script and data from Thomas Gleixner, Philippe
Ombredanne, and Kate Stewart.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Acked-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/atm/cxacru.c                       | 1 +
 drivers/usb/atm/speedtch.c                     | 1 +
 drivers/usb/atm/ueagle-atm.c                   | 1 +
 drivers/usb/atm/usbatm.c                       | 1 +
 drivers/usb/atm/usbatm.h                       | 1 +
 drivers/usb/atm/xusbatm.c                      | 1 +
 drivers/usb/c67x00/c67x00-drv.c                | 1 +
 drivers/usb/c67x00/c67x00-hcd.c                | 1 +
 drivers/usb/c67x00/c67x00-hcd.h                | 1 +
 drivers/usb/c67x00/c67x00-ll-hpi.c             | 1 +
 drivers/usb/c67x00/c67x00-sched.c              | 1 +
 drivers/usb/c67x00/c67x00.h                    | 1 +
 drivers/usb/chipidea/bits.h                    | 1 +
 drivers/usb/chipidea/ci.h                      | 1 +
 drivers/usb/chipidea/ci_hdrc_imx.c             | 1 +
 drivers/usb/chipidea/ci_hdrc_imx.h             | 1 +
 drivers/usb/chipidea/ci_hdrc_msm.c             | 1 +
 drivers/usb/chipidea/ci_hdrc_pci.c             | 1 +
 drivers/usb/chipidea/ci_hdrc_usb2.c            | 1 +
 drivers/usb/chipidea/ci_hdrc_zevio.c           | 1 +
 drivers/usb/chipidea/core.c                    | 1 +
 drivers/usb/chipidea/host.c                    | 1 +
 drivers/usb/chipidea/otg.c                     | 1 +
 drivers/usb/chipidea/otg.h                     | 1 +
 drivers/usb/chipidea/otg_fsm.c                 | 1 +
 drivers/usb/chipidea/otg_fsm.h                 | 1 +
 drivers/usb/chipidea/udc.c                     | 1 +
 drivers/usb/chipidea/udc.h                     | 1 +
 drivers/usb/chipidea/ulpi.c                    | 1 +
 drivers/usb/chipidea/usbmisc_imx.c             | 1 +
 drivers/usb/class/cdc-acm.c                    | 1 +
 drivers/usb/class/cdc-wdm.c                    | 1 +
 drivers/usb/class/usblp.c                      | 1 +
 drivers/usb/class/usbtmc.c                     | 1 +
 drivers/usb/common/common.c                    | 1 +
 drivers/usb/common/led.c                       | 1 +
 drivers/usb/common/ulpi.c                      | 1 +
 drivers/usb/common/usb-otg-fsm.c               | 1 +
 drivers/usb/core/devices.c                     | 1 +
 drivers/usb/core/devio.c                       | 1 +
 drivers/usb/core/hcd-pci.c                     | 1 +
 drivers/usb/core/hcd.c                         | 1 +
 drivers/usb/core/hub.h                         | 1 +
 drivers/usb/core/ledtrig-usbport.c             | 1 +
 drivers/usb/core/of.c                          | 1 +
 drivers/usb/core/otg_whitelist.h               | 1 +
 drivers/usb/core/port.c                        | 1 +
 drivers/usb/core/quirks.c                      | 1 +
 drivers/usb/core/usb-acpi.c                    | 1 +
 drivers/usb/dwc2/core.c                        | 1 +
 drivers/usb/dwc2/core.h                        | 1 +
 drivers/usb/dwc2/core_intr.c                   | 1 +
 drivers/usb/dwc2/debug.h                       | 1 +
 drivers/usb/dwc2/debugfs.c                     | 1 +
 drivers/usb/dwc2/gadget.c                      | 1 +
 drivers/usb/dwc2/hcd.c                         | 1 +
 drivers/usb/dwc2/hcd.h                         | 1 +
 drivers/usb/dwc2/hcd_ddma.c                    | 1 +
 drivers/usb/dwc2/hcd_intr.c                    | 1 +
 drivers/usb/dwc2/hcd_queue.c                   | 1 +
 drivers/usb/dwc2/hw.h                          | 1 +
 drivers/usb/dwc2/params.c                      | 1 +
 drivers/usb/dwc2/pci.c                         | 1 +
 drivers/usb/dwc2/platform.c                    | 1 +
 drivers/usb/dwc3/core.c                        | 1 +
 drivers/usb/dwc3/core.h                        | 1 +
 drivers/usb/dwc3/debug.h                       | 1 +
 drivers/usb/dwc3/debugfs.c                     | 1 +
 drivers/usb/dwc3/drd.c                         | 1 +
 drivers/usb/dwc3/dwc3-exynos.c                 | 1 +
 drivers/usb/dwc3/dwc3-keystone.c               | 1 +
 drivers/usb/dwc3/dwc3-of-simple.c              | 1 +
 drivers/usb/dwc3/dwc3-omap.c                   | 1 +
 drivers/usb/dwc3/dwc3-pci.c                    | 1 +
 drivers/usb/dwc3/dwc3-st.c                     | 1 +
 drivers/usb/dwc3/ep0.c                         | 1 +
 drivers/usb/dwc3/gadget.c                      | 1 +
 drivers/usb/dwc3/gadget.h                      | 1 +
 drivers/usb/dwc3/host.c                        | 1 +
 drivers/usb/dwc3/io.h                          | 1 +
 drivers/usb/dwc3/trace.c                       | 1 +
 drivers/usb/dwc3/trace.h                       | 1 +
 drivers/usb/dwc3/ulpi.c                        | 1 +
 drivers/usb/early/ehci-dbgp.c                  | 1 +
 drivers/usb/early/xhci-dbc.c                   | 1 +
 drivers/usb/early/xhci-dbc.h                   | 1 +
 drivers/usb/gadget/composite.c                 | 1 +
 drivers/usb/gadget/config.c                    | 1 +
 drivers/usb/gadget/configfs.c                  | 1 +
 drivers/usb/gadget/epautoconf.c                | 1 +
 drivers/usb/gadget/function/f_acm.c            | 1 +
 drivers/usb/gadget/function/f_ecm.c            | 1 +
 drivers/usb/gadget/function/f_eem.c            | 1 +
 drivers/usb/gadget/function/f_fs.c             | 1 +
 drivers/usb/gadget/function/f_hid.c            | 1 +
 drivers/usb/gadget/function/f_loopback.c       | 1 +
 drivers/usb/gadget/function/f_mass_storage.c   | 1 +
 drivers/usb/gadget/function/f_midi.c           | 1 +
 drivers/usb/gadget/function/f_ncm.c            | 1 +
 drivers/usb/gadget/function/f_obex.c           | 1 +
 drivers/usb/gadget/function/f_phonet.c         | 1 +
 drivers/usb/gadget/function/f_printer.c        | 1 +
 drivers/usb/gadget/function/f_rndis.c          | 1 +
 drivers/usb/gadget/function/f_serial.c         | 1 +
 drivers/usb/gadget/function/f_sourcesink.c     | 1 +
 drivers/usb/gadget/function/f_subset.c         | 1 +
 drivers/usb/gadget/function/f_tcm.c            | 1 +
 drivers/usb/gadget/function/f_uac1.c           | 1 +
 drivers/usb/gadget/function/f_uac1_legacy.c    | 1 +
 drivers/usb/gadget/function/f_uac2.c           | 1 +
 drivers/usb/gadget/function/f_uvc.c            | 1 +
 drivers/usb/gadget/function/f_uvc.h            | 1 +
 drivers/usb/gadget/function/rndis.c            | 1 +
 drivers/usb/gadget/function/rndis.h            | 1 +
 drivers/usb/gadget/function/storage_common.c   | 1 +
 drivers/usb/gadget/function/u_audio.c          | 1 +
 drivers/usb/gadget/function/u_audio.h          | 1 +
 drivers/usb/gadget/function/u_ecm.h            | 1 +
 drivers/usb/gadget/function/u_eem.h            | 1 +
 drivers/usb/gadget/function/u_ether.c          | 1 +
 drivers/usb/gadget/function/u_ether.h          | 1 +
 drivers/usb/gadget/function/u_ether_configfs.h | 1 +
 drivers/usb/gadget/function/u_fs.h             | 1 +
 drivers/usb/gadget/function/u_gether.h         | 1 +
 drivers/usb/gadget/function/u_hid.h            | 1 +
 drivers/usb/gadget/function/u_midi.h           | 1 +
 drivers/usb/gadget/function/u_ncm.h            | 1 +
 drivers/usb/gadget/function/u_phonet.h         | 1 +
 drivers/usb/gadget/function/u_printer.h        | 1 +
 drivers/usb/gadget/function/u_rndis.h          | 1 +
 drivers/usb/gadget/function/u_serial.c         | 1 +
 drivers/usb/gadget/function/u_serial.h         | 1 +
 drivers/usb/gadget/function/u_tcm.h            | 1 +
 drivers/usb/gadget/function/u_uac1.h           | 1 +
 drivers/usb/gadget/function/u_uac1_legacy.c    | 1 +
 drivers/usb/gadget/function/u_uac1_legacy.h    | 1 +
 drivers/usb/gadget/function/u_uac2.h           | 1 +
 drivers/usb/gadget/function/u_uvc.h            | 1 +
 drivers/usb/gadget/function/uvc.h              | 1 +
 drivers/usb/gadget/function/uvc_configfs.c     | 1 +
 drivers/usb/gadget/function/uvc_configfs.h     | 1 +
 drivers/usb/gadget/function/uvc_queue.c        | 1 +
 drivers/usb/gadget/function/uvc_v4l2.c         | 1 +
 drivers/usb/gadget/function/uvc_v4l2.h         | 1 +
 drivers/usb/gadget/function/uvc_video.c        | 1 +
 drivers/usb/gadget/function/uvc_video.h        | 1 +
 drivers/usb/gadget/functions.c                 | 1 +
 drivers/usb/gadget/legacy/acm_ms.c             | 1 +
 drivers/usb/gadget/legacy/audio.c              | 1 +
 drivers/usb/gadget/legacy/cdc2.c               | 1 +
 drivers/usb/gadget/legacy/dbgp.c               | 1 +
 drivers/usb/gadget/legacy/ether.c              | 1 +
 drivers/usb/gadget/legacy/g_ffs.c              | 1 +
 drivers/usb/gadget/legacy/gmidi.c              | 1 +
 drivers/usb/gadget/legacy/hid.c                | 1 +
 drivers/usb/gadget/legacy/inode.c              | 1 +
 drivers/usb/gadget/legacy/mass_storage.c       | 1 +
 drivers/usb/gadget/legacy/multi.c              | 1 +
 drivers/usb/gadget/legacy/ncm.c                | 1 +
 drivers/usb/gadget/legacy/nokia.c              | 1 +
 drivers/usb/gadget/legacy/printer.c            | 1 +
 drivers/usb/gadget/legacy/serial.c             | 1 +
 drivers/usb/gadget/legacy/tcm_usb_gadget.c     | 1 +
 drivers/usb/gadget/legacy/webcam.c             | 1 +
 drivers/usb/gadget/legacy/zero.c               | 1 +
 drivers/usb/gadget/u_f.c                       | 1 +
 drivers/usb/gadget/u_f.h                       | 1 +
 drivers/usb/gadget/u_os_desc.h                 | 1 +
 drivers/usb/gadget/udc/amd5536udc.h            | 1 +
 drivers/usb/gadget/udc/amd5536udc_pci.c        | 1 +
 drivers/usb/gadget/udc/at91_udc.c              | 1 +
 drivers/usb/gadget/udc/at91_udc.h              | 1 +
 drivers/usb/gadget/udc/atmel_usba_udc.c        | 1 +
 drivers/usb/gadget/udc/atmel_usba_udc.h        | 1 +
 drivers/usb/gadget/udc/bcm63xx_udc.c           | 1 +
 drivers/usb/gadget/udc/bdc/bdc.h               | 1 +
 drivers/usb/gadget/udc/bdc/bdc_cmd.c           | 1 +
 drivers/usb/gadget/udc/bdc/bdc_cmd.h           | 1 +
 drivers/usb/gadget/udc/bdc/bdc_core.c          | 1 +
 drivers/usb/gadget/udc/bdc/bdc_dbg.c           | 1 +
 drivers/usb/gadget/udc/bdc/bdc_dbg.h           | 1 +
 drivers/usb/gadget/udc/bdc/bdc_ep.c            | 1 +
 drivers/usb/gadget/udc/bdc/bdc_ep.h            | 1 +
 drivers/usb/gadget/udc/bdc/bdc_pci.c           | 1 +
 drivers/usb/gadget/udc/bdc/bdc_udc.c           | 1 +
 drivers/usb/gadget/udc/core.c                  | 1 +
 drivers/usb/gadget/udc/dummy_hcd.c             | 1 +
 drivers/usb/gadget/udc/fotg210-udc.c           | 1 +
 drivers/usb/gadget/udc/fotg210.h               | 1 +
 drivers/usb/gadget/udc/fsl_mxc_udc.c           | 1 +
 drivers/usb/gadget/udc/fsl_qe_udc.c            | 1 +
 drivers/usb/gadget/udc/fsl_qe_udc.h            | 1 +
 drivers/usb/gadget/udc/fsl_udc_core.c          | 1 +
 drivers/usb/gadget/udc/fsl_usb2_udc.h          | 1 +
 drivers/usb/gadget/udc/fusb300_udc.c           | 1 +
 drivers/usb/gadget/udc/fusb300_udc.h           | 1 +
 drivers/usb/gadget/udc/goku_udc.c              | 1 +
 drivers/usb/gadget/udc/goku_udc.h              | 1 +
 drivers/usb/gadget/udc/gr_udc.c                | 1 +
 drivers/usb/gadget/udc/gr_udc.h                | 1 +
 drivers/usb/gadget/udc/lpc32xx_udc.c           | 1 +
 drivers/usb/gadget/udc/m66592-udc.c            | 1 +
 drivers/usb/gadget/udc/m66592-udc.h            | 1 +
 drivers/usb/gadget/udc/mv_u3d.h                | 1 +
 drivers/usb/gadget/udc/mv_u3d_core.c           | 1 +
 drivers/usb/gadget/udc/mv_udc.h                | 1 +
 drivers/usb/gadget/udc/mv_udc_core.c           | 1 +
 drivers/usb/gadget/udc/net2272.c               | 1 +
 drivers/usb/gadget/udc/net2272.h               | 1 +
 drivers/usb/gadget/udc/net2280.c               | 1 +
 drivers/usb/gadget/udc/net2280.h               | 1 +
 drivers/usb/gadget/udc/omap_udc.c              | 1 +
 drivers/usb/gadget/udc/pch_udc.c               | 1 +
 drivers/usb/gadget/udc/pxa25x_udc.c            | 1 +
 drivers/usb/gadget/udc/pxa25x_udc.h            | 1 +
 drivers/usb/gadget/udc/pxa27x_udc.c            | 1 +
 drivers/usb/gadget/udc/pxa27x_udc.h            | 1 +
 drivers/usb/gadget/udc/r8a66597-udc.c          | 1 +
 drivers/usb/gadget/udc/r8a66597-udc.h          | 1 +
 drivers/usb/gadget/udc/renesas_usb3.c          | 1 +
 drivers/usb/gadget/udc/s3c-hsudc.c             | 1 +
 drivers/usb/gadget/udc/s3c2410_udc.c           | 1 +
 drivers/usb/gadget/udc/s3c2410_udc.h           | 1 +
 drivers/usb/gadget/udc/snps_udc_core.c         | 1 +
 drivers/usb/gadget/udc/snps_udc_plat.c         | 1 +
 drivers/usb/gadget/udc/trace.c                 | 1 +
 drivers/usb/gadget/udc/trace.h                 | 1 +
 drivers/usb/gadget/udc/udc-xilinx.c            | 1 +
 drivers/usb/gadget/usbstring.c                 | 1 +
 drivers/usb/host/bcma-hcd.c                    | 1 +
 drivers/usb/host/ehci-atmel.c                  | 1 +
 drivers/usb/host/ehci-dbg.c                    | 1 +
 drivers/usb/host/ehci-exynos.c                 | 1 +
 drivers/usb/host/ehci-fsl.c                    | 1 +
 drivers/usb/host/ehci-fsl.h                    | 1 +
 drivers/usb/host/ehci-grlib.c                  | 1 +
 drivers/usb/host/ehci-hcd.c                    | 1 +
 drivers/usb/host/ehci-hub.c                    | 1 +
 drivers/usb/host/ehci-mem.c                    | 1 +
 drivers/usb/host/ehci-mv.c                     | 1 +
 drivers/usb/host/ehci-mxc.c                    | 1 +
 drivers/usb/host/ehci-omap.c                   | 1 +
 drivers/usb/host/ehci-orion.c                  | 1 +
 drivers/usb/host/ehci-pci.c                    | 1 +
 drivers/usb/host/ehci-platform.c               | 1 +
 drivers/usb/host/ehci-pmcmsp.c                 | 1 +
 drivers/usb/host/ehci-ppc-of.c                 | 1 +
 drivers/usb/host/ehci-ps3.c                    | 1 +
 drivers/usb/host/ehci-q.c                      | 1 +
 drivers/usb/host/ehci-sched.c                  | 1 +
 drivers/usb/host/ehci-sh.c                     | 1 +
 drivers/usb/host/ehci-spear.c                  | 1 +
 drivers/usb/host/ehci-st.c                     | 1 +
 drivers/usb/host/ehci-sysfs.c                  | 1 +
 drivers/usb/host/ehci-tegra.c                  | 1 +
 drivers/usb/host/ehci-tilegx.c                 | 1 +
 drivers/usb/host/ehci-timer.c                  | 1 +
 drivers/usb/host/ehci-w90x900.c                | 1 +
 drivers/usb/host/ehci-xilinx-of.c              | 1 +
 drivers/usb/host/ehci.h                        | 1 +
 drivers/usb/host/fhci-dbg.c                    | 1 +
 drivers/usb/host/fhci-hcd.c                    | 1 +
 drivers/usb/host/fhci-hub.c                    | 1 +
 drivers/usb/host/fhci-mem.c                    | 1 +
 drivers/usb/host/fhci-q.c                      | 1 +
 drivers/usb/host/fhci-sched.c                  | 1 +
 drivers/usb/host/fhci-tds.c                    | 1 +
 drivers/usb/host/fhci.h                        | 1 +
 drivers/usb/host/fotg210-hcd.c                 | 1 +
 drivers/usb/host/fsl-mph-dr-of.c               | 1 +
 drivers/usb/host/hwa-hc.c                      | 1 +
 drivers/usb/host/imx21-dbg.c                   | 1 +
 drivers/usb/host/imx21-hcd.c                   | 1 +
 drivers/usb/host/imx21-hcd.h                   | 1 +
 drivers/usb/host/isp116x-hcd.c                 | 1 +
 drivers/usb/host/isp1362-hcd.c                 | 1 +
 drivers/usb/host/max3421-hcd.c                 | 1 +
 drivers/usb/host/ohci-at91.c                   | 1 +
 drivers/usb/host/ohci-da8xx.c                  | 1 +
 drivers/usb/host/ohci-dbg.c                    | 1 +
 drivers/usb/host/ohci-exynos.c                 | 1 +
 drivers/usb/host/ohci-hcd.c                    | 1 +
 drivers/usb/host/ohci-hub.c                    | 1 +
 drivers/usb/host/ohci-mem.c                    | 1 +
 drivers/usb/host/ohci-nxp.c                    | 1 +
 drivers/usb/host/ohci-omap.c                   | 1 +
 drivers/usb/host/ohci-pci.c                    | 1 +
 drivers/usb/host/ohci-platform.c               | 1 +
 drivers/usb/host/ohci-ppc-of.c                 | 1 +
 drivers/usb/host/ohci-ps3.c                    | 1 +
 drivers/usb/host/ohci-pxa27x.c                 | 1 +
 drivers/usb/host/ohci-q.c                      | 1 +
 drivers/usb/host/ohci-s3c2410.c                | 1 +
 drivers/usb/host/ohci-sa1111.c                 | 1 +
 drivers/usb/host/ohci-sm501.c                  | 1 +
 drivers/usb/host/ohci-spear.c                  | 1 +
 drivers/usb/host/ohci-st.c                     | 1 +
 drivers/usb/host/ohci-tilegx.c                 | 1 +
 drivers/usb/host/ohci-tmio.c                   | 1 +
 drivers/usb/host/ohci.h                        | 1 +
 drivers/usb/host/oxu210hp-hcd.c                | 1 +
 drivers/usb/host/pci-quirks.c                  | 1 +
 drivers/usb/host/r8a66597-hcd.c                | 1 +
 drivers/usb/host/r8a66597.h                    | 1 +
 drivers/usb/host/sl811-hcd.c                   | 1 +
 drivers/usb/host/sl811_cs.c                    | 1 +
 drivers/usb/host/ssb-hcd.c                     | 1 +
 drivers/usb/host/u132-hcd.c                    | 1 +
 drivers/usb/host/uhci-hcd.c                    | 1 +
 drivers/usb/host/whci/asl.c                    | 1 +
 drivers/usb/host/whci/debug.c                  | 1 +
 drivers/usb/host/whci/hcd.c                    | 1 +
 drivers/usb/host/whci/hw.c                     | 1 +
 drivers/usb/host/whci/init.c                   | 1 +
 drivers/usb/host/whci/int.c                    | 1 +
 drivers/usb/host/whci/pzl.c                    | 1 +
 drivers/usb/host/whci/qset.c                   | 1 +
 drivers/usb/host/whci/whcd.h                   | 1 +
 drivers/usb/host/whci/whci-hc.h                | 1 +
 drivers/usb/host/whci/wusb.c                   | 1 +
 drivers/usb/host/xhci-dbg.c                    | 1 +
 drivers/usb/host/xhci-ext-caps.h               | 1 +
 drivers/usb/host/xhci-hub.c                    | 1 +
 drivers/usb/host/xhci-mem.c                    | 1 +
 drivers/usb/host/xhci-mtk-sch.c                | 1 +
 drivers/usb/host/xhci-mtk.c                    | 1 +
 drivers/usb/host/xhci-mtk.h                    | 1 +
 drivers/usb/host/xhci-mvebu.c                  | 1 +
 drivers/usb/host/xhci-mvebu.h                  | 1 +
 drivers/usb/host/xhci-pci.c                    | 1 +
 drivers/usb/host/xhci-plat.c                   | 1 +
 drivers/usb/host/xhci-plat.h                   | 1 +
 drivers/usb/host/xhci-rcar.c                   | 1 +
 drivers/usb/host/xhci-rcar.h                   | 1 +
 drivers/usb/host/xhci-ring.c                   | 1 +
 drivers/usb/host/xhci-tegra.c                  | 1 +
 drivers/usb/host/xhci-trace.c                  | 1 +
 drivers/usb/host/xhci-trace.h                  | 1 +
 drivers/usb/host/xhci.c                        | 1 +
 drivers/usb/host/xhci.h                        | 1 +
 drivers/usb/image/mdc800.c                     | 1 +
 drivers/usb/image/microtek.c                   | 1 +
 drivers/usb/isp1760/isp1760-core.c             | 1 +
 drivers/usb/isp1760/isp1760-core.h             | 1 +
 drivers/usb/isp1760/isp1760-regs.h             | 1 +
 drivers/usb/isp1760/isp1760-udc.c              | 1 +
 drivers/usb/isp1760/isp1760-udc.h              | 1 +
 drivers/usb/misc/adutux.c                      | 1 +
 drivers/usb/misc/appledisplay.c                | 1 +
 drivers/usb/misc/chaoskey.c                    | 1 +
 drivers/usb/misc/cypress_cy7c63.c              | 1 +
 drivers/usb/misc/cytherm.c                     | 1 +
 drivers/usb/misc/ehset.c                       | 1 +
 drivers/usb/misc/emi26.c                       | 1 +
 drivers/usb/misc/emi62.c                       | 1 +
 drivers/usb/misc/ezusb.c                       | 1 +
 drivers/usb/misc/ftdi-elan.c                   | 1 +
 drivers/usb/misc/idmouse.c                     | 1 +
 drivers/usb/misc/iowarrior.c                   | 1 +
 drivers/usb/misc/isight_firmware.c             | 1 +
 drivers/usb/misc/ldusb.c                       | 1 +
 drivers/usb/misc/legousbtower.c                | 1 +
 drivers/usb/misc/lvstest.c                     | 1 +
 drivers/usb/misc/rio500.c                      | 1 +
 drivers/usb/misc/rio500_usb.h                  | 1 +
 drivers/usb/misc/sisusbvga/sisusb.c            | 1 +
 drivers/usb/misc/sisusbvga/sisusb.h            | 1 +
 drivers/usb/misc/sisusbvga/sisusb_con.c        | 1 +
 drivers/usb/misc/sisusbvga/sisusb_init.c       | 1 +
 drivers/usb/misc/sisusbvga/sisusb_init.h       | 1 +
 drivers/usb/misc/sisusbvga/sisusb_struct.h     | 1 +
 drivers/usb/misc/trancevibrator.c              | 1 +
 drivers/usb/misc/usb251xb.c                    | 1 +
 drivers/usb/misc/usb3503.c                     | 1 +
 drivers/usb/misc/usb4604.c                     | 1 +
 drivers/usb/misc/usb_u132.h                    | 1 +
 drivers/usb/misc/usblcd.c                      | 1 +
 drivers/usb/misc/usbsevseg.c                   | 1 +
 drivers/usb/misc/usbtest.c                     | 1 +
 drivers/usb/misc/uss720.c                      | 1 +
 drivers/usb/misc/yurex.c                       | 1 +
 drivers/usb/mon/mon_main.c                     | 1 +
 drivers/usb/mtu3/mtu3.h                        | 1 +
 drivers/usb/mtu3/mtu3_core.c                   | 1 +
 drivers/usb/mtu3/mtu3_dr.c                     | 1 +
 drivers/usb/mtu3/mtu3_dr.h                     | 1 +
 drivers/usb/mtu3/mtu3_gadget.c                 | 1 +
 drivers/usb/mtu3/mtu3_gadget_ep0.c             | 1 +
 drivers/usb/mtu3/mtu3_host.c                   | 1 +
 drivers/usb/mtu3/mtu3_hw_regs.h                | 1 +
 drivers/usb/mtu3/mtu3_plat.c                   | 1 +
 drivers/usb/mtu3/mtu3_qmu.c                    | 1 +
 drivers/usb/mtu3/mtu3_qmu.h                    | 1 +
 drivers/usb/musb/am35x.c                       | 1 +
 drivers/usb/musb/blackfin.c                    | 1 +
 drivers/usb/musb/blackfin.h                    | 1 +
 drivers/usb/musb/cppi_dma.c                    | 1 +
 drivers/usb/musb/da8xx.c                       | 1 +
 drivers/usb/musb/davinci.c                     | 1 +
 drivers/usb/musb/davinci.h                     | 1 +
 drivers/usb/musb/jz4740.c                      | 1 +
 drivers/usb/musb/musb_am335x.c                 | 1 +
 drivers/usb/musb/musb_core.c                   | 1 +
 drivers/usb/musb/musb_core.h                   | 1 +
 drivers/usb/musb/musb_cppi41.c                 | 1 +
 drivers/usb/musb/musb_debug.h                  | 1 +
 drivers/usb/musb/musb_debugfs.c                | 1 +
 drivers/usb/musb/musb_dma.h                    | 1 +
 drivers/usb/musb/musb_dsps.c                   | 1 +
 drivers/usb/musb/musb_gadget.c                 | 1 +
 drivers/usb/musb/musb_gadget.h                 | 1 +
 drivers/usb/musb/musb_gadget_ep0.c             | 1 +
 drivers/usb/musb/musb_host.c                   | 1 +
 drivers/usb/musb/musb_host.h                   | 1 +
 drivers/usb/musb/musb_io.h                     | 1 +
 drivers/usb/musb/musb_regs.h                   | 1 +
 drivers/usb/musb/musb_trace.c                  | 1 +
 drivers/usb/musb/musb_trace.h                  | 1 +
 drivers/usb/musb/musb_virthub.c                | 1 +
 drivers/usb/musb/musbhsdma.c                   | 1 +
 drivers/usb/musb/musbhsdma.h                   | 1 +
 drivers/usb/musb/omap2430.c                    | 1 +
 drivers/usb/musb/omap2430.h                    | 1 +
 drivers/usb/musb/sunxi.c                       | 1 +
 drivers/usb/musb/tusb6010.c                    | 1 +
 drivers/usb/musb/tusb6010.h                    | 1 +
 drivers/usb/musb/tusb6010_omap.c               | 1 +
 drivers/usb/musb/ux500.c                       | 1 +
 drivers/usb/musb/ux500_dma.c                   | 1 +
 drivers/usb/phy/of.c                           | 1 +
 drivers/usb/phy/phy-ab8500-usb.c               | 1 +
 drivers/usb/phy/phy-am335x-control.c           | 1 +
 drivers/usb/phy/phy-am335x.c                   | 1 +
 drivers/usb/phy/phy-fsl-usb.c                  | 1 +
 drivers/usb/phy/phy-fsl-usb.h                  | 1 +
 drivers/usb/phy/phy-generic.c                  | 1 +
 drivers/usb/phy/phy-gpio-vbus-usb.c            | 1 +
 drivers/usb/phy/phy-isp1301-omap.c             | 1 +
 drivers/usb/phy/phy-isp1301.c                  | 1 +
 drivers/usb/phy/phy-keystone.c                 | 1 +
 drivers/usb/phy/phy-mv-usb.c                   | 1 +
 drivers/usb/phy/phy-mv-usb.h                   | 1 +
 drivers/usb/phy/phy-mxs-usb.c                  | 1 +
 drivers/usb/phy/phy-omap-otg.c                 | 1 +
 drivers/usb/phy/phy-tahvo.c                    | 1 +
 drivers/usb/phy/phy-tegra-usb.c                | 1 +
 drivers/usb/phy/phy-twl6030-usb.c              | 1 +
 drivers/usb/phy/phy-ulpi-viewport.c            | 1 +
 drivers/usb/phy/phy-ulpi.c                     | 1 +
 drivers/usb/phy/phy.c                          | 1 +
 drivers/usb/renesas_usbhs/common.c             | 1 +
 drivers/usb/renesas_usbhs/common.h             | 1 +
 drivers/usb/renesas_usbhs/fifo.c               | 1 +
 drivers/usb/renesas_usbhs/fifo.h               | 1 +
 drivers/usb/renesas_usbhs/mod.c                | 1 +
 drivers/usb/renesas_usbhs/mod.h                | 1 +
 drivers/usb/renesas_usbhs/mod_gadget.c         | 1 +
 drivers/usb/renesas_usbhs/mod_host.c           | 1 +
 drivers/usb/renesas_usbhs/pipe.c               | 1 +
 drivers/usb/renesas_usbhs/pipe.h               | 1 +
 drivers/usb/renesas_usbhs/rcar2.c              | 1 +
 drivers/usb/renesas_usbhs/rcar3.c              | 1 +
 drivers/usb/serial/aircable.c                  | 1 +
 drivers/usb/serial/ark3116.c                   | 1 +
 drivers/usb/serial/belkin_sa.c                 | 1 +
 drivers/usb/serial/belkin_sa.h                 | 1 +
 drivers/usb/serial/bus.c                       | 1 +
 drivers/usb/serial/ch341.c                     | 1 +
 drivers/usb/serial/console.c                   | 1 +
 drivers/usb/serial/cp210x.c                    | 1 +
 drivers/usb/serial/cyberjack.c                 | 1 +
 drivers/usb/serial/cypress_m8.c                | 1 +
 drivers/usb/serial/digi_acceleport.c           | 1 +
 drivers/usb/serial/empeg.c                     | 1 +
 drivers/usb/serial/f81232.c                    | 1 +
 drivers/usb/serial/f81534.c                    | 1 +
 drivers/usb/serial/ftdi_sio.c                  | 1 +
 drivers/usb/serial/garmin_gps.c                | 1 +
 drivers/usb/serial/generic.c                   | 1 +
 drivers/usb/serial/io_16654.h                  | 1 +
 drivers/usb/serial/io_edgeport.c               | 1 +
 drivers/usb/serial/io_edgeport.h               | 1 +
 drivers/usb/serial/io_ionsp.h                  | 1 +
 drivers/usb/serial/io_ti.c                     | 1 +
 drivers/usb/serial/io_ti.h                     | 1 +
 drivers/usb/serial/io_usbvend.h                | 1 +
 drivers/usb/serial/ipaq.c                      | 1 +
 drivers/usb/serial/ipw.c                       | 1 +
 drivers/usb/serial/ir-usb.c                    | 1 +
 drivers/usb/serial/iuu_phoenix.c               | 1 +
 drivers/usb/serial/iuu_phoenix.h               | 1 +
 drivers/usb/serial/keyspan.c                   | 1 +
 drivers/usb/serial/keyspan_pda.c               | 1 +
 drivers/usb/serial/kl5kusb105.c                | 1 +
 drivers/usb/serial/kobil_sct.c                 | 1 +
 drivers/usb/serial/mct_u232.c                  | 1 +
 drivers/usb/serial/mct_u232.h                  | 1 +
 drivers/usb/serial/metro-usb.c                 | 1 +
 drivers/usb/serial/mos7720.c                   | 1 +
 drivers/usb/serial/mos7840.c                   | 1 +
 drivers/usb/serial/mxuport.c                   | 1 +
 drivers/usb/serial/navman.c                    | 1 +
 drivers/usb/serial/omninet.c                   | 1 +
 drivers/usb/serial/opticon.c                   | 1 +
 drivers/usb/serial/option.c                    | 1 +
 drivers/usb/serial/oti6858.c                   | 1 +
 drivers/usb/serial/oti6858.h                   | 1 +
 drivers/usb/serial/pl2303.c                    | 1 +
 drivers/usb/serial/pl2303.h                    | 1 +
 drivers/usb/serial/qcaux.c                     | 1 +
 drivers/usb/serial/qcserial.c                  | 1 +
 drivers/usb/serial/quatech2.c                  | 1 +
 drivers/usb/serial/safe_serial.c               | 1 +
 drivers/usb/serial/sierra.c                    | 1 +
 drivers/usb/serial/spcp8x5.c                   | 1 +
 drivers/usb/serial/ssu100.c                    | 1 +
 drivers/usb/serial/symbolserial.c              | 1 +
 drivers/usb/serial/ti_usb_3410_5052.c          | 1 +
 drivers/usb/serial/upd78f0730.c                | 1 +
 drivers/usb/serial/usb-serial-simple.c         | 1 +
 drivers/usb/serial/usb-serial.c                | 1 +
 drivers/usb/serial/usb_debug.c                 | 1 +
 drivers/usb/serial/usb_wwan.c                  | 1 +
 drivers/usb/serial/visor.c                     | 1 +
 drivers/usb/serial/visor.h                     | 1 +
 drivers/usb/serial/whiteheat.c                 | 1 +
 drivers/usb/serial/whiteheat.h                 | 1 +
 drivers/usb/serial/wishbone-serial.c           | 1 +
 drivers/usb/serial/xsens_mt.c                  | 1 +
 drivers/usb/storage/alauda.c                   | 1 +
 drivers/usb/storage/cypress_atacb.c            | 1 +
 drivers/usb/storage/datafab.c                  | 1 +
 drivers/usb/storage/debug.c                    | 1 +
 drivers/usb/storage/debug.h                    | 1 +
 drivers/usb/storage/ene_ub6250.c               | 1 +
 drivers/usb/storage/freecom.c                  | 1 +
 drivers/usb/storage/initializers.c             | 1 +
 drivers/usb/storage/initializers.h             | 1 +
 drivers/usb/storage/isd200.c                   | 1 +
 drivers/usb/storage/jumpshot.c                 | 1 +
 drivers/usb/storage/karma.c                    | 1 +
 drivers/usb/storage/onetouch.c                 | 1 +
 drivers/usb/storage/option_ms.c                | 1 +
 drivers/usb/storage/protocol.c                 | 1 +
 drivers/usb/storage/protocol.h                 | 1 +
 drivers/usb/storage/realtek_cr.c               | 1 +
 drivers/usb/storage/scsiglue.c                 | 1 +
 drivers/usb/storage/scsiglue.h                 | 1 +
 drivers/usb/storage/sddr09.c                   | 1 +
 drivers/usb/storage/sddr55.c                   | 1 +
 drivers/usb/storage/shuttle_usbat.c            | 1 +
 drivers/usb/storage/transport.c                | 1 +
 drivers/usb/storage/transport.h                | 1 +
 drivers/usb/storage/uas.c                      | 1 +
 drivers/usb/storage/unusual_alauda.h           | 1 +
 drivers/usb/storage/unusual_cypress.h          | 1 +
 drivers/usb/storage/unusual_datafab.h          | 1 +
 drivers/usb/storage/unusual_devs.h             | 1 +
 drivers/usb/storage/unusual_ene_ub6250.h       | 1 +
 drivers/usb/storage/unusual_freecom.h          | 1 +
 drivers/usb/storage/unusual_isd200.h           | 1 +
 drivers/usb/storage/unusual_jumpshot.h         | 1 +
 drivers/usb/storage/unusual_karma.h            | 1 +
 drivers/usb/storage/unusual_onetouch.h         | 1 +
 drivers/usb/storage/unusual_realtek.h          | 1 +
 drivers/usb/storage/unusual_sddr09.h           | 1 +
 drivers/usb/storage/unusual_sddr55.h           | 1 +
 drivers/usb/storage/unusual_uas.h              | 1 +
 drivers/usb/storage/unusual_usbat.h            | 1 +
 drivers/usb/storage/usb.c                      | 1 +
 drivers/usb/storage/usb.h                      | 1 +
 drivers/usb/storage/usual-tables.c             | 1 +
 drivers/usb/typec/typec.c                      | 1 +
 drivers/usb/typec/typec_wcove.c                | 1 +
 drivers/usb/typec/ucsi/ucsi.c                  | 1 +
 drivers/usb/typec/ucsi/ucsi_acpi.c             | 1 +
 drivers/usb/usb-skeleton.c                     | 1 +
 drivers/usb/usbip/stub.h                       | 1 +
 drivers/usb/usbip/stub_dev.c                   | 1 +
 drivers/usb/usbip/stub_main.c                  | 1 +
 drivers/usb/usbip/stub_rx.c                    | 1 +
 drivers/usb/usbip/stub_tx.c                    | 1 +
 drivers/usb/usbip/usbip_common.c               | 1 +
 drivers/usb/usbip/usbip_common.h               | 1 +
 drivers/usb/usbip/usbip_event.c                | 1 +
 drivers/usb/usbip/vhci.h                       | 1 +
 drivers/usb/usbip/vhci_hcd.c                   | 1 +
 drivers/usb/usbip/vhci_rx.c                    | 1 +
 drivers/usb/usbip/vhci_sysfs.c                 | 1 +
 drivers/usb/usbip/vhci_tx.c                    | 1 +
 drivers/usb/usbip/vudc.h                       | 1 +
 drivers/usb/usbip/vudc_dev.c                   | 1 +
 drivers/usb/usbip/vudc_main.c                  | 1 +
 drivers/usb/usbip/vudc_rx.c                    | 1 +
 drivers/usb/usbip/vudc_sysfs.c                 | 1 +
 drivers/usb/usbip/vudc_transfer.c              | 1 +
 drivers/usb/usbip/vudc_tx.c                    | 1 +
 drivers/usb/wusbcore/cbaf.c                    | 1 +
 drivers/usb/wusbcore/crypto.c                  | 1 +
 drivers/usb/wusbcore/dev-sysfs.c               | 1 +
 drivers/usb/wusbcore/devconnect.c              | 1 +
 drivers/usb/wusbcore/mmc.c                     | 1 +
 drivers/usb/wusbcore/pal.c                     | 1 +
 drivers/usb/wusbcore/reservation.c             | 1 +
 drivers/usb/wusbcore/rh.c                      | 1 +
 drivers/usb/wusbcore/security.c                | 1 +
 drivers/usb/wusbcore/wa-hc.c                   | 1 +
 drivers/usb/wusbcore/wa-hc.h                   | 1 +
 drivers/usb/wusbcore/wa-nep.c                  | 1 +
 drivers/usb/wusbcore/wa-rpipe.c                | 1 +
 drivers/usb/wusbcore/wa-xfer.c                 | 1 +
 drivers/usb/wusbcore/wusbhc.c                  | 1 +
 drivers/usb/wusbcore/wusbhc.h                  | 1 +
 include/linux/usb/association.h                | 1 +
 include/linux/usb/audio-v2.h                   | 1 +
 include/linux/usb/audio.h                      | 1 +
 include/linux/usb/c67x00.h                     | 1 +
 include/linux/usb/cdc-wdm.h                    | 1 +
 include/linux/usb/cdc.h                        | 1 +
 include/linux/usb/cdc_ncm.h                    | 1 +
 include/linux/usb/composite.h                  | 1 +
 include/linux/usb/ehci_def.h                   | 1 +
 include/linux/usb/ehci_pdriver.h               | 1 +
 include/linux/usb/g_hid.h                      | 1 +
 include/linux/usb/gadget.h                     | 1 +
 include/linux/usb/gpio_vbus.h                  | 1 +
 include/linux/usb/hcd.h                        | 1 +
 include/linux/usb/input.h                      | 1 +
 include/linux/usb/isp1301.h                    | 1 +
 include/linux/usb/m66592.h                     | 1 +
 include/linux/usb/musb-ux500.h                 | 1 +
 include/linux/usb/net2280.h                    | 1 +
 include/linux/usb/of.h                         | 1 +
 include/linux/usb/ohci_pdriver.h               | 1 +
 include/linux/usb/otg-fsm.h                    | 1 +
 include/linux/usb/phy_companion.h              | 1 +
 include/linux/usb/r8a66597.h                   | 1 +
 include/linux/usb/renesas_usbhs.h              | 1 +
 include/linux/usb/rndis_host.h                 | 1 +
 include/linux/usb/samsung_usb_phy.h            | 1 +
 include/linux/usb/serial.h                     | 1 +
 include/linux/usb/storage.h                    | 1 +
 include/linux/usb/tegra_usb_phy.h              | 1 +
 include/linux/usb/tilegx.h                     | 1 +
 include/linux/usb/ulpi.h                       | 1 +
 include/linux/usb/usb338x.h                    | 1 +
 include/linux/usb/usbnet.h                     | 1 +
 include/linux/usb/wusb-wa.h                    | 1 +
 include/linux/usb/wusb.h                       | 1 +
 include/linux/usb/xhci-dbgp.h                  | 1 +
 include/linux/usbdevice_fs.h                   | 1 +
 651 files changed, 651 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/atm/cxacru.c b/drivers/usb/atm/cxacru.c
index e4f177da164d..87d16578c5a4 100644
--- a/drivers/usb/atm/cxacru.c
+++ b/drivers/usb/atm/cxacru.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /******************************************************************************
  *  cxacru.c  -  driver for USB ADSL modems based on
  *               Conexant AccessRunner chipset
diff --git a/drivers/usb/atm/speedtch.c b/drivers/usb/atm/speedtch.c
index 091db9b281f5..cc65bec5b331 100644
--- a/drivers/usb/atm/speedtch.c
+++ b/drivers/usb/atm/speedtch.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /******************************************************************************
  *  speedtch.c  -  Alcatel SpeedTouch USB xDSL modem driver
  *
diff --git a/drivers/usb/atm/ueagle-atm.c b/drivers/usb/atm/ueagle-atm.c
index ba7616395db2..ab75690044bb 100644
--- a/drivers/usb/atm/ueagle-atm.c
+++ b/drivers/usb/atm/ueagle-atm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
 /*-
  * Copyright (c) 2003, 2004
  *	Damien Bergamini <damien.bergamini@free.fr>. All rights reserved.
diff --git a/drivers/usb/atm/usbatm.c b/drivers/usb/atm/usbatm.c
index 8607af758bbd..ce1c57cb413f 100644
--- a/drivers/usb/atm/usbatm.c
+++ b/drivers/usb/atm/usbatm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /******************************************************************************
  *  usbatm.c - Generic USB xDSL driver core
  *
diff --git a/drivers/usb/atm/usbatm.h b/drivers/usb/atm/usbatm.h
index f3eecd967a8a..72f9d3b8adb6 100644
--- a/drivers/usb/atm/usbatm.h
+++ b/drivers/usb/atm/usbatm.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /******************************************************************************
  *  usbatm.h - Generic USB xDSL driver core
  *
diff --git a/drivers/usb/atm/xusbatm.c b/drivers/usb/atm/xusbatm.c
index c73c1ec3005e..7c55032a2c2c 100644
--- a/drivers/usb/atm/xusbatm.c
+++ b/drivers/usb/atm/xusbatm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /******************************************************************************
  *  xusbatm.c -	dumb usbatm-based driver for modems initialized in userspace
  *
diff --git a/drivers/usb/c67x00/c67x00-drv.c b/drivers/usb/c67x00/c67x00-drv.c
index 5796c8820514..3e4b46f8aa67 100644
--- a/drivers/usb/c67x00/c67x00-drv.c
+++ b/drivers/usb/c67x00/c67x00-drv.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * c67x00-drv.c: Cypress C67X00 USB Common infrastructure
  *
diff --git a/drivers/usb/c67x00/c67x00-hcd.c b/drivers/usb/c67x00/c67x00-hcd.c
index 30d3f346686e..705c40a0097d 100644
--- a/drivers/usb/c67x00/c67x00-hcd.c
+++ b/drivers/usb/c67x00/c67x00-hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * c67x00-hcd.c: Cypress C67X00 USB Host Controller Driver
  *
diff --git a/drivers/usb/c67x00/c67x00-hcd.h b/drivers/usb/c67x00/c67x00-hcd.h
index cf8a455a6403..05ab1ba3b327 100644
--- a/drivers/usb/c67x00/c67x00-hcd.h
+++ b/drivers/usb/c67x00/c67x00-hcd.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * c67x00-hcd.h: Cypress C67X00 USB HCD
  *
diff --git a/drivers/usb/c67x00/c67x00-ll-hpi.c b/drivers/usb/c67x00/c67x00-ll-hpi.c
index b58151841e10..285f40aa16bf 100644
--- a/drivers/usb/c67x00/c67x00-ll-hpi.c
+++ b/drivers/usb/c67x00/c67x00-ll-hpi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * c67x00-ll-hpi.c: Cypress C67X00 USB Low level interface using HPI
  *
diff --git a/drivers/usb/c67x00/c67x00-sched.c b/drivers/usb/c67x00/c67x00-sched.c
index 0b68cd6874d2..233a8be2d8e4 100644
--- a/drivers/usb/c67x00/c67x00-sched.c
+++ b/drivers/usb/c67x00/c67x00-sched.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * c67x00-sched.c: Cypress C67X00 USB Host Controller Driver - TD scheduling
  *
diff --git a/drivers/usb/c67x00/c67x00.h b/drivers/usb/c67x00/c67x00.h
index a26e9ded0f32..31339d7f7a4b 100644
--- a/drivers/usb/c67x00/c67x00.h
+++ b/drivers/usb/c67x00/c67x00.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * c67x00.h: Cypress C67X00 USB register and field definitions
  *
diff --git a/drivers/usb/chipidea/bits.h b/drivers/usb/chipidea/bits.h
index e462f55c8b99..018a28e2e7d8 100644
--- a/drivers/usb/chipidea/bits.h
+++ b/drivers/usb/chipidea/bits.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * bits.h - register bits of the ChipIdea USB IP core
  *
diff --git a/drivers/usb/chipidea/ci.h b/drivers/usb/chipidea/ci.h
index 6743f85b1b7a..fa6dd9d9f5c6 100644
--- a/drivers/usb/chipidea/ci.h
+++ b/drivers/usb/chipidea/ci.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * ci.h - common structures, functions, and macros of the ChipIdea driver
  *
diff --git a/drivers/usb/chipidea/ci_hdrc_imx.c b/drivers/usb/chipidea/ci_hdrc_imx.c
index 5f4a8157fad8..9a2f7416ff9e 100644
--- a/drivers/usb/chipidea/ci_hdrc_imx.c
+++ b/drivers/usb/chipidea/ci_hdrc_imx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright 2012 Freescale Semiconductor, Inc.
  * Copyright (C) 2012 Marek Vasut <marex@denx.de>
diff --git a/drivers/usb/chipidea/ci_hdrc_imx.h b/drivers/usb/chipidea/ci_hdrc_imx.h
index d666c9f036ba..98a8b8c817ff 100644
--- a/drivers/usb/chipidea/ci_hdrc_imx.h
+++ b/drivers/usb/chipidea/ci_hdrc_imx.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright 2012 Freescale Semiconductor, Inc.
  *
diff --git a/drivers/usb/chipidea/ci_hdrc_msm.c b/drivers/usb/chipidea/ci_hdrc_msm.c
index bb626120296f..85aac4e881aa 100644
--- a/drivers/usb/chipidea/ci_hdrc_msm.c
+++ b/drivers/usb/chipidea/ci_hdrc_msm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2010, Code Aurora Forum. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
diff --git a/drivers/usb/chipidea/ci_hdrc_pci.c b/drivers/usb/chipidea/ci_hdrc_pci.c
index 39414e4b2d81..a9dd661026bf 100644
--- a/drivers/usb/chipidea/ci_hdrc_pci.c
+++ b/drivers/usb/chipidea/ci_hdrc_pci.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * ci_hdrc_pci.c - MIPS USB IP core family device controller
  *
diff --git a/drivers/usb/chipidea/ci_hdrc_usb2.c b/drivers/usb/chipidea/ci_hdrc_usb2.c
index 99425db9ba62..41c5f8b9c0b9 100644
--- a/drivers/usb/chipidea/ci_hdrc_usb2.c
+++ b/drivers/usb/chipidea/ci_hdrc_usb2.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2014 Marvell Technology Group Ltd.
  *
diff --git a/drivers/usb/chipidea/ci_hdrc_zevio.c b/drivers/usb/chipidea/ci_hdrc_zevio.c
index 1264de505527..b8c75304d805 100644
--- a/drivers/usb/chipidea/ci_hdrc_zevio.c
+++ b/drivers/usb/chipidea/ci_hdrc_zevio.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *	Copyright (C) 2013 Daniel Tang <tangrs@tangrs.id.au>
  *
diff --git a/drivers/usb/chipidea/core.c b/drivers/usb/chipidea/core.c
index 43ea5fb87b9a..708c2eaba81c 100644
--- a/drivers/usb/chipidea/core.c
+++ b/drivers/usb/chipidea/core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * core.c - ChipIdea USB IP core family device controller
  *
diff --git a/drivers/usb/chipidea/host.c b/drivers/usb/chipidea/host.c
index 18cb8e46262d..a49e8567ac7d 100644
--- a/drivers/usb/chipidea/host.c
+++ b/drivers/usb/chipidea/host.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * host.c - ChipIdea USB host controller driver
  *
diff --git a/drivers/usb/chipidea/otg.c b/drivers/usb/chipidea/otg.c
index 10236fe71522..ef22fcc63f33 100644
--- a/drivers/usb/chipidea/otg.c
+++ b/drivers/usb/chipidea/otg.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * otg.c - ChipIdea USB IP core OTG driver
  *
diff --git a/drivers/usb/chipidea/otg.h b/drivers/usb/chipidea/otg.h
index 9ecb598e48f0..b8e2415bfa7c 100644
--- a/drivers/usb/chipidea/otg.h
+++ b/drivers/usb/chipidea/otg.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2013-2014 Freescale Semiconductor, Inc.
  *
diff --git a/drivers/usb/chipidea/otg_fsm.c b/drivers/usb/chipidea/otg_fsm.c
index 5ea0246f650d..485aaa1747e8 100644
--- a/drivers/usb/chipidea/otg_fsm.c
+++ b/drivers/usb/chipidea/otg_fsm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * otg_fsm.c - ChipIdea USB IP core OTG FSM driver
  *
diff --git a/drivers/usb/chipidea/otg_fsm.h b/drivers/usb/chipidea/otg_fsm.h
index 6366fe398ba6..08e74aeea21d 100644
--- a/drivers/usb/chipidea/otg_fsm.h
+++ b/drivers/usb/chipidea/otg_fsm.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2014 Freescale Semiconductor, Inc.
  *
diff --git a/drivers/usb/chipidea/udc.c b/drivers/usb/chipidea/udc.c
index daffa5ed396d..9e75e0921b3f 100644
--- a/drivers/usb/chipidea/udc.c
+++ b/drivers/usb/chipidea/udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * udc.c - ChipIdea UDC driver
  *
diff --git a/drivers/usb/chipidea/udc.h b/drivers/usb/chipidea/udc.h
index 2ecd1174d66c..840275342873 100644
--- a/drivers/usb/chipidea/udc.h
+++ b/drivers/usb/chipidea/udc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * udc.h - ChipIdea UDC structures
  *
diff --git a/drivers/usb/chipidea/ulpi.c b/drivers/usb/chipidea/ulpi.c
index 1219583dc1b2..c4e1900b9777 100644
--- a/drivers/usb/chipidea/ulpi.c
+++ b/drivers/usb/chipidea/ulpi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (c) 2016 Linaro Ltd.
  *
diff --git a/drivers/usb/chipidea/usbmisc_imx.c b/drivers/usb/chipidea/usbmisc_imx.c
index 9f4a0185dd60..e3842238adeb 100644
--- a/drivers/usb/chipidea/usbmisc_imx.c
+++ b/drivers/usb/chipidea/usbmisc_imx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright 2012 Freescale Semiconductor, Inc.
  *
diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 18c923a4c16e..ee33370b012d 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * cdc-acm.c
  *
diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index 3e865dbf878c..80529ac4b083 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * cdc-wdm.c
  *
diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c
index fb87c17ed6fa..af771fa25800 100644
--- a/drivers/usb/class/usblp.c
+++ b/drivers/usb/class/usblp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * usblp.c
  *
diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c
index 6adceacaa919..a1475fafc5f4 100644
--- a/drivers/usb/class/usbtmc.c
+++ b/drivers/usb/class/usbtmc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /**
  * drivers/usb/class/usbtmc.c - USB Test & Measurement class driver
  *
diff --git a/drivers/usb/common/common.c b/drivers/usb/common/common.c
index 552ff7ac5a6b..2b45b0517b3d 100644
--- a/drivers/usb/common/common.c
+++ b/drivers/usb/common/common.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Provides code common for host and device side USB.
  *
diff --git a/drivers/usb/common/led.c b/drivers/usb/common/led.c
index df23da00a901..fd5538c8e067 100644
--- a/drivers/usb/common/led.c
+++ b/drivers/usb/common/led.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * LED Triggers for USB Activity
  *
diff --git a/drivers/usb/common/ulpi.c b/drivers/usb/common/ulpi.c
index 4aa5195db8ea..73c38258eec7 100644
--- a/drivers/usb/common/ulpi.c
+++ b/drivers/usb/common/ulpi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * ulpi.c - USB ULPI PHY bus
  *
diff --git a/drivers/usb/common/usb-otg-fsm.c b/drivers/usb/common/usb-otg-fsm.c
index b8fe31e409a5..f960d5374ee0 100644
--- a/drivers/usb/common/usb-otg-fsm.c
+++ b/drivers/usb/common/usb-otg-fsm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * OTG Finite State Machine from OTG spec
  *
diff --git a/drivers/usb/core/devices.c b/drivers/usb/core/devices.c
index 55dea2e7828f..1c8b6faa7e66 100644
--- a/drivers/usb/core/devices.c
+++ b/drivers/usb/core/devices.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * devices.c
  * (C) Copyright 1999 Randy Dunlap.
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index e9326f31db8d..f67e9fce6847 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*****************************************************************************/
 
 /*
diff --git a/drivers/usb/core/hcd-pci.c b/drivers/usb/core/hcd-pci.c
index ea829ad798c0..07d5919ccae1 100644
--- a/drivers/usb/core/hcd-pci.c
+++ b/drivers/usb/core/hcd-pci.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * (C) Copyright David Brownell 2000-2002
  *
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 67aa3d039b9b..16a19faae376 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * (C) Copyright Linus Torvalds 1999
  * (C) Copyright Johannes Erdfelt 1999-2001
diff --git a/drivers/usb/core/hub.h b/drivers/usb/core/hub.h
index 34c1a7e22aae..7d29bf3f095b 100644
--- a/drivers/usb/core/hub.h
+++ b/drivers/usb/core/hub.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * usb hub driver head file
  *
diff --git a/drivers/usb/core/ledtrig-usbport.c b/drivers/usb/core/ledtrig-usbport.c
index 1af877942110..f832a2b854ff 100644
--- a/drivers/usb/core/ledtrig-usbport.c
+++ b/drivers/usb/core/ledtrig-usbport.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB port LED trigger
  *
diff --git a/drivers/usb/core/of.c b/drivers/usb/core/of.c
index 3863bb1ce8c5..a95416d28aa6 100644
--- a/drivers/usb/core/of.c
+++ b/drivers/usb/core/of.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * of.c		The helpers for hcd device tree support
  *
diff --git a/drivers/usb/core/otg_whitelist.h b/drivers/usb/core/otg_whitelist.h
index 085049d37d7a..4629f6e93954 100644
--- a/drivers/usb/core/otg_whitelist.h
+++ b/drivers/usb/core/otg_whitelist.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * drivers/usb/core/otg_whitelist.h
  *
diff --git a/drivers/usb/core/port.c b/drivers/usb/core/port.c
index 460c855be0d0..bd757a951a18 100644
--- a/drivers/usb/core/port.c
+++ b/drivers/usb/core/port.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * usb port device code
  *
diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index 3f08a9be65a8..270c5105d3c6 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB device quirk handling logic and table
  *
diff --git a/drivers/usb/core/usb-acpi.c b/drivers/usb/core/usb-acpi.c
index ef9cf4a21afe..90afee5079dc 100644
--- a/drivers/usb/core/usb-acpi.c
+++ b/drivers/usb/core/usb-acpi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB-ACPI glue code
  *
diff --git a/drivers/usb/dwc2/core.c b/drivers/usb/dwc2/core.c
index 1b6612c2cdda..82a7d98c3436 100644
--- a/drivers/usb/dwc2/core.c
+++ b/drivers/usb/dwc2/core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
 /*
  * core.c - DesignWare HS OTG Controller common routines
  *
diff --git a/drivers/usb/dwc2/core.h b/drivers/usb/dwc2/core.h
index 730d7eb449bb..f66c94130cac 100644
--- a/drivers/usb/dwc2/core.h
+++ b/drivers/usb/dwc2/core.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
 /*
  * core.h - DesignWare HS OTG Controller common declarations
  *
diff --git a/drivers/usb/dwc2/core_intr.c b/drivers/usb/dwc2/core_intr.c
index b8bcb007c92a..ab3fa1630853 100644
--- a/drivers/usb/dwc2/core_intr.c
+++ b/drivers/usb/dwc2/core_intr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
 /*
  * core_intr.c - DesignWare HS OTG Controller common interrupt handling
  *
diff --git a/drivers/usb/dwc2/debug.h b/drivers/usb/dwc2/debug.h
index 8222783e6822..7e2442bcbb2e 100644
--- a/drivers/usb/dwc2/debug.h
+++ b/drivers/usb/dwc2/debug.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * debug.h - Designware USB2 DRD controller debug header
  *
diff --git a/drivers/usb/dwc2/debugfs.c b/drivers/usb/dwc2/debugfs.c
index 794b959a7c8c..c6492cca872f 100644
--- a/drivers/usb/dwc2/debugfs.c
+++ b/drivers/usb/dwc2/debugfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * debugfs.c - Designware USB2 DRD controller debugfs
  *
diff --git a/drivers/usb/dwc2/gadget.c b/drivers/usb/dwc2/gadget.c
index 603c21612051..40fd4e747e20 100644
--- a/drivers/usb/dwc2/gadget.c
+++ b/drivers/usb/dwc2/gadget.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * Copyright (c) 2011 Samsung Electronics Co., Ltd.
  *		http://www.samsung.com
diff --git a/drivers/usb/dwc2/hcd.c b/drivers/usb/dwc2/hcd.c
index e8fee0f969f0..69eb40cd1b47 100644
--- a/drivers/usb/dwc2/hcd.c
+++ b/drivers/usb/dwc2/hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
 /*
  * hcd.c - DesignWare HS OTG Controller host-mode routines
  *
diff --git a/drivers/usb/dwc2/hcd.h b/drivers/usb/dwc2/hcd.h
index 11c3c145b793..78e9e01051b5 100644
--- a/drivers/usb/dwc2/hcd.h
+++ b/drivers/usb/dwc2/hcd.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
 /*
  * hcd.h - DesignWare HS OTG Controller host-mode declarations
  *
diff --git a/drivers/usb/dwc2/hcd_ddma.c b/drivers/usb/dwc2/hcd_ddma.c
index b8bdf545c3a7..28c8898b3b66 100644
--- a/drivers/usb/dwc2/hcd_ddma.c
+++ b/drivers/usb/dwc2/hcd_ddma.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
 /*
  * hcd_ddma.c - DesignWare HS OTG Controller descriptor DMA routines
  *
diff --git a/drivers/usb/dwc2/hcd_intr.c b/drivers/usb/dwc2/hcd_intr.c
index 28a8210710b1..916d991b96b8 100644
--- a/drivers/usb/dwc2/hcd_intr.c
+++ b/drivers/usb/dwc2/hcd_intr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
 /*
  * hcd_intr.c - DesignWare HS OTG Controller host-mode interrupt handling
  *
diff --git a/drivers/usb/dwc2/hcd_queue.c b/drivers/usb/dwc2/hcd_queue.c
index 3ae8b1bbaa55..f472de238ac2 100644
--- a/drivers/usb/dwc2/hcd_queue.c
+++ b/drivers/usb/dwc2/hcd_queue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
 /*
  * hcd_queue.c - DesignWare HS OTG Controller host queuing routines
  *
diff --git a/drivers/usb/dwc2/hw.h b/drivers/usb/dwc2/hw.h
index 4592012c4743..2c906d8ee465 100644
--- a/drivers/usb/dwc2/hw.h
+++ b/drivers/usb/dwc2/hw.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
 /*
  * hw.h - DesignWare HS OTG Controller hardware definitions
  *
diff --git a/drivers/usb/dwc2/params.c b/drivers/usb/dwc2/params.c
index fe770a2834d2..ef73af6e03a9 100644
--- a/drivers/usb/dwc2/params.c
+++ b/drivers/usb/dwc2/params.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
 /*
  * Copyright (C) 2004-2016 Synopsys, Inc.
  *
diff --git a/drivers/usb/dwc2/pci.c b/drivers/usb/dwc2/pci.c
index fdeb8c7bf30a..3ecc951a1aea 100644
--- a/drivers/usb/dwc2/pci.c
+++ b/drivers/usb/dwc2/pci.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
 /*
  * pci.c - DesignWare HS OTG Controller PCI driver
  *
diff --git a/drivers/usb/dwc2/platform.c b/drivers/usb/dwc2/platform.c
index daf0d37acb37..3e26550d13dd 100644
--- a/drivers/usb/dwc2/platform.c
+++ b/drivers/usb/dwc2/platform.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
 /*
  * platform.c - DesignWare HS OTG Controller platform driver
  *
diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c
index dabfa16fa267..d52f087cd8bb 100644
--- a/drivers/usb/dwc3/core.c
+++ b/drivers/usb/dwc3/core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * core.c - DesignWare USB3 DRD Controller Core file
  *
diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h
index e33cc10121be..5ac60463e8c5 100644
--- a/drivers/usb/dwc3/core.h
+++ b/drivers/usb/dwc3/core.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * core.h - DesignWare USB3 DRD Core Header
  *
diff --git a/drivers/usb/dwc3/debug.h b/drivers/usb/dwc3/debug.h
index 5e9c070ec874..2778d2d1e9b6 100644
--- a/drivers/usb/dwc3/debug.h
+++ b/drivers/usb/dwc3/debug.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * debug.h - DesignWare USB3 DRD Controller Debug Header
  *
diff --git a/drivers/usb/dwc3/debugfs.c b/drivers/usb/dwc3/debugfs.c
index 4e09be80e59f..b104a8786896 100644
--- a/drivers/usb/dwc3/debugfs.c
+++ b/drivers/usb/dwc3/debugfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * debugfs.c - DesignWare USB3 DRD Controller DebugFS file
  *
diff --git a/drivers/usb/dwc3/drd.c b/drivers/usb/dwc3/drd.c
index 2765c51c7ef5..a7e6803ba2e1 100644
--- a/drivers/usb/dwc3/drd.c
+++ b/drivers/usb/dwc3/drd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * drd.c - DesignWare USB3 DRD Controller Dual-role support
  *
diff --git a/drivers/usb/dwc3/dwc3-exynos.c b/drivers/usb/dwc3/dwc3-exynos.c
index e089df72f766..24294c3e3c9b 100644
--- a/drivers/usb/dwc3/dwc3-exynos.c
+++ b/drivers/usb/dwc3/dwc3-exynos.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * dwc3-exynos.c - Samsung EXYNOS DWC3 Specific Glue layer
  *
diff --git a/drivers/usb/dwc3/dwc3-keystone.c b/drivers/usb/dwc3/dwc3-keystone.c
index d2ed9523e77c..7310646aec33 100644
--- a/drivers/usb/dwc3/dwc3-keystone.c
+++ b/drivers/usb/dwc3/dwc3-keystone.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * dwc3-keystone.c - Keystone Specific Glue layer
  *
diff --git a/drivers/usb/dwc3/dwc3-of-simple.c b/drivers/usb/dwc3/dwc3-of-simple.c
index ceea1619f8aa..3269a5f64823 100644
--- a/drivers/usb/dwc3/dwc3-of-simple.c
+++ b/drivers/usb/dwc3/dwc3-of-simple.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * dwc3-of-simple.c - OF glue layer for simple integrations
  *
diff --git a/drivers/usb/dwc3/dwc3-omap.c b/drivers/usb/dwc3/dwc3-omap.c
index 3530795bbb8f..d86aaba9c618 100644
--- a/drivers/usb/dwc3/dwc3-omap.c
+++ b/drivers/usb/dwc3/dwc3-omap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * dwc3-omap.c - OMAP Specific Glue layer
  *
diff --git a/drivers/usb/dwc3/dwc3-pci.c b/drivers/usb/dwc3/dwc3-pci.c
index 261ee805763f..dd893208761f 100644
--- a/drivers/usb/dwc3/dwc3-pci.c
+++ b/drivers/usb/dwc3/dwc3-pci.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * dwc3-pci.c - PCI Specific glue layer
  *
diff --git a/drivers/usb/dwc3/dwc3-st.c b/drivers/usb/dwc3/dwc3-st.c
index 505676fd3ba4..3d635df22568 100644
--- a/drivers/usb/dwc3/dwc3-st.c
+++ b/drivers/usb/dwc3/dwc3-st.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /**
  * dwc3-st.c Support for dwc3 platform devices on ST Microelectronics platforms
  *
diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c
index a4dceae4bdc7..7f5bdb318f89 100644
--- a/drivers/usb/dwc3/ep0.c
+++ b/drivers/usb/dwc3/ep0.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * ep0.c - DesignWare USB3 DRD Controller Endpoint 0 Handling
  *
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 66dc99b9525a..8bba7ae12aae 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * gadget.c - DesignWare USB3 DRD Controller Gadget Framework Link
  *
diff --git a/drivers/usb/dwc3/gadget.h b/drivers/usb/dwc3/gadget.h
index 4a3227543255..fe502e542c39 100644
--- a/drivers/usb/dwc3/gadget.h
+++ b/drivers/usb/dwc3/gadget.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * gadget.h - DesignWare USB3 DRD Gadget Header
  *
diff --git a/drivers/usb/dwc3/host.c b/drivers/usb/dwc3/host.c
index 76f0b0df37c1..8eddbe17e99e 100644
--- a/drivers/usb/dwc3/host.c
+++ b/drivers/usb/dwc3/host.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * host.c - DesignWare USB3 DRD Controller Host Glue
  *
diff --git a/drivers/usb/dwc3/io.h b/drivers/usb/dwc3/io.h
index c69b06696824..085db0412ce0 100644
--- a/drivers/usb/dwc3/io.h
+++ b/drivers/usb/dwc3/io.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * io.h - DesignWare USB3 DRD IO Header
  *
diff --git a/drivers/usb/dwc3/trace.c b/drivers/usb/dwc3/trace.c
index 6cd166412ad0..31acdc3ce436 100644
--- a/drivers/usb/dwc3/trace.c
+++ b/drivers/usb/dwc3/trace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * trace.c - DesignWare USB3 DRD Controller Trace Support
  *
diff --git a/drivers/usb/dwc3/trace.h b/drivers/usb/dwc3/trace.h
index 6504b116da04..9e6e10b2ea5c 100644
--- a/drivers/usb/dwc3/trace.h
+++ b/drivers/usb/dwc3/trace.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * trace.h - DesignWare USB3 DRD Controller Trace Support
  *
diff --git a/drivers/usb/dwc3/ulpi.c b/drivers/usb/dwc3/ulpi.c
index e87ce8e9edee..5a6edbfc9e88 100644
--- a/drivers/usb/dwc3/ulpi.c
+++ b/drivers/usb/dwc3/ulpi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * ulpi.c - DesignWare USB3 Controller's ULPI PHY interface
  *
diff --git a/drivers/usb/early/ehci-dbgp.c b/drivers/usb/early/ehci-dbgp.c
index e2654443e8eb..d633c2abe5a4 100644
--- a/drivers/usb/early/ehci-dbgp.c
+++ b/drivers/usb/early/ehci-dbgp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Standalone EHCI usb debug driver
  *
diff --git a/drivers/usb/early/xhci-dbc.c b/drivers/usb/early/xhci-dbc.c
index 12fe70beae69..6901a08a6866 100644
--- a/drivers/usb/early/xhci-dbc.c
+++ b/drivers/usb/early/xhci-dbc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * xhci-dbc.c - xHCI debug capability early driver
  *
diff --git a/drivers/usb/early/xhci-dbc.h b/drivers/usb/early/xhci-dbc.h
index 2df0f6e613fe..24118d3d6e07 100644
--- a/drivers/usb/early/xhci-dbc.h
+++ b/drivers/usb/early/xhci-dbc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * xhci-dbc.h - xHCI debug capability early driver
  *
diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c
index 1f1ab6fa3f9d..a9c3904dbe1b 100644
--- a/drivers/usb/gadget/composite.c
+++ b/drivers/usb/gadget/composite.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * composite.c - infrastructure for Composite USB Gadgets
  *
diff --git a/drivers/usb/gadget/config.c b/drivers/usb/gadget/config.c
index 17a6077b89a4..80a75c038696 100644
--- a/drivers/usb/gadget/config.c
+++ b/drivers/usb/gadget/config.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * usb/gadget/config.c -- simplify building config descriptors
  *
diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c
index aeb9f3c40521..4ddf063b9f47 100644
--- a/drivers/usb/gadget/configfs.c
+++ b/drivers/usb/gadget/configfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 #include <linux/configfs.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/drivers/usb/gadget/epautoconf.c b/drivers/usb/gadget/epautoconf.c
index 30fdab0ae383..3a3b3027f234 100644
--- a/drivers/usb/gadget/epautoconf.c
+++ b/drivers/usb/gadget/epautoconf.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * epautoconf.c -- endpoint autoconfiguration for usb gadget drivers
  *
diff --git a/drivers/usb/gadget/function/f_acm.c b/drivers/usb/gadget/function/f_acm.c
index 5e3828d9dac7..5a2229784bd6 100644
--- a/drivers/usb/gadget/function/f_acm.c
+++ b/drivers/usb/gadget/function/f_acm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * f_acm.c -- USB CDC serial (ACM) function driver
  *
diff --git a/drivers/usb/gadget/function/f_ecm.c b/drivers/usb/gadget/function/f_ecm.c
index 4c488d15b6f6..d2a83dd78cb9 100644
--- a/drivers/usb/gadget/function/f_ecm.c
+++ b/drivers/usb/gadget/function/f_ecm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * f_ecm.c -- USB CDC Ethernet (ECM) link function driver
  *
diff --git a/drivers/usb/gadget/function/f_eem.c b/drivers/usb/gadget/function/f_eem.c
index 007ec6e4a5d4..bfb4cacc8b8e 100644
--- a/drivers/usb/gadget/function/f_eem.c
+++ b/drivers/usb/gadget/function/f_eem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * f_eem.c -- USB CDC Ethernet (EEM) link function driver
  *
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 8b342587f8ad..eb4196fa9959 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * f_fs.c -- user mode file system API for USB composite function controllers
  *
diff --git a/drivers/usb/gadget/function/f_hid.c b/drivers/usb/gadget/function/f_hid.c
index d8e359ef6eb1..fc1d9282c1d9 100644
--- a/drivers/usb/gadget/function/f_hid.c
+++ b/drivers/usb/gadget/function/f_hid.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * f_hid.c -- USB HID function driver
  *
diff --git a/drivers/usb/gadget/function/f_loopback.c b/drivers/usb/gadget/function/f_loopback.c
index e70093835e14..2dea6e63ef4d 100644
--- a/drivers/usb/gadget/function/f_loopback.c
+++ b/drivers/usb/gadget/function/f_loopback.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * f_loopback.c - USB peripheral loopback configuration driver
  *
diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c
index 5153e29870c3..697224237976 100644
--- a/drivers/usb/gadget/function/f_mass_storage.c
+++ b/drivers/usb/gadget/function/f_mass_storage.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
 /*
  * f_mass_storage.c -- Mass Storage USB Composite Function
  *
diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c
index 5d3d7941d2c2..09a69b2c8125 100644
--- a/drivers/usb/gadget/function/f_midi.c
+++ b/drivers/usb/gadget/function/f_midi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * f_midi.c -- USB MIDI class function driver
  *
diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c
index 45b334ceaf2e..a45e6174a387 100644
--- a/drivers/usb/gadget/function/f_ncm.c
+++ b/drivers/usb/gadget/function/f_ncm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * f_ncm.c -- USB CDC Network (NCM) link function driver
  *
diff --git a/drivers/usb/gadget/function/f_obex.c b/drivers/usb/gadget/function/f_obex.c
index d43e86cea74f..34e81d888b24 100644
--- a/drivers/usb/gadget/function/f_obex.c
+++ b/drivers/usb/gadget/function/f_obex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * f_obex.c -- USB CDC OBEX function driver
  *
diff --git a/drivers/usb/gadget/function/f_phonet.c b/drivers/usb/gadget/function/f_phonet.c
index 710b6885c55b..b0c2ad17e3e3 100644
--- a/drivers/usb/gadget/function/f_phonet.c
+++ b/drivers/usb/gadget/function/f_phonet.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * f_phonet.c -- USB CDC Phonet function
  *
diff --git a/drivers/usb/gadget/function/f_printer.c b/drivers/usb/gadget/function/f_printer.c
index ea0da35a44e2..fff806d1e4e9 100644
--- a/drivers/usb/gadget/function/f_printer.c
+++ b/drivers/usb/gadget/function/f_printer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * f_printer.c - USB printer function driver
  *
diff --git a/drivers/usb/gadget/function/f_rndis.c b/drivers/usb/gadget/function/f_rndis.c
index c7c5b3ce1d98..355e2308bd7a 100644
--- a/drivers/usb/gadget/function/f_rndis.c
+++ b/drivers/usb/gadget/function/f_rndis.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * f_rndis.c -- RNDIS link function driver
  *
diff --git a/drivers/usb/gadget/function/f_serial.c b/drivers/usb/gadget/function/f_serial.c
index cb00ada21d9c..9dae136d81e4 100644
--- a/drivers/usb/gadget/function/f_serial.c
+++ b/drivers/usb/gadget/function/f_serial.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * f_serial.c - generic USB serial function driver
  *
diff --git a/drivers/usb/gadget/function/f_sourcesink.c b/drivers/usb/gadget/function/f_sourcesink.c
index 8784fa12ea2c..4581b172ced7 100644
--- a/drivers/usb/gadget/function/f_sourcesink.c
+++ b/drivers/usb/gadget/function/f_sourcesink.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * f_sourcesink.c - USB peripheral source/sink configuration driver
  *
diff --git a/drivers/usb/gadget/function/f_subset.c b/drivers/usb/gadget/function/f_subset.c
index 434b983f3b4c..8e13415cf91e 100644
--- a/drivers/usb/gadget/function/f_subset.c
+++ b/drivers/usb/gadget/function/f_subset.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * f_subset.c -- "CDC Subset" Ethernet link function driver
  *
diff --git a/drivers/usb/gadget/function/f_tcm.c b/drivers/usb/gadget/function/f_tcm.c
index c9d741dfeff4..e4e86705a15e 100644
--- a/drivers/usb/gadget/function/f_tcm.c
+++ b/drivers/usb/gadget/function/f_tcm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Target based USB-Gadget
  *
  * UAS protocol handling, target callbacks, configfs handling,
diff --git a/drivers/usb/gadget/function/f_uac1.c b/drivers/usb/gadget/function/f_uac1.c
index 29efbedc91f9..9dc33cbcc06c 100644
--- a/drivers/usb/gadget/function/f_uac1.c
+++ b/drivers/usb/gadget/function/f_uac1.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * f_uac1.c -- USB Audio Class 1.0 Function (using u_audio API)
  *
diff --git a/drivers/usb/gadget/function/f_uac1_legacy.c b/drivers/usb/gadget/function/f_uac1_legacy.c
index 5d229e72912e..83d33ee3c40a 100644
--- a/drivers/usb/gadget/function/f_uac1_legacy.c
+++ b/drivers/usb/gadget/function/f_uac1_legacy.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * f_audio.c -- USB Audio class function driver
   *
diff --git a/drivers/usb/gadget/function/f_uac2.c b/drivers/usb/gadget/function/f_uac2.c
index f05c3f3e6103..452b250be960 100644
--- a/drivers/usb/gadget/function/f_uac2.c
+++ b/drivers/usb/gadget/function/f_uac2.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * f_uac2.c -- USB Audio Class 2.0 Function
  *
diff --git a/drivers/usb/gadget/function/f_uvc.c b/drivers/usb/gadget/function/f_uvc.c
index f8a1881609a2..67af76361e7f 100644
--- a/drivers/usb/gadget/function/f_uvc.c
+++ b/drivers/usb/gadget/function/f_uvc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  *	uvc_gadget.c  --  USB Video Class Gadget driver
  *
diff --git a/drivers/usb/gadget/function/f_uvc.h b/drivers/usb/gadget/function/f_uvc.h
index d0a73bdcbba1..b78d1af0aa9c 100644
--- a/drivers/usb/gadget/function/f_uvc.h
+++ b/drivers/usb/gadget/function/f_uvc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  *	f_uvc.h  --  USB Video Class Gadget driver
  *
diff --git a/drivers/usb/gadget/function/rndis.c b/drivers/usb/gadget/function/rndis.c
index d6341045c631..ac8a5476d968 100644
--- a/drivers/usb/gadget/function/rndis.c
+++ b/drivers/usb/gadget/function/rndis.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * RNDIS MSG parser
  *
diff --git a/drivers/usb/gadget/function/rndis.h b/drivers/usb/gadget/function/rndis.h
index 21e0430ffb98..f3bbd224cc16 100644
--- a/drivers/usb/gadget/function/rndis.h
+++ b/drivers/usb/gadget/function/rndis.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * RNDIS	Definitions for Remote NDIS
  *
diff --git a/drivers/usb/gadget/function/storage_common.c b/drivers/usb/gadget/function/storage_common.c
index 8fbf6861690d..1beb7ce507ce 100644
--- a/drivers/usb/gadget/function/storage_common.c
+++ b/drivers/usb/gadget/function/storage_common.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * storage_common.c -- Common definitions for mass storage functionality
  *
diff --git a/drivers/usb/gadget/function/u_audio.c b/drivers/usb/gadget/function/u_audio.c
index 3971bbab88bd..efc5a918842b 100644
--- a/drivers/usb/gadget/function/u_audio.c
+++ b/drivers/usb/gadget/function/u_audio.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * u_audio.c -- interface to USB gadget "ALSA sound card" utilities
  *
diff --git a/drivers/usb/gadget/function/u_audio.h b/drivers/usb/gadget/function/u_audio.h
index 07e13784cbb8..fae06fa2ef5d 100644
--- a/drivers/usb/gadget/function/u_audio.h
+++ b/drivers/usb/gadget/function/u_audio.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * u_audio.h -- interface to USB gadget "ALSA sound card" utilities
  *
diff --git a/drivers/usb/gadget/function/u_ecm.h b/drivers/usb/gadget/function/u_ecm.h
index 262cc03cc2c0..320cd890bb0e 100644
--- a/drivers/usb/gadget/function/u_ecm.h
+++ b/drivers/usb/gadget/function/u_ecm.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * u_ecm.h
  *
diff --git a/drivers/usb/gadget/function/u_eem.h b/drivers/usb/gadget/function/u_eem.h
index e3ae97874c4f..b756664f9c5e 100644
--- a/drivers/usb/gadget/function/u_eem.h
+++ b/drivers/usb/gadget/function/u_eem.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * u_eem.h
  *
diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c
index bdbc3fdc7c4f..247eb69cb99b 100644
--- a/drivers/usb/gadget/function/u_ether.c
+++ b/drivers/usb/gadget/function/u_ether.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * u_ether.c -- Ethernet-over-USB link layer utilities for Gadget stack
  *
diff --git a/drivers/usb/gadget/function/u_ether.h b/drivers/usb/gadget/function/u_ether.h
index c77145bd6b5b..015084eda722 100644
--- a/drivers/usb/gadget/function/u_ether.h
+++ b/drivers/usb/gadget/function/u_ether.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * u_ether.h -- interface to USB gadget "ethernet link" utilities
  *
diff --git a/drivers/usb/gadget/function/u_ether_configfs.h b/drivers/usb/gadget/function/u_ether_configfs.h
index e4c3f84af4c3..0f303fc288fa 100644
--- a/drivers/usb/gadget/function/u_ether_configfs.h
+++ b/drivers/usb/gadget/function/u_ether_configfs.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * u_ether_configfs.h
  *
diff --git a/drivers/usb/gadget/function/u_fs.h b/drivers/usb/gadget/function/u_fs.h
index 79f70ebf85dc..79a585ce51c1 100644
--- a/drivers/usb/gadget/function/u_fs.h
+++ b/drivers/usb/gadget/function/u_fs.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * u_fs.h
  *
diff --git a/drivers/usb/gadget/function/u_gether.h b/drivers/usb/gadget/function/u_gether.h
index d4078426ba5d..b9643d83b5cc 100644
--- a/drivers/usb/gadget/function/u_gether.h
+++ b/drivers/usb/gadget/function/u_gether.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * u_gether.h
  *
diff --git a/drivers/usb/gadget/function/u_hid.h b/drivers/usb/gadget/function/u_hid.h
index aaa0e368a159..5c9cae57b41a 100644
--- a/drivers/usb/gadget/function/u_hid.h
+++ b/drivers/usb/gadget/function/u_hid.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * u_hid.h
  *
diff --git a/drivers/usb/gadget/function/u_midi.h b/drivers/usb/gadget/function/u_midi.h
index 22510189758e..0b3e14bb9021 100644
--- a/drivers/usb/gadget/function/u_midi.h
+++ b/drivers/usb/gadget/function/u_midi.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * u_midi.h
  *
diff --git a/drivers/usb/gadget/function/u_ncm.h b/drivers/usb/gadget/function/u_ncm.h
index ce0f3a78ca13..e90054039f40 100644
--- a/drivers/usb/gadget/function/u_ncm.h
+++ b/drivers/usb/gadget/function/u_ncm.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * u_ncm.h
  *
diff --git a/drivers/usb/gadget/function/u_phonet.h b/drivers/usb/gadget/function/u_phonet.h
index 98ced18779ea..5387ed46f415 100644
--- a/drivers/usb/gadget/function/u_phonet.h
+++ b/drivers/usb/gadget/function/u_phonet.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * u_phonet.h - interface to Phonet
  *
diff --git a/drivers/usb/gadget/function/u_printer.h b/drivers/usb/gadget/function/u_printer.h
index 8d30b7577f87..65737e58259a 100644
--- a/drivers/usb/gadget/function/u_printer.h
+++ b/drivers/usb/gadget/function/u_printer.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * u_printer.h
  *
diff --git a/drivers/usb/gadget/function/u_rndis.h b/drivers/usb/gadget/function/u_rndis.h
index efdb7ac381d9..87d365d9cf5d 100644
--- a/drivers/usb/gadget/function/u_rndis.h
+++ b/drivers/usb/gadget/function/u_rndis.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * u_rndis.h
  *
diff --git a/drivers/usb/gadget/function/u_serial.c b/drivers/usb/gadget/function/u_serial.c
index 961457ef5a1c..5188c4842f86 100644
--- a/drivers/usb/gadget/function/u_serial.c
+++ b/drivers/usb/gadget/function/u_serial.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * u_serial.c - utilities for USB gadget "serial port"/TTY support
  *
diff --git a/drivers/usb/gadget/function/u_serial.h b/drivers/usb/gadget/function/u_serial.h
index c20210c0babd..44617b205bc7 100644
--- a/drivers/usb/gadget/function/u_serial.h
+++ b/drivers/usb/gadget/function/u_serial.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * u_serial.h - interface to USB gadget "serial port"/TTY utilities
  *
diff --git a/drivers/usb/gadget/function/u_tcm.h b/drivers/usb/gadget/function/u_tcm.h
index 0bd751e0483f..1ec6f702d400 100644
--- a/drivers/usb/gadget/function/u_tcm.h
+++ b/drivers/usb/gadget/function/u_tcm.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * u_tcm.h
  *
diff --git a/drivers/usb/gadget/function/u_uac1.h b/drivers/usb/gadget/function/u_uac1.h
index 6f188fd8633f..84e05cc42659 100644
--- a/drivers/usb/gadget/function/u_uac1.h
+++ b/drivers/usb/gadget/function/u_uac1.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * u_uac1.h - Utility definitions for UAC1 function
  *
diff --git a/drivers/usb/gadget/function/u_uac1_legacy.c b/drivers/usb/gadget/function/u_uac1_legacy.c
index fa4684a1c54c..0f66c4b4f772 100644
--- a/drivers/usb/gadget/function/u_uac1_legacy.c
+++ b/drivers/usb/gadget/function/u_uac1_legacy.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * u_uac1.c -- ALSA audio utilities for Gadget stack
  *
diff --git a/drivers/usb/gadget/function/u_uac1_legacy.h b/drivers/usb/gadget/function/u_uac1_legacy.h
index d715b1af56a4..ad77043567cd 100644
--- a/drivers/usb/gadget/function/u_uac1_legacy.h
+++ b/drivers/usb/gadget/function/u_uac1_legacy.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * u_uac1.h -- interface to USB gadget "ALSA AUDIO" utilities
  *
diff --git a/drivers/usb/gadget/function/u_uac2.h b/drivers/usb/gadget/function/u_uac2.h
index 19eeb83538a5..6ad31b29df8a 100644
--- a/drivers/usb/gadget/function/u_uac2.h
+++ b/drivers/usb/gadget/function/u_uac2.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * u_uac2.h
  *
diff --git a/drivers/usb/gadget/function/u_uvc.h b/drivers/usb/gadget/function/u_uvc.h
index 4676b60a5063..ec5b5e6839b3 100644
--- a/drivers/usb/gadget/function/u_uvc.h
+++ b/drivers/usb/gadget/function/u_uvc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * u_uvc.h
  *
diff --git a/drivers/usb/gadget/function/uvc.h b/drivers/usb/gadget/function/uvc.h
index 11d70dead32b..f6388ddad19f 100644
--- a/drivers/usb/gadget/function/uvc.h
+++ b/drivers/usb/gadget/function/uvc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  *	uvc_gadget.h  --  USB Video Class Gadget driver
  *
diff --git a/drivers/usb/gadget/function/uvc_configfs.c b/drivers/usb/gadget/function/uvc_configfs.c
index 844cb738bafd..66f1312d50e7 100644
--- a/drivers/usb/gadget/function/uvc_configfs.c
+++ b/drivers/usb/gadget/function/uvc_configfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * uvc_configfs.c
  *
diff --git a/drivers/usb/gadget/function/uvc_configfs.h b/drivers/usb/gadget/function/uvc_configfs.h
index 085e67be7c71..f604bcb241ea 100644
--- a/drivers/usb/gadget/function/uvc_configfs.h
+++ b/drivers/usb/gadget/function/uvc_configfs.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * uvc_configfs.h
  *
diff --git a/drivers/usb/gadget/function/uvc_queue.c b/drivers/usb/gadget/function/uvc_queue.c
index 6377e9fee6e5..6c819d9fa7af 100644
--- a/drivers/usb/gadget/function/uvc_queue.c
+++ b/drivers/usb/gadget/function/uvc_queue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  *	uvc_queue.c  --  USB Video Class driver - Buffers management
  *
diff --git a/drivers/usb/gadget/function/uvc_v4l2.c b/drivers/usb/gadget/function/uvc_v4l2.c
index 66124024278b..c1fb5249f4a4 100644
--- a/drivers/usb/gadget/function/uvc_v4l2.c
+++ b/drivers/usb/gadget/function/uvc_v4l2.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  *	uvc_v4l2.c  --  USB Video Class Gadget driver
  *
diff --git a/drivers/usb/gadget/function/uvc_v4l2.h b/drivers/usb/gadget/function/uvc_v4l2.h
index ad6ca0671740..20cf417a56c0 100644
--- a/drivers/usb/gadget/function/uvc_v4l2.h
+++ b/drivers/usb/gadget/function/uvc_v4l2.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *	uvc_v4l2.h  --  USB Video Class Gadget driver
  *
diff --git a/drivers/usb/gadget/function/uvc_video.c b/drivers/usb/gadget/function/uvc_video.c
index 0f01c04d7cbd..29d81ac02391 100644
--- a/drivers/usb/gadget/function/uvc_video.c
+++ b/drivers/usb/gadget/function/uvc_video.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  *	uvc_video.c  --  USB Video Class Gadget driver
  *
diff --git a/drivers/usb/gadget/function/uvc_video.h b/drivers/usb/gadget/function/uvc_video.h
index ef00f06fa00b..1c48a8bdca02 100644
--- a/drivers/usb/gadget/function/uvc_video.h
+++ b/drivers/usb/gadget/function/uvc_video.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *	uvc_video.h  --  USB Video Class Gadget driver
  *
diff --git a/drivers/usb/gadget/functions.c b/drivers/usb/gadget/functions.c
index b13f839e7368..203361a64212 100644
--- a/drivers/usb/gadget/functions.c
+++ b/drivers/usb/gadget/functions.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/module.h>
diff --git a/drivers/usb/gadget/legacy/acm_ms.c b/drivers/usb/gadget/legacy/acm_ms.c
index c39de65a448b..eed2cb6483b6 100644
--- a/drivers/usb/gadget/legacy/acm_ms.c
+++ b/drivers/usb/gadget/legacy/acm_ms.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * acm_ms.c -- Composite driver, with ACM and mass storage support
  *
diff --git a/drivers/usb/gadget/legacy/audio.c b/drivers/usb/gadget/legacy/audio.c
index 1f5cdbe162df..d0bffaa0bc09 100644
--- a/drivers/usb/gadget/legacy/audio.c
+++ b/drivers/usb/gadget/legacy/audio.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * audio.c -- Audio gadget driver
  *
diff --git a/drivers/usb/gadget/legacy/cdc2.c b/drivers/usb/gadget/legacy/cdc2.c
index 51c08682de84..074c0d4efcf9 100644
--- a/drivers/usb/gadget/legacy/cdc2.c
+++ b/drivers/usb/gadget/legacy/cdc2.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * cdc2.c -- CDC Composite driver, with ECM and ACM support
  *
diff --git a/drivers/usb/gadget/legacy/dbgp.c b/drivers/usb/gadget/legacy/dbgp.c
index 99ca3dabc4f3..e1d566c9918a 100644
--- a/drivers/usb/gadget/legacy/dbgp.c
+++ b/drivers/usb/gadget/legacy/dbgp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * dbgp.c -- EHCI Debug Port device gadget
  *
diff --git a/drivers/usb/gadget/legacy/ether.c b/drivers/usb/gadget/legacy/ether.c
index 25a2c2e48592..8eb0043f6697 100644
--- a/drivers/usb/gadget/legacy/ether.c
+++ b/drivers/usb/gadget/legacy/ether.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * ether.c -- Ethernet gadget driver, with CDC and non-CDC options
  *
diff --git a/drivers/usb/gadget/legacy/g_ffs.c b/drivers/usb/gadget/legacy/g_ffs.c
index 6da7316f8e87..95db302cec7d 100644
--- a/drivers/usb/gadget/legacy/g_ffs.c
+++ b/drivers/usb/gadget/legacy/g_ffs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * g_ffs.c -- user mode file system API for USB composite function controllers
  *
diff --git a/drivers/usb/gadget/legacy/gmidi.c b/drivers/usb/gadget/legacy/gmidi.c
index 0bf39c3ccdb1..c03674b02718 100644
--- a/drivers/usb/gadget/legacy/gmidi.c
+++ b/drivers/usb/gadget/legacy/gmidi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * gmidi.c -- USB MIDI Gadget Driver
  *
diff --git a/drivers/usb/gadget/legacy/hid.c b/drivers/usb/gadget/legacy/hid.c
index a71a884f79fc..c9fb9a3c034f 100644
--- a/drivers/usb/gadget/legacy/hid.c
+++ b/drivers/usb/gadget/legacy/hid.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * hid.c -- HID Composite driver
  *
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index 5c28bee327e1..ae4fe683f7e0 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * inode.c -- user mode filesystem api for usb gadget controllers
  *
diff --git a/drivers/usb/gadget/legacy/mass_storage.c b/drivers/usb/gadget/legacy/mass_storage.c
index fcba59782f26..3700cd272d75 100644
--- a/drivers/usb/gadget/legacy/mass_storage.c
+++ b/drivers/usb/gadget/legacy/mass_storage.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * mass_storage.c -- Mass Storage USB Gadget
  *
diff --git a/drivers/usb/gadget/legacy/multi.c b/drivers/usb/gadget/legacy/multi.c
index a70a406580ea..7dc276c68b54 100644
--- a/drivers/usb/gadget/legacy/multi.c
+++ b/drivers/usb/gadget/legacy/multi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * multi.c -- Multifunction Composite driver
  *
diff --git a/drivers/usb/gadget/legacy/ncm.c b/drivers/usb/gadget/legacy/ncm.c
index 0aba68253e3d..7bfd306ea1ee 100644
--- a/drivers/usb/gadget/legacy/ncm.c
+++ b/drivers/usb/gadget/legacy/ncm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * ncm.c -- NCM gadget driver
  *
diff --git a/drivers/usb/gadget/legacy/nokia.c b/drivers/usb/gadget/legacy/nokia.c
index b1e535f4022e..f10dd6e19cc9 100644
--- a/drivers/usb/gadget/legacy/nokia.c
+++ b/drivers/usb/gadget/legacy/nokia.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * nokia.c -- Nokia Composite Gadget Driver
  *
diff --git a/drivers/usb/gadget/legacy/printer.c b/drivers/usb/gadget/legacy/printer.c
index 4c9cfff34a03..6e1eef41ad86 100644
--- a/drivers/usb/gadget/legacy/printer.c
+++ b/drivers/usb/gadget/legacy/printer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * printer.c -- Printer gadget driver
  *
diff --git a/drivers/usb/gadget/legacy/serial.c b/drivers/usb/gadget/legacy/serial.c
index 9d89adce756d..e84cb9f97b5d 100644
--- a/drivers/usb/gadget/legacy/serial.c
+++ b/drivers/usb/gadget/legacy/serial.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * serial.c -- USB gadget serial driver
  *
diff --git a/drivers/usb/gadget/legacy/tcm_usb_gadget.c b/drivers/usb/gadget/legacy/tcm_usb_gadget.c
index 0b0bb98319cd..1089cb118b66 100644
--- a/drivers/usb/gadget/legacy/tcm_usb_gadget.c
+++ b/drivers/usb/gadget/legacy/tcm_usb_gadget.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Target based USB-Gadget
  *
  * UAS protocol handling, target callbacks, configfs handling,
diff --git a/drivers/usb/gadget/legacy/webcam.c b/drivers/usb/gadget/legacy/webcam.c
index 82c13fce9232..a3929a38ede9 100644
--- a/drivers/usb/gadget/legacy/webcam.c
+++ b/drivers/usb/gadget/legacy/webcam.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  *	webcam.c -- USB webcam gadget driver
  *
diff --git a/drivers/usb/gadget/legacy/zero.c b/drivers/usb/gadget/legacy/zero.c
index 3acc589dae98..6c77940091c5 100644
--- a/drivers/usb/gadget/legacy/zero.c
+++ b/drivers/usb/gadget/legacy/zero.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * zero.c -- Gadget Zero, for USB development
  *
diff --git a/drivers/usb/gadget/u_f.c b/drivers/usb/gadget/u_f.c
index 18839732c840..f52fb321d266 100644
--- a/drivers/usb/gadget/u_f.c
+++ b/drivers/usb/gadget/u_f.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * u_f.c -- USB function utilities for Gadget stack
  *
diff --git a/drivers/usb/gadget/u_f.h b/drivers/usb/gadget/u_f.h
index 7d53a4773d1a..35aaaaa80926 100644
--- a/drivers/usb/gadget/u_f.h
+++ b/drivers/usb/gadget/u_f.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * u_f.h
  *
diff --git a/drivers/usb/gadget/u_os_desc.h b/drivers/usb/gadget/u_os_desc.h
index 947b7ddff691..cfa53a204de1 100644
--- a/drivers/usb/gadget/u_os_desc.h
+++ b/drivers/usb/gadget/u_os_desc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * u_os_desc.h
  *
diff --git a/drivers/usb/gadget/udc/amd5536udc.h b/drivers/usb/gadget/udc/amd5536udc.h
index 4fe22d432af2..5a92388ef8bb 100644
--- a/drivers/usb/gadget/udc/amd5536udc.h
+++ b/drivers/usb/gadget/udc/amd5536udc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * amd5536.h -- header for AMD 5536 UDC high/full speed USB device controller
  *
diff --git a/drivers/usb/gadget/udc/amd5536udc_pci.c b/drivers/usb/gadget/udc/amd5536udc_pci.c
index 57a13f080a79..cf9117e84534 100644
--- a/drivers/usb/gadget/udc/amd5536udc_pci.c
+++ b/drivers/usb/gadget/udc/amd5536udc_pci.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * amd5536udc_pci.c -- AMD 5536 UDC high/full speed USB device controller
  *
diff --git a/drivers/usb/gadget/udc/at91_udc.c b/drivers/usb/gadget/udc/at91_udc.c
index 8bc78418d40e..972f78409df7 100644
--- a/drivers/usb/gadget/udc/at91_udc.c
+++ b/drivers/usb/gadget/udc/at91_udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * at91_udc -- driver for at91-series USB peripheral controller
  *
diff --git a/drivers/usb/gadget/udc/at91_udc.h b/drivers/usb/gadget/udc/at91_udc.h
index 9bbe72764f31..9581a868032e 100644
--- a/drivers/usb/gadget/udc/at91_udc.h
+++ b/drivers/usb/gadget/udc/at91_udc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2004 by Thomas Rathbone, HP Labs
  * Copyright (C) 2005 by Ivan Kokshaysky
diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c
index a884c022df7a..12543decf9ab 100644
--- a/drivers/usb/gadget/udc/atmel_usba_udc.c
+++ b/drivers/usb/gadget/udc/atmel_usba_udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Driver for the Atmel USBA high speed USB device controller
  *
diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.h b/drivers/usb/gadget/udc/atmel_usba_udc.h
index f8ebe0389bd4..10df5e4aaeb2 100644
--- a/drivers/usb/gadget/udc/atmel_usba_udc.h
+++ b/drivers/usb/gadget/udc/atmel_usba_udc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Driver for the Atmel USBA high speed USB device controller
  *
diff --git a/drivers/usb/gadget/udc/bcm63xx_udc.c b/drivers/usb/gadget/udc/bcm63xx_udc.c
index f78503203f42..403cb339fd7b 100644
--- a/drivers/usb/gadget/udc/bcm63xx_udc.c
+++ b/drivers/usb/gadget/udc/bcm63xx_udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * bcm63xx_udc.c -- BCM63xx UDC high/full speed USB device controller
  *
diff --git a/drivers/usb/gadget/udc/bdc/bdc.h b/drivers/usb/gadget/udc/bdc/bdc.h
index 6df0352cdc50..960620bccc25 100644
--- a/drivers/usb/gadget/udc/bdc/bdc.h
+++ b/drivers/usb/gadget/udc/bdc/bdc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * bdc.h - header for the BRCM BDC USB3.0 device controller
  *
diff --git a/drivers/usb/gadget/udc/bdc/bdc_cmd.c b/drivers/usb/gadget/udc/bdc/bdc_cmd.c
index 6e920f1dce02..ad3240375f87 100644
--- a/drivers/usb/gadget/udc/bdc/bdc_cmd.c
+++ b/drivers/usb/gadget/udc/bdc/bdc_cmd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * bdc_cmd.c - BRCM BDC USB3.0 device controller
  *
diff --git a/drivers/usb/gadget/udc/bdc/bdc_cmd.h b/drivers/usb/gadget/udc/bdc/bdc_cmd.h
index 61d0e3bf9853..64648fbef233 100644
--- a/drivers/usb/gadget/udc/bdc/bdc_cmd.h
+++ b/drivers/usb/gadget/udc/bdc/bdc_cmd.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * bdc_cmd.h - header for the BDC debug functions
  *
diff --git a/drivers/usb/gadget/udc/bdc/bdc_core.c b/drivers/usb/gadget/udc/bdc/bdc_core.c
index 7a8af4b916cf..2ab6a6b45f9e 100644
--- a/drivers/usb/gadget/udc/bdc/bdc_core.c
+++ b/drivers/usb/gadget/udc/bdc/bdc_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * bdc_core.c - BRCM BDC USB3.0 device controller core operations
  *
diff --git a/drivers/usb/gadget/udc/bdc/bdc_dbg.c b/drivers/usb/gadget/udc/bdc/bdc_dbg.c
index ac98f6f681b7..11216cd6cb94 100644
--- a/drivers/usb/gadget/udc/bdc/bdc_dbg.c
+++ b/drivers/usb/gadget/udc/bdc/bdc_dbg.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * bdc_dbg.c - BRCM BDC USB3.0 device controller debug functions
  *
diff --git a/drivers/usb/gadget/udc/bdc/bdc_dbg.h b/drivers/usb/gadget/udc/bdc/bdc_dbg.h
index 338a6c701315..f62d59b30a3e 100644
--- a/drivers/usb/gadget/udc/bdc/bdc_dbg.h
+++ b/drivers/usb/gadget/udc/bdc/bdc_dbg.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * bdc_dbg.h - header for the BDC debug functions
  *
diff --git a/drivers/usb/gadget/udc/bdc/bdc_ep.c b/drivers/usb/gadget/udc/bdc/bdc_ep.c
index bfd8f7ade935..e9fda8e6e87d 100644
--- a/drivers/usb/gadget/udc/bdc/bdc_ep.c
+++ b/drivers/usb/gadget/udc/bdc/bdc_ep.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * bdc_ep.c - BRCM BDC USB3.0 device controller endpoint related functions
  *
diff --git a/drivers/usb/gadget/udc/bdc/bdc_ep.h b/drivers/usb/gadget/udc/bdc/bdc_ep.h
index 8a6b36cbf2ea..db52fc78c8bf 100644
--- a/drivers/usb/gadget/udc/bdc/bdc_ep.h
+++ b/drivers/usb/gadget/udc/bdc/bdc_ep.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * bdc_ep.h - header for the BDC debug functions
  *
diff --git a/drivers/usb/gadget/udc/bdc/bdc_pci.c b/drivers/usb/gadget/udc/bdc/bdc_pci.c
index 02968842b359..8eca33c545b1 100644
--- a/drivers/usb/gadget/udc/bdc/bdc_pci.c
+++ b/drivers/usb/gadget/udc/bdc/bdc_pci.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * bdc_pci.c - BRCM BDC USB3.0 device controller PCI interface file.
  *
diff --git a/drivers/usb/gadget/udc/bdc/bdc_udc.c b/drivers/usb/gadget/udc/bdc/bdc_udc.c
index c84346146456..492b8b872d2c 100644
--- a/drivers/usb/gadget/udc/bdc/bdc_udc.c
+++ b/drivers/usb/gadget/udc/bdc/bdc_udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * bdc_udc.c - BRCM BDC USB3.0 device controller gagdet ops
  *
diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c
index 54b02eca0456..4fb5ca05278e 100644
--- a/drivers/usb/gadget/udc/core.c
+++ b/drivers/usb/gadget/udc/core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * udc.c - Core UDC Framework
  *
diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c
index 9dd26ff2a4ff..b5ded16d1f42 100644
--- a/drivers/usb/gadget/udc/dummy_hcd.c
+++ b/drivers/usb/gadget/udc/dummy_hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * dummy_hcd.c -- Dummy/Loopback USB host and device emulator driver.
  *
diff --git a/drivers/usb/gadget/udc/fotg210-udc.c b/drivers/usb/gadget/udc/fotg210-udc.c
index 78d0204e3e20..6a7e0e26a1d1 100644
--- a/drivers/usb/gadget/udc/fotg210-udc.c
+++ b/drivers/usb/gadget/udc/fotg210-udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * FOTG210 UDC Driver supports Bulk transfer so far
  *
diff --git a/drivers/usb/gadget/udc/fotg210.h b/drivers/usb/gadget/udc/fotg210.h
index bbf991bcbe7c..2c825a884ebc 100644
--- a/drivers/usb/gadget/udc/fotg210.h
+++ b/drivers/usb/gadget/udc/fotg210.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Faraday FOTG210 USB OTG controller
  *
diff --git a/drivers/usb/gadget/udc/fsl_mxc_udc.c b/drivers/usb/gadget/udc/fsl_mxc_udc.c
index f16e149c5b3e..089fbfc44da7 100644
--- a/drivers/usb/gadget/udc/fsl_mxc_udc.c
+++ b/drivers/usb/gadget/udc/fsl_mxc_udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2009
  * Guennadi Liakhovetski, DENX Software Engineering, <lg@denx.de>
diff --git a/drivers/usb/gadget/udc/fsl_qe_udc.c b/drivers/usb/gadget/udc/fsl_qe_udc.c
index a3e72d690eef..228577c6c180 100644
--- a/drivers/usb/gadget/udc/fsl_qe_udc.c
+++ b/drivers/usb/gadget/udc/fsl_qe_udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * driver/usb/gadget/fsl_qe_udc.c
  *
diff --git a/drivers/usb/gadget/udc/fsl_qe_udc.h b/drivers/usb/gadget/udc/fsl_qe_udc.h
index 7026919fc901..2b1aec81c397 100644
--- a/drivers/usb/gadget/udc/fsl_qe_udc.h
+++ b/drivers/usb/gadget/udc/fsl_qe_udc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * drivers/usb/gadget/qe_udc.h
  *
diff --git a/drivers/usb/gadget/udc/fsl_udc_core.c b/drivers/usb/gadget/udc/fsl_udc_core.c
index 6f2f71c054be..9be768d04cfa 100644
--- a/drivers/usb/gadget/udc/fsl_udc_core.c
+++ b/drivers/usb/gadget/udc/fsl_udc_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2004-2007,2011-2012 Freescale Semiconductor, Inc.
  * All rights reserved.
diff --git a/drivers/usb/gadget/udc/fsl_usb2_udc.h b/drivers/usb/gadget/udc/fsl_usb2_udc.h
index e92b8408b6f6..e5a25ef5803b 100644
--- a/drivers/usb/gadget/udc/fsl_usb2_udc.h
+++ b/drivers/usb/gadget/udc/fsl_usb2_udc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2004,2012 Freescale Semiconductor, Inc
  * All rights reserved.
diff --git a/drivers/usb/gadget/udc/fusb300_udc.c b/drivers/usb/gadget/udc/fusb300_udc.c
index e0c1b0099265..e05946c421ed 100644
--- a/drivers/usb/gadget/udc/fusb300_udc.c
+++ b/drivers/usb/gadget/udc/fusb300_udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Fusb300 UDC (USB gadget)
  *
diff --git a/drivers/usb/gadget/udc/fusb300_udc.h b/drivers/usb/gadget/udc/fusb300_udc.h
index ad39f892d200..4b055ef31cc1 100644
--- a/drivers/usb/gadget/udc/fusb300_udc.h
+++ b/drivers/usb/gadget/udc/fusb300_udc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Fusb300 UDC (USB gadget)
  *
diff --git a/drivers/usb/gadget/udc/goku_udc.c b/drivers/usb/gadget/udc/goku_udc.c
index cccad51eb999..11dfcc82abdf 100644
--- a/drivers/usb/gadget/udc/goku_udc.c
+++ b/drivers/usb/gadget/udc/goku_udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Toshiba TC86C001 ("Goku-S") USB Device Controller driver
  *
diff --git a/drivers/usb/gadget/udc/goku_udc.h b/drivers/usb/gadget/udc/goku_udc.h
index 86d2adafe149..6ac811feeee4 100644
--- a/drivers/usb/gadget/udc/goku_udc.h
+++ b/drivers/usb/gadget/udc/goku_udc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Toshiba TC86C001 ("Goku-S") USB Device Controller driver
  *
diff --git a/drivers/usb/gadget/udc/gr_udc.c b/drivers/usb/gadget/udc/gr_udc.c
index 48117a539146..ed818b118a82 100644
--- a/drivers/usb/gadget/udc/gr_udc.c
+++ b/drivers/usb/gadget/udc/gr_udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * USB Peripheral Controller driver for Aeroflex Gaisler GRUSBDC.
  *
diff --git a/drivers/usb/gadget/udc/gr_udc.h b/drivers/usb/gadget/udc/gr_udc.h
index 4297c4e8021f..6c08ddf03521 100644
--- a/drivers/usb/gadget/udc/gr_udc.h
+++ b/drivers/usb/gadget/udc/gr_udc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * USB Peripheral Controller driver for Aeroflex Gaisler GRUSBDC.
  *
diff --git a/drivers/usb/gadget/udc/lpc32xx_udc.c b/drivers/usb/gadget/udc/lpc32xx_udc.c
index 8f32b5ee7734..7dcd0904bf25 100644
--- a/drivers/usb/gadget/udc/lpc32xx_udc.c
+++ b/drivers/usb/gadget/udc/lpc32xx_udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * USB Gadget driver for LPC32xx
  *
diff --git a/drivers/usb/gadget/udc/m66592-udc.c b/drivers/usb/gadget/udc/m66592-udc.c
index 3b8dbed7e2e9..ca5c98226ad2 100644
--- a/drivers/usb/gadget/udc/m66592-udc.c
+++ b/drivers/usb/gadget/udc/m66592-udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * M66592 UDC (USB gadget)
  *
diff --git a/drivers/usb/gadget/udc/m66592-udc.h b/drivers/usb/gadget/udc/m66592-udc.h
index 96d49d7bfb6b..4a62b4fda942 100644
--- a/drivers/usb/gadget/udc/m66592-udc.h
+++ b/drivers/usb/gadget/udc/m66592-udc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * M66592 UDC (USB gadget)
  *
diff --git a/drivers/usb/gadget/udc/mv_u3d.h b/drivers/usb/gadget/udc/mv_u3d.h
index e32a787ac373..4c7812429920 100644
--- a/drivers/usb/gadget/udc/mv_u3d.h
+++ b/drivers/usb/gadget/udc/mv_u3d.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2011 Marvell International Ltd. All rights reserved.
  *
diff --git a/drivers/usb/gadget/udc/mv_u3d_core.c b/drivers/usb/gadget/udc/mv_u3d_core.c
index 772049afe166..6f336fe8bbef 100644
--- a/drivers/usb/gadget/udc/mv_u3d_core.c
+++ b/drivers/usb/gadget/udc/mv_u3d_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2011 Marvell International Ltd. All rights reserved.
  *
diff --git a/drivers/usb/gadget/udc/mv_udc.h b/drivers/usb/gadget/udc/mv_udc.h
index be77f207dbaf..4acf7edf4d86 100644
--- a/drivers/usb/gadget/udc/mv_udc.h
+++ b/drivers/usb/gadget/udc/mv_udc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2011 Marvell International Ltd. All rights reserved.
  *
diff --git a/drivers/usb/gadget/udc/mv_udc_core.c b/drivers/usb/gadget/udc/mv_udc_core.c
index 4103bf7cf52a..df4065cf5fcd 100644
--- a/drivers/usb/gadget/udc/mv_udc_core.c
+++ b/drivers/usb/gadget/udc/mv_udc_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2011 Marvell International Ltd. All rights reserved.
  * Author: Chao Xie <chao.xie@marvell.com>
diff --git a/drivers/usb/gadget/udc/net2272.c b/drivers/usb/gadget/udc/net2272.c
index 8f85a51bd2b3..a3018f93df58 100644
--- a/drivers/usb/gadget/udc/net2272.c
+++ b/drivers/usb/gadget/udc/net2272.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for PLX NET2272 USB device controller
  *
diff --git a/drivers/usb/gadget/udc/net2272.h b/drivers/usb/gadget/udc/net2272.h
index 69bc9c3c6ce4..f0212cf042a2 100644
--- a/drivers/usb/gadget/udc/net2272.h
+++ b/drivers/usb/gadget/udc/net2272.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * PLX NET2272 high/full speed USB device controller
  *
diff --git a/drivers/usb/gadget/udc/net2280.c b/drivers/usb/gadget/udc/net2280.c
index f608c1f85e61..a0b2ab0c04f7 100644
--- a/drivers/usb/gadget/udc/net2280.c
+++ b/drivers/usb/gadget/udc/net2280.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for the PLX NET2280 USB device controller.
  * Specs and errata are available from <http://www.plxtech.com>.
diff --git a/drivers/usb/gadget/udc/net2280.h b/drivers/usb/gadget/udc/net2280.h
index 1088c3745999..18a881e7f93f 100644
--- a/drivers/usb/gadget/udc/net2280.h
+++ b/drivers/usb/gadget/udc/net2280.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * NetChip 2280 high/full speed USB device controller.
  * Unlike many such controllers, this one talks PCI.
diff --git a/drivers/usb/gadget/udc/omap_udc.c b/drivers/usb/gadget/udc/omap_udc.c
index f05ba6825bfe..5531ea492ed2 100644
--- a/drivers/usb/gadget/udc/omap_udc.c
+++ b/drivers/usb/gadget/udc/omap_udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * omap_udc.c -- for OMAP full speed udc; most chips support OTG.
  *
diff --git a/drivers/usb/gadget/udc/pch_udc.c b/drivers/usb/gadget/udc/pch_udc.c
index 84dcbcd756f0..cc24334504b8 100644
--- a/drivers/usb/gadget/udc/pch_udc.c
+++ b/drivers/usb/gadget/udc/pch_udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2011 LAPIS Semiconductor Co., Ltd.
  *
diff --git a/drivers/usb/gadget/udc/pxa25x_udc.c b/drivers/usb/gadget/udc/pxa25x_udc.c
index 974b778e033c..1f36abdfac96 100644
--- a/drivers/usb/gadget/udc/pxa25x_udc.c
+++ b/drivers/usb/gadget/udc/pxa25x_udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Intel PXA25x and IXP4xx on-chip full speed USB device controllers
  *
diff --git a/drivers/usb/gadget/udc/pxa25x_udc.h b/drivers/usb/gadget/udc/pxa25x_udc.h
index a458bec2536d..1532e7e71f99 100644
--- a/drivers/usb/gadget/udc/pxa25x_udc.h
+++ b/drivers/usb/gadget/udc/pxa25x_udc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Intel PXA25x on-chip full speed USB device controller
  *
diff --git a/drivers/usb/gadget/udc/pxa27x_udc.c b/drivers/usb/gadget/udc/pxa27x_udc.c
index d48e239660c3..14606f340325 100644
--- a/drivers/usb/gadget/udc/pxa27x_udc.c
+++ b/drivers/usb/gadget/udc/pxa27x_udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Handles the Intel 27x USB Device Controller (UDC)
  *
diff --git a/drivers/usb/gadget/udc/pxa27x_udc.h b/drivers/usb/gadget/udc/pxa27x_udc.h
index cea2cb79b30c..cfdece686abe 100644
--- a/drivers/usb/gadget/udc/pxa27x_udc.h
+++ b/drivers/usb/gadget/udc/pxa27x_udc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * linux/drivers/usb/gadget/pxa27x_udc.h
  * Intel PXA27x on-chip full speed USB device controller
diff --git a/drivers/usb/gadget/udc/r8a66597-udc.c b/drivers/usb/gadget/udc/r8a66597-udc.c
index bb844b94df10..75c9e94ecd59 100644
--- a/drivers/usb/gadget/udc/r8a66597-udc.c
+++ b/drivers/usb/gadget/udc/r8a66597-udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * R8A66597 UDC (USB gadget)
  *
diff --git a/drivers/usb/gadget/udc/r8a66597-udc.h b/drivers/usb/gadget/udc/r8a66597-udc.h
index 45c4b2df1785..0f6d41e61841 100644
--- a/drivers/usb/gadget/udc/r8a66597-udc.h
+++ b/drivers/usb/gadget/udc/r8a66597-udc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * R8A66597 UDC
  *
diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c
index cb461b8f4509..3d4b885e7958 100644
--- a/drivers/usb/gadget/udc/renesas_usb3.c
+++ b/drivers/usb/gadget/udc/renesas_usb3.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Renesas USB3.0 Peripheral driver (USB gadget)
  *
diff --git a/drivers/usb/gadget/udc/s3c-hsudc.c b/drivers/usb/gadget/udc/s3c-hsudc.c
index 42587b738a1f..9707b945eef2 100644
--- a/drivers/usb/gadget/udc/s3c-hsudc.c
+++ b/drivers/usb/gadget/udc/s3c-hsudc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* linux/drivers/usb/gadget/s3c-hsudc.c
  *
  * Copyright (c) 2010 Samsung Electronics Co., Ltd.
diff --git a/drivers/usb/gadget/udc/s3c2410_udc.c b/drivers/usb/gadget/udc/s3c2410_udc.c
index 394abd5d65c0..ed874cabd339 100644
--- a/drivers/usb/gadget/udc/s3c2410_udc.c
+++ b/drivers/usb/gadget/udc/s3c2410_udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * linux/drivers/usb/gadget/s3c2410_udc.c
  *
diff --git a/drivers/usb/gadget/udc/s3c2410_udc.h b/drivers/usb/gadget/udc/s3c2410_udc.h
index 93bf225f1969..cfabc83c2244 100644
--- a/drivers/usb/gadget/udc/s3c2410_udc.h
+++ b/drivers/usb/gadget/udc/s3c2410_udc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * linux/drivers/usb/gadget/s3c2410_udc.h
  * Samsung on-chip full speed USB device controllers
diff --git a/drivers/usb/gadget/udc/snps_udc_core.c b/drivers/usb/gadget/udc/snps_udc_core.c
index 2f5e788dd978..a9569080c30d 100644
--- a/drivers/usb/gadget/udc/snps_udc_core.c
+++ b/drivers/usb/gadget/udc/snps_udc_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * amd5536.c -- AMD 5536 UDC high/full speed USB device controller
  *
diff --git a/drivers/usb/gadget/udc/snps_udc_plat.c b/drivers/usb/gadget/udc/snps_udc_plat.c
index e8a5fdaee37d..800a35b48ab1 100644
--- a/drivers/usb/gadget/udc/snps_udc_plat.c
+++ b/drivers/usb/gadget/udc/snps_udc_plat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * snps_udc_plat.c - Synopsys UDC Platform Driver
  *
diff --git a/drivers/usb/gadget/udc/trace.c b/drivers/usb/gadget/udc/trace.c
index 8c551ab91ad8..fbc139292245 100644
--- a/drivers/usb/gadget/udc/trace.c
+++ b/drivers/usb/gadget/udc/trace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * trace.c - USB Gadget Framework Trace Support
  *
diff --git a/drivers/usb/gadget/udc/trace.h b/drivers/usb/gadget/udc/trace.h
index da29874b5366..06b162bcdb54 100644
--- a/drivers/usb/gadget/udc/trace.h
+++ b/drivers/usb/gadget/udc/trace.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * udc.c - Core UDC Framework
  *
diff --git a/drivers/usb/gadget/udc/udc-xilinx.c b/drivers/usb/gadget/udc/udc-xilinx.c
index de207a90571e..374a75d68365 100644
--- a/drivers/usb/gadget/udc/udc-xilinx.c
+++ b/drivers/usb/gadget/udc/udc-xilinx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Xilinx USB peripheral controller driver
  *
diff --git a/drivers/usb/gadget/usbstring.c b/drivers/usb/gadget/usbstring.c
index 73a4dfba0edb..6c7a4e0c45c8 100644
--- a/drivers/usb/gadget/usbstring.c
+++ b/drivers/usb/gadget/usbstring.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1+
 /*
  * Copyright (C) 2003 David Brownell
  *
diff --git a/drivers/usb/host/bcma-hcd.c b/drivers/usb/host/bcma-hcd.c
index 5f425c89faf1..0f595c630600 100644
--- a/drivers/usb/host/bcma-hcd.c
+++ b/drivers/usb/host/bcma-hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Broadcom specific Advanced Microcontroller Bus
  * Broadcom USB-core driver (BCMA bus glue)
diff --git a/drivers/usb/host/ehci-atmel.c b/drivers/usb/host/ehci-atmel.c
index 2a8b9bdc0e57..50beacc581de 100644
--- a/drivers/usb/host/ehci-atmel.c
+++ b/drivers/usb/host/ehci-atmel.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Driver for EHCI UHP on Atmel chips
  *
diff --git a/drivers/usb/host/ehci-dbg.c b/drivers/usb/host/ehci-dbg.c
index cbb9b8e12c3c..7fb21d01b3d0 100644
--- a/drivers/usb/host/ehci-dbg.c
+++ b/drivers/usb/host/ehci-dbg.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (c) 2001-2002 by David Brownell
  *
diff --git a/drivers/usb/host/ehci-exynos.c b/drivers/usb/host/ehci-exynos.c
index 26b641100639..0a131659fc33 100644
--- a/drivers/usb/host/ehci-exynos.c
+++ b/drivers/usb/host/ehci-exynos.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * SAMSUNG EXYNOS USB HOST EHCI Controller
  *
diff --git a/drivers/usb/host/ehci-fsl.c b/drivers/usb/host/ehci-fsl.c
index d025cc06dda7..7c4bb32230d2 100644
--- a/drivers/usb/host/ehci-fsl.c
+++ b/drivers/usb/host/ehci-fsl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright 2005-2009 MontaVista Software, Inc.
  * Copyright 2008,2012,2015      Freescale Semiconductor, Inc.
diff --git a/drivers/usb/host/ehci-fsl.h b/drivers/usb/host/ehci-fsl.h
index 1a8a60a57cf2..21a6f10b5e3a 100644
--- a/drivers/usb/host/ehci-fsl.h
+++ b/drivers/usb/host/ehci-fsl.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /* Copyright (C) 2005-2010,2012 Freescale Semiconductor, Inc.
  * Copyright (c) 2005 MontaVista Software
  *
diff --git a/drivers/usb/host/ehci-grlib.c b/drivers/usb/host/ehci-grlib.c
index 21650044b09e..a8cffcf13451 100644
--- a/drivers/usb/host/ehci-grlib.c
+++ b/drivers/usb/host/ehci-grlib.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for Aeroflex Gaisler GRLIB GRUSBHC EHCI host controller
  *
diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index c560a01f4971..74d82148954d 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Enhanced Host Controller Interface (EHCI) driver for USB.
  *
diff --git a/drivers/usb/host/ehci-hub.c b/drivers/usb/host/ehci-hub.c
index df169c8e7225..b94793b56270 100644
--- a/drivers/usb/host/ehci-hub.c
+++ b/drivers/usb/host/ehci-hub.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2001-2004 by David Brownell
  *
diff --git a/drivers/usb/host/ehci-mem.c b/drivers/usb/host/ehci-mem.c
index 9b7e63977215..212d2042fb9f 100644
--- a/drivers/usb/host/ehci-mem.c
+++ b/drivers/usb/host/ehci-mem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (c) 2001 by David Brownell
  *
diff --git a/drivers/usb/host/ehci-mv.c b/drivers/usb/host/ehci-mv.c
index 849806a75f1c..c9e15225a30f 100644
--- a/drivers/usb/host/ehci-mv.c
+++ b/drivers/usb/host/ehci-mv.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2011 Marvell International Ltd. All rights reserved.
  * Author: Chao Xie <chao.xie@marvell.com>
diff --git a/drivers/usb/host/ehci-mxc.c b/drivers/usb/host/ehci-mxc.c
index c7a9b31eeaef..67adea97c26c 100644
--- a/drivers/usb/host/ehci-mxc.c
+++ b/drivers/usb/host/ehci-mxc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (c) 2008 Sascha Hauer <s.hauer@pengutronix.de>, Pengutronix
  * Copyright (c) 2009 Daniel Mack <daniel@caiaq.de>
diff --git a/drivers/usb/host/ehci-omap.c b/drivers/usb/host/ehci-omap.c
index 4d308533bc83..dd319d3219b6 100644
--- a/drivers/usb/host/ehci-omap.c
+++ b/drivers/usb/host/ehci-omap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * ehci-omap.c - driver for USBHOST on OMAP3/4 processors
  *
diff --git a/drivers/usb/host/ehci-orion.c b/drivers/usb/host/ehci-orion.c
index 1aec87ec68df..199a6d2778dd 100644
--- a/drivers/usb/host/ehci-orion.c
+++ b/drivers/usb/host/ehci-orion.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * drivers/usb/host/ehci-orion.c
  *
diff --git a/drivers/usb/host/ehci-pci.c b/drivers/usb/host/ehci-pci.c
index 93326974ff4b..f6015f6ca488 100644
--- a/drivers/usb/host/ehci-pci.c
+++ b/drivers/usb/host/ehci-pci.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * EHCI HCD (Host Controller Driver) PCI Bus Glue.
  *
diff --git a/drivers/usb/host/ehci-platform.c b/drivers/usb/host/ehci-platform.c
index a41acd661c46..080014197f25 100644
--- a/drivers/usb/host/ehci-platform.c
+++ b/drivers/usb/host/ehci-platform.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Generic platform ehci driver
  *
diff --git a/drivers/usb/host/ehci-pmcmsp.c b/drivers/usb/host/ehci-pmcmsp.c
index 342816a7f8b1..9a05bff230bb 100644
--- a/drivers/usb/host/ehci-pmcmsp.c
+++ b/drivers/usb/host/ehci-pmcmsp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * PMC MSP EHCI (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/ehci-ppc-of.c b/drivers/usb/host/ehci-ppc-of.c
index 1a10c8d542ca..576f7d79ad4e 100644
--- a/drivers/usb/host/ehci-ppc-of.c
+++ b/drivers/usb/host/ehci-ppc-of.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * EHCI HCD (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/ehci-ps3.c b/drivers/usb/host/ehci-ps3.c
index 7934ff9b35e1..c74066790e69 100644
--- a/drivers/usb/host/ehci-ps3.c
+++ b/drivers/usb/host/ehci-ps3.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  PS3 EHCI Host Controller driver
  *
diff --git a/drivers/usb/host/ehci-q.c b/drivers/usb/host/ehci-q.c
index 8f3f055c05fa..c0074f212a09 100644
--- a/drivers/usb/host/ehci-q.c
+++ b/drivers/usb/host/ehci-q.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2001-2004 by David Brownell
  *
diff --git a/drivers/usb/host/ehci-sched.c b/drivers/usb/host/ehci-sched.c
index 6bc6304672bc..ebbc2c60de89 100644
--- a/drivers/usb/host/ehci-sched.c
+++ b/drivers/usb/host/ehci-sched.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (c) 2001-2004 by David Brownell
  * Copyright (c) 2003 Michal Sojka, for high-speed iso transfers
diff --git a/drivers/usb/host/ehci-sh.c b/drivers/usb/host/ehci-sh.c
index 5caf88d679e4..d565f24ca7f5 100644
--- a/drivers/usb/host/ehci-sh.c
+++ b/drivers/usb/host/ehci-sh.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * SuperH EHCI host controller driver
  *
diff --git a/drivers/usb/host/ehci-spear.c b/drivers/usb/host/ehci-spear.c
index 1f25c7985f5b..d12259d06f53 100644
--- a/drivers/usb/host/ehci-spear.c
+++ b/drivers/usb/host/ehci-spear.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
 * Driver for EHCI HCD on SPEAr SOC
 *
diff --git a/drivers/usb/host/ehci-st.c b/drivers/usb/host/ehci-st.c
index be4a2788fc58..336e9fa5274f 100644
--- a/drivers/usb/host/ehci-st.c
+++ b/drivers/usb/host/ehci-st.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * ST EHCI driver
  *
diff --git a/drivers/usb/host/ehci-sysfs.c b/drivers/usb/host/ehci-sysfs.c
index 5216f2b09d63..16669619cfc5 100644
--- a/drivers/usb/host/ehci-sysfs.c
+++ b/drivers/usb/host/ehci-sysfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2007 by Alan Stern
  *
diff --git a/drivers/usb/host/ehci-tegra.c b/drivers/usb/host/ehci-tegra.c
index 9a3d7db5be57..fe8423e17877 100644
--- a/drivers/usb/host/ehci-tegra.c
+++ b/drivers/usb/host/ehci-tegra.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * EHCI-compliant USB host controller driver for NVIDIA Tegra SoCs
  *
diff --git a/drivers/usb/host/ehci-tilegx.c b/drivers/usb/host/ehci-tilegx.c
index bdb93b6a356f..d41b3d217253 100644
--- a/drivers/usb/host/ehci-tilegx.c
+++ b/drivers/usb/host/ehci-tilegx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright 2012 Tilera Corporation. All Rights Reserved.
  *
diff --git a/drivers/usb/host/ehci-timer.c b/drivers/usb/host/ehci-timer.c
index 0b6cdb723192..047a5b131717 100644
--- a/drivers/usb/host/ehci-timer.c
+++ b/drivers/usb/host/ehci-timer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2012 by Alan Stern
  *
diff --git a/drivers/usb/host/ehci-w90x900.c b/drivers/usb/host/ehci-w90x900.c
index 63b9d0c67963..da2c99d3ba6b 100644
--- a/drivers/usb/host/ehci-w90x900.c
+++ b/drivers/usb/host/ehci-w90x900.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * linux/driver/usb/host/ehci-w90x900.c
  *
diff --git a/drivers/usb/host/ehci-xilinx-of.c b/drivers/usb/host/ehci-xilinx-of.c
index f54480850bb8..886b05678de9 100644
--- a/drivers/usb/host/ehci-xilinx-of.c
+++ b/drivers/usb/host/ehci-xilinx-of.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * EHCI HCD (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/ehci.h b/drivers/usb/host/ehci.h
index a8e36170d8b8..1794d6254cfc 100644
--- a/drivers/usb/host/ehci.h
+++ b/drivers/usb/host/ehci.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (c) 2001-2002 by David Brownell
  *
diff --git a/drivers/usb/host/fhci-dbg.c b/drivers/usb/host/fhci-dbg.c
index b58e7a60913a..9935f10ad407 100644
--- a/drivers/usb/host/fhci-dbg.c
+++ b/drivers/usb/host/fhci-dbg.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Freescale QUICC Engine USB Host Controller Driver
  *
diff --git a/drivers/usb/host/fhci-hcd.c b/drivers/usb/host/fhci-hcd.c
index 55a0ae6f2d74..763131134ab1 100644
--- a/drivers/usb/host/fhci-hcd.c
+++ b/drivers/usb/host/fhci-hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Freescale QUICC Engine USB Host Controller Driver
  *
diff --git a/drivers/usb/host/fhci-hub.c b/drivers/usb/host/fhci-hub.c
index 60d55eb3de0d..d50a9ca15830 100644
--- a/drivers/usb/host/fhci-hub.c
+++ b/drivers/usb/host/fhci-hub.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Freescale QUICC Engine USB Host Controller Driver
  *
diff --git a/drivers/usb/host/fhci-mem.c b/drivers/usb/host/fhci-mem.c
index b0b88f57a5ac..532a5960ff48 100644
--- a/drivers/usb/host/fhci-mem.c
+++ b/drivers/usb/host/fhci-mem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Freescale QUICC Engine USB Host Controller Driver
  *
diff --git a/drivers/usb/host/fhci-q.c b/drivers/usb/host/fhci-q.c
index 03be7494a476..664e1f98d68f 100644
--- a/drivers/usb/host/fhci-q.c
+++ b/drivers/usb/host/fhci-q.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Freescale QUICC Engine USB Host Controller Driver
  *
diff --git a/drivers/usb/host/fhci-sched.c b/drivers/usb/host/fhci-sched.c
index 2f162faabbca..c8f3de90f464 100644
--- a/drivers/usb/host/fhci-sched.c
+++ b/drivers/usb/host/fhci-sched.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Freescale QUICC Engine USB Host Controller Driver
  *
diff --git a/drivers/usb/host/fhci-tds.c b/drivers/usb/host/fhci-tds.c
index f82ad5df1b0d..fa54315064da 100644
--- a/drivers/usb/host/fhci-tds.c
+++ b/drivers/usb/host/fhci-tds.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Freescale QUICC Engine USB Host Controller Driver
  *
diff --git a/drivers/usb/host/fhci.h b/drivers/usb/host/fhci.h
index 3fc82c1c3c73..257c04c8af0c 100644
--- a/drivers/usb/host/fhci.h
+++ b/drivers/usb/host/fhci.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Freescale QUICC Engine USB Host Controller Driver
  *
diff --git a/drivers/usb/host/fotg210-hcd.c b/drivers/usb/host/fotg210-hcd.c
index 33a4f7ed0d7d..302b66fc6cfa 100644
--- a/drivers/usb/host/fotg210-hcd.c
+++ b/drivers/usb/host/fotg210-hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /* Faraday FOTG210 EHCI-like driver
  *
  * Copyright (c) 2013 Faraday Technology Corporation
diff --git a/drivers/usb/host/fsl-mph-dr-of.c b/drivers/usb/host/fsl-mph-dr-of.c
index ba557cdba8ef..c749cbd8dd3c 100644
--- a/drivers/usb/host/fsl-mph-dr-of.c
+++ b/drivers/usb/host/fsl-mph-dr-of.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Setup platform devices needed by the Freescale multi-port host
  * and/or dual-role USB controller modules based on the description
diff --git a/drivers/usb/host/hwa-hc.c b/drivers/usb/host/hwa-hc.c
index da3b18038d23..e5fda058b5d6 100644
--- a/drivers/usb/host/hwa-hc.c
+++ b/drivers/usb/host/hwa-hc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Host Wire Adapter:
  * Driver glue, HWA-specific functions, bridges to WAHC and WUSBHC
diff --git a/drivers/usb/host/imx21-dbg.c b/drivers/usb/host/imx21-dbg.c
index 4f320d050da7..d6a72acceacf 100644
--- a/drivers/usb/host/imx21-dbg.c
+++ b/drivers/usb/host/imx21-dbg.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (c) 2009 by Martin Fuzzey
  *
diff --git a/drivers/usb/host/imx21-hcd.c b/drivers/usb/host/imx21-hcd.c
index 39ae7fb64b6f..4dbf28bc2652 100644
--- a/drivers/usb/host/imx21-hcd.c
+++ b/drivers/usb/host/imx21-hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * USB Host Controller Driver for IMX21
  *
diff --git a/drivers/usb/host/imx21-hcd.h b/drivers/usb/host/imx21-hcd.h
index 05122f8a6983..768e714bcb30 100644
--- a/drivers/usb/host/imx21-hcd.h
+++ b/drivers/usb/host/imx21-hcd.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Macros and prototypes for i.MX21
  *
diff --git a/drivers/usb/host/isp116x-hcd.c b/drivers/usb/host/isp116x-hcd.c
index ebf2ff239539..5f9234b9cf7b 100644
--- a/drivers/usb/host/isp116x-hcd.c
+++ b/drivers/usb/host/isp116x-hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * ISP116x HCD (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/isp1362-hcd.c b/drivers/usb/host/isp1362-hcd.c
index a822de4f36a1..c7e60b662e9b 100644
--- a/drivers/usb/host/isp1362-hcd.c
+++ b/drivers/usb/host/isp1362-hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * ISP1362 HCD (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/max3421-hcd.c b/drivers/usb/host/max3421-hcd.c
index 02bbf8938bb9..afa321ab55fc 100644
--- a/drivers/usb/host/max3421-hcd.c
+++ b/drivers/usb/host/max3421-hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * MAX3421 Host Controller driver for USB.
  *
diff --git a/drivers/usb/host/ohci-at91.c b/drivers/usb/host/ohci-at91.c
index 5302f988e7e6..5ad9e9bdc8ee 100644
--- a/drivers/usb/host/ohci-at91.c
+++ b/drivers/usb/host/ohci-at91.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * OHCI HCD (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/ohci-da8xx.c b/drivers/usb/host/ohci-da8xx.c
index 05da2cb59612..2056573c2b12 100644
--- a/drivers/usb/host/ohci-da8xx.c
+++ b/drivers/usb/host/ohci-da8xx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * OHCI HCD (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/ohci-dbg.c b/drivers/usb/host/ohci-dbg.c
index c3eded317495..ac7d4ac34b02 100644
--- a/drivers/usb/host/ohci-dbg.c
+++ b/drivers/usb/host/ohci-dbg.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * OHCI HCD (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/ohci-exynos.c b/drivers/usb/host/ohci-exynos.c
index 6865b919403f..a12cbb295425 100644
--- a/drivers/usb/host/ohci-exynos.c
+++ b/drivers/usb/host/ohci-exynos.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * SAMSUNG EXYNOS USB HOST OHCI Controller
  *
diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c
index 15ec8f90aa6d..9902bc3c15fe 100644
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * Open Host Controller Interface (OHCI) driver for USB.
  *
diff --git a/drivers/usb/host/ohci-hub.c b/drivers/usb/host/ohci-hub.c
index 248eb7702463..fb7aaa3b9d06 100644
--- a/drivers/usb/host/ohci-hub.c
+++ b/drivers/usb/host/ohci-hub.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * OHCI HCD (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/ohci-mem.c b/drivers/usb/host/ohci-mem.c
index ed8a762b8670..b3da3f12e5b1 100644
--- a/drivers/usb/host/ohci-mem.c
+++ b/drivers/usb/host/ohci-mem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * OHCI HCD (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/ohci-nxp.c b/drivers/usb/host/ohci-nxp.c
index 6df8e2ed40fd..5509b50bc417 100644
--- a/drivers/usb/host/ohci-nxp.c
+++ b/drivers/usb/host/ohci-nxp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * driver for NXP USB Host devices
  *
diff --git a/drivers/usb/host/ohci-omap.c b/drivers/usb/host/ohci-omap.c
index 91393ec7d850..0201c49bc4fc 100644
--- a/drivers/usb/host/ohci-omap.c
+++ b/drivers/usb/host/ohci-omap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * OHCI HCD (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/ohci-pci.c b/drivers/usb/host/ohci-pci.c
index a84aebe9b0a9..fbcd34911025 100644
--- a/drivers/usb/host/ohci-pci.c
+++ b/drivers/usb/host/ohci-pci.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * OHCI HCD (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/ohci-platform.c b/drivers/usb/host/ohci-platform.c
index 05ebde6adf1e..56be56c1ab63 100644
--- a/drivers/usb/host/ohci-platform.c
+++ b/drivers/usb/host/ohci-platform.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Generic platform ohci driver
  *
diff --git a/drivers/usb/host/ohci-ppc-of.c b/drivers/usb/host/ohci-ppc-of.c
index 4f87a5c61b08..76a9b40b08f1 100644
--- a/drivers/usb/host/ohci-ppc-of.c
+++ b/drivers/usb/host/ohci-ppc-of.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * OHCI HCD (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/ohci-ps3.c b/drivers/usb/host/ohci-ps3.c
index 71d8bc4c27f6..bb0375d9eef0 100644
--- a/drivers/usb/host/ohci-ps3.c
+++ b/drivers/usb/host/ohci-ps3.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  PS3 OHCI Host Controller driver
  *
diff --git a/drivers/usb/host/ohci-pxa27x.c b/drivers/usb/host/ohci-pxa27x.c
index 21c010ffb03c..3e2474959735 100644
--- a/drivers/usb/host/ohci-pxa27x.c
+++ b/drivers/usb/host/ohci-pxa27x.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * OHCI HCD (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/ohci-q.c b/drivers/usb/host/ohci-q.c
index 641fed609911..b2ec8c399363 100644
--- a/drivers/usb/host/ohci-q.c
+++ b/drivers/usb/host/ohci-q.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * OHCI HCD (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/ohci-s3c2410.c b/drivers/usb/host/ohci-s3c2410.c
index b006b93126f7..4511e27e9da8 100644
--- a/drivers/usb/host/ohci-s3c2410.c
+++ b/drivers/usb/host/ohci-s3c2410.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * OHCI HCD (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/ohci-sa1111.c b/drivers/usb/host/ohci-sa1111.c
index 8758c73215d7..ebec9a7699e3 100644
--- a/drivers/usb/host/ohci-sa1111.c
+++ b/drivers/usb/host/ohci-sa1111.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * OHCI HCD (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/ohci-sm501.c b/drivers/usb/host/ohci-sm501.c
index d4e0f7cd96fa..c9233cddf9a2 100644
--- a/drivers/usb/host/ohci-sm501.c
+++ b/drivers/usb/host/ohci-sm501.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * OHCI HCD (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/ohci-spear.c b/drivers/usb/host/ohci-spear.c
index 56478ed2f932..b3554f70bd27 100644
--- a/drivers/usb/host/ohci-spear.c
+++ b/drivers/usb/host/ohci-spear.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
 * OHCI HCD (Host Controller Driver) for USB.
 *
diff --git a/drivers/usb/host/ohci-st.c b/drivers/usb/host/ohci-st.c
index 02816a1515a1..697e6d95bb7e 100644
--- a/drivers/usb/host/ohci-st.c
+++ b/drivers/usb/host/ohci-st.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * ST OHCI driver
  *
diff --git a/drivers/usb/host/ohci-tilegx.c b/drivers/usb/host/ohci-tilegx.c
index e1b208da460a..e5a9f68cd648 100644
--- a/drivers/usb/host/ohci-tilegx.c
+++ b/drivers/usb/host/ohci-tilegx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright 2012 Tilera Corporation. All Rights Reserved.
  *
diff --git a/drivers/usb/host/ohci-tmio.c b/drivers/usb/host/ohci-tmio.c
index 16d081a093bb..5702964408db 100644
--- a/drivers/usb/host/ohci-tmio.c
+++ b/drivers/usb/host/ohci-tmio.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * OHCI HCD(Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/ohci.h b/drivers/usb/host/ohci.h
index 12742d002d2d..508a803139dd 100644
--- a/drivers/usb/host/ohci.h
+++ b/drivers/usb/host/ohci.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * OHCI HCD (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/oxu210hp-hcd.c b/drivers/usb/host/oxu210hp-hcd.c
index 9f8c61eafd45..08a72f658305 100644
--- a/drivers/usb/host/oxu210hp-hcd.c
+++ b/drivers/usb/host/oxu210hp-hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (c) 2008 Rodolfo Giometti <giometti@linux.it>
  * Copyright (c) 2008 Eurotech S.p.A. <info@eurtech.it>
diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c
index 6731f8d8d4c9..161536717025 100644
--- a/drivers/usb/host/pci-quirks.c
+++ b/drivers/usb/host/pci-quirks.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * This file contains code to reset and initialize USB host controllers.
  * Some of it includes work-arounds for PCI hardware and BIOS quirks.
diff --git a/drivers/usb/host/r8a66597-hcd.c b/drivers/usb/host/r8a66597-hcd.c
index 0f3d2aedaec5..46d595f1b2eb 100644
--- a/drivers/usb/host/r8a66597-hcd.c
+++ b/drivers/usb/host/r8a66597-hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * R8A66597 HCD (Host Controller Driver)
  *
diff --git a/drivers/usb/host/r8a66597.h b/drivers/usb/host/r8a66597.h
index b8406c07f363..cc8241c7d653 100644
--- a/drivers/usb/host/r8a66597.h
+++ b/drivers/usb/host/r8a66597.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * R8A66597 HCD (Host Controller Driver)
  *
diff --git a/drivers/usb/host/sl811-hcd.c b/drivers/usb/host/sl811-hcd.c
index 24ad1d6cec25..601fb00603cc 100644
--- a/drivers/usb/host/sl811-hcd.c
+++ b/drivers/usb/host/sl811-hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * SL811HS HCD (Host Controller Driver) for USB.
  *
diff --git a/drivers/usb/host/sl811_cs.c b/drivers/usb/host/sl811_cs.c
index 88a9bffe93df..72136373ffab 100644
--- a/drivers/usb/host/sl811_cs.c
+++ b/drivers/usb/host/sl811_cs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * PCMCIA driver for SL811HS (as found in REX-CFU1U)
  * Filename: sl811_cs.c
diff --git a/drivers/usb/host/ssb-hcd.c b/drivers/usb/host/ssb-hcd.c
index 62b6b7804c66..2f9087dc4cab 100644
--- a/drivers/usb/host/ssb-hcd.c
+++ b/drivers/usb/host/ssb-hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Sonics Silicon Backplane
  * Broadcom USB-core driver  (SSB bus glue)
diff --git a/drivers/usb/host/u132-hcd.c b/drivers/usb/host/u132-hcd.c
index c38855aed62c..228d22bfb36e 100644
--- a/drivers/usb/host/u132-hcd.c
+++ b/drivers/usb/host/u132-hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
 * Host Controller Driver for the Elan Digital Systems U132 adapter
 *
diff --git a/drivers/usb/host/uhci-hcd.c b/drivers/usb/host/uhci-hcd.c
index c3267a78c94e..babeefd84ffd 100644
--- a/drivers/usb/host/uhci-hcd.c
+++ b/drivers/usb/host/uhci-hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Universal Host Controller Interface driver for USB.
  *
diff --git a/drivers/usb/host/whci/asl.c b/drivers/usb/host/whci/asl.c
index 773249306031..81a6286f50cf 100644
--- a/drivers/usb/host/whci/asl.c
+++ b/drivers/usb/host/whci/asl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless Host Controller (WHC) asynchronous schedule management.
  *
diff --git a/drivers/usb/host/whci/debug.c b/drivers/usb/host/whci/debug.c
index 774b89d28fae..3cbd84893b6f 100644
--- a/drivers/usb/host/whci/debug.c
+++ b/drivers/usb/host/whci/debug.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless Host Controller (WHC) debug.
  *
diff --git a/drivers/usb/host/whci/hcd.c b/drivers/usb/host/whci/hcd.c
index cf84269c3e6d..eb30567fa6d1 100644
--- a/drivers/usb/host/whci/hcd.c
+++ b/drivers/usb/host/whci/hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless Host Controller (WHC) driver.
  *
diff --git a/drivers/usb/host/whci/hw.c b/drivers/usb/host/whci/hw.c
index 6afa2e379160..2a89686d6971 100644
--- a/drivers/usb/host/whci/hw.c
+++ b/drivers/usb/host/whci/hw.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless Host Controller (WHC) hardware access helpers.
  *
diff --git a/drivers/usb/host/whci/init.c b/drivers/usb/host/whci/init.c
index ad8eb575c30a..48a6f50df24a 100644
--- a/drivers/usb/host/whci/init.c
+++ b/drivers/usb/host/whci/init.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless Host Controller (WHC) initialization.
  *
diff --git a/drivers/usb/host/whci/int.c b/drivers/usb/host/whci/int.c
index 0c086b2790d1..15a2df0b29ab 100644
--- a/drivers/usb/host/whci/int.c
+++ b/drivers/usb/host/whci/int.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless Host Controller (WHC) interrupt handling.
  *
diff --git a/drivers/usb/host/whci/pzl.c b/drivers/usb/host/whci/pzl.c
index 33c5580b4d25..bafac6a88551 100644
--- a/drivers/usb/host/whci/pzl.c
+++ b/drivers/usb/host/whci/pzl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless Host Controller (WHC) periodic schedule management.
  *
diff --git a/drivers/usb/host/whci/qset.c b/drivers/usb/host/whci/qset.c
index c0e6812426b3..1a92d0e492a0 100644
--- a/drivers/usb/host/whci/qset.c
+++ b/drivers/usb/host/whci/qset.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless Host Controller (WHC) qset management.
  *
diff --git a/drivers/usb/host/whci/whcd.h b/drivers/usb/host/whci/whcd.h
index c80c7d93bc4a..4712972682fe 100644
--- a/drivers/usb/host/whci/whcd.h
+++ b/drivers/usb/host/whci/whcd.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless Host Controller (WHC) private header.
  *
diff --git a/drivers/usb/host/whci/whci-hc.h b/drivers/usb/host/whci/whci-hc.h
index 4d4cbc0730bf..5dfbc9837b00 100644
--- a/drivers/usb/host/whci/whci-hc.h
+++ b/drivers/usb/host/whci/whci-hc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless Host Controller (WHC) data structures.
  *
diff --git a/drivers/usb/host/whci/wusb.c b/drivers/usb/host/whci/wusb.c
index 8d2762682869..8c8d8bc8eac4 100644
--- a/drivers/usb/host/whci/wusb.c
+++ b/drivers/usb/host/whci/wusb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless Host Controller (WHC) WUSB operations.
  *
diff --git a/drivers/usb/host/xhci-dbg.c b/drivers/usb/host/xhci-dbg.c
index 2c83b37ae8f2..83904170be5c 100644
--- a/drivers/usb/host/xhci-dbg.c
+++ b/drivers/usb/host/xhci-dbg.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * xHCI host controller driver
  *
diff --git a/drivers/usb/host/xhci-ext-caps.h b/drivers/usb/host/xhci-ext-caps.h
index 28deea584884..259963bbe3aa 100644
--- a/drivers/usb/host/xhci-ext-caps.h
+++ b/drivers/usb/host/xhci-ext-caps.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * xHCI host controller driver
  *
diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c
index 3693b1f487e5..53209113e170 100644
--- a/drivers/usb/host/xhci-hub.c
+++ b/drivers/usb/host/xhci-hub.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * xHCI host controller driver
  *
diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c
index 57be885fb544..4df7c2ea3e1c 100644
--- a/drivers/usb/host/xhci-mem.c
+++ b/drivers/usb/host/xhci-mem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * xHCI host controller driver
  *
diff --git a/drivers/usb/host/xhci-mtk-sch.c b/drivers/usb/host/xhci-mtk-sch.c
index bfc51bc902b8..6945350671e2 100644
--- a/drivers/usb/host/xhci-mtk-sch.c
+++ b/drivers/usb/host/xhci-mtk-sch.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (c) 2015 MediaTek Inc.
  * Author:
diff --git a/drivers/usb/host/xhci-mtk.c b/drivers/usb/host/xhci-mtk.c
index 19d27ce31fdc..371524ea0739 100644
--- a/drivers/usb/host/xhci-mtk.c
+++ b/drivers/usb/host/xhci-mtk.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * MediaTek xHCI Host Controller Driver
  *
diff --git a/drivers/usb/host/xhci-mtk.h b/drivers/usb/host/xhci-mtk.h
index 45ff5c67efb5..303723e3f885 100644
--- a/drivers/usb/host/xhci-mtk.h
+++ b/drivers/usb/host/xhci-mtk.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (c) 2015 MediaTek Inc.
  * Author:
diff --git a/drivers/usb/host/xhci-mvebu.c b/drivers/usb/host/xhci-mvebu.c
index 85908a3ecb8f..fe7a2f84faeb 100644
--- a/drivers/usb/host/xhci-mvebu.c
+++ b/drivers/usb/host/xhci-mvebu.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2014 Marvell
  * Author: Gregory CLEMENT <gregory.clement@free-electrons.com>
diff --git a/drivers/usb/host/xhci-mvebu.h b/drivers/usb/host/xhci-mvebu.h
index 301fc984cae6..619792ae75b8 100644
--- a/drivers/usb/host/xhci-mvebu.h
+++ b/drivers/usb/host/xhci-mvebu.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2014 Marvell
  *
diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index 76f392954733..4281bbee0544 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * xHCI host controller driver PCI Bus Glue.
  *
diff --git a/drivers/usb/host/xhci-plat.c b/drivers/usb/host/xhci-plat.c
index 1969e56a8d8d..72505e602527 100644
--- a/drivers/usb/host/xhci-plat.c
+++ b/drivers/usb/host/xhci-plat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * xhci-plat.c - xHCI host controller driver platform Bus Glue.
  *
diff --git a/drivers/usb/host/xhci-plat.h b/drivers/usb/host/xhci-plat.h
index 29b227895b07..ac8f8eb0bf49 100644
--- a/drivers/usb/host/xhci-plat.h
+++ b/drivers/usb/host/xhci-plat.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * xhci-plat.h - xHCI host controller driver platform Bus Glue.
  *
diff --git a/drivers/usb/host/xhci-rcar.c b/drivers/usb/host/xhci-rcar.c
index 198bc188ab25..bde650594b70 100644
--- a/drivers/usb/host/xhci-rcar.c
+++ b/drivers/usb/host/xhci-rcar.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * xHCI host controller driver for R-Car SoCs
  *
diff --git a/drivers/usb/host/xhci-rcar.h b/drivers/usb/host/xhci-rcar.h
index d247951147a1..162706528c4c 100644
--- a/drivers/usb/host/xhci-rcar.h
+++ b/drivers/usb/host/xhci-rcar.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * drivers/usb/host/xhci-rcar.h
  *
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 521d19e82494..fb07211babd8 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * xHCI host controller driver
  *
diff --git a/drivers/usb/host/xhci-tegra.c b/drivers/usb/host/xhci-tegra.c
index 39b6e93130d3..40e18a7ffce9 100644
--- a/drivers/usb/host/xhci-tegra.c
+++ b/drivers/usb/host/xhci-tegra.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * NVIDIA Tegra xHCI host controller driver
  *
diff --git a/drivers/usb/host/xhci-trace.c b/drivers/usb/host/xhci-trace.c
index 367b630bdb3c..0be3e83025ae 100644
--- a/drivers/usb/host/xhci-trace.c
+++ b/drivers/usb/host/xhci-trace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * xHCI host controller driver
  *
diff --git a/drivers/usb/host/xhci-trace.h b/drivers/usb/host/xhci-trace.h
index 754dfb0e1a02..846f2859fa4a 100644
--- a/drivers/usb/host/xhci-trace.h
+++ b/drivers/usb/host/xhci-trace.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * xHCI host controller driver
  *
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 05db6e977ba1..ab9dc0a5af9e 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * xHCI host controller driver
  *
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index 86df906aec46..dd8bcdf64a9f 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 
 /*
  * xHCI host controller driver
diff --git a/drivers/usb/image/mdc800.c b/drivers/usb/image/mdc800.c
index 185c4e2db2a1..bdbaf6bc4f38 100644
--- a/drivers/usb/image/mdc800.c
+++ b/drivers/usb/image/mdc800.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * copyright (C) 1999/2000 by Henning Zabel <henning@uni-paderborn.de>
  *
diff --git a/drivers/usb/image/microtek.c b/drivers/usb/image/microtek.c
index 0b21ba757bba..9f2f563c82ed 100644
--- a/drivers/usb/image/microtek.c
+++ b/drivers/usb/image/microtek.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Driver for Microtek Scanmaker X6 USB scanner, and possibly others.
  *
  * (C) Copyright 2000 John Fremlin <vii@penguinpowered.com>
diff --git a/drivers/usb/isp1760/isp1760-core.c b/drivers/usb/isp1760/isp1760-core.c
index bfa402cf3a27..8157d18135b8 100644
--- a/drivers/usb/isp1760/isp1760-core.c
+++ b/drivers/usb/isp1760/isp1760-core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Driver for the NXP ISP1760 chip
  *
diff --git a/drivers/usb/isp1760/isp1760-core.h b/drivers/usb/isp1760/isp1760-core.h
index c70f8368a794..47985161ee77 100644
--- a/drivers/usb/isp1760/isp1760-core.h
+++ b/drivers/usb/isp1760/isp1760-core.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Driver for the NXP ISP1760 chip
  *
diff --git a/drivers/usb/isp1760/isp1760-regs.h b/drivers/usb/isp1760/isp1760-regs.h
index b67095c9a9d4..eecdb76c132c 100644
--- a/drivers/usb/isp1760/isp1760-regs.h
+++ b/drivers/usb/isp1760/isp1760-regs.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Driver for the NXP ISP1760 chip
  *
diff --git a/drivers/usb/isp1760/isp1760-udc.c b/drivers/usb/isp1760/isp1760-udc.c
index ad566f27dffe..6f1b4fdee802 100644
--- a/drivers/usb/isp1760/isp1760-udc.c
+++ b/drivers/usb/isp1760/isp1760-udc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Driver for the NXP ISP1761 device controller
  *
diff --git a/drivers/usb/isp1760/isp1760-udc.h b/drivers/usb/isp1760/isp1760-udc.h
index 26899ed81145..ea1598df1f11 100644
--- a/drivers/usb/isp1760/isp1760-udc.h
+++ b/drivers/usb/isp1760/isp1760-udc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Driver for the NXP ISP1761 device controller
  *
diff --git a/drivers/usb/misc/adutux.c b/drivers/usb/misc/adutux.c
index 1c0ada75c35d..092db5ae5fa1 100644
--- a/drivers/usb/misc/adutux.c
+++ b/drivers/usb/misc/adutux.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * adutux - driver for ADU devices from Ontrak Control Systems
  * This is an experimental driver. Use at your own risk.
diff --git a/drivers/usb/misc/appledisplay.c b/drivers/usb/misc/appledisplay.c
index 8efdc500e790..b75defc52c24 100644
--- a/drivers/usb/misc/appledisplay.c
+++ b/drivers/usb/misc/appledisplay.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Apple Cinema Display driver
  *
diff --git a/drivers/usb/misc/chaoskey.c b/drivers/usb/misc/chaoskey.c
index abec6e604a62..8a22b4997c5f 100644
--- a/drivers/usb/misc/chaoskey.c
+++ b/drivers/usb/misc/chaoskey.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * chaoskey - driver for ChaosKey device from Altus Metrum.
  *
diff --git a/drivers/usb/misc/cypress_cy7c63.c b/drivers/usb/misc/cypress_cy7c63.c
index 5c93a888c40e..819ad8dc2376 100644
--- a/drivers/usb/misc/cypress_cy7c63.c
+++ b/drivers/usb/misc/cypress_cy7c63.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
 * cypress_cy7c63.c
 *
diff --git a/drivers/usb/misc/cytherm.c b/drivers/usb/misc/cytherm.c
index 63207c42acf6..a65f8817ecf8 100644
--- a/drivers/usb/misc/cytherm.c
+++ b/drivers/usb/misc/cytherm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* -*- linux-c -*-
  * Cypress USB Thermometer driver 
  * 
diff --git a/drivers/usb/misc/ehset.c b/drivers/usb/misc/ehset.c
index c31b4a33e6bb..8b8e25424aa1 100644
--- a/drivers/usb/misc/ehset.c
+++ b/drivers/usb/misc/ehset.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (c) 2010-2013, The Linux Foundation. All rights reserved.
  *
diff --git a/drivers/usb/misc/emi26.c b/drivers/usb/misc/emi26.c
index 8950fa5e973d..81836e9d787f 100644
--- a/drivers/usb/misc/emi26.c
+++ b/drivers/usb/misc/emi26.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* 
  * Emagic EMI 2|6 usb audio interface firmware loader.
  * Copyright (C) 2002
diff --git a/drivers/usb/misc/emi62.c b/drivers/usb/misc/emi62.c
index 1d9be4431b72..fad894a63c52 100644
--- a/drivers/usb/misc/emi62.c
+++ b/drivers/usb/misc/emi62.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* 
  * Emagic EMI 2|6 usb audio interface firmware loader.
  * Copyright (C) 2002
diff --git a/drivers/usb/misc/ezusb.c b/drivers/usb/misc/ezusb.c
index 837208f14f86..c9be0d484e5c 100644
--- a/drivers/usb/misc/ezusb.c
+++ b/drivers/usb/misc/ezusb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * EZ-USB specific functions used by some of the USB to Serial drivers.
  *
diff --git a/drivers/usb/misc/ftdi-elan.c b/drivers/usb/misc/ftdi-elan.c
index 424ff12f3b51..c1fbc2a32eb2 100644
--- a/drivers/usb/misc/ftdi-elan.c
+++ b/drivers/usb/misc/ftdi-elan.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB FTDI client driver for Elan Digital Systems's Uxxx adapters
  *
diff --git a/drivers/usb/misc/idmouse.c b/drivers/usb/misc/idmouse.c
index 39d8fedfaf3b..8d144903f05e 100644
--- a/drivers/usb/misc/idmouse.c
+++ b/drivers/usb/misc/idmouse.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /* Siemens ID Mouse driver v0.6
 
   This program is free software; you can redistribute it and/or
diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c
index be5881303681..ad3109490c0f 100644
--- a/drivers/usb/misc/iowarrior.c
+++ b/drivers/usb/misc/iowarrior.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  Native support for the I/O-Warrior USB devices
  *
diff --git a/drivers/usb/misc/isight_firmware.c b/drivers/usb/misc/isight_firmware.c
index 1c61830e96f9..91c028f16d31 100644
--- a/drivers/usb/misc/isight_firmware.c
+++ b/drivers/usb/misc/isight_firmware.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Driver for loading USB isight firmware
  *
diff --git a/drivers/usb/misc/ldusb.c b/drivers/usb/misc/ldusb.c
index 680bddb3ce05..0f01dd3b59d0 100644
--- a/drivers/usb/misc/ldusb.c
+++ b/drivers/usb/misc/ldusb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /**
  * Generic USB driver for report based interrupt in/out devices
  * like LD Didactic's USB devices. LD Didactic's USB devices are
diff --git a/drivers/usb/misc/legousbtower.c b/drivers/usb/misc/legousbtower.c
index 5628f678ab59..cd4d49d8aea5 100644
--- a/drivers/usb/misc/legousbtower.c
+++ b/drivers/usb/misc/legousbtower.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * LEGO USB Tower driver
  *
diff --git a/drivers/usb/misc/lvstest.c b/drivers/usb/misc/lvstest.c
index ddddd6387f66..5e5d128e16c0 100644
--- a/drivers/usb/misc/lvstest.c
+++ b/drivers/usb/misc/lvstest.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * drivers/usb/misc/lvstest.c
  *
diff --git a/drivers/usb/misc/rio500.c b/drivers/usb/misc/rio500.c
index ddfebb144aaa..84bd682e9e61 100644
--- a/drivers/usb/misc/rio500.c
+++ b/drivers/usb/misc/rio500.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /* -*- linux-c -*- */
 
 /* 
diff --git a/drivers/usb/misc/rio500_usb.h b/drivers/usb/misc/rio500_usb.h
index 359abc98e706..ce2ac1099d86 100644
--- a/drivers/usb/misc/rio500_usb.h
+++ b/drivers/usb/misc/rio500_usb.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*  ----------------------------------------------------------------------
 
     Copyright (C) 2000  Cesar Miquel  (miquel@df.uba.ar)
diff --git a/drivers/usb/misc/sisusbvga/sisusb.c b/drivers/usb/misc/sisusbvga/sisusb.c
index 30774e0aeadd..3e65bdc2615c 100644
--- a/drivers/usb/misc/sisusbvga/sisusb.c
+++ b/drivers/usb/misc/sisusbvga/sisusb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
 /*
  * sisusb - usb kernel driver for SiS315(E) based USB2VGA dongles
  *
diff --git a/drivers/usb/misc/sisusbvga/sisusb.h b/drivers/usb/misc/sisusbvga/sisusb.h
index 55492a5930bd..20f03ad0ea16 100644
--- a/drivers/usb/misc/sisusbvga/sisusb.h
+++ b/drivers/usb/misc/sisusbvga/sisusb.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
 /*
  * sisusb - usb kernel driver for Net2280/SiS315 based USB2VGA dongles
  *
diff --git a/drivers/usb/misc/sisusbvga/sisusb_con.c b/drivers/usb/misc/sisusbvga/sisusb_con.c
index f019d80ca9e4..73f7bde78e11 100644
--- a/drivers/usb/misc/sisusbvga/sisusb_con.c
+++ b/drivers/usb/misc/sisusbvga/sisusb_con.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
 /*
  * sisusb - usb kernel driver for SiS315(E) based USB2VGA dongles
  *
diff --git a/drivers/usb/misc/sisusbvga/sisusb_init.c b/drivers/usb/misc/sisusbvga/sisusb_init.c
index bf0032ca35ed..6a30e8bd9221 100644
--- a/drivers/usb/misc/sisusbvga/sisusb_init.c
+++ b/drivers/usb/misc/sisusbvga/sisusb_init.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
 /*
  * sisusb - usb kernel driver for SiS315(E) based USB2VGA dongles
  *
diff --git a/drivers/usb/misc/sisusbvga/sisusb_init.h b/drivers/usb/misc/sisusbvga/sisusb_init.h
index e79a616f0d26..1782c759c4ad 100644
--- a/drivers/usb/misc/sisusbvga/sisusb_init.h
+++ b/drivers/usb/misc/sisusbvga/sisusb_init.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
 /* $XFree86$ */
 /* $XdotOrg$ */
 /*
diff --git a/drivers/usb/misc/sisusbvga/sisusb_struct.h b/drivers/usb/misc/sisusbvga/sisusb_struct.h
index 1c4240e802c1..706d77090e00 100644
--- a/drivers/usb/misc/sisusbvga/sisusb_struct.h
+++ b/drivers/usb/misc/sisusbvga/sisusb_struct.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
 /*
  * General structure definitions for universal mode switching modules
  *
diff --git a/drivers/usb/misc/trancevibrator.c b/drivers/usb/misc/trancevibrator.c
index 1862ed15ce28..405726b8ebe6 100644
--- a/drivers/usb/misc/trancevibrator.c
+++ b/drivers/usb/misc/trancevibrator.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * PlayStation 2 Trance Vibrator driver
  *
diff --git a/drivers/usb/misc/usb251xb.c b/drivers/usb/misc/usb251xb.c
index 9c8f7e2b6740..5234bc3e0f01 100644
--- a/drivers/usb/misc/usb251xb.c
+++ b/drivers/usb/misc/usb251xb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for Microchip USB251xB USB 2.0 Hi-Speed Hub Controller
  * Configuration via SMBus.
diff --git a/drivers/usb/misc/usb3503.c b/drivers/usb/misc/usb3503.c
index 8e7737d7ac0a..b5416f887fbf 100644
--- a/drivers/usb/misc/usb3503.c
+++ b/drivers/usb/misc/usb3503.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for SMSC USB3503 USB 2.0 hub controller driver
  *
diff --git a/drivers/usb/misc/usb4604.c b/drivers/usb/misc/usb4604.c
index e9f37fb746ac..40fa85807aae 100644
--- a/drivers/usb/misc/usb4604.c
+++ b/drivers/usb/misc/usb4604.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for SMSC USB4604 USB HSIC 4-port 2.0 hub controller driver
  * Based on usb3503 driver
diff --git a/drivers/usb/misc/usb_u132.h b/drivers/usb/misc/usb_u132.h
index dc2e5a31caec..4dbfea3237f7 100644
--- a/drivers/usb/misc/usb_u132.h
+++ b/drivers/usb/misc/usb_u132.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
 * Common Header File for the Elan Digital Systems U132 adapter
 * this file should be included by both the "ftdi-u132" and
diff --git a/drivers/usb/misc/usblcd.c b/drivers/usb/misc/usblcd.c
index 0f5ad896c7e3..9ba4a4e68d91 100644
--- a/drivers/usb/misc/usblcd.c
+++ b/drivers/usb/misc/usblcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*****************************************************************************
  *                          USBLCD Kernel Driver                             *
  *                            Version 1.05                                   *
diff --git a/drivers/usb/misc/usbsevseg.c b/drivers/usb/misc/usbsevseg.c
index 3f6a28045b53..4990d5757019 100644
--- a/drivers/usb/misc/usbsevseg.c
+++ b/drivers/usb/misc/usbsevseg.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB 7 Segment Driver
  *
diff --git a/drivers/usb/misc/usbtest.c b/drivers/usb/misc/usbtest.c
index 3639e00a51a0..aeda01f037c1 100644
--- a/drivers/usb/misc/usbtest.c
+++ b/drivers/usb/misc/usbtest.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/init.h>
diff --git a/drivers/usb/misc/uss720.c b/drivers/usb/misc/uss720.c
index 8a13b2fcf3e1..876a7a32defc 100644
--- a/drivers/usb/misc/uss720.c
+++ b/drivers/usb/misc/uss720.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*****************************************************************************/
 
 /*
diff --git a/drivers/usb/misc/yurex.c b/drivers/usb/misc/yurex.c
index 58abdf28620a..1aec1d25ee44 100644
--- a/drivers/usb/misc/yurex.c
+++ b/drivers/usb/misc/yurex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Driver for Meywa-Denki & KAYAC YUREX
  *
diff --git a/drivers/usb/mon/mon_main.c b/drivers/usb/mon/mon_main.c
index 46847340b819..9812d102a005 100644
--- a/drivers/usb/mon/mon_main.c
+++ b/drivers/usb/mon/mon_main.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * The USB Monitor, inspired by Dave Harding's USBMon.
  *
diff --git a/drivers/usb/mtu3/mtu3.h b/drivers/usb/mtu3/mtu3.h
index d80e4e813248..dfdace0d81a0 100644
--- a/drivers/usb/mtu3/mtu3.h
+++ b/drivers/usb/mtu3/mtu3.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * mtu3.h - MediaTek USB3 DRD header
  *
diff --git a/drivers/usb/mtu3/mtu3_core.c b/drivers/usb/mtu3/mtu3_core.c
index 7c149a7da14e..dbbf0ee1bd5f 100644
--- a/drivers/usb/mtu3/mtu3_core.c
+++ b/drivers/usb/mtu3/mtu3_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * mtu3_core.c - hardware access layer and gadget init/exit of
  *                     MediaTek usb3 Dual-Role Controller Driver
diff --git a/drivers/usb/mtu3/mtu3_dr.c b/drivers/usb/mtu3/mtu3_dr.c
index ec442cd5a1ad..ff842835329f 100644
--- a/drivers/usb/mtu3/mtu3_dr.c
+++ b/drivers/usb/mtu3/mtu3_dr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * mtu3_dr.c - dual role switch and host glue layer
  *
diff --git a/drivers/usb/mtu3/mtu3_dr.h b/drivers/usb/mtu3/mtu3_dr.h
index 0f0cbac00192..0a54c0bc4752 100644
--- a/drivers/usb/mtu3/mtu3_dr.h
+++ b/drivers/usb/mtu3/mtu3_dr.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * mtu3_dr.h - dual role switch and host glue layer header
  *
diff --git a/drivers/usb/mtu3/mtu3_gadget.c b/drivers/usb/mtu3/mtu3_gadget.c
index b495471f689f..8679a5bc484e 100644
--- a/drivers/usb/mtu3/mtu3_gadget.c
+++ b/drivers/usb/mtu3/mtu3_gadget.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * mtu3_gadget.c - MediaTek usb3 DRD peripheral support
  *
diff --git a/drivers/usb/mtu3/mtu3_gadget_ep0.c b/drivers/usb/mtu3/mtu3_gadget_ep0.c
index 020b25314a68..678432f6ef74 100644
--- a/drivers/usb/mtu3/mtu3_gadget_ep0.c
+++ b/drivers/usb/mtu3/mtu3_gadget_ep0.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * mtu3_gadget_ep0.c - MediaTek USB3 DRD peripheral driver ep0 handling
  *
diff --git a/drivers/usb/mtu3/mtu3_host.c b/drivers/usb/mtu3/mtu3_host.c
index ec76b86dd887..4c570de20d56 100644
--- a/drivers/usb/mtu3/mtu3_host.c
+++ b/drivers/usb/mtu3/mtu3_host.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * mtu3_dr.c - dual role switch and host glue layer
  *
diff --git a/drivers/usb/mtu3/mtu3_hw_regs.h b/drivers/usb/mtu3/mtu3_hw_regs.h
index 6953436a1688..8507363a7e44 100644
--- a/drivers/usb/mtu3/mtu3_hw_regs.h
+++ b/drivers/usb/mtu3/mtu3_hw_regs.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * mtu3_hw_regs.h - MediaTek USB3 DRD register and field definitions
  *
diff --git a/drivers/usb/mtu3/mtu3_plat.c b/drivers/usb/mtu3/mtu3_plat.c
index 9ff33579b42e..3808b2e7bcb8 100644
--- a/drivers/usb/mtu3/mtu3_plat.c
+++ b/drivers/usb/mtu3/mtu3_plat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2016 MediaTek Inc.
  *
diff --git a/drivers/usb/mtu3/mtu3_qmu.c b/drivers/usb/mtu3/mtu3_qmu.c
index 0b4b412b1d0d..9f273dad0fd9 100644
--- a/drivers/usb/mtu3/mtu3_qmu.c
+++ b/drivers/usb/mtu3/mtu3_qmu.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * mtu3_qmu.c - Queue Management Unit driver for device controller
  *
diff --git a/drivers/usb/mtu3/mtu3_qmu.h b/drivers/usb/mtu3/mtu3_qmu.h
index 4dafa16bf120..05cfdfe8be4c 100644
--- a/drivers/usb/mtu3/mtu3_qmu.h
+++ b/drivers/usb/mtu3/mtu3_qmu.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * mtu3_qmu.h - Queue Management Unit driver header
  *
diff --git a/drivers/usb/musb/am35x.c b/drivers/usb/musb/am35x.c
index 2cf990e85bb2..efacff81fd32 100644
--- a/drivers/usb/musb/am35x.c
+++ b/drivers/usb/musb/am35x.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 
 /*
  * Texas Instruments AM35x "glue layer"
diff --git a/drivers/usb/musb/blackfin.c b/drivers/usb/musb/blackfin.c
index 7c580df75dc1..432fd5795877 100644
--- a/drivers/usb/musb/blackfin.c
+++ b/drivers/usb/musb/blackfin.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * MUSB OTG controller driver for Blackfin Processors
  *
diff --git a/drivers/usb/musb/blackfin.h b/drivers/usb/musb/blackfin.h
index 231f2d23b8a1..c0ae8ce81b10 100644
--- a/drivers/usb/musb/blackfin.h
+++ b/drivers/usb/musb/blackfin.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2007 by Analog Devices, Inc.
  *
diff --git a/drivers/usb/musb/cppi_dma.c b/drivers/usb/musb/cppi_dma.c
index a13bd3625043..b4d6d9bb3239 100644
--- a/drivers/usb/musb/cppi_dma.c
+++ b/drivers/usb/musb/cppi_dma.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2005-2006 by Texas Instruments
  *
diff --git a/drivers/usb/musb/da8xx.c b/drivers/usb/musb/da8xx.c
index 1ed5b0e0c2ca..6446921bd98e 100644
--- a/drivers/usb/musb/da8xx.c
+++ b/drivers/usb/musb/da8xx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Texas Instruments DA8xx/OMAP-L1x "glue layer"
  *
diff --git a/drivers/usb/musb/davinci.c b/drivers/usb/musb/davinci.c
index 3a7048e84e1c..e25890379c8e 100644
--- a/drivers/usb/musb/davinci.c
+++ b/drivers/usb/musb/davinci.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2005-2006 by Texas Instruments
  *
diff --git a/drivers/usb/musb/davinci.h b/drivers/usb/musb/davinci.h
index 371baa0ee509..2e507cedf2f4 100644
--- a/drivers/usb/musb/davinci.h
+++ b/drivers/usb/musb/davinci.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2005-2006 by Texas Instruments
  *
diff --git a/drivers/usb/musb/jz4740.c b/drivers/usb/musb/jz4740.c
index 40c68c23d553..354d143ad740 100644
--- a/drivers/usb/musb/jz4740.c
+++ b/drivers/usb/musb/jz4740.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Ingenic JZ4740 "glue layer"
  *
diff --git a/drivers/usb/musb/musb_am335x.c b/drivers/usb/musb/musb_am335x.c
index 1e58ed2361cc..5f04f8e3a640 100644
--- a/drivers/usb/musb/musb_am335x.c
+++ b/drivers/usb/musb/musb_am335x.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
 #include <linux/module.h>
diff --git a/drivers/usb/musb/musb_core.c b/drivers/usb/musb/musb_core.c
index 82a5089caa34..e568690410fd 100644
--- a/drivers/usb/musb/musb_core.c
+++ b/drivers/usb/musb/musb_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * MUSB OTG driver core code
  *
diff --git a/drivers/usb/musb/musb_core.h b/drivers/usb/musb/musb_core.h
index e8573975743d..ca823231bbb3 100644
--- a/drivers/usb/musb/musb_core.h
+++ b/drivers/usb/musb/musb_core.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * MUSB OTG driver defines
  *
diff --git a/drivers/usb/musb/musb_cppi41.c b/drivers/usb/musb/musb_cppi41.c
index 1ec0a4947b6b..d0dd4f470bbe 100644
--- a/drivers/usb/musb/musb_cppi41.c
+++ b/drivers/usb/musb/musb_cppi41.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
 #include <linux/dmaengine.h>
diff --git a/drivers/usb/musb/musb_debug.h b/drivers/usb/musb/musb_debug.h
index 9a78877a8afe..345a359de57a 100644
--- a/drivers/usb/musb/musb_debug.h
+++ b/drivers/usb/musb/musb_debug.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * MUSB OTG driver debug defines
  *
diff --git a/drivers/usb/musb/musb_debugfs.c b/drivers/usb/musb/musb_debugfs.c
index 952733ceaac8..b91d4b60b8c7 100644
--- a/drivers/usb/musb/musb_debugfs.c
+++ b/drivers/usb/musb/musb_debugfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * MUSB OTG driver debugfs support
  *
diff --git a/drivers/usb/musb/musb_dma.h b/drivers/usb/musb/musb_dma.h
index 04c3bd86bd62..7fea3455cd3b 100644
--- a/drivers/usb/musb/musb_dma.h
+++ b/drivers/usb/musb/musb_dma.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * MUSB OTG driver DMA controller abstraction
  *
diff --git a/drivers/usb/musb/musb_dsps.c b/drivers/usb/musb/musb_dsps.c
index 4d95fe714f27..8c292820330c 100644
--- a/drivers/usb/musb/musb_dsps.c
+++ b/drivers/usb/musb/musb_dsps.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Texas Instruments DSPS platforms "glue layer"
  *
diff --git a/drivers/usb/musb/musb_gadget.c b/drivers/usb/musb/musb_gadget.c
index bc6d1717c9ec..2c47704adbd3 100644
--- a/drivers/usb/musb/musb_gadget.c
+++ b/drivers/usb/musb/musb_gadget.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * MUSB OTG driver peripheral support
  *
diff --git a/drivers/usb/musb/musb_gadget.h b/drivers/usb/musb/musb_gadget.h
index 0314dfc770c7..c8c9d5565848 100644
--- a/drivers/usb/musb/musb_gadget.h
+++ b/drivers/usb/musb/musb_gadget.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * MUSB OTG driver peripheral defines
  *
diff --git a/drivers/usb/musb/musb_gadget_ep0.c b/drivers/usb/musb/musb_gadget_ep0.c
index 844a309fe895..1586f3be37be 100644
--- a/drivers/usb/musb/musb_gadget_ep0.c
+++ b/drivers/usb/musb/musb_gadget_ep0.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * MUSB OTG peripheral driver ep0 handling
  *
diff --git a/drivers/usb/musb/musb_host.c b/drivers/usb/musb/musb_host.c
index b17450a59882..262b425c231d 100644
--- a/drivers/usb/musb/musb_host.c
+++ b/drivers/usb/musb/musb_host.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * MUSB OTG driver host support
  *
diff --git a/drivers/usb/musb/musb_host.h b/drivers/usb/musb/musb_host.h
index 7bbf01bf4bb0..2546cdd5d3b9 100644
--- a/drivers/usb/musb/musb_host.h
+++ b/drivers/usb/musb/musb_host.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * MUSB OTG driver host defines
  *
diff --git a/drivers/usb/musb/musb_io.h b/drivers/usb/musb/musb_io.h
index 17a80ae20674..2ebf033ced87 100644
--- a/drivers/usb/musb/musb_io.h
+++ b/drivers/usb/musb/musb_io.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * MUSB OTG driver register I/O
  *
diff --git a/drivers/usb/musb/musb_regs.h b/drivers/usb/musb/musb_regs.h
index cff5bcf0d00f..31f92798b408 100644
--- a/drivers/usb/musb/musb_regs.h
+++ b/drivers/usb/musb/musb_regs.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * MUSB OTG driver register defines
  *
diff --git a/drivers/usb/musb/musb_trace.c b/drivers/usb/musb/musb_trace.c
index 70973d901a21..037509918844 100644
--- a/drivers/usb/musb/musb_trace.c
+++ b/drivers/usb/musb/musb_trace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * musb_trace.c - MUSB Controller Trace Support
  *
diff --git a/drivers/usb/musb/musb_trace.h b/drivers/usb/musb/musb_trace.h
index f031c9e74322..669cd1df5bf8 100644
--- a/drivers/usb/musb/musb_trace.h
+++ b/drivers/usb/musb/musb_trace.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * musb_trace.h - MUSB Controller Trace Support
  *
diff --git a/drivers/usb/musb/musb_virthub.c b/drivers/usb/musb/musb_virthub.c
index 0b4595439d51..de00a6ccd7b3 100644
--- a/drivers/usb/musb/musb_virthub.c
+++ b/drivers/usb/musb/musb_virthub.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * MUSB OTG driver virtual root hub support
  *
diff --git a/drivers/usb/musb/musbhsdma.c b/drivers/usb/musb/musbhsdma.c
index 3620073da58c..d1bb309070a4 100644
--- a/drivers/usb/musb/musbhsdma.c
+++ b/drivers/usb/musb/musbhsdma.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * MUSB OTG driver - support for Mentor's DMA controller
  *
diff --git a/drivers/usb/musb/musbhsdma.h b/drivers/usb/musb/musbhsdma.h
index a3dcbd55e436..51289c0b277e 100644
--- a/drivers/usb/musb/musbhsdma.h
+++ b/drivers/usb/musb/musbhsdma.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * MUSB OTG driver - support for Mentor's DMA controller
  *
diff --git a/drivers/usb/musb/omap2430.c b/drivers/usb/musb/omap2430.c
index 456f3e6ecf03..f68053165b05 100644
--- a/drivers/usb/musb/omap2430.c
+++ b/drivers/usb/musb/omap2430.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2005-2007 by Texas Instruments
  * Some code has been taken from tusb6010.c
diff --git a/drivers/usb/musb/omap2430.h b/drivers/usb/musb/omap2430.h
index 1b5e83a9840e..f484ba592d40 100644
--- a/drivers/usb/musb/omap2430.h
+++ b/drivers/usb/musb/omap2430.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2005-2006 by Texas Instruments
  *
diff --git a/drivers/usb/musb/sunxi.c b/drivers/usb/musb/sunxi.c
index dc353e24d53c..ecc9e1a60f46 100644
--- a/drivers/usb/musb/sunxi.c
+++ b/drivers/usb/musb/sunxi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Allwinner sun4i MUSB Glue Layer
  *
diff --git a/drivers/usb/musb/tusb6010.c b/drivers/usb/musb/tusb6010.c
index f8fce7df654f..d844066a411d 100644
--- a/drivers/usb/musb/tusb6010.c
+++ b/drivers/usb/musb/tusb6010.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * TUSB6010 USB 2.0 OTG Dual Role controller
  *
diff --git a/drivers/usb/musb/tusb6010.h b/drivers/usb/musb/tusb6010.h
index 72cdad23ced9..fc8c01d994c6 100644
--- a/drivers/usb/musb/tusb6010.h
+++ b/drivers/usb/musb/tusb6010.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Definitions for TUSB6010 USB 2.0 OTG Dual Role controller
  *
diff --git a/drivers/usb/musb/tusb6010_omap.c b/drivers/usb/musb/tusb6010_omap.c
index e8060e49b0f4..9db5eb9ba215 100644
--- a/drivers/usb/musb/tusb6010_omap.c
+++ b/drivers/usb/musb/tusb6010_omap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * TUSB6010 USB 2.0 OTG Dual Role controller OMAP DMA interface
  *
diff --git a/drivers/usb/musb/ux500.c b/drivers/usb/musb/ux500.c
index 5a572500c418..b29930253acb 100644
--- a/drivers/usb/musb/ux500.c
+++ b/drivers/usb/musb/ux500.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2010 ST-Ericsson AB
  * Mian Yousaf Kaukab <mian.yousaf.kaukab@stericsson.com>
diff --git a/drivers/usb/musb/ux500_dma.c b/drivers/usb/musb/ux500_dma.c
index c92a295049ad..1a14b0e15ba3 100644
--- a/drivers/usb/musb/ux500_dma.c
+++ b/drivers/usb/musb/ux500_dma.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * drivers/usb/musb/ux500_dma.c
  *
diff --git a/drivers/usb/phy/of.c b/drivers/usb/phy/of.c
index 66ffa82457a8..3b0ebdb63488 100644
--- a/drivers/usb/phy/of.c
+++ b/drivers/usb/phy/of.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * USB of helper code
  *
diff --git a/drivers/usb/phy/phy-ab8500-usb.c b/drivers/usb/phy/phy-ab8500-usb.c
index 61bf2285d5b1..c1394c524c6b 100644
--- a/drivers/usb/phy/phy-ab8500-usb.c
+++ b/drivers/usb/phy/phy-ab8500-usb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * USB transceiver driver for AB8500 family chips
  *
diff --git a/drivers/usb/phy/phy-am335x-control.c b/drivers/usb/phy/phy-am335x-control.c
index 5f5f19813fde..a3cb25cb74f8 100644
--- a/drivers/usb/phy/phy-am335x-control.c
+++ b/drivers/usb/phy/phy-am335x-control.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/err.h>
diff --git a/drivers/usb/phy/phy-am335x.c b/drivers/usb/phy/phy-am335x.c
index 7e5aece769da..b36fa8b953d0 100644
--- a/drivers/usb/phy/phy-am335x.c
+++ b/drivers/usb/phy/phy-am335x.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/dma-mapping.h>
diff --git a/drivers/usb/phy/phy-fsl-usb.c b/drivers/usb/phy/phy-fsl-usb.c
index cf8f40ae6e01..0d9f2e27ce34 100644
--- a/drivers/usb/phy/phy-fsl-usb.c
+++ b/drivers/usb/phy/phy-fsl-usb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2007,2008 Freescale semiconductor, Inc.
  *
diff --git a/drivers/usb/phy/phy-fsl-usb.h b/drivers/usb/phy/phy-fsl-usb.h
index 23149954a09c..5db4668661cd 100644
--- a/drivers/usb/phy/phy-fsl-usb.h
+++ b/drivers/usb/phy/phy-fsl-usb.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /* Copyright (C) 2007,2008 Freescale Semiconductor, Inc.
  *
  * This program is free software; you can redistribute  it and/or modify it
diff --git a/drivers/usb/phy/phy-generic.c b/drivers/usb/phy/phy-generic.c
index 1cc02eb3fc65..9e67a0d58fa1 100644
--- a/drivers/usb/phy/phy-generic.c
+++ b/drivers/usb/phy/phy-generic.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * NOP USB transceiver for all USB transceiver which are either built-in
  * into USB IP or which are mostly autonomous.
diff --git a/drivers/usb/phy/phy-gpio-vbus-usb.c b/drivers/usb/phy/phy-gpio-vbus-usb.c
index f66120db8a41..d7a3aeaa5ae6 100644
--- a/drivers/usb/phy/phy-gpio-vbus-usb.c
+++ b/drivers/usb/phy/phy-gpio-vbus-usb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * gpio-vbus.c - simple GPIO VBUS sensing driver for B peripheral devices
  *
diff --git a/drivers/usb/phy/phy-isp1301-omap.c b/drivers/usb/phy/phy-isp1301-omap.c
index 00bcc6c6db17..e0e1759463ba 100644
--- a/drivers/usb/phy/phy-isp1301-omap.c
+++ b/drivers/usb/phy/phy-isp1301-omap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * isp1301_omap - ISP 1301 USB transceiver, talking to OMAP OTG controller
  *
diff --git a/drivers/usb/phy/phy-isp1301.c b/drivers/usb/phy/phy-isp1301.c
index f333024660b4..22499abc8272 100644
--- a/drivers/usb/phy/phy-isp1301.c
+++ b/drivers/usb/phy/phy-isp1301.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * NXP ISP1301 USB transceiver driver
  *
diff --git a/drivers/usb/phy/phy-keystone.c b/drivers/usb/phy/phy-keystone.c
index 01d4e4cdbc79..0670414ccc38 100644
--- a/drivers/usb/phy/phy-keystone.c
+++ b/drivers/usb/phy/phy-keystone.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * phy-keystone - USB PHY, talking to dwc3 controller in Keystone.
  *
diff --git a/drivers/usb/phy/phy-mv-usb.c b/drivers/usb/phy/phy-mv-usb.c
index fc9ed047d25d..50041d72427d 100644
--- a/drivers/usb/phy/phy-mv-usb.c
+++ b/drivers/usb/phy/phy-mv-usb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2011 Marvell International Ltd. All rights reserved.
  * Author: Chao Xie <chao.xie@marvell.com>
diff --git a/drivers/usb/phy/phy-mv-usb.h b/drivers/usb/phy/phy-mv-usb.h
index 551da6eb0ba8..6150f6bba30b 100644
--- a/drivers/usb/phy/phy-mv-usb.h
+++ b/drivers/usb/phy/phy-mv-usb.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2011 Marvell International Ltd. All rights reserved.
  *
diff --git a/drivers/usb/phy/phy-mxs-usb.c b/drivers/usb/phy/phy-mxs-usb.c
index 269fc030656e..5de62677df98 100644
--- a/drivers/usb/phy/phy-mxs-usb.c
+++ b/drivers/usb/phy/phy-mxs-usb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright 2012-2014 Freescale Semiconductor, Inc.
  * Copyright (C) 2012 Marek Vasut <marex@denx.de>
diff --git a/drivers/usb/phy/phy-omap-otg.c b/drivers/usb/phy/phy-omap-otg.c
index 800d1d90753d..1ce4a846d9b0 100644
--- a/drivers/usb/phy/phy-omap-otg.c
+++ b/drivers/usb/phy/phy-omap-otg.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * OMAP OTG controller driver
  *
diff --git a/drivers/usb/phy/phy-tahvo.c b/drivers/usb/phy/phy-tahvo.c
index bf2c364867a0..3a437bf5e004 100644
--- a/drivers/usb/phy/phy-tahvo.c
+++ b/drivers/usb/phy/phy-tahvo.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Tahvo USB transceiver driver
  *
diff --git a/drivers/usb/phy/phy-tegra-usb.c b/drivers/usb/phy/phy-tegra-usb.c
index ccc2bf5274b4..1ebfbdef4529 100644
--- a/drivers/usb/phy/phy-tegra-usb.c
+++ b/drivers/usb/phy/phy-tegra-usb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2010 Google, Inc.
  * Copyright (C) 2013 NVIDIA Corporation
diff --git a/drivers/usb/phy/phy-twl6030-usb.c b/drivers/usb/phy/phy-twl6030-usb.c
index b5dc077ed7d3..ddcb0dff4d5e 100644
--- a/drivers/usb/phy/phy-twl6030-usb.c
+++ b/drivers/usb/phy/phy-twl6030-usb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * twl6030_usb - TWL6030 USB transceiver, talking to OMAP OTG driver.
  *
diff --git a/drivers/usb/phy/phy-ulpi-viewport.c b/drivers/usb/phy/phy-ulpi-viewport.c
index 18bb8264b5a0..394778a5219c 100644
--- a/drivers/usb/phy/phy-ulpi-viewport.c
+++ b/drivers/usb/phy/phy-ulpi-viewport.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2011 Google, Inc.
  *
diff --git a/drivers/usb/phy/phy-ulpi.c b/drivers/usb/phy/phy-ulpi.c
index f48a7a21e3c2..1a594b356ad8 100644
--- a/drivers/usb/phy/phy-ulpi.c
+++ b/drivers/usb/phy/phy-ulpi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Generic ULPI USB transceiver support
  *
diff --git a/drivers/usb/phy/phy.c b/drivers/usb/phy/phy.c
index 89f4ac4cd93e..3405e8e30a01 100644
--- a/drivers/usb/phy/phy.c
+++ b/drivers/usb/phy/phy.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * phy.c -- USB phy handling
  *
diff --git a/drivers/usb/renesas_usbhs/common.c b/drivers/usb/renesas_usbhs/common.c
index 490ebc7df7b2..e1bc8a84c118 100644
--- a/drivers/usb/renesas_usbhs/common.c
+++ b/drivers/usb/renesas_usbhs/common.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * Renesas USB driver
  *
diff --git a/drivers/usb/renesas_usbhs/common.h b/drivers/usb/renesas_usbhs/common.h
index 8c5fc12ad778..416331c6990a 100644
--- a/drivers/usb/renesas_usbhs/common.h
+++ b/drivers/usb/renesas_usbhs/common.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * Renesas USB driver
  *
diff --git a/drivers/usb/renesas_usbhs/fifo.c b/drivers/usb/renesas_usbhs/fifo.c
index 50285b01da92..e99c323b2f9e 100644
--- a/drivers/usb/renesas_usbhs/fifo.c
+++ b/drivers/usb/renesas_usbhs/fifo.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * Renesas USB driver
  *
diff --git a/drivers/usb/renesas_usbhs/fifo.h b/drivers/usb/renesas_usbhs/fifo.h
index 8b98507d7abc..7a741234c24b 100644
--- a/drivers/usb/renesas_usbhs/fifo.h
+++ b/drivers/usb/renesas_usbhs/fifo.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * Renesas USB driver
  *
diff --git a/drivers/usb/renesas_usbhs/mod.c b/drivers/usb/renesas_usbhs/mod.c
index 28965ef4f824..c0a0789d8b1e 100644
--- a/drivers/usb/renesas_usbhs/mod.c
+++ b/drivers/usb/renesas_usbhs/mod.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * Renesas USB driver
  *
diff --git a/drivers/usb/renesas_usbhs/mod.h b/drivers/usb/renesas_usbhs/mod.h
index 1ef5bf604070..5355a13045d9 100644
--- a/drivers/usb/renesas_usbhs/mod.h
+++ b/drivers/usb/renesas_usbhs/mod.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * Renesas USB driver
  *
diff --git a/drivers/usb/renesas_usbhs/mod_gadget.c b/drivers/usb/renesas_usbhs/mod_gadget.c
index c068b673420b..019bbc8bf9b2 100644
--- a/drivers/usb/renesas_usbhs/mod_gadget.c
+++ b/drivers/usb/renesas_usbhs/mod_gadget.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * Renesas USB driver
  *
diff --git a/drivers/usb/renesas_usbhs/mod_host.c b/drivers/usb/renesas_usbhs/mod_host.c
index e256351cb72d..1ab0ac83b00c 100644
--- a/drivers/usb/renesas_usbhs/mod_host.c
+++ b/drivers/usb/renesas_usbhs/mod_host.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * Renesas USB driver
  *
diff --git a/drivers/usb/renesas_usbhs/pipe.c b/drivers/usb/renesas_usbhs/pipe.c
index d811f0550c04..3c500aaadf35 100644
--- a/drivers/usb/renesas_usbhs/pipe.c
+++ b/drivers/usb/renesas_usbhs/pipe.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * Renesas USB driver
  *
diff --git a/drivers/usb/renesas_usbhs/pipe.h b/drivers/usb/renesas_usbhs/pipe.h
index 95185fdb29b1..ed32cb11fe09 100644
--- a/drivers/usb/renesas_usbhs/pipe.h
+++ b/drivers/usb/renesas_usbhs/pipe.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * Renesas USB driver
  *
diff --git a/drivers/usb/renesas_usbhs/rcar2.c b/drivers/usb/renesas_usbhs/rcar2.c
index 277160bc6f25..b03b3cb36b49 100644
--- a/drivers/usb/renesas_usbhs/rcar2.c
+++ b/drivers/usb/renesas_usbhs/rcar2.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * Renesas USB driver R-Car Gen. 2 initialization and power control
  *
diff --git a/drivers/usb/renesas_usbhs/rcar3.c b/drivers/usb/renesas_usbhs/rcar3.c
index f436e9d51127..11a13887a71d 100644
--- a/drivers/usb/renesas_usbhs/rcar3.c
+++ b/drivers/usb/renesas_usbhs/rcar3.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Renesas USB driver R-Car Gen. 3 initialization and power control
  *
diff --git a/drivers/usb/serial/aircable.c b/drivers/usb/serial/aircable.c
index 569c2200ba42..2ea3fc79acd6 100644
--- a/drivers/usb/serial/aircable.c
+++ b/drivers/usb/serial/aircable.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * AIRcable USB Bluetooth Dongle Driver.
  *
diff --git a/drivers/usb/serial/ark3116.c b/drivers/usb/serial/ark3116.c
index 0adbd38b4eea..a0cc93b0c64d 100644
--- a/drivers/usb/serial/ark3116.c
+++ b/drivers/usb/serial/ark3116.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2009 by Bart Hartgers (bart.hartgers+ark3116@gmail.com)
  * Original version:
diff --git a/drivers/usb/serial/belkin_sa.c b/drivers/usb/serial/belkin_sa.c
index 15bc71853db5..6e0c98bde745 100644
--- a/drivers/usb/serial/belkin_sa.c
+++ b/drivers/usb/serial/belkin_sa.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Belkin USB Serial Adapter Driver
  *
diff --git a/drivers/usb/serial/belkin_sa.h b/drivers/usb/serial/belkin_sa.h
index c74b58ab56f9..345c57f63831 100644
--- a/drivers/usb/serial/belkin_sa.h
+++ b/drivers/usb/serial/belkin_sa.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Definitions for Belkin USB Serial Adapter Driver
  *
diff --git a/drivers/usb/serial/bus.c b/drivers/usb/serial/bus.c
index 8936a83c96cd..db49e2a4346b 100644
--- a/drivers/usb/serial/bus.c
+++ b/drivers/usb/serial/bus.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB Serial Converter Bus specific functions
  *
diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c
index 351745aec0e1..0543836f053e 100644
--- a/drivers/usb/serial/ch341.c
+++ b/drivers/usb/serial/ch341.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright 2007, Frank A Kingswood <frank@kingswood-consulting.co.uk>
  * Copyright 2007, Werner Cornelius <werner@cornelius-consult.de>
diff --git a/drivers/usb/serial/console.c b/drivers/usb/serial/console.c
index 43a862a90a77..4f80a422d4e5 100644
--- a/drivers/usb/serial/console.c
+++ b/drivers/usb/serial/console.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB Serial Console driver
  *
diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index 412f812522ee..2bc560a8d9a0 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Silicon Laboratories CP210x USB to RS232 serial adaptor driver
  *
diff --git a/drivers/usb/serial/cyberjack.c b/drivers/usb/serial/cyberjack.c
index 47fbd9f0c0c7..830e793dfe44 100644
--- a/drivers/usb/serial/cyberjack.c
+++ b/drivers/usb/serial/cyberjack.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  *  REINER SCT cyberJack pinpad/e-com USB Chipcard Reader Driver
  *
diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c
index 90110de715e0..a43ee56d3032 100644
--- a/drivers/usb/serial/cypress_m8.c
+++ b/drivers/usb/serial/cypress_m8.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * USB Cypress M8 driver
  *
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index 2ce39af32cfa..fc30c1d04d6e 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
 *  Digi AccelePort USB-4 and USB-2 Serial Converters
 *
diff --git a/drivers/usb/serial/empeg.c b/drivers/usb/serial/empeg.c
index 90e603d5f660..b270ff3a721a 100644
--- a/drivers/usb/serial/empeg.c
+++ b/drivers/usb/serial/empeg.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB Empeg empeg-car player driver
  *
diff --git a/drivers/usb/serial/f81232.c b/drivers/usb/serial/f81232.c
index 972f5a5fe577..869bfd05e02e 100644
--- a/drivers/usb/serial/f81232.c
+++ b/drivers/usb/serial/f81232.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Fintek F81232 USB to serial adaptor driver
  *
diff --git a/drivers/usb/serial/f81534.c b/drivers/usb/serial/f81534.c
index cb8214860192..cc535426208f 100644
--- a/drivers/usb/serial/f81534.c
+++ b/drivers/usb/serial/f81534.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * F81532/F81534 USB to Serial Ports Bridge
  *
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index 49d1b2d4606d..70791b672af6 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * USB FTDI SIO driver
  *
diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c
index 4f793c86978e..e06bde8aaac4 100644
--- a/drivers/usb/serial/garmin_gps.c
+++ b/drivers/usb/serial/garmin_gps.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Garmin GPS driver
  *
diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index 35cb8c0e584f..6d64595df6fd 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB Serial Converter Generic functions
  *
diff --git a/drivers/usb/serial/io_16654.h b/drivers/usb/serial/io_16654.h
index a53abc9530ff..9a594aa65cd8 100644
--- a/drivers/usb/serial/io_16654.h
+++ b/drivers/usb/serial/io_16654.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /************************************************************************
  *
  *	16654.H		Definitions for 16C654 UART used on EdgePorts
diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c
index bdf8bd814a9a..9ccfe291d6ad 100644
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Edgeport USB Serial Converter driver
  *
diff --git a/drivers/usb/serial/io_edgeport.h b/drivers/usb/serial/io_edgeport.h
index ad9c1d47a619..657158133a88 100644
--- a/drivers/usb/serial/io_edgeport.h
+++ b/drivers/usb/serial/io_edgeport.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /************************************************************************
  *
  *	io_edgeport.h	Edgeport Linux Interface definitions
diff --git a/drivers/usb/serial/io_ionsp.h b/drivers/usb/serial/io_ionsp.h
index 5cc591bae54d..e9d5cde2e4a6 100644
--- a/drivers/usb/serial/io_ionsp.h
+++ b/drivers/usb/serial/io_ionsp.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /************************************************************************
  *
  *	IONSP.H		Definitions for I/O Networks Serial Protocol
diff --git a/drivers/usb/serial/io_ti.c b/drivers/usb/serial/io_ti.c
index 6cefb9cb133d..c6fc62447b25 100644
--- a/drivers/usb/serial/io_ti.c
+++ b/drivers/usb/serial/io_ti.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Edgeport USB Serial Converter driver
  *
diff --git a/drivers/usb/serial/io_ti.h b/drivers/usb/serial/io_ti.h
index 1bd67b24f916..42eb4afc4550 100644
--- a/drivers/usb/serial/io_ti.h
+++ b/drivers/usb/serial/io_ti.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*****************************************************************************
  *
  *	Copyright (C) 1997-2002 Inside Out Networks, Inc.
diff --git a/drivers/usb/serial/io_usbvend.h b/drivers/usb/serial/io_usbvend.h
index 6f6a856bc37c..fec5a0aec075 100644
--- a/drivers/usb/serial/io_usbvend.h
+++ b/drivers/usb/serial/io_usbvend.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /************************************************************************
  *
  *	USBVEND.H		Vendor-specific USB definitions
diff --git a/drivers/usb/serial/ipaq.c b/drivers/usb/serial/ipaq.c
index cde0dcdce9c4..52da1a82d7f0 100644
--- a/drivers/usb/serial/ipaq.c
+++ b/drivers/usb/serial/ipaq.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * USB Compaq iPAQ driver
  *
diff --git a/drivers/usb/serial/ipw.c b/drivers/usb/serial/ipw.c
index 8b1cf18a668b..4a534e8037d5 100644
--- a/drivers/usb/serial/ipw.c
+++ b/drivers/usb/serial/ipw.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * IPWireless 3G UMTS TDD Modem driver (USB connected)
  *
diff --git a/drivers/usb/serial/ir-usb.c b/drivers/usb/serial/ir-usb.c
index f9734a96d516..8109bcfed9f4 100644
--- a/drivers/usb/serial/ir-usb.c
+++ b/drivers/usb/serial/ir-usb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * USB IR Dongle driver
  *
diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c
index 18fc992a245f..c0565a352674 100644
--- a/drivers/usb/serial/iuu_phoenix.c
+++ b/drivers/usb/serial/iuu_phoenix.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Infinity Unlimited USB Phoenix driver
  *
diff --git a/drivers/usb/serial/iuu_phoenix.h b/drivers/usb/serial/iuu_phoenix.h
index b82630a3b8fd..d473cde2aae7 100644
--- a/drivers/usb/serial/iuu_phoenix.h
+++ b/drivers/usb/serial/iuu_phoenix.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Infinity Unlimited USB Phoenix driver
  *
diff --git a/drivers/usb/serial/keyspan.c b/drivers/usb/serial/keyspan.c
index 5662d324edd2..c5e34999bea4 100644
--- a/drivers/usb/serial/keyspan.c
+++ b/drivers/usb/serial/keyspan.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
   Keyspan USB to Serial Converter driver
 
diff --git a/drivers/usb/serial/keyspan_pda.c b/drivers/usb/serial/keyspan_pda.c
index 196908dd25a1..04c26f4a9542 100644
--- a/drivers/usb/serial/keyspan_pda.c
+++ b/drivers/usb/serial/keyspan_pda.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * USB Keyspan PDA / Xircom / Entrega Converter driver
  *
diff --git a/drivers/usb/serial/kl5kusb105.c b/drivers/usb/serial/kl5kusb105.c
index 595415e59d5d..e71978840780 100644
--- a/drivers/usb/serial/kl5kusb105.c
+++ b/drivers/usb/serial/kl5kusb105.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * KLSI KL5KUSB105 chip RS232 converter driver
  *
diff --git a/drivers/usb/serial/kobil_sct.c b/drivers/usb/serial/kobil_sct.c
index 2f8fa3589b20..86625e9f3a8b 100644
--- a/drivers/usb/serial/kobil_sct.c
+++ b/drivers/usb/serial/kobil_sct.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  *  KOBIL USB Smart Card Terminal Driver
  *
diff --git a/drivers/usb/serial/mct_u232.c b/drivers/usb/serial/mct_u232.c
index 70f346f1aa86..ef4f9067499a 100644
--- a/drivers/usb/serial/mct_u232.c
+++ b/drivers/usb/serial/mct_u232.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * MCT (Magic Control Technology Corp.) USB RS232 Converter Driver
  *
diff --git a/drivers/usb/serial/mct_u232.h b/drivers/usb/serial/mct_u232.h
index d325bb8cb583..bd3f40770adb 100644
--- a/drivers/usb/serial/mct_u232.h
+++ b/drivers/usb/serial/mct_u232.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Definitions for MCT (Magic Control Technology) USB-RS232 Converter Driver
  *
diff --git a/drivers/usb/serial/metro-usb.c b/drivers/usb/serial/metro-usb.c
index 36f65df41721..2832810c9693 100644
--- a/drivers/usb/serial/metro-usb.c
+++ b/drivers/usb/serial/metro-usb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
   Some of this code is credited to Linux USB open source files that are
   distributed with Linux.
diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index a453965f9e9a..b7075252e9ca 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * mos7720.c
  *   Controls the Moschip 7720 usb to dual port serial converter
diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c
index e8669aae14b3..24b7a4a6f8f1 100644
--- a/drivers/usb/serial/mos7840.c
+++ b/drivers/usb/serial/mos7840.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff --git a/drivers/usb/serial/mxuport.c b/drivers/usb/serial/mxuport.c
index 3aef091fe88b..1e9430b077f2 100644
--- a/drivers/usb/serial/mxuport.c
+++ b/drivers/usb/serial/mxuport.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  *	mxuport.c - MOXA UPort series driver
  *
diff --git a/drivers/usb/serial/navman.c b/drivers/usb/serial/navman.c
index 2a97cdc078d5..ab387a58992b 100644
--- a/drivers/usb/serial/navman.c
+++ b/drivers/usb/serial/navman.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Navman Serial USB driver
  *
diff --git a/drivers/usb/serial/omninet.c b/drivers/usb/serial/omninet.c
index efcd7feed6f4..8e1406aecafb 100644
--- a/drivers/usb/serial/omninet.c
+++ b/drivers/usb/serial/omninet.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB ZyXEL omni.net LCD PLUS driver
  *
diff --git a/drivers/usb/serial/opticon.c b/drivers/usb/serial/opticon.c
index 58657d64678b..66e6f77d27c2 100644
--- a/drivers/usb/serial/opticon.c
+++ b/drivers/usb/serial/opticon.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Opticon USB barcode to serial driver
  *
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index ba672cf4e888..eb7d55a8e0ca 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
   USB Driver for GSM modems
 
diff --git a/drivers/usb/serial/oti6858.c b/drivers/usb/serial/oti6858.c
index b11eead469ee..c6adf41b62da 100644
--- a/drivers/usb/serial/oti6858.c
+++ b/drivers/usb/serial/oti6858.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Ours Technology Inc. OTi-6858 USB to serial adapter driver.
  *
diff --git a/drivers/usb/serial/oti6858.h b/drivers/usb/serial/oti6858.h
index 704ac3a532b3..9fb62f03005d 100644
--- a/drivers/usb/serial/oti6858.h
+++ b/drivers/usb/serial/oti6858.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Ours Technology Inc. OTi-6858 USB to serial adapter driver.
  *
diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c
index a585b477415d..5a12f5d81fc2 100644
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Prolific PL2303 USB to serial adaptor driver
  *
diff --git a/drivers/usb/serial/pl2303.h b/drivers/usb/serial/pl2303.h
index 3b5a15d1dc0d..35db75405f9d 100644
--- a/drivers/usb/serial/pl2303.h
+++ b/drivers/usb/serial/pl2303.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Prolific PL2303 USB to serial adaptor driver header file
  *
diff --git a/drivers/usb/serial/qcaux.c b/drivers/usb/serial/qcaux.c
index 6e9f8af96959..f4a5921cc500 100644
--- a/drivers/usb/serial/qcaux.c
+++ b/drivers/usb/serial/qcaux.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Qualcomm USB Auxiliary Serial Port driver
  *
diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c
index 9f9d3a904464..6968170beda7 100644
--- a/drivers/usb/serial/qcserial.c
+++ b/drivers/usb/serial/qcserial.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Qualcomm Serial USB driver
  *
diff --git a/drivers/usb/serial/quatech2.c b/drivers/usb/serial/quatech2.c
index 60e17d1444c3..58fe44db82e8 100644
--- a/drivers/usb/serial/quatech2.c
+++ b/drivers/usb/serial/quatech2.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * usb-serial driver for Quatech USB 2 devices
  *
diff --git a/drivers/usb/serial/safe_serial.c b/drivers/usb/serial/safe_serial.c
index 27d7a7016298..3dd85f025cbe 100644
--- a/drivers/usb/serial/safe_serial.c
+++ b/drivers/usb/serial/safe_serial.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Safe Encapsulated USB Serial Driver
  *
diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index 4c4ac4705ac0..1adbd5a99cf8 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
   USB Driver for Sierra Wireless
 
diff --git a/drivers/usb/serial/spcp8x5.c b/drivers/usb/serial/spcp8x5.c
index 5167b6564c8b..48d330ff03fa 100644
--- a/drivers/usb/serial/spcp8x5.c
+++ b/drivers/usb/serial/spcp8x5.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * spcp8x5 USB to serial adaptor driver
  *
diff --git a/drivers/usb/serial/ssu100.c b/drivers/usb/serial/ssu100.c
index 5aa7bbbeba3d..a7db9aff97a1 100644
--- a/drivers/usb/serial/ssu100.c
+++ b/drivers/usb/serial/ssu100.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * usb-serial driver for Quatech SSU-100
  *
diff --git a/drivers/usb/serial/symbolserial.c b/drivers/usb/serial/symbolserial.c
index 0d1727232d0c..25d966165211 100644
--- a/drivers/usb/serial/symbolserial.c
+++ b/drivers/usb/serial/symbolserial.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Symbol USB barcode to serial driver
  *
diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c
index 8fc3854e5e69..eb184a78db94 100644
--- a/drivers/usb/serial/ti_usb_3410_5052.c
+++ b/drivers/usb/serial/ti_usb_3410_5052.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * TI 3410/5052 USB Serial Driver
  *
diff --git a/drivers/usb/serial/upd78f0730.c b/drivers/usb/serial/upd78f0730.c
index 6819a3486e5d..facad97ff487 100644
--- a/drivers/usb/serial/upd78f0730.c
+++ b/drivers/usb/serial/upd78f0730.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Renesas Electronics uPD78F0730 USB to serial converter driver
  *
diff --git a/drivers/usb/serial/usb-serial-simple.c b/drivers/usb/serial/usb-serial-simple.c
index e98b6e57b703..0d692f187d85 100644
--- a/drivers/usb/serial/usb-serial-simple.c
+++ b/drivers/usb/serial/usb-serial-simple.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB Serial "Simple" driver
  *
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 20639d5b8886..3e27c844345c 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB Serial Converter driver
  *
diff --git a/drivers/usb/serial/usb_debug.c b/drivers/usb/serial/usb_debug.c
index 12f4c5a91e62..9fbd6352de4f 100644
--- a/drivers/usb/serial/usb_debug.c
+++ b/drivers/usb/serial/usb_debug.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB Debug cable driver
  *
diff --git a/drivers/usb/serial/usb_wwan.c b/drivers/usb/serial/usb_wwan.c
index 59bfcb3da116..9e48be615b2f 100644
--- a/drivers/usb/serial/usb_wwan.c
+++ b/drivers/usb/serial/usb_wwan.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
   USB Driver layer for GSM modems
 
diff --git a/drivers/usb/serial/visor.c b/drivers/usb/serial/visor.c
index 9f3317a940ef..2ea07ee08c4f 100644
--- a/drivers/usb/serial/visor.c
+++ b/drivers/usb/serial/visor.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB HandSpring Visor, Palm m50x, and Sony Clie driver
  * (supports all of the Palm OS USB devices)
diff --git a/drivers/usb/serial/visor.h b/drivers/usb/serial/visor.h
index 4c456dd69ce5..e87de8c0c239 100644
--- a/drivers/usb/serial/visor.h
+++ b/drivers/usb/serial/visor.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * USB HandSpring Visor driver
  *
diff --git a/drivers/usb/serial/whiteheat.c b/drivers/usb/serial/whiteheat.c
index 55cebc1e6fec..1873f1046c58 100644
--- a/drivers/usb/serial/whiteheat.c
+++ b/drivers/usb/serial/whiteheat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * USB ConnectTech WhiteHEAT driver
  *
diff --git a/drivers/usb/serial/whiteheat.h b/drivers/usb/serial/whiteheat.h
index 38065df4d2d8..024cc9266ecb 100644
--- a/drivers/usb/serial/whiteheat.h
+++ b/drivers/usb/serial/whiteheat.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * USB ConnectTech WhiteHEAT driver
  *
diff --git a/drivers/usb/serial/wishbone-serial.c b/drivers/usb/serial/wishbone-serial.c
index 4fed4a0bd702..8f227c485bfd 100644
--- a/drivers/usb/serial/wishbone-serial.c
+++ b/drivers/usb/serial/wishbone-serial.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * USB Wishbone-Serial adapter driver
  *
diff --git a/drivers/usb/serial/xsens_mt.c b/drivers/usb/serial/xsens_mt.c
index 3837d5113bb2..ef5f3f655a42 100644
--- a/drivers/usb/serial/xsens_mt.c
+++ b/drivers/usb/serial/xsens_mt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Xsens MT USB driver
  *
diff --git a/drivers/usb/storage/alauda.c b/drivers/usb/storage/alauda.c
index 878b4b8761f5..ad71ff132080 100644
--- a/drivers/usb/storage/alauda.c
+++ b/drivers/usb/storage/alauda.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for Alauda-based card readers
  *
diff --git a/drivers/usb/storage/cypress_atacb.c b/drivers/usb/storage/cypress_atacb.c
index 5e4af44d7d9f..a326ad73de16 100644
--- a/drivers/usb/storage/cypress_atacb.c
+++ b/drivers/usb/storage/cypress_atacb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Support for emulating SAT (ata pass through) on devices based
  *       on the Cypress USB/ATA bridge supporting ATACB.
diff --git a/drivers/usb/storage/datafab.c b/drivers/usb/storage/datafab.c
index 723197af6ec5..f408d26700ce 100644
--- a/drivers/usb/storage/datafab.c
+++ b/drivers/usb/storage/datafab.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for Datafab USB Compact Flash reader
  *
diff --git a/drivers/usb/storage/debug.c b/drivers/usb/storage/debug.c
index 8d20804a59e6..182d2f8e9b2b 100644
--- a/drivers/usb/storage/debug.c
+++ b/drivers/usb/storage/debug.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for USB Mass Storage compliant devices
  * Debugging Functions Source Code File
diff --git a/drivers/usb/storage/debug.h b/drivers/usb/storage/debug.h
index 8ab73299b650..69dd4c480fb2 100644
--- a/drivers/usb/storage/debug.h
+++ b/drivers/usb/storage/debug.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for USB Mass Storage compliant devices
  * Debugging Functions Header File
diff --git a/drivers/usb/storage/ene_ub6250.c b/drivers/usb/storage/ene_ub6250.c
index 28100374f7bd..fc733fa14415 100644
--- a/drivers/usb/storage/ene_ub6250.c
+++ b/drivers/usb/storage/ene_ub6250.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  *
  * This program is free software; you can redistribute it and/or modify it
diff --git a/drivers/usb/storage/freecom.c b/drivers/usb/storage/freecom.c
index c0a5d954414b..3b1ffadd07bf 100644
--- a/drivers/usb/storage/freecom.c
+++ b/drivers/usb/storage/freecom.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for Freecom USB/IDE adaptor
  *
diff --git a/drivers/usb/storage/initializers.c b/drivers/usb/storage/initializers.c
index d9d8c17e05d1..9b574caa80ac 100644
--- a/drivers/usb/storage/initializers.c
+++ b/drivers/usb/storage/initializers.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Special Initializers for certain USB Mass Storage devices
  *
diff --git a/drivers/usb/storage/initializers.h b/drivers/usb/storage/initializers.h
index 039abf4d1cb7..9e6f6efb57f4 100644
--- a/drivers/usb/storage/initializers.h
+++ b/drivers/usb/storage/initializers.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Header file for Special Initializers for certain USB Mass Storage devices
  *
diff --git a/drivers/usb/storage/isd200.c b/drivers/usb/storage/isd200.c
index 6a7720e66595..0f8603cf2755 100644
--- a/drivers/usb/storage/isd200.c
+++ b/drivers/usb/storage/isd200.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Transport & Protocol Driver for In-System Design, Inc. ISD200 ASIC
  *
diff --git a/drivers/usb/storage/jumpshot.c b/drivers/usb/storage/jumpshot.c
index 011e5270690a..a1ead5b9d559 100644
--- a/drivers/usb/storage/jumpshot.c
+++ b/drivers/usb/storage/jumpshot.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for Lexar "Jumpshot" Compact Flash reader
  *
diff --git a/drivers/usb/storage/karma.c b/drivers/usb/storage/karma.c
index b05ba4929f00..7ecf2f83f8c9 100644
--- a/drivers/usb/storage/karma.c
+++ b/drivers/usb/storage/karma.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for Rio Karma
  *
diff --git a/drivers/usb/storage/onetouch.c b/drivers/usb/storage/onetouch.c
index acc3d03d8c1e..27873d0ad130 100644
--- a/drivers/usb/storage/onetouch.c
+++ b/drivers/usb/storage/onetouch.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Support for the Maxtor OneTouch USB hard drive's button
  *
diff --git a/drivers/usb/storage/option_ms.c b/drivers/usb/storage/option_ms.c
index 4a73cd4783ae..e23e47413271 100644
--- a/drivers/usb/storage/option_ms.c
+++ b/drivers/usb/storage/option_ms.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for Option High Speed Mobile Devices.
  *
diff --git a/drivers/usb/storage/protocol.c b/drivers/usb/storage/protocol.c
index 74c38870a17e..b8b8e7066a04 100644
--- a/drivers/usb/storage/protocol.c
+++ b/drivers/usb/storage/protocol.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for USB Mass Storage compliant devices
  *
diff --git a/drivers/usb/storage/protocol.h b/drivers/usb/storage/protocol.h
index a55666880b7b..abedf13a8cf1 100644
--- a/drivers/usb/storage/protocol.h
+++ b/drivers/usb/storage/protocol.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for USB Mass Storage compliant devices
  * Protocol Functions Header File
diff --git a/drivers/usb/storage/realtek_cr.c b/drivers/usb/storage/realtek_cr.c
index ec83b3b5efa9..7d02a7c5cdd6 100644
--- a/drivers/usb/storage/realtek_cr.c
+++ b/drivers/usb/storage/realtek_cr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for Realtek RTS51xx USB card reader
  *
diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c
index 8cd2926fb1fe..878922fb54b8 100644
--- a/drivers/usb/storage/scsiglue.c
+++ b/drivers/usb/storage/scsiglue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for USB Mass Storage compliant devices
  * SCSI layer glue code
diff --git a/drivers/usb/storage/scsiglue.h b/drivers/usb/storage/scsiglue.h
index d0a331dd9bc5..add14c47ce1d 100644
--- a/drivers/usb/storage/scsiglue.h
+++ b/drivers/usb/storage/scsiglue.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for USB Mass Storage compliant devices
  * SCSI Connecting Glue Header File
diff --git a/drivers/usb/storage/sddr09.c b/drivers/usb/storage/sddr09.c
index 44f8ffccd031..b37cb07dfc80 100644
--- a/drivers/usb/storage/sddr09.c
+++ b/drivers/usb/storage/sddr09.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for SanDisk SDDR-09 SmartMedia reader
  *
diff --git a/drivers/usb/storage/sddr55.c b/drivers/usb/storage/sddr55.c
index b973da42b593..705c9724ab9d 100644
--- a/drivers/usb/storage/sddr55.c
+++ b/drivers/usb/storage/sddr55.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for SanDisk SDDR-55 SmartMedia reader
  *
diff --git a/drivers/usb/storage/shuttle_usbat.c b/drivers/usb/storage/shuttle_usbat.c
index 3b0294e4df93..3e9da1d257b6 100644
--- a/drivers/usb/storage/shuttle_usbat.c
+++ b/drivers/usb/storage/shuttle_usbat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for SCM Microsystems (a.k.a. Shuttle) USB-ATAPI cable
  *
diff --git a/drivers/usb/storage/transport.c b/drivers/usb/storage/transport.c
index a3ccb899df60..b31568e8f6c3 100644
--- a/drivers/usb/storage/transport.c
+++ b/drivers/usb/storage/transport.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for USB Mass Storage compliant devices
  *
diff --git a/drivers/usb/storage/transport.h b/drivers/usb/storage/transport.h
index dae3ecd2e6cf..98591db5fb6a 100644
--- a/drivers/usb/storage/transport.h
+++ b/drivers/usb/storage/transport.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for USB Mass Storage compliant devices
  * Transport Functions Header File
diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c
index bd4671d5dfc3..d86754b65ef1 100644
--- a/drivers/usb/storage/uas.c
+++ b/drivers/usb/storage/uas.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB Attached SCSI
  * Note that this is not the same as the USB Mass Storage driver
diff --git a/drivers/usb/storage/unusual_alauda.h b/drivers/usb/storage/unusual_alauda.h
index 763bc03032a1..311c5a21ac13 100644
--- a/drivers/usb/storage/unusual_alauda.h
+++ b/drivers/usb/storage/unusual_alauda.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Unusual Devices File for the Alauda-based card readers
  *
diff --git a/drivers/usb/storage/unusual_cypress.h b/drivers/usb/storage/unusual_cypress.h
index e9a2eb88869a..285370f13ee4 100644
--- a/drivers/usb/storage/unusual_cypress.h
+++ b/drivers/usb/storage/unusual_cypress.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Unusual Devices File for devices based on the Cypress USB/ATA bridge
  *	with support for ATACB
diff --git a/drivers/usb/storage/unusual_datafab.h b/drivers/usb/storage/unusual_datafab.h
index 5049b6bbe5d5..03c0e72d7b9d 100644
--- a/drivers/usb/storage/unusual_datafab.h
+++ b/drivers/usb/storage/unusual_datafab.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Unusual Devices File for the Datafab USB Compact Flash reader
  *
diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h
index eb06d88b41d6..33f691916762 100644
--- a/drivers/usb/storage/unusual_devs.h
+++ b/drivers/usb/storage/unusual_devs.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for USB Mass Storage compliant devices
  * Unusual Devices File
diff --git a/drivers/usb/storage/unusual_ene_ub6250.h b/drivers/usb/storage/unusual_ene_ub6250.h
index 5667f5d365c6..c9d9e52d884d 100644
--- a/drivers/usb/storage/unusual_ene_ub6250.h
+++ b/drivers/usb/storage/unusual_ene_ub6250.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  *
  * This program is free software; you can redistribute it and/or modify it
diff --git a/drivers/usb/storage/unusual_freecom.h b/drivers/usb/storage/unusual_freecom.h
index 1f5aab42ece2..06088feb8f96 100644
--- a/drivers/usb/storage/unusual_freecom.h
+++ b/drivers/usb/storage/unusual_freecom.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Unusual Devices File for the Freecom USB/IDE adaptor
  *
diff --git a/drivers/usb/storage/unusual_isd200.h b/drivers/usb/storage/unusual_isd200.h
index 9b6862ec3d4f..b924e3d75960 100644
--- a/drivers/usb/storage/unusual_isd200.h
+++ b/drivers/usb/storage/unusual_isd200.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Unusual Devices File for In-System Design, Inc. ISD200 ASIC
  *
diff --git a/drivers/usb/storage/unusual_jumpshot.h b/drivers/usb/storage/unusual_jumpshot.h
index 413e64fa6b95..1ca977374ba4 100644
--- a/drivers/usb/storage/unusual_jumpshot.h
+++ b/drivers/usb/storage/unusual_jumpshot.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Unusual Devices File for the Lexar "Jumpshot" Compact Flash reader
  *
diff --git a/drivers/usb/storage/unusual_karma.h b/drivers/usb/storage/unusual_karma.h
index e6fad3aeae20..84910d13c80a 100644
--- a/drivers/usb/storage/unusual_karma.h
+++ b/drivers/usb/storage/unusual_karma.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Unusual Devices File for the Rio Karma
  *
diff --git a/drivers/usb/storage/unusual_onetouch.h b/drivers/usb/storage/unusual_onetouch.h
index 425dc22f345a..28d39c4e20e2 100644
--- a/drivers/usb/storage/unusual_onetouch.h
+++ b/drivers/usb/storage/unusual_onetouch.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Unusual Devices File for the Maxtor OneTouch USB hard drive's button
  *
diff --git a/drivers/usb/storage/unusual_realtek.h b/drivers/usb/storage/unusual_realtek.h
index 8fe624ad302a..736e4f7ebe10 100644
--- a/drivers/usb/storage/unusual_realtek.h
+++ b/drivers/usb/storage/unusual_realtek.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for Realtek RTS51xx USB card reader
  *
diff --git a/drivers/usb/storage/unusual_sddr09.h b/drivers/usb/storage/unusual_sddr09.h
index d9d38ac4abf9..1098646d4b75 100644
--- a/drivers/usb/storage/unusual_sddr09.h
+++ b/drivers/usb/storage/unusual_sddr09.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Unusual Devices File for SanDisk SDDR-09 SmartMedia reader
  *
diff --git a/drivers/usb/storage/unusual_sddr55.h b/drivers/usb/storage/unusual_sddr55.h
index ebb1d1c6c467..b98c778051b8 100644
--- a/drivers/usb/storage/unusual_sddr55.h
+++ b/drivers/usb/storage/unusual_sddr55.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Unusual Devices File for SanDisk SDDR-55 SmartMedia reader
  *
diff --git a/drivers/usb/storage/unusual_uas.h b/drivers/usb/storage/unusual_uas.h
index cde115359793..2df3319efd66 100644
--- a/drivers/usb/storage/unusual_uas.h
+++ b/drivers/usb/storage/unusual_uas.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for USB Attached SCSI devices - Unusual Devices File
  *
diff --git a/drivers/usb/storage/unusual_usbat.h b/drivers/usb/storage/unusual_usbat.h
index 2044ad5ef5e4..ac6bae08ac79 100644
--- a/drivers/usb/storage/unusual_usbat.h
+++ b/drivers/usb/storage/unusual_usbat.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Unusual Devices File for SCM Microsystems (a.k.a. Shuttle) USB-ATAPI cable
  *
diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c
index 1ae2b8554a88..7b1524be01bb 100644
--- a/drivers/usb/storage/usb.c
+++ b/drivers/usb/storage/usb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for USB Mass Storage compliant devices
  *
diff --git a/drivers/usb/storage/usb.h b/drivers/usb/storage/usb.h
index 8fae28b40bb4..aa3da222f6b1 100644
--- a/drivers/usb/storage/usb.h
+++ b/drivers/usb/storage/usb.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for USB Mass Storage compliant devices
  * Main Header File
diff --git a/drivers/usb/storage/usual-tables.c b/drivers/usb/storage/usual-tables.c
index 499669bcf700..90ac516c1f12 100644
--- a/drivers/usb/storage/usual-tables.c
+++ b/drivers/usb/storage/usual-tables.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Driver for USB Mass Storage devices
  * Usual Tables File for usb-storage and libusual
diff --git a/drivers/usb/typec/typec.c b/drivers/usb/typec/typec.c
index 24e355ba109d..51fbd9a3ef9f 100644
--- a/drivers/usb/typec/typec.c
+++ b/drivers/usb/typec/typec.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB Type-C Connector Class
  *
diff --git a/drivers/usb/typec/typec_wcove.c b/drivers/usb/typec/typec_wcove.c
index 4ce30a967962..e845cdd0f765 100644
--- a/drivers/usb/typec/typec_wcove.c
+++ b/drivers/usb/typec/typec_wcove.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * typec_wcove.c - WhiskeyCove PMIC USB Type-C PHY driver
  *
diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c
index 714c5bcedf2b..8a9cbe9354ea 100644
--- a/drivers/usb/typec/ucsi/ucsi.c
+++ b/drivers/usb/typec/ucsi/ucsi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB Type-C Connector System Software Interface driver
  *
diff --git a/drivers/usb/typec/ucsi/ucsi_acpi.c b/drivers/usb/typec/ucsi/ucsi_acpi.c
index cabd47612b0a..e848cc508230 100644
--- a/drivers/usb/typec/ucsi/ucsi_acpi.c
+++ b/drivers/usb/typec/ucsi/ucsi_acpi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * UCSI ACPI driver
  *
diff --git a/drivers/usb/usb-skeleton.c b/drivers/usb/usb-skeleton.c
index bb0bd732e29a..cc31a1b6c7bd 100644
--- a/drivers/usb/usb-skeleton.c
+++ b/drivers/usb/usb-skeleton.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB Skeleton driver - 2.2
  *
diff --git a/drivers/usb/usbip/stub.h b/drivers/usb/usbip/stub.h
index 910f027773aa..1001a9c1b6fc 100644
--- a/drivers/usb/usbip/stub.h
+++ b/drivers/usb/usbip/stub.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2003-2008 Takahiro Hirofuchi
  *
diff --git a/drivers/usb/usbip/stub_dev.c b/drivers/usb/usbip/stub_dev.c
index c653ce533430..4ff622549781 100644
--- a/drivers/usb/usbip/stub_dev.c
+++ b/drivers/usb/usbip/stub_dev.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2003-2008 Takahiro Hirofuchi
  *
diff --git a/drivers/usb/usbip/stub_main.c b/drivers/usb/usbip/stub_main.c
index 7170404e8979..e07166534c62 100644
--- a/drivers/usb/usbip/stub_main.c
+++ b/drivers/usb/usbip/stub_main.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2003-2008 Takahiro Hirofuchi
  *
diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c
index 191b176ffedf..095429b533d8 100644
--- a/drivers/usb/usbip/stub_rx.c
+++ b/drivers/usb/usbip/stub_rx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2003-2008 Takahiro Hirofuchi
  *
diff --git a/drivers/usb/usbip/stub_tx.c b/drivers/usb/usbip/stub_tx.c
index be50cef645d8..814b4e6a945b 100644
--- a/drivers/usb/usbip/stub_tx.c
+++ b/drivers/usb/usbip/stub_tx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2003-2008 Takahiro Hirofuchi
  *
diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c
index 2281f3562870..edd14fd92377 100644
--- a/drivers/usb/usbip/usbip_common.c
+++ b/drivers/usb/usbip/usbip_common.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2003-2008 Takahiro Hirofuchi
  * Copyright (C) 2015-2016 Samsung Electronics
diff --git a/drivers/usb/usbip/usbip_common.h b/drivers/usb/usbip/usbip_common.h
index 3050fc99a417..d0bd3ae24465 100644
--- a/drivers/usb/usbip/usbip_common.h
+++ b/drivers/usb/usbip/usbip_common.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2003-2008 Takahiro Hirofuchi
  * Copyright (C) 2015-2016 Samsung Electronics
diff --git a/drivers/usb/usbip/usbip_event.c b/drivers/usb/usbip/usbip_event.c
index f1635662c299..ffeb0f53fc68 100644
--- a/drivers/usb/usbip/usbip_event.c
+++ b/drivers/usb/usbip/usbip_event.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2003-2008 Takahiro Hirofuchi
  * Copyright (C) 2015 Nobuo Iwata
diff --git a/drivers/usb/usbip/vhci.h b/drivers/usb/usbip/vhci.h
index 5cfb59e98e44..04240e450e3f 100644
--- a/drivers/usb/usbip/vhci.h
+++ b/drivers/usb/usbip/vhci.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2003-2008 Takahiro Hirofuchi
  * Copyright (C) 2015 Nobuo Iwata
diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c
index 11b9a22799cc..0e16a7dcfb38 100644
--- a/drivers/usb/usbip/vhci_hcd.c
+++ b/drivers/usb/usbip/vhci_hcd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2003-2008 Takahiro Hirofuchi
  * Copyright (C) 2015-2016 Nobuo Iwata
diff --git a/drivers/usb/usbip/vhci_rx.c b/drivers/usb/usbip/vhci_rx.c
index ef2f2d5ca6b2..2389ba443caa 100644
--- a/drivers/usb/usbip/vhci_rx.c
+++ b/drivers/usb/usbip/vhci_rx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2003-2008 Takahiro Hirofuchi
  *
diff --git a/drivers/usb/usbip/vhci_sysfs.c b/drivers/usb/usbip/vhci_sysfs.c
index 1b9f60a22e0b..2a05f3198b32 100644
--- a/drivers/usb/usbip/vhci_sysfs.c
+++ b/drivers/usb/usbip/vhci_sysfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2003-2008 Takahiro Hirofuchi
  * Copyright (C) 2015-2016 Nobuo Iwata
diff --git a/drivers/usb/usbip/vhci_tx.c b/drivers/usb/usbip/vhci_tx.c
index 3e7878fe2fd4..8b127d98ce93 100644
--- a/drivers/usb/usbip/vhci_tx.c
+++ b/drivers/usb/usbip/vhci_tx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2003-2008 Takahiro Hirofuchi
  *
diff --git a/drivers/usb/usbip/vudc.h b/drivers/usb/usbip/vudc.h
index 25e01b09c4c3..44fb24193acd 100644
--- a/drivers/usb/usbip/vudc.h
+++ b/drivers/usb/usbip/vudc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2015 Karol Kosik <karo9@interia.eu>
  * Copyright (C) 2015-2016 Samsung Electronics
diff --git a/drivers/usb/usbip/vudc_dev.c b/drivers/usb/usbip/vudc_dev.c
index 968471b62cbc..0c07348820ea 100644
--- a/drivers/usb/usbip/vudc_dev.c
+++ b/drivers/usb/usbip/vudc_dev.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2015 Karol Kosik <karo9@interia.eu>
  * Copyright (C) 2015-2016 Samsung Electronics
diff --git a/drivers/usb/usbip/vudc_main.c b/drivers/usb/usbip/vudc_main.c
index 9e655714e389..63aee6bb0dd9 100644
--- a/drivers/usb/usbip/vudc_main.c
+++ b/drivers/usb/usbip/vudc_main.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2015 Karol Kosik <karo9@interia.eu>
  * Copyright (C) 2015-2016 Samsung Electronics
diff --git a/drivers/usb/usbip/vudc_rx.c b/drivers/usb/usbip/vudc_rx.c
index e429b59f6f8a..2faccf62b17e 100644
--- a/drivers/usb/usbip/vudc_rx.c
+++ b/drivers/usb/usbip/vudc_rx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2015 Karol Kosik <karo9@interia.eu>
  * Copyright (C) 2015-2016 Samsung Electronics
diff --git a/drivers/usb/usbip/vudc_sysfs.c b/drivers/usb/usbip/vudc_sysfs.c
index 0f98f2c7475f..7d978b824ed4 100644
--- a/drivers/usb/usbip/vudc_sysfs.c
+++ b/drivers/usb/usbip/vudc_sysfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2015 Karol Kosik <karo9@interia.eu>
  * Copyright (C) 2015-2016 Samsung Electronics
diff --git a/drivers/usb/usbip/vudc_transfer.c b/drivers/usb/usbip/vudc_transfer.c
index 718f1595a18e..80feece59b21 100644
--- a/drivers/usb/usbip/vudc_transfer.c
+++ b/drivers/usb/usbip/vudc_transfer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2015 Karol Kosik <karo9@interia.eu>
  * Copyright (C) 2015-2016 Samsung Electronics
diff --git a/drivers/usb/usbip/vudc_tx.c b/drivers/usb/usbip/vudc_tx.c
index 234661782fa0..63d8bb810ff8 100644
--- a/drivers/usb/usbip/vudc_tx.c
+++ b/drivers/usb/usbip/vudc_tx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2015 Karol Kosik <karo9@interia.eu>
  * Copyright (C) 2015-2016 Samsung Electronics
diff --git a/drivers/usb/wusbcore/cbaf.c b/drivers/usb/wusbcore/cbaf.c
index aa4e440e9975..5a3ee0d6cf31 100644
--- a/drivers/usb/wusbcore/cbaf.c
+++ b/drivers/usb/wusbcore/cbaf.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless USB - Cable Based Association
  *
diff --git a/drivers/usb/wusbcore/crypto.c b/drivers/usb/wusbcore/crypto.c
index 062c205f0046..011ce6a35137 100644
--- a/drivers/usb/wusbcore/crypto.c
+++ b/drivers/usb/wusbcore/crypto.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Ultra Wide Band
  * AES-128 CCM Encryption
diff --git a/drivers/usb/wusbcore/dev-sysfs.c b/drivers/usb/wusbcore/dev-sysfs.c
index 78212f8180ce..02457f7e2ee3 100644
--- a/drivers/usb/wusbcore/dev-sysfs.c
+++ b/drivers/usb/wusbcore/dev-sysfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * WUSB devices
  * sysfs bindings
diff --git a/drivers/usb/wusbcore/devconnect.c b/drivers/usb/wusbcore/devconnect.c
index bf9551735938..18826c98b7d6 100644
--- a/drivers/usb/wusbcore/devconnect.c
+++ b/drivers/usb/wusbcore/devconnect.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * WUSB Wire Adapter: Control/Data Streaming Interface (WUSB[8])
  * Device Connect handling
diff --git a/drivers/usb/wusbcore/mmc.c b/drivers/usb/wusbcore/mmc.c
index 3f485df96226..235099df783e 100644
--- a/drivers/usb/wusbcore/mmc.c
+++ b/drivers/usb/wusbcore/mmc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * WUSB Wire Adapter: Control/Data Streaming Interface (WUSB[8])
  * MMC (Microscheduled Management Command) handling
diff --git a/drivers/usb/wusbcore/pal.c b/drivers/usb/wusbcore/pal.c
index 090f27371a8f..6e47b055ad47 100644
--- a/drivers/usb/wusbcore/pal.c
+++ b/drivers/usb/wusbcore/pal.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless USB Host Controller
  * UWB Protocol Adaptation Layer (PAL) glue.
diff --git a/drivers/usb/wusbcore/reservation.c b/drivers/usb/wusbcore/reservation.c
index 7b1b2e2fb673..9463afe44c09 100644
--- a/drivers/usb/wusbcore/reservation.c
+++ b/drivers/usb/wusbcore/reservation.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * WUSB cluster reservation management
  *
diff --git a/drivers/usb/wusbcore/rh.c b/drivers/usb/wusbcore/rh.c
index a082fe62b1f0..0fb11cde5ff8 100644
--- a/drivers/usb/wusbcore/rh.c
+++ b/drivers/usb/wusbcore/rh.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless USB Host Controller
  * Root Hub operations
diff --git a/drivers/usb/wusbcore/security.c b/drivers/usb/wusbcore/security.c
index 724490580c1e..c55a74de0945 100644
--- a/drivers/usb/wusbcore/security.c
+++ b/drivers/usb/wusbcore/security.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless USB Host Controller
  * Security support: encryption enablement, etc
diff --git a/drivers/usb/wusbcore/wa-hc.c b/drivers/usb/wusbcore/wa-hc.c
index d01496fd27fe..80e539931e34 100644
--- a/drivers/usb/wusbcore/wa-hc.c
+++ b/drivers/usb/wusbcore/wa-hc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wire Adapter Host Controller Driver
  * Common items to HWA and DWA based HCDs
diff --git a/drivers/usb/wusbcore/wa-hc.h b/drivers/usb/wusbcore/wa-hc.h
index edc7267157f3..ab5fc274a73f 100644
--- a/drivers/usb/wusbcore/wa-hc.h
+++ b/drivers/usb/wusbcore/wa-hc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * HWA Host Controller Driver
  * Wire Adapter Control/Data Streaming Iface (WUSB1.0[8])
diff --git a/drivers/usb/wusbcore/wa-nep.c b/drivers/usb/wusbcore/wa-nep.c
index e3819fc182b0..4cfc5ba70e62 100644
--- a/drivers/usb/wusbcore/wa-nep.c
+++ b/drivers/usb/wusbcore/wa-nep.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * WUSB Wire Adapter: Control/Data Streaming Interface (WUSB[8])
  * Notification EndPoint support
diff --git a/drivers/usb/wusbcore/wa-rpipe.c b/drivers/usb/wusbcore/wa-rpipe.c
index c7ecdbe19a32..aff01f19f09e 100644
--- a/drivers/usb/wusbcore/wa-rpipe.c
+++ b/drivers/usb/wusbcore/wa-rpipe.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * WUSB Wire Adapter
  * rpipe management
diff --git a/drivers/usb/wusbcore/wa-xfer.c b/drivers/usb/wusbcore/wa-xfer.c
index bee2404353d9..2a9f95f7d653 100644
--- a/drivers/usb/wusbcore/wa-xfer.c
+++ b/drivers/usb/wusbcore/wa-xfer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * WUSB Wire Adapter
  * Data transfer and URB enqueing
diff --git a/drivers/usb/wusbcore/wusbhc.c b/drivers/usb/wusbcore/wusbhc.c
index 5338e42533c8..ebd07e906906 100644
--- a/drivers/usb/wusbcore/wusbhc.c
+++ b/drivers/usb/wusbcore/wusbhc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless USB Host Controller
  * sysfs glue, wusbcore module support and life cycle management
diff --git a/drivers/usb/wusbcore/wusbhc.h b/drivers/usb/wusbcore/wusbhc.h
index 8c5bd000739b..6ccef2d0c7b3 100644
--- a/drivers/usb/wusbcore/wusbhc.h
+++ b/drivers/usb/wusbcore/wusbhc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless USB Host Controller
  * Common infrastructure for WHCI and HWA WUSB-HC drivers
diff --git a/include/linux/usb/association.h b/include/linux/usb/association.h
index 0a4a18b3c1bb..d7f3cb9b9db5 100644
--- a/include/linux/usb/association.h
+++ b/include/linux/usb/association.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless USB - Cable Based Association
  *
diff --git a/include/linux/usb/audio-v2.h b/include/linux/usb/audio-v2.h
index fd73bc0e9027..3119d0ace7aa 100644
--- a/include/linux/usb/audio-v2.h
+++ b/include/linux/usb/audio-v2.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (c) 2010 Daniel Mack <daniel@caiaq.de>
  *
diff --git a/include/linux/usb/audio.h b/include/linux/usb/audio.h
index 3d84619110a4..170acd500ea1 100644
--- a/include/linux/usb/audio.h
+++ b/include/linux/usb/audio.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * <linux/usb/audio.h> -- USB Audio definitions.
  *
diff --git a/include/linux/usb/c67x00.h b/include/linux/usb/c67x00.h
index 83c6b45470ca..2fc39e3b7281 100644
--- a/include/linux/usb/c67x00.h
+++ b/include/linux/usb/c67x00.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * usb_c67x00.h: platform definitions for the Cypress C67X00 USB chip
  *
diff --git a/include/linux/usb/cdc-wdm.h b/include/linux/usb/cdc-wdm.h
index 0b3f4295c025..9b895f93d8de 100644
--- a/include/linux/usb/cdc-wdm.h
+++ b/include/linux/usb/cdc-wdm.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB CDC Device Management subdriver
  *
diff --git a/include/linux/usb/cdc.h b/include/linux/usb/cdc.h
index b5706f94ee9e..35d784cf32a4 100644
--- a/include/linux/usb/cdc.h
+++ b/include/linux/usb/cdc.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB CDC common helpers
  *
diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h
index 1a59699cf82a..1646c06989df 100644
--- a/include/linux/usb/cdc_ncm.h
+++ b/include/linux/usb/cdc_ncm.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
 /*
  * Copyright (C) ST-Ericsson 2010-2012
  * Contact: Alexey Orishko <alexey.orishko@stericsson.com>
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index f665d2ceac20..cef0e44601f8 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * composite.h -- framework for usb gadgets which are composite devices
  *
diff --git a/include/linux/usb/ehci_def.h b/include/linux/usb/ehci_def.h
index e479033bd782..a15ce99dfc2d 100644
--- a/include/linux/usb/ehci_def.h
+++ b/include/linux/usb/ehci_def.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (c) 2001-2002 by David Brownell
  *
diff --git a/include/linux/usb/ehci_pdriver.h b/include/linux/usb/ehci_pdriver.h
index db0431b39a63..dd742afdc03f 100644
--- a/include/linux/usb/ehci_pdriver.h
+++ b/include/linux/usb/ehci_pdriver.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2012 Hauke Mehrtens <hauke@hauke-m.de>
  *
diff --git a/include/linux/usb/g_hid.h b/include/linux/usb/g_hid.h
index 50f5745df28c..7581e488c237 100644
--- a/include/linux/usb/g_hid.h
+++ b/include/linux/usb/g_hid.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * g_hid.h -- Header file for USB HID gadget driver
  *
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 523e5a2b5015..0142f3af0da6 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * <linux/usb/gadget.h>
  *
diff --git a/include/linux/usb/gpio_vbus.h b/include/linux/usb/gpio_vbus.h
index 837bba604a0b..804fb06cf6d6 100644
--- a/include/linux/usb/gpio_vbus.h
+++ b/include/linux/usb/gpio_vbus.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * A simple GPIO VBUS sensing driver for B peripheral only devices
  * with internal transceivers.
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index a1f03ebfde47..176900528822 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (c) 2001-2002 by David Brownell
  *
diff --git a/include/linux/usb/input.h b/include/linux/usb/input.h
index 0e010b220e85..974befa72ac0 100644
--- a/include/linux/usb/input.h
+++ b/include/linux/usb/input.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2005 Dmitry Torokhov
  *
diff --git a/include/linux/usb/isp1301.h b/include/linux/usb/isp1301.h
index d3a851c28b6a..dedb3b2473e8 100644
--- a/include/linux/usb/isp1301.h
+++ b/include/linux/usb/isp1301.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * NXP ISP1301 USB transceiver driver
  *
diff --git a/include/linux/usb/m66592.h b/include/linux/usb/m66592.h
index a4ba31ab2fed..2dfe68183495 100644
--- a/include/linux/usb/m66592.h
+++ b/include/linux/usb/m66592.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * M66592 driver platform data
  *
diff --git a/include/linux/usb/musb-ux500.h b/include/linux/usb/musb-ux500.h
index 1e2c7130f6e1..c4b7ad9850ca 100644
--- a/include/linux/usb/musb-ux500.h
+++ b/include/linux/usb/musb-ux500.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2013 ST-Ericsson AB
  *
diff --git a/include/linux/usb/net2280.h b/include/linux/usb/net2280.h
index 725120224472..08b85caecfaf 100644
--- a/include/linux/usb/net2280.h
+++ b/include/linux/usb/net2280.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * NetChip 2280 high/full speed USB device controller.
  * Unlike many such controllers, this one talks PCI.
diff --git a/include/linux/usb/of.h b/include/linux/usb/of.h
index 4031f47629ec..6cbe7a5c2b57 100644
--- a/include/linux/usb/of.h
+++ b/include/linux/usb/of.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * OF helpers for usb devices.
  *
diff --git a/include/linux/usb/ohci_pdriver.h b/include/linux/usb/ohci_pdriver.h
index 012f2b7eb2b6..7eb16cf587ee 100644
--- a/include/linux/usb/ohci_pdriver.h
+++ b/include/linux/usb/ohci_pdriver.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2012 Hauke Mehrtens <hauke@hauke-m.de>
  *
diff --git a/include/linux/usb/otg-fsm.h b/include/linux/usb/otg-fsm.h
index a0a8f878503c..e78eb577d0fa 100644
--- a/include/linux/usb/otg-fsm.h
+++ b/include/linux/usb/otg-fsm.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /* Copyright (C) 2007,2008 Freescale Semiconductor, Inc.
  *
  * This program is free software; you can redistribute  it and/or modify it
diff --git a/include/linux/usb/phy_companion.h b/include/linux/usb/phy_companion.h
index edd2ec23d282..407f530061cd 100644
--- a/include/linux/usb/phy_companion.h
+++ b/include/linux/usb/phy_companion.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * phy-companion.h -- phy companion to indicate the comparator part of PHY
  *
diff --git a/include/linux/usb/r8a66597.h b/include/linux/usb/r8a66597.h
index 55805f9dcf21..c0753d026bbf 100644
--- a/include/linux/usb/r8a66597.h
+++ b/include/linux/usb/r8a66597.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * R8A66597 driver platform data
  *
diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h
index d2f41e4dd7a8..67102f3d59d4 100644
--- a/include/linux/usb/renesas_usbhs.h
+++ b/include/linux/usb/renesas_usbhs.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-1.0+
 /*
  * Renesas USB
  *
diff --git a/include/linux/usb/rndis_host.h b/include/linux/usb/rndis_host.h
index d44ef85db177..809bccd08455 100644
--- a/include/linux/usb/rndis_host.h
+++ b/include/linux/usb/rndis_host.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Host Side support for RNDIS Networking Links
  * Copyright (C) 2005 by David Brownell
diff --git a/include/linux/usb/samsung_usb_phy.h b/include/linux/usb/samsung_usb_phy.h
index 916782699f1c..dc0071741695 100644
--- a/include/linux/usb/samsung_usb_phy.h
+++ b/include/linux/usb/samsung_usb_phy.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2012 Samsung Electronics Co.Ltd
  *		http://www.samsung.com/
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index e2f0ab07eea5..106551a5616e 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * USB Serial Converter stuff
  *
diff --git a/include/linux/usb/storage.h b/include/linux/usb/storage.h
index 305ee8db7faf..e0240f864548 100644
--- a/include/linux/usb/storage.h
+++ b/include/linux/usb/storage.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 #ifndef __LINUX_USB_STORAGE_H
 #define __LINUX_USB_STORAGE_H
 
diff --git a/include/linux/usb/tegra_usb_phy.h b/include/linux/usb/tegra_usb_phy.h
index 1de16c324ec8..d641ea1660b7 100644
--- a/include/linux/usb/tegra_usb_phy.h
+++ b/include/linux/usb/tegra_usb_phy.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2010 Google, Inc.
  *
diff --git a/include/linux/usb/tilegx.h b/include/linux/usb/tilegx.h
index 2d65e3435680..817908573fe8 100644
--- a/include/linux/usb/tilegx.h
+++ b/include/linux/usb/tilegx.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright 2012 Tilera Corporation. All Rights Reserved.
  *
diff --git a/include/linux/usb/ulpi.h b/include/linux/usb/ulpi.h
index 5f07407a367a..c515765adab7 100644
--- a/include/linux/usb/ulpi.h
+++ b/include/linux/usb/ulpi.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * ulpi.h -- ULPI defines and function prorotypes
  *
diff --git a/include/linux/usb/usb338x.h b/include/linux/usb/usb338x.h
index 11525d8d89a7..7189e3387bf9 100644
--- a/include/linux/usb/usb338x.h
+++ b/include/linux/usb/usb338x.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * USB 338x super/high/full speed USB device controller.
  * Unlike many such controllers, this one talks PCI.
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 97116379db5f..a69877734c4e 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * USB Networking Link Interface
  *
diff --git a/include/linux/usb/wusb-wa.h b/include/linux/usb/wusb-wa.h
index c1257130769b..64a840b5106e 100644
--- a/include/linux/usb/wusb-wa.h
+++ b/include/linux/usb/wusb-wa.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless USB Wire Adapter constants and structures.
  *
diff --git a/include/linux/usb/wusb.h b/include/linux/usb/wusb.h
index eeb28329fa3c..9e4a3213f2c2 100644
--- a/include/linux/usb/wusb.h
+++ b/include/linux/usb/wusb.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Wireless USB Standard Definitions
  * Event Size Tables
diff --git a/include/linux/usb/xhci-dbgp.h b/include/linux/usb/xhci-dbgp.h
index 80c1cca1f529..0a37f1283bf0 100644
--- a/include/linux/usb/xhci-dbgp.h
+++ b/include/linux/usb/xhci-dbgp.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Standalone xHCI debug capability driver
  *
diff --git a/include/linux/usbdevice_fs.h b/include/linux/usbdevice_fs.h
index 04a26285416c..79aab0065ec8 100644
--- a/include/linux/usbdevice_fs.h
+++ b/include/linux/usbdevice_fs.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*****************************************************************************/
 
 /*
-- 
cgit v1.2.3


From a8497b31fee650846fcc5a1e41f4ef22318a9925 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 24 Oct 2017 03:00:18 -0700
Subject: tty: cyclades: Convert timers to use timer_setup()

In preparation for unconditionally passing the struct timer_list
pointer to all timer callbacks, switch to using the new timer_setup() and
from_timer() to pass the timer pointer explicitly. Moves timer structures
from global to attached to struct cyclades_port.

Cc: Jiri Slaby <jslaby@suse.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/cyclades.c   | 16 ++++++----------
 include/linux/cyclades.h |  3 +++
 2 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c
index d272bc4e7fb5..f30f7a90995c 100644
--- a/drivers/tty/cyclades.c
+++ b/drivers/tty/cyclades.c
@@ -286,8 +286,7 @@ static long cyz_polling_cycle = CZ_DEF_POLL;
 static DEFINE_TIMER(cyz_timerlist, cyz_poll, 0, 0);
 
 #else				/* CONFIG_CYZ_INTR */
-static void cyz_rx_restart(unsigned long);
-static struct timer_list cyz_rx_full_timer[NR_PORTS];
+static void cyz_rx_restart(struct timer_list *);
 #endif				/* CONFIG_CYZ_INTR */
 
 static void cyy_writeb(struct cyclades_port *port, u32 reg, u8 val)
@@ -992,10 +991,8 @@ static void cyz_handle_rx(struct cyclades_port *info)
 	else
 		char_count = rx_put - rx_get + rx_bufsize;
 	if (char_count >= readl(&buf_ctrl->rx_threshold) &&
-			!timer_pending(&cyz_rx_full_timer[
-					info->line]))
-		mod_timer(&cyz_rx_full_timer[info->line],
-				jiffies + 1);
+			!timer_pending(&info->rx_full_timer))
+		mod_timer(&info->rx_full_timer, jiffies + 1);
 #endif
 	info->idle_stats.recv_idle = jiffies;
 	tty_schedule_flip(&info->port);
@@ -1197,9 +1194,9 @@ static irqreturn_t cyz_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }				/* cyz_interrupt */
 
-static void cyz_rx_restart(unsigned long arg)
+static void cyz_rx_restart(struct timer_list *t)
 {
-	struct cyclades_port *info = (struct cyclades_port *)arg;
+	struct cyclades_port *info = from_timer(info, t, rx_full_timer);
 	struct cyclades_card *card = info->card;
 	int retval;
 	__u32 channel = info->line - card->first_line;
@@ -3097,8 +3094,7 @@ static int cy_init_card(struct cyclades_card *cinfo)
 			else
 				info->xmit_fifo_size = 4 * CYZ_FIFO_SIZE;
 #ifdef CONFIG_CYZ_INTR
-			setup_timer(&cyz_rx_full_timer[port],
-				cyz_rx_restart, (unsigned long)info);
+			timer_setup(&info->rx_full_timer, cyz_rx_restart, 0);
 #endif
 		} else {
 			unsigned short chip_number;
diff --git a/include/linux/cyclades.h b/include/linux/cyclades.h
index 19ae518f5471..3638eb2d52f6 100644
--- a/include/linux/cyclades.h
+++ b/include/linux/cyclades.h
@@ -156,6 +156,9 @@ struct cyclades_port {
 	struct cyclades_icount	icount;
 	struct completion       shutdown_wait;
 	int throttle;
+#ifdef CONFIG_CYZ_INTR
+	struct timer_list	rx_full_timer;
+#endif
 };
 
 #define	CLOSING_WAIT_DELAY	30*HZ
-- 
cgit v1.2.3


From 88022d7201e96b43f1754b0358fc6bcd8dbdcde1 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 5 Nov 2017 02:21:12 +0800
Subject: blk-mq: don't handle failure in .get_budget

It is enough to just check if we can get the budget via .get_budget().
And we don't need to deal with device state change in .get_budget().

For SCSI, one issue to be fixed is that we have to call
scsi_mq_uninit_cmd() to free allocated ressources if SCSI device fails
to handle the request. And it isn't enough to simply call
blk_mq_end_request() to do that if this request is marked as
RQF_DONTPREP.

Fixes: 0df21c86bdbf(scsi: implement .get_budget and .put_budget for blk-mq)
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c    | 14 ++------------
 block/blk-mq.c          | 17 ++++-------------
 block/blk-mq.h          |  5 ++---
 drivers/scsi/scsi_lib.c | 11 +++--------
 include/linux/blk-mq.h  |  2 +-
 5 files changed, 12 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 7775f6b12fa9..13a27d4d1671 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -94,23 +94,18 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 
 	do {
 		struct request *rq;
-		blk_status_t ret;
 
 		if (e->type->ops.mq.has_work &&
 				!e->type->ops.mq.has_work(hctx))
 			break;
 
-		ret = blk_mq_get_dispatch_budget(hctx);
-		if (ret == BLK_STS_RESOURCE)
+		if (!blk_mq_get_dispatch_budget(hctx))
 			break;
 
 		rq = e->type->ops.mq.dispatch_request(hctx);
 		if (!rq) {
 			blk_mq_put_dispatch_budget(hctx);
 			break;
-		} else if (ret != BLK_STS_OK) {
-			blk_mq_end_request(rq, ret);
-			continue;
 		}
 
 		/*
@@ -146,22 +141,17 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 
 	do {
 		struct request *rq;
-		blk_status_t ret;
 
 		if (!sbitmap_any_bit_set(&hctx->ctx_map))
 			break;
 
-		ret = blk_mq_get_dispatch_budget(hctx);
-		if (ret == BLK_STS_RESOURCE)
+		if (!blk_mq_get_dispatch_budget(hctx))
 			break;
 
 		rq = blk_mq_dequeue_from_ctx(hctx, ctx);
 		if (!rq) {
 			blk_mq_put_dispatch_budget(hctx);
 			break;
-		} else if (ret != BLK_STS_OK) {
-			blk_mq_end_request(rq, ret);
-			continue;
 		}
 
 		/*
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 13cdccef543c..c9fa4b294664 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1137,13 +1137,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 			}
 		}
 
-		if (!got_budget) {
-			ret = blk_mq_get_dispatch_budget(hctx);
-			if (ret == BLK_STS_RESOURCE)
-				break;
-			if (ret != BLK_STS_OK)
-				goto fail_rq;
-		}
+		if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
+			break;
 
 		list_del_init(&rq->queuelist);
 
@@ -1170,7 +1165,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 			break;
 		}
 
- fail_rq:
 		if (unlikely(ret != BLK_STS_OK)) {
 			errors++;
 			blk_mq_end_request(rq, BLK_STS_IOERR);
@@ -1642,12 +1636,10 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 	if (!blk_mq_get_driver_tag(rq, NULL, false))
 		goto insert;
 
-	ret = blk_mq_get_dispatch_budget(hctx);
-	if (ret == BLK_STS_RESOURCE) {
+	if (!blk_mq_get_dispatch_budget(hctx)) {
 		blk_mq_put_driver_tag(rq);
 		goto insert;
-	} else if (ret != BLK_STS_OK)
-		goto fail_rq;
+	}
 
 	new_cookie = request_to_qc_t(hctx, rq);
 
@@ -1665,7 +1657,6 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		__blk_mq_requeue_request(rq);
 		goto insert;
 	default:
- fail_rq:
 		*cookie = BLK_QC_T_NONE;
 		blk_mq_end_request(rq, ret);
 		return;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 522b420dedc0..f97aceff76e9 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -147,14 +147,13 @@ static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx)
 		q->mq_ops->put_budget(hctx);
 }
 
-static inline blk_status_t blk_mq_get_dispatch_budget(
-		struct blk_mq_hw_ctx *hctx)
+static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 
 	if (q->mq_ops->get_budget)
 		return q->mq_ops->get_budget(hctx);
-	return BLK_STS_OK;
+	return true;
 }
 
 #endif
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 22a7e4c47207..286ea983c9e3 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1955,27 +1955,22 @@ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
 	put_device(&sdev->sdev_gendev);
 }
 
-static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
+static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct scsi_device *sdev = q->queuedata;
-	blk_status_t ret;
-
-	ret = prep_to_mq(scsi_prep_state_check(sdev, NULL));
-	if (ret == BLK_STS_RESOURCE || ret != BLK_STS_OK)
-		return ret;
 
 	if (!get_device(&sdev->sdev_gendev))
 		goto out;
 	if (!scsi_dev_queue_ready(q, sdev))
 		goto out_put_device;
 
-	return BLK_STS_OK;
+	return true;
 
 out_put_device:
 	put_device(&sdev->sdev_gendev);
 out:
-	return BLK_STS_RESOURCE;
+	return false;
 }
 
 static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index f2e3079eecdd..674641527da7 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -92,7 +92,7 @@ struct blk_mq_queue_data {
 
 typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
 		const struct blk_mq_queue_data *);
-typedef blk_status_t (get_budget_fn)(struct blk_mq_hw_ctx *);
+typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *);
 typedef void (put_budget_fn)(struct blk_mq_hw_ctx *);
 typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
-- 
cgit v1.2.3


From c02762eb20cb57ec5b7c037b056c37d5838c803f Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Tue, 18 Jul 2017 16:03:17 -0500
Subject: net/mlx5: QCAM register firmware command support

The QCAM register provides capability bit for all the QoS registers
using ACCESS_REG command.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fw.c       | 10 ++++++
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |  2 ++
 drivers/net/ethernet/mellanox/mlx5/core/port.c     | 12 +++++++
 include/linux/mlx5/device.h                        | 14 ++++++++
 include/linux/mlx5/driver.h                        |  2 ++
 include/linux/mlx5/mlx5_ifc.h                      | 40 +++++++++++++++++++++-
 6 files changed, 79 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index 2c71557d1cee..5ef1b56b6a96 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -106,6 +106,13 @@ static int mlx5_get_mcam_reg(struct mlx5_core_dev *dev)
 				   MLX5_MCAM_REGS_FIRST_128);
 }
 
+static int mlx5_get_qcam_reg(struct mlx5_core_dev *dev)
+{
+	return mlx5_query_qcam_reg(dev, dev->caps.qcam,
+				   MLX5_QCAM_FEATURE_ENHANCED_FEATURES,
+				   MLX5_QCAM_REGS_FIRST_128);
+}
+
 int mlx5_query_hca_caps(struct mlx5_core_dev *dev)
 {
 	int err;
@@ -182,6 +189,9 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev)
 	if (MLX5_CAP_GEN(dev, mcam_reg))
 		mlx5_get_mcam_reg(dev);
 
+	if (MLX5_CAP_GEN(dev, qcam_reg))
+		mlx5_get_qcam_reg(dev);
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 8f00de2fe283..ff4a0b889a6f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -122,6 +122,8 @@ int mlx5_query_pcam_reg(struct mlx5_core_dev *dev, u32 *pcam, u8 feature_group,
 			u8 access_reg_group);
 int mlx5_query_mcam_reg(struct mlx5_core_dev *dev, u32 *mcap, u8 feature_group,
 			u8 access_reg_group);
+int mlx5_query_qcam_reg(struct mlx5_core_dev *mdev, u32 *qcam,
+			u8 feature_group, u8 access_reg_group);
 
 void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev);
 void mlx5_lag_remove(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index e07061f565d6..b6553be841f9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -98,6 +98,18 @@ int mlx5_query_mcam_reg(struct mlx5_core_dev *dev, u32 *mcam, u8 feature_group,
 	return mlx5_core_access_reg(dev, in, sz, mcam, sz, MLX5_REG_MCAM, 0, 0);
 }
 
+int mlx5_query_qcam_reg(struct mlx5_core_dev *mdev, u32 *qcam,
+			u8 feature_group, u8 access_reg_group)
+{
+	u32 in[MLX5_ST_SZ_DW(qcam_reg)] = {};
+	int sz = MLX5_ST_SZ_BYTES(qcam_reg);
+
+	MLX5_SET(qcam_reg, in, feature_group, feature_group);
+	MLX5_SET(qcam_reg, in, access_reg_group, access_reg_group);
+
+	return mlx5_core_access_reg(mdev, in, sz, qcam, sz, MLX5_REG_QCAM, 0, 0);
+}
+
 struct mlx5_reg_pcap {
 	u8			rsvd0;
 	u8			port_num;
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index e32dbc4934db..6d79b3f79458 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1000,6 +1000,14 @@ enum mlx5_mcam_feature_groups {
 	MLX5_MCAM_FEATURE_ENHANCED_FEATURES         = 0x0,
 };
 
+enum mlx5_qcam_reg_groups {
+	MLX5_QCAM_REGS_FIRST_128                    = 0x0,
+};
+
+enum mlx5_qcam_feature_groups {
+	MLX5_QCAM_FEATURE_ENHANCED_FEATURES         = 0x0,
+};
+
 /* GET Dev Caps macros */
 #define MLX5_CAP_GEN(mdev, cap) \
 	MLX5_GET(cmd_hca_cap, mdev->caps.hca_cur[MLX5_CAP_GENERAL], cap)
@@ -1108,6 +1116,12 @@ enum mlx5_mcam_feature_groups {
 #define MLX5_CAP_MCAM_FEATURE(mdev, fld) \
 	MLX5_GET(mcam_reg, (mdev)->caps.mcam, mng_feature_cap_mask.enhanced_features.fld)
 
+#define MLX5_CAP_QCAM_REG(mdev, fld) \
+	MLX5_GET(qcam_reg, (mdev)->caps.qcam, qos_access_reg_cap_mask.reg_cap.fld)
+
+#define MLX5_CAP_QCAM_FEATURE(mdev, fld) \
+	MLX5_GET(qcam_reg, (mdev)->caps.qcam, qos_feature_cap_mask.feature_cap.fld)
+
 #define MLX5_CAP_FPGA(mdev, cap) \
 	MLX5_GET(fpga_cap, (mdev)->caps.fpga, cap)
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 08c77b7e59cb..ed5be52282ea 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -109,6 +109,7 @@ enum {
 enum {
 	MLX5_REG_QETCR		 = 0x4005,
 	MLX5_REG_QTCT		 = 0x400a,
+	MLX5_REG_QCAM            = 0x4019,
 	MLX5_REG_DCBX_PARAM      = 0x4020,
 	MLX5_REG_DCBX_APP        = 0x4021,
 	MLX5_REG_FPGA_CAP	 = 0x4022,
@@ -798,6 +799,7 @@ struct mlx5_core_dev {
 		u32 pcam[MLX5_ST_SZ_DW(pcam_reg)];
 		u32 mcam[MLX5_ST_SZ_DW(mcam_reg)];
 		u32 fpga[MLX5_ST_SZ_DW(fpga_cap)];
+		u32 qcam[MLX5_ST_SZ_DW(qcam_reg)];
 	} caps;
 	phys_addr_t		iseg_base;
 	struct mlx5_init_seg __iomem *iseg;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 69772347f866..f127c5b310c5 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -838,7 +838,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         cc_modify_allowed[0x1];
 	u8         start_pad[0x1];
 	u8         cache_line_128byte[0x1];
-	u8         reserved_at_165[0xb];
+	u8         reserved_at_165[0xa];
+	u8         qcam_reg[0x1];
 	u8         gid_table_size[0x10];
 
 	u8         out_of_seq_cnt[0x1];
@@ -7890,6 +7891,43 @@ struct mlx5_ifc_mcam_reg_bits {
 	u8         reserved_at_1c0[0x80];
 };
 
+struct mlx5_ifc_qcam_access_reg_cap_mask {
+	u8         qcam_access_reg_cap_mask_127_to_20[0x6C];
+	u8         qpdpm[0x1];
+	u8         qcam_access_reg_cap_mask_18_to_4[0x0F];
+	u8         qdpm[0x1];
+	u8         qpts[0x1];
+	u8         qcap[0x1];
+	u8         qcam_access_reg_cap_mask_0[0x1];
+};
+
+struct mlx5_ifc_qcam_qos_feature_cap_mask {
+	u8         qcam_qos_feature_cap_mask_127_to_1[0x7F];
+	u8         qpts_trust_both[0x1];
+};
+
+struct mlx5_ifc_qcam_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         feature_group[0x8];
+	u8         reserved_at_10[0x8];
+	u8         access_reg_group[0x8];
+	u8         reserved_at_20[0x20];
+
+	union {
+		struct mlx5_ifc_qcam_access_reg_cap_mask reg_cap;
+		u8  reserved_at_0[0x80];
+	} qos_access_reg_cap_mask;
+
+	u8         reserved_at_c0[0x80];
+
+	union {
+		struct mlx5_ifc_qcam_qos_feature_cap_mask feature_cap;
+		u8  reserved_at_0[0x80];
+	} qos_feature_cap_mask;
+
+	u8         reserved_at_1c0[0x80];
+};
+
 struct mlx5_ifc_pcap_reg_bits {
 	u8         reserved_at_0[0x8];
 	u8         local_port[0x8];
-- 
cgit v1.2.3


From 71c70eb21c33c60433b95e72a59d40bb128db649 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Wed, 2 Aug 2017 21:36:23 -0500
Subject: net/mlx5: Add MLX5_SET16 and MLX5_GET16

Add MLX5_SET16 and MLX5_GET16 for 16bit structure field in firmware
command.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/device.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 6d79b3f79458..409ffb14298a 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -49,11 +49,15 @@
 #define __mlx5_nullp(typ) ((struct mlx5_ifc_##typ##_bits *)0)
 #define __mlx5_bit_sz(typ, fld) sizeof(__mlx5_nullp(typ)->fld)
 #define __mlx5_bit_off(typ, fld) (offsetof(struct mlx5_ifc_##typ##_bits, fld))
+#define __mlx5_16_off(typ, fld) (__mlx5_bit_off(typ, fld) / 16)
 #define __mlx5_dw_off(typ, fld) (__mlx5_bit_off(typ, fld) / 32)
 #define __mlx5_64_off(typ, fld) (__mlx5_bit_off(typ, fld) / 64)
+#define __mlx5_16_bit_off(typ, fld) (16 - __mlx5_bit_sz(typ, fld) - (__mlx5_bit_off(typ, fld) & 0xf))
 #define __mlx5_dw_bit_off(typ, fld) (32 - __mlx5_bit_sz(typ, fld) - (__mlx5_bit_off(typ, fld) & 0x1f))
 #define __mlx5_mask(typ, fld) ((u32)((1ull << __mlx5_bit_sz(typ, fld)) - 1))
 #define __mlx5_dw_mask(typ, fld) (__mlx5_mask(typ, fld) << __mlx5_dw_bit_off(typ, fld))
+#define __mlx5_mask16(typ, fld) ((u16)((1ull << __mlx5_bit_sz(typ, fld)) - 1))
+#define __mlx5_16_mask(typ, fld) (__mlx5_mask16(typ, fld) << __mlx5_16_bit_off(typ, fld))
 #define __mlx5_st_sz_bits(typ) sizeof(struct mlx5_ifc_##typ##_bits)
 
 #define MLX5_FLD_SZ_BYTES(typ, fld) (__mlx5_bit_sz(typ, fld) / 8)
@@ -116,6 +120,19 @@ __mlx5_mask(typ, fld))
 	___t; \
 })
 
+#define MLX5_GET16(typ, p, fld) ((be16_to_cpu(*((__be16 *)(p) +\
+__mlx5_16_off(typ, fld))) >> __mlx5_16_bit_off(typ, fld)) & \
+__mlx5_mask16(typ, fld))
+
+#define MLX5_SET16(typ, p, fld, v) do { \
+	u16 _v = v; \
+	BUILD_BUG_ON(__mlx5_st_sz_bits(typ) % 16);             \
+	*((__be16 *)(p) + __mlx5_16_off(typ, fld)) = \
+	cpu_to_be16((be16_to_cpu(*((__be16 *)(p) + __mlx5_16_off(typ, fld))) & \
+		     (~__mlx5_16_mask(typ, fld))) | (((_v) & __mlx5_mask16(typ, fld)) \
+		     << __mlx5_16_bit_off(typ, fld))); \
+} while (0)
+
 /* Big endian getters */
 #define MLX5_GET64_BE(typ, p, fld) (*((__be64 *)(p) +\
 	__mlx5_64_off(typ, fld)))
-- 
cgit v1.2.3


From 415a64aa8dc6b4fc478609c549ca652d95a12f13 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Tue, 18 Jul 2017 16:08:46 -0500
Subject: net/mlx5: QPTS and QPDPM register firmware command support

The QPTS register allows changing the priority trust state between pcp and
dscp. Add support to get/set trust state from device. When the port is
in pcp/dscp trust state, packet is routed by hardware to matching priority
based on its pcp/dscp value respectively.

The QPDPM register allow channing the dscp to priority mapping. Add support
to get/set dscp to priority mapping from device.
Note that to change a dscp mapping, the "e" bit of this dscp structure
must be set in the QPDPM firmware command.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/port.c | 99 ++++++++++++++++++++++++++
 include/linux/mlx5/driver.h                    |  7 ++
 include/linux/mlx5/mlx5_ifc.h                  | 20 ++++++
 include/linux/mlx5/port.h                      |  5 ++
 4 files changed, 131 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index b6553be841f9..c37d00cd472a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -971,3 +971,102 @@ int mlx5_set_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 arm, u8 mode)
 	return mlx5_core_access_reg(mdev, in, sizeof(in), out,
 				    sizeof(out), MLX5_REG_MTPPSE, 0, 1);
 }
+
+int mlx5_set_trust_state(struct mlx5_core_dev *mdev, u8 trust_state)
+{
+	u32 out[MLX5_ST_SZ_DW(qpts_reg)] = {};
+	u32 in[MLX5_ST_SZ_DW(qpts_reg)] = {};
+	int err;
+
+	MLX5_SET(qpts_reg, in, local_port, 1);
+	MLX5_SET(qpts_reg, in, trust_state, trust_state);
+
+	err = mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_QPTS, 0, 1);
+	return err;
+}
+
+int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state)
+{
+	u32 out[MLX5_ST_SZ_DW(qpts_reg)] = {};
+	u32 in[MLX5_ST_SZ_DW(qpts_reg)] = {};
+	int err;
+
+	MLX5_SET(qpts_reg, in, local_port, 1);
+
+	err = mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_QPTS, 0, 0);
+	if (!err)
+		*trust_state = MLX5_GET(qpts_reg, out, trust_state);
+
+	return err;
+}
+
+int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio)
+{
+	int sz = MLX5_ST_SZ_BYTES(qpdpm_reg);
+	void *qpdpm_dscp;
+	void *out;
+	void *in;
+	int err;
+
+	in = kzalloc(sz, GFP_KERNEL);
+	out = kzalloc(sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(qpdpm_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_QPDPM, 0, 0);
+	if (err)
+		goto out;
+
+	memcpy(in, out, sz);
+	MLX5_SET(qpdpm_reg, in, local_port, 1);
+
+	/* Update the corresponding dscp entry */
+	qpdpm_dscp = MLX5_ADDR_OF(qpdpm_reg, in, dscp[dscp]);
+	MLX5_SET16(qpdpm_dscp_reg, qpdpm_dscp, prio, prio);
+	MLX5_SET16(qpdpm_dscp_reg, qpdpm_dscp, e, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_QPDPM, 0, 1);
+
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
+
+/* dscp2prio[i]: priority that dscp i mapped to */
+#define MLX5E_SUPPORTED_DSCP 64
+int mlx5_query_dscp2prio(struct mlx5_core_dev *mdev, u8 *dscp2prio)
+{
+	int sz = MLX5_ST_SZ_BYTES(qpdpm_reg);
+	void *qpdpm_dscp;
+	void *out;
+	void *in;
+	int err;
+	int i;
+
+	in = kzalloc(sz, GFP_KERNEL);
+	out = kzalloc(sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(qpdpm_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_QPDPM, 0, 0);
+	if (err)
+		goto out;
+
+	for (i = 0; i < (MLX5E_SUPPORTED_DSCP); i++) {
+		qpdpm_dscp = MLX5_ADDR_OF(qpdpm_reg, out, dscp[i]);
+		dscp2prio[i] = MLX5_GET16(qpdpm_dscp_reg, qpdpm_dscp, prio);
+	}
+
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ed5be52282ea..a886b51511ab 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -107,8 +107,10 @@ enum {
 };
 
 enum {
+	MLX5_REG_QPTS            = 0x4002,
 	MLX5_REG_QETCR		 = 0x4005,
 	MLX5_REG_QTCT		 = 0x400a,
+	MLX5_REG_QPDPM           = 0x4013,
 	MLX5_REG_QCAM            = 0x4019,
 	MLX5_REG_DCBX_PARAM      = 0x4020,
 	MLX5_REG_DCBX_APP        = 0x4021,
@@ -142,6 +144,11 @@ enum {
 	MLX5_REG_MCAM		 = 0x907f,
 };
 
+enum mlx5_qpts_trust_state {
+	MLX5_QPTS_TRUST_PCP  = 1,
+	MLX5_QPTS_TRUST_DSCP = 2,
+};
+
 enum mlx5_dcbx_oper_mode {
 	MLX5E_DCBX_PARAM_VER_OPER_HOST  = 0x0,
 	MLX5E_DCBX_PARAM_VER_OPER_AUTO  = 0x3,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f127c5b310c5..3e5363f760dd 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -8578,6 +8578,26 @@ struct mlx5_ifc_qetc_reg_bits {
 	struct mlx5_ifc_ets_global_config_reg_bits global_configuration;
 };
 
+struct mlx5_ifc_qpdpm_dscp_reg_bits {
+	u8         e[0x1];
+	u8         reserved_at_01[0x0b];
+	u8         prio[0x04];
+};
+
+struct mlx5_ifc_qpdpm_reg_bits {
+	u8                                     reserved_at_0[0x8];
+	u8                                     local_port[0x8];
+	u8                                     reserved_at_10[0x10];
+	struct mlx5_ifc_qpdpm_dscp_reg_bits    dscp[64];
+};
+
+struct mlx5_ifc_qpts_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x2d];
+	u8         trust_state[0x3];
+};
+
 struct mlx5_ifc_qtct_reg_bits {
 	u8         reserved_at_0[0x8];
 	u8         port_number[0x8];
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index c59af8ab753a..035f0d4dc9fe 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -179,4 +179,9 @@ int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
 
 int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out);
 int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in);
+
+int mlx5_set_trust_state(struct mlx5_core_dev *mdev, u8 trust_state);
+int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state);
+int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio);
+int mlx5_query_dscp2prio(struct mlx5_core_dev *mdev, u8 *dscp2prio);
 #endif /* __MLX5_PORT_H__ */
-- 
cgit v1.2.3


From eb3b05fb0ff2cf0097e70b6c5e3afbe854a8deb7 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Sat, 4 Nov 2017 21:49:26 +0100
Subject: i2c: nuc900: remove platform_data, too

Commit 7da62cb1853025 ("i2c: nuc900: remove driver") removed the driver,
we should remove the platform_data as well.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/platform_data/i2c-nuc900.h | 9 ---------
 1 file changed, 9 deletions(-)
 delete mode 100644 include/linux/platform_data/i2c-nuc900.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/i2c-nuc900.h b/include/linux/platform_data/i2c-nuc900.h
deleted file mode 100644
index 9ffb12d06e91..000000000000
--- a/include/linux/platform_data/i2c-nuc900.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef __ASM_ARCH_NUC900_I2C_H
-#define __ASM_ARCH_NUC900_I2C_H
-
-struct nuc900_platform_i2c {
-	int		bus_num;
-	unsigned long   bus_freq;
-};
-
-#endif /* __ASM_ARCH_NUC900_I2C_H */
-- 
cgit v1.2.3


From f4e63525ee35f9c02e9f51f90571718363e9a9a9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:16 -0700
Subject: net: bpf: rename ndo_xdp to ndo_bpf

ndo_xdp is a control path callback for setting up XDP in the
driver.  We can reuse it for other forms of communication
between the eBPF stack and the drivers.  Rename the callback
and associated structures and definitions.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c          |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c      |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h      |  2 +-
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   |  4 +--
 drivers/net/ethernet/intel/i40e/i40e_main.c        |  6 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      |  4 +--
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |  6 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  4 +--
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |  4 +--
 drivers/net/ethernet/qlogic/qede/qede.h            |  2 +-
 drivers/net/ethernet/qlogic/qede/qede_filter.c     |  2 +-
 drivers/net/ethernet/qlogic/qede/qede_main.c       |  4 +--
 drivers/net/tun.c                                  |  4 +--
 drivers/net/virtio_net.c                           |  4 +--
 include/linux/netdevice.h                          | 23 ++++++++-------
 net/core/dev.c                                     | 34 +++++++++++-----------
 net/core/rtnetlink.c                               |  4 +--
 17 files changed, 56 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 4e3d569bf32e..96416f5d97f3 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -7775,7 +7775,7 @@ static const struct net_device_ops bnxt_netdev_ops = {
 #endif
 	.ndo_udp_tunnel_add	= bnxt_udp_tunnel_add,
 	.ndo_udp_tunnel_del	= bnxt_udp_tunnel_del,
-	.ndo_xdp		= bnxt_xdp,
+	.ndo_bpf		= bnxt_xdp,
 	.ndo_bridge_getlink	= bnxt_bridge_getlink,
 	.ndo_bridge_setlink	= bnxt_bridge_setlink,
 	.ndo_get_phys_port_name = bnxt_get_phys_port_name
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
index 06ce63c00821..261e5847557a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -208,7 +208,7 @@ static int bnxt_xdp_set(struct bnxt *bp, struct bpf_prog *prog)
 	return 0;
 }
 
-int bnxt_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	struct bnxt *bp = netdev_priv(dev);
 	int rc;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h
index 12a5ad66b564..414b748038ca 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h
@@ -16,6 +16,6 @@ void bnxt_tx_int_xdp(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts);
 bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
 		 struct page *page, u8 **data_ptr, unsigned int *len,
 		 u8 *event);
-int bnxt_xdp(struct net_device *dev, struct netdev_xdp *xdp);
+int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp);
 
 #endif
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 71989e180289..a063c36c4c58 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -1741,7 +1741,7 @@ static int nicvf_xdp_setup(struct nicvf *nic, struct bpf_prog *prog)
 	return 0;
 }
 
-static int nicvf_xdp(struct net_device *netdev, struct netdev_xdp *xdp)
+static int nicvf_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
 {
 	struct nicvf *nic = netdev_priv(netdev);
 
@@ -1774,7 +1774,7 @@ static const struct net_device_ops nicvf_netdev_ops = {
 	.ndo_tx_timeout         = nicvf_tx_timeout,
 	.ndo_fix_features       = nicvf_fix_features,
 	.ndo_set_features       = nicvf_set_features,
-	.ndo_xdp		= nicvf_xdp,
+	.ndo_bpf		= nicvf_xdp,
 };
 
 static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index dfecaeda0654..05b94d87a6c3 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -11648,12 +11648,12 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi,
 }
 
 /**
- * i40e_xdp - implements ndo_xdp for i40e
+ * i40e_xdp - implements ndo_bpf for i40e
  * @dev: netdevice
  * @xdp: XDP command
  **/
 static int i40e_xdp(struct net_device *dev,
-		    struct netdev_xdp *xdp)
+		    struct netdev_bpf *xdp)
 {
 	struct i40e_netdev_priv *np = netdev_priv(dev);
 	struct i40e_vsi *vsi = np->vsi;
@@ -11705,7 +11705,7 @@ static const struct net_device_ops i40e_netdev_ops = {
 	.ndo_features_check	= i40e_features_check,
 	.ndo_bridge_getlink	= i40e_ndo_bridge_getlink,
 	.ndo_bridge_setlink	= i40e_ndo_bridge_setlink,
-	.ndo_xdp		= i40e_xdp,
+	.ndo_bpf		= i40e_xdp,
 };
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 507977994a03..e5dcb25be398 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10004,7 +10004,7 @@ static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog)
 	return 0;
 }
 
-static int ixgbe_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(dev);
 
@@ -10113,7 +10113,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_udp_tunnel_add	= ixgbe_add_udp_tunnel_port,
 	.ndo_udp_tunnel_del	= ixgbe_del_udp_tunnel_port,
 	.ndo_features_check	= ixgbe_features_check,
-	.ndo_xdp		= ixgbe_xdp,
+	.ndo_bpf		= ixgbe_xdp,
 	.ndo_xdp_xmit		= ixgbe_xdp_xmit,
 	.ndo_xdp_flush		= ixgbe_xdp_flush,
 };
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index d611df2f274d..736a6ccaf05e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2916,7 +2916,7 @@ static u32 mlx4_xdp_query(struct net_device *dev)
 	return prog_id;
 }
 
-static int mlx4_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+static int mlx4_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
@@ -2958,7 +2958,7 @@ static const struct net_device_ops mlx4_netdev_ops = {
 	.ndo_udp_tunnel_del	= mlx4_en_del_vxlan_port,
 	.ndo_features_check	= mlx4_en_features_check,
 	.ndo_set_tx_maxrate	= mlx4_en_set_tx_maxrate,
-	.ndo_xdp		= mlx4_xdp,
+	.ndo_bpf		= mlx4_xdp,
 };
 
 static const struct net_device_ops mlx4_netdev_ops_master = {
@@ -2995,7 +2995,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = {
 	.ndo_udp_tunnel_del	= mlx4_en_del_vxlan_port,
 	.ndo_features_check	= mlx4_en_features_check,
 	.ndo_set_tx_maxrate	= mlx4_en_set_tx_maxrate,
-	.ndo_xdp		= mlx4_xdp,
+	.ndo_bpf		= mlx4_xdp,
 };
 
 struct mlx4_en_bond {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 28ae00b3eb88..3b7b7bb84eb0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3831,7 +3831,7 @@ static u32 mlx5e_xdp_query(struct net_device *dev)
 	return prog_id;
 }
 
-static int mlx5e_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
@@ -3883,7 +3883,7 @@ static const struct net_device_ops mlx5e_netdev_ops = {
 	.ndo_rx_flow_steer	 = mlx5e_rx_flow_steer,
 #endif
 	.ndo_tx_timeout          = mlx5e_tx_timeout,
-	.ndo_xdp		 = mlx5e_xdp,
+	.ndo_bpf		 = mlx5e_xdp,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller     = mlx5e_netpoll,
 #endif
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 185a3dd35a3f..f6c6ad4e8a59 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3378,7 +3378,7 @@ nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog, u32 flags,
 	return 0;
 }
 
-static int nfp_net_xdp(struct net_device *netdev, struct netdev_xdp *xdp)
+static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
 {
 	struct nfp_net *nn = netdev_priv(netdev);
 
@@ -3441,7 +3441,7 @@ const struct net_device_ops nfp_net_netdev_ops = {
 	.ndo_get_phys_port_name	= nfp_port_get_phys_port_name,
 	.ndo_udp_tunnel_add	= nfp_net_add_vxlan_port,
 	.ndo_udp_tunnel_del	= nfp_net_del_vxlan_port,
-	.ndo_xdp		= nfp_net_xdp,
+	.ndo_bpf		= nfp_net_xdp,
 };
 
 /**
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index adb700512baa..a3a70ade411f 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -503,7 +503,7 @@ void qede_fill_rss_params(struct qede_dev *edev,
 void qede_udp_tunnel_add(struct net_device *dev, struct udp_tunnel_info *ti);
 void qede_udp_tunnel_del(struct net_device *dev, struct udp_tunnel_info *ti);
 
-int qede_xdp(struct net_device *dev, struct netdev_xdp *xdp);
+int qede_xdp(struct net_device *dev, struct netdev_bpf *xdp);
 
 #ifdef CONFIG_DCB
 void qede_set_dcbnl_ops(struct net_device *ndev);
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index f79e36e4060a..c1a0708a7d7c 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -1065,7 +1065,7 @@ static int qede_xdp_set(struct qede_dev *edev, struct bpf_prog *prog)
 	return 0;
 }
 
-int qede_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+int qede_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	struct qede_dev *edev = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index e5ee9f274a71..8f9b3eb82137 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -556,7 +556,7 @@ static const struct net_device_ops qede_netdev_ops = {
 	.ndo_udp_tunnel_add = qede_udp_tunnel_add,
 	.ndo_udp_tunnel_del = qede_udp_tunnel_del,
 	.ndo_features_check = qede_features_check,
-	.ndo_xdp = qede_xdp,
+	.ndo_bpf = qede_xdp,
 #ifdef CONFIG_RFS_ACCEL
 	.ndo_rx_flow_steer = qede_rx_flow_steer,
 #endif
@@ -594,7 +594,7 @@ static const struct net_device_ops qede_netdev_vf_xdp_ops = {
 	.ndo_udp_tunnel_add = qede_udp_tunnel_add,
 	.ndo_udp_tunnel_del = qede_udp_tunnel_del,
 	.ndo_features_check = qede_features_check,
-	.ndo_xdp = qede_xdp,
+	.ndo_bpf = qede_xdp,
 };
 
 /* -------------------------------------------------------------------------
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 8125956f62a1..1a326b697221 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1141,7 +1141,7 @@ static u32 tun_xdp_query(struct net_device *dev)
 	return 0;
 }
 
-static int tun_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
@@ -1185,7 +1185,7 @@ static const struct net_device_ops tap_netdev_ops = {
 	.ndo_features_check	= passthru_features_check,
 	.ndo_set_rx_headroom	= tun_set_headroom,
 	.ndo_get_stats64	= tun_net_get_stats64,
-	.ndo_xdp		= tun_xdp,
+	.ndo_bpf		= tun_xdp,
 };
 
 static void tun_flow_init(struct tun_struct *tun)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index fc059f193e7d..edf984406ba0 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2088,7 +2088,7 @@ static u32 virtnet_xdp_query(struct net_device *dev)
 	return 0;
 }
 
-static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
@@ -2115,7 +2115,7 @@ static const struct net_device_ops virtnet_netdev = {
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller = virtnet_netpoll,
 #endif
-	.ndo_xdp		= virtnet_xdp,
+	.ndo_bpf		= virtnet_xdp,
 	.ndo_xdp_xmit		= virtnet_xdp_xmit,
 	.ndo_xdp_flush		= virtnet_xdp_flush,
 	.ndo_features_check	= passthru_features_check,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7de7656550c2..9af9feaaeb64 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -779,10 +779,10 @@ enum tc_setup_type {
 	TC_SETUP_CBS,
 };
 
-/* These structures hold the attributes of xdp state that are being passed
- * to the netdevice through the xdp op.
+/* These structures hold the attributes of bpf state that are being passed
+ * to the netdevice through the bpf op.
  */
-enum xdp_netdev_command {
+enum bpf_netdev_command {
 	/* Set or clear a bpf program used in the earliest stages of packet
 	 * rx. The prog will have been loaded as BPF_PROG_TYPE_XDP. The callee
 	 * is responsible for calling bpf_prog_put on any old progs that are
@@ -801,8 +801,8 @@ enum xdp_netdev_command {
 
 struct netlink_ext_ack;
 
-struct netdev_xdp {
-	enum xdp_netdev_command command;
+struct netdev_bpf {
+	enum bpf_netdev_command command;
 	union {
 		/* XDP_SETUP_PROG */
 		struct {
@@ -1124,9 +1124,10 @@ struct dev_ifalias {
  *	appropriate rx headroom value allows avoiding skb head copy on
  *	forward. Setting a negative value resets the rx headroom to the
  *	default value.
- * int (*ndo_xdp)(struct net_device *dev, struct netdev_xdp *xdp);
+ * int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf);
  *	This function is used to set or query state related to XDP on the
- *	netdevice. See definition of enum xdp_netdev_command for details.
+ *	netdevice and manage BPF offload. See definition of
+ *	enum bpf_netdev_command for details.
  * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp);
  *	This function is used to submit a XDP packet for transmit on a
  *	netdevice.
@@ -1315,8 +1316,8 @@ struct net_device_ops {
 						       struct sk_buff *skb);
 	void			(*ndo_set_rx_headroom)(struct net_device *dev,
 						       int needed_headroom);
-	int			(*ndo_xdp)(struct net_device *dev,
-					   struct netdev_xdp *xdp);
+	int			(*ndo_bpf)(struct net_device *dev,
+					   struct netdev_bpf *bpf);
 	int			(*ndo_xdp_xmit)(struct net_device *dev,
 						struct xdp_buff *xdp);
 	void			(*ndo_xdp_flush)(struct net_device *dev);
@@ -3311,10 +3312,10 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
 
-typedef int (*xdp_op_t)(struct net_device *dev, struct netdev_xdp *xdp);
+typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, u32 flags);
-u8 __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id);
+u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t xdp_op, u32 *prog_id);
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 1423cf4d695c..10cde58d3275 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4545,7 +4545,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	return ret;
 }
 
-static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp)
+static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
 	struct bpf_prog *new = xdp->prog;
@@ -7090,26 +7090,26 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down);
 
-u8 __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id)
+u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op, u32 *prog_id)
 {
-	struct netdev_xdp xdp;
+	struct netdev_bpf xdp;
 
 	memset(&xdp, 0, sizeof(xdp));
 	xdp.command = XDP_QUERY_PROG;
 
 	/* Query must always succeed. */
-	WARN_ON(xdp_op(dev, &xdp) < 0);
+	WARN_ON(bpf_op(dev, &xdp) < 0);
 	if (prog_id)
 		*prog_id = xdp.prog_id;
 
 	return xdp.prog_attached;
 }
 
-static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op,
+static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
 			   struct netlink_ext_ack *extack, u32 flags,
 			   struct bpf_prog *prog)
 {
-	struct netdev_xdp xdp;
+	struct netdev_bpf xdp;
 
 	memset(&xdp, 0, sizeof(xdp));
 	if (flags & XDP_FLAGS_HW_MODE)
@@ -7120,7 +7120,7 @@ static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op,
 	xdp.flags = flags;
 	xdp.prog = prog;
 
-	return xdp_op(dev, &xdp);
+	return bpf_op(dev, &xdp);
 }
 
 /**
@@ -7137,24 +7137,24 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	struct bpf_prog *prog = NULL;
-	xdp_op_t xdp_op, xdp_chk;
+	bpf_op_t bpf_op, bpf_chk;
 	int err;
 
 	ASSERT_RTNL();
 
-	xdp_op = xdp_chk = ops->ndo_xdp;
-	if (!xdp_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
+	bpf_op = bpf_chk = ops->ndo_bpf;
+	if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
 		return -EOPNOTSUPP;
-	if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE))
-		xdp_op = generic_xdp_install;
-	if (xdp_op == xdp_chk)
-		xdp_chk = generic_xdp_install;
+	if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
+		bpf_op = generic_xdp_install;
+	if (bpf_op == bpf_chk)
+		bpf_chk = generic_xdp_install;
 
 	if (fd >= 0) {
-		if (xdp_chk && __dev_xdp_attached(dev, xdp_chk, NULL))
+		if (bpf_chk && __dev_xdp_attached(dev, bpf_chk, NULL))
 			return -EEXIST;
 		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
-		    __dev_xdp_attached(dev, xdp_op, NULL))
+		    __dev_xdp_attached(dev, bpf_op, NULL))
 			return -EBUSY;
 
 		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
@@ -7162,7 +7162,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 			return PTR_ERR(prog);
 	}
 
-	err = dev_xdp_install(dev, xdp_op, extack, flags, prog);
+	err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
 	if (err < 0 && prog)
 		bpf_prog_put(prog);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 8a8c51937edf..dc5ad84ac096 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1270,10 +1270,10 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
 		*prog_id = generic_xdp_prog->aux->id;
 		return XDP_ATTACHED_SKB;
 	}
-	if (!ops->ndo_xdp)
+	if (!ops->ndo_bpf)
 		return XDP_ATTACHED_NONE;
 
-	return __dev_xdp_attached(dev, ops->ndo_xdp, prog_id);
+	return __dev_xdp_attached(dev, ops->ndo_bpf, prog_id);
 }
 
 static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
-- 
cgit v1.2.3


From ab3f0063c48c26c927851b6767824e35a716d878 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:17 -0700
Subject: bpf: offload: add infrastructure for loading programs for a specific
 netdev

The fact that we don't know which device the program is going
to be used on is quite limiting in current eBPF infrastructure.
We have to reverse or limit the changes which kernel makes to
the loaded bytecode if we want it to be offloaded to a networking
device.  We also have to invent new APIs for debugging and
troubleshooting support.

Make it possible to load programs for a specific netdev.  This
helps us to bring the debug information closer to the core
eBPF infrastructure (e.g. we will be able to reuse the verifer
log in device JIT).  It allows device JITs to perform translation
on the original bytecode.

__bpf_prog_get() when called to get a reference for an attachment
point will now refuse to give it if program has a device assigned.
Following patches will add a version of that function which passes
the expected netdev in. @type argument in __bpf_prog_get() is
renamed to attach_type to make it clearer that it's only set on
attachment.

All calls to ndo_bpf are protected by rtnl, only verifier callbacks
are not.  We need a wait queue to make sure netdev doesn't get
destroyed while verifier is still running and calling its driver.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h          |  36 +++++++++
 include/linux/bpf_verifier.h |  10 +++
 include/linux/netdevice.h    |  14 ++++
 include/uapi/linux/bpf.h     |   1 +
 kernel/bpf/Makefile          |   1 +
 kernel/bpf/core.c            |  10 ++-
 kernel/bpf/offload.c         | 182 +++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c         |  17 +++-
 kernel/bpf/verifier.c        |  15 +++-
 9 files changed, 278 insertions(+), 8 deletions(-)
 create mode 100644 kernel/bpf/offload.c

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 520aeebe0d93..e45d43f9ec92 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -15,6 +15,7 @@
 #include <linux/err.h>
 #include <linux/rbtree_latch.h>
 #include <linux/numa.h>
+#include <linux/wait.h>
 
 struct perf_event;
 struct bpf_prog;
@@ -182,6 +183,16 @@ struct bpf_verifier_ops {
 				  struct bpf_prog *prog, u32 *target_size);
 };
 
+struct bpf_dev_offload {
+	struct bpf_prog		*prog;
+	struct net_device	*netdev;
+	void			*dev_priv;
+	struct list_head	offloads;
+	bool			dev_state;
+	bool			verifier_running;
+	wait_queue_head_t	verifier_done;
+};
+
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
@@ -199,6 +210,7 @@ struct bpf_prog_aux {
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
+	struct bpf_dev_offload *offload;
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
@@ -317,6 +329,7 @@ extern const struct file_operations bpf_prog_fops;
 #undef BPF_PROG_TYPE
 #undef BPF_MAP_TYPE
 
+extern const struct bpf_prog_ops bpf_offload_prog_ops;
 extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
 extern const struct bpf_verifier_ops xdp_analyzer_ops;
 
@@ -491,6 +504,29 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
+int bpf_prog_offload_compile(struct bpf_prog *prog);
+void bpf_prog_offload_destroy(struct bpf_prog *prog);
+
+#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
+int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
+
+static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
+{
+	return aux->offload;
+}
+#else
+static inline int bpf_prog_offload_init(struct bpf_prog *prog,
+					union bpf_attr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
+{
+	return false;
+}
+#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
+
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
 struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
 int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3b0976aaac75..e45011dbc02d 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -153,6 +153,7 @@ struct bpf_verifier_env {
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
+	const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */
 	void *analyzer_priv; /* pointer to external analyzer's private data */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
@@ -169,6 +170,15 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
 	return env->cur_state->regs;
 }
 
+#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env);
+#else
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 		 void *priv);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9af9feaaeb64..fda527ccb263 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -797,8 +797,13 @@ enum bpf_netdev_command {
 	 * is equivalent to XDP_ATTACHED_DRV.
 	 */
 	XDP_QUERY_PROG,
+	/* BPF program for offload callbacks, invoked at program load time. */
+	BPF_OFFLOAD_VERIFIER_PREP,
+	BPF_OFFLOAD_TRANSLATE,
+	BPF_OFFLOAD_DESTROY,
 };
 
+struct bpf_ext_analyzer_ops;
 struct netlink_ext_ack;
 
 struct netdev_bpf {
@@ -815,6 +820,15 @@ struct netdev_bpf {
 			u8 prog_attached;
 			u32 prog_id;
 		};
+		/* BPF_OFFLOAD_VERIFIER_PREP */
+		struct {
+			struct bpf_prog *prog;
+			const struct bpf_ext_analyzer_ops *ops; /* callee set */
+		} verifier;
+		/* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */
+		struct {
+			struct bpf_prog *prog;
+		} offload;
 	};
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a9820677c2ff..80d191a93fb0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -260,6 +260,7 @@ union bpf_attr {
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
 		char		prog_name[BPF_OBJ_NAME_LEN];
+		__u32		prog_target_ifindex;	/* ifindex of netdev to prep for */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 16e95c8e749e..e691da0b3bab 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+obj-$(CONFIG_BPF_SYSCALL) += offload.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
 obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
 endif
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7fe448799d76..8a6c37762330 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1380,7 +1380,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 	 * valid program, which in this case would simply not
 	 * be JITed, but falls back to the interpreter.
 	 */
-	fp = bpf_int_jit_compile(fp);
+	if (!bpf_prog_is_dev_bound(fp->aux)) {
+		fp = bpf_int_jit_compile(fp);
+	} else {
+		*err = bpf_prog_offload_compile(fp);
+		if (*err)
+			return fp;
+	}
 	bpf_prog_lock_ro(fp);
 
 	/* The tail call compatibility check can only be done at
@@ -1549,6 +1555,8 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 	struct bpf_prog_aux *aux;
 
 	aux = container_of(work, struct bpf_prog_aux, work);
+	if (bpf_prog_is_dev_bound(aux))
+		bpf_prog_offload_destroy(aux->prog);
 	bpf_jit_free(aux->prog);
 }
 
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
new file mode 100644
index 000000000000..5553e0e2f8b1
--- /dev/null
+++ b/kernel/bpf/offload.c
@@ -0,0 +1,182 @@
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/rtnetlink.h>
+
+/* protected by RTNL */
+static LIST_HEAD(bpf_prog_offload_devs);
+
+int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct bpf_dev_offload *offload;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (attr->prog_flags)
+		return -EINVAL;
+
+	offload = kzalloc(sizeof(*offload), GFP_USER);
+	if (!offload)
+		return -ENOMEM;
+
+	offload->prog = prog;
+	init_waitqueue_head(&offload->verifier_done);
+
+	rtnl_lock();
+	offload->netdev = __dev_get_by_index(net, attr->prog_target_ifindex);
+	if (!offload->netdev) {
+		rtnl_unlock();
+		kfree(offload);
+		return -EINVAL;
+	}
+
+	prog->aux->offload = offload;
+	list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
+	rtnl_unlock();
+
+	return 0;
+}
+
+static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
+			     struct netdev_bpf *data)
+{
+	struct net_device *netdev = prog->aux->offload->netdev;
+
+	ASSERT_RTNL();
+
+	if (!netdev)
+		return -ENODEV;
+	if (!netdev->netdev_ops->ndo_bpf)
+		return -EOPNOTSUPP;
+
+	data->command = cmd;
+
+	return netdev->netdev_ops->ndo_bpf(netdev, data);
+}
+
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+{
+	struct netdev_bpf data = {};
+	int err;
+
+	data.verifier.prog = env->prog;
+
+	rtnl_lock();
+	err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data);
+	if (err)
+		goto exit_unlock;
+
+	env->dev_ops = data.verifier.ops;
+
+	env->prog->aux->offload->dev_state = true;
+	env->prog->aux->offload->verifier_running = true;
+exit_unlock:
+	rtnl_unlock();
+	return err;
+}
+
+static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+	struct netdev_bpf data = {};
+
+	data.offload.prog = prog;
+
+	if (offload->verifier_running)
+		wait_event(offload->verifier_done, !offload->verifier_running);
+
+	if (offload->dev_state)
+		WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
+
+	offload->dev_state = false;
+	list_del_init(&offload->offloads);
+	offload->netdev = NULL;
+}
+
+void bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+
+	offload->verifier_running = false;
+	wake_up(&offload->verifier_done);
+
+	rtnl_lock();
+	__bpf_prog_offload_destroy(prog);
+	rtnl_unlock();
+
+	kfree(offload);
+}
+
+static int bpf_prog_offload_translate(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+	struct netdev_bpf data = {};
+	int ret;
+
+	data.offload.prog = prog;
+
+	offload->verifier_running = false;
+	wake_up(&offload->verifier_done);
+
+	rtnl_lock();
+	ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data);
+	rtnl_unlock();
+
+	return ret;
+}
+
+static unsigned int bpf_prog_warn_on_exec(const void *ctx,
+					  const struct bpf_insn *insn)
+{
+	WARN(1, "attempt to execute device eBPF program on the host!");
+	return 0;
+}
+
+int bpf_prog_offload_compile(struct bpf_prog *prog)
+{
+	prog->bpf_func = bpf_prog_warn_on_exec;
+
+	return bpf_prog_offload_translate(prog);
+}
+
+const struct bpf_prog_ops bpf_offload_prog_ops = {
+};
+
+static int bpf_offload_notification(struct notifier_block *notifier,
+				    ulong event, void *ptr)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+	struct bpf_dev_offload *offload, *tmp;
+
+	ASSERT_RTNL();
+
+	switch (event) {
+	case NETDEV_UNREGISTER:
+		list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
+					 offloads) {
+			if (offload->netdev == netdev)
+				__bpf_prog_offload_destroy(offload->prog);
+		}
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block bpf_offload_notifier = {
+	.notifier_call = bpf_offload_notification,
+};
+
+static int __init bpf_offload_init(void)
+{
+	register_netdevice_notifier(&bpf_offload_notifier);
+	return 0;
+}
+
+subsys_initcall(bpf_offload_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 323be2473c4b..1574b9f0f24e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -824,7 +824,10 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 	if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
 		return -EINVAL;
 
-	prog->aux->ops = bpf_prog_types[type];
+	if (!bpf_prog_is_dev_bound(prog->aux))
+		prog->aux->ops = bpf_prog_types[type];
+	else
+		prog->aux->ops = &bpf_offload_prog_ops;
 	prog->type = type;
 	return 0;
 }
@@ -1054,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
 
-static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type)
 {
 	struct fd f = fdget(ufd);
 	struct bpf_prog *prog;
@@ -1062,7 +1065,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
 	prog = ____bpf_prog_get(f);
 	if (IS_ERR(prog))
 		return prog;
-	if (type && prog->type != *type) {
+	if (attach_type && (prog->type != *attach_type || prog->aux->offload)) {
 		prog = ERR_PTR(-EINVAL);
 		goto out;
 	}
@@ -1089,7 +1092,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD prog_name
+#define	BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -1152,6 +1155,12 @@ static int bpf_prog_load(union bpf_attr *attr)
 	atomic_set(&prog->aux->refcnt, 1);
 	prog->gpl_compatible = is_gpl ? 1 : 0;
 
+	if (attr->prog_target_ifindex) {
+		err = bpf_prog_offload_init(prog, attr);
+		if (err)
+			goto free_prog;
+	}
+
 	/* find program type: socket_filter vs tracing_filter */
 	err = find_prog_type(type, prog);
 	if (err < 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 04357ad5a812..51aabb32ad67 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3736,10 +3736,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
 				  int insn_idx, int prev_insn_idx)
 {
-	if (!env->analyzer_ops || !env->analyzer_ops->insn_hook)
-		return 0;
+	if (env->analyzer_ops && env->analyzer_ops->insn_hook)
+		return env->analyzer_ops->insn_hook(env, insn_idx,
+						    prev_insn_idx);
+	if (env->dev_ops && env->dev_ops->insn_hook)
+		return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
 
-	return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx);
+	return 0;
 }
 
 static int do_check(struct bpf_verifier_env *env)
@@ -4516,6 +4519,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		env->strict_alignment = true;
 
+	if (env->prog->aux->offload) {
+		ret = bpf_prog_offload_verifier_prep(env);
+		if (ret)
+			goto err_unlock;
+	}
+
 	ret = replace_map_fd_with_map_ptr(env);
 	if (ret < 0)
 		goto skip_full_check;
-- 
cgit v1.2.3


From bd601b6ada11fdfb9e277f24ad2eb54bc599156b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:18 -0700
Subject: bpf: report offload info to user space

Extend struct bpf_prog_info to contain information about program
being bound to a device.  Since the netdev may get destroyed while
program still exists we need a flag to indicate the program is
loaded for a device, even if the device is gone.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  6 ++++++
 kernel/bpf/offload.c     | 12 ++++++++++++
 kernel/bpf/syscall.c     |  5 +++++
 4 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e45d43f9ec92..98bacd0fa5cc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -506,6 +506,7 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
+u32 bpf_prog_offload_ifindex(struct bpf_prog *prog);
 
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 80d191a93fb0..4455dd195201 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -895,6 +895,10 @@ enum sk_action {
 
 #define BPF_TAG_SIZE	8
 
+enum bpf_prog_status {
+	BPF_PROG_STATUS_DEV_BOUND	= (1 << 0),
+};
+
 struct bpf_prog_info {
 	__u32 type;
 	__u32 id;
@@ -908,6 +912,8 @@ struct bpf_prog_info {
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u32 status;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 5553e0e2f8b1..2816feb38be1 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -144,6 +144,18 @@ int bpf_prog_offload_compile(struct bpf_prog *prog)
 	return bpf_prog_offload_translate(prog);
 }
 
+u32 bpf_prog_offload_ifindex(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+	u32 ifindex;
+
+	rtnl_lock();
+	ifindex = offload->netdev ? offload->netdev->ifindex : 0;
+	rtnl_unlock();
+
+	return ifindex;
+}
+
 const struct bpf_prog_ops bpf_offload_prog_ops = {
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1574b9f0f24e..3217c20ea91b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1592,6 +1592,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 			return -EFAULT;
 	}
 
+	if (bpf_prog_is_dev_bound(prog->aux)) {
+		info.status |= BPF_PROG_STATUS_DEV_BOUND;
+		info.ifindex = bpf_prog_offload_ifindex(prog);
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
cgit v1.2.3


From 248f346ffe9508dee0039db4ac839cb31ba3bdec Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:20 -0700
Subject: xdp: allow attaching programs loaded for specific device

Pass the netdev pointer to bpf_prog_get_type().  This way
BPF code can decide whether the device matches what the
code was loaded/translated for.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h  | 10 ++++++++++
 kernel/bpf/syscall.c | 33 +++++++++++++++++++++++++++++----
 net/core/dev.c       |  6 +++++-
 3 files changed, 44 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 98bacd0fa5cc..c397934f91dd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -335,6 +335,8 @@ extern const struct bpf_verifier_ops xdp_analyzer_ops;
 
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
+struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
+				       struct net_device *netdev);
 struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
 void bpf_prog_sub(struct bpf_prog *prog, int i);
 struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog);
@@ -428,6 +430,14 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
+
+static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
+						     enum bpf_prog_type type,
+						     struct net_device *netdev)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static inline struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog,
 							  int i)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3217c20ea91b..68f9123acd39 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1057,7 +1057,22 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
 
-static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type)
+static bool bpf_prog_can_attach(struct bpf_prog *prog,
+				enum bpf_prog_type *attach_type,
+				struct net_device *netdev)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+
+	if (prog->type != *attach_type)
+		return false;
+	if (offload && offload->netdev != netdev)
+		return false;
+
+	return true;
+}
+
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
+				       struct net_device *netdev)
 {
 	struct fd f = fdget(ufd);
 	struct bpf_prog *prog;
@@ -1065,7 +1080,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type)
 	prog = ____bpf_prog_get(f);
 	if (IS_ERR(prog))
 		return prog;
-	if (attach_type && (prog->type != *attach_type || prog->aux->offload)) {
+	if (attach_type && !bpf_prog_can_attach(prog, attach_type, netdev)) {
 		prog = ERR_PTR(-EINVAL);
 		goto out;
 	}
@@ -1078,12 +1093,12 @@ out:
 
 struct bpf_prog *bpf_prog_get(u32 ufd)
 {
-	return __bpf_prog_get(ufd, NULL);
+	return __bpf_prog_get(ufd, NULL, NULL);
 }
 
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 {
-	struct bpf_prog *prog = __bpf_prog_get(ufd, &type);
+	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, NULL);
 
 	if (!IS_ERR(prog))
 		trace_bpf_prog_get_type(prog);
@@ -1091,6 +1106,16 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
+struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
+				       struct net_device *netdev)
+{
+	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, netdev);
+
+	if (!IS_ERR(prog))
+		trace_bpf_prog_get_type(prog);
+	return prog;
+}
+
 /* last field in 'union bpf_attr' used by this command */
 #define	BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 10cde58d3275..30b5fe32c525 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7157,7 +7157,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		    __dev_xdp_attached(dev, bpf_op, NULL))
 			return -EBUSY;
 
-		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
+		if (bpf_op == ops->ndo_bpf)
+			prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
+						     dev);
+		else
+			prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
 		if (IS_ERR(prog))
 			return PTR_ERR(prog);
 	}
-- 
cgit v1.2.3


From b37a530613104aa3f592376c67a462823298759c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:30 -0700
Subject: bpf: remove old offload/analyzer

Thanks to the ability to load a program for a specific device,
running verifier twice is no longer needed.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |  5 ---
 kernel/bpf/verifier.c        | 75 --------------------------------------------
 net/core/filter.c            | 42 -------------------------
 3 files changed, 122 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e45011dbc02d..07b96aaca256 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -152,9 +152,7 @@ struct bpf_verifier_env {
 	bool strict_alignment;		/* perform strict pointer alignment checks */
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
-	const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
 	const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */
-	void *analyzer_priv; /* pointer to external analyzer's private data */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
 	u32 id_gen;			/* used to generate unique reg IDs */
@@ -179,7 +177,4 @@ int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
 }
 #endif
 
-int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
-		 void *priv);
-
 #endif /* _LINUX_BPF_VERIFIER_H */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 51aabb32ad67..add845fe788a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -949,9 +949,6 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		 */
 		*reg_type = info.reg_type;
 
-		if (env->analyzer_ops)
-			return 0;
-
 		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
 		/* remember the offset of last byte accessed in ctx */
 		if (env->prog->aux->max_ctx_offset < off + size)
@@ -3736,9 +3733,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
 				  int insn_idx, int prev_insn_idx)
 {
-	if (env->analyzer_ops && env->analyzer_ops->insn_hook)
-		return env->analyzer_ops->insn_hook(env, insn_idx,
-						    prev_insn_idx);
 	if (env->dev_ops && env->dev_ops->insn_hook)
 		return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
 
@@ -4601,72 +4595,3 @@ err_free_env:
 	kfree(env);
 	return ret;
 }
-
-static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = {
-#ifdef CONFIG_NET
-	[BPF_PROG_TYPE_XDP]		= &xdp_analyzer_ops,
-	[BPF_PROG_TYPE_SCHED_CLS]	= &tc_cls_act_analyzer_ops,
-#endif
-};
-
-int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
-		 void *priv)
-{
-	struct bpf_verifier_env *env;
-	int ret;
-
-	if (prog->type >= ARRAY_SIZE(bpf_analyzer_ops) ||
-	    !bpf_analyzer_ops[prog->type])
-		return -EOPNOTSUPP;
-
-	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
-	if (!env)
-		return -ENOMEM;
-
-	env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
-				     prog->len);
-	ret = -ENOMEM;
-	if (!env->insn_aux_data)
-		goto err_free_env;
-	env->prog = prog;
-	env->ops = bpf_analyzer_ops[env->prog->type];
-	env->analyzer_ops = ops;
-	env->analyzer_priv = priv;
-
-	/* grab the mutex to protect few globals used by verifier */
-	mutex_lock(&bpf_verifier_lock);
-
-	env->strict_alignment = false;
-	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
-		env->strict_alignment = true;
-
-	env->explored_states = kcalloc(env->prog->len,
-				       sizeof(struct bpf_verifier_state_list *),
-				       GFP_KERNEL);
-	ret = -ENOMEM;
-	if (!env->explored_states)
-		goto skip_full_check;
-
-	ret = check_cfg(env);
-	if (ret < 0)
-		goto skip_full_check;
-
-	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
-
-	ret = do_check(env);
-	if (env->cur_state) {
-		free_verifier_state(env->cur_state, true);
-		env->cur_state = NULL;
-	}
-
-skip_full_check:
-	while (!pop_stack(env, NULL, NULL));
-	free_states(env);
-
-	mutex_unlock(&bpf_verifier_lock);
-	vfree(env->insn_aux_data);
-err_free_env:
-	kfree(env);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(bpf_analyzer);
diff --git a/net/core/filter.c b/net/core/filter.c
index a0112168d6f9..1afa17935954 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3777,25 +3777,6 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, info);
 }
 
-static bool
-tc_cls_act_is_valid_access_analyzer(int off, int size,
-				    enum bpf_access_type type,
-				    struct bpf_insn_access_aux *info)
-{
-	switch (off) {
-	case offsetof(struct sk_buff, len):
-		return true;
-	case offsetof(struct sk_buff, data):
-		info->reg_type = PTR_TO_PACKET;
-		return true;
-	case offsetof(struct sk_buff, cb) +
-	     offsetof(struct bpf_skb_data_end, data_end):
-		info->reg_type = PTR_TO_PACKET_END;
-		return true;
-	}
-	return false;
-}
-
 static bool __is_valid_xdp_access(int off, int size)
 {
 	if (off < 0 || off >= sizeof(struct xdp_md))
@@ -3830,21 +3811,6 @@ static bool xdp_is_valid_access(int off, int size,
 	return __is_valid_xdp_access(off, size);
 }
 
-static bool xdp_is_valid_access_analyzer(int off, int size,
-					 enum bpf_access_type type,
-					 struct bpf_insn_access_aux *info)
-{
-	switch (off) {
-	case offsetof(struct xdp_buff, data):
-		info->reg_type = PTR_TO_PACKET;
-		return true;
-	case offsetof(struct xdp_buff, data_end):
-		info->reg_type = PTR_TO_PACKET_END;
-		return true;
-	}
-	return false;
-}
-
 void bpf_warn_invalid_xdp_action(u32 act)
 {
 	const u32 act_max = XDP_REDIRECT;
@@ -4516,10 +4482,6 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.gen_prologue		= tc_cls_act_prologue,
 };
 
-const struct bpf_verifier_ops tc_cls_act_analyzer_ops = {
-	.is_valid_access	= tc_cls_act_is_valid_access_analyzer,
-};
-
 const struct bpf_prog_ops tc_cls_act_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
@@ -4530,10 +4492,6 @@ const struct bpf_verifier_ops xdp_verifier_ops = {
 	.convert_ctx_access	= xdp_convert_ctx_access,
 };
 
-const struct bpf_verifier_ops xdp_analyzer_ops = {
-	.is_valid_access	= xdp_is_valid_access_analyzer,
-};
-
 const struct bpf_prog_ops xdp_prog_ops = {
 	.test_run		= bpf_prog_test_run_xdp,
 };
-- 
cgit v1.2.3


From 1f2556916d974cfb62b6af51660186b5f58bd869 Mon Sep 17 00:00:00 2001
From: Priyaranjan Jha <priyarjha@google.com>
Date: Fri, 3 Nov 2017 16:38:48 -0700
Subject: tcp: higher throughput under reordering with adaptive RACK reordering
 wnd

Currently TCP RACK loss detection does not work well if packets are
being reordered beyond its static reordering window (min_rtt/4).Under
such reordering it may falsely trigger loss recoveries and reduce TCP
throughput significantly.

This patch improves that by increasing and reducing the reordering
window based on DSACK, which is now supported in major TCP implementations.
It makes RACK's reo_wnd adaptive based on DSACK and no. of recoveries.

- If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded
  by srtt), since there is possibility that spurious retransmission was
  due to reordering delay longer than reo_wnd.

- Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
  no. of successful recoveries (accounts for full DSACK-based loss
  recovery undo). After that, reset it to default (min_rtt/4).

- At max, reo_wnd is incremented only once per rtt. So that the new
  DSACK on which we are reacting, is due to the spurious retx (approx)
  after the reo_wnd has been updated last time.

- reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
  absolute value to account for change in rtt.

In our internal testing, we observed significant increase in throughput,
in scenarios where reordering exceeds min_rtt/4 (previous static value).

Signed-off-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  1 +
 include/linux/tcp.h                    |  9 +++++--
 include/net/tcp.h                      |  2 ++
 net/ipv4/tcp.c                         |  1 +
 net/ipv4/tcp_input.c                   |  7 +++++
 net/ipv4/tcp_minisocks.c               |  4 +++
 net/ipv4/tcp_recovery.c                | 48 ++++++++++++++++++++++++++++++++--
 7 files changed, 68 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index e6661b205f72..54410a1d4065 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -454,6 +454,7 @@ tcp_recovery - INTEGER
 
 	RACK: 0x1 enables the RACK loss detection for fast detection of lost
 	      retransmissions and tail drops.
+	RACK: 0x2 makes RACK's reordering window static (min_rtt/4).
 
 	Default: 0x1
 
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 8c431385b272..22f40c96a15b 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -210,8 +210,13 @@ struct tcp_sock {
 		u64 mstamp; /* (Re)sent time of the skb */
 		u32 rtt_us;  /* Associated RTT */
 		u32 end_seq; /* Ending TCP sequence of the skb */
-		u8 advanced; /* mstamp advanced since last lost marking */
-		u8 reord;    /* reordering detected */
+		u32 last_delivered; /* tp->delivered at last reo_wnd adj */
+		u8 reo_wnd_steps;   /* Allowed reordering window */
+#define TCP_RACK_RECOVERY_THRESH 16
+		u8 reo_wnd_persist:5, /* No. of recovery since last adj */
+		   dsack_seen:1, /* Whether DSACK seen after last adj */
+		   advanced:1,	 /* mstamp advanced since last lost marking */
+		   reord:1;	 /* reordering detected */
 	} rack;
 	u16	advmss;		/* Advertised MSS			*/
 	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c2bf2a822b10..babfd4da1515 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -246,6 +246,7 @@ extern int sysctl_tcp_wmem[3];
 extern int sysctl_tcp_rmem[3];
 
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
+#define TCP_RACK_STATIC_REO_WND  0x2 /* Use static RACK reo wnd */
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -1901,6 +1902,7 @@ extern void tcp_rack_mark_lost(struct sock *sk);
 extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 			     u64 xmit_time);
 extern void tcp_rack_reo_timeout(struct sock *sk);
+extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs);
 
 /* At how many usecs into the future should the RTO fire? */
 static inline s64 tcp_rto_delta_us(const struct sock *sk)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a7a0f316eb86..c4cb19ed4628 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -447,6 +447,7 @@ void tcp_init_sock(struct sock *sk)
 	tcp_assign_congestion_control(sk);
 
 	tp->tsoffset = 0;
+	tp->rack.reo_wnd_steps = 1;
 
 	sk->sk_state = TCP_CLOSE;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8393b405ea98..0ada8bfc2ebd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -856,6 +856,7 @@ void tcp_disable_fack(struct tcp_sock *tp)
 static void tcp_dsack_seen(struct tcp_sock *tp)
 {
 	tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
+	tp->rack.dsack_seen = 1;
 }
 
 static void tcp_update_reordering(struct sock *sk, const int metric,
@@ -2408,6 +2409,8 @@ static bool tcp_try_undo_recovery(struct sock *sk)
 			mib_idx = LINUX_MIB_TCPFULLUNDO;
 
 		NET_INC_STATS(sock_net(sk), mib_idx);
+	} else if (tp->rack.reo_wnd_persist) {
+		tp->rack.reo_wnd_persist--;
 	}
 	if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
 		/* Hold old state until something *above* high_seq
@@ -2427,6 +2430,8 @@ static bool tcp_try_undo_dsack(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (tp->undo_marker && !tp->undo_retrans) {
+		tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
+					       tp->rack.reo_wnd_persist + 1);
 		DBGUNDO(sk, "D-SACK");
 		tcp_undo_cwnd_reduction(sk, false);
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
@@ -3644,6 +3649,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
 				    &sack_state);
 
+	tcp_rack_update_reo_wnd(sk, &rs);
+
 	if (tp->tlp_high_seq)
 		tcp_process_tlp_ack(sk, ack, flag);
 	/* If needed, reset TLP/RTO timer; RACK may later override this. */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 3c65c1a3f944..4bb86580decd 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -551,6 +551,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->syn_data_acked = 0;
 		newtp->rack.mstamp = 0;
 		newtp->rack.advanced = 0;
+		newtp->rack.reo_wnd_steps = 1;
+		newtp->rack.last_delivered = 0;
+		newtp->rack.reo_wnd_persist = 0;
+		newtp->rack.dsack_seen = 0;
 
 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
 	}
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index ac3e9c6d3a3d..d3ea89020c69 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -44,6 +44,7 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
 static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	u32 min_rtt = tcp_min_rtt(tp);
 	struct sk_buff *skb, *n;
 	u32 reo_wnd;
 
@@ -54,8 +55,10 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 	 * to queuing or delayed ACKs.
 	 */
 	reo_wnd = 1000;
-	if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
-		reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
+	if ((tp->rack.reord || !tp->lost_out) && min_rtt != ~0U) {
+		reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd);
+		reo_wnd = min(reo_wnd, tp->srtt_us >> 3);
+	}
 
 	list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
 				 tcp_tsorted_anchor) {
@@ -160,3 +163,44 @@ void tcp_rack_reo_timeout(struct sock *sk)
 	if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
 		tcp_rearm_rto(sk);
 }
+
+/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
+ *
+ * If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded
+ * by srtt), since there is possibility that spurious retransmission was
+ * due to reordering delay longer than reo_wnd.
+ *
+ * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
+ * no. of successful recoveries (accounts for full DSACK-based loss
+ * recovery undo). After that, reset it to default (min_rtt/4).
+ *
+ * At max, reo_wnd is incremented only once per rtt. So that the new
+ * DSACK on which we are reacting, is due to the spurious retx (approx)
+ * after the reo_wnd has been updated last time.
+ *
+ * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
+ * absolute value to account for change in rtt.
+ */
+void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_STATIC_REO_WND ||
+	    !rs->prior_delivered)
+		return;
+
+	/* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */
+	if (before(rs->prior_delivered, tp->rack.last_delivered))
+		tp->rack.dsack_seen = 0;
+
+	/* Adjust the reo_wnd if update is pending */
+	if (tp->rack.dsack_seen) {
+		tp->rack.reo_wnd_steps = min_t(u32, 0xFF,
+					       tp->rack.reo_wnd_steps + 1);
+		tp->rack.dsack_seen = 0;
+		tp->rack.last_delivered = tp->delivered;
+		tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH;
+	} else if (!tp->rack.reo_wnd_persist) {
+		tp->rack.reo_wnd_steps = 1;
+	}
+}
-- 
cgit v1.2.3


From ecf8fecb7828648cba0e42de7464a7e600c93459 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Sun, 5 Nov 2017 08:15:31 -0500
Subject: device_cgroup: prepare code for bpf-based device controller

This is non-functional change to prepare the device cgroup code
for adding eBPF-based controller for cgroups v2.

The patch performs the following changes:
1) __devcgroup_inode_permission() and devcgroup_inode_mknod()
   are moving to the device-cgroup.h and converting into static inline.
2) __devcgroup_check_permission() is exported.
3) devcgroup_check_permission() wrapper is introduced to be used
   by both existing and new bpf-based implementations.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/device_cgroup.h | 61 ++++++++++++++++++++++++++++++++++++++++---
 security/device_cgroup.c      | 47 ++-------------------------------
 2 files changed, 59 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h
index cdbc344a92e4..2d93d7ecd479 100644
--- a/include/linux/device_cgroup.h
+++ b/include/linux/device_cgroup.h
@@ -1,17 +1,70 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/fs.h>
 
+#define DEVCG_ACC_MKNOD 1
+#define DEVCG_ACC_READ  2
+#define DEVCG_ACC_WRITE 4
+#define DEVCG_ACC_MASK (DEVCG_ACC_MKNOD | DEVCG_ACC_READ | DEVCG_ACC_WRITE)
+
+#define DEVCG_DEV_BLOCK 1
+#define DEVCG_DEV_CHAR  2
+#define DEVCG_DEV_ALL   4  /* this represents all devices */
+
+#ifdef CONFIG_CGROUP_DEVICE
+extern int __devcgroup_check_permission(short type, u32 major, u32 minor,
+					short access);
+#else
+static inline int __devcgroup_check_permission(short type, u32 major, u32 minor,
+					       short access)
+{ return 0; }
+#endif
+
 #ifdef CONFIG_CGROUP_DEVICE
-extern int __devcgroup_inode_permission(struct inode *inode, int mask);
-extern int devcgroup_inode_mknod(int mode, dev_t dev);
+static inline int devcgroup_check_permission(short type, u32 major, u32 minor,
+					     short access)
+{
+	return __devcgroup_check_permission(type, major, minor, access);
+}
+
 static inline int devcgroup_inode_permission(struct inode *inode, int mask)
 {
+	short type, access = 0;
+
 	if (likely(!inode->i_rdev))
 		return 0;
-	if (!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode))
+
+	if (S_ISBLK(inode->i_mode))
+		type = DEVCG_DEV_BLOCK;
+	else if (S_ISCHR(inode->i_mode))
+		type = DEVCG_DEV_CHAR;
+	else
 		return 0;
-	return __devcgroup_inode_permission(inode, mask);
+
+	if (mask & MAY_WRITE)
+		access |= DEVCG_ACC_WRITE;
+	if (mask & MAY_READ)
+		access |= DEVCG_ACC_READ;
+
+	return devcgroup_check_permission(type, imajor(inode), iminor(inode),
+					  access);
 }
+
+static inline int devcgroup_inode_mknod(int mode, dev_t dev)
+{
+	short type;
+
+	if (!S_ISBLK(mode) && !S_ISCHR(mode))
+		return 0;
+
+	if (S_ISBLK(mode))
+		type = DEVCG_DEV_BLOCK;
+	else
+		type = DEVCG_DEV_CHAR;
+
+	return devcgroup_check_permission(type, MAJOR(dev), MINOR(dev),
+					  DEVCG_ACC_MKNOD);
+}
+
 #else
 static inline int devcgroup_inode_permission(struct inode *inode, int mask)
 { return 0; }
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 968c21557ba7..c65b39bafdfe 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -15,15 +15,6 @@
 #include <linux/rcupdate.h>
 #include <linux/mutex.h>
 
-#define DEVCG_ACC_MKNOD 1
-#define DEVCG_ACC_READ  2
-#define DEVCG_ACC_WRITE 4
-#define DEVCG_ACC_MASK (DEVCG_ACC_MKNOD | DEVCG_ACC_READ | DEVCG_ACC_WRITE)
-
-#define DEVCG_DEV_BLOCK 1
-#define DEVCG_DEV_CHAR  2
-#define DEVCG_DEV_ALL   4  /* this represents all devices */
-
 static DEFINE_MUTEX(devcgroup_mutex);
 
 enum devcg_behavior {
@@ -810,8 +801,8 @@ struct cgroup_subsys devices_cgrp_subsys = {
  *
  * returns 0 on success, -EPERM case the operation is not permitted
  */
-static int __devcgroup_check_permission(short type, u32 major, u32 minor,
-				        short access)
+int __devcgroup_check_permission(short type, u32 major, u32 minor,
+				 short access)
 {
 	struct dev_cgroup *dev_cgroup;
 	bool rc;
@@ -833,37 +824,3 @@ static int __devcgroup_check_permission(short type, u32 major, u32 minor,
 
 	return 0;
 }
-
-int __devcgroup_inode_permission(struct inode *inode, int mask)
-{
-	short type, access = 0;
-
-	if (S_ISBLK(inode->i_mode))
-		type = DEVCG_DEV_BLOCK;
-	if (S_ISCHR(inode->i_mode))
-		type = DEVCG_DEV_CHAR;
-	if (mask & MAY_WRITE)
-		access |= DEVCG_ACC_WRITE;
-	if (mask & MAY_READ)
-		access |= DEVCG_ACC_READ;
-
-	return __devcgroup_check_permission(type, imajor(inode), iminor(inode),
-			access);
-}
-
-int devcgroup_inode_mknod(int mode, dev_t dev)
-{
-	short type;
-
-	if (!S_ISBLK(mode) && !S_ISCHR(mode))
-		return 0;
-
-	if (S_ISBLK(mode))
-		type = DEVCG_DEV_BLOCK;
-	else
-		type = DEVCG_DEV_CHAR;
-
-	return __devcgroup_check_permission(type, MAJOR(dev), MINOR(dev),
-			DEVCG_ACC_MKNOD);
-
-}
-- 
cgit v1.2.3


From ebc614f687369f9df99828572b1d85a7c2de3d92 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Sun, 5 Nov 2017 08:15:32 -0500
Subject: bpf, cgroup: implement eBPF-based device controller for cgroup v2

Cgroup v2 lacks the device controller, provided by cgroup v1.
This patch adds a new eBPF program type, which in combination
of previously added ability to attach multiple eBPF programs
to a cgroup, will provide a similar functionality, but with some
additional flexibility.

This patch introduces a BPF_PROG_TYPE_CGROUP_DEVICE program type.
A program takes major and minor device numbers, device type
(block/character) and access type (mknod/read/write) as parameters
and returns an integer which defines if the operation should be
allowed or terminated with -EPERM.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h     | 15 ++++++++++
 include/linux/bpf_types.h      |  3 ++
 include/linux/device_cgroup.h  |  8 ++++-
 include/uapi/linux/bpf.h       | 15 ++++++++++
 kernel/bpf/cgroup.c            | 67 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |  7 +++++
 kernel/bpf/verifier.c          |  1 +
 tools/include/uapi/linux/bpf.h | 15 ++++++++++
 8 files changed, 130 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 87a7db9feb38..a7f16e0f8d68 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -67,6 +67,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     struct bpf_sock_ops_kern *sock_ops,
 				     enum bpf_attach_type type);
 
+int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
+				      short access, enum bpf_attach_type type);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
@@ -112,6 +115,17 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 	}								       \
 	__ret;								       \
 })
+
+#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access)	      \
+({									      \
+	int __ret = 0;							      \
+	if (cgroup_bpf_enabled)						      \
+		__ret = __cgroup_bpf_check_dev_permission(type, major, minor, \
+							  access,	      \
+							  BPF_CGROUP_DEVICE); \
+									      \
+	__ret;								      \
+})
 #else
 
 struct cgroup_bpf {};
@@ -122,6 +136,7 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
 
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 53c5b9ad7220..978c1d9c9383 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -19,6 +19,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
 BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint)
 BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 #endif
+#ifdef CONFIG_CGROUP_BPF
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h
index 2d93d7ecd479..8557efe096dc 100644
--- a/include/linux/device_cgroup.h
+++ b/include/linux/device_cgroup.h
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/fs.h>
+#include <linux/bpf-cgroup.h>
 
 #define DEVCG_ACC_MKNOD 1
 #define DEVCG_ACC_READ  2
@@ -19,10 +20,15 @@ static inline int __devcgroup_check_permission(short type, u32 major, u32 minor,
 { return 0; }
 #endif
 
-#ifdef CONFIG_CGROUP_DEVICE
+#if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF)
 static inline int devcgroup_check_permission(short type, u32 major, u32 minor,
 					     short access)
 {
+	int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access);
+
+	if (rc)
+		return -EPERM;
+
 	return __devcgroup_check_permission(type, major, minor, access);
 }
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4455dd195201..e880ae6434ee 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -132,6 +132,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_XMIT,
 	BPF_PROG_TYPE_SOCK_OPS,
 	BPF_PROG_TYPE_SK_SKB,
+	BPF_PROG_TYPE_CGROUP_DEVICE,
 };
 
 enum bpf_attach_type {
@@ -141,6 +142,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_SOCK_OPS,
 	BPF_SK_SKB_STREAM_PARSER,
 	BPF_SK_SKB_STREAM_VERDICT,
+	BPF_CGROUP_DEVICE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -991,4 +993,17 @@ struct bpf_perf_event_value {
 	__u64 running;
 };
 
+#define BPF_DEVCG_ACC_MKNOD	(1ULL << 0)
+#define BPF_DEVCG_ACC_READ	(1ULL << 1)
+#define BPF_DEVCG_ACC_WRITE	(1ULL << 2)
+
+#define BPF_DEVCG_DEV_BLOCK	(1ULL << 0)
+#define BPF_DEVCG_DEV_CHAR	(1ULL << 1)
+
+struct bpf_cgroup_dev_ctx {
+	__u32 access_type; /* (access << 16) | type */
+	__u32 major;
+	__u32 minor;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 3db5a17fcfe8..b789ab78d28f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -522,3 +522,70 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
+
+int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
+				      short access, enum bpf_attach_type type)
+{
+	struct cgroup *cgrp;
+	struct bpf_cgroup_dev_ctx ctx = {
+		.access_type = (access << 16) | dev_type,
+		.major = major,
+		.minor = minor,
+	};
+	int allow = 1;
+
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(current);
+	allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
+				   BPF_PROG_RUN);
+	rcu_read_unlock();
+
+	return !allow;
+}
+EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
+
+static const struct bpf_func_proto *
+cgroup_dev_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_get_current_uid_gid:
+		return &bpf_get_current_uid_gid_proto;
+	case BPF_FUNC_trace_printk:
+		if (capable(CAP_SYS_ADMIN))
+			return bpf_get_trace_printk_proto();
+	default:
+		return NULL;
+	}
+}
+
+static bool cgroup_dev_is_valid_access(int off, int size,
+				       enum bpf_access_type type,
+				       struct bpf_insn_access_aux *info)
+{
+	if (type == BPF_WRITE)
+		return false;
+
+	if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
+		return false;
+	/* The verifier guarantees that size > 0. */
+	if (off % size != 0)
+		return false;
+	if (size != sizeof(__u32))
+		return false;
+
+	return true;
+}
+
+const struct bpf_prog_ops cg_dev_prog_ops = {
+};
+
+const struct bpf_verifier_ops cg_dev_verifier_ops = {
+	.get_func_proto		= cgroup_dev_func_proto,
+	.is_valid_access	= cgroup_dev_is_valid_access,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 416d70cdfc76..09badc37e864 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1326,6 +1326,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_SOCK_OPS:
 		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
+	case BPF_CGROUP_DEVICE:
+		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
+		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, true);
@@ -1378,6 +1381,9 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_CGROUP_SOCK_OPS:
 		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
+	case BPF_CGROUP_DEVICE:
+		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
+		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, false);
@@ -1420,6 +1426,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_INET_EGRESS:
 	case BPF_CGROUP_INET_SOCK_CREATE:
 	case BPF_CGROUP_SOCK_OPS:
+	case BPF_CGROUP_DEVICE:
 		break;
 	default:
 		return -EINVAL;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index add845fe788a..4a942e2e753d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3124,6 +3124,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 	case BPF_PROG_TYPE_CGROUP_SKB:
 	case BPF_PROG_TYPE_CGROUP_SOCK:
 	case BPF_PROG_TYPE_SOCK_OPS:
+	case BPF_PROG_TYPE_CGROUP_DEVICE:
 		break;
 	default:
 		return 0;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e92f62cf933a..b280f37cd057 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -131,6 +131,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_XMIT,
 	BPF_PROG_TYPE_SOCK_OPS,
 	BPF_PROG_TYPE_SK_SKB,
+	BPF_PROG_TYPE_CGROUP_DEVICE,
 };
 
 enum bpf_attach_type {
@@ -140,6 +141,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_SOCK_OPS,
 	BPF_SK_SKB_STREAM_PARSER,
 	BPF_SK_SKB_STREAM_VERDICT,
+	BPF_CGROUP_DEVICE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -990,4 +992,17 @@ struct bpf_perf_event_value {
 	__u64 running;
 };
 
+#define BPF_DEVCG_ACC_MKNOD	(1ULL << 0)
+#define BPF_DEVCG_ACC_READ	(1ULL << 1)
+#define BPF_DEVCG_ACC_WRITE	(1ULL << 2)
+
+#define BPF_DEVCG_DEV_BLOCK	(1ULL << 0)
+#define BPF_DEVCG_DEV_CHAR	(1ULL << 1)
+
+struct bpf_cgroup_dev_ctx {
+	__u32 access_type; /* (access << 16) | type */
+	__u32 major;
+	__u32 minor;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3


From 63dcb81e5b9e1faadf4b55450141bc4446e5a3d3 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Tue, 3 Oct 2017 08:53:07 +0300
Subject: include/linux/fs.h: fix comment about struct address_space

Before commit 9c5d760b8d22 ("mm: split gfp_mask and mapping flags into
separate fields") the private_* fields of struct adrress_space were grouped
together and using "ditto" in comments describing the last fields was
correct.

With introduction of gpf_mask between private_lock and private_list "ditto"
references the wrong description.

Fix it by using the elaborate description.

Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 31f8b2ea358c..ccbac0ed672c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -403,7 +403,7 @@ struct address_space {
 	unsigned long		flags;		/* error bits */
 	spinlock_t		private_lock;	/* for use by the address_space */
 	gfp_t			gfp_mask;	/* implicit gfp mask for allocations */
-	struct list_head	private_list;	/* ditto */
+	struct list_head	private_list;	/* for use by the address_space */
 	void			*private_data;	/* ditto */
 	errseq_t		wb_err;
 } __attribute__((aligned(sizeof(long)))) __randomize_layout;
-- 
cgit v1.2.3


From 6afc662e68b5f988282ff20afd58a89b1c279dca Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 6 Sep 2017 21:59:50 +0800
Subject: f2fs: support flexible inline xattr size

Now, in product, more and more features based on file encryption were
introduced, their demand of xattr space is increasing, however, inline
xattr has fixed-size of 200 bytes, once inline xattr space is full, new
increased xattr data would occupy additional xattr block which may bring
us more space usage and performance regression during persisting.

In order to resolve above issue, it's better to expand inline xattr size
flexibly according to user's requirement.

So this patch introduces new filesystem feature 'flexible inline xattr',
and new mount option 'inline_xattr_size=%u', once mkfs enables the
feature, we can use the option to make f2fs supporting flexible inline
xattr size.

To support this feature, we add extra attribute i_inline_xattr_size in
inode layout, indicating that how many space inline xattr borrows from
block address mapping space in inode layout, by this, we can easily
locate and store flexible-sized inline xattr data in inode.

Inode disk layout:
  +----------------------+
  | .i_mode              |
  | ...                  |
  | .i_ext               |
  +----------------------+
  | .i_extra_isize       |
  | .i_inline_xattr_size |-----------+
  | ...                  |           |
  +----------------------+           |
  | .i_addr              |           |
  |  - block address or  |           |
  |  - inline data       |           |
  +----------------------+<---+      v
  |    inline xattr      |    +---inline xattr range
  +----------------------+<---+
  | .i_nid               |
  +----------------------+
  |   node_footer        |
  | (nid, ino, offset)   |
  +----------------------+

Note that, we have to cnosider backward compatibility which reserved
inline_data space, 200 bytes, all the time, reported by Sheng Yong.

Previous inline data or directory always reserved 200 bytes in inode layout,
even if inline_xattr is disabled. In order to keep inline_dentry's structure
for backward compatibility, we get the space back only from inline_data.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Reported-by: Sheng Yong <shengyong1@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h          | 39 ++++++++++++++++++++++++++-------------
 fs/f2fs/inode.c         | 21 +++++++++++++++++++++
 fs/f2fs/namei.c         | 13 +++++++++++++
 fs/f2fs/node.c          | 10 ++++++++--
 fs/f2fs/super.c         | 32 +++++++++++++++++++++++++++++++-
 fs/f2fs/sysfs.c         |  7 +++++++
 fs/f2fs/xattr.c         | 18 +++++++++---------
 include/linux/f2fs_fs.h |  5 +++--
 8 files changed, 118 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4034459f1f5d..13a96b8aa2d7 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -93,6 +93,7 @@ extern char *fault_name[FAULT_MAX];
 #define F2FS_MOUNT_GRPQUOTA		0x00100000
 #define F2FS_MOUNT_PRJQUOTA		0x00200000
 #define F2FS_MOUNT_QUOTA		0x00400000
+#define F2FS_MOUNT_INLINE_XATTR_SIZE	0x00800000
 
 #define clear_opt(sbi, option)	((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)	((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -118,6 +119,7 @@ struct f2fs_mount_info {
 #define F2FS_FEATURE_EXTRA_ATTR		0x0008
 #define F2FS_FEATURE_PRJQUOTA		0x0010
 #define F2FS_FEATURE_INODE_CHKSUM	0x0020
+#define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR	0x0040
 
 #define F2FS_HAS_FEATURE(sb, mask)					\
 	((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -397,11 +399,14 @@ struct f2fs_flush_device {
 
 /* for inline stuff */
 #define DEF_INLINE_RESERVED_SIZE	1
+#define DEF_MIN_INLINE_SIZE		1
 static inline int get_extra_isize(struct inode *inode);
-#define MAX_INLINE_DATA(inode)	(sizeof(__le32) * \
-				(CUR_ADDRS_PER_INODE(inode) - \
-				DEF_INLINE_RESERVED_SIZE - \
-				F2FS_INLINE_XATTR_ADDRS))
+static inline int get_inline_xattr_addrs(struct inode *inode);
+#define F2FS_INLINE_XATTR_ADDRS(inode)	get_inline_xattr_addrs(inode)
+#define MAX_INLINE_DATA(inode)	(sizeof(__le32) *			\
+				(CUR_ADDRS_PER_INODE(inode) -		\
+				F2FS_INLINE_XATTR_ADDRS(inode) -	\
+				DEF_INLINE_RESERVED_SIZE))
 
 /* for inline dir */
 #define NR_INLINE_DENTRY(inode)	(MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \
@@ -612,6 +617,7 @@ struct f2fs_inode_info {
 
 	int i_extra_isize;		/* size of extra space located in i_addr */
 	kprojid_t i_projid;		/* id for project quota */
+	int i_inline_xattr_size;	/* inline xattr size */
 };
 
 static inline void get_extent_info(struct extent_info *ext,
@@ -1069,6 +1075,7 @@ struct f2fs_sb_info {
 	loff_t max_file_blocks;			/* max block index of file */
 	int active_logs;			/* # of active logs */
 	int dir_level;				/* directory level */
+	int inline_xattr_size;			/* inline xattr size */
 
 	block_t user_block_count;		/* # of user blocks */
 	block_t total_valid_block_count;	/* # of valid blocks */
@@ -2186,25 +2193,20 @@ static inline int f2fs_has_inline_xattr(struct inode *inode)
 
 static inline unsigned int addrs_per_inode(struct inode *inode)
 {
-	if (f2fs_has_inline_xattr(inode))
-		return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS;
-	return CUR_ADDRS_PER_INODE(inode);
+	return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS(inode);
 }
 
-static inline void *inline_xattr_addr(struct page *page)
+static inline void *inline_xattr_addr(struct inode *inode, struct page *page)
 {
 	struct f2fs_inode *ri = F2FS_INODE(page);
 
 	return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE -
-					F2FS_INLINE_XATTR_ADDRS]);
+					F2FS_INLINE_XATTR_ADDRS(inode)]);
 }
 
 static inline int inline_xattr_size(struct inode *inode)
 {
-	if (f2fs_has_inline_xattr(inode))
-		return F2FS_INLINE_XATTR_ADDRS << 2;
-	else
-		return 0;
+	return get_inline_xattr_addrs(inode) * sizeof(__le32);
 }
 
 static inline int f2fs_has_inline_data(struct inode *inode)
@@ -2354,6 +2356,12 @@ static inline int get_extra_isize(struct inode *inode)
 	return F2FS_I(inode)->i_extra_isize / sizeof(__le32);
 }
 
+static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb);
+static inline int get_inline_xattr_addrs(struct inode *inode)
+{
+	return F2FS_I(inode)->i_inline_xattr_size;
+}
+
 #define get_inode_mode(i) \
 	((is_inode_flag_set(i, FI_ACL_MODE)) ? \
 	 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
@@ -3023,6 +3031,11 @@ static inline int f2fs_sb_has_inode_chksum(struct super_block *sb)
 	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM);
 }
 
+static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb)
+{
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR);
+}
+
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline int get_blkz_type(struct f2fs_sb_info *sbi,
 			struct block_device *bdev, block_t blkaddr)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 3617e7fca930..9684d53563f1 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -232,6 +232,23 @@ static int do_read_inode(struct inode *inode)
 	fi->i_extra_isize = f2fs_has_extra_attr(inode) ?
 					le16_to_cpu(ri->i_extra_isize) : 0;
 
+	if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) {
+		f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
+		fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size);
+	} else if (f2fs_has_inline_xattr(inode) ||
+				f2fs_has_inline_dentry(inode)) {
+		fi->i_inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
+	} else {
+
+		/*
+		 * Previous inline data or directory always reserved 200 bytes
+		 * in inode layout, even if inline_xattr is disabled. In order
+		 * to keep inline_dentry's structure for backward compatibility,
+		 * we get the space back only from inline_data.
+		 */
+		fi->i_inline_xattr_size = 0;
+	}
+
 	/* check data exist */
 	if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
 		__recover_inline_status(inode, node_page);
@@ -384,6 +401,10 @@ int update_inode(struct inode *inode, struct page *node_page)
 	if (f2fs_has_extra_attr(inode)) {
 		ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize);
 
+		if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)->sb))
+			ri->i_inline_xattr_size =
+				cpu_to_le16(F2FS_I(inode)->i_inline_xattr_size);
+
 		if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) &&
 			F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
 								i_projid)) {
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 773d327f8fc1..28bdf8828e73 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -29,6 +29,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	nid_t ino;
 	struct inode *inode;
 	bool nid_free = false;
+	int xattr_size = 0;
 	int err;
 
 	inode = new_inode(dir->i_sb);
@@ -86,11 +87,23 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 
 	if (test_opt(sbi, INLINE_XATTR))
 		set_inode_flag(inode, FI_INLINE_XATTR);
+
 	if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
 		set_inode_flag(inode, FI_INLINE_DATA);
 	if (f2fs_may_inline_dentry(inode))
 		set_inode_flag(inode, FI_INLINE_DENTRY);
 
+	if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) {
+		f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
+		if (f2fs_has_inline_xattr(inode))
+			xattr_size = sbi->inline_xattr_size;
+		/* Otherwise, will be 0 */
+	} else if (f2fs_has_inline_xattr(inode) ||
+				f2fs_has_inline_dentry(inode)) {
+		xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
+	}
+	F2FS_I(inode)->i_inline_xattr_size = xattr_size;
+
 	f2fs_init_extent_tree(inode, NULL);
 
 	stat_inc_inline_xattr(inode);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index bdb7e92636b4..f44d83705245 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2191,8 +2191,8 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
 		goto update_inode;
 	}
 
-	dst_addr = inline_xattr_addr(ipage);
-	src_addr = inline_xattr_addr(page);
+	dst_addr = inline_xattr_addr(inode, ipage);
+	src_addr = inline_xattr_addr(inode, page);
 	inline_size = inline_xattr_size(inode);
 
 	f2fs_wait_on_page_writeback(ipage, NODE, true);
@@ -2281,6 +2281,12 @@ retry:
 	dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR);
 	if (dst->i_inline & F2FS_EXTRA_ATTR) {
 		dst->i_extra_isize = src->i_extra_isize;
+
+		if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) &&
+			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
+							i_inline_xattr_size))
+			dst->i_inline_xattr_size = src->i_inline_xattr_size;
+
 		if (f2fs_sb_has_project_quota(sbi->sb) &&
 			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
 								i_projid))
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 97fa81503e1f..213d2c1e5759 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -92,6 +92,7 @@ enum {
 	Opt_disable_ext_identify,
 	Opt_inline_xattr,
 	Opt_noinline_xattr,
+	Opt_inline_xattr_size,
 	Opt_inline_data,
 	Opt_inline_dentry,
 	Opt_noinline_dentry,
@@ -141,6 +142,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_disable_ext_identify, "disable_ext_identify"},
 	{Opt_inline_xattr, "inline_xattr"},
 	{Opt_noinline_xattr, "noinline_xattr"},
+	{Opt_inline_xattr_size, "inline_xattr_size=%u"},
 	{Opt_inline_data, "inline_data"},
 	{Opt_inline_dentry, "inline_dentry"},
 	{Opt_noinline_dentry, "noinline_dentry"},
@@ -383,6 +385,12 @@ static int parse_options(struct super_block *sb, char *options)
 		case Opt_noinline_xattr:
 			clear_opt(sbi, INLINE_XATTR);
 			break;
+		case Opt_inline_xattr_size:
+			if (args->from && match_int(args, &arg))
+				return -EINVAL;
+			set_opt(sbi, INLINE_XATTR_SIZE);
+			sbi->inline_xattr_size = arg;
+			break;
 #else
 		case Opt_user_xattr:
 			f2fs_msg(sb, KERN_INFO,
@@ -604,6 +612,24 @@ static int parse_options(struct super_block *sb, char *options)
 				F2FS_IO_SIZE_KB(sbi));
 		return -EINVAL;
 	}
+
+	if (test_opt(sbi, INLINE_XATTR_SIZE)) {
+		if (!test_opt(sbi, INLINE_XATTR)) {
+			f2fs_msg(sb, KERN_ERR,
+					"inline_xattr_size option should be "
+					"set with inline_xattr option");
+			return -EINVAL;
+		}
+		if (!sbi->inline_xattr_size ||
+			sbi->inline_xattr_size >= DEF_ADDRS_PER_INODE -
+					F2FS_TOTAL_EXTRA_ATTR_SIZE -
+					DEF_INLINE_RESERVED_SIZE -
+					DEF_MIN_INLINE_SIZE) {
+			f2fs_msg(sb, KERN_ERR,
+					"inline xattr size is out of range");
+			return -EINVAL;
+		}
+	}
 	return 0;
 }
 
@@ -1050,6 +1076,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",inline_xattr");
 	else
 		seq_puts(seq, ",noinline_xattr");
+	if (test_opt(sbi, INLINE_XATTR_SIZE))
+		seq_printf(seq, ",inline_xattr_size=%u",
+					sbi->inline_xattr_size);
 #endif
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
 	if (test_opt(sbi, POSIX_ACL))
@@ -1112,6 +1141,7 @@ static void default_options(struct f2fs_sb_info *sbi)
 {
 	/* init some FS parameters */
 	sbi->active_logs = NR_CURSEG_TYPE;
+	sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
 
 	set_opt(sbi, BG_GC);
 	set_opt(sbi, INLINE_XATTR);
@@ -1670,7 +1700,7 @@ static loff_t max_file_blocks(void)
 
 	/*
 	 * note: previously, result is equal to (DEF_ADDRS_PER_INODE -
-	 * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more
+	 * DEFAULT_INLINE_XATTR_ADDRS), but now f2fs try to reserve more
 	 * space in inode.i_addr, it will be more safe to reassign
 	 * result as zero.
 	 */
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 48ebe6153cc5..e09e59cc678a 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -107,6 +107,9 @@ static ssize_t features_show(struct f2fs_attr *a,
 	if (f2fs_sb_has_inode_chksum(sb))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "inode_checksum");
+	if (f2fs_sb_has_flexible_inline_xattr(sb))
+		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "flexible_inline_xattr");
 	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
 	return len;
 }
@@ -216,6 +219,7 @@ enum feat_id {
 	FEAT_EXTRA_ATTR,
 	FEAT_PROJECT_QUOTA,
 	FEAT_INODE_CHECKSUM,
+	FEAT_FLEXIBLE_INLINE_XATTR,
 };
 
 static ssize_t f2fs_feature_show(struct f2fs_attr *a,
@@ -228,6 +232,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a,
 	case FEAT_EXTRA_ATTR:
 	case FEAT_PROJECT_QUOTA:
 	case FEAT_INODE_CHECKSUM:
+	case FEAT_FLEXIBLE_INLINE_XATTR:
 		return snprintf(buf, PAGE_SIZE, "supported\n");
 	}
 	return 0;
@@ -299,6 +304,7 @@ F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE);
 F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR);
 F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA);
 F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM);
+F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR);
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -346,6 +352,7 @@ static struct attribute *f2fs_feat_attrs[] = {
 	ATTR_LIST(extra_attr),
 	ATTR_LIST(project_quota),
 	ATTR_LIST(inode_checksum),
+	ATTR_LIST(flexible_inline_xattr),
 	NULL,
 };
 
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 9fbcb7687d3b..52b46f598470 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -217,12 +217,12 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
 	return entry;
 }
 
-static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr,
-					void **last_addr, int index,
-					size_t len, const char *name)
+static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
+				void *base_addr, void **last_addr, int index,
+				size_t len, const char *name)
 {
 	struct f2fs_xattr_entry *entry;
-	unsigned int inline_size = F2FS_INLINE_XATTR_ADDRS << 2;
+	unsigned int inline_size = inline_xattr_size(inode);
 
 	list_for_each_xattr(entry, base_addr) {
 		if ((void *)entry + sizeof(__u32) > base_addr + inline_size ||
@@ -250,13 +250,13 @@ static int read_inline_xattr(struct inode *inode, struct page *ipage,
 	void *inline_addr;
 
 	if (ipage) {
-		inline_addr = inline_xattr_addr(ipage);
+		inline_addr = inline_xattr_addr(inode, ipage);
 	} else {
 		page = get_node_page(sbi, inode->i_ino);
 		if (IS_ERR(page))
 			return PTR_ERR(page);
 
-		inline_addr = inline_xattr_addr(page);
+		inline_addr = inline_xattr_addr(inode, page);
 	}
 	memcpy(txattr_addr, inline_addr, inline_size);
 	f2fs_put_page(page, 1);
@@ -309,7 +309,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 		if (err)
 			goto out;
 
-		*xe = __find_inline_xattr(txattr_addr, &last_addr,
+		*xe = __find_inline_xattr(inode, txattr_addr, &last_addr,
 						index, len, name);
 		if (*xe)
 			goto check;
@@ -404,7 +404,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 		void *inline_addr;
 
 		if (ipage) {
-			inline_addr = inline_xattr_addr(ipage);
+			inline_addr = inline_xattr_addr(inode, ipage);
 			f2fs_wait_on_page_writeback(ipage, NODE, true);
 			set_page_dirty(ipage);
 		} else {
@@ -413,7 +413,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 				alloc_nid_failed(sbi, new_nid);
 				return PTR_ERR(page);
 			}
-			inline_addr = inline_xattr_addr(page);
+			inline_addr = inline_xattr_addr(inode, page);
 			f2fs_wait_on_page_writeback(page, NODE, true);
 		}
 		memcpy(inline_addr, txattr_addr, inline_size);
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 2a0c453d7235..50a8ee501bf1 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -184,7 +184,8 @@ struct f2fs_extent {
 } __packed;
 
 #define F2FS_NAME_LEN		255
-#define F2FS_INLINE_XATTR_ADDRS	50	/* 200 bytes for inline xattrs */
+/* 200 bytes for inline xattrs by default */
+#define DEFAULT_INLINE_XATTR_ADDRS	50
 #define DEF_ADDRS_PER_INODE	923	/* Address Pointers in an Inode */
 #define CUR_ADDRS_PER_INODE(inode)	(DEF_ADDRS_PER_INODE - \
 					get_extra_isize(inode))
@@ -238,7 +239,7 @@ struct f2fs_inode {
 	union {
 		struct {
 			__le16 i_extra_isize;	/* extra inode attribute size */
-			__le16 i_padding;	/* padding */
+			__le16 i_inline_xattr_size;	/* inline xattr size, unit: 4 bytes */
 			__le32 i_projid;	/* project id */
 			__le32 i_inode_checksum;/* inode meta checksum */
 			__le32 i_extra_end[0];	/* for attribute size calculation */
-- 
cgit v1.2.3


From 234a9689614272d93271b308adbd303b59d266e3 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 5 Oct 2017 21:03:06 -0700
Subject: f2fs: add quota_ino feature infra

This patch adds quota_ino feature infra to be used for quota files.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h          | 6 ++++++
 fs/f2fs/sysfs.c         | 7 +++++++
 include/linux/f2fs_fs.h | 6 +++++-
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4a75f07f1dc8..9a1c7ffa6845 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -122,6 +122,7 @@ struct f2fs_mount_info {
 #define F2FS_FEATURE_PRJQUOTA		0x0010
 #define F2FS_FEATURE_INODE_CHKSUM	0x0020
 #define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR	0x0040
+#define F2FS_FEATURE_QUOTA_INO		0x0080
 
 #define F2FS_HAS_FEATURE(sb, mask)					\
 	((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -3070,6 +3071,11 @@ static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb)
 	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR);
 }
 
+static inline int f2fs_sb_has_quota_ino(struct super_block *sb)
+{
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO);
+}
+
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline int get_blkz_type(struct f2fs_sb_info *sbi,
 			struct block_device *bdev, block_t blkaddr)
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index f0fdc89ce82f..9835348b6e5d 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -110,6 +110,9 @@ static ssize_t features_show(struct f2fs_attr *a,
 	if (f2fs_sb_has_flexible_inline_xattr(sb))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "flexible_inline_xattr");
+	if (f2fs_sb_has_quota_ino(sb))
+		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "quota_ino");
 	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
 	return len;
 }
@@ -227,6 +230,7 @@ enum feat_id {
 	FEAT_PROJECT_QUOTA,
 	FEAT_INODE_CHECKSUM,
 	FEAT_FLEXIBLE_INLINE_XATTR,
+	FEAT_QUOTA_INO,
 };
 
 static ssize_t f2fs_feature_show(struct f2fs_attr *a,
@@ -240,6 +244,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a,
 	case FEAT_PROJECT_QUOTA:
 	case FEAT_INODE_CHECKSUM:
 	case FEAT_FLEXIBLE_INLINE_XATTR:
+	case FEAT_QUOTA_INO:
 		return snprintf(buf, PAGE_SIZE, "supported\n");
 	}
 	return 0;
@@ -314,6 +319,7 @@ F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR);
 F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA);
 F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM);
 F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR);
+F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO);
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -364,6 +370,7 @@ static struct attribute *f2fs_feat_attrs[] = {
 	ATTR_LIST(project_quota),
 	ATTR_LIST(inode_checksum),
 	ATTR_LIST(flexible_inline_xattr),
+	ATTR_LIST(quota_ino),
 	NULL,
 };
 
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 50a8ee501bf1..ce34007972c3 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -36,6 +36,9 @@
 #define F2FS_NODE_INO(sbi)	((sbi)->node_ino_num)
 #define F2FS_META_INO(sbi)	((sbi)->meta_ino_num)
 
+#define F2FS_QUOTA_INO		3
+#define F2FS_MAX_QUOTAS		3
+
 #define F2FS_IO_SIZE(sbi)	(1 << (sbi)->write_io_size_bits) /* Blocks */
 #define F2FS_IO_SIZE_KB(sbi)	(1 << ((sbi)->write_io_size_bits + 2)) /* KB */
 #define F2FS_IO_SIZE_BYTES(sbi)	(1 << ((sbi)->write_io_size_bits + 12)) /* B */
@@ -108,7 +111,8 @@ struct f2fs_super_block {
 	__u8 encryption_level;		/* versioning level for encryption */
 	__u8 encrypt_pw_salt[16];	/* Salt used for string2key algorithm */
 	struct f2fs_device devs[MAX_DEVICES];	/* device list */
-	__u8 reserved[327];		/* valid reserved region */
+	__le32 qf_ino[F2FS_MAX_QUOTAS];	/* quota inode numbers */
+	__u8 reserved[315];		/* valid reserved region */
 } __packed;
 
 /*
-- 
cgit v1.2.3


From ea6767337f86312ebe473585899a159bf50ef1b7 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 6 Oct 2017 09:14:28 -0700
Subject: f2fs: support quota sys files

This patch supports hidden quota files in the system, which will be used for
Android. It requires up-to-date f2fs-tools later than v1.9.0.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c    |   9 ++-
 fs/f2fs/f2fs.h          |   9 ++-
 fs/f2fs/recovery.c      |   8 ++-
 fs/f2fs/super.c         | 149 +++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/f2fs_fs.h |   1 -
 5 files changed, 157 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 6b52d4b66c7b..78e1b2998bbd 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -615,6 +615,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 	block_t start_blk, orphan_blocks, i, j;
 	unsigned int s_flags = sbi->sb->s_flags;
 	int err = 0;
+#ifdef CONFIG_QUOTA
+	int quota_enabled;
+#endif
 
 	if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG))
 		return 0;
@@ -627,8 +630,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 #ifdef CONFIG_QUOTA
 	/* Needed for iput() to work correctly and not trash data */
 	sbi->sb->s_flags |= MS_ACTIVE;
+
 	/* Turn on quotas so that they are updated correctly */
-	f2fs_enable_quota_files(sbi);
+	quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY);
 #endif
 
 	start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
@@ -656,7 +660,8 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 out:
 #ifdef CONFIG_QUOTA
 	/* Turn quotas off */
-	f2fs_quota_off_umount(sbi->sb);
+	if (quota_enabled)
+		f2fs_quota_off_umount(sbi->sb);
 #endif
 	sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9a1c7ffa6845..484b5c6ac36b 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1384,6 +1384,13 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
 	return le64_to_cpu(cp->checkpoint_ver);
 }
 
+static inline unsigned long f2fs_qf_ino(struct super_block *sb, int type)
+{
+	if (type < F2FS_MAX_QUOTAS)
+		return le32_to_cpu(F2FS_SB(sb)->raw_super->qf_ino[type]);
+	return 0;
+}
+
 static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp)
 {
 	size_t crc_offset = le32_to_cpu(cp->checksum_offset);
@@ -2526,7 +2533,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
  */
 int f2fs_inode_dirtied(struct inode *inode, bool sync);
 void f2fs_inode_synced(struct inode *inode);
-void f2fs_enable_quota_files(struct f2fs_sb_info *sbi);
+int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly);
 void f2fs_quota_off_umount(struct super_block *sb);
 int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
 int f2fs_sync_fs(struct super_block *sb, int sync);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 9626758bc762..92c57ace1939 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -594,6 +594,9 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 	int ret = 0;
 	unsigned long s_flags = sbi->sb->s_flags;
 	bool need_writecp = false;
+#ifdef CONFIG_QUOTA
+	int quota_enabled;
+#endif
 
 	if (s_flags & MS_RDONLY) {
 		f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs");
@@ -604,7 +607,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 	/* Needed for iput() to work correctly and not trash data */
 	sbi->sb->s_flags |= MS_ACTIVE;
 	/* Turn on quotas so that they are updated correctly */
-	f2fs_enable_quota_files(sbi);
+	quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY);
 #endif
 
 	fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
@@ -665,7 +668,8 @@ skip:
 out:
 #ifdef CONFIG_QUOTA
 	/* Turn quotas off */
-	f2fs_quota_off_umount(sbi->sb);
+	if (quota_enabled)
+		f2fs_quota_off_umount(sbi->sb);
 #endif
 	sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 96e145c34ba2..91eaa7b235eb 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -213,6 +213,12 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
 			"quota options when quota turned on");
 		return -EINVAL;
 	}
+	if (f2fs_sb_has_quota_ino(sb)) {
+		f2fs_msg(sb, KERN_INFO,
+			"QUOTA feature is enabled, so ignore qf_name");
+		return 0;
+	}
+
 	qname = match_strdup(args);
 	if (!qname) {
 		f2fs_msg(sb, KERN_ERR,
@@ -291,6 +297,18 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
 			return -1;
 		}
 	}
+
+	if (f2fs_sb_has_quota_ino(sbi->sb) && sbi->s_jquota_fmt) {
+		f2fs_msg(sbi->sb, KERN_INFO,
+			"QUOTA feature is enabled, so ignore jquota_fmt");
+		sbi->s_jquota_fmt = 0;
+	}
+	if (f2fs_sb_has_quota_ino(sbi->sb) && sb_rdonly(sbi->sb)) {
+		f2fs_msg(sbi->sb, KERN_INFO,
+			 "Filesystem with quota feature cannot be mounted RDWR "
+			 "without CONFIG_QUOTA");
+		return -1;
+	}
 	return 0;
 }
 #endif
@@ -1172,6 +1190,9 @@ static void default_options(struct f2fs_sb_info *sbi)
 #endif
 }
 
+#ifdef CONFIG_QUOTA
+static int f2fs_enable_quotas(struct super_block *sb);
+#endif
 static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -1238,6 +1259,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	if (f2fs_readonly(sb) && (*flags & MS_RDONLY))
 		goto skip;
 
+#ifdef CONFIG_QUOTA
 	if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) {
 		err = dquot_suspend(sb, -1);
 		if (err < 0)
@@ -1245,9 +1267,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	} else {
 		/* dquot_resume needs RW */
 		sb->s_flags &= ~MS_RDONLY;
-		dquot_resume(sb, -1);
+		if (sb_any_quota_suspended(sb)) {
+			dquot_resume(sb, -1);
+		} else if (f2fs_sb_has_quota_ino(sb)) {
+			err = f2fs_enable_quotas(sb);
+			if (err)
+				goto restore_opts;
+		}
 	}
-
+#endif
 	/* disallow enable/disable extent_cache dynamically */
 	if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) {
 		err = -EINVAL;
@@ -1454,19 +1482,91 @@ static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type)
 						sbi->s_jquota_fmt, type);
 }
 
-void f2fs_enable_quota_files(struct f2fs_sb_info *sbi)
+int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly)
 {
-	int i, ret;
+	int enabled = 0;
+	int i, err;
+
+	if (f2fs_sb_has_quota_ino(sbi->sb) && rdonly) {
+		err = f2fs_enable_quotas(sbi->sb);
+		if (err) {
+			f2fs_msg(sbi->sb, KERN_ERR,
+					"Cannot turn on quota_ino: %d", err);
+			return 0;
+		}
+		return 1;
+	}
 
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if (sbi->s_qf_names[i]) {
-			ret = f2fs_quota_on_mount(sbi, i);
-			if (ret < 0)
-				f2fs_msg(sbi->sb, KERN_ERR,
-					"Cannot turn on journaled "
-					"quota: error %d", ret);
+			err = f2fs_quota_on_mount(sbi, i);
+			if (!err) {
+				enabled = 1;
+				continue;
+			}
+			f2fs_msg(sbi->sb, KERN_ERR,
+				"Cannot turn on quotas: %d on %d", err, i);
+		}
+	}
+	return enabled;
+}
+
+static int f2fs_quota_enable(struct super_block *sb, int type, int format_id,
+			     unsigned int flags)
+{
+	struct inode *qf_inode;
+	unsigned long qf_inum;
+	int err;
+
+	BUG_ON(!f2fs_sb_has_quota_ino(sb));
+
+	qf_inum = f2fs_qf_ino(sb, type);
+	if (!qf_inum)
+		return -EPERM;
+
+	qf_inode = f2fs_iget(sb, qf_inum);
+	if (IS_ERR(qf_inode)) {
+		f2fs_msg(sb, KERN_ERR,
+			"Bad quota inode %u:%lu", type, qf_inum);
+		return PTR_ERR(qf_inode);
+	}
+
+	/* Don't account quota for quota files to avoid recursion */
+	qf_inode->i_flags |= S_NOQUOTA;
+	err = dquot_enable(qf_inode, type, format_id, flags);
+	iput(qf_inode);
+	return err;
+}
+
+static int f2fs_enable_quotas(struct super_block *sb)
+{
+	int type, err = 0;
+	unsigned long qf_inum;
+	bool quota_mopt[MAXQUOTAS] = {
+		test_opt(F2FS_SB(sb), USRQUOTA),
+		test_opt(F2FS_SB(sb), GRPQUOTA),
+		test_opt(F2FS_SB(sb), PRJQUOTA),
+	};
+
+	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
+	for (type = 0; type < MAXQUOTAS; type++) {
+		qf_inum = f2fs_qf_ino(sb, type);
+		if (qf_inum) {
+			err = f2fs_quota_enable(sb, type, QFMT_VFS_V1,
+				DQUOT_USAGE_ENABLED |
+				(quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
+			if (err) {
+				f2fs_msg(sb, KERN_ERR,
+					"Failed to enable quota tracking "
+					"(type=%d, err=%d). Please run "
+					"fsck to fix.", type, err);
+				for (type--; type >= 0; type--)
+					dquot_quota_off(sb, type);
+				return err;
+			}
 		}
 	}
+	return 0;
 }
 
 static int f2fs_quota_sync(struct super_block *sb, int type)
@@ -1537,7 +1637,7 @@ static int f2fs_quota_off(struct super_block *sb, int type)
 	f2fs_quota_sync(sb, type);
 
 	err = dquot_quota_off(sb, type);
-	if (err)
+	if (err || f2fs_sb_has_quota_ino(sb))
 		goto out_put;
 
 	inode_lock(inode);
@@ -2364,7 +2464,10 @@ try_onemore:
 
 #ifdef CONFIG_QUOTA
 	sb->dq_op = &f2fs_quota_operations;
-	sb->s_qcop = &f2fs_quotactl_ops;
+	if (f2fs_sb_has_quota_ino(sb))
+		sb->s_qcop = &dquot_quotactl_sysfile_ops;
+	else
+		sb->s_qcop = &f2fs_quotactl_ops;
 	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
 #endif
 
@@ -2535,10 +2638,24 @@ try_onemore:
 	if (err)
 		goto free_root_inode;
 
+#ifdef CONFIG_QUOTA
+	/*
+	 * Turn on quotas which were not enabled for read-only mounts if
+	 * filesystem has quota feature, so that they are updated correctly.
+	 */
+	if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) {
+		err = f2fs_enable_quotas(sb);
+		if (err) {
+			f2fs_msg(sb, KERN_ERR,
+				"Cannot turn on quotas: error %d", err);
+			goto free_sysfs;
+		}
+	}
+#endif
 	/* if there are nt orphan nodes free them */
 	err = recover_orphan_inodes(sbi);
 	if (err)
-		goto free_sysfs;
+		goto free_meta;
 
 	/* recover fsynced data */
 	if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
@@ -2572,7 +2689,7 @@ try_onemore:
 			err = -EINVAL;
 			f2fs_msg(sb, KERN_ERR,
 				"Need to recover fsync data");
-			goto free_sysfs;
+			goto free_meta;
 		}
 	}
 skip_recovery:
@@ -2606,6 +2723,10 @@ skip_recovery:
 	return 0;
 
 free_meta:
+#ifdef CONFIG_QUOTA
+	if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb))
+		f2fs_quota_off_umount(sbi->sb);
+#endif
 	f2fs_sync_inode_meta(sbi);
 	/*
 	 * Some dirty meta pages can be produced by recover_orphan_inodes()
@@ -2614,7 +2735,9 @@ free_meta:
 	 * falls into an infinite loop in sync_meta_pages().
 	 */
 	truncate_inode_pages_final(META_MAPPING(sbi));
+#ifdef CONFIG_QUOTA
 free_sysfs:
+#endif
 	f2fs_unregister_sysfs(sbi);
 free_root_inode:
 	dput(sb->s_root);
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index ce34007972c3..43e98d30d2df 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -36,7 +36,6 @@
 #define F2FS_NODE_INO(sbi)	((sbi)->node_ino_num)
 #define F2FS_META_INO(sbi)	((sbi)->meta_ino_num)
 
-#define F2FS_QUOTA_INO		3
 #define F2FS_MAX_QUOTAS		3
 
 #define F2FS_IO_SIZE(sbi)	(1 << (sbi)->write_io_size_bits) /* Blocks */
-- 
cgit v1.2.3


From 08810a4119aaebf6318f209ec5dd9828e969cba4 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 25 Oct 2017 14:12:29 +0200
Subject: PM / core: Add NEVER_SKIP and SMART_PREPARE driver flags

The motivation for this change is to provide a way to work around
a problem with the direct-complete mechanism used for avoiding
system suspend/resume handling for devices in runtime suspend.

The problem is that some middle layer code (the PCI bus type and
the ACPI PM domain in particular) returns positive values from its
system suspend ->prepare callbacks regardless of whether the driver's
->prepare returns a positive value or 0, which effectively prevents
drivers from being able to control the direct-complete feature.
Some drivers need that control, however, and the PCI bus type has
grown its own flag to deal with this issue, but since it is not
limited to PCI, it is better to address it by adding driver flags at
the core level.

To that end, add a driver_flags field to struct dev_pm_info for flags
that can be set by device drivers at the probe time to inform the PM
core and/or bus types, PM domains and so on on the capabilities and/or
preferences of device drivers.  Also add two static inline helpers
for setting that field and testing it against a given set of flags
and make the driver core clear it automatically on driver remove
and probe failures.

Define and document two PM driver flags related to the direct-
complete feature: NEVER_SKIP and SMART_PREPARE that can be used,
respectively, to indicate to the PM core that the direct-complete
mechanism should never be used for the device and to inform the
middle layer code (bus types, PM domains etc) that it can only
request the PM core to use the direct-complete mechanism for
the device (by returning a positive value from its ->prepare
callback) if it also has been requested by the driver.

While at it, make the core check pm_runtime_suspended() when
setting power.direct_complete so that it doesn't need to be
checked by ->prepare callbacks.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 Documentation/driver-api/pm/devices.rst | 14 ++++++++++++++
 Documentation/power/pci.txt             | 19 +++++++++++++++++++
 drivers/acpi/device_pm.c                | 13 +++++++++----
 drivers/base/dd.c                       |  2 ++
 drivers/base/power/main.c               |  4 +++-
 drivers/pci/pci-driver.c                |  5 ++++-
 include/linux/device.h                  | 10 ++++++++++
 include/linux/pm.h                      | 20 ++++++++++++++++++++
 8 files changed, 81 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst
index 4a18ef9997c0..8add5b302a89 100644
--- a/Documentation/driver-api/pm/devices.rst
+++ b/Documentation/driver-api/pm/devices.rst
@@ -354,6 +354,20 @@ the phases are: ``prepare``, ``suspend``, ``suspend_late``, ``suspend_noirq``.
 	is because all such devices are initially set to runtime-suspended with
 	runtime PM disabled.
 
+	This feature also can be controlled by device drivers by using the
+	``DPM_FLAG_NEVER_SKIP`` and ``DPM_FLAG_SMART_PREPARE`` driver power
+	management flags.  [Typically, they are set at the time the driver is
+	probed against the device in question by passing them to the
+	:c:func:`dev_pm_set_driver_flags` helper function.]  If the first of
+	these flags is set, the PM core will not apply the direct-complete
+	procedure described above to the given device and, consequenty, to any
+	of its ancestors.  The second flag, when set, informs the middle layer
+	code (bus types, device types, PM domains, classes) that it should take
+	the return value of the ``->prepare`` callback provided by the driver
+	into account and it may only return a positive value from its own
+	``->prepare`` callback if the driver's one also has returned a positive
+	value.
+
     2.	The ``->suspend`` methods should quiesce the device to stop it from
 	performing I/O.  They also may save the device registers and put it into
 	the appropriate low-power state, depending on the bus type the device is
diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt
index a1b7f7158930..ab4e7d0540c1 100644
--- a/Documentation/power/pci.txt
+++ b/Documentation/power/pci.txt
@@ -961,6 +961,25 @@ dev_pm_ops to indicate that one suspend routine is to be pointed to by the
 .suspend(), .freeze(), and .poweroff() members and one resume routine is to
 be pointed to by the .resume(), .thaw(), and .restore() members.
 
+3.1.19. Driver Flags for Power Management
+
+The PM core allows device drivers to set flags that influence the handling of
+power management for the devices by the core itself and by middle layer code
+including the PCI bus type.  The flags should be set once at the driver probe
+time with the help of the dev_pm_set_driver_flags() function and they should not
+be updated directly afterwards.
+
+The DPM_FLAG_NEVER_SKIP flag prevents the PM core from using the direct-complete
+mechanism allowing device suspend/resume callbacks to be skipped if the device
+is in runtime suspend when the system suspend starts.  That also affects all of
+the ancestors of the device, so this flag should only be used if absolutely
+necessary.
+
+The DPM_FLAG_SMART_PREPARE flag instructs the PCI bus type to only return a
+positive value from pci_pm_prepare() if the ->prepare callback provided by the
+driver of the device returns a positive value.  That allows the driver to opt
+out from using the direct-complete mechanism dynamically.
+
 3.2. Device Runtime Power Management
 ------------------------------------
 In addition to providing device power management callbacks PCI device drivers
diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index 17e8eb93a76c..b4dcc6144e6b 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -959,11 +959,16 @@ static bool acpi_dev_needs_resume(struct device *dev, struct acpi_device *adev)
 int acpi_subsys_prepare(struct device *dev)
 {
 	struct acpi_device *adev = ACPI_COMPANION(dev);
-	int ret;
 
-	ret = pm_generic_prepare(dev);
-	if (ret < 0)
-		return ret;
+	if (dev->driver && dev->driver->pm && dev->driver->pm->prepare) {
+		int ret = dev->driver->pm->prepare(dev);
+
+		if (ret < 0)
+			return ret;
+
+		if (!ret && dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_PREPARE))
+			return 0;
+	}
 
 	if (!adev || !pm_runtime_suspended(dev))
 		return 0;
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index ad44b40fe284..45575e134696 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -464,6 +464,7 @@ pinctrl_bind_failed:
 	if (dev->pm_domain && dev->pm_domain->dismiss)
 		dev->pm_domain->dismiss(dev);
 	pm_runtime_reinit(dev);
+	dev_pm_set_driver_flags(dev, 0);
 
 	switch (ret) {
 	case -EPROBE_DEFER:
@@ -869,6 +870,7 @@ static void __device_release_driver(struct device *dev, struct device *parent)
 		if (dev->pm_domain && dev->pm_domain->dismiss)
 			dev->pm_domain->dismiss(dev);
 		pm_runtime_reinit(dev);
+		dev_pm_set_driver_flags(dev, 0);
 
 		klist_remove(&dev->p->knode_driver);
 		device_pm_check_callbacks(dev);
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 9bbbbb13a9db..c0135cd95ada 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1700,7 +1700,9 @@ unlock:
 	 * applies to suspend transitions, however.
 	 */
 	spin_lock_irq(&dev->power.lock);
-	dev->power.direct_complete = ret > 0 && state.event == PM_EVENT_SUSPEND;
+	dev->power.direct_complete = state.event == PM_EVENT_SUSPEND &&
+		pm_runtime_suspended(dev) && ret > 0 &&
+		!dev_pm_test_driver_flags(dev, DPM_FLAG_NEVER_SKIP);
 	spin_unlock_irq(&dev->power.lock);
 	return 0;
 }
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 11bd267fc137..68a32703b30a 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -689,8 +689,11 @@ static int pci_pm_prepare(struct device *dev)
 
 	if (drv && drv->pm && drv->pm->prepare) {
 		int error = drv->pm->prepare(dev);
-		if (error)
+		if (error < 0)
 			return error;
+
+		if (!error && dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_PREPARE))
+			return 0;
 	}
 	return pci_dev_keep_suspended(to_pci_dev(dev));
 }
diff --git a/include/linux/device.h b/include/linux/device.h
index c32e6f974d4a..fb9451599aca 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1070,6 +1070,16 @@ static inline void dev_pm_syscore_device(struct device *dev, bool val)
 #endif
 }
 
+static inline void dev_pm_set_driver_flags(struct device *dev, u32 flags)
+{
+	dev->power.driver_flags = flags;
+}
+
+static inline bool dev_pm_test_driver_flags(struct device *dev, u32 flags)
+{
+	return !!(dev->power.driver_flags & flags);
+}
+
 static inline void device_lock(struct device *dev)
 {
 	mutex_lock(&dev->mutex);
diff --git a/include/linux/pm.h b/include/linux/pm.h
index a0ceeccf2846..f10bad831bfa 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -550,6 +550,25 @@ struct pm_subsys_data {
 #endif
 };
 
+/*
+ * Driver flags to control system suspend/resume behavior.
+ *
+ * These flags can be set by device drivers at the probe time.  They need not be
+ * cleared by the drivers as the driver core will take care of that.
+ *
+ * NEVER_SKIP: Do not skip system suspend/resume callbacks for the device.
+ * SMART_PREPARE: Check the return value of the driver's ->prepare callback.
+ *
+ * Setting SMART_PREPARE instructs bus types and PM domains which may want
+ * system suspend/resume callbacks to be skipped for the device to return 0 from
+ * their ->prepare callbacks if the driver's ->prepare callback returns 0 (in
+ * other words, the system suspend/resume callbacks can only be skipped for the
+ * device if its driver doesn't object against that).  This flag has no effect
+ * if NEVER_SKIP is set.
+ */
+#define DPM_FLAG_NEVER_SKIP	BIT(0)
+#define DPM_FLAG_SMART_PREPARE	BIT(1)
+
 struct dev_pm_info {
 	pm_message_t		power_state;
 	unsigned int		can_wakeup:1;
@@ -561,6 +580,7 @@ struct dev_pm_info {
 	bool			is_late_suspended:1;
 	bool			early_init:1;	/* Owned by the PM core */
 	bool			direct_complete:1;	/* Owned by the PM core */
+	u32			driver_flags;
 	spinlock_t		lock;
 #ifdef CONFIG_PM_SLEEP
 	struct list_head	entry;
-- 
cgit v1.2.3


From c2eac4d3a115e2f511844e7bcf73f4e877fbf5da Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 25 Oct 2017 14:16:46 +0200
Subject: PCI / PM: Use the NEVER_SKIP driver flag

Replace the PCI-specific flag PCI_DEV_FLAGS_NEEDS_RESUME with the
PM core's DPM_FLAG_NEVER_SKIP one everywhere and drop it.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/gpu/drm/i915/i915_drv.c | 2 +-
 drivers/misc/mei/pci-me.c       | 2 +-
 drivers/misc/mei/pci-txe.c      | 2 +-
 drivers/pci/pci.c               | 3 +--
 include/linux/pci.h             | 7 +------
 5 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 9f45cfeae775..f124de3a0668 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1304,7 +1304,7 @@ int i915_driver_load(struct pci_dev *pdev, const struct pci_device_id *ent)
 	 * becaue the HDA driver may require us to enable the audio power
 	 * domain during system suspend.
 	 */
-	pdev->dev_flags |= PCI_DEV_FLAGS_NEEDS_RESUME;
+	dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_NEVER_SKIP);
 
 	ret = i915_driver_init_early(dev_priv, ent);
 	if (ret < 0)
diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c
index 4ff40d319676..f17e4b435fa9 100644
--- a/drivers/misc/mei/pci-me.c
+++ b/drivers/misc/mei/pci-me.c
@@ -223,7 +223,7 @@ static int mei_me_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	 * MEI requires to resume from runtime suspend mode
 	 * in order to perform link reset flow upon system suspend.
 	 */
-	pdev->dev_flags |= PCI_DEV_FLAGS_NEEDS_RESUME;
+	dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_NEVER_SKIP);
 
 	/*
 	* For not wake-able HW runtime pm framework
diff --git a/drivers/misc/mei/pci-txe.c b/drivers/misc/mei/pci-txe.c
index e38a5f144373..f911a08e3579 100644
--- a/drivers/misc/mei/pci-txe.c
+++ b/drivers/misc/mei/pci-txe.c
@@ -141,7 +141,7 @@ static int mei_txe_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	 * MEI requires to resume from runtime suspend mode
 	 * in order to perform link reset flow upon system suspend.
 	 */
-	pdev->dev_flags |= PCI_DEV_FLAGS_NEEDS_RESUME;
+	dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_NEVER_SKIP);
 
 	/*
 	* For not wake-able HW runtime pm framework
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 6078dfc11b11..374f5686e2bc 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2166,8 +2166,7 @@ bool pci_dev_keep_suspended(struct pci_dev *pci_dev)
 
 	if (!pm_runtime_suspended(dev)
 	    || pci_target_state(pci_dev, wakeup) != pci_dev->current_state
-	    || platform_pci_need_resume(pci_dev)
-	    || (pci_dev->dev_flags & PCI_DEV_FLAGS_NEEDS_RESUME))
+	    || platform_pci_need_resume(pci_dev))
 		return false;
 
 	/*
diff --git a/include/linux/pci.h b/include/linux/pci.h
index f4f8ee5a7362..4b65fa4fb94e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -205,13 +205,8 @@ enum pci_dev_flags {
 	PCI_DEV_FLAGS_BRIDGE_XLATE_ROOT = (__force pci_dev_flags_t) (1 << 9),
 	/* Do not use FLR even if device advertises PCI_AF_CAP */
 	PCI_DEV_FLAGS_NO_FLR_RESET = (__force pci_dev_flags_t) (1 << 10),
-	/*
-	 * Resume before calling the driver's system suspend hooks, disabling
-	 * the direct_complete optimization.
-	 */
-	PCI_DEV_FLAGS_NEEDS_RESUME = (__force pci_dev_flags_t) (1 << 11),
 	/* Don't use Relaxed Ordering for TLPs directed at this device */
-	PCI_DEV_FLAGS_NO_RELAXED_ORDERING = (__force pci_dev_flags_t) (1 << 12),
+	PCI_DEV_FLAGS_NO_RELAXED_ORDERING = (__force pci_dev_flags_t) (1 << 11),
 };
 
 enum pci_irq_reroute_variant {
-- 
cgit v1.2.3


From 0eab11c9ae3b3cc5dd76f20b81d0247647a6e96f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 26 Oct 2017 12:12:08 +0200
Subject: PM / core: Add SMART_SUSPEND driver flag

Define and document a SMART_SUSPEND flag to instruct bus types and PM
domains that the system suspend callbacks provided by the driver can
cope with runtime-suspended devices, so from the driver's perspective
it should be safe to leave devices in runtime suspend during system
suspend.

Setting that flag may also cause middle-layer code (bus types,
PM domains etc.) to skip invocations of the ->suspend_late and
->suspend_noirq callbacks provided by the driver if the device
is in runtime suspend at the beginning of the "late" phase of
the system-wide suspend transition, in which case the driver's
system-wide resume callbacks may be invoked back-to-back with
its ->runtime_suspend callback, so the driver has to be able to
cope with that too.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 Documentation/driver-api/pm/devices.rst | 20 ++++++++++++++++++++
 drivers/base/power/main.c               |  3 +++
 include/linux/pm.h                      |  8 ++++++++
 3 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst
index 8add5b302a89..574dadd06dec 100644
--- a/Documentation/driver-api/pm/devices.rst
+++ b/Documentation/driver-api/pm/devices.rst
@@ -766,6 +766,26 @@ the state of devices (possibly except for resuming them from runtime suspend)
 from their ``->prepare`` and ``->suspend`` callbacks (or equivalent) *before*
 invoking device drivers' ``->suspend`` callbacks (or equivalent).
 
+Some bus types and PM domains have a policy to resume all devices from runtime
+suspend upfront in their ``->suspend`` callbacks, but that may not be really
+necessary if the driver of the device can cope with runtime-suspended devices.
+The driver can indicate that by setting ``DPM_FLAG_SMART_SUSPEND`` in
+:c:member:`power.driver_flags` at the probe time, by passing it to the
+:c:func:`dev_pm_set_driver_flags` helper.  That also may cause middle-layer code
+(bus types, PM domains etc.) to skip the ``->suspend_late`` and
+``->suspend_noirq`` callbacks provided by the driver if the device remains in
+runtime suspend at the beginning of the ``suspend_late`` phase of system-wide
+suspend (or in the ``poweroff_late`` phase of hibernation), when runtime PM
+has been disabled for it, under the assumption that its state should not change
+after that point until the system-wide transition is over.  If that happens, the
+driver's system-wide resume callbacks, if present, may still be invoked during
+the subsequent system-wide resume transition and the device's runtime power
+management status may be set to "active" before enabling runtime PM for it,
+so the driver must be prepared to cope with the invocation of its system-wide
+resume callbacks back-to-back with its ``->runtime_suspend`` one (without the
+intervening ``->runtime_resume`` and so on) and the final state of the device
+must reflect the "active" status for runtime PM in that case.
+
 During system-wide resume from a sleep state it's easiest to put devices into
 the full-power state, as explained in :file:`Documentation/power/runtime_pm.txt`.
 Refer to that document for more information regarding this particular issue as
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index c0135cd95ada..8d9024017645 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1652,6 +1652,9 @@ static int device_prepare(struct device *dev, pm_message_t state)
 	if (dev->power.syscore)
 		return 0;
 
+	WARN_ON(dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) &&
+		!pm_runtime_enabled(dev));
+
 	/*
 	 * If a device's parent goes into runtime suspend at the wrong time,
 	 * it won't be possible to resume the device.  To prevent this we
diff --git a/include/linux/pm.h b/include/linux/pm.h
index f10bad831bfa..43b5418e05bb 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -558,6 +558,7 @@ struct pm_subsys_data {
  *
  * NEVER_SKIP: Do not skip system suspend/resume callbacks for the device.
  * SMART_PREPARE: Check the return value of the driver's ->prepare callback.
+ * SMART_SUSPEND: No need to resume the device from runtime suspend.
  *
  * Setting SMART_PREPARE instructs bus types and PM domains which may want
  * system suspend/resume callbacks to be skipped for the device to return 0 from
@@ -565,9 +566,16 @@ struct pm_subsys_data {
  * other words, the system suspend/resume callbacks can only be skipped for the
  * device if its driver doesn't object against that).  This flag has no effect
  * if NEVER_SKIP is set.
+ *
+ * Setting SMART_SUSPEND instructs bus types and PM domains which may want to
+ * runtime resume the device upfront during system suspend that doing so is not
+ * necessary from the driver's perspective.  It also may cause them to skip
+ * invocations of the ->suspend_late and ->suspend_noirq callbacks provided by
+ * the driver if they decide to leave the device in runtime suspend.
  */
 #define DPM_FLAG_NEVER_SKIP	BIT(0)
 #define DPM_FLAG_SMART_PREPARE	BIT(1)
+#define DPM_FLAG_SMART_SUSPEND	BIT(2)
 
 struct dev_pm_info {
 	pm_message_t		power_state;
-- 
cgit v1.2.3


From c4b65157aeefad29b2351a00a010e8c40ce7fd0e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 26 Oct 2017 12:12:22 +0200
Subject: PCI / PM: Take SMART_SUSPEND driver flag into account

Make the PCI bus type take DPM_FLAG_SMART_SUSPEND into account in its
system-wide PM callbacks and make sure that all code that should not
run in parallel with pci_pm_runtime_resume() is executed in the "late"
phases of system suspend, freeze and poweroff transitions.

[Note that the pm_runtime_suspended() check in pci_dev_keep_suspended()
is an optimization, because if is not passed, all of the subsequent
checks may be skipped and some of them are much more overhead in
general.]

Also use the observation that if the device is in runtime suspend
at the beginning of the "late" phase of a system-wide suspend-like
transition, its state cannot change going forward (runtime PM is
disabled for it at that time) until the transition is over and the
subsequent system-wide PM callbacks should be skipped for it (as
they generally assume the device to not be suspended), so add checks
for that in pci_pm_suspend_late/noirq(), pci_pm_freeze_late/noirq()
and pci_pm_poweroff_late/noirq().

Moreover, if pci_pm_resume_noirq() or pci_pm_restore_noirq() is
called during the subsequent system-wide resume transition and if
the device was left in runtime suspend previously, its runtime PM
status needs to be changed to "active" as it is going to be put
into the full-power state, so add checks for that too to these
functions.

In turn, if pci_pm_thaw_noirq() runs after the device has been
left in runtime suspend, the subsequent "thaw" callbacks need
to be skipped for it (as they may not work correctly with a
suspended device), so set the power.direct_complete flag for the
device then to make the PM core skip those callbacks.

In addition to the above add a core helper for checking if
DPM_FLAG_SMART_SUSPEND is set and the device runtime PM status is
"suspended" at the same time, which is done quite often in the new
code (and will be done elsewhere going forward too).

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 Documentation/power/pci.txt |  14 ++++++
 drivers/base/power/main.c   |   6 +++
 drivers/pci/pci-driver.c    | 103 ++++++++++++++++++++++++++++++++++++--------
 include/linux/pm.h          |   2 +
 4 files changed, 108 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt
index ab4e7d0540c1..304162ea377e 100644
--- a/Documentation/power/pci.txt
+++ b/Documentation/power/pci.txt
@@ -980,6 +980,20 @@ positive value from pci_pm_prepare() if the ->prepare callback provided by the
 driver of the device returns a positive value.  That allows the driver to opt
 out from using the direct-complete mechanism dynamically.
 
+The DPM_FLAG_SMART_SUSPEND flag tells the PCI bus type that from the driver's
+perspective the device can be safely left in runtime suspend during system
+suspend.  That causes pci_pm_suspend(), pci_pm_freeze() and pci_pm_poweroff()
+to skip resuming the device from runtime suspend unless there are PCI-specific
+reasons for doing that.  Also, it causes pci_pm_suspend_late/noirq(),
+pci_pm_freeze_late/noirq() and pci_pm_poweroff_late/noirq() to return early
+if the device remains in runtime suspend in the beginning of the "late" phase
+of the system-wide transition under way.  Moreover, if the device is in
+runtime suspend in pci_pm_resume_noirq() or pci_pm_restore_noirq(), its runtime
+power management status will be changed to "active" (as it is going to be put
+into D0 going forward), but if it is in runtime suspend in pci_pm_thaw_noirq(),
+the function will set the power.direct_complete flag for it (to make the PM core
+skip the subsequent "thaw" callbacks for it) and return.
+
 3.2. Device Runtime Power Management
 ------------------------------------
 In addition to providing device power management callbacks PCI device drivers
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 8d9024017645..6c6f1c74c24c 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1861,3 +1861,9 @@ void device_pm_check_callbacks(struct device *dev)
 		 !dev->driver->suspend && !dev->driver->resume));
 	spin_unlock_irq(&dev->power.lock);
 }
+
+bool dev_pm_smart_suspend_and_suspended(struct device *dev)
+{
+	return dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) &&
+		pm_runtime_status_suspended(dev);
+}
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index c1aeeb10539e..d19bd54d337e 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -734,18 +734,25 @@ static int pci_pm_suspend(struct device *dev)
 
 	if (!pm) {
 		pci_pm_default_suspend(pci_dev);
-		goto Fixup;
+		return 0;
 	}
 
 	/*
-	 * PCI devices suspended at run time need to be resumed at this point,
-	 * because in general it is necessary to reconfigure them for system
-	 * suspend.  Namely, if the device is supposed to wake up the system
-	 * from the sleep state, we may need to reconfigure it for this purpose.
-	 * In turn, if the device is not supposed to wake up the system from the
-	 * sleep state, we'll have to prevent it from signaling wake-up.
+	 * PCI devices suspended at run time may need to be resumed at this
+	 * point, because in general it may be necessary to reconfigure them for
+	 * system suspend.  Namely, if the device is expected to wake up the
+	 * system from the sleep state, it may have to be reconfigured for this
+	 * purpose, or if the device is not expected to wake up the system from
+	 * the sleep state, it should be prevented from signaling wakeup events
+	 * going forward.
+	 *
+	 * Also if the driver of the device does not indicate that its system
+	 * suspend callbacks can cope with runtime-suspended devices, it is
+	 * better to resume the device from runtime suspend here.
 	 */
-	pm_runtime_resume(dev);
+	if (!dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) ||
+	    !pci_dev_keep_suspended(pci_dev))
+		pm_runtime_resume(dev);
 
 	pci_dev->state_saved = false;
 	if (pm->suspend) {
@@ -765,17 +772,27 @@ static int pci_pm_suspend(struct device *dev)
 		}
 	}
 
- Fixup:
-	pci_fixup_device(pci_fixup_suspend, pci_dev);
-
 	return 0;
 }
 
+static int pci_pm_suspend_late(struct device *dev)
+{
+	if (dev_pm_smart_suspend_and_suspended(dev))
+		return 0;
+
+	pci_fixup_device(pci_fixup_suspend, to_pci_dev(dev));
+
+	return pm_generic_suspend_late(dev);
+}
+
 static int pci_pm_suspend_noirq(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
 	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 
+	if (dev_pm_smart_suspend_and_suspended(dev))
+		return 0;
+
 	if (pci_has_legacy_pm_support(pci_dev))
 		return pci_legacy_suspend_late(dev, PMSG_SUSPEND);
 
@@ -834,6 +851,14 @@ static int pci_pm_resume_noirq(struct device *dev)
 	struct device_driver *drv = dev->driver;
 	int error = 0;
 
+	/*
+	 * Devices with DPM_FLAG_SMART_SUSPEND may be left in runtime suspend
+	 * during system suspend, so update their runtime PM status to "active"
+	 * as they are going to be put into D0 shortly.
+	 */
+	if (dev_pm_smart_suspend_and_suspended(dev))
+		pm_runtime_set_active(dev);
+
 	pci_pm_default_resume_early(pci_dev);
 
 	if (pci_has_legacy_pm_support(pci_dev))
@@ -876,6 +901,7 @@ static int pci_pm_resume(struct device *dev)
 #else /* !CONFIG_SUSPEND */
 
 #define pci_pm_suspend		NULL
+#define pci_pm_suspend_late	NULL
 #define pci_pm_suspend_noirq	NULL
 #define pci_pm_resume		NULL
 #define pci_pm_resume_noirq	NULL
@@ -910,7 +936,8 @@ static int pci_pm_freeze(struct device *dev)
 	 * devices should not be touched during freeze/thaw transitions,
 	 * however.
 	 */
-	pm_runtime_resume(dev);
+	if (!dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND))
+		pm_runtime_resume(dev);
 
 	pci_dev->state_saved = false;
 	if (pm->freeze) {
@@ -925,11 +952,22 @@ static int pci_pm_freeze(struct device *dev)
 	return 0;
 }
 
+static int pci_pm_freeze_late(struct device *dev)
+{
+	if (dev_pm_smart_suspend_and_suspended(dev))
+		return 0;
+
+	return pm_generic_freeze_late(dev);;
+}
+
 static int pci_pm_freeze_noirq(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
 	struct device_driver *drv = dev->driver;
 
+	if (dev_pm_smart_suspend_and_suspended(dev))
+		return 0;
+
 	if (pci_has_legacy_pm_support(pci_dev))
 		return pci_legacy_suspend_late(dev, PMSG_FREEZE);
 
@@ -959,6 +997,16 @@ static int pci_pm_thaw_noirq(struct device *dev)
 	struct device_driver *drv = dev->driver;
 	int error = 0;
 
+	/*
+	 * If the device is in runtime suspend, the code below may not work
+	 * correctly with it, so skip that code and make the PM core skip all of
+	 * the subsequent "thaw" callbacks for the device.
+	 */
+	if (dev_pm_smart_suspend_and_suspended(dev)) {
+		dev->power.direct_complete = true;
+		return 0;
+	}
+
 	if (pcibios_pm_ops.thaw_noirq) {
 		error = pcibios_pm_ops.thaw_noirq(dev);
 		if (error)
@@ -1008,11 +1056,13 @@ static int pci_pm_poweroff(struct device *dev)
 
 	if (!pm) {
 		pci_pm_default_suspend(pci_dev);
-		goto Fixup;
+		return 0;
 	}
 
 	/* The reason to do that is the same as in pci_pm_suspend(). */
-	pm_runtime_resume(dev);
+	if (!dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) ||
+	    !pci_dev_keep_suspended(pci_dev))
+		pm_runtime_resume(dev);
 
 	pci_dev->state_saved = false;
 	if (pm->poweroff) {
@@ -1024,17 +1074,27 @@ static int pci_pm_poweroff(struct device *dev)
 			return error;
 	}
 
- Fixup:
-	pci_fixup_device(pci_fixup_suspend, pci_dev);
-
 	return 0;
 }
 
+static int pci_pm_poweroff_late(struct device *dev)
+{
+	if (dev_pm_smart_suspend_and_suspended(dev))
+		return 0;
+
+	pci_fixup_device(pci_fixup_suspend, to_pci_dev(dev));
+
+	return pm_generic_poweroff_late(dev);
+}
+
 static int pci_pm_poweroff_noirq(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
 	struct device_driver *drv = dev->driver;
 
+	if (dev_pm_smart_suspend_and_suspended(dev))
+		return 0;
+
 	if (pci_has_legacy_pm_support(to_pci_dev(dev)))
 		return pci_legacy_suspend_late(dev, PMSG_HIBERNATE);
 
@@ -1076,6 +1136,10 @@ static int pci_pm_restore_noirq(struct device *dev)
 	struct device_driver *drv = dev->driver;
 	int error = 0;
 
+	/* This is analogous to the pci_pm_resume_noirq() case. */
+	if (dev_pm_smart_suspend_and_suspended(dev))
+		pm_runtime_set_active(dev);
+
 	if (pcibios_pm_ops.restore_noirq) {
 		error = pcibios_pm_ops.restore_noirq(dev);
 		if (error)
@@ -1124,10 +1188,12 @@ static int pci_pm_restore(struct device *dev)
 #else /* !CONFIG_HIBERNATE_CALLBACKS */
 
 #define pci_pm_freeze		NULL
+#define pci_pm_freeze_late	NULL
 #define pci_pm_freeze_noirq	NULL
 #define pci_pm_thaw		NULL
 #define pci_pm_thaw_noirq	NULL
 #define pci_pm_poweroff		NULL
+#define pci_pm_poweroff_late	NULL
 #define pci_pm_poweroff_noirq	NULL
 #define pci_pm_restore		NULL
 #define pci_pm_restore_noirq	NULL
@@ -1243,10 +1309,13 @@ static const struct dev_pm_ops pci_dev_pm_ops = {
 	.prepare = pci_pm_prepare,
 	.complete = pci_pm_complete,
 	.suspend = pci_pm_suspend,
+	.suspend_late = pci_pm_suspend_late,
 	.resume = pci_pm_resume,
 	.freeze = pci_pm_freeze,
+	.freeze_late = pci_pm_freeze_late,
 	.thaw = pci_pm_thaw,
 	.poweroff = pci_pm_poweroff,
+	.poweroff_late = pci_pm_poweroff_late,
 	.restore = pci_pm_restore,
 	.suspend_noirq = pci_pm_suspend_noirq,
 	.resume_noirq = pci_pm_resume_noirq,
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 43b5418e05bb..65d39115f06d 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -765,6 +765,8 @@ extern int pm_generic_poweroff_late(struct device *dev);
 extern int pm_generic_poweroff(struct device *dev);
 extern void pm_generic_complete(struct device *dev);
 
+extern bool dev_pm_smart_suspend_and_suspended(struct device *dev);
+
 #else /* !CONFIG_PM_SLEEP */
 
 #define device_pm_lock() do {} while (0)
-- 
cgit v1.2.3


From 05087360fd7acf2cc9b7bbb243c12765c44c7693 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 27 Oct 2017 10:10:16 +0200
Subject: ACPI / PM: Take SMART_SUSPEND driver flag into account

Make the ACPI PM domain take DPM_FLAG_SMART_SUSPEND into account in
its system suspend callbacks.

[Note that the pm_runtime_suspended() check in acpi_dev_needs_resume()
is an optimization, because if is not passed, all of the subsequent
checks may be skipped and some of them are much more overhead in
general.]

Also use the observation that if the device is in runtime suspend
at the beginning of the "late" phase of a system-wide suspend-like
transition, its state cannot change going forward (runtime PM is
disabled for it at that time) until the transition is over and the
subsequent system-wide PM callbacks should be skipped for it (as
they generally assume the device to not be suspended), so add
checks for that in acpi_subsys_suspend_late/noirq() and
acpi_subsys_freeze_late/noirq().

Moreover, if acpi_subsys_resume_noirq() is called during the
subsequent system-wide resume transition and if the device was left
in runtime suspend previously, its runtime PM status needs to be
changed to "active" as it is going to be put into the full-power
state going forward, so add a check for that too in there.

In turn, if acpi_subsys_thaw_noirq() runs after the device has been
left in runtime suspend, the subsequent "thaw" callbacks need
to be skipped for it (as they may not work correctly with a
suspended device), so set the power.direct_complete flag for the
device then to make the PM core skip those callbacks.

On top of the above, make the analogous changes in the acpi_lpss
driver that uses the ACPI PM domain callbacks.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/acpi/acpi_lpss.c |  13 +++++-
 drivers/acpi/device_pm.c | 113 +++++++++++++++++++++++++++++++++++++++++++----
 include/linux/acpi.h     |  10 +++++
 3 files changed, 126 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
index 04d32bdb5a95..de7385b824e1 100644
--- a/drivers/acpi/acpi_lpss.c
+++ b/drivers/acpi/acpi_lpss.c
@@ -849,8 +849,12 @@ static int acpi_lpss_resume(struct device *dev)
 #ifdef CONFIG_PM_SLEEP
 static int acpi_lpss_suspend_late(struct device *dev)
 {
-	int ret = pm_generic_suspend_late(dev);
+	int ret;
+
+	if (dev_pm_smart_suspend_and_suspended(dev))
+		return 0;
 
+	ret = pm_generic_suspend_late(dev);
 	return ret ? ret : acpi_lpss_suspend(dev, device_may_wakeup(dev));
 }
 
@@ -889,10 +893,17 @@ static struct dev_pm_domain acpi_lpss_pm_domain = {
 		.complete = acpi_subsys_complete,
 		.suspend = acpi_subsys_suspend,
 		.suspend_late = acpi_lpss_suspend_late,
+		.suspend_noirq = acpi_subsys_suspend_noirq,
+		.resume_noirq = acpi_subsys_resume_noirq,
 		.resume_early = acpi_lpss_resume_early,
 		.freeze = acpi_subsys_freeze,
+		.freeze_late = acpi_subsys_freeze_late,
+		.freeze_noirq = acpi_subsys_freeze_noirq,
+		.thaw_noirq = acpi_subsys_thaw_noirq,
 		.poweroff = acpi_subsys_suspend,
 		.poweroff_late = acpi_lpss_suspend_late,
+		.poweroff_noirq = acpi_subsys_suspend_noirq,
+		.restore_noirq = acpi_subsys_resume_noirq,
 		.restore_early = acpi_lpss_resume_early,
 #endif
 		.runtime_suspend = acpi_lpss_runtime_suspend,
diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index b4dcc6144e6b..3d6ec51d2bbc 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -936,7 +936,8 @@ static bool acpi_dev_needs_resume(struct device *dev, struct acpi_device *adev)
 	u32 sys_target = acpi_target_system_state();
 	int ret, state;
 
-	if (device_may_wakeup(dev) != !!adev->wakeup.prepare_count)
+	if (!pm_runtime_suspended(dev) || !adev ||
+	    device_may_wakeup(dev) != !!adev->wakeup.prepare_count)
 		return true;
 
 	if (sys_target == ACPI_STATE_S0)
@@ -970,9 +971,6 @@ int acpi_subsys_prepare(struct device *dev)
 			return 0;
 	}
 
-	if (!adev || !pm_runtime_suspended(dev))
-		return 0;
-
 	return !acpi_dev_needs_resume(dev, adev);
 }
 EXPORT_SYMBOL_GPL(acpi_subsys_prepare);
@@ -998,12 +996,17 @@ EXPORT_SYMBOL_GPL(acpi_subsys_complete);
  * acpi_subsys_suspend - Run the device driver's suspend callback.
  * @dev: Device to handle.
  *
- * Follow PCI and resume devices suspended at run time before running their
- * system suspend callbacks.
+ * Follow PCI and resume devices from runtime suspend before running their
+ * system suspend callbacks, unless the driver can cope with runtime-suspended
+ * devices during system suspend and there are no ACPI-specific reasons for
+ * resuming them.
  */
 int acpi_subsys_suspend(struct device *dev)
 {
-	pm_runtime_resume(dev);
+	if (!dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) ||
+	    acpi_dev_needs_resume(dev, ACPI_COMPANION(dev)))
+		pm_runtime_resume(dev);
+
 	return pm_generic_suspend(dev);
 }
 EXPORT_SYMBOL_GPL(acpi_subsys_suspend);
@@ -1017,11 +1020,47 @@ EXPORT_SYMBOL_GPL(acpi_subsys_suspend);
  */
 int acpi_subsys_suspend_late(struct device *dev)
 {
-	int ret = pm_generic_suspend_late(dev);
+	int ret;
+
+	if (dev_pm_smart_suspend_and_suspended(dev))
+		return 0;
+
+	ret = pm_generic_suspend_late(dev);
 	return ret ? ret : acpi_dev_suspend(dev, device_may_wakeup(dev));
 }
 EXPORT_SYMBOL_GPL(acpi_subsys_suspend_late);
 
+/**
+ * acpi_subsys_suspend_noirq - Run the device driver's "noirq" suspend callback.
+ * @dev: Device to suspend.
+ */
+int acpi_subsys_suspend_noirq(struct device *dev)
+{
+	if (dev_pm_smart_suspend_and_suspended(dev))
+		return 0;
+
+	return pm_generic_suspend_noirq(dev);
+}
+EXPORT_SYMBOL_GPL(acpi_subsys_suspend_noirq);
+
+/**
+ * acpi_subsys_resume_noirq - Run the device driver's "noirq" resume callback.
+ * @dev: Device to handle.
+ */
+int acpi_subsys_resume_noirq(struct device *dev)
+{
+	/*
+	 * Devices with DPM_FLAG_SMART_SUSPEND may be left in runtime suspend
+	 * during system suspend, so update their runtime PM status to "active"
+	 * as they will be put into D0 going forward.
+	 */
+	if (dev_pm_smart_suspend_and_suspended(dev))
+		pm_runtime_set_active(dev);
+
+	return pm_generic_resume_noirq(dev);
+}
+EXPORT_SYMBOL_GPL(acpi_subsys_resume_noirq);
+
 /**
  * acpi_subsys_resume_early - Resume device using ACPI.
  * @dev: Device to Resume.
@@ -1049,11 +1088,60 @@ int acpi_subsys_freeze(struct device *dev)
 	 * runtime-suspended devices should not be touched during freeze/thaw
 	 * transitions.
 	 */
-	pm_runtime_resume(dev);
+	if (!dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND))
+		pm_runtime_resume(dev);
+
 	return pm_generic_freeze(dev);
 }
 EXPORT_SYMBOL_GPL(acpi_subsys_freeze);
 
+/**
+ * acpi_subsys_freeze_late - Run the device driver's "late" freeze callback.
+ * @dev: Device to handle.
+ */
+int acpi_subsys_freeze_late(struct device *dev)
+{
+
+	if (dev_pm_smart_suspend_and_suspended(dev))
+		return 0;
+
+	return pm_generic_freeze_late(dev);
+}
+EXPORT_SYMBOL_GPL(acpi_subsys_freeze_late);
+
+/**
+ * acpi_subsys_freeze_noirq - Run the device driver's "noirq" freeze callback.
+ * @dev: Device to handle.
+ */
+int acpi_subsys_freeze_noirq(struct device *dev)
+{
+
+	if (dev_pm_smart_suspend_and_suspended(dev))
+		return 0;
+
+	return pm_generic_freeze_noirq(dev);
+}
+EXPORT_SYMBOL_GPL(acpi_subsys_freeze_noirq);
+
+/**
+ * acpi_subsys_thaw_noirq - Run the device driver's "noirq" thaw callback.
+ * @dev: Device to handle.
+ */
+int acpi_subsys_thaw_noirq(struct device *dev)
+{
+	/*
+	 * If the device is in runtime suspend, the "thaw" code may not work
+	 * correctly with it, so skip the driver callback and make the PM core
+	 * skip all of the subsequent "thaw" callbacks for the device.
+	 */
+	if (dev_pm_smart_suspend_and_suspended(dev)) {
+		dev->power.direct_complete = true;
+		return 0;
+	}
+
+	return pm_generic_thaw_noirq(dev);
+}
+EXPORT_SYMBOL_GPL(acpi_subsys_thaw_noirq);
 #endif /* CONFIG_PM_SLEEP */
 
 static struct dev_pm_domain acpi_general_pm_domain = {
@@ -1065,10 +1153,17 @@ static struct dev_pm_domain acpi_general_pm_domain = {
 		.complete = acpi_subsys_complete,
 		.suspend = acpi_subsys_suspend,
 		.suspend_late = acpi_subsys_suspend_late,
+		.suspend_noirq = acpi_subsys_suspend_noirq,
+		.resume_noirq = acpi_subsys_resume_noirq,
 		.resume_early = acpi_subsys_resume_early,
 		.freeze = acpi_subsys_freeze,
+		.freeze_late = acpi_subsys_freeze_late,
+		.freeze_noirq = acpi_subsys_freeze_noirq,
+		.thaw_noirq = acpi_subsys_thaw_noirq,
 		.poweroff = acpi_subsys_suspend,
 		.poweroff_late = acpi_subsys_suspend_late,
+		.poweroff_noirq = acpi_subsys_suspend_noirq,
+		.restore_noirq = acpi_subsys_resume_noirq,
 		.restore_early = acpi_subsys_resume_early,
 #endif
 	},
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 0ada2a948b44..dc1ebfeeb5ec 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -885,17 +885,27 @@ int acpi_dev_suspend_late(struct device *dev);
 int acpi_subsys_prepare(struct device *dev);
 void acpi_subsys_complete(struct device *dev);
 int acpi_subsys_suspend_late(struct device *dev);
+int acpi_subsys_suspend_noirq(struct device *dev);
+int acpi_subsys_resume_noirq(struct device *dev);
 int acpi_subsys_resume_early(struct device *dev);
 int acpi_subsys_suspend(struct device *dev);
 int acpi_subsys_freeze(struct device *dev);
+int acpi_subsys_freeze_late(struct device *dev);
+int acpi_subsys_freeze_noirq(struct device *dev);
+int acpi_subsys_thaw_noirq(struct device *dev);
 #else
 static inline int acpi_dev_resume_early(struct device *dev) { return 0; }
 static inline int acpi_subsys_prepare(struct device *dev) { return 0; }
 static inline void acpi_subsys_complete(struct device *dev) {}
 static inline int acpi_subsys_suspend_late(struct device *dev) { return 0; }
+static inline int acpi_subsys_suspend_noirq(struct device *dev) { return 0; }
+static inline int acpi_subsys_resume_noirq(struct device *dev) { return 0; }
 static inline int acpi_subsys_resume_early(struct device *dev) { return 0; }
 static inline int acpi_subsys_suspend(struct device *dev) { return 0; }
 static inline int acpi_subsys_freeze(struct device *dev) { return 0; }
+static inline int acpi_subsys_freeze_late(struct device *dev) { return 0; }
+static inline int acpi_subsys_freeze_noirq(struct device *dev) { return 0; }
+static inline int acpi_subsys_thaw_noirq(struct device *dev) { return 0; }
 #endif
 
 #ifdef CONFIG_ACPI
-- 
cgit v1.2.3


From 10738ba8e02bc8cb1c4e832611621aa9666f4a24 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 17 Oct 2017 21:06:32 -0700
Subject: ide: Convert timers to use timer_setup()

In preparation for unconditionally passing the struct timer_list pointer to
all timer callbacks, switch to using the new timer_setup() and from_timer()
to pass the timer pointer explicitly.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: linux-ide@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/ide/ide-io.c    | 4 ++--
 drivers/ide/ide-probe.c | 2 +-
 include/linux/ide.h     | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 3a234701d92c..6f25da56a169 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -611,9 +611,9 @@ static int drive_is_ready(ide_drive_t *drive)
  *	logic that wants cleaning up.
  */
  
-void ide_timer_expiry (unsigned long data)
+void ide_timer_expiry (struct timer_list *t)
 {
-	ide_hwif_t	*hwif = (ide_hwif_t *)data;
+	ide_hwif_t	*hwif = from_timer(hwif, t, timer);
 	ide_drive_t	*uninitialized_var(drive);
 	ide_handler_t	*handler;
 	unsigned long	flags;
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 01b2adfd8226..27a2488c7468 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1184,7 +1184,7 @@ static void ide_init_port_data(ide_hwif_t *hwif, unsigned int index)
 
 	spin_lock_init(&hwif->lock);
 
-	setup_timer(&hwif->timer, &ide_timer_expiry, (unsigned long)hwif);
+	timer_setup(&hwif->timer, ide_timer_expiry, 0);
 
 	init_completion(&hwif->gendev_rel_comp);
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index dc152e4b7f73..cc412175d036 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1211,7 +1211,7 @@ extern int ide_wait_not_busy(ide_hwif_t *hwif, unsigned long timeout);
 
 extern void ide_stall_queue(ide_drive_t *drive, unsigned long timeout);
 
-extern void ide_timer_expiry(unsigned long);
+extern void ide_timer_expiry(struct timer_list *t);
 extern irqreturn_t ide_intr(int irq, void *dev_id);
 extern void do_ide_request(struct request_queue *);
 extern void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq);
-- 
cgit v1.2.3


From 971e4aeeaa126b207d1e85eec6dc6e29bead316e Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sat, 23 Sep 2017 22:13:14 +0200
Subject: mtd: Fix C++ comment in include/linux/mtd/mtd.h

C++ comments look wrong in kernel tree. Fix one.

Signed-off-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 include/linux/mtd/mtd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 6cd0f6b7658b..849543f1f233 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -267,7 +267,7 @@ struct mtd_info {
 	 */
 	unsigned int bitflip_threshold;
 
-	// Kernel-only stuff starts here.
+	/* Kernel-only stuff starts here. */
 	const char *name;
 	int index;
 
-- 
cgit v1.2.3


From 24a0c654d7d6063301c51361f911369264342b3c Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 20 Oct 2017 15:38:54 -0500
Subject: PCI: Add for_each_pci_bridge() helper

The following pattern is often used:

  list_for_each_entry(dev, &bus->devices, bus_list) {
    if (pci_is_bridge(dev)) {
      ...
    }
  }

Add a for_each_pci_bridge() helper to make that code easier to write and
read by reducing indentation level.  It also saves one or few lines of code
in each occurrence.

Convert PCI core parts here at the same time.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
[bhelgaas: fold in http://lkml.kernel.org/r/20171013165352.25550-1-andriy.shevchenko@linux.intel.com]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/hotplug/acpiphp_glue.c     | 15 ++++++---------
 drivers/pci/hotplug/cpci_hotplug_pci.c |  7 ++-----
 drivers/pci/hotplug/pciehp_pci.c       |  5 ++---
 drivers/pci/hotplug/shpchp_pci.c       |  6 ++----
 drivers/pci/probe.c                    |  6 ++----
 drivers/pci/setup-bus.c                |  7 +++----
 drivers/pcmcia/cardbus.c               |  5 ++---
 include/linux/pci.h                    |  4 ++++
 8 files changed, 23 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index 5ed2dcaa8e27..5db6f1839dad 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -462,18 +462,15 @@ static void enable_slot(struct acpiphp_slot *slot)
 	acpiphp_rescan_slot(slot);
 	max = acpiphp_max_busnr(bus);
 	for (pass = 0; pass < 2; pass++) {
-		list_for_each_entry(dev, &bus->devices, bus_list) {
+		for_each_pci_bridge(dev, bus) {
 			if (PCI_SLOT(dev->devfn) != slot->device)
 				continue;
 
-			if (pci_is_bridge(dev)) {
-				max = pci_scan_bridge(bus, dev, max, pass);
-				if (pass && dev->subordinate) {
-					check_hotplug_bridge(slot, dev);
-					pcibios_resource_survey_bus(dev->subordinate);
-					__pci_bus_size_bridges(dev->subordinate,
-							       &add_list);
-				}
+			max = pci_scan_bridge(bus, dev, max, pass);
+			if (pass && dev->subordinate) {
+				check_hotplug_bridge(slot, dev);
+				pcibios_resource_survey_bus(dev->subordinate);
+				__pci_bus_size_bridges(dev->subordinate, &add_list);
 			}
 		}
 	}
diff --git a/drivers/pci/hotplug/cpci_hotplug_pci.c b/drivers/pci/hotplug/cpci_hotplug_pci.c
index 80c80017197d..f616358fa938 100644
--- a/drivers/pci/hotplug/cpci_hotplug_pci.c
+++ b/drivers/pci/hotplug/cpci_hotplug_pci.c
@@ -286,14 +286,11 @@ int cpci_configure_slot(struct slot *slot)
 	}
 	parent = slot->dev->bus;
 
-	list_for_each_entry(dev, &parent->devices, bus_list) {
-		if (PCI_SLOT(dev->devfn) != PCI_SLOT(slot->devfn))
-			continue;
-		if (pci_is_bridge(dev))
+	for_each_pci_bridge(dev, parent) {
+		if (PCI_SLOT(dev->devfn) == PCI_SLOT(slot->devfn))
 			pci_hp_add_bridge(dev);
 	}
 
-
 	pci_assign_unassigned_bridge_resources(parent->self);
 
 	pci_bus_add_devices(parent);
diff --git a/drivers/pci/hotplug/pciehp_pci.c b/drivers/pci/hotplug/pciehp_pci.c
index 19f30a9f461d..c3af027ee1a6 100644
--- a/drivers/pci/hotplug/pciehp_pci.c
+++ b/drivers/pci/hotplug/pciehp_pci.c
@@ -60,9 +60,8 @@ int pciehp_configure_device(struct slot *p_slot)
 		goto out;
 	}
 
-	list_for_each_entry(dev, &parent->devices, bus_list)
-		if (pci_is_bridge(dev))
-			pci_hp_add_bridge(dev);
+	for_each_pci_bridge(dev, parent)
+		pci_hp_add_bridge(dev);
 
 	pci_assign_unassigned_bridge_resources(bridge);
 	pcie_bus_configure_settings(parent);
diff --git a/drivers/pci/hotplug/shpchp_pci.c b/drivers/pci/hotplug/shpchp_pci.c
index f8cd3a27e351..ea63db58b4b1 100644
--- a/drivers/pci/hotplug/shpchp_pci.c
+++ b/drivers/pci/hotplug/shpchp_pci.c
@@ -61,10 +61,8 @@ int shpchp_configure_device(struct slot *p_slot)
 		goto out;
 	}
 
-	list_for_each_entry(dev, &parent->devices, bus_list) {
-		if (PCI_SLOT(dev->devfn) != p_slot->device)
-			continue;
-		if (pci_is_bridge(dev))
+	for_each_pci_bridge(dev, parent) {
+		if (PCI_SLOT(dev->devfn) == p_slot->device)
 			pci_hp_add_bridge(dev);
 	}
 
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index ff94b69738a8..cdc2f83c11c5 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2421,10 +2421,8 @@ unsigned int pci_scan_child_bus(struct pci_bus *bus)
 	}
 
 	for (pass = 0; pass < 2; pass++)
-		list_for_each_entry(dev, &bus->devices, bus_list) {
-			if (pci_is_bridge(dev))
-				max = pci_scan_bridge(bus, dev, max, pass);
-		}
+		for_each_pci_bridge(dev, bus)
+			max = pci_scan_bridge(bus, dev, max, pass);
 
 	/*
 	 * Make sure a hotplug bridge has at least the minimum requested
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 958da7db9033..7ca03407404c 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1921,10 +1921,9 @@ void pci_assign_unassigned_bus_resources(struct pci_bus *bus)
 					want additional resources */
 
 	down_read(&pci_bus_sem);
-	list_for_each_entry(dev, &bus->devices, bus_list)
-		if (pci_is_bridge(dev) && pci_has_subordinate(dev))
-				__pci_bus_size_bridges(dev->subordinate,
-							 &add_list);
+	for_each_pci_bridge(dev, bus)
+		if (pci_has_subordinate(dev))
+			__pci_bus_size_bridges(dev->subordinate, &add_list);
 	up_read(&pci_bus_sem);
 	__pci_bus_assign_resources(bus, &add_list, NULL);
 	BUG_ON(!list_empty(&add_list));
diff --git a/drivers/pcmcia/cardbus.c b/drivers/pcmcia/cardbus.c
index 4fe4cc4ae19a..5c0170597037 100644
--- a/drivers/pcmcia/cardbus.c
+++ b/drivers/pcmcia/cardbus.c
@@ -77,9 +77,8 @@ int __ref cb_alloc(struct pcmcia_socket *s)
 
 	max = bus->busn_res.start;
 	for (pass = 0; pass < 2; pass++)
-		list_for_each_entry(dev, &bus->devices, bus_list)
-			if (pci_is_bridge(dev))
-				max = pci_scan_bridge(bus, dev, max, pass);
+		for_each_pci_bridge(dev, bus)
+			max = pci_scan_bridge(bus, dev, max, pass);
 
 	/*
 	 * Size all resources below the CardBus controller.
diff --git a/include/linux/pci.h b/include/linux/pci.h
index f4f8ee5a7362..3dbe947b4152 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -596,6 +596,10 @@ static inline bool pci_is_bridge(struct pci_dev *dev)
 		dev->hdr_type == PCI_HEADER_TYPE_CARDBUS;
 }
 
+#define for_each_pci_bridge(dev, bus)				\
+	list_for_each_entry(dev, &bus->devices, bus_list)	\
+		if (!pci_is_bridge(dev)) {} else
+
 static inline struct pci_dev *pci_upstream_bridge(struct pci_dev *dev)
 {
 	dev = pci_physfn(dev);
-- 
cgit v1.2.3


From d0c2e691d1cbe43662df3a08a4933f13acc352c3 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 6 Nov 2017 07:17:37 -0600
Subject: objtool: Add a comment for the unreachable annotation macros

Add a comment for the unreachable annotation macros to explain their
purpose and the '__COUNTER__' label hack.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1570e48d9f87e0fc6f0126c32e7e1de6e109cb67.1509974104.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 202710420d6d..f8734fca54ce 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -187,6 +187,11 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 
 /* Unreachable code */
 #ifdef CONFIG_STACK_VALIDATION
+/*
+ * These macros help objtool understand GCC code flow for unreachable code.
+ * The __COUNTER__ based labels are a hack to make each instance of the macros
+ * unique, to convince GCC not to merge duplicate inline asm statements.
+ */
 #define annotate_reachable() ({						\
 	asm("%c0:\n\t"							\
 	    ".pushsection .discard.reachable\n\t"			\
-- 
cgit v1.2.3


From 10259821ac47dbefa6f83ae57f1fa9f1f2c54b3d Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 6 Nov 2017 07:17:38 -0600
Subject: objtool: Make unreachable annotation inline asms explicitly volatile

Add 'volatile' to the unreachable annotation macro inline asm
statements.  They're already implicitly volatile because they don't have
output constraints, but it's clearer and more robust to make them
explicitly volatile.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/28659257b7a6adf4a7f65920dad70b2b0226e996.1509974104.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index f8734fca54ce..4fac29cdffd1 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -193,16 +193,16 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
  * unique, to convince GCC not to merge duplicate inline asm statements.
  */
 #define annotate_reachable() ({						\
-	asm("%c0:\n\t"							\
-	    ".pushsection .discard.reachable\n\t"			\
-	    ".long %c0b - .\n\t"					\
-	    ".popsection\n\t" : : "i" (__COUNTER__));			\
+	asm volatile("%c0:\n\t"						\
+		     ".pushsection .discard.reachable\n\t"		\
+		     ".long %c0b - .\n\t"				\
+		     ".popsection\n\t" : : "i" (__COUNTER__));		\
 })
 #define annotate_unreachable() ({					\
-	asm("%c0:\n\t"							\
-	    ".pushsection .discard.unreachable\n\t"			\
-	    ".long %c0b - .\n\t"					\
-	    ".popsection\n\t" : : "i" (__COUNTER__));			\
+	asm volatile("%c0:\n\t"						\
+		     ".pushsection .discard.unreachable\n\t"		\
+		     ".long %c0b - .\n\t"				\
+		     ".popsection\n\t" : : "i" (__COUNTER__));		\
 })
 #define ASM_UNREACHABLE							\
 	"999:\n\t"							\
-- 
cgit v1.2.3


From f791dd2589d7e217625d7e411621a5bac71cbf69 Mon Sep 17 00:00:00 2001
From: Cheng Jian <cj.chengjian@huawei.com>
Date: Fri, 3 Nov 2017 18:59:48 +0800
Subject: locking/rwlocks: Fix comments

- fix the list of locking API headers in kernel/locking/spinlock.c
- fix an #endif comment

Signed-off-by: Cheng Jian <cj.chengjian@huawei.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: huawei.libin@huawei.com
Cc: xiexiuqi@huawei.com
Link: http://lkml.kernel.org/r/1509706788-152547-1-git-send-email-cj.chengjian@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rwlock_api_smp.h | 2 +-
 kernel/locking/spinlock.c      | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h
index 5b9b84b20407..86ebb4bf9c6e 100644
--- a/include/linux/rwlock_api_smp.h
+++ b/include/linux/rwlock_api_smp.h
@@ -211,7 +211,7 @@ static inline void __raw_write_lock(rwlock_t *lock)
 	LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock);
 }
 
-#endif /* CONFIG_PREEMPT */
+#endif /* !CONFIG_GENERIC_LOCKBREAK || CONFIG_DEBUG_LOCK_ALLOC */
 
 static inline void __raw_write_unlock(rwlock_t *lock)
 {
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index b96343374a87..1fd1a7543cdd 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -30,7 +30,8 @@
 #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
 /*
  * The __lock_function inlines are taken from
- * include/linux/spinlock_api_smp.h
+ * spinlock : include/linux/spinlock_api_smp.h
+ * rwlock   : include/linux/rwlock_api_smp.h
  */
 #else
 
-- 
cgit v1.2.3


From d8aa7eea78a1401cce39b3bb61ead0150044a3df Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Fri, 20 Oct 2017 09:30:44 -0500
Subject: x86/mm: Add Secure Encrypted Virtualization (SEV) support

Provide support for Secure Encrypted Virtualization (SEV). This initial
support defines a flag that is used by the kernel to determine if it is
running with SEV active.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Tested-by: Borislav Petkov <bp@suse.de>
Cc: kvm@vger.kernel.org
Cc: Borislav Petkov <bp@alien8.de>
Cc: Andy Lutomirski <luto@kernel.org>
Link: https://lkml.kernel.org/r/20171020143059.3291-3-brijesh.singh@amd.com
---
 arch/x86/include/asm/mem_encrypt.h |  6 ++++++
 arch/x86/mm/mem_encrypt.c          | 26 ++++++++++++++++++++++++++
 include/linux/mem_encrypt.h        |  7 +++++--
 3 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 6a77c63540f7..2b024741bce9 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -47,6 +47,9 @@ void __init mem_encrypt_init(void);
 
 void swiotlb_set_mem_attributes(void *vaddr, unsigned long size);
 
+bool sme_active(void);
+bool sev_active(void);
+
 #else	/* !CONFIG_AMD_MEM_ENCRYPT */
 
 #define sme_me_mask	0ULL
@@ -64,6 +67,9 @@ static inline void __init sme_early_init(void) { }
 static inline void __init sme_encrypt_kernel(void) { }
 static inline void __init sme_enable(struct boot_params *bp) { }
 
+static inline bool sme_active(void) { return false; }
+static inline bool sev_active(void) { return false; }
+
 #endif	/* CONFIG_AMD_MEM_ENCRYPT */
 
 /*
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 16c5f37933a2..add836d3d174 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -42,6 +42,8 @@ static char sme_cmdline_off[] __initdata = "off";
 u64 sme_me_mask __section(.data) = 0;
 EXPORT_SYMBOL_GPL(sme_me_mask);
 
+static bool sev_enabled __section(.data);
+
 /* Buffer used for early in-place encryption by BSP, no locking needed */
 static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE);
 
@@ -192,6 +194,30 @@ void __init sme_early_init(void)
 		protection_map[i] = pgprot_encrypted(protection_map[i]);
 }
 
+/*
+ * SME and SEV are very similar but they are not the same, so there are
+ * times that the kernel will need to distinguish between SME and SEV. The
+ * sme_active() and sev_active() functions are used for this.  When a
+ * distinction isn't needed, the mem_encrypt_active() function can be used.
+ *
+ * The trampoline code is a good example for this requirement.  Before
+ * paging is activated, SME will access all memory as decrypted, but SEV
+ * will access all memory as encrypted.  So, when APs are being brought
+ * up under SME the trampoline area cannot be encrypted, whereas under SEV
+ * the trampoline area must be encrypted.
+ */
+bool sme_active(void)
+{
+	return sme_me_mask && !sev_enabled;
+}
+EXPORT_SYMBOL_GPL(sme_active);
+
+bool sev_active(void)
+{
+	return sme_me_mask && sev_enabled;
+}
+EXPORT_SYMBOL_GPL(sev_active);
+
 /* Architecture __weak replacement functions */
 void __init mem_encrypt_init(void)
 {
diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
index 265a9cd21cb4..b310a9c18113 100644
--- a/include/linux/mem_encrypt.h
+++ b/include/linux/mem_encrypt.h
@@ -23,11 +23,14 @@
 
 #define sme_me_mask	0ULL
 
+static inline bool sme_active(void) { return false; }
+static inline bool sev_active(void) { return false; }
+
 #endif	/* CONFIG_ARCH_HAS_MEM_ENCRYPT */
 
-static inline bool sme_active(void)
+static inline bool mem_encrypt_active(void)
 {
-	return !!sme_me_mask;
+	return sme_me_mask;
 }
 
 static inline u64 sme_get_me_mask(void)
-- 
cgit v1.2.3


From 1d2e733b13b450e5854f4a8f8efcd77fa7362d62 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Fri, 20 Oct 2017 09:30:51 -0500
Subject: resource: Provide resource struct in resource walk callback

In preperation for a new function that will need additional resource
information during the resource walk, update the resource walk callback to
pass the resource structure.  Since the current callback start and end
arguments are pulled from the resource structure, the callback functions
can obtain them from the resource structure directly.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Tested-by: Borislav Petkov <bp@suse.de>
Cc: kvm@vger.kernel.org
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: linuxppc-dev@lists.ozlabs.org
Link: https://lkml.kernel.org/r/20171020143059.3291-10-brijesh.singh@amd.com
---
 arch/powerpc/kernel/machine_kexec_file_64.c | 12 +++++++++---
 arch/x86/kernel/crash.c                     | 18 +++++++++---------
 arch/x86/kernel/pmem.c                      |  2 +-
 include/linux/ioport.h                      |  4 ++--
 include/linux/kexec.h                       |  2 +-
 kernel/kexec_file.c                         |  5 +++--
 kernel/resource.c                           |  9 +++++----
 7 files changed, 30 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/machine_kexec_file_64.c b/arch/powerpc/kernel/machine_kexec_file_64.c
index 992c0d258e5d..e4395f937d63 100644
--- a/arch/powerpc/kernel/machine_kexec_file_64.c
+++ b/arch/powerpc/kernel/machine_kexec_file_64.c
@@ -91,11 +91,13 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image)
  * and that value will be returned. If all free regions are visited without
  * func returning non-zero, then zero will be returned.
  */
-int arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(u64, u64, void *))
+int arch_kexec_walk_mem(struct kexec_buf *kbuf,
+			int (*func)(struct resource *, void *))
 {
 	int ret = 0;
 	u64 i;
 	phys_addr_t mstart, mend;
+	struct resource res = { };
 
 	if (kbuf->top_down) {
 		for_each_free_mem_range_reverse(i, NUMA_NO_NODE, 0,
@@ -105,7 +107,9 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(u64, u64, void *))
 			 * range while in kexec, end points to the last byte
 			 * in the range.
 			 */
-			ret = func(mstart, mend - 1, kbuf);
+			res.start = mstart;
+			res.end = mend - 1;
+			ret = func(&res, kbuf);
 			if (ret)
 				break;
 		}
@@ -117,7 +121,9 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(u64, u64, void *))
 			 * range while in kexec, end points to the last byte
 			 * in the range.
 			 */
-			ret = func(mstart, mend - 1, kbuf);
+			res.start = mstart;
+			res.end = mend - 1;
+			ret = func(&res, kbuf);
 			if (ret)
 				break;
 		}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 44404e2307bb..815008c9ca18 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -209,7 +209,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
 }
 
 #ifdef CONFIG_KEXEC_FILE
-static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg)
+static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
 {
 	unsigned int *nr_ranges = arg;
 
@@ -342,7 +342,7 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced,
 	return ret;
 }
 
-static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg)
+static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
 {
 	struct crash_elf_data *ced = arg;
 	Elf64_Ehdr *ehdr;
@@ -355,7 +355,7 @@ static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg)
 	ehdr = ced->ehdr;
 
 	/* Exclude unwanted mem ranges */
-	ret = elf_header_exclude_ranges(ced, start, end);
+	ret = elf_header_exclude_ranges(ced, res->start, res->end);
 	if (ret)
 		return ret;
 
@@ -518,14 +518,14 @@ static int add_e820_entry(struct boot_params *params, struct e820_entry *entry)
 	return 0;
 }
 
-static int memmap_entry_callback(u64 start, u64 end, void *arg)
+static int memmap_entry_callback(struct resource *res, void *arg)
 {
 	struct crash_memmap_data *cmd = arg;
 	struct boot_params *params = cmd->params;
 	struct e820_entry ei;
 
-	ei.addr = start;
-	ei.size = end - start + 1;
+	ei.addr = res->start;
+	ei.size = res->end - res->start + 1;
 	ei.type = cmd->type;
 	add_e820_entry(params, &ei);
 
@@ -619,12 +619,12 @@ out:
 	return ret;
 }
 
-static int determine_backup_region(u64 start, u64 end, void *arg)
+static int determine_backup_region(struct resource *res, void *arg)
 {
 	struct kimage *image = arg;
 
-	image->arch.backup_src_start = start;
-	image->arch.backup_src_sz = end - start + 1;
+	image->arch.backup_src_start = res->start;
+	image->arch.backup_src_sz = res->end - res->start + 1;
 
 	/* Expecting only one range for backup region */
 	return 1;
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 3fe690067802..6b07faaa1579 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -7,7 +7,7 @@
 #include <linux/init.h>
 #include <linux/ioport.h>
 
-static int found(u64 start, u64 end, void *data)
+static int found(struct resource *res, void *data)
 {
 	return 1;
 }
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 83c8d6530f0f..c0070d7c4b99 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -272,10 +272,10 @@ walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 		void *arg, int (*func)(unsigned long, unsigned long, void *));
 extern int
 walk_system_ram_res(u64 start, u64 end, void *arg,
-		    int (*func)(u64, u64, void *));
+		    int (*func)(struct resource *, void *));
 extern int
 walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end,
-		    void *arg, int (*func)(u64, u64, void *));
+		    void *arg, int (*func)(struct resource *, void *));
 
 /* True if any part of r1 overlaps r2 */
 static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 1c08c925cefb..f16f6ceb3875 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -160,7 +160,7 @@ struct kexec_buf {
 };
 
 int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
-			       int (*func)(u64, u64, void *));
+			       int (*func)(struct resource *, void *));
 extern int kexec_add_buffer(struct kexec_buf *kbuf);
 int kexec_locate_mem_hole(struct kexec_buf *kbuf);
 #endif /* CONFIG_KEXEC_FILE */
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 9f48f4412297..e5bcd94c1efb 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -406,9 +406,10 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
 	return 1;
 }
 
-static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+static int locate_mem_hole_callback(struct resource *res, void *arg)
 {
 	struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+	u64 start = res->start, end = res->end;
 	unsigned long sz = end - start + 1;
 
 	/* Returning 0 will take to next memory range */
@@ -437,7 +438,7 @@ static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
  * func returning non-zero, then zero will be returned.
  */
 int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
-			       int (*func)(u64, u64, void *))
+			       int (*func)(struct resource *, void *))
 {
 	if (kbuf->image->type == KEXEC_TYPE_CRASH)
 		return walk_iomem_res_desc(crashk_res.desc,
diff --git a/kernel/resource.c b/kernel/resource.c
index 7323c1b636cd..8430042fa77b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -402,14 +402,15 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
 
 static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
 				 bool first_level_children_only,
-				 void *arg, int (*func)(u64, u64, void *))
+				 void *arg,
+				 int (*func)(struct resource *, void *))
 {
 	u64 orig_end = res->end;
 	int ret = -1;
 
 	while ((res->start < res->end) &&
 	       !find_next_iomem_res(res, desc, first_level_children_only)) {
-		ret = (*func)(res->start, res->end, arg);
+		ret = (*func)(res, arg);
 		if (ret)
 			break;
 
@@ -435,7 +436,7 @@ static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
  * <linux/ioport.h> and set it in 'desc' of a target resource entry.
  */
 int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
-		u64 end, void *arg, int (*func)(u64, u64, void *))
+		u64 end, void *arg, int (*func)(struct resource *, void *))
 {
 	struct resource res;
 
@@ -454,7 +455,7 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
  * ranges.
  */
 int walk_system_ram_res(u64 start, u64 end, void *arg,
-				int (*func)(u64, u64, void *))
+				int (*func)(struct resource *, void *))
 {
 	struct resource res;
 
-- 
cgit v1.2.3


From 0e4c12b45aa88e74fdda117896d2b61c4e510cb9 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Fri, 20 Oct 2017 09:30:52 -0500
Subject: x86/mm, resource: Use PAGE_KERNEL protection for ioremap of memory
 pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order for memory pages to be properly mapped when SEV is active, it's
necessary to use the PAGE_KERNEL protection attribute as the base
protection.  This ensures that memory mapping of, e.g. ACPI tables,
receives the proper mapping attributes.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Tested-by: Borislav Petkov <bp@suse.de>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: kvm@vger.kernel.org
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Link: https://lkml.kernel.org/r/20171020143059.3291-11-brijesh.singh@amd.com
---
 arch/x86/mm/ioremap.c  | 79 ++++++++++++++++++++++++++++++++++++++++++--------
 include/linux/ioport.h |  3 ++
 kernel/resource.c      | 19 ++++++++++++
 3 files changed, 89 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 52cc0f4ed494..6e4573b1da34 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -27,6 +27,11 @@
 
 #include "physaddr.h"
 
+struct ioremap_mem_flags {
+	bool system_ram;
+	bool desc_other;
+};
+
 /*
  * Fix up the linear direct mapping of the kernel to avoid cache attribute
  * conflicts.
@@ -56,17 +61,59 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 	return err;
 }
 
-static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
-			       void *arg)
+static bool __ioremap_check_ram(struct resource *res)
 {
+	unsigned long start_pfn, stop_pfn;
 	unsigned long i;
 
-	for (i = 0; i < nr_pages; ++i)
-		if (pfn_valid(start_pfn + i) &&
-		    !PageReserved(pfn_to_page(start_pfn + i)))
-			return 1;
+	if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM)
+		return false;
 
-	return 0;
+	start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	stop_pfn = (res->end + 1) >> PAGE_SHIFT;
+	if (stop_pfn > start_pfn) {
+		for (i = 0; i < (stop_pfn - start_pfn); ++i)
+			if (pfn_valid(start_pfn + i) &&
+			    !PageReserved(pfn_to_page(start_pfn + i)))
+				return true;
+	}
+
+	return false;
+}
+
+static int __ioremap_check_desc_other(struct resource *res)
+{
+	return (res->desc != IORES_DESC_NONE);
+}
+
+static int __ioremap_res_check(struct resource *res, void *arg)
+{
+	struct ioremap_mem_flags *flags = arg;
+
+	if (!flags->system_ram)
+		flags->system_ram = __ioremap_check_ram(res);
+
+	if (!flags->desc_other)
+		flags->desc_other = __ioremap_check_desc_other(res);
+
+	return flags->system_ram && flags->desc_other;
+}
+
+/*
+ * To avoid multiple resource walks, this function walks resources marked as
+ * IORESOURCE_MEM and IORESOURCE_BUSY and looking for system RAM and/or a
+ * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES).
+ */
+static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
+				struct ioremap_mem_flags *flags)
+{
+	u64 start, end;
+
+	start = (u64)addr;
+	end = start + size - 1;
+	memset(flags, 0, sizeof(*flags));
+
+	walk_mem_res(start, end, flags, __ioremap_res_check);
 }
 
 /*
@@ -87,9 +134,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		unsigned long size, enum page_cache_mode pcm, void *caller)
 {
 	unsigned long offset, vaddr;
-	resource_size_t pfn, last_pfn, last_addr;
+	resource_size_t last_addr;
 	const resource_size_t unaligned_phys_addr = phys_addr;
 	const unsigned long unaligned_size = size;
+	struct ioremap_mem_flags mem_flags;
 	struct vm_struct *area;
 	enum page_cache_mode new_pcm;
 	pgprot_t prot;
@@ -108,13 +156,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		return NULL;
 	}
 
+	__ioremap_check_mem(phys_addr, size, &mem_flags);
+
 	/*
 	 * Don't allow anybody to remap normal RAM that we're using..
 	 */
-	pfn      = phys_addr >> PAGE_SHIFT;
-	last_pfn = last_addr >> PAGE_SHIFT;
-	if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
-					  __ioremap_check_ram) == 1) {
+	if (mem_flags.system_ram) {
 		WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n",
 			  &phys_addr, &last_addr);
 		return NULL;
@@ -146,7 +193,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		pcm = new_pcm;
 	}
 
+	/*
+	 * If the page being mapped is in memory and SEV is active then
+	 * make sure the memory encryption attribute is enabled in the
+	 * resulting mapping.
+	 */
 	prot = PAGE_KERNEL_IO;
+	if (sev_active() && mem_flags.desc_other)
+		prot = pgprot_encrypted(prot);
+
 	switch (pcm) {
 	case _PAGE_CACHE_MODE_UC:
 	default:
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index c0070d7c4b99..93b4183cf53d 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -271,6 +271,9 @@ extern int
 walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 		void *arg, int (*func)(unsigned long, unsigned long, void *));
 extern int
+walk_mem_res(u64 start, u64 end, void *arg,
+	     int (*func)(struct resource *, void *));
+extern int
 walk_system_ram_res(u64 start, u64 end, void *arg,
 		    int (*func)(struct resource *, void *));
 extern int
diff --git a/kernel/resource.c b/kernel/resource.c
index 8430042fa77b..54ba6de3757c 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -397,6 +397,8 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
 		res->start = p->start;
 	if (res->end > p->end)
 		res->end = p->end;
+	res->flags = p->flags;
+	res->desc = p->desc;
 	return 0;
 }
 
@@ -467,6 +469,23 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
 				     arg, func);
 }
 
+/*
+ * This function calls the @func callback against all memory ranges, which
+ * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY.
+ */
+int walk_mem_res(u64 start, u64 end, void *arg,
+		 int (*func)(struct resource *, void *))
+{
+	struct resource res;
+
+	res.start = start;
+	res.end = end;
+	res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+
+	return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
+				     arg, func);
+}
+
 #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
 
 /*
-- 
cgit v1.2.3


From ac26963a1175c813e3ed21c0d2435b083173136e Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Fri, 20 Oct 2017 09:30:57 -0500
Subject: percpu: Introduce DEFINE_PER_CPU_DECRYPTED

KVM guest defines three per-CPU variables (steal-time, apf_reason, and
kvm_pic_eoi) which are shared between a guest and a hypervisor.

When SEV is active, memory is encrypted with a guest-specific key, and if
the guest OS wants to share the memory region with the hypervisor then it
must clear the C-bit (i.e set decrypted) before sharing it.

DEFINE_PER_CPU_DECRYPTED can be used to define the per-CPU variables
which will be shared between a guest and a hypervisor.

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Borislav Petkov <bp@suse.de>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: linux-arch@vger.kernel.org
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: kvm@vger.kernel.org
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christoph Lameter <cl@linux.com>
Link: https://lkml.kernel.org/r/20171020143059.3291-16-brijesh.singh@amd.com
---
 include/asm-generic/vmlinux.lds.h | 19 +++++++++++++++++++
 include/linux/percpu-defs.h       | 15 +++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 63e56f6c1877..c58f3805e348 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -777,6 +777,24 @@
 #define INIT_RAM_FS
 #endif
 
+/*
+ * Memory encryption operates on a page basis. Since we need to clear
+ * the memory encryption mask for this section, it needs to be aligned
+ * on a page boundary and be a page-size multiple in length.
+ *
+ * Note: We use a separate section so that only this section gets
+ * decrypted to avoid exposing more than we wish.
+ */
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+#define PERCPU_DECRYPTED_SECTION					\
+	. = ALIGN(PAGE_SIZE);						\
+	*(.data..percpu..decrypted)					\
+	. = ALIGN(PAGE_SIZE);
+#else
+#define PERCPU_DECRYPTED_SECTION
+#endif
+
+
 /*
  * Default discarded sections.
  *
@@ -815,6 +833,7 @@
 	. = ALIGN(cacheline);						\
 	*(.data..percpu)						\
 	*(.data..percpu..shared_aligned)				\
+	PERCPU_DECRYPTED_SECTION					\
 	VMLINUX_SYMBOL(__per_cpu_end) = .;
 
 /**
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 8f16299ca068..2d2096ba1cfe 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -172,6 +172,21 @@
 #define DEFINE_PER_CPU_READ_MOSTLY(type, name)				\
 	DEFINE_PER_CPU_SECTION(type, name, "..read_mostly")
 
+/*
+ * Declaration/definition used for per-CPU variables that should be accessed
+ * as decrypted when memory encryption is enabled in the guest.
+ */
+#if defined(CONFIG_VIRTUALIZATION) && defined(CONFIG_AMD_MEM_ENCRYPT)
+
+#define DECLARE_PER_CPU_DECRYPTED(type, name)				\
+	DECLARE_PER_CPU_SECTION(type, name, "..decrypted")
+
+#define DEFINE_PER_CPU_DECRYPTED(type, name)				\
+	DEFINE_PER_CPU_SECTION(type, name, "..decrypted")
+#else
+#define DEFINE_PER_CPU_DECRYPTED(type, name)	DEFINE_PER_CPU(type, name)
+#endif
+
 /*
  * Intermodule exports for per-CPU variables.  sparse forgets about
  * address space across EXPORT_SYMBOL(), change EXPORT_SYMBOL() to
-- 
cgit v1.2.3


From 3c377ef1000d57cb1faf8b86ea77cfa47141db33 Mon Sep 17 00:00:00 2001
From: Felipe Balbi <felipe.balbi@linux.intel.com>
Date: Thu, 2 Nov 2017 10:57:39 +0200
Subject: usb: core: rename usb_get_status() 'type' argument to 'recip'

This makes it a lot clearer that we're expecting a recipient as the
argument. A follow-up patch will use the argument 'type' as the status
type selector (standard or ptm).

Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/message.c | 6 +++---
 include/linux/usb.h        | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index f35cbfa2b87b..7e95db70bba0 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -918,7 +918,7 @@ int usb_get_device_descriptor(struct usb_device *dev, unsigned int size)
 /**
  * usb_get_status - issues a GET_STATUS call
  * @dev: the device whose status is being checked
- * @type: USB_RECIP_*; for device, interface, or endpoint
+ * @recip: USB_RECIP_*; for device, interface, or endpoint
  * @target: zero (for device), else interface or endpoint number
  * @data: pointer to two bytes of bitmap data
  * Context: !in_interrupt ()
@@ -937,7 +937,7 @@ int usb_get_device_descriptor(struct usb_device *dev, unsigned int size)
  * Returns 0 and the status value in *@data (in host byte order) on success,
  * or else the status code from the underlying usb_control_msg() call.
  */
-int usb_get_status(struct usb_device *dev, int type, int target, void *data)
+int usb_get_status(struct usb_device *dev, int recip, int target, void *data)
 {
 	int ret;
 	__le16 *status = kmalloc(sizeof(*status), GFP_KERNEL);
@@ -946,7 +946,7 @@ int usb_get_status(struct usb_device *dev, int type, int target, void *data)
 		return -ENOMEM;
 
 	ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0),
-		USB_REQ_GET_STATUS, USB_DIR_IN | type, USB_STATUS_TYPE_STANDARD,
+		USB_REQ_GET_STATUS, USB_DIR_IN | recip, USB_STATUS_TYPE_STANDARD,
 		target, status, sizeof(*status), USB_CTRL_GET_TIMEOUT);
 
 	if (ret == 2) {
diff --git a/include/linux/usb.h b/include/linux/usb.h
index cb9fbd54386e..2e2c1d40081b 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1766,7 +1766,7 @@ extern int usb_bulk_msg(struct usb_device *usb_dev, unsigned int pipe,
 extern int usb_get_descriptor(struct usb_device *dev, unsigned char desctype,
 	unsigned char descindex, void *buf, int size);
 extern int usb_get_status(struct usb_device *dev,
-	int type, int target, void *data);
+	int recip, int target, void *data);
 extern int usb_string(struct usb_device *dev, int index,
 	char *buf, size_t size);
 
-- 
cgit v1.2.3


From d9e1e1484ade396b3a979ba6c68798dbaceed1b9 Mon Sep 17 00:00:00 2001
From: Felipe Balbi <felipe.balbi@linux.intel.com>
Date: Thu, 2 Nov 2017 10:57:40 +0200
Subject: usb: core: introduce a new usb_get_std_status() helper

This new helper is a simple wrapper around usb_get_status(). This
patch is in preparation to adding support for fetching PTM_STATUS
types. No functional changes.

Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/wlan-ng/hfa384x_usb.c | 4 ++--
 drivers/usb/core/driver.c             | 4 ++--
 drivers/usb/core/hub.c                | 8 ++++----
 drivers/usb/misc/usbtest.c            | 8 ++++----
 include/linux/usb.h                   | 7 +++++++
 5 files changed, 19 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/wlan-ng/hfa384x_usb.c b/drivers/staging/wlan-ng/hfa384x_usb.c
index d1e8218f96fb..d24f5eba9375 100644
--- a/drivers/staging/wlan-ng/hfa384x_usb.c
+++ b/drivers/staging/wlan-ng/hfa384x_usb.c
@@ -2460,7 +2460,7 @@ int hfa384x_drvr_start(struct hfa384x *hw)
 	 * ok
 	 */
 	result =
-	    usb_get_status(hw->usb, USB_RECIP_ENDPOINT, hw->endp_in, &status);
+	    usb_get_std_status(hw->usb, USB_RECIP_ENDPOINT, hw->endp_in, &status);
 	if (result < 0) {
 		netdev_err(hw->wlandev->netdev, "Cannot get bulk in endpoint status.\n");
 		goto done;
@@ -2469,7 +2469,7 @@ int hfa384x_drvr_start(struct hfa384x *hw)
 		netdev_err(hw->wlandev->netdev, "Failed to reset bulk in endpoint.\n");
 
 	result =
-	    usb_get_status(hw->usb, USB_RECIP_ENDPOINT, hw->endp_out, &status);
+	    usb_get_std_status(hw->usb, USB_RECIP_ENDPOINT, hw->endp_out, &status);
 	if (result < 0) {
 		netdev_err(hw->wlandev->netdev, "Cannot get bulk out endpoint status.\n");
 		goto done;
diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index 4b941a3a746e..64262a9a8829 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -1340,8 +1340,8 @@ static int usb_suspend_both(struct usb_device *udev, pm_message_t msg)
 			int err;
 			u16 devstat;
 
-			err = usb_get_status(udev, USB_RECIP_DEVICE, 0,
-					     &devstat);
+			err = usb_get_std_status(udev, USB_RECIP_DEVICE, 0,
+						 &devstat);
 			if (err) {
 				dev_err(&udev->dev,
 					"Failed to suspend device, error %d\n",
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 3edcaf105692..7ccdd3d4db84 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -1482,7 +1482,7 @@ static int hub_configure(struct usb_hub *hub,
 	/* power budgeting mostly matters with bus-powered hubs,
 	 * and battery-powered root hubs (may provide just 8 mA).
 	 */
-	ret = usb_get_status(hdev, USB_RECIP_DEVICE, 0, &hubstatus);
+	ret = usb_get_std_status(hdev, USB_RECIP_DEVICE, 0, &hubstatus);
 	if (ret) {
 		message = "can't get hub status";
 		goto fail;
@@ -3279,7 +3279,7 @@ static int finish_port_resume(struct usb_device *udev)
 	 */
 	if (status == 0) {
 		devstatus = 0;
-		status = usb_get_status(udev, USB_RECIP_DEVICE, 0, &devstatus);
+		status = usb_get_std_status(udev, USB_RECIP_DEVICE, 0, &devstatus);
 
 		/* If a normal resume failed, try doing a reset-resume */
 		if (status && !udev->reset_resume && udev->persist_enabled) {
@@ -3303,7 +3303,7 @@ static int finish_port_resume(struct usb_device *udev)
 			if (devstatus & (1 << USB_DEVICE_REMOTE_WAKEUP))
 				status = usb_disable_remote_wakeup(udev);
 		} else {
-			status = usb_get_status(udev, USB_RECIP_INTERFACE, 0,
+			status = usb_get_std_status(udev, USB_RECIP_INTERFACE, 0,
 					&devstatus);
 			if (!status && devstatus & (USB_INTRF_STAT_FUNC_RW_CAP
 					| USB_INTRF_STAT_FUNC_RW))
@@ -4866,7 +4866,7 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus,
 				&& udev->bus_mA <= unit_load) {
 			u16	devstat;
 
-			status = usb_get_status(udev, USB_RECIP_DEVICE, 0,
+			status = usb_get_std_status(udev, USB_RECIP_DEVICE, 0,
 					&devstat);
 			if (status) {
 				dev_dbg(&udev->dev, "get status %d ?\n", status);
diff --git a/drivers/usb/misc/usbtest.c b/drivers/usb/misc/usbtest.c
index aeda01f037c1..203f29894bce 100644
--- a/drivers/usb/misc/usbtest.c
+++ b/drivers/usb/misc/usbtest.c
@@ -1016,7 +1016,7 @@ static int ch9_postconfig(struct usbtest_dev *dev)
 	/* FIXME fetch strings from at least the device descriptor */
 
 	/* [9.4.5] get_status always works */
-	retval = usb_get_status(udev, USB_RECIP_DEVICE, 0, dev->buf);
+	retval = usb_get_std_status(udev, USB_RECIP_DEVICE, 0, dev->buf);
 	if (retval) {
 		dev_err(&iface->dev, "get dev status --> %d\n", retval);
 		return retval;
@@ -1026,7 +1026,7 @@ static int ch9_postconfig(struct usbtest_dev *dev)
 	 * the device's remote wakeup feature ... if we can, test that here
 	 */
 
-	retval = usb_get_status(udev, USB_RECIP_INTERFACE,
+	retval = usb_get_std_status(udev, USB_RECIP_INTERFACE,
 			iface->altsetting[0].desc.bInterfaceNumber, dev->buf);
 	if (retval) {
 		dev_err(&iface->dev, "get interface status --> %d\n", retval);
@@ -1615,7 +1615,7 @@ static int verify_not_halted(struct usbtest_dev *tdev, int ep, struct urb *urb)
 	u16	status;
 
 	/* shouldn't look or act halted */
-	retval = usb_get_status(urb->dev, USB_RECIP_ENDPOINT, ep, &status);
+	retval = usb_get_std_status(urb->dev, USB_RECIP_ENDPOINT, ep, &status);
 	if (retval < 0) {
 		ERROR(tdev, "ep %02x couldn't get no-halt status, %d\n",
 				ep, retval);
@@ -1637,7 +1637,7 @@ static int verify_halted(struct usbtest_dev *tdev, int ep, struct urb *urb)
 	u16	status;
 
 	/* should look and act halted */
-	retval = usb_get_status(urb->dev, USB_RECIP_ENDPOINT, ep, &status);
+	retval = usb_get_std_status(urb->dev, USB_RECIP_ENDPOINT, ep, &status);
 	if (retval < 0) {
 		ERROR(tdev, "ep %02x couldn't get halt status, %d\n",
 				ep, retval);
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 2e2c1d40081b..e86b6a2a35e4 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1767,6 +1767,13 @@ extern int usb_get_descriptor(struct usb_device *dev, unsigned char desctype,
 	unsigned char descindex, void *buf, int size);
 extern int usb_get_status(struct usb_device *dev,
 	int recip, int target, void *data);
+
+static inline int usb_get_std_status(struct usb_device *dev,
+	int recip, int target, void *data)
+{
+	return usb_get_status(dev, recip, target, data);
+}
+
 extern int usb_string(struct usb_device *dev, int index,
 	char *buf, size_t size);
 
-- 
cgit v1.2.3


From 2e43f0fe379c317d1ca27a69d860e397682ce957 Mon Sep 17 00:00:00 2001
From: Felipe Balbi <felipe.balbi@linux.intel.com>
Date: Thu, 2 Nov 2017 10:57:41 +0200
Subject: usb: core: add a 'type' parameter to usb_get_status()

This new 'type' parameter will allows interested drivers to request
for PTM status or Standard status.

Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/message.c | 47 +++++++++++++++++++++++++++++++++++++++-------
 include/linux/usb.h        |  5 +++--
 2 files changed, 43 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index 7e95db70bba0..ebaea514161f 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -919,6 +919,7 @@ int usb_get_device_descriptor(struct usb_device *dev, unsigned int size)
  * usb_get_status - issues a GET_STATUS call
  * @dev: the device whose status is being checked
  * @recip: USB_RECIP_*; for device, interface, or endpoint
+ * @type: USB_STATUS_TYPE_*; for standard or PTM status types
  * @target: zero (for device), else interface or endpoint number
  * @data: pointer to two bytes of bitmap data
  * Context: !in_interrupt ()
@@ -937,24 +938,56 @@ int usb_get_device_descriptor(struct usb_device *dev, unsigned int size)
  * Returns 0 and the status value in *@data (in host byte order) on success,
  * or else the status code from the underlying usb_control_msg() call.
  */
-int usb_get_status(struct usb_device *dev, int recip, int target, void *data)
+int usb_get_status(struct usb_device *dev, int recip, int type, int target,
+		void *data)
 {
 	int ret;
-	__le16 *status = kmalloc(sizeof(*status), GFP_KERNEL);
+	void *status;
+	int length;
+
+	switch (type) {
+	case USB_STATUS_TYPE_STANDARD:
+		length = 2;
+		break;
+	case USB_STATUS_TYPE_PTM:
+		if (recip != USB_RECIP_DEVICE)
+			return -EINVAL;
+
+		length = 4;
+		break;
+	default:
+		return -EINVAL;
+	}
 
+	status =  kmalloc(length, GFP_KERNEL);
 	if (!status)
 		return -ENOMEM;
 
 	ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0),
 		USB_REQ_GET_STATUS, USB_DIR_IN | recip, USB_STATUS_TYPE_STANDARD,
-		target, status, sizeof(*status), USB_CTRL_GET_TIMEOUT);
+		target, status, length, USB_CTRL_GET_TIMEOUT);
+
+	switch (ret) {
+	case 4:
+		if (type != USB_STATUS_TYPE_PTM) {
+			ret = -EIO;
+			break;
+		}
+
+		*(u32 *) data = le32_to_cpu(*(__le32 *) status);
+		break;
+	case 2:
+		if (type != USB_STATUS_TYPE_STANDARD) {
+			ret = -EIO;
+			break;
+		}
 
-	if (ret == 2) {
-		*(u16 *) data = le16_to_cpu(*status);
-		ret = 0;
-	} else if (ret >= 0) {
+		*(u16 *) data = le16_to_cpu(*(__le16 *) status);
+		break;
+	default:
 		ret = -EIO;
 	}
+
 	kfree(status);
 	return ret;
 }
diff --git a/include/linux/usb.h b/include/linux/usb.h
index e86b6a2a35e4..6228d8177bfe 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1766,12 +1766,13 @@ extern int usb_bulk_msg(struct usb_device *usb_dev, unsigned int pipe,
 extern int usb_get_descriptor(struct usb_device *dev, unsigned char desctype,
 	unsigned char descindex, void *buf, int size);
 extern int usb_get_status(struct usb_device *dev,
-	int recip, int target, void *data);
+	int recip, int type, int target, void *data);
 
 static inline int usb_get_std_status(struct usb_device *dev,
 	int recip, int target, void *data)
 {
-	return usb_get_status(dev, recip, target, data);
+	return usb_get_status(dev, recip, USB_STATUS_TYPE_STANDARD, target,
+		data);
 }
 
 extern int usb_string(struct usb_device *dev, int index,
-- 
cgit v1.2.3


From f8f3e4acbde3dff2e433d02476034606e07ac742 Mon Sep 17 00:00:00 2001
From: Felipe Balbi <felipe.balbi@linux.intel.com>
Date: Thu, 2 Nov 2017 10:57:42 +0200
Subject: usb: core: add a new usb_get_ptm_status() helper

Drivers who are interested in the PTM status stype, should use this
new helper to make sure they issue the correct GetStatus message.

Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 6228d8177bfe..3555936025ec 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1775,6 +1775,12 @@ static inline int usb_get_std_status(struct usb_device *dev,
 		data);
 }
 
+static inline int usb_get_ptm_status(struct usb_device *dev, void *data)
+{
+	return usb_get_status(dev, USB_RECIP_DEVICE, USB_STATUS_TYPE_PTM,
+		0, data);
+}
+
 extern int usb_string(struct usb_device *dev, int index,
 	char *buf, size_t size);
 
-- 
cgit v1.2.3


From 5b143d2a6edeae59700420c948f7d793da3356a8 Mon Sep 17 00:00:00 2001
From: Sebastien Bourdelin <sebastien.bourdelin@savoirfairelinux.com>
Date: Wed, 1 Nov 2017 13:14:33 -0400
Subject: bus: add driver for the Technologic Systems NBUS

This driver implements a GPIOs bit-banged bus, called the NBUS by
Technologic Systems. It is used to communicate with the peripherals in
the FPGA on the TS-4600 SoM.

Signed-off-by: Sebastien Bourdelin <sebastien.bourdelin@savoirfairelinux.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/bus/Kconfig     |   8 ++
 drivers/bus/Makefile    |   1 +
 drivers/bus/ts-nbus.c   | 375 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/ts-nbus.h |  18 +++
 4 files changed, 402 insertions(+)
 create mode 100644 drivers/bus/ts-nbus.c
 create mode 100644 include/linux/ts-nbus.h

(limited to 'include/linux')

diff --git a/drivers/bus/Kconfig b/drivers/bus/Kconfig
index ae3d8f3444b9..854348e646b3 100644
--- a/drivers/bus/Kconfig
+++ b/drivers/bus/Kconfig
@@ -157,6 +157,14 @@ config TEGRA_GMI
 	  Driver for the Tegra Generic Memory Interface bus which can be used
 	  to attach devices such as NOR, UART, FPGA and more.
 
+config TS_NBUS
+	tristate "Technologic Systems NBUS Driver"
+	depends on SOC_IMX28
+	depends on OF_GPIO && PWM
+	help
+	  Driver for the Technologic Systems NBUS which is used to interface
+	  with the peripherals in the FPGA of the TS-4600 SoM.
+
 config UNIPHIER_SYSTEM_BUS
 	tristate "UniPhier System Bus driver"
 	depends on ARCH_UNIPHIER && OF
diff --git a/drivers/bus/Makefile b/drivers/bus/Makefile
index cc6364bec054..72377f77651c 100644
--- a/drivers/bus/Makefile
+++ b/drivers/bus/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_SUNXI_RSB)		+= sunxi-rsb.o
 obj-$(CONFIG_SIMPLE_PM_BUS)	+= simple-pm-bus.o
 obj-$(CONFIG_TEGRA_ACONNECT)	+= tegra-aconnect.o
 obj-$(CONFIG_TEGRA_GMI)		+= tegra-gmi.o
+obj-$(CONFIG_TS_NBUS)		+= ts-nbus.o
 obj-$(CONFIG_UNIPHIER_SYSTEM_BUS)	+= uniphier-system-bus.o
 obj-$(CONFIG_VEXPRESS_CONFIG)	+= vexpress-config.o
 
diff --git a/drivers/bus/ts-nbus.c b/drivers/bus/ts-nbus.c
new file mode 100644
index 000000000000..073fd9011154
--- /dev/null
+++ b/drivers/bus/ts-nbus.c
@@ -0,0 +1,375 @@
+/*
+ * NBUS driver for TS-4600 based boards
+ *
+ * Copyright (c) 2016 - Savoir-faire Linux
+ * Author: Sebastien Bourdelin <sebastien.bourdelin@savoirfairelinux.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ *
+ * This driver implements a GPIOs bit-banged bus, called the NBUS by Technologic
+ * Systems. It is used to communicate with the peripherals in the FPGA on the
+ * TS-4600 SoM.
+ */
+
+#include <linux/bitops.h>
+#include <linux/gpio/consumer.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/pwm.h>
+#include <linux/ts-nbus.h>
+
+#define TS_NBUS_DIRECTION_IN  0
+#define TS_NBUS_DIRECTION_OUT 1
+#define TS_NBUS_WRITE_ADR 0
+#define TS_NBUS_WRITE_VAL 1
+
+struct ts_nbus {
+	struct pwm_device *pwm;
+	struct gpio_descs *data;
+	struct gpio_desc *csn;
+	struct gpio_desc *txrx;
+	struct gpio_desc *strobe;
+	struct gpio_desc *ale;
+	struct gpio_desc *rdy;
+	struct mutex lock;
+};
+
+/*
+ * request all gpios required by the bus.
+ */
+static int ts_nbus_init_pdata(struct platform_device *pdev, struct ts_nbus
+		*ts_nbus)
+{
+	ts_nbus->data = devm_gpiod_get_array(&pdev->dev, "ts,data",
+			GPIOD_OUT_HIGH);
+	if (IS_ERR(ts_nbus->data)) {
+		dev_err(&pdev->dev, "failed to retrieve ts,data-gpio from dts\n");
+		return PTR_ERR(ts_nbus->data);
+	}
+
+	ts_nbus->csn = devm_gpiod_get(&pdev->dev, "ts,csn", GPIOD_OUT_HIGH);
+	if (IS_ERR(ts_nbus->csn)) {
+		dev_err(&pdev->dev, "failed to retrieve ts,csn-gpio from dts\n");
+		return PTR_ERR(ts_nbus->csn);
+	}
+
+	ts_nbus->txrx = devm_gpiod_get(&pdev->dev, "ts,txrx", GPIOD_OUT_HIGH);
+	if (IS_ERR(ts_nbus->txrx)) {
+		dev_err(&pdev->dev, "failed to retrieve ts,txrx-gpio from dts\n");
+		return PTR_ERR(ts_nbus->txrx);
+	}
+
+	ts_nbus->strobe = devm_gpiod_get(&pdev->dev, "ts,strobe", GPIOD_OUT_HIGH);
+	if (IS_ERR(ts_nbus->strobe)) {
+		dev_err(&pdev->dev, "failed to retrieve ts,strobe-gpio from dts\n");
+		return PTR_ERR(ts_nbus->strobe);
+	}
+
+	ts_nbus->ale = devm_gpiod_get(&pdev->dev, "ts,ale", GPIOD_OUT_HIGH);
+	if (IS_ERR(ts_nbus->ale)) {
+		dev_err(&pdev->dev, "failed to retrieve ts,ale-gpio from dts\n");
+		return PTR_ERR(ts_nbus->ale);
+	}
+
+	ts_nbus->rdy = devm_gpiod_get(&pdev->dev, "ts,rdy", GPIOD_IN);
+	if (IS_ERR(ts_nbus->rdy)) {
+		dev_err(&pdev->dev, "failed to retrieve ts,rdy-gpio from dts\n");
+		return PTR_ERR(ts_nbus->rdy);
+	}
+
+	return 0;
+}
+
+/*
+ * the data gpios are used for reading and writing values, their directions
+ * should be adjusted accordingly.
+ */
+static void ts_nbus_set_direction(struct ts_nbus *ts_nbus, int direction)
+{
+	int i;
+
+	for (i = 0; i < 8; i++) {
+		if (direction == TS_NBUS_DIRECTION_IN)
+			gpiod_direction_input(ts_nbus->data->desc[i]);
+		else
+			/* when used as output the default state of the data
+			 * lines are set to high */
+			gpiod_direction_output(ts_nbus->data->desc[i], 1);
+	}
+}
+
+/*
+ * reset the bus in its initial state.
+ * The data, csn, strobe and ale lines must be zero'ed to let the FPGA knows a
+ * new transaction can be process.
+ */
+static void ts_nbus_reset_bus(struct ts_nbus *ts_nbus)
+{
+	int i;
+	int values[8];
+
+	for (i = 0; i < 8; i++)
+		values[i] = 0;
+
+	gpiod_set_array_value_cansleep(8, ts_nbus->data->desc, values);
+	gpiod_set_value_cansleep(ts_nbus->csn, 0);
+	gpiod_set_value_cansleep(ts_nbus->strobe, 0);
+	gpiod_set_value_cansleep(ts_nbus->ale, 0);
+}
+
+/*
+ * let the FPGA knows it can process.
+ */
+static void ts_nbus_start_transaction(struct ts_nbus *ts_nbus)
+{
+	gpiod_set_value_cansleep(ts_nbus->strobe, 1);
+}
+
+/*
+ * read a byte value from the data gpios.
+ * return 0 on success or negative errno on failure.
+ */
+static int ts_nbus_read_byte(struct ts_nbus *ts_nbus, u8 *val)
+{
+	struct gpio_descs *gpios = ts_nbus->data;
+	int ret, i;
+
+	*val = 0;
+	for (i = 0; i < 8; i++) {
+		ret = gpiod_get_value_cansleep(gpios->desc[i]);
+		if (ret < 0)
+			return ret;
+		if (ret)
+			*val |= BIT(i);
+	}
+
+	return 0;
+}
+
+/*
+ * set the data gpios accordingly to the byte value.
+ */
+static void ts_nbus_write_byte(struct ts_nbus *ts_nbus, u8 byte)
+{
+	struct gpio_descs *gpios = ts_nbus->data;
+	int i;
+	int values[8];
+
+	for (i = 0; i < 8; i++)
+		if (byte & BIT(i))
+			values[i] = 1;
+		else
+			values[i] = 0;
+
+	gpiod_set_array_value_cansleep(8, gpios->desc, values);
+}
+
+/*
+ * reading the bus consists of resetting the bus, then notifying the FPGA to
+ * send the data in the data gpios and return the read value.
+ * return 0 on success or negative errno on failure.
+ */
+static int ts_nbus_read_bus(struct ts_nbus *ts_nbus, u8 *val)
+{
+	ts_nbus_reset_bus(ts_nbus);
+	ts_nbus_start_transaction(ts_nbus);
+
+	return ts_nbus_read_byte(ts_nbus, val);
+}
+
+/*
+ * writing to the bus consists of resetting the bus, then define the type of
+ * command (address/value), write the data and notify the FPGA to retrieve the
+ * value in the data gpios.
+ */
+static void ts_nbus_write_bus(struct ts_nbus *ts_nbus, int cmd, u8 val)
+{
+	ts_nbus_reset_bus(ts_nbus);
+
+	if (cmd == TS_NBUS_WRITE_ADR)
+		gpiod_set_value_cansleep(ts_nbus->ale, 1);
+
+	ts_nbus_write_byte(ts_nbus, val);
+	ts_nbus_start_transaction(ts_nbus);
+}
+
+/*
+ * read the value in the FPGA register at the given address.
+ * return 0 on success or negative errno on failure.
+ */
+int ts_nbus_read(struct ts_nbus *ts_nbus, u8 adr, u16 *val)
+{
+	int ret, i;
+	u8 byte;
+
+	/* bus access must be atomic */
+	mutex_lock(&ts_nbus->lock);
+
+	/* set the bus in read mode */
+	gpiod_set_value_cansleep(ts_nbus->txrx, 0);
+
+	/* write address */
+	ts_nbus_write_bus(ts_nbus, TS_NBUS_WRITE_ADR, adr);
+
+	/* set the data gpios direction as input before reading */
+	ts_nbus_set_direction(ts_nbus, TS_NBUS_DIRECTION_IN);
+
+	/* reading value MSB first */
+	do {
+		*val = 0;
+		byte = 0;
+		for (i = 1; i >= 0; i--) {
+			/* read a byte from the bus, leave on error */
+			ret = ts_nbus_read_bus(ts_nbus, &byte);
+			if (ret < 0)
+				goto err;
+
+			/* append the byte read to the final value */
+			*val |= byte << (i * 8);
+		}
+		gpiod_set_value_cansleep(ts_nbus->csn, 1);
+		ret = gpiod_get_value_cansleep(ts_nbus->rdy);
+	} while (ret);
+
+err:
+	/* restore the data gpios direction as output after reading */
+	ts_nbus_set_direction(ts_nbus, TS_NBUS_DIRECTION_OUT);
+
+	mutex_unlock(&ts_nbus->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ts_nbus_read);
+
+/*
+ * write the desired value in the FPGA register at the given address.
+ */
+int ts_nbus_write(struct ts_nbus *ts_nbus, u8 adr, u16 val)
+{
+	int i;
+
+	/* bus access must be atomic */
+	mutex_lock(&ts_nbus->lock);
+
+	/* set the bus in write mode */
+	gpiod_set_value_cansleep(ts_nbus->txrx, 1);
+
+	/* write address */
+	ts_nbus_write_bus(ts_nbus, TS_NBUS_WRITE_ADR, adr);
+
+	/* writing value MSB first */
+	for (i = 1; i >= 0; i--)
+		ts_nbus_write_bus(ts_nbus, TS_NBUS_WRITE_VAL, (u8)(val >> (i * 8)));
+
+	/* wait for completion */
+	gpiod_set_value_cansleep(ts_nbus->csn, 1);
+	while (gpiod_get_value_cansleep(ts_nbus->rdy) != 0) {
+		gpiod_set_value_cansleep(ts_nbus->csn, 0);
+		gpiod_set_value_cansleep(ts_nbus->csn, 1);
+	}
+
+	mutex_unlock(&ts_nbus->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ts_nbus_write);
+
+static int ts_nbus_probe(struct platform_device *pdev)
+{
+	struct pwm_device *pwm;
+	struct pwm_args pargs;
+	struct device *dev = &pdev->dev;
+	struct ts_nbus *ts_nbus;
+	int ret;
+
+	ts_nbus = devm_kzalloc(dev, sizeof(*ts_nbus), GFP_KERNEL);
+	if (!ts_nbus)
+		return -ENOMEM;
+
+	mutex_init(&ts_nbus->lock);
+
+	ret = ts_nbus_init_pdata(pdev, ts_nbus);
+	if (ret < 0)
+		return ret;
+
+	pwm = devm_pwm_get(dev, NULL);
+	if (IS_ERR(pwm)) {
+		ret = PTR_ERR(pwm);
+		if (ret != -EPROBE_DEFER)
+			dev_err(dev, "unable to request PWM\n");
+		return ret;
+	}
+
+	pwm_get_args(pwm, &pargs);
+	if (!pargs.period) {
+		dev_err(&pdev->dev, "invalid PWM period\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * FIXME: pwm_apply_args() should be removed when switching to
+	 * the atomic PWM API.
+	 */
+	pwm_apply_args(pwm);
+	ret = pwm_config(pwm, pargs.period, pargs.period);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * we can now start the FPGA and populate the peripherals.
+	 */
+	pwm_enable(pwm);
+	ts_nbus->pwm = pwm;
+
+	/*
+	 * let the child nodes retrieve this instance of the ts-nbus.
+	 */
+	dev_set_drvdata(dev, ts_nbus);
+
+	ret = of_platform_populate(dev->of_node, NULL, NULL, dev);
+	if (ret < 0)
+		return ret;
+
+	dev_info(dev, "initialized\n");
+
+	return 0;
+}
+
+static int ts_nbus_remove(struct platform_device *pdev)
+{
+	struct ts_nbus *ts_nbus = dev_get_drvdata(&pdev->dev);
+
+	/* shutdown the FPGA */
+	mutex_lock(&ts_nbus->lock);
+	pwm_disable(ts_nbus->pwm);
+	mutex_unlock(&ts_nbus->lock);
+
+	return 0;
+}
+
+static const struct of_device_id ts_nbus_of_match[] = {
+	{ .compatible = "technologic,ts-nbus", },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, ts_nbus_of_match);
+
+static struct platform_driver ts_nbus_driver = {
+	.probe		= ts_nbus_probe,
+	.remove		= ts_nbus_remove,
+	.driver		= {
+		.name	= "ts_nbus",
+		.of_match_table = ts_nbus_of_match,
+	},
+};
+
+module_platform_driver(ts_nbus_driver);
+
+MODULE_ALIAS("platform:ts_nbus");
+MODULE_AUTHOR("Sebastien Bourdelin <sebastien.bourdelin@savoirfairelinux.com>");
+MODULE_DESCRIPTION("Technologic Systems NBUS");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/ts-nbus.h b/include/linux/ts-nbus.h
new file mode 100644
index 000000000000..5bd4c822f7cf
--- /dev/null
+++ b/include/linux/ts-nbus.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016 - Savoir-faire Linux
+ * Author: Sebastien Bourdelin <sebastien.bourdelin@savoirfairelinux.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef _TS_NBUS_H
+#define _TS_NBUS_H
+
+struct ts_nbus;
+
+extern int ts_nbus_read(struct ts_nbus *ts_nbus, u8 adr, u16 *val);
+extern int ts_nbus_write(struct ts_nbus *ts_nbus, u8 adr, u16 val);
+
+#endif /* _TS_NBUS_H */
-- 
cgit v1.2.3


From 7c8d469877b16d2c1cecf101a0abb7b218db85bc Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Tue, 31 Oct 2017 00:15:47 +0100
Subject: debugfs: add support for more elaborate ->d_fsdata

Currently, the user provided fops, "real_fops", are stored directly into
->d_fsdata.

In order to be able to store more per-file state and thus prepare for more
granular file removal protection, wrap the real_fops into a dynamically
allocated container struct, debugfs_fsdata.

A struct debugfs_fsdata gets allocated at file creation and freed from the
newly intoduced ->d_release().

Finally, move the implementation of debugfs_real_fops() out of the public
debugfs header such that struct debugfs_fsdata's declaration can be kept
private.

Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c       | 12 ++++++++++++
 fs/debugfs/inode.c      | 22 +++++++++++++++++++---
 fs/debugfs/internal.h   |  4 ++++
 include/linux/debugfs.h | 20 +++-----------------
 4 files changed, 38 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 6dabc4a10396..b6f5ddab66bf 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -97,6 +97,18 @@ EXPORT_SYMBOL_GPL(debugfs_use_file_finish);
 
 #define F_DENTRY(filp) ((filp)->f_path.dentry)
 
+const struct file_operations *debugfs_real_fops(const struct file *filp)
+	__must_hold(&debugfs_srcu)
+{
+	struct debugfs_fsdata *fsd = F_DENTRY(filp)->d_fsdata;
+	/*
+	 * Neither the pointer to the struct file_operations, nor its
+	 * contents ever change -- srcu_dereference() is not needed here.
+	 */
+	return fsd->real_fops;
+}
+EXPORT_SYMBOL_GPL(debugfs_real_fops);
+
 static int open_proxy_open(struct inode *inode, struct file *filp)
 {
 	const struct dentry *dentry = F_DENTRY(filp);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index c59f015f386e..a9c3d3e9af39 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -185,6 +185,11 @@ static const struct super_operations debugfs_super_operations = {
 	.evict_inode	= debugfs_evict_inode,
 };
 
+static void debugfs_release_dentry(struct dentry *dentry)
+{
+	kfree(dentry->d_fsdata);
+}
+
 static struct vfsmount *debugfs_automount(struct path *path)
 {
 	debugfs_automount_t f;
@@ -194,6 +199,7 @@ static struct vfsmount *debugfs_automount(struct path *path)
 
 static const struct dentry_operations debugfs_dops = {
 	.d_delete = always_delete_dentry,
+	.d_release = debugfs_release_dentry,
 	.d_automount = debugfs_automount,
 };
 
@@ -341,24 +347,34 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
 {
 	struct dentry *dentry;
 	struct inode *inode;
+	struct debugfs_fsdata *fsd;
+
+	fsd = kmalloc(sizeof(*fsd), GFP_KERNEL);
+	if (!fsd)
+		return NULL;
 
 	if (!(mode & S_IFMT))
 		mode |= S_IFREG;
 	BUG_ON(!S_ISREG(mode));
 	dentry = start_creating(name, parent);
 
-	if (IS_ERR(dentry))
+	if (IS_ERR(dentry)) {
+		kfree(fsd);
 		return NULL;
+	}
 
 	inode = debugfs_get_inode(dentry->d_sb);
-	if (unlikely(!inode))
+	if (unlikely(!inode)) {
+		kfree(fsd);
 		return failed_creating(dentry);
+	}
 
 	inode->i_mode = mode;
 	inode->i_private = data;
 
 	inode->i_fop = proxy_fops;
-	dentry->d_fsdata = (void *)real_fops;
+	fsd->real_fops = real_fops;
+	dentry->d_fsdata = fsd;
 
 	d_instantiate(dentry, inode);
 	fsnotify_create(d_inode(dentry->d_parent), dentry);
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index b3e8443a1f47..512601eed3ce 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -19,4 +19,8 @@ extern const struct file_operations debugfs_noop_file_operations;
 extern const struct file_operations debugfs_open_proxy_file_operations;
 extern const struct file_operations debugfs_full_proxy_file_operations;
 
+struct debugfs_fsdata {
+	const struct file_operations *real_fops;
+};
+
 #endif /* _DEBUGFS_INTERNAL_H_ */
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index b93efc8feecd..cbee5f4a02a3 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -45,23 +45,6 @@ extern struct dentry *arch_debugfs_dir;
 
 extern struct srcu_struct debugfs_srcu;
 
-/**
- * debugfs_real_fops - getter for the real file operation
- * @filp: a pointer to a struct file
- *
- * Must only be called under the protection established by
- * debugfs_use_file_start().
- */
-static inline const struct file_operations *debugfs_real_fops(const struct file *filp)
-	__must_hold(&debugfs_srcu)
-{
-	/*
-	 * Neither the pointer to the struct file_operations, nor its
-	 * contents ever change -- srcu_dereference() is not needed here.
-	 */
-	return filp->f_path.dentry->d_fsdata;
-}
-
 #define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt)		\
 static int __fops ## _open(struct inode *inode, struct file *file)	\
 {									\
@@ -112,6 +95,9 @@ int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
 
 void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu);
 
+const struct file_operations *debugfs_real_fops(const struct file *filp)
+	__must_hold(&debugfs_srcu);
+
 ssize_t debugfs_attr_read(struct file *file, char __user *buf,
 			size_t len, loff_t *ppos);
 ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
-- 
cgit v1.2.3


From e9117a5a4bf65d8e99f060d356a04d27a60b436d Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Tue, 31 Oct 2017 00:15:48 +0100
Subject: debugfs: implement per-file removal protection

Since commit 49d200deaa68 ("debugfs: prevent access to removed files'
private data"), accesses to a file's private data are protected from
concurrent removal by covering all file_operations with a SRCU read section
and sychronizing with those before returning from debugfs_remove() by means
of synchronize_srcu().

As pointed out by Johannes Berg, there are debugfs files with forever
blocking file_operations. Their corresponding SRCU read side sections would
block any debugfs_remove() forever as well, even unrelated ones. This
results in a livelock. Because a remover can't cancel any indefinite
blocking within foreign files, this is a problem.

Resolve this by introducing support for more granular protection on a
per-file basis.

This is implemented by introducing an  'active_users' refcount_t to the
per-file struct debugfs_fsdata state. At file creation time, it is set to
one and a debugfs_remove() will drop that initial reference. The new
debugfs_file_get() and debugfs_file_put(), intended to be used in place of
former debugfs_use_file_start() and debugfs_use_file_finish(), increment
and decrement it respectively. Once the count drops to zero,
debugfs_file_put() will signal a completion which is possibly being waited
for from debugfs_remove().
Thus, as long as there is a debugfs_file_get() not yet matched by a
corresponding debugfs_file_put() around, debugfs_remove() will block.

Actual users of debugfs_use_file_start() and -finish() will get converted
to the new debugfs_file_get() and debugfs_file_put() by followup patches.

Fixes: 49d200deaa68 ("debugfs: prevent access to removed files' private data")
Reported-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c       | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/debugfs/inode.c      | 29 +++++++++++++++++++++++------
 fs/debugfs/internal.h   |  2 ++
 include/linux/debugfs.h | 11 +++++++++++
 4 files changed, 84 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index b6f5ddab66bf..6644bfdea2f8 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -109,6 +109,54 @@ const struct file_operations *debugfs_real_fops(const struct file *filp)
 }
 EXPORT_SYMBOL_GPL(debugfs_real_fops);
 
+/**
+ * debugfs_file_get - mark the beginning of file data access
+ * @dentry: the dentry object whose data is being accessed.
+ *
+ * Up to a matching call to debugfs_file_put(), any successive call
+ * into the file removing functions debugfs_remove() and
+ * debugfs_remove_recursive() will block. Since associated private
+ * file data may only get freed after a successful return of any of
+ * the removal functions, you may safely access it after a successful
+ * call to debugfs_file_get() without worrying about lifetime issues.
+ *
+ * If -%EIO is returned, the file has already been removed and thus,
+ * it is not safe to access any of its data. If, on the other hand,
+ * it is allowed to access the file data, zero is returned.
+ */
+int debugfs_file_get(struct dentry *dentry)
+{
+	struct debugfs_fsdata *fsd = dentry->d_fsdata;
+
+	/* Avoid starvation of removers. */
+	if (d_unlinked(dentry))
+		return -EIO;
+
+	if (!refcount_inc_not_zero(&fsd->active_users))
+		return -EIO;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(debugfs_file_get);
+
+/**
+ * debugfs_file_put - mark the end of file data access
+ * @dentry: the dentry object formerly passed to
+ *          debugfs_file_get().
+ *
+ * Allow any ongoing concurrent call into debugfs_remove() or
+ * debugfs_remove_recursive() blocked by a former call to
+ * debugfs_file_get() to proceed and return to its caller.
+ */
+void debugfs_file_put(struct dentry *dentry)
+{
+	struct debugfs_fsdata *fsd = dentry->d_fsdata;
+
+	if (refcount_dec_and_test(&fsd->active_users))
+		complete(&fsd->active_users_drained);
+}
+EXPORT_SYMBOL_GPL(debugfs_file_put);
+
 static int open_proxy_open(struct inode *inode, struct file *filp)
 {
 	const struct dentry *dentry = F_DENTRY(filp);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index a9c3d3e9af39..6449eb935540 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -374,6 +374,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
 
 	inode->i_fop = proxy_fops;
 	fsd->real_fops = real_fops;
+	refcount_set(&fsd->active_users, 1);
 	dentry->d_fsdata = fsd;
 
 	d_instantiate(dentry, inode);
@@ -631,18 +632,34 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_symlink);
 
+static void __debugfs_remove_file(struct dentry *dentry, struct dentry *parent)
+{
+	struct debugfs_fsdata *fsd;
+
+	simple_unlink(d_inode(parent), dentry);
+	d_delete(dentry);
+	fsd = dentry->d_fsdata;
+	init_completion(&fsd->active_users_drained);
+	if (!refcount_dec_and_test(&fsd->active_users))
+		wait_for_completion(&fsd->active_users_drained);
+}
+
 static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
 {
 	int ret = 0;
 
 	if (simple_positive(dentry)) {
 		dget(dentry);
-		if (d_is_dir(dentry))
-			ret = simple_rmdir(d_inode(parent), dentry);
-		else
-			simple_unlink(d_inode(parent), dentry);
-		if (!ret)
-			d_delete(dentry);
+		if (!d_is_reg(dentry)) {
+			if (d_is_dir(dentry))
+				ret = simple_rmdir(d_inode(parent), dentry);
+			else
+				simple_unlink(d_inode(parent), dentry);
+			if (!ret)
+				d_delete(dentry);
+		} else {
+			__debugfs_remove_file(dentry, parent);
+		}
 		dput(dentry);
 	}
 	return ret;
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index 512601eed3ce..0eea99432840 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -21,6 +21,8 @@ extern const struct file_operations debugfs_full_proxy_file_operations;
 
 struct debugfs_fsdata {
 	const struct file_operations *real_fops;
+	refcount_t active_users;
+	struct completion active_users_drained;
 };
 
 #endif /* _DEBUGFS_INTERNAL_H_ */
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index cbee5f4a02a3..3b914d588148 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -98,6 +98,9 @@ void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu);
 const struct file_operations *debugfs_real_fops(const struct file *filp)
 	__must_hold(&debugfs_srcu);
 
+int debugfs_file_get(struct dentry *dentry);
+void debugfs_file_put(struct dentry *dentry);
+
 ssize_t debugfs_attr_read(struct file *file, char __user *buf,
 			size_t len, loff_t *ppos);
 ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
@@ -236,6 +239,14 @@ static inline void debugfs_use_file_finish(int srcu_idx)
 	__releases(&debugfs_srcu)
 { }
 
+static inline int debugfs_file_get(struct dentry *dentry)
+{
+	return 0;
+}
+
+static inline void debugfs_file_put(struct dentry *dentry)
+{ }
+
 static inline ssize_t debugfs_attr_read(struct file *file, char __user *buf,
 					size_t len, loff_t *ppos)
 {
-- 
cgit v1.2.3


From 055ab8e3e3d52e005d2047b14ce63551b3a8b8b5 Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Tue, 31 Oct 2017 00:15:49 +0100
Subject: debugfs: debugfs_real_fops(): drop __must_hold sparse annotation

Currently, debugfs_real_fops() is annotated with a
__must_hold(&debugfs_srcu) sparse annotation.

With the conversion of the SRCU based protection of users against
concurrent file removals to a per-file refcount based scheme, this becomes
wrong.

Drop this annotation.

Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c       | 6 +-----
 include/linux/debugfs.h | 3 +--
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 6644bfdea2f8..08511678b782 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -98,13 +98,9 @@ EXPORT_SYMBOL_GPL(debugfs_use_file_finish);
 #define F_DENTRY(filp) ((filp)->f_path.dentry)
 
 const struct file_operations *debugfs_real_fops(const struct file *filp)
-	__must_hold(&debugfs_srcu)
 {
 	struct debugfs_fsdata *fsd = F_DENTRY(filp)->d_fsdata;
-	/*
-	 * Neither the pointer to the struct file_operations, nor its
-	 * contents ever change -- srcu_dereference() is not needed here.
-	 */
+
 	return fsd->real_fops;
 }
 EXPORT_SYMBOL_GPL(debugfs_real_fops);
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 3b914d588148..c5eda259b9d6 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -95,8 +95,7 @@ int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
 
 void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu);
 
-const struct file_operations *debugfs_real_fops(const struct file *filp)
-	__must_hold(&debugfs_srcu);
+const struct file_operations *debugfs_real_fops(const struct file *filp);
 
 int debugfs_file_get(struct dentry *dentry);
 void debugfs_file_put(struct dentry *dentry);
-- 
cgit v1.2.3


From c9afbec27089cd6b4e621b639f41c7fc726c3bf1 Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Tue, 31 Oct 2017 00:15:52 +0100
Subject: debugfs: purge obsolete SRCU based removal protection

Purge the SRCU based file removal race protection in favour of the new,
refcount based debugfs_file_get()/debugfs_file_put() API.

Fixes: 49d200deaa68 ("debugfs: prevent access to removed files' private data")
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c       | 48 ------------------------------------------------
 fs/debugfs/inode.c      |  7 -------
 include/linux/debugfs.h | 19 -------------------
 lib/Kconfig.debug       |  1 -
 4 files changed, 75 deletions(-)

(limited to 'include/linux')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index d3a972b45ff0..53f5c9a2af88 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -22,7 +22,6 @@
 #include <linux/slab.h>
 #include <linux/atomic.h>
 #include <linux/device.h>
-#include <linux/srcu.h>
 #include <asm/poll.h>
 
 #include "internal.h"
@@ -48,53 +47,6 @@ const struct file_operations debugfs_noop_file_operations = {
 	.llseek =	noop_llseek,
 };
 
-/**
- * debugfs_use_file_start - mark the beginning of file data access
- * @dentry: the dentry object whose data is being accessed.
- * @srcu_idx: a pointer to some memory to store a SRCU index in.
- *
- * Up to a matching call to debugfs_use_file_finish(), any
- * successive call into the file removing functions debugfs_remove()
- * and debugfs_remove_recursive() will block. Since associated private
- * file data may only get freed after a successful return of any of
- * the removal functions, you may safely access it after a successful
- * call to debugfs_use_file_start() without worrying about
- * lifetime issues.
- *
- * If -%EIO is returned, the file has already been removed and thus,
- * it is not safe to access any of its data. If, on the other hand,
- * it is allowed to access the file data, zero is returned.
- *
- * Regardless of the return code, any call to
- * debugfs_use_file_start() must be followed by a matching call
- * to debugfs_use_file_finish().
- */
-int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
-	__acquires(&debugfs_srcu)
-{
-	*srcu_idx = srcu_read_lock(&debugfs_srcu);
-	barrier();
-	if (d_unlinked(dentry))
-		return -EIO;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(debugfs_use_file_start);
-
-/**
- * debugfs_use_file_finish - mark the end of file data access
- * @srcu_idx: the SRCU index "created" by a former call to
- *            debugfs_use_file_start().
- *
- * Allow any ongoing concurrent call into debugfs_remove() or
- * debugfs_remove_recursive() blocked by a former call to
- * debugfs_use_file_start() to proceed and return to its caller.
- */
-void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu)
-{
-	srcu_read_unlock(&debugfs_srcu, srcu_idx);
-}
-EXPORT_SYMBOL_GPL(debugfs_use_file_finish);
-
 #define F_DENTRY(filp) ((filp)->f_path.dentry)
 
 const struct file_operations *debugfs_real_fops(const struct file *filp)
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 6449eb935540..f587aded46b5 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -27,14 +27,11 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
-#include <linux/srcu.h>
 
 #include "internal.h"
 
 #define DEBUGFS_DEFAULT_MODE	0700
 
-DEFINE_SRCU(debugfs_srcu);
-
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
 static bool debugfs_registered;
@@ -693,8 +690,6 @@ void debugfs_remove(struct dentry *dentry)
 	inode_unlock(d_inode(parent));
 	if (!ret)
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
-
-	synchronize_srcu(&debugfs_srcu);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove);
 
@@ -768,8 +763,6 @@ void debugfs_remove_recursive(struct dentry *dentry)
 	if (!__debugfs_remove(child, parent))
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	inode_unlock(d_inode(parent));
-
-	synchronize_srcu(&debugfs_srcu);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
 
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index c5eda259b9d6..9f821a43bb0d 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -23,7 +23,6 @@
 
 struct device;
 struct file_operations;
-struct srcu_struct;
 
 struct debugfs_blob_wrapper {
 	void *data;
@@ -43,8 +42,6 @@ struct debugfs_regset32 {
 
 extern struct dentry *arch_debugfs_dir;
 
-extern struct srcu_struct debugfs_srcu;
-
 #define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt)		\
 static int __fops ## _open(struct inode *inode, struct file *file)	\
 {									\
@@ -90,11 +87,6 @@ struct dentry *debugfs_create_automount(const char *name,
 void debugfs_remove(struct dentry *dentry);
 void debugfs_remove_recursive(struct dentry *dentry);
 
-int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
-	__acquires(&debugfs_srcu);
-
-void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu);
-
 const struct file_operations *debugfs_real_fops(const struct file *filp);
 
 int debugfs_file_get(struct dentry *dentry);
@@ -227,17 +219,6 @@ static inline void debugfs_remove(struct dentry *dentry)
 static inline void debugfs_remove_recursive(struct dentry *dentry)
 { }
 
-static inline int debugfs_use_file_start(const struct dentry *dentry,
-					int *srcu_idx)
-	__acquires(&debugfs_srcu)
-{
-	return 0;
-}
-
-static inline void debugfs_use_file_finish(int srcu_idx)
-	__releases(&debugfs_srcu)
-{ }
-
 static inline int debugfs_file_get(struct dentry *dentry)
 {
 	return 0;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index dfdad67d8f6c..57fb43f521c3 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -280,7 +280,6 @@ config PAGE_OWNER
 
 config DEBUG_FS
 	bool "Debug Filesystem"
-	select SRCU
 	help
 	  debugfs is a virtual file system that kernel developers use to put
 	  debugging files into.  Enable this option to be able to read and
-- 
cgit v1.2.3


From 3bce94fd5f4c05337dedbe218501fb9f8789fc40 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 7 Nov 2017 16:59:23 +0100
Subject: debugfs: add SPDX identifiers to all debugfs files

It's good to have SPDX identifiers in all files to make it easier to
audit the kernel tree for correct licenses.

Update the debugfs files files with the correct SPDX license identifier
based on the license text in the file itself.  The SPDX identifier is a
legally binding shorthand, which can be used instead of the full boiler
plate text.

This work is based on a script and data from Thomas Gleixner, Philippe
Ombredanne, and Kate Stewart.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c       | 1 +
 fs/debugfs/inode.c      | 1 +
 fs/debugfs/internal.h   | 1 +
 include/linux/debugfs.h | 1 +
 4 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 65872340e301..133be3c689af 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  file.c - part of debugfs, a tiny little debug file system
  *
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 9dca4da059b3..7c107b3b9c93 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  inode.c - part of debugfs, a tiny little debug file system
  *
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index cb1e8139c398..b9567541b737 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  internal.h - declarations internal to debugfs
  *
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 9f821a43bb0d..7beab3fd2f1a 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  debugfs.h - a tiny little debug file system
  *
-- 
cgit v1.2.3


From 2b2d8788dd565cbe1ab22da6a1bc63d0934a80eb Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 7 Nov 2017 17:01:46 +0100
Subject: debugfs: Remove redundant license text

Now that the SPDX tag is in all debugfs files, that identifies the
license in a specific and legally-defined manner.  So the extra GPL text
wording can be removed as it is no longer needed at all.

This is done on a quest to remove the 700+ different ways that files in
the kernel describe the GPL license text.  And there's unneeded stuff
like the address (sometimes incorrect) for the FSF which is never
needed.

No copyright headers or other non-license-description text was removed.

Cc: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c       | 5 -----
 fs/debugfs/inode.c      | 5 -----
 fs/debugfs/internal.h   | 5 -----
 include/linux/debugfs.h | 4 ----
 4 files changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 133be3c689af..cd12e6576b48 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -5,13 +5,8 @@
  *  Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
  *  Copyright (C) 2004 IBM Inc.
  *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License version
- *	2 as published by the Free Software Foundation.
- *
  *  debugfs is for people to use instead of /proc or /sys.
  *  See Documentation/filesystems/ for more details.
- *
  */
 
 #include <linux/module.h>
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 7c107b3b9c93..63a998c3f252 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -5,13 +5,8 @@
  *  Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
  *  Copyright (C) 2004 IBM Inc.
  *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License version
- *	2 as published by the Free Software Foundation.
- *
  *  debugfs is for people to use instead of /proc or /sys.
  *  See ./Documentation/core-api/kernel-api.rst for more details.
- *
  */
 
 #include <linux/module.h>
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index b9567541b737..f0d73d86cc1a 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -3,11 +3,6 @@
  *  internal.h - declarations internal to debugfs
  *
  *  Copyright (C) 2016 Nicolai Stange <nicstange@gmail.com>
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License version
- *	2 as published by the Free Software Foundation.
- *
  */
 
 #ifndef _DEBUGFS_INTERNAL_H_
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 7beab3fd2f1a..f36ecc2a5712 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -5,10 +5,6 @@
  *  Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
  *  Copyright (C) 2004 IBM Inc.
  *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License version
- *	2 as published by the Free Software Foundation.
- *
  *  debugfs is for people to use instead of /proc or /sys.
  *  See Documentation/filesystems/ for more details.
  */
-- 
cgit v1.2.3


From 22700f3c6df55387cec2ee27c533a7b23c76dc51 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 10 Oct 2017 17:31:43 -0400
Subject: SUNRPC: Improve ordering of transport processing

Since it can take a while before a specific thread gets scheduled, it
is better to just implement a first come first served queue mechanism.
That way, if a thread is already scheduled and is idle, it can pick up
the work to do from the queue.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc.h |   1 +
 net/sunrpc/svc_xprt.c      | 100 ++++++++++++++-------------------------------
 2 files changed, 31 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 38f561b2dda3..23c4d6496aac 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -46,6 +46,7 @@ struct svc_pool {
 	struct svc_pool_stats	sp_stats;	/* statistics on pool operation */
 #define	SP_TASK_PENDING		(0)		/* still work to do even if no
 						 * xprt is queued. */
+#define SP_CONGESTED		(1)
 	unsigned long		sp_flags;
 } ____cacheline_aligned_in_smp;
 
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 18e87791350f..80112c45aad1 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -380,7 +380,6 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
 	struct svc_pool *pool;
 	struct svc_rqst	*rqstp = NULL;
 	int cpu;
-	bool queued = false;
 
 	if (!svc_xprt_has_something_to_do(xprt))
 		goto out;
@@ -401,58 +400,25 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
 
 	atomic_long_inc(&pool->sp_stats.packets);
 
-redo_search:
+	dprintk("svc: transport %p put into queue\n", xprt);
+	spin_lock_bh(&pool->sp_lock);
+	list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
+	pool->sp_stats.sockets_queued++;
+	spin_unlock_bh(&pool->sp_lock);
+
 	/* find a thread for this xprt */
 	rcu_read_lock();
 	list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) {
-		/* Do a lockless check first */
-		if (test_bit(RQ_BUSY, &rqstp->rq_flags))
+		if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags))
 			continue;
-
-		/*
-		 * Once the xprt has been queued, it can only be dequeued by
-		 * the task that intends to service it. All we can do at that
-		 * point is to try to wake this thread back up so that it can
-		 * do so.
-		 */
-		if (!queued) {
-			spin_lock_bh(&rqstp->rq_lock);
-			if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags)) {
-				/* already busy, move on... */
-				spin_unlock_bh(&rqstp->rq_lock);
-				continue;
-			}
-
-			/* this one will do */
-			rqstp->rq_xprt = xprt;
-			svc_xprt_get(xprt);
-			spin_unlock_bh(&rqstp->rq_lock);
-		}
-		rcu_read_unlock();
-
 		atomic_long_inc(&pool->sp_stats.threads_woken);
 		wake_up_process(rqstp->rq_task);
-		put_cpu();
-		goto out;
-	}
-	rcu_read_unlock();
-
-	/*
-	 * We didn't find an idle thread to use, so we need to queue the xprt.
-	 * Do so and then search again. If we find one, we can't hook this one
-	 * up to it directly but we can wake the thread up in the hopes that it
-	 * will pick it up once it searches for a xprt to service.
-	 */
-	if (!queued) {
-		queued = true;
-		dprintk("svc: transport %p put into queue\n", xprt);
-		spin_lock_bh(&pool->sp_lock);
-		list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
-		pool->sp_stats.sockets_queued++;
-		spin_unlock_bh(&pool->sp_lock);
-		goto redo_search;
+		goto out_unlock;
 	}
+	set_bit(SP_CONGESTED, &pool->sp_flags);
 	rqstp = NULL;
+out_unlock:
+	rcu_read_unlock();
 	put_cpu();
 out:
 	trace_svc_xprt_do_enqueue(xprt, rqstp);
@@ -721,38 +687,25 @@ rqst_should_sleep(struct svc_rqst *rqstp)
 
 static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
 {
-	struct svc_xprt *xprt;
 	struct svc_pool		*pool = rqstp->rq_pool;
 	long			time_left = 0;
 
 	/* rq_xprt should be clear on entry */
 	WARN_ON_ONCE(rqstp->rq_xprt);
 
-	/* Normally we will wait up to 5 seconds for any required
-	 * cache information to be provided.
-	 */
-	rqstp->rq_chandle.thread_wait = 5*HZ;
-
-	xprt = svc_xprt_dequeue(pool);
-	if (xprt) {
-		rqstp->rq_xprt = xprt;
-
-		/* As there is a shortage of threads and this request
-		 * had to be queued, don't allow the thread to wait so
-		 * long for cache updates.
-		 */
-		rqstp->rq_chandle.thread_wait = 1*HZ;
-		clear_bit(SP_TASK_PENDING, &pool->sp_flags);
-		return xprt;
-	}
+	rqstp->rq_xprt = svc_xprt_dequeue(pool);
+	if (rqstp->rq_xprt)
+		goto out_found;
 
 	/*
 	 * We have to be able to interrupt this wait
 	 * to bring down the daemons ...
 	 */
 	set_current_state(TASK_INTERRUPTIBLE);
+	smp_mb__before_atomic();
+	clear_bit(SP_CONGESTED, &pool->sp_flags);
 	clear_bit(RQ_BUSY, &rqstp->rq_flags);
-	smp_mb();
+	smp_mb__after_atomic();
 
 	if (likely(rqst_should_sleep(rqstp)))
 		time_left = schedule_timeout(timeout);
@@ -761,13 +714,11 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
 
 	try_to_freeze();
 
-	spin_lock_bh(&rqstp->rq_lock);
 	set_bit(RQ_BUSY, &rqstp->rq_flags);
-	spin_unlock_bh(&rqstp->rq_lock);
-
-	xprt = rqstp->rq_xprt;
-	if (xprt != NULL)
-		return xprt;
+	smp_mb__after_atomic();
+	rqstp->rq_xprt = svc_xprt_dequeue(pool);
+	if (rqstp->rq_xprt)
+		goto out_found;
 
 	if (!time_left)
 		atomic_long_inc(&pool->sp_stats.threads_timedout);
@@ -775,6 +726,15 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
 	if (signalled() || kthread_should_stop())
 		return ERR_PTR(-EINTR);
 	return ERR_PTR(-EAGAIN);
+out_found:
+	/* Normally we will wait up to 5 seconds for any required
+	 * cache information to be provided.
+	 */
+	if (!test_bit(SP_CONGESTED, &pool->sp_flags))
+		rqstp->rq_chandle.thread_wait = 5*HZ;
+	else
+		rqstp->rq_chandle.thread_wait = 1*HZ;
+	return rqstp->rq_xprt;
 }
 
 static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt)
-- 
cgit v1.2.3


From 95a20ef6f7e54c6a982715a7d0da2fd81790db28 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 7 Nov 2017 13:48:11 +0100
Subject: PM / Domains: Allow genpd users to specify default active wakeup
 behavior

It is quite common for PM Domains to require slave devices to be kept
active during system suspend if they are to be used as wakeup sources.
To enable this, currently each PM Domain or driver has to provide its
own gpd_dev_ops.active_wakeup() callback.

Introduce a new flag GENPD_FLAG_ACTIVE_WAKEUP to consolidate this.
If specified, all slave devices configured as wakeup sources will be
kept active during system suspend.

PM Domains that need more fine-grained controls, based on the slave
device, can still provide their own callbacks, as before.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 3 +++
 include/linux/pm_domain.h   | 7 ++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 7e01ae364d78..e343844357c8 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -124,6 +124,7 @@ static const struct genpd_lock_ops genpd_spin_ops = {
 #define genpd_status_on(genpd)		(genpd->status == GPD_STATE_ACTIVE)
 #define genpd_is_irq_safe(genpd)	(genpd->flags & GENPD_FLAG_IRQ_SAFE)
 #define genpd_is_always_on(genpd)	(genpd->flags & GENPD_FLAG_ALWAYS_ON)
+#define genpd_is_active_wakeup(genpd)	(genpd->flags & GENPD_FLAG_ACTIVE_WAKEUP)
 
 static inline bool irq_safe_dev_in_no_sleep_domain(struct device *dev,
 		const struct generic_pm_domain *genpd)
@@ -868,6 +869,8 @@ static bool genpd_present(const struct generic_pm_domain *genpd)
 static bool genpd_dev_active_wakeup(const struct generic_pm_domain *genpd,
 				    struct device *dev)
 {
+	if (genpd_is_active_wakeup(genpd))
+		return true;
 	return GENPD_DEV_CALLBACK(genpd, bool, active_wakeup, dev);
 }
 
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 9af0356bd69c..28c24c58d479 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -18,9 +18,10 @@
 #include <linux/spinlock.h>
 
 /* Defines used for the flags field in the struct generic_pm_domain */
-#define GENPD_FLAG_PM_CLK	(1U << 0) /* PM domain uses PM clk */
-#define GENPD_FLAG_IRQ_SAFE	(1U << 1) /* PM domain operates in atomic */
-#define GENPD_FLAG_ALWAYS_ON	(1U << 2) /* PM domain is always powered on */
+#define GENPD_FLAG_PM_CLK	 (1U << 0) /* PM domain uses PM clk */
+#define GENPD_FLAG_IRQ_SAFE	 (1U << 1) /* PM domain operates in atomic */
+#define GENPD_FLAG_ALWAYS_ON	 (1U << 2) /* PM domain is always powered on */
+#define GENPD_FLAG_ACTIVE_WAKEUP (1U << 3) /* Keep devices active if wakeup */
 
 enum gpd_status {
 	GPD_STATE_ACTIVE = 0,	/* PM domain is active */
-- 
cgit v1.2.3


From d0af45f1f6528949e05385976eb61c5ebd31854e Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 7 Nov 2017 13:48:15 +0100
Subject: PM / Domains: Remove gpd_dev_ops.active_wakeup() callback

There are no more users left of the gpd_dev_ops.active_wakeup()
callback.  All have been converted to GENPD_FLAG_ACTIVE_WAKEUP.
Hence remove the callback.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 14 +++-----------
 include/linux/pm_domain.h   |  1 -
 2 files changed, 3 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index e343844357c8..65bb40c240fb 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -866,14 +866,6 @@ static bool genpd_present(const struct generic_pm_domain *genpd)
 
 #ifdef CONFIG_PM_SLEEP
 
-static bool genpd_dev_active_wakeup(const struct generic_pm_domain *genpd,
-				    struct device *dev)
-{
-	if (genpd_is_active_wakeup(genpd))
-		return true;
-	return GENPD_DEV_CALLBACK(genpd, bool, active_wakeup, dev);
-}
-
 /**
  * genpd_sync_power_off - Synchronously power off a PM domain and its masters.
  * @genpd: PM domain to power off, if possible.
@@ -978,7 +970,7 @@ static bool resume_needed(struct device *dev,
 	if (!device_can_wakeup(dev))
 		return false;
 
-	active_wakeup = genpd_dev_active_wakeup(genpd, dev);
+	active_wakeup = genpd_is_active_wakeup(genpd);
 	return device_may_wakeup(dev) ? active_wakeup : !active_wakeup;
 }
 
@@ -1047,7 +1039,7 @@ static int genpd_finish_suspend(struct device *dev, bool poweroff)
 	if (IS_ERR(genpd))
 		return -EINVAL;
 
-	if (dev->power.wakeup_path && genpd_dev_active_wakeup(genpd, dev))
+	if (dev->power.wakeup_path && genpd_is_active_wakeup(genpd))
 		return 0;
 
 	if (poweroff)
@@ -1102,7 +1094,7 @@ static int genpd_resume_noirq(struct device *dev)
 	if (IS_ERR(genpd))
 		return -EINVAL;
 
-	if (dev->power.wakeup_path && genpd_dev_active_wakeup(genpd, dev))
+	if (dev->power.wakeup_path && genpd_is_active_wakeup(genpd))
 		return 0;
 
 	genpd_lock(genpd);
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 28c24c58d479..04dbef9847d3 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -36,7 +36,6 @@ struct dev_power_governor {
 struct gpd_dev_ops {
 	int (*start)(struct device *dev);
 	int (*stop)(struct device *dev);
-	bool (*active_wakeup)(struct device *dev);
 };
 
 struct genpd_power_state {
-- 
cgit v1.2.3


From 3928ee6485a316c8abde7e24c7f82033a1c8d3ae Mon Sep 17 00:00:00 2001
From: "Maciej S. Szmigiero" <mail@maciej.szmigiero.name>
Date: Thu, 2 Nov 2017 00:49:18 +0100
Subject: net: phy: leds: Add support for "link" trigger

Currently, we create a LED trigger for any link speed known to a PHY.
These triggers only fire when their exact link speed had been negotiated
(they aren't cumulative, that is, they don't fire for "their or any higher"
link speed).

What we are missing, however, is a trigger which will fire on any link
speed known to the PHY. Such trigger can then be used for implementing a
poor man's substitute of the "link" LED on boards that lack it.
Let's add it.

Signed-off-by: Maciej S. Szmigiero <mail@maciej.szmigiero.name>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Kconfig            |  7 +++++--
 drivers/net/phy/phy_led_triggers.c | 43 +++++++++++++++++++++++++++++++++++---
 include/linux/phy.h                |  2 ++
 3 files changed, 47 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 8125412c8814..bdfbabb86ee0 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -191,11 +191,14 @@ config LED_TRIGGER_PHY
 	  Adds support for a set of LED trigger events per-PHY.  Link
 	  state change will trigger the events, for consumption by an
 	  LED class driver.  There are triggers for each link speed currently
-	  supported by the phy, and are of the form:
+	  supported by the PHY and also a one common "link" trigger as a
+	  logical-or of all the link speed ones.
+	  All these triggers are named according to the following pattern:
 	      <mii bus id>:<phy>:<speed>
 
 	  Where speed is in the form:
-		<Speed in megabits>Mbps or <Speed in gigabits>Gbps
+		<Speed in megabits>Mbps OR <Speed in gigabits>Gbps OR link
+		for any speed known to the PHY.
 
 
 comment "MII PHY device drivers"
diff --git a/drivers/net/phy/phy_led_triggers.c b/drivers/net/phy/phy_led_triggers.c
index c736f29b3b2a..39ecad25b201 100644
--- a/drivers/net/phy/phy_led_triggers.c
+++ b/drivers/net/phy/phy_led_triggers.c
@@ -31,6 +31,7 @@ static void phy_led_trigger_no_link(struct phy_device *phy)
 {
 	if (phy->last_triggered) {
 		led_trigger_event(&phy->last_triggered->trigger, LED_OFF);
+		led_trigger_event(&phy->led_link_trigger->trigger, LED_OFF);
 		phy->last_triggered = NULL;
 	}
 }
@@ -54,6 +55,10 @@ void phy_led_trigger_change_speed(struct phy_device *phy)
 	}
 
 	if (plt != phy->last_triggered) {
+		if (!phy->last_triggered)
+			led_trigger_event(&phy->led_link_trigger->trigger,
+					  LED_FULL);
+
 		led_trigger_event(&phy->last_triggered->trigger, LED_OFF);
 		led_trigger_event(&plt->trigger, LED_FULL);
 		phy->last_triggered = plt;
@@ -61,6 +66,13 @@ void phy_led_trigger_change_speed(struct phy_device *phy)
 }
 EXPORT_SYMBOL_GPL(phy_led_trigger_change_speed);
 
+static void phy_led_trigger_format_name(struct phy_device *phy, char *buf,
+					size_t size, char *suffix)
+{
+	snprintf(buf, size, PHY_ID_FMT ":%s",
+		 phy->mdio.bus->id, phy->mdio.addr, suffix);
+}
+
 static int phy_led_trigger_register(struct phy_device *phy,
 				    struct phy_led_trigger *plt,
 				    unsigned int speed)
@@ -77,8 +89,8 @@ static int phy_led_trigger_register(struct phy_device *phy,
 		snprintf(name_suffix, sizeof(name_suffix), "%dGbps",
 			 DIV_ROUND_CLOSEST(speed, 1000));
 
-	snprintf(plt->name, sizeof(plt->name), PHY_ID_FMT ":%s",
-		 phy->mdio.bus->id, phy->mdio.addr, name_suffix);
+	phy_led_trigger_format_name(phy, plt->name, sizeof(plt->name),
+				    name_suffix);
 	plt->trigger.name = plt->name;
 
 	return led_trigger_register(&plt->trigger);
@@ -99,13 +111,30 @@ int phy_led_triggers_register(struct phy_device *phy)
 	if (!phy->phy_num_led_triggers)
 		return 0;
 
+	phy->led_link_trigger = devm_kzalloc(&phy->mdio.dev,
+					     sizeof(*phy->led_link_trigger),
+					     GFP_KERNEL);
+	if (!phy->led_link_trigger) {
+		err = -ENOMEM;
+		goto out_clear;
+	}
+
+	phy_led_trigger_format_name(phy, phy->led_link_trigger->name,
+				    sizeof(phy->led_link_trigger->name),
+				    "link");
+	phy->led_link_trigger->trigger.name = phy->led_link_trigger->name;
+
+	err = led_trigger_register(&phy->led_link_trigger->trigger);
+	if (err)
+		goto out_free_link;
+
 	phy->phy_led_triggers = devm_kzalloc(&phy->mdio.dev,
 					    sizeof(struct phy_led_trigger) *
 						   phy->phy_num_led_triggers,
 					    GFP_KERNEL);
 	if (!phy->phy_led_triggers) {
 		err = -ENOMEM;
-		goto out_clear;
+		goto out_unreg_link;
 	}
 
 	for (i = 0; i < phy->phy_num_led_triggers; i++) {
@@ -123,6 +152,11 @@ out_unreg:
 	while (i--)
 		phy_led_trigger_unregister(&phy->phy_led_triggers[i]);
 	devm_kfree(&phy->mdio.dev, phy->phy_led_triggers);
+out_unreg_link:
+	phy_led_trigger_unregister(phy->led_link_trigger);
+out_free_link:
+	devm_kfree(&phy->mdio.dev, phy->led_link_trigger);
+	phy->led_link_trigger = NULL;
 out_clear:
 	phy->phy_num_led_triggers = 0;
 	return err;
@@ -135,5 +169,8 @@ void phy_led_triggers_unregister(struct phy_device *phy)
 
 	for (i = 0; i < phy->phy_num_led_triggers; i++)
 		phy_led_trigger_unregister(&phy->phy_led_triggers[i]);
+
+	if (phy->led_link_trigger)
+		phy_led_trigger_unregister(phy->led_link_trigger);
 }
 EXPORT_SYMBOL_GPL(phy_led_triggers_unregister);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index d78cd01ea513..dc82a07cb4fd 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -451,6 +451,8 @@ struct phy_device {
 	struct phy_led_trigger *phy_led_triggers;
 	unsigned int phy_num_led_triggers;
 	struct phy_led_trigger *last_triggered;
+
+	struct phy_led_trigger *led_link_trigger;
 #endif
 
 	/*
-- 
cgit v1.2.3


From 4a1a57df97636b9323c4221cc75a35694b6d34c7 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 31 Oct 2017 09:47:28 -0700
Subject: Input: st1232 - remove obsolete platform device support

Commit 1fa59bda21c7fa36 ("ARM: shmobile: Remove legacy board code for
Armadillo-800 EVA"), removed the last user of st1232_pdata and the
"st1232-ts" platform device.  All remaining users use DT.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/st1232.c         | 16 +++-------------
 include/linux/platform_data/st1232_pdata.h | 14 --------------
 2 files changed, 3 insertions(+), 27 deletions(-)
 delete mode 100644 include/linux/platform_data/st1232_pdata.h

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/st1232.c b/drivers/input/touchscreen/st1232.c
index be5615c6bf8f..d5dfa4053bbf 100644
--- a/drivers/input/touchscreen/st1232.c
+++ b/drivers/input/touchscreen/st1232.c
@@ -29,7 +29,6 @@
 #include <linux/pm_qos.h>
 #include <linux/slab.h>
 #include <linux/types.h>
-#include <linux/platform_data/st1232_pdata.h>
 
 #define ST1232_TS_NAME	"st1232-ts"
 
@@ -152,10 +151,9 @@ static void st1232_ts_power(struct st1232_ts_data *ts, bool poweron)
 }
 
 static int st1232_ts_probe(struct i2c_client *client,
-					const struct i2c_device_id *id)
+			   const struct i2c_device_id *id)
 {
 	struct st1232_ts_data *ts;
-	struct st1232_pdata *pdata = dev_get_platdata(&client->dev);
 	struct input_dev *input_dev;
 	int error;
 
@@ -180,13 +178,7 @@ static int st1232_ts_probe(struct i2c_client *client,
 	ts->client = client;
 	ts->input_dev = input_dev;
 
-	if (pdata)
-		ts->reset_gpio = pdata->reset_gpio;
-	else if (client->dev.of_node)
-		ts->reset_gpio = of_get_gpio(client->dev.of_node, 0);
-	else
-		ts->reset_gpio = -ENODEV;
-
+	ts->reset_gpio = of_get_gpio(client->dev.of_node, 0);
 	if (gpio_is_valid(ts->reset_gpio)) {
 		error = devm_gpio_request(&client->dev, ts->reset_gpio, NULL);
 		if (error) {
@@ -281,13 +273,11 @@ static const struct i2c_device_id st1232_ts_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, st1232_ts_id);
 
-#ifdef CONFIG_OF
 static const struct of_device_id st1232_ts_dt_ids[] = {
 	{ .compatible = "sitronix,st1232", },
 	{ }
 };
 MODULE_DEVICE_TABLE(of, st1232_ts_dt_ids);
-#endif
 
 static struct i2c_driver st1232_ts_driver = {
 	.probe		= st1232_ts_probe,
@@ -295,7 +285,7 @@ static struct i2c_driver st1232_ts_driver = {
 	.id_table	= st1232_ts_id,
 	.driver = {
 		.name	= ST1232_TS_NAME,
-		.of_match_table = of_match_ptr(st1232_ts_dt_ids),
+		.of_match_table = st1232_ts_dt_ids,
 		.pm	= &st1232_ts_pm_ops,
 	},
 };
diff --git a/include/linux/platform_data/st1232_pdata.h b/include/linux/platform_data/st1232_pdata.h
deleted file mode 100644
index 1dcd23bee24e..000000000000
--- a/include/linux/platform_data/st1232_pdata.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_ST1232_PDATA_H
-#define _LINUX_ST1232_PDATA_H
-
-/*
- * Optional platform data
- *
- * Use this if you want the driver to drive the reset pin.
- */
-struct st1232_pdata {
-	int reset_gpio;
-};
-
-#endif
-- 
cgit v1.2.3


From 602f3baf22188aad24b9a58be3209ab774b97d74 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Mon, 6 Nov 2017 07:23:41 +0100
Subject: net_sch: red: Add offload ability to RED qdisc

Add the ability to offload RED qdisc by using ndo_setup_tc.
There are four commands for RED offloading:
* TC_RED_SET: handles set and change.
* TC_RED_DESTROY: handle qdisc destroy.
* TC_RED_STATS: update the qdiscs counters (given as reference)
* TC_RED_XSTAT: returns red xstats.

Whether RED is being offloaded is being determined every time dump action
is being called because parent change of this qdisc could change its
offload state but doesn't require any RED function to be called.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h      |  1 +
 include/net/pkt_cls.h          | 30 ++++++++++++++++
 include/uapi/linux/pkt_sched.h |  1 +
 net/sched/sch_red.c            | 79 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 111 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fda527ccb263..71968a2ca9f3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -777,6 +777,7 @@ enum tc_setup_type {
 	TC_SETUP_CLSBPF,
 	TC_SETUP_BLOCK,
 	TC_SETUP_CBS,
+	TC_SETUP_QDISC_RED,
 };
 
 /* These structures hold the attributes of bpf state that are being passed
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 98fef3221227..03c208d3c922 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -703,4 +703,34 @@ struct tc_cookie {
 	u8  *data;
 	u32 len;
 };
+
+enum tc_red_command {
+	TC_RED_REPLACE,
+	TC_RED_DESTROY,
+	TC_RED_STATS,
+	TC_RED_XSTATS,
+};
+
+struct tc_red_qopt_offload_params {
+	u32 min;
+	u32 max;
+	u32 probability;
+	bool is_ecn;
+};
+struct tc_red_qopt_offload_stats {
+	struct gnet_stats_basic_packed *bstats;
+	struct gnet_stats_queue *qstats;
+};
+
+struct tc_red_qopt_offload {
+	enum tc_red_command command;
+	u32 handle;
+	u32 parent;
+	union {
+		struct tc_red_qopt_offload_params set;
+		struct tc_red_qopt_offload_stats stats;
+		struct red_stats *xstats;
+	};
+};
+
 #endif
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 5002562868cc..6a2c5ea7e9c4 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -256,6 +256,7 @@ struct tc_red_qopt {
 #define TC_RED_ECN		1
 #define TC_RED_HARDDROP		2
 #define TC_RED_ADAPTATIVE	4
+#define TC_RED_OFFLOADED	8
 };
 
 struct tc_red_xstats {
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index fdfdb56aaae2..007dd8ef8aac 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 #include <net/inet_ecn.h>
 #include <net/red.h>
 
@@ -148,11 +149,37 @@ static void red_reset(struct Qdisc *sch)
 	red_restart(&q->vars);
 }
 
+static int red_offload(struct Qdisc *sch, bool enable)
+{
+	struct red_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_red_qopt_offload opt = {
+		.handle = sch->handle,
+		.parent = sch->parent,
+	};
+
+	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+		return -EOPNOTSUPP;
+
+	if (enable) {
+		opt.command = TC_RED_REPLACE;
+		opt.set.min = q->parms.qth_min >> q->parms.Wlog;
+		opt.set.max = q->parms.qth_max >> q->parms.Wlog;
+		opt.set.probability = q->parms.max_P;
+		opt.set.is_ecn = red_use_ecn(q);
+	} else {
+		opt.command = TC_RED_DESTROY;
+	}
+
+	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt);
+}
+
 static void red_destroy(struct Qdisc *sch)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
 
 	del_timer_sync(&q->adapt_timer);
+	red_offload(sch, false);
 	qdisc_destroy(q->qdisc);
 }
 
@@ -219,6 +246,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)
 		red_start_of_idle_period(&q->vars);
 
 	sch_tree_unlock(sch);
+	red_offload(sch, true);
 	return 0;
 }
 
@@ -244,6 +272,33 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt)
 	return red_change(sch, opt);
 }
 
+static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_red_qopt_offload hw_stats = {
+		.handle = sch->handle,
+		.parent = sch->parent,
+		.command = TC_RED_STATS,
+		.stats.bstats = &sch->bstats,
+		.stats.qstats = &sch->qstats,
+	};
+	int err;
+
+	opt->flags &= ~TC_RED_OFFLOADED;
+	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+		return 0;
+
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
+					    &hw_stats);
+	if (err == -EOPNOTSUPP)
+		return 0;
+
+	if (!err)
+		opt->flags |= TC_RED_OFFLOADED;
+
+	return err;
+}
+
 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
@@ -257,8 +312,13 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 		.Plog		= q->parms.Plog,
 		.Scell_log	= q->parms.Scell_log,
 	};
+	int err;
 
 	sch->qstats.backlog = q->qdisc->qstats.backlog;
+	err = red_dump_offload(sch, &opt);
+	if (err)
+		goto nla_put_failure;
+
 	opts = nla_nest_start(skb, TCA_OPTIONS);
 	if (opts == NULL)
 		goto nla_put_failure;
@@ -275,6 +335,7 @@ nla_put_failure:
 static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
 	struct tc_red_xstats st = {
 		.early	= q->stats.prob_drop + q->stats.forced_drop,
 		.pdrop	= q->stats.pdrop,
@@ -282,6 +343,24 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
 	};
 
+	if (tc_can_offload(dev) &&  dev->netdev_ops->ndo_setup_tc) {
+		struct red_stats hw_stats = {0};
+		struct tc_red_qopt_offload hw_stats_request = {
+			.handle = sch->handle,
+			.parent = sch->parent,
+			.command = TC_RED_XSTATS,
+			.xstats = &hw_stats,
+		};
+		if (!dev->netdev_ops->ndo_setup_tc(dev,
+						   TC_SETUP_QDISC_RED,
+						   &hw_stats_request)) {
+			st.early += hw_stats.prob_drop + hw_stats.forced_drop;
+			st.pdrop += hw_stats.pdrop;
+			st.other += hw_stats.other;
+			st.marked += hw_stats.prob_mark + hw_stats.forced_mark;
+		}
+	}
+
 	return gnet_stats_copy_app(d, &st, sizeof(st));
 }
 
-- 
cgit v1.2.3


From 575ed7d39e2fbe602a3894bc766a8cb49af83bd3 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Mon, 6 Nov 2017 07:23:42 +0100
Subject: net_sch: mqprio: Change TC_SETUP_MQPRIO to TC_SETUP_QDISC_MQPRIO

Change TC_SETUP_MQPRIO to TC_SETUP_QDISC_MQPRIO to match the new
convention.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c               | 2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c        | 2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c              | 2 +-
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c         | 2 +-
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 2 +-
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c        | 2 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c            | 2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c          | 2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c         | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c      | 2 +-
 drivers/net/ethernet/sfc/falcon/tx.c                   | 2 +-
 drivers/net/ethernet/sfc/tx.c                          | 2 +-
 drivers/net/ethernet/ti/netcp_core.c                   | 2 +-
 include/linux/netdevice.h                              | 2 +-
 net/sched/sch_mqprio.c                                 | 5 +++--
 15 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 3d53153ce751..a74a8fbad53a 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -2206,7 +2206,7 @@ static int xgbe_setup_tc(struct net_device *netdev, enum tc_setup_type type,
 	struct tc_mqprio_qopt *mqprio = type_data;
 	u8 tc;
 
-	if (type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_QDISC_MQPRIO)
 		return -EOPNOTSUPP;
 
 	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 1216c1f1e052..4c739d5355d2 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -4289,7 +4289,7 @@ int __bnx2x_setup_tc(struct net_device *dev, enum tc_setup_type type,
 {
 	struct tc_mqprio_qopt *mqprio = type_data;
 
-	if (type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_QDISC_MQPRIO)
 		return -EOPNOTSUPP;
 
 	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 96416f5d97f3..e5472e5ae7b2 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -7388,7 +7388,7 @@ static int bnxt_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	switch (type) {
 	case TC_SETUP_BLOCK:
 		return bnxt_setup_tc_block(dev, type_data);
-	case TC_SETUP_MQPRIO: {
+	case TC_SETUP_QDISC_MQPRIO: {
 		struct tc_mqprio_qopt *mqprio = type_data;
 
 		mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index ebc55b6a6349..784dbf5a3e12 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -351,7 +351,7 @@ static int dpaa_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
 	u8 num_tc;
 	int i;
 
-	if (type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_QDISC_MQPRIO)
 		return -EOPNOTSUPP;
 
 	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
index 2a0af11c9b59..59415090ff0f 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
@@ -1252,7 +1252,7 @@ out:
 static int hns3_nic_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			     void *type_data)
 {
-	if (type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_QDISC_MQPRIO)
 		return -EOPNOTSUPP;
 
 	return hns3_setup_tc(dev, type_data);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index 81e4425f0529..adc62fb38c49 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -1389,7 +1389,7 @@ static int __fm10k_setup_tc(struct net_device *dev, enum tc_setup_type type,
 {
 	struct tc_mqprio_qopt *mqprio = type_data;
 
-	if (type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_QDISC_MQPRIO)
 		return -EOPNOTSUPP;
 
 	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 05b94d87a6c3..17e6f64299cf 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7550,7 +7550,7 @@ static int __i40e_setup_tc(struct net_device *netdev, enum tc_setup_type type,
 			   void *type_data)
 {
 	switch (type) {
-	case TC_SETUP_MQPRIO:
+	case TC_SETUP_QDISC_MQPRIO:
 		return i40e_setup_tc(netdev, type_data);
 	case TC_SETUP_BLOCK:
 		return i40e_setup_tc_block(netdev, type_data);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index e5dcb25be398..6eaca8366ac8 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9431,7 +9431,7 @@ static int __ixgbe_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	switch (type) {
 	case TC_SETUP_BLOCK:
 		return ixgbe_setup_tc_block(dev, type_data);
-	case TC_SETUP_MQPRIO:
+	case TC_SETUP_QDISC_MQPRIO:
 		return ixgbe_setup_tc_mqprio(dev, type_data);
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 736a6ccaf05e..99051a294fa6 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -135,7 +135,7 @@ static int __mlx4_en_setup_tc(struct net_device *dev, enum tc_setup_type type,
 {
 	struct tc_mqprio_qopt *mqprio = type_data;
 
-	if (type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_QDISC_MQPRIO)
 		return -EOPNOTSUPP;
 
 	if (mqprio->num_tc && mqprio->num_tc != MLX4_EN_NUM_UP_HIGH)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index f877f2f5f2a5..5d5d2e50e4bf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3146,7 +3146,7 @@ int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	case TC_SETUP_BLOCK:
 		return mlx5e_setup_tc_block(dev, type_data);
 #endif
-	case TC_SETUP_MQPRIO:
+	case TC_SETUP_QDISC_MQPRIO:
 		return mlx5e_setup_tc_mqprio(dev, type_data);
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/sfc/falcon/tx.c b/drivers/net/ethernet/sfc/falcon/tx.c
index 6a75f4140a4b..1b978d69e702 100644
--- a/drivers/net/ethernet/sfc/falcon/tx.c
+++ b/drivers/net/ethernet/sfc/falcon/tx.c
@@ -435,7 +435,7 @@ int ef4_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
 	unsigned tc, num_tc;
 	int rc;
 
-	if (type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_QDISC_MQPRIO)
 		return -EOPNOTSUPP;
 
 	num_tc = mqprio->num_tc;
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index 32bf1fecf864..ea27b8a7f465 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -663,7 +663,7 @@ int efx_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
 	unsigned tc, num_tc;
 	int rc;
 
-	if (type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_QDISC_MQPRIO)
 		return -EOPNOTSUPP;
 
 	num_tc = mqprio->num_tc;
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index 437d36289786..15e2e3031d36 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -1887,7 +1887,7 @@ static int netcp_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	/* setup tc must be called under rtnl lock */
 	ASSERT_RTNL();
 
-	if (type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_QDISC_MQPRIO)
 		return -EOPNOTSUPP;
 
 	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 71968a2ca9f3..703885aed856 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -770,7 +770,7 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
 				       struct sk_buff *skb);
 
 enum tc_setup_type {
-	TC_SETUP_MQPRIO,
+	TC_SETUP_QDISC_MQPRIO,
 	TC_SETUP_CLSU32,
 	TC_SETUP_CLSFLOWER,
 	TC_SETUP_CLSMATCHALL,
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 4d5ed45123f0..b85885a9d8a1 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -50,7 +50,8 @@ static void mqprio_destroy(struct Qdisc *sch)
 		switch (priv->mode) {
 		case TC_MQPRIO_MODE_DCB:
 		case TC_MQPRIO_MODE_CHANNEL:
-			dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO,
+			dev->netdev_ops->ndo_setup_tc(dev,
+						      TC_SETUP_QDISC_MQPRIO,
 						      &mqprio);
 			break;
 		default:
@@ -265,7 +266,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 			return -EINVAL;
 		}
 		err = dev->netdev_ops->ndo_setup_tc(dev,
-						    TC_SETUP_MQPRIO,
+						    TC_SETUP_QDISC_MQPRIO,
 						    &mqprio);
 		if (err)
 			return err;
-- 
cgit v1.2.3


From 8521db4c7e155d12fb280686c0552e47f77e9110 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Mon, 6 Nov 2017 07:23:43 +0100
Subject: net_sch: cbs: Change TC_SETUP_CBS to TC_SETUP_QDISC_CBS

Change TC_SETUP_CBS to TC_SETUP_QDISC_CBS to match the new convention..

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 2 +-
 include/linux/netdevice.h                 | 2 +-
 net/sched/sch_cbs.c                       | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index e22bce7cdacd..43cf39527660 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2488,7 +2488,7 @@ static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	struct igb_adapter *adapter = netdev_priv(dev);
 
 	switch (type) {
-	case TC_SETUP_CBS:
+	case TC_SETUP_QDISC_CBS:
 		return igb_offload_cbs(adapter, type_data);
 
 	default:
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 703885aed856..30f0f2928808 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -776,7 +776,7 @@ enum tc_setup_type {
 	TC_SETUP_CLSMATCHALL,
 	TC_SETUP_CLSBPF,
 	TC_SETUP_BLOCK,
-	TC_SETUP_CBS,
+	TC_SETUP_QDISC_CBS,
 	TC_SETUP_QDISC_RED,
 };
 
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index bdb533b7fb8c..7a72980c1509 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -212,7 +212,7 @@ static void cbs_disable_offload(struct net_device *dev,
 	cbs.queue = q->queue;
 	cbs.enable = 0;
 
-	err = ops->ndo_setup_tc(dev, TC_SETUP_CBS, &cbs);
+	err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs);
 	if (err < 0)
 		pr_warn("Couldn't disable CBS offload for queue %d\n",
 			cbs.queue);
@@ -236,7 +236,7 @@ static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q,
 	cbs.idleslope = opt->idleslope;
 	cbs.sendslope = opt->sendslope;
 
-	err = ops->ndo_setup_tc(dev, TC_SETUP_CBS, &cbs);
+	err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs);
 	if (err < 0)
 		return err;
 
-- 
cgit v1.2.3


From 620a5c860b774a81ce3f193eefb52bf4d128cca5 Mon Sep 17 00:00:00 2001
From: Egil Hjelmeland <privat@egil-hjelmeland.no>
Date: Mon, 6 Nov 2017 12:42:01 +0100
Subject: net: dsa: lan9303: Correct register names in comments

Two comments refer to registers, but lack the LAN9303_ prefix.
Fix that.

Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/lan9303.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/lan9303.h b/include/linux/dsa/lan9303.h
index 05d8d136baab..f48a85c377de 100644
--- a/include/linux/dsa/lan9303.h
+++ b/include/linux/dsa/lan9303.h
@@ -13,8 +13,8 @@ struct lan9303_phy_ops {
 #define LAN9303_NUM_ALR_RECORDS 512
 struct lan9303_alr_cache_entry {
 	u8  mac_addr[ETH_ALEN];
-	u8  port_map;           /* Bitmap of ports. Zero if unused entry */
-	u8  stp_override;       /* non zero if set ALR_DAT1_AGE_OVERRID */
+	u8  port_map;         /* Bitmap of ports. Zero if unused entry */
+	u8  stp_override;     /* non zero if set LAN9303_ALR_DAT1_AGE_OVERRID */
 };
 
 struct lan9303 {
@@ -28,7 +28,9 @@ struct lan9303 {
 	struct mutex indirect_mutex; /* protect indexed register access */
 	const struct lan9303_phy_ops *ops;
 	bool is_bridged; /* true if port 1 and 2 are bridged */
-	u32 swe_port_state; /* remember SWE_PORT_STATE while not bridged */
+
+	/* remember LAN9303_SWE_PORT_STATE while not bridged */
+	u32 swe_port_state;
 	/* LAN9303 do not offer reading specific ALR entry. Cache all
 	 * static entries in a flat table
 	 **/
-- 
cgit v1.2.3


From 96c623e51f1c40bf524decc48c6fac7ce5dd41f7 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 6 Nov 2017 14:26:10 +0100
Subject: of: add of_property_read_variable_* dummy helpers

Commit a67e9472da42 ("of: Add array read functions with min/max size
limits") added a new interface for reading variable-length arrays from
DT properties. One user was added in dsa recently and this causes a
build error because that code can be built with CONFIG_OF disabled:

net/dsa/dsa2.c: In function 'dsa_switch_parse_member_of':
net/dsa/dsa2.c:678:7: error: implicit declaration of function 'of_property_read_variable_u32_array'; did you mean 'of_property_read_u32_array'? [-Werror=implicit-function-declaration]

This adds a dummy functions for of_property_read_variable_u32_array()
and a few others that had been missing here. I decided to move
of_property_read_string() and of_property_read_string_helper() in the
process to make it easier to compare the two sets of function prototypes
to make sure they match.

Fixes: 975e6e32215e ("net: dsa: rework switch parsing")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/of.h | 62 +++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 50 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index b240ed69dc96..b32d418d011a 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -675,12 +675,6 @@ static inline int of_property_count_elems_of_size(const struct device_node *np,
 	return -ENOSYS;
 }
 
-static inline int of_property_read_u32_index(const struct device_node *np,
-			const char *propname, u32 index, u32 *out_value)
-{
-	return -ENOSYS;
-}
-
 static inline int of_property_read_u8_array(const struct device_node *np,
 			const char *propname, u8 *out_values, size_t sz)
 {
@@ -707,16 +701,14 @@ static inline int of_property_read_u64_array(const struct device_node *np,
 	return -ENOSYS;
 }
 
-static inline int of_property_read_string(const struct device_node *np,
-					  const char *propname,
-					  const char **out_string)
+static inline int of_property_read_u32_index(const struct device_node *np,
+			const char *propname, u32 index, u32 *out_value)
 {
 	return -ENOSYS;
 }
 
-static inline int of_property_read_string_helper(const struct device_node *np,
-						 const char *propname,
-						 const char **out_strs, size_t sz, int index)
+static inline int of_property_read_u64_index(const struct device_node *np,
+			const char *propname, u32 index, u64 *out_value)
 {
 	return -ENOSYS;
 }
@@ -744,12 +736,51 @@ static inline int of_n_size_cells(struct device_node *np)
 	return 0;
 }
 
+static inline int of_property_read_variable_u8_array(const struct device_node *np,
+					const char *propname, u8 *out_values,
+					size_t sz_min, size_t sz_max)
+{
+	return -ENOSYS;
+}
+
+static inline int of_property_read_variable_u16_array(const struct device_node *np,
+					const char *propname, u16 *out_values,
+					size_t sz_min, size_t sz_max)
+{
+	return -ENOSYS;
+}
+
+static inline int of_property_read_variable_u32_array(const struct device_node *np,
+					const char *propname,
+					u32 *out_values,
+					size_t sz_min,
+					size_t sz_max)
+{
+	return -ENOSYS;
+}
+
 static inline int of_property_read_u64(const struct device_node *np,
 				       const char *propname, u64 *out_value)
 {
 	return -ENOSYS;
 }
 
+static inline int of_property_read_variable_u64_array(const struct device_node *np,
+					const char *propname,
+					u64 *out_values,
+					size_t sz_min,
+					size_t sz_max)
+{
+	return -ENOSYS;
+}
+
+static inline int of_property_read_string(const struct device_node *np,
+					  const char *propname,
+					  const char **out_string)
+{
+	return -ENOSYS;
+}
+
 static inline int of_property_match_string(const struct device_node *np,
 					   const char *propname,
 					   const char *string)
@@ -757,6 +788,13 @@ static inline int of_property_match_string(const struct device_node *np,
 	return -ENOSYS;
 }
 
+static inline int of_property_read_string_helper(const struct device_node *np,
+						 const char *propname,
+						 const char **out_strs, size_t sz, int index)
+{
+	return -ENOSYS;
+}
+
 static inline struct device_node *of_parse_phandle(const struct device_node *np,
 						   const char *phandle_name,
 						   int index)
-- 
cgit v1.2.3


From f54bb2ec02c839f6bfe3e8d438cd93d30b4809dd Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 6 Nov 2017 16:01:17 +0100
Subject: locking/lockdep: Add IRQs disabled/enabled assertion APIs:
 lockdep_assert_irqs_enabled()/disabled()

Checking whether IRQs are enabled or disabled is a very common sanity
check, however not free of overhead especially on fastpath where such
assertion is very common.

Lockdep is a good host for such concurrency correctness check and it
even already tracks down IRQs disablement state. Just reuse its
machinery. This will allow us to get rid of the flags pop and check
overhead from fast path when kernel is built for production.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: David S . Miller <davem@davemloft.net>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1509980490-4285-2-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 02720769c159..a842551fe044 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -725,9 +725,24 @@ do {									\
 	lock_acquire(&(lock)->dep_map, 0, 0, 1, 1, NULL, _THIS_IP_);	\
 	lock_release(&(lock)->dep_map, 0, _THIS_IP_);			\
 } while (0)
+
+#define lockdep_assert_irqs_enabled()	do {				\
+		WARN_ONCE(debug_locks && !current->lockdep_recursion &&	\
+			  !current->hardirqs_enabled,			\
+			  "IRQs not enabled as expected\n");		\
+	} while (0)
+
+#define lockdep_assert_irqs_disabled()	do {				\
+		WARN_ONCE(debug_locks && !current->lockdep_recursion &&	\
+			  current->hardirqs_enabled,			\
+			  "IRQs not disabled as expected\n");		\
+	} while (0)
+
 #else
 # define might_lock(lock) do { } while (0)
 # define might_lock_read(lock) do { } while (0)
+# define lockdep_assert_irqs_enabled() do { } while (0)
+# define lockdep_assert_irqs_disabled() do { } while (0)
 #endif
 
 #ifdef CONFIG_LOCKDEP
-- 
cgit v1.2.3


From 0759e80b84e34a84e7e46e2b1adb528c83d84a47 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 7 Nov 2017 11:33:49 +0100
Subject: PM / QoS: Fix device resume latency framework

The special value of 0 for device resume latency PM QoS means
"no restriction", but there are two problems with that.

First, device resume latency PM QoS requests with 0 as the
value are always put in front of requests with positive
values in the priority lists used internally by the PM QoS
framework, causing 0 to be chosen as an effective constraint
value.  However, that 0 is then interpreted as "no restriction"
effectively overriding the other requests with specific
restrictions which is incorrect.

Second, the users of device resume latency PM QoS have no
way to specify that *any* resume latency at all should be
avoided, which is an artificial limitation in general.

To address these issues, modify device resume latency PM QoS to
use S32_MAX as the "no constraint" value and 0 as the "no
latency at all" one and rework its users (the cpuidle menu
governor, the genpd QoS governor and the runtime PM framework)
to follow these changes.

Also add a special "n/a" value to the corresponding user space I/F
to allow user space to indicate that it cannot accept any resume
latencies at all for the given device.

Fixes: 85dc0b8a4019 (PM / QoS: Make it possible to expose PM QoS latency constraints)
Link: https://bugzilla.kernel.org/show_bug.cgi?id=197323
Reported-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Tested-by: Tero Kristo <t-kristo@ti.com>
Reviewed-by: Ramesh Thomas <ramesh.thomas@intel.com>
---
 Documentation/ABI/testing/sysfs-devices-power |  4 ++-
 drivers/base/cpu.c                            |  3 +-
 drivers/base/power/domain.c                   |  2 +-
 drivers/base/power/domain_governor.c          | 40 +++++++++++----------------
 drivers/base/power/qos.c                      |  5 +++-
 drivers/base/power/runtime.c                  |  2 +-
 drivers/base/power/sysfs.c                    | 25 ++++++++++++++---
 drivers/cpuidle/governors/menu.c              |  4 +--
 include/linux/pm_qos.h                        | 26 +++++++++++------
 9 files changed, 68 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-devices-power b/Documentation/ABI/testing/sysfs-devices-power
index f4b24c327665..80a00f7b6667 100644
--- a/Documentation/ABI/testing/sysfs-devices-power
+++ b/Documentation/ABI/testing/sysfs-devices-power
@@ -211,7 +211,9 @@ Description:
 		device, after it has been suspended at run time, from a resume
 		request to the moment the device will be ready to process I/O,
 		in microseconds.  If it is equal to 0, however, this means that
-		the PM QoS resume latency may be arbitrary.
+		the PM QoS resume latency may be arbitrary and the special value
+		"n/a" means that user space cannot accept any resume latency at
+		all for the given device.
 
 		Not all drivers support this attribute.  If it isn't supported,
 		it is not present.
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 321cd7b4d817..227bac5f1191 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -377,7 +377,8 @@ int register_cpu(struct cpu *cpu, int num)
 
 	per_cpu(cpu_sys_devices, num) = &cpu->dev;
 	register_cpu_under_node(num, cpu_to_node(num));
-	dev_pm_qos_expose_latency_limit(&cpu->dev, 0);
+	dev_pm_qos_expose_latency_limit(&cpu->dev,
+					PM_QOS_RESUME_LATENCY_NO_CONSTRAINT);
 
 	return 0;
 }
diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 679c79545e42..24e39ce27bd8 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1326,7 +1326,7 @@ static struct generic_pm_domain_data *genpd_alloc_dev_data(struct device *dev,
 
 	gpd_data->base.dev = dev;
 	gpd_data->td.constraint_changed = true;
-	gpd_data->td.effective_constraint_ns = 0;
+	gpd_data->td.effective_constraint_ns = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT_NS;
 	gpd_data->nb.notifier_call = genpd_dev_pm_qos_notifier;
 
 	spin_lock_irq(&dev->power.lock);
diff --git a/drivers/base/power/domain_governor.c b/drivers/base/power/domain_governor.c
index e4cca8adab32..99896fbf18e4 100644
--- a/drivers/base/power/domain_governor.c
+++ b/drivers/base/power/domain_governor.c
@@ -33,15 +33,10 @@ static int dev_update_qos_constraint(struct device *dev, void *data)
 		 * known at this point anyway).
 		 */
 		constraint_ns = dev_pm_qos_read_value(dev);
-		if (constraint_ns > 0)
-			constraint_ns *= NSEC_PER_USEC;
+		constraint_ns *= NSEC_PER_USEC;
 	}
 
-	/* 0 means "no constraint" */
-	if (constraint_ns == 0)
-		return 0;
-
-	if (constraint_ns < *constraint_ns_p || *constraint_ns_p == 0)
+	if (constraint_ns < *constraint_ns_p)
 		*constraint_ns_p = constraint_ns;
 
 	return 0;
@@ -69,12 +64,12 @@ static bool default_suspend_ok(struct device *dev)
 	}
 	td->constraint_changed = false;
 	td->cached_suspend_ok = false;
-	td->effective_constraint_ns = -1;
+	td->effective_constraint_ns = 0;
 	constraint_ns = __dev_pm_qos_read_value(dev);
 
 	spin_unlock_irqrestore(&dev->power.lock, flags);
 
-	if (constraint_ns < 0)
+	if (constraint_ns == 0)
 		return false;
 
 	constraint_ns *= NSEC_PER_USEC;
@@ -87,25 +82,25 @@ static bool default_suspend_ok(struct device *dev)
 		device_for_each_child(dev, &constraint_ns,
 				      dev_update_qos_constraint);
 
-	if (constraint_ns == 0) {
+	if (constraint_ns == PM_QOS_RESUME_LATENCY_NO_CONSTRAINT_NS) {
 		/* "No restriction", so the device is allowed to suspend. */
-		td->effective_constraint_ns = 0;
+		td->effective_constraint_ns = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT_NS;
 		td->cached_suspend_ok = true;
-	} else if (constraint_ns < 0) {
+	} else if (constraint_ns == 0) {
 		/*
 		 * This triggers if one of the children that don't belong to a
-		 * domain has a negative PM QoS constraint and it's better not
-		 * to suspend then.  effective_constraint_ns is negative already
-		 * and cached_suspend_ok is false, so bail out.
+		 * domain has a zero PM QoS constraint and it's better not to
+		 * suspend then.  effective_constraint_ns is zero already and
+		 * cached_suspend_ok is false, so bail out.
 		 */
 		return false;
 	} else {
 		constraint_ns -= td->suspend_latency_ns +
 				td->resume_latency_ns;
 		/*
-		 * effective_constraint_ns is negative already and
-		 * cached_suspend_ok is false, so if the computed value is not
-		 * positive, return right away.
+		 * effective_constraint_ns is zero already and cached_suspend_ok
+		 * is false, so if the computed value is not positive, return
+		 * right away.
 		 */
 		if (constraint_ns <= 0)
 			return false;
@@ -174,13 +169,10 @@ static bool __default_power_down_ok(struct dev_pm_domain *pd,
 		td = &to_gpd_data(pdd)->td;
 		constraint_ns = td->effective_constraint_ns;
 		/*
-		 * Negative values mean "no suspend at all" and this runs only
-		 * when all devices in the domain are suspended, so it must be
-		 * 0 at least.
-		 *
-		 * 0 means "no constraint"
+		 * Zero means "no suspend at all" and this runs only when all
+		 * devices in the domain are suspended, so it must be positive.
 		 */
-		if (constraint_ns == 0)
+		if (constraint_ns == PM_QOS_RESUME_LATENCY_NO_CONSTRAINT_NS)
 			continue;
 
 		if (constraint_ns <= off_on_time_ns)
diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 277d43a83f53..3382542b39b7 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -139,6 +139,9 @@ static int apply_constraint(struct dev_pm_qos_request *req,
 
 	switch(req->type) {
 	case DEV_PM_QOS_RESUME_LATENCY:
+		if (WARN_ON(action != PM_QOS_REMOVE_REQ && value < 0))
+			value = 0;
+
 		ret = pm_qos_update_target(&qos->resume_latency,
 					   &req->data.pnode, action, value);
 		break;
@@ -189,7 +192,7 @@ static int dev_pm_qos_constraints_allocate(struct device *dev)
 	plist_head_init(&c->list);
 	c->target_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE;
 	c->default_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE;
-	c->no_constraint_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE;
+	c->no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
 	c->type = PM_QOS_MIN;
 	c->notifiers = n;
 
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 7bcf80fa9ada..13e015905543 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -253,7 +253,7 @@ static int rpm_check_suspend_allowed(struct device *dev)
 	    || (dev->power.request_pending
 			&& dev->power.request == RPM_REQ_RESUME))
 		retval = -EAGAIN;
-	else if (__dev_pm_qos_read_value(dev) < 0)
+	else if (__dev_pm_qos_read_value(dev) == 0)
 		retval = -EPERM;
 	else if (dev->power.runtime_status == RPM_SUSPENDED)
 		retval = 1;
diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index 29bf28fef136..e153e28b1857 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -218,7 +218,14 @@ static ssize_t pm_qos_resume_latency_show(struct device *dev,
 					  struct device_attribute *attr,
 					  char *buf)
 {
-	return sprintf(buf, "%d\n", dev_pm_qos_requested_resume_latency(dev));
+	s32 value = dev_pm_qos_requested_resume_latency(dev);
+
+	if (value == 0)
+		return sprintf(buf, "n/a\n");
+	else if (value == PM_QOS_RESUME_LATENCY_NO_CONSTRAINT)
+		value = 0;
+
+	return sprintf(buf, "%d\n", value);
 }
 
 static ssize_t pm_qos_resume_latency_store(struct device *dev,
@@ -228,11 +235,21 @@ static ssize_t pm_qos_resume_latency_store(struct device *dev,
 	s32 value;
 	int ret;
 
-	if (kstrtos32(buf, 0, &value))
-		return -EINVAL;
+	if (!kstrtos32(buf, 0, &value)) {
+		/*
+		 * Prevent users from writing negative or "no constraint" values
+		 * directly.
+		 */
+		if (value < 0 || value == PM_QOS_RESUME_LATENCY_NO_CONSTRAINT)
+			return -EINVAL;
 
-	if (value < 0)
+		if (value == 0)
+			value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
+	} else if (!strcmp(buf, "n/a") || !strcmp(buf, "n/a\n")) {
+		value = 0;
+	} else {
 		return -EINVAL;
+	}
 
 	ret = dev_pm_qos_update_request(dev->power.qos->resume_latency_req,
 					value);
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 48eaf2879228..aa390404e85f 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -298,8 +298,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		data->needs_update = 0;
 	}
 
-	/* resume_latency is 0 means no restriction */
-	if (resume_latency && resume_latency < latency_req)
+	if (resume_latency < latency_req &&
+	    resume_latency != PM_QOS_RESUME_LATENCY_NO_CONSTRAINT)
 		latency_req = resume_latency;
 
 	/* Special case when user has set very strict latency requirement */
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 51f0d7e0b15f..2a3b36da61b1 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -27,16 +27,19 @@ enum pm_qos_flags_status {
 	PM_QOS_FLAGS_ALL,
 };
 
-#define PM_QOS_DEFAULT_VALUE -1
+#define PM_QOS_DEFAULT_VALUE	(-1)
+#define PM_QOS_LATENCY_ANY	S32_MAX
+#define PM_QOS_LATENCY_ANY_NS	((s64)PM_QOS_LATENCY_ANY * NSEC_PER_USEC)
 
 #define PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE	(2000 * USEC_PER_SEC)
 #define PM_QOS_NETWORK_LAT_DEFAULT_VALUE	(2000 * USEC_PER_SEC)
 #define PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE	0
 #define PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE	0
-#define PM_QOS_RESUME_LATENCY_DEFAULT_VALUE	0
+#define PM_QOS_RESUME_LATENCY_DEFAULT_VALUE	PM_QOS_LATENCY_ANY
+#define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT	PM_QOS_LATENCY_ANY
+#define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT_NS	PM_QOS_LATENCY_ANY_NS
 #define PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE	0
 #define PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT	(-1)
-#define PM_QOS_LATENCY_ANY			((s32)(~(__u32)0 >> 1))
 
 #define PM_QOS_FLAG_NO_POWER_OFF	(1 << 0)
 
@@ -173,7 +176,8 @@ static inline s32 dev_pm_qos_requested_flags(struct device *dev)
 static inline s32 dev_pm_qos_raw_read_value(struct device *dev)
 {
 	return IS_ERR_OR_NULL(dev->power.qos) ?
-		0 : pm_qos_read_value(&dev->power.qos->resume_latency);
+		PM_QOS_RESUME_LATENCY_NO_CONSTRAINT :
+		pm_qos_read_value(&dev->power.qos->resume_latency);
 }
 #else
 static inline enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev,
@@ -183,9 +187,9 @@ static inline enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev,
 							s32 mask)
 			{ return PM_QOS_FLAGS_UNDEFINED; }
 static inline s32 __dev_pm_qos_read_value(struct device *dev)
-			{ return 0; }
+			{ return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; }
 static inline s32 dev_pm_qos_read_value(struct device *dev)
-			{ return 0; }
+			{ return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; }
 static inline int dev_pm_qos_add_request(struct device *dev,
 					 struct dev_pm_qos_request *req,
 					 enum dev_pm_qos_req_type type,
@@ -231,9 +235,15 @@ static inline int dev_pm_qos_expose_latency_tolerance(struct device *dev)
 			{ return 0; }
 static inline void dev_pm_qos_hide_latency_tolerance(struct device *dev) {}
 
-static inline s32 dev_pm_qos_requested_resume_latency(struct device *dev) { return 0; }
+static inline s32 dev_pm_qos_requested_resume_latency(struct device *dev)
+{
+	return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
+}
 static inline s32 dev_pm_qos_requested_flags(struct device *dev) { return 0; }
-static inline s32 dev_pm_qos_raw_read_value(struct device *dev) { return 0; }
+static inline s32 dev_pm_qos_raw_read_value(struct device *dev)
+{
+	return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
+}
 #endif
 
 #endif
-- 
cgit v1.2.3


From e0e1e39de490a2d9b8a173363ccf2415ddada871 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sat, 28 Oct 2017 15:37:17 +0200
Subject: pinctrl: Add skew-delay pin config and bindings

Some pin controllers (such as the Gemini) can control the
expected clock skew and output delay on certain pins with a
sub-nanosecond granularity. This is typically done by shunting
in a number of double inverters in front of or behind the pin.
Make it possible to configure this with a generic binding.

Cc: devicetree@vger.kernel.org
Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Hans Ulli Kroll <ulli.kroll@googlemail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt | 4 ++++
 drivers/pinctrl/pinconf-generic.c                              | 2 ++
 include/linux/pinctrl/pinconf-generic.h                        | 5 +++++
 3 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
index 4483cc31e531..ad9bbbba36e9 100644
--- a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
+++ b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
@@ -271,6 +271,10 @@ output-high		- set the pin to output mode with high level
 sleep-hardware-state	- indicate this is sleep related state which will be programmed
 			  into the registers for the sleep state.
 slew-rate		- set the slew rate
+skew-delay		- this affects the expected clock skew on input pins
+			  and the delay before latching a value to an output
+			  pin. Typically indicates how many double-inverters are
+			  used to delay the signal.
 
 For example:
 
diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index 8eaa25c3384f..b4f7f8a458ea 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -49,6 +49,7 @@ static const struct pin_config_item conf_items[] = {
 	PCONFDUMP(PIN_CONFIG_POWER_SOURCE, "pin power source", "selector", true),
 	PCONFDUMP(PIN_CONFIG_SLEEP_HARDWARE_STATE, "sleep hardware state", NULL, false),
 	PCONFDUMP(PIN_CONFIG_SLEW_RATE, "slew rate", NULL, true),
+	PCONFDUMP(PIN_CONFIG_SKEW_DELAY, "skew delay", NULL, true),
 };
 
 static void pinconf_generic_dump_one(struct pinctrl_dev *pctldev,
@@ -181,6 +182,7 @@ static const struct pinconf_generic_params dt_params[] = {
 	{ "power-source", PIN_CONFIG_POWER_SOURCE, 0 },
 	{ "sleep-hardware-state", PIN_CONFIG_SLEEP_HARDWARE_STATE, 0 },
 	{ "slew-rate", PIN_CONFIG_SLEW_RATE, 0 },
+	{ "skew-delay", PIN_CONFIG_SKEW_DELAY, 0 },
 };
 
 /**
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 5d8bc7f21c2a..ec6dadcc1fde 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -90,6 +90,10 @@
  * @PIN_CONFIG_SLEW_RATE: if the pin can select slew rate, the argument to
  *	this parameter (on a custom format) tells the driver which alternative
  *	slew rate to use.
+ * @PIN_CONFIG_SKEW_DELAY: if the pin has programmable skew rate (on inputs)
+ *	or latch delay (on outputs) this parameter (in a custom format)
+ *	specifies the clock skew or latch delay. It typically controls how
+ *	many double inverters are put in front of the line.
  * @PIN_CONFIG_END: this is the last enumerator for pin configurations, if
  *	you need to pass in custom configurations to the pin controller, use
  *	PIN_CONFIG_END+1 as the base offset.
@@ -117,6 +121,7 @@ enum pin_config_param {
 	PIN_CONFIG_POWER_SOURCE,
 	PIN_CONFIG_SLEEP_HARDWARE_STATE,
 	PIN_CONFIG_SLEW_RATE,
+	PIN_CONFIG_SKEW_DELAY,
 	PIN_CONFIG_END = 0x7F,
 	PIN_CONFIG_MAX = 0xFF,
 };
-- 
cgit v1.2.3


From c44eafd79be666e7c81d22e215c945b27f2785f7 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 7 Nov 2017 19:15:45 +0100
Subject: gpio: Introduce struct gpio_irq_chip

This new structure will be used to group all fields related to interrupt
handling in a GPIO chip. Doing so will properly namespace these fields
and make it easier to distinguish which fields are used for IRQ support.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Acked-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index ed04fa2a00a8..36a065521fa0 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -19,6 +19,36 @@ struct module;
 
 #ifdef CONFIG_GPIOLIB
 
+#ifdef CONFIG_GPIOLIB_IRQCHIP
+/**
+ * struct gpio_irq_chip - GPIO interrupt controller
+ */
+struct gpio_irq_chip {
+	/**
+	 * @domain_ops:
+	 *
+	 * Table of interrupt domain operations for this IRQ chip.
+	 */
+	const struct irq_domain_ops *domain_ops;
+
+	/**
+	 * @parent_handler:
+	 *
+	 * The interrupt handler for the GPIO chip's parent interrupts, may be
+	 * NULL if the parent interrupts are nested rather than cascaded.
+	 */
+	irq_flow_handler_t parent_handler;
+
+	/**
+	 * @parent_handler_data:
+	 *
+	 * Data associated, and passed to, the handler for the parent
+	 * interrupt.
+	 */
+	void *parent_handler_data;
+};
+#endif
+
 /**
  * struct gpio_chip - abstract a GPIO controller
  * @label: a functional name for the GPIO device, such as a part
@@ -176,6 +206,14 @@ struct gpio_chip {
 	bool			irq_need_valid_mask;
 	unsigned long		*irq_valid_mask;
 	struct lock_class_key	*lock_key;
+
+	/**
+	 * @irq:
+	 *
+	 * Integrates interrupt chip functionality with the GPIO chip. Can be
+	 * used to handle IRQs for most practical cases.
+	 */
+	struct gpio_irq_chip irq;
 #endif
 
 #if defined(CONFIG_OF_GPIO)
-- 
cgit v1.2.3


From da80ff81a8f54611b834d73149f8ac0d59151c87 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 7 Nov 2017 19:15:46 +0100
Subject: gpio: Move irqchip into struct gpio_irq_chip

In order to consolidate the multiple ways to associate an IRQ chip with
a GPIO chip, move more fields into the new struct gpio_irq_chip.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Acked-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 14 +++++++-------
 include/linux/gpio/driver.h | 14 ++++++++++++--
 2 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 3827f0767101..d3d0b3134ba3 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1646,7 +1646,7 @@ static int gpiochip_irq_map(struct irq_domain *d, unsigned int irq,
 	 * category than their parents, so it won't report false recursion.
 	 */
 	irq_set_lockdep_class(irq, chip->lock_key);
-	irq_set_chip_and_handler(irq, chip->irqchip, chip->irq_handler);
+	irq_set_chip_and_handler(irq, chip->irq.chip, chip->irq_handler);
 	/* Chips that use nested thread handlers have them marked */
 	if (chip->irq_nested)
 		irq_set_nested_thread(irq, 1);
@@ -1739,10 +1739,10 @@ static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip)
 		irq_domain_remove(gpiochip->irqdomain);
 	}
 
-	if (gpiochip->irqchip) {
-		gpiochip->irqchip->irq_request_resources = NULL;
-		gpiochip->irqchip->irq_release_resources = NULL;
-		gpiochip->irqchip = NULL;
+	if (gpiochip->irq.chip) {
+		gpiochip->irq.chip->irq_request_resources = NULL;
+		gpiochip->irq.chip->irq_release_resources = NULL;
+		gpiochip->irq.chip = NULL;
 	}
 
 	gpiochip_irqchip_free_valid_mask(gpiochip);
@@ -1817,7 +1817,7 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 		type = IRQ_TYPE_NONE;
 	}
 
-	gpiochip->irqchip = irqchip;
+	gpiochip->irq.chip = irqchip;
 	gpiochip->irq_handler = handler;
 	gpiochip->irq_default_type = type;
 	gpiochip->to_irq = gpiochip_to_irq;
@@ -1826,7 +1826,7 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 					gpiochip->ngpio, first_irq,
 					&gpiochip_domain_ops, gpiochip);
 	if (!gpiochip->irqdomain) {
-		gpiochip->irqchip = NULL;
+		gpiochip->irq.chip = NULL;
 		return -EINVAL;
 	}
 
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 36a065521fa0..a79b3b18fadd 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -24,6 +24,13 @@ struct module;
  * struct gpio_irq_chip - GPIO interrupt controller
  */
 struct gpio_irq_chip {
+	/**
+	 * @chip:
+	 *
+	 * GPIO IRQ chip implementation, provided by GPIO driver.
+	 */
+	struct irq_chip *chip;
+
 	/**
 	 * @domain_ops:
 	 *
@@ -47,6 +54,11 @@ struct gpio_irq_chip {
 	 */
 	void *parent_handler_data;
 };
+
+static inline struct gpio_irq_chip *to_gpio_irq_chip(struct irq_chip *chip)
+{
+	return container_of(chip, struct gpio_irq_chip, chip);
+}
 #endif
 
 /**
@@ -112,7 +124,6 @@ struct gpio_irq_chip {
  *	safely.
  * @bgpio_dir: shadowed direction register for generic GPIO to clear/set
  *	direction safely.
- * @irqchip: GPIO IRQ chip impl, provided by GPIO driver
  * @irqdomain: Interrupt translation domain; responsible for mapping
  *	between GPIO hwirq number and linux irq number
  * @irq_handler: the irq handler to use (often a predefined irq core function)
@@ -197,7 +208,6 @@ struct gpio_chip {
 	 * With CONFIG_GPIOLIB_IRQCHIP we get an irqchip inside the gpiolib
 	 * to handle IRQs for most practical cases.
 	 */
-	struct irq_chip		*irqchip;
 	struct irq_domain	*irqdomain;
 	irq_flow_handler_t	irq_handler;
 	unsigned int		irq_default_type;
-- 
cgit v1.2.3


From f0fbe7bce733561b76a5b55c5f4625888acd3792 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 7 Nov 2017 19:15:47 +0100
Subject: gpio: Move irqdomain into struct gpio_irq_chip

In order to consolidate the multiple ways to associate an IRQ chip with
a GPIO chip, move more fields into the new struct gpio_irq_chip.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Acked-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/gpio/driver.txt               |  2 +-
 drivers/bcma/driver_gpio.c                  |  2 +-
 drivers/gpio/gpio-104-dio-48e.c             |  2 +-
 drivers/gpio/gpio-104-idi-48.c              |  2 +-
 drivers/gpio/gpio-104-idio-16.c             |  2 +-
 drivers/gpio/gpio-adnp.c                    |  2 +-
 drivers/gpio/gpio-altera.c                  |  4 ++--
 drivers/gpio/gpio-aspeed.c                  |  2 +-
 drivers/gpio/gpio-ath79.c                   |  2 +-
 drivers/gpio/gpio-crystalcove.c             |  2 +-
 drivers/gpio/gpio-dln2.c                    |  2 +-
 drivers/gpio/gpio-ftgpio010.c               |  2 +-
 drivers/gpio/gpio-ingenic.c                 |  2 +-
 drivers/gpio/gpio-intel-mid.c               |  2 +-
 drivers/gpio/gpio-lynxpoint.c               |  2 +-
 drivers/gpio/gpio-max732x.c                 |  2 +-
 drivers/gpio/gpio-merrifield.c              |  2 +-
 drivers/gpio/gpio-omap.c                    |  2 +-
 drivers/gpio/gpio-pca953x.c                 |  2 +-
 drivers/gpio/gpio-pcf857x.c                 |  2 +-
 drivers/gpio/gpio-pci-idio-16.c             |  2 +-
 drivers/gpio/gpio-pl061.c                   |  2 +-
 drivers/gpio/gpio-rcar.c                    |  2 +-
 drivers/gpio/gpio-reg.c                     |  4 ++--
 drivers/gpio/gpio-stmpe.c                   |  2 +-
 drivers/gpio/gpio-tc3589x.c                 |  2 +-
 drivers/gpio/gpio-vf610.c                   |  2 +-
 drivers/gpio/gpio-wcove.c                   |  2 +-
 drivers/gpio/gpio-ws16c48.c                 |  2 +-
 drivers/gpio/gpio-xgene-sb.c                |  2 +-
 drivers/gpio/gpio-xlp.c                     |  2 +-
 drivers/gpio/gpio-zx.c                      |  2 +-
 drivers/gpio/gpio-zynq.c                    |  2 +-
 drivers/gpio/gpiolib.c                      | 22 ++++++++++++----------
 drivers/pinctrl/bcm/pinctrl-bcm2835.c       |  4 ++--
 drivers/pinctrl/bcm/pinctrl-iproc-gpio.c    |  2 +-
 drivers/pinctrl/intel/pinctrl-baytrail.c    |  2 +-
 drivers/pinctrl/intel/pinctrl-cherryview.c  |  2 +-
 drivers/pinctrl/intel/pinctrl-intel.c       |  2 +-
 drivers/pinctrl/mvebu/pinctrl-armada-37xx.c |  2 +-
 drivers/pinctrl/nomadik/pinctrl-nomadik.c   |  4 ++--
 drivers/pinctrl/pinctrl-amd.c               |  2 +-
 drivers/pinctrl/pinctrl-at91.c              |  2 +-
 drivers/pinctrl/pinctrl-coh901.c            |  2 +-
 drivers/pinctrl/pinctrl-mcp23s08.c          |  2 +-
 drivers/pinctrl/pinctrl-oxnas.c             |  2 +-
 drivers/pinctrl/pinctrl-pic32.c             |  2 +-
 drivers/pinctrl/pinctrl-pistachio.c         |  2 +-
 drivers/pinctrl/pinctrl-st.c                |  2 +-
 drivers/pinctrl/pinctrl-sx150x.c            |  2 +-
 drivers/pinctrl/qcom/pinctrl-msm.c          |  2 +-
 drivers/pinctrl/sirf/pinctrl-atlas7.c       |  2 +-
 drivers/pinctrl/sirf/pinctrl-sirf.c         |  2 +-
 drivers/pinctrl/spear/pinctrl-plgpio.c      |  2 +-
 drivers/platform/x86/intel_int0002_vgpio.c  |  2 +-
 include/linux/gpio/driver.h                 | 11 ++++++++---
 56 files changed, 78 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/gpio/driver.txt b/Documentation/gpio/driver.txt
index fc1d2f83564d..dcf6af1d9e56 100644
--- a/Documentation/gpio/driver.txt
+++ b/Documentation/gpio/driver.txt
@@ -254,7 +254,7 @@ GPIO irqchips usually fall in one of two categories:
 	static irqreturn_t omap_gpio_irq_handler(int irq, void *gpiobank)
 		unsigned long wa_lock_flags;
 		raw_spin_lock_irqsave(&bank->wa_lock, wa_lock_flags);
-		generic_handle_irq(irq_find_mapping(bank->chip.irqdomain, bit));
+		generic_handle_irq(irq_find_mapping(bank->chip.irq.domain, bit));
 		raw_spin_unlock_irqrestore(&bank->wa_lock, wa_lock_flags);
 
 * GENERIC CHAINED GPIO irqchips: these are the same as "CHAINED GPIO irqchips",
diff --git a/drivers/bcma/driver_gpio.c b/drivers/bcma/driver_gpio.c
index 982d5781d3ce..2c0ffb77d738 100644
--- a/drivers/bcma/driver_gpio.c
+++ b/drivers/bcma/driver_gpio.c
@@ -113,7 +113,7 @@ static irqreturn_t bcma_gpio_irq_handler(int irq, void *dev_id)
 		return IRQ_NONE;
 
 	for_each_set_bit(gpio, &irqs, gc->ngpio)
-		generic_handle_irq(irq_find_mapping(gc->irqdomain, gpio));
+		generic_handle_irq(irq_find_mapping(gc->irq.domain, gpio));
 	bcma_chipco_gpio_polarity(cc, irqs, val & irqs);
 
 	return IRQ_HANDLED;
diff --git a/drivers/gpio/gpio-104-dio-48e.c b/drivers/gpio/gpio-104-dio-48e.c
index 598e209efa2d..bab3b94c5cbc 100644
--- a/drivers/gpio/gpio-104-dio-48e.c
+++ b/drivers/gpio/gpio-104-dio-48e.c
@@ -326,7 +326,7 @@ static irqreturn_t dio48e_irq_handler(int irq, void *dev_id)
 	unsigned long gpio;
 
 	for_each_set_bit(gpio, &irq_mask, 2)
-		generic_handle_irq(irq_find_mapping(chip->irqdomain,
+		generic_handle_irq(irq_find_mapping(chip->irq.domain,
 			19 + gpio*24));
 
 	raw_spin_lock(&dio48egpio->lock);
diff --git a/drivers/gpio/gpio-104-idi-48.c b/drivers/gpio/gpio-104-idi-48.c
index 51f046e29ff7..add859d59766 100644
--- a/drivers/gpio/gpio-104-idi-48.c
+++ b/drivers/gpio/gpio-104-idi-48.c
@@ -209,7 +209,7 @@ static irqreturn_t idi_48_irq_handler(int irq, void *dev_id)
 		for_each_set_bit(bit_num, &irq_mask, 8) {
 			gpio = bit_num + boundary * 8;
 
-			generic_handle_irq(irq_find_mapping(chip->irqdomain,
+			generic_handle_irq(irq_find_mapping(chip->irq.domain,
 				gpio));
 		}
 	}
diff --git a/drivers/gpio/gpio-104-idio-16.c b/drivers/gpio/gpio-104-idio-16.c
index ec2ce34ff473..2f16638a0589 100644
--- a/drivers/gpio/gpio-104-idio-16.c
+++ b/drivers/gpio/gpio-104-idio-16.c
@@ -199,7 +199,7 @@ static irqreturn_t idio_16_irq_handler(int irq, void *dev_id)
 	int gpio;
 
 	for_each_set_bit(gpio, &idio16gpio->irq_mask, chip->ngpio)
-		generic_handle_irq(irq_find_mapping(chip->irqdomain, gpio));
+		generic_handle_irq(irq_find_mapping(chip->irq.domain, gpio));
 
 	raw_spin_lock(&idio16gpio->lock);
 
diff --git a/drivers/gpio/gpio-adnp.c b/drivers/gpio/gpio-adnp.c
index 7f475eef3faa..44c09904daa6 100644
--- a/drivers/gpio/gpio-adnp.c
+++ b/drivers/gpio/gpio-adnp.c
@@ -320,7 +320,7 @@ static irqreturn_t adnp_irq(int irq, void *data)
 
 		for_each_set_bit(bit, &pending, 8) {
 			unsigned int child_irq;
-			child_irq = irq_find_mapping(adnp->gpio.irqdomain,
+			child_irq = irq_find_mapping(adnp->gpio.irq.domain,
 						     base + bit);
 			handle_nested_irq(child_irq);
 		}
diff --git a/drivers/gpio/gpio-altera.c b/drivers/gpio/gpio-altera.c
index ccc02ed65b3c..8e76d390e653 100644
--- a/drivers/gpio/gpio-altera.c
+++ b/drivers/gpio/gpio-altera.c
@@ -211,7 +211,7 @@ static void altera_gpio_irq_edge_handler(struct irq_desc *desc)
 	altera_gc = gpiochip_get_data(irq_desc_get_handler_data(desc));
 	chip = irq_desc_get_chip(desc);
 	mm_gc = &altera_gc->mmchip;
-	irqdomain = altera_gc->mmchip.gc.irqdomain;
+	irqdomain = altera_gc->mmchip.gc.irq.domain;
 
 	chained_irq_enter(chip, desc);
 
@@ -239,7 +239,7 @@ static void altera_gpio_irq_leveL_high_handler(struct irq_desc *desc)
 	altera_gc = gpiochip_get_data(irq_desc_get_handler_data(desc));
 	chip = irq_desc_get_chip(desc);
 	mm_gc = &altera_gc->mmchip;
-	irqdomain = altera_gc->mmchip.gc.irqdomain;
+	irqdomain = altera_gc->mmchip.gc.irq.domain;
 
 	chained_irq_enter(chip, desc);
 
diff --git a/drivers/gpio/gpio-aspeed.c b/drivers/gpio/gpio-aspeed.c
index 00dc1c020198..2bfce0ab7326 100644
--- a/drivers/gpio/gpio-aspeed.c
+++ b/drivers/gpio/gpio-aspeed.c
@@ -469,7 +469,7 @@ static void aspeed_gpio_irq_handler(struct irq_desc *desc)
 		reg = ioread32(bank_irq_reg(data, bank, GPIO_IRQ_STATUS));
 
 		for_each_set_bit(p, &reg, 32) {
-			girq = irq_find_mapping(gc->irqdomain, i * 32 + p);
+			girq = irq_find_mapping(gc->irq.domain, i * 32 + p);
 			generic_handle_irq(girq);
 		}
 
diff --git a/drivers/gpio/gpio-ath79.c b/drivers/gpio/gpio-ath79.c
index 02e56e0c793a..5fad89dfab7e 100644
--- a/drivers/gpio/gpio-ath79.c
+++ b/drivers/gpio/gpio-ath79.c
@@ -209,7 +209,7 @@ static void ath79_gpio_irq_handler(struct irq_desc *desc)
 	if (pending) {
 		for_each_set_bit(irq, &pending, gc->ngpio)
 			generic_handle_irq(
-				irq_linear_revmap(gc->irqdomain, irq));
+				irq_linear_revmap(gc->irq.domain, irq));
 	}
 
 	chained_irq_exit(irqchip, desc);
diff --git a/drivers/gpio/gpio-crystalcove.c b/drivers/gpio/gpio-crystalcove.c
index e60156ec0c18..b6f0f729656c 100644
--- a/drivers/gpio/gpio-crystalcove.c
+++ b/drivers/gpio/gpio-crystalcove.c
@@ -295,7 +295,7 @@ static irqreturn_t crystalcove_gpio_irq_handler(int irq, void *data)
 
 	for (gpio = 0; gpio < CRYSTALCOVE_GPIO_NUM; gpio++) {
 		if (pending & BIT(gpio)) {
-			virq = irq_find_mapping(cg->chip.irqdomain, gpio);
+			virq = irq_find_mapping(cg->chip.irq.domain, gpio);
 			handle_nested_irq(virq);
 		}
 	}
diff --git a/drivers/gpio/gpio-dln2.c b/drivers/gpio/gpio-dln2.c
index aecb847166f5..1dada68b9a27 100644
--- a/drivers/gpio/gpio-dln2.c
+++ b/drivers/gpio/gpio-dln2.c
@@ -420,7 +420,7 @@ static void dln2_gpio_event(struct platform_device *pdev, u16 echo,
 		return;
 	}
 
-	irq = irq_find_mapping(dln2->gpio.irqdomain, pin);
+	irq = irq_find_mapping(dln2->gpio.irq.domain, pin);
 	if (!irq) {
 		dev_err(dln2->gpio.parent, "pin %d not mapped to IRQ\n", pin);
 		return;
diff --git a/drivers/gpio/gpio-ftgpio010.c b/drivers/gpio/gpio-ftgpio010.c
index e9386f8b67f5..b7896bae83ca 100644
--- a/drivers/gpio/gpio-ftgpio010.c
+++ b/drivers/gpio/gpio-ftgpio010.c
@@ -149,7 +149,7 @@ static void ftgpio_gpio_irq_handler(struct irq_desc *desc)
 	stat = readl(g->base + GPIO_INT_STAT);
 	if (stat)
 		for_each_set_bit(offset, &stat, gc->ngpio)
-			generic_handle_irq(irq_find_mapping(gc->irqdomain,
+			generic_handle_irq(irq_find_mapping(gc->irq.domain,
 							    offset));
 
 	chained_irq_exit(irqchip, desc);
diff --git a/drivers/gpio/gpio-ingenic.c b/drivers/gpio/gpio-ingenic.c
index 254780730b95..15fb2bc796a8 100644
--- a/drivers/gpio/gpio-ingenic.c
+++ b/drivers/gpio/gpio-ingenic.c
@@ -242,7 +242,7 @@ static void ingenic_gpio_irq_handler(struct irq_desc *desc)
 		flag = gpio_ingenic_read_reg(jzgc, JZ4740_GPIO_FLAG);
 
 	for_each_set_bit(i, &flag, 32)
-		generic_handle_irq(irq_linear_revmap(gc->irqdomain, i));
+		generic_handle_irq(irq_linear_revmap(gc->irq.domain, i));
 	chained_irq_exit(irq_chip, desc);
 }
 
diff --git a/drivers/gpio/gpio-intel-mid.c b/drivers/gpio/gpio-intel-mid.c
index b76ecee82c3f..629575ea46a0 100644
--- a/drivers/gpio/gpio-intel-mid.c
+++ b/drivers/gpio/gpio-intel-mid.c
@@ -295,7 +295,7 @@ static void intel_mid_irq_handler(struct irq_desc *desc)
 			mask = BIT(gpio);
 			/* Clear before handling so we can't lose an edge */
 			writel(mask, gedr);
-			generic_handle_irq(irq_find_mapping(gc->irqdomain,
+			generic_handle_irq(irq_find_mapping(gc->irq.domain,
 							    base + gpio));
 		}
 	}
diff --git a/drivers/gpio/gpio-lynxpoint.c b/drivers/gpio/gpio-lynxpoint.c
index fbd393b46ce0..1e557b10d73e 100644
--- a/drivers/gpio/gpio-lynxpoint.c
+++ b/drivers/gpio/gpio-lynxpoint.c
@@ -255,7 +255,7 @@ static void lp_gpio_irq_handler(struct irq_desc *desc)
 			mask = BIT(pin);
 			/* Clear before handling so we don't lose an edge */
 			outl(mask, reg);
-			irq = irq_find_mapping(lg->chip.irqdomain, base + pin);
+			irq = irq_find_mapping(lg->chip.irq.domain, base + pin);
 			generic_handle_irq(irq);
 		}
 	}
diff --git a/drivers/gpio/gpio-max732x.c b/drivers/gpio/gpio-max732x.c
index 7f4d26ce5f23..c04fae1ba32a 100644
--- a/drivers/gpio/gpio-max732x.c
+++ b/drivers/gpio/gpio-max732x.c
@@ -486,7 +486,7 @@ static irqreturn_t max732x_irq_handler(int irq, void *devid)
 
 	do {
 		level = __ffs(pending);
-		handle_nested_irq(irq_find_mapping(chip->gpio_chip.irqdomain,
+		handle_nested_irq(irq_find_mapping(chip->gpio_chip.irq.domain,
 						   level));
 
 		pending &= ~(1 << level);
diff --git a/drivers/gpio/gpio-merrifield.c b/drivers/gpio/gpio-merrifield.c
index ec8560298805..dd67a31ac337 100644
--- a/drivers/gpio/gpio-merrifield.c
+++ b/drivers/gpio/gpio-merrifield.c
@@ -357,7 +357,7 @@ static void mrfld_irq_handler(struct irq_desc *desc)
 		for_each_set_bit(gpio, &pending, 32) {
 			unsigned int irq;
 
-			irq = irq_find_mapping(gc->irqdomain, base + gpio);
+			irq = irq_find_mapping(gc->irq.domain, base + gpio);
 			generic_handle_irq(irq);
 		}
 	}
diff --git a/drivers/gpio/gpio-omap.c b/drivers/gpio/gpio-omap.c
index dbf869fb63ce..ce27d6a586bf 100644
--- a/drivers/gpio/gpio-omap.c
+++ b/drivers/gpio/gpio-omap.c
@@ -733,7 +733,7 @@ static irqreturn_t omap_gpio_irq_handler(int irq, void *gpiobank)
 
 			raw_spin_lock_irqsave(&bank->wa_lock, wa_lock_flags);
 
-			generic_handle_irq(irq_find_mapping(bank->chip.irqdomain,
+			generic_handle_irq(irq_find_mapping(bank->chip.irq.domain,
 							    bit));
 
 			raw_spin_unlock_irqrestore(&bank->wa_lock,
diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c
index 1b9dbf691ae7..babb7bd2ba59 100644
--- a/drivers/gpio/gpio-pca953x.c
+++ b/drivers/gpio/gpio-pca953x.c
@@ -608,7 +608,7 @@ static irqreturn_t pca953x_irq_handler(int irq, void *devid)
 	for (i = 0; i < NBANK(chip); i++) {
 		while (pending[i]) {
 			level = __ffs(pending[i]);
-			handle_nested_irq(irq_find_mapping(chip->gpio_chip.irqdomain,
+			handle_nested_irq(irq_find_mapping(chip->gpio_chip.irq.domain,
 							level + (BANK_SZ * i)));
 			pending[i] &= ~(1 << level);
 			nhandled++;
diff --git a/drivers/gpio/gpio-pcf857x.c b/drivers/gpio/gpio-pcf857x.c
index a4fd78b9c0e4..38fbb420c6cd 100644
--- a/drivers/gpio/gpio-pcf857x.c
+++ b/drivers/gpio/gpio-pcf857x.c
@@ -196,7 +196,7 @@ static irqreturn_t pcf857x_irq(int irq, void *data)
 	mutex_unlock(&gpio->lock);
 
 	for_each_set_bit(i, &change, gpio->chip.ngpio)
-		handle_nested_irq(irq_find_mapping(gpio->chip.irqdomain, i));
+		handle_nested_irq(irq_find_mapping(gpio->chip.irq.domain, i));
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/gpio/gpio-pci-idio-16.c b/drivers/gpio/gpio-pci-idio-16.c
index 7de4f6a2cb49..57d1b7fbf07b 100644
--- a/drivers/gpio/gpio-pci-idio-16.c
+++ b/drivers/gpio/gpio-pci-idio-16.c
@@ -240,7 +240,7 @@ static irqreturn_t idio_16_irq_handler(int irq, void *dev_id)
 		return IRQ_NONE;
 
 	for_each_set_bit(gpio, &idio16gpio->irq_mask, chip->ngpio)
-		generic_handle_irq(irq_find_mapping(chip->irqdomain, gpio));
+		generic_handle_irq(irq_find_mapping(chip->irq.domain, gpio));
 
 	raw_spin_lock(&idio16gpio->lock);
 
diff --git a/drivers/gpio/gpio-pl061.c b/drivers/gpio/gpio-pl061.c
index 6aaaab79c205..b70974cb9ef1 100644
--- a/drivers/gpio/gpio-pl061.c
+++ b/drivers/gpio/gpio-pl061.c
@@ -221,7 +221,7 @@ static void pl061_irq_handler(struct irq_desc *desc)
 	pending = readb(pl061->base + GPIOMIS);
 	if (pending) {
 		for_each_set_bit(offset, &pending, PL061_GPIO_NR)
-			generic_handle_irq(irq_find_mapping(gc->irqdomain,
+			generic_handle_irq(irq_find_mapping(gc->irq.domain,
 							    offset));
 	}
 
diff --git a/drivers/gpio/gpio-rcar.c b/drivers/gpio/gpio-rcar.c
index ddcff4f543bc..0ea998a3e357 100644
--- a/drivers/gpio/gpio-rcar.c
+++ b/drivers/gpio/gpio-rcar.c
@@ -207,7 +207,7 @@ static irqreturn_t gpio_rcar_irq_handler(int irq, void *dev_id)
 			  gpio_rcar_read(p, INTMSK))) {
 		offset = __ffs(pending);
 		gpio_rcar_write(p, INTCLR, BIT(offset));
-		generic_handle_irq(irq_find_mapping(p->gpio_chip.irqdomain,
+		generic_handle_irq(irq_find_mapping(p->gpio_chip.irq.domain,
 						    offset));
 		irqs_handled++;
 	}
diff --git a/drivers/gpio/gpio-reg.c b/drivers/gpio/gpio-reg.c
index e85903eddc68..23e771dba4c1 100644
--- a/drivers/gpio/gpio-reg.c
+++ b/drivers/gpio/gpio-reg.c
@@ -103,8 +103,8 @@ static int gpio_reg_to_irq(struct gpio_chip *gc, unsigned offset)
 	struct gpio_reg *r = to_gpio_reg(gc);
 	int irq = r->irqs[offset];
 
-	if (irq >= 0 && r->irqdomain)
-		irq = irq_find_mapping(r->irqdomain, irq);
+	if (irq >= 0 && r->irq.domain)
+		irq = irq_find_mapping(r->irq.domain, irq);
 
 	return irq;
 }
diff --git a/drivers/gpio/gpio-stmpe.c b/drivers/gpio/gpio-stmpe.c
index 001a89db5161..18d8bef76d85 100644
--- a/drivers/gpio/gpio-stmpe.c
+++ b/drivers/gpio/gpio-stmpe.c
@@ -397,7 +397,7 @@ static irqreturn_t stmpe_gpio_irq(int irq, void *dev)
 		while (stat) {
 			int bit = __ffs(stat);
 			int line = bank * 8 + bit;
-			int child_irq = irq_find_mapping(stmpe_gpio->chip.irqdomain,
+			int child_irq = irq_find_mapping(stmpe_gpio->chip.irq.domain,
 							 line);
 
 			handle_nested_irq(child_irq);
diff --git a/drivers/gpio/gpio-tc3589x.c b/drivers/gpio/gpio-tc3589x.c
index 433b45ef332e..91a8ef8e7f3f 100644
--- a/drivers/gpio/gpio-tc3589x.c
+++ b/drivers/gpio/gpio-tc3589x.c
@@ -268,7 +268,7 @@ static irqreturn_t tc3589x_gpio_irq(int irq, void *dev)
 		while (stat) {
 			int bit = __ffs(stat);
 			int line = i * 8 + bit;
-			int irq = irq_find_mapping(tc3589x_gpio->chip.irqdomain,
+			int irq = irq_find_mapping(tc3589x_gpio->chip.irq.domain,
 						   line);
 
 			handle_nested_irq(irq);
diff --git a/drivers/gpio/gpio-vf610.c b/drivers/gpio/gpio-vf610.c
index cbe9e06861de..4610cc2938ad 100644
--- a/drivers/gpio/gpio-vf610.c
+++ b/drivers/gpio/gpio-vf610.c
@@ -160,7 +160,7 @@ static void vf610_gpio_irq_handler(struct irq_desc *desc)
 	for_each_set_bit(pin, &irq_isfr, VF610_GPIO_PER_PORT) {
 		vf610_gpio_writel(BIT(pin), port->base + PORT_ISFR);
 
-		generic_handle_irq(irq_find_mapping(port->gc.irqdomain, pin));
+		generic_handle_irq(irq_find_mapping(port->gc.irq.domain, pin));
 	}
 
 	chained_irq_exit(chip, desc);
diff --git a/drivers/gpio/gpio-wcove.c b/drivers/gpio/gpio-wcove.c
index 85341eab795d..dde7c6aecbb5 100644
--- a/drivers/gpio/gpio-wcove.c
+++ b/drivers/gpio/gpio-wcove.c
@@ -350,7 +350,7 @@ static irqreturn_t wcove_gpio_irq_handler(int irq, void *data)
 			offset = (gpio > GROUP0_NR_IRQS) ? 1 : 0;
 			mask = (offset == 1) ? BIT(gpio - GROUP0_NR_IRQS) :
 								BIT(gpio);
-			virq = irq_find_mapping(wg->chip.irqdomain, gpio);
+			virq = irq_find_mapping(wg->chip.irq.domain, gpio);
 			handle_nested_irq(virq);
 			regmap_update_bits(wg->regmap, IRQ_STATUS_BASE + offset,
 								mask, mask);
diff --git a/drivers/gpio/gpio-ws16c48.c b/drivers/gpio/gpio-ws16c48.c
index 5037974ac063..746648244bf3 100644
--- a/drivers/gpio/gpio-ws16c48.c
+++ b/drivers/gpio/gpio-ws16c48.c
@@ -332,7 +332,7 @@ static irqreturn_t ws16c48_irq_handler(int irq, void *dev_id)
 			int_id = inb(ws16c48gpio->base + 8 + port);
 			for_each_set_bit(gpio, &int_id, 8)
 				generic_handle_irq(irq_find_mapping(
-					chip->irqdomain, gpio + 8*port));
+					chip->irq.domain, gpio + 8*port));
 		}
 
 		int_pending = inb(ws16c48gpio->base + 6) & 0x7;
diff --git a/drivers/gpio/gpio-xgene-sb.c b/drivers/gpio/gpio-xgene-sb.c
index 82c3ee6da66a..4f2623c2393e 100644
--- a/drivers/gpio/gpio-xgene-sb.c
+++ b/drivers/gpio/gpio-xgene-sb.c
@@ -287,7 +287,7 @@ static int xgene_gpio_sb_probe(struct platform_device *pdev)
 	if (!priv->irq_domain)
 		return -ENODEV;
 
-	priv->gc.irqdomain = priv->irq_domain;
+	priv->gc.irq.domain = priv->irq_domain;
 
 	ret = devm_gpiochip_add_data(&pdev->dev, &priv->gc, priv);
 	if (ret) {
diff --git a/drivers/gpio/gpio-xlp.c b/drivers/gpio/gpio-xlp.c
index d857e1d8e731..e74bd43a6974 100644
--- a/drivers/gpio/gpio-xlp.c
+++ b/drivers/gpio/gpio-xlp.c
@@ -225,7 +225,7 @@ static void xlp_gpio_generic_handler(struct irq_desc *desc)
 
 		if (gpio_stat & BIT(gpio % XLP_GPIO_REGSZ))
 			generic_handle_irq(irq_find_mapping(
-						priv->chip.irqdomain, gpio));
+						priv->chip.irq.domain, gpio));
 	}
 	chained_irq_exit(irqchip, desc);
 }
diff --git a/drivers/gpio/gpio-zx.c b/drivers/gpio/gpio-zx.c
index be3a87da8438..5eacad9b2692 100644
--- a/drivers/gpio/gpio-zx.c
+++ b/drivers/gpio/gpio-zx.c
@@ -170,7 +170,7 @@ static void zx_irq_handler(struct irq_desc *desc)
 	writew_relaxed(pending, chip->base + ZX_GPIO_IC);
 	if (pending) {
 		for_each_set_bit(offset, &pending, ZX_GPIO_NR)
-			generic_handle_irq(irq_find_mapping(gc->irqdomain,
+			generic_handle_irq(irq_find_mapping(gc->irq.domain,
 							    offset));
 	}
 
diff --git a/drivers/gpio/gpio-zynq.c b/drivers/gpio/gpio-zynq.c
index b3cc948a2d8b..75ee877e5cd5 100644
--- a/drivers/gpio/gpio-zynq.c
+++ b/drivers/gpio/gpio-zynq.c
@@ -562,7 +562,7 @@ static void zynq_gpio_handle_bank_irq(struct zynq_gpio *gpio,
 				      unsigned long pending)
 {
 	unsigned int bank_offset = gpio->p_data->bank_min[bank_num];
-	struct irq_domain *irqdomain = gpio->chip.irqdomain;
+	struct irq_domain *irqdomain = gpio->chip.irq.domain;
 	int offset;
 
 	if (!pending)
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index d3d0b3134ba3..9ee75a45ba37 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1550,7 +1550,7 @@ static void gpiochip_set_cascaded_irqchip(struct gpio_chip *gpiochip,
 {
 	unsigned int offset;
 
-	if (!gpiochip->irqdomain) {
+	if (!gpiochip->irq.domain) {
 		chip_err(gpiochip, "called %s before setting up irqchip\n",
 			 __func__);
 		return;
@@ -1577,7 +1577,7 @@ static void gpiochip_set_cascaded_irqchip(struct gpio_chip *gpiochip,
 	for (offset = 0; offset < gpiochip->ngpio; offset++) {
 		if (!gpiochip_irqchip_irq_valid(gpiochip, offset))
 			continue;
-		irq_set_parent(irq_find_mapping(gpiochip->irqdomain, offset),
+		irq_set_parent(irq_find_mapping(gpiochip->irq.domain, offset),
 			       parent_irq);
 	}
 }
@@ -1708,7 +1708,7 @@ static int gpiochip_to_irq(struct gpio_chip *chip, unsigned offset)
 {
 	if (!gpiochip_irqchip_irq_valid(chip, offset))
 		return -ENXIO;
-	return irq_create_mapping(chip->irqdomain, offset);
+	return irq_create_mapping(chip->irq.domain, offset);
 }
 
 /**
@@ -1719,7 +1719,7 @@ static int gpiochip_to_irq(struct gpio_chip *chip, unsigned offset)
  */
 static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip)
 {
-	unsigned int offset;
+	unsigned int offset, irq;
 
 	acpi_gpiochip_free_interrupts(gpiochip);
 
@@ -1729,14 +1729,16 @@ static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip)
 	}
 
 	/* Remove all IRQ mappings and delete the domain */
-	if (gpiochip->irqdomain) {
+	if (gpiochip->irq.domain) {
 		for (offset = 0; offset < gpiochip->ngpio; offset++) {
 			if (!gpiochip_irqchip_irq_valid(gpiochip, offset))
 				continue;
-			irq_dispose_mapping(
-				irq_find_mapping(gpiochip->irqdomain, offset));
+
+			irq = irq_find_mapping(gpiochip->irq.domain, offset);
+			irq_dispose_mapping(irq);
 		}
-		irq_domain_remove(gpiochip->irqdomain);
+
+		irq_domain_remove(gpiochip->irq.domain);
 	}
 
 	if (gpiochip->irq.chip) {
@@ -1822,10 +1824,10 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 	gpiochip->irq_default_type = type;
 	gpiochip->to_irq = gpiochip_to_irq;
 	gpiochip->lock_key = lock_key;
-	gpiochip->irqdomain = irq_domain_add_simple(of_node,
+	gpiochip->irq.domain = irq_domain_add_simple(of_node,
 					gpiochip->ngpio, first_irq,
 					&gpiochip_domain_ops, gpiochip);
-	if (!gpiochip->irqdomain) {
+	if (!gpiochip->irq.domain) {
 		gpiochip->irq.chip = NULL;
 		return -EINVAL;
 	}
diff --git a/drivers/pinctrl/bcm/pinctrl-bcm2835.c b/drivers/pinctrl/bcm/pinctrl-bcm2835.c
index 0944310225db..fb85ab3cf662 100644
--- a/drivers/pinctrl/bcm/pinctrl-bcm2835.c
+++ b/drivers/pinctrl/bcm/pinctrl-bcm2835.c
@@ -383,7 +383,7 @@ static void bcm2835_gpio_irq_handle_bank(struct bcm2835_pinctrl *pc,
 		/* FIXME: no clue why the code looks up the type here */
 		type = pc->irq_type[gpio];
 
-		generic_handle_irq(irq_linear_revmap(pc->gpio_chip.irqdomain,
+		generic_handle_irq(irq_linear_revmap(pc->gpio_chip.irq.irqdomain,
 						     gpio));
 	}
 }
@@ -665,7 +665,7 @@ static void bcm2835_pctl_pin_dbg_show(struct pinctrl_dev *pctldev,
 	enum bcm2835_fsel fsel = bcm2835_pinctrl_fsel_get(pc, offset);
 	const char *fname = bcm2835_functions[fsel];
 	int value = bcm2835_gpio_get_bit(pc, GPLEV0, offset);
-	int irq = irq_find_mapping(chip->irqdomain, offset);
+	int irq = irq_find_mapping(chip->irq.domain, offset);
 
 	seq_printf(s, "function %s in %s; irq %d (%s)",
 		fname, value ? "hi" : "lo",
diff --git a/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c b/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c
index 85a8c97d9dfe..b93f62dc8733 100644
--- a/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c
+++ b/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c
@@ -172,7 +172,7 @@ static void iproc_gpio_irq_handler(struct irq_desc *desc)
 
 		for_each_set_bit(bit, &val, NGPIOS_PER_BANK) {
 			unsigned pin = NGPIOS_PER_BANK * i + bit;
-			int child_irq = irq_find_mapping(gc->irqdomain, pin);
+			int child_irq = irq_find_mapping(gc->irq.domain, pin);
 
 			/*
 			 * Clear the interrupt before invoking the
diff --git a/drivers/pinctrl/intel/pinctrl-baytrail.c b/drivers/pinctrl/intel/pinctrl-baytrail.c
index 0f3a02495aeb..5897981e5ed3 100644
--- a/drivers/pinctrl/intel/pinctrl-baytrail.c
+++ b/drivers/pinctrl/intel/pinctrl-baytrail.c
@@ -1627,7 +1627,7 @@ static void byt_gpio_irq_handler(struct irq_desc *desc)
 		pending = readl(reg);
 		raw_spin_unlock(&vg->lock);
 		for_each_set_bit(pin, &pending, 32) {
-			virq = irq_find_mapping(vg->chip.irqdomain, base + pin);
+			virq = irq_find_mapping(vg->chip.irq.domain, base + pin);
 			generic_handle_irq(virq);
 		}
 	}
diff --git a/drivers/pinctrl/intel/pinctrl-cherryview.c b/drivers/pinctrl/intel/pinctrl-cherryview.c
index 04e929fd0ffe..1cd7043edbc1 100644
--- a/drivers/pinctrl/intel/pinctrl-cherryview.c
+++ b/drivers/pinctrl/intel/pinctrl-cherryview.c
@@ -1523,7 +1523,7 @@ static void chv_gpio_irq_handler(struct irq_desc *desc)
 		unsigned irq, offset;
 
 		offset = pctrl->intr_lines[intr_line];
-		irq = irq_find_mapping(gc->irqdomain, offset);
+		irq = irq_find_mapping(gc->irq.domain, offset);
 		generic_handle_irq(irq);
 	}
 
diff --git a/drivers/pinctrl/intel/pinctrl-intel.c b/drivers/pinctrl/intel/pinctrl-intel.c
index 71df0f70b61f..ffda27bfd133 100644
--- a/drivers/pinctrl/intel/pinctrl-intel.c
+++ b/drivers/pinctrl/intel/pinctrl-intel.c
@@ -1005,7 +1005,7 @@ static irqreturn_t intel_gpio_community_irq_handler(struct intel_pinctrl *pctrl,
 			if (padno >= community->npins)
 				break;
 
-			irq = irq_find_mapping(gc->irqdomain,
+			irq = irq_find_mapping(gc->irq.domain,
 					       community->pin_base + padno);
 			generic_handle_irq(irq);
 
diff --git a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c
index e800d55c340b..d754a9b10e19 100644
--- a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c
+++ b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c
@@ -592,7 +592,7 @@ static void armada_37xx_irq_handler(struct irq_desc *desc)
 	struct gpio_chip *gc = irq_desc_get_handler_data(desc);
 	struct irq_chip *chip = irq_desc_get_chip(desc);
 	struct armada_37xx_pinctrl *info = gpiochip_get_data(gc);
-	struct irq_domain *d = gc->irqdomain;
+	struct irq_domain *d = gc->irq.domain;
 	int i;
 
 	chained_irq_enter(chip, desc);
diff --git a/drivers/pinctrl/nomadik/pinctrl-nomadik.c b/drivers/pinctrl/nomadik/pinctrl-nomadik.c
index a53f1a9b1ed2..f0e7a8c114b2 100644
--- a/drivers/pinctrl/nomadik/pinctrl-nomadik.c
+++ b/drivers/pinctrl/nomadik/pinctrl-nomadik.c
@@ -413,7 +413,7 @@ nmk_gpio_disable_lazy_irq(struct nmk_gpio_chip *nmk_chip, unsigned offset)
 	u32 falling = nmk_chip->fimsc & BIT(offset);
 	u32 rising = nmk_chip->rimsc & BIT(offset);
 	int gpio = nmk_chip->chip.base + offset;
-	int irq = irq_find_mapping(nmk_chip->chip.irqdomain, offset);
+	int irq = irq_find_mapping(nmk_chip->chip.irq.domain, offset);
 	struct irq_data *d = irq_get_irq_data(irq);
 
 	if (!rising && !falling)
@@ -815,7 +815,7 @@ static void __nmk_gpio_irq_handler(struct irq_desc *desc, u32 status)
 	while (status) {
 		int bit = __ffs(status);
 
-		generic_handle_irq(irq_find_mapping(chip->irqdomain, bit));
+		generic_handle_irq(irq_find_mapping(chip->irq.domain, bit));
 		status &= ~BIT(bit);
 	}
 
diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c
index 3f6b34febbf1..06362bf84a5b 100644
--- a/drivers/pinctrl/pinctrl-amd.c
+++ b/drivers/pinctrl/pinctrl-amd.c
@@ -532,7 +532,7 @@ static irqreturn_t amd_gpio_irq_handler(int irq, void *dev_id)
 			regval = readl(regs + i);
 			if (!(regval & PIN_IRQ_PENDING))
 				continue;
-			irq = irq_find_mapping(gc->irqdomain, irqnr + i);
+			irq = irq_find_mapping(gc->irq.domain, irqnr + i);
 			generic_handle_irq(irq);
 			/* Clear interrupt */
 			writel(regval, regs + i);
diff --git a/drivers/pinctrl/pinctrl-at91.c b/drivers/pinctrl/pinctrl-at91.c
index 569bc28cb909..03492e3c09fa 100644
--- a/drivers/pinctrl/pinctrl-at91.c
+++ b/drivers/pinctrl/pinctrl-at91.c
@@ -1603,7 +1603,7 @@ static void gpio_irq_handler(struct irq_desc *desc)
 
 		for_each_set_bit(n, &isr, BITS_PER_LONG) {
 			generic_handle_irq(irq_find_mapping(
-					   gpio_chip->irqdomain, n));
+					   gpio_chip->irq.domain, n));
 		}
 	}
 	chained_irq_exit(chip, desc);
diff --git a/drivers/pinctrl/pinctrl-coh901.c b/drivers/pinctrl/pinctrl-coh901.c
index ac155e7d3412..7939b178c6ae 100644
--- a/drivers/pinctrl/pinctrl-coh901.c
+++ b/drivers/pinctrl/pinctrl-coh901.c
@@ -517,7 +517,7 @@ static void u300_gpio_irq_handler(struct irq_desc *desc)
 
 		for_each_set_bit(irqoffset, &val, U300_GPIO_PINS_PER_PORT) {
 			int offset = pinoffset + irqoffset;
-			int pin_irq = irq_find_mapping(chip->irqdomain, offset);
+			int pin_irq = irq_find_mapping(chip->irq.domain, offset);
 
 			dev_dbg(gpio->dev, "GPIO IRQ %d on pin %d\n",
 				pin_irq, offset);
diff --git a/drivers/pinctrl/pinctrl-mcp23s08.c b/drivers/pinctrl/pinctrl-mcp23s08.c
index 3e40d4245512..db19a2f2f575 100644
--- a/drivers/pinctrl/pinctrl-mcp23s08.c
+++ b/drivers/pinctrl/pinctrl-mcp23s08.c
@@ -537,7 +537,7 @@ static irqreturn_t mcp23s08_irq(int irq, void *data)
 		    ((gpio_bit_changed || intcap_changed) &&
 			(BIT(i) & mcp->irq_fall) && !gpio_set) ||
 		    defval_changed) {
-			child_irq = irq_find_mapping(mcp->chip.irqdomain, i);
+			child_irq = irq_find_mapping(mcp->chip.irq.domain, i);
 			handle_nested_irq(child_irq);
 		}
 	}
diff --git a/drivers/pinctrl/pinctrl-oxnas.c b/drivers/pinctrl/pinctrl-oxnas.c
index 494ec9a7573a..53ec22a51f5c 100644
--- a/drivers/pinctrl/pinctrl-oxnas.c
+++ b/drivers/pinctrl/pinctrl-oxnas.c
@@ -1064,7 +1064,7 @@ static void oxnas_gpio_irq_handler(struct irq_desc *desc)
 	stat = readl(bank->reg_base + IRQ_PENDING);
 
 	for_each_set_bit(pin, &stat, BITS_PER_LONG)
-		generic_handle_irq(irq_linear_revmap(gc->irqdomain, pin));
+		generic_handle_irq(irq_linear_revmap(gc->irq.domain, pin));
 
 	chained_irq_exit(chip, desc);
 }
diff --git a/drivers/pinctrl/pinctrl-pic32.c b/drivers/pinctrl/pinctrl-pic32.c
index 31ceb958b3fe..96390228d388 100644
--- a/drivers/pinctrl/pinctrl-pic32.c
+++ b/drivers/pinctrl/pinctrl-pic32.c
@@ -2106,7 +2106,7 @@ static void pic32_gpio_irq_handler(struct irq_desc *desc)
 	pending = pic32_gpio_get_pending(gc, stat);
 
 	for_each_set_bit(pin, &pending, BITS_PER_LONG)
-		generic_handle_irq(irq_linear_revmap(gc->irqdomain, pin));
+		generic_handle_irq(irq_linear_revmap(gc->irq.domain, pin));
 
 	chained_irq_exit(chip, desc);
 }
diff --git a/drivers/pinctrl/pinctrl-pistachio.c b/drivers/pinctrl/pinctrl-pistachio.c
index 55375b1b3cc8..302190d1558d 100644
--- a/drivers/pinctrl/pinctrl-pistachio.c
+++ b/drivers/pinctrl/pinctrl-pistachio.c
@@ -1307,7 +1307,7 @@ static void pistachio_gpio_irq_handler(struct irq_desc *desc)
 	pending = gpio_readl(bank, GPIO_INTERRUPT_STATUS) &
 		gpio_readl(bank, GPIO_INTERRUPT_EN);
 	for_each_set_bit(pin, &pending, 16)
-		generic_handle_irq(irq_linear_revmap(gc->irqdomain, pin));
+		generic_handle_irq(irq_linear_revmap(gc->irq.domain, pin));
 	chained_irq_exit(chip, desc);
 }
 
diff --git a/drivers/pinctrl/pinctrl-st.c b/drivers/pinctrl/pinctrl-st.c
index a5205b94b2e6..2081c67667a8 100644
--- a/drivers/pinctrl/pinctrl-st.c
+++ b/drivers/pinctrl/pinctrl-st.c
@@ -1408,7 +1408,7 @@ static void __gpio_irq_handler(struct st_gpio_bank *bank)
 					continue;
 			}
 
-			generic_handle_irq(irq_find_mapping(bank->gpio_chip.irqdomain, n));
+			generic_handle_irq(irq_find_mapping(bank->gpio_chip.irq.domain, n));
 		}
 	}
 }
diff --git a/drivers/pinctrl/pinctrl-sx150x.c b/drivers/pinctrl/pinctrl-sx150x.c
index 7450f5118445..7db4f6a6eb17 100644
--- a/drivers/pinctrl/pinctrl-sx150x.c
+++ b/drivers/pinctrl/pinctrl-sx150x.c
@@ -561,7 +561,7 @@ static irqreturn_t sx150x_irq_thread_fn(int irq, void *dev_id)
 
 	status = val;
 	for_each_set_bit(n, &status, pctl->data->ngpios)
-		handle_nested_irq(irq_find_mapping(pctl->gpio.irqdomain, n));
+		handle_nested_irq(irq_find_mapping(pctl->gpio.irq.domain, n));
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c
index ff491da64dab..7a960590ecaa 100644
--- a/drivers/pinctrl/qcom/pinctrl-msm.c
+++ b/drivers/pinctrl/qcom/pinctrl-msm.c
@@ -795,7 +795,7 @@ static void msm_gpio_irq_handler(struct irq_desc *desc)
 		g = &pctrl->soc->groups[i];
 		val = readl(pctrl->regs + g->intr_status_reg);
 		if (val & BIT(g->intr_status_bit)) {
-			irq_pin = irq_find_mapping(gc->irqdomain, i);
+			irq_pin = irq_find_mapping(gc->irq.domain, i);
 			generic_handle_irq(irq_pin);
 			handled++;
 		}
diff --git a/drivers/pinctrl/sirf/pinctrl-atlas7.c b/drivers/pinctrl/sirf/pinctrl-atlas7.c
index 4db9323251e3..f5cef6e5fa3e 100644
--- a/drivers/pinctrl/sirf/pinctrl-atlas7.c
+++ b/drivers/pinctrl/sirf/pinctrl-atlas7.c
@@ -5820,7 +5820,7 @@ static void atlas7_gpio_handle_irq(struct irq_desc *desc)
 				__func__, gc->label,
 				bank->gpio_offset + pin_in_bank);
 			generic_handle_irq(
-				irq_find_mapping(gc->irqdomain,
+				irq_find_mapping(gc->irq.domain,
 					bank->gpio_offset + pin_in_bank));
 		}
 
diff --git a/drivers/pinctrl/sirf/pinctrl-sirf.c b/drivers/pinctrl/sirf/pinctrl-sirf.c
index d3ef05973901..8b14a1f1e671 100644
--- a/drivers/pinctrl/sirf/pinctrl-sirf.c
+++ b/drivers/pinctrl/sirf/pinctrl-sirf.c
@@ -587,7 +587,7 @@ static void sirfsoc_gpio_handle_irq(struct irq_desc *desc)
 		if ((status & 0x1) && (ctrl & SIRFSOC_GPIO_CTL_INTR_EN_MASK)) {
 			pr_debug("%s: gpio id %d idx %d happens\n",
 				__func__, bank->id, idx);
-			generic_handle_irq(irq_find_mapping(gc->irqdomain, idx +
+			generic_handle_irq(irq_find_mapping(gc->irq.domain, idx +
 					bank->id * SIRFSOC_GPIO_BANK_SIZE));
 		}
 
diff --git a/drivers/pinctrl/spear/pinctrl-plgpio.c b/drivers/pinctrl/spear/pinctrl-plgpio.c
index cf6d68c7345b..72ae6bccee55 100644
--- a/drivers/pinctrl/spear/pinctrl-plgpio.c
+++ b/drivers/pinctrl/spear/pinctrl-plgpio.c
@@ -401,7 +401,7 @@ static void plgpio_irq_handler(struct irq_desc *desc)
 			/* get correct irq line number */
 			pin = i * MAX_GPIO_PER_REG + pin;
 			generic_handle_irq(
-				irq_find_mapping(gc->irqdomain, pin));
+				irq_find_mapping(gc->irq.domain, pin));
 		}
 	}
 	chained_irq_exit(irqchip, desc);
diff --git a/drivers/platform/x86/intel_int0002_vgpio.c b/drivers/platform/x86/intel_int0002_vgpio.c
index 92dc230ef5b2..f6b3af73dea5 100644
--- a/drivers/platform/x86/intel_int0002_vgpio.c
+++ b/drivers/platform/x86/intel_int0002_vgpio.c
@@ -119,7 +119,7 @@ static irqreturn_t int0002_irq(int irq, void *data)
 	if (!(gpe_sts_reg & GPE0A_PME_B0_STS_BIT))
 		return IRQ_NONE;
 
-	generic_handle_irq(irq_find_mapping(chip->irqdomain,
+	generic_handle_irq(irq_find_mapping(chip->irq.domain,
 					    GPE0A_PME_B0_VIRT_GPIO_PIN));
 
 	pm_system_wakeup();
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index a79b3b18fadd..c5dfa8c0b829 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -31,6 +31,14 @@ struct gpio_irq_chip {
 	 */
 	struct irq_chip *chip;
 
+	/**
+	 * @domain:
+	 *
+	 * Interrupt translation domain; responsible for mapping between GPIO
+	 * hwirq number and Linux IRQ number.
+	 */
+	struct irq_domain *domain;
+
 	/**
 	 * @domain_ops:
 	 *
@@ -124,8 +132,6 @@ static inline struct gpio_irq_chip *to_gpio_irq_chip(struct irq_chip *chip)
  *	safely.
  * @bgpio_dir: shadowed direction register for generic GPIO to clear/set
  *	direction safely.
- * @irqdomain: Interrupt translation domain; responsible for mapping
- *	between GPIO hwirq number and linux irq number
  * @irq_handler: the irq handler to use (often a predefined irq core function)
  *	for GPIO IRQs, provided by GPIO driver
  * @irq_default_type: default IRQ triggering type applied during GPIO driver
@@ -208,7 +214,6 @@ struct gpio_chip {
 	 * With CONFIG_GPIOLIB_IRQCHIP we get an irqchip inside the gpiolib
 	 * to handle IRQs for most practical cases.
 	 */
-	struct irq_domain	*irqdomain;
 	irq_flow_handler_t	irq_handler;
 	unsigned int		irq_default_type;
 	unsigned int		irq_chained_parent;
-- 
cgit v1.2.3


From c7a0aa59524c5bb20bebaca360f7c5faaec6b806 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 7 Nov 2017 19:15:48 +0100
Subject: gpio: Move irq_handler to struct gpio_irq_chip

In order to consolidate the multiple ways to associate an IRQ chip with
a GPIO chip, move more fields into the new struct gpio_irq_chip.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Acked-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      |  4 ++--
 include/linux/gpio/driver.h | 11 ++++++++---
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 9ee75a45ba37..dafbca12c4ca 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1646,7 +1646,7 @@ static int gpiochip_irq_map(struct irq_domain *d, unsigned int irq,
 	 * category than their parents, so it won't report false recursion.
 	 */
 	irq_set_lockdep_class(irq, chip->lock_key);
-	irq_set_chip_and_handler(irq, chip->irq.chip, chip->irq_handler);
+	irq_set_chip_and_handler(irq, chip->irq.chip, chip->irq.handler);
 	/* Chips that use nested thread handlers have them marked */
 	if (chip->irq_nested)
 		irq_set_nested_thread(irq, 1);
@@ -1820,7 +1820,7 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 	}
 
 	gpiochip->irq.chip = irqchip;
-	gpiochip->irq_handler = handler;
+	gpiochip->irq.handler = handler;
 	gpiochip->irq_default_type = type;
 	gpiochip->to_irq = gpiochip_to_irq;
 	gpiochip->lock_key = lock_key;
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index c5dfa8c0b829..864f507e859b 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -46,6 +46,14 @@ struct gpio_irq_chip {
 	 */
 	const struct irq_domain_ops *domain_ops;
 
+	/**
+	 * @handler:
+	 *
+	 * The IRQ handler to use (often a predefined IRQ core function) for
+	 * GPIO IRQs, provided by GPIO driver.
+	 */
+	irq_flow_handler_t handler;
+
 	/**
 	 * @parent_handler:
 	 *
@@ -132,8 +140,6 @@ static inline struct gpio_irq_chip *to_gpio_irq_chip(struct irq_chip *chip)
  *	safely.
  * @bgpio_dir: shadowed direction register for generic GPIO to clear/set
  *	direction safely.
- * @irq_handler: the irq handler to use (often a predefined irq core function)
- *	for GPIO IRQs, provided by GPIO driver
  * @irq_default_type: default IRQ triggering type applied during GPIO driver
  *	initialization, provided by GPIO driver
  * @irq_chained_parent: GPIO IRQ chip parent/bank linux irq number,
@@ -214,7 +220,6 @@ struct gpio_chip {
 	 * With CONFIG_GPIOLIB_IRQCHIP we get an irqchip inside the gpiolib
 	 * to handle IRQs for most practical cases.
 	 */
-	irq_flow_handler_t	irq_handler;
 	unsigned int		irq_default_type;
 	unsigned int		irq_chained_parent;
 	bool			irq_nested;
-- 
cgit v1.2.3


From 3634eeb0fe9176e453c99834749dce21ea1305c1 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 7 Nov 2017 19:15:49 +0100
Subject: gpio: Move irq_default_type to struct gpio_irq_chip

In order to consolidate the multiple ways to associate an IRQ chip with
a GPIO chip, move more fields into the new struct gpio_irq_chip.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Acked-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      |  6 +++---
 include/linux/gpio/driver.h | 11 ++++++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index dafbca12c4ca..d3608cf511de 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1656,8 +1656,8 @@ static int gpiochip_irq_map(struct irq_domain *d, unsigned int irq,
 	 * No set-up of the hardware will happen if IRQ_TYPE_NONE
 	 * is passed as default type.
 	 */
-	if (chip->irq_default_type != IRQ_TYPE_NONE)
-		irq_set_irq_type(irq, chip->irq_default_type);
+	if (chip->irq.default_type != IRQ_TYPE_NONE)
+		irq_set_irq_type(irq, chip->irq.default_type);
 
 	return 0;
 }
@@ -1821,7 +1821,7 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 
 	gpiochip->irq.chip = irqchip;
 	gpiochip->irq.handler = handler;
-	gpiochip->irq_default_type = type;
+	gpiochip->irq.default_type = type;
 	gpiochip->to_irq = gpiochip_to_irq;
 	gpiochip->lock_key = lock_key;
 	gpiochip->irq.domain = irq_domain_add_simple(of_node,
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 864f507e859b..1367fa94105f 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -54,6 +54,14 @@ struct gpio_irq_chip {
 	 */
 	irq_flow_handler_t handler;
 
+	/**
+	 * @default_type:
+	 *
+	 * Default IRQ triggering type applied during GPIO driver
+	 * initialization, provided by GPIO driver.
+	 */
+	unsigned int default_type;
+
 	/**
 	 * @parent_handler:
 	 *
@@ -140,8 +148,6 @@ static inline struct gpio_irq_chip *to_gpio_irq_chip(struct irq_chip *chip)
  *	safely.
  * @bgpio_dir: shadowed direction register for generic GPIO to clear/set
  *	direction safely.
- * @irq_default_type: default IRQ triggering type applied during GPIO driver
- *	initialization, provided by GPIO driver
  * @irq_chained_parent: GPIO IRQ chip parent/bank linux irq number,
  *	provided by GPIO driver for chained interrupt (not for nested
  *	interrupts).
@@ -220,7 +226,6 @@ struct gpio_chip {
 	 * With CONFIG_GPIOLIB_IRQCHIP we get an irqchip inside the gpiolib
 	 * to handle IRQs for most practical cases.
 	 */
-	unsigned int		irq_default_type;
 	unsigned int		irq_chained_parent;
 	bool			irq_nested;
 	bool			irq_need_valid_mask;
-- 
cgit v1.2.3


From 39e5f0969514fbfd6c235ac52586c72ca77cee00 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 7 Nov 2017 19:15:50 +0100
Subject: gpio: Move irq_chained_parent to struct gpio_irq_chip

In order to consolidate the multiple ways to associate an IRQ chip with
a GPIO chip, move more fields into the new struct gpio_irq_chip.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Acked-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 17 ++++++++++++-----
 include/linux/gpio/driver.h | 19 +++++++++++++++----
 2 files changed, 27 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index d3608cf511de..8eadae73ff20 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1570,7 +1570,8 @@ static void gpiochip_set_cascaded_irqchip(struct gpio_chip *gpiochip,
 		irq_set_chained_handler_and_data(parent_irq, parent_handler,
 						 gpiochip);
 
-		gpiochip->irq_chained_parent = parent_irq;
+		gpiochip->irq.parents = &parent_irq;
+		gpiochip->irq.num_parents = 1;
 	}
 
 	/* Set the parent IRQ for all affected IRQs */
@@ -1719,17 +1720,23 @@ static int gpiochip_to_irq(struct gpio_chip *chip, unsigned offset)
  */
 static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip)
 {
-	unsigned int offset, irq;
+	unsigned int offset;
 
 	acpi_gpiochip_free_interrupts(gpiochip);
 
-	if (gpiochip->irq_chained_parent) {
-		irq_set_chained_handler_and_data(
-			gpiochip->irq_chained_parent, NULL, NULL);
+	if (gpiochip->irq.num_parents > 0) {
+		struct gpio_irq_chip *irq = &gpiochip->irq;
+		unsigned int i;
+
+		for (i = 0; i < irq->num_parents; i++)
+			irq_set_chained_handler_and_data(irq->parents[i],
+							 NULL, NULL);
 	}
 
 	/* Remove all IRQ mappings and delete the domain */
 	if (gpiochip->irq.domain) {
+		unsigned int irq;
+
 		for (offset = 0; offset < gpiochip->ngpio; offset++) {
 			if (!gpiochip_irqchip_irq_valid(gpiochip, offset))
 				continue;
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 1367fa94105f..86f00d908e90 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -77,6 +77,21 @@ struct gpio_irq_chip {
 	 * interrupt.
 	 */
 	void *parent_handler_data;
+
+	/**
+	 * @num_parents:
+	 *
+	 * The number of interrupt parents of a GPIO chip.
+	 */
+	unsigned int num_parents;
+
+	/**
+	 * @parents:
+	 *
+	 * A list of interrupt parents of a GPIO chip. This is owned by the
+	 * driver, so the core will only reference this list, not modify it.
+	 */
+	unsigned int *parents;
 };
 
 static inline struct gpio_irq_chip *to_gpio_irq_chip(struct irq_chip *chip)
@@ -148,9 +163,6 @@ static inline struct gpio_irq_chip *to_gpio_irq_chip(struct irq_chip *chip)
  *	safely.
  * @bgpio_dir: shadowed direction register for generic GPIO to clear/set
  *	direction safely.
- * @irq_chained_parent: GPIO IRQ chip parent/bank linux irq number,
- *	provided by GPIO driver for chained interrupt (not for nested
- *	interrupts).
  * @irq_nested: True if set the interrupt handling is nested.
  * @irq_need_valid_mask: If set core allocates @irq_valid_mask with all
  *	bits set to one
@@ -226,7 +238,6 @@ struct gpio_chip {
 	 * With CONFIG_GPIOLIB_IRQCHIP we get an irqchip inside the gpiolib
 	 * to handle IRQs for most practical cases.
 	 */
-	unsigned int		irq_chained_parent;
 	bool			irq_nested;
 	bool			irq_need_valid_mask;
 	unsigned long		*irq_valid_mask;
-- 
cgit v1.2.3


From dc6bafee86897419b0908e8d1e52ef46ca0ea487 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 7 Nov 2017 19:15:51 +0100
Subject: gpio: Move irq_nested into struct gpio_irq_chip

In order to consolidate the multiple ways to associate an IRQ chip with
a GPIO chip, move more fields into the new struct gpio_irq_chip.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Acked-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 8 ++++----
 include/linux/gpio/driver.h | 9 +++++++--
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 8eadae73ff20..236a9f55a265 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1614,7 +1614,7 @@ void gpiochip_set_nested_irqchip(struct gpio_chip *gpiochip,
 				 struct irq_chip *irqchip,
 				 unsigned int parent_irq)
 {
-	if (!gpiochip->irq_nested) {
+	if (!gpiochip->irq.nested) {
 		chip_err(gpiochip, "tried to nest a chained gpiochip\n");
 		return;
 	}
@@ -1649,7 +1649,7 @@ static int gpiochip_irq_map(struct irq_domain *d, unsigned int irq,
 	irq_set_lockdep_class(irq, chip->lock_key);
 	irq_set_chip_and_handler(irq, chip->irq.chip, chip->irq.handler);
 	/* Chips that use nested thread handlers have them marked */
-	if (chip->irq_nested)
+	if (chip->irq.nested)
 		irq_set_nested_thread(irq, 1);
 	irq_set_noprobe(irq);
 
@@ -1667,7 +1667,7 @@ static void gpiochip_irq_unmap(struct irq_domain *d, unsigned int irq)
 {
 	struct gpio_chip *chip = d->host_data;
 
-	if (chip->irq_nested)
+	if (chip->irq.nested)
 		irq_set_nested_thread(irq, 0);
 	irq_set_chip_and_handler(irq, NULL, NULL);
 	irq_set_chip_data(irq, NULL);
@@ -1801,7 +1801,7 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 		pr_err("missing gpiochip .dev parent pointer\n");
 		return -EINVAL;
 	}
-	gpiochip->irq_nested = nested;
+	gpiochip->irq.nested = nested;
 	of_node = gpiochip->parent->of_node;
 #ifdef CONFIG_OF_GPIO
 	/*
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 86f00d908e90..1c3d06fe54b1 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -92,6 +92,13 @@ struct gpio_irq_chip {
 	 * driver, so the core will only reference this list, not modify it.
 	 */
 	unsigned int *parents;
+
+	/**
+	 * @nested:
+	 *
+	 * True if set the interrupt handling is nested.
+	 */
+	bool nested;
 };
 
 static inline struct gpio_irq_chip *to_gpio_irq_chip(struct irq_chip *chip)
@@ -163,7 +170,6 @@ static inline struct gpio_irq_chip *to_gpio_irq_chip(struct irq_chip *chip)
  *	safely.
  * @bgpio_dir: shadowed direction register for generic GPIO to clear/set
  *	direction safely.
- * @irq_nested: True if set the interrupt handling is nested.
  * @irq_need_valid_mask: If set core allocates @irq_valid_mask with all
  *	bits set to one
  * @irq_valid_mask: If not %NULL holds bitmask of GPIOs which are valid to
@@ -238,7 +244,6 @@ struct gpio_chip {
 	 * With CONFIG_GPIOLIB_IRQCHIP we get an irqchip inside the gpiolib
 	 * to handle IRQs for most practical cases.
 	 */
-	bool			irq_nested;
 	bool			irq_need_valid_mask;
 	unsigned long		*irq_valid_mask;
 	struct lock_class_key	*lock_key;
-- 
cgit v1.2.3


From dc7b0387ee894c115ef5ddcaaf794125d6d9058c Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 7 Nov 2017 19:15:52 +0100
Subject: gpio: Move irq_valid_mask into struct gpio_irq_chip

In order to consolidate the multiple ways to associate an IRQ chip with
a GPIO chip, move more fields into the new struct gpio_irq_chip.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Acked-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/gpio/driver.txt              |  4 ++--
 drivers/gpio/gpio-aspeed.c                 |  4 ++--
 drivers/gpio/gpio-stmpe.c                  |  4 ++--
 drivers/gpio/gpiolib.c                     | 16 ++++++++--------
 drivers/pinctrl/intel/pinctrl-baytrail.c   |  4 ++--
 drivers/pinctrl/intel/pinctrl-cherryview.c |  4 ++--
 drivers/platform/x86/intel_int0002_vgpio.c |  4 ++--
 include/linux/gpio/driver.h                | 21 +++++++++++++++------
 8 files changed, 35 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/gpio/driver.txt b/Documentation/gpio/driver.txt
index dcf6af1d9e56..d8de1c7de85a 100644
--- a/Documentation/gpio/driver.txt
+++ b/Documentation/gpio/driver.txt
@@ -313,8 +313,8 @@ symbol:
   mark all the child IRQs as having the other IRQ as parent.
 
 If there is a need to exclude certain GPIOs from the IRQ domain, you can
-set .irq_need_valid_mask of the gpiochip before gpiochip_add_data() is
-called. This allocates an .irq_valid_mask with as many bits set as there
+set .irq.need_valid_mask of the gpiochip before gpiochip_add_data() is
+called. This allocates an .irq.valid_mask with as many bits set as there
 are GPIOs in the chip. Drivers can exclude GPIOs by clearing bits from this
 mask. The mask must be filled in before gpiochip_irqchip_add() or
 gpiochip_irqchip_add_nested() is called.
diff --git a/drivers/gpio/gpio-aspeed.c b/drivers/gpio/gpio-aspeed.c
index 2bfce0ab7326..8781817d9003 100644
--- a/drivers/gpio/gpio-aspeed.c
+++ b/drivers/gpio/gpio-aspeed.c
@@ -501,7 +501,7 @@ static void set_irq_valid_mask(struct aspeed_gpio *gpio)
 			if (i >= gpio->config->nr_gpios)
 				break;
 
-			clear_bit(i, gpio->chip.irq_valid_mask);
+			clear_bit(i, gpio->chip.irq.valid_mask);
 		}
 
 		props++;
@@ -856,7 +856,7 @@ static int __init aspeed_gpio_probe(struct platform_device *pdev)
 	gpio->chip.set_config = aspeed_gpio_set_config;
 	gpio->chip.label = dev_name(&pdev->dev);
 	gpio->chip.base = -1;
-	gpio->chip.irq_need_valid_mask = true;
+	gpio->chip.irq.need_valid_mask = true;
 
 	rc = devm_gpiochip_add_data(&pdev->dev, &gpio->chip, gpio);
 	if (rc < 0)
diff --git a/drivers/gpio/gpio-stmpe.c b/drivers/gpio/gpio-stmpe.c
index 18d8bef76d85..e6e5cca624a7 100644
--- a/drivers/gpio/gpio-stmpe.c
+++ b/drivers/gpio/gpio-stmpe.c
@@ -451,7 +451,7 @@ static int stmpe_gpio_probe(struct platform_device *pdev)
 	of_property_read_u32(np, "st,norequest-mask",
 			&stmpe_gpio->norequest_mask);
 	if (stmpe_gpio->norequest_mask)
-		stmpe_gpio->chip.irq_need_valid_mask = true;
+		stmpe_gpio->chip.irq.need_valid_mask = true;
 
 	if (irq < 0)
 		dev_info(&pdev->dev,
@@ -482,7 +482,7 @@ static int stmpe_gpio_probe(struct platform_device *pdev)
 			/* Forbid unused lines to be mapped as IRQs */
 			for (i = 0; i < sizeof(u32); i++)
 				if (stmpe_gpio->norequest_mask & BIT(i))
-					clear_bit(i, stmpe_gpio->chip.irq_valid_mask);
+					clear_bit(i, stmpe_gpio->chip.irq.valid_mask);
 		}
 		ret =  gpiochip_irqchip_add_nested(&stmpe_gpio->chip,
 						   &stmpe_gpio_irq_chip,
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 236a9f55a265..0bf844470693 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1504,33 +1504,33 @@ static struct gpio_chip *find_chip_by_name(const char *name)
 
 static int gpiochip_irqchip_init_valid_mask(struct gpio_chip *gpiochip)
 {
-	if (!gpiochip->irq_need_valid_mask)
+	if (!gpiochip->irq.need_valid_mask)
 		return 0;
 
-	gpiochip->irq_valid_mask = kcalloc(BITS_TO_LONGS(gpiochip->ngpio),
+	gpiochip->irq.valid_mask = kcalloc(BITS_TO_LONGS(gpiochip->ngpio),
 					   sizeof(long), GFP_KERNEL);
-	if (!gpiochip->irq_valid_mask)
+	if (!gpiochip->irq.valid_mask)
 		return -ENOMEM;
 
 	/* Assume by default all GPIOs are valid */
-	bitmap_fill(gpiochip->irq_valid_mask, gpiochip->ngpio);
+	bitmap_fill(gpiochip->irq.valid_mask, gpiochip->ngpio);
 
 	return 0;
 }
 
 static void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gpiochip)
 {
-	kfree(gpiochip->irq_valid_mask);
-	gpiochip->irq_valid_mask = NULL;
+	kfree(gpiochip->irq.valid_mask);
+	gpiochip->irq.valid_mask = NULL;
 }
 
 static bool gpiochip_irqchip_irq_valid(const struct gpio_chip *gpiochip,
 				       unsigned int offset)
 {
 	/* No mask means all valid */
-	if (likely(!gpiochip->irq_valid_mask))
+	if (likely(!gpiochip->irq.valid_mask))
 		return true;
-	return test_bit(offset, gpiochip->irq_valid_mask);
+	return test_bit(offset, gpiochip->irq.valid_mask);
 }
 
 /**
diff --git a/drivers/pinctrl/intel/pinctrl-baytrail.c b/drivers/pinctrl/intel/pinctrl-baytrail.c
index 5897981e5ed3..9c1ca29c60b7 100644
--- a/drivers/pinctrl/intel/pinctrl-baytrail.c
+++ b/drivers/pinctrl/intel/pinctrl-baytrail.c
@@ -1660,7 +1660,7 @@ static void byt_gpio_irq_init_hw(struct byt_gpio *vg)
 
 		value = readl(reg);
 		if (value & BYT_DIRECT_IRQ_EN) {
-			clear_bit(i, gc->irq_valid_mask);
+			clear_bit(i, gc->irq.valid_mask);
 			dev_dbg(dev, "excluding GPIO %d from IRQ domain\n", i);
 		} else if ((value & BYT_PIN_MUX) == byt_get_gpio_mux(vg, i)) {
 			byt_gpio_clear_triggering(vg, i);
@@ -1703,7 +1703,7 @@ static int byt_gpio_probe(struct byt_gpio *vg)
 	gc->can_sleep	= false;
 	gc->parent	= &vg->pdev->dev;
 	gc->ngpio	= vg->soc_data->npins;
-	gc->irq_need_valid_mask	= true;
+	gc->irq.need_valid_mask	= true;
 
 #ifdef CONFIG_PM_SLEEP
 	vg->saved_context = devm_kcalloc(&vg->pdev->dev, gc->ngpio,
diff --git a/drivers/pinctrl/intel/pinctrl-cherryview.c b/drivers/pinctrl/intel/pinctrl-cherryview.c
index 1cd7043edbc1..e23def322de2 100644
--- a/drivers/pinctrl/intel/pinctrl-cherryview.c
+++ b/drivers/pinctrl/intel/pinctrl-cherryview.c
@@ -1584,7 +1584,7 @@ static int chv_gpio_probe(struct chv_pinctrl *pctrl, int irq)
 	chip->label = dev_name(pctrl->dev);
 	chip->parent = pctrl->dev;
 	chip->base = -1;
-	chip->irq_need_valid_mask = need_valid_mask;
+	chip->irq.need_valid_mask = need_valid_mask;
 
 	ret = devm_gpiochip_add_data(pctrl->dev, chip, pctrl);
 	if (ret) {
@@ -1616,7 +1616,7 @@ static int chv_gpio_probe(struct chv_pinctrl *pctrl, int irq)
 		intsel >>= CHV_PADCTRL0_INTSEL_SHIFT;
 
 		if (need_valid_mask && intsel >= pctrl->community->nirqs)
-			clear_bit(i, chip->irq_valid_mask);
+			clear_bit(i, chip->irq.valid_mask);
 	}
 
 	/* Clear all interrupts */
diff --git a/drivers/platform/x86/intel_int0002_vgpio.c b/drivers/platform/x86/intel_int0002_vgpio.c
index f6b3af73dea5..f7b67e898abc 100644
--- a/drivers/platform/x86/intel_int0002_vgpio.c
+++ b/drivers/platform/x86/intel_int0002_vgpio.c
@@ -165,7 +165,7 @@ static int int0002_probe(struct platform_device *pdev)
 	chip->direction_output = int0002_gpio_direction_output;
 	chip->base = -1;
 	chip->ngpio = GPE0A_PME_B0_VIRT_GPIO_PIN + 1;
-	chip->irq_need_valid_mask = true;
+	chip->irq.need_valid_mask = true;
 
 	ret = devm_gpiochip_add_data(&pdev->dev, chip, NULL);
 	if (ret) {
@@ -173,7 +173,7 @@ static int int0002_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	bitmap_clear(chip->irq_valid_mask, 0, GPE0A_PME_B0_VIRT_GPIO_PIN);
+	bitmap_clear(chip->irq.valid_mask, 0, GPE0A_PME_B0_VIRT_GPIO_PIN);
 
 	/*
 	 * We manually request the irq here instead of passing a flow-handler
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 1c3d06fe54b1..067efcd4f46d 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -99,6 +99,21 @@ struct gpio_irq_chip {
 	 * True if set the interrupt handling is nested.
 	 */
 	bool nested;
+
+	/**
+	 * @need_valid_mask:
+	 *
+	 * If set core allocates @valid_mask with all bits set to one.
+	 */
+	bool need_valid_mask;
+
+	/**
+	 * @valid_mask:
+	 *
+	 * If not %NULL holds bitmask of GPIOs which are valid to be included
+	 * in IRQ domain of the chip.
+	 */
+	unsigned long *valid_mask;
 };
 
 static inline struct gpio_irq_chip *to_gpio_irq_chip(struct irq_chip *chip)
@@ -170,10 +185,6 @@ static inline struct gpio_irq_chip *to_gpio_irq_chip(struct irq_chip *chip)
  *	safely.
  * @bgpio_dir: shadowed direction register for generic GPIO to clear/set
  *	direction safely.
- * @irq_need_valid_mask: If set core allocates @irq_valid_mask with all
- *	bits set to one
- * @irq_valid_mask: If not %NULL holds bitmask of GPIOs which are valid to
- *	be included in IRQ domain of the chip
  * @lock_key: per GPIO IRQ chip lockdep class
  *
  * A gpio_chip can help platforms abstract various sources of GPIOs so
@@ -244,8 +255,6 @@ struct gpio_chip {
 	 * With CONFIG_GPIOLIB_IRQCHIP we get an irqchip inside the gpiolib
 	 * to handle IRQs for most practical cases.
 	 */
-	bool			irq_need_valid_mask;
-	unsigned long		*irq_valid_mask;
 	struct lock_class_key	*lock_key;
 
 	/**
-- 
cgit v1.2.3


From ca9df053fb2bb2fcc64f37a1668321c7e19edd04 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 7 Nov 2017 19:15:53 +0100
Subject: gpio: Move lock_key into struct gpio_irq_chip

In order to consolidate the multiple ways to associate an IRQ chip with
a GPIO chip, move more fields into the new struct gpio_irq_chip.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Acked-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 4 ++--
 include/linux/gpio/driver.h | 9 +++++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 0bf844470693..685a05caf1ba 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1646,7 +1646,7 @@ static int gpiochip_irq_map(struct irq_domain *d, unsigned int irq,
 	 * This lock class tells lockdep that GPIO irqs are in a different
 	 * category than their parents, so it won't report false recursion.
 	 */
-	irq_set_lockdep_class(irq, chip->lock_key);
+	irq_set_lockdep_class(irq, chip->irq.lock_key);
 	irq_set_chip_and_handler(irq, chip->irq.chip, chip->irq.handler);
 	/* Chips that use nested thread handlers have them marked */
 	if (chip->irq.nested)
@@ -1830,7 +1830,7 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 	gpiochip->irq.handler = handler;
 	gpiochip->irq.default_type = type;
 	gpiochip->to_irq = gpiochip_to_irq;
-	gpiochip->lock_key = lock_key;
+	gpiochip->irq.lock_key = lock_key;
 	gpiochip->irq.domain = irq_domain_add_simple(of_node,
 					gpiochip->ngpio, first_irq,
 					&gpiochip_domain_ops, gpiochip);
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 067efcd4f46d..c363ee198ff9 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -62,6 +62,13 @@ struct gpio_irq_chip {
 	 */
 	unsigned int default_type;
 
+	/**
+	 * @lock_key:
+	 *
+	 * Per GPIO IRQ chip lockdep class.
+	 */
+	struct lock_class_key *lock_key;
+
 	/**
 	 * @parent_handler:
 	 *
@@ -185,7 +192,6 @@ static inline struct gpio_irq_chip *to_gpio_irq_chip(struct irq_chip *chip)
  *	safely.
  * @bgpio_dir: shadowed direction register for generic GPIO to clear/set
  *	direction safely.
- * @lock_key: per GPIO IRQ chip lockdep class
  *
  * A gpio_chip can help platforms abstract various sources of GPIOs so
  * they can all be accessed through a common programing interface.
@@ -255,7 +261,6 @@ struct gpio_chip {
 	 * With CONFIG_GPIOLIB_IRQCHIP we get an irqchip inside the gpiolib
 	 * to handle IRQs for most practical cases.
 	 */
-	struct lock_class_key	*lock_key;
 
 	/**
 	 * @irq:
-- 
cgit v1.2.3


From e0d89728981393b7d694bd3419b7794b9882c92d Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 7 Nov 2017 19:15:54 +0100
Subject: gpio: Implement tighter IRQ chip integration

Currently GPIO drivers are required to add the GPIO chip and its
corresponding IRQ chip separately, which can result in a lot of
boilerplate. Use the newly introduced struct gpio_irq_chip, embedded in
struct gpio_chip, that drivers can fill in if they want the GPIO core
to automatically register the IRQ chip associated with a GPIO chip.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Acked-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 108 +++++++++++++++++++++++++++++++++++++++++++-
 include/linux/gpio/driver.h |   7 +++
 2 files changed, 114 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 685a05caf1ba..003d1bb85165 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -72,6 +72,7 @@ static LIST_HEAD(gpio_lookup_list);
 LIST_HEAD(gpio_devices);
 
 static void gpiochip_free_hogs(struct gpio_chip *chip);
+static int gpiochip_add_irqchip(struct gpio_chip *gpiochip);
 static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip);
 static int gpiochip_irqchip_init_valid_mask(struct gpio_chip *gpiochip);
 static void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gpiochip);
@@ -1266,6 +1267,10 @@ int gpiochip_add_data(struct gpio_chip *chip, void *data)
 	if (status)
 		goto err_remove_from_list;
 
+	status = gpiochip_add_irqchip(chip);
+	if (status)
+		goto err_remove_chip;
+
 	status = of_gpiochip_add(chip);
 	if (status)
 		goto err_remove_chip;
@@ -1637,6 +1642,7 @@ static int gpiochip_irq_map(struct irq_domain *d, unsigned int irq,
 			    irq_hw_number_t hwirq)
 {
 	struct gpio_chip *chip = d->host_data;
+	int err = 0;
 
 	if (!gpiochip_irqchip_irq_valid(chip, hwirq))
 		return -ENXIO;
@@ -1653,6 +1659,14 @@ static int gpiochip_irq_map(struct irq_domain *d, unsigned int irq,
 		irq_set_nested_thread(irq, 1);
 	irq_set_noprobe(irq);
 
+	if (chip->irq.num_parents == 1)
+		err = irq_set_parent(irq, chip->irq.parents[0]);
+	else if (chip->irq.map)
+		err = irq_set_parent(irq, chip->irq.map[hwirq]);
+
+	if (err < 0)
+		return err;
+
 	/*
 	 * No set-up of the hardware will happen if IRQ_TYPE_NONE
 	 * is passed as default type.
@@ -1709,9 +1723,96 @@ static int gpiochip_to_irq(struct gpio_chip *chip, unsigned offset)
 {
 	if (!gpiochip_irqchip_irq_valid(chip, offset))
 		return -ENXIO;
+
 	return irq_create_mapping(chip->irq.domain, offset);
 }
 
+/**
+ * gpiochip_add_irqchip() - adds an IRQ chip to a GPIO chip
+ * @gpiochip: the GPIO chip to add the IRQ chip to
+ */
+static int gpiochip_add_irqchip(struct gpio_chip *gpiochip)
+{
+	struct irq_chip *irqchip = gpiochip->irq.chip;
+	const struct irq_domain_ops *ops;
+	struct device_node *np;
+	unsigned int type;
+	unsigned int i;
+
+	if (!irqchip)
+		return 0;
+
+	if (gpiochip->irq.parent_handler && gpiochip->can_sleep) {
+		chip_err(gpiochip, "you cannot have chained interrupts on a "
+			 "chip that may sleep\n");
+		return -EINVAL;
+	}
+
+	np = gpiochip->gpiodev->dev.of_node;
+	type = gpiochip->irq.default_type;
+
+	/*
+	 * Specifying a default trigger is a terrible idea if DT or ACPI is
+	 * used to configure the interrupts, as you may end up with
+	 * conflicting triggers. Tell the user, and reset to NONE.
+	 */
+	if (WARN(np && type != IRQ_TYPE_NONE,
+		 "%s: Ignoring %u default trigger\n", np->full_name, type))
+		type = IRQ_TYPE_NONE;
+
+	if (has_acpi_companion(gpiochip->parent) && type != IRQ_TYPE_NONE) {
+		acpi_handle_warn(ACPI_HANDLE(gpiochip->parent),
+				 "Ignoring %u default trigger\n", type);
+		type = IRQ_TYPE_NONE;
+	}
+
+	gpiochip->to_irq = gpiochip_to_irq;
+	gpiochip->irq.default_type = type;
+
+	if (gpiochip->irq.domain_ops)
+		ops = gpiochip->irq.domain_ops;
+	else
+		ops = &gpiochip_domain_ops;
+
+	gpiochip->irq.domain = irq_domain_add_simple(np, gpiochip->ngpio,
+						     0, ops, gpiochip);
+	if (!gpiochip->irq.domain)
+		return -EINVAL;
+
+	/*
+	 * It is possible for a driver to override this, but only if the
+	 * alternative functions are both implemented.
+	 */
+	if (!irqchip->irq_request_resources &&
+	    !irqchip->irq_release_resources) {
+		irqchip->irq_request_resources = gpiochip_irq_reqres;
+		irqchip->irq_release_resources = gpiochip_irq_relres;
+	}
+
+	if (gpiochip->irq.parent_handler) {
+		void *data = gpiochip->irq.parent_handler_data ?: gpiochip;
+
+		for (i = 0; i < gpiochip->irq.num_parents; i++) {
+			/*
+			 * The parent IRQ chip is already using the chip_data
+			 * for this IRQ chip, so our callbacks simply use the
+			 * handler_data.
+			 */
+			irq_set_chained_handler_and_data(gpiochip->irq.parents[i],
+							 gpiochip->irq.parent_handler,
+							 data);
+		}
+
+		gpiochip->irq.nested = false;
+	} else {
+		gpiochip->irq.nested = true;
+	}
+
+	acpi_gpiochip_request_interrupts(gpiochip);
+
+	return 0;
+}
+
 /**
  * gpiochip_irqchip_remove() - removes an irqchip added to a gpiochip
  * @gpiochip: the gpiochip to remove the irqchip from
@@ -1724,7 +1825,7 @@ static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip)
 
 	acpi_gpiochip_free_interrupts(gpiochip);
 
-	if (gpiochip->irq.num_parents > 0) {
+	if (gpiochip->irq.chip && gpiochip->irq.parent_handler) {
 		struct gpio_irq_chip *irq = &gpiochip->irq;
 		unsigned int i;
 
@@ -1857,6 +1958,11 @@ EXPORT_SYMBOL_GPL(gpiochip_irqchip_add_key);
 
 #else /* CONFIG_GPIOLIB_IRQCHIP */
 
+static inline int gpiochip_add_irqchip(struct gpio_chip *gpiochip)
+{
+	return 0;
+}
+
 static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip) {}
 static inline int gpiochip_irqchip_init_valid_mask(struct gpio_chip *gpiochip)
 {
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index c363ee198ff9..51fc7b023364 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -100,6 +100,13 @@ struct gpio_irq_chip {
 	 */
 	unsigned int *parents;
 
+	/**
+	 * @map:
+	 *
+	 * A list of interrupt parents for each line of a GPIO chip.
+	 */
+	unsigned int *map;
+
 	/**
 	 * @nested:
 	 *
-- 
cgit v1.2.3


From 1b95b4eb567aab1cafcdcb14c60a7dd9d56236a9 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 7 Nov 2017 19:15:55 +0100
Subject: gpio: Export gpiochip_irq_{map,unmap}()

Export these functions so that drivers can explicitly use these when
setting up their IRQ domain.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Acked-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 8 +++++---
 include/linux/gpio/driver.h | 4 ++++
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 003d1bb85165..7347ea1c699a 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1638,8 +1638,8 @@ EXPORT_SYMBOL_GPL(gpiochip_set_nested_irqchip);
  * gpiochip by assigning the gpiochip as chip data, and using the irqchip
  * stored inside the gpiochip.
  */
-static int gpiochip_irq_map(struct irq_domain *d, unsigned int irq,
-			    irq_hw_number_t hwirq)
+int gpiochip_irq_map(struct irq_domain *d, unsigned int irq,
+		     irq_hw_number_t hwirq)
 {
 	struct gpio_chip *chip = d->host_data;
 	int err = 0;
@@ -1676,8 +1676,9 @@ static int gpiochip_irq_map(struct irq_domain *d, unsigned int irq,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(gpiochip_irq_map);
 
-static void gpiochip_irq_unmap(struct irq_domain *d, unsigned int irq)
+void gpiochip_irq_unmap(struct irq_domain *d, unsigned int irq)
 {
 	struct gpio_chip *chip = d->host_data;
 
@@ -1686,6 +1687,7 @@ static void gpiochip_irq_unmap(struct irq_domain *d, unsigned int irq)
 	irq_set_chip_and_handler(irq, NULL, NULL);
 	irq_set_chip_data(irq, NULL);
 }
+EXPORT_SYMBOL_GPL(gpiochip_irq_unmap);
 
 static const struct irq_domain_ops gpiochip_domain_ops = {
 	.map	= gpiochip_irq_map,
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 51fc7b023364..bbe5c647f29d 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -367,6 +367,10 @@ int bgpio_init(struct gpio_chip *gc, struct device *dev,
 
 #ifdef CONFIG_GPIOLIB_IRQCHIP
 
+int gpiochip_irq_map(struct irq_domain *d, unsigned int irq,
+		     irq_hw_number_t hwirq);
+void gpiochip_irq_unmap(struct irq_domain *d, unsigned int irq);
+
 void gpiochip_set_chained_irqchip(struct gpio_chip *gpiochip,
 		struct irq_chip *irqchip,
 		unsigned int parent_irq,
-- 
cgit v1.2.3


From 60ed54cae8dc0f2d41cafbd477bbed6deb716615 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 7 Nov 2017 19:15:57 +0100
Subject: gpio: Disambiguate struct gpio_irq_chip.nested

The nested field in struct gpio_irq_chip currently has two meanings. On
one hand it marks an IRQ chip as being nested (as opposed to chained),
while on the other hand it also means that an IRQ chip uses nested
thread handlers.

However, nested IRQ chips can already be identified by the fact that
they don't pass a parent handler (the driver would instead already have
installed a nested handler using request_irq()).

Therefore, the only use for the nested attribute is to inform gpiolib
that an IRQ chip uses nested thread handlers (as opposed to regular,
non-threaded handlers). To clarify its purpose, rename the field to
"threaded".

Signed-off-by: Thierry Reding <treding@nvidia.com>
Acked-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 24 ++++++++++--------------
 include/linux/gpio/driver.h |  8 ++++----
 2 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 7347ea1c699a..389257f97e45 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1603,6 +1603,11 @@ void gpiochip_set_chained_irqchip(struct gpio_chip *gpiochip,
 				  unsigned int parent_irq,
 				  irq_flow_handler_t parent_handler)
 {
+	if (gpiochip->irq.threaded) {
+		chip_err(gpiochip, "tried to chain a threaded gpiochip\n");
+		return;
+	}
+
 	gpiochip_set_cascaded_irqchip(gpiochip, irqchip, parent_irq,
 				      parent_handler);
 }
@@ -1619,10 +1624,6 @@ void gpiochip_set_nested_irqchip(struct gpio_chip *gpiochip,
 				 struct irq_chip *irqchip,
 				 unsigned int parent_irq)
 {
-	if (!gpiochip->irq.nested) {
-		chip_err(gpiochip, "tried to nest a chained gpiochip\n");
-		return;
-	}
 	gpiochip_set_cascaded_irqchip(gpiochip, irqchip, parent_irq,
 				      NULL);
 }
@@ -1655,7 +1656,7 @@ int gpiochip_irq_map(struct irq_domain *d, unsigned int irq,
 	irq_set_lockdep_class(irq, chip->irq.lock_key);
 	irq_set_chip_and_handler(irq, chip->irq.chip, chip->irq.handler);
 	/* Chips that use nested thread handlers have them marked */
-	if (chip->irq.nested)
+	if (chip->irq.threaded)
 		irq_set_nested_thread(irq, 1);
 	irq_set_noprobe(irq);
 
@@ -1682,7 +1683,7 @@ void gpiochip_irq_unmap(struct irq_domain *d, unsigned int irq)
 {
 	struct gpio_chip *chip = d->host_data;
 
-	if (chip->irq.nested)
+	if (chip->irq.threaded)
 		irq_set_nested_thread(irq, 0);
 	irq_set_chip_and_handler(irq, NULL, NULL);
 	irq_set_chip_data(irq, NULL);
@@ -1804,10 +1805,6 @@ static int gpiochip_add_irqchip(struct gpio_chip *gpiochip)
 							 gpiochip->irq.parent_handler,
 							 data);
 		}
-
-		gpiochip->irq.nested = false;
-	} else {
-		gpiochip->irq.nested = true;
 	}
 
 	acpi_gpiochip_request_interrupts(gpiochip);
@@ -1869,8 +1866,7 @@ static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip)
  * @handler: the irq handler to use (often a predefined irq core function)
  * @type: the default type for IRQs on this irqchip, pass IRQ_TYPE_NONE
  * to have the core avoid setting up any default type in the hardware.
- * @nested: whether this is a nested irqchip calling handle_nested_irq()
- * in its IRQ handler
+ * @threaded: whether this irqchip uses a nested thread handler
  * @lock_key: lockdep class
  *
  * This function closely associates a certain irqchip with a certain
@@ -1892,7 +1888,7 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 			     unsigned int first_irq,
 			     irq_flow_handler_t handler,
 			     unsigned int type,
-			     bool nested,
+			     bool threaded,
 			     struct lock_class_key *lock_key)
 {
 	struct device_node *of_node;
@@ -1904,7 +1900,7 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 		pr_err("missing gpiochip .dev parent pointer\n");
 		return -EINVAL;
 	}
-	gpiochip->irq.nested = nested;
+	gpiochip->irq.threaded = threaded;
 	of_node = gpiochip->parent->of_node;
 #ifdef CONFIG_OF_GPIO
 	/*
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index bbe5c647f29d..0a3fdd4d9d8d 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -108,11 +108,11 @@ struct gpio_irq_chip {
 	unsigned int *map;
 
 	/**
-	 * @nested:
+	 * @threaded:
 	 *
-	 * True if set the interrupt handling is nested.
+	 * True if set the interrupt handling uses nested threads.
 	 */
-	bool nested;
+	bool threaded;
 
 	/**
 	 * @need_valid_mask:
@@ -385,7 +385,7 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 			     unsigned int first_irq,
 			     irq_flow_handler_t handler,
 			     unsigned int type,
-			     bool nested,
+			     bool threaded,
 			     struct lock_class_key *lock_key);
 
 #ifdef CONFIG_LOCKDEP
-- 
cgit v1.2.3


From 8302cf585288f75fd253f6b9a094d51ae371a3f3 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 7 Nov 2017 19:15:58 +0100
Subject: gpio: Introduce struct gpio_irq_chip.first

Some GPIO chips cannot support sparse IRQ numbering and therefore need
to manually allocate their interrupt descriptors statically. For these
cases, a driver can pass the first allocated IRQ via the struct
gpio_irq_chip's "first" field and thereby cause the IRQ domain to map
all IRQs during initialization.

Suggested-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
Acked-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 3 ++-
 include/linux/gpio/driver.h | 8 ++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 389257f97e45..6d5c366a1378 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1778,7 +1778,8 @@ static int gpiochip_add_irqchip(struct gpio_chip *gpiochip)
 		ops = &gpiochip_domain_ops;
 
 	gpiochip->irq.domain = irq_domain_add_simple(np, gpiochip->ngpio,
-						     0, ops, gpiochip);
+						     gpiochip->irq.first,
+						     ops, gpiochip);
 	if (!gpiochip->irq.domain)
 		return -EINVAL;
 
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 0a3fdd4d9d8d..a77d4ada060c 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -128,6 +128,14 @@ struct gpio_irq_chip {
 	 * in IRQ domain of the chip.
 	 */
 	unsigned long *valid_mask;
+
+	/**
+	 * @first:
+	 *
+	 * Required for static IRQ allocation. If set, irq_domain_add_simple()
+	 * will allocate and map all IRQs during initialization.
+	 */
+	unsigned int first;
 };
 
 static inline struct gpio_irq_chip *to_gpio_irq_chip(struct irq_chip *chip)
-- 
cgit v1.2.3


From 959bc7b22bd25a3a907fbb9b26a1d0cbdf98ef40 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 7 Nov 2017 19:15:59 +0100
Subject: gpio: Automatically add lockdep keys

In order to avoid lockdep boilerplate in individual drivers, turn the
gpiochip_add_data() function into a macro that creates a unique class
key for each driver.

Note that this has the slight disadvantage of adding a key for each
driver registered with the system. However, these keys are 8 bytes in
size, which is negligible and a small price to pay for generic
infrastructure.

Suggested-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
Acked-by: Grygorii Strashko <grygorii.strashko@ti.com>
[renane __gpiochip_add_data() to gpiochip_add_data_with_key]
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 41 ++++++++++++-----------------------------
 include/linux/gpio/driver.h | 36 +++++++++++++++++++++++++++++++++++-
 2 files changed, 47 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 6d5c366a1378..961b5da38bb9 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -72,7 +72,8 @@ static LIST_HEAD(gpio_lookup_list);
 LIST_HEAD(gpio_devices);
 
 static void gpiochip_free_hogs(struct gpio_chip *chip);
-static int gpiochip_add_irqchip(struct gpio_chip *gpiochip);
+static int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
+				struct lock_class_key *key);
 static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip);
 static int gpiochip_irqchip_init_valid_mask(struct gpio_chip *gpiochip);
 static void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gpiochip);
@@ -1098,30 +1099,8 @@ static void gpiochip_setup_devs(void)
 	}
 }
 
-/**
- * gpiochip_add_data() - register a gpio_chip
- * @chip: the chip to register, with chip->base initialized
- * @data: driver-private data associated with this chip
- *
- * Context: potentially before irqs will work
- *
- * When gpiochip_add_data() is called very early during boot, so that GPIOs
- * can be freely used, the chip->parent device must be registered before
- * the gpio framework's arch_initcall().  Otherwise sysfs initialization
- * for GPIOs will fail rudely.
- *
- * gpiochip_add_data() must only be called after gpiolib initialization,
- * ie after core_initcall().
- *
- * If chip->base is negative, this requests dynamic assignment of
- * a range of valid GPIOs.
- *
- * Returns:
- * A negative errno if the chip can't be registered, such as because the
- * chip->base is invalid or already associated with a different chip.
- * Otherwise it returns zero as a success code.
- */
-int gpiochip_add_data(struct gpio_chip *chip, void *data)
+int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
+			       struct lock_class_key *key)
 {
 	unsigned long	flags;
 	int		status = 0;
@@ -1267,7 +1246,7 @@ int gpiochip_add_data(struct gpio_chip *chip, void *data)
 	if (status)
 		goto err_remove_from_list;
 
-	status = gpiochip_add_irqchip(chip);
+	status = gpiochip_add_irqchip(chip, key);
 	if (status)
 		goto err_remove_chip;
 
@@ -1314,7 +1293,7 @@ err_free_gdev:
 	kfree(gdev);
 	return status;
 }
-EXPORT_SYMBOL_GPL(gpiochip_add_data);
+EXPORT_SYMBOL_GPL(gpiochip_add_data_with_key);
 
 /**
  * gpiochip_get_data() - get per-subdriver data for the chip
@@ -1733,8 +1712,10 @@ static int gpiochip_to_irq(struct gpio_chip *chip, unsigned offset)
 /**
  * gpiochip_add_irqchip() - adds an IRQ chip to a GPIO chip
  * @gpiochip: the GPIO chip to add the IRQ chip to
+ * @lock_key: lockdep class
  */
-static int gpiochip_add_irqchip(struct gpio_chip *gpiochip)
+static int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
+				struct lock_class_key *lock_key)
 {
 	struct irq_chip *irqchip = gpiochip->irq.chip;
 	const struct irq_domain_ops *ops;
@@ -1771,6 +1752,7 @@ static int gpiochip_add_irqchip(struct gpio_chip *gpiochip)
 
 	gpiochip->to_irq = gpiochip_to_irq;
 	gpiochip->irq.default_type = type;
+	gpiochip->irq.lock_key = lock_key;
 
 	if (gpiochip->irq.domain_ops)
 		ops = gpiochip->irq.domain_ops;
@@ -1957,7 +1939,8 @@ EXPORT_SYMBOL_GPL(gpiochip_irqchip_add_key);
 
 #else /* CONFIG_GPIOLIB_IRQCHIP */
 
-static inline int gpiochip_add_irqchip(struct gpio_chip *gpiochip)
+static inline int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
+				       struct lock_dep_class *lock_key)
 {
 	return 0;
 }
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index a77d4ada060c..8dd282c5167a 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -321,7 +321,41 @@ extern const char *gpiochip_is_requested(struct gpio_chip *chip,
 			unsigned offset);
 
 /* add/remove chips */
-extern int gpiochip_add_data(struct gpio_chip *chip, void *data);
+extern int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
+				      struct lock_class_key *lock_key);
+
+/**
+ * gpiochip_add_data() - register a gpio_chip
+ * @chip: the chip to register, with chip->base initialized
+ * @data: driver-private data associated with this chip
+ *
+ * Context: potentially before irqs will work
+ *
+ * When gpiochip_add_data() is called very early during boot, so that GPIOs
+ * can be freely used, the chip->parent device must be registered before
+ * the gpio framework's arch_initcall().  Otherwise sysfs initialization
+ * for GPIOs will fail rudely.
+ *
+ * gpiochip_add_data() must only be called after gpiolib initialization,
+ * ie after core_initcall().
+ *
+ * If chip->base is negative, this requests dynamic assignment of
+ * a range of valid GPIOs.
+ *
+ * Returns:
+ * A negative errno if the chip can't be registered, such as because the
+ * chip->base is invalid or already associated with a different chip.
+ * Otherwise it returns zero as a success code.
+ */
+#ifdef CONFIG_LOCKDEP
+#define gpiochip_add_data(chip, data) ({		\
+		static struct lock_class_key key;	\
+		gpiochip_add_data_with_key(chip, data, &key);	\
+	})
+#else
+#define gpiochip_add_data(chip, data) gpiochip_add_data_with_key(chip, data, NULL)
+#endif
+
 static inline int gpiochip_add(struct gpio_chip *chip)
 {
 	return gpiochip_add_data(chip, NULL);
-- 
cgit v1.2.3


From a7d3d0392a325d630225b7dbccf2558f944114e5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 10 Sep 2017 09:49:45 +0200
Subject: integrity: use kernel_read_file_from_path() to read x509 certs

The CONFIG_IMA_LOAD_X509 and CONFIG_EVM_LOAD_X509 options permit
loading x509 signed certificates onto the trusted keyrings without
verifying the x509 certificate file's signature.

This patch replaces the call to the integrity_read_file() specific
function with the common kernel_read_file_from_path() function.
To avoid verifying the file signature, this patch defines
READING_X509_CERTFICATE.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
---
 include/linux/fs.h                |  1 +
 security/integrity/digsig.c       | 14 +++++++----
 security/integrity/iint.c         | 49 ---------------------------------------
 security/integrity/ima/ima_main.c |  4 ++++
 security/integrity/integrity.h    |  2 --
 5 files changed, 14 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 339e73742e73..456325084f1d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2792,6 +2792,7 @@ extern int do_pipe_flags(int *, int);
 	id(KEXEC_IMAGE, kexec-image)		\
 	id(KEXEC_INITRAMFS, kexec-initramfs)	\
 	id(POLICY, security-policy)		\
+	id(X509_CERTIFICATE, x509-certificate)	\
 	id(MAX_ID, )
 
 #define __fid_enumify(ENUM, dummy) READING_ ## ENUM,
diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c
index 06554c448dce..6f9e4ce568cd 100644
--- a/security/integrity/digsig.c
+++ b/security/integrity/digsig.c
@@ -112,21 +112,25 @@ int __init integrity_init_keyring(const unsigned int id)
 int __init integrity_load_x509(const unsigned int id, const char *path)
 {
 	key_ref_t key;
-	char *data;
+	void *data;
+	loff_t size;
 	int rc;
 
 	if (!keyring[id])
 		return -EINVAL;
 
-	rc = integrity_read_file(path, &data);
-	if (rc < 0)
+	rc = kernel_read_file_from_path(path, &data, &size, 0,
+					READING_X509_CERTIFICATE);
+	if (rc < 0) {
+		pr_err("Unable to open file: %s (%d)", path, rc);
 		return rc;
+	}
 
 	key = key_create_or_update(make_key_ref(keyring[id], 1),
 				   "asymmetric",
 				   NULL,
 				   data,
-				   rc,
+				   size,
 				   ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				    KEY_USR_VIEW | KEY_USR_READ),
 				   KEY_ALLOC_NOT_IN_QUOTA);
@@ -139,6 +143,6 @@ int __init integrity_load_x509(const unsigned int id, const char *path)
 			  key_ref_to_ptr(key)->description, path);
 		key_ref_put(key);
 	}
-	kfree(data);
+	vfree(data);
 	return 0;
 }
diff --git a/security/integrity/iint.c b/security/integrity/iint.c
index 6fc888ca468e..c84e05866052 100644
--- a/security/integrity/iint.c
+++ b/security/integrity/iint.c
@@ -199,55 +199,6 @@ int integrity_kernel_read(struct file *file, loff_t offset,
 	return ret;
 }
 
-/*
- * integrity_read_file - read entire file content into the buffer
- *
- * This is function opens a file, allocates the buffer of required
- * size, read entire file content to the buffer and closes the file
- *
- * It is used only by init code.
- *
- */
-int __init integrity_read_file(const char *path, char **data)
-{
-	struct file *file;
-	loff_t size;
-	char *buf;
-	int rc = -EINVAL;
-
-	if (!path || !*path)
-		return -EINVAL;
-
-	file = filp_open(path, O_RDONLY, 0);
-	if (IS_ERR(file)) {
-		rc = PTR_ERR(file);
-		pr_err("Unable to open file: %s (%d)", path, rc);
-		return rc;
-	}
-
-	size = i_size_read(file_inode(file));
-	if (size <= 0)
-		goto out;
-
-	buf = kmalloc(size, GFP_KERNEL);
-	if (!buf) {
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	rc = integrity_kernel_read(file, 0, buf, size);
-	if (rc == size) {
-		*data = buf;
-	} else {
-		kfree(buf);
-		if (rc >= 0)
-			rc = -EIO;
-	}
-out:
-	fput(file);
-	return rc;
-}
-
 /*
  * integrity_load_keys - load integrity keys hook
  *
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 12738c8f39c2..d47f92e97f80 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -405,6 +405,10 @@ int ima_post_read_file(struct file *file, void *buf, loff_t size,
 	if (!file && read_id == READING_MODULE) /* MODULE_SIG_FORCE enabled */
 		return 0;
 
+	/* permit signed certs */
+	if (!file && read_id == READING_X509_CERTIFICATE)
+		return 0;
+
 	if (!file || !buf || size == 0) { /* should never happen */
 		if (ima_appraise & IMA_APPRAISE_ENFORCE)
 			return -EACCES;
diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h
index a53e7e4ab06c..e1bf040fb110 100644
--- a/security/integrity/integrity.h
+++ b/security/integrity/integrity.h
@@ -120,8 +120,6 @@ struct integrity_iint_cache *integrity_iint_find(struct inode *inode);
 int integrity_kernel_read(struct file *file, loff_t offset,
 			  void *addr, unsigned long count);
 
-int __init integrity_read_file(const char *path, char **data);
-
 #define INTEGRITY_KEYRING_EVM		0
 #define INTEGRITY_KEYRING_IMA		1
 #define INTEGRITY_KEYRING_MODULE	2
-- 
cgit v1.2.3


From fda784e50aace694ec2e4e16e2de07b91a938563 Mon Sep 17 00:00:00 2001
From: "Bruno E. O. Meneguele" <brdeoliv@redhat.com>
Date: Tue, 24 Oct 2017 15:37:00 -0200
Subject: module: export module signature enforcement status

A static variable sig_enforce is used as status var to indicate the real
value of CONFIG_MODULE_SIG_FORCE, once this one is set the var will hold
true, but if the CONFIG is not set the status var will hold whatever
value is present in the module.sig_enforce kernel cmdline param: true
when =1 and false when =0 or not present.

Considering this cmdline param take place over the CONFIG value when
it's not set, other places in the kernel could misbehave since they
would have only the CONFIG_MODULE_SIG_FORCE value to rely on. Exporting
this status var allows the kernel to rely in the effective value of
module signature enforcement, being it from CONFIG value or cmdline
param.

Signed-off-by: Bruno E. O. Meneguele <brdeoliv@redhat.com>
Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
---
 include/linux/module.h |  7 +++++++
 kernel/module.c        | 10 ++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index fe5aa3736707..c69b49abe877 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -639,6 +639,8 @@ static inline bool is_livepatch_module(struct module *mod)
 }
 #endif /* CONFIG_LIVEPATCH */
 
+bool is_module_sig_enforced(void);
+
 #else /* !CONFIG_MODULES... */
 
 static inline struct module *__module_address(unsigned long addr)
@@ -753,6 +755,11 @@ static inline bool module_requested_async_probing(struct module *module)
 	return false;
 }
 
+static inline bool is_module_sig_enforced(void)
+{
+	return false;
+}
+
 #endif /* CONFIG_MODULES */
 
 #ifdef CONFIG_SYSFS
diff --git a/kernel/module.c b/kernel/module.c
index de66ec825992..d1c194b057a2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -278,6 +278,16 @@ static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
 module_param(sig_enforce, bool_enable_only, 0644);
 #endif /* !CONFIG_MODULE_SIG_FORCE */
 
+/*
+ * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
+ * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
+ */
+bool is_module_sig_enforced(void)
+{
+	return sig_enforce;
+}
+EXPORT_SYMBOL(is_module_sig_enforced);
+
 /* Block module loading/unloading? */
 int modules_disabled = 0;
 core_param(nomodule, modules_disabled, bint, 0);
-- 
cgit v1.2.3


From 2dd9789c76ffde05d5f4c56f45c3cb71b3936694 Mon Sep 17 00:00:00 2001
From: Himanshu Jha <himanshujha199640@gmail.com>
Date: Sun, 5 Nov 2017 03:27:32 +0530
Subject: freezer: Fix typo in freezable_schedule_timeout() comment

Signed-off-by: Himanshu Jha <himanshujha199640@gmail.com>
Acked-by: Luis R. Rodriguez <mcgrof@kernel.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/freezer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index dd03e837ebb7..5b2cf48b2a7c 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -181,7 +181,7 @@ static inline void freezable_schedule_unsafe(void)
 }
 
 /*
- * Like freezable_schedule_timeout(), but should not block the freezer.  Do not
+ * Like schedule_timeout(), but should not block the freezer.  Do not
  * call this with locks held.
  */
 static inline long freezable_schedule_timeout(long timeout)
-- 
cgit v1.2.3


From cf89a31ca55272e1dfb9527b5a61eee4d417747a Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 19 Sep 2017 12:39:11 +0300
Subject: device property: Make fwnode_handle_get() return the fwnode

The fwnode_handle_get() function is used to obtain a reference to an
fwnode. A common usage pattern for the OF equivalent of the function is:

	mynode = of_node_get(node);

Similarly make fwnode_handle_get() return the fwnode to which the
reference was obtained.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 9 +++++++--
 drivers/of/property.c    | 4 ++--
 include/linux/fwnode.h   | 2 +-
 include/linux/property.h | 2 +-
 4 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 7ed99c1b2a8b..851b1b6596a4 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -1044,10 +1044,15 @@ EXPORT_SYMBOL_GPL(device_get_named_child_node);
 /**
  * fwnode_handle_get - Obtain a reference to a device node
  * @fwnode: Pointer to the device node to obtain the reference to.
+ *
+ * Returns the fwnode handle.
  */
-void fwnode_handle_get(struct fwnode_handle *fwnode)
+struct fwnode_handle *fwnode_handle_get(struct fwnode_handle *fwnode)
 {
-	fwnode_call_void_op(fwnode, get);
+	if (!fwnode_has_op(fwnode, get))
+		return fwnode;
+
+	return fwnode_call_ptr_op(fwnode, get);
 }
 EXPORT_SYMBOL_GPL(fwnode_handle_get);
 
diff --git a/drivers/of/property.c b/drivers/of/property.c
index 264c355ba1ff..8ad33a44a7b8 100644
--- a/drivers/of/property.c
+++ b/drivers/of/property.c
@@ -817,9 +817,9 @@ struct device_node *of_graph_get_remote_node(const struct device_node *node,
 }
 EXPORT_SYMBOL(of_graph_get_remote_node);
 
-static void of_fwnode_get(struct fwnode_handle *fwnode)
+static struct fwnode_handle *of_fwnode_get(struct fwnode_handle *fwnode)
 {
-	of_node_get(to_of_node(fwnode));
+	return of_fwnode_handle(of_node_get(to_of_node(fwnode)));
 }
 
 static void of_fwnode_put(struct fwnode_handle *fwnode)
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 0c35b6caf0f6..411a84c6c400 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -68,7 +68,7 @@ struct fwnode_reference_args {
  * @graph_parse_endpoint: Parse endpoint for port and endpoint id.
  */
 struct fwnode_operations {
-	void (*get)(struct fwnode_handle *fwnode);
+	struct fwnode_handle *(*get)(struct fwnode_handle *fwnode);
 	void (*put)(struct fwnode_handle *fwnode);
 	bool (*device_is_available)(const struct fwnode_handle *fwnode);
 	bool (*property_present)(const struct fwnode_handle *fwnode,
diff --git a/include/linux/property.h b/include/linux/property.h
index 6bebee13c5e0..42069c20caee 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -100,7 +100,7 @@ struct fwnode_handle *fwnode_get_named_child_node(
 struct fwnode_handle *device_get_named_child_node(struct device *dev,
 						  const char *childname);
 
-void fwnode_handle_get(struct fwnode_handle *fwnode);
+struct fwnode_handle *fwnode_handle_get(struct fwnode_handle *fwnode);
 void fwnode_handle_put(struct fwnode_handle *fwnode);
 
 unsigned int device_get_child_node_count(struct device *dev);
-- 
cgit v1.2.3


From 5e4b1b707b7ed795585b8ee28c44ce692aa8dfef Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 19 Sep 2017 12:39:12 +0300
Subject: device property: Add a macro for interating over graph endpoints

Add a convenience macro for iterating over graph endpoints. Iterating over
graph endpoints using fwnode_graph_get_next_endpoint() is a recurring
pattern, and this macro allows calling that function in a slightly more
convenient way. For instance,

	for (child = NULL;
	     (child = fwnode_graph_get_next_endpoint(fwnode, child)); )

becomes

	fwnode_graph_for_each_endpoint(fwnode, child)

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/property.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/property.h b/include/linux/property.h
index 42069c20caee..f6189a3ac63c 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -293,6 +293,10 @@ struct fwnode_handle *
 fwnode_graph_get_remote_node(const struct fwnode_handle *fwnode, u32 port,
 			     u32 endpoint);
 
+#define fwnode_graph_for_each_endpoint(fwnode, child)			\
+	for (child = NULL;						\
+	     (child = fwnode_graph_get_next_endpoint(fwnode, child)); )
+
 int fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
 				struct fwnode_endpoint *endpoint);
 
-- 
cgit v1.2.3


From 3653bc95bcc7daa938c0fdcd64ff199ed8f7f208 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 9 Oct 2017 21:52:09 -0700
Subject: timer: Prepare to change all DEFINE_TIMER() callbacks

Before we can globally change the function prototype of all timer callbacks,
we have to change those set up by DEFINE_TIMER(). Prepare for this by
casting the callbacks until the prototype changes globally.

Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/timer.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index a1af92bac0d5..9f8895decb82 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -63,6 +63,9 @@ struct timer_list {
 
 #define TIMER_TRACE_FLAGMASK	(TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE)
 
+#define TIMER_DATA_TYPE		unsigned long
+#define TIMER_FUNC_TYPE		void (*)(TIMER_DATA_TYPE)
+
 #define __TIMER_INITIALIZER(_function, _data, _flags) {		\
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
@@ -74,7 +77,7 @@ struct timer_list {
 
 #define DEFINE_TIMER(_name, _function)				\
 	struct timer_list _name =				\
-		__TIMER_INITIALIZER(_function, 0, 0)
+		__TIMER_INITIALIZER((TIMER_FUNC_TYPE)_function, 0, 0)
 
 void init_timer_key(struct timer_list *timer, unsigned int flags,
 		    const char *name, struct lock_class_key *key);
@@ -147,9 +150,6 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 #define setup_pinned_deferrable_timer_on_stack(timer, fn, data)		\
 	__setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED)
 
-#define TIMER_DATA_TYPE		unsigned long
-#define TIMER_FUNC_TYPE		void (*)(TIMER_DATA_TYPE)
-
 #ifndef CONFIG_LOCKDEP
 static inline void timer_setup(struct timer_list *timer,
 			       void (*callback)(struct timer_list *),
-- 
cgit v1.2.3


From 375ef2b1f0d0b43b0d36ffdd521637ff59b0c13c Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 17 Sep 2017 13:43:58 +0300
Subject: net: Introduce netdev_*_once functions

Extend the net device error logging with netdev_*_once macros.
netdev_*_once are the equivalents of the dev_*_once macros which are
useful for messages that should only be logged once.

Also add netdev_WARN_ONCE, which is the "once" extension for the already
existing netdev_WARN macro.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/netdevice.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 30f0f2928808..79518ede3170 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4336,6 +4336,31 @@ void netdev_notice(const struct net_device *dev, const char *format, ...);
 __printf(2, 3)
 void netdev_info(const struct net_device *dev, const char *format, ...);
 
+#define netdev_level_once(level, dev, fmt, ...)			\
+do {								\
+	static bool __print_once __read_mostly;			\
+								\
+	if (!__print_once) {					\
+		__print_once = true;				\
+		netdev_printk(level, dev, fmt, ##__VA_ARGS__);	\
+	}							\
+} while (0)
+
+#define netdev_emerg_once(dev, fmt, ...) \
+	netdev_level_once(KERN_EMERG, dev, fmt, ##__VA_ARGS__)
+#define netdev_alert_once(dev, fmt, ...) \
+	netdev_level_once(KERN_ALERT, dev, fmt, ##__VA_ARGS__)
+#define netdev_crit_once(dev, fmt, ...) \
+	netdev_level_once(KERN_CRIT, dev, fmt, ##__VA_ARGS__)
+#define netdev_err_once(dev, fmt, ...) \
+	netdev_level_once(KERN_ERR, dev, fmt, ##__VA_ARGS__)
+#define netdev_warn_once(dev, fmt, ...) \
+	netdev_level_once(KERN_WARNING, dev, fmt, ##__VA_ARGS__)
+#define netdev_notice_once(dev, fmt, ...) \
+	netdev_level_once(KERN_NOTICE, dev, fmt, ##__VA_ARGS__)
+#define netdev_info_once(dev, fmt, ...) \
+	netdev_level_once(KERN_INFO, dev, fmt, ##__VA_ARGS__)
+
 #define MODULE_ALIAS_NETDEV(device) \
 	MODULE_ALIAS("netdev-" device)
 
@@ -4376,6 +4401,10 @@ do {								\
 	WARN(1, "netdevice: %s%s\n" format, netdev_name(dev),	\
 	     netdev_reg_state(dev), ##args)
 
+#define netdev_WARN_ONCE(dev, condition, format, arg...)		\
+	WARN_ONCE(1, "netdevice: %s%s\n" format, netdev_name(dev)	\
+		  netdev_reg_state(dev), ##args)
+
 /* netif printk helpers, similar to netdev_printk */
 
 #define netif_printk(priv, type, level, dev, fmt, args...)	\
-- 
cgit v1.2.3


From 4382c7b92a1db397874ca62c73aa8b023af6dba8 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 10 Sep 2017 13:22:51 +0300
Subject: net/mlx5e: Add 802.1ad VLAN insertion support

Report VLAN insertion support for S-tagged packets and add support by
choosing the correct VLAN type in the WQE.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   | 2 ++
 include/linux/mlx5/qp.h                           | 1 +
 3 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 59b8a2d62b8d..c408b7efa42e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4194,6 +4194,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
 	netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_TX;
 	netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_RX;
 	netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_FILTER;
+	netdev->hw_features      |= NETIF_F_HW_VLAN_STAG_TX;
 
 	if (mlx5e_vxlan_allowed(mdev) || MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) {
 		netdev->hw_features     |= NETIF_F_GSO_PARTIAL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index de651de35c9b..c62305b214cc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -369,6 +369,8 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 		ds_cnt += DIV_ROUND_UP(ihs - sizeof(eseg->inline_hdr.start), MLX5_SEND_WQE_DS);
 	} else if (skb_vlan_tag_present(skb)) {
 		eseg->insert.type = cpu_to_be16(MLX5_ETH_WQE_INSERT_VLAN);
+		if (skb->vlan_proto == cpu_to_be16(ETH_P_8021AD))
+			eseg->insert.type |= cpu_to_be16(MLX5_ETH_WQE_SVLAN);
 		eseg->insert.vlan_tci = cpu_to_be16(skb_vlan_tag_get(skb));
 	}
 
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 66d19b611fe4..62af7512dabb 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -221,6 +221,7 @@ enum {
 };
 
 enum {
+	MLX5_ETH_WQE_SVLAN              = 1 << 0,
 	MLX5_ETH_WQE_INSERT_VLAN        = 1 << 15,
 };
 
-- 
cgit v1.2.3


From f7a6509fe002e3909cb41c09e807b7f3ca4a361b Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 1 Sep 2017 17:11:43 +0200
Subject: KVM: s390: vsie: use common code functions for pinning

We will not see -ENOMEM (gfn_to_hva() will return KVM_ERR_PTR_BAD_PAGE
for all errors). So we can also get rid of special handling in the
callers of pin_guest_page() and always assume that it is a g2 error.

As also kvm_s390_inject_program_int() should never fail, we can
simplify pin_scb(), too.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20170901151143.22714-1-david@redhat.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/vsie.c     | 50 +++++++++++++++++-------------------------------
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      |  4 ++--
 3 files changed, 21 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index b18b5652e5c5..a311938b63b3 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -443,22 +443,14 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
  *
  * Returns: - 0 on success
  *          - -EINVAL if the gpa is not valid guest storage
- *          - -ENOMEM if out of memory
  */
 static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
 {
 	struct page *page;
-	hva_t hva;
-	int rc;
 
-	hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
-	if (kvm_is_error_hva(hva))
+	page = gfn_to_page(kvm, gpa_to_gfn(gpa));
+	if (is_error_page(page))
 		return -EINVAL;
-	rc = get_user_pages_fast(hva, 1, 1, &page);
-	if (rc < 0)
-		return rc;
-	else if (rc != 1)
-		return -ENOMEM;
 	*hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
 	return 0;
 }
@@ -466,11 +458,7 @@ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
 /* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */
 static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
 {
-	struct page *page;
-
-	page = virt_to_page(hpa);
-	set_page_dirty_lock(page);
-	put_page(page);
+	kvm_release_pfn_dirty(hpa >> PAGE_SHIFT);
 	/* mark the page always as dirty for migration */
 	mark_page_dirty(kvm, gpa_to_gfn(gpa));
 }
@@ -557,7 +545,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 			rc = set_validity_icpt(scb_s, 0x003bU);
 		if (!rc) {
 			rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
-			if (rc == -EINVAL)
+			if (rc)
 				rc = set_validity_icpt(scb_s, 0x0034U);
 		}
 		if (rc)
@@ -574,10 +562,10 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		}
 		/* 256 bytes cannot cross page boundaries */
 		rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
-		if (rc == -EINVAL)
+		if (rc) {
 			rc = set_validity_icpt(scb_s, 0x0080U);
-		if (rc)
 			goto unpin;
+		}
 		scb_s->itdba = hpa;
 	}
 
@@ -592,10 +580,10 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		 * if this block gets bigger, we have to shadow it.
 		 */
 		rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
-		if (rc == -EINVAL)
+		if (rc) {
 			rc = set_validity_icpt(scb_s, 0x1310U);
-		if (rc)
 			goto unpin;
+		}
 		scb_s->gvrd = hpa;
 	}
 
@@ -607,11 +595,11 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		}
 		/* 64 bytes cannot cross page boundaries */
 		rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
-		if (rc == -EINVAL)
+		if (rc) {
 			rc = set_validity_icpt(scb_s, 0x0043U);
-		/* Validity 0x0044 will be checked by SIE */
-		if (rc)
 			goto unpin;
+		}
+		/* Validity 0x0044 will be checked by SIE */
 		scb_s->riccbd = hpa;
 	}
 	if ((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) {
@@ -635,10 +623,10 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		 * cross page boundaries
 		 */
 		rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
-		if (rc == -EINVAL)
+		if (rc) {
 			rc = set_validity_icpt(scb_s, 0x10b0U);
-		if (rc)
 			goto unpin;
+		}
 		scb_s->sdnxo = hpa | sdnxc;
 	}
 	return 0;
@@ -663,7 +651,6 @@ static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
  *
  * Returns: - 0 if the scb was pinned.
  *          - > 0 if control has to be given to guest 2
- *          - -ENOMEM if out of memory
  */
 static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
 		   gpa_t gpa)
@@ -672,14 +659,13 @@ static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
 	int rc;
 
 	rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
-	if (rc == -EINVAL) {
+	if (rc) {
 		rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-		if (!rc)
-			rc = 1;
+		WARN_ON_ONCE(rc);
+		return 1;
 	}
-	if (!rc)
-		vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
-	return rc;
+	vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
+	return 0;
 }
 
 /*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6882538eda32..2e754b7c282c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -667,6 +667,7 @@ kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
 			       bool *writable);
 
 void kvm_release_pfn_clean(kvm_pfn_t pfn);
+void kvm_release_pfn_dirty(kvm_pfn_t pfn);
 void kvm_set_pfn_dirty(kvm_pfn_t pfn);
 void kvm_set_pfn_accessed(kvm_pfn_t pfn);
 void kvm_get_pfn(kvm_pfn_t pfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9deb5a245b83..37731f661be5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -122,7 +122,6 @@ static void hardware_disable_all(void);
 
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 
-static void kvm_release_pfn_dirty(kvm_pfn_t pfn);
 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
 
 __visible bool kvm_rebooting;
@@ -1679,11 +1678,12 @@ void kvm_release_page_dirty(struct page *page)
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
 
-static void kvm_release_pfn_dirty(kvm_pfn_t pfn)
+void kvm_release_pfn_dirty(kvm_pfn_t pfn)
 {
 	kvm_set_pfn_dirty(pfn);
 	kvm_release_pfn_clean(pfn);
 }
+EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
 
 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
 {
-- 
cgit v1.2.3


From f121aadede37bcbb77ea552d195a09c734486fe7 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Thu, 9 Nov 2017 10:23:28 +0100
Subject: vfs: add path_put_init()

This one initializes the struct path to all zeroes so that multiple
path_put_init() on the path is safe.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 include/linux/path.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/path.h b/include/linux/path.h
index 81e65a5be7ce..475225a03d0d 100644
--- a/include/linux/path.h
+++ b/include/linux/path.h
@@ -18,4 +18,10 @@ static inline int path_equal(const struct path *path1, const struct path *path2)
 	return path1->mnt == path2->mnt && path1->dentry == path2->dentry;
 }
 
+static inline void path_put_init(struct path *path)
+{
+	path_put(path);
+	*path = (struct path) { };
+}
+
 #endif  /* _LINUX_PATH_H */
-- 
cgit v1.2.3


From 54985120a1c461b74f9510e5d730971f2a2383b1 Mon Sep 17 00:00:00 2001
From: Girish Moodalbail <girish.moodalbail@oracle.com>
Date: Tue, 7 Nov 2017 11:32:11 -0800
Subject: net: fix incorrect comment with regard to VLAN packet handling

The commit bcc6d4790361 ("net: vlan: make non-hw-accel rx path similar
to hw-accel") unified accel and non-accel path for VLAN RX. With that
fix we do not register any packet_type handler for VLANs anymore, so fix
the incorrect comment.

Signed-off-by: Girish Moodalbail <girish.moodalbail@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 79518ede3170..6b274bfe489f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4479,15 +4479,7 @@ do {								\
  *	Why 16. Because with 16 the only overlap we get on a hash of the
  *	low nibble of the protocol value is RARP/SNAP/X.25.
  *
- *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
- *             sure which should go first, but I bet it won't make much
- *             difference if we are running VLANs.  The good news is that
- *             this protocol won't be in the list unless compiled in, so
- *             the average user (w/out VLANs) will not be adversely affected.
- *             --BLG
- *
  *		0800	IP
- *		8100    802.1Q VLAN
  *		0001	802.3
  *		0002	AX.25
  *		0004	802.2
-- 
cgit v1.2.3


From 4f8413a3a799c958f7a10a6310a451e6b8aef5ad Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 9 Nov 2017 14:17:59 +0000
Subject: genirq: Track whether the trigger type has been set

When requesting a shared interrupt, we assume that the firmware
support code (DT or ACPI) has called irqd_set_trigger_type
already, so that we can retrieve it and check that the requester
is being reasonnable.

Unfortunately, we still have non-DT, non-ACPI systems around,
and these guys won't call irqd_set_trigger_type before requesting
the interrupt. The consequence is that we fail the request that
would have worked before.

We can either chase all these use cases (boring), or address it
in core code (easier). Let's have a per-irq_desc flag that
indicates whether irqd_set_trigger_type has been called, and
let's just check it when checking for a shared interrupt.
If it hasn't been set, just take whatever the interrupt
requester asks.

Fixes: 382bd4de6182 ("genirq: Use irqd_get_trigger_type to compare the trigger type for shared IRQs")
Cc: stable@vger.kernel.org
Reported-and-tested-by: Petr Cvek <petrcvekcz@gmail.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irq.h | 11 ++++++++++-
 kernel/irq/manage.c | 13 ++++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index fda8da7c45e7..73f61eeb152e 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -210,6 +210,7 @@ struct irq_data {
  * IRQD_MANAGED_SHUTDOWN	- Interrupt was shutdown due to empty affinity
  *				  mask. Applies only to affinity managed irqs.
  * IRQD_SINGLE_TARGET		- IRQ allows only a single affinity target
+ * IRQD_DEFAULT_TRIGGER_SET	- Expected trigger already been set
  */
 enum {
 	IRQD_TRIGGER_MASK		= 0xf,
@@ -230,6 +231,7 @@ enum {
 	IRQD_IRQ_STARTED		= (1 << 22),
 	IRQD_MANAGED_SHUTDOWN		= (1 << 23),
 	IRQD_SINGLE_TARGET		= (1 << 24),
+	IRQD_DEFAULT_TRIGGER_SET	= (1 << 25),
 };
 
 #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -259,18 +261,25 @@ static inline void irqd_mark_affinity_was_set(struct irq_data *d)
 	__irqd_to_state(d) |= IRQD_AFFINITY_SET;
 }
 
+static inline bool irqd_trigger_type_was_set(struct irq_data *d)
+{
+	return __irqd_to_state(d) & IRQD_DEFAULT_TRIGGER_SET;
+}
+
 static inline u32 irqd_get_trigger_type(struct irq_data *d)
 {
 	return __irqd_to_state(d) & IRQD_TRIGGER_MASK;
 }
 
 /*
- * Must only be called inside irq_chip.irq_set_type() functions.
+ * Must only be called inside irq_chip.irq_set_type() functions or
+ * from the DT/ACPI setup code.
  */
 static inline void irqd_set_trigger_type(struct irq_data *d, u32 type)
 {
 	__irqd_to_state(d) &= ~IRQD_TRIGGER_MASK;
 	__irqd_to_state(d) |= type & IRQD_TRIGGER_MASK;
+	__irqd_to_state(d) |= IRQD_DEFAULT_TRIGGER_SET;
 }
 
 static inline bool irqd_is_level_type(struct irq_data *d)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e667912d0e9c..21e04e780be4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1228,7 +1228,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		 * set the trigger type must match. Also all must
 		 * agree on ONESHOT.
 		 */
-		unsigned int oldtype = irqd_get_trigger_type(&desc->irq_data);
+		unsigned int oldtype;
+
+		/*
+		 * If nobody did set the configuration before, inherit
+		 * the one provided by the requester.
+		 */
+		if (irqd_trigger_type_was_set(&desc->irq_data)) {
+			oldtype = irqd_get_trigger_type(&desc->irq_data);
+		} else {
+			oldtype = new->flags & IRQF_TRIGGER_MASK;
+			irqd_set_trigger_type(&desc->irq_data, oldtype);
+		}
 
 		if (!((old->flags & new->flags) & IRQF_SHARED) ||
 		    (oldtype != (new->flags & IRQF_TRIGGER_MASK)) ||
-- 
cgit v1.2.3


From 125b1192bd48544d9429e240c8f8e64d1533b2c1 Mon Sep 17 00:00:00 2001
From: Keerthy <j-keerthy@ti.com>
Date: Fri, 10 Nov 2017 17:22:44 +0530
Subject: regulator: tps65218: remove unused tps_info structure

remove unused tps_info structure.

Signed-off-by: Keerthy <j-keerthy@ti.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/mfd/tps65218.h | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/tps65218.h b/include/linux/mfd/tps65218.h
index bccd2d68b1e3..f069c518c0ed 100644
--- a/include/linux/mfd/tps65218.h
+++ b/include/linux/mfd/tps65218.h
@@ -245,24 +245,6 @@ enum tps65218_irqs {
 	TPS65218_INVALID4_IRQ,
 };
 
-/**
- * struct tps_info - packages regulator constraints
- * @id:			Id of the regulator
- * @name:		Voltage regulator name
- * @min_uV:		minimum micro volts
- * @max_uV:		minimum micro volts
- * @strobe:		sequencing strobe value for the regulator
- *
- * This data is used to check the regualtor voltage limits while setting.
- */
-struct tps_info {
-	int id;
-	const char *name;
-	int min_uV;
-	int max_uV;
-	int strobe;
-};
-
 /**
  * struct tps65218 - tps65218 sub-driver chip access routines
  *
@@ -280,7 +262,6 @@ struct tps65218 {
 	u32 irq_mask;
 	struct regmap_irq_chip_data *irq_data;
 	struct regulator_desc desc[TPS65218_NUM_REGULATOR];
-	struct tps_info *info[TPS65218_NUM_REGULATOR];
 	struct regmap *regmap;
 	u8 *strobes;
 };
-- 
cgit v1.2.3


From f7b53637c090bd8ce2dc74ad0f3aa1898aff2524 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Tue, 24 Oct 2017 18:52:31 -0700
Subject: Audit: remove unused audit_log_secctx function

The function audit_log_secctx() is unused in the upstream kernel.
All it does is wrap another function that doesn't need wrapping.
It claims to give you the SELinux context, but that is not true if
you are using a different security module.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h |  8 --------
 kernel/audit.c        | 26 --------------------------
 2 files changed, 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 2150bdccfbab..fa1b068d911d 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -149,12 +149,6 @@ extern void		    audit_log_key(struct audit_buffer *ab,
 extern void		    audit_log_link_denied(const char *operation,
 						  const struct path *link);
 extern void		    audit_log_lost(const char *message);
-#ifdef CONFIG_SECURITY
-extern void 		    audit_log_secctx(struct audit_buffer *ab, u32 secid);
-#else
-static inline void	    audit_log_secctx(struct audit_buffer *ab, u32 secid)
-{ }
-#endif
 
 extern int audit_log_task_context(struct audit_buffer *ab);
 extern void audit_log_task_info(struct audit_buffer *ab,
@@ -203,8 +197,6 @@ static inline void audit_log_key(struct audit_buffer *ab, char *key)
 static inline void audit_log_link_denied(const char *string,
 					 const struct path *link)
 { }
-static inline void audit_log_secctx(struct audit_buffer *ab, u32 secid)
-{ }
 static inline int audit_log_task_context(struct audit_buffer *ab)
 {
 	return 0;
diff --git a/kernel/audit.c b/kernel/audit.c
index 64e1d0ec19de..227db99b0f19 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2345,32 +2345,6 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
 	}
 }
 
-#ifdef CONFIG_SECURITY
-/**
- * audit_log_secctx - Converts and logs SELinux context
- * @ab: audit_buffer
- * @secid: security number
- *
- * This is a helper function that calls security_secid_to_secctx to convert
- * secid to secctx and then adds the (converted) SELinux context to the audit
- * log by calling audit_log_format, thus also preventing leak of internal secid
- * to userspace. If secid cannot be converted audit_panic is called.
- */
-void audit_log_secctx(struct audit_buffer *ab, u32 secid)
-{
-	u32 len;
-	char *secctx;
-
-	if (security_secid_to_secctx(secid, &secctx, &len)) {
-		audit_panic("Cannot convert secid to context");
-	} else {
-		audit_log_format(ab, " obj=%s", secctx);
-		security_release_secctx(secctx, len);
-	}
-}
-EXPORT_SYMBOL(audit_log_secctx);
-#endif
-
 EXPORT_SYMBOL(audit_log_start);
 EXPORT_SYMBOL(audit_log_end);
 EXPORT_SYMBOL(audit_log_format);
-- 
cgit v1.2.3


From 84fef62d135b6e47b52f4e9280b5dbc5bb0050ba Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 7 Nov 2017 10:28:32 -0700
Subject: nvme: check admin passthru command effects

The NVMe standard provides a command effects log page so the host may
be aware of special requirements it may need to do for a particular
command. For example, the command may need to run with IO quiesced to
prevent timeouts or undefined behavior, or it may change the logical block
formats that determine how the host needs to construct future commands.

This patch saves the nvme command effects log page if the controller
supports it, and performs appropriate actions before and after an admin
passthrough command is completed. If the controller does not support the
command effects log page, the driver will define the effects for known
opcodes. The nvme format and santize are the only commands in this patch
with known effects.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h |   1 +
 include/linux/nvme.h     |  18 ++++++++
 3 files changed, 126 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 90b6375a9da5..65fd2fc1ae3c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -72,6 +72,9 @@ static DEFINE_IDA(nvme_instance_ida);
 static dev_t nvme_chr_devt;
 static struct class *nvme_class;
 
+static void nvme_ns_remove(struct nvme_ns *ns);
+static int nvme_revalidate_disk(struct gendisk *disk);
+
 static __le32 nvme_get_log_dw10(u8 lid, size_t size)
 {
 	return cpu_to_le32((((size / 4) - 1) << 16) | lid);
@@ -992,12 +995,87 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 			metadata, meta_len, io.slba, NULL, 0);
 }
 
+static u32 nvme_known_admin_effects(u8 opcode)
+{
+	switch (opcode) {
+	case nvme_admin_format_nvm:
+		return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
+					NVME_CMD_EFFECTS_CSE_MASK;
+	case nvme_admin_sanitize_nvm:
+		return NVME_CMD_EFFECTS_CSE_MASK;
+	default:
+		break;
+	}
+	return 0;
+}
+
+static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+								u8 opcode)
+{
+	u32 effects = 0;
+
+	if (ns) {
+		if (ctrl->effects)
+			effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
+		if (effects & ~NVME_CMD_EFFECTS_CSUPP)
+			dev_warn(ctrl->device,
+				 "IO command:%02x has unhandled effects:%08x\n",
+				 opcode, effects);
+		return 0;
+	}
+
+	if (ctrl->effects)
+		effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
+	else
+		effects = nvme_known_admin_effects(opcode);
+
+	/*
+	 * For simplicity, IO to all namespaces is quiesced even if the command
+	 * effects say only one namespace is affected.
+	 */
+	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
+		nvme_start_freeze(ctrl);
+		nvme_wait_freeze(ctrl);
+	}
+	return effects;
+}
+
+static void nvme_update_formats(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	mutex_lock(&ctrl->namespaces_mutex);
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		if (ns->disk && nvme_revalidate_disk(ns->disk))
+			nvme_ns_remove(ns);
+	}
+	mutex_unlock(&ctrl->namespaces_mutex);
+}
+
+static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
+{
+	/*
+	 * Revalidate LBA changes prior to unfreezing. This is necessary to
+	 * prevent memory corruption if a logical block size was changed by
+	 * this command.
+	 */
+	if (effects & NVME_CMD_EFFECTS_LBCC)
+		nvme_update_formats(ctrl);
+	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK))
+		nvme_unfreeze(ctrl);
+	if (effects & NVME_CMD_EFFECTS_CCC)
+		nvme_init_identify(ctrl);
+	if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC))
+		nvme_queue_scan(ctrl);
+}
+
 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 			struct nvme_passthru_cmd __user *ucmd)
 {
 	struct nvme_passthru_cmd cmd;
 	struct nvme_command c;
 	unsigned timeout = 0;
+	u32 effects;
 	int status;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -1023,10 +1101,13 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	if (cmd.timeout_ms)
 		timeout = msecs_to_jiffies(cmd.timeout_ms);
 
+	effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
 			(void __user *)(uintptr_t)cmd.addr, cmd.data_len,
 			(void __user *)(uintptr_t)cmd.metadata, cmd.metadata,
 			0, &cmd.result, timeout);
+	nvme_passthru_end(ctrl, effects);
+
 	if (status >= 0) {
 		if (put_user(cmd.result, &ucmd->result))
 			return -EFAULT;
@@ -1759,6 +1840,25 @@ static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log,
 	return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
 }
 
+static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
+{
+	int ret;
+
+	if (!ctrl->effects)
+		ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
+
+	if (!ctrl->effects)
+		return 0;
+
+	ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects,
+					sizeof(*ctrl->effects));
+	if (ret) {
+		kfree(ctrl->effects);
+		ctrl->effects = NULL;
+	}
+	return ret;
+}
+
 /*
  * Initialize the cached copies of the Identify data and various controller
  * register in our nvme_ctrl structure.  This should be called as soon as
@@ -1794,6 +1894,12 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 		return -EIO;
 	}
 
+	if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
+		ret = nvme_get_effects_log(ctrl);
+		if (ret < 0)
+			return ret;
+	}
+
 	nvme_init_subnqn(ctrl, id);
 
 	if (!ctrl->identified) {
@@ -2713,6 +2819,7 @@ static void nvme_free_ctrl(struct device *dev)
 
 	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 	ida_destroy(&ctrl->ns_ida);
+	kfree(ctrl->effects);
 
 	ctrl->ops->free_ctrl(ctrl);
 }
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 35d9cee515f1..f7c21cea9485 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -174,6 +174,7 @@ struct nvme_ctrl {
 	bool subsystem;
 	unsigned long quirks;
 	struct nvme_id_power_state psd[32];
+	struct nvme_effects_log *effects;
 	struct work_struct scan_work;
 	struct work_struct async_event_work;
 	struct delayed_work ka_work;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 9310ce77d8e1..fd1d4508a612 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -267,6 +267,7 @@ enum {
 	NVME_CTRL_OACS_SEC_SUPP                 = 1 << 0,
 	NVME_CTRL_OACS_DIRECTIVES		= 1 << 5,
 	NVME_CTRL_OACS_DBBUF_SUPP		= 1 << 8,
+	NVME_CTRL_LPA_CMD_EFFECTS_LOG		= 1 << 1,
 };
 
 struct nvme_lbaf {
@@ -395,6 +396,21 @@ struct nvme_fw_slot_info_log {
 	__u8			rsvd64[448];
 };
 
+enum {
+	NVME_CMD_EFFECTS_CSUPP		= 1 << 0,
+	NVME_CMD_EFFECTS_LBCC		= 1 << 1,
+	NVME_CMD_EFFECTS_NCC		= 1 << 2,
+	NVME_CMD_EFFECTS_NIC		= 1 << 3,
+	NVME_CMD_EFFECTS_CCC		= 1 << 4,
+	NVME_CMD_EFFECTS_CSE_MASK	= 3 << 16,
+};
+
+struct nvme_effects_log {
+	__le32 acs[256];
+	__le32 iocs[256];
+	__u8   resv[2048];
+};
+
 enum {
 	NVME_SMART_CRIT_SPARE		= 1 << 0,
 	NVME_SMART_CRIT_TEMPERATURE	= 1 << 1,
@@ -681,6 +697,7 @@ enum nvme_admin_opcode {
 	nvme_admin_format_nvm		= 0x80,
 	nvme_admin_security_send	= 0x81,
 	nvme_admin_security_recv	= 0x82,
+	nvme_admin_sanitize_nvm		= 0x84,
 };
 
 enum {
@@ -712,6 +729,7 @@ enum {
 	NVME_LOG_ERROR		= 0x01,
 	NVME_LOG_SMART		= 0x02,
 	NVME_LOG_FW_SLOT	= 0x03,
+	NVME_LOG_CMD_EFFECTS	= 0x05,
 	NVME_LOG_DISC		= 0x70,
 	NVME_LOG_RESERVATION	= 0x80,
 	NVME_FWACT_REPL		= (0 << 3),
-- 
cgit v1.2.3


From 83f5f7ed72f316eda4c4c3833beebfe6578926f4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 8 Nov 2017 11:15:37 -0700
Subject: block: kill bio_kmap/kunmap_irq()

There are no users of it anymore.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/biodoc.txt |  4 ++--
 include/linux/bio.h            | 11 -----------
 2 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 9490f2845f06..01c0a03407cc 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -216,7 +216,7 @@ may need to abort DMA operations and revert to PIO for the transfer, in
 which case a virtual mapping of the page is required. For SCSI it is also
 done in some scenarios where the low level driver cannot be trusted to
 handle a single sg entry correctly. The driver is expected to perform the
-kmaps as needed on such occasions using the __bio_kmap_atomic and bio_kmap_irq
+kmaps as needed on such occasions using the bio_kmap_irq and friends
 routines as appropriate. A driver could also use the blk_queue_bounce()
 routine on its own to bounce highmem i/o to low memory for specific requests
 if so desired.
@@ -1137,7 +1137,7 @@ use dma_map_sg for scatter gather) to be able to ship it to the driver. For
 PIO drivers (or drivers that need to revert to PIO transfer once in a
 while (IDE for example)), where the CPU is doing the actual data
 transfer a virtual mapping is needed. If the driver supports highmem I/O,
-(Sec 1.1, (ii) ) it needs to use __bio_kmap_atomic and bio_kmap_irq to
+(Sec 1.1, (ii) ) it needs to use __bio_kmap_atomic or similar to
 temporarily map a bio into the virtual address space.
 
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 9c75f58f6a50..1d7e63d7505f 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -573,17 +573,6 @@ static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
 }
 #endif
 
-static inline char *__bio_kmap_irq(struct bio *bio, struct bvec_iter iter,
-				   unsigned long *flags)
-{
-	return bvec_kmap_irq(&bio_iter_iovec(bio, iter), flags);
-}
-#define __bio_kunmap_irq(buf, flags)	bvec_kunmap_irq(buf, flags)
-
-#define bio_kmap_irq(bio, flags) \
-	__bio_kmap_irq((bio), (bio)->bi_iter, (flags))
-#define bio_kunmap_irq(buf,flags)	__bio_kunmap_irq(buf, flags)
-
 /*
  * BIO list management for use by remapping drivers (e.g. DM or MD) and loop.
  *
-- 
cgit v1.2.3


From d004a5e7d4dd6335ce6e2044af42f5e0fbebb51d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 8 Nov 2017 19:13:48 +0100
Subject: block: remove __bio_kmap_atomic

This helper doesn't buy us much over calling kmap_atomic directly.
In fact in the only caller it does a bit of useless work as the
caller already has the bvec at hand, and said caller would even
buggy for a multi-segment bio due to the use of this helper.

So just remove it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/biodoc.txt      | 11 +++++------
 arch/xtensa/platforms/iss/simdisk.c |  4 ++--
 block/blk-settings.c                |  2 +-
 include/linux/bio.h                 | 12 ------------
 4 files changed, 8 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 01c0a03407cc..86927029a52d 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -216,10 +216,9 @@ may need to abort DMA operations and revert to PIO for the transfer, in
 which case a virtual mapping of the page is required. For SCSI it is also
 done in some scenarios where the low level driver cannot be trusted to
 handle a single sg entry correctly. The driver is expected to perform the
-kmaps as needed on such occasions using the bio_kmap_irq and friends
-routines as appropriate. A driver could also use the blk_queue_bounce()
-routine on its own to bounce highmem i/o to low memory for specific requests
-if so desired.
+kmaps as needed on such occasions as appropriate. A driver could also use
+the blk_queue_bounce() routine on its own to bounce highmem i/o to low
+memory for specific requests if so desired.
 
 iii. The i/o scheduler algorithm itself can be replaced/set as appropriate
 
@@ -1137,8 +1136,8 @@ use dma_map_sg for scatter gather) to be able to ship it to the driver. For
 PIO drivers (or drivers that need to revert to PIO transfer once in a
 while (IDE for example)), where the CPU is doing the actual data
 transfer a virtual mapping is needed. If the driver supports highmem I/O,
-(Sec 1.1, (ii) ) it needs to use __bio_kmap_atomic or similar to
-temporarily map a bio into the virtual address space.
+(Sec 1.1, (ii) ) it needs to use kmap_atomic or similar to temporarily map
+a bio into the virtual address space.
 
 
 8. Prior/Related/Impacted patches
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index c45b90bb9339..eacf1e433518 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -110,13 +110,13 @@ static blk_qc_t simdisk_make_request(struct request_queue *q, struct bio *bio)
 	sector_t sector = bio->bi_iter.bi_sector;
 
 	bio_for_each_segment(bvec, bio, iter) {
-		char *buffer = __bio_kmap_atomic(bio, iter);
+		char *buffer = kmap_atomic(bvec.bv_page) + bvec.bv_offset;
 		unsigned len = bvec.bv_len >> SECTOR_SHIFT;
 
 		simdisk_transfer(dev, sector, len, buffer,
 				bio_data_dir(bio) == WRITE);
 		sector += len;
-		__bio_kunmap_atomic(buffer);
+		kunmap_atomic(buffer)
 	}
 
 	bio_endio(bio);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8559e9563c52..48ebe6be07b7 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -157,7 +157,7 @@ EXPORT_SYMBOL(blk_set_stacking_limits);
  * Caveat:
  *    The driver that does this *must* be able to deal appropriately
  *    with buffers in "highmemory". This can be accomplished by either calling
- *    __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
+ *    kmap_atomic() to get a temporary kernel mapping, or by calling
  *    blk_queue_bounce() to create a buffer in normal memory.
  **/
 void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1d7e63d7505f..d4eec19a6d3c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -128,18 +128,6 @@ static inline void *bio_data(struct bio *bio)
  */
 #define bvec_to_phys(bv)	(page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset)
 
-/*
- * queues that have highmem support enabled may still need to revert to
- * PIO transfers occasionally and thus map high pages temporarily. For
- * permanent PIO fall back, user is probably better off disabling highmem
- * I/O completely on that queue (see ide-dma for example)
- */
-#define __bio_kmap_atomic(bio, iter)				\
-	(kmap_atomic(bio_iter_iovec((bio), (iter)).bv_page) +	\
-		bio_iter_iovec((bio), (iter)).bv_offset)
-
-#define __bio_kunmap_atomic(addr)	kunmap_atomic(addr)
-
 /*
  * merge helpers etc
  */
-- 
cgit v1.2.3


From f00c4d80ffdac9e3a64947ebd57489f3232d5a74 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 5 Nov 2017 10:36:31 +0300
Subject: block: pass full fmode_t to blk_verify_command

Use the obvious calling convention.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bsg.c            | 18 ++++++++----------
 block/scsi_ioctl.c     |  8 ++++----
 drivers/scsi/sg.c      |  2 +-
 include/linux/blkdev.h |  2 +-
 4 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/block/bsg.c b/block/bsg.c
index ee1335c68de7..452f94f1c5d4 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -137,7 +137,7 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index)
 
 static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 				struct sg_io_v4 *hdr, struct bsg_device *bd,
-				fmode_t has_write_perm)
+				fmode_t mode)
 {
 	struct scsi_request *req = scsi_req(rq);
 
@@ -152,7 +152,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 		return -EFAULT;
 
 	if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) {
-		if (blk_verify_command(req->cmd, has_write_perm))
+		if (blk_verify_command(req->cmd, mode))
 			return -EPERM;
 	} else if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
@@ -206,7 +206,7 @@ bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *op)
  * map sg_io_v4 to a request.
  */
 static struct request *
-bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
+bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode)
 {
 	struct request_queue *q = bd->queue;
 	struct request *rq, *next_rq = NULL;
@@ -237,7 +237,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
 	if (IS_ERR(rq))
 		return rq;
 
-	ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm);
+	ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, mode);
 	if (ret)
 		goto out;
 
@@ -587,8 +587,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 }
 
 static int __bsg_write(struct bsg_device *bd, const char __user *buf,
-		       size_t count, ssize_t *bytes_written,
-		       fmode_t has_write_perm)
+		       size_t count, ssize_t *bytes_written, fmode_t mode)
 {
 	struct bsg_command *bc;
 	struct request *rq;
@@ -619,7 +618,7 @@ static int __bsg_write(struct bsg_device *bd, const char __user *buf,
 		/*
 		 * get a request, fill in the blanks, and add to request queue
 		 */
-		rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm);
+		rq = bsg_map_hdr(bd, &bc->hdr, mode);
 		if (IS_ERR(rq)) {
 			ret = PTR_ERR(rq);
 			rq = NULL;
@@ -655,8 +654,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 	bsg_set_block(bd, file);
 
 	bytes_written = 0;
-	ret = __bsg_write(bd, buf, count, &bytes_written,
-			  file->f_mode & FMODE_WRITE);
+	ret = __bsg_write(bd, buf, count, &bytes_written, file->f_mode);
 
 	*ppos = bytes_written;
 
@@ -915,7 +913,7 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (copy_from_user(&hdr, uarg, sizeof(hdr)))
 			return -EFAULT;
 
-		rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE);
+		rq = bsg_map_hdr(bd, &hdr, file->f_mode);
 		if (IS_ERR(rq))
 			return PTR_ERR(rq);
 
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 7440de44dd85..edcfff974527 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -207,7 +207,7 @@ static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)
 	__set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok);
 }
 
-int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
+int blk_verify_command(unsigned char *cmd, fmode_t mode)
 {
 	struct blk_cmd_filter *filter = &blk_default_cmd_filter;
 
@@ -220,7 +220,7 @@ int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
 		return 0;
 
 	/* Write-safe commands require a writable open */
-	if (test_bit(cmd[0], filter->write_ok) && has_write_perm)
+	if (test_bit(cmd[0], filter->write_ok) && (mode & FMODE_WRITE))
 		return 0;
 
 	return -EPERM;
@@ -234,7 +234,7 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
 
 	if (copy_from_user(req->cmd, hdr->cmdp, hdr->cmd_len))
 		return -EFAULT;
-	if (blk_verify_command(req->cmd, mode & FMODE_WRITE))
+	if (blk_verify_command(req->cmd, mode))
 		return -EPERM;
 
 	/*
@@ -469,7 +469,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 	if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
 		goto error;
 
-	err = blk_verify_command(req->cmd, mode & FMODE_WRITE);
+	err = blk_verify_command(req->cmd, mode);
 	if (err)
 		goto error;
 
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 0419c2298eab..92fd870e1315 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -217,7 +217,7 @@ static int sg_allow_access(struct file *filp, unsigned char *cmd)
 	if (sfp->parentdp->device->type == TYPE_SCANNER)
 		return 0;
 
-	return blk_verify_command(cmd, filp->f_mode & FMODE_WRITE);
+	return blk_verify_command(cmd, filp->f_mode);
 }
 
 static int
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 225617dd0a3f..1437ef4d8037 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1360,7 +1360,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
 				    gfp_mask, 0);
 }
 
-extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
+extern int blk_verify_command(unsigned char *cmd, fmode_t mode);
 
 enum blk_default_limits {
 	BLK_MAX_SEGMENTS	= 128,
-- 
cgit v1.2.3


From 38dabe210fbab4e7e8a03670ab3ba42f247ea08f Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 7 Nov 2017 15:13:10 -0700
Subject: nvme: centralize AEN defines

All the transports were unnecessarilly duplicating the AEN request
accounting. This patch defines everything in one place.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Guan Junxiong <guanjunxiong@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c   |  2 +-
 drivers/nvme/host/fc.c     | 33 ++++++++++++---------------------
 drivers/nvme/host/nvme.h   |  1 -
 drivers/nvme/host/pci.c    | 16 +++-------------
 drivers/nvme/host/rdma.c   | 14 +++-----------
 drivers/nvme/target/loop.c | 14 +++-----------
 include/linux/nvme.h       |  8 ++++++++
 7 files changed, 30 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 6c79b73fe2c3..315087dfcd8d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2779,7 +2779,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
 
 void nvme_queue_async_events(struct nvme_ctrl *ctrl)
 {
-	ctrl->event_limit = NVME_NR_AERS;
+	ctrl->event_limit = NVME_NR_AEN_COMMANDS;
 	queue_work(nvme_wq, &ctrl->async_event_work);
 }
 EXPORT_SYMBOL_GPL(nvme_queue_async_events);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index da9296a1eb26..f3bfad47a95f 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -30,15 +30,6 @@
 /* *************************** Data Structures/Defines ****************** */
 
 
-/*
- * We handle AEN commands ourselves and don't even let the
- * block layer know about them.
- */
-#define NVME_FC_NR_AEN_COMMANDS	1
-#define NVME_FC_AQ_BLKMQ_DEPTH	\
-	(NVME_AQ_DEPTH - NVME_FC_NR_AEN_COMMANDS)
-#define AEN_CMDID_BASE		(NVME_FC_AQ_BLKMQ_DEPTH + 1)
-
 enum nvme_fc_queue_flags {
 	NVME_FC_Q_CONNECTED = (1 << 0),
 };
@@ -170,7 +161,7 @@ struct nvme_fc_ctrl {
 	u32			iocnt;
 	wait_queue_head_t	ioabort_wait;
 
-	struct nvme_fc_fcp_op	aen_ops[NVME_FC_NR_AEN_COMMANDS];
+	struct nvme_fc_fcp_op	aen_ops[NVME_NR_AEN_COMMANDS];
 
 	struct nvme_ctrl	ctrl;
 };
@@ -1546,7 +1537,7 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl)
 	unsigned long flags;
 	int i, ret;
 
-	for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+	for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
 		if (atomic_read(&aen_op->state) != FCPOP_STATE_ACTIVE)
 			continue;
 
@@ -1816,7 +1807,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl)
 	int i, ret;
 
 	aen_op = ctrl->aen_ops;
-	for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+	for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
 		private = kzalloc(ctrl->lport->ops->fcprqst_priv_sz,
 						GFP_KERNEL);
 		if (!private)
@@ -1826,7 +1817,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl)
 		sqe = &cmdiu->sqe;
 		ret = __nvme_fc_init_request(ctrl, &ctrl->queues[0],
 				aen_op, (struct request *)NULL,
-				(AEN_CMDID_BASE + i));
+				(NVME_AQ_BLK_MQ_DEPTH + i));
 		if (ret) {
 			kfree(private);
 			return ret;
@@ -1839,7 +1830,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl)
 		memset(sqe, 0, sizeof(*sqe));
 		sqe->common.opcode = nvme_admin_async_event;
 		/* Note: core layer may overwrite the sqe.command_id value */
-		sqe->common.command_id = AEN_CMDID_BASE + i;
+		sqe->common.command_id = NVME_AQ_BLK_MQ_DEPTH + i;
 	}
 	return 0;
 }
@@ -1851,7 +1842,7 @@ nvme_fc_term_aen_ops(struct nvme_fc_ctrl *ctrl)
 	int i;
 
 	aen_op = ctrl->aen_ops;
-	for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+	for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
 		if (!aen_op->fcp_req.private)
 			continue;
 
@@ -2402,7 +2393,7 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
 	bool terminating = false;
 	blk_status_t ret;
 
-	if (aer_idx > NVME_FC_NR_AEN_COMMANDS)
+	if (aer_idx > NVME_NR_AEN_COMMANDS)
 		return;
 
 	spin_lock_irqsave(&ctrl->lock, flags);
@@ -2722,16 +2713,16 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
 	 * Create the admin queue
 	 */
 
-	nvme_fc_init_queue(ctrl, 0, NVME_FC_AQ_BLKMQ_DEPTH);
+	nvme_fc_init_queue(ctrl, 0, NVME_AQ_BLK_MQ_DEPTH);
 
 	ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0,
-				NVME_FC_AQ_BLKMQ_DEPTH);
+				NVME_AQ_BLK_MQ_DEPTH);
 	if (ret)
 		goto out_free_queue;
 
 	ret = nvme_fc_connect_admin_queue(ctrl, &ctrl->queues[0],
-				NVME_FC_AQ_BLKMQ_DEPTH,
-				(NVME_FC_AQ_BLKMQ_DEPTH / 4));
+				NVME_AQ_BLK_MQ_DEPTH,
+				(NVME_AQ_BLK_MQ_DEPTH / 4));
 	if (ret)
 		goto out_delete_hw_queue;
 
@@ -3145,7 +3136,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 
 	memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
 	ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
-	ctrl->admin_tag_set.queue_depth = NVME_FC_AQ_BLKMQ_DEPTH;
+	ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
 	ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
 	ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
 	ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index f7c21cea9485..a6d750cfa6b2 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -313,7 +313,6 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
 int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 		bool send);
 
-#define NVME_NR_AERS	1
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		union nvme_result *res);
 void nvme_queue_async_events(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 32c413ec818c..c3dfd84feef7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -35,12 +35,6 @@
 #define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
 
-/*
- * We handle AEN commands ourselves and don't even let the
- * block layer know about them.
- */
-#define NVME_AQ_BLKMQ_DEPTH	(NVME_AQ_DEPTH - NVME_NR_AERS)
-
 #define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
 
 static int use_threaded_interrupts;
@@ -956,7 +950,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 	 * for them but rather special case them here.
 	 */
 	if (unlikely(nvmeq->qid == 0 &&
-			cqe->command_id >= NVME_AQ_BLKMQ_DEPTH)) {
+			cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
 		nvme_complete_async_event(&nvmeq->dev->ctrl,
 				cqe->status, &cqe->result);
 		return;
@@ -1057,7 +1051,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
 
 	memset(&c, 0, sizeof(c));
 	c.common.opcode = nvme_admin_async_event;
-	c.common.command_id = NVME_AQ_BLKMQ_DEPTH + aer_idx;
+	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH + aer_idx;
 
 	spin_lock_irq(&nvmeq->q_lock);
 	__nvme_submit_cmd(nvmeq, &c);
@@ -1524,11 +1518,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
 		dev->admin_tagset.ops = &nvme_mq_admin_ops;
 		dev->admin_tagset.nr_hw_queues = 1;
 
-		/*
-		 * Subtract one to leave an empty queue entry for 'Full Queue'
-		 * condition. See NVM-Express 1.2 specification, section 4.1.2.
-		 */
-		dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1;
+		dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
 		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
 		dev->admin_tagset.numa_node = dev_to_node(dev->dev);
 		dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 32e21ab1ae52..008791bbdfb3 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -41,14 +41,6 @@
 
 #define NVME_RDMA_MAX_INLINE_SEGMENTS	1
 
-/*
- * We handle AEN commands ourselves and don't even let the
- * block layer know about them.
- */
-#define NVME_RDMA_NR_AEN_COMMANDS      1
-#define NVME_RDMA_AQ_BLKMQ_DEPTH       \
-	(NVME_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS)
-
 struct nvme_rdma_device {
 	struct ib_device	*dev;
 	struct ib_pd		*pd;
@@ -690,7 +682,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
 		set = &ctrl->admin_tag_set;
 		memset(set, 0, sizeof(*set));
 		set->ops = &nvme_rdma_admin_mq_ops;
-		set->queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH;
+		set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
 		set->reserved_tags = 2; /* connect + keep-alive */
 		set->numa_node = NUMA_NO_NODE;
 		set->cmd_size = sizeof(struct nvme_rdma_request) +
@@ -1318,7 +1310,7 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
 
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->common.opcode = nvme_admin_async_event;
-	cmd->common.command_id = NVME_RDMA_AQ_BLKMQ_DEPTH;
+	cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
 	cmd->common.flags |= NVME_CMD_SGL_METABUF;
 	nvme_rdma_set_sg_null(cmd);
 
@@ -1380,7 +1372,7 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
 	 * for them but rather special case them here.
 	 */
 	if (unlikely(nvme_rdma_queue_idx(queue) == 0 &&
-			cqe->command_id >= NVME_RDMA_AQ_BLKMQ_DEPTH))
+			cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
 		nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
 				&cqe->result);
 	else
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index bc95c6ed531a..7258b796f209 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -23,14 +23,6 @@
 
 #define NVME_LOOP_MAX_SEGMENTS		256
 
-/*
- * We handle AEN commands ourselves and don't even let the
- * block layer know about them.
- */
-#define NVME_LOOP_NR_AEN_COMMANDS	1
-#define NVME_LOOP_AQ_BLKMQ_DEPTH	\
-	(NVME_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS)
-
 struct nvme_loop_iod {
 	struct nvme_request	nvme_req;
 	struct nvme_command	cmd;
@@ -112,7 +104,7 @@ static void nvme_loop_queue_response(struct nvmet_req *req)
 	 * for them but rather special case them here.
 	 */
 	if (unlikely(nvme_loop_queue_idx(queue) == 0 &&
-			cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) {
+			cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
 		nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
 				&cqe->result);
 	} else {
@@ -200,7 +192,7 @@ static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
 
 	memset(&iod->cmd, 0, sizeof(iod->cmd));
 	iod->cmd.common.opcode = nvme_admin_async_event;
-	iod->cmd.common.command_id = NVME_LOOP_AQ_BLKMQ_DEPTH;
+	iod->cmd.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
 	iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
 
 	if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq,
@@ -356,7 +348,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
 
 	memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
 	ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops;
-	ctrl->admin_tag_set.queue_depth = NVME_LOOP_AQ_BLKMQ_DEPTH;
+	ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
 	ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */
 	ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
 	ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) +
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index fd1d4508a612..89ffa7eed2fd 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -90,6 +90,14 @@ enum {
 };
 
 #define NVME_AQ_DEPTH		32
+#define NVME_NR_AEN_COMMANDS	1
+#define NVME_AQ_BLK_MQ_DEPTH	(NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS)
+
+/*
+ * Subtract one to leave an empty queue entry for 'Full Queue' condition. See
+ * NVM-Express 1.2 specification, section 4.1.2.
+ */
+#define NVME_AQ_MQ_TAG_DEPTH	(NVME_AQ_BLK_MQ_DEPTH - 1)
 
 enum {
 	NVME_REG_CAP	= 0x0000,	/* Controller Capabilities */
-- 
cgit v1.2.3


From e3d7874dcf175cca2dca7795d6453f637ad8ba9b Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 7 Nov 2017 15:13:14 -0700
Subject: nvme: send uevent for some asynchronous events

This will give udev a chance to observe and handle asynchronous event
notifications and clear the log to unmask future events of the same type.
The driver will create a change uevent of the asyncronuos event result
before submitting the next AEN request to the device if a completed AEN
event is of type error, smart, command set or vendor specific,

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Guan Junxiong <guanjunxiong@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 28 ++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h |  1 +
 include/linux/nvme.h     |  4 ++++
 3 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c135d0aeebd7..683d890d73fa 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2665,11 +2665,28 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
 
+static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
+{
+	char *envp[2] = { NULL, NULL };
+	u32 aen_result = ctrl->aen_result;
+
+	ctrl->aen_result = 0;
+	if (!aen_result)
+		return;
+
+	envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
+	if (!envp[0])
+		return;
+	kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
+	kfree(envp[0]);
+}
+
 static void nvme_async_event_work(struct work_struct *work)
 {
 	struct nvme_ctrl *ctrl =
 		container_of(work, struct nvme_ctrl, async_event_work);
 
+	nvme_aen_uevent(ctrl);
 	ctrl->ops->submit_async_event(ctrl);
 }
 
@@ -2741,6 +2758,17 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 	if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
 		return;
 
+	switch (result & 0x7) {
+	case NVME_AER_ERROR:
+	case NVME_AER_SMART:
+	case NVME_AER_CSS:
+	case NVME_AER_VS:
+		ctrl->aen_result = result;
+		break;
+	default:
+		break;
+	}
+
 	switch (result & 0xff07) {
 	case NVME_AER_NOTICE_NS_CHANGED:
 		dev_info(ctrl->device, "rescanning\n");
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 151062573ece..7b9cc7d616b7 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -168,6 +168,7 @@ struct nvme_ctrl {
 	u16 kas;
 	u8 npss;
 	u8 apsta;
+	u32 aen_result;
 	unsigned int shutdown_timeout;
 	unsigned int kato;
 	bool subsystem;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 89ffa7eed2fd..aea87f0d917b 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -428,6 +428,10 @@ enum {
 };
 
 enum {
+	NVME_AER_ERROR			= 0,
+	NVME_AER_SMART			= 1,
+	NVME_AER_CSS			= 6,
+	NVME_AER_VS			= 7,
 	NVME_AER_NOTICE_NS_CHANGED	= 0x0002,
 	NVME_AER_NOTICE_FW_ACT_STARTING = 0x0102,
 };
-- 
cgit v1.2.3


From eb619fdb2d4cb8b3d3419e9113921e87e7daf557 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 9 Nov 2017 08:32:43 -0700
Subject: blk-mq: fix issue with shared tag queue re-running

This patch attempts to make the case of hctx re-running on driver tag
failure more robust. Without this patch, it's pretty easy to trigger a
stall condition with shared tags. An example is using null_blk like
this:

modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1

which sets up 4 devices, sharing the same tag set with a depth of 1.
Running a fio job ala:

[global]
bs=4k
rw=randread
norandommap
direct=1
ioengine=libaio
iodepth=4

[nullb0]
filename=/dev/nullb0
[nullb1]
filename=/dev/nullb1
[nullb2]
filename=/dev/nullb2
[nullb3]
filename=/dev/nullb3

will inevitably end with one or more threads being stuck waiting for a
scheduler tag. That IO is then stuck forever, until someone else
triggers a run of the queue.

Ensure that we always re-run the hardware queue, if the driver tag we
were waiting for got freed before we added our leftover request entries
back on the dispatch list.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Tested-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c |  1 -
 block/blk-mq.c         | 85 ++++++++++++++++++++++++++++----------------------
 include/linux/blk-mq.h |  5 ++-
 3 files changed, 50 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7f4a1ba532af..bb7f08415203 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = {
 	HCTX_STATE_NAME(STOPPED),
 	HCTX_STATE_NAME(TAG_ACTIVE),
 	HCTX_STATE_NAME(SCHED_RESTART),
-	HCTX_STATE_NAME(TAG_WAITING),
 	HCTX_STATE_NAME(START_ON_RUN),
 };
 #undef HCTX_STATE_NAME
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3d759bb8a5bb..fed8165973a3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -998,49 +998,64 @@ done:
 	return rq->tag != -1;
 }
 
-static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
-				void *key)
+static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
+				int flags, void *key)
 {
 	struct blk_mq_hw_ctx *hctx;
 
 	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
 
-	list_del(&wait->entry);
-	clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
+	list_del_init(&wait->entry);
 	blk_mq_run_hw_queue(hctx, true);
 	return 1;
 }
 
-static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx,
+				     struct request *rq)
 {
+	struct blk_mq_hw_ctx *this_hctx = *hctx;
+	wait_queue_entry_t *wait = &this_hctx->dispatch_wait;
 	struct sbq_wait_state *ws;
 
+	if (!list_empty_careful(&wait->entry))
+		return false;
+
+	spin_lock(&this_hctx->lock);
+	if (!list_empty(&wait->entry)) {
+		spin_unlock(&this_hctx->lock);
+		return false;
+	}
+
+	ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
+	add_wait_queue(&ws->wait, wait);
+
 	/*
-	 * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
-	 * The thread which wins the race to grab this bit adds the hardware
-	 * queue to the wait queue.
+	 * It's possible that a tag was freed in the window between the
+	 * allocation failure and adding the hardware queue to the wait
+	 * queue.
 	 */
-	if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
-	    test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
+	if (!blk_mq_get_driver_tag(rq, hctx, false)) {
+		spin_unlock(&this_hctx->lock);
 		return false;
-
-	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
-	ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
+	}
 
 	/*
-	 * As soon as this returns, it's no longer safe to fiddle with
-	 * hctx->dispatch_wait, since a completion can wake up the wait queue
-	 * and unlock the bit.
+	 * We got a tag, remove ourselves from the wait queue to ensure
+	 * someone else gets the wakeup.
 	 */
-	add_wait_queue(&ws->wait, &hctx->dispatch_wait);
+	spin_lock_irq(&ws->wait.lock);
+	list_del_init(&wait->entry);
+	spin_unlock_irq(&ws->wait.lock);
+	spin_unlock(&this_hctx->lock);
 	return true;
 }
 
 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
-		bool got_budget)
+			     bool got_budget)
 {
 	struct blk_mq_hw_ctx *hctx;
 	struct request *rq, *nxt;
+	bool no_tag = false;
 	int errors, queued;
 
 	if (list_empty(list))
@@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
 			/*
 			 * The initial allocation attempt failed, so we need to
-			 * rerun the hardware queue when a tag is freed.
+			 * rerun the hardware queue when a tag is freed. The
+			 * waitqueue takes care of that. If the queue is run
+			 * before we add this entry back on the dispatch list,
+			 * we'll re-run it below.
 			 */
-			if (!blk_mq_dispatch_wait_add(hctx)) {
-				if (got_budget)
-					blk_mq_put_dispatch_budget(hctx);
-				break;
-			}
-
-			/*
-			 * It's possible that a tag was freed in the window
-			 * between the allocation failure and adding the
-			 * hardware queue to the wait queue.
-			 */
-			if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+			if (!blk_mq_dispatch_wait_add(&hctx, rq)) {
 				if (got_budget)
 					blk_mq_put_dispatch_budget(hctx);
+				no_tag = true;
 				break;
 			}
 		}
@@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		 * it is no longer set that means that it was cleared by another
 		 * thread and hence that a queue rerun is needed.
 		 *
-		 * If TAG_WAITING is set that means that an I/O scheduler has
-		 * been configured and another thread is waiting for a driver
-		 * tag. To guarantee fairness, do not rerun this hardware queue
-		 * but let the other thread grab the driver tag.
+		 * If 'no_tag' is set, that means that we failed getting
+		 * a driver tag with an I/O scheduler attached. If our dispatch
+		 * waitqueue is no longer active, ensure that we run the queue
+		 * AFTER adding our entries back to the list.
 		 *
 		 * If no I/O scheduler has been configured it is possible that
 		 * the hardware queue got stopped and restarted before requests
@@ -1155,8 +1163,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
 		 *   and dm-rq.
 		 */
-		if (!blk_mq_sched_needs_restart(hctx) &&
-		    !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
+		if (!blk_mq_sched_needs_restart(hctx) ||
+		    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
 			blk_mq_run_hw_queue(hctx, true);
 	}
 
@@ -2020,6 +2028,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
 
 	hctx->nr_ctx = 0;
 
+	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
+	INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
+
 	if (set->ops->init_hctx &&
 	    set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
 		goto free_bitmap;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 674641527da7..4ae987c2352c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -35,7 +35,7 @@ struct blk_mq_hw_ctx {
 	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;
 
-	wait_queue_entry_t		dispatch_wait;
+	wait_queue_entry_t	dispatch_wait;
 	atomic_t		wait_index;
 
 	struct blk_mq_tags	*tags;
@@ -181,8 +181,7 @@ enum {
 	BLK_MQ_S_STOPPED	= 0,
 	BLK_MQ_S_TAG_ACTIVE	= 1,
 	BLK_MQ_S_SCHED_RESTART	= 2,
-	BLK_MQ_S_TAG_WAITING	= 3,
-	BLK_MQ_S_START_ON_RUN	= 4,
+	BLK_MQ_S_START_ON_RUN	= 3,
 
 	BLK_MQ_MAX_DEPTH	= 10240,
 
-- 
cgit v1.2.3


From 6a15674d1e90917f1723a814e2e8c949000440f7 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 9 Nov 2017 10:49:54 -0800
Subject: block: Introduce blk_get_request_flags()

A side effect of this patch is that the GFP mask that is passed to
several allocation functions in the legacy block layer is changed
from GFP_KERNEL into __GFP_DIRECT_RECLAIM.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Martin Steigerwald <martin@lichtvoll.de>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 50 +++++++++++++++++++++++++++++++++++---------------
 include/linux/blkdev.h |  3 +++
 2 files changed, 38 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index a4362849059a..0f7093dfc010 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1160,7 +1160,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
  * @rl: request list to allocate from
  * @op: operation and flags
  * @bio: bio to allocate request for (can be %NULL)
- * @gfp_mask: allocation mask
+ * @flags: BLQ_MQ_REQ_* flags
  *
  * Get a free request from @q.  This function may fail under memory
  * pressure or if @q is dead.
@@ -1170,7 +1170,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
 static struct request *__get_request(struct request_list *rl, unsigned int op,
-		struct bio *bio, gfp_t gfp_mask)
+				     struct bio *bio, unsigned int flags)
 {
 	struct request_queue *q = rl->q;
 	struct request *rq;
@@ -1179,6 +1179,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
 	struct io_cq *icq = NULL;
 	const bool is_sync = op_is_sync(op);
 	int may_queue;
+	gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
+			 __GFP_DIRECT_RECLAIM;
 	req_flags_t rq_flags = RQF_ALLOCED;
 
 	lockdep_assert_held(q->queue_lock);
@@ -1339,7 +1341,7 @@ rq_starved:
  * @q: request_queue to allocate request from
  * @op: operation and flags
  * @bio: bio to allocate request for (can be %NULL)
- * @gfp_mask: allocation mask
+ * @flags: BLK_MQ_REQ_* flags.
  *
  * Get a free request from @q.  If %__GFP_DIRECT_RECLAIM is set in @gfp_mask,
  * this function keeps retrying under memory pressure and fails iff @q is dead.
@@ -1349,7 +1351,7 @@ rq_starved:
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
 static struct request *get_request(struct request_queue *q, unsigned int op,
-		struct bio *bio, gfp_t gfp_mask)
+				   struct bio *bio, unsigned int flags)
 {
 	const bool is_sync = op_is_sync(op);
 	DEFINE_WAIT(wait);
@@ -1361,7 +1363,7 @@ static struct request *get_request(struct request_queue *q, unsigned int op,
 
 	rl = blk_get_rl(q, bio);	/* transferred to @rq on success */
 retry:
-	rq = __get_request(rl, op, bio, gfp_mask);
+	rq = __get_request(rl, op, bio, flags);
 	if (!IS_ERR(rq))
 		return rq;
 
@@ -1370,7 +1372,7 @@ retry:
 		return ERR_PTR(-EAGAIN);
 	}
 
-	if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) {
+	if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) {
 		blk_put_rl(rl);
 		return rq;
 	}
@@ -1397,10 +1399,13 @@ retry:
 	goto retry;
 }
 
+/* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */
 static struct request *blk_old_get_request(struct request_queue *q,
-					   unsigned int op, gfp_t gfp_mask)
+					   unsigned int op, unsigned int flags)
 {
 	struct request *rq;
+	gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
+			 __GFP_DIRECT_RECLAIM;
 	int ret = 0;
 
 	WARN_ON_ONCE(q->mq_ops);
@@ -1413,7 +1418,7 @@ static struct request *blk_old_get_request(struct request_queue *q,
 	if (ret)
 		return ERR_PTR(ret);
 	spin_lock_irq(q->queue_lock);
-	rq = get_request(q, op, NULL, gfp_mask);
+	rq = get_request(q, op, NULL, flags);
 	if (IS_ERR(rq)) {
 		spin_unlock_irq(q->queue_lock);
 		blk_queue_exit(q);
@@ -1427,25 +1432,40 @@ static struct request *blk_old_get_request(struct request_queue *q,
 	return rq;
 }
 
-struct request *blk_get_request(struct request_queue *q, unsigned int op,
-				gfp_t gfp_mask)
+/**
+ * blk_get_request_flags - allocate a request
+ * @q: request queue to allocate a request for
+ * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
+ * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
+ */
+struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
+				      unsigned int flags)
 {
 	struct request *req;
 
+	WARN_ON_ONCE(op & REQ_NOWAIT);
+	WARN_ON_ONCE(flags & ~BLK_MQ_REQ_NOWAIT);
+
 	if (q->mq_ops) {
-		req = blk_mq_alloc_request(q, op,
-			(gfp_mask & __GFP_DIRECT_RECLAIM) ?
-				0 : BLK_MQ_REQ_NOWAIT);
+		req = blk_mq_alloc_request(q, op, flags);
 		if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
 			q->mq_ops->initialize_rq_fn(req);
 	} else {
-		req = blk_old_get_request(q, op, gfp_mask);
+		req = blk_old_get_request(q, op, flags);
 		if (!IS_ERR(req) && q->initialize_rq_fn)
 			q->initialize_rq_fn(req);
 	}
 
 	return req;
 }
+EXPORT_SYMBOL(blk_get_request_flags);
+
+struct request *blk_get_request(struct request_queue *q, unsigned int op,
+				gfp_t gfp_mask)
+{
+	return blk_get_request_flags(q, op, gfp_mask & __GFP_DIRECT_RECLAIM ?
+				     0 : BLK_MQ_REQ_NOWAIT);
+}
 EXPORT_SYMBOL(blk_get_request);
 
 /**
@@ -1871,7 +1891,7 @@ get_rq:
 	 * Returns with the queue unlocked.
 	 */
 	blk_queue_enter_live(q);
-	req = get_request(q, bio->bi_opf, bio, GFP_NOIO);
+	req = get_request(q, bio->bi_opf, bio, 0);
 	if (IS_ERR(req)) {
 		blk_queue_exit(q);
 		__wbt_done(q->rq_wb, wb_acct);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1437ef4d8037..1af5ddd0f631 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -927,6 +927,9 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
+extern struct request *blk_get_request_flags(struct request_queue *,
+					     unsigned int op,
+					     unsigned int flags);
 extern struct request *blk_get_request(struct request_queue *, unsigned int op,
 				       gfp_t gfp_mask);
 extern void blk_requeue_request(struct request_queue *, struct request *);
-- 
cgit v1.2.3


From 1b6d65a0bfb5df2a6029c1430e99fcc5d96bb59a Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 9 Nov 2017 10:49:55 -0800
Subject: block: Introduce BLK_MQ_REQ_PREEMPT

Set RQF_PREEMPT if BLK_MQ_REQ_PREEMPT is passed to
blk_get_request_flags().

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Martin Steigerwald <martin@lichtvoll.de>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 4 +++-
 block/blk-mq.c         | 2 ++
 include/linux/blk-mq.h | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 0f7093dfc010..17eed16a6e04 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1263,6 +1263,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
 	blk_rq_set_rl(rq, rl);
 	rq->cmd_flags = op;
 	rq->rq_flags = rq_flags;
+	if (flags & BLK_MQ_REQ_PREEMPT)
+		rq->rq_flags |= RQF_PREEMPT;
 
 	/* init elvpriv */
 	if (rq_flags & RQF_ELVPRIV) {
@@ -1444,7 +1446,7 @@ struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
 	struct request *req;
 
 	WARN_ON_ONCE(op & REQ_NOWAIT);
-	WARN_ON_ONCE(flags & ~BLK_MQ_REQ_NOWAIT);
+	WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT));
 
 	if (q->mq_ops) {
 		req = blk_mq_alloc_request(q, op, flags);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7173d4bd64af..e21876778cec 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -291,6 +291,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->q = data->q;
 	rq->mq_ctx = data->ctx;
 	rq->cmd_flags = op;
+	if (data->flags & BLK_MQ_REQ_PREEMPT)
+		rq->rq_flags |= RQF_PREEMPT;
 	if (blk_queue_io_stat(data->q))
 		rq->rq_flags |= RQF_IO_STAT;
 	/* do not touch atomic flags, it needs atomic ops against the timer */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 4ae987c2352c..82b56609736a 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -212,6 +212,7 @@ enum {
 	BLK_MQ_REQ_NOWAIT	= (1 << 0), /* return when out of requests */
 	BLK_MQ_REQ_RESERVED	= (1 << 1), /* allocate from reserved pool */
 	BLK_MQ_REQ_INTERNAL	= (1 << 2), /* allocate internal/sched tag */
+	BLK_MQ_REQ_PREEMPT	= (1 << 3), /* set RQF_PREEMPT */
 };
 
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
-- 
cgit v1.2.3


From c9254f2ddb19387ea9714a57ea48463c20333b92 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 9 Nov 2017 10:49:57 -0800
Subject: block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag

This flag will be used in the next patch to let the block layer
core know whether or not a SCSI request queue has been quiesced.
A quiesced SCSI queue namely only processes RQF_PREEMPT requests.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Martin Steigerwald <martin@lichtvoll.de>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 30 ++++++++++++++++++++++++++++++
 block/blk-mq-debugfs.c |  1 +
 include/linux/blkdev.h |  6 ++++++
 3 files changed, 37 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 17eed16a6e04..edc276899116 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -348,6 +348,36 @@ void blk_sync_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
+/**
+ * blk_set_preempt_only - set QUEUE_FLAG_PREEMPT_ONLY
+ * @q: request queue pointer
+ *
+ * Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was not
+ * set and 1 if the flag was already set.
+ */
+int blk_set_preempt_only(struct request_queue *q)
+{
+	unsigned long flags;
+	int res;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	res = queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	return res;
+}
+EXPORT_SYMBOL_GPL(blk_set_preempt_only);
+
+void blk_clear_preempt_only(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
+
 /**
  * __blk_run_queue_uncond - run a queue whether or not it has been stopped
  * @q:	The queue to run
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index bb7f08415203..e4f2bb936e66 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -74,6 +74,7 @@ static const char *const blk_queue_flag_name[] = {
 	QUEUE_FLAG_NAME(REGISTERED),
 	QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
 	QUEUE_FLAG_NAME(QUIESCED),
+	QUEUE_FLAG_NAME(PREEMPT_ONLY),
 };
 #undef QUEUE_FLAG_NAME
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1af5ddd0f631..2147e2381a22 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -632,6 +632,7 @@ struct request_queue {
 #define QUEUE_FLAG_REGISTERED  26	/* queue has been registered to a disk */
 #define QUEUE_FLAG_SCSI_PASSTHROUGH 27	/* queue supports SCSI commands */
 #define QUEUE_FLAG_QUIESCED    28	/* queue has been quiesced */
+#define QUEUE_FLAG_PREEMPT_ONLY	29	/* only process REQ_PREEMPT requests */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_SAME_COMP)	|	\
@@ -732,6 +733,11 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
 			     REQ_FAILFAST_DRIVER))
 #define blk_queue_quiesced(q)	test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags)
+#define blk_queue_preempt_only(q)				\
+	test_bit(QUEUE_FLAG_PREEMPT_ONLY, &(q)->queue_flags)
+
+extern int blk_set_preempt_only(struct request_queue *q);
+extern void blk_clear_preempt_only(struct request_queue *q);
 
 static inline bool blk_account_rq(struct request *rq)
 {
-- 
cgit v1.2.3


From 3a0a529971ec4e2d933e9c7798db101dfb6b1aec Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 9 Nov 2017 10:49:58 -0800
Subject: block, scsi: Make SCSI quiesce and resume work reliably

The contexts from which a SCSI device can be quiesced or resumed are:
* Writing into /sys/class/scsi_device/*/device/state.
* SCSI parallel (SPI) domain validation.
* The SCSI device power management methods. See also scsi_bus_pm_ops.

It is essential during suspend and resume that neither the filesystem
state nor the filesystem metadata in RAM changes. This is why while
the hibernation image is being written or restored that SCSI devices
are quiesced. The SCSI core quiesces devices through scsi_device_quiesce()
and scsi_device_resume(). In the SDEV_QUIESCE state execution of
non-preempt requests is deferred. This is realized by returning
BLKPREP_DEFER from inside scsi_prep_state_check() for quiesced SCSI
devices. Avoid that a full queue prevents power management requests
to be submitted by deferring allocation of non-preempt requests for
devices in the quiesced state. This patch has been tested by running
the following commands and by verifying that after each resume the
fio job was still running:

for ((i=0; i<10; i++)); do
  (
    cd /sys/block/md0/md &&
    while true; do
      [ "$(<sync_action)" = "idle" ] && echo check > sync_action
      sleep 1
    done
  ) &
  pids=($!)
  for d in /sys/class/block/sd*[a-z]; do
    bdev=${d#/sys/class/block/}
    hcil=$(readlink "$d/device")
    hcil=${hcil#../../../}
    echo 4 > "$d/queue/nr_requests"
    echo 1 > "/sys/class/scsi_device/$hcil/device/queue_depth"
    fio --name="$bdev" --filename="/dev/$bdev" --buffered=0 --bs=512 \
      --rw=randread --ioengine=libaio --numjobs=4 --iodepth=16       \
      --iodepth_batch=1 --thread --loops=$((2**31)) &
    pids+=($!)
  done
  sleep 1
  echo "$(date) Hibernating ..." >>hibernate-test-log.txt
  systemctl hibernate
  sleep 10
  kill "${pids[@]}"
  echo idle > /sys/block/md0/md/sync_action
  wait
  echo "$(date) Done." >>hibernate-test-log.txt
done

Reported-by: Oleksandr Natalenko <oleksandr@natalenko.name>
References: "I/O hangs after resuming from suspend-to-ram" (https://marc.info/?l=linux-block&m=150340235201348).
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Martin Steigerwald <martin@lichtvoll.de>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c           | 42 ++++++++++++++++++++++++++++++++++--------
 block/blk-mq.c             |  4 ++--
 drivers/scsi/scsi_lib.c    | 42 ++++++++++++++++++++++++++++++------------
 fs/block_dev.c             |  4 ++--
 include/linux/blkdev.h     |  2 +-
 include/scsi/scsi_device.h |  1 +
 6 files changed, 70 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index edc276899116..29b08428ae45 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -374,6 +374,7 @@ void blk_clear_preempt_only(struct request_queue *q)
 
 	spin_lock_irqsave(q->queue_lock, flags);
 	queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
+	wake_up_all(&q->mq_freeze_wq);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
@@ -795,15 +796,38 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 
-int blk_queue_enter(struct request_queue *q, bool nowait)
+/**
+ * blk_queue_enter() - try to increase q->q_usage_counter
+ * @q: request queue pointer
+ * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
+ */
+int blk_queue_enter(struct request_queue *q, unsigned int flags)
 {
+	const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
+
 	while (true) {
+		bool success = false;
 		int ret;
 
-		if (percpu_ref_tryget_live(&q->q_usage_counter))
+		rcu_read_lock_sched();
+		if (percpu_ref_tryget_live(&q->q_usage_counter)) {
+			/*
+			 * The code that sets the PREEMPT_ONLY flag is
+			 * responsible for ensuring that that flag is globally
+			 * visible before the queue is unfrozen.
+			 */
+			if (preempt || !blk_queue_preempt_only(q)) {
+				success = true;
+			} else {
+				percpu_ref_put(&q->q_usage_counter);
+			}
+		}
+		rcu_read_unlock_sched();
+
+		if (success)
 			return 0;
 
-		if (nowait)
+		if (flags & BLK_MQ_REQ_NOWAIT)
 			return -EBUSY;
 
 		/*
@@ -816,7 +840,8 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
 		smp_rmb();
 
 		ret = wait_event_interruptible(q->mq_freeze_wq,
-				!atomic_read(&q->mq_freeze_depth) ||
+				(atomic_read(&q->mq_freeze_depth) == 0 &&
+				 (preempt || !blk_queue_preempt_only(q))) ||
 				blk_queue_dying(q));
 		if (blk_queue_dying(q))
 			return -ENODEV;
@@ -1445,8 +1470,7 @@ static struct request *blk_old_get_request(struct request_queue *q,
 	/* create ioc upfront */
 	create_io_context(gfp_mask, q->node);
 
-	ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM) ||
-			      (op & REQ_NOWAIT));
+	ret = blk_queue_enter(q, flags);
 	if (ret)
 		return ERR_PTR(ret);
 	spin_lock_irq(q->queue_lock);
@@ -2267,8 +2291,10 @@ blk_qc_t generic_make_request(struct bio *bio)
 	current->bio_list = bio_list_on_stack;
 	do {
 		struct request_queue *q = bio->bi_disk->queue;
+		unsigned int flags = bio->bi_opf & REQ_NOWAIT ?
+			BLK_MQ_REQ_NOWAIT : 0;
 
-		if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) {
+		if (likely(blk_queue_enter(q, flags) == 0)) {
 			struct bio_list lower, same;
 
 			/* Create a fresh bio_list for all subordinate requests */
@@ -2327,7 +2353,7 @@ blk_qc_t direct_make_request(struct bio *bio)
 	if (!generic_make_request_checks(bio))
 		return BLK_QC_T_NONE;
 
-	if (unlikely(blk_queue_enter(q, nowait))) {
+	if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) {
 		if (nowait && !blk_queue_dying(q))
 			bio->bi_status = BLK_STS_AGAIN;
 		else
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e21876778cec..211bc8a3e2cc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -389,7 +389,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 	struct request *rq;
 	int ret;
 
-	ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
+	ret = blk_queue_enter(q, flags);
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -428,7 +428,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	if (hctx_idx >= q->nr_hw_queues)
 		return ERR_PTR(-EIO);
 
-	ret = blk_queue_enter(q, true);
+	ret = blk_queue_enter(q, flags);
 	if (ret)
 		return ERR_PTR(ret);
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index eb129dfc2ebe..f907e2f8c1dd 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2947,21 +2947,37 @@ static void scsi_wait_for_queuecommand(struct scsi_device *sdev)
 int
 scsi_device_quiesce(struct scsi_device *sdev)
 {
+	struct request_queue *q = sdev->request_queue;
 	int err;
 
+	/*
+	 * It is allowed to call scsi_device_quiesce() multiple times from
+	 * the same context but concurrent scsi_device_quiesce() calls are
+	 * not allowed.
+	 */
+	WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current);
+
+	blk_set_preempt_only(q);
+
+	blk_mq_freeze_queue(q);
+	/*
+	 * Ensure that the effect of blk_set_preempt_only() will be visible
+	 * for percpu_ref_tryget() callers that occur after the queue
+	 * unfreeze even if the queue was already frozen before this function
+	 * was called. See also https://lwn.net/Articles/573497/.
+	 */
+	synchronize_rcu();
+	blk_mq_unfreeze_queue(q);
+
 	mutex_lock(&sdev->state_mutex);
 	err = scsi_device_set_state(sdev, SDEV_QUIESCE);
+	if (err == 0)
+		sdev->quiesced_by = current;
+	else
+		blk_clear_preempt_only(q);
 	mutex_unlock(&sdev->state_mutex);
 
-	if (err)
-		return err;
-
-	scsi_run_queue(sdev->request_queue);
-	while (atomic_read(&sdev->device_busy)) {
-		msleep_interruptible(200);
-		scsi_run_queue(sdev->request_queue);
-	}
-	return 0;
+	return err;
 }
 EXPORT_SYMBOL(scsi_device_quiesce);
 
@@ -2981,9 +2997,11 @@ void scsi_device_resume(struct scsi_device *sdev)
 	 * device deleted during suspend)
 	 */
 	mutex_lock(&sdev->state_mutex);
-	if (sdev->sdev_state == SDEV_QUIESCE &&
-	    scsi_device_set_state(sdev, SDEV_RUNNING) == 0)
-		scsi_run_queue(sdev->request_queue);
+	WARN_ON_ONCE(!sdev->quiesced_by);
+	sdev->quiesced_by = NULL;
+	blk_clear_preempt_only(sdev->request_queue);
+	if (sdev->sdev_state == SDEV_QUIESCE)
+		scsi_device_set_state(sdev, SDEV_RUNNING);
 	mutex_unlock(&sdev->state_mutex);
 }
 EXPORT_SYMBOL(scsi_device_resume);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4afa4d5ff969..04973f484422 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -662,7 +662,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
 	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return result;
 
-	result = blk_queue_enter(bdev->bd_queue, false);
+	result = blk_queue_enter(bdev->bd_queue, 0);
 	if (result)
 		return result;
 	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false);
@@ -698,7 +698,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 
 	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return -EOPNOTSUPP;
-	result = blk_queue_enter(bdev->bd_queue, false);
+	result = blk_queue_enter(bdev->bd_queue, 0);
 	if (result)
 		return result;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2147e2381a22..402c9d536ae1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -959,7 +959,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 
-extern int blk_queue_enter(struct request_queue *q, bool nowait);
+extern int blk_queue_enter(struct request_queue *q, unsigned int flags);
 extern void blk_queue_exit(struct request_queue *q);
 extern void blk_start_queue(struct request_queue *q);
 extern void blk_start_queue_async(struct request_queue *q);
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 82e93ee94708..6f0f1e242e23 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -219,6 +219,7 @@ struct scsi_device {
 	unsigned char		access_state;
 	struct mutex		state_mutex;
 	enum scsi_device_state sdev_state;
+	struct task_struct	*quiesced_by;
 	unsigned long		sdev_data[0];
 } __attribute__((aligned(sizeof(unsigned long))));
 
-- 
cgit v1.2.3


From 9a95e4ef709533efac4aafcb8bddf73f96db50ed Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 9 Nov 2017 10:49:59 -0800
Subject: block, nvme: Introduce blk_mq_req_flags_t

Several block layer and NVMe core functions accept a combination
of BLK_MQ_REQ_* flags through the 'flags' argument but there is
no verification at compile time whether the right type of block
layer flags is passed. Make it possible for sparse to verify this.
This patch does not change any functionality.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: linux-nvme@lists.infradead.org
Cc: Christoph Hellwig <hch@lst.de>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          | 12 ++++++------
 block/blk-mq.c            |  4 ++--
 block/blk-mq.h            |  2 +-
 drivers/nvme/host/core.c  |  5 +++--
 drivers/nvme/host/nvme.h  |  5 +++--
 include/linux/blk-mq.h    | 17 +++++++++++------
 include/linux/blk_types.h |  2 ++
 include/linux/blkdev.h    |  4 ++--
 8 files changed, 30 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 29b08428ae45..7c54c195e79e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -801,7 +801,7 @@ EXPORT_SYMBOL(blk_alloc_queue);
  * @q: request queue pointer
  * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
  */
-int blk_queue_enter(struct request_queue *q, unsigned int flags)
+int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 {
 	const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
 
@@ -1225,7 +1225,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
 static struct request *__get_request(struct request_list *rl, unsigned int op,
-				     struct bio *bio, unsigned int flags)
+				     struct bio *bio, blk_mq_req_flags_t flags)
 {
 	struct request_queue *q = rl->q;
 	struct request *rq;
@@ -1408,7 +1408,7 @@ rq_starved:
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
 static struct request *get_request(struct request_queue *q, unsigned int op,
-				   struct bio *bio, unsigned int flags)
+				   struct bio *bio, blk_mq_req_flags_t flags)
 {
 	const bool is_sync = op_is_sync(op);
 	DEFINE_WAIT(wait);
@@ -1458,7 +1458,7 @@ retry:
 
 /* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */
 static struct request *blk_old_get_request(struct request_queue *q,
-					   unsigned int op, unsigned int flags)
+				unsigned int op, blk_mq_req_flags_t flags)
 {
 	struct request *rq;
 	gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
@@ -1495,7 +1495,7 @@ static struct request *blk_old_get_request(struct request_queue *q,
  * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
  */
 struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
-				      unsigned int flags)
+				      blk_mq_req_flags_t flags)
 {
 	struct request *req;
 
@@ -2291,7 +2291,7 @@ blk_qc_t generic_make_request(struct bio *bio)
 	current->bio_list = bio_list_on_stack;
 	do {
 		struct request_queue *q = bio->bi_disk->queue;
-		unsigned int flags = bio->bi_opf & REQ_NOWAIT ?
+		blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ?
 			BLK_MQ_REQ_NOWAIT : 0;
 
 		if (likely(blk_queue_enter(q, flags) == 0)) {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 211bc8a3e2cc..bfe24a5b62a3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -383,7 +383,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 }
 
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
-		unsigned int flags)
+		blk_mq_req_flags_t flags)
 {
 	struct blk_mq_alloc_data alloc_data = { .flags = flags };
 	struct request *rq;
@@ -409,7 +409,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 EXPORT_SYMBOL(blk_mq_alloc_request);
 
 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
-		unsigned int op, unsigned int flags, unsigned int hctx_idx)
+	unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
 {
 	struct blk_mq_alloc_data alloc_data = { .flags = flags };
 	struct request *rq;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 2502f40ccdc0..99a19c5523e2 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -111,7 +111,7 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
 struct blk_mq_alloc_data {
 	/* input parameter */
 	struct request_queue *q;
-	unsigned int flags;
+	blk_mq_req_flags_t flags;
 	unsigned int shallow_depth;
 
 	/* input & output parameter */
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 683d890d73fa..878d5c09d15b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -305,7 +305,7 @@ static void nvme_put_ns(struct nvme_ns *ns)
 }
 
 struct request *nvme_alloc_request(struct request_queue *q,
-		struct nvme_command *cmd, unsigned int flags, int qid)
+		struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
 {
 	unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
 	struct request *req;
@@ -573,7 +573,8 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd);
  */
 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		union nvme_result *result, void *buffer, unsigned bufflen,
-		unsigned timeout, int qid, int at_head, int flags)
+		unsigned timeout, int qid, int at_head,
+		blk_mq_req_flags_t flags)
 {
 	struct request *req;
 	int ret;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 7b9cc7d616b7..23b8504ace7f 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -327,14 +327,15 @@ int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set);
 
 #define NVME_QID_ANY -1
 struct request *nvme_alloc_request(struct request_queue *q,
-		struct nvme_command *cmd, unsigned int flags, int qid);
+		struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid);
 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 		struct nvme_command *cmd);
 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		void *buf, unsigned bufflen);
 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		union nvme_result *result, void *buffer, unsigned bufflen,
-		unsigned timeout, int qid, int at_head, int flags);
+		unsigned timeout, int qid, int at_head,
+		blk_mq_req_flags_t flags);
 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
 void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 82b56609736a..b326208277ee 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -209,16 +209,21 @@ void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 
 enum {
-	BLK_MQ_REQ_NOWAIT	= (1 << 0), /* return when out of requests */
-	BLK_MQ_REQ_RESERVED	= (1 << 1), /* allocate from reserved pool */
-	BLK_MQ_REQ_INTERNAL	= (1 << 2), /* allocate internal/sched tag */
-	BLK_MQ_REQ_PREEMPT	= (1 << 3), /* set RQF_PREEMPT */
+	/* return when out of requests */
+	BLK_MQ_REQ_NOWAIT	= (__force blk_mq_req_flags_t)(1 << 0),
+	/* allocate from reserved pool */
+	BLK_MQ_REQ_RESERVED	= (__force blk_mq_req_flags_t)(1 << 1),
+	/* allocate internal/sched tag */
+	BLK_MQ_REQ_INTERNAL	= (__force blk_mq_req_flags_t)(1 << 2),
+	/* set RQF_PREEMPT */
+	BLK_MQ_REQ_PREEMPT	= (__force blk_mq_req_flags_t)(1 << 3),
 };
 
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
-		unsigned int flags);
+		blk_mq_req_flags_t flags);
 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
-		unsigned int op, unsigned int flags, unsigned int hctx_idx);
+		unsigned int op, blk_mq_req_flags_t flags,
+		unsigned int hctx_idx);
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
 
 enum {
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1b04085255fb..13ccfc9b210a 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -162,6 +162,8 @@ struct bio {
  */
 #define BIO_RESET_BITS	BVEC_POOL_OFFSET
 
+typedef __u32 __bitwise blk_mq_req_flags_t;
+
 /*
  * Operations and flags common to the bio and request structures.
  * We use 8 bits for encoding the operation, and the remaining 24 for flags.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 402c9d536ae1..e80ea1d31343 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -935,7 +935,7 @@ extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request_flags(struct request_queue *,
 					     unsigned int op,
-					     unsigned int flags);
+					     blk_mq_req_flags_t flags);
 extern struct request *blk_get_request(struct request_queue *, unsigned int op,
 				       gfp_t gfp_mask);
 extern void blk_requeue_request(struct request_queue *, struct request *);
@@ -959,7 +959,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 
-extern int blk_queue_enter(struct request_queue *q, unsigned int flags);
+extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
 extern void blk_queue_exit(struct request_queue *q);
 extern void blk_start_queue(struct request_queue *q);
 extern void blk_start_queue_async(struct request_queue *q);
-- 
cgit v1.2.3


From 67f2519fe2903c4041c0e94394d14d372fe51399 Mon Sep 17 00:00:00 2001
From: Greg Edwards <gedwards@ddn.com>
Date: Tue, 24 Oct 2017 11:21:48 -0600
Subject: fs: guard_bio_eod() needs to consider partitions

guard_bio_eod() needs to look at the partition capacity, not just the
capacity of the whole device, when determining if truncation is
necessary.

[   60.268688] attempt to access beyond end of device
[   60.268690] unknown-block(9,1): rw=0, want=67103509, limit=67103506
[   60.268693] buffer_io_error: 2 callbacks suppressed
[   60.268696] Buffer I/O error on dev md1p7, logical block 4524305, async page read

Fixes: 74d46992e0d9 ("block: replace bi_bdev with a gendisk pointer and partitions index")
Cc: stable@vger.kernel.org # v4.13
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Greg Edwards <gedwards@ddn.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/buffer.c           | 10 +++++++++-
 include/linux/genhd.h |  1 +
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index bff571dc7bc3..bcabb69e7462 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3015,8 +3015,16 @@ void guard_bio_eod(int op, struct bio *bio)
 	sector_t maxsector;
 	struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
 	unsigned truncated_bytes;
+	struct hd_struct *part;
+
+	rcu_read_lock();
+	part = __disk_get_part(bio->bi_disk, bio->bi_partno);
+	if (part)
+		maxsector = part_nr_sects_read(part);
+	else
+		maxsector = get_capacity(bio->bi_disk);
+	rcu_read_unlock();
 
-	maxsector = get_capacity(bio->bi_disk);
 	if (!maxsector)
 		return;
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 93aae3476f58..ca10cc292187 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -243,6 +243,7 @@ static inline dev_t part_devt(struct hd_struct *part)
 	return part_to_dev(part)->devt;
 }
 
+extern struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
 extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno);
 
 static inline void disk_put_part(struct hd_struct *part)
-- 
cgit v1.2.3


From 79f720a751cad613620d0237e3b44f89f4a69181 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 10 Nov 2017 09:13:21 -0700
Subject: blk-mq: only run the hardware queue if IO is pending

Currently we are inconsistent in when we decide to run the queue. Using
blk_mq_run_hw_queues() we check if the hctx has pending IO before
running it, but we don't do that from the individual queue run function,
blk_mq_run_hw_queue(). This results in a lot of extra and pointless
queue runs, potentially, on flush requests and (much worse) on tag
starvation situations. This is observable just looking at top output,
with lots of kworkers active. For the !async runs, it just adds to the
CPU overhead of blk-mq.

Move the has-pending check into the run function instead of having
callers do it.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c   |  7 +------
 block/blk-mq.c         | 18 +++++++++++-------
 block/blk-mq.h         |  2 --
 include/linux/blk-mq.h |  2 +-
 4 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 6f4bdb8209f7..c117bd8fd1f6 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -81,12 +81,7 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 	} else
 		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 
-	if (blk_mq_hctx_has_pending(hctx)) {
-		blk_mq_run_hw_queue(hctx, true);
-		return true;
-	}
-
-	return false;
+	return blk_mq_run_hw_queue(hctx, true);
 }
 
 /*
diff --git a/block/blk-mq.c b/block/blk-mq.c
index bfe24a5b62a3..a2a4271f5ab8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -61,10 +61,10 @@ static int blk_mq_poll_stats_bkt(const struct request *rq)
 /*
  * Check if any of the ctx's have pending work in this hardware queue
  */
-bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
-	return sbitmap_any_bit_set(&hctx->ctx_map) ||
-			!list_empty_careful(&hctx->dispatch) ||
+	return !list_empty_careful(&hctx->dispatch) ||
+		sbitmap_any_bit_set(&hctx->ctx_map) ||
 			blk_mq_sched_has_work(hctx);
 }
 
@@ -1253,9 +1253,14 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 }
 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
 
-void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
-	__blk_mq_delay_run_hw_queue(hctx, async, 0);
+	if (blk_mq_hctx_has_pending(hctx)) {
+		__blk_mq_delay_run_hw_queue(hctx, async, 0);
+		return true;
+	}
+
+	return false;
 }
 EXPORT_SYMBOL(blk_mq_run_hw_queue);
 
@@ -1265,8 +1270,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 	int i;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		if (!blk_mq_hctx_has_pending(hctx) ||
-		    blk_mq_hctx_stopped(hctx))
+		if (blk_mq_hctx_stopped(hctx))
 			continue;
 
 		blk_mq_run_hw_queue(hctx, async);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 99a19c5523e2..dcf379a892dd 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -26,14 +26,12 @@ struct blk_mq_ctx {
 	struct kobject		kobj;
 } ____cacheline_aligned_in_smp;
 
-void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
 bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
-bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
 				bool wait);
 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b326208277ee..eb1e2cdffb31 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -266,7 +266,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_quiesce_queue(struct request_queue *q);
 void blk_mq_unquiesce_queue(struct request_queue *q);
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
-void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
+bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
-- 
cgit v1.2.3


From dd0bb688eaa241b5655d396d45366cba9225aed9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 7 Nov 2017 15:28:42 -0500
Subject: bpf: add a bpf_override_function helper

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/Kconfig                     |  3 +++
 arch/x86/Kconfig                 |  1 +
 arch/x86/include/asm/kprobes.h   |  4 ++++
 arch/x86/include/asm/ptrace.h    |  5 +++++
 arch/x86/kernel/kprobes/ftrace.c | 14 ++++++++++++++
 include/linux/filter.h           |  3 ++-
 include/linux/trace_events.h     |  1 +
 include/uapi/linux/bpf.h         |  7 ++++++-
 kernel/bpf/core.c                |  3 +++
 kernel/bpf/verifier.c            |  2 ++
 kernel/events/core.c             |  7 +++++++
 kernel/trace/Kconfig             | 11 +++++++++++
 kernel/trace/bpf_trace.c         | 35 +++++++++++++++++++++++++++++++++++
 kernel/trace/trace_kprobe.c      | 40 +++++++++++++++++++++++++++++++++-------
 kernel/trace/trace_probe.h       |  6 ++++++
 15 files changed, 133 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 057370a0ac4e..6e8520f09bc1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -196,6 +196,9 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
 	bool
 
+config HAVE_KPROBE_OVERRIDE
+	bool
+
 config HAVE_NMI
 	bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2fdb23313dd5..51458c1a0b4a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -153,6 +153,7 @@ config X86
 	select HAVE_KERNEL_XZ
 	select HAVE_KPROBES
 	select HAVE_KPROBES_ON_FTRACE
+	select HAVE_KPROBE_OVERRIDE
 	select HAVE_KRETPROBES
 	select HAVE_KVM
 	select HAVE_LIVEPATCH			if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 6cf65437b5e5..c6c3b1f4306a 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
+#endif
+
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
 	/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index c0e3c45cf6ab..2370bb0149cc 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -109,6 +109,11 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
 	return regs->ax;
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
+{
+	regs->ax = rc;
+}
+
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 041f7b6dfa0f..3c455bf490cb 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
 	p->ainsn.boostable = false;
 	return 0;
 }
+
+asmlinkage void override_func(void);
+asm(
+	".type override_func, @function\n"
+	"override_func:\n"
+	"	ret\n"
+	".size override_func, .-override_func\n"
+);
+
+void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
+{
+	regs->ip = (unsigned long)&override_func;
+}
+NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0cd02ff4ae30..eaec066f99e8 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -459,7 +459,8 @@ struct bpf_prog {
 				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1;	/* Do we need dst entry? */
+				dst_needed:1,	/* Do we need dst entry? */
+				kprobe_override:1; /* Do we override a kprobe? */
 	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 84014ecfa67f..17e5e820a84c 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -523,6 +523,7 @@ do {									\
 struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
+DECLARE_PER_CPU(int, bpf_kprobe_override);
 
 extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e880ae6434ee..adb66f78b674 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -677,6 +677,10 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ *	@pt_regs: pointer to struct pt_regs
+ *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -736,7 +740,8 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),
+	FN(getsockopt),			\
+	FN(override_return),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8a6c37762330..271daad31f37 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1326,6 +1326,9 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
 bool bpf_prog_array_compatible(struct bpf_array *array,
 			       const struct bpf_prog *fp)
 {
+	if (fp->kprobe_override)
+		return false;
+
 	if (!array->owner_prog_type) {
 		/* There's no owner yet where we could check for
 		 * compatibility.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4a942e2e753d..bc464b8ec91e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4357,6 +4357,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			prog->dst_needed = 1;
 		if (insn->imm == BPF_FUNC_get_prandom_u32)
 			bpf_user_rnd_init_once();
+		if (insn->imm == BPF_FUNC_override_return)
+			prog->kprobe_override = 1;
 		if (insn->imm == BPF_FUNC_tail_call) {
 			/* If we tail call into other programs, we
 			 * cannot make any assumptions since they can
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 42d24bd64ea4..ac240d31b5bf 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8171,6 +8171,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return -EINVAL;
 	}
 
+	/* Kprobe override only works for kprobes, not uprobes. */
+	if (prog->kprobe_override &&
+	    !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
 	if (is_tracepoint || is_syscall_tp) {
 		int off = trace_event_get_offsets(event->tp_event);
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 434c840e2d82..9dc0deeaad2b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -518,6 +518,17 @@ config FUNCTION_PROFILER
 
 	  If in doubt, say N.
 
+config BPF_KPROBE_OVERRIDE
+	bool "Enable BPF programs to override a kprobed function"
+	depends on BPF_EVENTS
+	depends on KPROBES_ON_FTRACE
+	depends on HAVE_KPROBE_OVERRIDE
+	depends on DYNAMIC_FTRACE_WITH_REGS
+	default n
+	help
+	 Allows BPF to override the execution of a probed function and
+	 set a different return value.  This is used for error injection.
+
 config FTRACE_MCOUNT_RECORD
 	def_bool y
 	depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 506efe6e8ed9..1865b0d4cdeb 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,6 +13,10 @@
 #include <linux/filter.h>
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
+#include <linux/kprobes.h>
+#include <asm/kprobes.h>
+
+#include "trace_probe.h"
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
@@ -76,6 +80,29 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 }
 EXPORT_SYMBOL_GPL(trace_call_bpf);
 
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
+{
+	__this_cpu_write(bpf_kprobe_override, 1);
+	regs_set_return_value(regs, rc);
+	arch_ftrace_kprobe_override_function(regs);
+	return 0;
+}
+#else
+BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
+{
+	return -EINVAL;
+}
+#endif
+
+static const struct bpf_func_proto bpf_override_return_proto = {
+	.func		= bpf_override_return,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
 {
 	int ret;
@@ -551,6 +578,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_get_stackid_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
+	case BPF_FUNC_override_return:
+		pr_warn_ratelimited("%s[%d] is installing a program with bpf_override_return helper that may cause unexpected behavior!",
+				    current->comm, task_pid_nr(current));
+		return &bpf_override_return_proto;
 	default:
 		return tracing_func_proto(func_id);
 	}
@@ -766,6 +797,10 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
 	struct bpf_prog_array *new_array;
 	int ret = -EEXIST;
 
+	/* Kprobe override only works for ftrace based kprobes. */
+	if (prog->kprobe_override && !trace_kprobe_ftrace(event->tp_event))
+		return -EINVAL;
+
 	mutex_lock(&bpf_event_mutex);
 
 	if (event->prog)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index abf92e478cfb..8e3c9ec1faf7 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -42,6 +42,7 @@ struct trace_kprobe {
 	(offsetof(struct trace_kprobe, tp.args) +	\
 	(sizeof(struct probe_arg) * (n)))
 
+DEFINE_PER_CPU(int, bpf_kprobe_override);
 
 static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
 {
@@ -87,6 +88,12 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
 	return nhit;
 }
 
+int trace_kprobe_ftrace(struct trace_event_call *call)
+{
+	struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
+	return kprobe_ftrace(&tk->rp.kp);
+}
+
 static int register_kprobe_event(struct trace_kprobe *tk);
 static int unregister_kprobe_event(struct trace_kprobe *tk);
 
@@ -1170,7 +1177,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call)
 #ifdef CONFIG_PERF_EVENTS
 
 /* Kprobe profile handler */
-static void
+static int
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
@@ -1179,12 +1186,29 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	int size, __size, dsize;
 	int rctx;
 
-	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
-		return;
+	if (bpf_prog_array_valid(call)) {
+		int ret;
+
+		ret = trace_call_bpf(call, regs);
+
+		/*
+		 * We need to check and see if we modified the pc of the
+		 * pt_regs, and if so clear the kprobe and return 1 so that we
+		 * don't do the instruction skipping.  Also reset our state so
+		 * we are clean the next pass through.
+		 */
+		if (__this_cpu_read(bpf_kprobe_override)) {
+			__this_cpu_write(bpf_kprobe_override, 0);
+			reset_current_kprobe();
+			return 1;
+		}
+		if (!ret)
+			return 0;
+	}
 
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
-		return;
+		return 0;
 
 	dsize = __get_data_size(&tk->tp, regs);
 	__size = sizeof(*entry) + tk->tp.size + dsize;
@@ -1193,13 +1217,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 
 	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
-		return;
+		return 0;
 
 	entry->ip = (unsigned long)tk->rp.kp.addr;
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
 			      head, NULL, NULL);
+	return 0;
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1275,6 +1300,7 @@ static int kprobe_register(struct trace_event_call *event,
 static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 {
 	struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
+	int ret = 0;
 
 	raw_cpu_inc(*tk->nhit);
 
@@ -1282,9 +1308,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 		kprobe_trace_func(tk, regs);
 #ifdef CONFIG_PERF_EVENTS
 	if (tk->tp.flags & TP_FLAG_PROFILE)
-		kprobe_perf_func(tk, regs);
+		ret = kprobe_perf_func(tk, regs);
 #endif
-	return 0;	/* We don't tweek kernel, so just return 0 */
+	return ret;
 }
 NOKPROBE_SYMBOL(kprobe_dispatcher);
 
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 903273c93e61..adbb3f7d1fb5 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -253,6 +253,7 @@ struct symbol_cache;
 unsigned long update_symbol_cache(struct symbol_cache *sc);
 void free_symbol_cache(struct symbol_cache *sc);
 struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
+int trace_kprobe_ftrace(struct trace_event_call *call);
 #else
 /* uprobes do not support symbol fetch methods */
 #define fetch_symbol_u8			NULL
@@ -278,6 +279,11 @@ alloc_symbol_cache(const char *sym, long offset)
 {
 	return NULL;
 }
+
+static inline int trace_kprobe_ftrace(struct trace_event_call *call)
+{
+	return 0;
+}
 #endif /* CONFIG_KPROBE_EVENTS */
 
 struct probe_arg {
-- 
cgit v1.2.3


From 2210d6b2f287d738eddf6b75f432126ce05450f8 Mon Sep 17 00:00:00 2001
From: Maciej Żenczykowski <maze@google.com>
Date: Tue, 7 Nov 2017 21:52:09 -0800
Subject: net: ipv6: sysctl to specify IPv6 ND traffic class
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a per-device sysctl to specify the default traffic class to use for
kernel originated IPv6 Neighbour Discovery packets.

Currently this includes:

  - Router Solicitation (ICMPv6 type 133)
    ndisc_send_rs() -> ndisc_send_skb() -> ip6_nd_hdr()

  - Neighbour Solicitation (ICMPv6 type 135)
    ndisc_send_ns() -> ndisc_send_skb() -> ip6_nd_hdr()

  - Neighbour Advertisement (ICMPv6 type 136)
    ndisc_send_na() -> ndisc_send_skb() -> ip6_nd_hdr()

  - Redirect (ICMPv6 type 137)
    ndisc_send_redirect() -> ndisc_send_skb() -> ip6_nd_hdr()

and if the kernel ever gets around to generating RA's,
it would presumably also include:

  - Router Advertisement (ICMPv6 type 134)
    (radvd daemon could pick up on the kernel setting and use it)

Interface drivers may examine the Traffic Class value and translate
the DiffServ Code Point into a link-layer appropriate traffic
prioritization scheme.  An example of mapping IETF DSCP values to
IEEE 802.11 User Priority values can be found here:

    https://tools.ietf.org/html/draft-ietf-tsvwg-ieee-802-11

The expected primary use case is to properly prioritize ND over wifi.

Testing:
  jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  0
  jzem22:~# echo -1 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  -bash: echo: write error: Invalid argument
  jzem22:~# echo 256 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  -bash: echo: write error: Invalid argument
  jzem22:~# echo 0 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# echo 255 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  255
  jzem22:~# echo 34 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  34

  jzem22:~# echo $[0xDC] > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# tcpdump -v -i eth0 icmp6 and src host jzem22.pgc and dst host fe80::1
  tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes
  IP6 (class 0xdc, hlim 255, next-header ICMPv6 (58) payload length: 24)
  jzem22.pgc > fe80::1: [icmp6 sum ok] ICMP6, neighbor advertisement,
  length 24, tgt is jzem22.pgc, Flags [solicited]

(based on original change written by Erik Kline, with minor changes)

v2: fix 'suspicious rcu_dereference_check() usage'
    by explicitly grabbing the rcu_read_lock.

Cc: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: Erik Kline <ek@google.com>
Signed-off-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  9 +++++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    | 11 +++++++++++
 net/ipv6/ndisc.c                       |  9 ++++++++-
 5 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 54410a1d4065..d8676dda7fa6 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1732,6 +1732,15 @@ ndisc_notify - BOOLEAN
 	1 - Generate unsolicited neighbour advertisements when device is brought
 	    up or hardware address changes.
 
+ndisc_tclass - INTEGER
+	The IPv6 Traffic Class to use by default when sending IPv6 Neighbor
+	Discovery (Router Solicitation, Router Advertisement, Neighbor
+	Solicitation, Neighbor Advertisement, Redirect) messages.
+	These 8 bits can be interpreted as 6 high order bits holding the DSCP
+	value and 2 low order bits representing ECN (which you probably want
+	to leave cleared).
+	0 - (default)
+
 mldv1_unsolicited_report_interval - INTEGER
 	The interval in milliseconds in which the next unsolicited
 	MLDv1 report retransmit will take place.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index ea04ca024f0d..cb18c6290ca8 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -73,6 +73,7 @@ struct ipv6_devconf {
 	__u32		enhanced_dad;
 	__u32		addr_gen_mode;
 	__s32		disable_policy;
+	__s32           ndisc_tclass;
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index b22a9c4e1b12..9c0f4a92bcff 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -186,6 +186,7 @@ enum {
 	DEVCONF_ADDR_GEN_MODE,
 	DEVCONF_DISABLE_POLICY,
 	DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN,
+	DEVCONF_NDISC_TCLASS,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 6233e06fa35c..a6dffd65eb9d 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5059,6 +5059,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad;
 	array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode;
 	array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy;
+	array[DEVCONF_NDISC_TCLASS] = cnf->ndisc_tclass;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5986,6 +5987,7 @@ int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write,
 }
 
 static int minus_one = -1;
+static const int zero = 0;
 static const int one = 1;
 static const int two_five_five = 255;
 
@@ -6356,6 +6358,15 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.mode           = 0644,
 		.proc_handler   = addrconf_sysctl_disable_policy,
 	},
+	{
+		.procname	= "ndisc_tclass",
+		.data		= &ipv6_devconf.ndisc_tclass,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= (void *)&zero,
+		.extra2		= (void *)&two_five_five,
+	},
 	{
 		/* sentinel */
 	}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index f9c3ffe04382..b3cea200c85e 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -427,12 +427,19 @@ static void ip6_nd_hdr(struct sk_buff *skb,
 		       int hop_limit, int len)
 {
 	struct ipv6hdr *hdr;
+	struct inet6_dev *idev;
+	unsigned tclass;
+
+	rcu_read_lock();
+	idev = __in6_dev_get(skb->dev);
+	tclass = idev ? idev->cnf.ndisc_tclass : 0;
+	rcu_read_unlock();
 
 	skb_push(skb, sizeof(*hdr));
 	skb_reset_network_header(skb);
 	hdr = ipv6_hdr(skb);
 
-	ip6_flow_hdr(hdr, 0, 0);
+	ip6_flow_hdr(hdr, tclass, 0);
 
 	hdr->payload_len = htons(len);
 	hdr->nexthdr = IPPROTO_ICMPV6;
-- 
cgit v1.2.3


From f3edacbd697f94a743fff1a3d26910ab99948ba7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 11 Nov 2017 18:24:55 +0900
Subject: bpf: Revert bpf_overrid_function() helper changes.

NACK'd by x86 maintainer.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/Kconfig                              |  3 ---
 arch/x86/Kconfig                          |  1 -
 arch/x86/include/asm/kprobes.h            |  4 ----
 arch/x86/include/asm/ptrace.h             |  5 ----
 arch/x86/kernel/kprobes/ftrace.c          | 14 -----------
 include/linux/filter.h                    |  3 +--
 include/linux/trace_events.h              |  1 -
 include/uapi/linux/bpf.h                  |  7 +-----
 kernel/bpf/core.c                         |  3 ---
 kernel/bpf/verifier.c                     |  2 --
 kernel/events/core.c                      |  7 ------
 kernel/trace/Kconfig                      | 11 ---------
 kernel/trace/bpf_trace.c                  | 35 ---------------------------
 kernel/trace/trace_kprobe.c               | 40 ++++++-------------------------
 kernel/trace/trace_probe.h                |  6 -----
 samples/bpf/Makefile                      |  4 ----
 samples/bpf/test_override_return.sh       | 15 ------------
 samples/bpf/tracex7_kern.c                | 16 -------------
 samples/bpf/tracex7_user.c                | 28 ----------------------
 tools/include/uapi/linux/bpf.h            |  7 +-----
 tools/testing/selftests/bpf/bpf_helpers.h |  3 +--
 21 files changed, 11 insertions(+), 204 deletions(-)
 delete mode 100755 samples/bpf/test_override_return.sh
 delete mode 100644 samples/bpf/tracex7_kern.c
 delete mode 100644 samples/bpf/tracex7_user.c

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 6e8520f09bc1..057370a0ac4e 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -196,9 +196,6 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
 	bool
 
-config HAVE_KPROBE_OVERRIDE
-	bool
-
 config HAVE_NMI
 	bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 51458c1a0b4a..2fdb23313dd5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -153,7 +153,6 @@ config X86
 	select HAVE_KERNEL_XZ
 	select HAVE_KPROBES
 	select HAVE_KPROBES_ON_FTRACE
-	select HAVE_KPROBE_OVERRIDE
 	select HAVE_KRETPROBES
 	select HAVE_KVM
 	select HAVE_LIVEPATCH			if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index c6c3b1f4306a..6cf65437b5e5 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,10 +67,6 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
-#ifdef CONFIG_KPROBES_ON_FTRACE
-extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
-#endif
-
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
 	/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 2370bb0149cc..c0e3c45cf6ab 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -109,11 +109,6 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
 	return regs->ax;
 }
 
-static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
-{
-	regs->ax = rc;
-}
-
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 3c455bf490cb..041f7b6dfa0f 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,17 +97,3 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
 	p->ainsn.boostable = false;
 	return 0;
 }
-
-asmlinkage void override_func(void);
-asm(
-	".type override_func, @function\n"
-	"override_func:\n"
-	"	ret\n"
-	".size override_func, .-override_func\n"
-);
-
-void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
-{
-	regs->ip = (unsigned long)&override_func;
-}
-NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index eaec066f99e8..0cd02ff4ae30 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -459,8 +459,7 @@ struct bpf_prog {
 				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1,	/* Do we need dst entry? */
-				kprobe_override:1; /* Do we override a kprobe? */
+				dst_needed:1;	/* Do we need dst entry? */
 	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 17e5e820a84c..84014ecfa67f 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -523,7 +523,6 @@ do {									\
 struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
-DECLARE_PER_CPU(int, bpf_kprobe_override);
 
 extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index adb66f78b674..e880ae6434ee 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -677,10 +677,6 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
- *
- * int bpf_override_return(pt_regs, rc)
- *	@pt_regs: pointer to struct pt_regs
- *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -740,8 +736,7 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),			\
-	FN(override_return),
+	FN(getsockopt),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 271daad31f37..8a6c37762330 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1326,9 +1326,6 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
 bool bpf_prog_array_compatible(struct bpf_array *array,
 			       const struct bpf_prog *fp)
 {
-	if (fp->kprobe_override)
-		return false;
-
 	if (!array->owner_prog_type) {
 		/* There's no owner yet where we could check for
 		 * compatibility.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bc464b8ec91e..4a942e2e753d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4357,8 +4357,6 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			prog->dst_needed = 1;
 		if (insn->imm == BPF_FUNC_get_prandom_u32)
 			bpf_user_rnd_init_once();
-		if (insn->imm == BPF_FUNC_override_return)
-			prog->kprobe_override = 1;
 		if (insn->imm == BPF_FUNC_tail_call) {
 			/* If we tail call into other programs, we
 			 * cannot make any assumptions since they can
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ac240d31b5bf..42d24bd64ea4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8171,13 +8171,6 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return -EINVAL;
 	}
 
-	/* Kprobe override only works for kprobes, not uprobes. */
-	if (prog->kprobe_override &&
-	    !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
-		bpf_prog_put(prog);
-		return -EINVAL;
-	}
-
 	if (is_tracepoint || is_syscall_tp) {
 		int off = trace_event_get_offsets(event->tp_event);
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 9dc0deeaad2b..434c840e2d82 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -518,17 +518,6 @@ config FUNCTION_PROFILER
 
 	  If in doubt, say N.
 
-config BPF_KPROBE_OVERRIDE
-	bool "Enable BPF programs to override a kprobed function"
-	depends on BPF_EVENTS
-	depends on KPROBES_ON_FTRACE
-	depends on HAVE_KPROBE_OVERRIDE
-	depends on DYNAMIC_FTRACE_WITH_REGS
-	default n
-	help
-	 Allows BPF to override the execution of a probed function and
-	 set a different return value.  This is used for error injection.
-
 config FTRACE_MCOUNT_RECORD
 	def_bool y
 	depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1865b0d4cdeb..506efe6e8ed9 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,10 +13,6 @@
 #include <linux/filter.h>
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
-#include <linux/kprobes.h>
-#include <asm/kprobes.h>
-
-#include "trace_probe.h"
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
@@ -80,29 +76,6 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 }
 EXPORT_SYMBOL_GPL(trace_call_bpf);
 
-#ifdef CONFIG_BPF_KPROBE_OVERRIDE
-BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
-{
-	__this_cpu_write(bpf_kprobe_override, 1);
-	regs_set_return_value(regs, rc);
-	arch_ftrace_kprobe_override_function(regs);
-	return 0;
-}
-#else
-BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
-{
-	return -EINVAL;
-}
-#endif
-
-static const struct bpf_func_proto bpf_override_return_proto = {
-	.func		= bpf_override_return,
-	.gpl_only	= true,
-	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_ANYTHING,
-};
-
 BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
 {
 	int ret;
@@ -578,10 +551,6 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_get_stackid_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
-	case BPF_FUNC_override_return:
-		pr_warn_ratelimited("%s[%d] is installing a program with bpf_override_return helper that may cause unexpected behavior!",
-				    current->comm, task_pid_nr(current));
-		return &bpf_override_return_proto;
 	default:
 		return tracing_func_proto(func_id);
 	}
@@ -797,10 +766,6 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
 	struct bpf_prog_array *new_array;
 	int ret = -EEXIST;
 
-	/* Kprobe override only works for ftrace based kprobes. */
-	if (prog->kprobe_override && !trace_kprobe_ftrace(event->tp_event))
-		return -EINVAL;
-
 	mutex_lock(&bpf_event_mutex);
 
 	if (event->prog)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8e3c9ec1faf7..abf92e478cfb 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -42,7 +42,6 @@ struct trace_kprobe {
 	(offsetof(struct trace_kprobe, tp.args) +	\
 	(sizeof(struct probe_arg) * (n)))
 
-DEFINE_PER_CPU(int, bpf_kprobe_override);
 
 static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
 {
@@ -88,12 +87,6 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
 	return nhit;
 }
 
-int trace_kprobe_ftrace(struct trace_event_call *call)
-{
-	struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
-	return kprobe_ftrace(&tk->rp.kp);
-}
-
 static int register_kprobe_event(struct trace_kprobe *tk);
 static int unregister_kprobe_event(struct trace_kprobe *tk);
 
@@ -1177,7 +1170,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call)
 #ifdef CONFIG_PERF_EVENTS
 
 /* Kprobe profile handler */
-static int
+static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
@@ -1186,29 +1179,12 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	int size, __size, dsize;
 	int rctx;
 
-	if (bpf_prog_array_valid(call)) {
-		int ret;
-
-		ret = trace_call_bpf(call, regs);
-
-		/*
-		 * We need to check and see if we modified the pc of the
-		 * pt_regs, and if so clear the kprobe and return 1 so that we
-		 * don't do the instruction skipping.  Also reset our state so
-		 * we are clean the next pass through.
-		 */
-		if (__this_cpu_read(bpf_kprobe_override)) {
-			__this_cpu_write(bpf_kprobe_override, 0);
-			reset_current_kprobe();
-			return 1;
-		}
-		if (!ret)
-			return 0;
-	}
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
+		return;
 
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
-		return 0;
+		return;
 
 	dsize = __get_data_size(&tk->tp, regs);
 	__size = sizeof(*entry) + tk->tp.size + dsize;
@@ -1217,14 +1193,13 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 
 	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
-		return 0;
+		return;
 
 	entry->ip = (unsigned long)tk->rp.kp.addr;
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
 			      head, NULL, NULL);
-	return 0;
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1300,7 +1275,6 @@ static int kprobe_register(struct trace_event_call *event,
 static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 {
 	struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
-	int ret = 0;
 
 	raw_cpu_inc(*tk->nhit);
 
@@ -1308,9 +1282,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 		kprobe_trace_func(tk, regs);
 #ifdef CONFIG_PERF_EVENTS
 	if (tk->tp.flags & TP_FLAG_PROFILE)
-		ret = kprobe_perf_func(tk, regs);
+		kprobe_perf_func(tk, regs);
 #endif
-	return ret;
+	return 0;	/* We don't tweek kernel, so just return 0 */
 }
 NOKPROBE_SYMBOL(kprobe_dispatcher);
 
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index adbb3f7d1fb5..903273c93e61 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -253,7 +253,6 @@ struct symbol_cache;
 unsigned long update_symbol_cache(struct symbol_cache *sc);
 void free_symbol_cache(struct symbol_cache *sc);
 struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
-int trace_kprobe_ftrace(struct trace_event_call *call);
 #else
 /* uprobes do not support symbol fetch methods */
 #define fetch_symbol_u8			NULL
@@ -279,11 +278,6 @@ alloc_symbol_cache(const char *sym, long offset)
 {
 	return NULL;
 }
-
-static inline int trace_kprobe_ftrace(struct trace_event_call *call)
-{
-	return 0;
-}
 #endif /* CONFIG_KPROBE_EVENTS */
 
 struct probe_arg {
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 87db0f9a4c15..3b4945c1eab0 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -15,7 +15,6 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
-hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
@@ -62,7 +61,6 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
-tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -106,7 +104,6 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
-always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
@@ -161,7 +158,6 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
-HOSTLOADLIBES_tracex7 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh
deleted file mode 100755
index e68b9ee6814b..000000000000
--- a/samples/bpf/test_override_return.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-rm -f testfile.img
-dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
-DEVICE=$(losetup --show -f testfile.img)
-mkfs.btrfs -f $DEVICE
-mkdir tmpmnt
-./tracex7 $DEVICE
-if [ $? -eq 0 ]
-then
-	echo "SUCCESS!"
-else
-	echo "FAILED!"
-fi
-losetup -d $DEVICE
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
deleted file mode 100644
index 1ab308a43e0f..000000000000
--- a/samples/bpf/tracex7_kern.c
+++ /dev/null
@@ -1,16 +0,0 @@
-#include <uapi/linux/ptrace.h>
-#include <uapi/linux/bpf.h>
-#include <linux/version.h>
-#include "bpf_helpers.h"
-
-SEC("kprobe/open_ctree")
-int bpf_prog1(struct pt_regs *ctx)
-{
-	unsigned long rc = -12;
-
-	bpf_override_return(ctx, rc);
-	return 0;
-}
-
-char _license[] SEC("license") = "GPL";
-u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
deleted file mode 100644
index 8a52ac492e8b..000000000000
--- a/samples/bpf/tracex7_user.c
+++ /dev/null
@@ -1,28 +0,0 @@
-#define _GNU_SOURCE
-
-#include <stdio.h>
-#include <linux/bpf.h>
-#include <unistd.h>
-#include "libbpf.h"
-#include "bpf_load.h"
-
-int main(int argc, char **argv)
-{
-	FILE *f;
-	char filename[256];
-	char command[256];
-	int ret;
-
-	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-
-	if (load_bpf_file(filename)) {
-		printf("%s", bpf_log_buf);
-		return 1;
-	}
-
-	snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
-	f = popen(command, "r");
-	ret = pclose(f);
-
-	return ret ? 0 : 1;
-}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index adb66f78b674..e880ae6434ee 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -677,10 +677,6 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
- *
- * int bpf_override_return(pt_regs, rc)
- *	@pt_regs: pointer to struct pt_regs
- *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -740,8 +736,7 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),			\
-	FN(override_return),
+	FN(getsockopt),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 33cb00e46c49..fd9a17fa8a8b 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -82,8 +82,7 @@ static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags,
 static int (*bpf_perf_prog_read_value)(void *ctx, void *buf,
 				       unsigned int buf_size) =
 	(void *) BPF_FUNC_perf_prog_read_value;
-static int (*bpf_override_return)(void *ctx, unsigned long rc) =
-	(void *) BPF_FUNC_override_return;
+
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
cgit v1.2.3


From 713bafea92920103cd3d361657406cf04d0e22dd Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 8 Nov 2017 13:01:26 -0800
Subject: tcp: retire FACK loss detection

FACK loss detection has been disabled by default and the
successor RACK subsumed FACK and can handle reordering better.
This patch removes FACK to simplify TCP loss recovery.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  3 +-
 include/linux/tcp.h                    |  1 -
 include/net/tcp.h                      | 14 +--------
 include/uapi/linux/snmp.h              |  1 -
 net/ipv4/proc.c                        |  1 -
 net/ipv4/tcp.c                         |  2 --
 net/ipv4/tcp_input.c                   | 53 +++++-----------------------------
 net/ipv4/tcp_metrics.c                 |  4 +--
 net/ipv4/tcp_minisocks.c               |  5 +---
 net/ipv4/tcp_output.c                  |  5 +---
 10 files changed, 12 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index d8676dda7fa6..46c7e1085efc 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -289,8 +289,7 @@ tcp_ecn_fallback - BOOLEAN
 	Default: 1 (fallback enabled)
 
 tcp_fack - BOOLEAN
-	Enable FACK congestion avoidance and fast retransmission.
-	The value is not used, if tcp_sack is not enabled.
+	This is a legacy option, it has no effect anymore.
 
 tcp_fin_timeout - INTEGER
 	The length of time an orphaned (no longer referenced by any
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 22f40c96a15b..9574936fe041 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -85,7 +85,6 @@ struct tcp_sack_block {
 
 /*These are used to set the sack_ok field in struct tcp_options_received */
 #define TCP_SACK_SEEN     (1 << 0)   /*1 = peer is SACK capable, */
-#define TCP_FACK_ENABLED  (1 << 1)   /*1 = FACK is enabled locally*/
 #define TCP_DSACK_SEEN    (1 << 2)   /*1 = DSACK was received from peer*/
 
 struct tcp_options_received {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2f2c69ad31b2..ed71511e67a6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -384,7 +384,6 @@ void tcp_update_metrics(struct sock *sk);
 void tcp_init_metrics(struct sock *sk);
 void tcp_metrics_init(void);
 bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
-void tcp_disable_fack(struct tcp_sock *tp);
 void tcp_close(struct sock *sk, long timeout);
 void tcp_init_sock(struct sock *sk);
 void tcp_init_transfer(struct sock *sk, int bpf_op);
@@ -776,7 +775,7 @@ struct tcp_skb_cb {
 	};
 	__u8		tcp_flags;	/* TCP header flags. (tcp[13])	*/
 
-	__u8		sacked;		/* State flags for SACK/FACK.	*/
+	__u8		sacked;		/* State flags for SACK.	*/
 #define TCPCB_SACKED_ACKED	0x01	/* SKB ACK'd by a SACK block	*/
 #define TCPCB_SACKED_RETRANS	0x02	/* SKB retransmitted		*/
 #define TCPCB_LOST		0x04	/* SKB is lost			*/
@@ -1066,7 +1065,6 @@ void tcp_rate_check_app_limited(struct sock *sk);
  *
  * tcp_is_sack - SACK enabled
  * tcp_is_reno - No SACK
- * tcp_is_fack - FACK enabled, implies SACK enabled
  */
 static inline int tcp_is_sack(const struct tcp_sock *tp)
 {
@@ -1078,16 +1076,6 @@ static inline bool tcp_is_reno(const struct tcp_sock *tp)
 	return !tcp_is_sack(tp);
 }
 
-static inline bool tcp_is_fack(const struct tcp_sock *tp)
-{
-	return tp->rx_opt.sack_ok & TCP_FACK_ENABLED;
-}
-
-static inline void tcp_enable_fack(struct tcp_sock *tp)
-{
-	tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
-}
-
 static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
 {
 	return tp->sacked_out + tp->lost_out;
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 0d941cdd8e8c..33a70ece462f 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -191,7 +191,6 @@ enum
 	LINUX_MIB_TCPRENORECOVERY,		/* TCPRenoRecovery */
 	LINUX_MIB_TCPSACKRECOVERY,		/* TCPSackRecovery */
 	LINUX_MIB_TCPSACKRENEGING,		/* TCPSACKReneging */
-	LINUX_MIB_TCPFACKREORDER,		/* TCPFACKReorder */
 	LINUX_MIB_TCPSACKREORDER,		/* TCPSACKReorder */
 	LINUX_MIB_TCPRENOREORDER,		/* TCPRenoReorder */
 	LINUX_MIB_TCPTSREORDER,			/* TCPTSReorder */
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 127153f1ed8a..9f37c4727861 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -212,7 +212,6 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
 	SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
 	SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
-	SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER),
 	SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER),
 	SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER),
 	SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bc71a27d5ad9..337555076043 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2509,8 +2509,6 @@ static int tcp_repair_options_est(struct sock *sk,
 				return -EINVAL;
 
 			tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
-			if (sock_net(sk)->ipv4.sysctl_tcp_fack)
-				tcp_enable_fack(tp);
 			break;
 		case TCPOPT_TIMESTAMP:
 			if (opt.opt_val != 0)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9ceaa1fdc3ab..487e181cff86 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -842,18 +842,6 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
 	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
 
-/*
- * Packet counting of FACK is based on in-order assumptions, therefore TCP
- * disables it when reordering is detected
- */
-void tcp_disable_fack(struct tcp_sock *tp)
-{
-	/* RFC3517 uses different metric in lost marker => reset on change */
-	if (tcp_is_fack(tp))
-		tp->lost_skb_hint = NULL;
-	tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
-}
-
 /* Take a notice that peer is sending D-SACKs */
 static void tcp_dsack_seen(struct tcp_sock *tp)
 {
@@ -881,7 +869,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
 			 tp->sacked_out,
 			 tp->undo_marker ? tp->undo_retrans : 0);
 #endif
-		tcp_disable_fack(tp);
 	}
 
 	tp->rack.reord = 1;
@@ -891,8 +878,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
 		mib_idx = LINUX_MIB_TCPTSREORDER;
 	else if (tcp_is_reno(tp))
 		mib_idx = LINUX_MIB_TCPRENOREORDER;
-	else if (tcp_is_fack(tp))
-		mib_idx = LINUX_MIB_TCPFACKREORDER;
 	else
 		mib_idx = LINUX_MIB_TCPSACKREORDER;
 
@@ -970,7 +955,6 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
  * 3. Loss detection event of two flavors:
  *	A. Scoreboard estimator decided the packet is lost.
  *	   A'. Reno "three dupacks" marks head of queue lost.
- *	   A''. Its FACK modification, head until snd.fack is lost.
  *	B. SACK arrives sacking SND.NXT at the moment, when the
  *	   segment was retransmitted.
  * 4. D-SACK added new rule: D-SACK changes any tag to S.
@@ -1248,7 +1232,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
 		fack_count += pcount;
 
 		/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
-		if (!tcp_is_fack(tp) && tp->lost_skb_hint &&
+		if (tp->lost_skb_hint &&
 		    before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
 			tp->lost_cnt_hint += pcount;
 
@@ -2051,10 +2035,6 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp)
  * counter when SACK is enabled (without SACK, sacked_out is used for
  * that purpose).
  *
- * Instead, with FACK TCP uses fackets_out that includes both SACKed
- * segments up to the highest received SACK block so far and holes in
- * between them.
- *
  * With reordering, holes may still be in flight, so RFC3517 recovery
  * uses pure sacked_out (total number of SACKed segments) even though
  * it violates the RFC that uses duplicate ACKs, often these are equal
@@ -2064,10 +2044,10 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp)
  */
 static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
 {
-	return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
+	return tp->sacked_out + 1;
 }
 
-/* Linux NewReno/SACK/FACK/ECN state machine.
+/* Linux NewReno/SACK/ECN state machine.
  * --------------------------------------
  *
  * "Open"	Normal state, no dubious events, fast path.
@@ -2132,16 +2112,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
  *		dynamically measured and adjusted. This is implemented in
  *		tcp_rack_mark_lost.
  *
- *		FACK (Disabled by default. Subsumbed by RACK):
- *		It is the simplest heuristics. As soon as we decided
- *		that something is lost, we decide that _all_ not SACKed
- *		packets until the most forward SACK are lost. I.e.
- *		lost_out = fackets_out - sacked_out and left_out = fackets_out.
- *		It is absolutely correct estimate, if network does not reorder
- *		packets. And it loses any connection to reality when reordering
- *		takes place. We use FACK by default until reordering
- *		is suspected on the path to this destination.
- *
  *		If the receiver does not support SACK:
  *
  *		NewReno (RFC6582): in Recovery we assume that one segment
@@ -2190,7 +2160,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 }
 
 /* Detect loss in event "A" above by marking head of queue up as lost.
- * For FACK or non-SACK(Reno) senders, the first "packets" number of segments
+ * For non-SACK(Reno) senders, the first "packets" number of segments
  * are considered lost. For RFC3517 SACK, a segment is considered lost if it
  * has at least tp->reordering SACKed seqments above it; "packets" refers to
  * the maximum SACKed segments to pass before reaching this limit.
@@ -2226,12 +2196,12 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
 			break;
 
 		oldcnt = cnt;
-		if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
+		if (tcp_is_reno(tp) ||
 		    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
 			cnt += tcp_skb_pcount(skb);
 
 		if (cnt > packets) {
-			if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
+			if (tcp_is_sack(tp) ||
 			    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
 			    (oldcnt >= packets))
 				break;
@@ -2262,11 +2232,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
 
 	if (tcp_is_reno(tp)) {
 		tcp_mark_head_lost(sk, 1, 1);
-	} else if (tcp_is_fack(tp)) {
-		int lost = tp->fackets_out - tp->reordering;
-		if (lost <= 0)
-			lost = 1;
-		tcp_mark_head_lost(sk, lost, 0);
 	} else {
 		int sacked_upto = tp->sacked_out - tp->reordering;
 		if (sacked_upto >= 0)
@@ -3199,8 +3164,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			if (reord < prior_fackets && reord <= tp->fackets_out)
 				tcp_update_reordering(sk, tp->fackets_out - reord, 0);
 
-			delta = tcp_is_fack(tp) ? pkts_acked :
-						  prior_sacked - tp->sacked_out;
+			delta = prior_sacked - tp->sacked_out;
 			tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
 		}
 
@@ -5708,9 +5672,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 			tp->tcp_header_len = sizeof(struct tcphdr);
 		}
 
-		if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_fack)
-			tcp_enable_fack(tp);
-
 		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 		tcp_initialize_rcv_mss(sk);
 
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 9d5ddebfd831..7097f92d16e5 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -470,10 +470,8 @@ void tcp_init_metrics(struct sock *sk)
 		tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	}
 	val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
-	if (val && tp->reordering != val) {
-		tcp_disable_fack(tp);
+	if (val && tp->reordering != val)
 		tp->reordering = val;
-	}
 
 	crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
 	rcu_read_unlock();
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4bb86580decd..326c9282bf94 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -509,10 +509,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 						       keepalive_time_when(newtp));
 
 		newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
-		if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
-			if (sock_net(sk)->ipv4.sysctl_tcp_fack)
-				tcp_enable_fack(newtp);
-		}
+		newtp->rx_opt.sack_ok = ireq->sack_ok;
 		newtp->window_clamp = req->rsk_window_clamp;
 		newtp->rcv_ssthresh = req->rsk_rcv_wnd;
 		newtp->rcv_wnd = req->rsk_rcv_wnd;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9b98d35aa0d8..094c429b4401 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1257,7 +1257,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
 
 	if (tp->lost_skb_hint &&
 	    before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
-	    (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)))
+	    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
 		tp->lost_cnt_hint -= decr;
 
 	tcp_verify_left_out(tp);
@@ -2961,9 +2961,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
  * retransmitted data is acknowledged.  It tries to continue
  * resending the rest of the retransmit queue, until either
  * we've sent it all or the congestion window limit is reached.
- * If doing SACK, the first ACK which comes back for a timeout
- * based retransmit packet might feed us FACK information again.
- * If so, we use it to avoid unnecessarily retransmissions.
  */
 void tcp_xmit_retransmit_queue(struct sock *sk)
 {
-- 
cgit v1.2.3


From 737ff314563ca27f044f9a3a041e9d42491ef7ce Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 8 Nov 2017 13:01:27 -0800
Subject: tcp: use sequence distance to detect reordering

Replace the reordering distance measurement in packet unit with
sequence based approach. Previously it trackes the number of "packets"
toward the forward ACK (i.e.  highest sacked sequence)in a state
variable "fackets_out".

Precisely measuring reordering degree on packet distance has not much
benefit, as the degree constantly changes by factors like path, load,
and congestion window. It is also complicated and prone to arcane bugs.
This patch replaces with sequence-based approach that's much simpler.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |   1 -
 net/ipv4/tcp.c           |   1 -
 net/ipv4/tcp_input.c     | 155 +++++++++++++++++++++--------------------------
 net/ipv4/tcp_minisocks.c |   1 -
 net/ipv4/tcp_output.c    |  17 ------
 5 files changed, 68 insertions(+), 107 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 9574936fe041..df5d97a85e1a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -293,7 +293,6 @@ struct tcp_sock {
 	u32	pushed_seq;	/* Last pushed seq, required to talk to windows */
 	u32	lost_out;	/* Lost packets			*/
 	u32	sacked_out;	/* SACK'd packets			*/
-	u32	fackets_out;	/* FACK'd packets			*/
 
 	struct hrtimer	pacing_timer;
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 337555076043..bf97317e6c97 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2977,7 +2977,6 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 
 	info->tcpi_lost = tp->lost_out;
 	info->tcpi_retrans = tp->retrans_out;
-	info->tcpi_fackets = tp->fackets_out;
 
 	now = tcp_jiffies32;
 	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 487e181cff86..94d729be42a9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -849,39 +849,39 @@ static void tcp_dsack_seen(struct tcp_sock *tp)
 	tp->rack.dsack_seen = 1;
 }
 
-static void tcp_update_reordering(struct sock *sk, const int metric,
-				  const int ts)
+/* It's reordering when higher sequence was delivered (i.e. sacked) before
+ * some lower never-retransmitted sequence ("low_seq"). The maximum reordering
+ * distance is approximated in full-mss packet distance ("reordering").
+ */
+static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
+				      const int ts)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	int mib_idx;
+	const u32 mss = tp->mss_cache;
+	u32 fack, metric;
 
-	if (WARN_ON_ONCE(metric < 0))
+	fack = tcp_highest_sack_seq(tp);
+	if (!before(low_seq, fack))
 		return;
 
-	if (metric > tp->reordering) {
-		tp->reordering = min(sock_net(sk)->ipv4.sysctl_tcp_max_reordering, metric);
-
+	metric = fack - low_seq;
+	if ((metric > tp->reordering * mss) && mss) {
 #if FASTRETRANS_DEBUG > 1
 		pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
 			 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
 			 tp->reordering,
-			 tp->fackets_out,
+			 0,
 			 tp->sacked_out,
 			 tp->undo_marker ? tp->undo_retrans : 0);
 #endif
+		tp->reordering = min_t(u32, (metric + mss - 1) / mss,
+				       sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
 	}
 
 	tp->rack.reord = 1;
-
 	/* This exciting event is worth to be remembered. 8) */
-	if (ts)
-		mib_idx = LINUX_MIB_TCPTSREORDER;
-	else if (tcp_is_reno(tp))
-		mib_idx = LINUX_MIB_TCPRENOREORDER;
-	else
-		mib_idx = LINUX_MIB_TCPSACKREORDER;
-
-	NET_INC_STATS(sock_net(sk), mib_idx);
+	NET_INC_STATS(sock_net(sk),
+		      ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
 }
 
 /* This must be called before lost_out is incremented */
@@ -1097,8 +1097,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
 }
 
 struct tcp_sacktag_state {
-	int	reord;
-	int	fack_count;
+	u32	reord;
 	/* Timestamps for earliest and latest never-retransmitted segment
 	 * that was SACKed. RTO needs the earliest RTT to stay conservative,
 	 * but congestion control should still get an accurate delay signal.
@@ -1174,15 +1173,15 @@ static u8 tcp_sacktag_one(struct sock *sk,
 			  u64 xmit_time)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	int fack_count = state->fack_count;
 
 	/* Account D-SACK for retransmitted packet. */
 	if (dup_sack && (sacked & TCPCB_RETRANS)) {
 		if (tp->undo_marker && tp->undo_retrans > 0 &&
 		    after(end_seq, tp->undo_marker))
 			tp->undo_retrans--;
-		if (sacked & TCPCB_SACKED_ACKED)
-			state->reord = min(fack_count, state->reord);
+		if ((sacked & TCPCB_SACKED_ACKED) &&
+		    before(start_seq, state->reord))
+				state->reord = start_seq;
 	}
 
 	/* Nothing to do; acked frame is about to be dropped (was ACKed). */
@@ -1208,9 +1207,10 @@ static u8 tcp_sacktag_one(struct sock *sk,
 				 * which was in hole. It is reordering.
 				 */
 				if (before(start_seq,
-					   tcp_highest_sack_seq(tp)))
-					state->reord = min(fack_count,
-							   state->reord);
+					   tcp_highest_sack_seq(tp)) &&
+				    before(start_seq, state->reord))
+					state->reord = start_seq;
+
 				if (!after(end_seq, tp->high_seq))
 					state->flag |= FLAG_ORIG_SACK_ACKED;
 				if (state->first_sackt == 0)
@@ -1229,15 +1229,10 @@ static u8 tcp_sacktag_one(struct sock *sk,
 		tp->sacked_out += pcount;
 		tp->delivered += pcount;  /* Out-of-order packets delivered */
 
-		fack_count += pcount;
-
 		/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
 		if (tp->lost_skb_hint &&
 		    before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
 			tp->lost_cnt_hint += pcount;
-
-		if (fack_count > tp->fackets_out)
-			tp->fackets_out = fack_count;
 	}
 
 	/* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1484,7 +1479,6 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
 	}
 
 out:
-	state->fack_count += pcount;
 	return prev;
 
 noop:
@@ -1563,8 +1557,6 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
 				    tcp_highest_sack_seq(tp)))
 				tcp_advance_highest_sack(sk, skb);
 		}
-
-		state->fack_count += tcp_skb_pcount(skb);
 	}
 	return skb;
 }
@@ -1575,7 +1567,6 @@ static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
 {
 	struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
 	struct sk_buff *skb;
-	int unack_bytes;
 
 	while (*p) {
 		parent = *p;
@@ -1588,12 +1579,6 @@ static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
 			p = &parent->rb_right;
 			continue;
 		}
-
-		state->fack_count = 0;
-		unack_bytes = TCP_SKB_CB(skb)->seq - tcp_sk(sk)->snd_una;
-		if (state->mss_now && unack_bytes > 0)
-			state->fack_count = unack_bytes / state->mss_now;
-
 		return skb;
 	}
 	return NULL;
@@ -1651,13 +1636,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 	int first_sack_index;
 
 	state->flag = 0;
-	state->reord = tp->packets_out;
+	state->reord = tp->snd_nxt;
 
-	if (!tp->sacked_out) {
-		if (WARN_ON(tp->fackets_out))
-			tp->fackets_out = 0;
+	if (!tp->sacked_out)
 		tcp_highest_sack_reset(sk);
-	}
 
 	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
 					 num_sacks, prior_snd_una);
@@ -1729,7 +1711,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 	}
 
 	state->mss_now = tcp_current_mss(sk);
-	state->fack_count = 0;
 	skb = NULL;
 	i = 0;
 
@@ -1787,7 +1768,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 				skb = tcp_highest_sack(sk);
 				if (!skb)
 					break;
-				state->fack_count = tp->fackets_out;
 				cache++;
 				goto walk;
 			}
@@ -1802,7 +1782,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 			skb = tcp_highest_sack(sk);
 			if (!skb)
 				break;
-			state->fack_count = tp->fackets_out;
 		}
 		skb = tcp_sacktag_skip(skb, sk, state, start_seq);
 
@@ -1822,9 +1801,8 @@ advance_sp:
 	for (j = 0; j < used_sacks; j++)
 		tp->recv_sack_cache[i++] = sp[j];
 
-	if ((state->reord < tp->fackets_out) &&
-	    ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
-		tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
+	if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker)
+		tcp_check_sack_reordering(sk, state->reord, 0);
 
 	tcp_verify_left_out(tp);
 out:
@@ -1862,8 +1840,13 @@ static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
 static void tcp_check_reno_reordering(struct sock *sk, const int addend)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	if (tcp_limit_reno_sacked(tp))
-		tcp_update_reordering(sk, tp->packets_out + addend, 0);
+
+	if (!tcp_limit_reno_sacked(tp))
+		return;
+
+	tp->reordering = min_t(u32, tp->packets_out + addend,
+			       sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
+	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
 }
 
 /* Emulate SACKs for SACKless connection: account for a new dupack. */
@@ -1909,7 +1892,6 @@ void tcp_clear_retrans(struct tcp_sock *tp)
 	tp->lost_out = 0;
 	tp->undo_marker = 0;
 	tp->undo_retrans = -1;
-	tp->fackets_out = 0;
 	tp->sacked_out = 0;
 }
 
@@ -1959,7 +1941,6 @@ void tcp_enter_loss(struct sock *sk)
 	if (is_reneg) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
 		tp->sacked_out = 0;
-		tp->fackets_out = 0;
 	}
 	tcp_clear_all_retrans_hints(tp);
 
@@ -2026,11 +2007,6 @@ static bool tcp_check_sack_reneging(struct sock *sk, int flag)
 	return false;
 }
 
-static inline int tcp_fackets_out(const struct tcp_sock *tp)
-{
-	return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
-}
-
 /* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
  * counter when SACK is enabled (without SACK, sacked_out is used for
  * that purpose).
@@ -2701,15 +2677,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
 }
 
 /* Undo during fast recovery after partial ACK. */
-static bool tcp_try_undo_partial(struct sock *sk, const int acked)
+static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (tp->undo_marker && tcp_packet_delayed(tp)) {
 		/* Plain luck! Hole if filled with delayed
-		 * packet, rather than with a retransmit.
+		 * packet, rather than with a retransmit. Check reordering.
 		 */
-		tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
+		tcp_check_sack_reordering(sk, prior_snd_una, 1);
 
 		/* We are getting evidence that the reordering degree is higher
 		 * than we realized. If there are no retransmits out then we
@@ -2745,6 +2721,14 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
 	}
 }
 
+static bool tcp_force_fast_retransmit(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	return after(tcp_highest_sack_seq(tp),
+		     tp->snd_una + tp->reordering * tp->mss_cache);
+}
+
 /* Process an event, which can update packets-in-flight not trivially.
  * Main goal of this function is to calculate new estimate for left_out,
  * taking into account both packets sitting in receiver's buffer and
@@ -2757,19 +2741,17 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
  * It does _not_ decide what to send, it is made in function
  * tcp_xmit_retransmit_queue().
  */
-static void tcp_fastretrans_alert(struct sock *sk, const int acked,
+static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 				  bool is_dupack, int *ack_flag, int *rexmit)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int fast_rexmit = 0, flag = *ack_flag;
 	bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
-				    (tcp_fackets_out(tp) > tp->reordering));
+				     tcp_force_fast_retransmit(sk));
 
 	if (!tp->packets_out && tp->sacked_out)
 		tp->sacked_out = 0;
-	if (!tp->sacked_out && tp->fackets_out)
-		tp->fackets_out = 0;
 
 	/* Now state machine starts.
 	 * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
@@ -2816,11 +2798,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 			if (tcp_is_reno(tp) && is_dupack)
 				tcp_add_reno_sack(sk);
 		} else {
-			if (tcp_try_undo_partial(sk, acked))
+			if (tcp_try_undo_partial(sk, prior_snd_una))
 				return;
 			/* Partial ACK arrived. Force fast retransmit. */
 			do_lost = tcp_is_reno(tp) ||
-				  tcp_fackets_out(tp) > tp->reordering;
+				  tcp_force_fast_retransmit(sk);
 		}
 		if (tcp_try_undo_dsack(sk)) {
 			tcp_try_keep_open(sk);
@@ -3030,15 +3012,15 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
  * is before the ack sequence we can discard it as it's confirmed to have
  * arrived at the other end.
  */
-static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
-			       u32 prior_snd_una, int *acked,
+static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
+			       u32 prior_snd_una,
 			       struct tcp_sacktag_state *sack)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u64 first_ackt, last_ackt;
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 prior_sacked = tp->sacked_out;
-	u32 reord = tp->packets_out;
+	u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */
 	struct sk_buff *skb, *next;
 	bool fully_acked = true;
 	long sack_rtt_us = -1L;
@@ -3053,6 +3035,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 
 	for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+		const u32 start_seq = scb->seq;
 		u8 sacked = scb->sacked;
 		u32 acked_pcount;
 
@@ -3083,7 +3066,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 				first_ackt = last_ackt;
 
 			last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
-			reord = min(pkts_acked, reord);
+			if (before(start_seq, reord))
+				reord = start_seq;
 			if (!after(scb->end_seq, tp->high_seq))
 				flag |= FLAG_ORIG_SACK_ACKED;
 		}
@@ -3161,15 +3145,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			int delta;
 
 			/* Non-retransmitted hole got filled? That's reordering */
-			if (reord < prior_fackets && reord <= tp->fackets_out)
-				tcp_update_reordering(sk, tp->fackets_out - reord, 0);
+			if (before(reord, prior_fack))
+				tcp_check_sack_reordering(sk, reord, 0);
 
 			delta = prior_sacked - tp->sacked_out;
 			tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
 		}
-
-		tp->fackets_out -= min(pkts_acked, tp->fackets_out);
-
 	} else if (skb && rtt_update && sack_rtt_us >= 0 &&
 		   sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) {
 		/* Do not re-arm RTO if the sack RTT is measured from data sent
@@ -3210,7 +3191,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		}
 	}
 #endif
-	*acked = pkts_acked;
 	return flag;
 }
 
@@ -3519,12 +3499,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	u32 ack_seq = TCP_SKB_CB(skb)->seq;
 	u32 ack = TCP_SKB_CB(skb)->ack_seq;
 	bool is_dupack = false;
-	u32 prior_fackets;
 	int prior_packets = tp->packets_out;
 	u32 delivered = tp->delivered;
 	u32 lost = tp->lost;
-	int acked = 0; /* Number of packets newly acked */
 	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+	u32 prior_fack;
 
 	sack_state.first_sackt = 0;
 	sack_state.rate = &rs;
@@ -3556,7 +3535,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		icsk->icsk_retransmits = 0;
 	}
 
-	prior_fackets = tp->fackets_out;
+	prior_fack = tcp_highest_sack_seq(tp);
 	rs.prior_in_flight = tcp_packets_in_flight(tp);
 
 	/* ts_recent update must be made after we are sure that the packet
@@ -3612,8 +3591,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		goto no_queue;
 
 	/* See if we can take anything off of the retransmit queue. */
-	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
-				    &sack_state);
+	flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state);
 
 	tcp_rack_update_reo_wnd(sk, &rs);
 
@@ -3625,7 +3603,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	if (tcp_ack_is_dubious(sk, flag)) {
 		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
-		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+		tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+				      &rexmit);
 	}
 
 	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
@@ -3641,7 +3620,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 no_queue:
 	/* If data was DSACKed, see if we can undo a cwnd reduction. */
 	if (flag & FLAG_DSACKING_ACK)
-		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+		tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+				      &rexmit);
 	/* If this ack opens up a zero window, clear backoff.  It was
 	 * being used to time the probes, and is probably far higher than
 	 * it needs to be for normal retransmission.
@@ -3663,7 +3643,8 @@ old_ack:
 	if (TCP_SKB_CB(skb)->sacked) {
 		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
 						&sack_state);
-		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+		tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+				      &rexmit);
 		tcp_xmit_recovery(sk, rexmit);
 	}
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 326c9282bf94..e36eff0403f4 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -475,7 +475,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->packets_out = 0;
 		newtp->retrans_out = 0;
 		newtp->sacked_out = 0;
-		newtp->fackets_out = 0;
 		newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 		newtp->tlp_high_seq = 0;
 		newtp->lsndtime = tcp_jiffies32;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 094c429b4401..0256f7a41041 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1218,21 +1218,6 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
 	}
 }
 
-/* When a modification to fackets out becomes necessary, we need to check
- * skb is counted to fackets_out or not.
- */
-static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
-				   int decr)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	if (!tp->sacked_out || tcp_is_reno(tp))
-		return;
-
-	if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
-		tp->fackets_out -= decr;
-}
-
 /* Pcount in the middle of the write queue got changed, we need to do various
  * tweaks to fix counters
  */
@@ -1253,8 +1238,6 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
 	if (tcp_is_reno(tp) && decr > 0)
 		tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
 
-	tcp_adjust_fackets_out(sk, skb, decr);
-
 	if (tp->lost_skb_hint &&
 	    before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
 	    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
-- 
cgit v1.2.3


From 39b175211053c7a6a4d794c42e225994f1c069c2 Mon Sep 17 00:00:00 2001
From: Mat Martineau <mathew.j.martineau@linux.intel.com>
Date: Fri, 10 Nov 2017 14:03:51 -0800
Subject: net: Remove unused skb_shared_info member

ip6_frag_id was only used by UFO, which has been removed.
ipv6_proxy_select_ident() only existed to set ip6_frag_id and has no
in-tree callers.

Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  1 -
 include/net/ipv6.h     |  1 -
 net/ipv6/output_core.c | 31 -------------------------------
 3 files changed, 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 57d712671081..54fe91183a8e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -500,7 +500,6 @@ struct skb_shared_info {
 	struct skb_shared_hwtstamps hwtstamps;
 	unsigned int	gso_type;
 	u32		tskey;
-	__be32          ip6_frag_id;
 
 	/*
 	 * Warning : all fields before dataref are cleared in __alloc_skb()
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index fb6d67012de6..ec14f0d5a3a1 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -767,7 +767,6 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add
 __be32 ipv6_select_ident(struct net *net,
 			 const struct in6_addr *daddr,
 			 const struct in6_addr *saddr);
-void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb);
 
 int ip6_dst_hoplimit(struct dst_entry *dst);
 
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index a338bbc33cf3..4a7e5ffa5108 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -31,37 +31,6 @@ static u32 __ipv6_select_ident(struct net *net, u32 hashrnd,
 	return id;
 }
 
-/* This function exists only for tap drivers that must support broken
- * clients requesting UFO without specifying an IPv6 fragment ID.
- *
- * This is similar to ipv6_select_ident() but we use an independent hash
- * seed to limit information leakage.
- *
- * The network header must be set before calling this.
- */
-void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
-{
-	static u32 ip6_proxy_idents_hashrnd __read_mostly;
-	struct in6_addr buf[2];
-	struct in6_addr *addrs;
-	u32 id;
-
-	addrs = skb_header_pointer(skb,
-				   skb_network_offset(skb) +
-				   offsetof(struct ipv6hdr, saddr),
-				   sizeof(buf), buf);
-	if (!addrs)
-		return;
-
-	net_get_random_once(&ip6_proxy_idents_hashrnd,
-			    sizeof(ip6_proxy_idents_hashrnd));
-
-	id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd,
-				 &addrs[1], &addrs[0]);
-	skb_shinfo(skb)->ip6_frag_id = htonl(id);
-}
-EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);
-
 __be32 ipv6_select_ident(struct net *net,
 			 const struct in6_addr *daddr,
 			 const struct in6_addr *saddr)
-- 
cgit v1.2.3


From b24591e2fcf852ad7ad2ccf745c8220bf378d312 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 9 Nov 2017 12:35:07 +0000
Subject: timers: Add a function to start/reduce a timer

Add a function, similar to mod_timer(), that will start a timer if it isn't
running and will modify it if it is running and has an expiry time longer
than the new time.  If the timer is running with an expiry time that's the
same or sooner, no change is made.

The function looks like:

	int timer_reduce(struct timer_list *timer, unsigned long expires);

This can be used by code such as networking code to make it easier to share
a timer for multiple timeouts.  For instance, in upcoming AF_RXRPC code,
the rxrpc_call struct will maintain a number of timeouts:

	unsigned long	ack_at;
	unsigned long	resend_at;
	unsigned long	ping_at;
	unsigned long	expect_rx_by;
	unsigned long	expect_req_by;
	unsigned long	expect_term_by;

each of which is set independently of the others.  With timer reduction
available, when the code needs to set one of the timeouts, it only needs to
look at that timeout and then call timer_reduce() to modify the timer,
starting it or bringing it forward if necessary.  There is no need to refer
to the other timeouts to see which is earliest and no need to take any lock
other than, potentially, the timer lock inside timer_reduce().

Note, that this does not protect against concurrent invocations of any of
the timer functions.

As an example, the expect_rx_by timeout above, which terminates a call if
we don't get a packet from the server within a certain time window, would
be set something like this:

	unsigned long now = jiffies;
	unsigned long expect_rx_by = now + packet_receive_timeout;
	WRITE_ONCE(call->expect_rx_by, expect_rx_by);
	timer_reduce(&call->timer, expect_rx_by);

The timer service code (which might, say, be in a work function) would then
check all the timeouts to see which, if any, had triggered, deal with
those:

	t = READ_ONCE(call->ack_at);
	if (time_after_eq(now, t)) {
		cmpxchg(&call->ack_at, t, now + MAX_JIFFY_OFFSET);
		set_bit(RXRPC_CALL_EV_ACK, &call->events);
	}

and then restart the timer if necessary by finding the soonest timeout that
hasn't yet passed and then calling timer_reduce().

The disadvantage of doing things this way rather than comparing the timers
each time and calling mod_timer() is that you *will* take timer events
unless you can finish what you're doing and delete the timer in time.

The advantage of doing things this way is that you don't need to use a lock
to work out when the next timer should be set, other than the timer's own
lock - which you might not have to take.

[ tglx: Fixed weird formatting and adopted it to pending changes ]

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: keyrings@vger.kernel.org
Cc: linux-afs@lists.infradead.org
Link: https://lkml.kernel.org/r/151023090769.23050.1801643667223880753.stgit@warthog.procyon.org.uk
---
 include/linux/timer.h |  1 +
 kernel/time/timer.c   | 45 ++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 9f8895decb82..37b5e2f74d21 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -203,6 +203,7 @@ extern void add_timer_on(struct timer_list *timer, int cpu);
 extern int del_timer(struct timer_list * timer);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
+extern int timer_reduce(struct timer_list *timer, unsigned long expires);
 
 /*
  * The jiffies value which is added to now, when there is no timer
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index fbb1f85327bf..af0b8bae4502 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -929,8 +929,11 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
 	}
 }
 
+#define MOD_TIMER_PENDING_ONLY		0x01
+#define MOD_TIMER_REDUCE		0x02
+
 static inline int
-__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
+__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
 {
 	struct timer_base *base, *new_base;
 	unsigned int idx = UINT_MAX;
@@ -950,7 +953,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 		 * larger granularity than you would get from adding a new
 		 * timer with this expiry.
 		 */
-		if (timer->expires == expires)
+		long diff = timer->expires - expires;
+
+		if (!diff)
+			return 1;
+		if (options & MOD_TIMER_REDUCE && diff <= 0)
 			return 1;
 
 		/*
@@ -962,6 +969,12 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 		base = lock_timer_base(timer, &flags);
 		forward_timer_base(base);
 
+		if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
+		    time_before_eq(timer->expires, expires)) {
+			ret = 1;
+			goto out_unlock;
+		}
+
 		clk = base->clk;
 		idx = calc_wheel_index(expires, clk);
 
@@ -971,7 +984,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 		 * subsequent call will exit in the expires check above.
 		 */
 		if (idx == timer_get_idx(timer)) {
-			timer->expires = expires;
+			if (!(options & MOD_TIMER_REDUCE))
+				timer->expires = expires;
+			else if (time_after(timer->expires, expires))
+				timer->expires = expires;
 			ret = 1;
 			goto out_unlock;
 		}
@@ -981,7 +997,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 	}
 
 	ret = detach_if_pending(timer, base, false);
-	if (!ret && pending_only)
+	if (!ret && (options & MOD_TIMER_PENDING_ONLY))
 		goto out_unlock;
 
 	debug_activate(timer, expires);
@@ -1042,7 +1058,7 @@ out_unlock:
  */
 int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 {
-	return __mod_timer(timer, expires, true);
+	return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
 }
 EXPORT_SYMBOL(mod_timer_pending);
 
@@ -1068,10 +1084,25 @@ EXPORT_SYMBOL(mod_timer_pending);
  */
 int mod_timer(struct timer_list *timer, unsigned long expires)
 {
-	return __mod_timer(timer, expires, false);
+	return __mod_timer(timer, expires, 0);
 }
 EXPORT_SYMBOL(mod_timer);
 
+/**
+ * timer_reduce - Modify a timer's timeout if it would reduce the timeout
+ * @timer:	The timer to be modified
+ * @expires:	New timeout in jiffies
+ *
+ * timer_reduce() is very similar to mod_timer(), except that it will only
+ * modify a running timer if that would reduce the expiration time (it will
+ * start a timer that isn't running).
+ */
+int timer_reduce(struct timer_list *timer, unsigned long expires)
+{
+	return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
+}
+EXPORT_SYMBOL(timer_reduce);
+
 /**
  * add_timer - start a timer
  * @timer: the timer to be added
@@ -1754,7 +1785,7 @@ signed long __sched schedule_timeout(signed long timeout)
 
 	timer.task = current;
 	timer_setup_on_stack(&timer.timer, process_timeout, 0);
-	__mod_timer(&timer.timer, expire, false);
+	__mod_timer(&timer.timer, expire, 0);
 	schedule();
 	del_singleshot_timer_sync(&timer.timer);
 
-- 
cgit v1.2.3


From 516fb7f2e73dcc303fb97fc3593209fcacf2d982 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 12 Nov 2017 18:44:23 -0800
Subject: /proc/module: use the same logic as /proc/kallsyms for address
 exposure

The (alleged) users of the module addresses are the same: kernel
profiling.

So just expose the same helper and format macros, and unify the logic.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kallsyms.h |  8 ++++++++
 kernel/kallsyms.c        |  8 +-------
 kernel/module.c          | 20 ++++++++++++++++++--
 3 files changed, 27 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index 11dd93e42580..0a777c5216b1 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -14,6 +14,14 @@
 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \
 			 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1)
 
+/* How and when do we show kallsyms values? */
+extern int kallsyms_show_value(void);
+#ifndef CONFIG_64BIT
+# define KALLSYM_FMT "%08lx"
+#else
+# define KALLSYM_FMT "%016lx"
+#endif
+
 struct module;
 
 #ifdef CONFIG_KALLSYMS
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 51b49ed452e4..1e6ae66c6244 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -581,12 +581,6 @@ static void s_stop(struct seq_file *m, void *p)
 {
 }
 
-#ifndef CONFIG_64BIT
-# define KALLSYM_FMT "%08lx"
-#else
-# define KALLSYM_FMT "%016lx"
-#endif
-
 static int s_show(struct seq_file *m, void *p)
 {
 	unsigned long value;
@@ -640,7 +634,7 @@ static inline int kallsyms_for_perf(void)
  * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to
  * block even that).
  */
-static int kallsyms_show_value(void)
+int kallsyms_show_value(void)
 {
 	switch (kptr_restrict) {
 	case 0:
diff --git a/kernel/module.c b/kernel/module.c
index fdb3a6aca363..0122747ba150 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4147,6 +4147,7 @@ static int m_show(struct seq_file *m, void *p)
 {
 	struct module *mod = list_entry(p, struct module, list);
 	char buf[MODULE_FLAGS_BUF_SIZE];
+	unsigned long value;
 
 	/* We always ignore unformed modules. */
 	if (mod->state == MODULE_STATE_UNFORMED)
@@ -4162,7 +4163,8 @@ static int m_show(struct seq_file *m, void *p)
 		   mod->state == MODULE_STATE_COMING ? "Loading" :
 		   "Live");
 	/* Used by oprofile and other similar tools. */
-	seq_printf(m, " 0x%pK", mod->core_layout.base);
+	value = m->private ? 0 : (unsigned long)mod->core_layout.base;
+	seq_printf(m, " 0x" KALLSYM_FMT, value);
 
 	/* Taints info */
 	if (mod->taints)
@@ -4184,9 +4186,23 @@ static const struct seq_operations modules_op = {
 	.show	= m_show
 };
 
+/*
+ * This also sets the "private" pointer to non-NULL if the
+ * kernel pointers should be hidden (so you can just test
+ * "m->private" to see if you should keep the values private).
+ *
+ * We use the same logic as for /proc/kallsyms.
+ */
 static int modules_open(struct inode *inode, struct file *file)
 {
-	return seq_open(file, &modules_op);
+	int err = seq_open(file, &modules_op);
+
+	if (!err) {
+		struct seq_file *m = file->private_data;
+		m->private = kallsyms_show_value() ? NULL : (void *)8ul;
+	}
+
+	return 0;
 }
 
 static const struct file_operations proc_modules_operations = {
-- 
cgit v1.2.3


From fcdfafcb73be8fa45909327bbddca46fb362a675 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 13 Nov 2017 07:59:10 +0100
Subject: kprobes: Don't spam the build log with deprecation warnings

The jprobes APIs are deprecated - but are still in occasional use for code that
few people seem to care about, so stop generating deprecation warnings.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kprobes.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 56b2e698dbad..9440a2fc8893 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -468,18 +468,18 @@ static inline int enable_kprobe(struct kprobe *kp)
 	return -ENOSYS;
 }
 #endif /* CONFIG_KPROBES */
-static inline int __deprecated register_jprobe(struct jprobe *p)
+static inline int register_jprobe(struct jprobe *p)
 {
 	return -ENOSYS;
 }
-static inline int __deprecated register_jprobes(struct jprobe **jps, int num)
+static inline int register_jprobes(struct jprobe **jps, int num)
 {
 	return -ENOSYS;
 }
-static inline void __deprecated unregister_jprobe(struct jprobe *p)
+static inline void unregister_jprobe(struct jprobe *p)
 {
 }
-static inline void __deprecated unregister_jprobes(struct jprobe **jps, int num)
+static inline void unregister_jprobes(struct jprobe **jps, int num)
 {
 }
 static inline int disable_kretprobe(struct kretprobe *rp)
@@ -490,11 +490,11 @@ static inline int enable_kretprobe(struct kretprobe *rp)
 {
 	return enable_kprobe(&rp->kp);
 }
-static inline int __deprecated disable_jprobe(struct jprobe *jp)
+static inline int disable_jprobe(struct jprobe *jp)
 {
 	return -ENOSYS;
 }
-static inline int __deprecated enable_jprobe(struct jprobe *jp)
+static inline int enable_jprobe(struct jprobe *jp)
 {
 	return -ENOSYS;
 }
-- 
cgit v1.2.3


From f4c09f87adfe31587aa4b2aea2cb2dbde2150f54 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 13 Nov 2017 09:39:01 +0100
Subject: cpu/hotplug: Get rid of CPU hotplug notifier leftovers

The CPU hotplug notifiers are history. Remove the last reminders.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 .../fault-injection/notifier-error-inject.txt      | 30 ----------------------
 Documentation/power/suspend-and-cpuhotplug.txt     |  9 +++----
 include/linux/cpu.h                                | 27 ++++++++-----------
 3 files changed, 14 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/fault-injection/notifier-error-inject.txt b/Documentation/fault-injection/notifier-error-inject.txt
index 83d3f4e43e91..e861d761de24 100644
--- a/Documentation/fault-injection/notifier-error-inject.txt
+++ b/Documentation/fault-injection/notifier-error-inject.txt
@@ -6,41 +6,11 @@ specified notifier chain callbacks. It is useful to test the error handling of
 notifier call chain failures which is rarely executed.  There are kernel
 modules that can be used to test the following notifiers.
 
- * CPU notifier
  * PM notifier
  * Memory hotplug notifier
  * powerpc pSeries reconfig notifier
  * Netdevice notifier
 
-CPU notifier error injection module
------------------------------------
-This feature can be used to test the error handling of the CPU notifiers by
-injecting artificial errors to CPU notifier chain callbacks.
-
-If the notifier call chain should be failed with some events notified, write
-the error code to debugfs interface
-/sys/kernel/debug/notifier-error-inject/cpu/actions/<notifier event>/error
-
-Possible CPU notifier events to be failed are:
-
- * CPU_UP_PREPARE
- * CPU_UP_PREPARE_FROZEN
- * CPU_DOWN_PREPARE
- * CPU_DOWN_PREPARE_FROZEN
-
-Example1: Inject CPU offline error (-1 == -EPERM)
-
-	# cd /sys/kernel/debug/notifier-error-inject/cpu
-	# echo -1 > actions/CPU_DOWN_PREPARE/error
-	# echo 0 > /sys/devices/system/cpu/cpu1/online
-	bash: echo: write error: Operation not permitted
-
-Example2: inject CPU online error (-2 == -ENOENT)
-
-	# echo -2 > actions/CPU_UP_PREPARE/error
-	# echo 1 > /sys/devices/system/cpu/cpu1/online
-	bash: echo: write error: No such file or directory
-
 PM notifier error injection module
 ----------------------------------
 This feature is controlled through debugfs interface
diff --git a/Documentation/power/suspend-and-cpuhotplug.txt b/Documentation/power/suspend-and-cpuhotplug.txt
index 2fc909502db5..31abd04b9572 100644
--- a/Documentation/power/suspend-and-cpuhotplug.txt
+++ b/Documentation/power/suspend-and-cpuhotplug.txt
@@ -232,7 +232,7 @@ d. Handling microcode update during suspend/hibernate:
    hibernate/restore cycle.]
 
    In the current design of the kernel however, during a CPU offline operation
-   as part of the suspend/hibernate cycle (the CPU_DEAD_FROZEN notification),
+   as part of the suspend/hibernate cycle (cpuhp_tasks_frozen is set),
    the existing copy of microcode image in the kernel is not freed up.
    And during the CPU online operations (during resume/restore), since the
    kernel finds that it already has copies of the microcode images for all the
@@ -252,10 +252,9 @@ Yes, they are listed below:
    the _cpu_down() and _cpu_up() functions is *always* 0.
    This might not reflect the true current state of the system, since the
    tasks could have been frozen by an out-of-band event such as a suspend
-   operation in progress. Hence, it will lead to wrong notifications being
-   sent during the cpu online/offline events (eg, CPU_ONLINE notification
-   instead of CPU_ONLINE_FROZEN) which in turn will lead to execution of
-   inappropriate code by the callbacks registered for such CPU hotplug events.
+   operation in progress. Hence, the cpuhp_tasks_frozen variable will not
+   reflect the frozen state and the CPU hotplug callbacks which evaluate
+   that variable might execute the wrong code path.
 
 2. If a regular CPU hotplug stress test happens to race with the freezer due
    to a suspend operation in progress at the same time, then we could hit the
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index cd4771b772c0..b6e4a598b2cd 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -55,24 +55,17 @@ extern void unregister_cpu(struct cpu *cpu);
 extern ssize_t arch_cpu_probe(const char *, size_t);
 extern ssize_t arch_cpu_release(const char *, size_t);
 #endif
-struct notifier_block;
-
-#define CPU_ONLINE		0x0002 /* CPU (unsigned)v is up */
-#define CPU_UP_PREPARE		0x0003 /* CPU (unsigned)v coming up */
-#define CPU_DEAD		0x0007 /* CPU (unsigned)v dead */
-#define CPU_POST_DEAD		0x0009 /* CPU (unsigned)v dead, cpu_hotplug
-					* lock is dropped */
-#define CPU_BROKEN		0x000B /* CPU (unsigned)v did not die properly,
-					* perhaps due to preemption. */
-
-/* Used for CPU hotplug events occurring while tasks are frozen due to a suspend
- * operation in progress
- */
-#define CPU_TASKS_FROZEN	0x0010
 
-#define CPU_ONLINE_FROZEN	(CPU_ONLINE | CPU_TASKS_FROZEN)
-#define CPU_UP_PREPARE_FROZEN	(CPU_UP_PREPARE | CPU_TASKS_FROZEN)
-#define CPU_DEAD_FROZEN		(CPU_DEAD | CPU_TASKS_FROZEN)
+/*
+ * These states are not related to the core CPU hotplug mechanism. They are
+ * used by various (sub)architectures to track internal state
+ */
+#define CPU_ONLINE		0x0002 /* CPU is up */
+#define CPU_UP_PREPARE		0x0003 /* CPU coming up */
+#define CPU_DEAD		0x0007 /* CPU dead */
+#define CPU_DEAD_FROZEN		0x0008 /* CPU timed out on unplug */
+#define CPU_POST_DEAD		0x0009 /* CPU successfully unplugged */
+#define CPU_BROKEN		0x000B /* CPU did not die properly */
 
 #ifdef CONFIG_SMP
 extern bool cpuhp_tasks_frozen;
-- 
cgit v1.2.3


From 5e4def20381678ba3ce0a4e117f97e378ecd81bc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 2 Nov 2017 15:27:44 +0000
Subject: Pass mode to wait_on_atomic_t() action funcs and provide default
 actions

Make wait_on_atomic_t() pass the TASK_* mode onto its action function as an
extra argument and make it 'unsigned int throughout.

Also, consolidate a bunch of identical action functions into a default
function that can do the appropriate thing for the mode.

Also, change the argument name in the bit_wait*() function declarations to
reflect the fact that it's the mode and not the bit number.

[Peter Z gives this a grudging ACK, but thinks that the whole atomic_t wait
should be done differently, though he's not immediately sure as to how]

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
cc: Ingo Molnar <mingo@kernel.org>
---
 arch/mips/kernel/traps.c                           | 14 +----------
 drivers/gpu/drm/drm_dp_aux_dev.c                   |  8 +------
 drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c | 10 ++------
 drivers/media/platform/qcom/venus/hfi.c            |  8 +------
 fs/afs/rxrpc.c                                     |  8 +------
 fs/btrfs/extent-tree.c                             | 27 +++-------------------
 fs/fscache/cookie.c                                |  2 +-
 fs/fscache/internal.h                              |  2 --
 fs/fscache/main.c                                  |  9 --------
 fs/nfs/inode.c                                     |  4 ++--
 fs/nfs/internal.h                                  |  2 +-
 fs/ocfs2/filecheck.c                               |  8 +------
 include/linux/wait_bit.h                           | 15 +++++++-----
 kernel/sched/wait_bit.c                            | 18 +++++++++++----
 14 files changed, 37 insertions(+), 98 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 5669d3b8bd38..5d19ed07e99d 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -1233,18 +1233,6 @@ static int default_cu2_call(struct notifier_block *nfb, unsigned long action,
 	return NOTIFY_OK;
 }
 
-static int wait_on_fp_mode_switch(atomic_t *p)
-{
-	/*
-	 * The FP mode for this task is currently being switched. That may
-	 * involve modifications to the format of this tasks FP context which
-	 * make it unsafe to proceed with execution for the moment. Instead,
-	 * schedule some other task.
-	 */
-	schedule();
-	return 0;
-}
-
 static int enable_restore_fp_context(int msa)
 {
 	int err, was_fpu_owner, prior_msa;
@@ -1254,7 +1242,7 @@ static int enable_restore_fp_context(int msa)
 	 * complete before proceeding.
 	 */
 	wait_on_atomic_t(&current->mm->context.fp_mode_switching,
-			 wait_on_fp_mode_switch, TASK_KILLABLE);
+			 atomic_t_wait, TASK_KILLABLE);
 
 	if (!used_math()) {
 		/* First time FP context user. */
diff --git a/drivers/gpu/drm/drm_dp_aux_dev.c b/drivers/gpu/drm/drm_dp_aux_dev.c
index d34e5096887a..053044201e31 100644
--- a/drivers/gpu/drm/drm_dp_aux_dev.c
+++ b/drivers/gpu/drm/drm_dp_aux_dev.c
@@ -263,12 +263,6 @@ static struct drm_dp_aux_dev *drm_dp_aux_dev_get_by_aux(struct drm_dp_aux *aux)
 	return aux_dev;
 }
 
-static int auxdev_wait_atomic_t(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
-
 void drm_dp_aux_unregister_devnode(struct drm_dp_aux *aux)
 {
 	struct drm_dp_aux_dev *aux_dev;
@@ -283,7 +277,7 @@ void drm_dp_aux_unregister_devnode(struct drm_dp_aux *aux)
 	mutex_unlock(&aux_idr_mutex);
 
 	atomic_dec(&aux_dev->usecount);
-	wait_on_atomic_t(&aux_dev->usecount, auxdev_wait_atomic_t,
+	wait_on_atomic_t(&aux_dev->usecount, atomic_t_wait,
 			 TASK_UNINTERRUPTIBLE);
 
 	minor = aux_dev->index;
diff --git a/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c b/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
index 828904b7d468..54fc571b1102 100644
--- a/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
@@ -271,13 +271,7 @@ struct igt_wakeup {
 	u32 seqno;
 };
 
-static int wait_atomic(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
-
-static int wait_atomic_timeout(atomic_t *p)
+static int wait_atomic_timeout(atomic_t *p, unsigned int mode)
 {
 	return schedule_timeout(10 * HZ) ? 0 : -ETIMEDOUT;
 }
@@ -348,7 +342,7 @@ static void igt_wake_all_sync(atomic_t *ready,
 	atomic_set(ready, 0);
 	wake_up_all(wq);
 
-	wait_on_atomic_t(set, wait_atomic, TASK_UNINTERRUPTIBLE);
+	wait_on_atomic_t(set, atomic_t_wait, TASK_UNINTERRUPTIBLE);
 	atomic_set(ready, count);
 	atomic_set(done, count);
 }
diff --git a/drivers/media/platform/qcom/venus/hfi.c b/drivers/media/platform/qcom/venus/hfi.c
index c09490876516..e374c7d1a618 100644
--- a/drivers/media/platform/qcom/venus/hfi.c
+++ b/drivers/media/platform/qcom/venus/hfi.c
@@ -88,12 +88,6 @@ unlock:
 	return ret;
 }
 
-static int core_deinit_wait_atomic_t(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
-
 int hfi_core_deinit(struct venus_core *core, bool blocking)
 {
 	int ret = 0, empty;
@@ -112,7 +106,7 @@ int hfi_core_deinit(struct venus_core *core, bool blocking)
 
 	if (!empty) {
 		mutex_unlock(&core->lock);
-		wait_on_atomic_t(&core->insts_count, core_deinit_wait_atomic_t,
+		wait_on_atomic_t(&core->insts_count, atomic_t_wait,
 				 TASK_UNINTERRUPTIBLE);
 		mutex_lock(&core->lock);
 	}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index bb1e2caa1720..77f5420a1a24 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -41,12 +41,6 @@ static void afs_charge_preallocation(struct work_struct *);
 
 static DECLARE_WORK(afs_charge_preallocation_work, afs_charge_preallocation);
 
-static int afs_wait_atomic_t(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
-
 /*
  * open an RxRPC socket and bind it to be a server for callback notifications
  * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
@@ -121,7 +115,7 @@ void afs_close_socket(void)
 	}
 
 	_debug("outstanding %u", atomic_read(&afs_outstanding_calls));
-	wait_on_atomic_t(&afs_outstanding_calls, afs_wait_atomic_t,
+	wait_on_atomic_t(&afs_outstanding_calls, atomic_t_wait,
 			 TASK_UNINTERRUPTIBLE);
 	_debug("no outstanding calls");
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e2d7e86b51d1..24cefde30e30 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4016,16 +4016,9 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
 	btrfs_put_block_group(bg);
 }
 
-static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
-{
-	schedule();
-	return 0;
-}
-
 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
 {
-	wait_on_atomic_t(&bg->nocow_writers,
-			 btrfs_wait_nocow_writers_atomic_t,
+	wait_on_atomic_t(&bg->nocow_writers, atomic_t_wait,
 			 TASK_UNINTERRUPTIBLE);
 }
 
@@ -6595,12 +6588,6 @@ void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
 	btrfs_put_block_group(bg);
 }
 
-static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
-{
-	schedule();
-	return 0;
-}
-
 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
 {
 	struct btrfs_space_info *space_info = bg->space_info;
@@ -6623,8 +6610,7 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
 	down_write(&space_info->groups_sem);
 	up_write(&space_info->groups_sem);
 
-	wait_on_atomic_t(&bg->reservations,
-			 btrfs_wait_bg_reservations_atomic_t,
+	wait_on_atomic_t(&bg->reservations, atomic_t_wait,
 			 TASK_UNINTERRUPTIBLE);
 }
 
@@ -11106,12 +11092,6 @@ int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
 	return 1;
 }
 
-static int wait_snapshotting_atomic_t(atomic_t *a)
-{
-	schedule();
-	return 0;
-}
-
 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
 {
 	while (true) {
@@ -11120,8 +11100,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
 		ret = btrfs_start_write_no_snapshotting(root);
 		if (ret)
 			break;
-		wait_on_atomic_t(&root->will_be_snapshotted,
-				 wait_snapshotting_atomic_t,
+		wait_on_atomic_t(&root->will_be_snapshotted, atomic_t_wait,
 				 TASK_UNINTERRUPTIBLE);
 	}
 }
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 40d61077bead..ff84258132bb 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -558,7 +558,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
 	 * have completed.
 	 */
 	if (!atomic_dec_and_test(&cookie->n_active))
-		wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
+		wait_on_atomic_t(&cookie->n_active, atomic_t_wait,
 				 TASK_UNINTERRUPTIBLE);
 
 	/* Make sure any pending writes are cancelled. */
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 97ec45110957..0ff4b49a0037 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -97,8 +97,6 @@ static inline bool fscache_object_congested(void)
 	return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
 }
 
-extern int fscache_wait_atomic_t(atomic_t *);
-
 /*
  * object.c
  */
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index b39d487ccfb0..249968dcbf5c 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -195,12 +195,3 @@ static void __exit fscache_exit(void)
 }
 
 module_exit(fscache_exit);
-
-/*
- * wait_on_atomic_t() sleep function for uninterruptible waiting
- */
-int fscache_wait_atomic_t(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 134d9f560240..1629056aa2c9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -85,9 +85,9 @@ int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
 }
 EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
 
-int nfs_wait_atomic_killable(atomic_t *p)
+int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode)
 {
-	return nfs_wait_killable(TASK_KILLABLE);
+	return nfs_wait_killable(mode);
 }
 
 /**
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f9a4a5524bd5..5ab17fd4700a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -388,7 +388,7 @@ extern void nfs_evict_inode(struct inode *);
 void nfs_zap_acl_cache(struct inode *inode);
 extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
 extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
-extern int nfs_wait_atomic_killable(atomic_t *p);
+extern int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode);
 
 /* super.c */
 extern const struct super_operations nfs_sops;
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
index 2cabbcf2f28e..e87279e49ba3 100644
--- a/fs/ocfs2/filecheck.c
+++ b/fs/ocfs2/filecheck.c
@@ -129,19 +129,13 @@ static struct kobj_attribute ocfs2_attr_filecheck_set =
 					ocfs2_filecheck_show,
 					ocfs2_filecheck_store);
 
-static int ocfs2_filecheck_sysfs_wait(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
-
 static void
 ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
 {
 	struct ocfs2_filecheck_entry *p;
 
 	if (!atomic_dec_and_test(&entry->fs_count))
-		wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait,
+		wait_on_atomic_t(&entry->fs_count, atomic_t_wait,
 				 TASK_UNINTERRUPTIBLE);
 
 	spin_lock(&entry->fs_fcheck->fc_lock);
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index af0d495430d7..61b39eaf7cad 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -26,6 +26,8 @@ struct wait_bit_queue_entry {
 	{ .flags = p, .bit_nr = WAIT_ATOMIC_T_BIT_NR, }
 
 typedef int wait_bit_action_f(struct wait_bit_key *key, int mode);
+typedef int wait_atomic_t_action_f(atomic_t *counter, unsigned int mode);
+
 void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit);
 int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
 int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
@@ -34,7 +36,7 @@ void wake_up_atomic_t(atomic_t *p);
 int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode);
 int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout);
 int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode);
-int out_of_line_wait_on_atomic_t(atomic_t *p, int (*)(atomic_t *), unsigned int mode);
+int out_of_line_wait_on_atomic_t(atomic_t *p, wait_atomic_t_action_f action, unsigned int mode);
 struct wait_queue_head *bit_waitqueue(void *word, int bit);
 extern void __init wait_bit_init(void);
 
@@ -51,10 +53,11 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
 		},								\
 	}
 
-extern int bit_wait(struct wait_bit_key *key, int bit);
-extern int bit_wait_io(struct wait_bit_key *key, int bit);
-extern int bit_wait_timeout(struct wait_bit_key *key, int bit);
-extern int bit_wait_io_timeout(struct wait_bit_key *key, int bit);
+extern int bit_wait(struct wait_bit_key *key, int mode);
+extern int bit_wait_io(struct wait_bit_key *key, int mode);
+extern int bit_wait_timeout(struct wait_bit_key *key, int mode);
+extern int bit_wait_io_timeout(struct wait_bit_key *key, int mode);
+extern int atomic_t_wait(atomic_t *counter, unsigned int mode);
 
 /**
  * wait_on_bit - wait for a bit to be cleared
@@ -251,7 +254,7 @@ wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
  * outside of the target 'word'.
  */
 static inline
-int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode)
+int wait_on_atomic_t(atomic_t *val, wait_atomic_t_action_f action, unsigned mode)
 {
 	might_sleep();
 	if (atomic_read(val) == 0)
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index f8159698aa4d..84cb3acd9260 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -183,7 +183,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo
  */
 static __sched
 int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
-		       int (*action)(atomic_t *), unsigned mode)
+		       wait_atomic_t_action_f action, unsigned int mode)
 {
 	atomic_t *val;
 	int ret = 0;
@@ -193,7 +193,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
 		val = wbq_entry->key.flags;
 		if (atomic_read(val) == 0)
 			break;
-		ret = (*action)(val);
+		ret = (*action)(val, mode);
 	} while (!ret && atomic_read(val) != 0);
 	finish_wait(wq_head, &wbq_entry->wq_entry);
 	return ret;
@@ -210,8 +210,9 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
 		},							\
 	}
 
-__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
-					 unsigned mode)
+__sched int out_of_line_wait_on_atomic_t(atomic_t *p,
+					 wait_atomic_t_action_f action,
+					 unsigned int mode)
 {
 	struct wait_queue_head *wq_head = atomic_t_waitqueue(p);
 	DEFINE_WAIT_ATOMIC_T(wq_entry, p);
@@ -220,6 +221,15 @@ __sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
 }
 EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
 
+__sched int atomic_t_wait(atomic_t *counter, unsigned int mode)
+{
+	schedule();
+	if (signal_pending_state(mode, current))
+		return -EINTR;
+	return 0;
+}
+EXPORT_SYMBOL(atomic_t_wait);
+
 /**
  * wake_up_atomic_t - Wake up a waiter on a atomic_t
  * @p: The atomic_t being waited on, a kernel virtual address
-- 
cgit v1.2.3


From 47f9d0bf526058b3fdc077698fcb19748d5700e4 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 13 Nov 2017 16:21:33 +0000
Subject: irqchip/gic-v4: Add forward definition of struct irq_domain_ops

In some randconfig scenarios, including arm-gic-v4.h results
in a spurious wawrning about the $SUBJECT structure not being
defined. Adding a forward definition keeps it quiet.

Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqchip/arm-gic-v4.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h
index 447da8ca2156..fa683ea5c769 100644
--- a/include/linux/irqchip/arm-gic-v4.h
+++ b/include/linux/irqchip/arm-gic-v4.h
@@ -109,6 +109,7 @@ int its_get_vlpi(int irq, struct its_vlpi_map *map);
 int its_unmap_vlpi(int irq);
 int its_prop_update_vlpi(int irq, u8 config, bool inv);
 
+struct irq_domain_ops;
 int its_init_v4(struct irq_domain *domain, const struct irq_domain_ops *ops);
 
 #endif
-- 
cgit v1.2.3


From 8a7a8e1eab929eb3a5b735a788a23b9731139046 Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Mon, 13 Nov 2017 13:49:04 +0800
Subject: timekeeping: Eliminate the stale declaration of
 ktime_get_raw_and_real_ts64()

Commit ba26621e63ce got rid of ktime_get_raw_and_real_ts64(), but left its
declaration behind.

Remove it.

Fixes: ba26621e63ce ("time: Remove duplicated code in ktime_get_raw_and_real()")
Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Christopher S. Hall <christopher.s.hall@intel.com>
Cc: joelaf@google.com
Cc: arnd@arndb.de
Cc: gregkh@linuxfoundation.org
Cc: john.stultz@linaro.org
Cc: deepa.kernel@gmail.com
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/1510552144-20831-1-git-send-email-douly.fnst@cn.fujitsu.com
---
 include/linux/timekeeping.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 0021575fe871..51293e1aa4da 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -272,12 +272,6 @@ extern bool timekeeping_rtc_skipresume(void);
 
 extern void timekeeping_inject_sleeptime64(struct timespec64 *delta);
 
-/*
- * PPS accessor
- */
-extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw,
-				        struct timespec64 *ts_real);
-
 /*
  * struct system_time_snapshot - simultaneous raw/real time capture with
  *	counter value
-- 
cgit v1.2.3


From 01c313dded34a16ef69e3972ceca687ba8a7cdf2 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 13 Nov 2017 17:50:59 +0100
Subject: kallsyms: fix building without printk

Building kallsyms fails without CONFIG_PRINTK due to a missing
declaration:

  kernel/kallsyms.c: In function 'kallsyms_show_value':
  kernel/kallsyms.c:670:10: error: 'kptr_restrict' undeclared (first use in this function); did you mean 'keyring_restrict'?

This moves the declaration outside of the #ifdef guard, the definition
is already available without CONFIG_PRINTK.

Fixes: c0f3ea158939 ("stop using '%pK' for /proc/kallsyms pointer values")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
[ I clearly need to start doing "allnoconfig" builds too, or just have a
  test branch for the 0day robot - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 335926039adc..905bba92f015 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -189,7 +189,6 @@ extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 
 extern int printk_delay_msec;
 extern int dmesg_restrict;
-extern int kptr_restrict;
 
 extern int
 devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void __user *buf,
@@ -280,6 +279,8 @@ static inline void printk_safe_flush_on_panic(void)
 }
 #endif
 
+extern int kptr_restrict;
+
 extern asmlinkage void dump_stack(void) __cold;
 
 #ifndef pr_fmt
-- 
cgit v1.2.3


From e4a8ca3baa5557fa54557d42b5910ed0d3316922 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 13 Nov 2017 17:51:00 +0100
Subject: /proc/module: fix building without kallsyms

As reported by kernelci and other build bots, we now get a link
failure without CONFIG_KALLSYMS:

  module.c:(.text+0xf2c): undefined reference to `kallsyms_show_value'

This adds a dummy helper with the same name that can be used
for compilation. It's not entirely clear to me what this
should return for !CONFIG_KALLSYMS, I picked an unconditional
'false', which leads to the module address being unavailable
to user space.

Link: https://kernelci.org/build/mainline/branch/master/kernel/v4.14-5-g516fb7f2e73d/
Fixes: 516fb7f2e73d ("/proc/module: use the same logic as /proc/kallsyms for address exposure")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kallsyms.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index 0a777c5216b1..708f337d780b 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -14,8 +14,6 @@
 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \
 			 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1)
 
-/* How and when do we show kallsyms values? */
-extern int kallsyms_show_value(void);
 #ifndef CONFIG_64BIT
 # define KALLSYM_FMT "%08lx"
 #else
@@ -54,6 +52,9 @@ extern void __print_symbol(const char *fmt, unsigned long address);
 int lookup_symbol_name(unsigned long addr, char *symname);
 int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name);
 
+/* How and when do we show kallsyms values? */
+extern int kallsyms_show_value(void);
+
 #else /* !CONFIG_KALLSYMS */
 
 static inline unsigned long kallsyms_lookup_name(const char *name)
@@ -112,6 +113,11 @@ static inline int lookup_symbol_attrs(unsigned long addr, unsigned long *size, u
 	return -ERANGE;
 }
 
+static inline int kallsyms_show_value(void)
+{
+	return false;
+}
+
 /* Stupid that this does nothing, but I didn't create this mess. */
 #define __print_symbol(fmt, addr)
 #endif /*CONFIG_KALLSYMS*/
-- 
cgit v1.2.3


From 9a5941080ef29f1a0347ac2766e4d93312123b21 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@linaro.org>
Date: Tue, 7 Nov 2017 22:29:48 +0100
Subject: mtd: remove the get_unmapped_area method

It is now unused.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Reviewed-by: Richard Weinberger <richard@nod.at>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Tested-by: Chris Brandt <chris.brandt@renesas.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/chips/map_ram.c  | 18 ------------------
 drivers/mtd/chips/map_rom.c  | 17 -----------------
 drivers/mtd/devices/mtdram.c | 14 --------------
 drivers/mtd/mtdconcat.c      | 27 ---------------------------
 drivers/mtd/mtdpart.c        | 14 --------------
 include/linux/mtd/mtd.h      |  4 ----
 6 files changed, 94 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/map_ram.c b/drivers/mtd/chips/map_ram.c
index c3939dd230c6..1cd0fff0e940 100644
--- a/drivers/mtd/chips/map_ram.c
+++ b/drivers/mtd/chips/map_ram.c
@@ -20,8 +20,6 @@ static int mapram_write (struct mtd_info *, loff_t, size_t, size_t *, const u_ch
 static int mapram_erase (struct mtd_info *, struct erase_info *);
 static void mapram_nop (struct mtd_info *);
 static struct mtd_info *map_ram_probe(struct map_info *map);
-static unsigned long mapram_unmapped_area(struct mtd_info *, unsigned long,
-					  unsigned long, unsigned long);
 static int mapram_point (struct mtd_info *mtd, loff_t from, size_t len,
 			 size_t *retlen, void **virt, resource_size_t *phys);
 static int mapram_unpoint(struct mtd_info *mtd, loff_t from, size_t len);
@@ -68,7 +66,6 @@ static struct mtd_info *map_ram_probe(struct map_info *map)
 	mtd->type = MTD_RAM;
 	mtd->size = map->size;
 	mtd->_erase = mapram_erase;
-	mtd->_get_unmapped_area = mapram_unmapped_area;
 	mtd->_read = mapram_read;
 	mtd->_write = mapram_write;
 	mtd->_panic_write = mapram_write;
@@ -86,21 +83,6 @@ static struct mtd_info *map_ram_probe(struct map_info *map)
 	return mtd;
 }
 
-
-/*
- * Allow NOMMU mmap() to directly map the device (if not NULL)
- * - return the address to which the offset maps
- * - return -ENOSYS to indicate refusal to do the mapping
- */
-static unsigned long mapram_unmapped_area(struct mtd_info *mtd,
-					  unsigned long len,
-					  unsigned long offset,
-					  unsigned long flags)
-{
-	struct map_info *map = mtd->priv;
-	return (unsigned long) map->virt + offset;
-}
-
 static int mapram_point(struct mtd_info *mtd, loff_t from, size_t len,
 			size_t *retlen, void **virt, resource_size_t *phys)
 {
diff --git a/drivers/mtd/chips/map_rom.c b/drivers/mtd/chips/map_rom.c
index 72934c1fbbe0..20e3604b4d71 100644
--- a/drivers/mtd/chips/map_rom.c
+++ b/drivers/mtd/chips/map_rom.c
@@ -20,8 +20,6 @@ static int maprom_write (struct mtd_info *, loff_t, size_t, size_t *, const u_ch
 static void maprom_nop (struct mtd_info *);
 static struct mtd_info *map_rom_probe(struct map_info *map);
 static int maprom_erase (struct mtd_info *mtd, struct erase_info *info);
-static unsigned long maprom_unmapped_area(struct mtd_info *, unsigned long,
-					  unsigned long, unsigned long);
 static int maprom_point (struct mtd_info *mtd, loff_t from, size_t len,
 			 size_t *retlen, void **virt, resource_size_t *phys);
 static int maprom_unpoint(struct mtd_info *mtd, loff_t from, size_t len);
@@ -55,7 +53,6 @@ static struct mtd_info *map_rom_probe(struct map_info *map)
 	mtd->name = map->name;
 	mtd->type = MTD_ROM;
 	mtd->size = map->size;
-	mtd->_get_unmapped_area = maprom_unmapped_area;
 	mtd->_point = maprom_point;
 	mtd->_unpoint = maprom_unpoint;
 	mtd->_read = maprom_read;
@@ -72,20 +69,6 @@ static struct mtd_info *map_rom_probe(struct map_info *map)
 }
 
 
-/*
- * Allow NOMMU mmap() to directly map the device (if not NULL)
- * - return the address to which the offset maps
- * - return -ENOSYS to indicate refusal to do the mapping
- */
-static unsigned long maprom_unmapped_area(struct mtd_info *mtd,
-					  unsigned long len,
-					  unsigned long offset,
-					  unsigned long flags)
-{
-	struct map_info *map = mtd->priv;
-	return (unsigned long) map->virt + offset;
-}
-
 static int maprom_point(struct mtd_info *mtd, loff_t from, size_t len,
 			size_t *retlen, void **virt, resource_size_t *phys)
 {
diff --git a/drivers/mtd/devices/mtdram.c b/drivers/mtd/devices/mtdram.c
index 4418629e8dc0..0bf4aeaf0cb8 100644
--- a/drivers/mtd/devices/mtdram.c
+++ b/drivers/mtd/devices/mtdram.c
@@ -99,19 +99,6 @@ static int ram_unpoint(struct mtd_info *mtd, loff_t from, size_t len)
 	return 0;
 }
 
-/*
- * Allow NOMMU mmap() to directly map the device (if not NULL)
- * - return the address to which the offset maps
- * - return -ENOSYS to indicate refusal to do the mapping
- */
-static unsigned long ram_get_unmapped_area(struct mtd_info *mtd,
-					   unsigned long len,
-					   unsigned long offset,
-					   unsigned long flags)
-{
-	return (unsigned long) mtd->priv + offset;
-}
-
 static int ram_read(struct mtd_info *mtd, loff_t from, size_t len,
 		size_t *retlen, u_char *buf)
 {
@@ -156,7 +143,6 @@ int mtdram_init_device(struct mtd_info *mtd, void *mapped_address,
 	mtd->_erase = ram_erase;
 	mtd->_point = ram_point;
 	mtd->_unpoint = ram_unpoint;
-	mtd->_get_unmapped_area = ram_get_unmapped_area;
 	mtd->_read = ram_read;
 	mtd->_write = ram_write;
 
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index d573606b91c2..60bf53df5454 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -643,32 +643,6 @@ static int concat_block_markbad(struct mtd_info *mtd, loff_t ofs)
 	return err;
 }
 
-/*
- * try to support NOMMU mmaps on concatenated devices
- * - we don't support subdev spanning as we can't guarantee it'll work
- */
-static unsigned long concat_get_unmapped_area(struct mtd_info *mtd,
-					      unsigned long len,
-					      unsigned long offset,
-					      unsigned long flags)
-{
-	struct mtd_concat *concat = CONCAT(mtd);
-	int i;
-
-	for (i = 0; i < concat->num_subdev; i++) {
-		struct mtd_info *subdev = concat->subdev[i];
-
-		if (offset >= subdev->size) {
-			offset -= subdev->size;
-			continue;
-		}
-
-		return mtd_get_unmapped_area(subdev, len, offset, flags);
-	}
-
-	return (unsigned long) -ENOSYS;
-}
-
 /*
  * This function constructs a virtual MTD device by concatenating
  * num_devs MTD devices. A pointer to the new device object is
@@ -790,7 +764,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 	concat->mtd._unlock = concat_unlock;
 	concat->mtd._suspend = concat_suspend;
 	concat->mtd._resume = concat_resume;
-	concat->mtd._get_unmapped_area = concat_get_unmapped_area;
 
 	/*
 	 * Combine the erase block size info of the subdevices:
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index a308e707392d..be088bccd593 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -101,18 +101,6 @@ static int part_unpoint(struct mtd_info *mtd, loff_t from, size_t len)
 	return part->parent->_unpoint(part->parent, from + part->offset, len);
 }
 
-static unsigned long part_get_unmapped_area(struct mtd_info *mtd,
-					    unsigned long len,
-					    unsigned long offset,
-					    unsigned long flags)
-{
-	struct mtd_part *part = mtd_to_part(mtd);
-
-	offset += part->offset;
-	return part->parent->_get_unmapped_area(part->parent, len, offset,
-						flags);
-}
-
 static int part_read_oob(struct mtd_info *mtd, loff_t from,
 		struct mtd_oob_ops *ops)
 {
@@ -458,8 +446,6 @@ static struct mtd_part *allocate_partition(struct mtd_info *parent,
 		slave->mtd._unpoint = part_unpoint;
 	}
 
-	if (parent->_get_unmapped_area)
-		slave->mtd._get_unmapped_area = part_get_unmapped_area;
 	if (parent->_read_oob)
 		slave->mtd._read_oob = part_read_oob;
 	if (parent->_write_oob)
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 849543f1f233..cd55bf14ad51 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -297,10 +297,6 @@ struct mtd_info {
 	int (*_point) (struct mtd_info *mtd, loff_t from, size_t len,
 		       size_t *retlen, void **virt, resource_size_t *phys);
 	int (*_unpoint) (struct mtd_info *mtd, loff_t from, size_t len);
-	unsigned long (*_get_unmapped_area) (struct mtd_info *mtd,
-					     unsigned long len,
-					     unsigned long offset,
-					     unsigned long flags);
 	int (*_read) (struct mtd_info *mtd, loff_t from, size_t len,
 		      size_t *retlen, u_char *buf);
 	int (*_write) (struct mtd_info *mtd, loff_t to, size_t len,
-- 
cgit v1.2.3


From 34d9a270e74a412f041b528c33b75e3e6bc7a242 Mon Sep 17 00:00:00 2001
From: Yonatan Cohen <yonatanc@mellanox.com>
Date: Mon, 13 Nov 2017 10:51:14 +0200
Subject: IB/mlx4: Exposing modify CQ callback to uverbs layer

Exposed mlx4_ib_modify_cq to be called from ib device
verb list.

Signed-off-by: Yonatan Cohen <yonatanc@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx4/main.c    | 3 +++
 drivers/infiniband/hw/mlx4/mlx4_ib.h | 1 +
 include/linux/mlx4/cq.h              | 3 +++
 3 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index d8f0b94f1dc4..28c10ffc19e2 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2750,6 +2750,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->ib_dev.get_dev_fw_str    = get_fw_ver_str;
 	ibdev->ib_dev.disassociate_ucontext = mlx4_ib_disassociate_ucontext;
 
+	ibdev->ib_dev.uverbs_ex_cmd_mask |=
+		(1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ);
+
 	if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) &&
 	    ((mlx4_ib_port_link_layer(&ibdev->ib_dev, 1) ==
 	    IB_LINK_LAYER_ETHERNET) ||
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 719dae354066..e14919c15b06 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -47,6 +47,7 @@
 #include <linux/mlx4/device.h>
 #include <linux/mlx4/doorbell.h>
 #include <linux/mlx4/qp.h>
+#include <linux/mlx4/cq.h>
 
 #define MLX4_IB_DRV_NAME	"mlx4_ib"
 
diff --git a/include/linux/mlx4/cq.h b/include/linux/mlx4/cq.h
index 09cebe528488..508e8cc5ee86 100644
--- a/include/linux/mlx4/cq.h
+++ b/include/linux/mlx4/cq.h
@@ -136,6 +136,9 @@ enum {
 	MLX4_CQE_BAD_FCS                 = 1 << 4,
 };
 
+#define MLX4_MAX_CQ_PERIOD (BIT(16) - 1)
+#define MLX4_MAX_CQ_COUNT (BIT(16) - 1)
+
 static inline void mlx4_cq_arm(struct mlx4_cq *cq, u32 cmd,
 			       void __iomem *uar_page,
 			       spinlock_t *doorbell_lock)
-- 
cgit v1.2.3


From b0e9df6da25890448ebd134b7f647f16bced9abc Mon Sep 17 00:00:00 2001
From: Yonatan Cohen <yonatanc@mellanox.com>
Date: Mon, 13 Nov 2017 10:51:15 +0200
Subject: IB/mlx5: Exposing modify CQ callback to uverbs layer

Exposed mlx5_ib_modify_cq to be called from ib device
verb list.

Signed-off-by: Yonatan Cohen <yonatanc@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/cq.c   | 3 +++
 drivers/infiniband/hw/mlx5/main.c | 3 ++-
 include/linux/mlx5/cq.h           | 3 +++
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 01b218a3c277..18705cbcdc8c 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -1149,6 +1149,9 @@ int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
 	if (!MLX5_CAP_GEN(dev->mdev, cq_moderation))
 		return -ENOSYS;
 
+	if (cq_period > MLX5_MAX_CQ_PERIOD)
+		return -EINVAL;
+
 	err = mlx5_core_modify_cq_moderation(dev->mdev, &mcq->mcq,
 					     cq_period, cq_count);
 	if (err)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index d2aadb13637b..5a0ed6a499f4 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4022,7 +4022,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 		(1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE)	|
 		(1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ)	|
 		(1ull << IB_USER_VERBS_EX_CMD_CREATE_QP)	|
-		(1ull << IB_USER_VERBS_EX_CMD_MODIFY_QP);
+		(1ull << IB_USER_VERBS_EX_CMD_MODIFY_QP)	|
+		(1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ);
 
 	dev->ib_dev.query_device	= mlx5_ib_query_device;
 	dev->ib_dev.query_port		= mlx5_ib_query_port;
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index cc718e245b1e..6be357b219ec 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -128,6 +128,9 @@ enum {
 	CQE_SIZE_128_PAD = 2,
 };
 
+#define MLX5_MAX_CQ_PERIOD (BIT(__mlx5_bit_sz(cqc, cq_period)) - 1)
+#define MLX5_MAX_CQ_COUNT (BIT(__mlx5_bit_sz(cqc, cq_max_count)) - 1)
+
 static inline int cqe_sz_to_mlx_sz(u8 size, int padding_128_en)
 {
 	return padding_128_en ? CQE_SIZE_128_PAD :
-- 
cgit v1.2.3


From aea3706cfc4d952ed6d32b6d5845b5ecd99ed7f5 Mon Sep 17 00:00:00 2001
From: Miroslav Lichvar <mlichvar@redhat.com>
Date: Mon, 13 Nov 2017 14:51:31 -0800
Subject: timekeeping: Remove CONFIG_GENERIC_TIME_VSYSCALL_OLD

As of d4d1fc61eb38f (ia64: Update fsyscall gettime to use modern
vsyscall_update)the last user of CONFIG_GENERIC_TIME_VSYSCALL_OLD
have been updated, the legacy support for old-style vsyscall
implementations can be removed from the timekeeping code.

(Thanks again to Tony Luck for helping remove the last user!)

[jstultz: Commit message rework]

Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Link: https://lkml.kernel.org/r/1510613491-16695-1-git-send-email-john.stultz@linaro.org
---
 include/linux/timekeeper_internal.h |  7 ------
 kernel/time/Kconfig                 |  4 ----
 kernel/time/timekeeping.c           | 45 -------------------------------------
 3 files changed, 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 7e9011101cb0..d315c3d6725c 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -136,13 +136,6 @@ struct timekeeper {
 extern void update_vsyscall(struct timekeeper *tk);
 extern void update_vsyscall_tz(void);
 
-#elif defined(CONFIG_GENERIC_TIME_VSYSCALL_OLD)
-
-extern void update_vsyscall_old(struct timespec *ts, struct timespec *wtm,
-				struct clocksource *c, u32 mult,
-				u64 cycle_last);
-extern void update_vsyscall_tz(void);
-
 #else
 
 static inline void update_vsyscall(struct timekeeper *tk)
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index d689a9557e17..e776fc8cc1df 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -21,10 +21,6 @@ config CLOCKSOURCE_VALIDATE_LAST_CYCLE
 config GENERIC_TIME_VSYSCALL
 	bool
 
-# Timekeeping vsyscall support
-config GENERIC_TIME_VSYSCALL_OLD
-	bool
-
 # Old style timekeeping
 config ARCH_USES_GETTIMEOFFSET
 	bool
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 198afa78bf69..cd03317e7b57 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -557,45 +557,6 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
 	update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
 }
 
-#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
-#warning Please contact your maintainers, as GENERIC_TIME_VSYSCALL_OLD compatibity will disappear soon.
-
-static inline void update_vsyscall(struct timekeeper *tk)
-{
-	struct timespec xt, wm;
-
-	xt = timespec64_to_timespec(tk_xtime(tk));
-	wm = timespec64_to_timespec(tk->wall_to_monotonic);
-	update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult,
-			    tk->tkr_mono.cycle_last);
-}
-
-static inline void old_vsyscall_fixup(struct timekeeper *tk)
-{
-	s64 remainder;
-
-	/*
-	* Store only full nanoseconds into xtime_nsec after rounding
-	* it up and add the remainder to the error difference.
-	* XXX - This is necessary to avoid small 1ns inconsistnecies caused
-	* by truncating the remainder in vsyscalls. However, it causes
-	* additional work to be done in timekeeping_adjust(). Once
-	* the vsyscall implementations are converted to use xtime_nsec
-	* (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
-	* users are removed, this can be killed.
-	*/
-	remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
-	if (remainder != 0) {
-		tk->tkr_mono.xtime_nsec -= remainder;
-		tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
-		tk->ntp_error += remainder << tk->ntp_error_shift;
-		tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
-	}
-}
-#else
-#define old_vsyscall_fixup(tk)
-#endif
-
 static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
 
 static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
@@ -2163,12 +2124,6 @@ void update_wall_time(void)
 	/* correct the clock when NTP error is too big */
 	timekeeping_adjust(tk, offset);
 
-	/*
-	 * XXX This can be killed once everyone converts
-	 * to the new update_vsyscall.
-	 */
-	old_vsyscall_fixup(tk);
-
 	/*
 	 * Finally, make sure that after the rounding
 	 * xtime_nsec isn't larger than NSEC_PER_SEC
-- 
cgit v1.2.3


From 096d1dd0f03211fb42d6c2457f248827604b7f0e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 13 Nov 2017 16:19:46 +0100
Subject: netlink: remove unused NETLINK SKB flags

These flags are unused, remove them to be less confusing.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 6ddb4a5da371..49b4257ce1ea 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -17,9 +17,6 @@ static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb)
 }
 
 enum netlink_skb_flags {
-	NETLINK_SKB_MMAPED	= 0x1,	/* Packet data is mmaped */
-	NETLINK_SKB_TX		= 0x2,	/* Packet was sent by userspace */
-	NETLINK_SKB_DELIVERED	= 0x4,	/* Packet was delivered */
 	NETLINK_SKB_DST		= 0x8,	/* Dst set in sendto or sendmsg */
 };
 
-- 
cgit v1.2.3


From c7cdff0e864713a089d7cb3a2b1136ba9a54881a Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 13 Oct 2017 16:11:48 +0300
Subject: virtio_balloon: fix deadlock on OOM

fill_balloon doing memory allocations under balloon_lock
can cause a deadlock when leak_balloon is called from
virtballoon_oom_notify and tries to take same lock.

To fix, split page allocation and enqueue and do allocations outside the lock.

Here's a detailed analysis of the deadlock by Tetsuo Handa:

In leak_balloon(), mutex_lock(&vb->balloon_lock) is called in order to
serialize against fill_balloon(). But in fill_balloon(),
alloc_page(GFP_HIGHUSER[_MOVABLE] | __GFP_NOMEMALLOC | __GFP_NORETRY) is
called with vb->balloon_lock mutex held. Since GFP_HIGHUSER[_MOVABLE]
implies __GFP_DIRECT_RECLAIM | __GFP_IO | __GFP_FS, despite __GFP_NORETRY
is specified, this allocation attempt might indirectly depend on somebody
else's __GFP_DIRECT_RECLAIM memory allocation. And such indirect
__GFP_DIRECT_RECLAIM memory allocation might call leak_balloon() via
virtballoon_oom_notify() via blocking_notifier_call_chain() callback via
out_of_memory() when it reached __alloc_pages_may_oom() and held oom_lock
mutex. Since vb->balloon_lock mutex is already held by fill_balloon(), it
will cause OOM lockup.

  Thread1                                       Thread2
    fill_balloon()
      takes a balloon_lock
      balloon_page_enqueue()
        alloc_page(GFP_HIGHUSER_MOVABLE)
          direct reclaim (__GFP_FS context)       takes a fs lock
            waits for that fs lock                  alloc_page(GFP_NOFS)
                                                      __alloc_pages_may_oom()
                                                        takes the oom_lock
                                                        out_of_memory()
                                                          blocking_notifier_call_chain()
                                                            leak_balloon()
                                                              tries to take that balloon_lock and deadlocks

Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_balloon.c    | 24 +++++++++++++++++++-----
 include/linux/balloon_compaction.h | 35 ++++++++++++++++++++++++++++++++++-
 mm/balloon_compaction.c            | 28 +++++++++++++++++++++-------
 3 files changed, 74 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index f0b3a0b9d42f..7960746f7597 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -143,16 +143,17 @@ static void set_page_pfns(struct virtio_balloon *vb,
 
 static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 {
-	struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info;
 	unsigned num_allocated_pages;
+	unsigned num_pfns;
+	struct page *page;
+	LIST_HEAD(pages);
 
 	/* We can only do one array worth at a time. */
 	num = min(num, ARRAY_SIZE(vb->pfns));
 
-	mutex_lock(&vb->balloon_lock);
-	for (vb->num_pfns = 0; vb->num_pfns < num;
-	     vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-		struct page *page = balloon_page_enqueue(vb_dev_info);
+	for (num_pfns = 0; num_pfns < num;
+	     num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
+		struct page *page = balloon_page_alloc();
 
 		if (!page) {
 			dev_info_ratelimited(&vb->vdev->dev,
@@ -162,6 +163,19 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 			msleep(200);
 			break;
 		}
+
+		balloon_page_push(&pages, page);
+	}
+
+	mutex_lock(&vb->balloon_lock);
+
+	vb->num_pfns = 0;
+
+	while ((page = balloon_page_pop(&pages))) {
+		balloon_page_enqueue(&vb->vb_dev_info, page);
+
+		vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE;
+
 		set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
 		vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
 		if (!virtio_has_feature(vb->vdev,
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index fbbe6da40fed..53051f3d8f25 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -50,6 +50,7 @@
 #include <linux/gfp.h>
 #include <linux/err.h>
 #include <linux/fs.h>
+#include <linux/list.h>
 
 /*
  * Balloon device information descriptor.
@@ -67,7 +68,9 @@ struct balloon_dev_info {
 	struct inode *inode;
 };
 
-extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info);
+extern struct page *balloon_page_alloc(void);
+extern void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
+				 struct page *page);
 extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info);
 
 static inline void balloon_devinfo_init(struct balloon_dev_info *balloon)
@@ -193,4 +196,34 @@ static inline gfp_t balloon_mapping_gfp_mask(void)
 }
 
 #endif /* CONFIG_BALLOON_COMPACTION */
+
+/*
+ * balloon_page_push - insert a page into a page list.
+ * @head : pointer to list
+ * @page : page to be added
+ *
+ * Caller must ensure the page is private and protect the list.
+ */
+static inline void balloon_page_push(struct list_head *pages, struct page *page)
+{
+	list_add(&page->lru, pages);
+}
+
+/*
+ * balloon_page_pop - remove a page from a page list.
+ * @head : pointer to list
+ * @page : page to be added
+ *
+ * Caller must ensure the page is private and protect the list.
+ */
+static inline struct page *balloon_page_pop(struct list_head *pages)
+{
+	struct page *page = list_first_entry_or_null(pages, struct page, lru);
+
+	if (!page)
+		return NULL;
+
+	list_del(&page->lru);
+	return page;
+}
 #endif /* _LINUX_BALLOON_COMPACTION_H */
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 68d28924ba79..ef858d547e2d 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -10,23 +10,38 @@
 #include <linux/export.h>
 #include <linux/balloon_compaction.h>
 
+/*
+ * balloon_page_alloc - allocates a new page for insertion into the balloon
+ *			  page list.
+ *
+ * Driver must call it to properly allocate a new enlisted balloon page.
+ * Driver must call balloon_page_enqueue before definitively removing it from
+ * the guest system.  This function returns the page address for the recently
+ * allocated page or NULL in the case we fail to allocate a new page this turn.
+ */
+struct page *balloon_page_alloc(void)
+{
+	struct page *page = alloc_page(balloon_mapping_gfp_mask() |
+				       __GFP_NOMEMALLOC | __GFP_NORETRY);
+	return page;
+}
+EXPORT_SYMBOL_GPL(balloon_page_alloc);
+
 /*
  * balloon_page_enqueue - allocates a new page and inserts it into the balloon
  *			  page list.
  * @b_dev_info: balloon device descriptor where we will insert a new page to
+ * @page: new page to enqueue - allocated using balloon_page_alloc.
  *
- * Driver must call it to properly allocate a new enlisted balloon page
+ * Driver must call it to properly enqueue a new allocated balloon page
  * before definitively removing it from the guest system.
  * This function returns the page address for the recently enqueued page or
  * NULL in the case we fail to allocate a new page this turn.
  */
-struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
+void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
+			  struct page *page)
 {
 	unsigned long flags;
-	struct page *page = alloc_page(balloon_mapping_gfp_mask() |
-				       __GFP_NOMEMALLOC | __GFP_NORETRY);
-	if (!page)
-		return NULL;
 
 	/*
 	 * Block others from accessing the 'page' when we get around to
@@ -39,7 +54,6 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
 	__count_vm_event(BALLOON_INFLATE);
 	spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
 	unlock_page(page);
-	return page;
 }
 EXPORT_SYMBOL_GPL(balloon_page_enqueue);
 
-- 
cgit v1.2.3


From 074d58989569b39f04294c90ef36dd82b8c2cc1a Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Wed, 15 Nov 2017 16:38:45 +0000
Subject: security: keys: Replace time_t/timespec with time64_t

The 'struct key' will use 'time_t' which we try to remove in the
kernel, since 'time_t' is not year 2038 safe on 32bit systems.
Also the 'struct keyring_search_context' will use 'timespec' type
to record current time, which is also not year 2038 safe on 32bit
systems.

Thus this patch replaces 'time_t' with 'time64_t' which is year 2038
safe for 'struct key', and replace 'timespec' with 'time64_t' for the
'struct keyring_search_context', since we only look at the the seconds
part of 'timespec' variable. Moreover we also change the codes where
using the 'time_t' and 'timespec', and we can get current time by
ktime_get_real_seconds() instead of current_kernel_time(), and use
'TIME64_MAX' macro to initialize the 'time64_t' type variable.

Especially in proc.c file, we have replaced 'unsigned long' and 'timespec'
type with 'u64' and 'time64_t' type to save the timeout value, which means
user will get one 'u64' type timeout value by issuing proc_keys_show()
function.

Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/key.h          |  7 ++++---
 security/keys/gc.c           | 20 ++++++++++----------
 security/keys/internal.h     |  8 ++++----
 security/keys/key.c          | 19 ++++++-------------
 security/keys/keyring.c      | 20 ++++++++++----------
 security/keys/permission.c   |  5 ++---
 security/keys/proc.c         | 21 ++++++++++-----------
 security/keys/process_keys.c |  2 +-
 8 files changed, 47 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/key.h b/include/linux/key.h
index 8a15cabe928d..e58ee10f6e58 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -24,6 +24,7 @@
 #include <linux/atomic.h>
 #include <linux/assoc_array.h>
 #include <linux/refcount.h>
+#include <linux/time64.h>
 
 #ifdef __KERNEL__
 #include <linux/uidgid.h>
@@ -162,10 +163,10 @@ struct key {
 	struct key_user		*user;		/* owner of this key */
 	void			*security;	/* security data for this key */
 	union {
-		time_t		expiry;		/* time at which key expires (or 0) */
-		time_t		revoked_at;	/* time at which key was revoked */
+		time64_t	expiry;		/* time at which key expires (or 0) */
+		time64_t	revoked_at;	/* time at which key was revoked */
 	};
-	time_t			last_used_at;	/* last time used for LRU keyring discard */
+	time64_t		last_used_at;	/* last time used for LRU keyring discard */
 	kuid_t			uid;
 	kgid_t			gid;
 	key_perm_t		perm;		/* access permissions */
diff --git a/security/keys/gc.c b/security/keys/gc.c
index afb3a9175d76..6713fee893fb 100644
--- a/security/keys/gc.c
+++ b/security/keys/gc.c
@@ -32,7 +32,7 @@ DECLARE_WORK(key_gc_work, key_garbage_collector);
 static void key_gc_timer_func(unsigned long);
 static DEFINE_TIMER(key_gc_timer, key_gc_timer_func);
 
-static time_t key_gc_next_run = LONG_MAX;
+static time64_t key_gc_next_run = TIME64_MAX;
 static struct key_type *key_gc_dead_keytype;
 
 static unsigned long key_gc_flags;
@@ -53,12 +53,12 @@ struct key_type key_type_dead = {
  * Schedule a garbage collection run.
  * - time precision isn't particularly important
  */
-void key_schedule_gc(time_t gc_at)
+void key_schedule_gc(time64_t gc_at)
 {
 	unsigned long expires;
-	time_t now = current_kernel_time().tv_sec;
+	time64_t now = ktime_get_real_seconds();
 
-	kenter("%ld", gc_at - now);
+	kenter("%lld", gc_at - now);
 
 	if (gc_at <= now || test_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags)) {
 		kdebug("IMMEDIATE");
@@ -87,7 +87,7 @@ void key_schedule_gc_links(void)
 static void key_gc_timer_func(unsigned long data)
 {
 	kenter("");
-	key_gc_next_run = LONG_MAX;
+	key_gc_next_run = TIME64_MAX;
 	key_schedule_gc_links();
 }
 
@@ -184,11 +184,11 @@ static void key_garbage_collector(struct work_struct *work)
 
 	struct rb_node *cursor;
 	struct key *key;
-	time_t new_timer, limit;
+	time64_t new_timer, limit;
 
 	kenter("[%lx,%x]", key_gc_flags, gc_state);
 
-	limit = current_kernel_time().tv_sec;
+	limit = ktime_get_real_seconds();
 	if (limit > key_gc_delay)
 		limit -= key_gc_delay;
 	else
@@ -204,7 +204,7 @@ static void key_garbage_collector(struct work_struct *work)
 		gc_state |= KEY_GC_REAPING_DEAD_1;
 	kdebug("new pass %x", gc_state);
 
-	new_timer = LONG_MAX;
+	new_timer = TIME64_MAX;
 
 	/* As only this function is permitted to remove things from the key
 	 * serial tree, if cursor is non-NULL then it will always point to a
@@ -235,7 +235,7 @@ continue_scanning:
 
 		if (gc_state & KEY_GC_SET_TIMER) {
 			if (key->expiry > limit && key->expiry < new_timer) {
-				kdebug("will expire %x in %ld",
+				kdebug("will expire %x in %lld",
 				       key_serial(key), key->expiry - limit);
 				new_timer = key->expiry;
 			}
@@ -276,7 +276,7 @@ maybe_resched:
 	 */
 	kdebug("pass complete");
 
-	if (gc_state & KEY_GC_SET_TIMER && new_timer != (time_t)LONG_MAX) {
+	if (gc_state & KEY_GC_SET_TIMER && new_timer != (time64_t)TIME64_MAX) {
 		new_timer += key_gc_delay;
 		key_schedule_gc(new_timer);
 	}
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 503adbae7b0d..9f8208dc0e55 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -130,7 +130,7 @@ struct keyring_search_context {
 	int			skipped_ret;
 	bool			possessed;
 	key_ref_t		result;
-	struct timespec		now;
+	time64_t		now;
 };
 
 extern bool key_default_cmp(const struct key *key,
@@ -169,10 +169,10 @@ extern void key_change_session_keyring(struct callback_head *twork);
 
 extern struct work_struct key_gc_work;
 extern unsigned key_gc_delay;
-extern void keyring_gc(struct key *keyring, time_t limit);
+extern void keyring_gc(struct key *keyring, time64_t limit);
 extern void keyring_restriction_gc(struct key *keyring,
 				   struct key_type *dead_type);
-extern void key_schedule_gc(time_t gc_at);
+extern void key_schedule_gc(time64_t gc_at);
 extern void key_schedule_gc_links(void);
 extern void key_gc_keytype(struct key_type *ktype);
 
@@ -211,7 +211,7 @@ extern struct key *key_get_instantiation_authkey(key_serial_t target_id);
 /*
  * Determine whether a key is dead.
  */
-static inline bool key_is_dead(const struct key *key, time_t limit)
+static inline bool key_is_dead(const struct key *key, time64_t limit)
 {
 	return
 		key->flags & ((1 << KEY_FLAG_DEAD) |
diff --git a/security/keys/key.c b/security/keys/key.c
index 83bf4b4afd49..32fd33d45959 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -570,7 +570,6 @@ int key_reject_and_link(struct key *key,
 			struct key *authkey)
 {
 	struct assoc_array_edit *edit;
-	struct timespec now;
 	int ret, awaken, link_ret = 0;
 
 	key_check(key);
@@ -593,8 +592,7 @@ int key_reject_and_link(struct key *key,
 		/* mark the key as being negatively instantiated */
 		atomic_inc(&key->user->nikeys);
 		mark_key_instantiated(key, -error);
-		now = current_kernel_time();
-		key->expiry = now.tv_sec + timeout;
+		key->expiry = ktime_get_real_seconds() + timeout;
 		key_schedule_gc(key->expiry + key_gc_delay);
 
 		if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags))
@@ -710,16 +708,13 @@ found_kernel_type:
 
 void key_set_timeout(struct key *key, unsigned timeout)
 {
-	struct timespec now;
-	time_t expiry = 0;
+	time64_t expiry = 0;
 
 	/* make the changes with the locks held to prevent races */
 	down_write(&key->sem);
 
-	if (timeout > 0) {
-		now = current_kernel_time();
-		expiry = now.tv_sec + timeout;
-	}
+	if (timeout > 0)
+		expiry = ktime_get_real_seconds() + timeout;
 
 	key->expiry = expiry;
 	key_schedule_gc(key->expiry + key_gc_delay);
@@ -1028,8 +1023,7 @@ EXPORT_SYMBOL(key_update);
  */
 void key_revoke(struct key *key)
 {
-	struct timespec now;
-	time_t time;
+	time64_t time;
 
 	key_check(key);
 
@@ -1044,8 +1038,7 @@ void key_revoke(struct key *key)
 		key->type->revoke(key);
 
 	/* set the death time to no more than the expiry time */
-	now = current_kernel_time();
-	time = now.tv_sec;
+	time = ktime_get_real_seconds();
 	if (key->revoked_at == 0 || key->revoked_at > time) {
 		key->revoked_at = time;
 		key_schedule_gc(key->revoked_at + key_gc_delay);
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 36f842ec87f0..d0bccebbd3b5 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -565,7 +565,7 @@ static int keyring_search_iterator(const void *object, void *iterator_data)
 
 	/* skip invalidated, revoked and expired keys */
 	if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) {
-		time_t expiry = READ_ONCE(key->expiry);
+		time64_t expiry = READ_ONCE(key->expiry);
 
 		if (kflags & ((1 << KEY_FLAG_INVALIDATED) |
 			      (1 << KEY_FLAG_REVOKED))) {
@@ -574,7 +574,7 @@ static int keyring_search_iterator(const void *object, void *iterator_data)
 			goto skipped;
 		}
 
-		if (expiry && ctx->now.tv_sec >= expiry) {
+		if (expiry && ctx->now >= expiry) {
 			if (!(ctx->flags & KEYRING_SEARCH_SKIP_EXPIRED))
 				ctx->result = ERR_PTR(-EKEYEXPIRED);
 			kleave(" = %d [expire]", ctx->skipped_ret);
@@ -834,10 +834,10 @@ found:
 	key = key_ref_to_ptr(ctx->result);
 	key_check(key);
 	if (!(ctx->flags & KEYRING_SEARCH_NO_UPDATE_TIME)) {
-		key->last_used_at = ctx->now.tv_sec;
-		keyring->last_used_at = ctx->now.tv_sec;
+		key->last_used_at = ctx->now;
+		keyring->last_used_at = ctx->now;
 		while (sp > 0)
-			stack[--sp].keyring->last_used_at = ctx->now.tv_sec;
+			stack[--sp].keyring->last_used_at = ctx->now;
 	}
 	kleave(" = true");
 	return true;
@@ -898,7 +898,7 @@ key_ref_t keyring_search_aux(key_ref_t keyring_ref,
 	}
 
 	rcu_read_lock();
-	ctx->now = current_kernel_time();
+	ctx->now = ktime_get_real_seconds();
 	if (search_nested_keyrings(keyring, ctx))
 		__key_get(key_ref_to_ptr(ctx->result));
 	rcu_read_unlock();
@@ -1149,7 +1149,7 @@ struct key *find_keyring_by_name(const char *name, bool uid_keyring)
 			 * (ie. it has a zero usage count) */
 			if (!refcount_inc_not_zero(&keyring->usage))
 				continue;
-			keyring->last_used_at = current_kernel_time().tv_sec;
+			keyring->last_used_at = ktime_get_real_seconds();
 			goto out;
 		}
 	}
@@ -1489,7 +1489,7 @@ static void keyring_revoke(struct key *keyring)
 static bool keyring_gc_select_iterator(void *object, void *iterator_data)
 {
 	struct key *key = keyring_ptr_to_key(object);
-	time_t *limit = iterator_data;
+	time64_t *limit = iterator_data;
 
 	if (key_is_dead(key, *limit))
 		return false;
@@ -1500,7 +1500,7 @@ static bool keyring_gc_select_iterator(void *object, void *iterator_data)
 static int keyring_gc_check_iterator(const void *object, void *iterator_data)
 {
 	const struct key *key = keyring_ptr_to_key(object);
-	time_t *limit = iterator_data;
+	time64_t *limit = iterator_data;
 
 	key_check(key);
 	return key_is_dead(key, *limit);
@@ -1512,7 +1512,7 @@ static int keyring_gc_check_iterator(const void *object, void *iterator_data)
  * Not called with any locks held.  The keyring's key struct will not be
  * deallocated under us as only our caller may deallocate it.
  */
-void keyring_gc(struct key *keyring, time_t limit)
+void keyring_gc(struct key *keyring, time64_t limit)
 {
 	int result;
 
diff --git a/security/keys/permission.c b/security/keys/permission.c
index a72b4dd70c8a..f68dc04d614e 100644
--- a/security/keys/permission.c
+++ b/security/keys/permission.c
@@ -89,7 +89,7 @@ EXPORT_SYMBOL(key_task_permission);
 int key_validate(const struct key *key)
 {
 	unsigned long flags = READ_ONCE(key->flags);
-	time_t expiry = READ_ONCE(key->expiry);
+	time64_t expiry = READ_ONCE(key->expiry);
 
 	if (flags & (1 << KEY_FLAG_INVALIDATED))
 		return -ENOKEY;
@@ -101,8 +101,7 @@ int key_validate(const struct key *key)
 
 	/* check it hasn't expired */
 	if (expiry) {
-		struct timespec now = current_kernel_time();
-		if (now.tv_sec >= expiry)
+		if (ktime_get_real_seconds() >= expiry)
 			return -EKEYEXPIRED;
 	}
 
diff --git a/security/keys/proc.c b/security/keys/proc.c
index 6d1fcbba1e09..fbc4af5c6c9f 100644
--- a/security/keys/proc.c
+++ b/security/keys/proc.c
@@ -178,13 +178,12 @@ static int proc_keys_show(struct seq_file *m, void *v)
 {
 	struct rb_node *_p = v;
 	struct key *key = rb_entry(_p, struct key, serial_node);
-	struct timespec now;
-	time_t expiry;
-	unsigned long timo;
 	unsigned long flags;
 	key_ref_t key_ref, skey_ref;
+	time64_t now, expiry;
 	char xbuf[16];
 	short state;
+	u64 timo;
 	int rc;
 
 	struct keyring_search_context ctx = {
@@ -215,7 +214,7 @@ static int proc_keys_show(struct seq_file *m, void *v)
 	if (rc < 0)
 		return 0;
 
-	now = current_kernel_time();
+	now = ktime_get_real_seconds();
 
 	rcu_read_lock();
 
@@ -223,21 +222,21 @@ static int proc_keys_show(struct seq_file *m, void *v)
 	expiry = READ_ONCE(key->expiry);
 	if (expiry == 0) {
 		memcpy(xbuf, "perm", 5);
-	} else if (now.tv_sec >= expiry) {
+	} else if (now >= expiry) {
 		memcpy(xbuf, "expd", 5);
 	} else {
-		timo = expiry - now.tv_sec;
+		timo = expiry - now;
 
 		if (timo < 60)
-			sprintf(xbuf, "%lus", timo);
+			sprintf(xbuf, "%llus", timo);
 		else if (timo < 60*60)
-			sprintf(xbuf, "%lum", timo / 60);
+			sprintf(xbuf, "%llum", div_u64(timo, 60));
 		else if (timo < 60*60*24)
-			sprintf(xbuf, "%luh", timo / (60*60));
+			sprintf(xbuf, "%lluh", div_u64(timo, 60 * 60));
 		else if (timo < 60*60*24*7)
-			sprintf(xbuf, "%lud", timo / (60*60*24));
+			sprintf(xbuf, "%llud", div_u64(timo, 60 * 60 * 24));
 		else
-			sprintf(xbuf, "%luw", timo / (60*60*24*7));
+			sprintf(xbuf, "%lluw", div_u64(timo, 60 * 60 * 24 * 7));
 	}
 
 	state = key_read_state(key);
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 740affd65ee9..d5b25e535d3a 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -738,7 +738,7 @@ try_again:
 	if (ret < 0)
 		goto invalid_key;
 
-	key->last_used_at = current_kernel_time().tv_sec;
+	key->last_used_at = ktime_get_real_seconds();
 
 error:
 	put_cred(ctx.cred);
-- 
cgit v1.2.3


From 0a9dd0e0711e58aa8d19ae4446cb3fe2906a8514 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Wed, 15 Nov 2017 16:38:45 +0000
Subject: security: keys: Replace time_t with time64_t for struct
 key_preparsed_payload

The 'struct key_preparsed_payload' will use 'time_t' which we will
try to remove in the kernel, since 'time_t' is not year 2038 safe on
32bits systems.

Thus this patch replaces 'time_t' with 'time64_t' which is year 2038
safe on 32 bits system for 'struct key_preparsed_payload', moreover
we should use the 'TIME64_MAX' macro to initialize the 'time64_t'
type variable.

Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/key-type.h | 2 +-
 security/keys/key.c      | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/key-type.h b/include/linux/key-type.h
index 9520fc3c3b9a..05d8fb5a06c4 100644
--- a/include/linux/key-type.h
+++ b/include/linux/key-type.h
@@ -44,7 +44,7 @@ struct key_preparsed_payload {
 	const void	*data;		/* Raw data */
 	size_t		datalen;	/* Raw datalen */
 	size_t		quotalen;	/* Quota length for proposed payload */
-	time_t		expiry;		/* Expiry time of key */
+	time64_t	expiry;		/* Expiry time of key */
 } __randomize_layout;
 
 typedef int (*request_key_actor_t)(struct key_construction *key,
diff --git a/security/keys/key.c b/security/keys/key.c
index 32fd33d45959..66049183ad89 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -460,7 +460,7 @@ static int __key_instantiate_and_link(struct key *key,
 			if (authkey)
 				key_revoke(authkey);
 
-			if (prep->expiry != TIME_T_MAX) {
+			if (prep->expiry != TIME64_MAX) {
 				key->expiry = prep->expiry;
 				key_schedule_gc(prep->expiry + key_gc_delay);
 			}
@@ -506,7 +506,7 @@ int key_instantiate_and_link(struct key *key,
 	prep.data = data;
 	prep.datalen = datalen;
 	prep.quotalen = key->type->def_datalen;
-	prep.expiry = TIME_T_MAX;
+	prep.expiry = TIME64_MAX;
 	if (key->type->preparse) {
 		ret = key->type->preparse(&prep);
 		if (ret < 0)
@@ -845,7 +845,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	prep.data = payload;
 	prep.datalen = plen;
 	prep.quotalen = index_key.type->def_datalen;
-	prep.expiry = TIME_T_MAX;
+	prep.expiry = TIME64_MAX;
 	if (index_key.type->preparse) {
 		ret = index_key.type->preparse(&prep);
 		if (ret < 0) {
@@ -989,7 +989,7 @@ int key_update(key_ref_t key_ref, const void *payload, size_t plen)
 	prep.data = payload;
 	prep.datalen = plen;
 	prep.quotalen = key->type->def_datalen;
-	prep.expiry = TIME_T_MAX;
+	prep.expiry = TIME64_MAX;
 	if (key->type->preparse) {
 		ret = key->type->preparse(&prep);
 		if (ret < 0)
-- 
cgit v1.2.3


From 7d5905dc14a87805a59f3c5bf70173aac2bb18f8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 15 Nov 2017 02:13:40 +0100
Subject: x86 / CPU: Always show current CPU frequency in /proc/cpuinfo

After commit 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get()
for /proc/cpuinfo "cpu MHz"") the "cpu MHz" number in /proc/cpuinfo
on x86 can be either the nominal CPU frequency (which is constant)
or the frequency most recently requested by a scaling governor in
cpufreq, depending on the cpufreq configuration.  That is somewhat
inconsistent and is different from what it was before 4.13, so in
order to restore the previous behavior, make it report the current
CPU frequency like the scaling_cur_freq sysfs file in cpufreq.

To that end, modify the /proc/cpuinfo implementation on x86 to use
aperfmperf_snapshot_khz() to snapshot the APERF and MPERF feedback
registers, if available, and use their values to compute the CPU
frequency to be reported as "cpu MHz".

However, do that carefully enough to avoid accumulating delays that
lead to unacceptable access times for /proc/cpuinfo on systems with
many CPUs.  Run aperfmperf_snapshot_khz() once on all CPUs
asynchronously at the /proc/cpuinfo open time, add a single delay
upfront (if necessary) at that point and simply compute the current
frequency while running show_cpuinfo() for each individual CPU.

Also, to avoid slowing down /proc/cpuinfo accesses too much, reduce
the default delay between consecutive APERF and MPERF reads to 10 ms,
which should be sufficient to get large enough numbers for the
frequency computation in all cases.

Fixes: 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/Makefile     |  2 +-
 arch/x86/kernel/cpu/aperfmperf.c | 74 ++++++++++++++++++++++++++++------------
 arch/x86/kernel/cpu/cpu.h        |  3 ++
 arch/x86/kernel/cpu/proc.c       |  6 +++-
 fs/proc/cpuinfo.c                |  6 ++++
 include/linux/cpufreq.h          |  1 +
 6 files changed, 68 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 90cb82dbba57..570e8bb1f386 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -22,7 +22,7 @@ obj-y			+= common.o
 obj-y			+= rdrand.o
 obj-y			+= match.o
 obj-y			+= bugs.o
-obj-$(CONFIG_CPU_FREQ)	+= aperfmperf.o
+obj-y			+= aperfmperf.o
 obj-y			+= cpuid-deps.o
 
 obj-$(CONFIG_PROC_FS)	+= proc.o
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index 957813e0180d..7eba34df54c3 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -14,6 +14,8 @@
 #include <linux/percpu.h>
 #include <linux/smp.h>
 
+#include "cpu.h"
+
 struct aperfmperf_sample {
 	unsigned int	khz;
 	ktime_t	time;
@@ -24,7 +26,7 @@ struct aperfmperf_sample {
 static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
 
 #define APERFMPERF_CACHE_THRESHOLD_MS	10
-#define APERFMPERF_REFRESH_DELAY_MS	20
+#define APERFMPERF_REFRESH_DELAY_MS	10
 #define APERFMPERF_STALE_THRESHOLD_MS	1000
 
 /*
@@ -38,8 +40,6 @@ static void aperfmperf_snapshot_khz(void *dummy)
 	u64 aperf, aperf_delta;
 	u64 mperf, mperf_delta;
 	struct aperfmperf_sample *s = this_cpu_ptr(&samples);
-	ktime_t now = ktime_get();
-	s64 time_delta = ktime_ms_delta(now, s->time);
 	unsigned long flags;
 
 	local_irq_save(flags);
@@ -57,38 +57,68 @@ static void aperfmperf_snapshot_khz(void *dummy)
 	if (mperf_delta == 0)
 		return;
 
-	s->time = now;
+	s->time = ktime_get();
 	s->aperf = aperf;
 	s->mperf = mperf;
-
-	/* If the previous iteration was too long ago, discard it. */
-	if (time_delta > APERFMPERF_STALE_THRESHOLD_MS)
-		s->khz = 0;
-	else
-		s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
+	s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
 }
 
-unsigned int arch_freq_get_on_cpu(int cpu)
+static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
 {
-	s64 time_delta;
-	unsigned int khz;
+	s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu));
+
+	/* Don't bother re-computing within the cache threshold time. */
+	if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
+		return true;
+
+	smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait);
+
+	/* Return false if the previous iteration was too long ago. */
+	return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
+}
 
+unsigned int aperfmperf_get_khz(int cpu)
+{
 	if (!cpu_khz)
 		return 0;
 
 	if (!static_cpu_has(X86_FEATURE_APERFMPERF))
 		return 0;
 
-	/* Don't bother re-computing within the cache threshold time. */
-	time_delta = ktime_ms_delta(ktime_get(), per_cpu(samples.time, cpu));
-	khz = per_cpu(samples.khz, cpu);
-	if (khz && time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
-		return khz;
+	aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
+	return per_cpu(samples.khz, cpu);
+}
 
-	smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
-	khz = per_cpu(samples.khz, cpu);
-	if (khz)
-		return khz;
+void arch_freq_prepare_all(void)
+{
+	ktime_t now = ktime_get();
+	bool wait = false;
+	int cpu;
+
+	if (!cpu_khz)
+		return;
+
+	if (!static_cpu_has(X86_FEATURE_APERFMPERF))
+		return;
+
+	for_each_online_cpu(cpu)
+		if (!aperfmperf_snapshot_cpu(cpu, now, false))
+			wait = true;
+
+	if (wait)
+		msleep(APERFMPERF_REFRESH_DELAY_MS);
+}
+
+unsigned int arch_freq_get_on_cpu(int cpu)
+{
+	if (!cpu_khz)
+		return 0;
+
+	if (!static_cpu_has(X86_FEATURE_APERFMPERF))
+		return 0;
+
+	if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true))
+		return per_cpu(samples.khz, cpu);
 
 	msleep(APERFMPERF_REFRESH_DELAY_MS);
 	smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index f52a370b6c00..e806b11a99af 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -47,4 +47,7 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
 
 extern void get_cpu_cap(struct cpuinfo_x86 *c);
 extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
+
+unsigned int aperfmperf_get_khz(int cpu);
+
 #endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 6b7e17bf0b71..e7ecedafa1c8 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -5,6 +5,8 @@
 #include <linux/seq_file.h>
 #include <linux/cpufreq.h>
 
+#include "cpu.h"
+
 /*
  *	Get CPU information for use by the procfs.
  */
@@ -78,8 +80,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
 
 	if (cpu_has(c, X86_FEATURE_TSC)) {
-		unsigned int freq = cpufreq_quick_get(cpu);
+		unsigned int freq = aperfmperf_get_khz(cpu);
 
+		if (!freq)
+			freq = cpufreq_quick_get(cpu);
 		if (!freq)
 			freq = cpu_khz;
 		seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index e0f867cd8553..96f1087e372c 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -1,12 +1,18 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/cpufreq.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
+__weak void arch_freq_prepare_all(void)
+{
+}
+
 extern const struct seq_operations cpuinfo_op;
 static int cpuinfo_open(struct inode *inode, struct file *file)
 {
+	arch_freq_prepare_all();
 	return seq_open(file, &cpuinfo_op);
 }
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 28734ee185a7..065f3a8eb486 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -917,6 +917,7 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy)
 }
 #endif
 
+extern void arch_freq_prepare_all(void);
 extern unsigned int arch_freq_get_on_cpu(int cpu);
 
 extern void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
-- 
cgit v1.2.3


From d50112edde1d0c621520e53747044009f11c656b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 15 Nov 2017 17:32:18 -0800
Subject: slab, slub, slob: add slab_flags_t

Add sparse-checked slab_flags_t for struct kmem_cache::flags (SLAB_POISON,
etc).

SLAB is bloated temporarily by switching to "unsigned long", but only
temporarily.

Link: http://lkml.kernel.org/r/20171021100225.GA22428@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/main.c       |  2 +-
 fs/xfs/kmem.h            |  2 +-
 include/linux/kasan.h    |  4 ++--
 include/linux/kmemleak.h |  8 +++----
 include/linux/slab.h     | 60 +++++++++++++++++++++++++++++-------------------
 include/linux/slab_def.h |  2 +-
 include/linux/slub_def.h |  2 +-
 include/linux/types.h    |  1 +
 include/net/sock.h       |  2 +-
 mm/kasan/kasan.c         |  2 +-
 mm/slab.c                | 23 +++++++++----------
 mm/slab.h                | 26 ++++++++++-----------
 mm/slab_common.c         | 16 ++++++-------
 mm/slob.c                |  2 +-
 mm/slub.c                | 26 +++++++++++----------
 15 files changed, 97 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 6b801186baa5..25aeaa7328ba 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -660,7 +660,7 @@ static struct ecryptfs_cache_info {
 	struct kmem_cache **cache;
 	const char *name;
 	size_t size;
-	unsigned long flags;
+	slab_flags_t flags;
 	void (*ctor)(void *obj);
 } ecryptfs_cache_infos[] = {
 	{
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 758f37ac5ad3..4b87472f35bc 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -104,7 +104,7 @@ kmem_zone_init(int size, char *zone_name)
 }
 
 static inline kmem_zone_t *
-kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
+kmem_zone_init_flags(int size, char *zone_name, slab_flags_t flags,
 		     void (*construct)(void *))
 {
 	return kmem_cache_create(zone_name, size, 0, flags, construct);
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 5017269e3f04..e3eb834c9a35 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -46,7 +46,7 @@ void kasan_alloc_pages(struct page *page, unsigned int order);
 void kasan_free_pages(struct page *page, unsigned int order);
 
 void kasan_cache_create(struct kmem_cache *cache, size_t *size,
-			unsigned long *flags);
+			slab_flags_t *flags);
 void kasan_cache_shrink(struct kmem_cache *cache);
 void kasan_cache_shutdown(struct kmem_cache *cache);
 
@@ -95,7 +95,7 @@ static inline void kasan_free_pages(struct page *page, unsigned int order) {}
 
 static inline void kasan_cache_create(struct kmem_cache *cache,
 				      size_t *size,
-				      unsigned long *flags) {}
+				      slab_flags_t *flags) {}
 static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
 static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
 
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
index 590343f6c1b1..5ac416e2d339 100644
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -48,14 +48,14 @@ extern void kmemleak_not_leak_phys(phys_addr_t phys) __ref;
 extern void kmemleak_ignore_phys(phys_addr_t phys) __ref;
 
 static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
-					    int min_count, unsigned long flags,
+					    int min_count, slab_flags_t flags,
 					    gfp_t gfp)
 {
 	if (!(flags & SLAB_NOLEAKTRACE))
 		kmemleak_alloc(ptr, size, min_count, gfp);
 }
 
-static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags)
+static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags)
 {
 	if (!(flags & SLAB_NOLEAKTRACE))
 		kmemleak_free(ptr);
@@ -76,7 +76,7 @@ static inline void kmemleak_alloc(const void *ptr, size_t size, int min_count,
 {
 }
 static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
-					    int min_count, unsigned long flags,
+					    int min_count, slab_flags_t flags,
 					    gfp_t gfp)
 {
 }
@@ -94,7 +94,7 @@ static inline void kmemleak_free(const void *ptr)
 static inline void kmemleak_free_part(const void *ptr, size_t size)
 {
 }
-static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags)
+static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags)
 {
 }
 static inline void kmemleak_free_percpu(const void __percpu *ptr)
diff --git a/include/linux/slab.h b/include/linux/slab.h
index af5aa65c7c18..0c4c579f52ed 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -21,13 +21,20 @@
  * Flags to pass to kmem_cache_create().
  * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set.
  */
-#define SLAB_CONSISTENCY_CHECKS	0x00000100UL	/* DEBUG: Perform (expensive) checks on alloc/free */
-#define SLAB_RED_ZONE		0x00000400UL	/* DEBUG: Red zone objs in a cache */
-#define SLAB_POISON		0x00000800UL	/* DEBUG: Poison objects */
-#define SLAB_HWCACHE_ALIGN	0x00002000UL	/* Align objs on cache lines */
-#define SLAB_CACHE_DMA		0x00004000UL	/* Use GFP_DMA memory */
-#define SLAB_STORE_USER		0x00010000UL	/* DEBUG: Store the last owner for bug hunting */
-#define SLAB_PANIC		0x00040000UL	/* Panic if kmem_cache_create() fails */
+/* DEBUG: Perform (expensive) checks on alloc/free */
+#define SLAB_CONSISTENCY_CHECKS	((slab_flags_t __force)0x00000100UL)
+/* DEBUG: Red zone objs in a cache */
+#define SLAB_RED_ZONE		((slab_flags_t __force)0x00000400UL)
+/* DEBUG: Poison objects */
+#define SLAB_POISON		((slab_flags_t __force)0x00000800UL)
+/* Align objs on cache lines */
+#define SLAB_HWCACHE_ALIGN	((slab_flags_t __force)0x00002000UL)
+/* Use GFP_DMA memory */
+#define SLAB_CACHE_DMA		((slab_flags_t __force)0x00004000UL)
+/* DEBUG: Store the last owner for bug hunting */
+#define SLAB_STORE_USER		((slab_flags_t __force)0x00010000UL)
+/* Panic if kmem_cache_create() fails */
+#define SLAB_PANIC		((slab_flags_t __force)0x00040000UL)
 /*
  * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
  *
@@ -65,44 +72,51 @@
  *
  * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
  */
-#define SLAB_TYPESAFE_BY_RCU	0x00080000UL	/* Defer freeing slabs to RCU */
-#define SLAB_MEM_SPREAD		0x00100000UL	/* Spread some memory over cpuset */
-#define SLAB_TRACE		0x00200000UL	/* Trace allocations and frees */
+/* Defer freeing slabs to RCU */
+#define SLAB_TYPESAFE_BY_RCU	((slab_flags_t __force)0x00080000UL)
+/* Spread some memory over cpuset */
+#define SLAB_MEM_SPREAD		((slab_flags_t __force)0x00100000UL)
+/* Trace allocations and frees */
+#define SLAB_TRACE		((slab_flags_t __force)0x00200000UL)
 
 /* Flag to prevent checks on free */
 #ifdef CONFIG_DEBUG_OBJECTS
-# define SLAB_DEBUG_OBJECTS	0x00400000UL
+# define SLAB_DEBUG_OBJECTS	((slab_flags_t __force)0x00400000UL)
 #else
-# define SLAB_DEBUG_OBJECTS	0x00000000UL
+# define SLAB_DEBUG_OBJECTS	((slab_flags_t __force)0x00000000UL)
 #endif
 
-#define SLAB_NOLEAKTRACE	0x00800000UL	/* Avoid kmemleak tracing */
+/* Avoid kmemleak tracing */
+#define SLAB_NOLEAKTRACE	((slab_flags_t __force)0x00800000UL)
 
 /* Don't track use of uninitialized memory */
 #ifdef CONFIG_KMEMCHECK
-# define SLAB_NOTRACK		0x01000000UL
+# define SLAB_NOTRACK		((slab_flags_t __force)0x01000000UL)
 #else
-# define SLAB_NOTRACK		0x00000000UL
+# define SLAB_NOTRACK		((slab_flags_t __force)0x00000000UL)
 #endif
+/* Fault injection mark */
 #ifdef CONFIG_FAILSLAB
-# define SLAB_FAILSLAB		0x02000000UL	/* Fault injection mark */
+# define SLAB_FAILSLAB		((slab_flags_t __force)0x02000000UL)
 #else
-# define SLAB_FAILSLAB		0x00000000UL
+# define SLAB_FAILSLAB		((slab_flags_t __force)0x00000000UL)
 #endif
+/* Account to memcg */
 #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
-# define SLAB_ACCOUNT		0x04000000UL	/* Account to memcg */
+# define SLAB_ACCOUNT		((slab_flags_t __force)0x04000000UL)
 #else
-# define SLAB_ACCOUNT		0x00000000UL
+# define SLAB_ACCOUNT		((slab_flags_t __force)0x00000000UL)
 #endif
 
 #ifdef CONFIG_KASAN
-#define SLAB_KASAN		0x08000000UL
+#define SLAB_KASAN		((slab_flags_t __force)0x08000000UL)
 #else
-#define SLAB_KASAN		0x00000000UL
+#define SLAB_KASAN		((slab_flags_t __force)0x00000000UL)
 #endif
 
 /* The following flags affect the page allocator grouping pages by mobility */
-#define SLAB_RECLAIM_ACCOUNT	0x00020000UL		/* Objects are reclaimable */
+/* Objects are reclaimable */
+#define SLAB_RECLAIM_ACCOUNT	((slab_flags_t __force)0x00020000UL)
 #define SLAB_TEMPORARY		SLAB_RECLAIM_ACCOUNT	/* Objects are short-lived */
 /*
  * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
@@ -128,7 +142,7 @@ void __init kmem_cache_init(void);
 bool slab_is_available(void);
 
 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
-			unsigned long,
+			slab_flags_t,
 			void (*)(void *));
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 8f7d2b1656d2..072e46e9e1d5 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -20,7 +20,7 @@ struct kmem_cache {
 	struct reciprocal_value reciprocal_buffer_size;
 /* 2) touched by every alloc & free from the backend */
 
-	unsigned int flags;		/* constant flags */
+	slab_flags_t flags;		/* constant flags */
 	unsigned int num;		/* # of objs per slab */
 
 /* 3) cache_grow/shrink */
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 39fa09bcde23..0adae162dc8f 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -82,7 +82,7 @@ struct kmem_cache_order_objects {
 struct kmem_cache {
 	struct kmem_cache_cpu __percpu *cpu_slab;
 	/* Used for retriving partial slabs etc */
-	unsigned long flags;
+	slab_flags_t flags;
 	unsigned long min_partial;
 	int size;		/* The size of an object including meta data */
 	int object_size;	/* The size of an object without meta data */
diff --git a/include/linux/types.h b/include/linux/types.h
index 34fce54e4f1b..732b52c2eae4 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -156,6 +156,7 @@ typedef u32 dma_addr_t;
 #endif
 
 typedef unsigned __bitwise gfp_t;
+typedef unsigned long __bitwise slab_flags_t;
 typedef unsigned __bitwise fmode_t;
 
 #ifdef CONFIG_PHYS_ADDR_T_64BIT
diff --git a/include/net/sock.h b/include/net/sock.h
index a6b9a8d1a6df..c577286dbffb 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1105,7 +1105,7 @@ struct proto {
 
 	struct kmem_cache	*slab;
 	unsigned int		obj_size;
-	int			slab_flags;
+	slab_flags_t		slab_flags;
 
 	struct percpu_counter	*orphan_count;
 
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 6f319fb81718..405bba487df5 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -337,7 +337,7 @@ static size_t optimal_redzone(size_t object_size)
 }
 
 void kasan_cache_create(struct kmem_cache *cache, size_t *size,
-			unsigned long *flags)
+			slab_flags_t *flags)
 {
 	int redzone_adjust;
 	int orig_size = *size;
diff --git a/mm/slab.c b/mm/slab.c
index 0c6468c07b01..19b1b9f99819 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -252,8 +252,8 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
 	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
 	} while (0)
 
-#define CFLGS_OBJFREELIST_SLAB	(0x40000000UL)
-#define CFLGS_OFF_SLAB		(0x80000000UL)
+#define CFLGS_OBJFREELIST_SLAB	((slab_flags_t __force)0x40000000UL)
+#define CFLGS_OFF_SLAB		((slab_flags_t __force)0x80000000UL)
 #define	OBJFREELIST_SLAB(x)	((x)->flags & CFLGS_OBJFREELIST_SLAB)
 #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
 
@@ -441,7 +441,7 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
  * Calculate the number of objects and left-over bytes for a given buffer size.
  */
 static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
-		unsigned long flags, size_t *left_over)
+		slab_flags_t flags, size_t *left_over)
 {
 	unsigned int num;
 	size_t slab_size = PAGE_SIZE << gfporder;
@@ -1759,7 +1759,7 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
  * towards high-order requests, this should be changed.
  */
 static size_t calculate_slab_order(struct kmem_cache *cachep,
-				size_t size, unsigned long flags)
+				size_t size, slab_flags_t flags)
 {
 	size_t left_over = 0;
 	int gfporder;
@@ -1886,8 +1886,8 @@ static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 	return 0;
 }
 
-unsigned long kmem_cache_flags(unsigned long object_size,
-	unsigned long flags, const char *name,
+slab_flags_t kmem_cache_flags(unsigned long object_size,
+	slab_flags_t flags, const char *name,
 	void (*ctor)(void *))
 {
 	return flags;
@@ -1895,7 +1895,7 @@ unsigned long kmem_cache_flags(unsigned long object_size,
 
 struct kmem_cache *
 __kmem_cache_alias(const char *name, size_t size, size_t align,
-		   unsigned long flags, void (*ctor)(void *))
+		   slab_flags_t flags, void (*ctor)(void *))
 {
 	struct kmem_cache *cachep;
 
@@ -1913,7 +1913,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
 }
 
 static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
-			size_t size, unsigned long flags)
+			size_t size, slab_flags_t flags)
 {
 	size_t left;
 
@@ -1936,7 +1936,7 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
 }
 
 static bool set_off_slab_cache(struct kmem_cache *cachep,
-			size_t size, unsigned long flags)
+			size_t size, slab_flags_t flags)
 {
 	size_t left;
 
@@ -1970,7 +1970,7 @@ static bool set_off_slab_cache(struct kmem_cache *cachep,
 }
 
 static bool set_on_slab_cache(struct kmem_cache *cachep,
-			size_t size, unsigned long flags)
+			size_t size, slab_flags_t flags)
 {
 	size_t left;
 
@@ -2006,8 +2006,7 @@ static bool set_on_slab_cache(struct kmem_cache *cachep,
  * cacheline.  This can be beneficial if you're counting cycles as closely
  * as davem.
  */
-int
-__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
+int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
 {
 	size_t ralign = BYTES_PER_WORD;
 	gfp_t gfp;
diff --git a/mm/slab.h b/mm/slab.h
index 45c586cefc11..e19255638cb6 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -21,7 +21,7 @@ struct kmem_cache {
 	unsigned int object_size;/* The original size of the object */
 	unsigned int size;	/* The aligned/padded/added on size  */
 	unsigned int align;	/* Alignment as calculated */
-	unsigned long flags;	/* Active flags on the slab */
+	slab_flags_t flags;	/* Active flags on the slab */
 	const char *name;	/* Slab name for sysfs */
 	int refcount;		/* Use counter */
 	void (*ctor)(void *);	/* Called on object slot creation */
@@ -79,13 +79,13 @@ extern const struct kmalloc_info_struct {
 	unsigned long size;
 } kmalloc_info[];
 
-unsigned long calculate_alignment(unsigned long flags,
+unsigned long calculate_alignment(slab_flags_t flags,
 		unsigned long align, unsigned long size);
 
 #ifndef CONFIG_SLOB
 /* Kmalloc array related functions */
 void setup_kmalloc_cache_index_table(void);
-void create_kmalloc_caches(unsigned long);
+void create_kmalloc_caches(slab_flags_t);
 
 /* Find the kmalloc slab corresponding for a certain size */
 struct kmem_cache *kmalloc_slab(size_t, gfp_t);
@@ -93,32 +93,32 @@ struct kmem_cache *kmalloc_slab(size_t, gfp_t);
 
 
 /* Functions provided by the slab allocators */
-extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags);
+int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
 
 extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
-			unsigned long flags);
+			slab_flags_t flags);
 extern void create_boot_cache(struct kmem_cache *, const char *name,
-			size_t size, unsigned long flags);
+			size_t size, slab_flags_t flags);
 
 int slab_unmergeable(struct kmem_cache *s);
 struct kmem_cache *find_mergeable(size_t size, size_t align,
-		unsigned long flags, const char *name, void (*ctor)(void *));
+		slab_flags_t flags, const char *name, void (*ctor)(void *));
 #ifndef CONFIG_SLOB
 struct kmem_cache *
 __kmem_cache_alias(const char *name, size_t size, size_t align,
-		   unsigned long flags, void (*ctor)(void *));
+		   slab_flags_t flags, void (*ctor)(void *));
 
-unsigned long kmem_cache_flags(unsigned long object_size,
-	unsigned long flags, const char *name,
+slab_flags_t kmem_cache_flags(unsigned long object_size,
+	slab_flags_t flags, const char *name,
 	void (*ctor)(void *));
 #else
 static inline struct kmem_cache *
 __kmem_cache_alias(const char *name, size_t size, size_t align,
-		   unsigned long flags, void (*ctor)(void *))
+		   slab_flags_t flags, void (*ctor)(void *))
 { return NULL; }
 
-static inline unsigned long kmem_cache_flags(unsigned long object_size,
-	unsigned long flags, const char *name,
+static inline slab_flags_t kmem_cache_flags(unsigned long object_size,
+	slab_flags_t flags, const char *name,
 	void (*ctor)(void *))
 {
 	return flags;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 8f7f9f75d7ea..175e86637afd 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -291,7 +291,7 @@ int slab_unmergeable(struct kmem_cache *s)
 }
 
 struct kmem_cache *find_mergeable(size_t size, size_t align,
-		unsigned long flags, const char *name, void (*ctor)(void *))
+		slab_flags_t flags, const char *name, void (*ctor)(void *))
 {
 	struct kmem_cache *s;
 
@@ -341,7 +341,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
  * Figure out what the alignment of the objects will be given a set of
  * flags, a user specified alignment and the size of the objects.
  */
-unsigned long calculate_alignment(unsigned long flags,
+unsigned long calculate_alignment(slab_flags_t flags,
 		unsigned long align, unsigned long size)
 {
 	/*
@@ -366,7 +366,7 @@ unsigned long calculate_alignment(unsigned long flags,
 
 static struct kmem_cache *create_cache(const char *name,
 		size_t object_size, size_t size, size_t align,
-		unsigned long flags, void (*ctor)(void *),
+		slab_flags_t flags, void (*ctor)(void *),
 		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
 {
 	struct kmem_cache *s;
@@ -431,7 +431,7 @@ out_free_cache:
  */
 struct kmem_cache *
 kmem_cache_create(const char *name, size_t size, size_t align,
-		  unsigned long flags, void (*ctor)(void *))
+		  slab_flags_t flags, void (*ctor)(void *))
 {
 	struct kmem_cache *s = NULL;
 	const char *cache_name;
@@ -879,7 +879,7 @@ bool slab_is_available(void)
 #ifndef CONFIG_SLOB
 /* Create a cache during boot when no slab services are available yet */
 void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
-		unsigned long flags)
+		slab_flags_t flags)
 {
 	int err;
 
@@ -899,7 +899,7 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
 }
 
 struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
-				unsigned long flags)
+				slab_flags_t flags)
 {
 	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
 
@@ -1057,7 +1057,7 @@ void __init setup_kmalloc_cache_index_table(void)
 	}
 }
 
-static void __init new_kmalloc_cache(int idx, unsigned long flags)
+static void __init new_kmalloc_cache(int idx, slab_flags_t flags)
 {
 	kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name,
 					kmalloc_info[idx].size, flags);
@@ -1068,7 +1068,7 @@ static void __init new_kmalloc_cache(int idx, unsigned long flags)
  * may already have been created because they were needed to
  * enable allocations for slab creation.
  */
-void __init create_kmalloc_caches(unsigned long flags)
+void __init create_kmalloc_caches(slab_flags_t flags)
 {
 	int i;
 
diff --git a/mm/slob.c b/mm/slob.c
index 3451ecad8e35..623e8a5c46ce 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -524,7 +524,7 @@ size_t ksize(const void *block)
 }
 EXPORT_SYMBOL(ksize);
 
-int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
+int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags)
 {
 	if (flags & SLAB_TYPESAFE_BY_RCU) {
 		/* leave room for rcu footer at the end of object */
diff --git a/mm/slub.c b/mm/slub.c
index 025bbb540f3d..482d1daa9088 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -193,8 +193,10 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
 #define MAX_OBJS_PER_PAGE	32767 /* since page.objects is u15 */
 
 /* Internal SLUB flags */
-#define __OBJECT_POISON		0x80000000UL /* Poison object */
-#define __CMPXCHG_DOUBLE	0x40000000UL /* Use cmpxchg_double */
+/* Poison object */
+#define __OBJECT_POISON		((slab_flags_t __force)0x80000000UL)
+/* Use cmpxchg_double */
+#define __CMPXCHG_DOUBLE	((slab_flags_t __force)0x40000000UL)
 
 /*
  * Tracking user of a slab.
@@ -485,9 +487,9 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p)
  * Debug settings:
  */
 #if defined(CONFIG_SLUB_DEBUG_ON)
-static int slub_debug = DEBUG_DEFAULT_FLAGS;
+static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
 #else
-static int slub_debug;
+static slab_flags_t slub_debug;
 #endif
 
 static char *slub_debug_slabs;
@@ -1289,8 +1291,8 @@ out:
 
 __setup("slub_debug", setup_slub_debug);
 
-unsigned long kmem_cache_flags(unsigned long object_size,
-	unsigned long flags, const char *name,
+slab_flags_t kmem_cache_flags(unsigned long object_size,
+	slab_flags_t flags, const char *name,
 	void (*ctor)(void *))
 {
 	/*
@@ -1322,8 +1324,8 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
 					struct page *page) {}
 static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
 					struct page *page) {}
-unsigned long kmem_cache_flags(unsigned long object_size,
-	unsigned long flags, const char *name,
+slab_flags_t kmem_cache_flags(unsigned long object_size,
+	slab_flags_t flags, const char *name,
 	void (*ctor)(void *))
 {
 	return flags;
@@ -3477,7 +3479,7 @@ static void set_cpu_partial(struct kmem_cache *s)
  */
 static int calculate_sizes(struct kmem_cache *s, int forced_order)
 {
-	unsigned long flags = s->flags;
+	slab_flags_t flags = s->flags;
 	size_t size = s->object_size;
 	int order;
 
@@ -3593,7 +3595,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 	return !!oo_objects(s->oo);
 }
 
-static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
+static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
 {
 	s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
 	s->reserved = 0;
@@ -4245,7 +4247,7 @@ void __init kmem_cache_init_late(void)
 
 struct kmem_cache *
 __kmem_cache_alias(const char *name, size_t size, size_t align,
-		   unsigned long flags, void (*ctor)(void *))
+		   slab_flags_t flags, void (*ctor)(void *))
 {
 	struct kmem_cache *s, *c;
 
@@ -4275,7 +4277,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
 	return s;
 }
 
-int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
+int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
 {
 	int err;
 
-- 
cgit v1.2.3


From 4fd0b46e898791009b03b2fdd6510044fa8730a6 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 15 Nov 2017 17:32:21 -0800
Subject: slab, slub, slob: convert slab_flags_t to 32-bit

struct kmem_cache::flags is "unsigned long" which is unnecessary on
64-bit as no flags are defined in the higher bits.

Switch the field to 32-bit and save some space on x86_64 until such
flags appear:

	add/remove: 0/0 grow/shrink: 0/107 up/down: 0/-657 (-657)
	function                                     old     new   delta
	sysfs_slab_add                               720     719      -1
				...
	check_object                                 699     676     -23

[akpm@linux-foundation.org: fix printk warning]
Link: http://lkml.kernel.org/r/20171021100635.GA8287@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h  | 44 ++++++++++++++++++++++----------------------
 include/linux/types.h |  2 +-
 mm/slab.c             |  4 ++--
 mm/slub.c             |  6 +++---
 4 files changed, 28 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 0c4c579f52ed..f37cb93768ab 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -22,19 +22,19 @@
  * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set.
  */
 /* DEBUG: Perform (expensive) checks on alloc/free */
-#define SLAB_CONSISTENCY_CHECKS	((slab_flags_t __force)0x00000100UL)
+#define SLAB_CONSISTENCY_CHECKS	((slab_flags_t __force)0x00000100U)
 /* DEBUG: Red zone objs in a cache */
-#define SLAB_RED_ZONE		((slab_flags_t __force)0x00000400UL)
+#define SLAB_RED_ZONE		((slab_flags_t __force)0x00000400U)
 /* DEBUG: Poison objects */
-#define SLAB_POISON		((slab_flags_t __force)0x00000800UL)
+#define SLAB_POISON		((slab_flags_t __force)0x00000800U)
 /* Align objs on cache lines */
-#define SLAB_HWCACHE_ALIGN	((slab_flags_t __force)0x00002000UL)
+#define SLAB_HWCACHE_ALIGN	((slab_flags_t __force)0x00002000U)
 /* Use GFP_DMA memory */
-#define SLAB_CACHE_DMA		((slab_flags_t __force)0x00004000UL)
+#define SLAB_CACHE_DMA		((slab_flags_t __force)0x00004000U)
 /* DEBUG: Store the last owner for bug hunting */
-#define SLAB_STORE_USER		((slab_flags_t __force)0x00010000UL)
+#define SLAB_STORE_USER		((slab_flags_t __force)0x00010000U)
 /* Panic if kmem_cache_create() fails */
-#define SLAB_PANIC		((slab_flags_t __force)0x00040000UL)
+#define SLAB_PANIC		((slab_flags_t __force)0x00040000U)
 /*
  * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
  *
@@ -73,50 +73,50 @@
  * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
  */
 /* Defer freeing slabs to RCU */
-#define SLAB_TYPESAFE_BY_RCU	((slab_flags_t __force)0x00080000UL)
+#define SLAB_TYPESAFE_BY_RCU	((slab_flags_t __force)0x00080000U)
 /* Spread some memory over cpuset */
-#define SLAB_MEM_SPREAD		((slab_flags_t __force)0x00100000UL)
+#define SLAB_MEM_SPREAD		((slab_flags_t __force)0x00100000U)
 /* Trace allocations and frees */
-#define SLAB_TRACE		((slab_flags_t __force)0x00200000UL)
+#define SLAB_TRACE		((slab_flags_t __force)0x00200000U)
 
 /* Flag to prevent checks on free */
 #ifdef CONFIG_DEBUG_OBJECTS
-# define SLAB_DEBUG_OBJECTS	((slab_flags_t __force)0x00400000UL)
+# define SLAB_DEBUG_OBJECTS	((slab_flags_t __force)0x00400000U)
 #else
-# define SLAB_DEBUG_OBJECTS	((slab_flags_t __force)0x00000000UL)
+# define SLAB_DEBUG_OBJECTS	0
 #endif
 
 /* Avoid kmemleak tracing */
-#define SLAB_NOLEAKTRACE	((slab_flags_t __force)0x00800000UL)
+#define SLAB_NOLEAKTRACE	((slab_flags_t __force)0x00800000U)
 
 /* Don't track use of uninitialized memory */
 #ifdef CONFIG_KMEMCHECK
-# define SLAB_NOTRACK		((slab_flags_t __force)0x01000000UL)
+# define SLAB_NOTRACK		((slab_flags_t __force)0x01000000U)
 #else
-# define SLAB_NOTRACK		((slab_flags_t __force)0x00000000UL)
+# define SLAB_NOTRACK		0
 #endif
 /* Fault injection mark */
 #ifdef CONFIG_FAILSLAB
-# define SLAB_FAILSLAB		((slab_flags_t __force)0x02000000UL)
+# define SLAB_FAILSLAB		((slab_flags_t __force)0x02000000U)
 #else
-# define SLAB_FAILSLAB		((slab_flags_t __force)0x00000000UL)
+# define SLAB_FAILSLAB		0
 #endif
 /* Account to memcg */
 #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
-# define SLAB_ACCOUNT		((slab_flags_t __force)0x04000000UL)
+# define SLAB_ACCOUNT		((slab_flags_t __force)0x04000000U)
 #else
-# define SLAB_ACCOUNT		((slab_flags_t __force)0x00000000UL)
+# define SLAB_ACCOUNT		0
 #endif
 
 #ifdef CONFIG_KASAN
-#define SLAB_KASAN		((slab_flags_t __force)0x08000000UL)
+#define SLAB_KASAN		((slab_flags_t __force)0x08000000U)
 #else
-#define SLAB_KASAN		((slab_flags_t __force)0x00000000UL)
+#define SLAB_KASAN		0
 #endif
 
 /* The following flags affect the page allocator grouping pages by mobility */
 /* Objects are reclaimable */
-#define SLAB_RECLAIM_ACCOUNT	((slab_flags_t __force)0x00020000UL)
+#define SLAB_RECLAIM_ACCOUNT	((slab_flags_t __force)0x00020000U)
 #define SLAB_TEMPORARY		SLAB_RECLAIM_ACCOUNT	/* Objects are short-lived */
 /*
  * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
diff --git a/include/linux/types.h b/include/linux/types.h
index 732b52c2eae4..c94d59ef96cc 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -156,7 +156,7 @@ typedef u32 dma_addr_t;
 #endif
 
 typedef unsigned __bitwise gfp_t;
-typedef unsigned long __bitwise slab_flags_t;
+typedef unsigned __bitwise slab_flags_t;
 typedef unsigned __bitwise fmode_t;
 
 #ifdef CONFIG_PHYS_ADDR_T_64BIT
diff --git a/mm/slab.c b/mm/slab.c
index 19b1b9f99819..7a5e0888a401 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -252,8 +252,8 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
 	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
 	} while (0)
 
-#define CFLGS_OBJFREELIST_SLAB	((slab_flags_t __force)0x40000000UL)
-#define CFLGS_OFF_SLAB		((slab_flags_t __force)0x80000000UL)
+#define CFLGS_OBJFREELIST_SLAB	((slab_flags_t __force)0x40000000U)
+#define CFLGS_OFF_SLAB		((slab_flags_t __force)0x80000000U)
 #define	OBJFREELIST_SLAB(x)	((x)->flags & CFLGS_OBJFREELIST_SLAB)
 #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
 
diff --git a/mm/slub.c b/mm/slub.c
index 482d1daa9088..33957fd376ae 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -194,9 +194,9 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
 
 /* Internal SLUB flags */
 /* Poison object */
-#define __OBJECT_POISON		((slab_flags_t __force)0x80000000UL)
+#define __OBJECT_POISON		((slab_flags_t __force)0x80000000U)
 /* Use cmpxchg_double */
-#define __CMPXCHG_DOUBLE	((slab_flags_t __force)0x40000000UL)
+#define __CMPXCHG_DOUBLE	((slab_flags_t __force)0x40000000U)
 
 /*
  * Tracking user of a slab.
@@ -3657,7 +3657,7 @@ error:
 	if (flags & SLAB_PANIC)
 		panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n",
 		      s->name, (unsigned long)s->size, s->size,
-		      oo_order(s->oo), s->offset, flags);
+		      oo_order(s->oo), s->offset, (unsigned long)flags);
 	return -EINVAL;
 }
 
-- 
cgit v1.2.3


From 5799b255c491d853b73fb9e0e1760210315d06cd Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn@suse.de>
Date: Wed, 15 Nov 2017 17:32:29 -0800
Subject: include/linux/slab.h: add kmalloc_array_node() and kcalloc_node()

Patch series "Add kmalloc_array_node() and kcalloc_node()".

Our current memeory allocation routines suffer form an API imbalance,
for one we have kmalloc_array() and kcalloc() which check for overflows
in size multiplication and we have kmalloc_node() and kzalloc_node()
which allow for memory allocation on a certain NUMA node but don't check
for eventual overflows.

This patch (of 6):

We have kmalloc_array() and kcalloc() wrappers on top of kmalloc() which
ensure us overflow free multiplication for the size of a memory
allocation but these implementations are not NUMA-aware.

Likewise we have kmalloc_node() which is a NUMA-aware version of
kmalloc() but the implementation is not aware of any possible overflows
in eventual size calculations.

Introduce a combination of the two above cases to have a NUMA-node aware
version of kmalloc_array() and kcalloc().

Link: http://lkml.kernel.org/r/20170927082038.3782-2-jthumshirn@suse.de
Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: Damien Le Moal <damien.lemoal@wdc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Doug Ledford <dledford@redhat.com>
Cc: Hal Rosenstock <hal.rosenstock@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mike Marciniszyn <infinipath@intel.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Cc: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index f37cb93768ab..ebddfca9e24b 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -650,6 +650,22 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
 #define kmalloc_track_caller(size, flags) \
 	__kmalloc_track_caller(size, flags, _RET_IP_)
 
+static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags,
+				       int node)
+{
+	if (size != 0 && n > SIZE_MAX / size)
+		return NULL;
+	if (__builtin_constant_p(n) && __builtin_constant_p(size))
+		return kmalloc_node(n * size, flags, node);
+	return __kmalloc_node(n * size, flags, node);
+}
+
+static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node)
+{
+	return kmalloc_array_node(n, size, flags | __GFP_ZERO, node);
+}
+
+
 #ifdef CONFIG_NUMA
 extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
 #define kmalloc_node_track_caller(size, flags, node) \
-- 
cgit v1.2.3


From 41710443f790b5f7f5305eba99dacce88e259f4c Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@intel.com>
Date: Wed, 15 Nov 2017 17:32:53 -0800
Subject: mm: update comments for struct page.mapping

struct page.mapping can be NULL or points to one object of type
address_space, anon_vma or KSM private structure.

Link: http://lkml.kernel.org/r/1506485067-15954-1-git-send-email-changbin.du@intel.com
Signed-off-by: Changbin Du <changbin.du@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c85f11dafd56..d1b8e8f97fc2 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -48,8 +48,10 @@ struct page {
 						 * inode address_space, or NULL.
 						 * If page mapped as anonymous
 						 * memory, low bit is set, and
-						 * it points to anon_vma object:
-						 * see PAGE_MAPPING_ANON below.
+						 * it points to anon_vma object
+						 * or KSM private structure. See
+						 * PAGE_MAPPING_ANON and
+						 * PAGE_MAPPING_KSM.
 						 */
 		void *s_mem;			/* slab first object */
 		atomic_t compound_mapcount;	/* first tail page */
-- 
cgit v1.2.3


From 23c47d2ada9f96731492a67b28c0072715075baa Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 15 Nov 2017 17:33:00 -0800
Subject: bdi: introduce BDI_CAP_SYNCHRONOUS_IO

As discussed at

  https://lkml.kernel.org/r/<20170728165604.10455-1-ross.zwisler@linux.intel.com>

someday we will remove rw_page().  If so, we need something to detect
such super-fast storage on which synchronous IO operations like the
current rw_page are always a win.

Introduces BDI_CAP_SYNCHRONOUS_IO to indicate such devices.  With it, we
could use various optimization techniques.

Link: http://lkml.kernel.org/r/1505886205-9671-3-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/brd.c           | 2 ++
 drivers/block/zram/zram_drv.c | 2 +-
 drivers/nvdimm/btt.c          | 3 +++
 drivers/nvdimm/pmem.c         | 2 ++
 include/linux/backing-dev.h   | 8 ++++++++
 5 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index c1cf87718c2e..588360d79fca 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -20,6 +20,7 @@
 #include <linux/radix-tree.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/backing-dev.h>
 #ifdef CONFIG_BLK_DEV_RAM_DAX
 #include <linux/pfn_t.h>
 #include <linux/dax.h>
@@ -448,6 +449,7 @@ static struct brd_device *brd_alloc(int i)
 	disk->flags		= GENHD_FL_EXT_DEVT;
 	sprintf(disk->disk_name, "ram%d", i);
 	set_capacity(disk, rd_size * 2);
+	disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
 
 #ifdef CONFIG_BLK_DEV_RAM_DAX
 	queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index d95bb8ce5092..88564222f473 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1558,7 +1558,7 @@ static int zram_add(void)
 		blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
 
 	zram->disk->queue->backing_dev_info->capabilities |=
-					BDI_CAP_STABLE_WRITES;
+			(BDI_CAP_STABLE_WRITES | BDI_CAP_SYNCHRONOUS_IO);
 	add_disk(zram->disk);
 
 	ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index d5612bd1cc81..e949e3302af4 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -23,6 +23,7 @@
 #include <linux/ndctl.h>
 #include <linux/fs.h>
 #include <linux/nd.h>
+#include <linux/backing-dev.h>
 #include "btt.h"
 #include "nd.h"
 
@@ -1402,6 +1403,8 @@ static int btt_blk_init(struct btt *btt)
 	btt->btt_disk->private_data = btt;
 	btt->btt_disk->queue = btt->btt_queue;
 	btt->btt_disk->flags = GENHD_FL_EXT_DEVT;
+	btt->btt_disk->queue->backing_dev_info->capabilities |=
+			BDI_CAP_SYNCHRONOUS_IO;
 
 	blk_queue_make_request(btt->btt_queue, btt_make_request);
 	blk_queue_logical_block_size(btt->btt_queue, btt->sector_size);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 39dfd7affa31..7fbc5c5dc8e1 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -31,6 +31,7 @@
 #include <linux/uio.h>
 #include <linux/dax.h>
 #include <linux/nd.h>
+#include <linux/backing-dev.h>
 #include "pmem.h"
 #include "pfn.h"
 #include "nd.h"
@@ -394,6 +395,7 @@ static int pmem_attach_disk(struct device *dev,
 	disk->fops		= &pmem_fops;
 	disk->queue		= q;
 	disk->flags		= GENHD_FL_EXT_DEVT;
+	disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
 	nvdimm_namespace_disk_name(ndns, disk->disk_name);
 	set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
 			/ 512);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f41ca8486e02..7360e79e9a2f 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -122,6 +122,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
  * BDI_CAP_STRICTLIMIT:    Keep number of dirty pages below bdi threshold.
  *
  * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback.
+ * BDI_CAP_SYNCHRONOUS_IO: Device is so fast that asynchronous IO would be
+ *			   inefficient.
  */
 #define BDI_CAP_NO_ACCT_DIRTY	0x00000001
 #define BDI_CAP_NO_WRITEBACK	0x00000002
@@ -129,6 +131,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 #define BDI_CAP_STABLE_WRITES	0x00000008
 #define BDI_CAP_STRICTLIMIT	0x00000010
 #define BDI_CAP_CGROUP_WRITEBACK 0x00000020
+#define BDI_CAP_SYNCHRONOUS_IO	0x00000040
 
 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \
 	(BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
@@ -174,6 +177,11 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
 long congestion_wait(int sync, long timeout);
 long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout);
 
+static inline bool bdi_cap_synchronous_io(struct backing_dev_info *bdi)
+{
+	return bdi->capabilities & BDI_CAP_SYNCHRONOUS_IO;
+}
+
 static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi)
 {
 	return bdi->capabilities & BDI_CAP_STABLE_WRITES;
-- 
cgit v1.2.3


From 539a6fea7fdcade532bd3e77be2862a683f8f0c9 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 15 Nov 2017 17:33:04 -0800
Subject: mm, swap: introduce SWP_SYNCHRONOUS_IO

If rw-page based fast storage is used for swap devices, we need to
detect it to enhance swap IO operations.  This patch is preparation for
optimizing of swap-in operation with next patch.

Link: http://lkml.kernel.org/r/1505886205-9671-4-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 3 ++-
 mm/swapfile.c        | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index f02fb5db8914..933d7c0c3542 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -171,8 +171,9 @@ enum {
 	SWP_AREA_DISCARD = (1 << 8),	/* single-time swap area discards */
 	SWP_PAGE_DISCARD = (1 << 9),	/* freed swap page-cluster discards */
 	SWP_STABLE_WRITES = (1 << 10),	/* no overwrite PG_writeback pages */
+	SWP_SYNCHRONOUS_IO = (1 << 11),	/* synchronous IO is efficient */
 					/* add others here before... */
-	SWP_SCANNING	= (1 << 11),	/* refcount in scan_swap_map */
+	SWP_SCANNING	= (1 << 12),	/* refcount in scan_swap_map */
 };
 
 #define SWAP_CLUSTER_MAX 32UL
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e47a21e64764..cb08fa65819f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3169,6 +3169,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
 		p->flags |= SWP_STABLE_WRITES;
 
+	if (bdi_cap_synchronous_io(inode_to_bdi(inode)))
+		p->flags |= SWP_SYNCHRONOUS_IO;
+
 	if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
 		int cpu;
 		unsigned long ci, nr_cluster;
-- 
cgit v1.2.3


From 0bcac06f27d7528591c27ac2b093ccd71c5d0168 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 15 Nov 2017 17:33:07 -0800
Subject: mm, swap: skip swapcache for swapin of synchronous device

With fast swap storage, the platforms want to use swap more aggressively
and swap-in is crucial to application latency.

The rw_page() based synchronous devices like zram, pmem and btt are such
fast storage.  When I profile swapin performance with zram lz4
decompress test, S/W overhead is more than 70%.  Maybe, it would be
bigger in nvdimm.

This patch aims to reduce swap-in latency by skipping swapcache if the
swap device is synchronous device like rw_page based device.  It
enhances 45% my swapin test(5G sequential swapin, no readahead, from
2.41sec to 1.64sec).

Link: http://lkml.kernel.org/r/1505886205-9671-5-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 11 +++++++++++
 mm/memory.c          | 52 ++++++++++++++++++++++++++++++++++++----------------
 mm/page_io.c         |  6 +++---
 mm/swapfile.c        | 11 +++++++----
 4 files changed, 57 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 933d7c0c3542..32c06f028c7b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -466,6 +466,7 @@ extern int page_swapcount(struct page *);
 extern int __swp_swapcount(swp_entry_t entry);
 extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
+extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
 extern bool reuse_swap_page(struct page *, int *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
@@ -474,6 +475,16 @@ extern void exit_swap_address_space(unsigned int type);
 
 #else /* CONFIG_SWAP */
 
+static inline int swap_readpage(struct page *page, bool do_poll)
+{
+	return 0;
+}
+
+static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry)
+{
+	return NULL;
+}
+
 #define swap_address_space(entry)		(NULL)
 #define get_nr_swap_pages()			0L
 #define total_swap_pages			0L
diff --git a/mm/memory.c b/mm/memory.c
index cae514e7dcfc..f75bff2cf662 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2842,7 +2842,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
 int do_swap_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
-	struct page *page = NULL, *swapcache;
+	struct page *page = NULL, *swapcache = NULL;
 	struct mem_cgroup *memcg;
 	struct vma_swap_readahead swap_ra;
 	swp_entry_t entry;
@@ -2881,17 +2881,35 @@ int do_swap_page(struct vm_fault *vmf)
 		}
 		goto out;
 	}
+
+
 	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
 	if (!page)
 		page = lookup_swap_cache(entry, vma_readahead ? vma : NULL,
 					 vmf->address);
 	if (!page) {
-		if (vma_readahead)
-			page = do_swap_page_readahead(entry,
-				GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
-		else
-			page = swapin_readahead(entry,
-				GFP_HIGHUSER_MOVABLE, vma, vmf->address);
+		struct swap_info_struct *si = swp_swap_info(entry);
+
+		if (!(si->flags & SWP_SYNCHRONOUS_IO)) {
+			if (vma_readahead)
+				page = do_swap_page_readahead(entry,
+					GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
+			else
+				page = swapin_readahead(entry,
+					GFP_HIGHUSER_MOVABLE, vma, vmf->address);
+			swapcache = page;
+		} else {
+			/* skip swapcache */
+			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
+			if (page) {
+				__SetPageLocked(page);
+				__SetPageSwapBacked(page);
+				set_page_private(page, entry.val);
+				lru_cache_add_anon(page);
+				swap_readpage(page, true);
+			}
+		}
+
 		if (!page) {
 			/*
 			 * Back out if somebody else faulted in this pte
@@ -2920,7 +2938,6 @@ int do_swap_page(struct vm_fault *vmf)
 		goto out_release;
 	}
 
-	swapcache = page;
 	locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
 
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2935,7 +2952,8 @@ int do_swap_page(struct vm_fault *vmf)
 	 * test below, are not enough to exclude that.  Even if it is still
 	 * swapcache, we need to check that the page's swap has not changed.
 	 */
-	if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
+	if (unlikely((!PageSwapCache(page) ||
+			page_private(page) != entry.val)) && swapcache)
 		goto out_page;
 
 	page = ksm_might_need_to_copy(page, vma, vmf->address);
@@ -2988,14 +3006,16 @@ int do_swap_page(struct vm_fault *vmf)
 		pte = pte_mksoft_dirty(pte);
 	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
 	vmf->orig_pte = pte;
-	if (page == swapcache) {
-		do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
-		mem_cgroup_commit_charge(page, memcg, true, false);
-		activate_page(page);
-	} else { /* ksm created a completely new copy */
+
+	/* ksm created a completely new copy */
+	if (unlikely(page != swapcache && swapcache)) {
 		page_add_new_anon_rmap(page, vma, vmf->address, false);
 		mem_cgroup_commit_charge(page, memcg, false, false);
 		lru_cache_add_active_or_unevictable(page, vma);
+	} else {
+		do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
+		mem_cgroup_commit_charge(page, memcg, true, false);
+		activate_page(page);
 	}
 
 	swap_free(entry);
@@ -3003,7 +3023,7 @@ int do_swap_page(struct vm_fault *vmf)
 	    (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
 		try_to_free_swap(page);
 	unlock_page(page);
-	if (page != swapcache) {
+	if (page != swapcache && swapcache) {
 		/*
 		 * Hold the lock to avoid the swap entry to be reused
 		 * until we take the PT lock for the pte_same() check
@@ -3036,7 +3056,7 @@ out_page:
 	unlock_page(page);
 out_release:
 	put_page(page);
-	if (page != swapcache) {
+	if (page != swapcache && swapcache) {
 		unlock_page(swapcache);
 		put_page(swapcache);
 	}
diff --git a/mm/page_io.c b/mm/page_io.c
index cd52b9cc169b..e93f1a4cacd7 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -347,7 +347,7 @@ out:
 	return ret;
 }
 
-int swap_readpage(struct page *page, bool do_poll)
+int swap_readpage(struct page *page, bool synchronous)
 {
 	struct bio *bio;
 	int ret = 0;
@@ -355,7 +355,7 @@ int swap_readpage(struct page *page, bool do_poll)
 	blk_qc_t qc;
 	struct gendisk *disk;
 
-	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+	VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(PageUptodate(page), page);
 	if (frontswap_load(page) == 0) {
@@ -403,7 +403,7 @@ int swap_readpage(struct page *page, bool do_poll)
 	count_vm_event(PSWPIN);
 	bio_get(bio);
 	qc = submit_bio(bio);
-	while (do_poll) {
+	while (synchronous) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (!READ_ONCE(bio->bi_private))
 			break;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index cb08fa65819f..85aff8a42801 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3455,10 +3455,15 @@ int swapcache_prepare(swp_entry_t entry)
 	return __swap_duplicate(entry, SWAP_HAS_CACHE);
 }
 
+struct swap_info_struct *swp_swap_info(swp_entry_t entry)
+{
+	return swap_info[swp_type(entry)];
+}
+
 struct swap_info_struct *page_swap_info(struct page *page)
 {
-	swp_entry_t swap = { .val = page_private(page) };
-	return swap_info[swp_type(swap)];
+	swp_entry_t entry = { .val = page_private(page) };
+	return swp_swap_info(entry);
 }
 
 /*
@@ -3466,7 +3471,6 @@ struct swap_info_struct *page_swap_info(struct page *page)
  */
 struct address_space *__page_file_mapping(struct page *page)
 {
-	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
 	return page_swap_info(page)->swap_file->f_mapping;
 }
 EXPORT_SYMBOL_GPL(__page_file_mapping);
@@ -3474,7 +3478,6 @@ EXPORT_SYMBOL_GPL(__page_file_mapping);
 pgoff_t __page_file_index(struct page *page)
 {
 	swp_entry_t swap = { .val = page_private(page) };
-	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
 	return swp_offset(swap);
 }
 EXPORT_SYMBOL_GPL(__page_file_index);
-- 
cgit v1.2.3


From aa8d22a11da933dbf880b4933b58931f4aefe91c Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 15 Nov 2017 17:33:11 -0800
Subject: mm: swap: SWP_SYNCHRONOUS_IO: skip swapcache only if swapped page has
 no other reference

When SWP_SYNCHRONOUS_IO swapped-in pages are shared by several
processes, it can cause unnecessary memory wastage by skipping swap
cache.  Because, with swapin fault by read, they could share a page if
the page were in swap cache.  Thus, it avoids allocating same content
new pages.

This patch makes the swapcache skipping work only if the swap pte is
non-sharable.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/1507620825-5537-1-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  6 ++++++
 mm/memory.c          | 19 ++++++++++---------
 mm/swapfile.c        |  7 +++++++
 3 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 32c06f028c7b..8b8a6f965785 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -463,6 +463,7 @@ extern unsigned int count_swap_pages(int, int);
 extern sector_t map_swap_page(struct page *, struct block_device **);
 extern sector_t swapdev_block(int, pgoff_t);
 extern int page_swapcount(struct page *);
+extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry);
 extern int __swp_swapcount(swp_entry_t entry);
 extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
@@ -589,6 +590,11 @@ static inline int page_swapcount(struct page *page)
 	return 0;
 }
 
+static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+{
+	return 0;
+}
+
 static inline int __swp_swapcount(swp_entry_t entry)
 {
 	return 0;
diff --git a/mm/memory.c b/mm/memory.c
index f75bff2cf662..a4518aedf4dd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2890,15 +2890,8 @@ int do_swap_page(struct vm_fault *vmf)
 	if (!page) {
 		struct swap_info_struct *si = swp_swap_info(entry);
 
-		if (!(si->flags & SWP_SYNCHRONOUS_IO)) {
-			if (vma_readahead)
-				page = do_swap_page_readahead(entry,
-					GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
-			else
-				page = swapin_readahead(entry,
-					GFP_HIGHUSER_MOVABLE, vma, vmf->address);
-			swapcache = page;
-		} else {
+		if (si->flags & SWP_SYNCHRONOUS_IO &&
+				__swap_count(si, entry) == 1) {
 			/* skip swapcache */
 			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
 			if (page) {
@@ -2908,6 +2901,14 @@ int do_swap_page(struct vm_fault *vmf)
 				lru_cache_add_anon(page);
 				swap_readpage(page, true);
 			}
+		} else {
+			if (vma_readahead)
+				page = do_swap_page_readahead(entry,
+					GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
+			else
+				page = swapin_readahead(entry,
+				       GFP_HIGHUSER_MOVABLE, vma, vmf->address);
+			swapcache = page;
 		}
 
 		if (!page) {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 85aff8a42801..3074b02eaa09 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1328,6 +1328,13 @@ int page_swapcount(struct page *page)
 	return count;
 }
 
+int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+{
+	pgoff_t offset = swp_offset(entry);
+
+	return swap_count(si->swap_map[offset]);
+}
+
 static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
 {
 	int count = 0;
-- 
cgit v1.2.3


From 4da2ce250f986060750fcc5b29112914e31803ba Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 15 Nov 2017 17:33:26 -0800
Subject: mm: distinguish CMA and MOVABLE isolation in has_unmovable_pages()

Joonsoo has noticed that "mm: drop migrate type checks from
has_unmovable_pages" would break CMA allocator because it relies on
has_unmovable_pages returning false even for CMA pageblocks which in
fact don't have to be movable:

 alloc_contig_range
   start_isolate_page_range
     set_migratetype_isolate
       has_unmovable_pages

This is a result of the code sharing between CMA and memory hotplug
while each one has a different idea of what has_unmovable_pages should
return.  This is unfortunate but fixing it properly would require a lot
of code duplication.

Fix the issue by introducing the requested migrate type argument and
special case MIGRATE_CMA case where CMA page blocks are handled
properly.  This will work for memory hotplug because it requires
MIGRATE_MOVABLE.

Link: http://lkml.kernel.org/r/20171019122118.y6cndierwl2vnguj@dhcp22.suse.cz
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Tested-by: Stefan Wahren <stefan.wahren@i2se.com>
Tested-by: Ran Wang <ran.wang_1@nxp.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <yasu.isimatu@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-isolation.h |  2 +-
 mm/page_alloc.c                | 12 +++++++++++-
 mm/page_isolation.c            | 10 +++++-----
 3 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 05a04e603686..cdad58bbfd8b 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -31,7 +31,7 @@ static inline bool is_migrate_isolate(int migratetype)
 #endif
 
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
-			 bool skip_hwpoisoned_pages);
+			 int migratetype, bool skip_hwpoisoned_pages);
 void set_pageblock_migratetype(struct page *page, int migratetype);
 int move_freepages_block(struct zone *zone, struct page *page,
 				int migratetype, int *num_movable);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e6d4234e0d1a..755f35f4bc8b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7353,6 +7353,7 @@ void *__init alloc_large_system_hash(const char *tablename,
  * race condition. So you can't expect this function should be exact.
  */
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
+			 int migratetype,
 			 bool skip_hwpoisoned_pages)
 {
 	unsigned long pfn, iter, found;
@@ -7364,6 +7365,15 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 	if (zone_idx(zone) == ZONE_MOVABLE)
 		return false;
 
+	/*
+	 * CMA allocations (alloc_contig_range) really need to mark isolate
+	 * CMA pageblocks even when they are not movable in fact so consider
+	 * them movable here.
+	 */
+	if (is_migrate_cma(migratetype) &&
+			is_migrate_cma(get_pageblock_migratetype(page)))
+		return false;
+
 	pfn = page_to_pfn(page);
 	for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
 		unsigned long check = pfn + iter;
@@ -7446,7 +7456,7 @@ bool is_pageblock_removable_nolock(struct page *page)
 	if (!zone_spans_pfn(zone, pfn))
 		return false;
 
-	return !has_unmovable_pages(zone, page, 0, true);
+	return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
 }
 
 #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 44f213935bf6..165ed8117bd1 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -15,7 +15,7 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/page_isolation.h>
 
-static int set_migratetype_isolate(struct page *page,
+static int set_migratetype_isolate(struct page *page, int migratetype,
 				bool skip_hwpoisoned_pages)
 {
 	struct zone *zone;
@@ -52,7 +52,7 @@ static int set_migratetype_isolate(struct page *page,
 	 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
 	 * We just check MOVABLE pages.
 	 */
-	if (!has_unmovable_pages(zone, page, arg.pages_found,
+	if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype,
 				 skip_hwpoisoned_pages))
 		ret = 0;
 
@@ -64,14 +64,14 @@ static int set_migratetype_isolate(struct page *page,
 out:
 	if (!ret) {
 		unsigned long nr_pages;
-		int migratetype = get_pageblock_migratetype(page);
+		int mt = get_pageblock_migratetype(page);
 
 		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
 		zone->nr_isolate_pageblock++;
 		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE,
 									NULL);
 
-		__mod_zone_freepage_state(zone, -nr_pages, migratetype);
+		__mod_zone_freepage_state(zone, -nr_pages, mt);
 	}
 
 	spin_unlock_irqrestore(&zone->lock, flags);
@@ -183,7 +183,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 	     pfn += pageblock_nr_pages) {
 		page = __first_valid_page(pfn, pageblock_nr_pages);
 		if (page &&
-		    set_migratetype_isolate(page, skip_hwpoisoned_pages)) {
+		    set_migratetype_isolate(page, migratetype, skip_hwpoisoned_pages)) {
 			undo_pfn = pfn;
 			goto undo;
 		}
-- 
cgit v1.2.3


From 66e8b438bd5c75498cfe915c4219049eaebcb869 Mon Sep 17 00:00:00 2001
From: Gioh Kim <gi-oh.kim@profitbricks.com>
Date: Wed, 15 Nov 2017 17:33:42 -0800
Subject: mm/memblock.c: make the index explicit argument of
 for_each_memblock_type

for_each_memblock_type macro function relies on idx variable defined in
the caller context.  Silent macro arguments are almost always wrong
thing to do.  They make code harder to read and easier to get wrong.
Let's use an explicit iterator parameter for for_each_memblock_type and
make the code more obious.  This patch is a mere cleanup and it
shouldn't introduce any functional change.

Link: http://lkml.kernel.org/r/20170913133029.28911-1-gi-oh.kim@profitbricks.com
Signed-off-by: Gioh Kim <gi-oh.kim@profitbricks.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 8 ++++----
 mm/memblock.c            | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index bae11c7e7bf3..ce0e5634c2f9 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -389,10 +389,10 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
 	     region < (memblock.memblock_type.regions + memblock.memblock_type.cnt);	\
 	     region++)
 
-#define for_each_memblock_type(memblock_type, rgn)			\
-	for (idx = 0, rgn = &memblock_type->regions[0];			\
-	     idx < memblock_type->cnt;					\
-	     idx++, rgn = &memblock_type->regions[idx])
+#define for_each_memblock_type(i, memblock_type, rgn)			\
+	for (i = 0, rgn = &memblock_type->regions[0];			\
+	     i < memblock_type->cnt;					\
+	     i++, rgn = &memblock_type->regions[i])
 
 #ifdef CONFIG_MEMTEST
 extern void early_memtest(phys_addr_t start, phys_addr_t end);
diff --git a/mm/memblock.c b/mm/memblock.c
index 91205780e6b1..18dbb69086bc 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -533,7 +533,7 @@ repeat:
 	base = obase;
 	nr_new = 0;
 
-	for_each_memblock_type(type, rgn) {
+	for_each_memblock_type(idx, type, rgn) {
 		phys_addr_t rbase = rgn->base;
 		phys_addr_t rend = rbase + rgn->size;
 
@@ -637,7 +637,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
 		if (memblock_double_array(type, base, size) < 0)
 			return -ENOMEM;
 
-	for_each_memblock_type(type, rgn) {
+	for_each_memblock_type(idx, type, rgn) {
 		phys_addr_t rbase = rgn->base;
 		phys_addr_t rend = rbase + rgn->size;
 
@@ -1715,7 +1715,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
 
 	pr_info(" %s.cnt  = 0x%lx\n", type->name, type->cnt);
 
-	for_each_memblock_type(type, rgn) {
+	for_each_memblock_type(idx, type, rgn) {
 		char nid_buf[32] = "";
 
 		base = rgn->base;
@@ -1739,7 +1739,7 @@ memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr)
 	unsigned long size = 0;
 	int idx;
 
-	for_each_memblock_type((&memblock.reserved), rgn) {
+	for_each_memblock_type(idx, (&memblock.reserved), rgn) {
 		phys_addr_t start, end;
 
 		if (rgn->base + rgn->size < start_addr)
-- 
cgit v1.2.3


From 0bea803e9e6bf3836d5368a6426c30a8c0e5eab5 Mon Sep 17 00:00:00 2001
From: Ralph Campbell <rcampbell@nvidia.com>
Date: Wed, 15 Nov 2017 17:34:00 -0800
Subject: mm/hmm: constify hmm_devmem_page_get_drvdata() parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Constify pointer parameter to avoid issue when use from code that only
has const struct page pointer to use in the first place.

Link: http://lkml.kernel.org/r/1506972774-10191-1-git-send-email-jglisse@redhat.com
Signed-off-by: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hmm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 96e69979f84d..325017ad9311 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -471,9 +471,9 @@ static inline void hmm_devmem_page_set_drvdata(struct page *page,
  * @page: pointer to struct page
  * Return: driver data value
  */
-static inline unsigned long hmm_devmem_page_get_drvdata(struct page *page)
+static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page)
 {
-	unsigned long *drvdata = (unsigned long *)&page->pgmap;
+	const unsigned long *drvdata = (const unsigned long *)&page->pgmap;
 
 	return drvdata[1];
 }
-- 
cgit v1.2.3


From 0f10851ea475e08896ee5d9a2036d1bb46a8f3a4 Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Wed, 15 Nov 2017 17:34:07 -0800
Subject: mm/mmu_notifier: avoid double notification when it is useless
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch only affects users of mmu_notifier->invalidate_range callback
which are device drivers related to ATS/PASID, CAPI, IOMMUv2, SVM ...
and it is an optimization for those users.  Everyone else is unaffected
by it.

When clearing a pte/pmd we are given a choice to notify the event under
the page table lock (notify version of *_clear_flush helpers do call the
mmu_notifier_invalidate_range).  But that notification is not necessary
in all cases.

This patch removes almost all cases where it is useless to have a call
to mmu_notifier_invalidate_range before
mmu_notifier_invalidate_range_end.  It also adds documentation in all
those cases explaining why.

Below is a more in depth analysis of why this is fine to do this:

For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when
device use thing like ATS/PASID to get the IOMMU to walk the CPU page
table to access a process virtual address space).  There is only 2 cases
when you need to notify those secondary TLB while holding page table
lock when clearing a pte/pmd:

  A) page backing address is free before mmu_notifier_invalidate_range_end
  B) a page table entry is updated to point to a new page (COW, write fault
     on zero page, __replace_page(), ...)

Case A is obvious you do not want to take the risk for the device to write
to a page that might now be used by something completely different.

Case B is more subtle. For correctness it requires the following sequence
to happen:
  - take page table lock
  - clear page table entry and notify (pmd/pte_huge_clear_flush_notify())
  - set page table entry to point to new page

If clearing the page table entry is not followed by a notify before setting
the new pte/pmd value then you can break memory model like C11 or C++11 for
the device.

Consider the following scenario (device use a feature similar to ATS/
PASID):

Two address addrA and addrB such that |addrA - addrB| >= PAGE_SIZE we
assume they are write protected for COW (other case of B apply too).

[Time N] -----------------------------------------------------------------
CPU-thread-0  {try to write to addrA}
CPU-thread-1  {try to write to addrB}
CPU-thread-2  {}
CPU-thread-3  {}
DEV-thread-0  {read addrA and populate device TLB}
DEV-thread-2  {read addrB and populate device TLB}
[Time N+1] ---------------------------------------------------------------
CPU-thread-0  {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
CPU-thread-1  {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
CPU-thread-2  {}
CPU-thread-3  {}
DEV-thread-0  {}
DEV-thread-2  {}
[Time N+2] ---------------------------------------------------------------
CPU-thread-0  {COW_step1: {update page table point to new page for addrA}}
CPU-thread-1  {COW_step1: {update page table point to new page for addrB}}
CPU-thread-2  {}
CPU-thread-3  {}
DEV-thread-0  {}
DEV-thread-2  {}
[Time N+3] ---------------------------------------------------------------
CPU-thread-0  {preempted}
CPU-thread-1  {preempted}
CPU-thread-2  {write to addrA which is a write to new page}
CPU-thread-3  {}
DEV-thread-0  {}
DEV-thread-2  {}
[Time N+3] ---------------------------------------------------------------
CPU-thread-0  {preempted}
CPU-thread-1  {preempted}
CPU-thread-2  {}
CPU-thread-3  {write to addrB which is a write to new page}
DEV-thread-0  {}
DEV-thread-2  {}
[Time N+4] ---------------------------------------------------------------
CPU-thread-0  {preempted}
CPU-thread-1  {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
CPU-thread-2  {}
CPU-thread-3  {}
DEV-thread-0  {}
DEV-thread-2  {}
[Time N+5] ---------------------------------------------------------------
CPU-thread-0  {preempted}
CPU-thread-1  {}
CPU-thread-2  {}
CPU-thread-3  {}
DEV-thread-0  {read addrA from old page}
DEV-thread-2  {read addrB from new page}

So here because at time N+2 the clear page table entry was not pair with a
notification to invalidate the secondary TLB, the device see the new value
for addrB before seing the new value for addrA.  This break total memory
ordering for the device.

When changing a pte to write protect or to point to a new write protected
page with same content (KSM) it is ok to delay invalidate_range callback
to mmu_notifier_invalidate_range_end() outside the page table lock.  This
is true even if the thread doing page table update is preempted right
after releasing page table lock before calling
mmu_notifier_invalidate_range_end

Thanks to Andrea for thinking of a problematic scenario for COW.

[jglisse@redhat.com: v2]
  Link: http://lkml.kernel.org/r/20171017031003.7481-2-jglisse@redhat.com
Link: http://lkml.kernel.org/r/20170901173011.10745-1-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alistair Popple <alistair@popple.id.au>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/mmu_notifier.txt | 93 +++++++++++++++++++++++++++++++++++++++
 fs/dax.c                          |  9 +++-
 include/linux/mmu_notifier.h      |  3 +-
 mm/huge_memory.c                  | 20 +++++++--
 mm/hugetlb.c                      | 16 +++++--
 mm/ksm.c                          | 15 ++++++-
 mm/rmap.c                         | 59 ++++++++++++++++++++++---
 7 files changed, 198 insertions(+), 17 deletions(-)
 create mode 100644 Documentation/vm/mmu_notifier.txt

(limited to 'include/linux')

diff --git a/Documentation/vm/mmu_notifier.txt b/Documentation/vm/mmu_notifier.txt
new file mode 100644
index 000000000000..23b462566bb7
--- /dev/null
+++ b/Documentation/vm/mmu_notifier.txt
@@ -0,0 +1,93 @@
+When do you need to notify inside page table lock ?
+
+When clearing a pte/pmd we are given a choice to notify the event through
+(notify version of *_clear_flush call mmu_notifier_invalidate_range) under
+the page table lock. But that notification is not necessary in all cases.
+
+For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use
+thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a
+process virtual address space). There is only 2 cases when you need to notify
+those secondary TLB while holding page table lock when clearing a pte/pmd:
+
+  A) page backing address is free before mmu_notifier_invalidate_range_end()
+  B) a page table entry is updated to point to a new page (COW, write fault
+     on zero page, __replace_page(), ...)
+
+Case A is obvious you do not want to take the risk for the device to write to
+a page that might now be used by some completely different task.
+
+Case B is more subtle. For correctness it requires the following sequence to
+happen:
+  - take page table lock
+  - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify())
+  - set page table entry to point to new page
+
+If clearing the page table entry is not followed by a notify before setting
+the new pte/pmd value then you can break memory model like C11 or C++11 for
+the device.
+
+Consider the following scenario (device use a feature similar to ATS/PASID):
+
+Two address addrA and addrB such that |addrA - addrB| >= PAGE_SIZE we assume
+they are write protected for COW (other case of B apply too).
+
+[Time N] --------------------------------------------------------------------
+CPU-thread-0  {try to write to addrA}
+CPU-thread-1  {try to write to addrB}
+CPU-thread-2  {}
+CPU-thread-3  {}
+DEV-thread-0  {read addrA and populate device TLB}
+DEV-thread-2  {read addrB and populate device TLB}
+[Time N+1] ------------------------------------------------------------------
+CPU-thread-0  {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
+CPU-thread-1  {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
+CPU-thread-2  {}
+CPU-thread-3  {}
+DEV-thread-0  {}
+DEV-thread-2  {}
+[Time N+2] ------------------------------------------------------------------
+CPU-thread-0  {COW_step1: {update page table to point to new page for addrA}}
+CPU-thread-1  {COW_step1: {update page table to point to new page for addrB}}
+CPU-thread-2  {}
+CPU-thread-3  {}
+DEV-thread-0  {}
+DEV-thread-2  {}
+[Time N+3] ------------------------------------------------------------------
+CPU-thread-0  {preempted}
+CPU-thread-1  {preempted}
+CPU-thread-2  {write to addrA which is a write to new page}
+CPU-thread-3  {}
+DEV-thread-0  {}
+DEV-thread-2  {}
+[Time N+3] ------------------------------------------------------------------
+CPU-thread-0  {preempted}
+CPU-thread-1  {preempted}
+CPU-thread-2  {}
+CPU-thread-3  {write to addrB which is a write to new page}
+DEV-thread-0  {}
+DEV-thread-2  {}
+[Time N+4] ------------------------------------------------------------------
+CPU-thread-0  {preempted}
+CPU-thread-1  {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
+CPU-thread-2  {}
+CPU-thread-3  {}
+DEV-thread-0  {}
+DEV-thread-2  {}
+[Time N+5] ------------------------------------------------------------------
+CPU-thread-0  {preempted}
+CPU-thread-1  {}
+CPU-thread-2  {}
+CPU-thread-3  {}
+DEV-thread-0  {read addrA from old page}
+DEV-thread-2  {read addrB from new page}
+
+So here because at time N+2 the clear page table entry was not pair with a
+notification to invalidate the secondary TLB, the device see the new value for
+addrB before seing the new value for addrA. This break total memory ordering
+for the device.
+
+When changing a pte to write protect or to point to a new write protected page
+with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range
+call to mmu_notifier_invalidate_range_end() outside the page table lock. This
+is true even if the thread doing the page table update is preempted right after
+releasing page table lock but before call mmu_notifier_invalidate_range_end().
diff --git a/fs/dax.c b/fs/dax.c
index f3a44a7c14b3..9ec797424e4f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -614,6 +614,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
 		if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
 			continue;
 
+		/*
+		 * No need to call mmu_notifier_invalidate_range() as we are
+		 * downgrading page table protection not changing it to point
+		 * to a new page.
+		 *
+		 * See Documentation/vm/mmu_notifier.txt
+		 */
 		if (pmdp) {
 #ifdef CONFIG_FS_DAX_PMD
 			pmd_t pmd;
@@ -628,7 +635,6 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
 			pmd = pmd_wrprotect(pmd);
 			pmd = pmd_mkclean(pmd);
 			set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-			mmu_notifier_invalidate_range(vma->vm_mm, start, end);
 unlock_pmd:
 			spin_unlock(ptl);
 #endif
@@ -643,7 +649,6 @@ unlock_pmd:
 			pte = pte_wrprotect(pte);
 			pte = pte_mkclean(pte);
 			set_pte_at(vma->vm_mm, address, ptep, pte);
-			mmu_notifier_invalidate_range(vma->vm_mm, start, end);
 unlock_pte:
 			pte_unmap_unlock(ptep, ptl);
 		}
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 2cf1c3c807f6..130831718e95 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -156,7 +156,8 @@ struct mmu_notifier_ops {
 	 * shared page-tables, it not necessary to implement the
 	 * invalidate_range_start()/end() notifiers, as
 	 * invalidate_range() alread catches the points in time when an
-	 * external TLB range needs to be flushed.
+	 * external TLB range needs to be flushed. For more in depth
+	 * discussion on this see Documentation/vm/mmu_notifier.txt
 	 *
 	 * The invalidate_range() function is called under the ptl
 	 * spin-lock and not allowed to sleep.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 003f7bcd0952..07ae73f4ef91 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1189,8 +1189,15 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
 		goto out_free_pages;
 	VM_BUG_ON_PAGE(!PageHead(page), page);
 
+	/*
+	 * Leave pmd empty until pte is filled note we must notify here as
+	 * concurrent CPU thread might write to new page before the call to
+	 * mmu_notifier_invalidate_range_end() happens which can lead to a
+	 * device seeing memory write in different order than CPU.
+	 *
+	 * See Documentation/vm/mmu_notifier.txt
+	 */
 	pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
-	/* leave pmd empty until pte is filled */
 
 	pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
 	pmd_populate(vma->vm_mm, &_pmd, pgtable);
@@ -2029,8 +2036,15 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 	pmd_t _pmd;
 	int i;
 
-	/* leave pmd empty until pte is filled */
-	pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+	/*
+	 * Leave pmd empty until pte is filled note that it is fine to delay
+	 * notification until mmu_notifier_invalidate_range_end() as we are
+	 * replacing a zero pmd write protected page with a zero pte write
+	 * protected page.
+	 *
+	 * See Documentation/vm/mmu_notifier.txt
+	 */
+	pmdp_huge_clear_flush(vma, haddr, pmd);
 
 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
 	pmd_populate(mm, &_pmd, pgtable);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2d2ff5e8bf2b..681b300185c0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3256,9 +3256,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
 		} else {
 			if (cow) {
+				/*
+				 * No need to notify as we are downgrading page
+				 * table protection not changing it to point
+				 * to a new page.
+				 *
+				 * See Documentation/vm/mmu_notifier.txt
+				 */
 				huge_ptep_set_wrprotect(src, addr, src_pte);
-				mmu_notifier_invalidate_range(src, mmun_start,
-								   mmun_end);
 			}
 			entry = huge_ptep_get(src_pte);
 			ptepage = pte_page(entry);
@@ -4318,7 +4323,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	 * and that page table be reused and filled with junk.
 	 */
 	flush_hugetlb_tlb_range(vma, start, end);
-	mmu_notifier_invalidate_range(mm, start, end);
+	/*
+	 * No need to call mmu_notifier_invalidate_range() we are downgrading
+	 * page table protection not changing it to point to a new page.
+	 *
+	 * See Documentation/vm/mmu_notifier.txt
+	 */
 	i_mmap_unlock_write(vma->vm_file->f_mapping);
 	mmu_notifier_invalidate_range_end(mm, start, end);
 
diff --git a/mm/ksm.c b/mm/ksm.c
index 6cb60f46cce5..be8f4576f842 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1052,8 +1052,13 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 		 * So we clear the pte and flush the tlb before the check
 		 * this assure us that no O_DIRECT can happen after the check
 		 * or in the middle of the check.
+		 *
+		 * No need to notify as we are downgrading page table to read
+		 * only not changing it to point to a new page.
+		 *
+		 * See Documentation/vm/mmu_notifier.txt
 		 */
-		entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
+		entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
 		/*
 		 * Check that no O_DIRECT or similar I/O is in progress on the
 		 * page
@@ -1136,7 +1141,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	}
 
 	flush_cache_page(vma, addr, pte_pfn(*ptep));
-	ptep_clear_flush_notify(vma, addr, ptep);
+	/*
+	 * No need to notify as we are replacing a read only page with another
+	 * read only page with the same content.
+	 *
+	 * See Documentation/vm/mmu_notifier.txt
+	 */
+	ptep_clear_flush(vma, addr, ptep);
 	set_pte_at_notify(mm, addr, ptep, newpte);
 
 	page_remove_rmap(page, false);
diff --git a/mm/rmap.c b/mm/rmap.c
index b874c4761e84..7dfc0975de4b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -939,10 +939,15 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
 #endif
 		}
 
-		if (ret) {
-			mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend);
+		/*
+		 * No need to call mmu_notifier_invalidate_range() as we are
+		 * downgrading page table protection not changing it to point
+		 * to a new page.
+		 *
+		 * See Documentation/vm/mmu_notifier.txt
+		 */
+		if (ret)
 			(*cleaned)++;
-		}
 	}
 
 	mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
@@ -1426,6 +1431,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			if (pte_soft_dirty(pteval))
 				swp_pte = pte_swp_mksoft_dirty(swp_pte);
 			set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+			/*
+			 * No need to invalidate here it will synchronize on
+			 * against the special swap migration pte.
+			 */
 			goto discard;
 		}
 
@@ -1483,6 +1492,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 * will take care of the rest.
 			 */
 			dec_mm_counter(mm, mm_counter(page));
+			/* We have to invalidate as we cleared the pte */
+			mmu_notifier_invalidate_range(mm, address,
+						      address + PAGE_SIZE);
 		} else if (IS_ENABLED(CONFIG_MIGRATION) &&
 				(flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
 			swp_entry_t entry;
@@ -1498,6 +1510,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			if (pte_soft_dirty(pteval))
 				swp_pte = pte_swp_mksoft_dirty(swp_pte);
 			set_pte_at(mm, address, pvmw.pte, swp_pte);
+			/*
+			 * No need to invalidate here it will synchronize on
+			 * against the special swap migration pte.
+			 */
 		} else if (PageAnon(page)) {
 			swp_entry_t entry = { .val = page_private(subpage) };
 			pte_t swp_pte;
@@ -1509,6 +1525,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 				WARN_ON_ONCE(1);
 				ret = false;
 				/* We have to invalidate as we cleared the pte */
+				mmu_notifier_invalidate_range(mm, address,
+							address + PAGE_SIZE);
 				page_vma_mapped_walk_done(&pvmw);
 				break;
 			}
@@ -1516,6 +1534,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			/* MADV_FREE page check */
 			if (!PageSwapBacked(page)) {
 				if (!PageDirty(page)) {
+					/* Invalidate as we cleared the pte */
+					mmu_notifier_invalidate_range(mm,
+						address, address + PAGE_SIZE);
 					dec_mm_counter(mm, MM_ANONPAGES);
 					goto discard;
 				}
@@ -1549,13 +1570,39 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			if (pte_soft_dirty(pteval))
 				swp_pte = pte_swp_mksoft_dirty(swp_pte);
 			set_pte_at(mm, address, pvmw.pte, swp_pte);
-		} else
+			/* Invalidate as we cleared the pte */
+			mmu_notifier_invalidate_range(mm, address,
+						      address + PAGE_SIZE);
+		} else {
+			/*
+			 * We should not need to notify here as we reach this
+			 * case only from freeze_page() itself only call from
+			 * split_huge_page_to_list() so everything below must
+			 * be true:
+			 *   - page is not anonymous
+			 *   - page is locked
+			 *
+			 * So as it is a locked file back page thus it can not
+			 * be remove from the page cache and replace by a new
+			 * page before mmu_notifier_invalidate_range_end so no
+			 * concurrent thread might update its page table to
+			 * point at new page while a device still is using this
+			 * page.
+			 *
+			 * See Documentation/vm/mmu_notifier.txt
+			 */
 			dec_mm_counter(mm, mm_counter_file(page));
+		}
 discard:
+		/*
+		 * No need to call mmu_notifier_invalidate_range() it has be
+		 * done above for all cases requiring it to happen under page
+		 * table lock before mmu_notifier_invalidate_range_end()
+		 *
+		 * See Documentation/vm/mmu_notifier.txt
+		 */
 		page_remove_rmap(subpage, PageHuge(page));
 		put_page(page);
-		mmu_notifier_invalidate_range(mm, address,
-					      address + PAGE_SIZE);
 	}
 
 	mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
-- 
cgit v1.2.3


From 4645b9fe84bf4878f04c7959a75df7c3c2d1bbb9 Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Wed, 15 Nov 2017 17:34:11 -0800
Subject: mm/mmu_notifier: avoid call to invalidate_range() in range_end()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is an optimization patch that only affect mmu_notifier users which
rely on the invalidate_range() callback.  This patch avoids calling that
callback twice in a row from inside __mmu_notifier_invalidate_range_end

Existing pattern (before this patch):
    mmu_notifier_invalidate_range_start()
        pte/pmd/pud_clear_flush_notify()
            mmu_notifier_invalidate_range()
    mmu_notifier_invalidate_range_end()
        mmu_notifier_invalidate_range()

New pattern (after this patch):
    mmu_notifier_invalidate_range_start()
        pte/pmd/pud_clear_flush_notify()
            mmu_notifier_invalidate_range()
    mmu_notifier_invalidate_range_only_end()

We call the invalidate_range callback after clearing the page table
under the page table lock and we skip the call to invalidate_range
inside the __mmu_notifier_invalidate_range_end() function.

Idea from Andrea Arcangeli

Link: http://lkml.kernel.org/r/20171017031003.7481-3-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alistair Popple <alistair@popple.id.au>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 17 ++++++++++++++--
 mm/huge_memory.c             | 46 ++++++++++++++++++++++++++++++++++++++++----
 mm/memory.c                  |  6 +++++-
 mm/migrate.c                 | 15 ++++++++++++---
 mm/mmu_notifier.c            | 11 +++++++++--
 5 files changed, 83 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 130831718e95..b25dc9db19fc 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -214,7 +214,8 @@ extern void __mmu_notifier_change_pte(struct mm_struct *mm,
 extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 				  unsigned long start, unsigned long end);
 extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
-				  unsigned long start, unsigned long end);
+				  unsigned long start, unsigned long end,
+				  bool only_end);
 extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
 				  unsigned long start, unsigned long end);
 
@@ -268,7 +269,14 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 				  unsigned long start, unsigned long end)
 {
 	if (mm_has_notifiers(mm))
-		__mmu_notifier_invalidate_range_end(mm, start, end);
+		__mmu_notifier_invalidate_range_end(mm, start, end, false);
+}
+
+static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+	if (mm_has_notifiers(mm))
+		__mmu_notifier_invalidate_range_end(mm, start, end, true);
 }
 
 static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
@@ -439,6 +447,11 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 {
 }
 
+static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+}
+
 static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
 				  unsigned long start, unsigned long end)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 07ae73f4ef91..cc65fb87c9db 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1223,7 +1223,12 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
 	page_remove_rmap(page, true);
 	spin_unlock(vmf->ptl);
 
-	mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
+	/*
+	 * No need to double call mmu_notifier->invalidate_range() callback as
+	 * the above pmdp_huge_clear_flush_notify() did already call it.
+	 */
+	mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
+						mmun_end);
 
 	ret |= VM_FAULT_WRITE;
 	put_page(page);
@@ -1372,7 +1377,12 @@ alloc:
 	}
 	spin_unlock(vmf->ptl);
 out_mn:
-	mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
+	/*
+	 * No need to double call mmu_notifier->invalidate_range() callback as
+	 * the above pmdp_huge_clear_flush_notify() did already call it.
+	 */
+	mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
+					       mmun_end);
 out:
 	return ret;
 out_unlock:
@@ -2024,7 +2034,12 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
 
 out:
 	spin_unlock(ptl);
-	mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE);
+	/*
+	 * No need to double call mmu_notifier->invalidate_range() callback as
+	 * the above pudp_huge_clear_flush_notify() did already call it.
+	 */
+	mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
+					       HPAGE_PUD_SIZE);
 }
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
@@ -2099,6 +2114,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
 		return;
 	} else if (is_huge_zero_pmd(*pmd)) {
+		/*
+		 * FIXME: Do we want to invalidate secondary mmu by calling
+		 * mmu_notifier_invalidate_range() see comments below inside
+		 * __split_huge_pmd() ?
+		 *
+		 * We are going from a zero huge page write protected to zero
+		 * small page also write protected so it does not seems useful
+		 * to invalidate secondary mmu at this time.
+		 */
 		return __split_huge_zero_page_pmd(vma, haddr, pmd);
 	}
 
@@ -2234,7 +2258,21 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 	__split_huge_pmd_locked(vma, pmd, haddr, freeze);
 out:
 	spin_unlock(ptl);
-	mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
+	/*
+	 * No need to double call mmu_notifier->invalidate_range() callback.
+	 * They are 3 cases to consider inside __split_huge_pmd_locked():
+	 *  1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
+	 *  2) __split_huge_zero_page_pmd() read only zero page and any write
+	 *    fault will trigger a flush_notify before pointing to a new page
+	 *    (it is fine if the secondary mmu keeps pointing to the old zero
+	 *    page in the meantime)
+	 *  3) Split a huge pmd into pte pointing to the same page. No need
+	 *     to invalidate secondary tlb entry they are all still valid.
+	 *     any further changes to individual pte will notify. So no need
+	 *     to call mmu_notifier->invalidate_range()
+	 */
+	mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
+					       HPAGE_PMD_SIZE);
 }
 
 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index a4518aedf4dd..42fb30300bb5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2554,7 +2554,11 @@ static int wp_page_copy(struct vm_fault *vmf)
 		put_page(new_page);
 
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
-	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+	/*
+	 * No need to double call mmu_notifier->invalidate_range() callback as
+	 * the above ptep_clear_flush_notify() did already call it.
+	 */
+	mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
 	if (old_page) {
 		/*
 		 * Don't let another task, with possibly unlocked vma,
diff --git a/mm/migrate.c b/mm/migrate.c
index 1236449b4777..4d0be47a322a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2089,7 +2089,11 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 	set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
 
 	spin_unlock(ptl);
-	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+	/*
+	 * No need to double call mmu_notifier->invalidate_range() callback as
+	 * the above pmdp_huge_clear_flush_notify() did already call it.
+	 */
+	mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
 
 	/* Take an "isolate" reference and put new page on the LRU. */
 	get_page(new_page);
@@ -2805,9 +2809,14 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
 			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
 	}
 
+	/*
+	 * No need to double call mmu_notifier->invalidate_range() callback as
+	 * the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
+	 * did already call it.
+	 */
 	if (notified)
-		mmu_notifier_invalidate_range_end(mm, mmu_start,
-						  migrate->end);
+		mmu_notifier_invalidate_range_only_end(mm, mmu_start,
+						       migrate->end);
 }
 
 /*
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 314285284e6e..96edb33fd09a 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -190,7 +190,9 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
 
 void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
-				  unsigned long start, unsigned long end)
+					 unsigned long start,
+					 unsigned long end,
+					 bool only_end)
 {
 	struct mmu_notifier *mn;
 	int id;
@@ -204,8 +206,13 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 		 * subsystem registers either invalidate_range_start()/end() or
 		 * invalidate_range(), so this will be no additional overhead
 		 * (besides the pointer check).
+		 *
+		 * We skip call to invalidate_range() if we know it is safe ie
+		 * call site use mmu_notifier_invalidate_range_only_end() which
+		 * is safe to do when we know that a call to invalidate_range()
+		 * already happen under page table lock.
 		 */
-		if (mn->ops->invalidate_range)
+		if (!only_end && mn->ops->invalidate_range)
 			mn->ops->invalidate_range(mn, mm, start, end);
 		if (mn->ops->invalidate_range_end)
 			mn->ops->invalidate_range_end(mn, mm, start, end);
-- 
cgit v1.2.3


From 3a50d14d0df5776e002a8683a290c87eeac93a21 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Wed, 15 Nov 2017 17:34:15 -0800
Subject: mm: remove unused pgdat->inactive_ratio

Since commit 59dc76b0d4df ("mm: vmscan: reduce size of inactive file
list") 'pgdat->inactive_ratio' is not used, except for printing
"node_inactive_ratio: 0" in /proc/zoneinfo output.

Remove it.

Link: http://lkml.kernel.org/r/20171003152611.27483-1-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 6 ------
 mm/vmscan.c            | 2 +-
 mm/vmstat.c            | 6 ++----
 3 files changed, 3 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a507f43ad221..39ccaa9c428b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -712,12 +712,6 @@ typedef struct pglist_data {
 	/* Fields commonly accessed by the page reclaim scanner */
 	struct lruvec		lruvec;
 
-	/*
-	 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
-	 * this node's LRU.  Maintained by the pageout code.
-	 */
-	unsigned int inactive_ratio;
-
 	unsigned long		flags;
 
 	ZONE_PADDING(_pad2_)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 15b483ef6440..2852b8c5a917 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2082,7 +2082,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
  * If that fails and refaulting is observed, the inactive list grows.
  *
  * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
- * on this LRU, maintained by the pageout code. A zone->inactive_ratio
+ * on this LRU, maintained by the pageout code. An inactive_ratio
  * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
  *
  * total     target    max
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4bb13e72ac97..7d11554861e4 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1564,11 +1564,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 	}
 	seq_printf(m,
 		   "\n  node_unreclaimable:  %u"
-		   "\n  start_pfn:           %lu"
-		   "\n  node_inactive_ratio: %u",
+		   "\n  start_pfn:           %lu",
 		   pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
-		   zone->zone_start_pfn,
-		   zone->zone_pgdat->inactive_ratio);
+		   zone->zone_start_pfn);
 	seq_putc(m, '\n');
 }
 
-- 
cgit v1.2.3


From 8745808fda84c638e45cc860c8fb600bf4b0a2a6 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 15 Nov 2017 17:34:22 -0800
Subject: mm, arch: remove empty_bad_page*

empty_bad_page() and empty_bad_pte_table() seem to be relics from old
days which is not used by any code for a long time.  I have tried to
find when exactly but this is not really all that straightforward due to
many code movements - traces disappear around 2.4 times.

Anyway no code really references neither empty_bad_page nor
empty_bad_pte_table.  We only allocate the storage which is not used by
anybody so remove them.

Link: http://lkml.kernel.org/r/20171004150045.30755-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Ralf Baechle <ralf@linus-mips.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: David Howells <dhowells@redhat.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/frv/mm/init.c                 | 14 --------------
 arch/h8300/mm/init.c               | 13 -------------
 arch/mips/include/asm/pgtable-64.h |  8 +-------
 arch/mn10300/kernel/head.S         |  8 --------
 arch/sh/kernel/head_64.S           |  8 --------
 arch/um/kernel/mem.c               |  3 ---
 include/linux/page-flags.h         |  2 +-
 7 files changed, 2 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c
index 328f0a292316..cf464100e838 100644
--- a/arch/frv/mm/init.c
+++ b/arch/frv/mm/init.c
@@ -42,21 +42,9 @@
 #undef DEBUG
 
 /*
- * BAD_PAGE is the page that is used for page faults when linux
- * is out-of-memory. Older versions of linux just did a
- * do_exit(), but using this instead means there is less risk
- * for a process dying in kernel mode, possibly leaving a inode
- * unused etc..
- *
- * BAD_PAGETABLE is the accompanying page-table: it is initialized
- * to point to BAD_PAGE entries.
- *
  * ZERO_PAGE is a special page that is used for zero-initialized
  * data and COW.
  */
-static unsigned long empty_bad_page_table;
-static unsigned long empty_bad_page;
-
 unsigned long empty_zero_page;
 EXPORT_SYMBOL(empty_zero_page);
 
@@ -72,8 +60,6 @@ void __init paging_init(void)
 	unsigned long zones_size[MAX_NR_ZONES] = {0, };
 
 	/* allocate some pages for kernel housekeeping tasks */
-	empty_bad_page_table	= (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
-	empty_bad_page		= (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
 	empty_zero_page		= (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
 
 	memset((void *) empty_zero_page, 0, PAGE_SIZE);
diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c
index eeead51bed2d..015287ac8ce8 100644
--- a/arch/h8300/mm/init.c
+++ b/arch/h8300/mm/init.c
@@ -40,20 +40,9 @@
 #include <asm/sections.h>
 
 /*
- * BAD_PAGE is the page that is used for page faults when linux
- * is out-of-memory. Older versions of linux just did a
- * do_exit(), but using this instead means there is less risk
- * for a process dying in kernel mode, possibly leaving a inode
- * unused etc..
- *
- * BAD_PAGETABLE is the accompanying page-table: it is initialized
- * to point to BAD_PAGE entries.
- *
  * ZERO_PAGE is a special page that is used for zero-initialized
  * data and COW.
  */
-static unsigned long empty_bad_page_table;
-static unsigned long empty_bad_page;
 unsigned long empty_zero_page;
 
 /*
@@ -78,8 +67,6 @@ void __init paging_init(void)
 	 * Initialize the bad page table and bad page to point
 	 * to a couple of allocated pages.
 	 */
-	empty_bad_page_table = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
-	empty_bad_page = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
 	empty_zero_page = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
 	memset((void *)empty_zero_page, 0, PAGE_SIZE);
 
diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h
index 67fe6dc5211c..0036ea0c7173 100644
--- a/arch/mips/include/asm/pgtable-64.h
+++ b/arch/mips/include/asm/pgtable-64.h
@@ -31,12 +31,7 @@
  * tables. Each page table is also a single 4K page, giving 512 (==
  * PTRS_PER_PTE) 8 byte ptes. Each pud entry is initialized to point to
  * invalid_pmd_table, each pmd entry is initialized to point to
- * invalid_pte_table, each pte is initialized to 0. When memory is low,
- * and a pmd table or a page table allocation fails, empty_bad_pmd_table
- * and empty_bad_page_table is returned back to higher layer code, so
- * that the failure is recognized later on. Linux does not seem to
- * handle these failures very well though. The empty_bad_page_table has
- * invalid pte entries in it, to force page faults.
+ * invalid_pte_table, each pte is initialized to 0.
  *
  * Kernel mappings: kernel mappings are held in the swapper_pg_table.
  * The layout is identical to userspace except it's indexed with the
@@ -175,7 +170,6 @@
 	printk("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e))
 
 extern pte_t invalid_pte_table[PTRS_PER_PTE];
-extern pte_t empty_bad_page_table[PTRS_PER_PTE];
 
 #ifndef __PAGETABLE_PUD_FOLDED
 /*
diff --git a/arch/mn10300/kernel/head.S b/arch/mn10300/kernel/head.S
index 73e00fc78072..0b15f759e0d2 100644
--- a/arch/mn10300/kernel/head.S
+++ b/arch/mn10300/kernel/head.S
@@ -433,14 +433,6 @@ ENTRY(swapper_pg_dir)
 ENTRY(empty_zero_page)
 	.space PAGE_SIZE
 
-	.balign PAGE_SIZE
-ENTRY(empty_bad_page)
-	.space PAGE_SIZE
-
-	.balign PAGE_SIZE
-ENTRY(empty_bad_pte_table)
-	.space PAGE_SIZE
-
 	.balign PAGE_SIZE
 ENTRY(large_page_table)
 	.space PAGE_SIZE
diff --git a/arch/sh/kernel/head_64.S b/arch/sh/kernel/head_64.S
index defd851abefa..cca491397a28 100644
--- a/arch/sh/kernel/head_64.S
+++ b/arch/sh/kernel/head_64.S
@@ -101,14 +101,6 @@ empty_zero_page:
 mmu_pdtp_cache:
 	.space PAGE_SIZE, 0
 
-	.global empty_bad_page
-empty_bad_page:
-	.space PAGE_SIZE, 0
-
-	.global empty_bad_pte_table
-empty_bad_pte_table:
-	.space PAGE_SIZE, 0
-
 	.global	fpu_in_use
 fpu_in_use:	.quad	0
 
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index e7437ec62710..3c0e470ea646 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -22,8 +22,6 @@
 /* allocated in paging_init, zeroed in mem_init, and unchanged thereafter */
 unsigned long *empty_zero_page = NULL;
 EXPORT_SYMBOL(empty_zero_page);
-/* allocated in paging_init and unchanged thereafter */
-static unsigned long *empty_bad_page = NULL;
 
 /*
  * Initialized during boot, and readonly for initializing page tables
@@ -146,7 +144,6 @@ void __init paging_init(void)
 	int i;
 
 	empty_zero_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE);
-	empty_bad_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE);
 	for (i = 0; i < ARRAY_SIZE(zones_size); i++)
 		zones_size[i] = 0;
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 584b14c774c1..3ec44e27aa9d 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -18,7 +18,7 @@
  * Various page->flags bits:
  *
  * PG_reserved is set for special pages, which can never be swapped out. Some
- * of them might not even exist (eg empty_bad_page)...
+ * of them might not even exist...
  *
  * The PG_private bitflag is set on pagecache pages if they contain filesystem
  * specific data (which is normally at page->private). It can be used by
-- 
cgit v1.2.3


From 72b045aecdd856b083521f2a963705b4c2e59680 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 15 Nov 2017 17:34:33 -0800
Subject: mm: implement find_get_pages_range_tag()

Patch series "Ranged pagevec tagged lookup", v3.

In this series I provide a ranged variant of pagevec_lookup_tag() and
use it in places where it makes sense.  This series removes some common
code and it also has a potential for speeding up some operations
similarly as for pagevec_lookup_range() (but for now I can think of only
artificial cases where this happens).

This patch (of 16):

Implement a variant of find_get_pages_tag() that stops iterating at
given index.  Lots of users of this function (through pagevec_lookup())
actually want a range lookup and all of them are currently open-coding
this.

Also create corresponding pagevec_lookup_range_tag() function.

Link: http://lkml.kernel.org/r/20171009151359.31984-2-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Chao Yu <yuchao0@huawei.com>
Cc: David Howells <dhowells@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Steve French <sfrench@samba.org>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 12 ++++++++++--
 include/linux/pagevec.h | 11 +++++++++--
 mm/filemap.c            | 33 ++++++++++++++++++++++++---------
 mm/swap.c               |  9 +++++----
 4 files changed, 48 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e08b5339023c..956d6a80e126 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -366,8 +366,16 @@ static inline unsigned find_get_pages(struct address_space *mapping,
 }
 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
 			       unsigned int nr_pages, struct page **pages);
-unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
-			int tag, unsigned int nr_pages, struct page **pages);
+unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
+			pgoff_t end, int tag, unsigned int nr_pages,
+			struct page **pages);
+static inline unsigned find_get_pages_tag(struct address_space *mapping,
+			pgoff_t *index, int tag, unsigned int nr_pages,
+			struct page **pages)
+{
+	return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag,
+					nr_pages, pages);
+}
 unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
 			int tag, unsigned int nr_entries,
 			struct page **entries, pgoff_t *indices);
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 2636c0c0f279..afc718f586f8 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -38,9 +38,16 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec,
 	return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1);
 }
 
-unsigned pagevec_lookup_tag(struct pagevec *pvec,
+unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
+		struct address_space *mapping, pgoff_t *index, pgoff_t end,
+		int tag, unsigned nr_pages);
+static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, int tag,
-		unsigned nr_pages);
+		unsigned nr_pages)
+{
+	return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag,
+					nr_pages);
+}
 
 static inline void pagevec_init(struct pagevec *pvec, int cold)
 {
diff --git a/mm/filemap.c b/mm/filemap.c
index 594d73fef8b4..cf74d0dacc6a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1754,9 +1754,10 @@ repeat:
 EXPORT_SYMBOL(find_get_pages_contig);
 
 /**
- * find_get_pages_tag - find and return pages that match @tag
+ * find_get_pages_range_tag - find and return pages in given range matching @tag
  * @mapping:	the address_space to search
  * @index:	the starting page index
+ * @end:	The final page index (inclusive)
  * @tag:	the tag index
  * @nr_pages:	the maximum number of pages
  * @pages:	where the resulting pages are placed
@@ -1764,8 +1765,9 @@ EXPORT_SYMBOL(find_get_pages_contig);
  * Like find_get_pages, except we only return pages which are tagged with
  * @tag.   We update @index to index the next page for the traversal.
  */
-unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
-			int tag, unsigned int nr_pages, struct page **pages)
+unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
+			pgoff_t end, int tag, unsigned int nr_pages,
+			struct page **pages)
 {
 	struct radix_tree_iter iter;
 	void **slot;
@@ -1778,6 +1780,9 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 	radix_tree_for_each_tagged(slot, &mapping->page_tree,
 				   &iter, *index, tag) {
 		struct page *head, *page;
+
+		if (iter.index > end)
+			break;
 repeat:
 		page = radix_tree_deref_slot(slot);
 		if (unlikely(!page))
@@ -1819,18 +1824,28 @@ repeat:
 		}
 
 		pages[ret] = page;
-		if (++ret == nr_pages)
-			break;
+		if (++ret == nr_pages) {
+			*index = pages[ret - 1]->index + 1;
+			goto out;
+		}
 	}
 
+	/*
+	 * We come here when we got at @end. We take care to not overflow the
+	 * index @index as it confuses some of the callers. This breaks the
+	 * iteration when there is page at index -1 but that is already broken
+	 * anyway.
+	 */
+	if (end == (pgoff_t)-1)
+		*index = (pgoff_t)-1;
+	else
+		*index = end + 1;
+out:
 	rcu_read_unlock();
 
-	if (ret)
-		*index = pages[ret - 1]->index + 1;
-
 	return ret;
 }
-EXPORT_SYMBOL(find_get_pages_tag);
+EXPORT_SYMBOL(find_get_pages_range_tag);
 
 /**
  * find_get_entries_tag - find and return entries that match @tag
diff --git a/mm/swap.c b/mm/swap.c
index a77d68f2c1b6..e1c74eb8a775 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -986,14 +986,15 @@ unsigned pagevec_lookup_range(struct pagevec *pvec,
 }
 EXPORT_SYMBOL(pagevec_lookup_range);
 
-unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
-		pgoff_t *index, int tag, unsigned nr_pages)
+unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
+		struct address_space *mapping, pgoff_t *index, pgoff_t end,
+		int tag, unsigned nr_pages)
 {
-	pvec->nr = find_get_pages_tag(mapping, index, tag,
+	pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
 					nr_pages, pvec->pages);
 	return pagevec_count(pvec);
 }
-EXPORT_SYMBOL(pagevec_lookup_tag);
+EXPORT_SYMBOL(pagevec_lookup_range_tag);
 
 /*
  * Perform any setup for the swap system
-- 
cgit v1.2.3


From 93d3b7140ad379885849ad2674b4290c9e8273da Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 15 Nov 2017 17:35:12 -0800
Subject: mm: add variant of pagevec_lookup_range_tag() taking number of pages

Currently pagevec_lookup_range_tag() takes number of pages to look up
but most users don't need this.  Create a new function
pagevec_lookup_range_nr_tag() that takes maximum number of pages to
lookup for Ceph which wants this functionality so that we can drop
nr_pages argument from pagevec_lookup_range_tag().

Link: http://lkml.kernel.org/r/20171009151359.31984-13-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h | 3 +++
 mm/swap.c               | 9 +++++++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index afc718f586f8..87a2dfd62f5e 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -41,6 +41,9 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec,
 unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, pgoff_t end,
 		int tag, unsigned nr_pages);
+unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
+		struct address_space *mapping, pgoff_t *index, pgoff_t end,
+		int tag, unsigned max_pages);
 static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, int tag,
 		unsigned nr_pages)
diff --git a/mm/swap.c b/mm/swap.c
index e1c74eb8a775..6c50fec2da92 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -996,6 +996,15 @@ unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
 }
 EXPORT_SYMBOL(pagevec_lookup_range_tag);
 
+unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
+		struct address_space *mapping, pgoff_t *index, pgoff_t end,
+		int tag, unsigned max_pages)
+{
+	pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
+		min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
+	return pagevec_count(pvec);
+}
+EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
 /*
  * Perform any setup for the swap system
  */
-- 
cgit v1.2.3


From 67fd707f468142d0f689a6240044bb45c1913003 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 15 Nov 2017 17:35:19 -0800
Subject: mm: remove nr_pages argument from pagevec_lookup_{,range}_tag()

All users of pagevec_lookup() and pagevec_lookup_range() now pass
PAGEVEC_SIZE as a desired number of pages.  Just drop the argument.

Link: http://lkml.kernel.org/r/20171009151359.31984-15-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/btrfs/extent_io.c    | 6 +++---
 fs/ceph/addr.c          | 3 +--
 fs/ext4/inode.c         | 2 +-
 fs/f2fs/checkpoint.c    | 2 +-
 fs/f2fs/data.c          | 2 +-
 fs/f2fs/node.c          | 8 ++++----
 fs/gfs2/aops.c          | 2 +-
 fs/nilfs2/btree.c       | 4 ++--
 fs/nilfs2/page.c        | 7 +++----
 fs/nilfs2/segment.c     | 6 +++---
 include/linux/pagevec.h | 8 +++-----
 mm/filemap.c            | 2 +-
 mm/page-writeback.c     | 2 +-
 mm/swap.c               | 4 ++--
 14 files changed, 27 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c18cf5d59521..93fdaac81212 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3815,7 +3815,7 @@ retry:
 		tag_pages_for_writeback(mapping, index, end);
 	while (!done && !nr_to_write_done && (index <= end) &&
 	       (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
-			tag, PAGEVEC_SIZE))) {
+			tag))) {
 		unsigned i;
 
 		scanned = 1;
@@ -3956,8 +3956,8 @@ retry:
 		tag_pages_for_writeback(mapping, index, end);
 	done_index = index;
 	while (!done && !nr_to_write_done && (index <= end) &&
-	       (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
-			tag, PAGEVEC_SIZE))) {
+			(nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
+						&index, end, tag))) {
 		unsigned i;
 
 		scanned = 1;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 0fc2987cb62d..26c682db94ee 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1162,8 +1162,7 @@ release_pvec_pages:
 			index = 0;
 			while ((index <= end) &&
 			       (nr = pagevec_lookup_tag(&pvec, mapping, &index,
-							PAGECACHE_TAG_WRITEBACK,
-							PAGEVEC_SIZE))) {
+						PAGECACHE_TAG_WRITEBACK))) {
 				for (i = 0; i < nr; i++) {
 					page = pvec.pages[i];
 					if (page_snap_context(page) != snapc)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3d0708c91c5a..b239a41dbeeb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2621,7 +2621,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 	mpd->next_page = index;
 	while (index <= end) {
 		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
-				tag, PAGEVEC_SIZE);
+				tag);
 		if (nr_pages == 0)
 			goto out;
 
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index fb74ec4ed9d2..6124f8710dc3 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -319,7 +319,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
 	blk_start_plug(&plug);
 
 	while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-				PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) {
+				PAGECACHE_TAG_DIRTY))) {
 		int i;
 
 		for (i = 0; i < nr_pages; i++) {
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 17d2c2997ddd..687703755824 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1670,7 +1670,7 @@ retry:
 		int i;
 
 		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
-				tag, PAGEVEC_SIZE);
+				tag);
 		if (nr_pages == 0)
 			break;
 
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index ca020e7d7465..d6e4df0bb622 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1286,7 +1286,7 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
 	index = 0;
 
 	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
-				PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) {
+				PAGECACHE_TAG_DIRTY))) {
 		int i;
 
 		for (i = 0; i < nr_pages; i++) {
@@ -1440,7 +1440,7 @@ retry:
 	index = 0;
 
 	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
-				PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) {
+				PAGECACHE_TAG_DIRTY))) {
 		int i;
 
 		for (i = 0; i < nr_pages; i++) {
@@ -1553,7 +1553,7 @@ next_step:
 	index = 0;
 
 	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
-				PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) {
+				PAGECACHE_TAG_DIRTY))) {
 		int i;
 
 		for (i = 0; i < nr_pages; i++) {
@@ -1651,7 +1651,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
 	pagevec_init(&pvec, 0);
 
 	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
-				PAGECACHE_TAG_WRITEBACK, PAGEVEC_SIZE))) {
+				PAGECACHE_TAG_WRITEBACK))) {
 		int i;
 
 		for (i = 0; i < nr_pages; i++) {
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index d0848d9623fb..3fea3d7780b0 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -398,7 +398,7 @@ retry:
 	done_index = index;
 	while (!done && (index <= end)) {
 		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
-				tag, PAGEVEC_SIZE);
+				tag);
 		if (nr_pages == 0)
 			break;
 
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 06ffa135dfa6..35989c7bb065 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -2158,8 +2158,8 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
 
 	pagevec_init(&pvec, 0);
 
-	while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY,
-				  PAGEVEC_SIZE)) {
+	while (pagevec_lookup_tag(&pvec, btcache, &index,
+					PAGECACHE_TAG_DIRTY)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			bh = head = page_buffers(pvec.pages[i]);
 			do {
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 8616c46d33da..1c16726915c1 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -257,8 +257,7 @@ int nilfs_copy_dirty_pages(struct address_space *dmap,
 
 	pagevec_init(&pvec, 0);
 repeat:
-	if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY,
-				PAGEVEC_SIZE))
+	if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY))
 		return 0;
 
 	for (i = 0; i < pagevec_count(&pvec); i++) {
@@ -376,8 +375,8 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent)
 
 	pagevec_init(&pvec, 0);
 
-	while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
-				  PAGEVEC_SIZE)) {
+	while (pagevec_lookup_tag(&pvec, mapping, &index,
+					PAGECACHE_TAG_DIRTY)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 68e5769cef3b..19366ab20bea 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -712,7 +712,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
  repeat:
 	if (unlikely(index > last) ||
 	    !pagevec_lookup_range_tag(&pvec, mapping, &index, last,
-				PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))
+				PAGECACHE_TAG_DIRTY))
 		return ndirties;
 
 	for (i = 0; i < pagevec_count(&pvec); i++) {
@@ -755,8 +755,8 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
 
 	pagevec_init(&pvec, 0);
 
-	while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
-				  PAGEVEC_SIZE)) {
+	while (pagevec_lookup_tag(&pvec, mapping, &index,
+					PAGECACHE_TAG_DIRTY)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			bh = head = page_buffers(pvec.pages[i]);
 			do {
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 87a2dfd62f5e..4f56e0ad9d00 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -40,16 +40,14 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec,
 
 unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, pgoff_t end,
-		int tag, unsigned nr_pages);
+		int tag);
 unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, pgoff_t end,
 		int tag, unsigned max_pages);
 static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
-		struct address_space *mapping, pgoff_t *index, int tag,
-		unsigned nr_pages)
+		struct address_space *mapping, pgoff_t *index, int tag)
 {
-	return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag,
-					nr_pages);
+	return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag);
 }
 
 static inline void pagevec_init(struct pagevec *pvec, int cold)
diff --git a/mm/filemap.c b/mm/filemap.c
index 229481d258bc..6eb4e32d99c8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -424,7 +424,7 @@ static void __filemap_fdatawait_range(struct address_space *mapping,
 		unsigned i;
 
 		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
-				end, PAGECACHE_TAG_WRITEBACK, PAGEVEC_SIZE);
+				end, PAGECACHE_TAG_WRITEBACK);
 		if (!nr_pages)
 			break;
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 460fc022cbc8..231651a1486d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2197,7 +2197,7 @@ retry:
 		int i;
 
 		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
-				tag, PAGEVEC_SIZE);
+				tag);
 		if (nr_pages == 0)
 			break;
 
diff --git a/mm/swap.c b/mm/swap.c
index 6c50fec2da92..4edac536fe24 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -988,10 +988,10 @@ EXPORT_SYMBOL(pagevec_lookup_range);
 
 unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, pgoff_t end,
-		int tag, unsigned nr_pages)
+		int tag)
 {
 	pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
-					nr_pages, pvec->pages);
+					PAGEVEC_SIZE, pvec->pages);
 	return pagevec_count(pvec);
 }
 EXPORT_SYMBOL(pagevec_lookup_range_tag);
-- 
cgit v1.2.3


From b4e98d9ac775907cc53fb08fcb6776deb7694e30 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Nov 2017 17:35:33 -0800
Subject: mm: account pud page tables

On a machine with 5-level paging support a process can allocate
significant amount of memory and stay unnoticed by oom-killer and memory
cgroup.  The trick is to allocate a lot of PUD page tables.  We don't
account PUD page tables, only PMD and PTE.

We already addressed the same issue for PMD page tables, see commit
dc6c9a35b66b ("mm: account pmd page tables to the process").
Introduction of 5-level paging brings the same issue for PUD page
tables.

The patch expands accounting to PUD level.

[kirill.shutemov@linux.intel.com: s/pmd_t/pud_t/]
  Link: http://lkml.kernel.org/r/20171004074305.x35eh5u7ybbt5kar@black.fi.intel.com
[heiko.carstens@de.ibm.com: s390/mm: fix pud table accounting]
  Link: http://lkml.kernel.org/r/20171103090551.18231-1-heiko.carstens@de.ibm.com
Link: http://lkml.kernel.org/r/20171002080427.3320-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt         |  8 ++++----
 arch/powerpc/mm/hugetlbpage.c       |  1 +
 arch/s390/include/asm/mmu_context.h |  4 +++-
 arch/sparc/mm/hugetlbpage.c         |  1 +
 fs/proc/task_mmu.c                  |  5 ++++-
 include/linux/mm.h                  | 36 +++++++++++++++++++++++++++++++++---
 include/linux/mm_types.h            |  3 +++
 kernel/fork.c                       |  4 ++++
 mm/debug.c                          |  6 ++++--
 mm/memory.c                         | 15 +++++++++------
 mm/oom_kill.c                       |  8 +++++---
 11 files changed, 71 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 30fd16b14196..2729e8db9492 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -629,10 +629,10 @@ oom_dump_tasks
 
 Enables a system-wide task dump (excluding kernel threads) to be produced
 when the kernel performs an OOM-killing and includes such information as
-pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj
-score, and name.  This is helpful to determine why the OOM killer was
-invoked, to identify the rogue task that caused it, and to determine why
-the OOM killer chose the task it did to kill.
+pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, nr_puds, swapents,
+oom_score_adj score, and name.  This is helpful to determine why the OOM
+killer was invoked, to identify the rogue task that caused it, and to
+determine why the OOM killer chose the task it did to kill.
 
 If this is set to zero, this information is suppressed.  On very
 large systems with thousands of tasks it may not be feasible to dump
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 1571a498a33f..a9b9083c5e49 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -433,6 +433,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 	pud = pud_offset(pgd, start);
 	pgd_clear(pgd);
 	pud_free_tlb(tlb, pud, start);
+	mm_dec_nr_puds(tlb->mm);
 }
 
 /*
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 43607bb12cc2..cf4c1cb17dcd 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -44,6 +44,8 @@ static inline int init_new_context(struct task_struct *tsk,
 		mm->context.asce_limit = STACK_TOP_MAX;
 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 				   _ASCE_USER_BITS | _ASCE_TYPE_REGION3;
+		/* pgd_alloc() did not account this pud */
+		mm_inc_nr_puds(mm);
 		break;
 	case -PAGE_SIZE:
 		/* forked 5-level task, set new asce with new_mm->pgd */
@@ -59,7 +61,7 @@ static inline int init_new_context(struct task_struct *tsk,
 		/* forked 2-level compat task, set new asce with new mm->pgd */
 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 				   _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
-		/* pgd_alloc() did not increase mm->nr_pmds */
+		/* pgd_alloc() did not account this pmd */
 		mm_inc_nr_pmds(mm);
 	}
 	crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 5078b7f68890..01f63b4ee2b4 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -472,6 +472,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 	pud = pud_offset(pgd, start);
 	pgd_clear(pgd);
 	pud_free_tlb(tlb, pud, start);
+	mm_dec_nr_puds(tlb->mm);
 }
 
 void hugetlb_free_pgd_range(struct mmu_gather *tlb,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6744bd706ecf..a05ce6186f99 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -26,7 +26,7 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long text, lib, swap, ptes, pmds, anon, file, shmem;
+	unsigned long text, lib, swap, ptes, pmds, puds, anon, file, shmem;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
 	anon = get_mm_counter(mm, MM_ANONPAGES);
@@ -52,6 +52,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	swap = get_mm_counter(mm, MM_SWAPENTS);
 	ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
 	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
+	puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -68,6 +69,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n"
 		"VmPMD:\t%8lu kB\n"
+		"VmPUD:\t%8lu kB\n"
 		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		total_vm << (PAGE_SHIFT-10),
@@ -82,6 +84,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		ptes >> 10,
 		pmds >> 10,
+		puds >> 10,
 		swap << (PAGE_SHIFT-10));
 	hugetlb_report_usage(m, mm);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 91b46f99b4d2..9af86f39d928 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1599,14 +1599,44 @@ static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
 int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
 #endif
 
-#ifdef __PAGETABLE_PUD_FOLDED
+#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU)
 static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
 						unsigned long address)
 {
 	return 0;
 }
+
+static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline void mm_nr_puds_init(struct mm_struct *mm) {}
+static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
+static inline void mm_dec_nr_puds(struct mm_struct *mm) {}
+
 #else
 int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);
+
+static inline void mm_nr_puds_init(struct mm_struct *mm)
+{
+	atomic_long_set(&mm->nr_puds, 0);
+}
+
+static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
+{
+	return atomic_long_read(&mm->nr_puds);
+}
+
+static inline void mm_inc_nr_puds(struct mm_struct *mm)
+{
+	atomic_long_inc(&mm->nr_puds);
+}
+
+static inline void mm_dec_nr_puds(struct mm_struct *mm)
+{
+	atomic_long_dec(&mm->nr_puds);
+}
 #endif
 
 #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
@@ -1618,7 +1648,7 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
 
 static inline void mm_nr_pmds_init(struct mm_struct *mm) {}
 
-static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
+static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
 {
 	return 0;
 }
@@ -1634,7 +1664,7 @@ static inline void mm_nr_pmds_init(struct mm_struct *mm)
 	atomic_long_set(&mm->nr_pmds, 0);
 }
 
-static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
+static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
 {
 	return atomic_long_read(&mm->nr_pmds);
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d1b8e8f97fc2..e9e561e02d22 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -404,6 +404,9 @@ struct mm_struct {
 	atomic_long_t nr_ptes;			/* PTE page table pages */
 #if CONFIG_PGTABLE_LEVELS > 2
 	atomic_long_t nr_pmds;			/* PMD page table pages */
+#endif
+#if CONFIG_PGTABLE_LEVELS > 3
+	atomic_long_t nr_puds;			/* PUD page table pages */
 #endif
 	int map_count;				/* number of VMAs */
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 07cc743698d3..a4eb6f289365 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -819,6 +819,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm->core_state = NULL;
 	atomic_long_set(&mm->nr_ptes, 0);
 	mm_nr_pmds_init(mm);
+	mm_nr_puds_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
 	mm->pinned_vm = 0;
@@ -878,6 +879,9 @@ static void check_mm(struct mm_struct *mm)
 	if (mm_nr_pmds(mm))
 		pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
 				mm_nr_pmds(mm));
+	if (mm_nr_puds(mm))
+		pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n",
+				mm_nr_puds(mm));
 
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
diff --git a/mm/debug.c b/mm/debug.c
index 6726bec731c9..a12d826bb774 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -105,7 +105,8 @@ void dump_mm(const struct mm_struct *mm)
 		"get_unmapped_area %p\n"
 #endif
 		"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
-		"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
+		"pgd %p mm_users %d mm_count %d\n"
+		"nr_ptes %lu nr_pmds %lu nr_puds %lu map_count %d\n"
 		"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
 		"pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
 		"start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -136,7 +137,8 @@ void dump_mm(const struct mm_struct *mm)
 		mm->pgd, atomic_read(&mm->mm_users),
 		atomic_read(&mm->mm_count),
 		atomic_long_read((atomic_long_t *)&mm->nr_ptes),
-		mm_nr_pmds((struct mm_struct *)mm),
+		mm_nr_pmds(mm),
+		mm_nr_puds(mm),
 		mm->map_count,
 		mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
 		mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/memory.c b/mm/memory.c
index 42fb30300bb5..6bbd4078ec98 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -506,6 +506,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
 	pud = pud_offset(p4d, start);
 	p4d_clear(p4d);
 	pud_free_tlb(tlb, pud, start);
+	mm_dec_nr_puds(tlb->mm);
 }
 
 static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -4149,15 +4150,17 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
 
 	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_5LEVEL_HACK
-	if (p4d_present(*p4d))		/* Another has populated it */
-		pud_free(mm, new);
-	else
+	if (!p4d_present(*p4d)) {
+		mm_inc_nr_puds(mm);
 		p4d_populate(mm, p4d, new);
-#else
-	if (pgd_present(*p4d))		/* Another has populated it */
+	} else	/* Another has populated it */
 		pud_free(mm, new);
-	else
+#else
+	if (!pgd_present(*p4d)) {
+		mm_inc_nr_puds(mm);
 		pgd_populate(mm, p4d, new);
+	} else	/* Another has populated it */
+		pud_free(mm, new);
 #endif /* __ARCH_HAS_5LEVEL_HACK */
 	spin_unlock(&mm->page_table_lock);
 	return 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3023919970f7..f642a45b7f14 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -221,7 +221,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * task's rss, pagetable and swap space use.
 	 */
 	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-		atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
+		atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm) +
+		mm_nr_puds(p->mm);
 	task_unlock(p);
 
 	/*
@@ -397,7 +398,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 	struct task_struct *p;
 	struct task_struct *task;
 
-	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds swapents oom_score_adj name\n");
+	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds nr_puds swapents oom_score_adj name\n");
 	rcu_read_lock();
 	for_each_process(p) {
 		if (oom_unkillable_task(p, memcg, nodemask))
@@ -413,11 +414,12 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 			continue;
 		}
 
-		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu         %5hd %s\n",
+		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu         %5hd %s\n",
 			task->pid, from_kuid(&init_user_ns, task_uid(task)),
 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
 			atomic_long_read(&task->mm->nr_ptes),
 			mm_nr_pmds(task->mm),
+			mm_nr_puds(task->mm),
 			get_mm_counter(task->mm, MM_SWAPENTS),
 			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
-- 
cgit v1.2.3


From c4812909f5d5a9b7f1c85a2d95be388a066cda52 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Nov 2017 17:35:37 -0800
Subject: mm: introduce wrappers to access mm->nr_ptes

Let's add wrappers for ->nr_ptes with the same interface as for nr_pmd
and nr_pud.

The patch also makes nr_ptes accounting dependent onto CONFIG_MMU.  Page
table accounting doesn't make sense if you don't have page tables.

It's preparation for consolidation of page-table counters in mm_struct.

Link: http://lkml.kernel.org/r/20171006100651.44742-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/pgd.c           |  2 +-
 arch/sparc/mm/hugetlbpage.c |  2 +-
 arch/unicore32/mm/pgd.c     |  2 +-
 fs/proc/task_mmu.c          |  2 +-
 include/linux/mm.h          | 32 ++++++++++++++++++++++++++++++++
 include/linux/mm_types.h    |  2 ++
 kernel/fork.c               |  6 +++---
 mm/debug.c                  |  2 +-
 mm/huge_memory.c            | 10 +++++-----
 mm/khugepaged.c             |  2 +-
 mm/memory.c                 |  8 ++++----
 mm/oom_kill.c               |  5 ++---
 12 files changed, 54 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c
index c1c1a5c67da1..61e281cb29fb 100644
--- a/arch/arm/mm/pgd.c
+++ b/arch/arm/mm/pgd.c
@@ -141,7 +141,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd_base)
 	pte = pmd_pgtable(*pmd);
 	pmd_clear(pmd);
 	pte_free(mm, pte);
-	atomic_long_dec(&mm->nr_ptes);
+	mm_dec_nr_ptes(mm);
 no_pmd:
 	pud_clear(pud);
 	pmd_free(mm, pmd);
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 01f63b4ee2b4..0112d6942288 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -397,7 +397,7 @@ static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 
 	pmd_clear(pmd);
 	pte_free_tlb(tlb, token, addr);
-	atomic_long_dec(&tlb->mm->nr_ptes);
+	mm_dec_nr_ptes(tlb->mm);
 }
 
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
diff --git a/arch/unicore32/mm/pgd.c b/arch/unicore32/mm/pgd.c
index c572a28c76c9..a830a300aaa1 100644
--- a/arch/unicore32/mm/pgd.c
+++ b/arch/unicore32/mm/pgd.c
@@ -97,7 +97,7 @@ void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd)
 	pte = pmd_pgtable(*pmd);
 	pmd_clear(pmd);
 	pte_free(mm, pte);
-	atomic_long_dec(&mm->nr_ptes);
+	mm_dec_nr_ptes(mm);
 	pmd_free(mm, pmd);
 	mm_dec_nr_pmds(mm);
 free:
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a05ce6186f99..9bd2a0294ac1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -50,7 +50,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	swap = get_mm_counter(mm, MM_SWAPENTS);
-	ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
+	ptes = PTRS_PER_PTE * sizeof(pte_t) * mm_nr_ptes(mm);
 	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
 	puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm);
 	seq_printf(m,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9af86f39d928..2ca799f0d762 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1680,6 +1680,38 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm)
 }
 #endif
 
+#ifdef CONFIG_MMU
+static inline void mm_nr_ptes_init(struct mm_struct *mm)
+{
+	atomic_long_set(&mm->nr_ptes, 0);
+}
+
+static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+{
+	return atomic_long_read(&mm->nr_ptes);
+}
+
+static inline void mm_inc_nr_ptes(struct mm_struct *mm)
+{
+	atomic_long_inc(&mm->nr_ptes);
+}
+
+static inline void mm_dec_nr_ptes(struct mm_struct *mm)
+{
+	atomic_long_dec(&mm->nr_ptes);
+}
+#else
+static inline void mm_nr_ptes_init(struct mm_struct *mm) {}
+
+static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline void mm_inc_nr_ptes(struct mm_struct *mm) {}
+static inline void mm_dec_nr_ptes(struct mm_struct *mm) {}
+#endif
+
 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e9e561e02d22..e42048020664 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -401,7 +401,9 @@ struct mm_struct {
 	 */
 	atomic_t mm_count;
 
+#ifdef CONFIG_MMU
 	atomic_long_t nr_ptes;			/* PTE page table pages */
+#endif
 #if CONFIG_PGTABLE_LEVELS > 2
 	atomic_long_t nr_pmds;			/* PMD page table pages */
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index a4eb6f289365..946922a30ede 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -817,7 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	init_rwsem(&mm->mmap_sem);
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_state = NULL;
-	atomic_long_set(&mm->nr_ptes, 0);
+	mm_nr_ptes_init(mm);
 	mm_nr_pmds_init(mm);
 	mm_nr_puds_init(mm);
 	mm->map_count = 0;
@@ -873,9 +873,9 @@ static void check_mm(struct mm_struct *mm)
 					  "mm:%p idx:%d val:%ld\n", mm, i, x);
 	}
 
-	if (atomic_long_read(&mm->nr_ptes))
+	if (mm_nr_ptes(mm))
 		pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
-				atomic_long_read(&mm->nr_ptes));
+				mm_nr_ptes(mm));
 	if (mm_nr_pmds(mm))
 		pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
 				mm_nr_pmds(mm));
diff --git a/mm/debug.c b/mm/debug.c
index a12d826bb774..c9888a6d7875 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -136,7 +136,7 @@ void dump_mm(const struct mm_struct *mm)
 		mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
 		mm->pgd, atomic_read(&mm->mm_users),
 		atomic_read(&mm->mm_count),
-		atomic_long_read((atomic_long_t *)&mm->nr_ptes),
+		mm_nr_ptes(mm),
 		mm_nr_pmds(mm),
 		mm_nr_puds(mm),
 		mm->map_count,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cc65fb87c9db..3610d81c062a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -606,7 +606,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
 		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
 		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-		atomic_long_inc(&vma->vm_mm->nr_ptes);
+		mm_inc_nr_ptes(vma->vm_mm);
 		spin_unlock(vmf->ptl);
 		count_vm_event(THP_FAULT_ALLOC);
 	}
@@ -662,7 +662,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
 	if (pgtable)
 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	set_pmd_at(mm, haddr, pmd, entry);
-	atomic_long_inc(&mm->nr_ptes);
+	mm_inc_nr_ptes(mm);
 	return true;
 }
 
@@ -747,7 +747,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 
 	if (pgtable) {
 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
-		atomic_long_inc(&mm->nr_ptes);
+		mm_inc_nr_ptes(mm);
 	}
 
 	set_pmd_at(mm, addr, pmd, entry);
@@ -978,7 +978,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	get_page(src_page);
 	page_dup_rmap(src_page, true);
 	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-	atomic_long_inc(&dst_mm->nr_ptes);
+	mm_inc_nr_ptes(dst_mm);
 	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
 
 	pmdp_set_wrprotect(src_mm, addr, src_pmd);
@@ -1695,7 +1695,7 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
 
 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
 	pte_free(mm, pgtable);
-	atomic_long_dec(&mm->nr_ptes);
+	mm_dec_nr_ptes(mm);
 }
 
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 43cb3043311b..ea4ff259b671 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1270,7 +1270,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 			_pmd = pmdp_collapse_flush(vma, addr, pmd);
 			spin_unlock(ptl);
 			up_write(&vma->vm_mm->mmap_sem);
-			atomic_long_dec(&vma->vm_mm->nr_ptes);
+			mm_dec_nr_ptes(vma->vm_mm);
 			pte_free(vma->vm_mm, pmd_pgtable(_pmd));
 		}
 	}
diff --git a/mm/memory.c b/mm/memory.c
index 6bbd4078ec98..6dec21b182b0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -438,7 +438,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 	pgtable_t token = pmd_pgtable(*pmd);
 	pmd_clear(pmd);
 	pte_free_tlb(tlb, token, addr);
-	atomic_long_dec(&tlb->mm->nr_ptes);
+	mm_dec_nr_ptes(tlb->mm);
 }
 
 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -666,7 +666,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 
 	ptl = pmd_lock(mm, pmd);
 	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
-		atomic_long_inc(&mm->nr_ptes);
+		mm_inc_nr_ptes(mm);
 		pmd_populate(mm, pmd, new);
 		new = NULL;
 	}
@@ -3238,7 +3238,7 @@ static int pte_alloc_one_map(struct vm_fault *vmf)
 			goto map_pte;
 		}
 
-		atomic_long_inc(&vma->vm_mm->nr_ptes);
+		mm_inc_nr_ptes(vma->vm_mm);
 		pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
 		spin_unlock(vmf->ptl);
 		vmf->prealloc_pte = NULL;
@@ -3297,7 +3297,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
 	 * We are going to consume the prealloc table,
 	 * count that as nr_ptes.
 	 */
-	atomic_long_inc(&vma->vm_mm->nr_ptes);
+	mm_inc_nr_ptes(vma->vm_mm);
 	vmf->prealloc_pte = NULL;
 }
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f642a45b7f14..f9300141480e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -221,8 +221,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * task's rss, pagetable and swap space use.
 	 */
 	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-		atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm) +
-		mm_nr_puds(p->mm);
+		mm_nr_ptes(p->mm) + mm_nr_pmds(p->mm) + mm_nr_puds(p->mm);
 	task_unlock(p);
 
 	/*
@@ -417,7 +416,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu         %5hd %s\n",
 			task->pid, from_kuid(&init_user_ns, task_uid(task)),
 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-			atomic_long_read(&task->mm->nr_ptes),
+			mm_nr_ptes(task->mm),
 			mm_nr_pmds(task->mm),
 			mm_nr_puds(task->mm),
 			get_mm_counter(task->mm, MM_SWAPENTS),
-- 
cgit v1.2.3


From af5b0f6a09e42c9f4fa87735f2a366748767b686 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Nov 2017 17:35:40 -0800
Subject: mm: consolidate page table accounting

Currently, we account page tables separately for each page table level,
but that's redundant -- we only make use of total memory allocated to
page tables for oom_badness calculation.  We also provide the
information to userspace, but it has dubious value there too.

This patch switches page table accounting to single counter.

mm->pgtables_bytes is now used to account all page table levels.  We use
bytes, because page table size for different levels of page table tree
may be different.

The change has user-visible effect: we don't have VmPMD and VmPUD
reported in /proc/[pid]/status.  Not sure if anybody uses them.  (As
alternative, we can always report 0 kB for them.)

OOM-killer report is also slightly changed: we now report pgtables_bytes
instead of nr_ptes, nr_pmd, nr_puds.

Apart from reducing number of counters per-mm, the benefit is that we
now calculate oom_badness() more correctly for machines which have
different size of page tables depending on level or where page tables
are less than a page in size.

The only downside can be debuggability because we do not know which page
table level could leak.  But I do not remember many bugs that would be
caught by separate counters so I wouldn't lose sleep over this.

[akpm@linux-foundation.org: fix mm/huge_memory.c]
Link: http://lkml.kernel.org/r/20171006100651.44742-2-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
[kirill.shutemov@linux.intel.com: fix build]
  Link: http://lkml.kernel.org/r/20171016150113.ikfxy3e7zzfvsr4w@black.fi.intel.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  1 -
 Documentation/sysctl/vm.txt        |  8 +++---
 fs/proc/task_mmu.c                 | 11 ++------
 include/linux/mm.h                 | 58 ++++++++------------------------------
 include/linux/mm_types.h           |  8 +-----
 kernel/fork.c                      | 16 +++--------
 mm/debug.c                         |  7 ++---
 mm/huge_memory.c                   |  2 +-
 mm/oom_kill.c                      | 14 ++++-----
 9 files changed, 32 insertions(+), 93 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index adba21b5ada7..ec571b9bb18a 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -250,7 +250,6 @@ Table 1-2: Contents of the status files (as of 4.8)
  VmExe                       size of text segment
  VmLib                       size of shared library code
  VmPTE                       size of page table entries
- VmPMD                       size of second level page tables
  VmSwap                      amount of swap used by anonymous private data
                              (shmem swap usage is not included)
  HugetlbPages                size of hugetlb memory portions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 2729e8db9492..3e579740b49f 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -629,10 +629,10 @@ oom_dump_tasks
 
 Enables a system-wide task dump (excluding kernel threads) to be produced
 when the kernel performs an OOM-killing and includes such information as
-pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, nr_puds, swapents,
-oom_score_adj score, and name.  This is helpful to determine why the OOM
-killer was invoked, to identify the rogue task that caused it, and to
-determine why the OOM killer chose the task it did to kill.
+pid, uid, tgid, vm size, rss, pgtables_bytes, swapents, oom_score_adj
+score, and name.  This is helpful to determine why the OOM killer was
+invoked, to identify the rogue task that caused it, and to determine why
+the OOM killer chose the task it did to kill.
 
 If this is set to zero, this information is suppressed.  On very
 large systems with thousands of tasks it may not be feasible to dump
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9bd2a0294ac1..875231c36cb3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -26,7 +26,7 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long text, lib, swap, ptes, pmds, puds, anon, file, shmem;
+	unsigned long text, lib, swap, anon, file, shmem;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
 	anon = get_mm_counter(mm, MM_ANONPAGES);
@@ -50,9 +50,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	swap = get_mm_counter(mm, MM_SWAPENTS);
-	ptes = PTRS_PER_PTE * sizeof(pte_t) * mm_nr_ptes(mm);
-	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
-	puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -68,8 +65,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n"
-		"VmPMD:\t%8lu kB\n"
-		"VmPUD:\t%8lu kB\n"
 		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		total_vm << (PAGE_SHIFT-10),
@@ -82,9 +77,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		shmem << (PAGE_SHIFT-10),
 		mm->data_vm << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		ptes >> 10,
-		pmds >> 10,
-		puds >> 10,
+		mm_pgtables_bytes(mm) >> 10,
 		swap << (PAGE_SHIFT-10));
 	hugetlb_report_usage(m, mm);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2ca799f0d762..7c1e82a1aa77 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1605,37 +1605,20 @@ static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
 {
 	return 0;
 }
-
-static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
-{
-	return 0;
-}
-
-static inline void mm_nr_puds_init(struct mm_struct *mm) {}
 static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
 static inline void mm_dec_nr_puds(struct mm_struct *mm) {}
 
 #else
 int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);
 
-static inline void mm_nr_puds_init(struct mm_struct *mm)
-{
-	atomic_long_set(&mm->nr_puds, 0);
-}
-
-static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
-{
-	return atomic_long_read(&mm->nr_puds);
-}
-
 static inline void mm_inc_nr_puds(struct mm_struct *mm)
 {
-	atomic_long_inc(&mm->nr_puds);
+	atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
 }
 
 static inline void mm_dec_nr_puds(struct mm_struct *mm)
 {
-	atomic_long_dec(&mm->nr_puds);
+	atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
 }
 #endif
 
@@ -1646,64 +1629,47 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
 	return 0;
 }
 
-static inline void mm_nr_pmds_init(struct mm_struct *mm) {}
-
-static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
-{
-	return 0;
-}
-
 static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
 static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}
 
 #else
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
 
-static inline void mm_nr_pmds_init(struct mm_struct *mm)
-{
-	atomic_long_set(&mm->nr_pmds, 0);
-}
-
-static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
-{
-	return atomic_long_read(&mm->nr_pmds);
-}
-
 static inline void mm_inc_nr_pmds(struct mm_struct *mm)
 {
-	atomic_long_inc(&mm->nr_pmds);
+	atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
 }
 
 static inline void mm_dec_nr_pmds(struct mm_struct *mm)
 {
-	atomic_long_dec(&mm->nr_pmds);
+	atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
 }
 #endif
 
 #ifdef CONFIG_MMU
-static inline void mm_nr_ptes_init(struct mm_struct *mm)
+static inline void mm_pgtables_bytes_init(struct mm_struct *mm)
 {
-	atomic_long_set(&mm->nr_ptes, 0);
+	atomic_long_set(&mm->pgtables_bytes, 0);
 }
 
-static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
 {
-	return atomic_long_read(&mm->nr_ptes);
+	return atomic_long_read(&mm->pgtables_bytes);
 }
 
 static inline void mm_inc_nr_ptes(struct mm_struct *mm)
 {
-	atomic_long_inc(&mm->nr_ptes);
+	atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
 }
 
 static inline void mm_dec_nr_ptes(struct mm_struct *mm)
 {
-	atomic_long_dec(&mm->nr_ptes);
+	atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
 }
 #else
-static inline void mm_nr_ptes_init(struct mm_struct *mm) {}
 
-static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {}
+static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
 {
 	return 0;
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e42048020664..09643e0472fc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -402,13 +402,7 @@ struct mm_struct {
 	atomic_t mm_count;
 
 #ifdef CONFIG_MMU
-	atomic_long_t nr_ptes;			/* PTE page table pages */
-#endif
-#if CONFIG_PGTABLE_LEVELS > 2
-	atomic_long_t nr_pmds;			/* PMD page table pages */
-#endif
-#if CONFIG_PGTABLE_LEVELS > 3
-	atomic_long_t nr_puds;			/* PUD page table pages */
+	atomic_long_t pgtables_bytes;		/* PTE page table pages */
 #endif
 	int map_count;				/* number of VMAs */
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 946922a30ede..006dc5899a1a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -817,9 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	init_rwsem(&mm->mmap_sem);
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_state = NULL;
-	mm_nr_ptes_init(mm);
-	mm_nr_pmds_init(mm);
-	mm_nr_puds_init(mm);
+	mm_pgtables_bytes_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
 	mm->pinned_vm = 0;
@@ -873,15 +871,9 @@ static void check_mm(struct mm_struct *mm)
 					  "mm:%p idx:%d val:%ld\n", mm, i, x);
 	}
 
-	if (mm_nr_ptes(mm))
-		pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
-				mm_nr_ptes(mm));
-	if (mm_nr_pmds(mm))
-		pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
-				mm_nr_pmds(mm));
-	if (mm_nr_puds(mm))
-		pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n",
-				mm_nr_puds(mm));
+	if (mm_pgtables_bytes(mm))
+		pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
+				mm_pgtables_bytes(mm));
 
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
diff --git a/mm/debug.c b/mm/debug.c
index c9888a6d7875..d947f3e03b0d 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -105,8 +105,7 @@ void dump_mm(const struct mm_struct *mm)
 		"get_unmapped_area %p\n"
 #endif
 		"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
-		"pgd %p mm_users %d mm_count %d\n"
-		"nr_ptes %lu nr_pmds %lu nr_puds %lu map_count %d\n"
+		"pgd %p mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
 		"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
 		"pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
 		"start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -136,9 +135,7 @@ void dump_mm(const struct mm_struct *mm)
 		mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
 		mm->pgd, atomic_read(&mm->mm_users),
 		atomic_read(&mm->mm_count),
-		mm_nr_ptes(mm),
-		mm_nr_pmds(mm),
-		mm_nr_puds(mm),
+		mm_pgtables_bytes(mm),
 		mm->map_count,
 		mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
 		mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3610d81c062a..86fe697e8bfb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -942,7 +942,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			set_pmd_at(src_mm, addr, src_pmd, pmd);
 		}
 		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-		atomic_long_inc(&dst_mm->nr_ptes);
+		mm_inc_nr_ptes(dst_mm);
 		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
 		set_pmd_at(dst_mm, addr, dst_pmd, pmd);
 		ret = 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f9300141480e..26add8a0d1f7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -221,7 +221,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * task's rss, pagetable and swap space use.
 	 */
 	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-		mm_nr_ptes(p->mm) + mm_nr_pmds(p->mm) + mm_nr_puds(p->mm);
+		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
 	task_unlock(p);
 
 	/*
@@ -389,15 +389,15 @@ static void select_bad_process(struct oom_control *oc)
  * Dumps the current memory state of all eligible tasks.  Tasks not in the same
  * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
  * are not shown.
- * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
- * swapents, oom_score_adj value, and name.
+ * State information includes task's pid, uid, tgid, vm size, rss,
+ * pgtables_bytes, swapents, oom_score_adj value, and name.
  */
 static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 {
 	struct task_struct *p;
 	struct task_struct *task;
 
-	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds nr_puds swapents oom_score_adj name\n");
+	pr_info("[ pid ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
 	rcu_read_lock();
 	for_each_process(p) {
 		if (oom_unkillable_task(p, memcg, nodemask))
@@ -413,12 +413,10 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 			continue;
 		}
 
-		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu         %5hd %s\n",
+		pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
 			task->pid, from_kuid(&init_user_ns, task_uid(task)),
 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-			mm_nr_ptes(task->mm),
-			mm_nr_pmds(task->mm),
-			mm_nr_puds(task->mm),
+			mm_pgtables_bytes(task->mm),
 			get_mm_counter(task->mm, MM_SWAPENTS),
 			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
-- 
cgit v1.2.3


From 4950276672fce5c241857540f8561c440663673d Mon Sep 17 00:00:00 2001
From: "Levin, Alexander (Sasha Levin)" <alexander.levin@verizon.com>
Date: Wed, 15 Nov 2017 17:35:51 -0800
Subject: kmemcheck: remove annotations

Patch series "kmemcheck: kill kmemcheck", v2.

As discussed at LSF/MM, kill kmemcheck.

KASan is a replacement that is able to work without the limitation of
kmemcheck (single CPU, slow).  KASan is already upstream.

We are also not aware of any users of kmemcheck (or users who don't
consider KASan as a suitable replacement).

The only objection was that since KASAN wasn't supported by all GCC
versions provided by distros at that time we should hold off for 2
years, and try again.

Now that 2 years have passed, and all distros provide gcc that supports
KASAN, kill kmemcheck again for the very same reasons.

This patch (of 4):

Remove kmemcheck annotations, and calls to kmemcheck from the kernel.

[alexander.levin@verizon.com: correctly remove kmemcheck call from dma_map_sg_attrs]
  Link: http://lkml.kernel.org/r/20171012192151.26531-1-alexander.levin@verizon.com
Link: http://lkml.kernel.org/r/20171007030159.22241-2-alexander.levin@verizon.com
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Hansen <devtimhansen@gmail.com>
Cc: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/dma-iommu.h        |  1 -
 arch/openrisc/include/asm/dma-mapping.h |  1 -
 arch/x86/Makefile                       |  5 -----
 arch/x86/include/asm/dma-mapping.h      |  1 -
 arch/x86/include/asm/xor.h              |  5 +----
 arch/x86/kernel/traps.c                 |  5 -----
 arch/x86/mm/fault.c                     |  6 ------
 drivers/char/random.c                   |  1 -
 drivers/misc/c2port/core.c              |  2 --
 fs/dcache.c                             |  2 --
 include/linux/c2port.h                  |  4 ----
 include/linux/dma-mapping.h             |  8 +-------
 include/linux/filter.h                  |  2 --
 include/linux/mm_types.h                |  8 --------
 include/linux/net.h                     |  3 ---
 include/linux/ring_buffer.h             |  3 ---
 include/linux/skbuff.h                  |  3 ---
 include/net/inet_sock.h                 |  3 ---
 include/net/inet_timewait_sock.h        |  4 ----
 include/net/sock.h                      |  3 ---
 init/main.c                             |  1 -
 kernel/bpf/core.c                       |  6 ------
 kernel/locking/lockdep.c                |  3 ---
 kernel/trace/ring_buffer.c              |  3 ---
 mm/kmemleak.c                           |  9 ---------
 mm/page_alloc.c                         | 14 --------------
 mm/slab.c                               | 14 --------------
 mm/slab.h                               |  2 --
 mm/slub.c                               | 20 --------------------
 net/core/skbuff.c                       |  5 -----
 net/core/sock.c                         |  2 --
 net/ipv4/inet_timewait_sock.c           |  3 ---
 net/ipv4/tcp_input.c                    |  1 -
 net/socket.c                            |  1 -
 34 files changed, 2 insertions(+), 152 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/dma-iommu.h b/arch/arm/include/asm/dma-iommu.h
index 0722ec6be692..6821f1249300 100644
--- a/arch/arm/include/asm/dma-iommu.h
+++ b/arch/arm/include/asm/dma-iommu.h
@@ -7,7 +7,6 @@
 #include <linux/mm_types.h>
 #include <linux/scatterlist.h>
 #include <linux/dma-debug.h>
-#include <linux/kmemcheck.h>
 #include <linux/kref.h>
 
 #define ARM_MAPPING_ERROR		(~(dma_addr_t)0x0)
diff --git a/arch/openrisc/include/asm/dma-mapping.h b/arch/openrisc/include/asm/dma-mapping.h
index f41bd3cb76d9..e212a1f0b6d2 100644
--- a/arch/openrisc/include/asm/dma-mapping.h
+++ b/arch/openrisc/include/asm/dma-mapping.h
@@ -23,7 +23,6 @@
  */
 
 #include <linux/dma-debug.h>
-#include <linux/kmemcheck.h>
 #include <linux/dma-mapping.h>
 
 extern const struct dma_map_ops or1k_dma_map_ops;
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index a20eacd9c7e9..3e73bc255e4e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -158,11 +158,6 @@ ifdef CONFIG_X86_X32
 endif
 export CONFIG_X86_X32_ABI
 
-# Don't unroll struct assignments with kmemcheck enabled
-ifeq ($(CONFIG_KMEMCHECK),y)
-	KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
-endif
-
 #
 # If the function graph tracer is used with mcount instead of fentry,
 # '-maccumulate-outgoing-args' is needed to prevent a GCC bug
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 43cbe843de8d..0350d99bb8fd 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -7,7 +7,6 @@
  * Documentation/DMA-API.txt for documentation.
  */
 
-#include <linux/kmemcheck.h>
 #include <linux/scatterlist.h>
 #include <linux/dma-debug.h>
 #include <asm/io.h>
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index 1f5c5161ead6..45c8605467f1 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -1,7 +1,4 @@
-#ifdef CONFIG_KMEMCHECK
-/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
-# include <asm-generic/xor.h>
-#elif !defined(_ASM_X86_XOR_H)
+#ifndef _ASM_X86_XOR_H
 #define _ASM_X86_XOR_H
 
 /*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b7b0f74a2150..989514c94a55 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -42,7 +42,6 @@
 #include <linux/edac.h>
 #endif
 
-#include <asm/kmemcheck.h>
 #include <asm/stacktrace.h>
 #include <asm/processor.h>
 #include <asm/debugreg.h>
@@ -749,10 +748,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	if (!dr6 && user_mode(regs))
 		user_icebp = 1;
 
-	/* Catch kmemcheck conditions! */
-	if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
-		goto exit;
-
 	/* Store the virtualized DR6 value */
 	tsk->thread.debugreg6 = dr6;
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3109ba6c6ede..78ca9a8ee454 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,7 +20,6 @@
 #include <asm/cpufeature.h>		/* boot_cpu_has, ...		*/
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
-#include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
 #include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/
 #include <asm/vsyscall.h>		/* emulate_vsyscall		*/
 #include <asm/vm86.h>			/* struct vm86			*/
@@ -1256,8 +1255,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * Detect and handle instructions that would cause a page fault for
 	 * both a tracked kernel page and a userspace page.
 	 */
-	if (kmemcheck_active(regs))
-		kmemcheck_hide(regs);
 	prefetchw(&mm->mmap_sem);
 
 	if (unlikely(kmmio_fault(regs, address)))
@@ -1280,9 +1277,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 		if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
 			if (vmalloc_fault(address) >= 0)
 				return;
-
-			if (kmemcheck_fault(regs, address, error_code))
-				return;
 		}
 
 		/* Can handle a stale RO->RW TLB: */
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 6c7ccac2679e..ec42c8bb9b0d 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -259,7 +259,6 @@
 #include <linux/cryptohash.h>
 #include <linux/fips.h>
 #include <linux/ptrace.h>
-#include <linux/kmemcheck.h>
 #include <linux/workqueue.h>
 #include <linux/irq.h>
 #include <linux/syscalls.h>
diff --git a/drivers/misc/c2port/core.c b/drivers/misc/c2port/core.c
index 1922cb8f6b88..1c5b7aec13d4 100644
--- a/drivers/misc/c2port/core.c
+++ b/drivers/misc/c2port/core.c
@@ -15,7 +15,6 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
-#include <linux/kmemcheck.h>
 #include <linux/ctype.h>
 #include <linux/delay.h>
 #include <linux/idr.h>
@@ -904,7 +903,6 @@ struct c2port_device *c2port_device_register(char *name,
 		return ERR_PTR(-EINVAL);
 
 	c2dev = kmalloc(sizeof(struct c2port_device), GFP_KERNEL);
-	kmemcheck_annotate_bitfield(c2dev, flags);
 	if (unlikely(!c2dev))
 		return ERR_PTR(-ENOMEM);
 
diff --git a/fs/dcache.c b/fs/dcache.c
index bcc9f6981569..5c7df1df81ff 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2705,8 +2705,6 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
 			 */
 			unsigned int i;
 			BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
-			kmemcheck_mark_initialized(dentry->d_iname, DNAME_INLINE_LEN);
-			kmemcheck_mark_initialized(target->d_iname, DNAME_INLINE_LEN);
 			for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
 				swap(((long *) &dentry->d_iname)[i],
 				     ((long *) &target->d_iname)[i]);
diff --git a/include/linux/c2port.h b/include/linux/c2port.h
index 4efabcb51347..f2736348ca26 100644
--- a/include/linux/c2port.h
+++ b/include/linux/c2port.h
@@ -9,8 +9,6 @@
  * the Free Software Foundation
  */
 
-#include <linux/kmemcheck.h>
-
 #define C2PORT_NAME_LEN			32
 
 struct device;
@@ -22,10 +20,8 @@ struct device;
 /* Main struct */
 struct c2port_ops;
 struct c2port_device {
-	kmemcheck_bitfield_begin(flags);
 	unsigned int access:1;
 	unsigned int flash_access:1;
-	kmemcheck_bitfield_end(flags);
 
 	int id;
 	char name[C2PORT_NAME_LEN];
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index eee1499db396..e8f8e8fb244d 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -9,7 +9,6 @@
 #include <linux/dma-debug.h>
 #include <linux/dma-direction.h>
 #include <linux/scatterlist.h>
-#include <linux/kmemcheck.h>
 #include <linux/bug.h>
 #include <linux/mem_encrypt.h>
 
@@ -232,7 +231,6 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	dma_addr_t addr;
 
-	kmemcheck_mark_initialized(ptr, size);
 	BUG_ON(!valid_dma_direction(dir));
 	addr = ops->map_page(dev, virt_to_page(ptr),
 			     offset_in_page(ptr), size,
@@ -265,11 +263,8 @@ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
 				   unsigned long attrs)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-	int i, ents;
-	struct scatterlist *s;
+	int ents;
 
-	for_each_sg(sg, s, nents, i)
-		kmemcheck_mark_initialized(sg_virt(s), s->length);
 	BUG_ON(!valid_dma_direction(dir));
 	ents = ops->map_sg(dev, sg, nents, dir, attrs);
 	BUG_ON(ents < 0);
@@ -299,7 +294,6 @@ static inline dma_addr_t dma_map_page_attrs(struct device *dev,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	dma_addr_t addr;
 
-	kmemcheck_mark_initialized(page_address(page) + offset, size);
 	BUG_ON(!valid_dma_direction(dir));
 	addr = ops->map_page(dev, page, offset, size, dir, attrs);
 	debug_dma_map_page(dev, page, offset, size, dir, addr, false);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 48ec57e70f9f..42197b16dd78 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -454,13 +454,11 @@ struct bpf_binary_header {
 
 struct bpf_prog {
 	u16			pages;		/* Number of allocated pages */
-	kmemcheck_bitfield_begin(meta);
 	u16			jited:1,	/* Is our filter JIT'ed? */
 				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
 				dst_needed:1;	/* Do we need dst entry? */
-	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
 	u32			jited_len;	/* Size of jited insns in bytes */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 09643e0472fc..cfd0ac4e5e0e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -209,14 +209,6 @@ struct page {
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
 
-#ifdef CONFIG_KMEMCHECK
-	/*
-	 * kmemcheck wants to track the status of each byte in a page; this
-	 * is a pointer to such a status block. NULL if not tracked.
-	 */
-	void *shadow;
-#endif
-
 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
 	int _last_cpupid;
 #endif
diff --git a/include/linux/net.h b/include/linux/net.h
index d97d80d7fdf8..caeb159abda5 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -22,7 +22,6 @@
 #include <linux/random.h>
 #include <linux/wait.h>
 #include <linux/fcntl.h>	/* For O_CLOEXEC and O_NONBLOCK */
-#include <linux/kmemcheck.h>
 #include <linux/rcupdate.h>
 #include <linux/once.h>
 #include <linux/fs.h>
@@ -111,9 +110,7 @@ struct socket_wq {
 struct socket {
 	socket_state		state;
 
-	kmemcheck_bitfield_begin(type);
 	short			type;
-	kmemcheck_bitfield_end(type);
 
 	unsigned long		flags;
 
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index fa6ace66fea5..289e4d54e3e0 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -2,7 +2,6 @@
 #ifndef _LINUX_RING_BUFFER_H
 #define _LINUX_RING_BUFFER_H
 
-#include <linux/kmemcheck.h>
 #include <linux/mm.h>
 #include <linux/seq_file.h>
 #include <linux/poll.h>
@@ -14,9 +13,7 @@ struct ring_buffer_iter;
  * Don't refer to this struct directly, use functions below.
  */
 struct ring_buffer_event {
-	kmemcheck_bitfield_begin(bitfield);
 	u32		type_len:5, time_delta:27;
-	kmemcheck_bitfield_end(bitfield);
 
 	u32		array[];
 };
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d448a4804aea..aa1341474916 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -15,7 +15,6 @@
 #define _LINUX_SKBUFF_H
 
 #include <linux/kernel.h>
-#include <linux/kmemcheck.h>
 #include <linux/compiler.h>
 #include <linux/time.h>
 #include <linux/bug.h>
@@ -704,7 +703,6 @@ struct sk_buff {
 	/* Following fields are _not_ copied in __copy_skb_header()
 	 * Note that queue_mapping is here mostly to fill a hole.
 	 */
-	kmemcheck_bitfield_begin(flags1);
 	__u16			queue_mapping;
 
 /* if you move cloned around you also must adapt those constants */
@@ -723,7 +721,6 @@ struct sk_buff {
 				head_frag:1,
 				xmit_more:1,
 				__unused:1; /* one bit hole */
-	kmemcheck_bitfield_end(flags1);
 
 	/* fields enclosed in headers_start/headers_end are copied
 	 * using a single memcpy() in __copy_skb_header()
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index db8162dd8c0b..8e51b4a69088 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -17,7 +17,6 @@
 #define _INET_SOCK_H
 
 #include <linux/bitops.h>
-#include <linux/kmemcheck.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/jhash.h>
@@ -84,7 +83,6 @@ struct inet_request_sock {
 #define ireq_state		req.__req_common.skc_state
 #define ireq_family		req.__req_common.skc_family
 
-	kmemcheck_bitfield_begin(flags);
 	u16			snd_wscale : 4,
 				rcv_wscale : 4,
 				tstamp_ok  : 1,
@@ -93,7 +91,6 @@ struct inet_request_sock {
 				ecn_ok	   : 1,
 				acked	   : 1,
 				no_srccheck: 1;
-	kmemcheck_bitfield_end(flags);
 	u32                     ir_mark;
 	union {
 		struct ip_options_rcu __rcu	*ireq_opt;
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 6a75d67a30fd..1356fa6a7566 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -15,8 +15,6 @@
 #ifndef _INET_TIMEWAIT_SOCK_
 #define _INET_TIMEWAIT_SOCK_
 
-
-#include <linux/kmemcheck.h>
 #include <linux/list.h>
 #include <linux/timer.h>
 #include <linux/types.h>
@@ -69,14 +67,12 @@ struct inet_timewait_sock {
 	/* Socket demultiplex comparisons on incoming packets. */
 	/* these three are in inet_sock */
 	__be16			tw_sport;
-	kmemcheck_bitfield_begin(flags);
 	/* And these are ours. */
 	unsigned int		tw_kill		: 1,
 				tw_transparent  : 1,
 				tw_flowlabel	: 20,
 				tw_pad		: 2,	/* 2 bits hole */
 				tw_tos		: 8;
-	kmemcheck_bitfield_end(flags);
 	struct timer_list	tw_timer;
 	struct inet_bind_bucket	*tw_tb;
 };
diff --git a/include/net/sock.h b/include/net/sock.h
index c577286dbffb..a63e6a8bb7e0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -436,7 +436,6 @@ struct sock {
 #define SK_FL_TYPE_MASK    0xffff0000
 #endif
 
-	kmemcheck_bitfield_begin(flags);
 	unsigned int		sk_padding : 1,
 				sk_kern_sock : 1,
 				sk_no_check_tx : 1,
@@ -445,8 +444,6 @@ struct sock {
 				sk_protocol  : 8,
 				sk_type      : 16;
 #define SK_PROTOCOL_MAX U8_MAX
-	kmemcheck_bitfield_end(flags);
-
 	u16			sk_gso_max_segs;
 	unsigned long	        sk_lingertime;
 	struct proto		*sk_prot_creator;
diff --git a/init/main.c b/init/main.c
index 3bdd8da90f69..859a786f7c0a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -70,7 +70,6 @@
 #include <linux/kgdb.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
-#include <linux/kmemcheck.h>
 #include <linux/sfi.h>
 #include <linux/shmem_fs.h>
 #include <linux/slab.h>
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7b62df86be1d..11ad089f2c74 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -85,8 +85,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 	if (fp == NULL)
 		return NULL;
 
-	kmemcheck_annotate_bitfield(fp, meta);
-
 	aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
 	if (aux == NULL) {
 		vfree(fp);
@@ -127,8 +125,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 	if (fp == NULL) {
 		__bpf_prog_uncharge(fp_old->aux->user, delta);
 	} else {
-		kmemcheck_annotate_bitfield(fp, meta);
-
 		memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
 		fp->pages = pages;
 		fp->aux->prog = fp;
@@ -662,8 +658,6 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
 
 	fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
 	if (fp != NULL) {
-		kmemcheck_annotate_bitfield(fp, meta);
-
 		/* aux->prog still points to the fp_other one, so
 		 * when promoting the clone to the real program,
 		 * this still needs to be adapted.
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index db933d063bfc..9776da8db180 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -47,7 +47,6 @@
 #include <linux/stringify.h>
 #include <linux/bitops.h>
 #include <linux/gfp.h>
-#include <linux/kmemcheck.h>
 #include <linux/random.h>
 #include <linux/jhash.h>
 
@@ -3238,8 +3237,6 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
 {
 	int i;
 
-	kmemcheck_mark_initialized(lock, sizeof(*lock));
-
 	for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
 		lock->class_cache[i] = NULL;
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 845f3805c73d..d57fede84b38 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -13,7 +13,6 @@
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
 #include <linux/kthread.h>	/* for self test */
-#include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
@@ -2055,7 +2054,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	}
 
 	event = __rb_page_index(tail_page, tail);
-	kmemcheck_annotate_bitfield(event, bitfield);
 
 	/* account for padding bytes */
 	local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
@@ -2686,7 +2684,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	/* We reserved something on the buffer */
 
 	event = __rb_page_index(tail_page, tail);
-	kmemcheck_annotate_bitfield(event, bitfield);
 	rb_update_event(cpu_buffer, event, info);
 
 	local_inc(&tail_page->entries);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index fca3452e56c1..e4738d5e9b8c 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -110,7 +110,6 @@
 #include <linux/atomic.h>
 
 #include <linux/kasan.h>
-#include <linux/kmemcheck.h>
 #include <linux/kmemleak.h>
 #include <linux/memory_hotplug.h>
 
@@ -1238,9 +1237,6 @@ static bool update_checksum(struct kmemleak_object *object)
 {
 	u32 old_csum = object->checksum;
 
-	if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
-		return false;
-
 	kasan_disable_current();
 	object->checksum = crc32(0, (void *)object->pointer, object->size);
 	kasan_enable_current();
@@ -1314,11 +1310,6 @@ static void scan_block(void *_start, void *_end,
 		if (scan_should_stop())
 			break;
 
-		/* don't scan uninitialized memory */
-		if (!kmemcheck_is_obj_initialized((unsigned long)ptr,
-						  BYTES_PER_POINTER))
-			continue;
-
 		kasan_disable_current();
 		pointer = *ptr;
 		kasan_enable_current();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e6106d7e9eb0..30a464b47366 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -24,7 +24,6 @@
 #include <linux/memblock.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
-#include <linux/kmemcheck.h>
 #include <linux/kasan.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
@@ -1013,7 +1012,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
 	VM_BUG_ON_PAGE(PageTail(page), page);
 
 	trace_mm_page_free(page, order);
-	kmemcheck_free_shadow(page, order);
 
 	/*
 	 * Check tail pages before head page information is cleared to
@@ -2669,15 +2667,6 @@ void split_page(struct page *page, unsigned int order)
 	VM_BUG_ON_PAGE(PageCompound(page), page);
 	VM_BUG_ON_PAGE(!page_count(page), page);
 
-#ifdef CONFIG_KMEMCHECK
-	/*
-	 * Split shadow pages too, because free(page[0]) would
-	 * otherwise free the whole shadow.
-	 */
-	if (kmemcheck_page_is_tracked(page))
-		split_page(virt_to_page(page[0].shadow), order);
-#endif
-
 	for (i = 1; i < (1 << order); i++)
 		set_page_refcounted(page + i);
 	split_page_owner(page, order);
@@ -4223,9 +4212,6 @@ out:
 		page = NULL;
 	}
 
-	if (kmemcheck_enabled && page)
-		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
-
 	trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
 
 	return page;
diff --git a/mm/slab.c b/mm/slab.c
index 7a5e0888a401..c84365e9a591 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -114,7 +114,6 @@
 #include	<linux/rtmutex.h>
 #include	<linux/reciprocal_div.h>
 #include	<linux/debugobjects.h>
-#include	<linux/kmemcheck.h>
 #include	<linux/memory.h>
 #include	<linux/prefetch.h>
 #include	<linux/sched/task_stack.h>
@@ -1433,15 +1432,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
 	if (sk_memalloc_socks() && page_is_pfmemalloc(page))
 		SetPageSlabPfmemalloc(page);
 
-	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
-		kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
-
-		if (cachep->ctor)
-			kmemcheck_mark_uninitialized_pages(page, nr_pages);
-		else
-			kmemcheck_mark_unallocated_pages(page, nr_pages);
-	}
-
 	return page;
 }
 
@@ -1453,8 +1443,6 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
 	int order = cachep->gfporder;
 	unsigned long nr_freed = (1 << order);
 
-	kmemcheck_free_shadow(page, order);
-
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 		mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed);
 	else
@@ -3515,8 +3503,6 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
 	kmemleak_free_recursive(objp, cachep->flags);
 	objp = cache_free_debugcheck(cachep, objp, caller);
 
-	kmemcheck_slab_free(cachep, objp, cachep->object_size);
-
 	/*
 	 * Skip calling cache_free_alien() when the platform is not numa.
 	 * This will avoid cache misses that happen while accessing slabp (which
diff --git a/mm/slab.h b/mm/slab.h
index e19255638cb6..e60a3d1d8f6f 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -40,7 +40,6 @@ struct kmem_cache {
 
 #include <linux/memcontrol.h>
 #include <linux/fault-inject.h>
-#include <linux/kmemcheck.h>
 #include <linux/kasan.h>
 #include <linux/kmemleak.h>
 #include <linux/random.h>
@@ -439,7 +438,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
 	for (i = 0; i < size; i++) {
 		void *object = p[i];
 
-		kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
 		kmemleak_alloc_recursive(object, s->object_size, 1,
 					 s->flags, flags);
 		kasan_slab_alloc(s, object, flags);
diff --git a/mm/slub.c b/mm/slub.c
index 51484f0fc068..ac3b50b9abec 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -22,7 +22,6 @@
 #include <linux/notifier.h>
 #include <linux/seq_file.h>
 #include <linux/kasan.h>
-#include <linux/kmemcheck.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/mempolicy.h>
@@ -1377,7 +1376,6 @@ static inline void *slab_free_hook(struct kmem_cache *s, void *x)
 		unsigned long flags;
 
 		local_irq_save(flags);
-		kmemcheck_slab_free(s, x, s->object_size);
 		debug_check_no_locks_freed(x, s->object_size);
 		local_irq_restore(flags);
 	}
@@ -1598,22 +1596,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 		stat(s, ORDER_FALLBACK);
 	}
 
-	if (kmemcheck_enabled &&
-	    !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
-		int pages = 1 << oo_order(oo);
-
-		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
-
-		/*
-		 * Objects from caches that have a constructor don't get
-		 * cleared when they're allocated, so we need to do it here.
-		 */
-		if (s->ctor)
-			kmemcheck_mark_uninitialized_pages(page, pages);
-		else
-			kmemcheck_mark_unallocated_pages(page, pages);
-	}
-
 	page->objects = oo_objects(oo);
 
 	order = compound_order(page);
@@ -1689,8 +1671,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 			check_object(s, page, p, SLUB_RED_INACTIVE);
 	}
 
-	kmemcheck_free_shadow(page, compound_order(page));
-
 	mod_lruvec_page_state(page,
 		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
 		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e140ba49b30a..6cd057b41f34 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -41,7 +41,6 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/kmemcheck.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/in.h>
@@ -234,14 +233,12 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	shinfo = skb_shinfo(skb);
 	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 	atomic_set(&shinfo->dataref, 1);
-	kmemcheck_annotate_variable(shinfo->destructor_arg);
 
 	if (flags & SKB_ALLOC_FCLONE) {
 		struct sk_buff_fclones *fclones;
 
 		fclones = container_of(skb, struct sk_buff_fclones, skb1);
 
-		kmemcheck_annotate_bitfield(&fclones->skb2, flags1);
 		skb->fclone = SKB_FCLONE_ORIG;
 		refcount_set(&fclones->fclone_ref, 1);
 
@@ -301,7 +298,6 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
 	shinfo = skb_shinfo(skb);
 	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 	atomic_set(&shinfo->dataref, 1);
-	kmemcheck_annotate_variable(shinfo->destructor_arg);
 
 	return skb;
 }
@@ -1283,7 +1279,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 		if (!n)
 			return NULL;
 
-		kmemcheck_annotate_bitfield(n, flags1);
 		n->fclone = SKB_FCLONE_UNAVAILABLE;
 	}
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 415f441c63b9..78401fa33ce8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1469,8 +1469,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 		sk = kmalloc(prot->obj_size, priority);
 
 	if (sk != NULL) {
-		kmemcheck_annotate_bitfield(sk, flags);
-
 		if (security_sk_alloc(sk, family, priority))
 			goto out_free;
 
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 5b039159e67a..d451b9f19b59 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -9,7 +9,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/kmemcheck.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <net/inet_hashtables.h>
@@ -167,8 +166,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
 	if (tw) {
 		const struct inet_sock *inet = inet_sk(sk);
 
-		kmemcheck_annotate_bitfield(tw, flags);
-
 		tw->tw_dr	    = dr;
 		/* Give us an identity. */
 		tw->tw_daddr	    = inet->inet_daddr;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 887585045b27..c04d60a677a7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6195,7 +6195,6 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
 	if (req) {
 		struct inet_request_sock *ireq = inet_rsk(req);
 
-		kmemcheck_annotate_bitfield(ireq, flags);
 		ireq->ireq_opt = NULL;
 #if IS_ENABLED(CONFIG_IPV6)
 		ireq->pktopts = NULL;
diff --git a/net/socket.c b/net/socket.c
index c729625eb5d3..42d8e9c9ccd5 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -568,7 +568,6 @@ struct socket *sock_alloc(void)
 
 	sock = SOCKET_I(inode);
 
-	kmemcheck_annotate_bitfield(sock, type);
 	inode->i_ino = get_next_ino();
 	inode->i_mode = S_IFSOCK | S_IRWXUGO;
 	inode->i_uid = current_fsuid();
-- 
cgit v1.2.3


From 75f296d93bcebcfe375884ddac79e30263a31766 Mon Sep 17 00:00:00 2001
From: "Levin, Alexander (Sasha Levin)" <alexander.levin@verizon.com>
Date: Wed, 15 Nov 2017 17:35:54 -0800
Subject: kmemcheck: stop using GFP_NOTRACK and SLAB_NOTRACK

Convert all allocations that used a NOTRACK flag to stop using it.

Link: http://lkml.kernel.org/r/20171007030159.22241-3-alexander.levin@verizon.com
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Hansen <devtimhansen@gmail.com>
Cc: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/pgalloc.h       |  2 +-
 arch/arm64/include/asm/pgalloc.h     |  2 +-
 arch/powerpc/include/asm/pgalloc.h   |  2 +-
 arch/sh/kernel/dwarf.c               |  4 ++--
 arch/sh/kernel/process.c             |  2 +-
 arch/sparc/mm/init_64.c              |  4 ++--
 arch/unicore32/include/asm/pgalloc.h |  2 +-
 arch/x86/kernel/espfix_64.c          |  2 +-
 arch/x86/mm/init.c                   |  3 +--
 arch/x86/mm/init_64.c                |  2 +-
 arch/x86/mm/pageattr.c               | 10 +++++-----
 arch/x86/mm/pgtable.c                |  2 +-
 arch/x86/platform/efi/efi_64.c       |  2 +-
 crypto/xor.c                         |  7 +------
 include/linux/thread_info.h          |  5 ++---
 init/do_mounts.c                     |  3 +--
 kernel/fork.c                        | 12 ++++++------
 kernel/signal.c                      |  3 +--
 mm/kmemcheck.c                       |  2 +-
 mm/slab.c                            |  2 +-
 mm/slab.h                            |  5 ++---
 mm/slab_common.c                     |  2 +-
 mm/slub.c                            |  4 +---
 23 files changed, 36 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h
index b2902a5cd780..2d7344f0e208 100644
--- a/arch/arm/include/asm/pgalloc.h
+++ b/arch/arm/include/asm/pgalloc.h
@@ -57,7 +57,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
 
-#define PGALLOC_GFP	(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP	(GFP_KERNEL | __GFP_ZERO)
 
 static inline void clean_pte_table(pte_t *pte)
 {
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index d25f4f137c2a..5ca6a573a701 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -26,7 +26,7 @@
 
 #define check_pgt_cache()		do { } while (0)
 
-#define PGALLOC_GFP	(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP	(GFP_KERNEL | __GFP_ZERO)
 #define PGD_SIZE	(PTRS_PER_PGD * sizeof(pgd_t))
 
 #if CONFIG_PGTABLE_LEVELS > 2
diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h
index a14203c005f1..e11f03007b57 100644
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h
@@ -18,7 +18,7 @@ static inline gfp_t pgtable_gfp_flags(struct mm_struct *mm, gfp_t gfp)
 }
 #endif /* MODULE */
 
-#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
 
 #ifdef CONFIG_PPC_BOOK3S
 #include <asm/book3s/pgalloc.h>
diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c
index e1d751ae2498..1a2526676a87 100644
--- a/arch/sh/kernel/dwarf.c
+++ b/arch/sh/kernel/dwarf.c
@@ -1172,11 +1172,11 @@ static int __init dwarf_unwinder_init(void)
 
 	dwarf_frame_cachep = kmem_cache_create("dwarf_frames",
 			sizeof(struct dwarf_frame), 0,
-			SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL);
+			SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
 
 	dwarf_reg_cachep = kmem_cache_create("dwarf_regs",
 			sizeof(struct dwarf_reg), 0,
-			SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL);
+			SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
 
 	dwarf_frame_pool = mempool_create_slab_pool(DWARF_FRAME_MIN_REQ,
 						    dwarf_frame_cachep);
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index b2d9963d5978..68b1a67533ce 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -59,7 +59,7 @@ void arch_task_cache_init(void)
 
 	task_xstate_cachep = kmem_cache_create("task_xstate", xstate_size,
 					       __alignof__(union thread_xstate),
-					       SLAB_PANIC | SLAB_NOTRACK, NULL);
+					       SLAB_PANIC, NULL);
 }
 
 #ifdef CONFIG_SH_FPU_EMU
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 61bdc1270d19..2de22d703076 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2927,7 +2927,7 @@ void __flush_tlb_all(void)
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
 			    unsigned long address)
 {
-	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+	struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	pte_t *pte = NULL;
 
 	if (page)
@@ -2939,7 +2939,7 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
 pgtable_t pte_alloc_one(struct mm_struct *mm,
 			unsigned long address)
 {
-	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+	struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page)
 		return NULL;
 	if (!pgtable_page_ctor(page)) {
diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h
index 26775793c204..f0fdb268f8f2 100644
--- a/arch/unicore32/include/asm/pgalloc.h
+++ b/arch/unicore32/include/asm/pgalloc.h
@@ -28,7 +28,7 @@ extern void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd);
 #define pgd_alloc(mm)			get_pgd_slow(mm)
 #define pgd_free(mm, pgd)		free_pgd_slow(mm, pgd)
 
-#define PGALLOC_GFP	(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP	(GFP_KERNEL | __GFP_ZERO)
 
 /*
  * Allocate one PTE table.
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 7d7715dde901..e5ec3cafa72e 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -57,7 +57,7 @@
 # error "Need more virtual address space for the ESPFIX hack"
 #endif
 
-#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
 
 /* This contains the *bottom* address of the espfix stack */
 DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a22c2b95e513..ef94620ceb8a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -92,8 +92,7 @@ __ref void *alloc_low_pages(unsigned int num)
 		unsigned int order;
 
 		order = get_order((unsigned long)num << PAGE_SHIFT);
-		return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
-						__GFP_ZERO, order);
+		return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
 	}
 
 	if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index adcea90a2046..5fa3a58b5d78 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -184,7 +184,7 @@ static __ref void *spp_getpage(void)
 	void *ptr;
 
 	if (after_bootmem)
-		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
+		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
 	else
 		ptr = alloc_bootmem_pages(PAGE_SIZE);
 
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 3fe68483463c..85cf12219dea 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -753,7 +753,7 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
 
 	if (!debug_pagealloc_enabled())
 		spin_unlock(&cpa_lock);
-	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+	base = alloc_pages(GFP_KERNEL, 0);
 	if (!debug_pagealloc_enabled())
 		spin_lock(&cpa_lock);
 	if (!base)
@@ -904,7 +904,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
 
 static int alloc_pte_page(pmd_t *pmd)
 {
-	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
 	if (!pte)
 		return -1;
 
@@ -914,7 +914,7 @@ static int alloc_pte_page(pmd_t *pmd)
 
 static int alloc_pmd_page(pud_t *pud)
 {
-	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
 	if (!pmd)
 		return -1;
 
@@ -1120,7 +1120,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
 	pgd_entry = cpa->pgd + pgd_index(addr);
 
 	if (pgd_none(*pgd_entry)) {
-		p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+		p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
 		if (!p4d)
 			return -1;
 
@@ -1132,7 +1132,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
 	 */
 	p4d = p4d_offset(pgd_entry, addr);
 	if (p4d_none(*p4d)) {
-		pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+		pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
 		if (!pud)
 			return -1;
 
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 17ebc5a978cc..96d456a94b03 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -7,7 +7,7 @@
 #include <asm/fixmap.h>
 #include <asm/mtrr.h>
 
-#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
 
 #ifdef CONFIG_HIGHPTE
 #define PGALLOC_USER_GFP __GFP_HIGHMEM
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 9e4ee5b04b2d..6a151ce70e86 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -207,7 +207,7 @@ int __init efi_alloc_page_tables(void)
 	if (efi_enabled(EFI_OLD_MEMMAP))
 		return 0;
 
-	gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO;
+	gfp_mask = GFP_KERNEL | __GFP_ZERO;
 	efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
 	if (!efi_pgd)
 		return -ENOMEM;
diff --git a/crypto/xor.c b/crypto/xor.c
index 263af9fb45ea..bce9fe7af40a 100644
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -122,12 +122,7 @@ calibrate_xor_blocks(void)
 		goto out;
 	}
 
-	/*
-	 * Note: Since the memory is not actually used for _anything_ but to
-	 * test the XOR speed, we don't really want kmemcheck to warn about
-	 * reading uninitialized bytes here.
-	 */
-	b1 = (void *) __get_free_pages(GFP_KERNEL | __GFP_NOTRACK, 2);
+	b1 = (void *) __get_free_pages(GFP_KERNEL, 2);
 	if (!b1) {
 		printk(KERN_WARNING "xor: Yikes!  No memory available.\n");
 		return -ENOMEM;
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 4bcdf00c110f..34f053a150a9 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -44,10 +44,9 @@ enum {
 #endif
 
 #if IS_ENABLED(CONFIG_DEBUG_STACK_USAGE) || IS_ENABLED(CONFIG_DEBUG_KMEMLEAK)
-# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \
-				 __GFP_ZERO)
+# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_ZERO)
 #else
-# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK)
+# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT)
 #endif
 
 /*
diff --git a/init/do_mounts.c b/init/do_mounts.c
index f6d4dd764a52..7cf4f6dafd5f 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -380,8 +380,7 @@ static int __init do_mount_root(char *name, char *fs, int flags, void *data)
 
 void __init mount_block_root(char *name, int flags)
 {
-	struct page *page = alloc_page(GFP_KERNEL |
-					__GFP_NOTRACK_FALSE_POSITIVE);
+	struct page *page = alloc_page(GFP_KERNEL);
 	char *fs_names = page_address(page);
 	char *p;
 #ifdef CONFIG_BLOCK
diff --git a/kernel/fork.c b/kernel/fork.c
index 006dc5899a1a..4e55eedba8d6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -469,7 +469,7 @@ void __init fork_init(void)
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep = kmem_cache_create("task_struct",
 			arch_task_struct_size, align,
-			SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
+			SLAB_PANIC|SLAB_ACCOUNT, NULL);
 #endif
 
 	/* do the arch specific task caches init */
@@ -2205,18 +2205,18 @@ void __init proc_caches_init(void)
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
-			SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
+			SLAB_ACCOUNT, sighand_ctor);
 	signal_cachep = kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	files_cachep = kmem_cache_create("files_cache",
 			sizeof(struct files_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	fs_cachep = kmem_cache_create("fs_cache",
 			sizeof(struct fs_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	/*
 	 * FIXME! The "sizeof(struct mm_struct)" currently includes the
@@ -2227,7 +2227,7 @@ void __init proc_caches_init(void)
 	 */
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
 	mmap_init();
diff --git a/kernel/signal.c b/kernel/signal.c
index 8dcd8825b2de..aa1fb9f905db 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1036,8 +1036,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	else
 		override_rlimit = 0;
 
-	q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
-		override_rlimit);
+	q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit);
 	if (q) {
 		list_add_tail(&q->list, &pending->list);
 		switch ((unsigned long) info) {
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index 800d64b854ea..b3a4d61d341c 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -18,7 +18,7 @@ void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
 	 * With kmemcheck enabled, we need to allocate a memory area for the
 	 * shadow bits as well.
 	 */
-	shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
+	shadow = alloc_pages_node(node, flags, order);
 	if (!shadow) {
 		if (printk_ratelimit())
 			pr_err("kmemcheck: failed to allocate shadow bitmap\n");
diff --git a/mm/slab.c b/mm/slab.c
index c84365e9a591..183e996dde5f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1410,7 +1410,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
 
 	flags |= cachep->allocflags;
 
-	page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
+	page = __alloc_pages_node(nodeid, flags, cachep->gfporder);
 	if (!page) {
 		slab_out_of_memory(cachep, flags, nodeid);
 		return NULL;
diff --git a/mm/slab.h b/mm/slab.h
index e60a3d1d8f6f..ad657ffa44e5 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -141,10 +141,10 @@ static inline slab_flags_t kmem_cache_flags(unsigned long object_size,
 #if defined(CONFIG_SLAB)
 #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
 			  SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
-			  SLAB_NOTRACK | SLAB_ACCOUNT)
+			  SLAB_ACCOUNT)
 #elif defined(CONFIG_SLUB)
 #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
-			  SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT)
+			  SLAB_TEMPORARY | SLAB_ACCOUNT)
 #else
 #define SLAB_CACHE_FLAGS (0)
 #endif
@@ -163,7 +163,6 @@ static inline slab_flags_t kmem_cache_flags(unsigned long object_size,
 			      SLAB_NOLEAKTRACE | \
 			      SLAB_RECLAIM_ACCOUNT | \
 			      SLAB_TEMPORARY | \
-			      SLAB_NOTRACK | \
 			      SLAB_ACCOUNT)
 
 int __kmem_cache_shutdown(struct kmem_cache *);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 175e86637afd..c8cb36774ba1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -44,7 +44,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
 		SLAB_FAILSLAB | SLAB_KASAN)
 
 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
-			 SLAB_NOTRACK | SLAB_ACCOUNT)
+			 SLAB_ACCOUNT)
 
 /*
  * Merge control. If this is set then no merging of slab caches will occur.
diff --git a/mm/slub.c b/mm/slub.c
index ac3b50b9abec..91aa99b4b836 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1436,8 +1436,6 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
 	struct page *page;
 	int order = oo_order(oo);
 
-	flags |= __GFP_NOTRACK;
-
 	if (node == NUMA_NO_NODE)
 		page = alloc_pages(flags, order);
 	else
@@ -3774,7 +3772,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
 	struct page *page;
 	void *ptr = NULL;
 
-	flags |= __GFP_COMP | __GFP_NOTRACK;
+	flags |= __GFP_COMP;
 	page = alloc_pages_node(node, flags, get_order(size));
 	if (page)
 		ptr = page_address(page);
-- 
cgit v1.2.3


From d8be75663cec0069b85f80191abd2682ce4a512f Mon Sep 17 00:00:00 2001
From: "Levin, Alexander (Sasha Levin)" <alexander.levin@verizon.com>
Date: Wed, 15 Nov 2017 17:35:58 -0800
Subject: kmemcheck: remove whats left of NOTRACK flags

Now that kmemcheck is gone, we don't need the NOTRACK flags.

Link: http://lkml.kernel.org/r/20171007030159.22241-5-alexander.levin@verizon.com
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Hansen <devtimhansen@gmail.com>
Cc: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable.h       |  5 -----
 arch/x86/include/asm/pgtable_types.h | 13 -------------
 include/linux/gfp.h                  |  9 ---------
 include/linux/slab.h                 |  6 ------
 include/trace/events/mmflags.h       |  1 -
 mm/slub.c                            |  2 --
 tools/perf/builtin-kmem.c            |  1 -
 7 files changed, 37 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f735c3016325..09f9e1e00e3b 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -667,11 +667,6 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
 	return false;
 }
 
-static inline int pte_hidden(pte_t pte)
-{
-	return pte_flags(pte) & _PAGE_HIDDEN;
-}
-
 static inline int pmd_present(pmd_t pmd)
 {
 	/*
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 9e9b05fc4860..3696398a9475 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -32,7 +32,6 @@
 
 #define _PAGE_BIT_SPECIAL	_PAGE_BIT_SOFTW1
 #define _PAGE_BIT_CPA_TEST	_PAGE_BIT_SOFTW1
-#define _PAGE_BIT_HIDDEN	_PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
 #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
 #define _PAGE_BIT_DEVMAP	_PAGE_BIT_SOFTW4
 
@@ -79,18 +78,6 @@
 #define _PAGE_KNL_ERRATUM_MASK 0
 #endif
 
-#ifdef CONFIG_KMEMCHECK
-#define _PAGE_HIDDEN	(_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
-#else
-#define _PAGE_HIDDEN	(_AT(pteval_t, 0))
-#endif
-
-/*
- * The same hidden bit is used by kmemcheck, but since kmemcheck
- * works on kernel pages while soft-dirty engine on user space,
- * they do not conflict with each other.
- */
-
 #ifdef CONFIG_MEM_SOFT_DIRTY
 #define _PAGE_SOFT_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
 #else
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 710143741eb5..b041f94678de 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -37,7 +37,6 @@ struct vm_area_struct;
 #define ___GFP_THISNODE		0x40000u
 #define ___GFP_ATOMIC		0x80000u
 #define ___GFP_ACCOUNT		0x100000u
-#define ___GFP_NOTRACK		0x200000u
 #define ___GFP_DIRECT_RECLAIM	0x400000u
 #define ___GFP_WRITE		0x800000u
 #define ___GFP_KSWAPD_RECLAIM	0x1000000u
@@ -201,19 +200,11 @@ struct vm_area_struct;
  * __GFP_COMP address compound page metadata.
  *
  * __GFP_ZERO returns a zeroed page on success.
- *
- * __GFP_NOTRACK avoids tracking with kmemcheck.
- *
- * __GFP_NOTRACK_FALSE_POSITIVE is an alias of __GFP_NOTRACK. It's a means of
- *   distinguishing in the source between false positives and allocations that
- *   cannot be supported (e.g. page tables).
  */
 #define __GFP_COLD	((__force gfp_t)___GFP_COLD)
 #define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)
 #define __GFP_COMP	((__force gfp_t)___GFP_COMP)
 #define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)
-#define __GFP_NOTRACK	((__force gfp_t)___GFP_NOTRACK)
-#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
 
 /* Disable lockdep for GFP context tracking */
 #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
diff --git a/include/linux/slab.h b/include/linux/slab.h
index ebddfca9e24b..79f6532f8a0b 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -89,12 +89,6 @@
 /* Avoid kmemleak tracing */
 #define SLAB_NOLEAKTRACE	((slab_flags_t __force)0x00800000U)
 
-/* Don't track use of uninitialized memory */
-#ifdef CONFIG_KMEMCHECK
-# define SLAB_NOTRACK		((slab_flags_t __force)0x01000000U)
-#else
-# define SLAB_NOTRACK		0
-#endif
 /* Fault injection mark */
 #ifdef CONFIG_FAILSLAB
 # define SLAB_FAILSLAB		((slab_flags_t __force)0x02000000U)
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 648cbf603736..72162f3a03fa 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -46,7 +46,6 @@
 	{(unsigned long)__GFP_RECLAIMABLE,	"__GFP_RECLAIMABLE"},	\
 	{(unsigned long)__GFP_MOVABLE,		"__GFP_MOVABLE"},	\
 	{(unsigned long)__GFP_ACCOUNT,		"__GFP_ACCOUNT"},	\
-	{(unsigned long)__GFP_NOTRACK,		"__GFP_NOTRACK"},	\
 	{(unsigned long)__GFP_WRITE,		"__GFP_WRITE"},		\
 	{(unsigned long)__GFP_RECLAIM,		"__GFP_RECLAIM"},	\
 	{(unsigned long)__GFP_DIRECT_RECLAIM,	"__GFP_DIRECT_RECLAIM"},\
diff --git a/mm/slub.c b/mm/slub.c
index 91aa99b4b836..c2c41e178acf 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5635,8 +5635,6 @@ static char *create_unique_id(struct kmem_cache *s)
 		*p++ = 'a';
 	if (s->flags & SLAB_CONSISTENCY_CHECKS)
 		*p++ = 'F';
-	if (!(s->flags & SLAB_NOTRACK))
-		*p++ = 't';
 	if (s->flags & SLAB_ACCOUNT)
 		*p++ = 'A';
 	if (p != name + 1)
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 557d391f564a..cbf70738ef5f 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -655,7 +655,6 @@ static const struct {
 	{ "__GFP_RECLAIMABLE",		"RC" },
 	{ "__GFP_MOVABLE",		"M" },
 	{ "__GFP_ACCOUNT",		"AC" },
-	{ "__GFP_NOTRACK",		"NT" },
 	{ "__GFP_WRITE",		"WR" },
 	{ "__GFP_RECLAIM",		"R" },
 	{ "__GFP_DIRECT_RECLAIM",	"DR" },
-- 
cgit v1.2.3


From 4675ff05de2d76d167336b368bd07f3fef6ed5a6 Mon Sep 17 00:00:00 2001
From: "Levin, Alexander (Sasha Levin)" <alexander.levin@verizon.com>
Date: Wed, 15 Nov 2017 17:36:02 -0800
Subject: kmemcheck: rip it out

Fix up makefiles, remove references, and git rm kmemcheck.

Link: http://lkml.kernel.org/r/20171007030159.22241-4-alexander.levin@verizon.com
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vegard Nossum <vegardno@ifi.uio.no>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Tim Hansen <devtimhansen@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |   7 -
 Documentation/dev-tools/index.rst               |   1 -
 Documentation/dev-tools/kmemcheck.rst           | 733 ------------------------
 MAINTAINERS                                     |  10 -
 arch/x86/Kconfig                                |   3 +-
 arch/x86/include/asm/kmemcheck.h                |  42 --
 arch/x86/include/asm/string_32.h                |   9 -
 arch/x86/include/asm/string_64.h                |   8 -
 arch/x86/kernel/cpu/intel.c                     |  15 -
 arch/x86/mm/Makefile                            |   2 -
 arch/x86/mm/init.c                              |   5 +-
 arch/x86/mm/kmemcheck/Makefile                  |   1 -
 arch/x86/mm/kmemcheck/error.c                   | 227 --------
 arch/x86/mm/kmemcheck/error.h                   |  15 -
 arch/x86/mm/kmemcheck/kmemcheck.c               | 658 ---------------------
 arch/x86/mm/kmemcheck/opcode.c                  | 106 ----
 arch/x86/mm/kmemcheck/opcode.h                  |   9 -
 arch/x86/mm/kmemcheck/pte.c                     |  22 -
 arch/x86/mm/kmemcheck/pte.h                     |  10 -
 arch/x86/mm/kmemcheck/selftest.c                |  70 ---
 arch/x86/mm/kmemcheck/selftest.h                |   6 -
 arch/x86/mm/kmemcheck/shadow.c                  | 173 ------
 arch/x86/mm/kmemcheck/shadow.h                  |  18 -
 include/linux/interrupt.h                       |  15 -
 include/linux/kmemcheck.h                       | 171 ------
 kernel/softirq.c                                |  10 -
 kernel/sysctl.c                                 |  10 -
 lib/Kconfig.debug                               |   6 +-
 lib/Kconfig.kmemcheck                           |  94 ---
 mm/Kconfig.debug                                |   1 -
 mm/Makefile                                     |   2 -
 mm/kmemcheck.c                                  | 125 ----
 mm/slub.c                                       |   5 +-
 scripts/kernel-doc                              |   2 -
 tools/include/linux/kmemcheck.h                 |   8 -
 35 files changed, 7 insertions(+), 2592 deletions(-)
 delete mode 100644 Documentation/dev-tools/kmemcheck.rst
 delete mode 100644 arch/x86/mm/kmemcheck/Makefile
 delete mode 100644 arch/x86/mm/kmemcheck/kmemcheck.c
 delete mode 100644 arch/x86/mm/kmemcheck/shadow.c
 delete mode 100644 lib/Kconfig.kmemcheck

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index b74e13312fdc..00bb04972612 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1864,13 +1864,6 @@
 			Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y,
 			the default is off.
 
-	kmemcheck=	[X86] Boot-time kmemcheck enable/disable/one-shot mode
-			Valid arguments: 0, 1, 2
-			kmemcheck=0 (disabled)
-			kmemcheck=1 (enabled)
-			kmemcheck=2 (one-shot mode)
-			Default: 2 (one-shot mode)
-
 	kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
 			Default is 0 (don't ignore, but inject #GP)
 
diff --git a/Documentation/dev-tools/index.rst b/Documentation/dev-tools/index.rst
index a81787cd47d7..e313925fb0fa 100644
--- a/Documentation/dev-tools/index.rst
+++ b/Documentation/dev-tools/index.rst
@@ -21,7 +21,6 @@ whole; patches welcome!
    kasan
    ubsan
    kmemleak
-   kmemcheck
    gdb-kernel-debugging
    kgdb
    kselftest
diff --git a/Documentation/dev-tools/kmemcheck.rst b/Documentation/dev-tools/kmemcheck.rst
deleted file mode 100644
index 7f3d1985de74..000000000000
--- a/Documentation/dev-tools/kmemcheck.rst
+++ /dev/null
@@ -1,733 +0,0 @@
-Getting started with kmemcheck
-==============================
-
-Vegard Nossum <vegardno@ifi.uio.no>
-
-
-Introduction
-------------
-
-kmemcheck is a debugging feature for the Linux Kernel. More specifically, it
-is a dynamic checker that detects and warns about some uses of uninitialized
-memory.
-
-Userspace programmers might be familiar with Valgrind's memcheck. The main
-difference between memcheck and kmemcheck is that memcheck works for userspace
-programs only, and kmemcheck works for the kernel only. The implementations
-are of course vastly different. Because of this, kmemcheck is not as accurate
-as memcheck, but it turns out to be good enough in practice to discover real
-programmer errors that the compiler is not able to find through static
-analysis.
-
-Enabling kmemcheck on a kernel will probably slow it down to the extent that
-the machine will not be usable for normal workloads such as e.g. an
-interactive desktop. kmemcheck will also cause the kernel to use about twice
-as much memory as normal. For this reason, kmemcheck is strictly a debugging
-feature.
-
-
-Downloading
------------
-
-As of version 2.6.31-rc1, kmemcheck is included in the mainline kernel.
-
-
-Configuring and compiling
--------------------------
-
-kmemcheck only works for the x86 (both 32- and 64-bit) platform. A number of
-configuration variables must have specific settings in order for the kmemcheck
-menu to even appear in "menuconfig". These are:
-
-- ``CONFIG_CC_OPTIMIZE_FOR_SIZE=n``
-	This option is located under "General setup" / "Optimize for size".
-
-	Without this, gcc will use certain optimizations that usually lead to
-	false positive warnings from kmemcheck. An example of this is a 16-bit
-	field in a struct, where gcc may load 32 bits, then discard the upper
-	16 bits. kmemcheck sees only the 32-bit load, and may trigger a
-	warning for the upper 16 bits (if they're uninitialized).
-
-- ``CONFIG_SLAB=y`` or ``CONFIG_SLUB=y``
-	This option is located under "General setup" / "Choose SLAB
-	allocator".
-
-- ``CONFIG_FUNCTION_TRACER=n``
-	This option is located under "Kernel hacking" / "Tracers" / "Kernel
-	Function Tracer"
-
-	When function tracing is compiled in, gcc emits a call to another
-	function at the beginning of every function. This means that when the
-	page fault handler is called, the ftrace framework will be called
-	before kmemcheck has had a chance to handle the fault. If ftrace then
-	modifies memory that was tracked by kmemcheck, the result is an
-	endless recursive page fault.
-
-- ``CONFIG_DEBUG_PAGEALLOC=n``
-	This option is located under "Kernel hacking" / "Memory Debugging"
-	/ "Debug page memory allocations".
-
-In addition, I highly recommend turning on ``CONFIG_DEBUG_INFO=y``. This is also
-located under "Kernel hacking". With this, you will be able to get line number
-information from the kmemcheck warnings, which is extremely valuable in
-debugging a problem. This option is not mandatory, however, because it slows
-down the compilation process and produces a much bigger kernel image.
-
-Now the kmemcheck menu should be visible (under "Kernel hacking" / "Memory
-Debugging" / "kmemcheck: trap use of uninitialized memory"). Here follows
-a description of the kmemcheck configuration variables:
-
-- ``CONFIG_KMEMCHECK``
-	This must be enabled in order to use kmemcheck at all...
-
-- ``CONFIG_KMEMCHECK_``[``DISABLED`` | ``ENABLED`` | ``ONESHOT``]``_BY_DEFAULT``
-	This option controls the status of kmemcheck at boot-time. "Enabled"
-	will enable kmemcheck right from the start, "disabled" will boot the
-	kernel as normal (but with the kmemcheck code compiled in, so it can
-	be enabled at run-time after the kernel has booted), and "one-shot" is
-	a special mode which will turn kmemcheck off automatically after
-	detecting the first use of uninitialized memory.
-
-	If you are using kmemcheck to actively debug a problem, then you
-	probably want to choose "enabled" here.
-
-	The one-shot mode is mostly useful in automated test setups because it
-	can prevent floods of warnings and increase the chances of the machine
-	surviving in case something is really wrong. In other cases, the one-
-	shot mode could actually be counter-productive because it would turn
-	itself off at the very first error -- in the case of a false positive
-	too -- and this would come in the way of debugging the specific
-	problem you were interested in.
-
-	If you would like to use your kernel as normal, but with a chance to
-	enable kmemcheck in case of some problem, it might be a good idea to
-	choose "disabled" here. When kmemcheck is disabled, most of the run-
-	time overhead is not incurred, and the kernel will be almost as fast
-	as normal.
-
-- ``CONFIG_KMEMCHECK_QUEUE_SIZE``
-	Select the maximum number of error reports to store in an internal
-	(fixed-size) buffer. Since errors can occur virtually anywhere and in
-	any context, we need a temporary storage area which is guaranteed not
-	to generate any other page faults when accessed. The queue will be
-	emptied as soon as a tasklet may be scheduled. If the queue is full,
-	new error reports will be lost.
-
-	The default value of 64 is probably fine. If some code produces more
-	than 64 errors within an irqs-off section, then the code is likely to
-	produce many, many more, too, and these additional reports seldom give
-	any more information (the first report is usually the most valuable
-	anyway).
-
-	This number might have to be adjusted if you are not using serial
-	console or similar to capture the kernel log. If you are using the
-	"dmesg" command to save the log, then getting a lot of kmemcheck
-	warnings might overflow the kernel log itself, and the earlier reports
-	will get lost in that way instead. Try setting this to 10 or so on
-	such a setup.
-
-- ``CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT``
-	Select the number of shadow bytes to save along with each entry of the
-	error-report queue. These bytes indicate what parts of an allocation
-	are initialized, uninitialized, etc. and will be displayed when an
-	error is detected to help the debugging of a particular problem.
-
-	The number entered here is actually the logarithm of the number of
-	bytes that will be saved. So if you pick for example 5 here, kmemcheck
-	will save 2^5 = 32 bytes.
-
-	The default value should be fine for debugging most problems. It also
-	fits nicely within 80 columns.
-
-- ``CONFIG_KMEMCHECK_PARTIAL_OK``
-	This option (when enabled) works around certain GCC optimizations that
-	produce 32-bit reads from 16-bit variables where the upper 16 bits are
-	thrown away afterwards.
-
-	The default value (enabled) is recommended. This may of course hide
-	some real errors, but disabling it would probably produce a lot of
-	false positives.
-
-- ``CONFIG_KMEMCHECK_BITOPS_OK``
-	This option silences warnings that would be generated for bit-field
-	accesses where not all the bits are initialized at the same time. This
-	may also hide some real bugs.
-
-	This option is probably obsolete, or it should be replaced with
-	the kmemcheck-/bitfield-annotations for the code in question. The
-	default value is therefore fine.
-
-Now compile the kernel as usual.
-
-
-How to use
-----------
-
-Booting
-~~~~~~~
-
-First some information about the command-line options. There is only one
-option specific to kmemcheck, and this is called "kmemcheck". It can be used
-to override the default mode as chosen by the ``CONFIG_KMEMCHECK_*_BY_DEFAULT``
-option. Its possible settings are:
-
-- ``kmemcheck=0`` (disabled)
-- ``kmemcheck=1`` (enabled)
-- ``kmemcheck=2`` (one-shot mode)
-
-If SLUB debugging has been enabled in the kernel, it may take precedence over
-kmemcheck in such a way that the slab caches which are under SLUB debugging
-will not be tracked by kmemcheck. In order to ensure that this doesn't happen
-(even though it shouldn't by default), use SLUB's boot option ``slub_debug``,
-like this: ``slub_debug=-``
-
-In fact, this option may also be used for fine-grained control over SLUB vs.
-kmemcheck. For example, if the command line includes
-``kmemcheck=1 slub_debug=,dentry``, then SLUB debugging will be used only
-for the "dentry" slab cache, and with kmemcheck tracking all the other
-caches. This is advanced usage, however, and is not generally recommended.
-
-
-Run-time enable/disable
-~~~~~~~~~~~~~~~~~~~~~~~
-
-When the kernel has booted, it is possible to enable or disable kmemcheck at
-run-time. WARNING: This feature is still experimental and may cause false
-positive warnings to appear. Therefore, try not to use this. If you find that
-it doesn't work properly (e.g. you see an unreasonable amount of warnings), I
-will be happy to take bug reports.
-
-Use the file ``/proc/sys/kernel/kmemcheck`` for this purpose, e.g.::
-
-	$ echo 0 > /proc/sys/kernel/kmemcheck # disables kmemcheck
-
-The numbers are the same as for the ``kmemcheck=`` command-line option.
-
-
-Debugging
-~~~~~~~~~
-
-A typical report will look something like this::
-
-    WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024)
-    80000000000000000000000000000000000000000088ffff0000000000000000
-     i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
-             ^
-
-    Pid: 1856, comm: ntpdate Not tainted 2.6.29-rc5 #264 945P-A
-    RIP: 0010:[<ffffffff8104ede8>]  [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190
-    RSP: 0018:ffff88003cdf7d98  EFLAGS: 00210002
-    RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009
-    RDX: ffff88003e5d6018 RSI: ffff88003e5d6024 RDI: ffff88003cdf7e84
-    RBP: ffff88003cdf7db8 R08: ffff88003e5d6000 R09: 0000000000000000
-    R10: 0000000000000080 R11: 0000000000000000 R12: 000000000000000e
-    R13: ffff88003cdf7e78 R14: ffff88003d530710 R15: ffff88003d5a98c8
-    FS:  0000000000000000(0000) GS:ffff880001982000(0063) knlGS:00000
-    CS:  0010 DS: 002b ES: 002b CR0: 0000000080050033
-    CR2: ffff88003f806ea0 CR3: 000000003c036000 CR4: 00000000000006a0
-    DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
-    DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400
-     [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170
-     [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390
-     [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0
-     [<ffffffff8100c7b5>] int_signal+0x12/0x17
-     [<ffffffffffffffff>] 0xffffffffffffffff
-
-The single most valuable information in this report is the RIP (or EIP on 32-
-bit) value. This will help us pinpoint exactly which instruction that caused
-the warning.
-
-If your kernel was compiled with ``CONFIG_DEBUG_INFO=y``, then all we have to do
-is give this address to the addr2line program, like this::
-
-	$ addr2line -e vmlinux -i ffffffff8104ede8
-	arch/x86/include/asm/string_64.h:12
-	include/asm-generic/siginfo.h:287
-	kernel/signal.c:380
-	kernel/signal.c:410
-
-The "``-e vmlinux``" tells addr2line which file to look in. **IMPORTANT:**
-This must be the vmlinux of the kernel that produced the warning in the
-first place! If not, the line number information will almost certainly be
-wrong.
-
-The "``-i``" tells addr2line to also print the line numbers of inlined
-functions.  In this case, the flag was very important, because otherwise,
-it would only have printed the first line, which is just a call to
-``memcpy()``, which could be called from a thousand places in the kernel, and
-is therefore not very useful.  These inlined functions would not show up in
-the stack trace above, simply because the kernel doesn't load the extra
-debugging information. This technique can of course be used with ordinary
-kernel oopses as well.
-
-In this case, it's the caller of ``memcpy()`` that is interesting, and it can be
-found in ``include/asm-generic/siginfo.h``, line 287::
-
-    281 static inline void copy_siginfo(struct siginfo *to, struct siginfo *from)
-    282 {
-    283         if (from->si_code < 0)
-    284                 memcpy(to, from, sizeof(*to));
-    285         else
-    286                 /* _sigchld is currently the largest know union member */
-    287                 memcpy(to, from, __ARCH_SI_PREAMBLE_SIZE + sizeof(from->_sifields._sigchld));
-    288 }
-
-Since this was a read (kmemcheck usually warns about reads only, though it can
-warn about writes to unallocated or freed memory as well), it was probably the
-"from" argument which contained some uninitialized bytes. Following the chain
-of calls, we move upwards to see where "from" was allocated or initialized,
-``kernel/signal.c``, line 380::
-
-    359 static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
-    360 {
-    ...
-    367         list_for_each_entry(q, &list->list, list) {
-    368                 if (q->info.si_signo == sig) {
-    369                         if (first)
-    370                                 goto still_pending;
-    371                         first = q;
-    ...
-    377         if (first) {
-    378 still_pending:
-    379                 list_del_init(&first->list);
-    380                 copy_siginfo(info, &first->info);
-    381                 __sigqueue_free(first);
-    ...
-    392         }
-    393 }
-
-Here, it is ``&first->info`` that is being passed on to ``copy_siginfo()``. The
-variable ``first`` was found on a list -- passed in as the second argument to
-``collect_signal()``. We  continue our journey through the stack, to figure out
-where the item on "list" was allocated or initialized. We move to line 410::
-
-    395 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
-    396                         siginfo_t *info)
-    397 {
-    ...
-    410                 collect_signal(sig, pending, info);
-    ...
-    414 }
-
-Now we need to follow the ``pending`` pointer, since that is being passed on to
-``collect_signal()`` as ``list``. At this point, we've run out of lines from the
-"addr2line" output. Not to worry, we just paste the next addresses from the
-kmemcheck stack dump, i.e.::
-
-     [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170
-     [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390
-     [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0
-     [<ffffffff8100c7b5>] int_signal+0x12/0x17
-
-	$ addr2line -e vmlinux -i ffffffff8104f04e ffffffff81050bd8 \
-		ffffffff8100b87d ffffffff8100c7b5
-	kernel/signal.c:446
-	kernel/signal.c:1806
-	arch/x86/kernel/signal.c:805
-	arch/x86/kernel/signal.c:871
-	arch/x86/kernel/entry_64.S:694
-
-Remember that since these addresses were found on the stack and not as the
-RIP value, they actually point to the _next_ instruction (they are return
-addresses). This becomes obvious when we look at the code for line 446::
-
-    422 int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
-    423 {
-    ...
-    431                 signr = __dequeue_signal(&tsk->signal->shared_pending,
-    432						 mask, info);
-    433			/*
-    434			 * itimer signal ?
-    435			 *
-    436			 * itimers are process shared and we restart periodic
-    437			 * itimers in the signal delivery path to prevent DoS
-    438			 * attacks in the high resolution timer case. This is
-    439			 * compliant with the old way of self restarting
-    440			 * itimers, as the SIGALRM is a legacy signal and only
-    441			 * queued once. Changing the restart behaviour to
-    442			 * restart the timer in the signal dequeue path is
-    443			 * reducing the timer noise on heavy loaded !highres
-    444			 * systems too.
-    445			 */
-    446			if (unlikely(signr == SIGALRM)) {
-    ...
-    489 }
-
-So instead of looking at 446, we should be looking at 431, which is the line
-that executes just before 446. Here we see that what we are looking for is
-``&tsk->signal->shared_pending``.
-
-Our next task is now to figure out which function that puts items on this
-``shared_pending`` list. A crude, but efficient tool, is ``git grep``::
-
-	$ git grep -n 'shared_pending' kernel/
-	...
-	kernel/signal.c:828:	pending = group ? &t->signal->shared_pending : &t->pending;
-	kernel/signal.c:1339:	pending = group ? &t->signal->shared_pending : &t->pending;
-	...
-
-There were more results, but none of them were related to list operations,
-and these were the only assignments. We inspect the line numbers more closely
-and find that this is indeed where items are being added to the list::
-
-    816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
-    817				int group)
-    818 {
-    ...
-    828		pending = group ? &t->signal->shared_pending : &t->pending;
-    ...
-    851		q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
-    852						     (is_si_special(info) ||
-    853						      info->si_code >= 0)));
-    854		if (q) {
-    855			list_add_tail(&q->list, &pending->list);
-    ...
-    890 }
-
-and::
-
-    1309 int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
-    1310 {
-    ....
-    1339	 pending = group ? &t->signal->shared_pending : &t->pending;
-    1340	 list_add_tail(&q->list, &pending->list);
-    ....
-    1347 }
-
-In the first case, the list element we are looking for, ``q``, is being
-returned from the function ``__sigqueue_alloc()``, which looks like an
-allocation function.  Let's take a look at it::
-
-    187 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
-    188						 int override_rlimit)
-    189 {
-    190		struct sigqueue *q = NULL;
-    191		struct user_struct *user;
-    192
-    193		/*
-    194		 * We won't get problems with the target's UID changing under us
-    195		 * because changing it requires RCU be used, and if t != current, the
-    196		 * caller must be holding the RCU readlock (by way of a spinlock) and
-    197		 * we use RCU protection here
-    198		 */
-    199		user = get_uid(__task_cred(t)->user);
-    200		atomic_inc(&user->sigpending);
-    201		if (override_rlimit ||
-    202		    atomic_read(&user->sigpending) <=
-    203				t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
-    204			q = kmem_cache_alloc(sigqueue_cachep, flags);
-    205		if (unlikely(q == NULL)) {
-    206			atomic_dec(&user->sigpending);
-    207			free_uid(user);
-    208		} else {
-    209			INIT_LIST_HEAD(&q->list);
-    210			q->flags = 0;
-    211			q->user = user;
-    212		}
-    213
-    214		return q;
-    215 }
-
-We see that this function initializes ``q->list``, ``q->flags``, and
-``q->user``. It seems that now is the time to look at the definition of
-``struct sigqueue``, e.g.::
-
-    14 struct sigqueue {
-    15	       struct list_head list;
-    16	       int flags;
-    17	       siginfo_t info;
-    18	       struct user_struct *user;
-    19 };
-
-And, you might remember, it was a ``memcpy()`` on ``&first->info`` that
-caused the warning, so this makes perfect sense. It also seems reasonable
-to assume that it is the caller of ``__sigqueue_alloc()`` that has the
-responsibility of filling out (initializing) this member.
-
-But just which fields of the struct were uninitialized? Let's look at
-kmemcheck's report again::
-
-    WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024)
-    80000000000000000000000000000000000000000088ffff0000000000000000
-     i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
-	     ^
-
-These first two lines are the memory dump of the memory object itself, and
-the shadow bytemap, respectively. The memory object itself is in this case
-``&first->info``. Just beware that the start of this dump is NOT the start
-of the object itself! The position of the caret (^) corresponds with the
-address of the read (ffff88003e4a2024).
-
-The shadow bytemap dump legend is as follows:
-
-- i: initialized
-- u: uninitialized
-- a: unallocated (memory has been allocated by the slab layer, but has not
-  yet been handed off to anybody)
-- f: freed (memory has been allocated by the slab layer, but has been freed
-  by the previous owner)
-
-In order to figure out where (relative to the start of the object) the
-uninitialized memory was located, we have to look at the disassembly. For
-that, we'll need the RIP address again::
-
-    RIP: 0010:[<ffffffff8104ede8>]  [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190
-
-	$ objdump -d --no-show-raw-insn vmlinux | grep -C 8 ffffffff8104ede8:
-	ffffffff8104edc8:	mov    %r8,0x8(%r8)
-	ffffffff8104edcc:	test   %r10d,%r10d
-	ffffffff8104edcf:	js     ffffffff8104ee88 <__dequeue_signal+0x168>
-	ffffffff8104edd5:	mov    %rax,%rdx
-	ffffffff8104edd8:	mov    $0xc,%ecx
-	ffffffff8104eddd:	mov    %r13,%rdi
-	ffffffff8104ede0:	mov    $0x30,%eax
-	ffffffff8104ede5:	mov    %rdx,%rsi
-	ffffffff8104ede8:	rep movsl %ds:(%rsi),%es:(%rdi)
-	ffffffff8104edea:	test   $0x2,%al
-	ffffffff8104edec:	je     ffffffff8104edf0 <__dequeue_signal+0xd0>
-	ffffffff8104edee:	movsw  %ds:(%rsi),%es:(%rdi)
-	ffffffff8104edf0:	test   $0x1,%al
-	ffffffff8104edf2:	je     ffffffff8104edf5 <__dequeue_signal+0xd5>
-	ffffffff8104edf4:	movsb  %ds:(%rsi),%es:(%rdi)
-	ffffffff8104edf5:	mov    %r8,%rdi
-	ffffffff8104edf8:	callq  ffffffff8104de60 <__sigqueue_free>
-
-As expected, it's the "``rep movsl``" instruction from the ``memcpy()``
-that causes the warning. We know about ``REP MOVSL`` that it uses the register
-``RCX`` to count the number of remaining iterations. By taking a look at the
-register dump again (from the kmemcheck report), we can figure out how many
-bytes were left to copy::
-
-    RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009
-
-By looking at the disassembly, we also see that ``%ecx`` is being loaded
-with the value ``$0xc`` just before (ffffffff8104edd8), so we are very
-lucky. Keep in mind that this is the number of iterations, not bytes. And
-since this is a "long" operation, we need to multiply by 4 to get the
-number of bytes. So this means that the uninitialized value was encountered
-at 4 * (0xc - 0x9) = 12 bytes from the start of the object.
-
-We can now try to figure out which field of the "``struct siginfo``" that
-was not initialized. This is the beginning of the struct::
-
-    40 typedef struct siginfo {
-    41	       int si_signo;
-    42	       int si_errno;
-    43	       int si_code;
-    44
-    45	       union {
-    ..
-    92	       } _sifields;
-    93 } siginfo_t;
-
-On 64-bit, the int is 4 bytes long, so it must the union member that has
-not been initialized. We can verify this using gdb::
-
-	$ gdb vmlinux
-	...
-	(gdb) p &((struct siginfo *) 0)->_sifields
-	$1 = (union {...} *) 0x10
-
-Actually, it seems that the union member is located at offset 0x10 -- which
-means that gcc has inserted 4 bytes of padding between the members ``si_code``
-and ``_sifields``. We can now get a fuller picture of the memory dump::
-
-		 _----------------------------=> si_code
-		/	 _--------------------=> (padding)
-	       |	/	 _------------=> _sifields(._kill._pid)
-	       |       |	/	 _----=> _sifields(._kill._uid)
-	       |       |       |	/
-	-------|-------|-------|-------|
-	80000000000000000000000000000000000000000088ffff0000000000000000
-	 i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
-
-This allows us to realize another important fact: ``si_code`` contains the
-value 0x80. Remember that x86 is little endian, so the first 4 bytes
-"80000000" are really the number 0x00000080. With a bit of research, we
-find that this is actually the constant ``SI_KERNEL`` defined in
-``include/asm-generic/siginfo.h``::
-
-    144 #define SI_KERNEL	0x80		/* sent by the kernel from somewhere	 */
-
-This macro is used in exactly one place in the x86 kernel: In ``send_signal()``
-in ``kernel/signal.c``::
-
-    816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
-    817				int group)
-    818 {
-    ...
-    828		pending = group ? &t->signal->shared_pending : &t->pending;
-    ...
-    851		q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
-    852						     (is_si_special(info) ||
-    853						      info->si_code >= 0)));
-    854		if (q) {
-    855			list_add_tail(&q->list, &pending->list);
-    856			switch ((unsigned long) info) {
-    ...
-    865			case (unsigned long) SEND_SIG_PRIV:
-    866				q->info.si_signo = sig;
-    867				q->info.si_errno = 0;
-    868				q->info.si_code = SI_KERNEL;
-    869				q->info.si_pid = 0;
-    870				q->info.si_uid = 0;
-    871				break;
-    ...
-    890 }
-
-Not only does this match with the ``.si_code`` member, it also matches the place
-we found earlier when looking for where siginfo_t objects are enqueued on the
-``shared_pending`` list.
-
-So to sum up: It seems that it is the padding introduced by the compiler
-between two struct fields that is uninitialized, and this gets reported when
-we do a ``memcpy()`` on the struct. This means that we have identified a false
-positive warning.
-
-Normally, kmemcheck will not report uninitialized accesses in ``memcpy()`` calls
-when both the source and destination addresses are tracked. (Instead, we copy
-the shadow bytemap as well). In this case, the destination address clearly
-was not tracked. We can dig a little deeper into the stack trace from above::
-
-	arch/x86/kernel/signal.c:805
-	arch/x86/kernel/signal.c:871
-	arch/x86/kernel/entry_64.S:694
-
-And we clearly see that the destination siginfo object is located on the
-stack::
-
-    782 static void do_signal(struct pt_regs *regs)
-    783 {
-    784		struct k_sigaction ka;
-    785		siginfo_t info;
-    ...
-    804		signr = get_signal_to_deliver(&info, &ka, regs, NULL);
-    ...
-    854 }
-
-And this ``&info`` is what eventually gets passed to ``copy_siginfo()`` as the
-destination argument.
-
-Now, even though we didn't find an actual error here, the example is still a
-good one, because it shows how one would go about to find out what the report
-was all about.
-
-
-Annotating false positives
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-There are a few different ways to make annotations in the source code that
-will keep kmemcheck from checking and reporting certain allocations. Here
-they are:
-
-- ``__GFP_NOTRACK_FALSE_POSITIVE``
-	This flag can be passed to ``kmalloc()`` or ``kmem_cache_alloc()``
-	(therefore also to other functions that end up calling one of
-	these) to indicate that the allocation should not be tracked
-	because it would lead to a false positive report. This is a "big
-	hammer" way of silencing kmemcheck; after all, even if the false
-	positive pertains to particular field in a struct, for example, we
-	will now lose the ability to find (real) errors in other parts of
-	the same struct.
-
-	Example::
-
-	    /* No warnings will ever trigger on accessing any part of x */
-	    x = kmalloc(sizeof *x, GFP_KERNEL | __GFP_NOTRACK_FALSE_POSITIVE);
-
-- ``kmemcheck_bitfield_begin(name)``/``kmemcheck_bitfield_end(name)`` and
-	``kmemcheck_annotate_bitfield(ptr, name)``
-	The first two of these three macros can be used inside struct
-	definitions to signal, respectively, the beginning and end of a
-	bitfield. Additionally, this will assign the bitfield a name, which
-	is given as an argument to the macros.
-
-	Having used these markers, one can later use
-	kmemcheck_annotate_bitfield() at the point of allocation, to indicate
-	which parts of the allocation is part of a bitfield.
-
-	Example::
-
-	    struct foo {
-		int x;
-
-		kmemcheck_bitfield_begin(flags);
-		int flag_a:1;
-		int flag_b:1;
-		kmemcheck_bitfield_end(flags);
-
-		int y;
-	    };
-
-	    struct foo *x = kmalloc(sizeof *x);
-
-	    /* No warnings will trigger on accessing the bitfield of x */
-	    kmemcheck_annotate_bitfield(x, flags);
-
-	Note that ``kmemcheck_annotate_bitfield()`` can be used even before the
-	return value of ``kmalloc()`` is checked -- in other words, passing NULL
-	as the first argument is legal (and will do nothing).
-
-
-Reporting errors
-----------------
-
-As we have seen, kmemcheck will produce false positive reports. Therefore, it
-is not very wise to blindly post kmemcheck warnings to mailing lists and
-maintainers. Instead, I encourage maintainers and developers to find errors
-in their own code. If you get a warning, you can try to work around it, try
-to figure out if it's a real error or not, or simply ignore it. Most
-developers know their own code and will quickly and efficiently determine the
-root cause of a kmemcheck report. This is therefore also the most efficient
-way to work with kmemcheck.
-
-That said, we (the kmemcheck maintainers) will always be on the lookout for
-false positives that we can annotate and silence. So whatever you find,
-please drop us a note privately! Kernel configs and steps to reproduce (if
-available) are of course a great help too.
-
-Happy hacking!
-
-
-Technical description
----------------------
-
-kmemcheck works by marking memory pages non-present. This means that whenever
-somebody attempts to access the page, a page fault is generated. The page
-fault handler notices that the page was in fact only hidden, and so it calls
-on the kmemcheck code to make further investigations.
-
-When the investigations are completed, kmemcheck "shows" the page by marking
-it present (as it would be under normal circumstances). This way, the
-interrupted code can continue as usual.
-
-But after the instruction has been executed, we should hide the page again, so
-that we can catch the next access too! Now kmemcheck makes use of a debugging
-feature of the processor, namely single-stepping. When the processor has
-finished the one instruction that generated the memory access, a debug
-exception is raised. From here, we simply hide the page again and continue
-execution, this time with the single-stepping feature turned off.
-
-kmemcheck requires some assistance from the memory allocator in order to work.
-The memory allocator needs to
-
-  1. Tell kmemcheck about newly allocated pages and pages that are about to
-     be freed. This allows kmemcheck to set up and tear down the shadow memory
-     for the pages in question. The shadow memory stores the status of each
-     byte in the allocation proper, e.g. whether it is initialized or
-     uninitialized.
-
-  2. Tell kmemcheck which parts of memory should be marked uninitialized.
-     There are actually a few more states, such as "not yet allocated" and
-     "recently freed".
-
-If a slab cache is set up using the SLAB_NOTRACK flag, it will never return
-memory that can take page faults because of kmemcheck.
-
-If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still
-request memory with the __GFP_NOTRACK or __GFP_NOTRACK_FALSE_POSITIVE flags.
-This does not prevent the page faults from occurring, however, but marks the
-object in question as being initialized so that no warnings will ever be
-produced for this object.
-
-Currently, the SLAB and SLUB allocators are supported by kmemcheck.
diff --git a/MAINTAINERS b/MAINTAINERS
index 7e9c887ad951..ac814d3dd1c1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7688,16 +7688,6 @@ F:	include/linux/kdb.h
 F:	include/linux/kgdb.h
 F:	kernel/debug/
 
-KMEMCHECK
-M:	Vegard Nossum <vegardno@ifi.uio.no>
-M:	Pekka Enberg <penberg@kernel.org>
-S:	Maintained
-F:	Documentation/dev-tools/kmemcheck.rst
-F:	arch/x86/include/asm/kmemcheck.h
-F:	arch/x86/mm/kmemcheck/
-F:	include/linux/kmemcheck.h
-F:	mm/kmemcheck.c
-
 KMEMLEAK
 M:	Catalin Marinas <catalin.marinas@arm.com>
 S:	Maintained
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f08977d82ca0..cb678192da4a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -112,7 +112,6 @@ config X86
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_KASAN			if X86_64 && SPARSEMEM_VMEMMAP
 	select HAVE_ARCH_KGDB
-	select HAVE_ARCH_KMEMCHECK
 	select HAVE_ARCH_MMAP_RND_BITS		if MMU
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS	if MMU && COMPAT
 	select HAVE_ARCH_COMPAT_MMAP_BASES	if MMU && COMPAT
@@ -1430,7 +1429,7 @@ config ARCH_DMA_ADDR_T_64BIT
 
 config X86_DIRECT_GBPAGES
 	def_bool y
-	depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
+	depends on X86_64 && !DEBUG_PAGEALLOC
 	---help---
 	  Certain kernel features effectively disable kernel
 	  linear 1 GB mappings (even if the CPU otherwise
diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h
index 945a0337fbcf..ea32a7d3cf1b 100644
--- a/arch/x86/include/asm/kmemcheck.h
+++ b/arch/x86/include/asm/kmemcheck.h
@@ -1,43 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ASM_X86_KMEMCHECK_H
-#define ASM_X86_KMEMCHECK_H
-
-#include <linux/types.h>
-#include <asm/ptrace.h>
-
-#ifdef CONFIG_KMEMCHECK
-bool kmemcheck_active(struct pt_regs *regs);
-
-void kmemcheck_show(struct pt_regs *regs);
-void kmemcheck_hide(struct pt_regs *regs);
-
-bool kmemcheck_fault(struct pt_regs *regs,
-	unsigned long address, unsigned long error_code);
-bool kmemcheck_trap(struct pt_regs *regs);
-#else
-static inline bool kmemcheck_active(struct pt_regs *regs)
-{
-	return false;
-}
-
-static inline void kmemcheck_show(struct pt_regs *regs)
-{
-}
-
-static inline void kmemcheck_hide(struct pt_regs *regs)
-{
-}
-
-static inline bool kmemcheck_fault(struct pt_regs *regs,
-	unsigned long address, unsigned long error_code)
-{
-	return false;
-}
-
-static inline bool kmemcheck_trap(struct pt_regs *regs)
-{
-	return false;
-}
-#endif /* CONFIG_KMEMCHECK */
-
-#endif
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index 076502241eae..55d392c6bd29 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -179,8 +179,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
  *	No 3D Now!
  */
 
-#ifndef CONFIG_KMEMCHECK
-
 #if (__GNUC__ >= 4)
 #define memcpy(t, f, n) __builtin_memcpy(t, f, n)
 #else
@@ -189,13 +187,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
 	 ? __constant_memcpy((t), (f), (n))	\
 	 : __memcpy((t), (f), (n)))
 #endif
-#else
-/*
- * kmemcheck becomes very happy if we use the REP instructions unconditionally,
- * because it means that we know both memory operands in advance.
- */
-#define memcpy(t, f, n) __memcpy((t), (f), (n))
-#endif
 
 #endif
 #endif /* !CONFIG_FORTIFY_SOURCE */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 0b1b4445f4c5..533f74c300c2 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -33,7 +33,6 @@ extern void *memcpy(void *to, const void *from, size_t len);
 extern void *__memcpy(void *to, const void *from, size_t len);
 
 #ifndef CONFIG_FORTIFY_SOURCE
-#ifndef CONFIG_KMEMCHECK
 #if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4
 #define memcpy(dst, src, len)					\
 ({								\
@@ -46,13 +45,6 @@ extern void *__memcpy(void *to, const void *from, size_t len);
 	__ret;							\
 })
 #endif
-#else
-/*
- * kmemcheck becomes very happy if we use the REP instructions unconditionally,
- * because it means that we know both memory operands in advance.
- */
-#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len))
-#endif
 #endif /* !CONFIG_FORTIFY_SOURCE */
 
 #define __HAVE_ARCH_MEMSET
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b720dacac051..b1af22073e28 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -187,21 +187,6 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 	if (c->x86 == 6 && c->x86_model < 15)
 		clear_cpu_cap(c, X86_FEATURE_PAT);
 
-#ifdef CONFIG_KMEMCHECK
-	/*
-	 * P4s have a "fast strings" feature which causes single-
-	 * stepping REP instructions to only generate a #DB on
-	 * cache-line boundaries.
-	 *
-	 * Ingo Molnar reported a Pentium D (model 6) and a Xeon
-	 * (model 2) with the same problem.
-	 */
-	if (c->x86 == 15)
-		if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
-				  MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0)
-			pr_info("kmemcheck: Disabling fast string operations\n");
-#endif
-
 	/*
 	 * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
 	 * clear the fast string and enhanced fast string CPU capabilities.
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 7ba7f3d7f477..8e13b8cc6bed 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -29,8 +29,6 @@ obj-$(CONFIG_X86_PTDUMP)	+= debug_pagetables.o
 
 obj-$(CONFIG_HIGHMEM)		+= highmem_32.o
 
-obj-$(CONFIG_KMEMCHECK)		+= kmemcheck/
-
 KASAN_SANITIZE_kasan_init_$(BITS).o := n
 obj-$(CONFIG_KASAN)		+= kasan_init_$(BITS).o
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ef94620ceb8a..6fdf91ef130a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -163,12 +163,11 @@ static int page_size_mask;
 static void __init probe_page_size_mask(void)
 {
 	/*
-	 * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
-	 * use small pages.
+	 * For pagealloc debugging, identity mapping will use small pages.
 	 * This will simplify cpa(), which otherwise needs to support splitting
 	 * large pages into small in interrupt context, etc.
 	 */
-	if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled() && !IS_ENABLED(CONFIG_KMEMCHECK))
+	if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
 		page_size_mask |= 1 << PG_LEVEL_2M;
 	else
 		direct_gbpages = 0;
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
deleted file mode 100644
index 520b3bce4095..000000000000
--- a/arch/x86/mm/kmemcheck/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index 872ec4159a68..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -1,228 +1 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/interrupt.h>
-#include <linux/kdebug.h>
-#include <linux/kmemcheck.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/stacktrace.h>
-#include <linux/string.h>
-
-#include "error.h"
-#include "shadow.h"
-
-enum kmemcheck_error_type {
-	KMEMCHECK_ERROR_INVALID_ACCESS,
-	KMEMCHECK_ERROR_BUG,
-};
-
-#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
-
-struct kmemcheck_error {
-	enum kmemcheck_error_type type;
-
-	union {
-		/* KMEMCHECK_ERROR_INVALID_ACCESS */
-		struct {
-			/* Kind of access that caused the error */
-			enum kmemcheck_shadow state;
-			/* Address and size of the erroneous read */
-			unsigned long	address;
-			unsigned int	size;
-		};
-	};
-
-	struct pt_regs		regs;
-	struct stack_trace	trace;
-	unsigned long		trace_entries[32];
-
-	/* We compress it to a char. */
-	unsigned char		shadow_copy[SHADOW_COPY_SIZE];
-	unsigned char		memory_copy[SHADOW_COPY_SIZE];
-};
-
-/*
- * Create a ring queue of errors to output. We can't call printk() directly
- * from the kmemcheck traps, since this may call the console drivers and
- * result in a recursive fault.
- */
-static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
-static unsigned int error_count;
-static unsigned int error_rd;
-static unsigned int error_wr;
-static unsigned int error_missed_count;
-
-static struct kmemcheck_error *error_next_wr(void)
-{
-	struct kmemcheck_error *e;
-
-	if (error_count == ARRAY_SIZE(error_fifo)) {
-		++error_missed_count;
-		return NULL;
-	}
-
-	e = &error_fifo[error_wr];
-	if (++error_wr == ARRAY_SIZE(error_fifo))
-		error_wr = 0;
-	++error_count;
-	return e;
-}
-
-static struct kmemcheck_error *error_next_rd(void)
-{
-	struct kmemcheck_error *e;
-
-	if (error_count == 0)
-		return NULL;
-
-	e = &error_fifo[error_rd];
-	if (++error_rd == ARRAY_SIZE(error_fifo))
-		error_rd = 0;
-	--error_count;
-	return e;
-}
-
-void kmemcheck_error_recall(void)
-{
-	static const char *desc[] = {
-		[KMEMCHECK_SHADOW_UNALLOCATED]		= "unallocated",
-		[KMEMCHECK_SHADOW_UNINITIALIZED]	= "uninitialized",
-		[KMEMCHECK_SHADOW_INITIALIZED]		= "initialized",
-		[KMEMCHECK_SHADOW_FREED]		= "freed",
-	};
-
-	static const char short_desc[] = {
-		[KMEMCHECK_SHADOW_UNALLOCATED]		= 'a',
-		[KMEMCHECK_SHADOW_UNINITIALIZED]	= 'u',
-		[KMEMCHECK_SHADOW_INITIALIZED]		= 'i',
-		[KMEMCHECK_SHADOW_FREED]		= 'f',
-	};
-
-	struct kmemcheck_error *e;
-	unsigned int i;
-
-	e = error_next_rd();
-	if (!e)
-		return;
-
-	switch (e->type) {
-	case KMEMCHECK_ERROR_INVALID_ACCESS:
-		printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n",
-			8 * e->size, e->state < ARRAY_SIZE(desc) ?
-				desc[e->state] : "(invalid shadow state)",
-			(void *) e->address);
-
-		printk(KERN_WARNING);
-		for (i = 0; i < SHADOW_COPY_SIZE; ++i)
-			printk(KERN_CONT "%02x", e->memory_copy[i]);
-		printk(KERN_CONT "\n");
-
-		printk(KERN_WARNING);
-		for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
-			if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
-				printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]);
-			else
-				printk(KERN_CONT " ?");
-		}
-		printk(KERN_CONT "\n");
-		printk(KERN_WARNING "%*c\n", 2 + 2
-			* (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
-		break;
-	case KMEMCHECK_ERROR_BUG:
-		printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
-		break;
-	}
-
-	__show_regs(&e->regs, 1);
-	print_stack_trace(&e->trace, 0);
-}
-
-static void do_wakeup(unsigned long data)
-{
-	while (error_count > 0)
-		kmemcheck_error_recall();
-
-	if (error_missed_count > 0) {
-		printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
-			"the queue was too small\n", error_missed_count);
-		error_missed_count = 0;
-	}
-}
-
-static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
-
-/*
- * Save the context of an error report.
- */
-void kmemcheck_error_save(enum kmemcheck_shadow state,
-	unsigned long address, unsigned int size, struct pt_regs *regs)
-{
-	static unsigned long prev_ip;
-
-	struct kmemcheck_error *e;
-	void *shadow_copy;
-	void *memory_copy;
-
-	/* Don't report several adjacent errors from the same EIP. */
-	if (regs->ip == prev_ip)
-		return;
-	prev_ip = regs->ip;
-
-	e = error_next_wr();
-	if (!e)
-		return;
-
-	e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
-
-	e->state = state;
-	e->address = address;
-	e->size = size;
-
-	/* Save regs */
-	memcpy(&e->regs, regs, sizeof(*regs));
-
-	/* Save stack trace */
-	e->trace.nr_entries = 0;
-	e->trace.entries = e->trace_entries;
-	e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
-	e->trace.skip = 0;
-	save_stack_trace_regs(regs, &e->trace);
-
-	/* Round address down to nearest 16 bytes */
-	shadow_copy = kmemcheck_shadow_lookup(address
-		& ~(SHADOW_COPY_SIZE - 1));
-	BUG_ON(!shadow_copy);
-
-	memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
-
-	kmemcheck_show_addr(address);
-	memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
-	memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
-	kmemcheck_hide_addr(address);
-
-	tasklet_hi_schedule_first(&kmemcheck_tasklet);
-}
-
-/*
- * Save the context of a kmemcheck bug.
- */
-void kmemcheck_error_save_bug(struct pt_regs *regs)
-{
-	struct kmemcheck_error *e;
-
-	e = error_next_wr();
-	if (!e)
-		return;
-
-	e->type = KMEMCHECK_ERROR_BUG;
-
-	memcpy(&e->regs, regs, sizeof(*regs));
-
-	e->trace.nr_entries = 0;
-	e->trace.entries = e->trace_entries;
-	e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
-	e->trace.skip = 1;
-	save_stack_trace(&e->trace);
-
-	tasklet_hi_schedule_first(&kmemcheck_tasklet);
-}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
index 39f80d7a874d..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/error.h
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -1,16 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
-#define ARCH__X86__MM__KMEMCHECK__ERROR_H
-
-#include <linux/ptrace.h>
-
-#include "shadow.h"
-
-void kmemcheck_error_save(enum kmemcheck_shadow state,
-	unsigned long address, unsigned int size, struct pt_regs *regs);
-
-void kmemcheck_error_save_bug(struct pt_regs *regs);
-
-void kmemcheck_error_recall(void);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
deleted file mode 100644
index 4515bae36bbe..000000000000
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ /dev/null
@@ -1,658 +0,0 @@
-/**
- * kmemcheck - a heavyweight memory checker for the linux kernel
- * Copyright (C) 2007, 2008  Vegard Nossum <vegardno@ifi.uio.no>
- * (With a lot of help from Ingo Molnar and Pekka Enberg.)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License (version 2) as
- * published by the Free Software Foundation.
- */
-
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/kallsyms.h>
-#include <linux/kernel.h>
-#include <linux/kmemcheck.h>
-#include <linux/mm.h>
-#include <linux/page-flags.h>
-#include <linux/percpu.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/types.h>
-
-#include <asm/cacheflush.h>
-#include <asm/kmemcheck.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-
-#include "error.h"
-#include "opcode.h"
-#include "pte.h"
-#include "selftest.h"
-#include "shadow.h"
-
-
-#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
-#  define KMEMCHECK_ENABLED 0
-#endif
-
-#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
-#  define KMEMCHECK_ENABLED 1
-#endif
-
-#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
-#  define KMEMCHECK_ENABLED 2
-#endif
-
-int kmemcheck_enabled = KMEMCHECK_ENABLED;
-
-int __init kmemcheck_init(void)
-{
-#ifdef CONFIG_SMP
-	/*
-	 * Limit SMP to use a single CPU. We rely on the fact that this code
-	 * runs before SMP is set up.
-	 */
-	if (setup_max_cpus > 1) {
-		printk(KERN_INFO
-			"kmemcheck: Limiting number of CPUs to 1.\n");
-		setup_max_cpus = 1;
-	}
-#endif
-
-	if (!kmemcheck_selftest()) {
-		printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
-		kmemcheck_enabled = 0;
-		return -EINVAL;
-	}
-
-	printk(KERN_INFO "kmemcheck: Initialized\n");
-	return 0;
-}
-
-early_initcall(kmemcheck_init);
-
-/*
- * We need to parse the kmemcheck= option before any memory is allocated.
- */
-static int __init param_kmemcheck(char *str)
-{
-	int val;
-	int ret;
-
-	if (!str)
-		return -EINVAL;
-
-	ret = kstrtoint(str, 0, &val);
-	if (ret)
-		return ret;
-	kmemcheck_enabled = val;
-	return 0;
-}
-
-early_param("kmemcheck", param_kmemcheck);
-
-int kmemcheck_show_addr(unsigned long address)
-{
-	pte_t *pte;
-
-	pte = kmemcheck_pte_lookup(address);
-	if (!pte)
-		return 0;
-
-	set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
-	__flush_tlb_one(address);
-	return 1;
-}
-
-int kmemcheck_hide_addr(unsigned long address)
-{
-	pte_t *pte;
-
-	pte = kmemcheck_pte_lookup(address);
-	if (!pte)
-		return 0;
-
-	set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
-	__flush_tlb_one(address);
-	return 1;
-}
-
-struct kmemcheck_context {
-	bool busy;
-	int balance;
-
-	/*
-	 * There can be at most two memory operands to an instruction, but
-	 * each address can cross a page boundary -- so we may need up to
-	 * four addresses that must be hidden/revealed for each fault.
-	 */
-	unsigned long addr[4];
-	unsigned long n_addrs;
-	unsigned long flags;
-
-	/* Data size of the instruction that caused a fault. */
-	unsigned int size;
-};
-
-static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
-
-bool kmemcheck_active(struct pt_regs *regs)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
-	return data->balance > 0;
-}
-
-/* Save an address that needs to be shown/hidden */
-static void kmemcheck_save_addr(unsigned long addr)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
-	BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
-	data->addr[data->n_addrs++] = addr;
-}
-
-static unsigned int kmemcheck_show_all(void)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-	unsigned int i;
-	unsigned int n;
-
-	n = 0;
-	for (i = 0; i < data->n_addrs; ++i)
-		n += kmemcheck_show_addr(data->addr[i]);
-
-	return n;
-}
-
-static unsigned int kmemcheck_hide_all(void)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-	unsigned int i;
-	unsigned int n;
-
-	n = 0;
-	for (i = 0; i < data->n_addrs; ++i)
-		n += kmemcheck_hide_addr(data->addr[i]);
-
-	return n;
-}
-
-/*
- * Called from the #PF handler.
- */
-void kmemcheck_show(struct pt_regs *regs)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
-	BUG_ON(!irqs_disabled());
-
-	if (unlikely(data->balance != 0)) {
-		kmemcheck_show_all();
-		kmemcheck_error_save_bug(regs);
-		data->balance = 0;
-		return;
-	}
-
-	/*
-	 * None of the addresses actually belonged to kmemcheck. Note that
-	 * this is not an error.
-	 */
-	if (kmemcheck_show_all() == 0)
-		return;
-
-	++data->balance;
-
-	/*
-	 * The IF needs to be cleared as well, so that the faulting
-	 * instruction can run "uninterrupted". Otherwise, we might take
-	 * an interrupt and start executing that before we've had a chance
-	 * to hide the page again.
-	 *
-	 * NOTE: In the rare case of multiple faults, we must not override
-	 * the original flags:
-	 */
-	if (!(regs->flags & X86_EFLAGS_TF))
-		data->flags = regs->flags;
-
-	regs->flags |= X86_EFLAGS_TF;
-	regs->flags &= ~X86_EFLAGS_IF;
-}
-
-/*
- * Called from the #DB handler.
- */
-void kmemcheck_hide(struct pt_regs *regs)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-	int n;
-
-	BUG_ON(!irqs_disabled());
-
-	if (unlikely(data->balance != 1)) {
-		kmemcheck_show_all();
-		kmemcheck_error_save_bug(regs);
-		data->n_addrs = 0;
-		data->balance = 0;
-
-		if (!(data->flags & X86_EFLAGS_TF))
-			regs->flags &= ~X86_EFLAGS_TF;
-		if (data->flags & X86_EFLAGS_IF)
-			regs->flags |= X86_EFLAGS_IF;
-		return;
-	}
-
-	if (kmemcheck_enabled)
-		n = kmemcheck_hide_all();
-	else
-		n = kmemcheck_show_all();
-
-	if (n == 0)
-		return;
-
-	--data->balance;
-
-	data->n_addrs = 0;
-
-	if (!(data->flags & X86_EFLAGS_TF))
-		regs->flags &= ~X86_EFLAGS_TF;
-	if (data->flags & X86_EFLAGS_IF)
-		regs->flags |= X86_EFLAGS_IF;
-}
-
-void kmemcheck_show_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i) {
-		unsigned long address;
-		pte_t *pte;
-		unsigned int level;
-
-		address = (unsigned long) page_address(&p[i]);
-		pte = lookup_address(address, &level);
-		BUG_ON(!pte);
-		BUG_ON(level != PG_LEVEL_4K);
-
-		set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
-		set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
-		__flush_tlb_one(address);
-	}
-}
-
-bool kmemcheck_page_is_tracked(struct page *p)
-{
-	/* This will also check the "hidden" flag of the PTE. */
-	return kmemcheck_pte_lookup((unsigned long) page_address(p));
-}
-
-void kmemcheck_hide_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i) {
-		unsigned long address;
-		pte_t *pte;
-		unsigned int level;
-
-		address = (unsigned long) page_address(&p[i]);
-		pte = lookup_address(address, &level);
-		BUG_ON(!pte);
-		BUG_ON(level != PG_LEVEL_4K);
-
-		set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
-		set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
-		__flush_tlb_one(address);
-	}
-}
-
-/* Access may NOT cross page boundary */
-static void kmemcheck_read_strict(struct pt_regs *regs,
-	unsigned long addr, unsigned int size)
-{
-	void *shadow;
-	enum kmemcheck_shadow status;
-
-	shadow = kmemcheck_shadow_lookup(addr);
-	if (!shadow)
-		return;
-
-	kmemcheck_save_addr(addr);
-	status = kmemcheck_shadow_test(shadow, size);
-	if (status == KMEMCHECK_SHADOW_INITIALIZED)
-		return;
-
-	if (kmemcheck_enabled)
-		kmemcheck_error_save(status, addr, size, regs);
-
-	if (kmemcheck_enabled == 2)
-		kmemcheck_enabled = 0;
-
-	/* Don't warn about it again. */
-	kmemcheck_shadow_set(shadow, size);
-}
-
-bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
-{
-	enum kmemcheck_shadow status;
-	void *shadow;
-
-	shadow = kmemcheck_shadow_lookup(addr);
-	if (!shadow)
-		return true;
-
-	status = kmemcheck_shadow_test_all(shadow, size);
-
-	return status == KMEMCHECK_SHADOW_INITIALIZED;
-}
-
-/* Access may cross page boundary */
-static void kmemcheck_read(struct pt_regs *regs,
-	unsigned long addr, unsigned int size)
-{
-	unsigned long page = addr & PAGE_MASK;
-	unsigned long next_addr = addr + size - 1;
-	unsigned long next_page = next_addr & PAGE_MASK;
-
-	if (likely(page == next_page)) {
-		kmemcheck_read_strict(regs, addr, size);
-		return;
-	}
-
-	/*
-	 * What we do is basically to split the access across the
-	 * two pages and handle each part separately. Yes, this means
-	 * that we may now see reads that are 3 + 5 bytes, for
-	 * example (and if both are uninitialized, there will be two
-	 * reports), but it makes the code a lot simpler.
-	 */
-	kmemcheck_read_strict(regs, addr, next_page - addr);
-	kmemcheck_read_strict(regs, next_page, next_addr - next_page);
-}
-
-static void kmemcheck_write_strict(struct pt_regs *regs,
-	unsigned long addr, unsigned int size)
-{
-	void *shadow;
-
-	shadow = kmemcheck_shadow_lookup(addr);
-	if (!shadow)
-		return;
-
-	kmemcheck_save_addr(addr);
-	kmemcheck_shadow_set(shadow, size);
-}
-
-static void kmemcheck_write(struct pt_regs *regs,
-	unsigned long addr, unsigned int size)
-{
-	unsigned long page = addr & PAGE_MASK;
-	unsigned long next_addr = addr + size - 1;
-	unsigned long next_page = next_addr & PAGE_MASK;
-
-	if (likely(page == next_page)) {
-		kmemcheck_write_strict(regs, addr, size);
-		return;
-	}
-
-	/* See comment in kmemcheck_read(). */
-	kmemcheck_write_strict(regs, addr, next_page - addr);
-	kmemcheck_write_strict(regs, next_page, next_addr - next_page);
-}
-
-/*
- * Copying is hard. We have two addresses, each of which may be split across
- * a page (and each page will have different shadow addresses).
- */
-static void kmemcheck_copy(struct pt_regs *regs,
-	unsigned long src_addr, unsigned long dst_addr, unsigned int size)
-{
-	uint8_t shadow[8];
-	enum kmemcheck_shadow status;
-
-	unsigned long page;
-	unsigned long next_addr;
-	unsigned long next_page;
-
-	uint8_t *x;
-	unsigned int i;
-	unsigned int n;
-
-	BUG_ON(size > sizeof(shadow));
-
-	page = src_addr & PAGE_MASK;
-	next_addr = src_addr + size - 1;
-	next_page = next_addr & PAGE_MASK;
-
-	if (likely(page == next_page)) {
-		/* Same page */
-		x = kmemcheck_shadow_lookup(src_addr);
-		if (x) {
-			kmemcheck_save_addr(src_addr);
-			for (i = 0; i < size; ++i)
-				shadow[i] = x[i];
-		} else {
-			for (i = 0; i < size; ++i)
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-		}
-	} else {
-		n = next_page - src_addr;
-		BUG_ON(n > sizeof(shadow));
-
-		/* First page */
-		x = kmemcheck_shadow_lookup(src_addr);
-		if (x) {
-			kmemcheck_save_addr(src_addr);
-			for (i = 0; i < n; ++i)
-				shadow[i] = x[i];
-		} else {
-			/* Not tracked */
-			for (i = 0; i < n; ++i)
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-		}
-
-		/* Second page */
-		x = kmemcheck_shadow_lookup(next_page);
-		if (x) {
-			kmemcheck_save_addr(next_page);
-			for (i = n; i < size; ++i)
-				shadow[i] = x[i - n];
-		} else {
-			/* Not tracked */
-			for (i = n; i < size; ++i)
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-		}
-	}
-
-	page = dst_addr & PAGE_MASK;
-	next_addr = dst_addr + size - 1;
-	next_page = next_addr & PAGE_MASK;
-
-	if (likely(page == next_page)) {
-		/* Same page */
-		x = kmemcheck_shadow_lookup(dst_addr);
-		if (x) {
-			kmemcheck_save_addr(dst_addr);
-			for (i = 0; i < size; ++i) {
-				x[i] = shadow[i];
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-			}
-		}
-	} else {
-		n = next_page - dst_addr;
-		BUG_ON(n > sizeof(shadow));
-
-		/* First page */
-		x = kmemcheck_shadow_lookup(dst_addr);
-		if (x) {
-			kmemcheck_save_addr(dst_addr);
-			for (i = 0; i < n; ++i) {
-				x[i] = shadow[i];
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-			}
-		}
-
-		/* Second page */
-		x = kmemcheck_shadow_lookup(next_page);
-		if (x) {
-			kmemcheck_save_addr(next_page);
-			for (i = n; i < size; ++i) {
-				x[i - n] = shadow[i];
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-			}
-		}
-	}
-
-	status = kmemcheck_shadow_test(shadow, size);
-	if (status == KMEMCHECK_SHADOW_INITIALIZED)
-		return;
-
-	if (kmemcheck_enabled)
-		kmemcheck_error_save(status, src_addr, size, regs);
-
-	if (kmemcheck_enabled == 2)
-		kmemcheck_enabled = 0;
-}
-
-enum kmemcheck_method {
-	KMEMCHECK_READ,
-	KMEMCHECK_WRITE,
-};
-
-static void kmemcheck_access(struct pt_regs *regs,
-	unsigned long fallback_address, enum kmemcheck_method fallback_method)
-{
-	const uint8_t *insn;
-	const uint8_t *insn_primary;
-	unsigned int size;
-
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
-	/* Recursive fault -- ouch. */
-	if (data->busy) {
-		kmemcheck_show_addr(fallback_address);
-		kmemcheck_error_save_bug(regs);
-		return;
-	}
-
-	data->busy = true;
-
-	insn = (const uint8_t *) regs->ip;
-	insn_primary = kmemcheck_opcode_get_primary(insn);
-
-	kmemcheck_opcode_decode(insn, &size);
-
-	switch (insn_primary[0]) {
-#ifdef CONFIG_KMEMCHECK_BITOPS_OK
-		/* AND, OR, XOR */
-		/*
-		 * Unfortunately, these instructions have to be excluded from
-		 * our regular checking since they access only some (and not
-		 * all) bits. This clears out "bogus" bitfield-access warnings.
-		 */
-	case 0x80:
-	case 0x81:
-	case 0x82:
-	case 0x83:
-		switch ((insn_primary[1] >> 3) & 7) {
-			/* OR */
-		case 1:
-			/* AND */
-		case 4:
-			/* XOR */
-		case 6:
-			kmemcheck_write(regs, fallback_address, size);
-			goto out;
-
-			/* ADD */
-		case 0:
-			/* ADC */
-		case 2:
-			/* SBB */
-		case 3:
-			/* SUB */
-		case 5:
-			/* CMP */
-		case 7:
-			break;
-		}
-		break;
-#endif
-
-		/* MOVS, MOVSB, MOVSW, MOVSD */
-	case 0xa4:
-	case 0xa5:
-		/*
-		 * These instructions are special because they take two
-		 * addresses, but we only get one page fault.
-		 */
-		kmemcheck_copy(regs, regs->si, regs->di, size);
-		goto out;
-
-		/* CMPS, CMPSB, CMPSW, CMPSD */
-	case 0xa6:
-	case 0xa7:
-		kmemcheck_read(regs, regs->si, size);
-		kmemcheck_read(regs, regs->di, size);
-		goto out;
-	}
-
-	/*
-	 * If the opcode isn't special in any way, we use the data from the
-	 * page fault handler to determine the address and type of memory
-	 * access.
-	 */
-	switch (fallback_method) {
-	case KMEMCHECK_READ:
-		kmemcheck_read(regs, fallback_address, size);
-		goto out;
-	case KMEMCHECK_WRITE:
-		kmemcheck_write(regs, fallback_address, size);
-		goto out;
-	}
-
-out:
-	data->busy = false;
-}
-
-bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
-	unsigned long error_code)
-{
-	pte_t *pte;
-
-	/*
-	 * XXX: Is it safe to assume that memory accesses from virtual 86
-	 * mode or non-kernel code segments will _never_ access kernel
-	 * memory (e.g. tracked pages)? For now, we need this to avoid
-	 * invoking kmemcheck for PnP BIOS calls.
-	 */
-	if (regs->flags & X86_VM_MASK)
-		return false;
-	if (regs->cs != __KERNEL_CS)
-		return false;
-
-	pte = kmemcheck_pte_lookup(address);
-	if (!pte)
-		return false;
-
-	WARN_ON_ONCE(in_nmi());
-
-	if (error_code & 2)
-		kmemcheck_access(regs, address, KMEMCHECK_WRITE);
-	else
-		kmemcheck_access(regs, address, KMEMCHECK_READ);
-
-	kmemcheck_show(regs);
-	return true;
-}
-
-bool kmemcheck_trap(struct pt_regs *regs)
-{
-	if (!kmemcheck_active(regs))
-		return false;
-
-	/* We're done. */
-	kmemcheck_hide(regs);
-	return true;
-}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
index df8109ddf7fe..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -1,107 +1 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/types.h>
-
-#include "opcode.h"
-
-static bool opcode_is_prefix(uint8_t b)
-{
-	return
-		/* Group 1 */
-		b == 0xf0 || b == 0xf2 || b == 0xf3
-		/* Group 2 */
-		|| b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
-		|| b == 0x64 || b == 0x65
-		/* Group 3 */
-		|| b == 0x66
-		/* Group 4 */
-		|| b == 0x67;
-}
-
-#ifdef CONFIG_X86_64
-static bool opcode_is_rex_prefix(uint8_t b)
-{
-	return (b & 0xf0) == 0x40;
-}
-#else
-static bool opcode_is_rex_prefix(uint8_t b)
-{
-	return false;
-}
-#endif
-
-#define REX_W (1 << 3)
-
-/*
- * This is a VERY crude opcode decoder. We only need to find the size of the
- * load/store that caused our #PF and this should work for all the opcodes
- * that we care about. Moreover, the ones who invented this instruction set
- * should be shot.
- */
-void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
-{
-	/* Default operand size */
-	int operand_size_override = 4;
-
-	/* prefixes */
-	for (; opcode_is_prefix(*op); ++op) {
-		if (*op == 0x66)
-			operand_size_override = 2;
-	}
-
-	/* REX prefix */
-	if (opcode_is_rex_prefix(*op)) {
-		uint8_t rex = *op;
-
-		++op;
-		if (rex & REX_W) {
-			switch (*op) {
-			case 0x63:
-				*size = 4;
-				return;
-			case 0x0f:
-				++op;
-
-				switch (*op) {
-				case 0xb6:
-				case 0xbe:
-					*size = 1;
-					return;
-				case 0xb7:
-				case 0xbf:
-					*size = 2;
-					return;
-				}
-
-				break;
-			}
-
-			*size = 8;
-			return;
-		}
-	}
-
-	/* escape opcode */
-	if (*op == 0x0f) {
-		++op;
-
-		/*
-		 * This is move with zero-extend and sign-extend, respectively;
-		 * we don't have to think about 0xb6/0xbe, because this is
-		 * already handled in the conditional below.
-		 */
-		if (*op == 0xb7 || *op == 0xbf)
-			operand_size_override = 2;
-	}
-
-	*size = (*op & 1) ? operand_size_override : 1;
-}
-
-const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
-{
-	/* skip prefixes */
-	while (opcode_is_prefix(*op))
-		++op;
-	if (opcode_is_rex_prefix(*op))
-		++op;
-	return op;
-}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
index 51a1ce94c24a..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/opcode.h
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -1,10 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
-#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
-
-#include <linux/types.h>
-
-void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
-const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
index 8a03be90272a..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/pte.c
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -1,23 +1 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
-
-#include <asm/pgtable.h>
-
-#include "pte.h"
-
-pte_t *kmemcheck_pte_lookup(unsigned long address)
-{
-	pte_t *pte;
-	unsigned int level;
-
-	pte = lookup_address(address, &level);
-	if (!pte)
-		return NULL;
-	if (level != PG_LEVEL_4K)
-		return NULL;
-	if (!pte_hidden(*pte))
-		return NULL;
-
-	return pte;
-}
-
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
index b595612382c2..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/pte.h
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -1,11 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
-#define ARCH__X86__MM__KMEMCHECK__PTE_H
-
-#include <linux/mm.h>
-
-#include <asm/pgtable.h>
-
-pte_t *kmemcheck_pte_lookup(unsigned long address);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
index 7ce0be1f99eb..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/selftest.c
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -1,71 +1 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/bug.h>
-#include <linux/kernel.h>
-
-#include "opcode.h"
-#include "selftest.h"
-
-struct selftest_opcode {
-	unsigned int expected_size;
-	const uint8_t *insn;
-	const char *desc;
-};
-
-static const struct selftest_opcode selftest_opcodes[] = {
-	/* REP MOVS */
-	{1, "\xf3\xa4", 		"rep movsb <mem8>, <mem8>"},
-	{4, "\xf3\xa5",			"rep movsl <mem32>, <mem32>"},
-
-	/* MOVZX / MOVZXD */
-	{1, "\x66\x0f\xb6\x51\xf8",	"movzwq <mem8>, <reg16>"},
-	{1, "\x0f\xb6\x51\xf8",		"movzwq <mem8>, <reg32>"},
-
-	/* MOVSX / MOVSXD */
-	{1, "\x66\x0f\xbe\x51\xf8",	"movswq <mem8>, <reg16>"},
-	{1, "\x0f\xbe\x51\xf8",		"movswq <mem8>, <reg32>"},
-
-#ifdef CONFIG_X86_64
-	/* MOVZX / MOVZXD */
-	{1, "\x49\x0f\xb6\x51\xf8",	"movzbq <mem8>, <reg64>"},
-	{2, "\x49\x0f\xb7\x51\xf8",	"movzbq <mem16>, <reg64>"},
-
-	/* MOVSX / MOVSXD */
-	{1, "\x49\x0f\xbe\x51\xf8",	"movsbq <mem8>, <reg64>"},
-	{2, "\x49\x0f\xbf\x51\xf8",	"movsbq <mem16>, <reg64>"},
-	{4, "\x49\x63\x51\xf8",		"movslq <mem32>, <reg64>"},
-#endif
-};
-
-static bool selftest_opcode_one(const struct selftest_opcode *op)
-{
-	unsigned size;
-
-	kmemcheck_opcode_decode(op->insn, &size);
-
-	if (size == op->expected_size)
-		return true;
-
-	printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
-		op->desc, op->expected_size, size);
-	return false;
-}
-
-static bool selftest_opcodes_all(void)
-{
-	bool pass = true;
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
-		pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
-
-	return pass;
-}
-
-bool kmemcheck_selftest(void)
-{
-	bool pass = true;
-
-	pass = pass && selftest_opcodes_all();
-
-	return pass;
-}
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
index 8d759aae453d..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/selftest.h
+++ b/arch/x86/mm/kmemcheck/selftest.h
@@ -1,7 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
-#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
-
-bool kmemcheck_selftest(void);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
deleted file mode 100644
index c2638a7d2c10..000000000000
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ /dev/null
@@ -1,173 +0,0 @@
-#include <linux/kmemcheck.h>
-#include <linux/export.h>
-#include <linux/mm.h>
-
-#include <asm/page.h>
-#include <asm/pgtable.h>
-
-#include "pte.h"
-#include "shadow.h"
-
-/*
- * Return the shadow address for the given address. Returns NULL if the
- * address is not tracked.
- *
- * We need to be extremely careful not to follow any invalid pointers,
- * because this function can be called for *any* possible address.
- */
-void *kmemcheck_shadow_lookup(unsigned long address)
-{
-	pte_t *pte;
-	struct page *page;
-
-	if (!virt_addr_valid(address))
-		return NULL;
-
-	pte = kmemcheck_pte_lookup(address);
-	if (!pte)
-		return NULL;
-
-	page = virt_to_page(address);
-	if (!page->shadow)
-		return NULL;
-	return page->shadow + (address & (PAGE_SIZE - 1));
-}
-
-static void mark_shadow(void *address, unsigned int n,
-	enum kmemcheck_shadow status)
-{
-	unsigned long addr = (unsigned long) address;
-	unsigned long last_addr = addr + n - 1;
-	unsigned long page = addr & PAGE_MASK;
-	unsigned long last_page = last_addr & PAGE_MASK;
-	unsigned int first_n;
-	void *shadow;
-
-	/* If the memory range crosses a page boundary, stop there. */
-	if (page == last_page)
-		first_n = n;
-	else
-		first_n = page + PAGE_SIZE - addr;
-
-	shadow = kmemcheck_shadow_lookup(addr);
-	if (shadow)
-		memset(shadow, status, first_n);
-
-	addr += first_n;
-	n -= first_n;
-
-	/* Do full-page memset()s. */
-	while (n >= PAGE_SIZE) {
-		shadow = kmemcheck_shadow_lookup(addr);
-		if (shadow)
-			memset(shadow, status, PAGE_SIZE);
-
-		addr += PAGE_SIZE;
-		n -= PAGE_SIZE;
-	}
-
-	/* Do the remaining page, if any. */
-	if (n > 0) {
-		shadow = kmemcheck_shadow_lookup(addr);
-		if (shadow)
-			memset(shadow, status, n);
-	}
-}
-
-void kmemcheck_mark_unallocated(void *address, unsigned int n)
-{
-	mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
-}
-
-void kmemcheck_mark_uninitialized(void *address, unsigned int n)
-{
-	mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
-}
-
-/*
- * Fill the shadow memory of the given address such that the memory at that
- * address is marked as being initialized.
- */
-void kmemcheck_mark_initialized(void *address, unsigned int n)
-{
-	mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
-}
-EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
-
-void kmemcheck_mark_freed(void *address, unsigned int n)
-{
-	mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
-}
-
-void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i)
-		kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
-}
-
-void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i)
-		kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
-}
-
-void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i)
-		kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
-}
-
-enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
-{
-#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
-	uint8_t *x;
-	unsigned int i;
-
-	x = shadow;
-
-	/*
-	 * Make sure _some_ bytes are initialized. Gcc frequently generates
-	 * code to access neighboring bytes.
-	 */
-	for (i = 0; i < size; ++i) {
-		if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
-			return x[i];
-	}
-
-	return x[0];
-#else
-	return kmemcheck_shadow_test_all(shadow, size);
-#endif
-}
-
-enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size)
-{
-	uint8_t *x;
-	unsigned int i;
-
-	x = shadow;
-
-	/* All bytes must be initialized. */
-	for (i = 0; i < size; ++i) {
-		if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
-			return x[i];
-	}
-
-	return x[0];
-}
-
-void kmemcheck_shadow_set(void *shadow, unsigned int size)
-{
-	uint8_t *x;
-	unsigned int i;
-
-	x = shadow;
-	for (i = 0; i < size; ++i)
-		x[i] = KMEMCHECK_SHADOW_INITIALIZED;
-}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
index 49768dc18664..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/shadow.h
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -1,19 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
-#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
-
-enum kmemcheck_shadow {
-	KMEMCHECK_SHADOW_UNALLOCATED,
-	KMEMCHECK_SHADOW_UNINITIALIZED,
-	KMEMCHECK_SHADOW_INITIALIZED,
-	KMEMCHECK_SHADOW_FREED,
-};
-
-void *kmemcheck_shadow_lookup(unsigned long address);
-
-enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
-enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow,
-						unsigned int size);
-void kmemcheck_shadow_set(void *shadow, unsigned int size);
-
-#endif
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index baeb872283d9..69c238210325 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -594,21 +594,6 @@ static inline void tasklet_hi_schedule(struct tasklet_struct *t)
 		__tasklet_hi_schedule(t);
 }
 
-extern void __tasklet_hi_schedule_first(struct tasklet_struct *t);
-
-/*
- * This version avoids touching any other tasklets. Needed for kmemcheck
- * in order not to take any page faults while enqueueing this tasklet;
- * consider VERY carefully whether you really need this or
- * tasklet_hi_schedule()...
- */
-static inline void tasklet_hi_schedule_first(struct tasklet_struct *t)
-{
-	if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
-		__tasklet_hi_schedule_first(t);
-}
-
-
 static inline void tasklet_disable_nosync(struct tasklet_struct *t)
 {
 	atomic_inc(&t->count);
diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h
index 7b1d7bead7d9..ea32a7d3cf1b 100644
--- a/include/linux/kmemcheck.h
+++ b/include/linux/kmemcheck.h
@@ -1,172 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef LINUX_KMEMCHECK_H
-#define LINUX_KMEMCHECK_H
-
-#include <linux/mm_types.h>
-#include <linux/types.h>
-
-#ifdef CONFIG_KMEMCHECK
-extern int kmemcheck_enabled;
-
-/* The slab-related functions. */
-void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node);
-void kmemcheck_free_shadow(struct page *page, int order);
-void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
-			  size_t size);
-void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size);
-
-void kmemcheck_pagealloc_alloc(struct page *p, unsigned int order,
-			       gfp_t gfpflags);
-
-void kmemcheck_show_pages(struct page *p, unsigned int n);
-void kmemcheck_hide_pages(struct page *p, unsigned int n);
-
-bool kmemcheck_page_is_tracked(struct page *p);
-
-void kmemcheck_mark_unallocated(void *address, unsigned int n);
-void kmemcheck_mark_uninitialized(void *address, unsigned int n);
-void kmemcheck_mark_initialized(void *address, unsigned int n);
-void kmemcheck_mark_freed(void *address, unsigned int n);
-
-void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n);
-void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n);
-void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n);
-
-int kmemcheck_show_addr(unsigned long address);
-int kmemcheck_hide_addr(unsigned long address);
-
-bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size);
-
-/*
- * Bitfield annotations
- *
- * How to use: If you have a struct using bitfields, for example
- *
- *     struct a {
- *             int x:8, y:8;
- *     };
- *
- * then this should be rewritten as
- *
- *     struct a {
- *             kmemcheck_bitfield_begin(flags);
- *             int x:8, y:8;
- *             kmemcheck_bitfield_end(flags);
- *     };
- *
- * Now the "flags_begin" and "flags_end" members may be used to refer to the
- * beginning and end, respectively, of the bitfield (and things like
- * &x.flags_begin is allowed). As soon as the struct is allocated, the bit-
- * fields should be annotated:
- *
- *     struct a *a = kmalloc(sizeof(struct a), GFP_KERNEL);
- *     kmemcheck_annotate_bitfield(a, flags);
- */
-#define kmemcheck_bitfield_begin(name)	\
-	int name##_begin[0];
-
-#define kmemcheck_bitfield_end(name)	\
-	int name##_end[0];
-
-#define kmemcheck_annotate_bitfield(ptr, name)				\
-	do {								\
-		int _n;							\
-									\
-		if (!ptr)						\
-			break;						\
-									\
-		_n = (long) &((ptr)->name##_end)			\
-			- (long) &((ptr)->name##_begin);		\
-		BUILD_BUG_ON(_n < 0);					\
-									\
-		kmemcheck_mark_initialized(&((ptr)->name##_begin), _n);	\
-	} while (0)
-
-#define kmemcheck_annotate_variable(var)				\
-	do {								\
-		kmemcheck_mark_initialized(&(var), sizeof(var));	\
-	} while (0)							\
-
-#else
-#define kmemcheck_enabled 0
-
-static inline void
-kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
-{
-}
-
-static inline void
-kmemcheck_free_shadow(struct page *page, int order)
-{
-}
-
-static inline void
-kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
-		     size_t size)
-{
-}
-
-static inline void kmemcheck_slab_free(struct kmem_cache *s, void *object,
-				       size_t size)
-{
-}
-
-static inline void kmemcheck_pagealloc_alloc(struct page *p,
-	unsigned int order, gfp_t gfpflags)
-{
-}
-
-static inline bool kmemcheck_page_is_tracked(struct page *p)
-{
-	return false;
-}
-
-static inline void kmemcheck_mark_unallocated(void *address, unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_uninitialized(void *address, unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_initialized(void *address, unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_freed(void *address, unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_unallocated_pages(struct page *p,
-						    unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_uninitialized_pages(struct page *p,
-						      unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_initialized_pages(struct page *p,
-						    unsigned int n)
-{
-}
-
-static inline bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
-{
-	return true;
-}
-
-#define kmemcheck_bitfield_begin(name)
-#define kmemcheck_bitfield_end(name)
-#define kmemcheck_annotate_bitfield(ptr, name)	\
-	do {					\
-	} while (0)
-
-#define kmemcheck_annotate_variable(var)	\
-	do {					\
-	} while (0)
-
-#endif /* CONFIG_KMEMCHECK */
-
-#endif /* LINUX_KMEMCHECK_H */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 662f7b1b7a78..2f5e87f1bae2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -486,16 +486,6 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
 }
 EXPORT_SYMBOL(__tasklet_hi_schedule);
 
-void __tasklet_hi_schedule_first(struct tasklet_struct *t)
-{
-	lockdep_assert_irqs_disabled();
-
-	t->next = __this_cpu_read(tasklet_hi_vec.head);
-	__this_cpu_write(tasklet_hi_vec.head, t);
-	__raise_softirq_irqoff(HI_SOFTIRQ);
-}
-EXPORT_SYMBOL(__tasklet_hi_schedule_first);
-
 static __latent_entropy void tasklet_action(struct softirq_action *a)
 {
 	struct tasklet_struct *list;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9576bd582d4a..7638e2f7fff8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -30,7 +30,6 @@
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
-#include <linux/kmemcheck.h>
 #include <linux/kmemleak.h>
 #include <linux/fs.h>
 #include <linux/init.h>
@@ -1173,15 +1172,6 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_thousand,
 	},
-#endif
-#ifdef CONFIG_KMEMCHECK
-	{
-		.procname	= "kmemcheck",
-		.data		= &kmemcheck_enabled,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 #endif
 	{
 		.procname	= "panic_on_warn",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 07ce7449765a..5402e3954659 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -504,7 +504,7 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT
 
 config DEBUG_SLAB
 	bool "Debug slab memory allocations"
-	depends on DEBUG_KERNEL && SLAB && !KMEMCHECK
+	depends on DEBUG_KERNEL && SLAB
 	help
 	  Say Y here to have the kernel do limited verification on memory
 	  allocation as well as poisoning memory on free to catch use of freed
@@ -516,7 +516,7 @@ config DEBUG_SLAB_LEAK
 
 config SLUB_DEBUG_ON
 	bool "SLUB debugging on by default"
-	depends on SLUB && SLUB_DEBUG && !KMEMCHECK
+	depends on SLUB && SLUB_DEBUG
 	default n
 	help
 	  Boot with debugging on by default. SLUB boots by default with
@@ -730,8 +730,6 @@ config DEBUG_STACKOVERFLOW
 
 	  If in doubt, say "N".
 
-source "lib/Kconfig.kmemcheck"
-
 source "lib/Kconfig.kasan"
 
 endmenu # "Memory Debugging"
diff --git a/lib/Kconfig.kmemcheck b/lib/Kconfig.kmemcheck
deleted file mode 100644
index 846e039a86b4..000000000000
--- a/lib/Kconfig.kmemcheck
+++ /dev/null
@@ -1,94 +0,0 @@
-config HAVE_ARCH_KMEMCHECK
-	bool
-
-if HAVE_ARCH_KMEMCHECK
-
-menuconfig KMEMCHECK
-	bool "kmemcheck: trap use of uninitialized memory"
-	depends on DEBUG_KERNEL
-	depends on !X86_USE_3DNOW
-	depends on SLUB || SLAB
-	depends on !CC_OPTIMIZE_FOR_SIZE
-	depends on !FUNCTION_TRACER
-	select FRAME_POINTER
-	select STACKTRACE
-	default n
-	help
-	  This option enables tracing of dynamically allocated kernel memory
-	  to see if memory is used before it has been given an initial value.
-	  Be aware that this requires half of your memory for bookkeeping and
-	  will insert extra code at *every* read and write to tracked memory
-	  thus slow down the kernel code (but user code is unaffected).
-
-	  The kernel may be started with kmemcheck=0 or kmemcheck=1 to disable
-	  or enable kmemcheck at boot-time. If the kernel is started with
-	  kmemcheck=0, the large memory and CPU overhead is not incurred.
-
-choice
-	prompt "kmemcheck: default mode at boot"
-	depends on KMEMCHECK
-	default KMEMCHECK_ONESHOT_BY_DEFAULT
-	help
-	  This option controls the default behaviour of kmemcheck when the
-	  kernel boots and no kmemcheck= parameter is given.
-
-config KMEMCHECK_DISABLED_BY_DEFAULT
-	bool "disabled"
-	depends on KMEMCHECK
-
-config KMEMCHECK_ENABLED_BY_DEFAULT
-	bool "enabled"
-	depends on KMEMCHECK
-
-config KMEMCHECK_ONESHOT_BY_DEFAULT
-	bool "one-shot"
-	depends on KMEMCHECK
-	help
-	  In one-shot mode, only the first error detected is reported before
-	  kmemcheck is disabled.
-
-endchoice
-
-config KMEMCHECK_QUEUE_SIZE
-	int "kmemcheck: error queue size"
-	depends on KMEMCHECK
-	default 64
-	help
-	  Select the maximum number of errors to store in the queue. Since
-	  errors can occur virtually anywhere and in any context, we need a
-	  temporary storage area which is guarantueed not to generate any
-	  other faults. The queue will be emptied as soon as a tasklet may
-	  be scheduled. If the queue is full, new error reports will be
-	  lost.
-
-config KMEMCHECK_SHADOW_COPY_SHIFT
-	int "kmemcheck: shadow copy size (5 => 32 bytes, 6 => 64 bytes)"
-	depends on KMEMCHECK
-	range 2 8
-	default 5
-	help
-	  Select the number of shadow bytes to save along with each entry of
-	  the queue. These bytes indicate what parts of an allocation are
-	  initialized, uninitialized, etc. and will be displayed when an
-	  error is detected to help the debugging of a particular problem.
-
-config KMEMCHECK_PARTIAL_OK
-	bool "kmemcheck: allow partially uninitialized memory"
-	depends on KMEMCHECK
-	default y
-	help
-	  This option works around certain GCC optimizations that produce
-	  32-bit reads from 16-bit variables where the upper 16 bits are
-	  thrown away afterwards. This may of course also hide some real
-	  bugs.
-
-config KMEMCHECK_BITOPS_OK
-	bool "kmemcheck: allow bit-field manipulation"
-	depends on KMEMCHECK
-	default n
-	help
-	  This option silences warnings that would be generated for bit-field
-	  accesses where not all the bits are initialized at the same time.
-	  This may also hide some real bugs.
-
-endif
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 5b0adf1435de..e5e606ee5f71 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -11,7 +11,6 @@ config DEBUG_PAGEALLOC
 	bool "Debug page memory allocations"
 	depends on DEBUG_KERNEL
 	depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
-	depends on !KMEMCHECK
 	select PAGE_EXTENSION
 	select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	---help---
diff --git a/mm/Makefile b/mm/Makefile
index 4659b93cba43..e7ebd176fb93 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -17,7 +17,6 @@ KCOV_INSTRUMENT_slub.o := n
 KCOV_INSTRUMENT_page_alloc.o := n
 KCOV_INSTRUMENT_debug-pagealloc.o := n
 KCOV_INSTRUMENT_kmemleak.o := n
-KCOV_INSTRUMENT_kmemcheck.o := n
 KCOV_INSTRUMENT_memcontrol.o := n
 KCOV_INSTRUMENT_mmzone.o := n
 KCOV_INSTRUMENT_vmstat.o := n
@@ -70,7 +69,6 @@ obj-$(CONFIG_KSM) += ksm.o
 obj-$(CONFIG_PAGE_POISONING) += page_poison.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
-obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
 obj-$(CONFIG_KASAN)	+= kasan/
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index b3a4d61d341c..cec594032515 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -1,126 +1 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/gfp.h>
-#include <linux/mm_types.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include "slab.h"
-#include <linux/kmemcheck.h>
-
-void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
-{
-	struct page *shadow;
-	int pages;
-	int i;
-
-	pages = 1 << order;
-
-	/*
-	 * With kmemcheck enabled, we need to allocate a memory area for the
-	 * shadow bits as well.
-	 */
-	shadow = alloc_pages_node(node, flags, order);
-	if (!shadow) {
-		if (printk_ratelimit())
-			pr_err("kmemcheck: failed to allocate shadow bitmap\n");
-		return;
-	}
-
-	for(i = 0; i < pages; ++i)
-		page[i].shadow = page_address(&shadow[i]);
-
-	/*
-	 * Mark it as non-present for the MMU so that our accesses to
-	 * this memory will trigger a page fault and let us analyze
-	 * the memory accesses.
-	 */
-	kmemcheck_hide_pages(page, pages);
-}
-
-void kmemcheck_free_shadow(struct page *page, int order)
-{
-	struct page *shadow;
-	int pages;
-	int i;
-
-	if (!kmemcheck_page_is_tracked(page))
-		return;
-
-	pages = 1 << order;
-
-	kmemcheck_show_pages(page, pages);
-
-	shadow = virt_to_page(page[0].shadow);
-
-	for(i = 0; i < pages; ++i)
-		page[i].shadow = NULL;
-
-	__free_pages(shadow, order);
-}
-
-void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
-			  size_t size)
-{
-	if (unlikely(!object)) /* Skip object if allocation failed */
-		return;
-
-	/*
-	 * Has already been memset(), which initializes the shadow for us
-	 * as well.
-	 */
-	if (gfpflags & __GFP_ZERO)
-		return;
-
-	/* No need to initialize the shadow of a non-tracked slab. */
-	if (s->flags & SLAB_NOTRACK)
-		return;
-
-	if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) {
-		/*
-		 * Allow notracked objects to be allocated from
-		 * tracked caches. Note however that these objects
-		 * will still get page faults on access, they just
-		 * won't ever be flagged as uninitialized. If page
-		 * faults are not acceptable, the slab cache itself
-		 * should be marked NOTRACK.
-		 */
-		kmemcheck_mark_initialized(object, size);
-	} else if (!s->ctor) {
-		/*
-		 * New objects should be marked uninitialized before
-		 * they're returned to the called.
-		 */
-		kmemcheck_mark_uninitialized(object, size);
-	}
-}
-
-void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
-{
-	/* TODO: RCU freeing is unsupported for now; hide false positives. */
-	if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU))
-		kmemcheck_mark_freed(object, size);
-}
-
-void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order,
-			       gfp_t gfpflags)
-{
-	int pages;
-
-	if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK))
-		return;
-
-	pages = 1 << order;
-
-	/*
-	 * NOTE: We choose to track GFP_ZERO pages too; in fact, they
-	 * can become uninitialized by copying uninitialized memory
-	 * into them.
-	 */
-
-	/* XXX: Can use zone->node for node? */
-	kmemcheck_alloc_shadow(page, order, gfpflags, -1);
-
-	if (gfpflags & __GFP_ZERO)
-		kmemcheck_mark_initialized_pages(page, pages);
-	else
-		kmemcheck_mark_uninitialized_pages(page, pages);
-}
diff --git a/mm/slub.c b/mm/slub.c
index c2c41e178acf..cfd56e5a35fb 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1371,7 +1371,7 @@ static inline void *slab_free_hook(struct kmem_cache *s, void *x)
 	 * So in order to make the debug calls that expect irqs to be
 	 * disabled we need to disable interrupts temporarily.
 	 */
-#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
+#ifdef CONFIG_LOCKDEP
 	{
 		unsigned long flags;
 
@@ -1399,8 +1399,7 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s,
  * Compiler cannot detect this function can be removed if slab_free_hook()
  * evaluates to nothing.  Thus, catch all relevant config debug options here.
  */
-#if defined(CONFIG_KMEMCHECK) ||		\
-	defined(CONFIG_LOCKDEP)	||		\
+#if defined(CONFIG_LOCKDEP)	||		\
 	defined(CONFIG_DEBUG_KMEMLEAK) ||	\
 	defined(CONFIG_DEBUG_OBJECTS_FREE) ||	\
 	defined(CONFIG_KASAN)
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 67d051edd615..7bd52b8f63d4 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -2182,8 +2182,6 @@ sub dump_struct($$) {
 	# strip comments:
 	$members =~ s/\/\*.*?\*\///gos;
 	$nested =~ s/\/\*.*?\*\///gos;
-	# strip kmemcheck_bitfield_{begin,end}.*;
-	$members =~ s/kmemcheck_bitfield_.*?;//gos;
 	# strip attributes
 	$members =~ s/__attribute__\s*\(\([a-z,_\*\s\(\)]*\)\)//i;
 	$members =~ s/__aligned\s*\([^;]*\)//gos;
diff --git a/tools/include/linux/kmemcheck.h b/tools/include/linux/kmemcheck.h
index 2bccd2c7b897..ea32a7d3cf1b 100644
--- a/tools/include/linux/kmemcheck.h
+++ b/tools/include/linux/kmemcheck.h
@@ -1,9 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LIBLOCKDEP_LINUX_KMEMCHECK_H_
-#define _LIBLOCKDEP_LINUX_KMEMCHECK_H_
-
-static inline void kmemcheck_mark_initialized(void *address, unsigned int n)
-{
-}
-
-#endif
-- 
cgit v1.2.3


From ea1f5f3712afe895dfa4176ec87376b4a9ac23be Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Wed, 15 Nov 2017 17:36:27 -0800
Subject: mm: define memblock_virt_alloc_try_nid_raw

* A new variant of memblock_virt_alloc_* allocations:
memblock_virt_alloc_try_nid_raw()
    - Does not zero the allocated memory
    - Does not panic if request cannot be satisfied

* optimize early system hash allocations

Clients can call alloc_large_system_hash() with flag: HASH_ZERO to
specify that memory that was allocated for system hash needs to be
zeroed, otherwise the memory does not need to be zeroed, and client will
initialize it.

If memory does not need to be zero'd, call the new
memblock_virt_alloc_raw() interface, and thus improve the boot
performance.

* debug for raw alloctor

When CONFIG_DEBUG_VM is enabled, this patch sets all the memory that is
returned by memblock_virt_alloc_try_nid_raw() to ones to ensure that no
places excpect zeroed memory.

Link: http://lkml.kernel.org/r/20171013173214.27300-6-pasha.tatashin@oracle.com
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Reviewed-by: Steven Sistare <steven.sistare@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Reviewed-by: Bob Picco <bob.picco@oracle.com>
Tested-by: Bob Picco <bob.picco@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootmem.h | 27 ++++++++++++++++++++++
 mm/memblock.c           | 60 +++++++++++++++++++++++++++++++++++++++++++------
 mm/page_alloc.c         | 15 ++++++-------
 3 files changed, 87 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index fdf40ca04b3c..a53063e9d7d8 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -161,6 +161,9 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 #define BOOTMEM_ALLOC_ANYWHERE		(~(phys_addr_t)0)
 
 /* FIXME: Move to memblock.h at a point where we remove nobootmem.c */
+void *memblock_virt_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
+				      phys_addr_t min_addr,
+				      phys_addr_t max_addr, int nid);
 void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size,
 		phys_addr_t align, phys_addr_t min_addr,
 		phys_addr_t max_addr, int nid);
@@ -177,6 +180,14 @@ static inline void * __init memblock_virt_alloc(
 					    NUMA_NO_NODE);
 }
 
+static inline void * __init memblock_virt_alloc_raw(
+					phys_addr_t size,  phys_addr_t align)
+{
+	return memblock_virt_alloc_try_nid_raw(size, align, BOOTMEM_LOW_LIMIT,
+					    BOOTMEM_ALLOC_ACCESSIBLE,
+					    NUMA_NO_NODE);
+}
+
 static inline void * __init memblock_virt_alloc_nopanic(
 					phys_addr_t size, phys_addr_t align)
 {
@@ -258,6 +269,14 @@ static inline void * __init memblock_virt_alloc(
 	return __alloc_bootmem(size, align, BOOTMEM_LOW_LIMIT);
 }
 
+static inline void * __init memblock_virt_alloc_raw(
+					phys_addr_t size,  phys_addr_t align)
+{
+	if (!align)
+		align = SMP_CACHE_BYTES;
+	return __alloc_bootmem_nopanic(size, align, BOOTMEM_LOW_LIMIT);
+}
+
 static inline void * __init memblock_virt_alloc_nopanic(
 					phys_addr_t size, phys_addr_t align)
 {
@@ -310,6 +329,14 @@ static inline void * __init memblock_virt_alloc_try_nid(phys_addr_t size,
 					  min_addr);
 }
 
+static inline void * __init memblock_virt_alloc_try_nid_raw(
+			phys_addr_t size, phys_addr_t align,
+			phys_addr_t min_addr, phys_addr_t max_addr, int nid)
+{
+	return ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, align,
+				min_addr, max_addr);
+}
+
 static inline void * __init memblock_virt_alloc_try_nid_nopanic(
 			phys_addr_t size, phys_addr_t align,
 			phys_addr_t min_addr, phys_addr_t max_addr, int nid)
diff --git a/mm/memblock.c b/mm/memblock.c
index 18dbb69086bc..46aacdfa4f4d 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1327,7 +1327,6 @@ again:
 	return NULL;
 done:
 	ptr = phys_to_virt(alloc);
-	memset(ptr, 0, size);
 
 	/*
 	 * The min_count is set to 0 so that bootmem allocated blocks
@@ -1340,6 +1339,45 @@ done:
 	return ptr;
 }
 
+/**
+ * memblock_virt_alloc_try_nid_raw - allocate boot memory block without zeroing
+ * memory and without panicking
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ *	  is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ *	      is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ *	      allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public function, provides additional debug information (including caller
+ * info), if enabled. Does not zero allocated memory, does not panic if request
+ * cannot be satisfied.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid_raw(
+			phys_addr_t size, phys_addr_t align,
+			phys_addr_t min_addr, phys_addr_t max_addr,
+			int nid)
+{
+	void *ptr;
+
+	memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
+		     __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+		     (u64)max_addr, (void *)_RET_IP_);
+
+	ptr = memblock_virt_alloc_internal(size, align,
+					   min_addr, max_addr, nid);
+#ifdef CONFIG_DEBUG_VM
+	if (ptr && size > 0)
+		memset(ptr, 0xff, size);
+#endif
+	return ptr;
+}
+
 /**
  * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
  * @size: size of memory block to be allocated in bytes
@@ -1351,8 +1389,8 @@ done:
  *	      allocate only from memory limited by memblock.current_limit value
  * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
  *
- * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides
- * additional debug information (including caller info), if enabled.
+ * Public function, provides additional debug information (including caller
+ * info), if enabled. This function zeroes the allocated memory.
  *
  * RETURNS:
  * Virtual address of allocated memory block on success, NULL on failure.
@@ -1362,11 +1400,17 @@ void * __init memblock_virt_alloc_try_nid_nopanic(
 				phys_addr_t min_addr, phys_addr_t max_addr,
 				int nid)
 {
+	void *ptr;
+
 	memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
 		     __func__, (u64)size, (u64)align, nid, (u64)min_addr,
 		     (u64)max_addr, (void *)_RET_IP_);
-	return memblock_virt_alloc_internal(size, align, min_addr,
-					     max_addr, nid);
+
+	ptr = memblock_virt_alloc_internal(size, align,
+					   min_addr, max_addr, nid);
+	if (ptr)
+		memset(ptr, 0, size);
+	return ptr;
 }
 
 /**
@@ -1380,7 +1424,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic(
  *	      allocate only from memory limited by memblock.current_limit value
  * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
  *
- * Public panicking version of _memblock_virt_alloc_try_nid_nopanic()
+ * Public panicking version of memblock_virt_alloc_try_nid_nopanic()
  * which provides debug information (including caller info), if enabled,
  * and panics if the request can not be satisfied.
  *
@@ -1399,8 +1443,10 @@ void * __init memblock_virt_alloc_try_nid(
 		     (u64)max_addr, (void *)_RET_IP_);
 	ptr = memblock_virt_alloc_internal(size, align,
 					   min_addr, max_addr, nid);
-	if (ptr)
+	if (ptr) {
+		memset(ptr, 0, size);
 		return ptr;
+	}
 
 	panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n",
 	      __func__, (u64)size, (u64)align, nid, (u64)min_addr,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4dee5082d3d7..805f30dd1c26 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7313,18 +7313,17 @@ void *__init alloc_large_system_hash(const char *tablename,
 
 	log2qty = ilog2(numentries);
 
-	/*
-	 * memblock allocator returns zeroed memory already, so HASH_ZERO is
-	 * currently not used when HASH_EARLY is specified.
-	 */
 	gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
 	do {
 		size = bucketsize << log2qty;
-		if (flags & HASH_EARLY)
-			table = memblock_virt_alloc_nopanic(size, 0);
-		else if (hashdist)
+		if (flags & HASH_EARLY) {
+			if (flags & HASH_ZERO)
+				table = memblock_virt_alloc_nopanic(size, 0);
+			else
+				table = memblock_virt_alloc_raw(size, 0);
+		} else if (hashdist) {
 			table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
-		else {
+		} else {
 			/*
 			 * If bucketsize is not a power-of-two, we may free
 			 * some pages at the end of hash table which
-- 
cgit v1.2.3


From a4a3ede2132ae0863e2d43e06f9b5697c51a7a3b Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Wed, 15 Nov 2017 17:36:31 -0800
Subject: mm: zero reserved and unavailable struct pages

Some memory is reserved but unavailable: not present in memblock.memory
(because not backed by physical pages), but present in memblock.reserved.
Such memory has backing struct pages, but they are not initialized by
going through __init_single_page().

In some cases these struct pages are accessed even if they do not
contain any data.  One example is page_to_pfn() might access page->flags
if this is where section information is stored (CONFIG_SPARSEMEM,
SECTION_IN_PAGE_FLAGS).

One example of such memory: trim_low_memory_range() unconditionally
reserves from pfn 0, but e820__memblock_setup() might provide the
exiting memory from pfn 1 (i.e.  KVM).

Since struct pages are zeroed in __init_single_page(), and not during
allocation time, we must zero such struct pages explicitly.

The patch involves adding a new memblock iterator:
	for_each_resv_unavail_range(i, p_start, p_end)

Which iterates through reserved && !memory lists, and we zero struct pages
explicitly by calling mm_zero_struct_page().

===

Here is more detailed example of problem that this patch is addressing:

Run tested on qemu with the following arguments:

	-enable-kvm -cpu kvm64 -m 512 -smp 2

This patch reports that there are 98 unavailable pages.

They are: pfn 0 and pfns in range [159, 255].

Note, trim_low_memory_range() reserves only pfns in range [0, 15], it does
not reserve [159, 255] ones.

e820__memblock_setup() reports linux that the following physical ranges are
available:
    [1 , 158]
[256, 130783]

Notice, that exactly unavailable pfns are missing!

Now, lets check what we have in zone 0: [1, 131039]

pfn 0, is not part of the zone, but pfns [1, 158], are.

However, the bigger problem we have if we do not initialize these struct
pages is with memory hotplug.  Because, that path operates at 2M
boundaries (section_nr).  And checks if 2M range of pages is hot
removable.  It starts with first pfn from zone, rounds it down to 2M
boundary (sturct pages are allocated at 2M boundaries when vmemmap is
created), and checks if that section is hot removable.  In this case
start with pfn 1 and convert it down to pfn 0.  Later pfn is converted
to struct page, and some fields are checked.  Now, if we do not zero
struct pages, we get unpredictable results.

In fact when CONFIG_VM_DEBUG is enabled, and we explicitly set all
vmemmap memory to ones, the following panic is observed with kernel test
without this patch applied:

  BUG: unable to handle kernel NULL pointer dereference at          (null)
  IP: is_pageblock_removable_nolock+0x35/0x90
  PGD 0 P4D 0
  Oops: 0000 [#1] PREEMPT
  ...
  task: ffff88001f4e2900 task.stack: ffffc90000314000
  RIP: 0010:is_pageblock_removable_nolock+0x35/0x90
  Call Trace:
   ? is_mem_section_removable+0x5a/0xd0
   show_mem_removable+0x6b/0xa0
   dev_attr_show+0x1b/0x50
   sysfs_kf_seq_show+0xa1/0x100
   kernfs_seq_show+0x22/0x30
   seq_read+0x1ac/0x3a0
   kernfs_fop_read+0x36/0x190
   ? security_file_permission+0x90/0xb0
   __vfs_read+0x16/0x30
   vfs_read+0x81/0x130
   SyS_read+0x44/0xa0
   entry_SYSCALL_64_fastpath+0x1f/0xbd

Link: http://lkml.kernel.org/r/20171013173214.27300-7-pasha.tatashin@oracle.com
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Reviewed-by: Steven Sistare <steven.sistare@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Reviewed-by: Bob Picco <bob.picco@oracle.com>
Tested-by: Bob Picco <bob.picco@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 16 ++++++++++++++++
 include/linux/mm.h       | 15 +++++++++++++++
 mm/page_alloc.c          | 40 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index ce0e5634c2f9..7ed0f7782d16 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -237,6 +237,22 @@ unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn);
 	for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved,	\
 			       nid, flags, p_start, p_end, p_nid)
 
+/**
+ * for_each_resv_unavail_range - iterate through reserved and unavailable memory
+ * @i: u64 used as loop variable
+ * @flags: pick from blocks based on memory attributes
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ *
+ * Walks over unavailable but reserved (reserved && !memory) areas of memblock.
+ * Available as soon as memblock is initialized.
+ * Note: because this memory does not belong to any physical node, flags and
+ * nid arguments do not make sense and thus not exported as arguments.
+ */
+#define for_each_resv_unavail_range(i, p_start, p_end)			\
+	for_each_mem_range(i, &memblock.reserved, &memblock.memory,	\
+			   NUMA_NO_NODE, MEMBLOCK_NONE, p_start, p_end, NULL)
+
 static inline void memblock_set_region_flags(struct memblock_region *r,
 					     unsigned long flags)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7c1e82a1aa77..703599fa8288 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -95,6 +95,15 @@ extern int mmap_rnd_compat_bits __read_mostly;
 #define mm_forbids_zeropage(X)	(0)
 #endif
 
+/*
+ * On some architectures it is expensive to call memset() for small sizes.
+ * Those architectures should provide their own implementation of "struct page"
+ * zeroing by defining this macro in <asm/pgtable.h>.
+ */
+#ifndef mm_zero_struct_page
+#define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
+#endif
+
 /*
  * Default maximum number of active map areas, this limits the number of vmas
  * per mm struct. Users can overwrite this number by sysctl but there is a
@@ -2030,6 +2039,12 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn,
 					struct mminit_pfnnid_cache *state);
 #endif
 
+#ifdef CONFIG_HAVE_MEMBLOCK
+void zero_resv_unavail(void);
+#else
+static inline void zero_resv_unavail(void) {}
+#endif
+
 extern void set_dma_reserve(unsigned long new_dma_reserve);
 extern void memmap_init_zone(unsigned long, int, unsigned long,
 				unsigned long, enum memmap_context);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 805f30dd1c26..c37343ef2889 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6215,6 +6215,44 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 	free_area_init_core(pgdat);
 }
 
+#ifdef CONFIG_HAVE_MEMBLOCK
+/*
+ * Only struct pages that are backed by physical memory are zeroed and
+ * initialized by going through __init_single_page(). But, there are some
+ * struct pages which are reserved in memblock allocator and their fields
+ * may be accessed (for example page_to_pfn() on some configuration accesses
+ * flags). We must explicitly zero those struct pages.
+ */
+void __paginginit zero_resv_unavail(void)
+{
+	phys_addr_t start, end;
+	unsigned long pfn;
+	u64 i, pgcnt;
+
+	/*
+	 * Loop through ranges that are reserved, but do not have reported
+	 * physical memory backing.
+	 */
+	pgcnt = 0;
+	for_each_resv_unavail_range(i, &start, &end) {
+		for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
+			mm_zero_struct_page(pfn_to_page(pfn));
+			pgcnt++;
+		}
+	}
+
+	/*
+	 * Struct pages that do not have backing memory. This could be because
+	 * firmware is using some of this memory, or for some other reasons.
+	 * Once memblock is changed so such behaviour is not allowed: i.e.
+	 * list of "reserved" memory must be a subset of list of "memory", then
+	 * this code can be removed.
+	 */
+	if (pgcnt)
+		pr_info("Reserved but unavailable: %lld pages", pgcnt);
+}
+#endif /* CONFIG_HAVE_MEMBLOCK */
+
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 
 #if MAX_NUMNODES > 1
@@ -6638,6 +6676,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 			node_set_state(nid, N_MEMORY);
 		check_for_memory(pgdat, nid);
 	}
+	zero_resv_unavail();
 }
 
 static int __init cmdline_parse_core(char *p, unsigned long *core)
@@ -6801,6 +6840,7 @@ void __init free_area_init(unsigned long *zones_size)
 {
 	free_area_init_node(0, zones_size,
 			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
+	zero_resv_unavail();
 }
 
 static int page_alloc_cpu_dead(unsigned int cpu)
-- 
cgit v1.2.3


From 736304f3245f39392895ff3392e1325d3e49e7d2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 15 Nov 2017 17:37:11 -0800
Subject: mm: speed up cancel_dirty_page() for clean pages

Patch series "Speed up page cache truncation", v1.

When rebasing our enterprise distro to a newer kernel (from 4.4 to 4.12)
we have noticed a regression in bonnie++ benchmark when deleting files.
Eventually we have tracked this down to a fact that page cache
truncation got slower by about 10%.  There were both gains and losses in
the above interval of kernels but we have been able to identify that
commit 83929372f629 ("filemap: prepare find and delete operations for
huge pages") caused about 10% regression on its own.

After some investigation it didn't seem easily possible to fix the
regression while maintaining the THP in page cache functionality so
we've decided to optimize the page cache truncation path instead to make
up for the change.  This series is a result of that effort.

Patch 1 is an easy speedup of cancel_dirty_page().  Patches 2-6 refactor
page cache truncation code so that it is easier to batch radix tree
operations.  Patch 7 implements batching of deletes from the radix tree
which more than makes up for the original regression.

This patch (of 7):

cancel_dirty_page() does quite some work even for clean pages (fetching
of mapping, locking of memcg, atomic bit op on page flags) so it
accounts for ~2.5% of cost of truncation of a clean page.  That is not
much but still dumb for something we don't need at all.  Check whether a
page is actually dirty and avoid any work if not.

Link: http://lkml.kernel.org/r/20171010151937.26984-2-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Acked-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h  | 8 +++++++-
 mm/page-writeback.c | 4 ++--
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 703599fa8288..c7b1d617dff6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1440,7 +1440,13 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
 			  struct bdi_writeback *wb);
 int set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
-void cancel_dirty_page(struct page *page);
+void __cancel_dirty_page(struct page *page);
+static inline void cancel_dirty_page(struct page *page)
+{
+	/* Avoid atomic ops, locking, etc. when not actually needed. */
+	if (PageDirty(page))
+		__cancel_dirty_page(page);
+}
 int clear_page_dirty_for_io(struct page *page);
 
 int get_cmdline(struct task_struct *task, char *buffer, int buflen);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 83c746577aea..436714917e03 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2608,7 +2608,7 @@ EXPORT_SYMBOL(set_page_dirty_lock);
  * page without actually doing it through the VM. Can you say "ext3 is
  * horribly ugly"? Thought you could.
  */
-void cancel_dirty_page(struct page *page)
+void __cancel_dirty_page(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
 
@@ -2629,7 +2629,7 @@ void cancel_dirty_page(struct page *page)
 		ClearPageDirty(page);
 	}
 }
-EXPORT_SYMBOL(cancel_dirty_page);
+EXPORT_SYMBOL(__cancel_dirty_page);
 
 /*
  * Clear a page's dirty flag, while caring for dirty memory accounting.
-- 
cgit v1.2.3


From aa65c29ce1b6e1990cd2c7d8004bbea7ff3aff38 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 15 Nov 2017 17:37:33 -0800
Subject: mm: batch radix tree operations when truncating pages

Currently we remove pages from the radix tree one by one.  To speed up
page cache truncation, lock several pages at once and free them in one
go.  This allows us to batch radix tree operations in a more efficient
way and also save round-trips on mapping->tree_lock.  As a result we
gain about 20% speed improvement in page cache truncation.

Data from a simple benchmark timing 10000 truncates of 1024 pages (on
ext4 on ramdisk but the filesystem is barely visible in the profiles).
The range shows 1% and 95% percentiles of the measured times:

  4.14-rc2	4.14-rc2 + batched truncation
  248-256	209-219
  249-258	209-217
  248-255	211-239
  248-255	209-217
  247-256	210-218

[jack@suse.cz: convert delete_from_page_cache_batch() to pagevec]
  Link: http://lkml.kernel.org/r/20171018111648.13714-1-jack@suse.cz
[akpm@linux-foundation.org: move struct pagevec forward declaration to top-of-file]
Link: http://lkml.kernel.org/r/20171010151937.26984-8-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Acked-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h |  4 +++
 mm/filemap.c            | 83 +++++++++++++++++++++++++++++++++++++++++++++++++
 mm/truncate.c           | 20 ++++++++++--
 3 files changed, 105 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 956d6a80e126..e0f7181118fe 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -16,6 +16,8 @@
 #include <linux/hardirq.h> /* for in_interrupt() */
 #include <linux/hugetlb_inline.h>
 
+struct pagevec;
+
 /*
  * Bits in mapping->flags.
  */
@@ -624,6 +626,8 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 extern void delete_from_page_cache(struct page *page);
 extern void __delete_from_page_cache(struct page *page, void *shadow);
 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
+void delete_from_page_cache_batch(struct address_space *mapping,
+				  struct pagevec *pvec);
 
 /*
  * Like add_to_page_cache_locked, but used to add newly allocated pages:
diff --git a/mm/filemap.c b/mm/filemap.c
index a11b42189436..a470dd8cd05b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -304,6 +304,89 @@ void delete_from_page_cache(struct page *page)
 }
 EXPORT_SYMBOL(delete_from_page_cache);
 
+/*
+ * page_cache_tree_delete_batch - delete several pages from page cache
+ * @mapping: the mapping to which pages belong
+ * @pvec: pagevec with pages to delete
+ *
+ * The function walks over mapping->page_tree and removes pages passed in @pvec
+ * from the radix tree. The function expects @pvec to be sorted by page index.
+ * It tolerates holes in @pvec (radix tree entries at those indices are not
+ * modified). The function expects only THP head pages to be present in the
+ * @pvec and takes care to delete all corresponding tail pages from the radix
+ * tree as well.
+ *
+ * The function expects mapping->tree_lock to be held.
+ */
+static void
+page_cache_tree_delete_batch(struct address_space *mapping,
+			     struct pagevec *pvec)
+{
+	struct radix_tree_iter iter;
+	void **slot;
+	int total_pages = 0;
+	int i = 0, tail_pages = 0;
+	struct page *page;
+	pgoff_t start;
+
+	start = pvec->pages[0]->index;
+	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+		if (i >= pagevec_count(pvec) && !tail_pages)
+			break;
+		page = radix_tree_deref_slot_protected(slot,
+						       &mapping->tree_lock);
+		if (radix_tree_exceptional_entry(page))
+			continue;
+		if (!tail_pages) {
+			/*
+			 * Some page got inserted in our range? Skip it. We
+			 * have our pages locked so they are protected from
+			 * being removed.
+			 */
+			if (page != pvec->pages[i])
+				continue;
+			WARN_ON_ONCE(!PageLocked(page));
+			if (PageTransHuge(page) && !PageHuge(page))
+				tail_pages = HPAGE_PMD_NR - 1;
+			page->mapping = NULL;
+			/*
+			 * Leave page->index set: truncation lookup relies
+			 * upon it
+			 */
+			i++;
+		} else {
+			tail_pages--;
+		}
+		radix_tree_clear_tags(&mapping->page_tree, iter.node, slot);
+		__radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL,
+				     workingset_update_node, mapping);
+		total_pages++;
+	}
+	mapping->nrpages -= total_pages;
+}
+
+void delete_from_page_cache_batch(struct address_space *mapping,
+				  struct pagevec *pvec)
+{
+	int i;
+	unsigned long flags;
+
+	if (!pagevec_count(pvec))
+		return;
+
+	spin_lock_irqsave(&mapping->tree_lock, flags);
+	for (i = 0; i < pagevec_count(pvec); i++) {
+		trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
+
+		unaccount_page_cache_page(mapping, pvec->pages[i]);
+	}
+	page_cache_tree_delete_batch(mapping, pvec);
+	spin_unlock_irqrestore(&mapping->tree_lock, flags);
+
+	for (i = 0; i < pagevec_count(pvec); i++)
+		page_cache_free_page(mapping, pvec->pages[i]);
+}
+
 int filemap_check_errors(struct address_space *mapping)
 {
 	int ret = 0;
diff --git a/mm/truncate.c b/mm/truncate.c
index 383a530d511e..4a39a3150ee2 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -294,6 +294,14 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE),
 			indices)) {
+		/*
+		 * Pagevec array has exceptional entries and we may also fail
+		 * to lock some pages. So we store pages that can be deleted
+		 * in a new pagevec.
+		 */
+		struct pagevec locked_pvec;
+
+		pagevec_init(&locked_pvec, 0);
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
@@ -315,9 +323,17 @@ void truncate_inode_pages_range(struct address_space *mapping,
 				unlock_page(page);
 				continue;
 			}
-			truncate_inode_page(mapping, page);
-			unlock_page(page);
+			if (page->mapping != mapping) {
+				unlock_page(page);
+				continue;
+			}
+			pagevec_add(&locked_pvec, page);
 		}
+		for (i = 0; i < pagevec_count(&locked_pvec); i++)
+			truncate_cleanup_page(mapping, locked_pvec.pages[i]);
+		delete_from_page_cache_batch(mapping, &locked_pvec);
+		for (i = 0; i < pagevec_count(&locked_pvec); i++)
+			unlock_page(locked_pvec.pages[i]);
 		pagevec_remove_exceptionals(&pvec);
 		pagevec_release(&pvec);
 		cond_resched();
-- 
cgit v1.2.3


From c7df8ad2910e965a6241b6d8f52fd122e26b0315 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 15 Nov 2017 17:37:41 -0800
Subject: mm, truncate: do not check mapping for every page being truncated

During truncation, the mapping has already been checked for shmem and
dax so it's known that workingset_update_node is required.

This patch avoids the checks on mapping for each page being truncated.
In all other cases, a lookup helper is used to determine if
workingset_update_node() needs to be called.  The one danger is that the
API is slightly harder to use as calling workingset_update_node directly
without checking for dax or shmem mappings could lead to surprises.
However, the API rarely needs to be used and hopefully the comment is
enough to give people the hint.

sparsetruncate (tiny)
                              4.14.0-rc4             4.14.0-rc4
                             oneirq-v1r1        pickhelper-v1r1
Min          Time      141.00 (   0.00%)      140.00 (   0.71%)
1st-qrtle    Time      142.00 (   0.00%)      141.00 (   0.70%)
2nd-qrtle    Time      142.00 (   0.00%)      142.00 (   0.00%)
3rd-qrtle    Time      143.00 (   0.00%)      143.00 (   0.00%)
Max-90%      Time      144.00 (   0.00%)      144.00 (   0.00%)
Max-95%      Time      147.00 (   0.00%)      145.00 (   1.36%)
Max-99%      Time      195.00 (   0.00%)      191.00 (   2.05%)
Max          Time      230.00 (   0.00%)      205.00 (  10.87%)
Amean        Time      144.37 (   0.00%)      143.82 (   0.38%)
Stddev       Time       10.44 (   0.00%)        9.00 (  13.74%)
Coeff        Time        7.23 (   0.00%)        6.26 (  13.41%)
Best99%Amean Time      143.72 (   0.00%)      143.34 (   0.26%)
Best95%Amean Time      142.37 (   0.00%)      142.00 (   0.26%)
Best90%Amean Time      142.19 (   0.00%)      141.85 (   0.24%)
Best75%Amean Time      141.92 (   0.00%)      141.58 (   0.24%)
Best50%Amean Time      141.69 (   0.00%)      141.31 (   0.27%)
Best25%Amean Time      141.38 (   0.00%)      140.97 (   0.29%)

As you'd expect, the gain is marginal but it can be detected.  The
differences in bonnie are all within the noise which is not surprising
given the impact on the microbenchmark.

radix_tree_update_node_t is a callback for some radix operations that
optionally passes in a private field.  The only user of the callback is
workingset_update_node and as it no longer requires a mapping, the
private field is removed.

Link: http://lkml.kernel.org/r/20171018075952.10627-3-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c                              |  2 +-
 include/linux/radix-tree.h            |  7 +++----
 include/linux/swap.h                  | 13 ++++++++++++-
 lib/idr.c                             |  2 +-
 lib/radix-tree.c                      | 30 +++++++++++++-----------------
 mm/filemap.c                          |  7 ++++---
 mm/shmem.c                            |  2 +-
 mm/truncate.c                         |  2 +-
 mm/workingset.c                       | 10 ++--------
 tools/testing/radix-tree/multiorder.c |  2 +-
 10 files changed, 39 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 9ec797424e4f..165fdfb6e508 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -565,7 +565,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
 		ret = __radix_tree_lookup(page_tree, index, &node, &slot);
 		WARN_ON_ONCE(ret != entry);
 		__radix_tree_replace(page_tree, node, slot,
-				     new_entry, NULL, NULL);
+				     new_entry, NULL);
 		entry = new_entry;
 	}
 
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 567ebb5eaab0..0ca448c1cb42 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -301,18 +301,17 @@ void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index,
 void *radix_tree_lookup(const struct radix_tree_root *, unsigned long);
 void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *,
 					unsigned long index);
-typedef void (*radix_tree_update_node_t)(struct radix_tree_node *, void *);
+typedef void (*radix_tree_update_node_t)(struct radix_tree_node *);
 void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *,
 			  void __rcu **slot, void *entry,
-			  radix_tree_update_node_t update_node, void *private);
+			  radix_tree_update_node_t update_node);
 void radix_tree_iter_replace(struct radix_tree_root *,
 		const struct radix_tree_iter *, void __rcu **slot, void *entry);
 void radix_tree_replace_slot(struct radix_tree_root *,
 			     void __rcu **slot, void *entry);
 void __radix_tree_delete_node(struct radix_tree_root *,
 			      struct radix_tree_node *,
-			      radix_tree_update_node_t update_node,
-			      void *private);
+			      radix_tree_update_node_t update_node);
 void radix_tree_iter_delete(struct radix_tree_root *,
 			struct radix_tree_iter *iter, void __rcu **slot);
 void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8b8a6f965785..454f042bcdd5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -298,7 +298,18 @@ struct vma_swap_readahead {
 void *workingset_eviction(struct address_space *mapping, struct page *page);
 bool workingset_refault(void *shadow);
 void workingset_activation(struct page *page);
-void workingset_update_node(struct radix_tree_node *node, void *private);
+
+/* Do not use directly, use workingset_lookup_update */
+void workingset_update_node(struct radix_tree_node *node);
+
+/* Returns workingset_update_node() if the mapping has shadow entries. */
+#define workingset_lookup_update(mapping)				\
+({									\
+	radix_tree_update_node_t __helper = workingset_update_node;	\
+	if (dax_mapping(mapping) || shmem_mapping(mapping))		\
+		__helper = NULL;					\
+	__helper;							\
+})
 
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
diff --git a/lib/idr.c b/lib/idr.c
index edd9b2be1651..2593ce513a18 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -171,7 +171,7 @@ void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id)
 	if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE))
 		return ERR_PTR(-ENOENT);
 
-	__radix_tree_replace(&idr->idr_rt, node, slot, ptr, NULL, NULL);
+	__radix_tree_replace(&idr->idr_rt, node, slot, ptr, NULL);
 
 	return entry;
 }
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 8b1feca1230a..c8d55565fafa 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -677,8 +677,7 @@ out:
  *	@root		radix tree root
  */
 static inline bool radix_tree_shrink(struct radix_tree_root *root,
-				     radix_tree_update_node_t update_node,
-				     void *private)
+				     radix_tree_update_node_t update_node)
 {
 	bool shrunk = false;
 
@@ -739,7 +738,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
 		if (!radix_tree_is_internal_node(child)) {
 			node->slots[0] = (void __rcu *)RADIX_TREE_RETRY;
 			if (update_node)
-				update_node(node, private);
+				update_node(node);
 		}
 
 		WARN_ON_ONCE(!list_empty(&node->private_list));
@@ -752,7 +751,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
 
 static bool delete_node(struct radix_tree_root *root,
 			struct radix_tree_node *node,
-			radix_tree_update_node_t update_node, void *private)
+			radix_tree_update_node_t update_node)
 {
 	bool deleted = false;
 
@@ -762,8 +761,8 @@ static bool delete_node(struct radix_tree_root *root,
 		if (node->count) {
 			if (node_to_entry(node) ==
 					rcu_dereference_raw(root->rnode))
-				deleted |= radix_tree_shrink(root, update_node,
-								private);
+				deleted |= radix_tree_shrink(root,
+								update_node);
 			return deleted;
 		}
 
@@ -1173,7 +1172,6 @@ static int calculate_count(struct radix_tree_root *root,
  * @slot:		pointer to slot in @node
  * @item:		new item to store in the slot.
  * @update_node:	callback for changing leaf nodes
- * @private:		private data to pass to @update_node
  *
  * For use with __radix_tree_lookup().  Caller must hold tree write locked
  * across slot lookup and replacement.
@@ -1181,7 +1179,7 @@ static int calculate_count(struct radix_tree_root *root,
 void __radix_tree_replace(struct radix_tree_root *root,
 			  struct radix_tree_node *node,
 			  void __rcu **slot, void *item,
-			  radix_tree_update_node_t update_node, void *private)
+			  radix_tree_update_node_t update_node)
 {
 	void *old = rcu_dereference_raw(*slot);
 	int exceptional = !!radix_tree_exceptional_entry(item) -
@@ -1201,9 +1199,9 @@ void __radix_tree_replace(struct radix_tree_root *root,
 		return;
 
 	if (update_node)
-		update_node(node, private);
+		update_node(node);
 
-	delete_node(root, node, update_node, private);
+	delete_node(root, node, update_node);
 }
 
 /**
@@ -1225,7 +1223,7 @@ void __radix_tree_replace(struct radix_tree_root *root,
 void radix_tree_replace_slot(struct radix_tree_root *root,
 			     void __rcu **slot, void *item)
 {
-	__radix_tree_replace(root, NULL, slot, item, NULL, NULL);
+	__radix_tree_replace(root, NULL, slot, item, NULL);
 }
 EXPORT_SYMBOL(radix_tree_replace_slot);
 
@@ -1242,7 +1240,7 @@ void radix_tree_iter_replace(struct radix_tree_root *root,
 				const struct radix_tree_iter *iter,
 				void __rcu **slot, void *item)
 {
-	__radix_tree_replace(root, iter->node, slot, item, NULL, NULL);
+	__radix_tree_replace(root, iter->node, slot, item, NULL);
 }
 
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
@@ -1972,7 +1970,6 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
  *	@root:		radix tree root
  *	@node:		node containing @index
  *	@update_node:	callback for changing leaf nodes
- *	@private:	private data to pass to @update_node
  *
  *	After clearing the slot at @index in @node from radix tree
  *	rooted at @root, call this function to attempt freeing the
@@ -1980,10 +1977,9 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
  */
 void __radix_tree_delete_node(struct radix_tree_root *root,
 			      struct radix_tree_node *node,
-			      radix_tree_update_node_t update_node,
-			      void *private)
+			      radix_tree_update_node_t update_node)
 {
-	delete_node(root, node, update_node, private);
+	delete_node(root, node, update_node);
 }
 
 static bool __radix_tree_delete(struct radix_tree_root *root,
@@ -2001,7 +1997,7 @@ static bool __radix_tree_delete(struct radix_tree_root *root,
 			node_tag_clear(root, node, tag, offset);
 
 	replace_slot(slot, NULL, node, -1, exceptional);
-	return node && delete_node(root, node, NULL, NULL);
+	return node && delete_node(root, node, NULL);
 }
 
 /**
diff --git a/mm/filemap.c b/mm/filemap.c
index a470dd8cd05b..155370fc87f2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -35,6 +35,7 @@
 #include <linux/hugetlb.h>
 #include <linux/memcontrol.h>
 #include <linux/cleancache.h>
+#include <linux/shmem_fs.h>
 #include <linux/rmap.h>
 #include "internal.h"
 
@@ -134,7 +135,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
 			*shadowp = p;
 	}
 	__radix_tree_replace(&mapping->page_tree, node, slot, page,
-			     workingset_update_node, mapping);
+			     workingset_lookup_update(mapping));
 	mapping->nrpages++;
 	return 0;
 }
@@ -162,7 +163,7 @@ static void page_cache_tree_delete(struct address_space *mapping,
 
 		radix_tree_clear_tags(&mapping->page_tree, node, slot);
 		__radix_tree_replace(&mapping->page_tree, node, slot, shadow,
-				     workingset_update_node, mapping);
+				workingset_lookup_update(mapping));
 	}
 
 	page->mapping = NULL;
@@ -359,7 +360,7 @@ page_cache_tree_delete_batch(struct address_space *mapping,
 		}
 		radix_tree_clear_tags(&mapping->page_tree, iter.node, slot);
 		__radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL,
-				     workingset_update_node, mapping);
+				workingset_lookup_update(mapping));
 		total_pages++;
 	}
 	mapping->nrpages -= total_pages;
diff --git a/mm/shmem.c b/mm/shmem.c
index 07a1d22807be..a72f68aee6a4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -338,7 +338,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
 	if (item != expected)
 		return -ENOENT;
 	__radix_tree_replace(&mapping->page_tree, node, pslot,
-			     replacement, NULL, NULL);
+			     replacement, NULL);
 	return 0;
 }
 
diff --git a/mm/truncate.c b/mm/truncate.c
index 4a39a3150ee2..02a0c0466c78 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -42,7 +42,7 @@ static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
 	if (*slot != entry)
 		goto unlock;
 	__radix_tree_replace(&mapping->page_tree, node, slot, NULL,
-			     workingset_update_node, mapping);
+			     workingset_update_node);
 	mapping->nrexceptional--;
 unlock:
 	spin_unlock_irq(&mapping->tree_lock);
diff --git a/mm/workingset.c b/mm/workingset.c
index b997c9de28f6..b7d616a3bbbe 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -340,14 +340,8 @@ out:
 
 static struct list_lru shadow_nodes;
 
-void workingset_update_node(struct radix_tree_node *node, void *private)
+void workingset_update_node(struct radix_tree_node *node)
 {
-	struct address_space *mapping = private;
-
-	/* Only regular page cache has shadow entries */
-	if (dax_mapping(mapping) || shmem_mapping(mapping))
-		return;
-
 	/*
 	 * Track non-empty nodes that contain only shadow entries;
 	 * unlink those that contain pages or are being freed.
@@ -475,7 +469,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 		goto out_invalid;
 	inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
 	__radix_tree_delete_node(&mapping->page_tree, node,
-				 workingset_update_node, mapping);
+				 workingset_lookup_update(mapping));
 
 out_invalid:
 	spin_unlock(&mapping->tree_lock);
diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c
index 06c71178d07d..59245b3d587c 100644
--- a/tools/testing/radix-tree/multiorder.c
+++ b/tools/testing/radix-tree/multiorder.c
@@ -618,7 +618,7 @@ static void multiorder_account(void)
 	__radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12);
 	__radix_tree_lookup(&tree, 1 << 5, &node, &slot);
 	assert(node->count == node->exceptional * 2);
-	__radix_tree_replace(&tree, node, slot, NULL, NULL, NULL);
+	__radix_tree_replace(&tree, node, slot, NULL, NULL);
 	assert(node->exceptional == 0);
 
 	item_kill_tree(&tree);
-- 
cgit v1.2.3


From d9ed0d08b6c6a882da1d8e75bb3162fc889fd199 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 15 Nov 2017 17:37:48 -0800
Subject: mm: only drain per-cpu pagevecs once per pagevec usage

When a pagevec is initialised on the stack, it is generally used
multiple times over a range of pages, looking up entries and then
releasing them.  On each pagevec_release, the per-cpu deferred LRU
pagevecs are drained on the grounds the page being released may be on
those queues and the pages may be cache hot.  In many cases only the
first drain is necessary as it's unlikely that the range of pages being
walked is racing against LRU addition.  Even if there is such a race,
the impact is marginal where as constantly redraining the lru pagevecs
costs.

This patch ensures that pagevec is only drained once in a given
lifecycle without increasing the cache footprint of the pagevec
structure.  Only sparsetruncate tiny is shown here as large files have
many exceptional entries and calls pagecache_release less frequently.

sparsetruncate (tiny)
                              4.14.0-rc4             4.14.0-rc4
                        batchshadow-v1r1          onedrain-v1r1
Min          Time      141.00 (   0.00%)      141.00 (   0.00%)
1st-qrtle    Time      142.00 (   0.00%)      142.00 (   0.00%)
2nd-qrtle    Time      142.00 (   0.00%)      142.00 (   0.00%)
3rd-qrtle    Time      143.00 (   0.00%)      143.00 (   0.00%)
Max-90%      Time      144.00 (   0.00%)      144.00 (   0.00%)
Max-95%      Time      146.00 (   0.00%)      145.00 (   0.68%)
Max-99%      Time      198.00 (   0.00%)      194.00 (   2.02%)
Max          Time      254.00 (   0.00%)      208.00 (  18.11%)
Amean        Time      145.12 (   0.00%)      144.30 (   0.56%)
Stddev       Time       12.74 (   0.00%)        9.62 (  24.49%)
Coeff        Time        8.78 (   0.00%)        6.67 (  24.06%)
Best99%Amean Time      144.29 (   0.00%)      143.82 (   0.32%)
Best95%Amean Time      142.68 (   0.00%)      142.31 (   0.26%)
Best90%Amean Time      142.52 (   0.00%)      142.19 (   0.24%)
Best75%Amean Time      142.26 (   0.00%)      141.98 (   0.20%)
Best50%Amean Time      141.90 (   0.00%)      141.71 (   0.13%)
Best25%Amean Time      141.80 (   0.00%)      141.43 (   0.26%)

The impact on bonnie is marginal and within the noise because a
significant percentage of the file being truncated has been reclaimed
and consists of shadow entries which reduce the hotness of the
pagevec_release path.

Link: http://lkml.kernel.org/r/20171018075952.10627-5-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h | 4 +++-
 mm/swap.c               | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 4f56e0ad9d00..95c75c858d1f 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -17,7 +17,8 @@ struct address_space;
 
 struct pagevec {
 	unsigned long nr;
-	unsigned long cold;
+	bool cold;
+	bool drained;
 	struct page *pages[PAGEVEC_SIZE];
 };
 
@@ -54,6 +55,7 @@ static inline void pagevec_init(struct pagevec *pvec, int cold)
 {
 	pvec->nr = 0;
 	pvec->cold = cold;
+	pvec->drained = false;
 }
 
 static inline void pagevec_reinit(struct pagevec *pvec)
diff --git a/mm/swap.c b/mm/swap.c
index 4edac536fe24..3e564a95ee73 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -833,7 +833,10 @@ EXPORT_SYMBOL(release_pages);
  */
 void __pagevec_release(struct pagevec *pvec)
 {
-	lru_add_drain();
+	if (!pvec->drained) {
+		lru_add_drain();
+		pvec->drained = true;
+	}
 	release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
 	pagevec_reinit(pvec);
 }
-- 
cgit v1.2.3


From 8667982014d6048e0b5e286b6247ff24f48d4cc6 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 15 Nov 2017 17:37:52 -0800
Subject: mm, pagevec: remove cold parameter for pagevecs

Every pagevec_init user claims the pages being released are hot even in
cases where it is unlikely the pages are hot.  As no one cares about the
hotness of pages being released to the allocator, just ditch the
parameter.

No performance impact is expected as the overhead is marginal.  The
parameter is removed simply because it is a bit stupid to have a useless
parameter copied everywhere.

Link: http://lkml.kernel.org/r/20171018075952.10627-6-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpu/drm/i915/i915_gem_gtt.c | 2 +-
 fs/afs/write.c                      | 4 ++--
 fs/btrfs/extent_io.c                | 4 ++--
 fs/buffer.c                         | 4 ++--
 fs/cachefiles/rdwr.c                | 4 ++--
 fs/ceph/addr.c                      | 4 ++--
 fs/dax.c                            | 2 +-
 fs/ext4/inode.c                     | 6 +++---
 fs/f2fs/checkpoint.c                | 2 +-
 fs/f2fs/data.c                      | 2 +-
 fs/f2fs/node.c                      | 8 ++++----
 fs/fscache/page.c                   | 2 +-
 fs/gfs2/aops.c                      | 2 +-
 fs/hugetlbfs/inode.c                | 2 +-
 fs/nilfs2/btree.c                   | 2 +-
 fs/nilfs2/page.c                    | 8 ++++----
 fs/nilfs2/segment.c                 | 4 ++--
 include/linux/pagevec.h             | 4 +---
 mm/filemap.c                        | 2 +-
 mm/mlock.c                          | 4 ++--
 mm/page-writeback.c                 | 2 +-
 mm/shmem.c                          | 6 +++---
 mm/swap.c                           | 4 ++--
 mm/truncate.c                       | 8 ++++----
 24 files changed, 45 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index ad524cb0f6fc..7982ad817c11 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1859,7 +1859,7 @@ static void i915_address_space_init(struct i915_address_space *vm,
 	INIT_LIST_HEAD(&vm->unbound_list);
 
 	list_add_tail(&vm->global_link, &dev_priv->vm_list);
-	pagevec_init(&vm->free_pages, false);
+	pagevec_init(&vm->free_pages);
 }
 
 static void i915_address_space_fini(struct i915_address_space *vm)
diff --git a/fs/afs/write.c b/fs/afs/write.c
index d62a6b54152d..11dd0526b96b 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -308,7 +308,7 @@ static void afs_kill_pages(struct afs_vnode *vnode, bool error,
 	_enter("{%x:%u},%lx-%lx",
 	       vnode->fid.vid, vnode->fid.vnode, first, last);
 
-	pagevec_init(&pv, 0);
+	pagevec_init(&pv);
 
 	do {
 		_debug("kill %lx-%lx", first, last);
@@ -602,7 +602,7 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
 
 	ASSERT(wb != NULL);
 
-	pagevec_init(&pv, 0);
+	pagevec_init(&pv);
 
 	do {
 		_debug("done %lx-%lx", first, last);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 93fdaac81212..16045ea86fc1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3797,7 +3797,7 @@ int btree_write_cache_pages(struct address_space *mapping,
 	int scanned = 0;
 	int tag;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
 		end = -1;
@@ -3936,7 +3936,7 @@ static int extent_write_cache_pages(struct address_space *mapping,
 	if (!igrab(inode))
 		return 0;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
 		end = -1;
diff --git a/fs/buffer.c b/fs/buffer.c
index 1c18a22a6013..0736a6a2e2f0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1592,7 +1592,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
 	struct buffer_head *head;
 
 	end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
 		count = pagevec_count(&pvec);
 		for (i = 0; i < count; i++) {
@@ -3514,7 +3514,7 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
 	if (length <= 0)
 		return -ENOENT;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 	do {
 		unsigned nr_pages, i;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 18d7aa61ef0f..23097cca2674 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -710,7 +710,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
 	/* calculate the shift required to use bmap */
 	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
 
-	pagevec_init(&pagevec, 0);
+	pagevec_init(&pagevec);
 
 	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
 	op->op.flags |= FSCACHE_OP_ASYNC;
@@ -844,7 +844,7 @@ int cachefiles_allocate_pages(struct fscache_retrieval *op,
 
 	ret = cachefiles_has_space(cache, 0, *nr_pages);
 	if (ret == 0) {
-		pagevec_init(&pagevec, 0);
+		pagevec_init(&pagevec);
 
 		list_for_each_entry(page, pages, lru) {
 			if (pagevec_add(&pagevec, page) == 0)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 26c682db94ee..dbf07051aacd 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -680,7 +680,7 @@ static void ceph_release_pages(struct page **pages, int num)
 	struct pagevec pvec;
 	int i;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	for (i = 0; i < num; i++) {
 		if (pagevec_add(&pvec, pages[i]) == 0)
 			pagevec_release(&pvec);
@@ -811,7 +811,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 	if (fsc->mount_options->wsize < wsize)
 		wsize = fsc->mount_options->wsize;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 	start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
 	index = start_index;
diff --git a/fs/dax.c b/fs/dax.c
index 165fdfb6e508..3652b26a0048 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -794,7 +794,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 
 	tag_pages_for_writeback(mapping, start_index, end_index);
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	while (!done) {
 		pvec.nr = find_get_entries_tag(mapping, start_index,
 				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b239a41dbeeb..8d2b582fb141 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1719,7 +1719,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
 		ext4_es_remove_extent(inode, start, last - start + 1);
 	}
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	while (index <= end) {
 		nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end);
 		if (nr_pages == 0)
@@ -2345,7 +2345,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 	lblk = start << bpp_bits;
 	pblock = mpd->map.m_pblk;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	while (start <= end) {
 		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
 						&start, end);
@@ -2616,7 +2616,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 	else
 		tag = PAGECACHE_TAG_DIRTY;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	mpd->map.m_len = 0;
 	mpd->next_page = index;
 	while (index <= end) {
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 6124f8710dc3..0bb8e2c022d3 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -314,7 +314,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
 	};
 	struct blk_plug plug;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 	blk_start_plug(&plug);
 
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 687703755824..7b3ad5d8e2e9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1635,7 +1635,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 	int range_whole = 0;
 	int tag;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 	if (get_dirty_pages(mapping->host) <=
 				SM_I(F2FS_M_SB(mapping))->min_hot_blocks)
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index d6e4df0bb622..b33dac9592ca 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1282,7 +1282,7 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
 	struct page *last_page = NULL;
 	int nr_pages;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	index = 0;
 
 	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
@@ -1436,7 +1436,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
 			return PTR_ERR_OR_ZERO(last_page);
 	}
 retry:
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	index = 0;
 
 	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
@@ -1547,7 +1547,7 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc,
 	int ret = 0;
 	int nr_pages;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 next_step:
 	index = 0;
@@ -1648,7 +1648,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
 	int ret2, ret = 0;
 	int nr_pages;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
 				PAGECACHE_TAG_WRITEBACK))) {
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 0ad3fd3ad0b4..961029e04027 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -1175,7 +1175,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
 		return;
 	}
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	next = 0;
 	do {
 		if (!pagevec_lookup(&pvec, mapping, &next))
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 3fea3d7780b0..1daf15a1f00c 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -371,7 +371,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
 	int range_whole = 0;
 	int tag;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	if (wbc->range_cyclic) {
 		writeback_index = mapping->writeback_index; /* prev offset */
 		index = writeback_index;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6adb17bb1a38..1e76730aac0d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -407,7 +407,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 
 	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
 	pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	next = start;
 	while (next < end) {
 		/*
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 35989c7bb065..16a7a67a11c9 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -2156,7 +2156,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
 	     level++)
 		INIT_LIST_HEAD(&lists[level]);
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 	while (pagevec_lookup_tag(&pvec, btcache, &index,
 					PAGECACHE_TAG_DIRTY)) {
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 1c16726915c1..68241512d7c1 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -255,7 +255,7 @@ int nilfs_copy_dirty_pages(struct address_space *dmap,
 	pgoff_t index = 0;
 	int err = 0;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 repeat:
 	if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY))
 		return 0;
@@ -309,7 +309,7 @@ void nilfs_copy_back_pages(struct address_space *dmap,
 	pgoff_t index = 0;
 	int err;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 repeat:
 	n = pagevec_lookup(&pvec, smap, &index);
 	if (!n)
@@ -373,7 +373,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent)
 	unsigned int i;
 	pgoff_t index = 0;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 	while (pagevec_lookup_tag(&pvec, mapping, &index,
 					PAGECACHE_TAG_DIRTY)) {
@@ -518,7 +518,7 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
 	index = start_blk >> (PAGE_SHIFT - inode->i_blkbits);
 	nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits);
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 repeat:
 	pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE,
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 19366ab20bea..f65392fecb5c 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -708,7 +708,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
 		index = start >> PAGE_SHIFT;
 		last = end >> PAGE_SHIFT;
 	}
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
  repeat:
 	if (unlikely(index > last) ||
 	    !pagevec_lookup_range_tag(&pvec, mapping, &index, last,
@@ -753,7 +753,7 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
 	unsigned int i;
 	pgoff_t index = 0;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 	while (pagevec_lookup_tag(&pvec, mapping, &index,
 					PAGECACHE_TAG_DIRTY)) {
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 95c75c858d1f..eebefd209424 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -17,7 +17,6 @@ struct address_space;
 
 struct pagevec {
 	unsigned long nr;
-	bool cold;
 	bool drained;
 	struct page *pages[PAGEVEC_SIZE];
 };
@@ -51,10 +50,9 @@ static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
 	return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag);
 }
 
-static inline void pagevec_init(struct pagevec *pvec, int cold)
+static inline void pagevec_init(struct pagevec *pvec)
 {
 	pvec->nr = 0;
-	pvec->cold = cold;
 	pvec->drained = false;
 }
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 155370fc87f2..90a9f261f85f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -519,7 +519,7 @@ static void __filemap_fdatawait_range(struct address_space *mapping,
 	if (end_byte < start_byte)
 		return;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	while (index <= end) {
 		unsigned i;
 
diff --git a/mm/mlock.c b/mm/mlock.c
index 46af369c13e5..ed37cb208d19 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -289,7 +289,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 	struct pagevec pvec_putback;
 	int pgrescued = 0;
 
-	pagevec_init(&pvec_putback, 0);
+	pagevec_init(&pvec_putback);
 
 	/* Phase 1: page isolation */
 	spin_lock_irq(zone_lru_lock(zone));
@@ -448,7 +448,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
 		struct pagevec pvec;
 		struct zone *zone;
 
-		pagevec_init(&pvec, 0);
+		pagevec_init(&pvec);
 		/*
 		 * Although FOLL_DUMP is intended for get_dump_page(),
 		 * it just so happens that its special treatment of the
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 436714917e03..05313f402ba8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2168,7 +2168,7 @@ int write_cache_pages(struct address_space *mapping,
 	int range_whole = 0;
 	int tag;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	if (wbc->range_cyclic) {
 		writeback_index = mapping->writeback_index; /* prev offset */
 		index = writeback_index;
diff --git a/mm/shmem.c b/mm/shmem.c
index a72f68aee6a4..7ea8b276ba8b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -747,7 +747,7 @@ void shmem_unlock_mapping(struct address_space *mapping)
 	pgoff_t indices[PAGEVEC_SIZE];
 	pgoff_t index = 0;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	/*
 	 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
 	 */
@@ -790,7 +790,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 	if (lend == -1)
 		end = -1;	/* unsigned, so actually very big */
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	index = start;
 	while (index < end) {
 		pvec.nr = find_get_entries(mapping, index,
@@ -2528,7 +2528,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
 	bool done = false;
 	int i;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	pvec.nr = 1;		/* start small: we may be there already */
 	while (!done) {
 		pvec.nr = find_get_entries(mapping, index,
diff --git a/mm/swap.c b/mm/swap.c
index 3e564a95ee73..88a19b6cdf7c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -210,7 +210,7 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
 	}
 	if (pgdat)
 		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
-	release_pages(pvec->pages, pvec->nr, pvec->cold);
+	release_pages(pvec->pages, pvec->nr, 0);
 	pagevec_reinit(pvec);
 }
 
@@ -837,7 +837,7 @@ void __pagevec_release(struct pagevec *pvec)
 		lru_add_drain();
 		pvec->drained = true;
 	}
-	release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
+	release_pages(pvec->pages, pagevec_count(pvec), 0);
 	pagevec_reinit(pvec);
 }
 EXPORT_SYMBOL(__pagevec_release);
diff --git a/mm/truncate.c b/mm/truncate.c
index c30e8fa3d063..e4b4cf0f4070 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -330,7 +330,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	else
 		end = (lend + 1) >> PAGE_SHIFT;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	index = start;
 	while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE),
@@ -342,7 +342,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 		 */
 		struct pagevec locked_pvec;
 
-		pagevec_init(&locked_pvec, 0);
+		pagevec_init(&locked_pvec);
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
@@ -553,7 +553,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
 	unsigned long count = 0;
 	int i;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
 			indices)) {
@@ -683,7 +683,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 	if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
 		goto out;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	index = start;
 	while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
-- 
cgit v1.2.3


From c6f92f9fbe7dbcc8903a67229aa88b4077ae4422 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 15 Nov 2017 17:37:55 -0800
Subject: mm: remove cold parameter for release_pages

All callers of release_pages claim the pages being released are cache
hot.  As no one cares about the hotness of pages being released to the
allocator, just ditch the parameter.

No performance impact is expected as the overhead is marginal.  The
parameter is removed simply because it is a bit stupid to have a useless
parameter copied everywhere.

Link: http://lkml.kernel.org/r/20171018075952.10627-7-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 6 ++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +-
 drivers/gpu/drm/etnaviv/etnaviv_gem.c   | 6 +++---
 drivers/gpu/drm/i915/i915_gem_userptr.c | 4 ++--
 drivers/gpu/drm/radeon/radeon_ttm.c     | 2 +-
 fs/fuse/dev.c                           | 2 +-
 include/linux/pagemap.h                 | 2 +-
 include/linux/swap.h                    | 2 +-
 mm/swap.c                               | 8 ++++----
 mm/swap_state.c                         | 2 +-
 11 files changed, 18 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 60d8bedb694d..cd664832f9e8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -553,8 +553,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 				 * invalidated it. Free it and try again
 				 */
 				release_pages(e->user_pages,
-					      e->robj->tbo.ttm->num_pages,
-					      false);
+					      e->robj->tbo.ttm->num_pages);
 				kvfree(e->user_pages);
 				e->user_pages = NULL;
 			}
@@ -691,8 +690,7 @@ error_free_pages:
 				continue;
 
 			release_pages(e->user_pages,
-				      e->robj->tbo.ttm->num_pages,
-				      false);
+				      e->robj->tbo.ttm->num_pages);
 			kvfree(e->user_pages);
 		}
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 6149a47fe63d..0bda8f2a188a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -347,7 +347,7 @@ int amdgpu_gem_userptr_ioctl(struct drm_device *dev, void *data,
 	return 0;
 
 free_pages:
-	release_pages(bo->tbo.ttm->pages, bo->tbo.ttm->num_pages, false);
+	release_pages(bo->tbo.ttm->pages, bo->tbo.ttm->num_pages);
 
 unlock_mmap_sem:
 	up_read(&current->mm->mmap_sem);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index bc746131987f..d792959fac43 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -659,7 +659,7 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages)
 	return 0;
 
 release_pages:
-	release_pages(pages, pinned, 0);
+	release_pages(pages, pinned);
 	return r;
 }
 
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index 57881167ccd2..bcc8c2d7c7c9 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -779,7 +779,7 @@ static struct page **etnaviv_gem_userptr_do_get_pages(
 	up_read(&mm->mmap_sem);
 
 	if (ret < 0) {
-		release_pages(pvec, pinned, 0);
+		release_pages(pvec, pinned);
 		kvfree(pvec);
 		return ERR_PTR(ret);
 	}
@@ -852,7 +852,7 @@ static int etnaviv_gem_userptr_get_pages(struct etnaviv_gem_object *etnaviv_obj)
 		}
 	}
 
-	release_pages(pvec, pinned, 0);
+	release_pages(pvec, pinned);
 	kvfree(pvec);
 
 	work = kmalloc(sizeof(*work), GFP_KERNEL);
@@ -886,7 +886,7 @@ static void etnaviv_gem_userptr_release(struct etnaviv_gem_object *etnaviv_obj)
 	if (etnaviv_obj->pages) {
 		int npages = etnaviv_obj->base.size >> PAGE_SHIFT;
 
-		release_pages(etnaviv_obj->pages, npages, 0);
+		release_pages(etnaviv_obj->pages, npages);
 		kvfree(etnaviv_obj->pages);
 	}
 	put_task_struct(etnaviv_obj->userptr.task);
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 709efe2357ea..aa22361bd5a1 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -554,7 +554,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
 	}
 	mutex_unlock(&obj->mm.lock);
 
-	release_pages(pvec, pinned, 0);
+	release_pages(pvec, pinned);
 	kvfree(pvec);
 
 	i915_gem_object_put(obj);
@@ -668,7 +668,7 @@ i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj)
 		__i915_gem_userptr_set_active(obj, true);
 
 	if (IS_ERR(pages))
-		release_pages(pvec, pinned, 0);
+		release_pages(pvec, pinned);
 	kvfree(pvec);
 
 	return pages;
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index bf69bf9086bf..1fdfc7a46072 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -597,7 +597,7 @@ release_sg:
 	kfree(ttm->sg);
 
 release_pages:
-	release_pages(ttm->pages, pinned, 0);
+	release_pages(ttm->pages, pinned);
 	return r;
 }
 
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index a42d89371748..17f0d05bfd4c 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1636,7 +1636,7 @@ out_finish:
 
 static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
 {
-	release_pages(req->pages, req->num_pages, false);
+	release_pages(req->pages, req->num_pages);
 }
 
 static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e0f7181118fe..4c6790bb7afb 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -118,7 +118,7 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
 	m->gfp_mask = mask;
 }
 
-void release_pages(struct page **pages, int nr, bool cold);
+void release_pages(struct page **pages, int nr);
 
 /*
  * speculatively take a reference to a page.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 454f042bcdd5..c2b8128799c1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -510,7 +510,7 @@ static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry)
 #define free_page_and_swap_cache(page) \
 	put_page(page)
 #define free_pages_and_swap_cache(pages, nr) \
-	release_pages((pages), (nr), false);
+	release_pages((pages), (nr));
 
 static inline void show_swap_cache_info(void)
 {
diff --git a/mm/swap.c b/mm/swap.c
index 88a19b6cdf7c..29cf75f1a860 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -210,7 +210,7 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
 	}
 	if (pgdat)
 		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
-	release_pages(pvec->pages, pvec->nr, 0);
+	release_pages(pvec->pages, pvec->nr);
 	pagevec_reinit(pvec);
 }
 
@@ -740,7 +740,7 @@ void lru_add_drain_all(void)
  * Decrement the reference count on all the pages in @pages.  If it
  * fell to zero, remove the page from the LRU and free it.
  */
-void release_pages(struct page **pages, int nr, bool cold)
+void release_pages(struct page **pages, int nr)
 {
 	int i;
 	LIST_HEAD(pages_to_free);
@@ -817,7 +817,7 @@ void release_pages(struct page **pages, int nr, bool cold)
 		spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
 
 	mem_cgroup_uncharge_list(&pages_to_free);
-	free_hot_cold_page_list(&pages_to_free, cold);
+	free_hot_cold_page_list(&pages_to_free, 0);
 }
 EXPORT_SYMBOL(release_pages);
 
@@ -837,7 +837,7 @@ void __pagevec_release(struct pagevec *pvec)
 		lru_add_drain();
 		pvec->drained = true;
 	}
-	release_pages(pvec->pages, pagevec_count(pvec), 0);
+	release_pages(pvec->pages, pagevec_count(pvec));
 	pagevec_reinit(pvec);
 }
 EXPORT_SYMBOL(__pagevec_release);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 374d446f7a0a..39ae7cfad90f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -319,7 +319,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
 	lru_add_drain();
 	for (i = 0; i < nr; i++)
 		free_swap_cache(pagep[i]);
-	release_pages(pagep, nr, false);
+	release_pages(pagep, nr);
 }
 
 /*
-- 
cgit v1.2.3


From 2d4894b5d2ae0fe1725ea7abd57b33bfbbe45492 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 15 Nov 2017 17:37:59 -0800
Subject: mm: remove cold parameter from free_hot_cold_page*

Most callers users of free_hot_cold_page claim the pages being released
are cache hot.  The exception is the page reclaim paths where it is
likely that enough pages will be freed in the near future that the
per-cpu lists are going to be recycled and the cache hotness information
is lost.  As no one really cares about the hotness of pages being
released to the allocator, just ditch the parameter.

The APIs are renamed to indicate that it's no longer about hot/cold
pages.  It should also be less confusing as there are subtle differences
between them.  __free_pages drops a reference and frees a page when the
refcount reaches zero.  free_hot_cold_page handled pages whose refcount
was already zero which is non-obvious from the name.  free_unref_page
should be more obvious.

No performance impact is expected as the overhead is marginal.  The
parameter is removed simply because it is a bit stupid to have a useless
parameter copied everywhere.

[mgorman@techsingularity.net: add pages to head, not tail]
  Link: http://lkml.kernel.org/r/20171019154321.qtpzaeftoyyw4iey@techsingularity.net
Link: http://lkml.kernel.org/r/20171018075952.10627-8-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/mm/mmu_context_book3s64.c |  2 +-
 arch/powerpc/mm/pgtable_64.c           |  2 +-
 arch/sparc/mm/init_64.c                |  2 +-
 arch/tile/mm/homecache.c               |  2 +-
 include/linux/gfp.h                    |  4 ++--
 include/trace/events/kmem.h            | 11 ++++-------
 mm/page_alloc.c                        | 29 ++++++++++++-----------------
 mm/rmap.c                              |  2 +-
 mm/swap.c                              |  4 ++--
 mm/vmscan.c                            |  6 +++---
 10 files changed, 28 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 05e15386d4cb..a7e998158f37 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -200,7 +200,7 @@ static void destroy_pagetable_page(struct mm_struct *mm)
 	/* We allow PTE_FRAG_NR fragments from a PTE page */
 	if (page_ref_sub_and_test(page, PTE_FRAG_NR - count)) {
 		pgtable_page_dtor(page);
-		free_hot_cold_page(page, 0);
+		free_unref_page(page);
 	}
 }
 
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index ac0717a90ca6..1ec3aee43624 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -404,7 +404,7 @@ void pte_fragment_free(unsigned long *table, int kernel)
 	if (put_page_testzero(page)) {
 		if (!kernel)
 			pgtable_page_dtor(page);
-		free_hot_cold_page(page, 0);
+		free_unref_page(page);
 	}
 }
 
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 051f73401793..55ba62957e64 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2939,7 +2939,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm,
 	if (!page)
 		return NULL;
 	if (!pgtable_page_ctor(page)) {
-		free_hot_cold_page(page, 0);
+		free_unref_page(page);
 		return NULL;
 	}
 	return (pte_t *) page_address(page);
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c
index b51cc28acd0a..4432f31e8479 100644
--- a/arch/tile/mm/homecache.c
+++ b/arch/tile/mm/homecache.c
@@ -409,7 +409,7 @@ void __homecache_free_pages(struct page *page, unsigned int order)
 	if (put_page_testzero(page)) {
 		homecache_change_page_home(page, order, PAGE_HOME_HASH);
 		if (order == 0) {
-			free_hot_cold_page(page, false);
+			free_unref_page(page);
 		} else {
 			init_page_count(page);
 			__free_pages(page, order);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index b041f94678de..f7e62d9096fe 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -530,8 +530,8 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
 
 extern void __free_pages(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
-extern void free_hot_cold_page(struct page *page, bool cold);
-extern void free_hot_cold_page_list(struct list_head *list, bool cold);
+extern void free_unref_page(struct page *page);
+extern void free_unref_page_list(struct list_head *list);
 
 struct page_frag_cache;
 extern void __page_frag_cache_drain(struct page *page, unsigned int count);
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 285feeadac39..eb57e3037deb 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -172,24 +172,21 @@ TRACE_EVENT(mm_page_free,
 
 TRACE_EVENT(mm_page_free_batched,
 
-	TP_PROTO(struct page *page, int cold),
+	TP_PROTO(struct page *page),
 
-	TP_ARGS(page, cold),
+	TP_ARGS(page),
 
 	TP_STRUCT__entry(
 		__field(	unsigned long,	pfn		)
-		__field(	int,		cold		)
 	),
 
 	TP_fast_assign(
 		__entry->pfn		= page_to_pfn(page);
-		__entry->cold		= cold;
 	),
 
-	TP_printk("page=%p pfn=%lu order=0 cold=%d",
+	TP_printk("page=%p pfn=%lu order=0",
 			pfn_to_page(__entry->pfn),
-			__entry->pfn,
-			__entry->cold)
+			__entry->pfn)
 );
 
 TRACE_EVENT(mm_page_alloc,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6a3c4a1d513f..f265d37b3152 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2611,7 +2611,7 @@ void mark_free_pages(struct zone *zone)
 }
 #endif /* CONFIG_PM */
 
-static bool free_hot_cold_page_prepare(struct page *page, unsigned long pfn)
+static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
 {
 	int migratetype;
 
@@ -2623,8 +2623,7 @@ static bool free_hot_cold_page_prepare(struct page *page, unsigned long pfn)
 	return true;
 }
 
-static void free_hot_cold_page_commit(struct page *page, unsigned long pfn,
-				bool cold)
+static void free_unref_page_commit(struct page *page, unsigned long pfn)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
@@ -2649,10 +2648,7 @@ static void free_hot_cold_page_commit(struct page *page, unsigned long pfn,
 	}
 
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
-	if (!cold)
-		list_add(&page->lru, &pcp->lists[migratetype]);
-	else
-		list_add_tail(&page->lru, &pcp->lists[migratetype]);
+	list_add(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
 		unsigned long batch = READ_ONCE(pcp->batch);
@@ -2663,25 +2659,24 @@ static void free_hot_cold_page_commit(struct page *page, unsigned long pfn,
 
 /*
  * Free a 0-order page
- * cold == true ? free a cold page : free a hot page
  */
-void free_hot_cold_page(struct page *page, bool cold)
+void free_unref_page(struct page *page)
 {
 	unsigned long flags;
 	unsigned long pfn = page_to_pfn(page);
 
-	if (!free_hot_cold_page_prepare(page, pfn))
+	if (!free_unref_page_prepare(page, pfn))
 		return;
 
 	local_irq_save(flags);
-	free_hot_cold_page_commit(page, pfn, cold);
+	free_unref_page_commit(page, pfn);
 	local_irq_restore(flags);
 }
 
 /*
  * Free a list of 0-order pages
  */
-void free_hot_cold_page_list(struct list_head *list, bool cold)
+void free_unref_page_list(struct list_head *list)
 {
 	struct page *page, *next;
 	unsigned long flags, pfn;
@@ -2689,7 +2684,7 @@ void free_hot_cold_page_list(struct list_head *list, bool cold)
 	/* Prepare pages for freeing */
 	list_for_each_entry_safe(page, next, list, lru) {
 		pfn = page_to_pfn(page);
-		if (!free_hot_cold_page_prepare(page, pfn))
+		if (!free_unref_page_prepare(page, pfn))
 			list_del(&page->lru);
 		set_page_private(page, pfn);
 	}
@@ -2699,8 +2694,8 @@ void free_hot_cold_page_list(struct list_head *list, bool cold)
 		unsigned long pfn = page_private(page);
 
 		set_page_private(page, 0);
-		trace_mm_page_free_batched(page, cold);
-		free_hot_cold_page_commit(page, pfn, cold);
+		trace_mm_page_free_batched(page);
+		free_unref_page_commit(page, pfn);
 	}
 	local_irq_restore(flags);
 }
@@ -4301,7 +4296,7 @@ void __free_pages(struct page *page, unsigned int order)
 {
 	if (put_page_testzero(page)) {
 		if (order == 0)
-			free_hot_cold_page(page, false);
+			free_unref_page(page);
 		else
 			__free_pages_ok(page, order);
 	}
@@ -4359,7 +4354,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
 		unsigned int order = compound_order(page);
 
 		if (order == 0)
-			free_hot_cold_page(page, false);
+			free_unref_page(page);
 		else
 			__free_pages_ok(page, order);
 	}
diff --git a/mm/rmap.c b/mm/rmap.c
index 6b5a0f219ac0..47db27f8049e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1321,7 +1321,7 @@ void page_remove_rmap(struct page *page, bool compound)
 	 * It would be tidy to reset the PageAnon mapping here,
 	 * but that might overwrite a racing page_add_anon_rmap
 	 * which increments mapcount after us but sets mapping
-	 * before us: so leave the reset to free_hot_cold_page,
+	 * before us: so leave the reset to free_unref_page,
 	 * and remember that it's only reliable while mapped.
 	 * Leaving it set also helps swapoff to reinstate ptes
 	 * faster for those pages still in swapcache.
diff --git a/mm/swap.c b/mm/swap.c
index 29cf75f1a860..b480279c760c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -76,7 +76,7 @@ static void __page_cache_release(struct page *page)
 static void __put_single_page(struct page *page)
 {
 	__page_cache_release(page);
-	free_hot_cold_page(page, false);
+	free_unref_page(page);
 }
 
 static void __put_compound_page(struct page *page)
@@ -817,7 +817,7 @@ void release_pages(struct page **pages, int nr)
 		spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
 
 	mem_cgroup_uncharge_list(&pages_to_free);
-	free_hot_cold_page_list(&pages_to_free, 0);
+	free_unref_page_list(&pages_to_free);
 }
 EXPORT_SYMBOL(release_pages);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2852b8c5a917..c02c850ea349 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1349,7 +1349,7 @@ keep:
 
 	mem_cgroup_uncharge_list(&free_pages);
 	try_to_unmap_flush();
-	free_hot_cold_page_list(&free_pages, true);
+	free_unref_page_list(&free_pages);
 
 	list_splice(&ret_pages, page_list);
 	count_vm_events(PGACTIVATE, pgactivate);
@@ -1824,7 +1824,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	spin_unlock_irq(&pgdat->lru_lock);
 
 	mem_cgroup_uncharge_list(&page_list);
-	free_hot_cold_page_list(&page_list, true);
+	free_unref_page_list(&page_list);
 
 	/*
 	 * If reclaim is isolating dirty pages under writeback, it implies
@@ -2063,7 +2063,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	spin_unlock_irq(&pgdat->lru_lock);
 
 	mem_cgroup_uncharge_list(&l_hold);
-	free_hot_cold_page_list(&l_hold, true);
+	free_unref_page_list(&l_hold);
 	trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
 			nr_deactivate, nr_rotated, sc->priority, file);
 }
-- 
cgit v1.2.3


From 453f85d43fa9ee243f0fc3ac4e1be45615301e3f Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 15 Nov 2017 17:38:03 -0800
Subject: mm: remove __GFP_COLD

As the page free path makes no distinction between cache hot and cold
pages, there is no real useful ordering of pages in the free list that
allocation requests can take advantage of.  Juding from the users of
__GFP_COLD, it is likely that a number of them are the result of copying
other sites instead of actually measuring the impact.  Remove the
__GFP_COLD parameter which simplifies a number of paths in the page
allocator.

This is potentially controversial but bear in mind that the size of the
per-cpu pagelists versus modern cache sizes means that the whole per-cpu
list can often fit in the L3 cache.  Hence, there is only a potential
benefit for microbenchmarks that alloc/free pages in a tight loop.  It's
even worse when THP is taken into account which has little or no chance
of getting a cache-hot page as the per-cpu list is bypassed and the
zeroing of multiple pages will thrash the cache anyway.

The truncate microbenchmarks are not shown as this patch affects the
allocation path and not the free path.  A page fault microbenchmark was
tested but it showed no sigificant difference which is not surprising
given that the __GFP_COLD branches are a miniscule percentage of the
fault path.

Link: http://lkml.kernel.org/r/20171018075952.10627-9-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c         |  2 +-
 drivers/net/ethernet/amd/xgbe/xgbe-desc.c            |  2 +-
 drivers/net/ethernet/aquantia/atlantic/aq_ring.c     |  3 +--
 .../net/ethernet/cavium/liquidio/octeon_network.h    |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c           |  5 ++---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c  |  4 ++--
 drivers/net/ethernet/qlogic/qlge/qlge_main.c         |  3 +--
 drivers/net/ethernet/sfc/falcon/rx.c                 |  2 +-
 drivers/net/ethernet/sfc/rx.c                        |  2 +-
 drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c      |  2 +-
 drivers/net/ethernet/ti/netcp_core.c                 |  2 +-
 drivers/net/virtio_net.c                             |  1 -
 drivers/staging/lustre/lustre/mdc/mdc_request.c      |  2 +-
 fs/cachefiles/rdwr.c                                 |  6 ++----
 include/linux/gfp.h                                  |  5 -----
 include/linux/pagemap.h                              |  8 +-------
 include/linux/skbuff.h                               |  2 +-
 include/linux/slab.h                                 |  3 ---
 include/trace/events/mmflags.h                       |  1 -
 kernel/power/snapshot.c                              |  4 ++--
 mm/filemap.c                                         |  6 +++---
 mm/page_alloc.c                                      | 20 ++++++--------------
 mm/percpu-vm.c                                       |  2 +-
 net/core/skbuff.c                                    |  4 ++--
 tools/perf/builtin-kmem.c                            |  1 -
 25 files changed, 32 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index c6bd5e24005d..fbbbd8b3eb45 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -517,7 +517,7 @@ static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
 
 
 		rc = ena_alloc_rx_page(rx_ring, rx_info,
-				       __GFP_COLD | GFP_ATOMIC | __GFP_COMP);
+				       GFP_ATOMIC | __GFP_COMP);
 		if (unlikely(rc < 0)) {
 			netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
 				   "failed to alloc buffer for rx queue %d\n",
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
index 45d92304068e..cc1e4f820e64 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
@@ -295,7 +295,7 @@ again:
 	order = alloc_order;
 
 	/* Try to obtain pages, decreasing order if necessary */
-	gfp = GFP_ATOMIC | __GFP_COLD | __GFP_COMP | __GFP_NOWARN;
+	gfp = GFP_ATOMIC | __GFP_COMP | __GFP_NOWARN;
 	while (order >= 0) {
 		pages = alloc_pages_node(node, gfp, order);
 		if (pages)
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
index 0654e0c76bc2..519ca6534b85 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
@@ -304,8 +304,7 @@ int aq_ring_rx_fill(struct aq_ring_s *self)
 		buff->flags = 0U;
 		buff->len = AQ_CFG_RX_FRAME_MAX;
 
-		buff->page = alloc_pages(GFP_ATOMIC | __GFP_COLD |
-					 __GFP_COMP, pages_order);
+		buff->page = alloc_pages(GFP_ATOMIC | __GFP_COMP, pages_order);
 		if (!buff->page) {
 			err = -ENOMEM;
 			goto err_exit;
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
index 9e36319cead6..57853eead4b5 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
@@ -195,7 +195,7 @@ static inline void
 	struct sk_buff *skb;
 	struct octeon_skb_page_info *skb_pg_info;
 
-	page = alloc_page(GFP_ATOMIC | __GFP_COLD);
+	page = alloc_page(GFP_ATOMIC);
 	if (unlikely(!page))
 		return NULL;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index b97a55c827eb..ffead38cf5da 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -193,7 +193,7 @@ static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv)
 
 			if (mlx4_en_prepare_rx_desc(priv, ring,
 						    ring->actual_size,
-						    GFP_KERNEL | __GFP_COLD)) {
+						    GFP_KERNEL)) {
 				if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) {
 					en_err(priv, "Failed to allocate enough rx buffers\n");
 					return -ENOMEM;
@@ -552,8 +552,7 @@ static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv,
 	do {
 		if (mlx4_en_prepare_rx_desc(priv, ring,
 					    ring->prod & ring->size_mask,
-					    GFP_ATOMIC | __GFP_COLD |
-					    __GFP_MEMALLOC))
+					    GFP_ATOMIC | __GFP_MEMALLOC))
 			break;
 		ring->prod++;
 	} while (likely(--missing));
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index e118b5f23996..ffb12dc13a5a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1185,7 +1185,7 @@ static void *nfp_net_rx_alloc_one(struct nfp_net_dp *dp, dma_addr_t *dma_addr)
 	} else {
 		struct page *page;
 
-		page = alloc_page(GFP_KERNEL | __GFP_COLD);
+		page = alloc_page(GFP_KERNEL);
 		frag = page ? page_address(page) : NULL;
 	}
 	if (!frag) {
@@ -1212,7 +1212,7 @@ static void *nfp_net_napi_alloc_one(struct nfp_net_dp *dp, dma_addr_t *dma_addr)
 	} else {
 		struct page *page;
 
-		page = alloc_page(GFP_ATOMIC | __GFP_COLD);
+		page = alloc_page(GFP_ATOMIC);
 		frag = page ? page_address(page) : NULL;
 	}
 	if (!frag) {
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
index 29fea74bff2e..7b97a9969046 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
@@ -1092,8 +1092,7 @@ static int ql_get_next_chunk(struct ql_adapter *qdev, struct rx_ring *rx_ring,
 {
 	if (!rx_ring->pg_chunk.page) {
 		u64 map;
-		rx_ring->pg_chunk.page = alloc_pages(__GFP_COLD | __GFP_COMP |
-						GFP_ATOMIC,
+		rx_ring->pg_chunk.page = alloc_pages(__GFP_COMP | GFP_ATOMIC,
 						qdev->lbq_buf_order);
 		if (unlikely(!rx_ring->pg_chunk.page)) {
 			netif_err(qdev, drv, qdev->ndev,
diff --git a/drivers/net/ethernet/sfc/falcon/rx.c b/drivers/net/ethernet/sfc/falcon/rx.c
index 6a8406dc0c2b..91097aea6c41 100644
--- a/drivers/net/ethernet/sfc/falcon/rx.c
+++ b/drivers/net/ethernet/sfc/falcon/rx.c
@@ -163,7 +163,7 @@ static int ef4_init_rx_buffers(struct ef4_rx_queue *rx_queue, bool atomic)
 	do {
 		page = ef4_reuse_page(rx_queue);
 		if (page == NULL) {
-			page = alloc_pages(__GFP_COLD | __GFP_COMP |
+			page = alloc_pages(__GFP_COMP |
 					   (atomic ? GFP_ATOMIC : GFP_KERNEL),
 					   efx->rx_buffer_order);
 			if (unlikely(page == NULL))
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 42443f434569..0004c50d3c83 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -163,7 +163,7 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue, bool atomic)
 	do {
 		page = efx_reuse_page(rx_queue);
 		if (page == NULL) {
-			page = alloc_pages(__GFP_COLD | __GFP_COMP |
+			page = alloc_pages(__GFP_COMP |
 					   (atomic ? GFP_ATOMIC : GFP_KERNEL),
 					   efx->rx_buffer_order);
 			if (unlikely(page == NULL))
diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c b/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c
index e9672b1f9968..031cf9c3435a 100644
--- a/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c
+++ b/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c
@@ -335,7 +335,7 @@ static int xlgmac_alloc_pages(struct xlgmac_pdata *pdata,
 	dma_addr_t pages_dma;
 
 	/* Try to obtain pages, decreasing order if necessary */
-	gfp |= __GFP_COLD | __GFP_COMP | __GFP_NOWARN;
+	gfp |= __GFP_COMP | __GFP_NOWARN;
 	while (order >= 0) {
 		pages = alloc_pages(gfp, order);
 		if (pages)
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index 437d36289786..50d2b76771b5 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -906,7 +906,7 @@ static int netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq)
 		sw_data[0] = (u32)bufptr;
 	} else {
 		/* Allocate a secondary receive queue entry */
-		page = alloc_page(GFP_ATOMIC | GFP_DMA | __GFP_COLD);
+		page = alloc_page(GFP_ATOMIC | GFP_DMA);
 		if (unlikely(!page)) {
 			dev_warn_ratelimited(netcp->ndev_dev, "Secondary page alloc failed\n");
 			goto fail;
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 511f8339fa96..5eec09d63fc0 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -988,7 +988,6 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
 	int err;
 	bool oom;
 
-	gfp |= __GFP_COLD;
 	do {
 		if (vi->mergeable_rx_bufs)
 			err = add_recvbuf_mergeable(vi, rq, gfp);
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c
index 9e538a59f09d..03e55bca4ada 100644
--- a/drivers/staging/lustre/lustre/mdc/mdc_request.c
+++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c
@@ -1152,7 +1152,7 @@ static int mdc_read_page_remote(void *data, struct page *page0)
 	}
 
 	for (npages = 1; npages < max_pages; npages++) {
-		page = page_cache_alloc_cold(inode->i_mapping);
+		page = page_cache_alloc(inode->i_mapping);
 		if (!page)
 			break;
 		page_pool[npages] = page;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 23097cca2674..883bc7bb12c5 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -256,8 +256,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
 			goto backing_page_already_present;
 
 		if (!newpage) {
-			newpage = __page_cache_alloc(cachefiles_gfp |
-						     __GFP_COLD);
+			newpage = __page_cache_alloc(cachefiles_gfp);
 			if (!newpage)
 				goto nomem_monitor;
 		}
@@ -493,8 +492,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 				goto backing_page_already_present;
 
 			if (!newpage) {
-				newpage = __page_cache_alloc(cachefiles_gfp |
-							     __GFP_COLD);
+				newpage = __page_cache_alloc(cachefiles_gfp);
 				if (!newpage)
 					goto nomem;
 			}
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f7e62d9096fe..1a4582b44d32 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -24,7 +24,6 @@ struct vm_area_struct;
 #define ___GFP_HIGH		0x20u
 #define ___GFP_IO		0x40u
 #define ___GFP_FS		0x80u
-#define ___GFP_COLD		0x100u
 #define ___GFP_NOWARN		0x200u
 #define ___GFP_RETRY_MAYFAIL	0x400u
 #define ___GFP_NOFAIL		0x800u
@@ -192,16 +191,12 @@ struct vm_area_struct;
 /*
  * Action modifiers
  *
- * __GFP_COLD indicates that the caller does not expect to be used in the near
- *   future. Where possible, a cache-cold page will be returned.
- *
  * __GFP_NOWARN suppresses allocation failure reports.
  *
  * __GFP_COMP address compound page metadata.
  *
  * __GFP_ZERO returns a zeroed page on success.
  */
-#define __GFP_COLD	((__force gfp_t)___GFP_COLD)
 #define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)
 #define __GFP_COMP	((__force gfp_t)___GFP_COMP)
 #define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4c6790bb7afb..34ce3ebf97d5 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -234,15 +234,9 @@ static inline struct page *page_cache_alloc(struct address_space *x)
 	return __page_cache_alloc(mapping_gfp_mask(x));
 }
 
-static inline struct page *page_cache_alloc_cold(struct address_space *x)
-{
-	return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD);
-}
-
 static inline gfp_t readahead_gfp_mask(struct address_space *x)
 {
-	return mapping_gfp_mask(x) |
-				  __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN;
+	return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
 }
 
 typedef int filler_t(void *, struct page *);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index aa1341474916..7c46fd0b8b64 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2672,7 +2672,7 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask,
 	 * 4.  __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to
 	 *     code in gfp_to_alloc_flags that should be enforcing this.
 	 */
-	gfp_mask |= __GFP_COLD | __GFP_COMP | __GFP_MEMALLOC;
+	gfp_mask |= __GFP_COMP | __GFP_MEMALLOC;
 
 	return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
 }
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 79f6532f8a0b..50697a1d6621 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -467,9 +467,6 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
  * Also it is possible to set different flags by OR'ing
  * in one or more of the following additional @flags:
  *
- * %__GFP_COLD - Request cache-cold pages instead of
- *   trying to return cache-warm pages.
- *
  * %__GFP_HIGH - This allocation has high priority and may use emergency pools.
  *
  * %__GFP_NOFAIL - Indicate that this allocation is in no way allowed to fail
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 72162f3a03fa..dbe1bb058c09 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -32,7 +32,6 @@
 	{(unsigned long)__GFP_ATOMIC,		"__GFP_ATOMIC"},	\
 	{(unsigned long)__GFP_IO,		"__GFP_IO"},		\
 	{(unsigned long)__GFP_FS,		"__GFP_FS"},		\
-	{(unsigned long)__GFP_COLD,		"__GFP_COLD"},		\
 	{(unsigned long)__GFP_NOWARN,		"__GFP_NOWARN"},	\
 	{(unsigned long)__GFP_RETRY_MAYFAIL,	"__GFP_RETRY_MAYFAIL"},	\
 	{(unsigned long)__GFP_NOFAIL,		"__GFP_NOFAIL"},	\
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index a917a301e201..bce0464524d8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1884,7 +1884,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
  */
 static inline int get_highmem_buffer(int safe_needed)
 {
-	buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
+	buffer = get_image_page(GFP_ATOMIC, safe_needed);
 	return buffer ? 0 : -ENOMEM;
 }
 
@@ -1945,7 +1945,7 @@ static int swsusp_alloc(struct memory_bitmap *copy_bm,
 		while (nr_pages-- > 0) {
 			struct page *page;
 
-			page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
+			page = alloc_image_page(GFP_ATOMIC);
 			if (!page)
 				goto err_out;
 			memory_bm_set_bit(copy_bm, page_to_pfn(page));
diff --git a/mm/filemap.c b/mm/filemap.c
index 90a9f261f85f..923fc2ebd74a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2272,7 +2272,7 @@ no_cached_page:
 		 * Ok, it wasn't cached, so we need to create a new
 		 * page..
 		 */
-		page = page_cache_alloc_cold(mapping);
+		page = page_cache_alloc(mapping);
 		if (!page) {
 			error = -ENOMEM;
 			goto out;
@@ -2384,7 +2384,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
 	int ret;
 
 	do {
-		page = __page_cache_alloc(gfp_mask|__GFP_COLD);
+		page = __page_cache_alloc(gfp_mask);
 		if (!page)
 			return -ENOMEM;
 
@@ -2788,7 +2788,7 @@ static struct page *do_read_cache_page(struct address_space *mapping,
 repeat:
 	page = find_get_page(mapping, index);
 	if (!page) {
-		page = __page_cache_alloc(gfp | __GFP_COLD);
+		page = __page_cache_alloc(gfp);
 		if (!page)
 			return ERR_PTR(-ENOMEM);
 		err = add_to_page_cache_lru(page, mapping, index, gfp);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f265d37b3152..370b64d03e3f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2336,7 +2336,7 @@ retry:
  */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list,
-			int migratetype, bool cold)
+			int migratetype)
 {
 	int i, alloced = 0;
 
@@ -2358,10 +2358,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		 * merge IO requests if the physical pages are ordered
 		 * properly.
 		 */
-		if (likely(!cold))
-			list_add(&page->lru, list);
-		else
-			list_add_tail(&page->lru, list);
+		list_add(&page->lru, list);
 		list = &page->lru;
 		alloced++;
 		if (is_migrate_cma(get_pcppage_migratetype(page)))
@@ -2795,7 +2792,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
 
 /* Remove page from the per-cpu list, caller must protect the list */
 static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
-			bool cold, struct per_cpu_pages *pcp,
+			struct per_cpu_pages *pcp,
 			struct list_head *list)
 {
 	struct page *page;
@@ -2804,16 +2801,12 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
 		if (list_empty(list)) {
 			pcp->count += rmqueue_bulk(zone, 0,
 					pcp->batch, list,
-					migratetype, cold);
+					migratetype);
 			if (unlikely(list_empty(list)))
 				return NULL;
 		}
 
-		if (cold)
-			page = list_last_entry(list, struct page, lru);
-		else
-			page = list_first_entry(list, struct page, lru);
-
+		page = list_first_entry(list, struct page, lru);
 		list_del(&page->lru);
 		pcp->count--;
 	} while (check_new_pcp(page));
@@ -2828,14 +2821,13 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 {
 	struct per_cpu_pages *pcp;
 	struct list_head *list;
-	bool cold = ((gfp_flags & __GFP_COLD) != 0);
 	struct page *page;
 	unsigned long flags;
 
 	local_irq_save(flags);
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	list = &pcp->lists[migratetype];
-	page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
+	page = __rmqueue_pcplist(zone,  migratetype, pcp, list);
 	if (page) {
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
 		zone_statistics(preferred_zone, zone);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 15dab691ea70..9158e5a81391 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -81,7 +81,7 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
 static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
 			    struct page **pages, int page_start, int page_end)
 {
-	const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
+	const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM;
 	unsigned int cpu, tcpu;
 	int i;
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6cd057b41f34..9c68555bb906 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -353,7 +353,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
  */
 void *netdev_alloc_frag(unsigned int fragsz)
 {
-	return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
+	return __netdev_alloc_frag(fragsz, GFP_ATOMIC);
 }
 EXPORT_SYMBOL(netdev_alloc_frag);
 
@@ -366,7 +366,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 
 void *napi_alloc_frag(unsigned int fragsz)
 {
-	return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
+	return __napi_alloc_frag(fragsz, GFP_ATOMIC);
 }
 EXPORT_SYMBOL(napi_alloc_frag);
 
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index cbf70738ef5f..ae11e4c3516a 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -641,7 +641,6 @@ static const struct {
 	{ "__GFP_ATOMIC",		"_A" },
 	{ "__GFP_IO",			"I" },
 	{ "__GFP_FS",			"F" },
-	{ "__GFP_COLD",			"CO" },
 	{ "__GFP_NOWARN",		"NWR" },
 	{ "__GFP_RETRY_MAYFAIL",	"R" },
 	{ "__GFP_NOFAIL",		"NF" },
-- 
cgit v1.2.3


From 7f0b5fb953e750a7410cc96c67a656d79db48bcb Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 15 Nov 2017 17:38:10 -0800
Subject: mm, pagevec: rename pagevec drained field

According to Vlastimil Babka, the drained field in pagevec is
potentially misleading because it might be interpreted as draining this
pagevec instead of the percpu lru pagevecs.  Rename the field for
clarity.

Link: http://lkml.kernel.org/r/20171019093346.ylahzdpzmoriyf4v@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h | 4 ++--
 mm/swap.c               | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index eebefd209424..5fb6580f7f23 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -17,7 +17,7 @@ struct address_space;
 
 struct pagevec {
 	unsigned long nr;
-	bool drained;
+	bool percpu_pvec_drained;
 	struct page *pages[PAGEVEC_SIZE];
 };
 
@@ -53,7 +53,7 @@ static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
 static inline void pagevec_init(struct pagevec *pvec)
 {
 	pvec->nr = 0;
-	pvec->drained = false;
+	pvec->percpu_pvec_drained = false;
 }
 
 static inline void pagevec_reinit(struct pagevec *pvec)
diff --git a/mm/swap.c b/mm/swap.c
index b480279c760c..38e1b6374a97 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -833,9 +833,9 @@ EXPORT_SYMBOL(release_pages);
  */
 void __pagevec_release(struct pagevec *pvec)
 {
-	if (!pvec->drained) {
+	if (!pvec->percpu_pvec_drained) {
 		lru_add_drain();
-		pvec->drained = true;
+		pvec->percpu_pvec_drained = true;
 	}
 	release_pages(pvec->pages, pagevec_count(pvec));
 	pagevec_reinit(pvec);
-- 
cgit v1.2.3


From 4518085e127dff97e74f74a8780d7564e273bec8 Mon Sep 17 00:00:00 2001
From: Kemi Wang <kemi.wang@intel.com>
Date: Wed, 15 Nov 2017 17:38:22 -0800
Subject: mm, sysctl: make NUMA stats configurable

This is the second step which introduces a tunable interface that allow
numa stats configurable for optimizing zone_statistics(), as suggested
by Dave Hansen and Ying Huang.

=========================================================================

When page allocation performance becomes a bottleneck and you can
tolerate some possible tool breakage and decreased numa counter
precision, you can do:

	echo 0 > /proc/sys/vm/numa_stat

In this case, numa counter update is ignored.  We can see about
*4.8%*(185->176) drop of cpu cycles per single page allocation and
reclaim on Jesper's page_bench01 (single thread) and *8.1%*(343->315)
drop of cpu cycles per single page allocation and reclaim on Jesper's
page_bench03 (88 threads) running on a 2-Socket Broadwell-based server
(88 threads, 126G memory).

Benchmark link provided by Jesper D Brouer (increase loop times to
10000000):

  https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench

=========================================================================

When page allocation performance is not a bottleneck and you want all
tooling to work, you can do:

	echo 1 > /proc/sys/vm/numa_stat

This is system default setting.

Many thanks to Michal Hocko, Dave Hansen, Ying Huang and Vlastimil Babka
for comments to help improve the original patch.

[keescook@chromium.org: make sure mutex is a global static]
  Link: http://lkml.kernel.org/r/20171107213809.GA4314@beast
Link: http://lkml.kernel.org/r/1508290927-8518-1-git-send-email-kemi.wang@intel.com
Signed-off-by: Kemi Wang <kemi.wang@intel.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reported-by: Jesper Dangaard Brouer <brouer@redhat.com>
Suggested-by: Dave Hansen <dave.hansen@intel.com>
Suggested-by: Ying Huang <ying.huang@intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Luis R . Rodriguez" <mcgrof@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christopher Lameter <cl@linux.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 16 ++++++++++
 include/linux/vmstat.h      | 10 +++++++
 kernel/sysctl.c             |  9 ++++++
 mm/mempolicy.c              |  3 ++
 mm/page_alloc.c             |  6 ++++
 mm/vmstat.c                 | 71 +++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 115 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 3e579740b49f..055c8b3e1018 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -58,6 +58,7 @@ Currently, these files are in /proc/sys/vm:
 - percpu_pagelist_fraction
 - stat_interval
 - stat_refresh
+- numa_stat
 - swappiness
 - user_reserve_kbytes
 - vfs_cache_pressure
@@ -799,6 +800,21 @@ with no ill effects: errors and warnings on these stats are suppressed.)
 
 ==============================================================
 
+numa_stat
+
+This interface allows runtime configuration of numa statistics.
+
+When page allocation performance becomes a bottleneck and you can tolerate
+some possible tool breakage and decreased numa counter precision, you can
+do:
+	echo 0 > /proc/sys/vm/numa_stat
+
+When page allocation performance is not a bottleneck and you want all
+tooling to work, you can do:
+	echo 1 > /proc/sys/vm/numa_stat
+
+==============================================================
+
 swappiness
 
 This control is used to define how aggressive the kernel will swap
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 1e0cb72e0598..1779c9817b39 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -7,9 +7,19 @@
 #include <linux/mmzone.h>
 #include <linux/vm_event_item.h>
 #include <linux/atomic.h>
+#include <linux/static_key.h>
 
 extern int sysctl_stat_interval;
 
+#ifdef CONFIG_NUMA
+#define ENABLE_NUMA_STAT   1
+#define DISABLE_NUMA_STAT   0
+extern int sysctl_vm_numa_stat;
+DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key);
+extern int sysctl_vm_numa_stat_handler(struct ctl_table *table,
+		int write, void __user *buffer, size_t *length, loff_t *ppos);
+#endif
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 /*
  * Light weight per cpu counter implementation.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7638e2f7fff8..4a13a389e99b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1356,6 +1356,15 @@ static struct ctl_table vm_table[] = {
 		.mode           = 0644,
 		.proc_handler   = &hugetlb_mempolicy_sysctl_handler,
 	},
+	{
+		.procname		= "numa_stat",
+		.data			= &sysctl_vm_numa_stat,
+		.maxlen			= sizeof(int),
+		.mode			= 0644,
+		.proc_handler	= sysctl_vm_numa_stat_handler,
+		.extra1			= &zero,
+		.extra2			= &one,
+	},
 #endif
 	 {
 		.procname	= "hugetlb_shm_group",
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index dad166b736ba..4ce44d3ff03d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1915,6 +1915,9 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 	struct page *page;
 
 	page = __alloc_pages(gfp, order, nid);
+	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
+	if (!static_branch_likely(&vm_numa_stat_key))
+		return page;
 	if (page && page_to_nid(page) == nid) {
 		preempt_disable();
 		__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7ca668e946e5..67f523c4711a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -82,6 +82,8 @@ DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
 #endif
 
+DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
+
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
@@ -2777,6 +2779,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
 #ifdef CONFIG_NUMA
 	enum numa_stat_item local_stat = NUMA_LOCAL;
 
+	/* skip numa counters update if numa stats is disabled */
+	if (!static_branch_likely(&vm_numa_stat_key))
+		return;
+
 	if (z->node != numa_node_id())
 		local_stat = NUMA_OTHER;
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7d11554861e4..40b2db6db6b1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -32,6 +32,77 @@
 
 #define NUMA_STATS_THRESHOLD (U16_MAX - 2)
 
+#ifdef CONFIG_NUMA
+int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
+
+/* zero numa counters within a zone */
+static void zero_zone_numa_counters(struct zone *zone)
+{
+	int item, cpu;
+
+	for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) {
+		atomic_long_set(&zone->vm_numa_stat[item], 0);
+		for_each_online_cpu(cpu)
+			per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]
+						= 0;
+	}
+}
+
+/* zero numa counters of all the populated zones */
+static void zero_zones_numa_counters(void)
+{
+	struct zone *zone;
+
+	for_each_populated_zone(zone)
+		zero_zone_numa_counters(zone);
+}
+
+/* zero global numa counters */
+static void zero_global_numa_counters(void)
+{
+	int item;
+
+	for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++)
+		atomic_long_set(&vm_numa_stat[item], 0);
+}
+
+static void invalid_numa_statistics(void)
+{
+	zero_zones_numa_counters();
+	zero_global_numa_counters();
+}
+
+static DEFINE_MUTEX(vm_numa_stat_lock);
+
+int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int ret, oldval;
+
+	mutex_lock(&vm_numa_stat_lock);
+	if (write)
+		oldval = sysctl_vm_numa_stat;
+	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (ret || !write)
+		goto out;
+
+	if (oldval == sysctl_vm_numa_stat)
+		goto out;
+	else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
+		static_branch_enable(&vm_numa_stat_key);
+		pr_info("enable numa statistics\n");
+	} else {
+		static_branch_disable(&vm_numa_stat_key);
+		invalid_numa_statistics();
+		pr_info("disable numa statistics, and clear numa counters\n");
+	}
+
+out:
+	mutex_unlock(&vm_numa_stat_lock);
+	return ret;
+}
+#endif
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
 EXPORT_PER_CPU_SYMBOL(vm_event_states);
-- 
cgit v1.2.3


From d135e5750205a21a212a19dbb05aeb339e2cbea7 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Wed, 15 Nov 2017 17:38:41 -0800
Subject: mm/page_alloc.c: broken deferred calculation

In reset_deferred_meminit() we determine number of pages that must not
be deferred.  We initialize pages for at least 2G of memory, but also
pages for reserved memory in this node.

The reserved memory is determined in this function:
memblock_reserved_memory_within(), which operates over physical
addresses, and returns size in bytes.  However, reset_deferred_meminit()
assumes that that this function operates with pfns, and returns page
count.

The result is that in the best case machine boots slower than expected
due to initializing more pages than needed in single thread, and in the
worst case panics because fewer than needed pages are initialized early.

Link: http://lkml.kernel.org/r/20171021011707.15191-1-pasha.tatashin@oracle.com
Fixes: 864b9a393dcb ("mm: consider memblock reservations for deferred memory initialization sizing")
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  3 ++-
 mm/page_alloc.c        | 27 ++++++++++++++++++---------
 2 files changed, 20 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 39ccaa9c428b..67f2e3c38939 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -700,7 +700,8 @@ typedef struct pglist_data {
 	 * is the first PFN that needs to be initialised.
 	 */
 	unsigned long first_deferred_pfn;
-	unsigned long static_init_size;
+	/* Number of non-deferred pages */
+	unsigned long static_init_pgcnt;
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bd1a686e40fe..8f2b9ad2e23f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -291,28 +291,37 @@ EXPORT_SYMBOL(nr_online_nodes);
 int page_group_by_mobility_disabled __read_mostly;
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+
+/*
+ * Determine how many pages need to be initialized durig early boot
+ * (non-deferred initialization).
+ * The value of first_deferred_pfn will be set later, once non-deferred pages
+ * are initialized, but for now set it ULONG_MAX.
+ */
 static inline void reset_deferred_meminit(pg_data_t *pgdat)
 {
-	unsigned long max_initialise;
-	unsigned long reserved_lowmem;
+	phys_addr_t start_addr, end_addr;
+	unsigned long max_pgcnt;
+	unsigned long reserved;
 
 	/*
 	 * Initialise at least 2G of a node but also take into account that
 	 * two large system hashes that can take up 1GB for 0.25TB/node.
 	 */
-	max_initialise = max(2UL << (30 - PAGE_SHIFT),
-		(pgdat->node_spanned_pages >> 8));
+	max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
+			(pgdat->node_spanned_pages >> 8));
 
 	/*
 	 * Compensate the all the memblock reservations (e.g. crash kernel)
 	 * from the initial estimation to make sure we will initialize enough
 	 * memory to boot.
 	 */
-	reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn,
-			pgdat->node_start_pfn + max_initialise);
-	max_initialise += reserved_lowmem;
+	start_addr = PFN_PHYS(pgdat->node_start_pfn);
+	end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
+	reserved = memblock_reserved_memory_within(start_addr, end_addr);
+	max_pgcnt += PHYS_PFN(reserved);
 
-	pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages);
+	pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
 	pgdat->first_deferred_pfn = ULONG_MAX;
 }
 
@@ -339,7 +348,7 @@ static inline bool update_defer_init(pg_data_t *pgdat,
 	if (zone_end < pgdat_end_pfn(pgdat))
 		return true;
 	(*nr_initialised)++;
-	if ((*nr_initialised > pgdat->static_init_size) &&
+	if ((*nr_initialised > pgdat->static_init_pgcnt) &&
 	    (pfn & (PAGES_PER_SECTION - 1)) == 0) {
 		pgdat->first_deferred_pfn = pfn;
 		return false;
-- 
cgit v1.2.3


From 2bce774e8245e95db81872ec39522cde8b486fc8 Mon Sep 17 00:00:00 2001
From: Wang Long <wanglong19@meituan.com>
Date: Wed, 15 Nov 2017 17:39:03 -0800
Subject: writeback: remove unused function parameter

The parameter `struct bdi_writeback *wb` is not been used in the
function body.  Remove it.

Link: http://lkml.kernel.org/r/1509685485-15278-1-git-send-email-wanglong19@meituan.com
Signed-off-by: Wang Long <wanglong19@meituan.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/backing-dev.h | 2 +-
 mm/page-writeback.c         | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 7360e79e9a2f..e54e7e0033eb 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -93,7 +93,7 @@ extern void wb_writeout_inc(struct bdi_writeback *wb);
 /*
  * maximal error of a stat counter.
  */
-static inline unsigned long wb_stat_error(struct bdi_writeback *wb)
+static inline unsigned long wb_stat_error(void)
 {
 #ifdef CONFIG_SMP
 	return nr_cpu_ids * WB_STAT_BATCH;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 05313f402ba8..8a1551154285 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1545,7 +1545,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
 	 * actually dirty; with m+n sitting in the percpu
 	 * deltas.
 	 */
-	if (dtc->wb_thresh < 2 * wb_stat_error(wb)) {
+	if (dtc->wb_thresh < 2 * wb_stat_error()) {
 		wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
 		dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
 	} else {
@@ -1803,7 +1803,7 @@ pause:
 		 * more page. However wb_dirty has accounting errors.  So use
 		 * the larger and more IO friendly wb_stat_error.
 		 */
-		if (sdtc->wb_dirty <= wb_stat_error(wb))
+		if (sdtc->wb_dirty <= wb_stat_error())
 			break;
 
 		if (fatal_signal_pending(current))
-- 
cgit v1.2.3


From 0205f75571e3a70c35f0dd5e608773cce97d9dbb Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 15 Nov 2017 17:39:14 -0800
Subject: mm: simplify nodemask printing

alloc_warn() and dump_header() have to explicitly handle NULL nodemask
which forces both paths to use pr_cont.  We can do better.  printk
already handles NULL pointers properly so all we need is to teach
nodemask_pr_args to handle NULL nodemask carefully.  This allows
simplification of both alloc_warn() and dump_header() and gets rid of
pr_cont altogether.

This patch has been motivated by patch from Joe Perches

  http://lkml.kernel.org/r/b31236dfe3fc924054fd7842bde678e71d193638.1509991345.git.joe@perches.com

[akpm@linux-foundation.org: fix tile warning, per Arnd]
Link: http://lkml.kernel.org/r/20171109100531.3cn2hcqnuj7mjaju@dhcp22.suse.cz
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Joe Perches <joe@perches.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nodemask.h |  4 +++-
 mm/oom_kill.c            | 12 ++++--------
 mm/page_alloc.c          | 12 +++---------
 3 files changed, 10 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index de1c50b93c61..15cab3967d6d 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -104,7 +104,9 @@ extern nodemask_t _unused_nodemask_arg_;
  *
  * Can be used to provide arguments for '%*pb[l]' when printing a nodemask.
  */
-#define nodemask_pr_args(maskp)		MAX_NUMNODES, (maskp)->bits
+#define nodemask_pr_args(maskp)				\
+	((maskp) != NULL) ? MAX_NUMNODES : 0,		\
+	((maskp) != NULL) ? (maskp)->bits : NULL
 
 /*
  * The inline keyword gives the compiler room to decide to inline, or
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a501a0a1f0f8..c86fbd1b590e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -426,14 +426,10 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 
 static void dump_header(struct oom_control *oc, struct task_struct *p)
 {
-	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=",
-		current->comm, oc->gfp_mask, &oc->gfp_mask);
-	if (oc->nodemask)
-		pr_cont("%*pbl", nodemask_pr_args(oc->nodemask));
-	else
-		pr_cont("(null)");
-	pr_cont(",  order=%d, oom_score_adj=%hd\n",
-		oc->order, current->signal->oom_score_adj);
+	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
+		current->comm, oc->gfp_mask, &oc->gfp_mask,
+		nodemask_pr_args(oc->nodemask), oc->order,
+			current->signal->oom_score_adj);
 	if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
 		pr_warn("COMPACTION is disabled!!!\n");
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8f2b9ad2e23f..7a199767dcee 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3279,20 +3279,14 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
 		return;
 
-	pr_warn("%s: ", current->comm);
-
 	va_start(args, fmt);
 	vaf.fmt = fmt;
 	vaf.va = &args;
-	pr_cont("%pV", &vaf);
+	pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
+			current->comm, &vaf, gfp_mask, &gfp_mask,
+			nodemask_pr_args(nodemask));
 	va_end(args);
 
-	pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
-	if (nodemask)
-		pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
-	else
-		pr_cont("(null)\n");
-
 	cpuset_print_current_mems_allowed();
 
 	dump_stack();
-- 
cgit v1.2.3


From 2f62b5aa4814be2c511553fd6afb4d35b6c2503b Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 12:53:37 +0300
Subject: fs, nfs: convert nfs_lock_context.count from atomic_t to refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable nfs_lock_context.count is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/inode.c         | 12 ++++++------
 include/linux/nfs_fs.h |  3 ++-
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 134d9f560240..52a60e399a2d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -783,7 +783,7 @@ EXPORT_SYMBOL_GPL(nfs_getattr);
 
 static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
 {
-	atomic_set(&l_ctx->count, 1);
+	refcount_set(&l_ctx->count, 1);
 	l_ctx->lockowner = current->files;
 	INIT_LIST_HEAD(&l_ctx->list);
 	atomic_set(&l_ctx->io_count, 0);
@@ -797,7 +797,7 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context
 	do {
 		if (pos->lockowner != current->files)
 			continue;
-		atomic_inc(&pos->count);
+		refcount_inc(&pos->count);
 		return pos;
 	} while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head);
 	return NULL;
@@ -836,7 +836,7 @@ void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
 	struct nfs_open_context *ctx = l_ctx->open_context;
 	struct inode *inode = d_inode(ctx->dentry);
 
-	if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock))
+	if (!refcount_dec_and_lock(&l_ctx->count, &inode->i_lock))
 		return;
 	list_del(&l_ctx->list);
 	spin_unlock(&inode->i_lock);
@@ -913,7 +913,7 @@ EXPORT_SYMBOL_GPL(alloc_nfs_open_context);
 struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
 {
 	if (ctx != NULL)
-		atomic_inc(&ctx->lock_context.count);
+		refcount_inc(&ctx->lock_context.count);
 	return ctx;
 }
 EXPORT_SYMBOL_GPL(get_nfs_open_context);
@@ -924,11 +924,11 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 	struct super_block *sb = ctx->dentry->d_sb;
 
 	if (!list_empty(&ctx->list)) {
-		if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
+		if (!refcount_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
 			return;
 		list_del(&ctx->list);
 		spin_unlock(&inode->i_lock);
-	} else if (!atomic_dec_and_test(&ctx->lock_context.count))
+	} else if (!refcount_dec_and_test(&ctx->lock_context.count))
 		return;
 	if (inode != NULL)
 		NFS_PROTO(inode)->close_context(ctx, is_sync);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 453f491a5fda..e6706913e6c2 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -22,6 +22,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/rbtree.h>
+#include <linux/refcount.h>
 #include <linux/rwsem.h>
 #include <linux/wait.h>
 
@@ -55,7 +56,7 @@ struct nfs_access_entry {
 };
 
 struct nfs_lock_context {
-	atomic_t count;
+	refcount_t count;
 	struct list_head list;
 	struct nfs_open_context *open_context;
 	fl_owner_t lockowner;
-- 
cgit v1.2.3


From 212bf41d88c06afc23e03f9b274eebf1e8dba197 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 12:53:38 +0300
Subject: fs, nfs: convert nfs_client.cl_count from atomic_t to refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable nfs_client.cl_count is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/client.c                        | 10 +++++-----
 fs/nfs/filelayout/filelayout.c         | 12 ++++++------
 fs/nfs/flexfilelayout/flexfilelayout.c | 12 ++++++------
 fs/nfs/nfs4client.c                    | 10 +++++-----
 fs/nfs/nfs4proc.c                      | 12 ++++++------
 fs/nfs/nfs4state.c                     |  6 +++---
 include/linux/nfs_fs_sb.h              |  3 ++-
 7 files changed, 33 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 22880ef6d8dd..0ac2fb1c6b63 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -163,7 +163,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
 
 	clp->rpc_ops = clp->cl_nfs_mod->rpc_ops;
 
-	atomic_set(&clp->cl_count, 1);
+	refcount_set(&clp->cl_count, 1);
 	clp->cl_cons_state = NFS_CS_INITING;
 
 	memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen);
@@ -269,7 +269,7 @@ void nfs_put_client(struct nfs_client *clp)
 
 	nn = net_generic(clp->cl_net, nfs_net_id);
 
-	if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) {
+	if (refcount_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) {
 		list_del(&clp->cl_share_link);
 		nfs_cb_idr_remove_locked(clp);
 		spin_unlock(&nn->nfs_client_lock);
@@ -314,7 +314,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
 							   sap))
 				continue;
 
-		atomic_inc(&clp->cl_count);
+		refcount_inc(&clp->cl_count);
 		return clp;
 	}
 	return NULL;
@@ -1006,7 +1006,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
 	/* Copy data from the source */
 	server->nfs_client = source->nfs_client;
 	server->destroy = source->destroy;
-	atomic_inc(&server->nfs_client->cl_count);
+	refcount_inc(&server->nfs_client->cl_count);
 	nfs_server_copy_userdata(server, source);
 
 	server->fsid = fattr->fsid;
@@ -1166,7 +1166,7 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
 		   clp->rpc_ops->version,
 		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
 		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
-		   atomic_read(&clp->cl_count),
+		   refcount_read(&clp->cl_count),
 		   clp->cl_hostname);
 	rcu_read_unlock();
 
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 508126eb49f9..4e54d8b5413a 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -471,10 +471,10 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
 		return PNFS_NOT_ATTEMPTED;
 
 	dprintk("%s USE DS: %s cl_count %d\n", __func__,
-		ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
+		ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count));
 
 	/* No multipath support. Use first DS */
-	atomic_inc(&ds->ds_clp->cl_count);
+	refcount_inc(&ds->ds_clp->cl_count);
 	hdr->ds_clp = ds->ds_clp;
 	hdr->ds_commit_idx = idx;
 	fh = nfs4_fl_select_ds_fh(lseg, j);
@@ -515,10 +515,10 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 
 	dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d\n",
 		__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
-		offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
+		offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count));
 
 	hdr->pgio_done_cb = filelayout_write_done_cb;
-	atomic_inc(&ds->ds_clp->cl_count);
+	refcount_inc(&ds->ds_clp->cl_count);
 	hdr->ds_clp = ds->ds_clp;
 	hdr->ds_commit_idx = idx;
 	fh = nfs4_fl_select_ds_fh(lseg, j);
@@ -1064,9 +1064,9 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
 		goto out_err;
 
 	dprintk("%s ino %lu, how %d cl_count %d\n", __func__,
-		data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count));
+		data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count));
 	data->commit_done_cb = filelayout_commit_done_cb;
-	atomic_inc(&ds->ds_clp->cl_count);
+	refcount_inc(&ds->ds_clp->cl_count);
 	data->ds_clp = ds->ds_clp;
 	fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
 	if (fh)
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index ff55a0a1b2e8..c75ad982bcfc 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1726,10 +1726,10 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
 	vers = nfs4_ff_layout_ds_version(lseg, idx);
 
 	dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
-		ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers);
+		ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers);
 
 	hdr->pgio_done_cb = ff_layout_read_done_cb;
-	atomic_inc(&ds->ds_clp->cl_count);
+	refcount_inc(&ds->ds_clp->cl_count);
 	hdr->ds_clp = ds->ds_clp;
 	fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
 	if (fh)
@@ -1785,11 +1785,11 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 
 	dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n",
 		__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
-		offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count),
+		offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count),
 		vers);
 
 	hdr->pgio_done_cb = ff_layout_write_done_cb;
-	atomic_inc(&ds->ds_clp->cl_count);
+	refcount_inc(&ds->ds_clp->cl_count);
 	hdr->ds_clp = ds->ds_clp;
 	hdr->ds_commit_idx = idx;
 	fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
@@ -1863,11 +1863,11 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
 	vers = nfs4_ff_layout_ds_version(lseg, idx);
 
 	dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
-		data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count),
+		data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count),
 		vers);
 	data->commit_done_cb = ff_layout_commit_done_cb;
 	data->cred = ds_cred;
-	atomic_inc(&ds->ds_clp->cl_count);
+	refcount_inc(&ds->ds_clp->cl_count);
 	data->ds_clp = ds->ds_clp;
 	fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
 	if (fh)
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index e9bea90dc017..31b5bc0f10a4 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -483,7 +483,7 @@ static int nfs4_match_client(struct nfs_client  *pos,  struct nfs_client *new,
 	 * ID and serverowner fields.  Wait for CREATE_SESSION
 	 * to finish. */
 	if (pos->cl_cons_state > NFS_CS_READY) {
-		atomic_inc(&pos->cl_count);
+		refcount_inc(&pos->cl_count);
 		spin_unlock(&nn->nfs_client_lock);
 
 		nfs_put_client(*prev);
@@ -559,7 +559,7 @@ int nfs40_walk_client_list(struct nfs_client *new,
 		 * way that a SETCLIENTID_CONFIRM to pos can succeed is
 		 * if new and pos point to the same server:
 		 */
-		atomic_inc(&pos->cl_count);
+		refcount_inc(&pos->cl_count);
 		spin_unlock(&nn->nfs_client_lock);
 
 		nfs_put_client(prev);
@@ -715,7 +715,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
 			continue;
 
 found:
-		atomic_inc(&pos->cl_count);
+		refcount_inc(&pos->cl_count);
 		*result = pos;
 		status = 0;
 		break;
@@ -749,7 +749,7 @@ nfs4_find_client_ident(struct net *net, int cb_ident)
 	spin_lock(&nn->nfs_client_lock);
 	clp = idr_find(&nn->cb_ident_idr, cb_ident);
 	if (clp)
-		atomic_inc(&clp->cl_count);
+		refcount_inc(&clp->cl_count);
 	spin_unlock(&nn->nfs_client_lock);
 	return clp;
 }
@@ -804,7 +804,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
 		    sid->data, NFS4_MAX_SESSIONID_LEN) != 0)
 			continue;
 
-		atomic_inc(&clp->cl_count);
+		refcount_inc(&clp->cl_count);
 		spin_unlock(&nn->nfs_client_lock);
 		return clp;
 	}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index be8c75a2cbbe..82e5ed2ee6ba 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4870,7 +4870,7 @@ static void nfs4_renew_release(void *calldata)
 	struct nfs4_renewdata *data = calldata;
 	struct nfs_client *clp = data->client;
 
-	if (atomic_read(&clp->cl_count) > 1)
+	if (refcount_read(&clp->cl_count) > 1)
 		nfs4_schedule_state_renewal(clp);
 	nfs_put_client(clp);
 	kfree(data);
@@ -4918,7 +4918,7 @@ static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred,
 
 	if (renew_flags == 0)
 		return 0;
-	if (!atomic_inc_not_zero(&clp->cl_count))
+	if (!refcount_inc_not_zero(&clp->cl_count))
 		return -EIO;
 	data = kmalloc(sizeof(*data), GFP_NOFS);
 	if (data == NULL) {
@@ -7499,7 +7499,7 @@ nfs4_run_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 	struct nfs41_exchange_id_data *calldata;
 	int status;
 
-	if (!atomic_inc_not_zero(&clp->cl_count))
+	if (!refcount_inc_not_zero(&clp->cl_count))
 		return ERR_PTR(-EIO);
 
 	status = -ENOMEM;
@@ -8099,7 +8099,7 @@ static void nfs41_sequence_release(void *data)
 	struct nfs4_sequence_data *calldata = data;
 	struct nfs_client *clp = calldata->clp;
 
-	if (atomic_read(&clp->cl_count) > 1)
+	if (refcount_read(&clp->cl_count) > 1)
 		nfs4_schedule_state_renewal(clp);
 	nfs_put_client(clp);
 	kfree(calldata);
@@ -8128,7 +8128,7 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
 	trace_nfs4_sequence(clp, task->tk_status);
 	if (task->tk_status < 0) {
 		dprintk("%s ERROR %d\n", __func__, task->tk_status);
-		if (atomic_read(&clp->cl_count) == 1)
+		if (refcount_read(&clp->cl_count) == 1)
 			goto out;
 
 		if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) {
@@ -8179,7 +8179,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
 	struct rpc_task *ret;
 
 	ret = ERR_PTR(-EIO);
-	if (!atomic_inc_not_zero(&clp->cl_count))
+	if (!refcount_inc_not_zero(&clp->cl_count))
 		goto out_err;
 
 	ret = ERR_PTR(-ENOMEM);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 1887134d5231..3bd79b8c016b 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1177,7 +1177,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
 	if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
 		return;
 	__module_get(THIS_MODULE);
-	atomic_inc(&clp->cl_count);
+	refcount_inc(&clp->cl_count);
 
 	/* The rcu_read_lock() is not strictly necessary, as the state
 	 * manager is the only thread that ever changes the rpc_xprt
@@ -1269,7 +1269,7 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp)
 
 	might_sleep();
 
-	atomic_inc(&clp->cl_count);
+	refcount_inc(&clp->cl_count);
 	res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
 				 nfs_wait_bit_killable, TASK_KILLABLE);
 	if (res)
@@ -2510,7 +2510,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			break;
 		if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
 			break;
-	} while (atomic_read(&clp->cl_count) > 1);
+	} while (refcount_read(&clp->cl_count) > 1);
 	return;
 out_error:
 	if (strlen(section))
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 74c44665e6d3..efcfe9ded9ea 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -9,6 +9,7 @@
 #include <linux/sunrpc/xprt.h>
 
 #include <linux/atomic.h>
+#include <linux/refcount.h>
 
 struct nfs4_session;
 struct nfs_iostats;
@@ -24,7 +25,7 @@ struct nfs41_impl_id;
  * The nfs_client identifies our client state to the server.
  */
 struct nfs_client {
-	atomic_t		cl_count;
+	refcount_t		cl_count;
 	atomic_t		cl_mds_count;
 	int			cl_cons_state;	/* current construction state (-ve: init error) */
 #define NFS_CS_READY		0		/* ready to be used */
-- 
cgit v1.2.3


From 2232df5ece121fd7049ccff95cbb3acfab278d75 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 30 Oct 2017 16:21:57 -0400
Subject: rpcrdma: Remove C structure definitions of XDR data items

Clean up: C-structure style XDR encoding and decoding logic has
been replaced over the past several merge windows on both the
client and server. These data structures are no longer used.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/rpc_rdma.h | 59 -----------------------------------------
 net/sunrpc/xprtrdma/rpc_rdma.c  |  6 ++---
 net/sunrpc/xprtrdma/xprt_rdma.h |  6 -----
 3 files changed, 3 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h
index b7e85b341a54..840afac16272 100644
--- a/include/linux/sunrpc/rpc_rdma.h
+++ b/include/linux/sunrpc/rpc_rdma.h
@@ -50,65 +50,6 @@ enum {
 	RPCRDMA_V1_DEF_INLINE_SIZE	= 1024,
 };
 
-struct rpcrdma_segment {
-	__be32 rs_handle;	/* Registered memory handle */
-	__be32 rs_length;	/* Length of the chunk in bytes */
-	__be64 rs_offset;	/* Chunk virtual address or offset */
-};
-
-/*
- * read chunk(s), encoded as a linked list.
- */
-struct rpcrdma_read_chunk {
-	__be32 rc_discrim;	/* 1 indicates presence */
-	__be32 rc_position;	/* Position in XDR stream */
-	struct rpcrdma_segment rc_target;
-};
-
-/*
- * write chunk, and reply chunk.
- */
-struct rpcrdma_write_chunk {
-	struct rpcrdma_segment wc_target;
-};
-
-/*
- * write chunk(s), encoded as a counted array.
- */
-struct rpcrdma_write_array {
-	__be32 wc_discrim;	/* 1 indicates presence */
-	__be32 wc_nchunks;	/* Array count */
-	struct rpcrdma_write_chunk wc_array[0];
-};
-
-struct rpcrdma_msg {
-	__be32 rm_xid;	/* Mirrors the RPC header xid */
-	__be32 rm_vers;	/* Version of this protocol */
-	__be32 rm_credit;	/* Buffers requested/granted */
-	__be32 rm_type;	/* Type of message (enum rpcrdma_proc) */
-	union {
-
-		struct {			/* no chunks */
-			__be32 rm_empty[3];	/* 3 empty chunk lists */
-		} rm_nochunks;
-
-		struct {			/* no chunks and padded */
-			__be32 rm_align;	/* Padding alignment */
-			__be32 rm_thresh;	/* Padding threshold */
-			__be32 rm_pempty[3];	/* 3 empty chunk lists */
-		} rm_padded;
-
-		struct {
-			__be32 rm_err;
-			__be32 rm_vers_low;
-			__be32 rm_vers_high;
-		} rm_error;
-
-		__be32 rm_chunks[0];	/* read, write and reply chunks */
-
-	} rm_body;
-};
-
 /*
  * XDR sizes, in quads
  */
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 4fdeaac6ebe6..45cb5497b37f 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -75,11 +75,11 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
 
 	/* Maximum Read list size */
 	maxsegs += 2;	/* segment for head and tail buffers */
-	size = maxsegs * sizeof(struct rpcrdma_read_chunk);
+	size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32);
 
 	/* Minimal Read chunk size */
 	size += sizeof(__be32);	/* segment count */
-	size += sizeof(struct rpcrdma_segment);
+	size += rpcrdma_segment_maxsz * sizeof(__be32);
 	size += sizeof(__be32);	/* list discriminator */
 
 	dprintk("RPC:       %s: max call header size = %u\n",
@@ -102,7 +102,7 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
 	/* Maximum Write list size */
 	maxsegs += 2;	/* segment for head and tail buffers */
 	size = sizeof(__be32);		/* segment count */
-	size += maxsegs * sizeof(struct rpcrdma_segment);
+	size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32);
 	size += sizeof(__be32);	/* list discriminator */
 
 	dprintk("RPC:       %s: max reply header size = %u\n",
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 6e64c8259d34..8b9954c41ee4 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -145,12 +145,6 @@ rdmab_lkey(struct rpcrdma_regbuf *rb)
 	return rb->rg_iov.lkey;
 }
 
-static inline struct rpcrdma_msg *
-rdmab_to_msg(struct rpcrdma_regbuf *rb)
-{
-	return (struct rpcrdma_msg *)rb->rg_base;
-}
-
 static inline struct ib_device *
 rdmab_device(struct rpcrdma_regbuf *rb)
 {
-- 
cgit v1.2.3


From 62b56a675565a2e40f2cdf50455977448fd87413 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 30 Oct 2017 16:22:14 -0400
Subject: xprtrdma: Update copyright notices

Credit work contributed by Oracle engineers since 2014.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/rpc_rdma.h | 1 +
 net/sunrpc/xprtrdma/rpc_rdma.c  | 1 +
 net/sunrpc/xprtrdma/transport.c | 1 +
 net/sunrpc/xprtrdma/verbs.c     | 1 +
 net/sunrpc/xprtrdma/xprt_rdma.h | 1 +
 5 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h
index 840afac16272..8f144db73e38 100644
--- a/include/linux/sunrpc/rpc_rdma.h
+++ b/include/linux/sunrpc/rpc_rdma.h
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2015-2017 Oracle. All rights reserved.
  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 45cb5497b37f..ed34dc0f144c 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2014-2017 Oracle.  All rights reserved.
  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 9fdd11e4758c..646c24494ea7 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2014-2017 Oracle.  All rights reserved.
  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index be61c29432d0..710b3f77db82 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2014-2017 Oracle.  All rights reserved.
  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 8b9954c41ee4..51686d9eac5f 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2014-2017 Oracle.  All rights reserved.
  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
-- 
cgit v1.2.3


From 1334be3657dd02af0591d6d8adf0e6a60a7710a6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 17 Nov 2017 15:26:12 -0800
Subject: mm: fix nodemask printing

The cleanup caused build warnings for constant mask pointers:

  mm/mempolicy.c: In function `mpol_to_str':
  ./include/linux/nodemask.h:108:11: warning: the comparison will always evaluate as `true' for the address of `nodes' will never be NULL [-Waddress]

An earlier workaround I suggested was incorporated in the version that
got merged, but that only solved the problem for gcc-7 and higher, while
gcc-4.6 through gcc-6.x still warn.

This changes the printing again to use inline functions that make it
clear to the compiler that the line that does the NULL check has no idea
whether the argument is a constant NULL.

Link: http://lkml.kernel.org/r/20171117101545.119689-1-arnd@arndb.de
Fixes: 0205f75571e3 ("mm: simplify nodemask printing")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Zhangshaokun <zhangshaokun@hisilicon.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nodemask.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 15cab3967d6d..1fbde8a880d9 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -104,9 +104,16 @@ extern nodemask_t _unused_nodemask_arg_;
  *
  * Can be used to provide arguments for '%*pb[l]' when printing a nodemask.
  */
-#define nodemask_pr_args(maskp)				\
-	((maskp) != NULL) ? MAX_NUMNODES : 0,		\
-	((maskp) != NULL) ? (maskp)->bits : NULL
+#define nodemask_pr_args(maskp)	__nodemask_pr_numnodes(maskp), \
+				__nodemask_pr_bits(maskp)
+static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
+{
+	return m ? MAX_NUMNODES : 0;
+}
+static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
+{
+	return m ? m->bits : NULL;
+}
 
 /*
  * The inline keyword gives the compiler room to decide to inline, or
-- 
cgit v1.2.3


From 21dc7e023611fbcf8e38f255731bcf3cc38e7638 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Fri, 17 Nov 2017 15:26:30 -0800
Subject: mm, compaction: persistently skip hugetlbfs pageblocks

It is pointless to migrate hugetlb memory as part of memory compaction
if the hugetlb size is equal to the pageblock order.  No defragmentation
is occurring in this condition.

It is also pointless to for the freeing scanner to scan a pageblock
where a hugetlb page is pinned.  Unconditionally skip these pageblocks,
and do so peristently so that they are not rescanned until it is
observed that these hugepages are no longer pinned.

It would also be possible to do this by involving the hugetlb subsystem
in marking pageblocks to no longer be skipped when they hugetlb pages
are freed.  This is a simple solution that doesn't involve any
additional subsystems in pageblock skip manipulation.

[rientjes@google.com: fix build]
  Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1708201734390.117182@chino.kir.corp.google.com
Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1708151639130.106658@chino.kir.corp.google.com
Signed-off-by: David Rientjes <rientjes@google.com>
Tested-by: Michal Hocko <mhocko@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pageblock-flags.h | 11 ++++++++
 mm/compaction.c                 | 56 ++++++++++++++++++++++++++++++++---------
 2 files changed, 55 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index e942558b3585..9132c5cb41f1 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -96,6 +96,17 @@ void set_pfnblock_flags_mask(struct page *page,
 #define set_pageblock_skip(page) \
 			set_pageblock_flags_group(page, 1, PB_migrate_skip,  \
 							PB_migrate_skip)
+#else
+static inline bool get_pageblock_skip(struct page *page)
+{
+	return false;
+}
+static inline void clear_pageblock_skip(struct page *page)
+{
+}
+static inline void set_pageblock_skip(struct page *page)
+{
+}
 #endif /* CONFIG_COMPACTION */
 
 #endif	/* PAGEBLOCK_FLAGS_H */
diff --git a/mm/compaction.c b/mm/compaction.c
index ad40d67421f3..94b5c0865dd1 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -218,6 +218,20 @@ static void reset_cached_positions(struct zone *zone)
 				pageblock_start_pfn(zone_end_pfn(zone) - 1);
 }
 
+/*
+ * Hugetlbfs pages should consistenly be skipped until updated by the hugetlb
+ * subsystem.  It is always pointless to compact pages of pageblock_order and
+ * the free scanner can reconsider when no longer huge.
+ */
+static bool pageblock_skip_persistent(struct page *page, unsigned int order)
+{
+	if (!PageHuge(page))
+		return false;
+	if (order != pageblock_order)
+		return false;
+	return true;
+}
+
 /*
  * This function is called to clear all cached information on pageblocks that
  * should be skipped for page isolation when the migrate and free page scanner
@@ -242,6 +256,8 @@ static void __reset_isolation_suitable(struct zone *zone)
 			continue;
 		if (zone != page_zone(page))
 			continue;
+		if (pageblock_skip_persistent(page, compound_order(page)))
+			continue;
 
 		clear_pageblock_skip(page);
 	}
@@ -307,7 +323,13 @@ static inline bool isolation_suitable(struct compact_control *cc,
 	return true;
 }
 
-static void update_pageblock_skip(struct compact_control *cc,
+static inline bool pageblock_skip_persistent(struct page *page,
+					     unsigned int order)
+{
+	return false;
+}
+
+static inline void update_pageblock_skip(struct compact_control *cc,
 			struct page *page, unsigned long nr_isolated,
 			bool migrate_scanner)
 {
@@ -449,13 +471,15 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		 * and the only danger is skipping too much.
 		 */
 		if (PageCompound(page)) {
-			unsigned int comp_order = compound_order(page);
-
-			if (likely(comp_order < MAX_ORDER)) {
-				blockpfn += (1UL << comp_order) - 1;
-				cursor += (1UL << comp_order) - 1;
+			const unsigned int order = compound_order(page);
+
+			if (pageblock_skip_persistent(page, order)) {
+				set_pageblock_skip(page);
+				blockpfn = end_pfn;
+			} else if (likely(order < MAX_ORDER)) {
+				blockpfn += (1UL << order) - 1;
+				cursor += (1UL << order) - 1;
 			}
-
 			goto isolate_fail;
 		}
 
@@ -772,11 +796,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		 * danger is skipping too much.
 		 */
 		if (PageCompound(page)) {
-			unsigned int comp_order = compound_order(page);
-
-			if (likely(comp_order < MAX_ORDER))
-				low_pfn += (1UL << comp_order) - 1;
+			const unsigned int order = compound_order(page);
 
+			if (pageblock_skip_persistent(page, order)) {
+				set_pageblock_skip(page);
+				low_pfn = end_pfn;
+			} else if (likely(order < MAX_ORDER))
+				low_pfn += (1UL << order) - 1;
 			goto isolate_fail;
 		}
 
@@ -838,7 +864,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			 * is safe to read and it's 0 for tail pages.
 			 */
 			if (unlikely(PageCompound(page))) {
-				low_pfn += (1UL << compound_order(page)) - 1;
+				const unsigned int order = compound_order(page);
+
+				if (pageblock_skip_persistent(page, order)) {
+					set_pageblock_skip(page);
+					low_pfn = end_pfn;
+				} else
+					low_pfn += (1UL << order) - 1;
 				goto isolate_fail;
 			}
 		}
-- 
cgit v1.2.3


From aaf5dcfb223617ac2d16113e4b500199c65689de Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 17 Nov 2017 15:27:06 -0800
Subject: kernel debug: support resetting WARN_ONCE for all architectures

Some architectures store the WARN_ONCE state in the flags field of the
bug_entry.  Clear that one too when resetting once state through
/sys/kernel/debug/clear_warn_once

Pointed out by Michael Ellerman

Improves the earlier patch that add clear_warn_once.

[ak@linux.intel.com: add a missing ifdef CONFIG_MODULES]
  Link: http://lkml.kernel.org/r/20171020170633.9593-1-andi@firstfloor.org
[akpm@linux-foundation.org: fix unused var warning]
[akpm@linux-foundation.org: Use 0200 for clear_warn_once file, per mpe]
[akpm@linux-foundation.org: clear BUGFLAG_DONE in clear_once_table(), per mpe]
Link: http://lkml.kernel.org/r/20171019204642.7404-1-andi@firstfloor.org
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bug.h |  5 +++++
 kernel/panic.c      |  3 ++-
 lib/bug.c           | 23 +++++++++++++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bug.h b/include/linux/bug.h
index da4231c905c8..fe5916550da8 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -43,6 +43,8 @@ enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs);
 /* These are defined by the architecture */
 int is_valid_bugaddr(unsigned long addr);
 
+void generic_bug_clear_once(void);
+
 #else	/* !CONFIG_GENERIC_BUG */
 
 static inline enum bug_trap_type report_bug(unsigned long bug_addr,
@@ -51,6 +53,9 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr,
 	return BUG_TRAP_TYPE_BUG;
 }
 
+
+static inline void generic_bug_clear_once(void) {}
+
 #endif	/* CONFIG_GENERIC_BUG */
 
 /*
diff --git a/kernel/panic.c b/kernel/panic.c
index 672a91dc20fe..67cebf2a3b67 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -595,6 +595,7 @@ EXPORT_SYMBOL(warn_slowpath_null);
 
 static int clear_warn_once_set(void *data, u64 val)
 {
+	generic_bug_clear_once();
 	memset(__start_once, 0, __end_once - __start_once);
 	return 0;
 }
@@ -607,7 +608,7 @@ DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops,
 static __init int register_warn_debugfs(void)
 {
 	/* Don't care about failure */
-	debugfs_create_file("clear_warn_once", 0644, NULL,
+	debugfs_create_file("clear_warn_once", 0200, NULL,
 			    NULL, &clear_warn_once_fops);
 	return 0;
 }
diff --git a/lib/bug.c b/lib/bug.c
index 1e094408c893..f66be6bf6206 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -196,3 +196,26 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 
 	return BUG_TRAP_TYPE_BUG;
 }
+
+static void clear_once_table(struct bug_entry *start, struct bug_entry *end)
+{
+	struct bug_entry *bug;
+
+	for (bug = start; bug < end; bug++)
+		bug->flags &= ~BUGFLAG_DONE;
+}
+
+void generic_bug_clear_once(void)
+{
+#ifdef CONFIG_MODULES
+	struct module *mod;
+
+	rcu_read_lock_sched();
+	list_for_each_entry_rcu(mod, &module_bug_list, bug_list)
+		clear_once_table(mod->bug_table,
+				 mod->bug_table + mod->num_bugs);
+	rcu_read_unlock_sched();
+#endif
+
+	clear_once_table(__start___bug_table, __stop___bug_table);
+}
-- 
cgit v1.2.3


From d15809f3649e2ee04713a9dba9aa7bd2c208ad82 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 17 Nov 2017 15:27:13 -0800
Subject: iopoll: avoid -Wint-in-bool-context warning

When we pass the result of a multiplication as the timeout or the delay,
we can get a warning from gcc-7:

  drivers/mmc/host/bcm2835.c:596:149: error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]
  drivers/mfd/arizona-core.c:247:195: error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]
  drivers/gpu/drm/sun4i/sun4i_hdmi_i2c.c:49:27: error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]

The warning is a bit questionable inside of a macro, but this is
intentional on the side of the gcc developers.  It is also an indication
of another problem: we evaluate the timeout and sleep arguments multiple
times, which can have undesired side-effects when those are complex
expressions.

This changes the two iopoll variants to use local variables for storing
copies of the timeouts.  This adds some more type safety, and avoids
both the double-evaluation and the gcc warning.

Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81484
Link: http://lkml.kernel.org/r/20170726133756.2161367-1-arnd@arndb.de
Link: http://lkml.kernel.org/r/20171102114048.1526955-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/iopoll.h | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h
index d29e1e21bf3f..b1d861caca16 100644
--- a/include/linux/iopoll.h
+++ b/include/linux/iopoll.h
@@ -42,18 +42,21 @@
  */
 #define readx_poll_timeout(op, addr, val, cond, sleep_us, timeout_us)	\
 ({ \
-	ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \
-	might_sleep_if(sleep_us); \
+	u64 __timeout_us = (timeout_us); \
+	unsigned long __sleep_us = (sleep_us); \
+	ktime_t __timeout = ktime_add_us(ktime_get(), __timeout_us); \
+	might_sleep_if((__sleep_us) != 0); \
 	for (;;) { \
 		(val) = op(addr); \
 		if (cond) \
 			break; \
-		if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \
+		if (__timeout_us && \
+		    ktime_compare(ktime_get(), __timeout) > 0) { \
 			(val) = op(addr); \
 			break; \
 		} \
-		if (sleep_us) \
-			usleep_range((sleep_us >> 2) + 1, sleep_us); \
+		if (__sleep_us) \
+			usleep_range((__sleep_us >> 2) + 1, __sleep_us); \
 	} \
 	(cond) ? 0 : -ETIMEDOUT; \
 })
@@ -77,17 +80,20 @@
  */
 #define readx_poll_timeout_atomic(op, addr, val, cond, delay_us, timeout_us) \
 ({ \
-	ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \
+	u64 __timeout_us = (timeout_us); \
+	unsigned long __delay_us = (delay_us); \
+	ktime_t __timeout = ktime_add_us(ktime_get(), __timeout_us); \
 	for (;;) { \
 		(val) = op(addr); \
 		if (cond) \
 			break; \
-		if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \
+		if (__timeout_us && \
+		    ktime_compare(ktime_get(), __timeout) > 0) { \
 			(val) = op(addr); \
 			break; \
 		} \
-		if (delay_us) \
-			udelay(delay_us);	\
+		if (__delay_us) \
+			udelay(__delay_us);	\
 	} \
 	(cond) ? 0 : -ETIMEDOUT; \
 })
-- 
cgit v1.2.3


From 4ca59b14e588f873795a11cdc77a25c686a29d23 Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Fri, 17 Nov 2017 15:27:28 -0800
Subject: include/linux/compiler-clang.h: handle randomizable anonymous structs

The GCC randomize layout plugin can randomize the member offsets of
sensitive kernel data structures.  To use this feature, certain
annotations and members are added to the structures which affect the
member offsets even if this plugin is not used.

All of these structures are completely randomized, except for task_struct
which leaves out some of its members.  All the other members are wrapped
within an anonymous struct with the __randomize_layout attribute.  This is
done using the randomized_struct_fields_start and
randomized_struct_fields_end defines.

When the plugin is disabled, the behaviour of this attribute can vary
based on the GCC version.  For GCC 5.1+, this attribute maps to
__designated_init otherwise it is just an empty define but the anonymous
structure is still present.  For other compilers, both
randomized_struct_fields_start and randomized_struct_fields_end default
to empty defines meaning the anonymous structure is not introduced at
all.

So, if a module compiled with Clang, such as a BPF program, needs to
access task_struct fields such as pid and comm, the offsets of these
members as recognized by Clang are different from those recognized by
modules compiled with GCC.  If GCC 4.6+ is used to build the kernel,
this can be solved by introducing appropriate defines for Clang so that
the anonymous structure is seen when determining the offsets for the
members.

Link: http://lkml.kernel.org/r/20171109064645.25581-1-sandipan@linux.vnet.ibm.com
Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-clang.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index a06583e41f80..3b609edffa8f 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -16,3 +16,6 @@
  * with any version that can compile the kernel
  */
 #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
+
+#define randomized_struct_fields_start	struct {
+#define randomized_struct_fields_end	};
-- 
cgit v1.2.3


From 8001541cc333b1a3c2c38e3b34475fa446b053da Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 17 Nov 2017 15:27:49 -0800
Subject: include/linux/bitfield.h: include <linux/build_bug.h> instead of
 <linux/bug.h>

Since commit bc6245e5efd7 ("bug: split BUILD_BUG stuff out into
<linux/build_bug.h>"), #include <linux/build_bug.h> is better to pull
minimal headers needed for BUILG_BUG() family.

Link: http://lkml.kernel.org/r/1505700775-19826-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Cc: Dinan Gunawardena <dinan.gunawardena@netronome.com>
Cc: Kalle Valo <kvalo@codeaurora.org>
Cc: Ian Abbott <abbotti@mev.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitfield.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
index f2deb71958b2..1030651f8309 100644
--- a/include/linux/bitfield.h
+++ b/include/linux/bitfield.h
@@ -15,7 +15,7 @@
 #ifndef _LINUX_BITFIELD_H
 #define _LINUX_BITFIELD_H
 
-#include <linux/bug.h>
+#include <linux/build_bug.h>
 
 /*
  * Bitfield access macros
-- 
cgit v1.2.3


From f5bba9d11a256ad2a1c2f8e7fc6aabe6416b7890 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 17 Nov 2017 15:27:53 -0800
Subject: include/linux/radix-tree.h: remove unneeded #include <linux/bug.h>

This include was added by commit 187f1882b5b0 ("BUG: headers with
BUG/BUG_ON etc. need linux/bug.h") because BUG_ON() was used in this
header at that time.

Some time later, commit 6d75f366b924 ("lib: radix-tree: check accounting
of existing slot replacement users") removed the use of BUG_ON() from
this header.

Since then, there is no reason to include <linux/bug.h>.

Link: http://lkml.kernel.org/r/1505660151-4383-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Chris Mi <chrism@mellanox.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 0ca448c1cb42..23a9c89c7ad9 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -22,7 +22,6 @@
 #define _LINUX_RADIX_TREE_H
 
 #include <linux/bitops.h>
-#include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/preempt.h>
-- 
cgit v1.2.3


From 36a3d1dd4e16bcd0d2ddfb4a2ec7092f0ae0d931 Mon Sep 17 00:00:00 2001
From: Stephen Bates <sbates@raithlin.com>
Date: Fri, 17 Nov 2017 15:28:16 -0800
Subject: lib/genalloc.c: make the avail variable an atomic_long_t

If the amount of resources allocated to a gen_pool exceeds 2^32 then the
avail atomic overflows and this causes problems when clients try and
borrow resources from the pool.  This is only expected to be an issue on
64 bit systems.

Add the <linux/atomic.h> header to pull in atomic_long* operations.  So
that 32 bit systems continue to use atomic32_t but 64 bit systems can
use atomic64_t.

Link: http://lkml.kernel.org/r/1509033843-25667-1-git-send-email-sbates@raithlin.com
Signed-off-by: Stephen Bates <sbates@raithlin.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Daniel Mentz <danielmentz@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/genalloc.h |  3 ++-
 lib/genalloc.c           | 10 +++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 6dfec4d638df..872f930f1b06 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -32,6 +32,7 @@
 
 #include <linux/types.h>
 #include <linux/spinlock_types.h>
+#include <linux/atomic.h>
 
 struct device;
 struct device_node;
@@ -71,7 +72,7 @@ struct gen_pool {
  */
 struct gen_pool_chunk {
 	struct list_head next_chunk;	/* next chunk in pool */
-	atomic_t avail;
+	atomic_long_t avail;
 	phys_addr_t phys_addr;		/* physical starting address of memory chunk */
 	unsigned long start_addr;	/* start address of memory chunk */
 	unsigned long end_addr;		/* end address of memory chunk (inclusive) */
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 144fe6b1a03e..ca06adc4f445 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -194,7 +194,7 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy
 	chunk->phys_addr = phys;
 	chunk->start_addr = virt;
 	chunk->end_addr = virt + size - 1;
-	atomic_set(&chunk->avail, size);
+	atomic_long_set(&chunk->avail, size);
 
 	spin_lock(&pool->lock);
 	list_add_rcu(&chunk->next_chunk, &pool->chunks);
@@ -304,7 +304,7 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size,
 	nbits = (size + (1UL << order) - 1) >> order;
 	rcu_read_lock();
 	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
-		if (size > atomic_read(&chunk->avail))
+		if (size > atomic_long_read(&chunk->avail))
 			continue;
 
 		start_bit = 0;
@@ -324,7 +324,7 @@ retry:
 
 		addr = chunk->start_addr + ((unsigned long)start_bit << order);
 		size = nbits << order;
-		atomic_sub(size, &chunk->avail);
+		atomic_long_sub(size, &chunk->avail);
 		break;
 	}
 	rcu_read_unlock();
@@ -390,7 +390,7 @@ void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
 			remain = bitmap_clear_ll(chunk->bits, start_bit, nbits);
 			BUG_ON(remain);
 			size = nbits << order;
-			atomic_add(size, &chunk->avail);
+			atomic_long_add(size, &chunk->avail);
 			rcu_read_unlock();
 			return;
 		}
@@ -464,7 +464,7 @@ size_t gen_pool_avail(struct gen_pool *pool)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk)
-		avail += atomic_read(&chunk->avail);
+		avail += atomic_long_read(&chunk->avail);
 	rcu_read_unlock();
 	return avail;
 }
-- 
cgit v1.2.3


From 7a8d181949fb2c16be00f8cdb354794a30e46b39 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Fri, 17 Nov 2017 15:29:24 -0800
Subject: pipe: add proc_dopipe_max_size() to safely assign pipe_max_size

pipe_max_size is assigned directly via procfs sysctl:

  static struct ctl_table fs_table[] = {
          ...
          {
                  .procname       = "pipe-max-size",
                  .data           = &pipe_max_size,
                  .maxlen         = sizeof(int),
                  .mode           = 0644,
                  .proc_handler   = &pipe_proc_fn,
                  .extra1         = &pipe_min_size,
          },
          ...

  int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
                   size_t *lenp, loff_t *ppos)
  {
          ...
          ret = proc_dointvec_minmax(table, write, buf, lenp, ppos)
          ...

and then later rounded in-place a few statements later:

          ...
          pipe_max_size = round_pipe_size(pipe_max_size);
          ...

This leaves a window of time between initial assignment and rounding
that may be visible to other threads.  (For example, one thread sets a
non-rounded value to pipe_max_size while another reads its value.)

Similar reads of pipe_max_size are potentially racy:

  pipe.c :: alloc_pipe_info()
  pipe.c :: pipe_set_size()

Add a new proc_dopipe_max_size() that consolidates reading the new value
from the user buffer, verifying bounds, and calling round_pipe_size()
with a single assignment to pipe_max_size.

Link: http://lkml.kernel.org/r/1507658689-11669-4-git-send-email-joe.lawrence@redhat.com
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c                 | 18 +++--------------
 include/linux/pipe_fs_i.h |  1 +
 include/linux/sysctl.h    |  3 +++
 kernel/sysctl.c           | 49 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/pipe.c b/fs/pipe.c
index f0f4ab36c444..6d98566201ef 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1020,7 +1020,7 @@ const struct file_operations pipefifo_fops = {
  * Currently we rely on the pipe array holding a power-of-2 number
  * of pages. Returns 0 on error.
  */
-static inline unsigned int round_pipe_size(unsigned int size)
+unsigned int round_pipe_size(unsigned int size)
 {
 	unsigned long nr_pages;
 
@@ -1125,25 +1125,13 @@ out_revert_acct:
 }
 
 /*
- * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
+ * This should work even if CONFIG_PROC_FS isn't set, as proc_dopipe_max_size
  * will return an error.
  */
 int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
 		 size_t *lenp, loff_t *ppos)
 {
-	unsigned int rounded_pipe_max_size;
-	int ret;
-
-	ret = proc_douintvec_minmax(table, write, buf, lenp, ppos);
-	if (ret < 0 || !write)
-		return ret;
-
-	rounded_pipe_max_size = round_pipe_size(pipe_max_size);
-	if (rounded_pipe_max_size == 0)
-		return -EINVAL;
-
-	pipe_max_size = rounded_pipe_max_size;
-	return ret;
+	return proc_dopipe_max_size(table, write, buf, lenp, ppos);
 }
 
 /*
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 6a80cfc63e0c..2dc5e9870fcd 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -191,5 +191,6 @@ long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
 struct pipe_inode_info *get_pipe_info(struct file *file);
 
 int create_pipe_files(struct file **, int);
+unsigned int round_pipe_size(unsigned int size);
 
 #endif
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index b769ecfcc3bd..992bc9948232 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -51,6 +51,9 @@ extern int proc_dointvec_minmax(struct ctl_table *, int,
 extern int proc_douintvec_minmax(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp,
 				 loff_t *ppos);
+extern int proc_dopipe_max_size(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp,
+				loff_t *ppos);
 extern int proc_dointvec_jiffies(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
 extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2d42183b4c98..138b6484f277 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -66,6 +66,7 @@
 #include <linux/kexec.h>
 #include <linux/bpf.h>
 #include <linux/mount.h>
+#include <linux/pipe_fs_i.h>
 
 #include <linux/uaccess.h>
 #include <asm/processor.h>
@@ -2620,6 +2621,47 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
 				 do_proc_douintvec_minmax_conv, &param);
 }
 
+struct do_proc_dopipe_max_size_conv_param {
+	unsigned int *min;
+};
+
+static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
+					unsigned int *valp,
+					int write, void *data)
+{
+	struct do_proc_dopipe_max_size_conv_param *param = data;
+
+	if (write) {
+		unsigned int val = round_pipe_size(*lvalp);
+
+		if (val == 0)
+			return -EINVAL;
+
+		if (param->min && *param->min > val)
+			return -ERANGE;
+
+		if (*lvalp > UINT_MAX)
+			return -EINVAL;
+
+		*valp = val;
+	} else {
+		unsigned int val = *valp;
+		*lvalp = (unsigned long) val;
+	}
+
+	return 0;
+}
+
+int proc_dopipe_max_size(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct do_proc_dopipe_max_size_conv_param param = {
+		.min = (unsigned int *) table->extra1,
+	};
+	return do_proc_douintvec(table, write, buffer, lenp, ppos,
+				 do_proc_dopipe_max_size_conv, &param);
+}
+
 static void validate_coredump_safety(void)
 {
 #ifdef CONFIG_COREDUMP
@@ -3125,6 +3167,12 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
 	return -ENOSYS;
 }
 
+int proc_dopipe_max_size(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
 int proc_dointvec_jiffies(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -3168,6 +3216,7 @@ EXPORT_SYMBOL(proc_douintvec);
 EXPORT_SYMBOL(proc_dointvec_jiffies);
 EXPORT_SYMBOL(proc_dointvec_minmax);
 EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
+EXPORT_SYMBOL_GPL(proc_dopipe_max_size);
 EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
 EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
 EXPORT_SYMBOL(proc_dostring);
-- 
cgit v1.2.3


From 95846ecf9dac5089aed4b144d912225f8ef86ae4 Mon Sep 17 00:00:00 2001
From: Gargi Sharma <gs051095@gmail.com>
Date: Fri, 17 Nov 2017 15:30:30 -0800
Subject: pid: replace pid bitmap implementation with IDR API

Patch series "Replacing PID bitmap implementation with IDR API", v4.

This series replaces kernel bitmap implementation of PID allocation with
IDR API.  These patches are written to simplify the kernel by replacing
custom code with calls to generic code.

The following are the stats for pid and pid_namespace object files
before and after the replacement.  There is a noteworthy change between
the IDR and bitmap implementation.

Before
   text       data        bss        dec        hex    filename
   8447       3894         64      12405       3075    kernel/pid.o
After
   text       data        bss        dec        hex    filename
   3397        304          0       3701        e75    kernel/pid.o

Before
   text       data        bss        dec        hex    filename
   5692       1842        192       7726       1e2e    kernel/pid_namespace.o
After
   text       data        bss        dec        hex    filename
   2854        216         16       3086        c0e    kernel/pid_namespace.o

The following are the stats for ps, pstree and calling readdir on /proc
for 10,000 processes.

ps:
        With IDR API    With bitmap
real    0m1.479s        0m2.319s
user    0m0.070s        0m0.060s
sys     0m0.289s        0m0.516s

pstree:
        With IDR API    With bitmap
real    0m1.024s        0m1.794s
user    0m0.348s        0m0.612s
sys     0m0.184s        0m0.264s

proc:
        With IDR API    With bitmap
real    0m0.059s        0m0.074s
user    0m0.000s        0m0.004s
sys     0m0.016s        0m0.016s

This patch (of 2):

Replace the current bitmap implementation for Process ID allocation.
Functions that are no longer required, for example, free_pidmap(),
alloc_pidmap(), etc.  are removed.  The rest of the functions are
modified to use the IDR API.  The change was made to make the PID
allocation less complex by replacing custom code with calls to generic
API.

[gs051095@gmail.com: v6]
  Link: http://lkml.kernel.org/r/1507760379-21662-2-git-send-email-gs051095@gmail.com
[avagin@openvz.org: restore the old behaviour of the ns_last_pid sysctl]
  Link: http://lkml.kernel.org/r/20171106183144.16368-1-avagin@openvz.org
Link: http://lkml.kernel.org/r/1507583624-22146-2-git-send-email-gs051095@gmail.com
Signed-off-by: Gargi Sharma <gs051095@gmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Julia Lawall <julia.lawall@lip6.fr>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/cell/spufs/sched.c |   2 +-
 fs/proc/loadavg.c                         |   2 +-
 include/linux/pid_namespace.h             |  14 +--
 init/main.c                               |   2 +-
 kernel/pid.c                              | 201 ++++++------------------------
 kernel/pid_namespace.c                    |  53 ++++----
 6 files changed, 65 insertions(+), 209 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 1fbb5da17dd2..e47761cdcb98 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -1093,7 +1093,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
 		LOAD_INT(c), LOAD_FRAC(c),
 		count_active_contexts(),
 		atomic_read(&nr_spu_contexts),
-		task_active_pid_ns(current)->last_pid);
+		idr_get_cursor(&task_active_pid_ns(current)->idr));
 	return 0;
 }
 
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 9bc5c58c00ee..a000d7547479 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -24,7 +24,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
 		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
 		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
 		nr_running(), nr_threads,
-		task_active_pid_ns(current)->last_pid);
+		idr_get_cursor(&task_active_pid_ns(current)->idr));
 	return 0;
 }
 
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index c78af6061644..92c6aa509d2e 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -10,15 +10,8 @@
 #include <linux/nsproxy.h>
 #include <linux/kref.h>
 #include <linux/ns_common.h>
+#include <linux/idr.h>
 
-struct pidmap {
-       atomic_t nr_free;
-       void *page;
-};
-
-#define BITS_PER_PAGE		(PAGE_SIZE * 8)
-#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
-#define PIDMAP_ENTRIES		((PID_MAX_LIMIT+BITS_PER_PAGE-1)/BITS_PER_PAGE)
 
 struct fs_pin;
 
@@ -30,9 +23,8 @@ enum { /* definitions for pid_namespace's hide_pid field */
 
 struct pid_namespace {
 	struct kref kref;
-	struct pidmap pidmap[PIDMAP_ENTRIES];
+	struct idr idr;
 	struct rcu_head rcu;
-	int last_pid;
 	unsigned int nr_hashed;
 	struct task_struct *child_reaper;
 	struct kmem_cache *pid_cachep;
@@ -106,6 +98,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
 
 extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
 void pidhash_init(void);
-void pidmap_init(void);
+void pid_idr_init(void);
 
 #endif /* _LINUX_PID_NS_H */
diff --git a/init/main.c b/init/main.c
index 859a786f7c0a..d0cbcfc06124 100644
--- a/init/main.c
+++ b/init/main.c
@@ -669,7 +669,7 @@ asmlinkage __visible void __init start_kernel(void)
 	if (late_time_init)
 		late_time_init();
 	calibrate_delay();
-	pidmap_init();
+	pid_idr_init();
 	anon_vma_init();
 #ifdef CONFIG_X86
 	if (efi_enabled(EFI_RUNTIME_SERVICES))
diff --git a/kernel/pid.c b/kernel/pid.c
index 020dedbdf066..0ce59369632f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -39,6 +39,7 @@
 #include <linux/proc_ns.h>
 #include <linux/proc_fs.h>
 #include <linux/sched/task.h>
+#include <linux/idr.h>
 
 #define pid_hashfn(nr, ns)	\
 	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -53,14 +54,6 @@ int pid_max = PID_MAX_DEFAULT;
 int pid_max_min = RESERVED_PIDS + 1;
 int pid_max_max = PID_MAX_LIMIT;
 
-static inline int mk_pid(struct pid_namespace *pid_ns,
-		struct pidmap *map, int off)
-{
-	return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
-}
-
-#define find_next_offset(map, off)					\
-		find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
 
 /*
  * PID-map pages start out as NULL, they get allocated upon
@@ -70,10 +63,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
  */
 struct pid_namespace init_pid_ns = {
 	.kref = KREF_INIT(2),
-	.pidmap = {
-		[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
-	},
-	.last_pid = 0,
+	.idr = IDR_INIT,
 	.nr_hashed = PIDNS_HASH_ADDING,
 	.level = 0,
 	.child_reaper = &init_task,
@@ -101,138 +91,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
 
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
 
-static void free_pidmap(struct upid *upid)
-{
-	int nr = upid->nr;
-	struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
-	int offset = nr & BITS_PER_PAGE_MASK;
-
-	clear_bit(offset, map->page);
-	atomic_inc(&map->nr_free);
-}
-
-/*
- * If we started walking pids at 'base', is 'a' seen before 'b'?
- */
-static int pid_before(int base, int a, int b)
-{
-	/*
-	 * This is the same as saying
-	 *
-	 * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
-	 * and that mapping orders 'a' and 'b' with respect to 'base'.
-	 */
-	return (unsigned)(a - base) < (unsigned)(b - base);
-}
-
-/*
- * We might be racing with someone else trying to set pid_ns->last_pid
- * at the pid allocation time (there's also a sysctl for this, but racing
- * with this one is OK, see comment in kernel/pid_namespace.c about it).
- * We want the winner to have the "later" value, because if the
- * "earlier" value prevails, then a pid may get reused immediately.
- *
- * Since pids rollover, it is not sufficient to just pick the bigger
- * value.  We have to consider where we started counting from.
- *
- * 'base' is the value of pid_ns->last_pid that we observed when
- * we started looking for a pid.
- *
- * 'pid' is the pid that we eventually found.
- */
-static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
-{
-	int prev;
-	int last_write = base;
-	do {
-		prev = last_write;
-		last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
-	} while ((prev != last_write) && (pid_before(base, last_write, pid)));
-}
-
-static int alloc_pidmap(struct pid_namespace *pid_ns)
-{
-	int i, offset, max_scan, pid, last = pid_ns->last_pid;
-	struct pidmap *map;
-
-	pid = last + 1;
-	if (pid >= pid_max)
-		pid = RESERVED_PIDS;
-	offset = pid & BITS_PER_PAGE_MASK;
-	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
-	/*
-	 * If last_pid points into the middle of the map->page we
-	 * want to scan this bitmap block twice, the second time
-	 * we start with offset == 0 (or RESERVED_PIDS).
-	 */
-	max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
-	for (i = 0; i <= max_scan; ++i) {
-		if (unlikely(!map->page)) {
-			void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-			/*
-			 * Free the page if someone raced with us
-			 * installing it:
-			 */
-			spin_lock_irq(&pidmap_lock);
-			if (!map->page) {
-				map->page = page;
-				page = NULL;
-			}
-			spin_unlock_irq(&pidmap_lock);
-			kfree(page);
-			if (unlikely(!map->page))
-				return -ENOMEM;
-		}
-		if (likely(atomic_read(&map->nr_free))) {
-			for ( ; ; ) {
-				if (!test_and_set_bit(offset, map->page)) {
-					atomic_dec(&map->nr_free);
-					set_last_pid(pid_ns, last, pid);
-					return pid;
-				}
-				offset = find_next_offset(map, offset);
-				if (offset >= BITS_PER_PAGE)
-					break;
-				pid = mk_pid(pid_ns, map, offset);
-				if (pid >= pid_max)
-					break;
-			}
-		}
-		if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
-			++map;
-			offset = 0;
-		} else {
-			map = &pid_ns->pidmap[0];
-			offset = RESERVED_PIDS;
-			if (unlikely(last == offset))
-				break;
-		}
-		pid = mk_pid(pid_ns, map, offset);
-	}
-	return -EAGAIN;
-}
-
-int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
-{
-	int offset;
-	struct pidmap *map, *end;
-
-	if (last >= PID_MAX_LIMIT)
-		return -1;
-
-	offset = (last + 1) & BITS_PER_PAGE_MASK;
-	map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
-	end = &pid_ns->pidmap[PIDMAP_ENTRIES];
-	for (; map < end; map++, offset = 0) {
-		if (unlikely(!map->page))
-			continue;
-		offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
-		if (offset < BITS_PER_PAGE)
-			return mk_pid(pid_ns, map, offset);
-	}
-	return -1;
-}
-
 void put_pid(struct pid *pid)
 {
 	struct pid_namespace *ns;
@@ -266,7 +124,7 @@ void free_pid(struct pid *pid)
 		struct upid *upid = pid->numbers + i;
 		struct pid_namespace *ns = upid->ns;
 		hlist_del_rcu(&upid->pid_chain);
-		switch(--ns->nr_hashed) {
+		switch (--ns->nr_hashed) {
 		case 2:
 		case 1:
 			/* When all that is left in the pid namespace
@@ -284,12 +142,11 @@ void free_pid(struct pid *pid)
 			schedule_work(&ns->proc_work);
 			break;
 		}
+
+		idr_remove(&ns->idr, upid->nr);
 	}
 	spin_unlock_irqrestore(&pidmap_lock, flags);
 
-	for (i = 0; i <= pid->level; i++)
-		free_pidmap(pid->numbers + i);
-
 	call_rcu(&pid->rcu, delayed_put_pid);
 }
 
@@ -308,8 +165,29 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 
 	tmp = ns;
 	pid->level = ns->level;
+
 	for (i = ns->level; i >= 0; i--) {
-		nr = alloc_pidmap(tmp);
+		int pid_min = 1;
+
+		idr_preload(GFP_KERNEL);
+		spin_lock_irq(&pidmap_lock);
+
+		/*
+		 * init really needs pid 1, but after reaching the maximum
+		 * wrap back to RESERVED_PIDS
+		 */
+		if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
+			pid_min = RESERVED_PIDS;
+
+		/*
+		 * Store a null pointer so find_pid_ns does not find
+		 * a partially initialized PID (see below).
+		 */
+		nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
+				      pid_max, GFP_ATOMIC);
+		spin_unlock_irq(&pidmap_lock);
+		idr_preload_end();
+
 		if (nr < 0) {
 			retval = nr;
 			goto out_free;
@@ -339,6 +217,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 	for ( ; upid >= pid->numbers; --upid) {
 		hlist_add_head_rcu(&upid->pid_chain,
 				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+		/* Make the PID visible to find_pid_ns. */
+		idr_replace(&upid->ns->idr, pid, upid->nr);
 		upid->ns->nr_hashed++;
 	}
 	spin_unlock_irq(&pidmap_lock);
@@ -350,8 +230,11 @@ out_unlock:
 	put_pid_ns(ns);
 
 out_free:
+	spin_lock_irq(&pidmap_lock);
 	while (++i <= ns->level)
-		free_pidmap(pid->numbers + i);
+		idr_remove(&ns->idr, (pid->numbers + i)->nr);
+
+	spin_unlock_irq(&pidmap_lock);
 
 	kmem_cache_free(ns->pid_cachep, pid);
 	return ERR_PTR(retval);
@@ -553,16 +436,7 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns);
  */
 struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 {
-	struct pid *pid;
-
-	do {
-		pid = find_pid_ns(nr, ns);
-		if (pid)
-			break;
-		nr = next_pidmap(ns, nr);
-	} while (nr > 0);
-
-	return pid;
+	return idr_get_next(&ns->idr, &nr);
 }
 
 /*
@@ -578,7 +452,7 @@ void __init pidhash_init(void)
 					   0, 4096);
 }
 
-void __init pidmap_init(void)
+void __init pid_idr_init(void)
 {
 	/* Verify no one has done anything silly: */
 	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
@@ -590,10 +464,7 @@ void __init pidmap_init(void)
 				PIDS_PER_CPU_MIN * num_possible_cpus());
 	pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
 
-	init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	/* Reserve PID 0. We never call free_pidmap(0) */
-	set_bit(0, init_pid_ns.pidmap[0].page);
-	atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+	idr_init(&init_pid_ns.idr);
 
 	init_pid_ns.pid_cachep = KMEM_CACHE(pid,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 4918314893bc..ca7c8a8823b1 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -21,6 +21,7 @@
 #include <linux/export.h>
 #include <linux/sched/task.h>
 #include <linux/sched/signal.h>
+#include <linux/idr.h>
 
 struct pid_cache {
 	int nr_ids;
@@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	struct pid_namespace *ns;
 	unsigned int level = parent_pid_ns->level + 1;
 	struct ucounts *ucounts;
-	int i;
 	int err;
 
 	err = -EINVAL;
@@ -117,17 +117,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	if (ns == NULL)
 		goto out_dec;
 
-	ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!ns->pidmap[0].page)
-		goto out_free;
+	idr_init(&ns->idr);
 
 	ns->pid_cachep = create_pid_cachep(level + 1);
 	if (ns->pid_cachep == NULL)
-		goto out_free_map;
+		goto out_free_idr;
 
 	err = ns_alloc_inum(&ns->ns);
 	if (err)
-		goto out_free_map;
+		goto out_free_idr;
 	ns->ns.ops = &pidns_operations;
 
 	kref_init(&ns->kref);
@@ -138,17 +136,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	ns->nr_hashed = PIDNS_HASH_ADDING;
 	INIT_WORK(&ns->proc_work, proc_cleanup_work);
 
-	set_bit(0, ns->pidmap[0].page);
-	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-
-	for (i = 1; i < PIDMAP_ENTRIES; i++)
-		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-
 	return ns;
 
-out_free_map:
-	kfree(ns->pidmap[0].page);
-out_free:
+out_free_idr:
+	idr_destroy(&ns->idr);
 	kmem_cache_free(pid_ns_cachep, ns);
 out_dec:
 	dec_pid_namespaces(ucounts);
@@ -168,11 +159,9 @@ static void delayed_free_pidns(struct rcu_head *p)
 
 static void destroy_pid_namespace(struct pid_namespace *ns)
 {
-	int i;
-
 	ns_free_inum(&ns->ns);
-	for (i = 0; i < PIDMAP_ENTRIES; i++)
-		kfree(ns->pidmap[i].page);
+
+	idr_destroy(&ns->idr);
 	call_rcu(&ns->rcu, delayed_free_pidns);
 }
 
@@ -213,6 +202,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	int rc;
 	struct task_struct *task, *me = current;
 	int init_pids = thread_group_leader(me) ? 1 : 2;
+	struct pid *pid;
 
 	/* Don't allow any more processes into the pid namespace */
 	disable_pid_allocation(pid_ns);
@@ -239,20 +229,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 * 	  maintain a tasklist for each pid namespace.
 	 *
 	 */
+	rcu_read_lock();
 	read_lock(&tasklist_lock);
-	nr = next_pidmap(pid_ns, 1);
-	while (nr > 0) {
-		rcu_read_lock();
-
-		task = pid_task(find_vpid(nr), PIDTYPE_PID);
+	nr = 2;
+	idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
+		task = pid_task(pid, PIDTYPE_PID);
 		if (task && !__fatal_signal_pending(task))
 			send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
-
-		rcu_read_unlock();
-
-		nr = next_pidmap(pid_ns, nr);
 	}
 	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	/*
 	 * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
@@ -301,6 +287,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 {
 	struct pid_namespace *pid_ns = task_active_pid_ns(current);
 	struct ctl_table tmp = *table;
+	int ret, next;
 
 	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
@@ -311,8 +298,14 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 	 * it should synchronize its usage with external means.
 	 */
 
-	tmp.data = &pid_ns->last_pid;
-	return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	next = idr_get_cursor(&pid_ns->idr) - 1;
+
+	tmp.data = &next;
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (!ret && write)
+		idr_set_cursor(&pid_ns->idr, next + 1);
+
+	return ret;
 }
 
 extern int pid_max;
-- 
cgit v1.2.3


From e8cfbc245e24887e3c30235f71e9e9405e0cfc39 Mon Sep 17 00:00:00 2001
From: Gargi Sharma <gs051095@gmail.com>
Date: Fri, 17 Nov 2017 15:30:34 -0800
Subject: pid: remove pidhash

pidhash is no longer required as all the information can be looked up
from idr tree.  nr_hashed represented the number of pids that had been
hashed.  Since, nr_hashed and PIDNS_HASH_ADDING are no longer relevant,
it has been renamed to pid_allocated and PIDNS_ADDING respectively.

[gs051095@gmail.com: v6]
  Link: http://lkml.kernel.org/r/1507760379-21662-3-git-send-email-gs051095@gmail.com
Link: http://lkml.kernel.org/r/1507583624-22146-3-git-send-email-gs051095@gmail.com
Signed-off-by: Gargi Sharma <gs051095@gmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Tested-by: Tony Luck <tony.luck@intel.com>	[ia64]
Cc: Julia Lawall <julia.lawall@lip6.fr>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/asm-offsets.c |  4 ++--
 include/linux/init_task.h      |  1 -
 include/linux/pid.h            |  2 --
 include/linux/pid_namespace.h  |  4 ++--
 init/main.c                    |  1 -
 kernel/fork.c                  |  2 +-
 kernel/pid.c                   | 48 +++++++++---------------------------------
 kernel/pid_namespace.c         |  6 +++---
 8 files changed, 18 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
index f7693f49c573..f4db2168d1b8 100644
--- a/arch/ia64/kernel/asm-offsets.c
+++ b/arch/ia64/kernel/asm-offsets.c
@@ -31,8 +31,8 @@ void foo(void)
 	DEFINE(SIGFRAME_SIZE, sizeof (struct sigframe));
 	DEFINE(UNW_FRAME_INFO_SIZE, sizeof (struct unw_frame_info));
 
-	BUILD_BUG_ON(sizeof(struct upid) != 32);
-	DEFINE(IA64_UPID_SHIFT, 5);
+	BUILD_BUG_ON(sizeof(struct upid) != 16);
+	DEFINE(IA64_UPID_SHIFT, 4);
 
 	BLANK();
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 8062e6cc607c..6a532629c983 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -105,7 +105,6 @@ extern struct group_info init_groups;
 	.numbers	= { {						\
 		.nr		= 0,					\
 		.ns		= &init_pid_ns,				\
-		.pid_chain	= { .next = NULL, .pprev = NULL },	\
 	}, }								\
 }
 
diff --git a/include/linux/pid.h b/include/linux/pid.h
index dfd684ce0787..7633d55d9a24 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -51,10 +51,8 @@ enum pid_type
  */
 
 struct upid {
-	/* Try to keep pid_chain in the same cacheline as nr for find_vpid */
 	int nr;
 	struct pid_namespace *ns;
-	struct hlist_node pid_chain;
 };
 
 struct pid
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 92c6aa509d2e..49538b172483 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -25,7 +25,7 @@ struct pid_namespace {
 	struct kref kref;
 	struct idr idr;
 	struct rcu_head rcu;
-	unsigned int nr_hashed;
+	unsigned int pid_allocated;
 	struct task_struct *child_reaper;
 	struct kmem_cache *pid_cachep;
 	unsigned int level;
@@ -49,7 +49,7 @@ struct pid_namespace {
 
 extern struct pid_namespace init_pid_ns;
 
-#define PIDNS_HASH_ADDING (1U << 31)
+#define PIDNS_ADDING (1U << 31)
 
 #ifdef CONFIG_PID_NS
 static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
diff --git a/init/main.c b/init/main.c
index d0cbcfc06124..dfec3809e740 100644
--- a/init/main.c
+++ b/init/main.c
@@ -562,7 +562,6 @@ asmlinkage __visible void __init start_kernel(void)
 	 * kmem_cache_init()
 	 */
 	setup_log_buf(0);
-	pidhash_init();
 	vfs_caches_init_early();
 	sort_main_extable();
 	trap_init();
diff --git a/kernel/fork.c b/kernel/fork.c
index 4e55eedba8d6..432eadf6b58c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1871,7 +1871,7 @@ static __latent_entropy struct task_struct *copy_process(
 		retval = -ERESTARTNOINTR;
 		goto bad_fork_cancel_cgroup;
 	}
-	if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
+	if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
 		retval = -ENOMEM;
 		goto bad_fork_cancel_cgroup;
 	}
diff --git a/kernel/pid.c b/kernel/pid.c
index 0ce59369632f..b13b624e2c49 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -41,10 +41,6 @@
 #include <linux/sched/task.h>
 #include <linux/idr.h>
 
-#define pid_hashfn(nr, ns)	\
-	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
-static struct hlist_head *pid_hash;
-static unsigned int pidhash_shift = 4;
 struct pid init_struct_pid = INIT_STRUCT_PID;
 
 int pid_max = PID_MAX_DEFAULT;
@@ -54,7 +50,6 @@ int pid_max = PID_MAX_DEFAULT;
 int pid_max_min = RESERVED_PIDS + 1;
 int pid_max_max = PID_MAX_LIMIT;
 
-
 /*
  * PID-map pages start out as NULL, they get allocated upon
  * first use and are never deallocated. This way a low pid_max
@@ -64,7 +59,7 @@ int pid_max_max = PID_MAX_LIMIT;
 struct pid_namespace init_pid_ns = {
 	.kref = KREF_INIT(2),
 	.idr = IDR_INIT,
-	.nr_hashed = PIDNS_HASH_ADDING,
+	.pid_allocated = PIDNS_ADDING,
 	.level = 0,
 	.child_reaper = &init_task,
 	.user_ns = &init_user_ns,
@@ -123,8 +118,7 @@ void free_pid(struct pid *pid)
 	for (i = 0; i <= pid->level; i++) {
 		struct upid *upid = pid->numbers + i;
 		struct pid_namespace *ns = upid->ns;
-		hlist_del_rcu(&upid->pid_chain);
-		switch (--ns->nr_hashed) {
+		switch (--ns->pid_allocated) {
 		case 2:
 		case 1:
 			/* When all that is left in the pid namespace
@@ -133,10 +127,10 @@ void free_pid(struct pid *pid)
 			 */
 			wake_up_process(ns->child_reaper);
 			break;
-		case PIDNS_HASH_ADDING:
+		case PIDNS_ADDING:
 			/* Handle a fork failure of the first process */
 			WARN_ON(ns->child_reaper);
-			ns->nr_hashed = 0;
+			ns->pid_allocated = 0;
 			/* fall through */
 		case 0:
 			schedule_work(&ns->proc_work);
@@ -212,14 +206,12 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 
 	upid = pid->numbers + ns->level;
 	spin_lock_irq(&pidmap_lock);
-	if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
+	if (!(ns->pid_allocated & PIDNS_ADDING))
 		goto out_unlock;
 	for ( ; upid >= pid->numbers; --upid) {
-		hlist_add_head_rcu(&upid->pid_chain,
-				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
 		/* Make the PID visible to find_pid_ns. */
 		idr_replace(&upid->ns->idr, pid, upid->nr);
-		upid->ns->nr_hashed++;
+		upid->ns->pid_allocated++;
 	}
 	spin_unlock_irq(&pidmap_lock);
 
@@ -243,21 +235,13 @@ out_free:
 void disable_pid_allocation(struct pid_namespace *ns)
 {
 	spin_lock_irq(&pidmap_lock);
-	ns->nr_hashed &= ~PIDNS_HASH_ADDING;
+	ns->pid_allocated &= ~PIDNS_ADDING;
 	spin_unlock_irq(&pidmap_lock);
 }
 
 struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
 {
-	struct upid *pnr;
-
-	hlist_for_each_entry_rcu(pnr,
-			&pid_hash[pid_hashfn(nr, ns)], pid_chain)
-		if (pnr->nr == nr && pnr->ns == ns)
-			return container_of(pnr, struct pid,
-					numbers[ns->level]);
-
-	return NULL;
+	return idr_find(&ns->idr, nr);
 }
 EXPORT_SYMBOL_GPL(find_pid_ns);
 
@@ -413,6 +397,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 		if (type != PIDTYPE_PID) {
 			if (type == __PIDTYPE_TGID)
 				type = PIDTYPE_PID;
+
 			task = task->group_leader;
 		}
 		nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
@@ -439,23 +424,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 	return idr_get_next(&ns->idr, &nr);
 }
 
-/*
- * The pid hash table is scaled according to the amount of memory in the
- * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
- * more.
- */
-void __init pidhash_init(void)
-{
-	pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
-					   HASH_EARLY | HASH_SMALL | HASH_ZERO,
-					   &pidhash_shift, NULL,
-					   0, 4096);
-}
-
 void __init pid_idr_init(void)
 {
 	/* Verify no one has done anything silly: */
-	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
+	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
 
 	/* bump default and minimum pid_max based on number of cpus */
 	pid_max = min(pid_max_max, max_t(int, pid_max,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ca7c8a8823b1..0b53eef7d34b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -133,7 +133,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	ns->parent = get_pid_ns(parent_pid_ns);
 	ns->user_ns = get_user_ns(user_ns);
 	ns->ucounts = ucounts;
-	ns->nr_hashed = PIDNS_HASH_ADDING;
+	ns->pid_allocated = PIDNS_ADDING;
 	INIT_WORK(&ns->proc_work, proc_cleanup_work);
 
 	return ns;
@@ -254,7 +254,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 * sys_wait4() above can't reap the EXIT_DEAD children but we do not
 	 * really care, we could reparent them to the global init. We could
 	 * exit and reap ->child_reaper even if it is not the last thread in
-	 * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
+	 * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
 	 * pid_ns can not go away until proc_kill_sb() drops the reference.
 	 *
 	 * But this ns can also have other tasks injected by setns()+fork().
@@ -268,7 +268,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 */
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (pid_ns->nr_hashed == init_pids)
+		if (pid_ns->pid_allocated == init_pids)
 			break;
 		schedule();
 	}
-- 
cgit v1.2.3


From 4efb442cc12eb66535b7c7ed06005fd7889c1d77 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 17 Nov 2017 15:30:38 -0800
Subject: kernel/panic.c: add TAINT_AUX

This is the gist of a patch which we've been forward-porting in our
kernels for a long time now and it probably would make a good sense to
have such TAINT_AUX flag upstream which can be used by each distro etc,
how they see fit.  This way, we won't need to forward-port a distro-only
version indefinitely.

Add an auxiliary taint flag to be used by distros and others.  This
obviates the need to forward-port whatever internal solutions people
have in favor of a single flag which they can map arbitrarily to a
definition of their pleasing.

The "X" mnemonic could also mean eXternal, which would be taint from a
distro or something else but not the upstream kernel.  We will use it to
mark modules for which we don't provide support.  I.e., a really
eXternal module.

Link: http://lkml.kernel.org/r/20170911134533.dp5mtyku5bongx4c@pd.tnic
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 3 ++-
 kernel/panic.c         | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 4b484ab9e163..ce51455e2adf 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -549,7 +549,8 @@ extern enum system_states {
 #define TAINT_UNSIGNED_MODULE		13
 #define TAINT_SOFTLOCKUP		14
 #define TAINT_LIVEPATCH			15
-#define TAINT_FLAGS_COUNT		16
+#define TAINT_AUX			16
+#define TAINT_FLAGS_COUNT		17
 
 struct taint_flag {
 	char c_true;	/* character printed when tainted */
diff --git a/kernel/panic.c b/kernel/panic.c
index 3242b64b1956..2cfef408fec9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -324,6 +324,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
 	{ 'E', ' ', true },	/* TAINT_UNSIGNED_MODULE */
 	{ 'L', ' ', false },	/* TAINT_SOFTLOCKUP */
 	{ 'K', ' ', true },	/* TAINT_LIVEPATCH */
+	{ 'X', ' ', true },	/* TAINT_AUX */
 };
 
 /**
@@ -345,6 +346,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
  *  'E' - Unsigned module has been loaded.
  *  'L' - A soft lockup has previously occurred.
  *  'K' - Kernel has been live patched.
+ *  'X' - Auxiliary taint, for distros' use.
  *
  *	The string is overwritten by the next call to print_tainted().
  */
-- 
cgit v1.2.3


From ded97d2c2b2c5f1dcced0bc57133f7753b037dfc Mon Sep 17 00:00:00 2001
From: Victor Chibotaru <tchibo@google.com>
Date: Fri, 17 Nov 2017 15:30:46 -0800
Subject: kcov: support comparison operands collection

Enables kcov to collect comparison operands from instrumented code.
This is done by using Clang's -fsanitize=trace-cmp instrumentation
(currently not available for GCC).

The comparison operands help a lot in fuzz testing.  E.g.  they are used
in Syzkaller to cover the interiors of conditional statements with way
less attempts and thus make previously unreachable code reachable.

To allow separate collection of coverage and comparison operands two
different work modes are implemented.  Mode selection is now done via a
KCOV_ENABLE ioctl call with corresponding argument value.

Link: http://lkml.kernel.org/r/20171011095459.70721-1-glider@google.com
Signed-off-by: Victor Chibotaru <tchibo@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: <syzkaller@googlegroups.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kcov.h      |  12 ++-
 include/uapi/linux/kcov.h |  24 ++++++
 kernel/kcov.c             | 214 ++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 211 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kcov.h b/include/linux/kcov.h
index f5d8ce4f4f86..3ecf6f5e3a5f 100644
--- a/include/linux/kcov.h
+++ b/include/linux/kcov.h
@@ -8,19 +8,23 @@ struct task_struct;
 
 #ifdef CONFIG_KCOV
 
-void kcov_task_init(struct task_struct *t);
-void kcov_task_exit(struct task_struct *t);
-
 enum kcov_mode {
 	/* Coverage collection is not enabled yet. */
 	KCOV_MODE_DISABLED = 0,
+	/* KCOV was initialized, but tracing mode hasn't been chosen yet. */
+	KCOV_MODE_INIT = 1,
 	/*
 	 * Tracing coverage collection mode.
 	 * Covered PCs are collected in a per-task buffer.
 	 */
-	KCOV_MODE_TRACE = 1,
+	KCOV_MODE_TRACE_PC = 2,
+	/* Collecting comparison operands mode. */
+	KCOV_MODE_TRACE_CMP = 3,
 };
 
+void kcov_task_init(struct task_struct *t);
+void kcov_task_exit(struct task_struct *t);
+
 #else
 
 static inline void kcov_task_init(struct task_struct *t) {}
diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h
index 33eabbb8ada1..9529867717a8 100644
--- a/include/uapi/linux/kcov.h
+++ b/include/uapi/linux/kcov.h
@@ -8,4 +8,28 @@
 #define KCOV_ENABLE			_IO('c', 100)
 #define KCOV_DISABLE			_IO('c', 101)
 
+enum {
+	/*
+	 * Tracing coverage collection mode.
+	 * Covered PCs are collected in a per-task buffer.
+	 * In new KCOV version the mode is chosen by calling
+	 * ioctl(fd, KCOV_ENABLE, mode). In older versions the mode argument
+	 * was supposed to be 0 in such a call. So, for reasons of backward
+	 * compatibility, we have chosen the value KCOV_TRACE_PC to be 0.
+	 */
+	KCOV_TRACE_PC = 0,
+	/* Collecting comparison operands mode. */
+	KCOV_TRACE_CMP = 1,
+};
+
+/*
+ * The format for the types of collected comparisons.
+ *
+ * Bit 0 shows whether one of the arguments is a compile-time constant.
+ * Bits 1 & 2 contain log2 of the argument size, up to 8 bytes.
+ */
+#define KCOV_CMP_CONST          (1 << 0)
+#define KCOV_CMP_SIZE(n)        ((n) << 1)
+#define KCOV_CMP_MASK           KCOV_CMP_SIZE(3)
+
 #endif /* _LINUX_KCOV_IOCTLS_H */
diff --git a/kernel/kcov.c b/kernel/kcov.c
index d9f9fa9cacc6..15f33faf4013 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -22,13 +22,21 @@
 #include <linux/kcov.h>
 #include <asm/setup.h>
 
+/* Number of 64-bit words written per one comparison: */
+#define KCOV_WORDS_PER_CMP 4
+
 /*
  * kcov descriptor (one per opened debugfs file).
  * State transitions of the descriptor:
  *  - initial state after open()
  *  - then there must be a single ioctl(KCOV_INIT_TRACE) call
  *  - then, mmap() call (several calls are allowed but not useful)
- *  - then, repeated enable/disable for a task (only one task a time allowed)
+ *  - then, ioctl(KCOV_ENABLE, arg), where arg is
+ *	KCOV_TRACE_PC - to trace only the PCs
+ *	or
+ *	KCOV_TRACE_CMP - to trace only the comparison operands
+ *  - then, ioctl(KCOV_DISABLE) to disable the task.
+ * Enabling/disabling ioctls can be repeated (only one task a time allowed).
  */
 struct kcov {
 	/*
@@ -48,51 +56,176 @@ struct kcov {
 	struct task_struct	*t;
 };
 
-/*
- * Entry point from instrumented code.
- * This is called once per basic-block/edge.
- */
-void notrace __sanitizer_cov_trace_pc(void)
+static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
 {
-	struct task_struct *t;
 	enum kcov_mode mode;
 
-	t = current;
 	/*
 	 * We are interested in code coverage as a function of a syscall inputs,
 	 * so we ignore code executed in interrupts.
 	 */
 	if (!in_task())
-		return;
+		return false;
 	mode = READ_ONCE(t->kcov_mode);
-	if (mode == KCOV_MODE_TRACE) {
-		unsigned long *area;
-		unsigned long pos;
-		unsigned long ip = _RET_IP_;
+	/*
+	 * There is some code that runs in interrupts but for which
+	 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+	 * READ_ONCE()/barrier() effectively provides load-acquire wrt
+	 * interrupts, there are paired barrier()/WRITE_ONCE() in
+	 * kcov_ioctl_locked().
+	 */
+	barrier();
+	return mode == needed_mode;
+}
 
+static unsigned long canonicalize_ip(unsigned long ip)
+{
 #ifdef CONFIG_RANDOMIZE_BASE
-		ip -= kaslr_offset();
+	ip -= kaslr_offset();
 #endif
+	return ip;
+}
 
-		/*
-		 * There is some code that runs in interrupts but for which
-		 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
-		 * READ_ONCE()/barrier() effectively provides load-acquire wrt
-		 * interrupts, there are paired barrier()/WRITE_ONCE() in
-		 * kcov_ioctl_locked().
-		 */
-		barrier();
-		area = t->kcov_area;
-		/* The first word is number of subsequent PCs. */
-		pos = READ_ONCE(area[0]) + 1;
-		if (likely(pos < t->kcov_size)) {
-			area[pos] = ip;
-			WRITE_ONCE(area[0], pos);
-		}
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void notrace __sanitizer_cov_trace_pc(void)
+{
+	struct task_struct *t;
+	unsigned long *area;
+	unsigned long ip = canonicalize_ip(_RET_IP_);
+	unsigned long pos;
+
+	t = current;
+	if (!check_kcov_mode(KCOV_MODE_TRACE_PC, t))
+		return;
+
+	area = t->kcov_area;
+	/* The first 64-bit word is the number of subsequent PCs. */
+	pos = READ_ONCE(area[0]) + 1;
+	if (likely(pos < t->kcov_size)) {
+		area[pos] = ip;
+		WRITE_ONCE(area[0], pos);
 	}
 }
 EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
 
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
+{
+	struct task_struct *t;
+	u64 *area;
+	u64 count, start_index, end_pos, max_pos;
+
+	t = current;
+	if (!check_kcov_mode(KCOV_MODE_TRACE_CMP, t))
+		return;
+
+	ip = canonicalize_ip(ip);
+
+	/*
+	 * We write all comparison arguments and types as u64.
+	 * The buffer was allocated for t->kcov_size unsigned longs.
+	 */
+	area = (u64 *)t->kcov_area;
+	max_pos = t->kcov_size * sizeof(unsigned long);
+
+	count = READ_ONCE(area[0]);
+
+	/* Every record is KCOV_WORDS_PER_CMP 64-bit words. */
+	start_index = 1 + count * KCOV_WORDS_PER_CMP;
+	end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
+	if (likely(end_pos <= max_pos)) {
+		area[start_index] = type;
+		area[start_index + 1] = arg1;
+		area[start_index + 2] = arg2;
+		area[start_index + 3] = ip;
+		WRITE_ONCE(area[0], count + 1);
+	}
+}
+
+void notrace __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(0), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp1);
+
+void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(1), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
+
+void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4);
+
+void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp8);
+
+void notrace __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(0) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp1);
+
+void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(1) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
+
+void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4);
+
+void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8);
+
+void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
+{
+	u64 i;
+	u64 count = cases[0];
+	u64 size = cases[1];
+	u64 type = KCOV_CMP_CONST;
+
+	switch (size) {
+	case 8:
+		type |= KCOV_CMP_SIZE(0);
+		break;
+	case 16:
+		type |= KCOV_CMP_SIZE(1);
+		break;
+	case 32:
+		type |= KCOV_CMP_SIZE(2);
+		break;
+	case 64:
+		type |= KCOV_CMP_SIZE(3);
+		break;
+	default:
+		return;
+	}
+	for (i = 0; i < count; i++)
+		write_comp_data(type, cases[i + 2], val, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
+#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */
+
 static void kcov_get(struct kcov *kcov)
 {
 	atomic_inc(&kcov->refcount);
@@ -129,6 +262,7 @@ void kcov_task_exit(struct task_struct *t)
 	/* Just to not leave dangling references behind. */
 	kcov_task_init(t);
 	kcov->t = NULL;
+	kcov->mode = KCOV_MODE_INIT;
 	spin_unlock(&kcov->lock);
 	kcov_put(kcov);
 }
@@ -147,7 +281,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
 
 	spin_lock(&kcov->lock);
 	size = kcov->size * sizeof(unsigned long);
-	if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
+	if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
 	    vma->vm_end - vma->vm_start != size) {
 		res = -EINVAL;
 		goto exit;
@@ -176,6 +310,7 @@ static int kcov_open(struct inode *inode, struct file *filep)
 	kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
 	if (!kcov)
 		return -ENOMEM;
+	kcov->mode = KCOV_MODE_DISABLED;
 	atomic_set(&kcov->refcount, 1);
 	spin_lock_init(&kcov->lock);
 	filep->private_data = kcov;
@@ -211,7 +346,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 		if (size < 2 || size > INT_MAX / sizeof(unsigned long))
 			return -EINVAL;
 		kcov->size = size;
-		kcov->mode = KCOV_MODE_TRACE;
+		kcov->mode = KCOV_MODE_INIT;
 		return 0;
 	case KCOV_ENABLE:
 		/*
@@ -221,17 +356,25 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 		 * at task exit or voluntary by KCOV_DISABLE. After that it can
 		 * be enabled for another task.
 		 */
-		unused = arg;
-		if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
-		    kcov->area == NULL)
+		if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
 			return -EINVAL;
 		if (kcov->t != NULL)
 			return -EBUSY;
+		if (arg == KCOV_TRACE_PC)
+			kcov->mode = KCOV_MODE_TRACE_PC;
+		else if (arg == KCOV_TRACE_CMP)
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+			kcov->mode = KCOV_MODE_TRACE_CMP;
+#else
+		return -ENOTSUPP;
+#endif
+		else
+			return -EINVAL;
 		t = current;
 		/* Cache in task struct for performance. */
 		t->kcov_size = kcov->size;
 		t->kcov_area = kcov->area;
-		/* See comment in __sanitizer_cov_trace_pc(). */
+		/* See comment in check_kcov_mode(). */
 		barrier();
 		WRITE_ONCE(t->kcov_mode, kcov->mode);
 		t->kcov = kcov;
@@ -249,6 +392,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 			return -EINVAL;
 		kcov_task_init(t);
 		kcov->t = NULL;
+		kcov->mode = KCOV_MODE_INIT;
 		kcov_put(kcov);
 		return 0;
 	default:
-- 
cgit v1.2.3


From 2d8364bae4db144df75ba85e92d2b8619ba8eedc Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Fri, 17 Nov 2017 15:30:57 -0800
Subject: kernel/reboot.c: add devm_register_reboot_notifier()

Add devm_* wrapper around register_reboot_notifier to simplify device
specific reboot notifier registration/unregistration.

[akpm@linux-foundation.org: move `struct device' forward decl to top-of-file]
Link: http://lkml.kernel.org/r/20170320171753.1705-1-andrew.smirnov@gmail.com
Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/reboot.h |  4 ++++
 kernel/reboot.c        | 27 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index d03da0eb95ca..e63799a6e895 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -6,6 +6,8 @@
 #include <linux/notifier.h>
 #include <uapi/linux/reboot.h>
 
+struct device;
+
 #define SYS_DOWN	0x0001	/* Notify of system down */
 #define SYS_RESTART	SYS_DOWN
 #define SYS_HALT	0x0002	/* Notify of system halt */
@@ -39,6 +41,8 @@ extern int reboot_force;
 extern int register_reboot_notifier(struct notifier_block *);
 extern int unregister_reboot_notifier(struct notifier_block *);
 
+extern int devm_register_reboot_notifier(struct device *, struct notifier_block *);
+
 extern int register_restart_handler(struct notifier_block *);
 extern int unregister_restart_handler(struct notifier_block *);
 extern void do_kernel_restart(char *cmd);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index bd30a973fe94..e4ced883d8de 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,6 +104,33 @@ int unregister_reboot_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_reboot_notifier);
 
+static void devm_unregister_reboot_notifier(struct device *dev, void *res)
+{
+	WARN_ON(unregister_reboot_notifier(*(struct notifier_block **)res));
+}
+
+int devm_register_reboot_notifier(struct device *dev, struct notifier_block *nb)
+{
+	struct notifier_block **rcnb;
+	int ret;
+
+	rcnb = devres_alloc(devm_unregister_reboot_notifier,
+			    sizeof(*rcnb), GFP_KERNEL);
+	if (!rcnb)
+		return -ENOMEM;
+
+	ret = register_reboot_notifier(nb);
+	if (!ret) {
+		*rcnb = nb;
+		devres_add(dev, rcnb);
+	} else {
+		devres_free(rcnb);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(devm_register_reboot_notifier);
+
 /*
  *	Notifier list for kernel code which wants to be called
  *	to restart the system.
-- 
cgit v1.2.3


From b8fd99838435f9b420c3e848192bd43abc648b7f Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Fri, 17 Nov 2017 15:31:08 -0800
Subject: sysvipc: unteach ids->next_id for !CHECKPOINT_RESTORE

Patch series "sysvipc: ipc-key management improvements".

Here are a few improvements I spotted while eyeballing Guillaume's
rhashtable implementation for ipc keys.  The first and fourth patches
are the interesting ones, the middle two are trivial.

This patch (of 4):

The next_id object-allocation functionality was introduced in commit
03f595668017 ("ipc: add sysctl to specify desired next object id").

Given that these new entries are _only_ exported under the
CONFIG_CHECKPOINT_RESTORE option, there is no point for the common case
to even know about ->next_id.  As such rewrite ipc_buildid() such that
it can do away with the field as well as unnecessary branches when
adding a new identifier.  The end result also better differentiates both
cases, so the code ends up being cleaner; albeit the small duplications
regarding the default case.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20170831172049.14576-2-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc_namespace.h |  2 ++
 ipc/util.c                    | 60 ++++++++++++++++++++++++++++++++-----------
 ipc/util.h                    |  5 ----
 3 files changed, 47 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 474812abe773..d7cf3a850853 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -19,7 +19,9 @@ struct ipc_ids {
 	bool tables_initialized;
 	struct rw_semaphore rwsem;
 	struct idr ipcs_idr;
+#ifdef CONFIG_CHECKPOINT_RESTORE
 	int next_id;
+#endif
 	struct rhashtable key_ht;
 };
 
diff --git a/ipc/util.c b/ipc/util.c
index 79b30eee32cd..429c06bdb8ef 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -116,13 +116,15 @@ int ipc_init_ids(struct ipc_ids *ids)
 	int err;
 	ids->in_use = 0;
 	ids->seq = 0;
-	ids->next_id = -1;
 	init_rwsem(&ids->rwsem);
 	err = rhashtable_init(&ids->key_ht, &ipc_kht_params);
 	if (err)
 		return err;
 	idr_init(&ids->ipcs_idr);
 	ids->tables_initialized = true;
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	ids->next_id = -1;
+#endif
 	return 0;
 }
 
@@ -216,6 +218,46 @@ int ipc_get_maxid(struct ipc_ids *ids)
 	return max_id;
 }
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
+/*
+ * Specify desired id for next allocated IPC object.
+ */
+#define ipc_idr_alloc(ids, new)						\
+	idr_alloc(&(ids)->ipcs_idr, (new),				\
+		  (ids)->next_id < 0 ? 0 : ipcid_to_idx((ids)->next_id),\
+		  0, GFP_NOWAIT)
+
+static inline int ipc_buildid(int id, struct ipc_ids *ids,
+			      struct kern_ipc_perm *new)
+{
+	if (ids->next_id < 0) { /* default, behave as !CHECKPOINT_RESTORE */
+		new->seq = ids->seq++;
+		if (ids->seq > IPCID_SEQ_MAX)
+			ids->seq = 0;
+	} else {
+		new->seq = ipcid_to_seqx(ids->next_id);
+		ids->next_id = -1;
+	}
+
+	return SEQ_MULTIPLIER * new->seq + id;
+}
+
+#else
+#define ipc_idr_alloc(ids, new)					\
+	idr_alloc(&(ids)->ipcs_idr, (new), 0, 0, GFP_NOWAIT)
+
+static inline int ipc_buildid(int id, struct ipc_ids *ids,
+			      struct kern_ipc_perm *new)
+{
+	new->seq = ids->seq++;
+	if (ids->seq > IPCID_SEQ_MAX)
+		ids->seq = 0;
+
+	return SEQ_MULTIPLIER * new->seq + id;
+}
+
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
 /**
  * ipc_addid - add an ipc identifier
  * @ids: ipc identifier set
@@ -234,7 +276,6 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
 	kuid_t euid;
 	kgid_t egid;
 	int id, err;
-	int next_id = ids->next_id;
 
 	if (size > IPCMNI)
 		size = IPCMNI;
@@ -254,9 +295,7 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
 	new->cuid = new->uid = euid;
 	new->gid = new->cgid = egid;
 
-	id = idr_alloc(&ids->ipcs_idr, new,
-		       (next_id < 0) ? 0 : ipcid_to_idx(next_id), 0,
-		       GFP_NOWAIT);
+	id = ipc_idr_alloc(ids, new);
 	idr_preload_end();
 
 	if (id >= 0 && new->key != IPC_PRIVATE) {
@@ -274,17 +313,8 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
 	}
 
 	ids->in_use++;
+	new->id = ipc_buildid(id, ids, new);
 
-	if (next_id < 0) {
-		new->seq = ids->seq++;
-		if (ids->seq > IPCID_SEQ_MAX)
-			ids->seq = 0;
-	} else {
-		new->seq = ipcid_to_seqx(next_id);
-		ids->next_id = -1;
-	}
-
-	new->id = ipc_buildid(id, new->seq);
 	return id;
 }
 
diff --git a/ipc/util.h b/ipc/util.h
index 579112d90016..0cd6201fe63a 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -146,11 +146,6 @@ extern struct msg_msg *load_msg(const void __user *src, size_t len);
 extern struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst);
 extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len);
 
-static inline int ipc_buildid(int id, int seq)
-{
-	return SEQ_MULTIPLIER * seq + id;
-}
-
 static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int uid)
 {
 	return uid / SEQ_MULTIPLIER != ipcp->seq;
-- 
cgit v1.2.3


From 15df03c87983660a4d1eedb4541778592bd97684 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Fri, 17 Nov 2017 15:31:18 -0800
Subject: sysvipc: make get_maxid O(1) again

For a custom microbenchmark on a 3.30GHz Xeon SandyBridge, which calls
IPC_STAT over and over, it was calculated that, on avg the cost of
ipc_get_maxid() for increasing amounts of keys was:

 10 keys: ~900 cycles
 100 keys: ~15000 cycles
 1000 keys: ~150000 cycles
 10000 keys: ~2100000 cycles

This is unsurprising as maxid is currently O(n).

By having the max_id available in O(1) we save all those cycles for each
semctl(_STAT) command, the idr_find can be expensive -- which some real
(customer) workloads actually poll on.

Note that this used to be the case, until commit 7ca7e564e04 ("ipc:
store ipcs into IDRs").  The cost is the extra idr_find when doing
RMIDs, but we simply go backwards, and should not take too many
iterations to find the new value.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20170831172049.14576-5-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc_namespace.h |  1 +
 ipc/util.c                    | 43 +++++++++++++------------------------------
 ipc/util.h                    | 21 ++++++++++++++++++---
 3 files changed, 32 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index d7cf3a850853..b5630c8eb2f3 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -19,6 +19,7 @@ struct ipc_ids {
 	bool tables_initialized;
 	struct rw_semaphore rwsem;
 	struct idr ipcs_idr;
+	int max_id;
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	int next_id;
 #endif
diff --git a/ipc/util.c b/ipc/util.c
index e09bf76610ef..ff045fec8d83 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -122,6 +122,7 @@ int ipc_init_ids(struct ipc_ids *ids)
 		return err;
 	idr_init(&ids->ipcs_idr);
 	ids->tables_initialized = true;
+	ids->max_id = -1;
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	ids->next_id = -1;
 #endif
@@ -188,36 +189,6 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
 	return NULL;
 }
 
-/**
- * ipc_get_maxid - get the last assigned id
- * @ids: ipc identifier set
- *
- * Called with ipc_ids.rwsem held.
- */
-int ipc_get_maxid(struct ipc_ids *ids)
-{
-	struct kern_ipc_perm *ipc;
-	int max_id = -1;
-	int total, id;
-
-	if (ids->in_use == 0)
-		return -1;
-
-	if (ids->in_use == IPCMNI)
-		return IPCMNI - 1;
-
-	/* Look for the last assigned id */
-	total = 0;
-	for (id = 0; id < IPCMNI && total < ids->in_use; id++) {
-		ipc = idr_find(&ids->ipcs_idr, id);
-		if (ipc != NULL) {
-			max_id = id;
-			total++;
-		}
-	}
-	return max_id;
-}
-
 #ifdef CONFIG_CHECKPOINT_RESTORE
 /*
  * Specify desired id for next allocated IPC object.
@@ -313,6 +284,9 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
 	}
 
 	ids->in_use++;
+	if (id > ids->max_id)
+		ids->max_id = id;
+
 	new->id = ipc_buildid(id, ids, new);
 
 	return id;
@@ -459,6 +433,15 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
 	ipc_kht_remove(ids, ipcp);
 	ids->in_use--;
 	ipcp->deleted = true;
+
+	if (unlikely(lid == ids->max_id)) {
+		do {
+			lid--;
+			if (lid == -1)
+				break;
+		} while (!idr_find(&ids->ipcs_idr, lid));
+		ids->max_id = lid;
+	}
 }
 
 /**
diff --git a/ipc/util.h b/ipc/util.h
index 0cd6201fe63a..89b8ec176fc4 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -13,6 +13,7 @@
 
 #include <linux/unistd.h>
 #include <linux/err.h>
+#include <linux/ipc_namespace.h>
 
 #define SEQ_MULTIPLIER	(IPCMNI)
 
@@ -99,9 +100,6 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
 /* must be called with ids->rwsem acquired for writing */
 int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);
 
-/* must be called with ids->rwsem acquired for reading */
-int ipc_get_maxid(struct ipc_ids *);
-
 /* must be called with both locks acquired. */
 void ipc_rmid(struct ipc_ids *, struct kern_ipc_perm *);
 
@@ -111,6 +109,23 @@ void ipc_set_key_private(struct ipc_ids *, struct kern_ipc_perm *);
 /* must be called with ipcp locked */
 int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg);
 
+/**
+ * ipc_get_maxid - get the last assigned id
+ * @ids: ipc identifier set
+ *
+ * Called with ipc_ids.rwsem held for reading.
+ */
+static inline int ipc_get_maxid(struct ipc_ids *ids)
+{
+	if (ids->in_use == 0)
+		return -1;
+
+	if (ids->in_use == IPCMNI)
+		return IPCMNI - 1;
+
+	return ids->max_id;
+}
+
 /*
  * For allocation that need to be freed by RCU.
  * Objects are reference counted, they start with reference count 1.
-- 
cgit v1.2.3


From 5a1c269f15dc9f964f829d864ae8abebcaa3d3be Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 3 Aug 2017 12:19:40 -0600
Subject: NTB: switchtec: Move structure definitions into a common header

Create the switchtec.h header in include/linux with hardware defines
and the switchtec_dev structure. Both moved directly from switchtec.c.
This is a prep patch for creating an NTB driver for Switchtec.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Stephen Bates <sbates@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwemmer@microsemi.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 MAINTAINERS                    |   1 +
 drivers/pci/switch/switchtec.c | 260 +-------------------------------------
 include/linux/switchtec.h      | 279 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 281 insertions(+), 259 deletions(-)
 create mode 100644 include/linux/switchtec.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index b1ae5c0faab7..07d288f11a03 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10347,6 +10347,7 @@ F:	Documentation/switchtec.txt
 F:	Documentation/ABI/testing/sysfs-class-switchtec
 F:	drivers/pci/switch/switchtec*
 F:	include/uapi/linux/switchtec_ioctl.h
+F:	include/linux/switchtec.h
 
 PCI DRIVER FOR MVEBU (Marvell Armada 370 and Armada XP SOC support)
 M:	Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index af81b2dec42e..5b75d3008ff8 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -13,6 +13,7 @@
  *
  */
 
+#include <linux/switchtec.h>
 #include <linux/switchtec_ioctl.h>
 
 #include <linux/interrupt.h>
@@ -20,8 +21,6 @@
 #include <linux/fs.h>
 #include <linux/uaccess.h>
 #include <linux/poll.h>
-#include <linux/pci.h>
-#include <linux/cdev.h>
 #include <linux/wait.h>
 
 MODULE_DESCRIPTION("Microsemi Switchtec(tm) PCIe Management Driver");
@@ -37,263 +36,6 @@ static dev_t switchtec_devt;
 static struct class *switchtec_class;
 static DEFINE_IDA(switchtec_minor_ida);
 
-#define MICROSEMI_VENDOR_ID         0x11f8
-#define MICROSEMI_NTB_CLASSCODE     0x068000
-#define MICROSEMI_MGMT_CLASSCODE    0x058000
-
-#define SWITCHTEC_MRPC_PAYLOAD_SIZE 1024
-#define SWITCHTEC_MAX_PFF_CSR 48
-
-#define SWITCHTEC_EVENT_OCCURRED BIT(0)
-#define SWITCHTEC_EVENT_CLEAR    BIT(0)
-#define SWITCHTEC_EVENT_EN_LOG   BIT(1)
-#define SWITCHTEC_EVENT_EN_CLI   BIT(2)
-#define SWITCHTEC_EVENT_EN_IRQ   BIT(3)
-#define SWITCHTEC_EVENT_FATAL    BIT(4)
-
-enum {
-	SWITCHTEC_GAS_MRPC_OFFSET       = 0x0000,
-	SWITCHTEC_GAS_TOP_CFG_OFFSET    = 0x1000,
-	SWITCHTEC_GAS_SW_EVENT_OFFSET   = 0x1800,
-	SWITCHTEC_GAS_SYS_INFO_OFFSET   = 0x2000,
-	SWITCHTEC_GAS_FLASH_INFO_OFFSET = 0x2200,
-	SWITCHTEC_GAS_PART_CFG_OFFSET   = 0x4000,
-	SWITCHTEC_GAS_NTB_OFFSET        = 0x10000,
-	SWITCHTEC_GAS_PFF_CSR_OFFSET    = 0x134000,
-};
-
-struct mrpc_regs {
-	u8 input_data[SWITCHTEC_MRPC_PAYLOAD_SIZE];
-	u8 output_data[SWITCHTEC_MRPC_PAYLOAD_SIZE];
-	u32 cmd;
-	u32 status;
-	u32 ret_value;
-} __packed;
-
-enum mrpc_status {
-	SWITCHTEC_MRPC_STATUS_INPROGRESS = 1,
-	SWITCHTEC_MRPC_STATUS_DONE = 2,
-	SWITCHTEC_MRPC_STATUS_ERROR = 0xFF,
-	SWITCHTEC_MRPC_STATUS_INTERRUPTED = 0x100,
-};
-
-struct sw_event_regs {
-	u64 event_report_ctrl;
-	u64 reserved1;
-	u64 part_event_bitmap;
-	u64 reserved2;
-	u32 global_summary;
-	u32 reserved3[3];
-	u32 stack_error_event_hdr;
-	u32 stack_error_event_data;
-	u32 reserved4[4];
-	u32 ppu_error_event_hdr;
-	u32 ppu_error_event_data;
-	u32 reserved5[4];
-	u32 isp_error_event_hdr;
-	u32 isp_error_event_data;
-	u32 reserved6[4];
-	u32 sys_reset_event_hdr;
-	u32 reserved7[5];
-	u32 fw_exception_hdr;
-	u32 reserved8[5];
-	u32 fw_nmi_hdr;
-	u32 reserved9[5];
-	u32 fw_non_fatal_hdr;
-	u32 reserved10[5];
-	u32 fw_fatal_hdr;
-	u32 reserved11[5];
-	u32 twi_mrpc_comp_hdr;
-	u32 twi_mrpc_comp_data;
-	u32 reserved12[4];
-	u32 twi_mrpc_comp_async_hdr;
-	u32 twi_mrpc_comp_async_data;
-	u32 reserved13[4];
-	u32 cli_mrpc_comp_hdr;
-	u32 cli_mrpc_comp_data;
-	u32 reserved14[4];
-	u32 cli_mrpc_comp_async_hdr;
-	u32 cli_mrpc_comp_async_data;
-	u32 reserved15[4];
-	u32 gpio_interrupt_hdr;
-	u32 gpio_interrupt_data;
-	u32 reserved16[4];
-} __packed;
-
-enum {
-	SWITCHTEC_CFG0_RUNNING = 0x04,
-	SWITCHTEC_CFG1_RUNNING = 0x05,
-	SWITCHTEC_IMG0_RUNNING = 0x03,
-	SWITCHTEC_IMG1_RUNNING = 0x07,
-};
-
-struct sys_info_regs {
-	u32 device_id;
-	u32 device_version;
-	u32 firmware_version;
-	u32 reserved1;
-	u32 vendor_table_revision;
-	u32 table_format_version;
-	u32 partition_id;
-	u32 cfg_file_fmt_version;
-	u16 cfg_running;
-	u16 img_running;
-	u32 reserved2[57];
-	char vendor_id[8];
-	char product_id[16];
-	char product_revision[4];
-	char component_vendor[8];
-	u16 component_id;
-	u8 component_revision;
-} __packed;
-
-struct flash_info_regs {
-	u32 flash_part_map_upd_idx;
-
-	struct active_partition_info {
-		u32 address;
-		u32 build_version;
-		u32 build_string;
-	} active_img;
-
-	struct active_partition_info active_cfg;
-	struct active_partition_info inactive_img;
-	struct active_partition_info inactive_cfg;
-
-	u32 flash_length;
-
-	struct partition_info {
-		u32 address;
-		u32 length;
-	} cfg0;
-
-	struct partition_info cfg1;
-	struct partition_info img0;
-	struct partition_info img1;
-	struct partition_info nvlog;
-	struct partition_info vendor[8];
-};
-
-struct ntb_info_regs {
-	u8  partition_count;
-	u8  partition_id;
-	u16 reserved1;
-	u64 ep_map;
-	u16 requester_id;
-} __packed;
-
-struct part_cfg_regs {
-	u32 status;
-	u32 state;
-	u32 port_cnt;
-	u32 usp_port_mode;
-	u32 usp_pff_inst_id;
-	u32 vep_pff_inst_id;
-	u32 dsp_pff_inst_id[47];
-	u32 reserved1[11];
-	u16 vep_vector_number;
-	u16 usp_vector_number;
-	u32 port_event_bitmap;
-	u32 reserved2[3];
-	u32 part_event_summary;
-	u32 reserved3[3];
-	u32 part_reset_hdr;
-	u32 part_reset_data[5];
-	u32 mrpc_comp_hdr;
-	u32 mrpc_comp_data[5];
-	u32 mrpc_comp_async_hdr;
-	u32 mrpc_comp_async_data[5];
-	u32 dyn_binding_hdr;
-	u32 dyn_binding_data[5];
-	u32 reserved4[159];
-} __packed;
-
-enum {
-	SWITCHTEC_PART_CFG_EVENT_RESET = 1 << 0,
-	SWITCHTEC_PART_CFG_EVENT_MRPC_CMP = 1 << 1,
-	SWITCHTEC_PART_CFG_EVENT_MRPC_ASYNC_CMP = 1 << 2,
-	SWITCHTEC_PART_CFG_EVENT_DYN_PART_CMP = 1 << 3,
-};
-
-struct pff_csr_regs {
-	u16 vendor_id;
-	u16 device_id;
-	u32 pci_cfg_header[15];
-	u32 pci_cap_region[48];
-	u32 pcie_cap_region[448];
-	u32 indirect_gas_window[128];
-	u32 indirect_gas_window_off;
-	u32 reserved[127];
-	u32 pff_event_summary;
-	u32 reserved2[3];
-	u32 aer_in_p2p_hdr;
-	u32 aer_in_p2p_data[5];
-	u32 aer_in_vep_hdr;
-	u32 aer_in_vep_data[5];
-	u32 dpc_hdr;
-	u32 dpc_data[5];
-	u32 cts_hdr;
-	u32 cts_data[5];
-	u32 reserved3[6];
-	u32 hotplug_hdr;
-	u32 hotplug_data[5];
-	u32 ier_hdr;
-	u32 ier_data[5];
-	u32 threshold_hdr;
-	u32 threshold_data[5];
-	u32 power_mgmt_hdr;
-	u32 power_mgmt_data[5];
-	u32 tlp_throttling_hdr;
-	u32 tlp_throttling_data[5];
-	u32 force_speed_hdr;
-	u32 force_speed_data[5];
-	u32 credit_timeout_hdr;
-	u32 credit_timeout_data[5];
-	u32 link_state_hdr;
-	u32 link_state_data[5];
-	u32 reserved4[174];
-} __packed;
-
-struct switchtec_dev {
-	struct pci_dev *pdev;
-	struct device dev;
-	struct cdev cdev;
-
-	int partition;
-	int partition_count;
-	int pff_csr_count;
-	char pff_local[SWITCHTEC_MAX_PFF_CSR];
-
-	void __iomem *mmio;
-	struct mrpc_regs __iomem *mmio_mrpc;
-	struct sw_event_regs __iomem *mmio_sw_event;
-	struct sys_info_regs __iomem *mmio_sys_info;
-	struct flash_info_regs __iomem *mmio_flash_info;
-	struct ntb_info_regs __iomem *mmio_ntb;
-	struct part_cfg_regs __iomem *mmio_part_cfg;
-	struct part_cfg_regs __iomem *mmio_part_cfg_all;
-	struct pff_csr_regs __iomem *mmio_pff_csr;
-
-	/*
-	 * The mrpc mutex must be held when accessing the other
-	 * mrpc_ fields, alive flag and stuser->state field
-	 */
-	struct mutex mrpc_mutex;
-	struct list_head mrpc_queue;
-	int mrpc_busy;
-	struct work_struct mrpc_work;
-	struct delayed_work mrpc_timeout;
-	bool alive;
-
-	wait_queue_head_t event_wq;
-	atomic_t event_cnt;
-};
-
-static struct switchtec_dev *to_stdev(struct device *dev)
-{
-	return container_of(dev, struct switchtec_dev, dev);
-}
-
 enum mrpc_state {
 	MRPC_IDLE = 0,
 	MRPC_QUEUED,
diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
new file mode 100644
index 000000000000..1cbd0e63b0ab
--- /dev/null
+++ b/include/linux/switchtec.h
@@ -0,0 +1,279 @@
+/*
+ * Microsemi Switchtec PCIe Driver
+ * Copyright (c) 2017, Microsemi Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#ifndef _SWITCHTEC_H
+#define _SWITCHTEC_H
+
+#include <linux/pci.h>
+#include <linux/cdev.h>
+
+#define MICROSEMI_VENDOR_ID         0x11f8
+#define MICROSEMI_NTB_CLASSCODE     0x068000
+#define MICROSEMI_MGMT_CLASSCODE    0x058000
+
+#define SWITCHTEC_MRPC_PAYLOAD_SIZE 1024
+#define SWITCHTEC_MAX_PFF_CSR 48
+
+#define SWITCHTEC_EVENT_OCCURRED BIT(0)
+#define SWITCHTEC_EVENT_CLEAR    BIT(0)
+#define SWITCHTEC_EVENT_EN_LOG   BIT(1)
+#define SWITCHTEC_EVENT_EN_CLI   BIT(2)
+#define SWITCHTEC_EVENT_EN_IRQ   BIT(3)
+#define SWITCHTEC_EVENT_FATAL    BIT(4)
+
+enum {
+	SWITCHTEC_GAS_MRPC_OFFSET       = 0x0000,
+	SWITCHTEC_GAS_TOP_CFG_OFFSET    = 0x1000,
+	SWITCHTEC_GAS_SW_EVENT_OFFSET   = 0x1800,
+	SWITCHTEC_GAS_SYS_INFO_OFFSET   = 0x2000,
+	SWITCHTEC_GAS_FLASH_INFO_OFFSET = 0x2200,
+	SWITCHTEC_GAS_PART_CFG_OFFSET   = 0x4000,
+	SWITCHTEC_GAS_NTB_OFFSET        = 0x10000,
+	SWITCHTEC_GAS_PFF_CSR_OFFSET    = 0x134000,
+};
+
+struct mrpc_regs {
+	u8 input_data[SWITCHTEC_MRPC_PAYLOAD_SIZE];
+	u8 output_data[SWITCHTEC_MRPC_PAYLOAD_SIZE];
+	u32 cmd;
+	u32 status;
+	u32 ret_value;
+} __packed;
+
+enum mrpc_status {
+	SWITCHTEC_MRPC_STATUS_INPROGRESS = 1,
+	SWITCHTEC_MRPC_STATUS_DONE = 2,
+	SWITCHTEC_MRPC_STATUS_ERROR = 0xFF,
+	SWITCHTEC_MRPC_STATUS_INTERRUPTED = 0x100,
+};
+
+struct sw_event_regs {
+	u64 event_report_ctrl;
+	u64 reserved1;
+	u64 part_event_bitmap;
+	u64 reserved2;
+	u32 global_summary;
+	u32 reserved3[3];
+	u32 stack_error_event_hdr;
+	u32 stack_error_event_data;
+	u32 reserved4[4];
+	u32 ppu_error_event_hdr;
+	u32 ppu_error_event_data;
+	u32 reserved5[4];
+	u32 isp_error_event_hdr;
+	u32 isp_error_event_data;
+	u32 reserved6[4];
+	u32 sys_reset_event_hdr;
+	u32 reserved7[5];
+	u32 fw_exception_hdr;
+	u32 reserved8[5];
+	u32 fw_nmi_hdr;
+	u32 reserved9[5];
+	u32 fw_non_fatal_hdr;
+	u32 reserved10[5];
+	u32 fw_fatal_hdr;
+	u32 reserved11[5];
+	u32 twi_mrpc_comp_hdr;
+	u32 twi_mrpc_comp_data;
+	u32 reserved12[4];
+	u32 twi_mrpc_comp_async_hdr;
+	u32 twi_mrpc_comp_async_data;
+	u32 reserved13[4];
+	u32 cli_mrpc_comp_hdr;
+	u32 cli_mrpc_comp_data;
+	u32 reserved14[4];
+	u32 cli_mrpc_comp_async_hdr;
+	u32 cli_mrpc_comp_async_data;
+	u32 reserved15[4];
+	u32 gpio_interrupt_hdr;
+	u32 gpio_interrupt_data;
+	u32 reserved16[4];
+} __packed;
+
+enum {
+	SWITCHTEC_CFG0_RUNNING = 0x04,
+	SWITCHTEC_CFG1_RUNNING = 0x05,
+	SWITCHTEC_IMG0_RUNNING = 0x03,
+	SWITCHTEC_IMG1_RUNNING = 0x07,
+};
+
+struct sys_info_regs {
+	u32 device_id;
+	u32 device_version;
+	u32 firmware_version;
+	u32 reserved1;
+	u32 vendor_table_revision;
+	u32 table_format_version;
+	u32 partition_id;
+	u32 cfg_file_fmt_version;
+	u16 cfg_running;
+	u16 img_running;
+	u32 reserved2[57];
+	char vendor_id[8];
+	char product_id[16];
+	char product_revision[4];
+	char component_vendor[8];
+	u16 component_id;
+	u8 component_revision;
+} __packed;
+
+struct flash_info_regs {
+	u32 flash_part_map_upd_idx;
+
+	struct active_partition_info {
+		u32 address;
+		u32 build_version;
+		u32 build_string;
+	} active_img;
+
+	struct active_partition_info active_cfg;
+	struct active_partition_info inactive_img;
+	struct active_partition_info inactive_cfg;
+
+	u32 flash_length;
+
+	struct partition_info {
+		u32 address;
+		u32 length;
+	} cfg0;
+
+	struct partition_info cfg1;
+	struct partition_info img0;
+	struct partition_info img1;
+	struct partition_info nvlog;
+	struct partition_info vendor[8];
+};
+
+struct ntb_info_regs {
+	u8  partition_count;
+	u8  partition_id;
+	u16 reserved1;
+	u64 ep_map;
+	u16 requester_id;
+} __packed;
+
+struct part_cfg_regs {
+	u32 status;
+	u32 state;
+	u32 port_cnt;
+	u32 usp_port_mode;
+	u32 usp_pff_inst_id;
+	u32 vep_pff_inst_id;
+	u32 dsp_pff_inst_id[47];
+	u32 reserved1[11];
+	u16 vep_vector_number;
+	u16 usp_vector_number;
+	u32 port_event_bitmap;
+	u32 reserved2[3];
+	u32 part_event_summary;
+	u32 reserved3[3];
+	u32 part_reset_hdr;
+	u32 part_reset_data[5];
+	u32 mrpc_comp_hdr;
+	u32 mrpc_comp_data[5];
+	u32 mrpc_comp_async_hdr;
+	u32 mrpc_comp_async_data[5];
+	u32 dyn_binding_hdr;
+	u32 dyn_binding_data[5];
+	u32 reserved4[159];
+} __packed;
+
+enum {
+	SWITCHTEC_PART_CFG_EVENT_RESET = 1 << 0,
+	SWITCHTEC_PART_CFG_EVENT_MRPC_CMP = 1 << 1,
+	SWITCHTEC_PART_CFG_EVENT_MRPC_ASYNC_CMP = 1 << 2,
+	SWITCHTEC_PART_CFG_EVENT_DYN_PART_CMP = 1 << 3,
+};
+
+struct pff_csr_regs {
+	u16 vendor_id;
+	u16 device_id;
+	u32 pci_cfg_header[15];
+	u32 pci_cap_region[48];
+	u32 pcie_cap_region[448];
+	u32 indirect_gas_window[128];
+	u32 indirect_gas_window_off;
+	u32 reserved[127];
+	u32 pff_event_summary;
+	u32 reserved2[3];
+	u32 aer_in_p2p_hdr;
+	u32 aer_in_p2p_data[5];
+	u32 aer_in_vep_hdr;
+	u32 aer_in_vep_data[5];
+	u32 dpc_hdr;
+	u32 dpc_data[5];
+	u32 cts_hdr;
+	u32 cts_data[5];
+	u32 reserved3[6];
+	u32 hotplug_hdr;
+	u32 hotplug_data[5];
+	u32 ier_hdr;
+	u32 ier_data[5];
+	u32 threshold_hdr;
+	u32 threshold_data[5];
+	u32 power_mgmt_hdr;
+	u32 power_mgmt_data[5];
+	u32 tlp_throttling_hdr;
+	u32 tlp_throttling_data[5];
+	u32 force_speed_hdr;
+	u32 force_speed_data[5];
+	u32 credit_timeout_hdr;
+	u32 credit_timeout_data[5];
+	u32 link_state_hdr;
+	u32 link_state_data[5];
+	u32 reserved4[174];
+} __packed;
+
+struct switchtec_dev {
+	struct pci_dev *pdev;
+	struct device dev;
+	struct cdev cdev;
+
+	int partition;
+	int partition_count;
+	int pff_csr_count;
+	char pff_local[SWITCHTEC_MAX_PFF_CSR];
+
+	void __iomem *mmio;
+	struct mrpc_regs __iomem *mmio_mrpc;
+	struct sw_event_regs __iomem *mmio_sw_event;
+	struct sys_info_regs __iomem *mmio_sys_info;
+	struct flash_info_regs __iomem *mmio_flash_info;
+	struct ntb_info_regs __iomem *mmio_ntb;
+	struct part_cfg_regs __iomem *mmio_part_cfg;
+	struct part_cfg_regs __iomem *mmio_part_cfg_all;
+	struct pff_csr_regs __iomem *mmio_pff_csr;
+
+	/*
+	 * The mrpc mutex must be held when accessing the other
+	 * mrpc_ fields, alive flag and stuser->state field
+	 */
+	struct mutex mrpc_mutex;
+	struct list_head mrpc_queue;
+	int mrpc_busy;
+	struct work_struct mrpc_work;
+	struct delayed_work mrpc_timeout;
+	bool alive;
+
+	wait_queue_head_t event_wq;
+	atomic_t event_cnt;
+};
+
+static inline struct switchtec_dev *to_stdev(struct device *dev)
+{
+	return container_of(dev, struct switchtec_dev, dev);
+}
+
+#endif
-- 
cgit v1.2.3


From 302e994d3af7e4be8d0f789aa66422166ccd89c2 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 3 Aug 2017 12:19:41 -0600
Subject: NTB: switchtec: Export class symbol for use in upper layer driver

We export the class pointer symbol and add an extern define in the
Switchtec header file.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Stephen Bates <sbates@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwemmer@microsemi.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 drivers/pci/switch/switchtec.c | 4 +++-
 include/linux/switchtec.h      | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index 5b75d3008ff8..39c9218d733f 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -33,9 +33,11 @@ module_param(max_devices, int, 0644);
 MODULE_PARM_DESC(max_devices, "max number of switchtec device instances");
 
 static dev_t switchtec_devt;
-static struct class *switchtec_class;
 static DEFINE_IDA(switchtec_minor_ida);
 
+struct class *switchtec_class;
+EXPORT_SYMBOL_GPL(switchtec_class);
+
 enum mrpc_state {
 	MRPC_IDLE = 0,
 	MRPC_QUEUED,
diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
index 1cbd0e63b0ab..18e388101855 100644
--- a/include/linux/switchtec.h
+++ b/include/linux/switchtec.h
@@ -276,4 +276,6 @@ static inline struct switchtec_dev *to_stdev(struct device *dev)
 	return container_of(dev, struct switchtec_dev, dev);
 }
 
+extern struct class *switchtec_class;
+
 #endif
-- 
cgit v1.2.3


From c082b04c9d40f69dacd93a151db114299f5de6eb Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 3 Aug 2017 12:19:42 -0600
Subject: NTB: switchtec: Add NTB hardware register definitions

There are two additional regions: ctrl and dbmsg. The first is
for generic NTB control and memory windows. The second is for doorbells
and message registers. This patch also adds a number of related
constants for using these registers.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Stephen Bates <sbates@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwemmer@microsemi.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 include/linux/switchtec.h | 84 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
index 18e388101855..9c2be7631257 100644
--- a/include/linux/switchtec.h
+++ b/include/linux/switchtec.h
@@ -156,6 +156,12 @@ struct flash_info_regs {
 	struct partition_info vendor[8];
 };
 
+enum {
+	SWITCHTEC_NTB_REG_INFO_OFFSET   = 0x0000,
+	SWITCHTEC_NTB_REG_CTRL_OFFSET   = 0x4000,
+	SWITCHTEC_NTB_REG_DBMSG_OFFSET  = 0x64000,
+};
+
 struct ntb_info_regs {
 	u8  partition_count;
 	u8  partition_id;
@@ -190,6 +196,84 @@ struct part_cfg_regs {
 	u32 reserved4[159];
 } __packed;
 
+enum {
+	NTB_CTRL_PART_OP_LOCK = 0x1,
+	NTB_CTRL_PART_OP_CFG = 0x2,
+	NTB_CTRL_PART_OP_RESET = 0x3,
+
+	NTB_CTRL_PART_STATUS_NORMAL = 0x1,
+	NTB_CTRL_PART_STATUS_LOCKED = 0x2,
+	NTB_CTRL_PART_STATUS_LOCKING = 0x3,
+	NTB_CTRL_PART_STATUS_CONFIGURING = 0x4,
+	NTB_CTRL_PART_STATUS_RESETTING = 0x5,
+
+	NTB_CTRL_BAR_VALID = 1 << 0,
+	NTB_CTRL_BAR_DIR_WIN_EN = 1 << 4,
+	NTB_CTRL_BAR_LUT_WIN_EN = 1 << 5,
+
+	NTB_CTRL_REQ_ID_EN = 1 << 0,
+
+	NTB_CTRL_LUT_EN = 1 << 0,
+
+	NTB_PART_CTRL_ID_PROT_DIS = 1 << 0,
+};
+
+struct ntb_ctrl_regs {
+	u32 partition_status;
+	u32 partition_op;
+	u32 partition_ctrl;
+	u32 bar_setup;
+	u32 bar_error;
+	u16 lut_table_entries;
+	u16 lut_table_offset;
+	u32 lut_error;
+	u16 req_id_table_size;
+	u16 req_id_table_offset;
+	u32 req_id_error;
+	u32 reserved1[7];
+	struct {
+		u32 ctl;
+		u32 win_size;
+		u64 xlate_addr;
+	} bar_entry[6];
+	u32 reserved2[216];
+	u32 req_id_table[256];
+	u32 reserved3[512];
+	u64 lut_entry[512];
+} __packed;
+
+#define NTB_DBMSG_IMSG_STATUS BIT_ULL(32)
+#define NTB_DBMSG_IMSG_MASK   BIT_ULL(40)
+
+struct ntb_dbmsg_regs {
+	u32 reserved1[1024];
+	u64 odb;
+	u64 odb_mask;
+	u64 idb;
+	u64 idb_mask;
+	u8  idb_vec_map[64];
+	u32 msg_map;
+	u32 reserved2;
+	struct {
+		u32 msg;
+		u32 status;
+	} omsg[4];
+
+	struct {
+		u32 msg;
+		u8  status;
+		u8  mask;
+		u8  src;
+		u8  reserved;
+	} imsg[4];
+
+	u8 reserved3[3928];
+	u8 msix_table[1024];
+	u8 reserved4[3072];
+	u8 pba[24];
+	u8 reserved5[4072];
+} __packed;
+
 enum {
 	SWITCHTEC_PART_CFG_EVENT_RESET = 1 << 0,
 	SWITCHTEC_PART_CFG_EVENT_MRPC_CMP = 1 << 1,
-- 
cgit v1.2.3


From 48c302dc8f3acc1b0ef56a28569bc6a35b9b0e60 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 3 Aug 2017 12:19:43 -0600
Subject: NTB: switchtec: Add link event notifier callback

In order for the Switchtec NTB code to handle link change events we
create a notifier callback in the switchtec code which gets called
whenever an appropriate event interrupt occurs.

In order to preserve userspace's ability to follow these events,
we compare the event count with a stored copy from last time we
checked.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Stephen Bates <sbates@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwemmer@microsemi.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 drivers/pci/switch/switchtec.c | 51 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/switchtec.h      |  4 ++++
 2 files changed, 55 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index 39c9218d733f..807569e62bee 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -978,6 +978,49 @@ static const struct file_operations switchtec_fops = {
 	.compat_ioctl = switchtec_dev_ioctl,
 };
 
+static void link_event_work(struct work_struct *work)
+{
+	struct switchtec_dev *stdev;
+
+	stdev = container_of(work, struct switchtec_dev, link_event_work);
+
+	if (stdev->link_notifier)
+		stdev->link_notifier(stdev);
+}
+
+static void check_link_state_events(struct switchtec_dev *stdev)
+{
+	int idx;
+	u32 reg;
+	int count;
+	int occurred = 0;
+
+	for (idx = 0; idx < stdev->pff_csr_count; idx++) {
+		reg = ioread32(&stdev->mmio_pff_csr[idx].link_state_hdr);
+		dev_dbg(&stdev->dev, "link_state: %d->%08x\n", idx, reg);
+		count = (reg >> 5) & 0xFF;
+
+		if (count != stdev->link_event_count[idx]) {
+			occurred = 1;
+			stdev->link_event_count[idx] = count;
+		}
+	}
+
+	if (occurred)
+		schedule_work(&stdev->link_event_work);
+}
+
+static void enable_link_state_events(struct switchtec_dev *stdev)
+{
+	int idx;
+
+	for (idx = 0; idx < stdev->pff_csr_count; idx++) {
+		iowrite32(SWITCHTEC_EVENT_CLEAR |
+			  SWITCHTEC_EVENT_EN_IRQ,
+			  &stdev->mmio_pff_csr[idx].link_state_hdr);
+	}
+}
+
 static void stdev_release(struct device *dev)
 {
 	struct switchtec_dev *stdev = to_stdev(dev);
@@ -1030,6 +1073,7 @@ static struct switchtec_dev *stdev_create(struct pci_dev *pdev)
 	stdev->mrpc_busy = 0;
 	INIT_WORK(&stdev->mrpc_work, mrpc_event_work);
 	INIT_DELAYED_WORK(&stdev->mrpc_timeout, mrpc_timeout_work);
+	INIT_WORK(&stdev->link_event_work, link_event_work);
 	init_waitqueue_head(&stdev->event_wq);
 	atomic_set(&stdev->event_cnt, 0);
 
@@ -1073,6 +1117,9 @@ static int mask_event(struct switchtec_dev *stdev, int eid, int idx)
 	if (!(hdr & SWITCHTEC_EVENT_OCCURRED && hdr & SWITCHTEC_EVENT_EN_IRQ))
 		return 0;
 
+	if (eid == SWITCHTEC_IOCTL_EVENT_LINK_STATE)
+		return 0;
+
 	dev_dbg(&stdev->dev, "%s: %d %d %x\n", __func__, eid, idx, hdr);
 	hdr &= ~(SWITCHTEC_EVENT_EN_IRQ | SWITCHTEC_EVENT_OCCURRED);
 	iowrite32(hdr, hdr_reg);
@@ -1092,6 +1139,7 @@ static int mask_all_events(struct switchtec_dev *stdev, int eid)
 		for (idx = 0; idx < stdev->pff_csr_count; idx++) {
 			if (!stdev->pff_local[idx])
 				continue;
+
 			count += mask_event(stdev, eid, idx);
 		}
 	} else {
@@ -1116,6 +1164,8 @@ static irqreturn_t switchtec_event_isr(int irq, void *dev)
 		iowrite32(reg, &stdev->mmio_part_cfg->mrpc_comp_hdr);
 	}
 
+	check_link_state_events(stdev);
+
 	for (eid = 0; eid < SWITCHTEC_IOCTL_MAX_EVENTS; eid++)
 		event_count += mask_all_events(stdev, eid);
 
@@ -1242,6 +1292,7 @@ static int switchtec_pci_probe(struct pci_dev *pdev,
 	iowrite32(SWITCHTEC_EVENT_CLEAR |
 		  SWITCHTEC_EVENT_EN_IRQ,
 		  &stdev->mmio_part_cfg->mrpc_comp_hdr);
+	enable_link_state_events(stdev);
 
 	rc = cdev_device_add(&stdev->cdev, &stdev->dev);
 	if (rc)
diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
index 9c2be7631257..d8159944f013 100644
--- a/include/linux/switchtec.h
+++ b/include/linux/switchtec.h
@@ -353,6 +353,10 @@ struct switchtec_dev {
 
 	wait_queue_head_t event_wq;
 	atomic_t event_cnt;
+
+	struct work_struct link_event_work;
+	void (*link_notifier)(struct switchtec_dev *stdev);
+	u8 link_event_count[SWITCHTEC_MAX_PFF_CSR];
 };
 
 static inline struct switchtec_dev *to_stdev(struct device *dev)
-- 
cgit v1.2.3


From fa5ab66e36de56f7e40dad57780d53e059dced86 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 3 Aug 2017 12:19:45 -0600
Subject: NTB: Add check and comment for link up to mw_count() and
 mw_get_align()

Adds a comment and a check to ntb_mw_get_align() so that it always fails
if the function is called before the link is up.

Also adds a comment to ntb_mw_count() to note that it may return 0 if
it is called before the link is up.

This is to prevent accidental mis-use in clients that are testing
on hardware that this doesn't matter for.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Acked-by: Allen Hubbe <Allen.Hubbe@dell.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 include/linux/ntb.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index 609e232c00da..47f2966cfd7f 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -730,7 +730,8 @@ static inline int ntb_link_disable(struct ntb_dev *ntb)
  * Hardware and topology may support a different number of memory windows.
  * Moreover different peer devices can support different number of memory
  * windows. Simply speaking this method returns the number of possible inbound
- * memory windows to share with specified peer device.
+ * memory windows to share with specified peer device. Note: this may return
+ * zero if the link is not up yet.
  *
  * Return: the number of memory windows.
  */
@@ -751,7 +752,7 @@ static inline int ntb_mw_count(struct ntb_dev *ntb, int pidx)
  * Get the alignments of an inbound memory window with specified index.
  * NULL may be given for any output parameter if the value is not needed.
  * The alignment and size parameters may be used for allocation of proper
- * shared memory.
+ * shared memory. Note: this must only be called when the link is up.
  *
  * Return: Zero on success, otherwise a negative error number.
  */
@@ -760,6 +761,9 @@ static inline int ntb_mw_get_align(struct ntb_dev *ntb, int pidx, int widx,
 				   resource_size_t *size_align,
 				   resource_size_t *size_max)
 {
+	if (!(ntb_link_is_up(ntb, NULL, NULL) & (1 << pidx)))
+		return -ENOTCONN;
+
 	return ntb->ops->mw_get_align(ntb, pidx, widx, addr_align, size_align,
 				      size_max);
 }
-- 
cgit v1.2.3


From 33dea5aae0320345af26ae9aba0894a930e0d4ec Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 3 Aug 2017 12:19:46 -0600
Subject: NTB: switchtec_ntb: Introduce initial NTB driver

Seeing the Switchtec NTB hardware shares the same endpoint as the
management endpoint we utilize the class_interface API to register
an NTB driver for every Switchtec device in the system that has the
NTB class code.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Stephen Bates <sbates@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwemmer@microsemi.com>
Acked-by: Allen Hubbe <Allen.Hubbe@dell.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 MAINTAINERS                            |  1 +
 drivers/ntb/hw/Kconfig                 |  1 +
 drivers/ntb/hw/Makefile                |  1 +
 drivers/ntb/hw/mscc/Kconfig            |  9 ++++
 drivers/ntb/hw/mscc/Makefile           |  1 +
 drivers/ntb/hw/mscc/ntb_hw_switchtec.c | 81 ++++++++++++++++++++++++++++++++++
 drivers/pci/switch/switchtec.c         |  3 ++
 include/linux/switchtec.h              |  4 ++
 8 files changed, 101 insertions(+)
 create mode 100644 drivers/ntb/hw/mscc/Kconfig
 create mode 100644 drivers/ntb/hw/mscc/Makefile
 create mode 100644 drivers/ntb/hw/mscc/ntb_hw_switchtec.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 07d288f11a03..2d781533c3af 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10348,6 +10348,7 @@ F:	Documentation/ABI/testing/sysfs-class-switchtec
 F:	drivers/pci/switch/switchtec*
 F:	include/uapi/linux/switchtec_ioctl.h
 F:	include/linux/switchtec.h
+F:	drivers/ntb/hw/mscc/
 
 PCI DRIVER FOR MVEBU (Marvell Armada 370 and Armada XP SOC support)
 M:	Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
diff --git a/drivers/ntb/hw/Kconfig b/drivers/ntb/hw/Kconfig
index a89243c9fdd3..e51b581fd102 100644
--- a/drivers/ntb/hw/Kconfig
+++ b/drivers/ntb/hw/Kconfig
@@ -1,3 +1,4 @@
 source "drivers/ntb/hw/amd/Kconfig"
 source "drivers/ntb/hw/idt/Kconfig"
 source "drivers/ntb/hw/intel/Kconfig"
+source "drivers/ntb/hw/mscc/Kconfig"
diff --git a/drivers/ntb/hw/Makefile b/drivers/ntb/hw/Makefile
index 87332c3905f0..923c442db750 100644
--- a/drivers/ntb/hw/Makefile
+++ b/drivers/ntb/hw/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_NTB_AMD)	+= amd/
 obj-$(CONFIG_NTB_IDT)	+= idt/
 obj-$(CONFIG_NTB_INTEL)	+= intel/
+obj-$(CONFIG_NTB_SWITCHTEC) += mscc/
diff --git a/drivers/ntb/hw/mscc/Kconfig b/drivers/ntb/hw/mscc/Kconfig
new file mode 100644
index 000000000000..013ed6716438
--- /dev/null
+++ b/drivers/ntb/hw/mscc/Kconfig
@@ -0,0 +1,9 @@
+config NTB_SWITCHTEC
+	tristate "MicroSemi Switchtec Non-Transparent Bridge Support"
+	select PCI_SW_SWITCHTEC
+	help
+	 Enables NTB support for Switchtec PCI switches. This also
+	 selects the Switchtec management driver as they share the same
+	 hardware interface.
+
+	 If unsure, say N.
diff --git a/drivers/ntb/hw/mscc/Makefile b/drivers/ntb/hw/mscc/Makefile
new file mode 100644
index 000000000000..064686ead1ba
--- /dev/null
+++ b/drivers/ntb/hw/mscc/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_NTB_SWITCHTEC) += ntb_hw_switchtec.o
diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
new file mode 100644
index 000000000000..253efba72275
--- /dev/null
+++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
@@ -0,0 +1,81 @@
+/*
+ * Microsemi Switchtec(tm) PCIe Management Driver
+ * Copyright (c) 2017, Microsemi Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <linux/switchtec.h>
+#include <linux/module.h>
+
+MODULE_DESCRIPTION("Microsemi Switchtec(tm) NTB Driver");
+MODULE_VERSION("0.1");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Microsemi Corporation");
+
+struct switchtec_ntb {
+	struct switchtec_dev *stdev;
+};
+
+static int switchtec_ntb_add(struct device *dev,
+			     struct class_interface *class_intf)
+{
+	struct switchtec_dev *stdev = to_stdev(dev);
+	struct switchtec_ntb *sndev;
+
+	stdev->sndev = NULL;
+
+	if (stdev->pdev->class != MICROSEMI_NTB_CLASSCODE)
+		return -ENODEV;
+
+	sndev = kzalloc_node(sizeof(*sndev), GFP_KERNEL, dev_to_node(dev));
+	if (!sndev)
+		return -ENOMEM;
+
+	sndev->stdev = stdev;
+
+	stdev->sndev = sndev;
+	dev_info(dev, "NTB device registered");
+
+	return 0;
+}
+
+void switchtec_ntb_remove(struct device *dev,
+			  struct class_interface *class_intf)
+{
+	struct switchtec_dev *stdev = to_stdev(dev);
+	struct switchtec_ntb *sndev = stdev->sndev;
+
+	if (!sndev)
+		return;
+
+	stdev->sndev = NULL;
+	kfree(sndev);
+	dev_info(dev, "ntb device unregistered");
+}
+
+static struct class_interface switchtec_interface  = {
+	.add_dev = switchtec_ntb_add,
+	.remove_dev = switchtec_ntb_remove,
+};
+
+static int __init switchtec_ntb_init(void)
+{
+	switchtec_interface.class = switchtec_class;
+	return class_interface_register(&switchtec_interface);
+}
+module_init(switchtec_ntb_init);
+
+static void __exit switchtec_ntb_exit(void)
+{
+	class_interface_unregister(&switchtec_interface);
+}
+module_exit(switchtec_ntb_exit);
diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index 807569e62bee..58edc24ea381 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -1275,6 +1275,9 @@ static int switchtec_pci_probe(struct pci_dev *pdev,
 	struct switchtec_dev *stdev;
 	int rc;
 
+	if (pdev->class == MICROSEMI_NTB_CLASSCODE)
+		request_module_nowait("ntb_hw_switchtec");
+
 	stdev = stdev_create(pdev);
 	if (IS_ERR(stdev))
 		return PTR_ERR(stdev);
diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
index d8159944f013..09d73d0d1aa8 100644
--- a/include/linux/switchtec.h
+++ b/include/linux/switchtec.h
@@ -320,6 +320,8 @@ struct pff_csr_regs {
 	u32 reserved4[174];
 } __packed;
 
+struct switchtec_ntb;
+
 struct switchtec_dev {
 	struct pci_dev *pdev;
 	struct device dev;
@@ -357,6 +359,8 @@ struct switchtec_dev {
 	struct work_struct link_event_work;
 	void (*link_notifier)(struct switchtec_dev *stdev);
 	u8 link_event_count[SWITCHTEC_MAX_PFF_CSR];
+
+	struct switchtec_ntb *sndev;
 };
 
 static inline struct switchtec_dev *to_stdev(struct device *dev)
-- 
cgit v1.2.3


From e099b45b7c27b4fc6510918ea8c7d18980787283 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 3 Aug 2017 12:19:49 -0600
Subject: NTB: switchtec_ntb: Add skeleton NTB driver

Add a skeleton NTB driver which will be filled out in subsequent patches.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Stephen Bates <sbates@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwemmer@microsemi.com>
Acked-by: Allen Hubbe <Allen.Hubbe@dell.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 drivers/ntb/hw/mscc/ntb_hw_switchtec.c | 148 ++++++++++++++++++++++++++++++++-
 include/linux/ntb.h                    |   3 +
 2 files changed, 150 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
index 175ac0baa7a0..158ed310cbaf 100644
--- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
+++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
@@ -18,6 +18,7 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/interrupt.h>
+#include <linux/ntb.h>
 
 MODULE_DESCRIPTION("Microsemi Switchtec(tm) NTB Driver");
 MODULE_VERSION("0.1");
@@ -71,6 +72,7 @@ struct shared_mw {
 #define LUT_SIZE SZ_64K
 
 struct switchtec_ntb {
+	struct ntb_dev ntb;
 	struct switchtec_dev *stdev;
 
 	int self_partition;
@@ -161,10 +163,148 @@ static int switchtec_ntb_part_op(struct switchtec_ntb *sndev,
 	return -EIO;
 }
 
+static int switchtec_ntb_mw_count(struct ntb_dev *ntb, int pidx)
+{
+	return 0;
+}
+
+static int switchtec_ntb_mw_get_align(struct ntb_dev *ntb, int pidx,
+				      int widx, resource_size_t *addr_align,
+				      resource_size_t *size_align,
+				      resource_size_t *size_max)
+{
+	return 0;
+}
+
+static int switchtec_ntb_mw_set_trans(struct ntb_dev *ntb, int pidx, int widx,
+				      dma_addr_t addr, resource_size_t size)
+{
+	return 0;
+}
+
+static int switchtec_ntb_peer_mw_count(struct ntb_dev *ntb)
+{
+	return 0;
+}
+
+static int switchtec_ntb_peer_mw_get_addr(struct ntb_dev *ntb, int idx,
+					  phys_addr_t *base,
+					  resource_size_t *size)
+{
+	return 0;
+}
+
+static u64 switchtec_ntb_link_is_up(struct ntb_dev *ntb,
+				    enum ntb_speed *speed,
+				    enum ntb_width *width)
+{
+	return 0;
+}
+
+static int switchtec_ntb_link_enable(struct ntb_dev *ntb,
+				     enum ntb_speed max_speed,
+				     enum ntb_width max_width)
+{
+	return 0;
+}
+
+static int switchtec_ntb_link_disable(struct ntb_dev *ntb)
+{
+	return 0;
+}
+
+static u64 switchtec_ntb_db_valid_mask(struct ntb_dev *ntb)
+{
+	return 0;
+}
+
+static int switchtec_ntb_db_vector_count(struct ntb_dev *ntb)
+{
+	return 0;
+}
+
+static u64 switchtec_ntb_db_vector_mask(struct ntb_dev *ntb, int db_vector)
+{
+	return 0;
+}
+
+static u64 switchtec_ntb_db_read(struct ntb_dev *ntb)
+{
+	return 0;
+}
+
+static int switchtec_ntb_db_clear(struct ntb_dev *ntb, u64 db_bits)
+{
+	return 0;
+}
+
+static int switchtec_ntb_db_set_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+	return 0;
+}
+
+static int switchtec_ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+	return 0;
+}
+
+static int switchtec_ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
+{
+	return 0;
+}
+
+static int switchtec_ntb_spad_count(struct ntb_dev *ntb)
+{
+	return 0;
+}
+
+static u32 switchtec_ntb_spad_read(struct ntb_dev *ntb, int idx)
+{
+	return 0;
+}
+
+static int switchtec_ntb_spad_write(struct ntb_dev *ntb, int idx, u32 val)
+{
+	return 0;
+}
+
+static int switchtec_ntb_peer_spad_write(struct ntb_dev *ntb, int pidx,
+					 int sidx, u32 val)
+{
+	return 0;
+}
+
+static const struct ntb_dev_ops switchtec_ntb_ops = {
+	.mw_count		= switchtec_ntb_mw_count,
+	.mw_get_align		= switchtec_ntb_mw_get_align,
+	.mw_set_trans		= switchtec_ntb_mw_set_trans,
+	.peer_mw_count		= switchtec_ntb_peer_mw_count,
+	.peer_mw_get_addr	= switchtec_ntb_peer_mw_get_addr,
+	.link_is_up		= switchtec_ntb_link_is_up,
+	.link_enable		= switchtec_ntb_link_enable,
+	.link_disable		= switchtec_ntb_link_disable,
+	.db_valid_mask		= switchtec_ntb_db_valid_mask,
+	.db_vector_count	= switchtec_ntb_db_vector_count,
+	.db_vector_mask		= switchtec_ntb_db_vector_mask,
+	.db_read		= switchtec_ntb_db_read,
+	.db_clear		= switchtec_ntb_db_clear,
+	.db_set_mask		= switchtec_ntb_db_set_mask,
+	.db_clear_mask		= switchtec_ntb_db_clear_mask,
+	.peer_db_set		= switchtec_ntb_peer_db_set,
+	.spad_count		= switchtec_ntb_spad_count,
+	.spad_read		= switchtec_ntb_spad_read,
+	.spad_write		= switchtec_ntb_spad_write,
+	.peer_spad_write	= switchtec_ntb_peer_spad_write,
+};
+
 static void switchtec_ntb_init_sndev(struct switchtec_ntb *sndev)
 {
 	u64 part_map;
 
+	sndev->ntb.pdev = sndev->stdev->pdev;
+	sndev->ntb.topo = NTB_TOPO_SWITCH;
+	sndev->ntb.ops = &switchtec_ntb_ops;
+
 	sndev->self_partition = sndev->stdev->partition;
 
 	sndev->mmio_ntb = sndev->stdev->mmio_ntb;
@@ -517,7 +657,6 @@ static int switchtec_ntb_add(struct device *dev,
 		return -ENOMEM;
 
 	sndev->stdev = stdev;
-
 	switchtec_ntb_init_sndev(sndev);
 	switchtec_ntb_init_mw(sndev);
 	switchtec_ntb_init_db(sndev);
@@ -535,11 +674,17 @@ static int switchtec_ntb_add(struct device *dev,
 	if (rc)
 		goto deinit_shared_and_exit;
 
+	rc = ntb_register_device(&sndev->ntb);
+	if (rc)
+		goto deinit_and_exit;
+
 	stdev->sndev = sndev;
 	dev_info(dev, "NTB device registered");
 
 	return 0;
 
+deinit_and_exit:
+	switchtec_ntb_deinit_db_msg_irq(sndev);
 deinit_shared_and_exit:
 	switchtec_ntb_deinit_shared_mw(sndev);
 free_and_exit:
@@ -558,6 +703,7 @@ void switchtec_ntb_remove(struct device *dev,
 		return;
 
 	stdev->sndev = NULL;
+	ntb_unregister_device(&sndev->ntb);
 	switchtec_ntb_deinit_db_msg_irq(sndev);
 	switchtec_ntb_deinit_shared_mw(sndev);
 	kfree(sndev);
diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index 47f2966cfd7f..c308964777eb 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -70,6 +70,7 @@ struct pci_dev;
  * @NTB_TOPO_SEC:	On secondary side of remote ntb.
  * @NTB_TOPO_B2B_USD:	On primary side of local ntb upstream of remote ntb.
  * @NTB_TOPO_B2B_DSD:	On primary side of local ntb downstream of remote ntb.
+ * @NTB_TOPO_SWITCH:	Connected via a switch which supports ntb.
  */
 enum ntb_topo {
 	NTB_TOPO_NONE = -1,
@@ -77,6 +78,7 @@ enum ntb_topo {
 	NTB_TOPO_SEC,
 	NTB_TOPO_B2B_USD,
 	NTB_TOPO_B2B_DSD,
+	NTB_TOPO_SWITCH,
 };
 
 static inline int ntb_topo_is_b2b(enum ntb_topo topo)
@@ -97,6 +99,7 @@ static inline char *ntb_topo_string(enum ntb_topo topo)
 	case NTB_TOPO_SEC:	return "NTB_TOPO_SEC";
 	case NTB_TOPO_B2B_USD:	return "NTB_TOPO_B2B_USD";
 	case NTB_TOPO_B2B_DSD:	return "NTB_TOPO_B2B_DSD";
+	case NTB_TOPO_SWITCH:	return "NTB_TOPO_SWITCH";
 	}
 	return "NTB_TOPO_INVALID";
 }
-- 
cgit v1.2.3


From 288b3de55aace830f13280985ec9e6bcbff33b1b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 20 Nov 2017 15:21:54 -0800
Subject: bpf: offload: move offload device validation out to the drivers

With TC shared block changes we can't depend on correct netdev
pointer being available in cls_bpf.  Move the device validation
to the driver.  Core will only make sure that offloaded programs
are always attached in the driver (or in HW by the driver).  We
trust that drivers which implement offload callbacks will perform
necessary checks.

Moving the checks to the driver is generally a useful thing,
in practice the check should be against a switchdev instance,
not a netdev, given that most ASICs will probably allow using
the same program on many ports.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/offload.c | 10 ++++++++--
 include/linux/bpf.h                              |  4 ++--
 kernel/bpf/syscall.c                             | 23 ++++++++++++-----------
 net/core/dev.c                                   |  7 ++-----
 net/sched/cls_bpf.c                              |  8 +++-----
 5 files changed, 27 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index b6cee71f49d3..bc879aeb62d4 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -214,8 +214,14 @@ int nfp_net_bpf_offload(struct nfp_net *nn, struct bpf_prog *prog,
 {
 	int err;
 
-	if (prog && !prog->aux->offload)
-		return -EINVAL;
+	if (prog) {
+		struct bpf_dev_offload *offload = prog->aux->offload;
+
+		if (!offload)
+			return -EINVAL;
+		if (offload->netdev != nn->dp.netdev)
+			return -EINVAL;
+	}
 
 	if (prog && old_prog) {
 		u8 cap;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c397934f91dd..f82be640731e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -336,7 +336,7 @@ extern const struct bpf_verifier_ops xdp_analyzer_ops;
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
-				       struct net_device *netdev);
+				       bool attach_drv);
 struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
 void bpf_prog_sub(struct bpf_prog *prog, int i);
 struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog);
@@ -433,7 +433,7 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 
 static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
 						     enum bpf_prog_type type,
-						     struct net_device *netdev)
+						     bool attach_drv)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8e9d065bb7cd..38da55905ab0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1057,22 +1057,23 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
 
-static bool bpf_prog_can_attach(struct bpf_prog *prog,
-				enum bpf_prog_type *attach_type,
-				struct net_device *netdev)
+static bool bpf_prog_get_ok(struct bpf_prog *prog,
+			    enum bpf_prog_type *attach_type, bool attach_drv)
 {
-	struct bpf_dev_offload *offload = prog->aux->offload;
+	/* not an attachment, just a refcount inc, always allow */
+	if (!attach_type)
+		return true;
 
 	if (prog->type != *attach_type)
 		return false;
-	if (offload && offload->netdev != netdev)
+	if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
 		return false;
 
 	return true;
 }
 
 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
-				       struct net_device *netdev)
+				       bool attach_drv)
 {
 	struct fd f = fdget(ufd);
 	struct bpf_prog *prog;
@@ -1080,7 +1081,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
 	prog = ____bpf_prog_get(f);
 	if (IS_ERR(prog))
 		return prog;
-	if (attach_type && !bpf_prog_can_attach(prog, attach_type, netdev)) {
+	if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
 		prog = ERR_PTR(-EINVAL);
 		goto out;
 	}
@@ -1093,12 +1094,12 @@ out:
 
 struct bpf_prog *bpf_prog_get(u32 ufd)
 {
-	return __bpf_prog_get(ufd, NULL, NULL);
+	return __bpf_prog_get(ufd, NULL, false);
 }
 
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 {
-	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, NULL);
+	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, false);
 
 	if (!IS_ERR(prog))
 		trace_bpf_prog_get_type(prog);
@@ -1107,9 +1108,9 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
-				       struct net_device *netdev)
+				       bool attach_drv)
 {
-	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, netdev);
+	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv);
 
 	if (!IS_ERR(prog))
 		trace_bpf_prog_get_type(prog);
diff --git a/net/core/dev.c b/net/core/dev.c
index 8ee29f4f5fa9..09525a27319c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7139,11 +7139,8 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		    __dev_xdp_attached(dev, bpf_op, NULL))
 			return -EBUSY;
 
-		if (bpf_op == ops->ndo_bpf)
-			prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
-						     dev);
-		else
-			prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
+		prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
+					     bpf_op == ops->ndo_bpf);
 		if (IS_ERR(prog))
 			return PTR_ERR(prog);
 	}
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index fb680dafac5a..a9f3e317055c 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -382,15 +382,13 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
 {
 	struct bpf_prog *fp;
 	char *name = NULL;
+	bool skip_sw;
 	u32 bpf_fd;
 
 	bpf_fd = nla_get_u32(tb[TCA_BPF_FD]);
+	skip_sw = gen_flags & TCA_CLS_FLAGS_SKIP_SW;
 
-	if (gen_flags & TCA_CLS_FLAGS_SKIP_SW)
-		fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS,
-					   qdisc_dev(tp->q));
-	else
-		fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS);
+	fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS, skip_sw);
 	if (IS_ERR(fp))
 		return PTR_ERR(fp);
 
-- 
cgit v1.2.3


From 479321e9c31a6c05426790b11888427400f75ac8 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 20 Nov 2017 15:21:56 -0800
Subject: bpf: turn bpf_prog_get_type() into a wrapper

bpf_prog_get_type() is identical to bpf_prog_get_type_dev(),
with false passed as attach_drv.  Instead of keeping it as
an exported symbol turn it into static inline wrapper.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h  | 13 ++++++-------
 kernel/bpf/syscall.c | 10 ----------
 2 files changed, 6 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f82be640731e..37bbab8c0f56 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -334,7 +334,6 @@ extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
 extern const struct bpf_verifier_ops xdp_analyzer_ops;
 
 struct bpf_prog *bpf_prog_get(u32 ufd);
-struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
 				       bool attach_drv);
 struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
@@ -425,12 +424,6 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
-static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
-						 enum bpf_prog_type type)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
 static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
 						     enum bpf_prog_type type,
 						     bool attach_drv)
@@ -514,6 +507,12 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
+static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
+						 enum bpf_prog_type type)
+{
+	return bpf_prog_get_type_dev(ufd, type, false);
+}
+
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
 u32 bpf_prog_offload_ifindex(struct bpf_prog *prog);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 38da55905ab0..41509cf825d8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1097,16 +1097,6 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 	return __bpf_prog_get(ufd, NULL, false);
 }
 
-struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
-{
-	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, false);
-
-	if (!IS_ERR(prog))
-		trace_bpf_prog_get_type(prog);
-	return prog;
-}
-EXPORT_SYMBOL_GPL(bpf_prog_get_type);
-
 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
 				       bool attach_drv)
 {
-- 
cgit v1.2.3


From 1ee640095f049e7ac4ec36b985abada497b98cc2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 20 Nov 2017 15:21:59 -0800
Subject: bpf: revert report offload info to user space

This reverts commit bd601b6ada11 ("bpf: report offload info to user
space").  The ifindex by itself is not sufficient, we should provide
information on which network namespace this ifindex belongs to.
After considering some options we concluded that it's best to just
remove this API for now, and rework it in -next.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h      |  1 -
 include/uapi/linux/bpf.h |  6 ------
 kernel/bpf/offload.c     | 12 ------------
 kernel/bpf/syscall.c     |  5 -----
 4 files changed, 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 37bbab8c0f56..76c577281d78 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -515,7 +515,6 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
-u32 bpf_prog_offload_ifindex(struct bpf_prog *prog);
 
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3f626df42516..4c223ab30293 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -897,10 +897,6 @@ enum sk_action {
 
 #define BPF_TAG_SIZE	8
 
-enum bpf_prog_status {
-	BPF_PROG_STATUS_DEV_BOUND	= (1 << 0),
-};
-
 struct bpf_prog_info {
 	__u32 type;
 	__u32 id;
@@ -914,8 +910,6 @@ struct bpf_prog_info {
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
-	__u32 ifindex;
-	__u32 status;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index d4267c674fec..68ec884440b7 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -149,18 +149,6 @@ int bpf_prog_offload_compile(struct bpf_prog *prog)
 	return bpf_prog_offload_translate(prog);
 }
 
-u32 bpf_prog_offload_ifindex(struct bpf_prog *prog)
-{
-	struct bpf_dev_offload *offload = prog->aux->offload;
-	u32 ifindex;
-
-	rtnl_lock();
-	ifindex = offload->netdev ? offload->netdev->ifindex : 0;
-	rtnl_unlock();
-
-	return ifindex;
-}
-
 const struct bpf_prog_ops bpf_offload_prog_ops = {
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 41509cf825d8..2c4cfeaa8d5e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1616,11 +1616,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 			return -EFAULT;
 	}
 
-	if (bpf_prog_is_dev_bound(prog->aux)) {
-		info.status |= BPF_PROG_STATUS_DEV_BOUND;
-		info.ifindex = bpf_prog_offload_ifindex(prog);
-	}
-
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
cgit v1.2.3


From 1438019479349d262b76f8767ace3273d11b6dcb Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 20 Nov 2017 15:22:00 -0800
Subject: bpf: make bpf_prog_offload_verifier_prep() static inline

Header implementation of bpf_prog_offload_verifier_prep() which
is used if CONFIG_NET=n should be a static inline.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 07b96aaca256..b61482d354a2 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -171,7 +171,7 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env);
 #else
-int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+static inline int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
 {
 	return -EOPNOTSUPP;
 }
-- 
cgit v1.2.3


From aa5222e92f8000ed3c1c38dddf11c83222aadfb3 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 13 Oct 2017 10:01:22 +0300
Subject: sched/deadline: Don't use dubious signed bitfields

It doesn't cause a run-time bug, but these bitfields should be unsigned.
When it's signed ->dl_throttled is set to either 0 or -1, instead of
0 and 1 as expected.

The sched.h file is included into tons of places so Sparse generates
a flood of warnings like this:

  ./include/linux/sched.h:477:54: error: dubious one-bit signed bitfield

Reported-by: Matthew Wilcox <willy@infradead.org>
Reported-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Luca Abeni <luca.abeni@santannapisa.it>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-janitors@vger.kernel.org
Cc: luca abeni <luca.abeni@santannapisa.it>
Link: http://lkml.kernel.org/r/20171013070121.dzcncojuj2f4utij@mwanda
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a5dc7c98b0a2..21991d668d35 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -473,10 +473,10 @@ struct sched_dl_entity {
 	 * conditions between the inactive timer handler and the wakeup
 	 * code.
 	 */
-	int				dl_throttled      : 1;
-	int				dl_boosted        : 1;
-	int				dl_yielded        : 1;
-	int				dl_non_contending : 1;
+	unsigned int			dl_throttled      : 1;
+	unsigned int			dl_boosted        : 1;
+	unsigned int			dl_yielded        : 1;
+	unsigned int			dl_non_contending : 1;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
-- 
cgit v1.2.3


From bca237a52ca0035b0a0380003283d8bf590188d5 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 28 Aug 2017 15:03:41 -0700
Subject: block/laptop_mode: Convert timers to use timer_setup()

In preparation for unconditionally passing the struct timer_list pointer to
all timer callbacks, switch to using the new timer_setup() and from_timer()
to pass the timer pointer explicitly.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Jeff Layton <jlayton@redhat.com>
Cc: linux-block@vger.kernel.org
Cc: linux-mm@kvack.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 block/blk-core.c          | 10 +++++-----
 include/linux/writeback.h |  2 +-
 mm/page-writeback.c       |  7 ++++---
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 1038706edd87..b8881750a3ac 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -863,9 +863,9 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
 	wake_up_all(&q->mq_freeze_wq);
 }
 
-static void blk_rq_timed_out_timer(unsigned long data)
+static void blk_rq_timed_out_timer(struct timer_list *t)
 {
-	struct request_queue *q = (struct request_queue *)data;
+	struct request_queue *q = from_timer(q, t, timeout);
 
 	kblockd_schedule_work(&q->timeout_work);
 }
@@ -901,9 +901,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	q->backing_dev_info->name = "block";
 	q->node = node_id;
 
-	setup_timer(&q->backing_dev_info->laptop_mode_wb_timer,
-		    laptop_mode_timer_fn, (unsigned long) q);
-	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
+	timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
+		    laptop_mode_timer_fn, 0);
+	timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
 	INIT_WORK(&q->timeout_work, NULL);
 	INIT_LIST_HEAD(&q->queue_head);
 	INIT_LIST_HEAD(&q->timeout_list);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index f42d85631d17..fdfd04e348f6 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -308,7 +308,7 @@ static inline void cgroup_writeback_umount(void)
 void laptop_io_completion(struct backing_dev_info *info);
 void laptop_sync_completion(void);
 void laptop_mode_sync(struct work_struct *work);
-void laptop_mode_timer_fn(unsigned long data);
+void laptop_mode_timer_fn(struct timer_list *t);
 #else
 static inline void laptop_sync_completion(void) { }
 #endif
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 8a1551154285..e7095030aa1f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1993,11 +1993,12 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
 }
 
 #ifdef CONFIG_BLOCK
-void laptop_mode_timer_fn(unsigned long data)
+void laptop_mode_timer_fn(struct timer_list *t)
 {
-	struct request_queue *q = (struct request_queue *)data;
+	struct backing_dev_info *backing_dev_info =
+		from_timer(backing_dev_info, t, laptop_mode_wb_timer);
 
-	wakeup_flusher_threads_bdi(q->backing_dev_info, WB_REASON_LAPTOP_TIMER);
+	wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER);
 }
 
 /*
-- 
cgit v1.2.3


From 7eeb6b893bd28c68b6d664de1d3120e49b855cdb Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 11 Oct 2017 23:13:49 -0700
Subject: timer: Remove init_timer() interface

All users of init_timer() have been updated. Remove the ancient interface.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 Documentation/core-api/local_ops.rst | 10 +++-------
 include/linux/timer.h                |  3 ---
 2 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/core-api/local_ops.rst b/Documentation/core-api/local_ops.rst
index 1062ddba62c7..2ac3f9f29845 100644
--- a/Documentation/core-api/local_ops.rst
+++ b/Documentation/core-api/local_ops.rst
@@ -177,18 +177,14 @@ Here is a sample module which implements a basic per cpu counter using
                     printk("Read : CPU %d, count %ld\n", cpu,
                             local_read(&per_cpu(counters, cpu)));
             }
-            del_timer(&test_timer);
-            test_timer.expires = jiffies + 1000;
-            add_timer(&test_timer);
+            mod_timer(&test_timer, jiffies + 1000);
     }
 
     static int __init test_init(void)
     {
             /* initialize the timer that will increment the counter */
-            init_timer(&test_timer);
-            test_timer.function = do_test_timer;
-            test_timer.expires = jiffies + 1;
-            add_timer(&test_timer);
+            timer_setup(&test_timer, do_test_timer, 0);
+            mod_timer(&test_timer, jiffies + 1);
 
             return 0;
     }
diff --git a/include/linux/timer.h b/include/linux/timer.h
index bf781acfc6d8..a58097ea7cfc 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -117,9 +117,6 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 	init_timer_on_stack_key((_timer), (_flags), NULL, NULL)
 #endif
 
-#define init_timer(timer)						\
-	__init_timer((timer), 0)
-
 #define __setup_timer(_timer, _fn, _data, _flags)			\
 	do {								\
 		__init_timer((_timer), (_flags));			\
-- 
cgit v1.2.3


From 513ae785c63c30741e46f43960213d4ae5382ec0 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sun, 22 Oct 2017 18:02:27 -0700
Subject: timer: Remove setup_*timer() interface

With all callers converted to timer_setup(), the old setup_*timer()
interface can be removed.

Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/timer.h | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index a58097ea7cfc..47615dca4c5c 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -131,23 +131,6 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 		(_timer)->data = (_data);				\
 	} while (0)
 
-#define setup_timer(timer, fn, data)					\
-	__setup_timer((timer), (fn), (data), 0)
-#define setup_pinned_timer(timer, fn, data)				\
-	__setup_timer((timer), (fn), (data), TIMER_PINNED)
-#define setup_deferrable_timer(timer, fn, data)				\
-	__setup_timer((timer), (fn), (data), TIMER_DEFERRABLE)
-#define setup_pinned_deferrable_timer(timer, fn, data)			\
-	__setup_timer((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED)
-#define setup_timer_on_stack(timer, fn, data)				\
-	__setup_timer_on_stack((timer), (fn), (data), 0)
-#define setup_pinned_timer_on_stack(timer, fn, data)			\
-	__setup_timer_on_stack((timer), (fn), (data), TIMER_PINNED)
-#define setup_deferrable_timer_on_stack(timer, fn, data)		\
-	__setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE)
-#define setup_pinned_deferrable_timer_on_stack(timer, fn, data)		\
-	__setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED)
-
 #ifndef CONFIG_LOCKDEP
 static inline void timer_setup(struct timer_list *timer,
 			       void (*callback)(struct timer_list *),
-- 
cgit v1.2.3


From c1eba5bcb6430868427e0b9d1cd1205a07302f06 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sun, 22 Oct 2017 18:18:19 -0700
Subject: timer: Pass timer_list pointer to callbacks unconditionally

Now that all timer callbacks are already taking their struct timer_list
pointer as the callback argument, just do this unconditionally and remove
the .data field.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/timer.h |  4 ----
 kernel/time/timer.c   | 17 +++++++----------
 2 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 47615dca4c5c..20a6e7af5fd6 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -18,7 +18,6 @@ struct timer_list {
 	struct hlist_node	entry;
 	unsigned long		expires;
 	void			(*function)(unsigned long);
-	unsigned long		data;
 	u32			flags;
 
 #ifdef CONFIG_LOCKDEP
@@ -70,7 +69,6 @@ struct timer_list {
 #define __TIMER_INITIALIZER(_function, _data, _flags) {		\
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
-		.data = (_data),				\
 		.flags = (_flags),				\
 		__TIMER_LOCKDEP_MAP_INITIALIZER(		\
 			__FILE__ ":" __stringify(__LINE__))	\
@@ -121,14 +119,12 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 	do {								\
 		__init_timer((_timer), (_flags));			\
 		(_timer)->function = (_fn);				\
-		(_timer)->data = (_data);				\
 	} while (0)
 
 #define __setup_timer_on_stack(_timer, _fn, _data, _flags)		\
 	do {								\
 		__init_timer_on_stack((_timer), (_flags));		\
 		(_timer)->function = (_fn);				\
-		(_timer)->data = (_data);				\
 	} while (0)
 
 #ifndef CONFIG_LOCKDEP
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index af0b8bae4502..a07eb124332f 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1107,12 +1107,12 @@ EXPORT_SYMBOL(timer_reduce);
  * add_timer - start a timer
  * @timer: the timer to be added
  *
- * The kernel will do a ->function(->data) callback from the
+ * The kernel will do a ->function(@timer) callback from the
  * timer interrupt at the ->expires point in the future. The
  * current time is 'jiffies'.
  *
- * The timer's ->expires, ->function (and if the handler uses it, ->data)
- * fields must be set prior calling this function.
+ * The timer's ->expires, ->function fields must be set prior calling this
+ * function.
  *
  * Timers with an ->expires field in the past will be executed in the next
  * timer tick.
@@ -1284,8 +1284,7 @@ int del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
-static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
-			  unsigned long data)
+static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long))
 {
 	int count = preempt_count();
 
@@ -1309,7 +1308,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
 	lock_map_acquire(&lockdep_map);
 
 	trace_timer_expire_entry(timer);
-	fn(data);
+	fn((TIMER_DATA_TYPE)timer);
 	trace_timer_expire_exit(timer);
 
 	lock_map_release(&lockdep_map);
@@ -1332,7 +1331,6 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
 	while (!hlist_empty(head)) {
 		struct timer_list *timer;
 		void (*fn)(unsigned long);
-		unsigned long data;
 
 		timer = hlist_entry(head->first, struct timer_list, entry);
 
@@ -1340,15 +1338,14 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
 		detach_timer(timer, true);
 
 		fn = timer->function;
-		data = timer->data;
 
 		if (timer->flags & TIMER_IRQSAFE) {
 			raw_spin_unlock(&base->lock);
-			call_timer_fn(timer, fn, data);
+			call_timer_fn(timer, fn);
 			raw_spin_lock(&base->lock);
 		} else {
 			raw_spin_unlock_irq(&base->lock);
-			call_timer_fn(timer, fn, data);
+			call_timer_fn(timer, fn);
 			raw_spin_lock_irq(&base->lock);
 		}
 	}
-- 
cgit v1.2.3


From 354b46b1a0adda1dd5b7f0bc2a5604cca091be5f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sun, 22 Oct 2017 19:15:40 -0700
Subject: timer: Switch callback prototype to take struct timer_list * argument

Since all callbacks have been converted, we can switch the core
prototype to "struct timer_list *" now too.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/timer.h | 4 ++--
 kernel/time/timer.c   | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 20a6e7af5fd6..a6d04fb72c9e 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -17,7 +17,7 @@ struct timer_list {
 	 */
 	struct hlist_node	entry;
 	unsigned long		expires;
-	void			(*function)(unsigned long);
+	void			(*function)(struct timer_list *);
 	u32			flags;
 
 #ifdef CONFIG_LOCKDEP
@@ -63,7 +63,7 @@ struct timer_list {
 
 #define TIMER_TRACE_FLAGMASK	(TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE)
 
-#define TIMER_DATA_TYPE		unsigned long
+#define TIMER_DATA_TYPE		struct timer_list *
 #define TIMER_FUNC_TYPE		void (*)(TIMER_DATA_TYPE)
 
 #define __TIMER_INITIALIZER(_function, _data, _flags) {		\
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a07eb124332f..0f0d49a02d04 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1284,7 +1284,7 @@ int del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
-static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long))
+static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *))
 {
 	int count = preempt_count();
 
@@ -1308,7 +1308,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long))
 	lock_map_acquire(&lockdep_map);
 
 	trace_timer_expire_entry(timer);
-	fn((TIMER_DATA_TYPE)timer);
+	fn(timer);
 	trace_timer_expire_exit(timer);
 
 	lock_map_release(&lockdep_map);
@@ -1330,7 +1330,7 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
 {
 	while (!hlist_empty(head)) {
 		struct timer_list *timer;
-		void (*fn)(unsigned long);
+		void (*fn)(struct timer_list *);
 
 		timer = hlist_entry(head->first, struct timer_list, entry);
 
-- 
cgit v1.2.3


From 1fe66ba572b455270dc35a2c099dd7328cec9e4c Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sun, 22 Oct 2017 18:22:50 -0700
Subject: timer: Remove unused data arguments from macros

With the .data field removed, the ignored data arguments in timer macros
can be removed.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Shaohua Li <shli@fb.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/kthread.h   |  2 --
 include/linux/timer.h     | 18 ++++++++----------
 include/linux/workqueue.h |  3 ---
 3 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 3203e36b2ee8..b855c5b72b26 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -119,7 +119,6 @@ struct kthread_delayed_work {
 #define KTHREAD_DELAYED_WORK_INIT(dwork, fn) {				\
 	.work = KTHREAD_WORK_INIT((dwork).work, (fn)),			\
 	.timer = __TIMER_INITIALIZER((TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn,\
-				     (TIMER_DATA_TYPE)&(dwork.timer),	\
 				     TIMER_IRQSAFE),			\
 	}
 
@@ -167,7 +166,6 @@ extern void __kthread_init_worker(struct kthread_worker *worker,
 		kthread_init_work(&(dwork)->work, (fn));		\
 		__setup_timer(&(dwork)->timer,				\
 			      (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn,\
-			      (TIMER_DATA_TYPE)&(dwork)->timer,		\
 			      TIMER_IRQSAFE);				\
 	} while (0)
 
diff --git a/include/linux/timer.h b/include/linux/timer.h
index a6d04fb72c9e..e6bab51db13d 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -66,7 +66,7 @@ struct timer_list {
 #define TIMER_DATA_TYPE		struct timer_list *
 #define TIMER_FUNC_TYPE		void (*)(TIMER_DATA_TYPE)
 
-#define __TIMER_INITIALIZER(_function, _data, _flags) {		\
+#define __TIMER_INITIALIZER(_function, _flags) {		\
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
 		.flags = (_flags),				\
@@ -76,7 +76,7 @@ struct timer_list {
 
 #define DEFINE_TIMER(_name, _function)				\
 	struct timer_list _name =				\
-		__TIMER_INITIALIZER((TIMER_FUNC_TYPE)_function, 0, 0)
+		__TIMER_INITIALIZER((TIMER_FUNC_TYPE)_function, 0)
 
 void init_timer_key(struct timer_list *timer, unsigned int flags,
 		    const char *name, struct lock_class_key *key);
@@ -115,13 +115,13 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 	init_timer_on_stack_key((_timer), (_flags), NULL, NULL)
 #endif
 
-#define __setup_timer(_timer, _fn, _data, _flags)			\
+#define __setup_timer(_timer, _fn, _flags)				\
 	do {								\
 		__init_timer((_timer), (_flags));			\
 		(_timer)->function = (_fn);				\
 	} while (0)
 
-#define __setup_timer_on_stack(_timer, _fn, _data, _flags)		\
+#define __setup_timer_on_stack(_timer, _fn, _flags)			\
 	do {								\
 		__init_timer_on_stack((_timer), (_flags));		\
 		(_timer)->function = (_fn);				\
@@ -132,16 +132,14 @@ static inline void timer_setup(struct timer_list *timer,
 			       void (*callback)(struct timer_list *),
 			       unsigned int flags)
 {
-	__setup_timer(timer, (TIMER_FUNC_TYPE)callback,
-		      (TIMER_DATA_TYPE)timer, flags);
+	__setup_timer(timer, (TIMER_FUNC_TYPE)callback, flags);
 }
 
 static inline void timer_setup_on_stack(struct timer_list *timer,
 			       void (*callback)(struct timer_list *),
 			       unsigned int flags)
 {
-	__setup_timer_on_stack(timer, (TIMER_FUNC_TYPE)callback,
-			       (TIMER_DATA_TYPE)timer, flags);
+	__setup_timer_on_stack(timer, (TIMER_FUNC_TYPE)callback, flags);
 }
 #else
 /*
@@ -151,11 +149,11 @@ static inline void timer_setup_on_stack(struct timer_list *timer,
  */
 # define timer_setup(timer, callback, flags)				\
 		__setup_timer((timer), (TIMER_FUNC_TYPE)(callback),	\
-			      (TIMER_DATA_TYPE)(timer), (flags))
+			      (flags))
 # define timer_setup_on_stack(timer, callback, flags)			\
 		__setup_timer_on_stack((timer),				\
 				       (TIMER_FUNC_TYPE)(callback),	\
-				       (TIMER_DATA_TYPE)(timer), (flags))
+				       (flags))
 #endif
 
 #define from_timer(var, callback_timer, timer_fieldname) \
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 01a050fc6650..8d11580237f5 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -177,7 +177,6 @@ struct execute_work {
 #define __DELAYED_WORK_INITIALIZER(n, f, tflags) {			\
 	.work = __WORK_INITIALIZER((n).work, (f)),			\
 	.timer = __TIMER_INITIALIZER((TIMER_FUNC_TYPE)delayed_work_timer_fn,\
-				     (TIMER_DATA_TYPE)&(n.timer),	\
 				     (tflags) | TIMER_IRQSAFE),		\
 	}
 
@@ -244,7 +243,6 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 		INIT_WORK(&(_work)->work, (_func));			\
 		__setup_timer(&(_work)->timer,				\
 			      (TIMER_FUNC_TYPE)delayed_work_timer_fn,	\
-			      (TIMER_DATA_TYPE)&(_work)->timer,		\
 			      (_tflags) | TIMER_IRQSAFE);		\
 	} while (0)
 
@@ -253,7 +251,6 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 		INIT_WORK_ONSTACK(&(_work)->work, (_func));		\
 		__setup_timer_on_stack(&(_work)->timer,			\
 				       (TIMER_FUNC_TYPE)delayed_work_timer_fn,\
-				       (TIMER_DATA_TYPE)&(_work)->timer,\
 				       (_tflags) | TIMER_IRQSAFE);	\
 	} while (0)
 
-- 
cgit v1.2.3


From 188665b2d67db8953899551d1a9d4481b2a0ac60 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sun, 22 Oct 2017 18:14:46 -0700
Subject: timer: Pass function down to initialization routines

In preparation for removing more macros, pass the function down to the
initialization routines instead of doing it in macros.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/timer.h | 33 ++++++++++++++++++---------------
 kernel/time/timer.c   | 21 +++++++++++++++------
 2 files changed, 33 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index e6bab51db13d..aff73b1c8f7b 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -78,53 +78,56 @@ struct timer_list {
 	struct timer_list _name =				\
 		__TIMER_INITIALIZER((TIMER_FUNC_TYPE)_function, 0)
 
-void init_timer_key(struct timer_list *timer, unsigned int flags,
+void init_timer_key(struct timer_list *timer,
+		    void (*func)(struct timer_list *), unsigned int flags,
 		    const char *name, struct lock_class_key *key);
 
 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
 extern void init_timer_on_stack_key(struct timer_list *timer,
+				    void (*func)(struct timer_list *),
 				    unsigned int flags, const char *name,
 				    struct lock_class_key *key);
 extern void destroy_timer_on_stack(struct timer_list *timer);
 #else
 static inline void destroy_timer_on_stack(struct timer_list *timer) { }
 static inline void init_timer_on_stack_key(struct timer_list *timer,
-					   unsigned int flags, const char *name,
+					   void (*func)(struct timer_list *),
+					   unsigned int flags,
+					   const char *name,
 					   struct lock_class_key *key)
 {
-	init_timer_key(timer, flags, name, key);
+	init_timer_key(timer, func, flags, name, key);
 }
 #endif
 
 #ifdef CONFIG_LOCKDEP
-#define __init_timer(_timer, _flags)					\
+#define __init_timer(_timer, _fn, _flags)				\
 	do {								\
 		static struct lock_class_key __key;			\
-		init_timer_key((_timer), (_flags), #_timer, &__key);	\
+		init_timer_key((_timer), (_fn), (_flags), #_timer, &__key);\
 	} while (0)
 
-#define __init_timer_on_stack(_timer, _flags)				\
+#define __init_timer_on_stack(_timer, _fn, _flags)			\
 	do {								\
 		static struct lock_class_key __key;			\
-		init_timer_on_stack_key((_timer), (_flags), #_timer, &__key); \
+		init_timer_on_stack_key((_timer), (_fn), (_flags),	\
+					#_timer, &__key);		 \
 	} while (0)
 #else
-#define __init_timer(_timer, _flags)					\
-	init_timer_key((_timer), (_flags), NULL, NULL)
-#define __init_timer_on_stack(_timer, _flags)				\
-	init_timer_on_stack_key((_timer), (_flags), NULL, NULL)
+#define __init_timer(_timer, _fn, _flags)				\
+	init_timer_key((_timer), (_fn), (_flags), NULL, NULL)
+#define __init_timer_on_stack(_timer, _fn, _flags)			\
+	init_timer_on_stack_key((_timer), (_fn), (_flags), NULL, NULL)
 #endif
 
 #define __setup_timer(_timer, _fn, _flags)				\
 	do {								\
-		__init_timer((_timer), (_flags));			\
-		(_timer)->function = (_fn);				\
+		__init_timer((_timer), (_fn), (_flags));		\
 	} while (0)
 
 #define __setup_timer_on_stack(_timer, _fn, _flags)			\
 	do {								\
-		__init_timer_on_stack((_timer), (_flags));		\
-		(_timer)->function = (_fn);				\
+		__init_timer_on_stack((_timer), (_fn), (_flags));	\
 	} while (0)
 
 #ifndef CONFIG_LOCKDEP
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 0f0d49a02d04..ffebcf878fba 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -707,14 +707,18 @@ static inline void debug_timer_assert_init(struct timer_list *timer)
 	debug_object_assert_init(timer, &timer_debug_descr);
 }
 
-static void do_init_timer(struct timer_list *timer, unsigned int flags,
+static void do_init_timer(struct timer_list *timer,
+			  void (*func)(struct timer_list *),
+			  unsigned int flags,
 			  const char *name, struct lock_class_key *key);
 
-void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags,
+void init_timer_on_stack_key(struct timer_list *timer,
+			     void (*func)(struct timer_list *),
+			     unsigned int flags,
 			     const char *name, struct lock_class_key *key)
 {
 	debug_object_init_on_stack(timer, &timer_debug_descr);
-	do_init_timer(timer, flags, name, key);
+	do_init_timer(timer, func, flags, name, key);
 }
 EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
 
@@ -755,10 +759,13 @@ static inline void debug_assert_init(struct timer_list *timer)
 	debug_timer_assert_init(timer);
 }
 
-static void do_init_timer(struct timer_list *timer, unsigned int flags,
+static void do_init_timer(struct timer_list *timer,
+			  void (*func)(struct timer_list *),
+			  unsigned int flags,
 			  const char *name, struct lock_class_key *key)
 {
 	timer->entry.pprev = NULL;
+	timer->function = func;
 	timer->flags = flags | raw_smp_processor_id();
 	lockdep_init_map(&timer->lockdep_map, name, key, 0);
 }
@@ -766,6 +773,7 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
 /**
  * init_timer_key - initialize a timer
  * @timer: the timer to be initialized
+ * @func: timer callback function
  * @flags: timer flags
  * @name: name of the timer
  * @key: lockdep class key of the fake lock used for tracking timer
@@ -774,11 +782,12 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
  * init_timer_key() must be done to a timer prior calling *any* of the
  * other timer functions.
  */
-void init_timer_key(struct timer_list *timer, unsigned int flags,
+void init_timer_key(struct timer_list *timer,
+		    void (*func)(struct timer_list *), unsigned int flags,
 		    const char *name, struct lock_class_key *key)
 {
 	debug_init(timer);
-	do_init_timer(timer, flags, name, key);
+	do_init_timer(timer, func, flags, name, key);
 }
 EXPORT_SYMBOL(init_timer_key);
 
-- 
cgit v1.2.3


From 919b250f8570618e84af544c3e18dad5210eb9b6 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sun, 22 Oct 2017 18:48:43 -0700
Subject: timer: Remove redundant __setup_timer*() macros

With __init_timer*() now matching __setup_timer*(), remove the redundant
internal interface, clean up the resulting definitions and add more
documentation.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/kthread.h   |  6 ++---
 include/linux/timer.h     | 56 +++++++++++++++++------------------------------
 include/linux/workqueue.h | 12 +++++-----
 3 files changed, 29 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index b855c5b72b26..dc850d257ea2 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -164,9 +164,9 @@ extern void __kthread_init_worker(struct kthread_worker *worker,
 #define kthread_init_delayed_work(dwork, fn)				\
 	do {								\
 		kthread_init_work(&(dwork)->work, (fn));		\
-		__setup_timer(&(dwork)->timer,				\
-			      (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn,\
-			      TIMER_IRQSAFE);				\
+		__init_timer(&(dwork)->timer,				\
+			     kthread_delayed_work_timer_fn,		\
+			     TIMER_IRQSAFE);				\
 	} while (0)
 
 int kthread_worker_fn(void *worker_ptr);
diff --git a/include/linux/timer.h b/include/linux/timer.h
index aff73b1c8f7b..b1ae64b112c2 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -78,6 +78,9 @@ struct timer_list {
 	struct timer_list _name =				\
 		__TIMER_INITIALIZER((TIMER_FUNC_TYPE)_function, 0)
 
+/*
+ * LOCKDEP and DEBUG timer interfaces.
+ */
 void init_timer_key(struct timer_list *timer,
 		    void (*func)(struct timer_list *), unsigned int flags,
 		    const char *name, struct lock_class_key *key);
@@ -87,9 +90,7 @@ extern void init_timer_on_stack_key(struct timer_list *timer,
 				    void (*func)(struct timer_list *),
 				    unsigned int flags, const char *name,
 				    struct lock_class_key *key);
-extern void destroy_timer_on_stack(struct timer_list *timer);
 #else
-static inline void destroy_timer_on_stack(struct timer_list *timer) { }
 static inline void init_timer_on_stack_key(struct timer_list *timer,
 					   void (*func)(struct timer_list *),
 					   unsigned int flags,
@@ -120,43 +121,26 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 	init_timer_on_stack_key((_timer), (_fn), (_flags), NULL, NULL)
 #endif
 
-#define __setup_timer(_timer, _fn, _flags)				\
-	do {								\
-		__init_timer((_timer), (_fn), (_flags));		\
-	} while (0)
-
-#define __setup_timer_on_stack(_timer, _fn, _flags)			\
-	do {								\
-		__init_timer_on_stack((_timer), (_fn), (_flags));	\
-	} while (0)
+/**
+ * timer_setup - prepare a timer for first use
+ * @timer: the timer in question
+ * @callback: the function to call when timer expires
+ * @flags: any TIMER_* flags
+ *
+ * Regular timer initialization should use either DEFINE_TIMER() above,
+ * or timer_setup(). For timers on the stack, timer_setup_on_stack() must
+ * be used and must be balanced with a call to destroy_timer_on_stack().
+ */
+#define timer_setup(timer, callback, flags)			\
+	__init_timer((timer), (callback), (flags))
 
-#ifndef CONFIG_LOCKDEP
-static inline void timer_setup(struct timer_list *timer,
-			       void (*callback)(struct timer_list *),
-			       unsigned int flags)
-{
-	__setup_timer(timer, (TIMER_FUNC_TYPE)callback, flags);
-}
+#define timer_setup_on_stack(timer, callback, flags)		\
+	__init_timer_on_stack((timer), (callback), (flags))
 
-static inline void timer_setup_on_stack(struct timer_list *timer,
-			       void (*callback)(struct timer_list *),
-			       unsigned int flags)
-{
-	__setup_timer_on_stack(timer, (TIMER_FUNC_TYPE)callback, flags);
-}
+#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
+extern void destroy_timer_on_stack(struct timer_list *timer);
 #else
-/*
- * Under LOCKDEP, the timer lock_class_key (set up in __init_timer) needs
- * to be tied to the caller's context, so an inline (above) won't work. We
- * do want to keep the inline for argument type checking, though.
- */
-# define timer_setup(timer, callback, flags)				\
-		__setup_timer((timer), (TIMER_FUNC_TYPE)(callback),	\
-			      (flags))
-# define timer_setup_on_stack(timer, callback, flags)			\
-		__setup_timer_on_stack((timer),				\
-				       (TIMER_FUNC_TYPE)(callback),	\
-				       (flags))
+static inline void destroy_timer_on_stack(struct timer_list *timer) { }
 #endif
 
 #define from_timer(var, callback_timer, timer_fieldname) \
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 8d11580237f5..bff39faba793 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -241,17 +241,17 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 #define __INIT_DELAYED_WORK(_work, _func, _tflags)			\
 	do {								\
 		INIT_WORK(&(_work)->work, (_func));			\
-		__setup_timer(&(_work)->timer,				\
-			      (TIMER_FUNC_TYPE)delayed_work_timer_fn,	\
-			      (_tflags) | TIMER_IRQSAFE);		\
+		__init_timer(&(_work)->timer,				\
+			     delayed_work_timer_fn,			\
+			     (_tflags) | TIMER_IRQSAFE);		\
 	} while (0)
 
 #define __INIT_DELAYED_WORK_ONSTACK(_work, _func, _tflags)		\
 	do {								\
 		INIT_WORK_ONSTACK(&(_work)->work, (_func));		\
-		__setup_timer_on_stack(&(_work)->timer,			\
-				       (TIMER_FUNC_TYPE)delayed_work_timer_fn,\
-				       (_tflags) | TIMER_IRQSAFE);	\
+		__init_timer_on_stack(&(_work)->timer,			\
+				      delayed_work_timer_fn,		\
+				      (_tflags) | TIMER_IRQSAFE);	\
 	} while (0)
 
 #define INIT_DELAYED_WORK(_work, _func)					\
-- 
cgit v1.2.3


From 841b86f3289dbe858daeceec36423d4ea286fac2 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 23 Oct 2017 09:40:42 +0200
Subject: treewide: Remove TIMER_FUNC_TYPE and TIMER_DATA_TYPE casts

With all callbacks converted, and the timer callback prototype
switched over, the TIMER_FUNC_TYPE cast is no longer needed,
so remove it. Conversion was done with the following scripts:

    perl -pi -e 's|\(TIMER_FUNC_TYPE\)||g' \
        $(git grep TIMER_FUNC_TYPE | cut -d: -f1 | sort -u)

    perl -pi -e 's|\(TIMER_DATA_TYPE\)||g' \
        $(git grep TIMER_DATA_TYPE | cut -d: -f1 | sort -u)

The now unused macros are also dropped from include/linux/timer.h.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/base/power/wakeup.c                       |  2 +-
 drivers/block/aoe/aoecmd.c                        |  2 +-
 drivers/block/swim3.c                             |  2 +-
 drivers/infiniband/hw/nes/nes_verbs.c             |  2 +-
 drivers/input/input.c                             |  2 +-
 drivers/media/common/saa7146/saa7146_vbi.c        |  2 +-
 drivers/net/ethernet/ti/tlan.c                    |  6 +++---
 drivers/net/hamradio/scc.c                        |  8 ++++----
 drivers/net/wireless/quantenna/qtnfmac/cfg80211.c |  2 +-
 drivers/net/wireless/ray_cs.c                     | 12 ++++++------
 drivers/s390/char/sclp.c                          |  4 ++--
 drivers/s390/scsi/zfcp_fsf.c                      |  4 ++--
 drivers/scsi/aic94xx/aic94xx_hwi.c                |  2 +-
 drivers/scsi/aic94xx/aic94xx_tmf.c                |  2 +-
 drivers/scsi/be2iscsi/be_main.c                   |  4 ++--
 drivers/scsi/cxgbi/cxgb3i/cxgb3i.c                |  4 ++--
 drivers/scsi/cxgbi/cxgb4i/cxgb4i.c                |  4 ++--
 drivers/scsi/hisi_sas/hisi_sas_main.c             |  4 ++--
 drivers/scsi/hisi_sas/hisi_sas_v2_hw.c            |  6 +++---
 drivers/scsi/ipr.c                                |  8 ++++----
 drivers/scsi/libfc/fc_fcp.c                       |  6 +++---
 drivers/scsi/libsas/sas_expander.c                |  2 +-
 drivers/scsi/libsas/sas_scsi_host.c               |  2 +-
 drivers/scsi/mvsas/mv_sas.c                       |  4 ++--
 drivers/scsi/pm8001/pm8001_sas.c                  |  4 ++--
 drivers/scsi/pmcraid.c                            | 10 +++++-----
 drivers/staging/irda/include/net/irda/timer.h     |  2 +-
 drivers/tty/serial/8250/8250_core.c               |  4 ++--
 include/linux/kthread.h                           |  2 +-
 include/linux/timer.h                             |  5 +----
 include/linux/workqueue.h                         |  2 +-
 kernel/kthread.c                                  |  2 +-
 kernel/workqueue.c                                |  2 +-
 net/atm/lec.c                                     |  6 +++---
 net/can/proc.c                                    |  4 ++--
 net/lapb/lapb_timer.c                             |  4 ++--
 net/netrom/af_netrom.c                            |  2 +-
 net/netrom/nr_timer.c                             |  2 +-
 net/rose/rose_link.c                              |  4 ++--
 net/rose/rose_timer.c                             | 12 ++++++------
 net/sunrpc/svc_xprt.c                             |  2 +-
 net/x25/af_x25.c                                  |  2 +-
 net/x25/x25_timer.c                               |  2 +-
 sound/usb/line6/driver.c                          |  2 +-
 44 files changed, 84 insertions(+), 87 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 680ee1d36ac9..38559f04db2c 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -481,7 +481,7 @@ static bool wakeup_source_not_registered(struct wakeup_source *ws)
 	 * Use timer struct to check if the given source is initialized
 	 * by wakeup_source_add.
 	 */
-	return ws->timer.function != (TIMER_FUNC_TYPE)pm_wakeup_timer_fn;
+	return ws->timer.function != pm_wakeup_timer_fn;
 }
 
 /*
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 55ab25f79a08..812fed069708 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1429,7 +1429,7 @@ aoecmd_ata_id(struct aoedev *d)
 
 	d->rttavg = RTTAVG_INIT;
 	d->rttdev = RTTDEV_INIT;
-	d->timer.function = (TIMER_FUNC_TYPE)rexmit_timer;
+	d->timer.function = rexmit_timer;
 
 	skb = skb_clone(skb, GFP_ATOMIC);
 	if (skb) {
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index e620e423102b..af51015d056e 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -397,7 +397,7 @@ static void set_timeout(struct floppy_state *fs, int nticks,
 	if (fs->timeout_pending)
 		del_timer(&fs->timeout);
 	fs->timeout.expires = jiffies + nticks;
-	fs->timeout.function = (TIMER_FUNC_TYPE)proc;
+	fs->timeout.function = proc;
 	add_timer(&fs->timeout);
 	fs->timeout_pending = 1;
 }
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index db46b7b53fb4..162475aeeedd 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -3819,7 +3819,7 @@ void  nes_port_ibevent(struct nes_vnic *nesvnic)
 	if (!nesvnic->event_timer.function) {
 		ib_dispatch_event(&event);
 		nesvnic->last_dispatched_event = event.event;
-		nesvnic->event_timer.function = (TIMER_FUNC_TYPE)nes_handle_delayed_event;
+		nesvnic->event_timer.function = nes_handle_delayed_event;
 		nesvnic->event_timer.expires = jiffies + NES_EVENT_DELAY;
 		add_timer(&nesvnic->event_timer);
 	} else {
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 44916ef4a424..e30642db50d5 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -2047,7 +2047,7 @@ static void devm_input_device_unregister(struct device *dev, void *res)
  */
 void input_enable_softrepeat(struct input_dev *dev, int delay, int period)
 {
-	dev->timer.function = (TIMER_FUNC_TYPE)input_repeat_key;
+	dev->timer.function = input_repeat_key;
 	dev->rep[REP_DELAY] = delay;
 	dev->rep[REP_PERIOD] = period;
 }
diff --git a/drivers/media/common/saa7146/saa7146_vbi.c b/drivers/media/common/saa7146/saa7146_vbi.c
index ce8d78c137f0..e1d369b976ed 100644
--- a/drivers/media/common/saa7146/saa7146_vbi.c
+++ b/drivers/media/common/saa7146/saa7146_vbi.c
@@ -402,7 +402,7 @@ static int vbi_open(struct saa7146_dev *dev, struct file *file)
 			    sizeof(struct saa7146_buf),
 			    file, &dev->v4l2_lock);
 
-	vv->vbi_read_timeout.function = (TIMER_FUNC_TYPE)vbi_read_timeout;
+	vv->vbi_read_timeout.function = vbi_read_timeout;
 	vv->vbi_read_timeout_file = file;
 
 	/* initialize the brs */
diff --git a/drivers/net/ethernet/ti/tlan.c b/drivers/net/ethernet/ti/tlan.c
index 8f53d762fbc4..5a4e78fde530 100644
--- a/drivers/net/ethernet/ti/tlan.c
+++ b/drivers/net/ethernet/ti/tlan.c
@@ -254,7 +254,7 @@ tlan_set_timer(struct net_device *dev, u32 ticks, u32 type)
 			spin_unlock_irqrestore(&priv->lock, flags);
 		return;
 	}
-	priv->timer.function = (TIMER_FUNC_TYPE)tlan_timer;
+	priv->timer.function = tlan_timer;
 	if (!in_irq())
 		spin_unlock_irqrestore(&priv->lock, flags);
 
@@ -1425,7 +1425,7 @@ static u32 tlan_handle_tx_eof(struct net_device *dev, u16 host_int)
 		tlan_dio_write8(dev->base_addr,
 				TLAN_LED_REG, TLAN_LED_LINK | TLAN_LED_ACT);
 		if (priv->timer.function == NULL) {
-			priv->timer.function = (TIMER_FUNC_TYPE)tlan_timer;
+			priv->timer.function = tlan_timer;
 			priv->timer.expires = jiffies + TLAN_TIMER_ACT_DELAY;
 			priv->timer_set_at = jiffies;
 			priv->timer_type = TLAN_TIMER_ACTIVITY;
@@ -1576,7 +1576,7 @@ drop_and_reuse:
 		tlan_dio_write8(dev->base_addr,
 				TLAN_LED_REG, TLAN_LED_LINK | TLAN_LED_ACT);
 		if (priv->timer.function == NULL)  {
-			priv->timer.function = (TIMER_FUNC_TYPE)tlan_timer;
+			priv->timer.function = tlan_timer;
 			priv->timer.expires = jiffies + TLAN_TIMER_ACT_DELAY;
 			priv->timer_set_at = jiffies;
 			priv->timer_type = TLAN_TIMER_ACTIVITY;
diff --git a/drivers/net/hamradio/scc.c b/drivers/net/hamradio/scc.c
index c9f7215c5dc2..3de272959090 100644
--- a/drivers/net/hamradio/scc.c
+++ b/drivers/net/hamradio/scc.c
@@ -1005,7 +1005,7 @@ static void __scc_start_tx_timer(struct scc_channel *scc,
 	} else 
 	if (when != TIMER_OFF)
 	{
-		scc->tx_t.function = (TIMER_FUNC_TYPE)handler;
+		scc->tx_t.function = handler;
 		scc->tx_t.expires = jiffies + (when*HZ)/100;
 		add_timer(&scc->tx_t);
 	}
@@ -1031,7 +1031,7 @@ static void scc_start_defer(struct scc_channel *scc)
 	
 	if (scc->kiss.maxdefer != 0 && scc->kiss.maxdefer != TIMER_OFF)
 	{
-		scc->tx_wdog.function = (TIMER_FUNC_TYPE)t_busy;
+		scc->tx_wdog.function = t_busy;
 		scc->tx_wdog.expires = jiffies + HZ*scc->kiss.maxdefer;
 		add_timer(&scc->tx_wdog);
 	}
@@ -1047,7 +1047,7 @@ static void scc_start_maxkeyup(struct scc_channel *scc)
 	
 	if (scc->kiss.maxkeyup != 0 && scc->kiss.maxkeyup != TIMER_OFF)
 	{
-		scc->tx_wdog.function = (TIMER_FUNC_TYPE)t_maxkeyup;
+		scc->tx_wdog.function = t_maxkeyup;
 		scc->tx_wdog.expires = jiffies + HZ*scc->kiss.maxkeyup;
 		add_timer(&scc->tx_wdog);
 	}
@@ -1428,7 +1428,7 @@ scc_start_calibrate(struct scc_channel *scc, int duration, unsigned char pattern
 
 	del_timer(&scc->tx_wdog);
 
-	scc->tx_wdog.function = (TIMER_FUNC_TYPE)scc_stop_calibrate;
+	scc->tx_wdog.function = scc_stop_calibrate;
 	scc->tx_wdog.expires = jiffies + HZ*duration;
 	add_timer(&scc->tx_wdog);
 
diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
index 7d6dc76c930a..6711e7fb6926 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
@@ -554,7 +554,7 @@ qtnf_scan(struct wiphy *wiphy, struct cfg80211_scan_request *request)
 		return -EFAULT;
 	}
 
-	mac->scan_timeout.function = (TIMER_FUNC_TYPE)qtnf_scan_timeout;
+	mac->scan_timeout.function = qtnf_scan_timeout;
 	mod_timer(&mac->scan_timeout,
 		  jiffies + QTNF_SCAN_TIMEOUT_SEC * HZ);
 
diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c
index d8afcdfca1ed..0133fcd4601b 100644
--- a/drivers/net/wireless/ray_cs.c
+++ b/drivers/net/wireless/ray_cs.c
@@ -569,7 +569,7 @@ static int dl_startup_params(struct net_device *dev)
 	local->card_status = CARD_DL_PARAM;
 	/* Start kernel timer to wait for dl startup to complete. */
 	local->timer.expires = jiffies + HZ / 2;
-	local->timer.function = (TIMER_FUNC_TYPE)verify_dl_startup;
+	local->timer.function = verify_dl_startup;
 	add_timer(&local->timer);
 	dev_dbg(&link->dev,
 	      "ray_cs dl_startup_params started timer for verify_dl_startup\n");
@@ -1947,12 +1947,12 @@ static irqreturn_t ray_interrupt(int irq, void *dev_id)
 					dev_dbg(&link->dev,
 					      "ray_cs interrupt network \"%s\" start failed\n",
 					      memtmp);
-					local->timer.function = (TIMER_FUNC_TYPE)start_net;
+					local->timer.function = start_net;
 				} else {
 					dev_dbg(&link->dev,
 					      "ray_cs interrupt network \"%s\" join failed\n",
 					      memtmp);
-					local->timer.function = (TIMER_FUNC_TYPE)join_net;
+					local->timer.function = join_net;
 				}
 				add_timer(&local->timer);
 			}
@@ -2417,9 +2417,9 @@ static void authenticate(ray_dev_t *local)
 
 	del_timer(&local->timer);
 	if (build_auth_frame(local, local->bss_id, OPEN_AUTH_REQUEST)) {
-		local->timer.function = (TIMER_FUNC_TYPE)join_net;
+		local->timer.function = join_net;
 	} else {
-		local->timer.function = (TIMER_FUNC_TYPE)authenticate_timeout;
+		local->timer.function = authenticate_timeout;
 	}
 	local->timer.expires = jiffies + HZ * 2;
 	add_timer(&local->timer);
@@ -2502,7 +2502,7 @@ static void associate(ray_dev_t *local)
 
 		del_timer(&local->timer);
 		local->timer.expires = jiffies + HZ * 2;
-		local->timer.function = (TIMER_FUNC_TYPE)join_net;
+		local->timer.function = join_net;
 		add_timer(&local->timer);
 		local->card_status = CARD_ASSOC_FAILED;
 		return;
diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
index 9b4c61c1e309..e4e2df7a478e 100644
--- a/drivers/s390/char/sclp.c
+++ b/drivers/s390/char/sclp.c
@@ -158,7 +158,7 @@ static inline void
 __sclp_set_request_timer(unsigned long time, void (*cb)(struct timer_list *))
 {
 	del_timer(&sclp_request_timer);
-	sclp_request_timer.function = (TIMER_FUNC_TYPE)cb;
+	sclp_request_timer.function = cb;
 	sclp_request_timer.expires = jiffies + time;
 	add_timer(&sclp_request_timer);
 }
@@ -566,7 +566,7 @@ sclp_sync_wait(void)
 		if (timer_pending(&sclp_request_timer) &&
 		    get_tod_clock_fast() > timeout &&
 		    del_timer(&sclp_request_timer))
-			sclp_request_timer.function((TIMER_DATA_TYPE)&sclp_request_timer);
+			sclp_request_timer.function(&sclp_request_timer);
 		cpu_relax();
 	}
 	local_irq_disable();
diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c
index 51b81c0a0652..b12cb81ad8a2 100644
--- a/drivers/s390/scsi/zfcp_fsf.c
+++ b/drivers/s390/scsi/zfcp_fsf.c
@@ -34,7 +34,7 @@ static void zfcp_fsf_request_timeout_handler(struct timer_list *t)
 static void zfcp_fsf_start_timer(struct zfcp_fsf_req *fsf_req,
 				 unsigned long timeout)
 {
-	fsf_req->timer.function = (TIMER_FUNC_TYPE)zfcp_fsf_request_timeout_handler;
+	fsf_req->timer.function = zfcp_fsf_request_timeout_handler;
 	fsf_req->timer.expires = jiffies + timeout;
 	add_timer(&fsf_req->timer);
 }
@@ -42,7 +42,7 @@ static void zfcp_fsf_start_timer(struct zfcp_fsf_req *fsf_req,
 static void zfcp_fsf_start_erp_timer(struct zfcp_fsf_req *fsf_req)
 {
 	BUG_ON(!fsf_req->erp_action);
-	fsf_req->timer.function = (TIMER_FUNC_TYPE)zfcp_erp_timeout_handler;
+	fsf_req->timer.function = zfcp_erp_timeout_handler;
 	fsf_req->timer.expires = jiffies + 30 * HZ;
 	add_timer(&fsf_req->timer);
 }
diff --git a/drivers/scsi/aic94xx/aic94xx_hwi.c b/drivers/scsi/aic94xx/aic94xx_hwi.c
index 5402b85b0bdc..2dbc8330d7d3 100644
--- a/drivers/scsi/aic94xx/aic94xx_hwi.c
+++ b/drivers/scsi/aic94xx/aic94xx_hwi.c
@@ -1175,7 +1175,7 @@ static void asd_start_scb_timers(struct list_head *list)
 	struct asd_ascb *ascb;
 	list_for_each_entry(ascb, list, list) {
 		if (!ascb->uldd_timer) {
-			ascb->timer.function = (TIMER_FUNC_TYPE)asd_ascb_timedout;
+			ascb->timer.function = asd_ascb_timedout;
 			ascb->timer.expires = jiffies + AIC94XX_SCB_TIMEOUT;
 			add_timer(&ascb->timer);
 		}
diff --git a/drivers/scsi/aic94xx/aic94xx_tmf.c b/drivers/scsi/aic94xx/aic94xx_tmf.c
index 4637119c09d8..2a01702d5ba7 100644
--- a/drivers/scsi/aic94xx/aic94xx_tmf.c
+++ b/drivers/scsi/aic94xx/aic94xx_tmf.c
@@ -42,7 +42,7 @@ static int asd_enqueue_internal(struct asd_ascb *ascb,
 	ascb->tasklet_complete = tasklet_complete;
 	ascb->uldd_timer = 1;
 
-	ascb->timer.function = (TIMER_FUNC_TYPE)timed_out;
+	ascb->timer.function = timed_out;
 	ascb->timer.expires = jiffies + AIC94XX_SCB_TIMEOUT;
 
 	add_timer(&ascb->timer);
diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c
index be96aa1e5077..b3cfdd5f4d1c 100644
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -5279,7 +5279,7 @@ static void beiscsi_hw_health_check(struct timer_list *t)
 		if (!test_bit(BEISCSI_HBA_UER_SUPP, &phba->state))
 			return;
 		/* modify this timer to check TPE */
-		phba->hw_check.function = (TIMER_FUNC_TYPE)beiscsi_hw_tpe_check;
+		phba->hw_check.function = beiscsi_hw_tpe_check;
 	}
 
 	mod_timer(&phba->hw_check,
@@ -5367,7 +5367,7 @@ static int beiscsi_enable_port(struct beiscsi_hba *phba)
 	 * Timer function gets modified for TPE detection.
 	 * Always reinit to do health check first.
 	 */
-	phba->hw_check.function = (TIMER_FUNC_TYPE)beiscsi_hw_health_check;
+	phba->hw_check.function = beiscsi_hw_health_check;
 	mod_timer(&phba->hw_check,
 		  jiffies + msecs_to_jiffies(BEISCSI_UE_DETECT_INTERVAL));
 	return 0;
diff --git a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
index babd79361a46..bf07735275a4 100644
--- a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
+++ b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
@@ -586,8 +586,8 @@ static int do_act_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
 	cxgbi_sock_get(csk);
 	spin_lock_bh(&csk->lock);
 	if (rpl->status == CPL_ERR_CONN_EXIST &&
-	    csk->retry_timer.function != (TIMER_FUNC_TYPE)act_open_retry_timer) {
-		csk->retry_timer.function = (TIMER_FUNC_TYPE)act_open_retry_timer;
+	    csk->retry_timer.function != act_open_retry_timer) {
+		csk->retry_timer.function = act_open_retry_timer;
 		mod_timer(&csk->retry_timer, jiffies + HZ / 2);
 	} else
 		cxgbi_sock_fail_act_open(csk,
diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
index 266eddf17a99..406e94312d4e 100644
--- a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
+++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
@@ -963,8 +963,8 @@ static void do_act_open_rpl(struct cxgbi_device *cdev, struct sk_buff *skb)
 	spin_lock_bh(&csk->lock);
 
 	if (status == CPL_ERR_CONN_EXIST &&
-	    csk->retry_timer.function != (TIMER_FUNC_TYPE)csk_act_open_retry_timer) {
-		csk->retry_timer.function = (TIMER_FUNC_TYPE)csk_act_open_retry_timer;
+	    csk->retry_timer.function != csk_act_open_retry_timer) {
+		csk->retry_timer.function = csk_act_open_retry_timer;
 		mod_timer(&csk->retry_timer, jiffies + HZ / 2);
 	} else
 		cxgbi_sock_fail_act_open(csk,
diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c
index 61a85ff8e459..5f503cb09508 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_main.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_main.c
@@ -839,7 +839,7 @@ static int hisi_sas_exec_internal_tmf_task(struct domain_device *device,
 		}
 		task->task_done = hisi_sas_task_done;
 
-		task->slow_task->timer.function = (TIMER_FUNC_TYPE)hisi_sas_tmf_timedout;
+		task->slow_task->timer.function = hisi_sas_tmf_timedout;
 		task->slow_task->timer.expires = jiffies + TASK_TIMEOUT*HZ;
 		add_timer(&task->slow_task->timer);
 
@@ -1451,7 +1451,7 @@ hisi_sas_internal_task_abort(struct hisi_hba *hisi_hba,
 	task->dev = device;
 	task->task_proto = device->tproto;
 	task->task_done = hisi_sas_task_done;
-	task->slow_task->timer.function = (TIMER_FUNC_TYPE)hisi_sas_tmf_timedout;
+	task->slow_task->timer.function = hisi_sas_tmf_timedout;
 	task->slow_task->timer.expires = jiffies + msecs_to_jiffies(110);
 	add_timer(&task->slow_task->timer);
 
diff --git a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
index d02c2a791981..5d3467fd728d 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
@@ -1268,7 +1268,7 @@ static void link_timeout_enable_link(struct timer_list *t)
 		}
 	}
 
-	hisi_hba->timer.function = (TIMER_FUNC_TYPE)link_timeout_disable_link;
+	hisi_hba->timer.function = link_timeout_disable_link;
 	mod_timer(&hisi_hba->timer, jiffies + msecs_to_jiffies(900));
 }
 
@@ -1289,13 +1289,13 @@ static void link_timeout_disable_link(struct timer_list *t)
 		}
 	}
 
-	hisi_hba->timer.function = (TIMER_FUNC_TYPE)link_timeout_enable_link;
+	hisi_hba->timer.function = link_timeout_enable_link;
 	mod_timer(&hisi_hba->timer, jiffies + msecs_to_jiffies(100));
 }
 
 static void set_link_timer_quirk(struct hisi_hba *hisi_hba)
 {
-	hisi_hba->timer.function = (TIMER_FUNC_TYPE)link_timeout_disable_link;
+	hisi_hba->timer.function = link_timeout_disable_link;
 	hisi_hba->timer.expires = jiffies + msecs_to_jiffies(1000);
 	add_timer(&hisi_hba->timer);
 }
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index d53429371127..cc0187965eee 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -997,7 +997,7 @@ static void ipr_do_req(struct ipr_cmnd *ipr_cmd,
 	ipr_cmd->done = done;
 
 	ipr_cmd->timer.expires = jiffies + timeout;
-	ipr_cmd->timer.function = (TIMER_FUNC_TYPE)timeout_func;
+	ipr_cmd->timer.function = timeout_func;
 
 	add_timer(&ipr_cmd->timer);
 
@@ -8312,7 +8312,7 @@ static void ipr_reset_start_timer(struct ipr_cmnd *ipr_cmd,
 	ipr_cmd->done = ipr_reset_ioa_job;
 
 	ipr_cmd->timer.expires = jiffies + timeout;
-	ipr_cmd->timer.function = (TIMER_FUNC_TYPE)ipr_reset_timer_done;
+	ipr_cmd->timer.function = ipr_reset_timer_done;
 	add_timer(&ipr_cmd->timer);
 }
 
@@ -8397,7 +8397,7 @@ static int ipr_reset_next_stage(struct ipr_cmnd *ipr_cmd)
 	}
 
 	ipr_cmd->timer.expires = jiffies + stage_time * HZ;
-	ipr_cmd->timer.function = (TIMER_FUNC_TYPE)ipr_oper_timeout;
+	ipr_cmd->timer.function = ipr_oper_timeout;
 	ipr_cmd->done = ipr_reset_ioa_job;
 	add_timer(&ipr_cmd->timer);
 
@@ -8468,7 +8468,7 @@ static int ipr_reset_enable_ioa(struct ipr_cmnd *ipr_cmd)
 	}
 
 	ipr_cmd->timer.expires = jiffies + (ioa_cfg->transop_timeout * HZ);
-	ipr_cmd->timer.function = (TIMER_FUNC_TYPE)ipr_oper_timeout;
+	ipr_cmd->timer.function = ipr_oper_timeout;
 	ipr_cmd->done = ipr_reset_ioa_job;
 	add_timer(&ipr_cmd->timer);
 	list_add_tail(&ipr_cmd->queue, &ipr_cmd->hrrq->hrrq_pending_q);
diff --git a/drivers/scsi/libfc/fc_fcp.c b/drivers/scsi/libfc/fc_fcp.c
index 1a4e701a8449..4fae253d4f3d 100644
--- a/drivers/scsi/libfc/fc_fcp.c
+++ b/drivers/scsi/libfc/fc_fcp.c
@@ -1214,7 +1214,7 @@ static int fc_fcp_cmd_send(struct fc_lport *lport, struct fc_fcp_pkt *fsp,
 	fsp->seq_ptr = seq;
 	fc_fcp_pkt_hold(fsp);	/* hold for fc_fcp_pkt_destroy */
 
-	fsp->timer.function = (TIMER_FUNC_TYPE)fc_fcp_timeout;
+	fsp->timer.function = fc_fcp_timeout;
 	if (rpriv->flags & FC_RP_FLAGS_REC_SUPPORTED)
 		fc_fcp_timer_set(fsp, get_fsp_rec_tov(fsp));
 
@@ -1307,7 +1307,7 @@ static void fc_lun_reset_send(struct timer_list *t)
 			return;
 		if (fc_fcp_lock_pkt(fsp))
 			return;
-		fsp->timer.function = (TIMER_FUNC_TYPE)fc_lun_reset_send;
+		fsp->timer.function = fc_lun_reset_send;
 		fc_fcp_timer_set(fsp, get_fsp_rec_tov(fsp));
 		fc_fcp_unlock_pkt(fsp);
 	}
@@ -1445,7 +1445,7 @@ static void fc_fcp_timeout(struct timer_list *t)
 	if (fsp->lp->qfull) {
 		FC_FCP_DBG(fsp, "fcp timeout, resetting timer delay %d\n",
 			   fsp->timer_delay);
-		fsp->timer.function = (TIMER_FUNC_TYPE)fc_fcp_timeout;
+		fsp->timer.function = fc_fcp_timeout;
 		fc_fcp_timer_set(fsp, fsp->timer_delay);
 		goto unlock;
 	}
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 174e5eff6155..ca1566237ae7 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -92,7 +92,7 @@ static int smp_execute_task_sg(struct domain_device *dev,
 
 		task->task_done = smp_task_done;
 
-		task->slow_task->timer.function = (TIMER_FUNC_TYPE)smp_task_timedout;
+		task->slow_task->timer.function = smp_task_timedout;
 		task->slow_task->timer.expires = jiffies + SMP_TIMEOUT*HZ;
 		add_timer(&task->slow_task->timer);
 
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index 91795eb56206..58476b728c57 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -919,7 +919,7 @@ void sas_task_abort(struct sas_task *task)
 			return;
 		if (!del_timer(&slow->timer))
 			return;
-		slow->timer.function((TIMER_DATA_TYPE)&slow->timer);
+		slow->timer.function(&slow->timer);
 		return;
 	}
 
diff --git a/drivers/scsi/mvsas/mv_sas.c b/drivers/scsi/mvsas/mv_sas.c
index cff1c37b8d2e..cff43bd9f675 100644
--- a/drivers/scsi/mvsas/mv_sas.c
+++ b/drivers/scsi/mvsas/mv_sas.c
@@ -1310,7 +1310,7 @@ static int mvs_exec_internal_tmf_task(struct domain_device *dev,
 		memcpy(&task->ssp_task, parameter, para_len);
 		task->task_done = mvs_task_done;
 
-		task->slow_task->timer.function = (TIMER_FUNC_TYPE)mvs_tmf_timedout;
+		task->slow_task->timer.function = mvs_tmf_timedout;
 		task->slow_task->timer.expires = jiffies + MVS_TASK_TIMEOUT*HZ;
 		add_timer(&task->slow_task->timer);
 
@@ -2020,7 +2020,7 @@ void mvs_int_port(struct mvs_info *mvi, int phy_no, u32 events)
 		MVS_CHIP_DISP->write_port_irq_mask(mvi, phy_no,
 					tmp | PHYEV_SIG_FIS);
 		if (phy->timer.function == NULL) {
-			phy->timer.function = (TIMER_FUNC_TYPE)mvs_sig_time_out;
+			phy->timer.function = mvs_sig_time_out;
 			phy->timer.expires = jiffies + 5*HZ;
 			add_timer(&phy->timer);
 		}
diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c
index 0e294e80c169..947d6017d004 100644
--- a/drivers/scsi/pm8001/pm8001_sas.c
+++ b/drivers/scsi/pm8001/pm8001_sas.c
@@ -695,7 +695,7 @@ static int pm8001_exec_internal_tmf_task(struct domain_device *dev,
 		task->task_proto = dev->tproto;
 		memcpy(&task->ssp_task, parameter, para_len);
 		task->task_done = pm8001_task_done;
-		task->slow_task->timer.function = (TIMER_FUNC_TYPE)pm8001_tmf_timedout;
+		task->slow_task->timer.function = pm8001_tmf_timedout;
 		task->slow_task->timer.expires = jiffies + PM8001_TASK_TIMEOUT*HZ;
 		add_timer(&task->slow_task->timer);
 
@@ -781,7 +781,7 @@ pm8001_exec_internal_task_abort(struct pm8001_hba_info *pm8001_ha,
 		task->dev = dev;
 		task->task_proto = dev->tproto;
 		task->task_done = pm8001_task_done;
-		task->slow_task->timer.function = (TIMER_FUNC_TYPE)pm8001_tmf_timedout;
+		task->slow_task->timer.function = pm8001_tmf_timedout;
 		task->slow_task->timer.expires = jiffies + PM8001_TASK_TIMEOUT * HZ;
 		add_timer(&task->slow_task->timer);
 
diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c
index 4f9f115fb6a0..e58be98430b0 100644
--- a/drivers/scsi/pmcraid.c
+++ b/drivers/scsi/pmcraid.c
@@ -604,7 +604,7 @@ static void pmcraid_start_bist(struct pmcraid_cmd *cmd)
 
 	cmd->time_left = msecs_to_jiffies(PMCRAID_BIST_TIMEOUT);
 	cmd->timer.expires = jiffies + msecs_to_jiffies(PMCRAID_BIST_TIMEOUT);
-	cmd->timer.function = (TIMER_FUNC_TYPE)pmcraid_bist_done;
+	cmd->timer.function = pmcraid_bist_done;
 	add_timer(&cmd->timer);
 }
 
@@ -636,7 +636,7 @@ static void pmcraid_reset_alert_done(struct timer_list *t)
 		/* restart timer if some more time is available to wait */
 		cmd->time_left -= PMCRAID_CHECK_FOR_RESET_TIMEOUT;
 		cmd->timer.expires = jiffies + PMCRAID_CHECK_FOR_RESET_TIMEOUT;
-		cmd->timer.function = (TIMER_FUNC_TYPE)pmcraid_reset_alert_done;
+		cmd->timer.function = pmcraid_reset_alert_done;
 		add_timer(&cmd->timer);
 	}
 }
@@ -673,7 +673,7 @@ static void pmcraid_reset_alert(struct pmcraid_cmd *cmd)
 		 */
 		cmd->time_left = PMCRAID_RESET_TIMEOUT;
 		cmd->timer.expires = jiffies + PMCRAID_CHECK_FOR_RESET_TIMEOUT;
-		cmd->timer.function = (TIMER_FUNC_TYPE)pmcraid_reset_alert_done;
+		cmd->timer.function = pmcraid_reset_alert_done;
 		add_timer(&cmd->timer);
 
 		iowrite32(DOORBELL_IOA_RESET_ALERT,
@@ -923,7 +923,7 @@ static void pmcraid_send_cmd(
 	if (timeout_func) {
 		/* setup timeout handler */
 		cmd->timer.expires = jiffies + timeout;
-		cmd->timer.function = (TIMER_FUNC_TYPE)timeout_func;
+		cmd->timer.function = timeout_func;
 		add_timer(&cmd->timer);
 	}
 
@@ -1951,7 +1951,7 @@ static void pmcraid_soft_reset(struct pmcraid_cmd *cmd)
 	cmd->cmd_done = pmcraid_ioa_reset;
 	cmd->timer.expires = jiffies +
 			     msecs_to_jiffies(PMCRAID_TRANSOP_TIMEOUT);
-	cmd->timer.function = (TIMER_FUNC_TYPE)pmcraid_timeout_handler;
+	cmd->timer.function = pmcraid_timeout_handler;
 
 	if (!timer_pending(&cmd->timer))
 		add_timer(&cmd->timer);
diff --git a/drivers/staging/irda/include/net/irda/timer.h b/drivers/staging/irda/include/net/irda/timer.h
index a6635f0afae9..6dab15f5dae1 100644
--- a/drivers/staging/irda/include/net/irda/timer.h
+++ b/drivers/staging/irda/include/net/irda/timer.h
@@ -75,7 +75,7 @@ struct lap_cb;
 static inline void irda_start_timer(struct timer_list *ptimer, int timeout,
 				    void (*callback)(struct timer_list *))
 {
-	ptimer->function = (TIMER_FUNC_TYPE) callback;
+	ptimer->function =  callback;
 
 	/* Set new value for timer (update or add timer).
 	 * We use mod_timer() because it's more efficient and also
diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index d64afdd93872..9342fc2ee7df 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -325,7 +325,7 @@ static int univ8250_setup_irq(struct uart_8250_port *up)
 	if (up->bugs & UART_BUG_THRE) {
 		pr_debug("ttyS%d - using backup timer\n", serial_index(port));
 
-		up->timer.function = (TIMER_FUNC_TYPE)serial8250_backup_timeout;
+		up->timer.function = serial8250_backup_timeout;
 		mod_timer(&up->timer, jiffies +
 			  uart_poll_timeout(port) + HZ / 5);
 	}
@@ -348,7 +348,7 @@ static void univ8250_release_irq(struct uart_8250_port *up)
 	struct uart_port *port = &up->port;
 
 	del_timer_sync(&up->timer);
-	up->timer.function = (TIMER_FUNC_TYPE)serial8250_timeout;
+	up->timer.function = serial8250_timeout;
 	if (port->irq)
 		serial_unlink_irq_chain(up);
 }
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index dc850d257ea2..c1961761311d 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -118,7 +118,7 @@ struct kthread_delayed_work {
 
 #define KTHREAD_DELAYED_WORK_INIT(dwork, fn) {				\
 	.work = KTHREAD_WORK_INIT((dwork).work, (fn)),			\
-	.timer = __TIMER_INITIALIZER((TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn,\
+	.timer = __TIMER_INITIALIZER(kthread_delayed_work_timer_fn,\
 				     TIMER_IRQSAFE),			\
 	}
 
diff --git a/include/linux/timer.h b/include/linux/timer.h
index b1ae64b112c2..04af640ea95b 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -63,9 +63,6 @@ struct timer_list {
 
 #define TIMER_TRACE_FLAGMASK	(TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE)
 
-#define TIMER_DATA_TYPE		struct timer_list *
-#define TIMER_FUNC_TYPE		void (*)(TIMER_DATA_TYPE)
-
 #define __TIMER_INITIALIZER(_function, _flags) {		\
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
@@ -76,7 +73,7 @@ struct timer_list {
 
 #define DEFINE_TIMER(_name, _function)				\
 	struct timer_list _name =				\
-		__TIMER_INITIALIZER((TIMER_FUNC_TYPE)_function, 0)
+		__TIMER_INITIALIZER(_function, 0)
 
 /*
  * LOCKDEP and DEBUG timer interfaces.
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index bff39faba793..4a54ef96aff5 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -176,7 +176,7 @@ struct execute_work {
 
 #define __DELAYED_WORK_INITIALIZER(n, f, tflags) {			\
 	.work = __WORK_INITIALIZER((n).work, (f)),			\
-	.timer = __TIMER_INITIALIZER((TIMER_FUNC_TYPE)delayed_work_timer_fn,\
+	.timer = __TIMER_INITIALIZER(delayed_work_timer_fn,\
 				     (tflags) | TIMER_IRQSAFE),		\
 	}
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8af313081b0d..cd50e99202b0 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -843,7 +843,7 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker,
 	struct timer_list *timer = &dwork->timer;
 	struct kthread_work *work = &dwork->work;
 
-	WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn);
+	WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);
 
 	/*
 	 * If @delay is 0, queue @dwork->work immediately.  This is for
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dde6298f6b22..8fdb710bfdd7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1509,7 +1509,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 	struct work_struct *work = &dwork->work;
 
 	WARN_ON_ONCE(!wq);
-	WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)delayed_work_timer_fn);
+	WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
 	WARN_ON_ONCE(timer_pending(timer));
 	WARN_ON_ONCE(!list_empty(&work->entry));
 
diff --git a/net/atm/lec.c b/net/atm/lec.c
index c976196da3ea..6676e3433261 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -1798,7 +1798,7 @@ static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv,
 		else
 			send_to_lecd(priv, l_arp_xmt, mac_to_find, NULL, NULL);
 		entry->timer.expires = jiffies + (1 * HZ);
-		entry->timer.function = (TIMER_FUNC_TYPE)lec_arp_expire_arp;
+		entry->timer.function = lec_arp_expire_arp;
 		add_timer(&entry->timer);
 		found = priv->mcast_vcc;
 	}
@@ -1998,7 +1998,7 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data,
 		entry->old_recv_push = old_push;
 		entry->status = ESI_UNKNOWN;
 		entry->timer.expires = jiffies + priv->vcc_timeout_period;
-		entry->timer.function = (TIMER_FUNC_TYPE)lec_arp_expire_vcc;
+		entry->timer.function = lec_arp_expire_vcc;
 		hlist_add_head(&entry->next, &priv->lec_no_forward);
 		add_timer(&entry->timer);
 		dump_arp_table(priv);
@@ -2082,7 +2082,7 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data,
 	entry->status = ESI_UNKNOWN;
 	hlist_add_head(&entry->next, &priv->lec_arp_empty_ones);
 	entry->timer.expires = jiffies + priv->vcc_timeout_period;
-	entry->timer.function = (TIMER_FUNC_TYPE)lec_arp_expire_vcc;
+	entry->timer.function = lec_arp_expire_vcc;
 	add_timer(&entry->timer);
 	pr_debug("After vcc was added\n");
 	dump_arp_table(priv);
diff --git a/net/can/proc.c b/net/can/proc.c
index d979b3dc49a6..0c59f876fe6f 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -221,7 +221,7 @@ static int can_stats_proc_show(struct seq_file *m, void *v)
 
 	seq_putc(m, '\n');
 
-	if (net->can.can_stattimer.function == (TIMER_FUNC_TYPE)can_stat_update) {
+	if (net->can.can_stattimer.function == can_stat_update) {
 		seq_printf(m, " %8ld %% total match ratio (RXMR)\n",
 				can_stats->total_rx_match_ratio);
 
@@ -291,7 +291,7 @@ static int can_reset_stats_proc_show(struct seq_file *m, void *v)
 
 	user_reset = 1;
 
-	if (net->can.can_stattimer.function == (TIMER_FUNC_TYPE)can_stat_update) {
+	if (net->can.can_stattimer.function == can_stat_update) {
 		seq_printf(m, "Scheduled statistic reset #%ld.\n",
 				can_pstats->stats_reset + 1);
 	} else {
diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c
index 8bb469cb3abe..5d4ae01951b5 100644
--- a/net/lapb/lapb_timer.c
+++ b/net/lapb/lapb_timer.c
@@ -42,7 +42,7 @@ void lapb_start_t1timer(struct lapb_cb *lapb)
 {
 	del_timer(&lapb->t1timer);
 
-	lapb->t1timer.function = (TIMER_FUNC_TYPE)lapb_t1timer_expiry;
+	lapb->t1timer.function = lapb_t1timer_expiry;
 	lapb->t1timer.expires  = jiffies + lapb->t1;
 
 	add_timer(&lapb->t1timer);
@@ -52,7 +52,7 @@ void lapb_start_t2timer(struct lapb_cb *lapb)
 {
 	del_timer(&lapb->t2timer);
 
-	lapb->t2timer.function = (TIMER_FUNC_TYPE)lapb_t2timer_expiry;
+	lapb->t2timer.function = lapb_t2timer_expiry;
 	lapb->t2timer.expires  = jiffies + lapb->t2;
 
 	add_timer(&lapb->t2timer);
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 2dec3583c97d..7ed9d4422a73 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -284,7 +284,7 @@ void nr_destroy_socket(struct sock *sk)
 
 	if (sk_has_allocations(sk)) {
 		/* Defer: outstanding buffers */
-		sk->sk_timer.function = (TIMER_FUNC_TYPE)nr_destroy_timer;
+		sk->sk_timer.function = nr_destroy_timer;
 		sk->sk_timer.expires  = jiffies + 2 * HZ;
 		add_timer(&sk->sk_timer);
 	} else
diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c
index 43569aea0f5e..cbd51ed5a2d7 100644
--- a/net/netrom/nr_timer.c
+++ b/net/netrom/nr_timer.c
@@ -45,7 +45,7 @@ void nr_init_timers(struct sock *sk)
 	timer_setup(&nr->idletimer, nr_idletimer_expiry, 0);
 
 	/* initialized by sock_init_data */
-	sk->sk_timer.function = (TIMER_FUNC_TYPE)nr_heartbeat_expiry;
+	sk->sk_timer.function = nr_heartbeat_expiry;
 }
 
 void nr_start_t1timer(struct sock *sk)
diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c
index cda4c6678ef1..62055d3069d2 100644
--- a/net/rose/rose_link.c
+++ b/net/rose/rose_link.c
@@ -37,7 +37,7 @@ void rose_start_ftimer(struct rose_neigh *neigh)
 {
 	del_timer(&neigh->ftimer);
 
-	neigh->ftimer.function = (TIMER_FUNC_TYPE)rose_ftimer_expiry;
+	neigh->ftimer.function = rose_ftimer_expiry;
 	neigh->ftimer.expires  =
 		jiffies + msecs_to_jiffies(sysctl_rose_link_fail_timeout);
 
@@ -48,7 +48,7 @@ static void rose_start_t0timer(struct rose_neigh *neigh)
 {
 	del_timer(&neigh->t0timer);
 
-	neigh->t0timer.function = (TIMER_FUNC_TYPE)rose_t0timer_expiry;
+	neigh->t0timer.function = rose_t0timer_expiry;
 	neigh->t0timer.expires  =
 		jiffies + msecs_to_jiffies(sysctl_rose_restart_request_timeout);
 
diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c
index ea613b2a9735..74555fb95615 100644
--- a/net/rose/rose_timer.c
+++ b/net/rose/rose_timer.c
@@ -36,7 +36,7 @@ void rose_start_heartbeat(struct sock *sk)
 {
 	del_timer(&sk->sk_timer);
 
-	sk->sk_timer.function = (TIMER_FUNC_TYPE)rose_heartbeat_expiry;
+	sk->sk_timer.function = rose_heartbeat_expiry;
 	sk->sk_timer.expires  = jiffies + 5 * HZ;
 
 	add_timer(&sk->sk_timer);
@@ -48,7 +48,7 @@ void rose_start_t1timer(struct sock *sk)
 
 	del_timer(&rose->timer);
 
-	rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry;
+	rose->timer.function = rose_timer_expiry;
 	rose->timer.expires  = jiffies + rose->t1;
 
 	add_timer(&rose->timer);
@@ -60,7 +60,7 @@ void rose_start_t2timer(struct sock *sk)
 
 	del_timer(&rose->timer);
 
-	rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry;
+	rose->timer.function = rose_timer_expiry;
 	rose->timer.expires  = jiffies + rose->t2;
 
 	add_timer(&rose->timer);
@@ -72,7 +72,7 @@ void rose_start_t3timer(struct sock *sk)
 
 	del_timer(&rose->timer);
 
-	rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry;
+	rose->timer.function = rose_timer_expiry;
 	rose->timer.expires  = jiffies + rose->t3;
 
 	add_timer(&rose->timer);
@@ -84,7 +84,7 @@ void rose_start_hbtimer(struct sock *sk)
 
 	del_timer(&rose->timer);
 
-	rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry;
+	rose->timer.function = rose_timer_expiry;
 	rose->timer.expires  = jiffies + rose->hb;
 
 	add_timer(&rose->timer);
@@ -97,7 +97,7 @@ void rose_start_idletimer(struct sock *sk)
 	del_timer(&rose->idletimer);
 
 	if (rose->idle > 0) {
-		rose->idletimer.function = (TIMER_FUNC_TYPE)rose_idletimer_expiry;
+		rose->idletimer.function = rose_idletimer_expiry;
 		rose->idletimer.expires  = jiffies + rose->idle;
 
 		add_timer(&rose->idletimer);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index e8e0831229cf..f9307bd6644b 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -745,7 +745,7 @@ static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt
 	serv->sv_tmpcnt++;
 	if (serv->sv_temptimer.function == NULL) {
 		/* setup timer to age temp transports */
-		serv->sv_temptimer.function = (TIMER_FUNC_TYPE)svc_age_temp_xprts;
+		serv->sv_temptimer.function = svc_age_temp_xprts;
 		mod_timer(&serv->sv_temptimer,
 			  jiffies + svc_conn_age_period * HZ);
 	}
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index ea87143314f3..562cc11131f6 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -415,7 +415,7 @@ static void __x25_destroy_socket(struct sock *sk)
 	if (sk_has_allocations(sk)) {
 		/* Defer: outstanding buffers */
 		sk->sk_timer.expires  = jiffies + 10 * HZ;
-		sk->sk_timer.function = (TIMER_FUNC_TYPE)x25_destroy_timer;
+		sk->sk_timer.function = x25_destroy_timer;
 		add_timer(&sk->sk_timer);
 	} else {
 		/* drop last reference so sock_put will free */
diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c
index 1dfba3c23459..fa3461002b3e 100644
--- a/net/x25/x25_timer.c
+++ b/net/x25/x25_timer.c
@@ -36,7 +36,7 @@ void x25_init_timers(struct sock *sk)
 	timer_setup(&x25->timer, x25_timer_expiry, 0);
 
 	/* initialized by sock_init_data */
-	sk->sk_timer.function = (TIMER_FUNC_TYPE)x25_heartbeat_expiry;
+	sk->sk_timer.function = x25_heartbeat_expiry;
 }
 
 void x25_start_heartbeat(struct sock *sk)
diff --git a/sound/usb/line6/driver.c b/sound/usb/line6/driver.c
index 4f9613e5fc9e..c1376bfdc90b 100644
--- a/sound/usb/line6/driver.c
+++ b/sound/usb/line6/driver.c
@@ -201,7 +201,7 @@ static int line6_send_raw_message_async_part(struct message *msg,
 void line6_start_timer(struct timer_list *timer, unsigned long msecs,
 		       void (*function)(struct timer_list *t))
 {
-	timer->function = (TIMER_FUNC_TYPE)function;
+	timer->function = function;
 	mod_timer(timer, jiffies + msecs_to_jiffies(msecs));
 }
 EXPORT_SYMBOL_GPL(line6_start_timer);
-- 
cgit v1.2.3


From 860dd4424f344400b491b212ee4acb3a358ba9d9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 21 Nov 2017 14:23:37 +0100
Subject: scsi: dma-mapping: always provide dma_get_cache_alignment

Provide the dummy version of dma_get_cache_alignment that always returns
1 even if CONFIG_HAS_DMA is not set, so that drivers and subsystems can
use it without ifdefs.

Cc: stable@vger.kernel.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/dma-mapping.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index eee1499db396..29cfd18360be 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -710,7 +710,6 @@ static inline void *dma_zalloc_coherent(struct device *dev, size_t size,
 	return ret;
 }
 
-#ifdef CONFIG_HAS_DMA
 static inline int dma_get_cache_alignment(void)
 {
 #ifdef ARCH_DMA_MINALIGN
@@ -718,7 +717,6 @@ static inline int dma_get_cache_alignment(void)
 #endif
 	return 1;
 }
-#endif
 
 /* flags for the coherent memory api */
 #define DMA_MEMORY_EXCLUSIVE		0x01
-- 
cgit v1.2.3


From db1ac4964fa172803a0fea83033cd35d380a8a77 Mon Sep 17 00:00:00 2001
From: Gianluca Borello <g.borello@gmail.com>
Date: Wed, 22 Nov 2017 18:32:53 +0000
Subject: bpf: introduce ARG_PTR_TO_MEM_OR_NULL

With the current ARG_PTR_TO_MEM/ARG_PTR_TO_UNINIT_MEM semantics, an helper
argument can be NULL when the next argument type is ARG_CONST_SIZE_OR_ZERO
and the verifier can prove the value of this next argument is 0. However,
most helpers are just interested in handling <!NULL, 0>, so forcing them to
deal with <NULL, 0> makes the implementation of those helpers more
complicated for no apparent benefits, requiring them to explicitly handle
those corner cases with checks that bpf programs could start relying upon,
preventing the possibility of removing them later.

Solve this by making ARG_PTR_TO_MEM/ARG_PTR_TO_UNINIT_MEM never accept NULL
even when ARG_CONST_SIZE_OR_ZERO is set, and introduce a new argument type
ARG_PTR_TO_MEM_OR_NULL to explicitly deal with the NULL case.

Currently, the only helper that needs this is bpf_csum_diff_proto(), so
change arg1 and arg3 to this new type as well.

Also add a new battery of tests that explicitly test the
!ARG_PTR_TO_MEM_OR_NULL combination: all the current ones testing the
various <NULL, 0> variations are focused on bpf_csum_diff, so cover also
other helpers.

Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h                         |   1 +
 kernel/bpf/verifier.c                       |   4 +-
 net/core/filter.c                           |   4 +-
 tools/testing/selftests/bpf/test_verifier.c | 113 ++++++++++++++++++++++++++--
 4 files changed, 112 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 76c577281d78..e55e4255a210 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -78,6 +78,7 @@ enum bpf_arg_type {
 	 * functions that access data on eBPF program stack
 	 */
 	ARG_PTR_TO_MEM,		/* pointer to valid memory (stack, packet, map value) */
+	ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */
 	ARG_PTR_TO_UNINIT_MEM,	/* pointer to memory does not need to be initialized,
 				 * helper function must fill all bytes or clear
 				 * them in error case.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index dd54d20ace2f..308b0638ec5d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1384,13 +1384,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		if (type != expected_type)
 			goto err_type;
 	} else if (arg_type == ARG_PTR_TO_MEM ||
+		   arg_type == ARG_PTR_TO_MEM_OR_NULL ||
 		   arg_type == ARG_PTR_TO_UNINIT_MEM) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
 		 * passed in as argument, it's a SCALAR_VALUE type. Final test
 		 * happens during stack boundary checking.
 		 */
-		if (register_is_null(*reg))
+		if (register_is_null(*reg) &&
+		    arg_type == ARG_PTR_TO_MEM_OR_NULL)
 			/* final test in check_stack_boundary() */;
 		else if (!type_is_pkt_pointer(type) &&
 			 type != PTR_TO_MAP_VALUE &&
diff --git a/net/core/filter.c b/net/core/filter.c
index 1afa17935954..6a85e67fafce 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1646,9 +1646,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.gpl_only	= false,
 	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_MEM,
+	.arg1_type	= ARG_PTR_TO_MEM_OR_NULL,
 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
-	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_PTR_TO_MEM_OR_NULL,
 	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg5_type	= ARG_ANYTHING,
 };
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 2a5267bef160..3c64f30cf63c 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -5631,7 +5631,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
 	},
 	{
-		"helper access to variable memory: size = 0 allowed on NULL",
+		"helper access to variable memory: size = 0 allowed on NULL (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_1, 0),
 			BPF_MOV64_IMM(BPF_REG_2, 0),
@@ -5645,7 +5645,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
-		"helper access to variable memory: size > 0 not allowed on NULL",
+		"helper access to variable memory: size > 0 not allowed on NULL (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_1, 0),
 			BPF_MOV64_IMM(BPF_REG_2, 0),
@@ -5663,7 +5663,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
-		"helper access to variable memory: size = 0 allowed on != NULL stack pointer",
+		"helper access to variable memory: size = 0 allowed on != NULL stack pointer (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
 			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
@@ -5680,7 +5680,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
-		"helper access to variable memory: size = 0 allowed on != NULL map pointer",
+		"helper access to variable memory: size = 0 allowed on != NULL map pointer (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
@@ -5702,7 +5702,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
-		"helper access to variable memory: size possible = 0 allowed on != NULL stack pointer",
+		"helper access to variable memory: size possible = 0 allowed on != NULL stack pointer (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
@@ -5727,7 +5727,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
-		"helper access to variable memory: size possible = 0 allowed on != NULL map pointer",
+		"helper access to variable memory: size possible = 0 allowed on != NULL map pointer (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
@@ -5750,7 +5750,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
-		"helper access to variable memory: size possible = 0 allowed on != NULL packet pointer",
+		"helper access to variable memory: size possible = 0 allowed on != NULL packet pointer (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
 				    offsetof(struct __sk_buff, data)),
@@ -5771,6 +5771,105 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
+	{
+		"helper access to variable memory: size = 0 not allowed on NULL (!ARG_PTR_TO_MEM_OR_NULL)",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 0),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R1 type=inv expected=fp",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: size > 0 not allowed on NULL (!ARG_PTR_TO_MEM_OR_NULL)",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 0),
+			BPF_MOV64_IMM(BPF_REG_2, 1),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R1 type=inv expected=fp",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: size = 0 allowed on != NULL stack pointer (!ARG_PTR_TO_MEM_OR_NULL)",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: size = 0 allowed on != NULL map pointer (!ARG_PTR_TO_MEM_OR_NULL)",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: size possible = 0 allowed on != NULL stack pointer (!ARG_PTR_TO_MEM_OR_NULL)",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 8, 4),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: size possible = 0 allowed on != NULL map pointer (!ARG_PTR_TO_MEM_OR_NULL)",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 8, 2),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
 	{
 		"helper access to variable memory: 8 bytes leak",
 		.insns = {
-- 
cgit v1.2.3


From c131187db2d3fa2f8bf32fdf4e9a4ef805168467 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 22 Nov 2017 16:42:05 -0800
Subject: bpf: fix branch pruning logic

when the verifier detects that register contains a runtime constant
and it's compared with another constant it will prune exploration
of the branch that is guaranteed not to be taken at runtime.
This is all correct, but malicious program may be constructed
in such a way that it always has a constant comparison and
the other branch is never taken under any conditions.
In this case such path through the program will not be explored
by the verifier. It won't be taken at run-time either, but since
all instructions are JITed the malicious program may cause JITs
to complain about using reserved fields, etc.
To fix the issue we have to track the instructions explored by
the verifier and sanitize instructions that are dead at run time
with NOPs. We cannot reject such dead code, since llvm generates
it for valid C code, since it doesn't do as much data flow
analysis as the verifier does.

Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |  2 +-
 kernel/bpf/verifier.c        | 27 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b61482d354a2..c561b986bab0 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -115,7 +115,7 @@ struct bpf_insn_aux_data {
 		struct bpf_map *map_ptr;	/* pointer for call insn into lookup_elem */
 	};
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
-	int converted_op_size; /* the valid value width after perceived conversion */
+	bool seen; /* this insn was processed by the verifier */
 };
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 308b0638ec5d..d4593571c404 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3827,6 +3827,7 @@ static int do_check(struct bpf_verifier_env *env)
 			return err;
 
 		regs = cur_regs(env);
+		env->insn_aux_data[insn_idx].seen = true;
 		if (class == BPF_ALU || class == BPF_ALU64) {
 			err = check_alu_op(env, insn);
 			if (err)
@@ -4022,6 +4023,7 @@ process_bpf_exit:
 					return err;
 
 				insn_idx++;
+				env->insn_aux_data[insn_idx].seen = true;
 			} else {
 				verbose(env, "invalid BPF_LD mode\n");
 				return -EINVAL;
@@ -4204,6 +4206,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
 				u32 off, u32 cnt)
 {
 	struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+	int i;
 
 	if (cnt == 1)
 		return 0;
@@ -4213,6 +4216,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
 	memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
 	memcpy(new_data + off + cnt - 1, old_data + off,
 	       sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+	for (i = off; i < off + cnt - 1; i++)
+		new_data[i].seen = true;
 	env->insn_aux_data = new_data;
 	vfree(old_data);
 	return 0;
@@ -4231,6 +4236,25 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
 	return new_prog;
 }
 
+/* The verifier does more data flow analysis than llvm and will not explore
+ * branches that are dead at run time. Malicious programs can have dead code
+ * too. Therefore replace all dead at-run-time code with nops.
+ */
+static void sanitize_dead_code(struct bpf_verifier_env *env)
+{
+	struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+	struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0);
+	struct bpf_insn *insn = env->prog->insnsi;
+	const int insn_cnt = env->prog->len;
+	int i;
+
+	for (i = 0; i < insn_cnt; i++) {
+		if (aux_data[i].seen)
+			continue;
+		memcpy(insn + i, &nop, sizeof(nop));
+	}
+}
+
 /* convert load instructions that access fields of 'struct __sk_buff'
  * into sequence of instructions that access fields of 'struct sk_buff'
  */
@@ -4557,6 +4581,9 @@ skip_full_check:
 	while (!pop_stack(env, NULL, NULL));
 	free_states(env);
 
+	if (ret == 0)
+		sanitize_dead_code(env);
+
 	if (ret == 0)
 		/* program is valid, convert *(u32*)(ctx + off) accesses */
 		ret = convert_ctx_accesses(env);
-- 
cgit v1.2.3


From 0c19f846d582af919db66a5914a0189f9f92c936 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Tue, 21 Nov 2017 10:22:25 -0500
Subject: net: accept UFO datagrams from tuntap and packet

Tuntap and similar devices can inject GSO packets. Accept type
VIRTIO_NET_HDR_GSO_UDP, even though not generating UFO natively.

Processes are expected to use feature negotiation such as TUNSETOFFLOAD
to detect supported offload types and refrain from injecting other
packets. This process breaks down with live migration: guest kernels
do not renegotiate flags, so destination hosts need to expose all
features that the source host does.

Partially revert the UFO removal from 182e0b6b5846~1..d9d30adf5677.
This patch introduces nearly(*) no new code to simplify verification.
It brings back verbatim tuntap UFO negotiation, VIRTIO_NET_HDR_GSO_UDP
insertion and software UFO segmentation.

It does not reinstate protocol stack support, hardware offload
(NETIF_F_UFO), SKB_GSO_UDP tunneling in SKB_GSO_SOFTWARE or reception
of VIRTIO_NET_HDR_GSO_UDP packets in tuntap.

To support SKB_GSO_UDP reappearing in the stack, also reinstate
logic in act_csum and openvswitch. Achieve equivalence with v4.13 HEAD
by squashing in commit 939912216fa8 ("net: skb_needs_check() removes
CHECKSUM_UNNECESSARY check for tx.") and reverting commit 8d63bee643f1
("net: avoid skb_warn_bad_offload false positives on UFO").

(*) To avoid having to bring back skb_shinfo(skb)->ip6_frag_id,
ipv6_proxy_select_ident is changed to return a __be32 and this is
assigned directly to the frag_hdr. Also, SKB_GSO_UDP is inserted
at the end of the enum to minimize code churn.

Tested
  Booted a v4.13 guest kernel with QEMU. On a host kernel before this
  patch `ethtool -k eth0` shows UFO disabled. After the patch, it is
  enabled, same as on a v4.13 host kernel.

  A UFO packet sent from the guest appears on the tap device:
    host:
      nc -l -p -u 8000 &
      tcpdump -n -i tap0

    guest:
      dd if=/dev/zero of=payload.txt bs=1 count=2000
      nc -u 192.16.1.1 8000 < payload.txt

  Direct tap to tap transmission of VIRTIO_NET_HDR_GSO_UDP succeeds,
  packets arriving fragmented:

    ./with_tap_pair.sh ./tap_send_ufo tap0 tap1
    (from https://github.com/wdebruij/kerneltools/tree/master/tests)

Changes
  v1 -> v2
    - simplified set_offload change (review comment)
    - documented test procedure

Link: http://lkml.kernel.org/r/<CAF=yD-LuUeDuL9YWPJD9ykOZ0QCjNeznPDr6whqZ9NGMNF12Mw@mail.gmail.com>
Fixes: fb652fdfe837 ("macvlan/macvtap: Remove NETIF_F_UFO advertisement.")
Reported-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tap.c               |  2 +-
 drivers/net/tun.c               |  2 +
 include/linux/netdev_features.h |  4 +-
 include/linux/netdevice.h       |  1 +
 include/linux/skbuff.h          |  2 +
 include/linux/virtio_net.h      |  5 ++-
 include/net/ipv6.h              |  1 +
 net/core/dev.c                  |  3 +-
 net/ipv4/af_inet.c              | 12 +++++-
 net/ipv4/udp_offload.c          | 49 ++++++++++++++++++++++--
 net/ipv6/output_core.c          | 31 +++++++++++++++
 net/ipv6/udp_offload.c          | 85 +++++++++++++++++++++++++++++++++++++++--
 net/openvswitch/datapath.c      | 14 +++++++
 net/openvswitch/flow.c          |  6 ++-
 net/sched/act_csum.c            |  6 +++
 15 files changed, 209 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index b13890953ebb..e9489b88407c 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1077,7 +1077,7 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
 	case TUNSETOFFLOAD:
 		/* let the user check for future flags */
 		if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
-			    TUN_F_TSO_ECN))
+			    TUN_F_TSO_ECN | TUN_F_UFO))
 			return -EINVAL;
 
 		rtnl_lock();
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 5a2ea78a008f..6a7bde9bc4b2 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2370,6 +2370,8 @@ static int set_offload(struct tun_struct *tun, unsigned long arg)
 				features |= NETIF_F_TSO6;
 			arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
 		}
+
+		arg &= ~TUN_F_UFO;
 	}
 
 	/* This gives the user a way to test for new features in future by
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index dc8b4896b77b..b1b0ca7ccb2b 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -54,8 +54,9 @@ enum {
 	NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
 	NETIF_F_GSO_SCTP_BIT,		/* ... SCTP fragmentation */
 	NETIF_F_GSO_ESP_BIT,		/* ... ESP with TSO */
+	NETIF_F_GSO_UDP_BIT,		/* ... UFO, deprecated except tuntap */
 	/**/NETIF_F_GSO_LAST =		/* last bit, see GSO_MASK */
-		NETIF_F_GSO_ESP_BIT,
+		NETIF_F_GSO_UDP_BIT,
 
 	NETIF_F_FCOE_CRC_BIT,		/* FCoE CRC32 */
 	NETIF_F_SCTP_CRC_BIT,		/* SCTP checksum offload */
@@ -132,6 +133,7 @@ enum {
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_GSO_SCTP	__NETIF_F(GSO_SCTP)
 #define NETIF_F_GSO_ESP		__NETIF_F(GSO_ESP)
+#define NETIF_F_GSO_UDP		__NETIF_F(GSO_UDP)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
 #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6b274bfe489f..ef789e1d679e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4140,6 +4140,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_SCTP    != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT));
 
 	return (features & feature) == feature;
 }
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ed06e1c28fc7..bc486ef23f20 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -568,6 +568,8 @@ enum {
 	SKB_GSO_SCTP = 1 << 14,
 
 	SKB_GSO_ESP = 1 << 15,
+
+	SKB_GSO_UDP = 1 << 16,
 };
 
 #if BITS_PER_LONG > 32
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 210034c896e3..f144216febc6 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -9,7 +9,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 					const struct virtio_net_hdr *hdr,
 					bool little_endian)
 {
-	unsigned short gso_type = 0;
+	unsigned int gso_type = 0;
 
 	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
 		switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
@@ -19,6 +19,9 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 		case VIRTIO_NET_HDR_GSO_TCPV6:
 			gso_type = SKB_GSO_TCPV6;
 			break;
+		case VIRTIO_NET_HDR_GSO_UDP:
+			gso_type = SKB_GSO_UDP;
+			break;
 		default:
 			return -EINVAL;
 		}
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index ec14f0d5a3a1..f73797e2fa60 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -767,6 +767,7 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add
 __be32 ipv6_select_ident(struct net *net,
 			 const struct in6_addr *daddr,
 			 const struct in6_addr *saddr);
+__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb);
 
 int ip6_dst_hoplimit(struct dst_entry *dst);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 8ee29f4f5fa9..bbba19112f02 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2746,7 +2746,8 @@ EXPORT_SYMBOL(skb_mac_gso_segment);
 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
 {
 	if (tx_path)
-		return skb->ip_summed != CHECKSUM_PARTIAL;
+		return skb->ip_summed != CHECKSUM_PARTIAL &&
+		       skb->ip_summed != CHECKSUM_UNNECESSARY;
 
 	return skb->ip_summed == CHECKSUM_NONE;
 }
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index ce4aa827be05..f00499a46927 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1223,9 +1223,10 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
 struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 				 netdev_features_t features)
 {
-	bool fixedid = false, gso_partial, encap;
+	bool udpfrag = false, fixedid = false, gso_partial, encap;
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	const struct net_offload *ops;
+	unsigned int offset = 0;
 	struct iphdr *iph;
 	int proto, tot_len;
 	int nhoff;
@@ -1260,6 +1261,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	segs = ERR_PTR(-EPROTONOSUPPORT);
 
 	if (!skb->encapsulation || encap) {
+		udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
 		fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
 
 		/* fixed ID is invalid if DF bit is not set */
@@ -1279,7 +1281,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	skb = segs;
 	do {
 		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
-		if (skb_is_gso(skb)) {
+		if (udpfrag) {
+			iph->frag_off = htons(offset >> 3);
+			if (skb->next)
+				iph->frag_off |= htons(IP_MF);
+			offset += skb->len - nhoff - ihl;
+			tot_len = skb->len - nhoff;
+		} else if (skb_is_gso(skb)) {
 			if (!fixedid) {
 				iph->id = htons(id);
 				id += skb_shinfo(skb)->gso_segs;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index e360d55be555..01801b77bd0d 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -187,16 +187,57 @@ out_unlock:
 }
 EXPORT_SYMBOL(skb_udp_tunnel_segment);
 
-static struct sk_buff *udp4_tunnel_segment(struct sk_buff *skb,
-					   netdev_features_t features)
+static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
+					 netdev_features_t features)
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	unsigned int mss;
+	__wsum csum;
+	struct udphdr *uh;
+	struct iphdr *iph;
 
 	if (skb->encapsulation &&
 	    (skb_shinfo(skb)->gso_type &
-	     (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)))
+	     (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) {
 		segs = skb_udp_tunnel_segment(skb, features, false);
+		goto out;
+	}
+
+	if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+		goto out;
+
+	mss = skb_shinfo(skb)->gso_size;
+	if (unlikely(skb->len <= mss))
+		goto out;
+
+	/* Do software UFO. Complete and fill in the UDP checksum as
+	 * HW cannot do checksum of UDP packets sent as multiple
+	 * IP fragments.
+	 */
 
+	uh = udp_hdr(skb);
+	iph = ip_hdr(skb);
+
+	uh->check = 0;
+	csum = skb_checksum(skb, 0, skb->len, 0);
+	uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum);
+	if (uh->check == 0)
+		uh->check = CSUM_MANGLED_0;
+
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	/* If there is no outer header we can fake a checksum offload
+	 * due to the fact that we have already done the checksum in
+	 * software prior to segmenting the frame.
+	 */
+	if (!skb->encap_hdr_csum)
+		features |= NETIF_F_HW_CSUM;
+
+	/* Fragment the skb. IP headers of the fragments are updated in
+	 * inet_gso_segment()
+	 */
+	segs = skb_segment(skb, features);
+out:
 	return segs;
 }
 
@@ -330,7 +371,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
 
 static const struct net_offload udpv4_offload = {
 	.callbacks = {
-		.gso_segment = udp4_tunnel_segment,
+		.gso_segment = udp4_ufo_fragment,
 		.gro_receive  =	udp4_gro_receive,
 		.gro_complete =	udp4_gro_complete,
 	},
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 4a7e5ffa5108..4fe7c90962dd 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -31,6 +31,37 @@ static u32 __ipv6_select_ident(struct net *net, u32 hashrnd,
 	return id;
 }
 
+/* This function exists only for tap drivers that must support broken
+ * clients requesting UFO without specifying an IPv6 fragment ID.
+ *
+ * This is similar to ipv6_select_ident() but we use an independent hash
+ * seed to limit information leakage.
+ *
+ * The network header must be set before calling this.
+ */
+__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
+{
+	static u32 ip6_proxy_idents_hashrnd __read_mostly;
+	struct in6_addr buf[2];
+	struct in6_addr *addrs;
+	u32 id;
+
+	addrs = skb_header_pointer(skb,
+				   skb_network_offset(skb) +
+				   offsetof(struct ipv6hdr, saddr),
+				   sizeof(buf), buf);
+	if (!addrs)
+		return 0;
+
+	net_get_random_once(&ip6_proxy_idents_hashrnd,
+			    sizeof(ip6_proxy_idents_hashrnd));
+
+	id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd,
+				 &addrs[1], &addrs[0]);
+	return htonl(id);
+}
+EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);
+
 __be32 ipv6_select_ident(struct net *net,
 			 const struct in6_addr *daddr,
 			 const struct in6_addr *saddr)
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 455fd4e39333..a0f89ad76f9d 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -17,15 +17,94 @@
 #include <net/ip6_checksum.h>
 #include "ip6_offload.h"
 
-static struct sk_buff *udp6_tunnel_segment(struct sk_buff *skb,
-					   netdev_features_t features)
+static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
+					 netdev_features_t features)
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	unsigned int mss;
+	unsigned int unfrag_ip6hlen, unfrag_len;
+	struct frag_hdr *fptr;
+	u8 *packet_start, *prevhdr;
+	u8 nexthdr;
+	u8 frag_hdr_sz = sizeof(struct frag_hdr);
+	__wsum csum;
+	int tnl_hlen;
+	int err;
+
+	mss = skb_shinfo(skb)->gso_size;
+	if (unlikely(skb->len <= mss))
+		goto out;
 
 	if (skb->encapsulation && skb_shinfo(skb)->gso_type &
 	    (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))
 		segs = skb_udp_tunnel_segment(skb, features, true);
+	else {
+		const struct ipv6hdr *ipv6h;
+		struct udphdr *uh;
+
+		if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+			goto out;
+
+		/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
+		 * do checksum of UDP packets sent as multiple IP fragments.
+		 */
+
+		uh = udp_hdr(skb);
+		ipv6h = ipv6_hdr(skb);
+
+		uh->check = 0;
+		csum = skb_checksum(skb, 0, skb->len, 0);
+		uh->check = udp_v6_check(skb->len, &ipv6h->saddr,
+					  &ipv6h->daddr, csum);
+		if (uh->check == 0)
+			uh->check = CSUM_MANGLED_0;
+
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+		/* If there is no outer header we can fake a checksum offload
+		 * due to the fact that we have already done the checksum in
+		 * software prior to segmenting the frame.
+		 */
+		if (!skb->encap_hdr_csum)
+			features |= NETIF_F_HW_CSUM;
+
+		/* Check if there is enough headroom to insert fragment header. */
+		tnl_hlen = skb_tnl_header_len(skb);
+		if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) {
+			if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz))
+				goto out;
+		}
+
+		/* Find the unfragmentable header and shift it left by frag_hdr_sz
+		 * bytes to insert fragment header.
+		 */
+		err = ip6_find_1stfragopt(skb, &prevhdr);
+		if (err < 0)
+			return ERR_PTR(err);
+		unfrag_ip6hlen = err;
+		nexthdr = *prevhdr;
+		*prevhdr = NEXTHDR_FRAGMENT;
+		unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) +
+			     unfrag_ip6hlen + tnl_hlen;
+		packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset;
+		memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len);
+
+		SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz;
+		skb->mac_header -= frag_hdr_sz;
+		skb->network_header -= frag_hdr_sz;
+
+		fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
+		fptr->nexthdr = nexthdr;
+		fptr->reserved = 0;
+		fptr->identification = ipv6_proxy_select_ident(dev_net(skb->dev), skb);
+
+		/* Fragment the skb. ipv6 header and the remaining fields of the
+		 * fragment header are updated in ipv6_gso_segment()
+		 */
+		segs = skb_segment(skb, features);
+	}
 
+out:
 	return segs;
 }
 
@@ -75,7 +154,7 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
 
 static const struct net_offload udpv6_offload = {
 	.callbacks = {
-		.gso_segment	=	udp6_tunnel_segment,
+		.gso_segment	=	udp6_ufo_fragment,
 		.gro_receive	=	udp6_gro_receive,
 		.gro_complete	=	udp6_gro_complete,
 	},
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 0dab33fb9844..99cfafc2a139 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -308,6 +308,8 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
 			     const struct dp_upcall_info *upcall_info,
 				 uint32_t cutlen)
 {
+	unsigned short gso_type = skb_shinfo(skb)->gso_type;
+	struct sw_flow_key later_key;
 	struct sk_buff *segs, *nskb;
 	int err;
 
@@ -318,9 +320,21 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
 	if (segs == NULL)
 		return -EINVAL;
 
+	if (gso_type & SKB_GSO_UDP) {
+		/* The initial flow key extracted by ovs_flow_key_extract()
+		 * in this case is for a first fragment, so we need to
+		 * properly mark later fragments.
+		 */
+		later_key = *key;
+		later_key.ip.frag = OVS_FRAG_TYPE_LATER;
+	}
+
 	/* Queue all of the segments. */
 	skb = segs;
 	do {
+		if (gso_type & SKB_GSO_UDP && skb != segs)
+			key = &later_key;
+
 		err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen);
 		if (err)
 			break;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 864ddb1e3642..dbe2379329c5 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -631,7 +631,8 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 			key->ip.frag = OVS_FRAG_TYPE_LATER;
 			return 0;
 		}
-		if (nh->frag_off & htons(IP_MF))
+		if (nh->frag_off & htons(IP_MF) ||
+			skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
 			key->ip.frag = OVS_FRAG_TYPE_FIRST;
 		else
 			key->ip.frag = OVS_FRAG_TYPE_NONE;
@@ -747,6 +748,9 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 
 		if (key->ip.frag == OVS_FRAG_TYPE_LATER)
 			return 0;
+		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+			key->ip.frag = OVS_FRAG_TYPE_FIRST;
+
 		/* Transport layer. */
 		if (key->ip.proto == NEXTHDR_TCP) {
 			if (tcphdr_ok(skb)) {
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 1c40caadcff9..d836f998117b 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -229,6 +229,9 @@ static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl,
 	const struct iphdr *iph;
 	u16 ul;
 
+	if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+		return 1;
+
 	/*
 	 * Support both UDP and UDPLITE checksum algorithms, Don't use
 	 * udph->len to get the real length without any protocol check,
@@ -282,6 +285,9 @@ static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl,
 	const struct ipv6hdr *ip6h;
 	u16 ul;
 
+	if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+		return 1;
+
 	/*
 	 * Support both UDP and UDPLITE checksum algorithms, Don't use
 	 * udph->len to get the real length without any protocol check,
-- 
cgit v1.2.3


From fd2fa6c18b729e19c51240453a521f76c766247e Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 22 Nov 2017 16:13:37 -0600
Subject: x86/PCI: Remove unused HyperTransport interrupt support

There are no in-tree callers of ht_create_irq(), the driver interface for
HyperTransport interrupts, left.  Remove the unused entry point and all the
supporting code.

See 8b955b0dddb3 ("[PATCH] Initial generic hypertransport interrupt
support").

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: linux-pci@vger.kernel.org
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Link: https://lkml.kernel.org/r/20171122221337.3877.23362.stgit@bhelgaas-glaptop.roam.corp.google.com
---
 arch/x86/include/asm/hw_irq.h         |   8 --
 arch/x86/include/asm/hypertransport.h |  46 --------
 arch/x86/include/asm/irqdomain.h      |   6 --
 arch/x86/kernel/apic/Makefile         |   1 -
 arch/x86/kernel/apic/htirq.c          | 198 ----------------------------------
 arch/x86/kernel/apic/vector.c         |   5 +-
 drivers/pci/Kconfig                   |   9 --
 drivers/pci/Makefile                  |   3 -
 drivers/pci/htirq.c                   | 135 -----------------------
 include/linux/htirq.h                 |  39 -------
 include/linux/pci.h                   |   6 --
 11 files changed, 2 insertions(+), 454 deletions(-)
 delete mode 100644 arch/x86/include/asm/hypertransport.h
 delete mode 100644 arch/x86/kernel/apic/htirq.c
 delete mode 100644 drivers/pci/htirq.c
 delete mode 100644 include/linux/htirq.h

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b80e46733909..2851077b6051 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -99,14 +99,6 @@ struct irq_alloc_info {
 			void		*dmar_data;
 		};
 #endif
-#ifdef	CONFIG_HT_IRQ
-		struct {
-			int		ht_pos;
-			int		ht_idx;
-			struct pci_dev	*ht_dev;
-			void		*ht_update;
-		};
-#endif
 #ifdef	CONFIG_X86_UV
 		struct {
 			int		uv_limit;
diff --git a/arch/x86/include/asm/hypertransport.h b/arch/x86/include/asm/hypertransport.h
deleted file mode 100644
index 5d55df352879..000000000000
--- a/arch/x86/include/asm/hypertransport.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_HYPERTRANSPORT_H
-#define _ASM_X86_HYPERTRANSPORT_H
-
-/*
- * Constants for x86 Hypertransport Interrupts.
- */
-
-#define HT_IRQ_LOW_BASE			0xf8000000
-
-#define HT_IRQ_LOW_VECTOR_SHIFT		16
-#define HT_IRQ_LOW_VECTOR_MASK		0x00ff0000
-#define HT_IRQ_LOW_VECTOR(v)						\
-	(((v) << HT_IRQ_LOW_VECTOR_SHIFT) & HT_IRQ_LOW_VECTOR_MASK)
-
-#define HT_IRQ_LOW_DEST_ID_SHIFT	8
-#define HT_IRQ_LOW_DEST_ID_MASK		0x0000ff00
-#define HT_IRQ_LOW_DEST_ID(v)						\
-	(((v) << HT_IRQ_LOW_DEST_ID_SHIFT) & HT_IRQ_LOW_DEST_ID_MASK)
-
-#define HT_IRQ_LOW_DM_PHYSICAL		0x0000000
-#define HT_IRQ_LOW_DM_LOGICAL		0x0000040
-
-#define HT_IRQ_LOW_RQEOI_EDGE		0x0000000
-#define HT_IRQ_LOW_RQEOI_LEVEL		0x0000020
-
-
-#define HT_IRQ_LOW_MT_FIXED		0x0000000
-#define HT_IRQ_LOW_MT_ARBITRATED	0x0000004
-#define HT_IRQ_LOW_MT_SMI		0x0000008
-#define HT_IRQ_LOW_MT_NMI		0x000000c
-#define HT_IRQ_LOW_MT_INIT		0x0000010
-#define HT_IRQ_LOW_MT_STARTUP		0x0000014
-#define HT_IRQ_LOW_MT_EXTINT		0x0000018
-#define HT_IRQ_LOW_MT_LINT1		0x000008c
-#define HT_IRQ_LOW_MT_LINT0		0x0000098
-
-#define HT_IRQ_LOW_IRQ_MASKED		0x0000001
-
-
-#define HT_IRQ_HIGH_DEST_ID_SHIFT	0
-#define HT_IRQ_HIGH_DEST_ID_MASK	0x00ffffff
-#define HT_IRQ_HIGH_DEST_ID(v)						\
-	((((v) >> 8) << HT_IRQ_HIGH_DEST_ID_SHIFT) & HT_IRQ_HIGH_DEST_ID_MASK)
-
-#endif /* _ASM_X86_HYPERTRANSPORT_H */
diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h
index f695cc6b8e1f..139feef467f7 100644
--- a/arch/x86/include/asm/irqdomain.h
+++ b/arch/x86/include/asm/irqdomain.h
@@ -56,10 +56,4 @@ extern void arch_init_msi_domain(struct irq_domain *domain);
 static inline void arch_init_msi_domain(struct irq_domain *domain) { }
 #endif
 
-#ifdef CONFIG_HT_IRQ
-extern void arch_init_htirq_domain(struct irq_domain *domain);
-#else
-static inline void arch_init_htirq_domain(struct irq_domain *domain) { }
-#endif
-
 #endif
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index a9e08924927e..a6fcaf16cdbf 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -12,7 +12,6 @@ obj-y				+= hw_nmi.o
 
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_PCI_MSI)		+= msi.o
-obj-$(CONFIG_HT_IRQ)		+= htirq.o
 obj-$(CONFIG_SMP)		+= ipi.o
 
 ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c
deleted file mode 100644
index b07075dce8b7..000000000000
--- a/arch/x86/kernel/apic/htirq.c
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Support Hypertransport IRQ
- *
- * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
- *	Moved from arch/x86/kernel/apic/io_apic.c.
- * Jiang Liu <jiang.liu@linux.intel.com>
- *	Add support of hierarchical irqdomain
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/device.h>
-#include <linux/pci.h>
-#include <linux/htirq.h>
-#include <asm/irqdomain.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/hypertransport.h>
-
-static struct irq_domain *htirq_domain;
-
-/*
- * Hypertransport interrupt support
- */
-static int
-ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
-{
-	struct irq_data *parent = data->parent_data;
-	int ret;
-
-	ret = parent->chip->irq_set_affinity(parent, mask, force);
-	if (ret >= 0) {
-		struct ht_irq_msg msg;
-		struct irq_cfg *cfg = irqd_cfg(data);
-
-		fetch_ht_irq_msg(data->irq, &msg);
-		msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK |
-				    HT_IRQ_LOW_DEST_ID_MASK);
-		msg.address_lo |= HT_IRQ_LOW_VECTOR(cfg->vector) |
-				  HT_IRQ_LOW_DEST_ID(cfg->dest_apicid);
-		msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
-		msg.address_hi |= HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid);
-		write_ht_irq_msg(data->irq, &msg);
-	}
-
-	return ret;
-}
-
-static struct irq_chip ht_irq_chip = {
-	.name			= "PCI-HT",
-	.irq_mask		= mask_ht_irq,
-	.irq_unmask		= unmask_ht_irq,
-	.irq_ack		= irq_chip_ack_parent,
-	.irq_set_affinity	= ht_set_affinity,
-	.irq_retrigger		= irq_chip_retrigger_hierarchy,
-	.flags			= IRQCHIP_SKIP_SET_WAKE,
-};
-
-static int htirq_domain_alloc(struct irq_domain *domain, unsigned int virq,
-			      unsigned int nr_irqs, void *arg)
-{
-	struct ht_irq_cfg *ht_cfg;
-	struct irq_alloc_info *info = arg;
-	struct pci_dev *dev;
-	irq_hw_number_t hwirq;
-	int ret;
-
-	if (nr_irqs > 1 || !info)
-		return -EINVAL;
-
-	dev = info->ht_dev;
-	hwirq = (info->ht_idx & 0xFF) |
-		PCI_DEVID(dev->bus->number, dev->devfn) << 8 |
-		(pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 24;
-	if (irq_find_mapping(domain, hwirq) > 0)
-		return -EEXIST;
-
-	ht_cfg = kmalloc(sizeof(*ht_cfg), GFP_KERNEL);
-	if (!ht_cfg)
-		return -ENOMEM;
-
-	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
-	if (ret < 0) {
-		kfree(ht_cfg);
-		return ret;
-	}
-
-	/* Initialize msg to a value that will never match the first write. */
-	ht_cfg->msg.address_lo = 0xffffffff;
-	ht_cfg->msg.address_hi = 0xffffffff;
-	ht_cfg->dev = info->ht_dev;
-	ht_cfg->update = info->ht_update;
-	ht_cfg->pos = info->ht_pos;
-	ht_cfg->idx = 0x10 + (info->ht_idx * 2);
-	irq_domain_set_info(domain, virq, hwirq, &ht_irq_chip, ht_cfg,
-			    handle_edge_irq, ht_cfg, "edge");
-
-	return 0;
-}
-
-static void htirq_domain_free(struct irq_domain *domain, unsigned int virq,
-			      unsigned int nr_irqs)
-{
-	struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
-
-	BUG_ON(nr_irqs != 1);
-	kfree(irq_data->chip_data);
-	irq_domain_free_irqs_top(domain, virq, nr_irqs);
-}
-
-static int htirq_domain_activate(struct irq_domain *domain,
-				 struct irq_data *irq_data, bool early)
-{
-	struct ht_irq_msg msg;
-	struct irq_cfg *cfg = irqd_cfg(irq_data);
-
-	msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid);
-	msg.address_lo =
-		HT_IRQ_LOW_BASE |
-		HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) |
-		HT_IRQ_LOW_VECTOR(cfg->vector) |
-		((apic->irq_dest_mode == 0) ?
-			HT_IRQ_LOW_DM_PHYSICAL :
-			HT_IRQ_LOW_DM_LOGICAL) |
-		HT_IRQ_LOW_RQEOI_EDGE |
-		((apic->irq_delivery_mode != dest_LowestPrio) ?
-			HT_IRQ_LOW_MT_FIXED :
-			HT_IRQ_LOW_MT_ARBITRATED) |
-		HT_IRQ_LOW_IRQ_MASKED;
-	write_ht_irq_msg(irq_data->irq, &msg);
-	return 0;
-}
-
-static void htirq_domain_deactivate(struct irq_domain *domain,
-				    struct irq_data *irq_data)
-{
-	struct ht_irq_msg msg;
-
-	memset(&msg, 0, sizeof(msg));
-	write_ht_irq_msg(irq_data->irq, &msg);
-}
-
-static const struct irq_domain_ops htirq_domain_ops = {
-	.alloc		= htirq_domain_alloc,
-	.free		= htirq_domain_free,
-	.activate	= htirq_domain_activate,
-	.deactivate	= htirq_domain_deactivate,
-};
-
-void __init arch_init_htirq_domain(struct irq_domain *parent)
-{
-	struct fwnode_handle *fn;
-
-	if (disable_apic)
-		return;
-
-	fn = irq_domain_alloc_named_fwnode("PCI-HT");
-	if (!fn)
-		goto warn;
-
-	htirq_domain = irq_domain_create_tree(fn, &htirq_domain_ops, NULL);
-	irq_domain_free_fwnode(fn);
-	if (!htirq_domain)
-		goto warn;
-
-	htirq_domain->parent = parent;
-	return;
-
-warn:
-	pr_warn("Failed to initialize irqdomain for HTIRQ.\n");
-}
-
-int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev,
-		      ht_irq_update_t *update)
-{
-	struct irq_alloc_info info;
-
-	if (!htirq_domain)
-		return -ENOSYS;
-
-	init_irq_alloc_info(&info, NULL);
-	info.ht_idx = idx;
-	info.ht_pos = pos;
-	info.ht_dev = dev;
-	info.ht_update = update;
-
-	return irq_domain_alloc_irqs(htirq_domain, 1, dev_to_node(&dev->dev),
-				     &info);
-}
-
-void arch_teardown_ht_irq(unsigned int irq)
-{
-	irq_domain_free_irqs(irq, 1);
-}
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 05c85e693a5d..6a823a25eaff 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -1,5 +1,5 @@
 /*
- * Local APIC related interfaces to support IOAPIC, MSI, HT_IRQ etc.
+ * Local APIC related interfaces to support IOAPIC, MSI, etc.
  *
  * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  *	Moved from arch/x86/kernel/apic/io_apic.c.
@@ -601,7 +601,7 @@ int __init arch_probe_nr_irqs(void)
 		nr_irqs = NR_VECTORS * nr_cpu_ids;
 
 	nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids;
-#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
+#if defined(CONFIG_PCI_MSI)
 	/*
 	 * for MSI and HT dyn irq
 	 */
@@ -663,7 +663,6 @@ int __init arch_early_irq_init(void)
 	irq_set_default_host(x86_vector_domain);
 
 	arch_init_msi_domain(x86_vector_domain);
-	arch_init_htirq_domain(x86_vector_domain);
 
 	BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL));
 
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index c32a77fc8b03..99ae5e30eabe 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -71,15 +71,6 @@ config XEN_PCIDEV_FRONTEND
           The PCI device frontend driver allows the kernel to import arbitrary
           PCI devices from a PCI backend to support PCI driver domains.
 
-config HT_IRQ
-	bool "Interrupts on hypertransport devices"
-	default y
-	depends on PCI && X86_LOCAL_APIC
-	help
-	   This allows native hypertransport devices to use interrupts.
-
-	   If unsure say Y.
-
 config PCI_ATS
 	bool
 
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 80adbdbcecce..ab0104e0ffac 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -24,9 +24,6 @@ endif
 # Build the PCI MSI interrupt support
 obj-$(CONFIG_PCI_MSI) += msi.o
 
-# Build the Hypertransport interrupt support
-obj-$(CONFIG_HT_IRQ) += htirq.o
-
 obj-$(CONFIG_PCI_ATS) += ats.o
 obj-$(CONFIG_PCI_IOV) += iov.o
 
diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c
deleted file mode 100644
index bb88c26f5144..000000000000
--- a/drivers/pci/htirq.c
+++ /dev/null
@@ -1,135 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * File:	htirq.c
- * Purpose:	Hypertransport Interrupt Capability
- *
- * Copyright (C) 2006 Linux Networx
- * Copyright (C) Eric Biederman <ebiederman@lnxi.com>
- */
-
-#include <linux/irq.h>
-#include <linux/pci.h>
-#include <linux/spinlock.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/htirq.h>
-
-/* Global ht irq lock.
- *
- * This is needed to serialize access to the data port in hypertransport
- * irq capability.
- *
- * With multiple simultaneous hypertransport irq devices it might pay
- * to make this more fine grained.  But start with simple, stupid, and correct.
- */
-static DEFINE_SPINLOCK(ht_irq_lock);
-
-void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg)
-{
-	struct ht_irq_cfg *cfg = irq_get_handler_data(irq);
-	unsigned long flags;
-
-	spin_lock_irqsave(&ht_irq_lock, flags);
-	if (cfg->msg.address_lo != msg->address_lo) {
-		pci_write_config_byte(cfg->dev, cfg->pos + 2, cfg->idx);
-		pci_write_config_dword(cfg->dev, cfg->pos + 4, msg->address_lo);
-	}
-	if (cfg->msg.address_hi != msg->address_hi) {
-		pci_write_config_byte(cfg->dev, cfg->pos + 2, cfg->idx + 1);
-		pci_write_config_dword(cfg->dev, cfg->pos + 4, msg->address_hi);
-	}
-	if (cfg->update)
-		cfg->update(cfg->dev, irq, msg);
-	spin_unlock_irqrestore(&ht_irq_lock, flags);
-	cfg->msg = *msg;
-}
-
-void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg)
-{
-	struct ht_irq_cfg *cfg = irq_get_handler_data(irq);
-
-	*msg = cfg->msg;
-}
-
-void mask_ht_irq(struct irq_data *data)
-{
-	struct ht_irq_cfg *cfg = irq_data_get_irq_handler_data(data);
-	struct ht_irq_msg msg = cfg->msg;
-
-	msg.address_lo |= 1;
-	write_ht_irq_msg(data->irq, &msg);
-}
-
-void unmask_ht_irq(struct irq_data *data)
-{
-	struct ht_irq_cfg *cfg = irq_data_get_irq_handler_data(data);
-	struct ht_irq_msg msg = cfg->msg;
-
-	msg.address_lo &= ~1;
-	write_ht_irq_msg(data->irq, &msg);
-}
-
-/**
- * __ht_create_irq - create an irq and attach it to a device.
- * @dev: The hypertransport device to find the irq capability on.
- * @idx: Which of the possible irqs to attach to.
- * @update: Function to be called when changing the htirq message
- *
- * The irq number of the new irq or a negative error value is returned.
- */
-int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
-{
-	int max_irq, pos, irq;
-	unsigned long flags;
-	u32 data;
-
-	pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ);
-	if (!pos)
-		return -EINVAL;
-
-	/* Verify the idx I want to use is in range */
-	spin_lock_irqsave(&ht_irq_lock, flags);
-	pci_write_config_byte(dev, pos + 2, 1);
-	pci_read_config_dword(dev, pos + 4, &data);
-	spin_unlock_irqrestore(&ht_irq_lock, flags);
-
-	max_irq = (data >> 16) & 0xff;
-	if (idx > max_irq)
-		return -EINVAL;
-
-	irq = arch_setup_ht_irq(idx, pos, dev, update);
-	if (irq > 0)
-		dev_dbg(&dev->dev, "irq %d for HT\n", irq);
-
-	return irq;
-}
-EXPORT_SYMBOL(__ht_create_irq);
-
-/**
- * ht_create_irq - create an irq and attach it to a device.
- * @dev: The hypertransport device to find the irq capability on.
- * @idx: Which of the possible irqs to attach to.
- *
- * ht_create_irq needs to be called for all hypertransport devices
- * that generate irqs.
- *
- * The irq number of the new irq or a negative error value is returned.
- */
-int ht_create_irq(struct pci_dev *dev, int idx)
-{
-	return __ht_create_irq(dev, idx, NULL);
-}
-EXPORT_SYMBOL(ht_create_irq);
-
-/**
- * ht_destroy_irq - destroy an irq created with ht_create_irq
- * @irq: irq to be destroyed
- *
- * This reverses ht_create_irq removing the specified irq from
- * existence.  The irq should be free before this happens.
- */
-void ht_destroy_irq(unsigned int irq)
-{
-	arch_teardown_ht_irq(irq);
-}
-EXPORT_SYMBOL(ht_destroy_irq);
diff --git a/include/linux/htirq.h b/include/linux/htirq.h
deleted file mode 100644
index 127c39d815ba..000000000000
--- a/include/linux/htirq.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef LINUX_HTIRQ_H
-#define LINUX_HTIRQ_H
-
-struct pci_dev;
-struct irq_data;
-
-struct ht_irq_msg {
-	u32	address_lo;	/* low 32 bits of the ht irq message */
-	u32	address_hi;	/* high 32 bits of the it irq message */
-};
-
-typedef void (ht_irq_update_t)(struct pci_dev *dev, int irq,
-			       struct ht_irq_msg *msg);
-
-struct ht_irq_cfg {
-	struct pci_dev *dev;
-	 /* Update callback used to cope with buggy hardware */
-	ht_irq_update_t *update;
-	unsigned pos;
-	unsigned idx;
-	struct ht_irq_msg msg;
-};
-
-/* Helper functions.. */
-void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
-void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
-void mask_ht_irq(struct irq_data *data);
-void unmask_ht_irq(struct irq_data *data);
-
-/* The arch hook for getting things started */
-int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev,
-		      ht_irq_update_t *update);
-void arch_teardown_ht_irq(unsigned int irq);
-
-/* For drivers of buggy hardware */
-int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update);
-
-#endif /* LINUX_HTIRQ_H */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index d16a7c037ec0..16287684dfe8 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1484,12 +1484,6 @@ static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { }
 static inline void pcie_ecrc_get_policy(char *str) { }
 #endif
 
-#ifdef CONFIG_HT_IRQ
-/* The functions a driver should call */
-int  ht_create_irq(struct pci_dev *dev, int idx);
-void ht_destroy_irq(unsigned int irq);
-#endif /* CONFIG_HT_IRQ */
-
 #ifdef CONFIG_PCI_ATS
 /* Address Translation Service */
 void pci_ats_init(struct pci_dev *dev);
-- 
cgit v1.2.3


From 20b7035c66bacc909ae3ffe92c1a1ea7db99fe4f Mon Sep 17 00:00:00 2001
From: "Jan H. Schönherr" <jschoenh@amazon.de>
Date: Fri, 24 Nov 2017 22:39:01 +0100
Subject: KVM: Let KVM_SET_SIGNAL_MASK work as advertised
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KVM API says for the signal mask you set via KVM_SET_SIGNAL_MASK, that
"any unblocked signal received [...] will cause KVM_RUN to return with
-EINTR" and that "the signal will only be delivered if not blocked by
the original signal mask".

This, however, is only true, when the calling task has a signal handler
registered for a signal. If not, signal evaluation is short-circuited for
SIG_IGN and SIG_DFL, and the signal is either ignored without KVM_RUN
returning or the whole process is terminated.

Make KVM_SET_SIGNAL_MASK behave as advertised by utilizing logic similar
to that in do_sigtimedwait() to avoid short-circuiting of signals.

Signed-off-by: Jan H. SchÃ¶nherr <jschoenh@amazon.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mips.c       |  7 ++-----
 arch/powerpc/kvm/powerpc.c |  7 ++-----
 arch/s390/kvm/kvm-s390.c   |  7 ++-----
 arch/x86/kvm/x86.c         |  7 ++-----
 include/linux/kvm_host.h   |  3 +++
 virt/kvm/arm/arm.c         |  8 +++-----
 virt/kvm/kvm_main.c        | 23 +++++++++++++++++++++++
 7 files changed, 37 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index d535edc01434..75fdeaa8c62f 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -445,10 +445,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
 	int r = -EINTR;
-	sigset_t sigsaved;
 
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+	kvm_sigset_activate(vcpu);
 
 	if (vcpu->mmio_needed) {
 		if (!vcpu->mmio_is_write)
@@ -480,8 +478,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	local_irq_enable();
 
 out:
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+	kvm_sigset_deactivate(vcpu);
 
 	return r;
 }
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6b6c53c42ac9..1915e86cef6f 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1407,7 +1407,6 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
 	int r;
-	sigset_t sigsaved;
 
 	if (vcpu->mmio_needed) {
 		vcpu->mmio_needed = 0;
@@ -1448,16 +1447,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 #endif
 	}
 
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+	kvm_sigset_activate(vcpu);
 
 	if (run->immediate_exit)
 		r = -EINTR;
 	else
 		r = kvmppc_vcpu_run(run, vcpu);
 
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+	kvm_sigset_deactivate(vcpu);
 
 	return r;
 }
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 98ad8b9e0360..9614aea5839b 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -3372,7 +3372,6 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	int rc;
-	sigset_t sigsaved;
 
 	if (kvm_run->immediate_exit)
 		return -EINTR;
@@ -3382,8 +3381,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		return 0;
 	}
 
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+	kvm_sigset_activate(vcpu);
 
 	if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) {
 		kvm_s390_vcpu_start(vcpu);
@@ -3417,8 +3415,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	disable_cpu_timer_accounting(vcpu);
 	store_regs(vcpu, kvm_run);
 
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+	kvm_sigset_deactivate(vcpu);
 
 	vcpu->stat.exit_userspace++;
 	return rc;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f49fe514d1b2..eee8e7faf1af 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7267,12 +7267,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct fpu *fpu = &current->thread.fpu;
 	int r;
-	sigset_t sigsaved;
 
 	fpu__initialize(fpu);
 
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+	kvm_sigset_activate(vcpu);
 
 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
 		if (kvm_run->immediate_exit) {
@@ -7315,8 +7313,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 out:
 	post_kvm_run_save(vcpu);
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+	kvm_sigset_deactivate(vcpu);
 
 	return r;
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2e754b7c282c..893d6d606cd0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -715,6 +715,9 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
 			 unsigned long len);
 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
 
+void kvm_sigset_activate(struct kvm_vcpu *vcpu);
+void kvm_sigset_deactivate(struct kvm_vcpu *vcpu);
+
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index a6524ff27de4..a67c106d73f5 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -615,7 +615,6 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
 	int ret;
-	sigset_t sigsaved;
 
 	if (unlikely(!kvm_vcpu_initialized(vcpu)))
 		return -ENOEXEC;
@@ -633,8 +632,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	if (run->immediate_exit)
 		return -EINTR;
 
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+	kvm_sigset_activate(vcpu);
 
 	ret = 1;
 	run->exit_reason = KVM_EXIT_UNKNOWN;
@@ -769,8 +767,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		kvm_pmu_update_run(vcpu);
 	}
 
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+	kvm_sigset_deactivate(vcpu);
+
 	return ret;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2dd1a9ca4599..c01cff064ec5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2065,6 +2065,29 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
 
+void kvm_sigset_activate(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->sigset_active)
+		return;
+
+	/*
+	 * This does a lockless modification of ->real_blocked, which is fine
+	 * because, only current can change ->real_blocked and all readers of
+	 * ->real_blocked don't care as long ->real_blocked is always a subset
+	 * of ->blocked.
+	 */
+	sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
+}
+
+void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->sigset_active)
+		return;
+
+	sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
+	sigemptyset(&current->real_blocked);
+}
+
 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
 {
 	unsigned int old, val, grow;
-- 
cgit v1.2.3


From 1751e8a6cb935e555fcdbcb9ab4f0446e322ca3e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 27 Nov 2017 13:05:09 -0800
Subject: Rename superblock flags (MS_xyz -> SB_xyz)

This is a pure automated search-and-replace of the internal kernel
superblock flags.

The s_flags are now called SB_*, with the names and the values for the
moment mirroring the MS_* flags that they're equivalent to.

Note how the MS_xyz flags are the ones passed to the mount system call,
while the SB_xyz flags are what we then use in sb->s_flags.

The script to do this was:

    # places to look in; re security/*: it generally should *not* be
    # touched (that stuff parses mount(2) arguments directly), but
    # there are two places where we really deal with superblock flags.
    FILES="drivers/mtd drivers/staging/lustre fs ipc mm \
            include/linux/fs.h include/uapi/linux/bfs_fs.h \
            security/apparmor/apparmorfs.c security/apparmor/include/lib.h"
    # the list of MS_... constants
    SYMS="RDONLY NOSUID NODEV NOEXEC SYNCHRONOUS REMOUNT MANDLOCK \
          DIRSYNC NOATIME NODIRATIME BIND MOVE REC VERBOSE SILENT \
          POSIXACL UNBINDABLE PRIVATE SLAVE SHARED RELATIME KERNMOUNT \
          I_VERSION STRICTATIME LAZYTIME SUBMOUNT NOREMOTELOCK NOSEC BORN \
          ACTIVE NOUSER"

    SED_PROG=
    for i in $SYMS; do SED_PROG="$SED_PROG -e s/MS_$i/SB_$i/g"; done

    # we want files that contain at least one of MS_...,
    # with fs/namespace.c and fs/pnode.c excluded.
    L=$(for i in $SYMS; do git grep -w -l MS_$i $FILES; done| sort|uniq|grep -v '^fs/namespace.c'|grep -v '^fs/pnode.c')

    for f in $L; do sed -i $f $SED_PROG; done

Requested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mtd/mtdsuper.c                          |  6 +--
 drivers/staging/lustre/lustre/llite/file.c      |  2 +-
 drivers/staging/lustre/lustre/llite/llite_lib.c | 14 +++----
 fs/9p/vfs_super.c                               |  6 +--
 fs/adfs/super.c                                 |  4 +-
 fs/affs/amigaffs.c                              |  2 +-
 fs/affs/bitmap.c                                |  6 +--
 fs/affs/super.c                                 | 16 ++++----
 fs/afs/super.c                                  |  4 +-
 fs/befs/ChangeLog                               |  2 +-
 fs/befs/linuxvfs.c                              |  4 +-
 fs/btrfs/ctree.h                                |  2 +-
 fs/btrfs/extent_io.c                            |  2 +-
 fs/btrfs/ioctl.c                                |  4 +-
 fs/btrfs/super.c                                | 50 ++++++++++++------------
 fs/btrfs/volumes.c                              |  4 +-
 fs/ceph/super.c                                 |  8 ++--
 fs/cifs/cifs_fs_sb.h                            |  2 +-
 fs/cifs/cifsfs.c                                | 12 +++---
 fs/cifs/cifsglob.h                              |  4 +-
 fs/cifs/inode.c                                 |  2 +-
 fs/cifs/xattr.c                                 |  8 ++--
 fs/coda/inode.c                                 |  4 +-
 fs/cramfs/inode.c                               |  4 +-
 fs/ecryptfs/main.c                              |  8 ++--
 fs/efs/super.c                                  |  4 +-
 fs/ext2/balloc.c                                |  4 +-
 fs/ext2/ialloc.c                                |  4 +-
 fs/ext2/super.c                                 | 20 +++++-----
 fs/ext4/inode.c                                 |  4 +-
 fs/ext4/super.c                                 | 52 ++++++++++++-------------
 fs/f2fs/checkpoint.c                            | 10 ++---
 fs/f2fs/f2fs.h                                  |  2 +-
 fs/f2fs/gc.c                                    |  2 +-
 fs/f2fs/recovery.c                              | 10 ++---
 fs/f2fs/super.c                                 | 28 ++++++-------
 fs/fat/fatent.c                                 |  6 +--
 fs/fat/inode.c                                  |  8 ++--
 fs/fat/misc.c                                   |  2 +-
 fs/fat/namei_msdos.c                            |  2 +-
 fs/freevxfs/vxfs_super.c                        |  4 +-
 fs/fs-writeback.c                               |  2 +-
 fs/fuse/inode.c                                 | 12 +++---
 fs/gfs2/ops_fstype.c                            | 16 ++++----
 fs/gfs2/super.c                                 | 10 ++---
 fs/gfs2/trans.c                                 |  2 +-
 fs/hfs/mdb.c                                    |  4 +-
 fs/hfs/super.c                                  | 16 ++++----
 fs/hfsplus/super.c                              | 22 +++++------
 fs/hpfs/map.c                                   |  2 +-
 fs/hpfs/super.c                                 |  8 ++--
 fs/inode.c                                      | 10 ++---
 fs/isofs/inode.c                                |  2 +-
 fs/jffs2/fs.c                                   |  4 +-
 fs/jffs2/os-linux.h                             |  2 +-
 fs/jffs2/super.c                                |  4 +-
 fs/jfs/super.c                                  | 10 ++---
 fs/kernfs/mount.c                               |  2 +-
 fs/libfs.c                                      |  6 +--
 fs/locks.c                                      |  2 +-
 fs/minix/inode.c                                |  4 +-
 fs/ncpfs/inode.c                                |  4 +-
 fs/nfs/dir.c                                    |  2 +-
 fs/nfs/inode.c                                  |  2 +-
 fs/nfs/internal.h                               |  2 +-
 fs/nfs/super.c                                  | 22 +++++------
 fs/nilfs2/segment.c                             |  2 +-
 fs/nilfs2/super.c                               | 24 ++++++------
 fs/nilfs2/the_nilfs.c                           |  6 +--
 fs/notify/fsnotify.c                            |  2 +-
 fs/nsfs.c                                       |  2 +-
 fs/ntfs/super.c                                 | 32 +++++++--------
 fs/ocfs2/file.c                                 |  2 +-
 fs/ocfs2/super.c                                | 28 ++++++-------
 fs/ocfs2/xattr.c                                |  2 +-
 fs/openpromfs/inode.c                           |  4 +-
 fs/orangefs/super.c                             |  8 ++--
 fs/overlayfs/super.c                            | 10 ++---
 fs/proc/inode.c                                 |  2 +-
 fs/proc/root.c                                  |  2 +-
 fs/proc_namespace.c                             |  8 ++--
 fs/qnx4/inode.c                                 |  4 +-
 fs/qnx6/inode.c                                 |  4 +-
 fs/reiserfs/inode.c                             |  2 +-
 fs/reiserfs/journal.c                           |  6 +--
 fs/reiserfs/prints.c                            |  4 +-
 fs/reiserfs/super.c                             | 18 ++++-----
 fs/reiserfs/xattr.c                             | 10 ++---
 fs/romfs/super.c                                |  4 +-
 fs/squashfs/super.c                             |  4 +-
 fs/statfs.c                                     |  6 +--
 fs/sysfs/mount.c                                |  2 +-
 fs/sysv/inode.c                                 |  2 +-
 fs/sysv/super.c                                 |  2 +-
 fs/ubifs/file.c                                 |  2 +-
 fs/ubifs/io.c                                   |  2 +-
 fs/ubifs/super.c                                | 20 +++++-----
 fs/ubifs/ubifs.h                                |  4 +-
 fs/udf/super.c                                  |  6 +--
 fs/ufs/balloc.c                                 |  8 ++--
 fs/ufs/ialloc.c                                 | 10 ++---
 fs/ufs/super.c                                  | 30 +++++++-------
 fs/xfs/xfs_log.c                                |  6 +--
 fs/xfs/xfs_super.c                              |  8 ++--
 fs/xfs/xfs_super.h                              |  2 +-
 include/linux/fs.h                              |  2 +-
 include/uapi/linux/bfs_fs.h                     |  2 +-
 ipc/mqueue.c                                    |  2 +-
 mm/shmem.c                                      | 10 ++---
 security/apparmor/apparmorfs.c                  |  2 +-
 security/apparmor/include/lib.h                 |  2 +-
 111 files changed, 417 insertions(+), 417 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
index e43fea896d1e..d58a61c09304 100644
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -79,14 +79,14 @@ static struct dentry *mount_mtd_aux(struct file_system_type *fs_type, int flags,
 	pr_debug("MTDSB: New superblock for device %d (\"%s\")\n",
 	      mtd->index, mtd->name);
 
-	ret = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+	ret = fill_super(sb, data, flags & SB_SILENT ? 1 : 0);
 	if (ret < 0) {
 		deactivate_locked_super(sb);
 		return ERR_PTR(ret);
 	}
 
 	/* go */
-	sb->s_flags |= MS_ACTIVE;
+	sb->s_flags |= SB_ACTIVE;
 	return dget(sb->s_root);
 
 	/* new mountpoint for an already mounted superblock */
@@ -202,7 +202,7 @@ struct dentry *mount_mtd(struct file_system_type *fs_type, int flags,
 not_an_MTD_device:
 #endif /* CONFIG_BLOCK */
 
-	if (!(flags & MS_SILENT))
+	if (!(flags & SB_SILENT))
 		printk(KERN_NOTICE
 		       "MTD: Attempt to mount non-MTD device \"%s\"\n",
 		       dev_name);
diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c
index 2d6e64dea266..938b859b6650 100644
--- a/drivers/staging/lustre/lustre/llite/file.c
+++ b/drivers/staging/lustre/lustre/llite/file.c
@@ -1016,7 +1016,7 @@ static bool file_is_noatime(const struct file *file)
 	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
 		return true;
 
-	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
+	if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
 		return true;
 
 	return false;
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c
index 65ac5128f005..8666f1e81ade 100644
--- a/drivers/staging/lustre/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -313,11 +313,11 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	}
 
 	if (data->ocd_connect_flags & OBD_CONNECT_ACL) {
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 		sbi->ll_flags |= LL_SBI_ACL;
 	} else {
 		LCONSOLE_INFO("client wants to enable acl, but mdt not!\n");
-		sb->s_flags &= ~MS_POSIXACL;
+		sb->s_flags &= ~SB_POSIXACL;
 		sbi->ll_flags &= ~LL_SBI_ACL;
 	}
 
@@ -660,7 +660,7 @@ void ll_kill_super(struct super_block *sb)
 	struct ll_sb_info *sbi;
 
 	/* not init sb ?*/
-	if (!(sb->s_flags & MS_ACTIVE))
+	if (!(sb->s_flags & SB_ACTIVE))
 		return;
 
 	sbi = ll_s2sbi(sb);
@@ -2039,8 +2039,8 @@ int ll_remount_fs(struct super_block *sb, int *flags, char *data)
 	int err;
 	__u32 read_only;
 
-	if ((bool)(*flags & MS_RDONLY) != sb_rdonly(sb)) {
-		read_only = *flags & MS_RDONLY;
+	if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
+		read_only = *flags & SB_RDONLY;
 		err = obd_set_info_async(NULL, sbi->ll_md_exp,
 					 sizeof(KEY_READ_ONLY),
 					 KEY_READ_ONLY, sizeof(read_only),
@@ -2053,9 +2053,9 @@ int ll_remount_fs(struct super_block *sb, int *flags, char *data)
 		}
 
 		if (read_only)
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 		else
-			sb->s_flags &= ~MS_RDONLY;
+			sb->s_flags &= ~SB_RDONLY;
 
 		if (sbi->ll_flags & LL_SBI_VERBOSE)
 			LCONSOLE_WARN("Remounted %s %s\n", profilenm,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 8b75463cb211..af03c2a901eb 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -94,13 +94,13 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 	if (v9ses->cache)
 		sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
 
-	sb->s_flags |= MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
+	sb->s_flags |= SB_ACTIVE | SB_DIRSYNC | SB_NOATIME;
 	if (!v9ses->cache)
-		sb->s_flags |= MS_SYNCHRONOUS;
+		sb->s_flags |= SB_SYNCHRONOUS;
 
 #ifdef CONFIG_9P_FS_POSIX_ACL
 	if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL)
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 #endif
 
 	return 0;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index c9fdfb112933..cfda2c7caedc 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -213,7 +213,7 @@ static int parse_options(struct super_block *sb, char *options)
 static int adfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_NODIRATIME;
+	*flags |= SB_NODIRATIME;
 	return parse_options(sb, data);
 }
 
@@ -372,7 +372,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 	struct inode *root;
 	int ret = -EINVAL;
 
-	sb->s_flags |= MS_NODIRATIME;
+	sb->s_flags |= SB_NODIRATIME;
 
 	asb = kzalloc(sizeof(*asb), GFP_KERNEL);
 	if (!asb)
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 185d5ab7e986..0f0e6925e97d 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -453,7 +453,7 @@ affs_error(struct super_block *sb, const char *function, const char *fmt, ...)
 	pr_crit("error (device %s): %s(): %pV\n", sb->s_id, function, &vaf);
 	if (!sb_rdonly(sb))
 		pr_warn("Remounting filesystem read-only\n");
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	va_end(args);
 }
 
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index 2b1399611d9e..5ba9ef2742f6 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -250,12 +250,12 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
 	int i, res = 0;
 	struct affs_sb_info *sbi = AFFS_SB(sb);
 
-	if (*flags & MS_RDONLY)
+	if (*flags & SB_RDONLY)
 		return 0;
 
 	if (!AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag) {
 		pr_notice("Bitmap invalid - mounting %s read only\n", sb->s_id);
-		*flags |= MS_RDONLY;
+		*flags |= SB_RDONLY;
 		return 0;
 	}
 
@@ -288,7 +288,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
 		if (affs_checksum_block(sb, bh)) {
 			pr_warn("Bitmap %u invalid - mounting %s read only.\n",
 				bm->bm_key, sb->s_id);
-			*flags |= MS_RDONLY;
+			*flags |= SB_RDONLY;
 			goto out;
 		}
 		pr_debug("read bitmap block %d: %d\n", blk, bm->bm_key);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 884bedab7266..1117e36134cc 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -356,7 +356,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_magic             = AFFS_SUPER_MAGIC;
 	sb->s_op                = &affs_sops;
-	sb->s_flags |= MS_NODIRATIME;
+	sb->s_flags |= SB_NODIRATIME;
 
 	sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
 	if (!sbi)
@@ -466,7 +466,7 @@ got_root:
 	if ((chksum == FS_DCFFS || chksum == MUFS_DCFFS || chksum == FS_DCOFS
 	     || chksum == MUFS_DCOFS) && !sb_rdonly(sb)) {
 		pr_notice("Dircache FS - mounting %s read only\n", sb->s_id);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 	switch (chksum) {
 	case MUFS_FS:
@@ -488,7 +488,7 @@ got_root:
 		/* fall thru */
 	case FS_OFS:
 		affs_set_opt(sbi->s_flags, SF_OFS);
-		sb->s_flags |= MS_NOEXEC;
+		sb->s_flags |= SB_NOEXEC;
 		break;
 	case MUFS_DCOFS:
 	case MUFS_INTLOFS:
@@ -497,7 +497,7 @@ got_root:
 	case FS_INTLOFS:
 		affs_set_opt(sbi->s_flags, SF_INTL);
 		affs_set_opt(sbi->s_flags, SF_OFS);
-		sb->s_flags |= MS_NOEXEC;
+		sb->s_flags |= SB_NOEXEC;
 		break;
 	default:
 		pr_err("Unknown filesystem on device %s: %08X\n",
@@ -513,7 +513,7 @@ got_root:
 			sig, sig[3] + '0', blocksize);
 	}
 
-	sb->s_flags |= MS_NODEV | MS_NOSUID;
+	sb->s_flags |= SB_NODEV | SB_NOSUID;
 
 	sbi->s_data_blksize = sb->s_blocksize;
 	if (affs_test_opt(sbi->s_flags, SF_OFS))
@@ -570,7 +570,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 	pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
 
 	sync_filesystem(sb);
-	*flags |= MS_NODIRATIME;
+	*flags |= SB_NODIRATIME;
 
 	memcpy(volume, sbi->s_volume, 32);
 	if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block,
@@ -596,10 +596,10 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 	memcpy(sbi->s_volume, volume, 32);
 	spin_unlock(&sbi->symlink_lock);
 
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		return 0;
 
-	if (*flags & MS_RDONLY)
+	if (*flags & SB_RDONLY)
 		affs_free_bitmap(sb);
 	else
 		res = affs_init_bitmap(sb, flags);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 875b5eb02242..d3f97da61bdf 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -496,10 +496,10 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
 		if (ret < 0)
 			goto error_sb;
 		as = NULL;
-		sb->s_flags |= MS_ACTIVE;
+		sb->s_flags |= SB_ACTIVE;
 	} else {
 		_debug("reuse");
-		ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
+		ASSERTCMP(sb->s_flags, &, SB_ACTIVE);
 		afs_destroy_sbi(as);
 		as = NULL;
 	}
diff --git a/fs/befs/ChangeLog b/fs/befs/ChangeLog
index 75a461cfaca6..16f2dfe8c2f7 100644
--- a/fs/befs/ChangeLog
+++ b/fs/befs/ChangeLog
@@ -365,7 +365,7 @@ Version 0.4 (2001-10-28)
 	(fs/befs/super.c)
 
 * Tell the kernel to only mount befs read-only. 
-	By setting the MS_RDONLY flag in befs_read_super().
+	By setting the SB_RDONLY flag in befs_read_super().
 	Not that it was possible to write before. But now the kernel won't even try.
 	(fs/befs/super.c)
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index a92355cc453b..ee236231cafa 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -841,7 +841,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sb_rdonly(sb)) {
 		befs_warning(sb,
 			     "No write support. Marking filesystem read-only");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 
 	/*
@@ -948,7 +948,7 @@ static int
 befs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	if (!(*flags & MS_RDONLY))
+	if (!(*flags & SB_RDONLY))
 		return -EINVAL;
 	return 0;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f7df5536ab61..51477a537c83 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2957,7 +2957,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
  */
 static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
 {
-	return fs_info->sb->s_flags & MS_RDONLY || btrfs_fs_closing(fs_info);
+	return fs_info->sb->s_flags & SB_RDONLY || btrfs_fs_closing(fs_info);
 }
 
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 16045ea86fc1..f9e9f721efe2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1984,7 +1984,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 	struct btrfs_bio *bbio = NULL;
 	int ret;
 
-	ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
+	ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
 	BUG_ON(!mirror_num);
 
 	bio = btrfs_io_bio_alloc(1);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fd172a93d11a..d748ad1c3620 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1172,7 +1172,7 @@ again:
 	if (!i_done || ret)
 		goto out;
 
-	if (!(inode->i_sb->s_flags & MS_ACTIVE))
+	if (!(inode->i_sb->s_flags & SB_ACTIVE))
 		goto out;
 
 	/*
@@ -1333,7 +1333,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		 * make sure we stop running if someone unmounts
 		 * the FS
 		 */
-		if (!(inode->i_sb->s_flags & MS_ACTIVE))
+		if (!(inode->i_sb->s_flags & SB_ACTIVE))
 			break;
 
 		if (btrfs_defrag_cancelled(fs_info)) {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 65af029559b5..305cae7444a0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -107,7 +107,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
 		return;
 
 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		btrfs_info(fs_info, "forced readonly");
 		/*
 		 * Note that a running device replace operation is not
@@ -137,7 +137,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
 
 	/*
 	 * Special case: if the error is EROFS, and we're already
-	 * under MS_RDONLY, then it is safe here.
+	 * under SB_RDONLY, then it is safe here.
 	 */
 	if (errno == -EROFS && sb_rdonly(sb))
   		return;
@@ -168,7 +168,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
 	set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
 
 	/* Don't go through full error handling during mount */
-	if (sb->s_flags & MS_BORN)
+	if (sb->s_flags & SB_BORN)
 		btrfs_handle_error(fs_info);
 }
 
@@ -625,7 +625,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			break;
 		case Opt_acl:
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
-			info->sb->s_flags |= MS_POSIXACL;
+			info->sb->s_flags |= SB_POSIXACL;
 			break;
 #else
 			btrfs_err(info, "support for ACL not compiled in!");
@@ -633,7 +633,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			goto out;
 #endif
 		case Opt_noacl:
-			info->sb->s_flags &= ~MS_POSIXACL;
+			info->sb->s_flags &= ~SB_POSIXACL;
 			break;
 		case Opt_notreelog:
 			btrfs_set_and_info(info, NOTREELOG,
@@ -851,7 +851,7 @@ check:
 	/*
 	 * Extra check for current option against current flag
 	 */
-	if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) {
+	if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & SB_RDONLY)) {
 		btrfs_err(info,
 			  "nologreplay must be used with ro mount option");
 		ret = -EINVAL;
@@ -1147,7 +1147,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	sb->s_xattr = btrfs_xattr_handlers;
 	sb->s_time_gran = 1;
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
-	sb->s_flags |= MS_POSIXACL;
+	sb->s_flags |= SB_POSIXACL;
 #endif
 	sb->s_flags |= SB_I_VERSION;
 	sb->s_iflags |= SB_I_CGROUPWB;
@@ -1180,7 +1180,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	}
 
 	cleancache_init_fs(sb);
-	sb->s_flags |= MS_ACTIVE;
+	sb->s_flags |= SB_ACTIVE;
 	return 0;
 
 fail_close:
@@ -1277,7 +1277,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_puts(seq, ",flushoncommit");
 	if (btrfs_test_opt(info, DISCARD))
 		seq_puts(seq, ",discard");
-	if (!(info->sb->s_flags & MS_POSIXACL))
+	if (!(info->sb->s_flags & SB_POSIXACL))
 		seq_puts(seq, ",noacl");
 	if (btrfs_test_opt(info, SPACE_CACHE))
 		seq_puts(seq, ",space_cache");
@@ -1409,11 +1409,11 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
 
 	mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs);
 	if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) {
-		if (flags & MS_RDONLY) {
-			mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY,
+		if (flags & SB_RDONLY) {
+			mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~SB_RDONLY,
 					     device_name, newargs);
 		} else {
-			mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY,
+			mnt = vfs_kern_mount(&btrfs_fs_type, flags | SB_RDONLY,
 					     device_name, newargs);
 			if (IS_ERR(mnt)) {
 				root = ERR_CAST(mnt);
@@ -1565,7 +1565,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	u64 subvol_objectid = 0;
 	int error = 0;
 
-	if (!(flags & MS_RDONLY))
+	if (!(flags & SB_RDONLY))
 		mode |= FMODE_WRITE;
 
 	error = btrfs_parse_early_options(data, mode, fs_type,
@@ -1619,13 +1619,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	if (error)
 		goto error_fs_info;
 
-	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+	if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
 		error = -EACCES;
 		goto error_close_devices;
 	}
 
 	bdev = fs_devices->latest_bdev;
-	s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | MS_NOSEC,
+	s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
 		 fs_info);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
@@ -1635,7 +1635,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	if (s->s_root) {
 		btrfs_close_devices(fs_devices);
 		free_fs_info(fs_info);
-		if ((flags ^ s->s_flags) & MS_RDONLY)
+		if ((flags ^ s->s_flags) & SB_RDONLY)
 			error = -EBUSY;
 	} else {
 		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
@@ -1702,11 +1702,11 @@ static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
 {
 	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
 	    (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
-	     (flags & MS_RDONLY))) {
+	     (flags & SB_RDONLY))) {
 		/* wait for any defraggers to finish */
 		wait_event(fs_info->transaction_wait,
 			   (atomic_read(&fs_info->defrag_running) == 0));
-		if (flags & MS_RDONLY)
+		if (flags & SB_RDONLY)
 			sync_filesystem(fs_info->sb);
 	}
 }
@@ -1766,10 +1766,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 	btrfs_resize_thread_pool(fs_info,
 		fs_info->thread_pool_size, old_thread_pool_size);
 
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		goto out;
 
-	if (*flags & MS_RDONLY) {
+	if (*flags & SB_RDONLY) {
 		/*
 		 * this also happens on 'umount -rf' or on shutdown, when
 		 * the filesystem is busy.
@@ -1781,10 +1781,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		/* avoid complains from lockdep et al. */
 		up(&fs_info->uuid_tree_rescan_sem);
 
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 
 		/*
-		 * Setting MS_RDONLY will put the cleaner thread to
+		 * Setting SB_RDONLY will put the cleaner thread to
 		 * sleep at the next loop if it's already active.
 		 * If it's already asleep, we'll leave unused block
 		 * groups on disk until we're mounted read-write again
@@ -1856,7 +1856,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 				goto restore;
 			}
 		}
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 
 		set_bit(BTRFS_FS_OPEN, &fs_info->flags);
 	}
@@ -1866,9 +1866,9 @@ out:
 	return 0;
 
 restore:
-	/* We've hit an error - don't reset MS_RDONLY */
+	/* We've hit an error - don't reset SB_RDONLY */
 	if (sb_rdonly(sb))
-		old_flags |= MS_RDONLY;
+		old_flags |= SB_RDONLY;
 	sb->s_flags = old_flags;
 	fs_info->mount_opt = old_opts;
 	fs_info->compress_type = old_compress_type;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f1ecb938ba4d..925070b9ce03 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2384,7 +2384,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
 
 	if (seeding_dev) {
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 		ret = btrfs_prepare_sprout(fs_info);
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
@@ -2497,7 +2497,7 @@ error_sysfs:
 	btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
 error_trans:
 	if (seeding_dev)
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	if (trans)
 		btrfs_end_transaction(trans);
 	rcu_string_free(device->name);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index fe9fbb3f13f7..a62d2a9841dc 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -331,11 +331,11 @@ static int parse_fsopt_token(char *c, void *private)
 		break;
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 	case Opt_acl:
-		fsopt->sb_flags |= MS_POSIXACL;
+		fsopt->sb_flags |= SB_POSIXACL;
 		break;
 #endif
 	case Opt_noacl:
-		fsopt->sb_flags &= ~MS_POSIXACL;
+		fsopt->sb_flags &= ~SB_POSIXACL;
 		break;
 	default:
 		BUG_ON(token);
@@ -520,7 +520,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
 		seq_puts(m, ",nopoolperm");
 
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
-	if (fsopt->sb_flags & MS_POSIXACL)
+	if (fsopt->sb_flags & SB_POSIXACL)
 		seq_puts(m, ",acl");
 	else
 		seq_puts(m, ",noacl");
@@ -988,7 +988,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
 	dout("ceph_mount\n");
 
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
-	flags |= MS_POSIXACL;
+	flags |= SB_POSIXACL;
 #endif
 	err = parse_mount_options(&fsopt, &opt, flags, data, dev_name);
 	if (err < 0) {
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index cbd216b57239..350fa55a1bf7 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -42,7 +42,7 @@
 #define CIFS_MOUNT_MULTIUSER	0x20000 /* multiuser mount */
 #define CIFS_MOUNT_STRICT_IO	0x40000 /* strict cache mode */
 #define CIFS_MOUNT_RWPIDFORWARD	0x80000 /* use pid forwarding for rw */
-#define CIFS_MOUNT_POSIXACL	0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */
+#define CIFS_MOUNT_POSIXACL	0x100000 /* mirror of SB_POSIXACL in mnt_cifs_flags */
 #define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */
 #define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */
 #define CIFS_MOUNT_MAP_SFM_CHR	0x800000 /* SFM/MAC mapping for illegal chars */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8c8b75d33f31..31b7565b1617 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -125,7 +125,7 @@ cifs_read_super(struct super_block *sb)
 	tcon = cifs_sb_master_tcon(cifs_sb);
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL)
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 
 	if (tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files)
 		sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -497,7 +497,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 		seq_puts(s, ",cifsacl");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
 		seq_puts(s, ",dynperm");
-	if (root->d_sb->s_flags & MS_POSIXACL)
+	if (root->d_sb->s_flags & SB_POSIXACL)
 		seq_puts(s, ",acl");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
 		seq_puts(s, ",mfsymlinks");
@@ -573,7 +573,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root)
 static int cifs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_NODIRATIME;
+	*flags |= SB_NODIRATIME;
 	return 0;
 }
 
@@ -708,7 +708,7 @@ cifs_do_mount(struct file_system_type *fs_type,
 
 	rc = cifs_mount(cifs_sb, volume_info);
 	if (rc) {
-		if (!(flags & MS_SILENT))
+		if (!(flags & SB_SILENT))
 			cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n",
 				 rc);
 		root = ERR_PTR(rc);
@@ -720,7 +720,7 @@ cifs_do_mount(struct file_system_type *fs_type,
 	mnt_data.flags = flags;
 
 	/* BB should we make this contingent on mount parm? */
-	flags |= MS_NODIRATIME | MS_NOATIME;
+	flags |= SB_NODIRATIME | SB_NOATIME;
 
 	sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data);
 	if (IS_ERR(sb)) {
@@ -739,7 +739,7 @@ cifs_do_mount(struct file_system_type *fs_type,
 			goto out_super;
 		}
 
-		sb->s_flags |= MS_ACTIVE;
+		sb->s_flags |= SB_ACTIVE;
 	}
 
 	root = cifs_get_root(volume_info, sb);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index e185b2853eab..b16583594d1a 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -559,8 +559,8 @@ struct smb_vol {
 			 CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \
 			 CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID)
 
-#define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \
-		      MS_NODEV | MS_SYNCHRONOUS)
+#define CIFS_MS_MASK (SB_RDONLY | SB_MANDLOCK | SB_NOEXEC | SB_NOSUID | \
+		      SB_NODEV | SB_SYNCHRONOUS)
 
 struct cifs_mnt_data {
 	struct cifs_sb_info *cifs_sb;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 7c732cb44164..ecb99079363a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -985,7 +985,7 @@ retry_iget5_locked:
 		}
 
 		cifs_fattr_to_inode(inode, fattr);
-		if (sb->s_flags & MS_NOATIME)
+		if (sb->s_flags & SB_NOATIME)
 			inode->i_flags |= S_NOATIME | S_NOCMTIME;
 		if (inode->i_state & I_NEW) {
 			inode->i_ino = hash;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 52f975d848a0..316af84674f1 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -117,7 +117,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
 #ifdef CONFIG_CIFS_POSIX
 		if (!value)
 			goto out;
-		if (sb->s_flags & MS_POSIXACL)
+		if (sb->s_flags & SB_POSIXACL)
 			rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
 				value, (const int)size,
 				ACL_TYPE_ACCESS, cifs_sb->local_nls,
@@ -129,7 +129,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
 #ifdef CONFIG_CIFS_POSIX
 		if (!value)
 			goto out;
-		if (sb->s_flags & MS_POSIXACL)
+		if (sb->s_flags & SB_POSIXACL)
 			rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
 				value, (const int)size,
 				ACL_TYPE_DEFAULT, cifs_sb->local_nls,
@@ -266,7 +266,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
 
 	case XATTR_ACL_ACCESS:
 #ifdef CONFIG_CIFS_POSIX
-		if (sb->s_flags & MS_POSIXACL)
+		if (sb->s_flags & SB_POSIXACL)
 			rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
 				value, size, ACL_TYPE_ACCESS,
 				cifs_sb->local_nls,
@@ -276,7 +276,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
 
 	case XATTR_ACL_DEFAULT:
 #ifdef CONFIG_CIFS_POSIX
-		if (sb->s_flags & MS_POSIXACL)
+		if (sb->s_flags & SB_POSIXACL)
 			rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
 				value, size, ACL_TYPE_DEFAULT,
 				cifs_sb->local_nls,
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6f0a6a4d5faa..97424cf206c0 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -96,7 +96,7 @@ void coda_destroy_inodecache(void)
 static int coda_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_NOATIME;
+	*flags |= SB_NOATIME;
 	return 0;
 }
 
@@ -188,7 +188,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	mutex_unlock(&vc->vc_mutex);
 
 	sb->s_fs_info = vc;
-	sb->s_flags |= MS_NOATIME;
+	sb->s_flags |= SB_NOATIME;
 	sb->s_blocksize = 4096;	/* XXXXX  what do we put here?? */
 	sb->s_blocksize_bits = 12;
 	sb->s_magic = CODA_SUPER_MAGIC;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 9a2ab419ba62..017b0ab19bc4 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -505,7 +505,7 @@ static void cramfs_kill_sb(struct super_block *sb)
 static int cramfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
@@ -592,7 +592,7 @@ static int cramfs_finalize_super(struct super_block *sb,
 	struct inode *root;
 
 	/* Set it all up.. */
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	sb->s_op = &cramfs_ops;
 	root = get_cramfs_inode(sb, cramfs_root, 0);
 	if (IS_ERR(root))
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index f2677c90d96e..025d66a705db 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -560,8 +560,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 	 * Set the POSIX ACL flag based on whether they're enabled in the lower
 	 * mount.
 	 */
-	s->s_flags = flags & ~MS_POSIXACL;
-	s->s_flags |= path.dentry->d_sb->s_flags & MS_POSIXACL;
+	s->s_flags = flags & ~SB_POSIXACL;
+	s->s_flags |= path.dentry->d_sb->s_flags & SB_POSIXACL;
 
 	/**
 	 * Force a read-only eCryptfs mount when:
@@ -569,7 +569,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 	 *   2) The ecryptfs_encrypted_view mount option is specified
 	 */
 	if (sb_rdonly(path.dentry->d_sb) || mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
-		s->s_flags |= MS_RDONLY;
+		s->s_flags |= SB_RDONLY;
 
 	s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
 	s->s_blocksize = path.dentry->d_sb->s_blocksize;
@@ -602,7 +602,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 	ecryptfs_set_dentry_private(s->s_root, root_info);
 	root_info->lower_path = path;
 
-	s->s_flags |= MS_ACTIVE;
+	s->s_flags |= SB_ACTIVE;
 	return dget(s->s_root);
 
 out_free:
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 65b59009555b..6ffb7ba1547a 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -116,7 +116,7 @@ static void destroy_inodecache(void)
 static int efs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
@@ -311,7 +311,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
 #ifdef DEBUG
 		pr_info("forcing read-only mode\n");
 #endif
-		s->s_flags |= MS_RDONLY;
+		s->s_flags |= SB_RDONLY;
 	}
 	s->s_op   = &efs_superblock_operations;
 	s->s_export_op = &efs_export_ops;
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index e1b3724bebf2..33db13365c5e 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -548,7 +548,7 @@ do_more:
 	}
 
 	mark_buffer_dirty(bitmap_bh);
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		sync_dirty_buffer(bitmap_bh);
 
 	group_adjust_blocks(sb, block_group, desc, bh2, group_freed);
@@ -1424,7 +1424,7 @@ allocated:
 	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
 
 	mark_buffer_dirty(bitmap_bh);
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		sync_dirty_buffer(bitmap_bh);
 
 	*errp = 0;
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index a1fc3dabca41..6484199b35d1 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -145,7 +145,7 @@ void ext2_free_inode (struct inode * inode)
 	else
 		ext2_release_inode(sb, block_group, is_directory);
 	mark_buffer_dirty(bitmap_bh);
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		sync_dirty_buffer(bitmap_bh);
 
 	brelse(bitmap_bh);
@@ -517,7 +517,7 @@ repeat_in_this_group:
 	goto fail;
 got:
 	mark_buffer_dirty(bitmap_bh);
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		sync_dirty_buffer(bitmap_bh);
 	brelse(bitmap_bh);
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index e2b6be03e69b..7646818ab266 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -75,7 +75,7 @@ void ext2_error(struct super_block *sb, const char *function,
 	if (test_opt(sb, ERRORS_RO)) {
 		ext2_msg(sb, KERN_CRIT,
 			     "error: remounting filesystem read-only");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 }
 
@@ -656,7 +656,7 @@ static int ext2_setup_super (struct super_block * sb,
 		ext2_msg(sb, KERN_ERR,
 			"error: revision level too high, "
 			"forcing read-only mode");
-		res = MS_RDONLY;
+		res = SB_RDONLY;
 	}
 	if (read_only)
 		return res;
@@ -924,9 +924,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_resuid = opts.s_resuid;
 	sbi->s_resgid = opts.s_resgid;
 
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
 		((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
-		 MS_POSIXACL : 0);
+		 SB_POSIXACL : 0);
 	sb->s_iflags |= SB_I_CGROUPWB;
 
 	if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
@@ -1178,7 +1178,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		ext2_msg(sb, KERN_WARNING,
 			"warning: mounting ext3 filesystem as ext2");
 	if (ext2_setup_super (sb, es, sb_rdonly(sb)))
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	ext2_write_super(sb);
 	return 0;
 
@@ -1341,9 +1341,9 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 			 "dax flag with busy inodes while remounting");
 		new_opts.s_mount_opt ^= EXT2_MOUNT_DAX;
 	}
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		goto out_set;
-	if (*flags & MS_RDONLY) {
+	if (*flags & SB_RDONLY) {
 		if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
 		    !(sbi->s_mount_state & EXT2_VALID_FS))
 			goto out_set;
@@ -1379,7 +1379,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 		 */
 		sbi->s_mount_state = le16_to_cpu(es->s_state);
 		if (!ext2_setup_super (sb, es, 0))
-			sb->s_flags &= ~MS_RDONLY;
+			sb->s_flags &= ~SB_RDONLY;
 		spin_unlock(&sbi->s_lock);
 
 		ext2_write_super(sb);
@@ -1392,8 +1392,8 @@ out_set:
 	sbi->s_mount_opt = new_opts.s_mount_opt;
 	sbi->s_resuid = new_opts.s_resuid;
 	sbi->s_resgid = new_opts.s_resgid;
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+		((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? SB_POSIXACL : 0);
 	spin_unlock(&sbi->s_lock);
 
 	return 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0992d76f7ab1..7df2c5644e59 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2742,7 +2742,7 @@ static int ext4_writepages(struct address_space *mapping,
 	 * If the filesystem has aborted, it is read-only, so return
 	 * right away instead of dumping stack traces later on that
 	 * will obscure the real source of the problem.  We test
-	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
+	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because
 	 * the latter could be true if the filesystem is mounted
 	 * read-only, and in that case, ext4_writepages should
 	 * *never* be called, so if that ever happens, we would want
@@ -5183,7 +5183,7 @@ static int ext4_do_update_inode(handle_t *handle,
 
 	ext4_inode_csum_set(inode, raw_inode, ei);
 	spin_unlock(&ei->i_raw_lock);
-	if (inode->i_sb->s_flags & MS_LAZYTIME)
+	if (inode->i_sb->s_flags & SB_LAZYTIME)
 		ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
 					      bh->b_data);
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0556cd036b69..7c46693a14d7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -422,7 +422,7 @@ static void ext4_handle_error(struct super_block *sb)
 		 * before ->s_flags update
 		 */
 		smp_wmb();
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 	if (test_opt(sb, ERRORS_PANIC)) {
 		if (EXT4_SB(sb)->s_journal &&
@@ -635,7 +635,7 @@ void __ext4_abort(struct super_block *sb, const char *function,
 		 * before ->s_flags update
 		 */
 		smp_wmb();
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		if (EXT4_SB(sb)->s_journal)
 			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 		save_error_info(sb, function, line);
@@ -1682,10 +1682,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 		sb->s_flags |= SB_I_VERSION;
 		return 1;
 	case Opt_lazytime:
-		sb->s_flags |= MS_LAZYTIME;
+		sb->s_flags |= SB_LAZYTIME;
 		return 1;
 	case Opt_nolazytime:
-		sb->s_flags &= ~MS_LAZYTIME;
+		sb->s_flags &= ~SB_LAZYTIME;
 		return 1;
 	}
 
@@ -2116,7 +2116,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
 		ext4_msg(sb, KERN_ERR, "revision level too high, "
 			 "forcing read-only mode");
-		res = MS_RDONLY;
+		res = SB_RDONLY;
 	}
 	if (read_only)
 		goto done;
@@ -2429,7 +2429,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 
 	if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
 		/* don't clear list on RO mount w/ errors */
-		if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
+		if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
 			ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
 				  "clearing orphan list.\n");
 			es->s_last_orphan = 0;
@@ -2438,19 +2438,19 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		return;
 	}
 
-	if (s_flags & MS_RDONLY) {
+	if (s_flags & SB_RDONLY) {
 		ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 	}
 #ifdef CONFIG_QUOTA
 	/* Needed for iput() to work correctly and not trash data */
-	sb->s_flags |= MS_ACTIVE;
+	sb->s_flags |= SB_ACTIVE;
 
 	/*
 	 * Turn on quotas which were not enabled for read-only mounts if
 	 * filesystem has quota feature, so that they are updated correctly.
 	 */
-	if (ext4_has_feature_quota(sb) && (s_flags & MS_RDONLY)) {
+	if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) {
 		int ret = ext4_enable_quotas(sb);
 
 		if (!ret)
@@ -2539,7 +2539,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		}
 	}
 #endif
-	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+	sb->s_flags = s_flags; /* Restore SB_RDONLY status */
 }
 
 /*
@@ -2741,7 +2741,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
 
 	if (ext4_has_feature_readonly(sb)) {
 		ext4_msg(sb, KERN_INFO, "filesystem is read-only");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		return 1;
 	}
 
@@ -3623,8 +3623,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		sb->s_iflags |= SB_I_CGROUPWB;
 	}
 
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+		(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
 
 	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
 	    (ext4_has_compat_features(sb) ||
@@ -4199,7 +4199,7 @@ no_journal:
 	}
 
 	if (ext4_setup_super(sb, es, sb_rdonly(sb)))
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 
 	/* determine the minimum size of new large inodes, if present */
 	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
@@ -4693,7 +4693,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	 * the clock is set in the future, and this will cause e2fsck
 	 * to complain and force a full file system check.
 	 */
-	if (!(sb->s_flags & MS_RDONLY))
+	if (!(sb->s_flags & SB_RDONLY))
 		es->s_wtime = cpu_to_le32(get_seconds());
 	if (sb->s_bdev->bd_part)
 		es->s_kbytes_written =
@@ -5047,8 +5047,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
 		ext4_abort(sb, "Abort forced by user");
 
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+		(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
 
 	es = sbi->s_es;
 
@@ -5057,16 +5057,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 	}
 
-	if (*flags & MS_LAZYTIME)
-		sb->s_flags |= MS_LAZYTIME;
+	if (*flags & SB_LAZYTIME)
+		sb->s_flags |= SB_LAZYTIME;
 
-	if ((bool)(*flags & MS_RDONLY) != sb_rdonly(sb)) {
+	if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
 		if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
 			err = -EROFS;
 			goto restore_opts;
 		}
 
-		if (*flags & MS_RDONLY) {
+		if (*flags & SB_RDONLY) {
 			err = sync_filesystem(sb);
 			if (err < 0)
 				goto restore_opts;
@@ -5078,7 +5078,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			 * First of all, the unconditional stuff we have to do
 			 * to disable replay of the journal when we next remount
 			 */
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 
 			/*
 			 * OK, test if we are remounting a valid rw partition
@@ -5140,7 +5140,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				ext4_clear_journal_err(sb, es);
 			sbi->s_mount_state = le16_to_cpu(es->s_state);
 			if (!ext4_setup_super(sb, es, 0))
-				sb->s_flags &= ~MS_RDONLY;
+				sb->s_flags &= ~SB_RDONLY;
 			if (ext4_has_feature_mmp(sb))
 				if (ext4_multi_mount_protect(sb,
 						le64_to_cpu(es->s_mmp_block))) {
@@ -5164,7 +5164,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	}
 
 	ext4_setup_system_zone(sb);
-	if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY))
+	if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY))
 		ext4_commit_super(sb, 1);
 
 #ifdef CONFIG_QUOTA
@@ -5182,7 +5182,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	}
 #endif
 
-	*flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME);
+	*flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
 	ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
 	kfree(orig_data);
 	return 0;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index dd2e73e10857..4aa69bc1c70a 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -617,17 +617,17 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 	if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG))
 		return 0;
 
-	if (s_flags & MS_RDONLY) {
+	if (s_flags & SB_RDONLY) {
 		f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs");
-		sbi->sb->s_flags &= ~MS_RDONLY;
+		sbi->sb->s_flags &= ~SB_RDONLY;
 	}
 
 #ifdef CONFIG_QUOTA
 	/* Needed for iput() to work correctly and not trash data */
-	sbi->sb->s_flags |= MS_ACTIVE;
+	sbi->sb->s_flags |= SB_ACTIVE;
 
 	/* Turn on quotas so that they are updated correctly */
-	quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY);
+	quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY);
 #endif
 
 	start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
@@ -658,7 +658,7 @@ out:
 	if (quota_enabled)
 		f2fs_quota_off_umount(sbi->sb);
 #endif
-	sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+	sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
 
 	return err;
 }
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f4e094e816c6..6abf26c31d01 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2378,7 +2378,7 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
 
 static inline int f2fs_readonly(struct super_block *sb)
 {
-	return sb->s_flags & MS_RDONLY;
+	return sb->s_flags & SB_RDONLY;
 }
 
 static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 5d5bba462f26..d844dcb80570 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1005,7 +1005,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
 
 	cpc.reason = __get_cp_reason(sbi);
 gc_more:
-	if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) {
+	if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) {
 		ret = -EINVAL;
 		goto stop;
 	}
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 92c57ace1939..b3a14b0429f2 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -598,16 +598,16 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 	int quota_enabled;
 #endif
 
-	if (s_flags & MS_RDONLY) {
+	if (s_flags & SB_RDONLY) {
 		f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs");
-		sbi->sb->s_flags &= ~MS_RDONLY;
+		sbi->sb->s_flags &= ~SB_RDONLY;
 	}
 
 #ifdef CONFIG_QUOTA
 	/* Needed for iput() to work correctly and not trash data */
-	sbi->sb->s_flags |= MS_ACTIVE;
+	sbi->sb->s_flags |= SB_ACTIVE;
 	/* Turn on quotas so that they are updated correctly */
-	quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY);
+	quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY);
 #endif
 
 	fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
@@ -671,7 +671,7 @@ out:
 	if (quota_enabled)
 		f2fs_quota_off_umount(sbi->sb);
 #endif
-	sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+	sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
 
 	return ret ? ret: err;
 }
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a6c5dd450002..708155d9c2e4 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -534,10 +534,10 @@ static int parse_options(struct super_block *sb, char *options)
 #endif
 			break;
 		case Opt_lazytime:
-			sb->s_flags |= MS_LAZYTIME;
+			sb->s_flags |= SB_LAZYTIME;
 			break;
 		case Opt_nolazytime:
-			sb->s_flags &= ~MS_LAZYTIME;
+			sb->s_flags &= ~SB_LAZYTIME;
 			break;
 #ifdef CONFIG_QUOTA
 		case Opt_quota:
@@ -1168,7 +1168,7 @@ static void default_options(struct f2fs_sb_info *sbi)
 	set_opt(sbi, INLINE_DENTRY);
 	set_opt(sbi, EXTENT_CACHE);
 	set_opt(sbi, NOHEAP);
-	sbi->sb->s_flags |= MS_LAZYTIME;
+	sbi->sb->s_flags |= SB_LAZYTIME;
 	set_opt(sbi, FLUSH_MERGE);
 	if (f2fs_sb_mounted_blkzoned(sbi->sb)) {
 		set_opt_mode(sbi, F2FS_MOUNT_LFS);
@@ -1236,7 +1236,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 #endif
 
 	/* recover superblocks we couldn't write due to previous RO mount */
-	if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
+	if (!(*flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
 		err = f2fs_commit_super(sbi, false);
 		f2fs_msg(sb, KERN_INFO,
 			"Try to recover all the superblocks, ret: %d", err);
@@ -1255,17 +1255,17 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	 * Previous and new state of filesystem is RO,
 	 * so skip checking GC and FLUSH_MERGE conditions.
 	 */
-	if (f2fs_readonly(sb) && (*flags & MS_RDONLY))
+	if (f2fs_readonly(sb) && (*flags & SB_RDONLY))
 		goto skip;
 
 #ifdef CONFIG_QUOTA
-	if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) {
+	if (!f2fs_readonly(sb) && (*flags & SB_RDONLY)) {
 		err = dquot_suspend(sb, -1);
 		if (err < 0)
 			goto restore_opts;
 	} else {
 		/* dquot_resume needs RW */
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 		if (sb_any_quota_suspended(sb)) {
 			dquot_resume(sb, -1);
 		} else if (f2fs_sb_has_quota_ino(sb)) {
@@ -1288,7 +1288,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	 * or if background_gc = off is passed in mount
 	 * option. Also sync the filesystem.
 	 */
-	if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
+	if ((*flags & SB_RDONLY) || !test_opt(sbi, BG_GC)) {
 		if (sbi->gc_thread) {
 			stop_gc_thread(sbi);
 			need_restart_gc = true;
@@ -1300,7 +1300,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 		need_stop_gc = true;
 	}
 
-	if (*flags & MS_RDONLY) {
+	if (*flags & SB_RDONLY) {
 		writeback_inodes_sb(sb, WB_REASON_SYNC);
 		sync_inodes_sb(sb);
 
@@ -1314,7 +1314,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	 * We stop issue flush thread if FS is mounted as RO
 	 * or if flush_merge is not passed in mount option.
 	 */
-	if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
+	if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
 		clear_opt(sbi, FLUSH_MERGE);
 		destroy_flush_cmd_control(sbi, false);
 	} else {
@@ -1329,8 +1329,8 @@ skip:
 		kfree(s_qf_names[i]);
 #endif
 	/* Update the POSIXACL Flag */
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+		(test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0);
 
 	return 0;
 restore_gc:
@@ -2472,8 +2472,8 @@ try_onemore:
 	sb->s_export_op = &f2fs_export_ops;
 	sb->s_magic = F2FS_SUPER_MAGIC;
 	sb->s_time_gran = 1;
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+		(test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0);
 	memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
 
 	/* init f2fs-specific super block info */
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 48b2336692f9..bac10de678cc 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -392,7 +392,7 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs,
 			memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
 			set_buffer_uptodate(c_bh);
 			mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
-			if (sb->s_flags & MS_SYNCHRONOUS)
+			if (sb->s_flags & SB_SYNCHRONOUS)
 				err = sync_dirty_buffer(c_bh);
 			brelse(c_bh);
 			if (err)
@@ -597,7 +597,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
 		}
 
 		if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) {
-			if (sb->s_flags & MS_SYNCHRONOUS) {
+			if (sb->s_flags & SB_SYNCHRONOUS) {
 				err = fat_sync_bhs(bhs, nr_bhs);
 				if (err)
 					goto error;
@@ -612,7 +612,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
 		fat_collect_bhs(bhs, &nr_bhs, &fatent);
 	} while (cluster != FAT_ENT_EOF);
 
-	if (sb->s_flags & MS_SYNCHRONOUS) {
+	if (sb->s_flags & SB_SYNCHRONOUS) {
 		err = fat_sync_bhs(bhs, nr_bhs);
 		if (err)
 			goto error;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 30c52394a7ad..016c46b5e44c 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -781,12 +781,12 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
 {
 	int new_rdonly;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
-	*flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME);
+	*flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME);
 
 	sync_filesystem(sb);
 
 	/* make sure we update state on remount. */
-	new_rdonly = *flags & MS_RDONLY;
+	new_rdonly = *flags & SB_RDONLY;
 	if (new_rdonly != sb_rdonly(sb)) {
 		if (new_rdonly)
 			fat_set_state(sb, 0, 0);
@@ -1352,7 +1352,7 @@ out:
 	if (opts->unicode_xlate)
 		opts->utf8 = 0;
 	if (opts->nfs == FAT_NFS_NOSTALE_RO) {
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		sb->s_export_op = &fat_export_ops_nostale;
 	}
 
@@ -1608,7 +1608,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
 		return -ENOMEM;
 	sb->s_fs_info = sbi;
 
-	sb->s_flags |= MS_NODIRATIME;
+	sb->s_flags |= SB_NODIRATIME;
 	sb->s_magic = MSDOS_SUPER_MAGIC;
 	sb->s_op = &fat_sops;
 	sb->s_export_op = &fat_export_ops;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index acc3aa30ee54..f9bdc1e01c98 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -33,7 +33,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
 	if (opts->errors == FAT_ERRORS_PANIC)
 		panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
 	else if (opts->errors == FAT_ERRORS_RO && !sb_rdonly(sb)) {
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		fat_msg(sb, KERN_ERR, "Filesystem has been set read-only");
 	}
 }
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 7d6a105d601b..d24d2758a363 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -646,7 +646,7 @@ static void setup(struct super_block *sb)
 {
 	MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations;
 	sb->s_d_op = &msdos_dentry_operations;
-	sb->s_flags |= MS_NOATIME;
+	sb->s_flags |= SB_NOATIME;
 }
 
 static int msdos_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 455ce5b77e9b..f989efa051a0 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -116,7 +116,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
 static int vxfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
@@ -220,7 +220,7 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
 	int ret = -EINVAL;
 	u32 j;
 
-	sbp->s_flags |= MS_RDONLY;
+	sbp->s_flags |= SB_RDONLY;
 
 	infp = kzalloc(sizeof(*infp), GFP_KERNEL);
 	if (!infp) {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 08f5debd07d1..cea4836385b7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -490,7 +490,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 
 	/* while holding I_WB_SWITCH, no one else can update the association */
 	spin_lock(&inode->i_lock);
-	if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
+	if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
 	    inode->i_state & (I_WB_SWITCH | I_FREEING) ||
 	    inode_to_wb(inode) == isw->new_wb) {
 		spin_unlock(&inode->i_lock);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2f504d615d92..624f18bbfd2b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -130,7 +130,7 @@ static void fuse_evict_inode(struct inode *inode)
 {
 	truncate_inode_pages_final(&inode->i_data);
 	clear_inode(inode);
-	if (inode->i_sb->s_flags & MS_ACTIVE) {
+	if (inode->i_sb->s_flags & SB_ACTIVE) {
 		struct fuse_conn *fc = get_fuse_conn(inode);
 		struct fuse_inode *fi = get_fuse_inode(inode);
 		fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
@@ -141,7 +141,7 @@ static void fuse_evict_inode(struct inode *inode)
 static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	if (*flags & MS_MANDLOCK)
+	if (*flags & SB_MANDLOCK)
 		return -EINVAL;
 
 	return 0;
@@ -1056,10 +1056,10 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	int is_bdev = sb->s_bdev != NULL;
 
 	err = -EINVAL;
-	if (sb->s_flags & MS_MANDLOCK)
+	if (sb->s_flags & SB_MANDLOCK)
 		goto err;
 
-	sb->s_flags &= ~(MS_NOSEC | SB_I_VERSION);
+	sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
 
 	if (!parse_fuse_opt(data, &d, is_bdev))
 		goto err;
@@ -1109,9 +1109,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		goto err_dev_free;
 
 	/* Handle umasking inside the fuse code */
-	if (sb->s_flags & MS_POSIXACL)
+	if (sb->s_flags & SB_POSIXACL)
 		fc->dont_mask = 1;
-	sb->s_flags |= MS_POSIXACL;
+	sb->s_flags |= SB_POSIXACL;
 
 	fc->default_permissions = d.default_permissions;
 	fc->allow_other = d.allow_other;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index a3711f543405..ad55eb86a250 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1065,15 +1065,15 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
 	sdp->sd_args = *args;
 
 	if (sdp->sd_args.ar_spectator) {
-                sb->s_flags |= MS_RDONLY;
+                sb->s_flags |= SB_RDONLY;
 		set_bit(SDF_RORECOVERY, &sdp->sd_flags);
 	}
 	if (sdp->sd_args.ar_posix_acl)
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 	if (sdp->sd_args.ar_nobarrier)
 		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
 
-	sb->s_flags |= MS_NOSEC;
+	sb->s_flags |= SB_NOSEC;
 	sb->s_magic = GFS2_MAGIC;
 	sb->s_op = &gfs2_super_ops;
 	sb->s_d_op = &gfs2_dops;
@@ -1257,7 +1257,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 	struct gfs2_args args;
 	struct gfs2_sbd *sdp;
 
-	if (!(flags & MS_RDONLY))
+	if (!(flags & SB_RDONLY))
 		mode |= FMODE_WRITE;
 
 	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
@@ -1313,15 +1313,15 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 
 	if (s->s_root) {
 		error = -EBUSY;
-		if ((flags ^ s->s_flags) & MS_RDONLY)
+		if ((flags ^ s->s_flags) & SB_RDONLY)
 			goto error_super;
 	} else {
 		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
 		sb_set_blocksize(s, block_size(bdev));
-		error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
+		error = fill_super(s, &args, flags & SB_SILENT ? 1 : 0);
 		if (error)
 			goto error_super;
-		s->s_flags |= MS_ACTIVE;
+		s->s_flags |= SB_ACTIVE;
 		bdev->bd_super = s;
 	}
 
@@ -1365,7 +1365,7 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
 		pr_warn("gfs2 mount does not exist\n");
 		return ERR_CAST(s);
 	}
-	if ((flags ^ s->s_flags) & MS_RDONLY) {
+	if ((flags ^ s->s_flags) & SB_RDONLY) {
 		deactivate_locked_super(s);
 		return ERR_PTR(-EBUSY);
 	}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 9cb5c9a97d69..d81d46e19726 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1256,10 +1256,10 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 		return -EINVAL;
 
 	if (sdp->sd_args.ar_spectator)
-		*flags |= MS_RDONLY;
+		*flags |= SB_RDONLY;
 
-	if ((sb->s_flags ^ *flags) & MS_RDONLY) {
-		if (*flags & MS_RDONLY)
+	if ((sb->s_flags ^ *flags) & SB_RDONLY) {
+		if (*flags & SB_RDONLY)
 			error = gfs2_make_fs_ro(sdp);
 		else
 			error = gfs2_make_fs_rw(sdp);
@@ -1269,9 +1269,9 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 
 	sdp->sd_args = args;
 	if (sdp->sd_args.ar_posix_acl)
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 	else
-		sb->s_flags &= ~MS_POSIXACL;
+		sb->s_flags &= ~SB_POSIXACL;
 	if (sdp->sd_args.ar_nobarrier)
 		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
 	else
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index a85ca8b2c9ba..ca8b72d0a831 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -117,7 +117,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
 		kfree(tr);
 	up_read(&sdp->sd_log_flush_lock);
 
-	if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
+	if (sdp->sd_vfs->s_flags & SB_SYNCHRONOUS)
 		gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
 	if (alloced)
 		sb_end_intwrite(sdp->sd_vfs);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 894994d2c885..460281b1299e 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -204,11 +204,11 @@ int hfs_mdb_get(struct super_block *sb)
 	attrib = mdb->drAtrb;
 	if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
 		pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended.  mounting read-only.\n");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 	if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) {
 		pr_warn("filesystem is marked locked, mounting read-only.\n");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 	if (!sb_rdonly(sb)) {
 		/* Mark the volume uncleanly unmounted in case we crash */
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 7e0d65e9586c..173876782f73 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -114,18 +114,18 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int hfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_NODIRATIME;
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	*flags |= SB_NODIRATIME;
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		return 0;
-	if (!(*flags & MS_RDONLY)) {
+	if (!(*flags & SB_RDONLY)) {
 		if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
 			pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended.  leaving read-only.\n");
-			sb->s_flags |= MS_RDONLY;
-			*flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
+			*flags |= SB_RDONLY;
 		} else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) {
 			pr_warn("filesystem is marked locked, leaving read-only.\n");
-			sb->s_flags |= MS_RDONLY;
-			*flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
+			*flags |= SB_RDONLY;
 		}
 	}
 	return 0;
@@ -407,7 +407,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_op = &hfs_super_operations;
 	sb->s_xattr = hfs_xattr_handlers;
-	sb->s_flags |= MS_NODIRATIME;
+	sb->s_flags |= SB_NODIRATIME;
 	mutex_init(&sbi->bitmap_lock);
 
 	res = hfs_mdb_get(sb);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index e5bb2de2262a..1d458b716957 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -329,9 +329,9 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		return 0;
-	if (!(*flags & MS_RDONLY)) {
+	if (!(*flags & SB_RDONLY)) {
 		struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
 		int force = 0;
 
@@ -340,20 +340,20 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
 
 		if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
 			pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended.  leaving read-only.\n");
-			sb->s_flags |= MS_RDONLY;
-			*flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
+			*flags |= SB_RDONLY;
 		} else if (force) {
 			/* nothing */
 		} else if (vhdr->attributes &
 				cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
 			pr_warn("filesystem is marked locked, leaving read-only.\n");
-			sb->s_flags |= MS_RDONLY;
-			*flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
+			*flags |= SB_RDONLY;
 		} else if (vhdr->attributes &
 				cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
 			pr_warn("filesystem is marked journaled, leaving read-only.\n");
-			sb->s_flags |= MS_RDONLY;
-			*flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
+			*flags |= SB_RDONLY;
 		}
 	}
 	return 0;
@@ -455,16 +455,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 
 	if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
 		pr_warn("Filesystem was not cleanly unmounted, running fsck.hfsplus is recommended.  mounting read-only.\n");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	} else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
 		/* nothing */
 	} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
 		pr_warn("Filesystem is marked locked, mounting read-only.\n");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	} else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) &&
 			!sb_rdonly(sb)) {
 		pr_warn("write access to a journaled filesystem is not supported, use the force option at your own risk, mounting read-only.\n");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 
 	err = -EINVAL;
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index e0e60b148400..7c49f1ef0c85 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -288,7 +288,7 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno,
 					goto bail;
 				}
 				if (((31 + de->namelen + de->down*4 + 3) & ~3) != le16_to_cpu(de->length)) {
-					if (((31 + de->namelen + de->down*4 + 3) & ~3) < le16_to_cpu(de->length) && s->s_flags & MS_RDONLY) goto ok;
+					if (((31 + de->namelen + de->down*4 + 3) & ~3) < le16_to_cpu(de->length) && s->s_flags & SB_RDONLY) goto ok;
 					hpfs_error(s, "namelen does not match dirent size in dnode %08x, dirent %03x, last %03x", secno, p, pp);
 					goto bail;
 				}
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 1516fb4e28f4..c45a3b9b9ac7 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -78,7 +78,7 @@ void hpfs_error(struct super_block *s, const char *fmt, ...)
 			else {
 				pr_cont("; remounting read-only\n");
 				mark_dirty(s, 0);
-				s->s_flags |= MS_RDONLY;
+				s->s_flags |= SB_RDONLY;
 			}
 		} else if (sb_rdonly(s))
 				pr_cont("; going on - but anything won't be destroyed because it's read-only\n");
@@ -457,7 +457,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 
 	sync_filesystem(s);
 
-	*flags |= MS_NOATIME;
+	*flags |= SB_NOATIME;
 
 	hpfs_lock(s);
 	uid = sbi->sb_uid; gid = sbi->sb_gid;
@@ -488,7 +488,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 	sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk;
 	sbi->sb_err = errs; sbi->sb_timeshift = timeshift;
 
-	if (!(*flags & MS_RDONLY)) mark_dirty(s, 1);
+	if (!(*flags & SB_RDONLY)) mark_dirty(s, 1);
 
 	hpfs_unlock(s);
 	return 0;
@@ -614,7 +614,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 		goto bail4;
 	}
 
-	s->s_flags |= MS_NOATIME;
+	s->s_flags |= SB_NOATIME;
 
 	/* Fill superblock stuff */
 	s->s_magic = HPFS_SUPER_MAGIC;
diff --git a/fs/inode.c b/fs/inode.c
index fd401028a309..03102d6ef044 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -416,7 +416,7 @@ void inode_add_lru(struct inode *inode)
 {
 	if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
 				I_FREEING | I_WILL_FREE)) &&
-	    !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
+	    !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE)
 		inode_lru_list_add(inode);
 }
 
@@ -595,7 +595,7 @@ static void dispose_list(struct list_head *head)
  * @sb:		superblock to operate on
  *
  * Make sure that no inodes with zero refcount are retained.  This is
- * called by superblock shutdown after having MS_ACTIVE flag removed,
+ * called by superblock shutdown after having SB_ACTIVE flag removed,
  * so any inode reaching zero refcount during or after that call will
  * be immediately evicted.
  */
@@ -1492,7 +1492,7 @@ static void iput_final(struct inode *inode)
 	else
 		drop = generic_drop_inode(inode);
 
-	if (!drop && (sb->s_flags & MS_ACTIVE)) {
+	if (!drop && (sb->s_flags & SB_ACTIVE)) {
 		inode_add_lru(inode);
 		spin_unlock(&inode->i_lock);
 		return;
@@ -1644,7 +1644,7 @@ int generic_update_time(struct inode *inode, struct timespec *time, int flags)
 	if (flags & S_MTIME)
 		inode->i_mtime = *time;
 
-	if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION))
+	if (!(inode->i_sb->s_flags & SB_LAZYTIME) || (flags & S_VERSION))
 		iflags |= I_DIRTY_SYNC;
 	__mark_inode_dirty(inode, iflags);
 	return 0;
@@ -1691,7 +1691,7 @@ bool __atime_needs_update(const struct path *path, struct inode *inode,
 
 	if (IS_NOATIME(inode))
 		return false;
-	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
+	if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
 		return false;
 
 	if (mnt->mnt_flags & MNT_NOATIME)
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 447a24d77b89..bc258a4402f6 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -114,7 +114,7 @@ static void destroy_inodecache(void)
 static int isofs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	if (!(*flags & MS_RDONLY))
+	if (!(*flags & SB_RDONLY))
 		return -EROFS;
 	return 0;
 }
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index e96c6b05e43e..d8c274d39ddb 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -409,10 +409,10 @@ int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data)
 		mutex_unlock(&c->alloc_sem);
 	}
 
-	if (!(*flags & MS_RDONLY))
+	if (!(*flags & SB_RDONLY))
 		jffs2_start_garbage_collect_thread(c);
 
-	*flags |= MS_NOATIME;
+	*flags |= SB_NOATIME;
 	return 0;
 }
 
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 824e61ede465..c2fbec19c616 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -59,7 +59,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 }
 
 
-#define jffs2_is_readonly(c) (OFNI_BS_2SFFJ(c)->s_flags & MS_RDONLY)
+#define jffs2_is_readonly(c) (OFNI_BS_2SFFJ(c)->s_flags & SB_RDONLY)
 
 #define SECTOR_ADDR(x) ( (((unsigned long)(x) / c->sector_size) * c->sector_size) )
 #ifndef CONFIG_JFFS2_FS_WRITEBUFFER
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 153f1c6eb169..f60dee7faf03 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -301,10 +301,10 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_op = &jffs2_super_operations;
 	sb->s_export_op = &jffs2_export_ops;
-	sb->s_flags = sb->s_flags | MS_NOATIME;
+	sb->s_flags = sb->s_flags | SB_NOATIME;
 	sb->s_xattr = jffs2_xattr_handlers;
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
-	sb->s_flags |= MS_POSIXACL;
+	sb->s_flags |= SB_POSIXACL;
 #endif
 	ret = jffs2_do_fill_super(sb, data, silent);
 	return ret;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 2f7b3af5b8b7..90373aebfdca 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -87,7 +87,7 @@ static void jfs_handle_error(struct super_block *sb)
 	else if (sbi->flag & JFS_ERR_REMOUNT_RO) {
 		jfs_err("ERROR: (device %s): remounting filesystem as read-only",
 			sb->s_id);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 
 	/* nothing is done for continue beyond marking the superblock dirty */
@@ -477,7 +477,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 			return rc;
 	}
 
-	if (sb_rdonly(sb) && !(*flags & MS_RDONLY)) {
+	if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) {
 		/*
 		 * Invalidate any previously read metadata.  fsck may have
 		 * changed the on-disk data since we mounted r/o
@@ -488,12 +488,12 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 		ret = jfs_mount_rw(sb, 1);
 
 		/* mark the fs r/w for quota activity */
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 
 		dquot_resume(sb, -1);
 		return ret;
 	}
-	if (!sb_rdonly(sb) && (*flags & MS_RDONLY)) {
+	if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) {
 		rc = dquot_suspend(sb, -1);
 		if (rc < 0)
 			return rc;
@@ -545,7 +545,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->flag = flag;
 
 #ifdef CONFIG_JFS_POSIX_ACL
-	sb->s_flags |= MS_POSIXACL;
+	sb->s_flags |= SB_POSIXACL;
 #endif
 
 	if (newLVSize) {
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 95a7c88baed9..26dd9a50f383 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -335,7 +335,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
 			deactivate_locked_super(sb);
 			return ERR_PTR(error);
 		}
-		sb->s_flags |= MS_ACTIVE;
+		sb->s_flags |= SB_ACTIVE;
 
 		mutex_lock(&kernfs_mutex);
 		list_add(&info->node, &root->supers);
diff --git a/fs/libfs.c b/fs/libfs.c
index 3aabe553fc45..7ff3cb904acd 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -246,7 +246,7 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name,
 	struct inode *root;
 	struct qstr d_name = QSTR_INIT(name, strlen(name));
 
-	s = sget_userns(fs_type, NULL, set_anon_super, MS_KERNMOUNT|MS_NOUSER,
+	s = sget_userns(fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER,
 			&init_user_ns, NULL);
 	if (IS_ERR(s))
 		return ERR_CAST(s);
@@ -277,7 +277,7 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name,
 	d_instantiate(dentry, root);
 	s->s_root = dentry;
 	s->s_d_op = dops;
-	s->s_flags |= MS_ACTIVE;
+	s->s_flags |= SB_ACTIVE;
 	return dget(s->s_root);
 
 Enomem:
@@ -578,7 +578,7 @@ int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *c
 	spin_lock(&pin_fs_lock);
 	if (unlikely(!*mount)) {
 		spin_unlock(&pin_fs_lock);
-		mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, NULL);
+		mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
 		if (IS_ERR(mnt))
 			return PTR_ERR(mnt);
 		spin_lock(&pin_fs_lock);
diff --git a/fs/locks.c b/fs/locks.c
index 1bd71c4d663a..21b4dfa289ee 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -141,7 +141,7 @@
 
 static inline bool is_remote_lock(struct file *filp)
 {
-	return likely(!(filp->f_path.dentry->d_sb->s_flags & MS_NOREMOTELOCK));
+	return likely(!(filp->f_path.dentry->d_sb->s_flags & SB_NOREMOTELOCK));
 }
 
 static bool lease_breaking(struct file_lock *fl)
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index b6829d679643..72e308c3e66b 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -125,9 +125,9 @@ static int minix_remount (struct super_block * sb, int * flags, char * data)
 
 	sync_filesystem(sb);
 	ms = sbi->s_ms;
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		return 0;
-	if (*flags & MS_RDONLY) {
+	if (*flags & SB_RDONLY) {
 		if (ms->s_state & MINIX_VALID_FS ||
 		    !(sbi->s_mount_state & MINIX_VALID_FS))
 			return 0;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 129f1937fa2c..41de88cdc053 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -103,7 +103,7 @@ static void destroy_inodecache(void)
 static int ncp_remount(struct super_block *sb, int *flags, char* data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_NODIRATIME;
+	*flags |= SB_NODIRATIME;
 	return 0;
 }
 
@@ -547,7 +547,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	else
 		default_bufsize = 1024;
 
-	sb->s_flags |= MS_NODIRATIME;	/* probably even noatime */
+	sb->s_flags |= SB_NODIRATIME;	/* probably even noatime */
 	sb->s_maxbytes = 0xFFFFFFFFU;
 	sb->s_blocksize = 1024;	/* Eh...  Is this correct? */
 	sb->s_blocksize_bits = 10;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e51ae52ed14f..2f3f86726f5b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1256,7 +1256,7 @@ static int nfs_dentry_delete(const struct dentry *dentry)
 		/* Unhash it, so that ->d_iput() would be called */
 		return 1;
 	}
-	if (!(dentry->d_sb->s_flags & MS_ACTIVE)) {
+	if (!(dentry->d_sb->s_flags & SB_ACTIVE)) {
 		/* Unhash it, so that ancestors of killed async unlink
 		 * files will be cleaned up during umount */
 		return 1;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 38b93d54c02e..b992d2382ffa 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -752,7 +752,7 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
 	 * Note that we only have to check the vfsmount flags here:
 	 *  - NFS always sets S_NOATIME by so checking it would give a
 	 *    bogus result
-	 *  - NFS never sets MS_NOATIME or MS_NODIRATIME so there is
+	 *  - NFS never sets SB_NOATIME or SB_NODIRATIME so there is
 	 *    no point in checking those.
 	 */
 	if ((path->mnt->mnt_flags & MNT_NOATIME) ||
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5ab17fd4700a..8357ff69962f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -10,7 +10,7 @@
 #include <linux/nfs_page.h>
 #include <linux/wait_bit.h>
 
-#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
+#define NFS_MS_MASK (SB_RDONLY|SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS)
 
 extern const struct export_operations nfs_export_ops;
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 43cadb28db6e..29bacdc56f6a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -813,9 +813,9 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root)
 	 */
 	seq_printf(m, "\n\topts:\t");
 	seq_puts(m, sb_rdonly(root->d_sb) ? "ro" : "rw");
-	seq_puts(m, root->d_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : "");
-	seq_puts(m, root->d_sb->s_flags & MS_NOATIME ? ",noatime" : "");
-	seq_puts(m, root->d_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
+	seq_puts(m, root->d_sb->s_flags & SB_SYNCHRONOUS ? ",sync" : "");
+	seq_puts(m, root->d_sb->s_flags & SB_NOATIME ? ",noatime" : "");
+	seq_puts(m, root->d_sb->s_flags & SB_NODIRATIME ? ",nodiratime" : "");
 	nfs_show_mount_options(m, nfss, 1);
 
 	seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
@@ -2296,11 +2296,11 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	/*
 	 * noac is a special case. It implies -o sync, but that's not
 	 * necessarily reflected in the mtab options. do_remount_sb
-	 * will clear MS_SYNCHRONOUS if -o sync wasn't specified in the
+	 * will clear SB_SYNCHRONOUS if -o sync wasn't specified in the
 	 * remount options, so we have to explicitly reset it.
 	 */
 	if (data->flags & NFS_MOUNT_NOAC)
-		*flags |= MS_SYNCHRONOUS;
+		*flags |= SB_SYNCHRONOUS;
 
 	/* compare new mount options with old ones */
 	error = nfs_compare_remount_data(nfss, data);
@@ -2349,7 +2349,7 @@ void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
 		/* The VFS shouldn't apply the umask to mode bits. We will do
 		 * so ourselves when necessary.
 		 */
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 		sb->s_time_gran = 1;
 		sb->s_export_op = &nfs_export_ops;
 	}
@@ -2379,7 +2379,7 @@ static void nfs_clone_super(struct super_block *sb,
 		/* The VFS shouldn't apply the umask to mode bits. We will do
 		 * so ourselves when necessary.
 		 */
-		sb->s_flags |= MS_POSIXACL;
+		sb->s_flags |= SB_POSIXACL;
 	}
 
  	nfs_initialise_sb(sb);
@@ -2600,11 +2600,11 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
 
 	/* -o noac implies -o sync */
 	if (server->flags & NFS_MOUNT_NOAC)
-		sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+		sb_mntdata.mntflags |= SB_SYNCHRONOUS;
 
 	if (mount_info->cloned != NULL && mount_info->cloned->sb != NULL)
-		if (mount_info->cloned->sb->s_flags & MS_SYNCHRONOUS)
-			sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+		if (mount_info->cloned->sb->s_flags & SB_SYNCHRONOUS)
+			sb_mntdata.mntflags |= SB_SYNCHRONOUS;
 
 	/* Get a superblock - note that we may end up sharing one that already exists */
 	s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata);
@@ -2641,7 +2641,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
 	if (error)
 		goto error_splat_root;
 
-	s->s_flags |= MS_ACTIVE;
+	s->s_flags |= SB_ACTIVE;
 
 out:
 	return mntroot;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index f572538dcc4f..9f3ffba41533 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1979,7 +1979,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
 					     struct the_nilfs *nilfs)
 {
 	struct nilfs_inode_info *ii, *n;
-	int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE);
+	int during_mount = !(sci->sc_super->s_flags & SB_ACTIVE);
 	int defer_iput = false;
 
 	spin_lock(&nilfs->ns_inode_lock);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 3ce20cd44a20..3073b646e1ba 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -141,7 +141,7 @@ void __nilfs_error(struct super_block *sb, const char *function,
 
 		if (nilfs_test_opt(nilfs, ERRORS_RO)) {
 			printk(KERN_CRIT "Remounting filesystem read-only\n");
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 		}
 	}
 
@@ -869,7 +869,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
 
 	/* FS independent flags */
 #ifdef NILFS_ATIME_DISABLE
-	sb->s_flags |= MS_NOATIME;
+	sb->s_flags |= SB_NOATIME;
 #endif
 
 	nilfs_set_default_options(sb, sbp);
@@ -1133,7 +1133,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		err = -EINVAL;
 		goto restore_opts;
 	}
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
+	sb->s_flags = (sb->s_flags & ~SB_POSIXACL);
 
 	err = -EINVAL;
 
@@ -1143,12 +1143,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		goto restore_opts;
 	}
 
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		goto out;
-	if (*flags & MS_RDONLY) {
+	if (*flags & SB_RDONLY) {
 		/* Shutting down log writer */
 		nilfs_detach_log_writer(sb);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 
 		/*
 		 * Remounting a valid RW partition RDONLY, so set
@@ -1178,7 +1178,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 			goto restore_opts;
 		}
 
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 
 		root = NILFS_I(d_inode(sb->s_root))->i_root;
 		err = nilfs_attach_log_writer(sb, root);
@@ -1212,7 +1212,7 @@ static int nilfs_parse_snapshot_option(const char *option,
 	const char *msg = NULL;
 	int err;
 
-	if (!(sd->flags & MS_RDONLY)) {
+	if (!(sd->flags & SB_RDONLY)) {
 		msg = "read-only option is not specified";
 		goto parse_error;
 	}
@@ -1286,7 +1286,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 	struct dentry *root_dentry;
 	int err, s_new = false;
 
-	if (!(flags & MS_RDONLY))
+	if (!(flags & SB_RDONLY))
 		mode |= FMODE_WRITE;
 
 	sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type);
@@ -1327,14 +1327,14 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 		snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev);
 		sb_set_blocksize(s, block_size(sd.bdev));
 
-		err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		err = nilfs_fill_super(s, data, flags & SB_SILENT ? 1 : 0);
 		if (err)
 			goto failed_super;
 
-		s->s_flags |= MS_ACTIVE;
+		s->s_flags |= SB_ACTIVE;
 	} else if (!sd.cno) {
 		if (nilfs_tree_is_busy(s->s_root)) {
-			if ((flags ^ s->s_flags) & MS_RDONLY) {
+			if ((flags ^ s->s_flags) & SB_RDONLY) {
 				nilfs_msg(s, KERN_ERR,
 					  "the device already has a %s mount.",
 					  sb_rdonly(s) ? "read-only" : "read/write");
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index afebb5067cec..1a85317e83f0 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -220,7 +220,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
 
 	if (!valid_fs) {
 		nilfs_msg(sb, KERN_WARNING, "mounting unchecked fs");
-		if (s_flags & MS_RDONLY) {
+		if (s_flags & SB_RDONLY) {
 			nilfs_msg(sb, KERN_INFO,
 				  "recovery required for readonly filesystem");
 			nilfs_msg(sb, KERN_INFO,
@@ -286,7 +286,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
 	if (valid_fs)
 		goto skip_recovery;
 
-	if (s_flags & MS_RDONLY) {
+	if (s_flags & SB_RDONLY) {
 		__u64 features;
 
 		if (nilfs_test_opt(nilfs, NORECOVERY)) {
@@ -309,7 +309,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
 			err = -EROFS;
 			goto failed_unload;
 		}
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 	} else if (nilfs_test_opt(nilfs, NORECOVERY)) {
 		nilfs_msg(sb, KERN_ERR,
 			  "recovery cancelled because norecovery option was specified for a read/write mount");
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 81d8959b6aef..219b269c737e 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -67,7 +67,7 @@ void fsnotify_unmount_inodes(struct super_block *sb)
 
 		/*
 		 * If i_count is zero, the inode cannot have any watches and
-		 * doing an __iget/iput with MS_ACTIVE clear would actually
+		 * doing an __iget/iput with SB_ACTIVE clear would actually
 		 * evict all inodes with zero i_count from icache which is
 		 * unnecessarily violent and may in fact be illegal to do.
 		 */
diff --git a/fs/nsfs.c b/fs/nsfs.c
index ef243e14b6eb..7c6f76d29f56 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -255,5 +255,5 @@ void __init nsfs_init(void)
 	nsfs_mnt = kern_mount(&nsfs);
 	if (IS_ERR(nsfs_mnt))
 		panic("can't set nsfs up\n");
-	nsfs_mnt->mnt_sb->s_flags &= ~MS_NOUSER;
+	nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER;
 }
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 3f70f041dbe9..bb7159f697f2 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -473,7 +473,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 
 #ifndef NTFS_RW
 	/* For read-only compiled driver, enforce read-only flag. */
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 #else /* NTFS_RW */
 	/*
 	 * For the read-write compiled driver, if we are remounting read-write,
@@ -487,7 +487,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 	 * When remounting read-only, mark the volume clean if no volume errors
 	 * have occurred.
 	 */
-	if (sb_rdonly(sb) && !(*flags & MS_RDONLY)) {
+	if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) {
 		static const char *es = ".  Cannot remount read-write.";
 
 		/* Remounting read-write. */
@@ -548,7 +548,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 			NVolSetErrors(vol);
 			return -EROFS;
 		}
-	} else if (!sb_rdonly(sb) && (*flags & MS_RDONLY)) {
+	} else if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) {
 		/* Remounting read-only. */
 		if (!NVolErrors(vol)) {
 			if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
@@ -1799,7 +1799,7 @@ static bool load_system_files(ntfs_volume *vol)
 						es3);
 				goto iput_mirr_err_out;
 			}
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s",
 					!vol->mftmirr_ino ? es1 : es2, es3);
 		} else
@@ -1937,7 +1937,7 @@ get_ctx_vol_failed:
 						es1, es2);
 				goto iput_vol_err_out;
 			}
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -1974,7 +1974,7 @@ get_ctx_vol_failed:
 				}
 				goto iput_logfile_err_out;
 			}
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -2019,7 +2019,7 @@ get_ctx_vol_failed:
 						es1, es2);
 				goto iput_root_err_out;
 			}
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -2042,7 +2042,7 @@ get_ctx_vol_failed:
 			goto iput_root_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		/*
 		 * Do not set NVolErrors() because ntfs_remount() might manage
 		 * to set the dirty flag in which case all would be well.
@@ -2055,7 +2055,7 @@ get_ctx_vol_failed:
 	 * If (still) a read-write mount, set the NT4 compatibility flag on
 	 * newer NTFS version volumes.
 	 */
-	if (!(sb->s_flags & MS_RDONLY) && (vol->major_ver > 1) &&
+	if (!(sb->s_flags & SB_RDONLY) && (vol->major_ver > 1) &&
 			ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
 		static const char *es1 = "Failed to set NT4 compatibility flag";
 		static const char *es2 = ".  Run chkdsk.";
@@ -2069,7 +2069,7 @@ get_ctx_vol_failed:
 			goto iput_root_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		NVolSetErrors(vol);
 	}
 #endif
@@ -2087,7 +2087,7 @@ get_ctx_vol_failed:
 			goto iput_root_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		NVolSetErrors(vol);
 	}
 #endif /* NTFS_RW */
@@ -2128,7 +2128,7 @@ get_ctx_vol_failed:
 						es1, es2);
 				goto iput_quota_err_out;
 			}
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -2150,7 +2150,7 @@ get_ctx_vol_failed:
 			goto iput_quota_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		NVolSetErrors(vol);
 	}
 	/*
@@ -2171,7 +2171,7 @@ get_ctx_vol_failed:
 						es1, es2);
 				goto iput_usnjrnl_err_out;
 			}
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -2194,7 +2194,7 @@ get_ctx_vol_failed:
 			goto iput_usnjrnl_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		NVolSetErrors(vol);
 	}
 #endif /* NTFS_RW */
@@ -2728,7 +2728,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 	lockdep_off();
 	ntfs_debug("Entering.");
 #ifndef NTFS_RW
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 #endif /* ! NTFS_RW */
 	/* Allocate a new ntfs_volume and place it in sb->s_fs_info. */
 	sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index dc455d45a66a..a1d051055472 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -227,7 +227,7 @@ int ocfs2_should_update_atime(struct inode *inode,
 		return 0;
 
 	if ((inode->i_flags & S_NOATIME) ||
-	    ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
+	    ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)))
 		return 0;
 
 	/*
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 040bbb6a6e4b..80efa5699fb0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -675,9 +675,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 	}
 
 	/* We're going to/from readonly mode. */
-	if ((bool)(*flags & MS_RDONLY) != sb_rdonly(sb)) {
+	if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
 		/* Disable quota accounting before remounting RO */
-		if (*flags & MS_RDONLY) {
+		if (*flags & SB_RDONLY) {
 			ret = ocfs2_susp_quotas(osb, 0);
 			if (ret < 0)
 				goto out;
@@ -691,8 +691,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 			goto unlock_osb;
 		}
 
-		if (*flags & MS_RDONLY) {
-			sb->s_flags |= MS_RDONLY;
+		if (*flags & SB_RDONLY) {
+			sb->s_flags |= SB_RDONLY;
 			osb->osb_flags |= OCFS2_OSB_SOFT_RO;
 		} else {
 			if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
@@ -709,14 +709,14 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 				ret = -EINVAL;
 				goto unlock_osb;
 			}
-			sb->s_flags &= ~MS_RDONLY;
+			sb->s_flags &= ~SB_RDONLY;
 			osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
 		}
 		trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags);
 unlock_osb:
 		spin_unlock(&osb->osb_lock);
 		/* Enable quota accounting after remounting RW */
-		if (!ret && !(*flags & MS_RDONLY)) {
+		if (!ret && !(*flags & SB_RDONLY)) {
 			if (sb_any_quota_suspended(sb))
 				ret = ocfs2_susp_quotas(osb, 1);
 			else
@@ -724,7 +724,7 @@ unlock_osb:
 			if (ret < 0) {
 				/* Return back changes... */
 				spin_lock(&osb->osb_lock);
-				sb->s_flags |= MS_RDONLY;
+				sb->s_flags |= SB_RDONLY;
 				osb->osb_flags |= OCFS2_OSB_SOFT_RO;
 				spin_unlock(&osb->osb_lock);
 				goto out;
@@ -744,9 +744,9 @@ unlock_osb:
 		if (!ocfs2_is_hard_readonly(osb))
 			ocfs2_set_journal_params(osb);
 
-		sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
 			((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
-							MS_POSIXACL : 0);
+							SB_POSIXACL : 0);
 	}
 out:
 	return ret;
@@ -1057,10 +1057,10 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_magic = OCFS2_SUPER_MAGIC;
 
-	sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) |
-		((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+	sb->s_flags = (sb->s_flags & ~(SB_POSIXACL | SB_NOSEC)) |
+		((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? SB_POSIXACL : 0);
 
-	/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
+	/* Hard readonly mode only if: bdev_read_only, SB_RDONLY,
 	 * heartbeat=none */
 	if (bdev_read_only(sb->s_bdev)) {
 		if (!sb_rdonly(sb)) {
@@ -2057,7 +2057,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
 	sb->s_xattr = ocfs2_xattr_handlers;
 	sb->s_time_gran = 1;
-	sb->s_flags |= MS_NOATIME;
+	sb->s_flags |= SB_NOATIME;
 	/* this is needed to support O_LARGEFILE */
 	cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
 	bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
@@ -2568,7 +2568,7 @@ static int ocfs2_handle_error(struct super_block *sb)
 			return rv;
 
 		pr_crit("OCFS2: File system is now read-only.\n");
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 		ocfs2_set_ro_flag(osb, 0);
 	}
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 5fdf269ba82e..c5898c59d411 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -901,7 +901,7 @@ static int ocfs2_xattr_list_entry(struct super_block *sb,
 
 	case OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS:
 	case OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT:
-		if (!(sb->s_flags & MS_POSIXACL))
+		if (!(sb->s_flags & SB_POSIXACL))
 			return 0;
 		break;
 
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 13215f26e321..2200662a9bf1 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -369,7 +369,7 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
 static int openprom_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_NOATIME;
+	*flags |= SB_NOATIME;
 	return 0;
 }
 
@@ -386,7 +386,7 @@ static int openprom_fill_super(struct super_block *s, void *data, int silent)
 	struct op_inode_info *oi;
 	int ret;
 
-	s->s_flags |= MS_NOATIME;
+	s->s_flags |= SB_NOATIME;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
 	s->s_magic = OPENPROM_SUPER_MAGIC;
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 366750eef201..36f1390b5ed7 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -40,7 +40,7 @@ static int orangefs_show_options(struct seq_file *m, struct dentry *root)
 {
 	struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(root->d_sb);
 
-	if (root->d_sb->s_flags & MS_POSIXACL)
+	if (root->d_sb->s_flags & SB_POSIXACL)
 		seq_puts(m, ",acl");
 	if (orangefs_sb->flags & ORANGEFS_OPT_INTR)
 		seq_puts(m, ",intr");
@@ -60,7 +60,7 @@ static int parse_mount_options(struct super_block *sb, char *options,
 	 * Force any potential flags that might be set from the mount
 	 * to zero, ie, initialize to unset.
 	 */
-	sb->s_flags &= ~MS_POSIXACL;
+	sb->s_flags &= ~SB_POSIXACL;
 	orangefs_sb->flags &= ~ORANGEFS_OPT_INTR;
 	orangefs_sb->flags &= ~ORANGEFS_OPT_LOCAL_LOCK;
 
@@ -73,7 +73,7 @@ static int parse_mount_options(struct super_block *sb, char *options,
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_acl:
-			sb->s_flags |= MS_POSIXACL;
+			sb->s_flags |= SB_POSIXACL;
 			break;
 		case Opt_intr:
 			orangefs_sb->flags |= ORANGEFS_OPT_INTR;
@@ -507,7 +507,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
 
 	ret = orangefs_fill_sb(sb,
 	      &new_op->downcall.resp.fs_mount, data,
-	      flags & MS_SILENT ? 1 : 0);
+	      flags & SB_SILENT ? 1 : 0);
 
 	if (ret) {
 		d = ERR_PTR(ret);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index be03578181d2..288d20f9a55a 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -326,7 +326,7 @@ static int ovl_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct ovl_fs *ofs = sb->s_fs_info;
 
-	if (!(*flags & MS_RDONLY) && ovl_force_readonly(ofs))
+	if (!(*flags & SB_RDONLY) && ovl_force_readonly(ofs))
 		return -EROFS;
 
 	return 0;
@@ -1190,7 +1190,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 			goto out_err;
 
 		if (!ofs->workdir)
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 
 		sb->s_stack_depth = ofs->upper_mnt->mnt_sb->s_stack_depth;
 		sb->s_time_gran = ofs->upper_mnt->mnt_sb->s_time_gran;
@@ -1203,7 +1203,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 
 	/* If the upper fs is nonexistent, we mark overlayfs r/o too */
 	if (!ofs->upper_mnt)
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	else if (ofs->upper_mnt->mnt_sb != ofs->same_sb)
 		ofs->same_sb = NULL;
 
@@ -1213,7 +1213,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 			goto out_free_oe;
 
 		if (!ofs->indexdir)
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 	}
 
 	/* Show index=off/on in /proc/mounts for any of the reasons above */
@@ -1227,7 +1227,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_op = &ovl_super_operations;
 	sb->s_xattr = ovl_xattr_handlers;
 	sb->s_fs_info = ofs;
-	sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK;
+	sb->s_flags |= SB_POSIXACL | SB_NOREMOTELOCK;
 
 	err = -ENOMEM;
 	root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 225f541f7078..dd0f82622427 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -483,7 +483,7 @@ int proc_fill_super(struct super_block *s, void *data, int silent)
 
 	/* User space would break if executables or devices appear on proc */
 	s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
-	s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
+	s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
 	s->s_magic = PROC_SUPER_MAGIC;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 4e42aba97f2e..ede8e64974be 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -91,7 +91,7 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 {
 	struct pid_namespace *ns;
 
-	if (flags & MS_KERNMOUNT) {
+	if (flags & SB_KERNMOUNT) {
 		ns = data;
 		data = NULL;
 	} else {
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 7b635d173213..b786840facd9 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -45,10 +45,10 @@ struct proc_fs_info {
 static int show_sb_opts(struct seq_file *m, struct super_block *sb)
 {
 	static const struct proc_fs_info fs_info[] = {
-		{ MS_SYNCHRONOUS, ",sync" },
-		{ MS_DIRSYNC, ",dirsync" },
-		{ MS_MANDLOCK, ",mand" },
-		{ MS_LAZYTIME, ",lazytime" },
+		{ SB_SYNCHRONOUS, ",sync" },
+		{ SB_DIRSYNC, ",dirsync" },
+		{ SB_MANDLOCK, ",mand" },
+		{ SB_LAZYTIME, ",lazytime" },
 		{ 0, NULL }
 	};
 	const struct proc_fs_info *fs_infop;
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 3a67cfb142d8..3d46fe302fcb 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -47,7 +47,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data)
 	sync_filesystem(sb);
 	qs = qnx4_sb(sb);
 	qs->Version = QNX4_VERSION;
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
@@ -199,7 +199,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 
 	s->s_op = &qnx4_sops;
 	s->s_magic = QNX4_SUPER_MAGIC;
-	s->s_flags |= MS_RDONLY;	/* Yup, read-only yet */
+	s->s_flags |= SB_RDONLY;	/* Yup, read-only yet */
 
 	/* Check the superblock signature. Since the qnx4 code is
 	   dangerous, we should leave as quickly as possible
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 1192422a1c56..4aeb26bcb4d0 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -56,7 +56,7 @@ static int qnx6_show_options(struct seq_file *seq, struct dentry *root)
 static int qnx6_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
@@ -427,7 +427,7 @@ mmi_success:
 	}
 	s->s_op = &qnx6_sops;
 	s->s_magic = QNX6_SUPER_MAGIC;
-	s->s_flags |= MS_RDONLY;        /* Yup, read-only yet */
+	s->s_flags |= SB_RDONLY;        /* Yup, read-only yet */
 
 	/* ease the later tree level calculations */
 	sbi = QNX6_SB(s);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 11a48affa882..b13fc024d2ee 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2106,7 +2106,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 			journal_end(th);
 			goto out_inserted_sd;
 		}
-	} else if (inode->i_sb->s_flags & MS_POSIXACL) {
+	} else if (inode->i_sb->s_flags & SB_POSIXACL) {
 		reiserfs_warning(inode->i_sb, "jdm-13090",
 				 "ACLs aren't enabled in the fs, "
 				 "but vfs thinks they are!");
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 69ff280bdfe8..70057359fbaf 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1960,7 +1960,7 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
 	/*
 	 * Cancel flushing of old commits. Note that neither of these works
 	 * will be requeued because superblock is being shutdown and doesn't
-	 * have MS_ACTIVE set.
+	 * have SB_ACTIVE set.
 	 */
 	reiserfs_cancel_old_flush(sb);
 	/* wait for all commits to finish */
@@ -4302,7 +4302,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
 		 * Avoid queueing work when sb is being shut down. Transaction
 		 * will be flushed on journal shutdown.
 		 */
-		if (sb->s_flags & MS_ACTIVE)
+		if (sb->s_flags & SB_ACTIVE)
 			queue_delayed_work(REISERFS_SB(sb)->commit_wq,
 					   &journal->j_work, HZ / 10);
 	}
@@ -4393,7 +4393,7 @@ void reiserfs_abort_journal(struct super_block *sb, int errno)
 	if (!journal->j_errno)
 		journal->j_errno = errno;
 
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	set_bit(J_ABORTED, &journal->j_state);
 
 #ifdef CONFIG_REISERFS_CHECK
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 64f49cafbc5b..7e288d97adcb 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -390,7 +390,7 @@ void __reiserfs_error(struct super_block *sb, const char *id,
 		return;
 
 	reiserfs_info(sb, "Remounting filesystem read-only\n");
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	reiserfs_abort_journal(sb, -EIO);
 }
 
@@ -409,7 +409,7 @@ void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
 	printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id,
 	       error_buf);
 
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	reiserfs_abort_journal(sb, errno);
 }
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 5464ec517702..020c9cacbb2f 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -121,7 +121,7 @@ void reiserfs_schedule_old_flush(struct super_block *s)
 	 * Avoid scheduling flush when sb is being shut down. It can race
 	 * with journal shutdown and free still queued delayed work.
 	 */
-	if (sb_rdonly(s) || !(s->s_flags & MS_ACTIVE))
+	if (sb_rdonly(s) || !(s->s_flags & SB_ACTIVE))
 		return;
 
 	spin_lock(&sbi->old_work_lock);
@@ -252,11 +252,11 @@ static int finish_unfinished(struct super_block *s)
 
 #ifdef CONFIG_QUOTA
 	/* Needed for iput() to work correctly and not trash data */
-	if (s->s_flags & MS_ACTIVE) {
+	if (s->s_flags & SB_ACTIVE) {
 		ms_active_set = 0;
 	} else {
 		ms_active_set = 1;
-		s->s_flags |= MS_ACTIVE;
+		s->s_flags |= SB_ACTIVE;
 	}
 	/* Turn on quotas so that they are updated correctly */
 	for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
@@ -411,7 +411,7 @@ static int finish_unfinished(struct super_block *s)
 	reiserfs_write_lock(s);
 	if (ms_active_set)
 		/* Restore the flag back */
-		s->s_flags &= ~MS_ACTIVE;
+		s->s_flags &= ~SB_ACTIVE;
 #endif
 	pathrelse(&path);
 	if (done)
@@ -1521,7 +1521,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 			goto out_err_unlock;
 	}
 
-	if (*mount_flags & MS_RDONLY) {
+	if (*mount_flags & SB_RDONLY) {
 		reiserfs_write_unlock(s);
 		reiserfs_xattr_init(s, *mount_flags);
 		/* remount read-only */
@@ -1567,7 +1567,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 		REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
 
 		/* now it is safe to call journal_begin */
-		s->s_flags &= ~MS_RDONLY;
+		s->s_flags &= ~SB_RDONLY;
 		err = journal_begin(&th, s, 10);
 		if (err)
 			goto out_err_unlock;
@@ -1575,7 +1575,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 		/* Mount a partition which is read-only, read-write */
 		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
 		REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
-		s->s_flags &= ~MS_RDONLY;
+		s->s_flags &= ~SB_RDONLY;
 		set_sb_umount_state(rs, REISERFS_ERROR_FS);
 		if (!old_format_only(s))
 			set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
@@ -1590,7 +1590,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 		goto out_err_unlock;
 
 	reiserfs_write_unlock(s);
-	if (!(*mount_flags & MS_RDONLY)) {
+	if (!(*mount_flags & SB_RDONLY)) {
 		dquot_resume(s, -1);
 		reiserfs_write_lock(s);
 		finish_unfinished(s);
@@ -2055,7 +2055,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	if (bdev_read_only(s->s_bdev) && !sb_rdonly(s)) {
 		SWARN(silent, s, "clm-7000",
 		      "Detected readonly device, marking FS readonly");
-		s->s_flags |= MS_RDONLY;
+		s->s_flags |= SB_RDONLY;
 	}
 	args.objectid = REISERFS_ROOT_OBJECTID;
 	args.dirid = REISERFS_ROOT_PARENT_OBJECTID;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 46492fb37a4c..5dbf5324bdda 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -959,7 +959,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
 
 /*
  * We need to take a copy of the mount flags since things like
- * MS_RDONLY don't get set until *after* we're called.
+ * SB_RDONLY don't get set until *after* we're called.
  * mount_flags != mount_options
  */
 int reiserfs_xattr_init(struct super_block *s, int mount_flags)
@@ -971,7 +971,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 	if (err)
 		goto error;
 
-	if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) {
+	if (d_really_is_negative(privroot) && !(mount_flags & SB_RDONLY)) {
 		inode_lock(d_inode(s->s_root));
 		err = create_privroot(REISERFS_SB(s)->priv_root);
 		inode_unlock(d_inode(s->s_root));
@@ -999,11 +999,11 @@ error:
 		clear_bit(REISERFS_POSIXACL, &REISERFS_SB(s)->s_mount_opt);
 	}
 
-	/* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
+	/* The super_block SB_POSIXACL must mirror the (no)acl mount option. */
 	if (reiserfs_posixacl(s))
-		s->s_flags |= MS_POSIXACL;
+		s->s_flags |= SB_POSIXACL;
 	else
-		s->s_flags &= ~MS_POSIXACL;
+		s->s_flags &= ~SB_POSIXACL;
 
 	return err;
 }
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 0186fe6d39f3..8f06fd1f3d69 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -451,7 +451,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int romfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
@@ -502,7 +502,7 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_maxbytes = 0xFFFFFFFF;
 	sb->s_magic = ROMFS_MAGIC;
-	sb->s_flags |= MS_RDONLY | MS_NOATIME;
+	sb->s_flags |= SB_RDONLY | SB_NOATIME;
 	sb->s_op = &romfs_super_ops;
 
 #ifdef CONFIG_ROMFS_ON_MTD
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index cf01e15a7b16..8a73b97217c8 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -195,7 +195,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 		(u64) le64_to_cpu(sblk->id_table_start));
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	sb->s_op = &squashfs_super_ops;
 
 	err = -ENOMEM;
@@ -373,7 +373,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int squashfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	sync_filesystem(sb);
-	*flags |= MS_RDONLY;
+	*flags |= SB_RDONLY;
 	return 0;
 }
 
diff --git a/fs/statfs.c b/fs/statfs.c
index b072a8bab71a..5b2a24f0f263 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -35,11 +35,11 @@ static int flags_by_mnt(int mnt_flags)
 static int flags_by_sb(int s_flags)
 {
 	int flags = 0;
-	if (s_flags & MS_SYNCHRONOUS)
+	if (s_flags & SB_SYNCHRONOUS)
 		flags |= ST_SYNCHRONOUS;
-	if (s_flags & MS_MANDLOCK)
+	if (s_flags & SB_MANDLOCK)
 		flags |= ST_MANDLOCK;
-	if (s_flags & MS_RDONLY)
+	if (s_flags & SB_RDONLY)
 		flags |= ST_RDONLY;
 	return flags;
 }
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 20b8f82e115b..fb49510c5dcf 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -30,7 +30,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	void *ns;
 	bool new_sb;
 
-	if (!(flags & MS_KERNMOUNT)) {
+	if (!(flags & SB_KERNMOUNT)) {
 		if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
 			return ERR_PTR(-EPERM);
 	}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 3c47b7d5d4cf..bec9f79adb25 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -63,7 +63,7 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data)
 
 	sync_filesystem(sb);
 	if (sbi->s_forced_ro)
-		*flags |= MS_RDONLY;
+		*flags |= SB_RDONLY;
 	return 0;
 }
 
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 0d56e486b392..89765ddfb738 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -333,7 +333,7 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
 	/* set up enough so that it can read an inode */
 	sb->s_op = &sysv_sops;
 	if (sbi->s_forced_ro)
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	if (sbi->s_truncate)
 		sb->s_d_op = &sysv_dentry_operations;
 	root_inode = sysv_iget(sb, SYSV_ROOT_INO);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index a02aa59d1e24..dfe85069586e 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1406,7 +1406,7 @@ int ubifs_update_time(struct inode *inode, struct timespec *time,
 	if (flags & S_MTIME)
 		inode->i_mtime = *time;
 
-	if (!(inode->i_sb->s_flags & MS_LAZYTIME))
+	if (!(inode->i_sb->s_flags & SB_LAZYTIME))
 		iflags |= I_DIRTY_SYNC;
 
 	release = ui->dirty;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 3be28900bf37..fe77e9625e84 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -84,7 +84,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
 	if (!c->ro_error) {
 		c->ro_error = 1;
 		c->no_chk_data_crc = 0;
-		c->vfs_sb->s_flags |= MS_RDONLY;
+		c->vfs_sb->s_flags |= SB_RDONLY;
 		ubifs_warn(c, "switched to read-only mode, error %d", err);
 		dump_stack();
 	}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 7503e7cdf870..0beb285b143d 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -968,7 +968,7 @@ static int parse_standard_option(const char *option)
 
 	pr_notice("UBIFS: parse %s\n", option);
 	if (!strcmp(option, "sync"))
-		return MS_SYNCHRONOUS;
+		return SB_SYNCHRONOUS;
 	return 0;
 }
 
@@ -1160,8 +1160,8 @@ static int mount_ubifs(struct ubifs_info *c)
 	size_t sz;
 
 	c->ro_mount = !!sb_rdonly(c->vfs_sb);
-	/* Suppress error messages while probing if MS_SILENT is set */
-	c->probing = !!(c->vfs_sb->s_flags & MS_SILENT);
+	/* Suppress error messages while probing if SB_SILENT is set */
+	c->probing = !!(c->vfs_sb->s_flags & SB_SILENT);
 
 	err = init_constants_early(c);
 	if (err)
@@ -1852,7 +1852,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		return err;
 	}
 
-	if (c->ro_mount && !(*flags & MS_RDONLY)) {
+	if (c->ro_mount && !(*flags & SB_RDONLY)) {
 		if (c->ro_error) {
 			ubifs_msg(c, "cannot re-mount R/W due to prior errors");
 			return -EROFS;
@@ -1864,7 +1864,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		err = ubifs_remount_rw(c);
 		if (err)
 			return err;
-	} else if (!c->ro_mount && (*flags & MS_RDONLY)) {
+	} else if (!c->ro_mount && (*flags & SB_RDONLY)) {
 		if (c->ro_error) {
 			ubifs_msg(c, "cannot re-mount R/O due to prior errors");
 			return -EROFS;
@@ -2117,7 +2117,7 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
 	 */
 	ubi = open_ubi(name, UBI_READONLY);
 	if (IS_ERR(ubi)) {
-		if (!(flags & MS_SILENT))
+		if (!(flags & SB_SILENT))
 			pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d",
 			       current->pid, name, (int)PTR_ERR(ubi));
 		return ERR_CAST(ubi);
@@ -2143,18 +2143,18 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
 		kfree(c);
 		/* A new mount point for already mounted UBIFS */
 		dbg_gen("this ubi volume is already mounted");
-		if (!!(flags & MS_RDONLY) != c1->ro_mount) {
+		if (!!(flags & SB_RDONLY) != c1->ro_mount) {
 			err = -EBUSY;
 			goto out_deact;
 		}
 	} else {
-		err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+		err = ubifs_fill_super(sb, data, flags & SB_SILENT ? 1 : 0);
 		if (err)
 			goto out_deact;
 		/* We do not support atime */
-		sb->s_flags |= MS_ACTIVE;
+		sb->s_flags |= SB_ACTIVE;
 #ifndef CONFIG_UBIFS_ATIME_SUPPORT
-		sb->s_flags |= MS_NOATIME;
+		sb->s_flags |= SB_NOATIME;
 #else
 		ubifs_msg(c, "full atime support is enabled.");
 #endif
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 63c7468147eb..5ee7af879cc4 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1201,7 +1201,7 @@ struct ubifs_debug_info;
  * @need_recovery: %1 if the file-system needs recovery
  * @replaying: %1 during journal replay
  * @mounting: %1 while mounting
- * @probing: %1 while attempting to mount if MS_SILENT mount flag is set
+ * @probing: %1 while attempting to mount if SB_SILENT mount flag is set
  * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
  * @replay_list: temporary list used during journal replay
  * @replay_buds: list of buds to replay
@@ -1850,7 +1850,7 @@ __printf(2, 3)
 void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...);
 /*
  * A conditional variant of 'ubifs_err()' which doesn't output anything
- * if probing (ie. MS_SILENT set).
+ * if probing (ie. SB_SILENT set).
  */
 #define ubifs_errc(c, fmt, ...)						\
 do {									\
diff --git a/fs/udf/super.c b/fs/udf/super.c
index f80e0a0f24d3..f73239a9a97d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -650,7 +650,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 	sync_filesystem(sb);
 	if (lvidiu) {
 		int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev);
-		if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY))
+		if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & SB_RDONLY))
 			return -EACCES;
 	}
 
@@ -673,10 +673,10 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 	sbi->s_dmode = uopt.dmode;
 	write_unlock(&sbi->s_cred_lock);
 
-	if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
 		goto out_unlock;
 
-	if (*flags & MS_RDONLY)
+	if (*flags & SB_RDONLY)
 		udf_close_lvid(sb);
 	else
 		udf_open_lvid(sb);
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index b5cd79065ef9..e727ee07dbe4 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -115,7 +115,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
 	
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 	ufs_mark_sb_dirty(sb);
 
@@ -205,7 +205,7 @@ do_more:
 
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 
 	if (overflow) {
@@ -567,7 +567,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
 	
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 	ufs_mark_sb_dirty(sb);
 
@@ -688,7 +688,7 @@ cg_found:
 succed:
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 	ufs_mark_sb_dirty(sb);
 
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 916b4a428933..e1ef0f0a1353 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -112,7 +112,7 @@ void ufs_free_inode (struct inode * inode)
 
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 	
 	ufs_mark_sb_dirty(sb);
@@ -146,14 +146,14 @@ static void ufs2_init_inodes_chunk(struct super_block *sb,
 		set_buffer_uptodate(bh);
 		mark_buffer_dirty(bh);
 		unlock_buffer(bh);
-		if (sb->s_flags & MS_SYNCHRONOUS)
+		if (sb->s_flags & SB_SYNCHRONOUS)
 			sync_dirty_buffer(bh);
 		brelse(bh);
 	}
 
 	fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb);
 	ubh_mark_buffer_dirty(UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 
 	UFSD("EXIT\n");
@@ -284,7 +284,7 @@ cg_found:
 	}
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		ubh_sync_block(UCPI_UBH(ucpi));
 	ufs_mark_sb_dirty(sb);
 
@@ -330,7 +330,7 @@ cg_found:
 		ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, ts.tv_nsec);
 		mark_buffer_dirty(bh);
 		unlock_buffer(bh);
-		if (sb->s_flags & MS_SYNCHRONOUS)
+		if (sb->s_flags & SB_SYNCHRONOUS)
 			sync_dirty_buffer(bh);
 		brelse(bh);
 	}
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 6440003f8ddc..4d497e9c6883 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -282,7 +282,7 @@ void ufs_error (struct super_block * sb, const char * function,
 		usb1->fs_clean = UFS_FSBAD;
 		ubh_mark_buffer_dirty(USPI_UBH(uspi));
 		ufs_mark_sb_dirty(sb);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 	va_start(args, fmt);
 	vaf.fmt = fmt;
@@ -320,7 +320,7 @@ void ufs_panic (struct super_block * sb, const char * function,
 	va_start(args, fmt);
 	vaf.fmt = fmt;
 	vaf.va = &args;
-	sb->s_flags |= MS_RDONLY;
+	sb->s_flags |= SB_RDONLY;
 	pr_crit("panic (device %s): %s: %pV\n",
 		sb->s_id, function, &vaf);
 	va_end(args);
@@ -905,7 +905,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		if (!sb_rdonly(sb)) {
 			if (!silent)
 				pr_info("ufstype=old is supported read-only\n");
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 		}
 		break;
 	
@@ -921,7 +921,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		if (!sb_rdonly(sb)) {
 			if (!silent)
 				pr_info("ufstype=nextstep is supported read-only\n");
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 		}
 		break;
 	
@@ -937,7 +937,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		if (!sb_rdonly(sb)) {
 			if (!silent)
 				pr_info("ufstype=nextstep-cd is supported read-only\n");
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 		}
 		break;
 	
@@ -953,7 +953,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		if (!sb_rdonly(sb)) {
 			if (!silent)
 				pr_info("ufstype=openstep is supported read-only\n");
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 		}
 		break;
 	
@@ -968,7 +968,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		if (!sb_rdonly(sb)) {
 			if (!silent)
 				pr_info("ufstype=hp is supported read-only\n");
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
  		}
  		break;
 	default:
@@ -1125,21 +1125,21 @@ magic_found:
 			break;
 		case UFS_FSACTIVE:
 			pr_err("%s(): fs is active\n", __func__);
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			break;
 		case UFS_FSBAD:
 			pr_err("%s(): fs is bad\n", __func__);
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			break;
 		default:
 			pr_err("%s(): can't grok fs_clean 0x%x\n",
 			       __func__, usb1->fs_clean);
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 			break;
 		}
 	} else {
 		pr_err("%s(): fs needs fsck\n", __func__);
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	}
 
 	/*
@@ -1328,7 +1328,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		return -EINVAL;
 	}
 
-	if ((bool)(*mount_flags & MS_RDONLY) == sb_rdonly(sb)) {
+	if ((bool)(*mount_flags & SB_RDONLY) == sb_rdonly(sb)) {
 		UFS_SB(sb)->s_mount_opt = new_mount_opt;
 		mutex_unlock(&UFS_SB(sb)->s_lock);
 		return 0;
@@ -1337,7 +1337,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	/*
 	 * fs was mouted as rw, remounting ro
 	 */
-	if (*mount_flags & MS_RDONLY) {
+	if (*mount_flags & SB_RDONLY) {
 		ufs_put_super_internal(sb);
 		usb1->fs_time = cpu_to_fs32(sb, get_seconds());
 		if ((flags & UFS_ST_MASK) == UFS_ST_SUN
@@ -1346,7 +1346,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 			ufs_set_fs_state(sb, usb1, usb3,
 				UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
 		ubh_mark_buffer_dirty (USPI_UBH(uspi));
-		sb->s_flags |= MS_RDONLY;
+		sb->s_flags |= SB_RDONLY;
 	} else {
 	/*
 	 * fs was mounted as ro, remounting rw
@@ -1370,7 +1370,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 			mutex_unlock(&UFS_SB(sb)->s_lock);
 			return -EPERM;
 		}
-		sb->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~SB_RDONLY;
 #endif
 	}
 	UFS_SB(sb)->s_mount_opt = new_mount_opt;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 38d4227895ae..a503af96d780 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -781,17 +781,17 @@ xfs_log_mount_finish(
 	 * something to an unlinked inode, the irele won't cause
 	 * premature truncation and freeing of the inode, which results
 	 * in log recovery failure.  We have to evict the unreferenced
-	 * lru inodes after clearing MS_ACTIVE because we don't
+	 * lru inodes after clearing SB_ACTIVE because we don't
 	 * otherwise clean up the lru if there's a subsequent failure in
 	 * xfs_mountfs, which leads to us leaking the inodes if nothing
 	 * else (e.g. quotacheck) references the inodes before the
 	 * mount failure occurs.
 	 */
-	mp->m_super->s_flags |= MS_ACTIVE;
+	mp->m_super->s_flags |= SB_ACTIVE;
 	error = xlog_recover_finish(mp->m_log);
 	if (!error)
 		xfs_log_work_queue(mp);
-	mp->m_super->s_flags &= ~MS_ACTIVE;
+	mp->m_super->s_flags &= ~SB_ACTIVE;
 	evict_inodes(mp->m_super);
 
 	/*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f663022353c0..5122d3021117 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -212,9 +212,9 @@ xfs_parseargs(
 	 */
 	if (sb_rdonly(sb))
 		mp->m_flags |= XFS_MOUNT_RDONLY;
-	if (sb->s_flags & MS_DIRSYNC)
+	if (sb->s_flags & SB_DIRSYNC)
 		mp->m_flags |= XFS_MOUNT_DIRSYNC;
-	if (sb->s_flags & MS_SYNCHRONOUS)
+	if (sb->s_flags & SB_SYNCHRONOUS)
 		mp->m_flags |= XFS_MOUNT_WSYNC;
 
 	/*
@@ -1312,7 +1312,7 @@ xfs_fs_remount(
 	}
 
 	/* ro -> rw */
-	if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
+	if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & SB_RDONLY)) {
 		if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
 			xfs_warn(mp,
 		"ro->rw transition prohibited on norecovery mount");
@@ -1368,7 +1368,7 @@ xfs_fs_remount(
 	}
 
 	/* rw -> ro */
-	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & SB_RDONLY)) {
 		/* Free the per-AG metadata reservation pool. */
 		error = xfs_fs_unreserve_ag_blocks(mp);
 		if (error) {
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 5f2f32408011..fcc5dfc70aa0 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -30,7 +30,7 @@ extern void xfs_qm_exit(void);
 
 #ifdef CONFIG_XFS_POSIX_ACL
 # define XFS_ACL_STRING		"ACLs, "
-# define set_posix_acl_flag(sb)	((sb)->s_flags |= MS_POSIXACL)
+# define set_posix_acl_flag(sb)	((sb)->s_flags |= SB_POSIXACL)
 #else
 # define XFS_ACL_STRING
 # define set_posix_acl_flag(sb)	do { } while (0)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2995a271ec46..bbd92da0946e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1872,7 +1872,7 @@ struct super_operations {
  */
 #define __IS_FLG(inode, flg)	((inode)->i_sb->s_flags & (flg))
 
-static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & MS_RDONLY; }
+static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; }
 #define IS_RDONLY(inode)	sb_rdonly((inode)->i_sb)
 #define IS_SYNC(inode)		(__IS_FLG(inode, SB_SYNCHRONOUS) || \
 					((inode)->i_flags & S_SYNC))
diff --git a/include/uapi/linux/bfs_fs.h b/include/uapi/linux/bfs_fs.h
index 73445ef07dda..940b04772af8 100644
--- a/include/uapi/linux/bfs_fs.h
+++ b/include/uapi/linux/bfs_fs.h
@@ -76,7 +76,7 @@ struct bfs_super_block {
 #define BFS_FILEBLOCKS(ip) \
         ((ip)->i_sblock == 0 ? 0 : (le32_to_cpu((ip)->i_eblock) + 1) -  le32_to_cpu((ip)->i_sblock))
 #define BFS_UNCLEAN(bfs_sb, sb)	\
-	((le32_to_cpu(bfs_sb->s_from) != -1) && (le32_to_cpu(bfs_sb->s_to) != -1) && !(sb->s_flags & MS_RDONLY))
+	((le32_to_cpu(bfs_sb->s_from) != -1) && (le32_to_cpu(bfs_sb->s_to) != -1) && !(sb->s_flags & SB_RDONLY))
 
 
 #endif	/* _LINUX_BFS_FS_H */
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index d24025626310..9649ecd8a73a 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -331,7 +331,7 @@ static struct dentry *mqueue_mount(struct file_system_type *fs_type,
 			 void *data)
 {
 	struct ipc_namespace *ns;
-	if (flags & MS_KERNMOUNT) {
+	if (flags & SB_KERNMOUNT) {
 		ns = data;
 		data = NULL;
 	} else {
diff --git a/mm/shmem.c b/mm/shmem.c
index 4aa9307feab0..7fbe67be86fa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3776,7 +3776,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 	 * tmpfs instance, limiting inodes to one per page of lowmem;
 	 * but the internal instance is left unlimited.
 	 */
-	if (!(sb->s_flags & MS_KERNMOUNT)) {
+	if (!(sb->s_flags & SB_KERNMOUNT)) {
 		sbinfo->max_blocks = shmem_default_max_blocks();
 		sbinfo->max_inodes = shmem_default_max_inodes();
 		if (shmem_parse_options(data, sbinfo, false)) {
@@ -3784,12 +3784,12 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 			goto failed;
 		}
 	} else {
-		sb->s_flags |= MS_NOUSER;
+		sb->s_flags |= SB_NOUSER;
 	}
 	sb->s_export_op = &shmem_export_ops;
-	sb->s_flags |= MS_NOSEC;
+	sb->s_flags |= SB_NOSEC;
 #else
-	sb->s_flags |= MS_NOUSER;
+	sb->s_flags |= SB_NOUSER;
 #endif
 
 	spin_lock_init(&sbinfo->stat_lock);
@@ -3809,7 +3809,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_xattr = shmem_xattr_handlers;
 #endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
-	sb->s_flags |= MS_POSIXACL;
+	sb->s_flags |= SB_POSIXACL;
 #endif
 	uuid_gen(&sb->s_uuid);
 
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index 8542e9a55e1b..d4fa04d91439 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -2451,7 +2451,7 @@ static int __init aa_create_aafs(void)
 	aafs_mnt = kern_mount(&aafs_ops);
 	if (IS_ERR(aafs_mnt))
 		panic("can't set apparmorfs up\n");
-	aafs_mnt->mnt_sb->s_flags &= ~MS_NOUSER;
+	aafs_mnt->mnt_sb->s_flags &= ~SB_NOUSER;
 
 	/* Populate fs tree. */
 	error = entry_create_dir(&aa_sfs_entry, NULL);
diff --git a/security/apparmor/include/lib.h b/security/apparmor/include/lib.h
index f546707a2bbb..6505e1ad9e23 100644
--- a/security/apparmor/include/lib.h
+++ b/security/apparmor/include/lib.h
@@ -86,7 +86,7 @@ static inline unsigned int aa_dfa_null_transition(struct aa_dfa *dfa,
 
 static inline bool path_mediated_fs(struct dentry *dentry)
 {
-	return !(dentry->d_sb->s_flags & MS_NOUSER);
+	return !(dentry->d_sb->s_flags & SB_NOUSER);
 }
 
 
-- 
cgit v1.2.3


From d34971a65b72619508f49cd237283e92f1c329d5 Mon Sep 17 00:00:00 2001
From: Bhumika Goyal <bhumirks@gmail.com>
Date: Tue, 17 Oct 2017 18:14:23 +0200
Subject: sunrpc: make the function arg as const

Make the struct cache_detail *tmpl argument of the function
cache_create_net as const as it is only getting passed to kmemup having
the argument as const void *.
Add const to the prototype too.

Signed-off-by: Bhumika Goyal <bhumirks@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/cache.h | 2 +-
 net/sunrpc/cache.c           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 270bad0e1bed..40d2822f0e2f 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -213,7 +213,7 @@ extern void __init cache_initialize(void);
 extern int cache_register_net(struct cache_detail *cd, struct net *net);
 extern void cache_unregister_net(struct cache_detail *cd, struct net *net);
 
-extern struct cache_detail *cache_create_net(struct cache_detail *tmpl, struct net *net);
+extern struct cache_detail *cache_create_net(const struct cache_detail *tmpl, struct net *net);
 extern void cache_destroy_net(struct cache_detail *cd, struct net *net);
 
 extern void sunrpc_init_cache_detail(struct cache_detail *cd);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 79d55d949d9a..e68943895be4 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1674,7 +1674,7 @@ void cache_unregister_net(struct cache_detail *cd, struct net *net)
 }
 EXPORT_SYMBOL_GPL(cache_unregister_net);
 
-struct cache_detail *cache_create_net(struct cache_detail *tmpl, struct net *net)
+struct cache_detail *cache_create_net(const struct cache_detail *tmpl, struct net *net)
 {
 	struct cache_detail *cd;
 	int i;
-- 
cgit v1.2.3


From f50caa9b517a2542ae9769fc17ad84110ae07c8b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 14 Nov 2017 12:40:31 +0100
Subject: debugfs: fix debugfs_real_fops() build error

Some drivers use debugfs_real_fops() even when CONFIG_DEBUG_FS is disabled,
which now leads to a build error:

In file included from include/linux/list.h:9:0,
                 from include/linux/wait.h:7,
                 from include/linux/wait_bit.h:8,
                 from include/linux/fs.h:6,
                 from drivers/net/wireless/broadcom/b43legacy/debugfs.c:26:
drivers/net/wireless/broadcom/b43legacy/debugfs.c: In function 'b43legacy_debugfs_read':
drivers/net/wireless/broadcom/b43legacy/debugfs.c:224:23: error: implicit declaration of function 'debugfs_real_fops'; did you mean 'debugfs_create_bool'? [-Werror=implicit-function-declaration]

My first impulse was to add another 'static inline' dummy function
returning NULL for it, which would work fine. However, most callers
feed the pointer into container_of(), so it seems a little dangerous
here. Since all the callers are inside of a read/write file operation
that gets eliminated in this configuration, so having an 'extern'
declaration seems better here. If it ever gets used in a dangerous
way, that will now result in a link error.

Fixes: 7c8d469877b1 ("debugfs: add support for more elaborate ->d_fsdata")
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Cc: Simon Horman <simon.horman@netronome.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/debugfs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index f36ecc2a5712..3b0ba54cc4d5 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -216,6 +216,8 @@ static inline void debugfs_remove(struct dentry *dentry)
 static inline void debugfs_remove_recursive(struct dentry *dentry)
 { }
 
+const struct file_operations *debugfs_real_fops(const struct file *filp);
+
 static inline int debugfs_file_get(struct dentry *dentry)
 {
 	return 0;
-- 
cgit v1.2.3


From fd00cf81a9a84776ba58e56bd042c726dcf75cf3 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Fri, 3 Nov 2017 15:30:53 +0100
Subject: serdev: fix receive_buf return value when no callback

The receive_buf callback is supposed to return the number of bytes
processed and should specifically not return a negative errno.

Due to missing sanity checks in the serdev tty-port controller, a driver
not providing a receive_buf callback could cause the flush_to_ldisc()
worker to spin in a tight loop when the tty buffer pointers are
incremented with -EINVAL (-22).

The missing sanity checks have now been added to the tty-port
controller, but let's fix up the serdev-controller helper as well.

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index e69402d4a8ae..d609e6dc5bad 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -184,7 +184,7 @@ static inline int serdev_controller_receive_buf(struct serdev_controller *ctrl,
 	struct serdev_device *serdev = ctrl->serdev;
 
 	if (!serdev || !serdev->ops->receive_buf)
-		return -EINVAL;
+		return 0;
 
 	return serdev->ops->receive_buf(serdev, data, count);
 }
-- 
cgit v1.2.3


From 7fa32e5ec28b1609abc0b797b58267f725fc3964 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Tue, 14 Nov 2017 06:53:33 -0700
Subject: Drivers: hv: vmbus: Fix a rescind issue

The current rescind processing code will not correctly handle
the case where the host immediately rescinds a channel that has
been offerred. In this case, we could be blocked in the open call and
since the channel is rescinded, the host will not respond and we could
be blocked forever in the vmbus open call.i Fix this problem.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel.c      | 10 ++++++++--
 drivers/hv/channel_mgmt.c |  7 ++++---
 include/linux/hyperv.h    |  1 +
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 19f0cf37e0ed..ba0a092ae085 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -659,22 +659,28 @@ void vmbus_close(struct vmbus_channel *channel)
 		 */
 		return;
 	}
-	mutex_lock(&vmbus_connection.channel_mutex);
 	/*
 	 * Close all the sub-channels first and then close the
 	 * primary channel.
 	 */
 	list_for_each_safe(cur, tmp, &channel->sc_list) {
 		cur_channel = list_entry(cur, struct vmbus_channel, sc_list);
-		vmbus_close_internal(cur_channel);
 		if (cur_channel->rescind) {
+			wait_for_completion(&cur_channel->rescind_event);
+			mutex_lock(&vmbus_connection.channel_mutex);
+			vmbus_close_internal(cur_channel);
 			hv_process_channel_removal(
 					   cur_channel->offermsg.child_relid);
+		} else {
+			mutex_lock(&vmbus_connection.channel_mutex);
+			vmbus_close_internal(cur_channel);
 		}
+		mutex_unlock(&vmbus_connection.channel_mutex);
 	}
 	/*
 	 * Now close the primary.
 	 */
+	mutex_lock(&vmbus_connection.channel_mutex);
 	vmbus_close_internal(channel);
 	mutex_unlock(&vmbus_connection.channel_mutex);
 }
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index ec5454f3f4a6..c21020b69114 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -333,6 +333,7 @@ static struct vmbus_channel *alloc_channel(void)
 		return NULL;
 
 	spin_lock_init(&channel->lock);
+	init_completion(&channel->rescind_event);
 
 	INIT_LIST_HEAD(&channel->sc_list);
 	INIT_LIST_HEAD(&channel->percpu_list);
@@ -898,6 +899,7 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
 	/*
 	 * Now wait for offer handling to complete.
 	 */
+	vmbus_rescind_cleanup(channel);
 	while (READ_ONCE(channel->probe_done) == false) {
 		/*
 		 * We wait here until any channel offer is currently
@@ -913,7 +915,6 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
 	if (channel->device_obj) {
 		if (channel->chn_rescind_callback) {
 			channel->chn_rescind_callback(channel);
-			vmbus_rescind_cleanup(channel);
 			return;
 		}
 		/*
@@ -922,7 +923,6 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
 		 */
 		dev = get_device(&channel->device_obj->device);
 		if (dev) {
-			vmbus_rescind_cleanup(channel);
 			vmbus_device_unregister(channel->device_obj);
 			put_device(dev);
 		}
@@ -936,13 +936,14 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
 		 * 2. Then close the primary channel.
 		 */
 		mutex_lock(&vmbus_connection.channel_mutex);
-		vmbus_rescind_cleanup(channel);
 		if (channel->state == CHANNEL_OPEN_STATE) {
 			/*
 			 * The channel is currently not open;
 			 * it is safe for us to cleanup the channel.
 			 */
 			hv_process_channel_removal(rescind->child_relid);
+		} else {
+			complete(&channel->rescind_event);
 		}
 		mutex_unlock(&vmbus_connection.channel_mutex);
 	}
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index f3e97c5f94c9..6c9336626592 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -708,6 +708,7 @@ struct vmbus_channel {
 	u8 monitor_bit;
 
 	bool rescind; /* got rescind msg */
+	struct completion rescind_event;
 
 	u32 ringbuffer_gpadlhandle;
 
-- 
cgit v1.2.3


From 668533dc0764b30c9dd2baf3ca800156f688326b Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 29 Nov 2017 10:30:13 -0800
Subject: kallsyms: take advantage of the new '%px' format

The conditional kallsym hex printing used a special fixed-width '%lx'
output (KALLSYM_FMT) in preparation for the hashing of %p, but that
series ended up adding a %px specifier to help with the conversions.

Use it, and avoid the "print pointer as an unsigned long" code.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kallsyms.h | 6 ------
 kernel/kallsyms.c        | 8 ++++----
 kernel/module.c          | 6 +++---
 3 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index 708f337d780b..bd118a6c60cb 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -14,12 +14,6 @@
 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \
 			 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1)
 
-#ifndef CONFIG_64BIT
-# define KALLSYM_FMT "%08lx"
-#else
-# define KALLSYM_FMT "%016lx"
-#endif
-
 struct module;
 
 #ifdef CONFIG_KALLSYMS
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 531ffa984bc2..d5fa4116688a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -614,14 +614,14 @@ static void s_stop(struct seq_file *m, void *p)
 
 static int s_show(struct seq_file *m, void *p)
 {
-	unsigned long value;
+	void *value;
 	struct kallsym_iter *iter = m->private;
 
 	/* Some debugging symbols have no name.  Ignore them. */
 	if (!iter->name[0])
 		return 0;
 
-	value = iter->show_value ? iter->value : 0;
+	value = iter->show_value ? (void *)iter->value : NULL;
 
 	if (iter->module_name[0]) {
 		char type;
@@ -632,10 +632,10 @@ static int s_show(struct seq_file *m, void *p)
 		 */
 		type = iter->exported ? toupper(iter->type) :
 					tolower(iter->type);
-		seq_printf(m, KALLSYM_FMT " %c %s\t[%s]\n", value,
+		seq_printf(m, "%px %c %s\t[%s]\n", value,
 			   type, iter->name, iter->module_name);
 	} else
-		seq_printf(m, KALLSYM_FMT " %c %s\n", value,
+		seq_printf(m, "%px %c %s\n", value,
 			   iter->type, iter->name);
 	return 0;
 }
diff --git a/kernel/module.c b/kernel/module.c
index f0411a271765..dea01ac9cb74 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4157,7 +4157,7 @@ static int m_show(struct seq_file *m, void *p)
 {
 	struct module *mod = list_entry(p, struct module, list);
 	char buf[MODULE_FLAGS_BUF_SIZE];
-	unsigned long value;
+	void *value;
 
 	/* We always ignore unformed modules. */
 	if (mod->state == MODULE_STATE_UNFORMED)
@@ -4173,8 +4173,8 @@ static int m_show(struct seq_file *m, void *p)
 		   mod->state == MODULE_STATE_COMING ? "Loading" :
 		   "Live");
 	/* Used by oprofile and other similar tools. */
-	value = m->private ? 0 : (unsigned long)mod->core_layout.base;
-	seq_printf(m, " 0x" KALLSYM_FMT, value);
+	value = m->private ? NULL : mod->core_layout.base;
+	seq_printf(m, " 0x%px", value);
 
 	/* Taints info */
 	if (mod->taints)
-- 
cgit v1.2.3


From 1501899a898dfb5477c55534bdfd734c046da06d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 29 Nov 2017 16:10:06 -0800
Subject: mm: fix device-dax pud write-faults triggered by get_user_pages()

Currently only get_user_pages_fast() can safely handle the writable gup
case due to its use of pud_access_permitted() to check whether the pud
entry is writable.  In the gup slow path pud_write() is used instead of
pud_access_permitted() and to date it has been unimplemented, just calls
BUG_ON().

    kernel BUG at ./include/linux/hugetlb.h:244!
    [..]
    RIP: 0010:follow_devmap_pud+0x482/0x490
    [..]
    Call Trace:
     follow_page_mask+0x28c/0x6e0
     __get_user_pages+0xe4/0x6c0
     get_user_pages_unlocked+0x130/0x1b0
     get_user_pages_fast+0x89/0xb0
     iov_iter_get_pages_alloc+0x114/0x4a0
     nfs_direct_read_schedule_iovec+0xd2/0x350
     ? nfs_start_io_direct+0x63/0x70
     nfs_file_direct_read+0x1e0/0x250
     nfs_file_read+0x90/0xc0

For now this just implements a simple check for the _PAGE_RW bit similar
to pmd_write.  However, this implies that the gup-slow-path check is
missing the extra checks that the gup-fast-path performs with
pud_access_permitted.  Later patches will align all checks to use the
'access_permitted' helper if the architecture provides it.

Note that the generic 'access_permitted' helper fallback is the simple
_PAGE_RW check on architectures that do not define the
'access_permitted' helper(s).

[dan.j.williams@intel.com: fix powerpc compile error]
  Link: http://lkml.kernel.org/r/151129126165.37405.16031785266675461397.stgit@dwillia2-desk3.amr.corp.intel.com
Link: http://lkml.kernel.org/r/151043109938.2842.14834662818213616199.stgit@dwillia2-desk3.amr.corp.intel.com
Fixes: a00cc7d9dd93 ("mm, x86: add support for PUD-sized transparent hugepages")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Thomas Gleixner <tglx@linutronix.de>	[x86]
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable.h | 6 ++++++
 include/asm-generic/pgtable.h  | 8 ++++++++
 include/linux/hugetlb.h        | 8 --------
 3 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 09f9e1e00e3b..dcce76ee4aa7 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1088,6 +1088,12 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 	clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
 }
 
+#define pud_write pud_write
+static inline int pud_write(pud_t pud)
+{
+	return pud_flags(pud) & _PAGE_RW;
+}
+
 /*
  * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
  *
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 757dc6ffc7ba..1ac457511f4e 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -814,6 +814,14 @@ static inline int pmd_write(pmd_t pmd)
 #endif /* __HAVE_ARCH_PMD_WRITE */
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+#ifndef pud_write
+static inline int pud_write(pud_t pud)
+{
+	BUG();
+	return 0;
+}
+#endif /* pud_write */
+
 #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
 	(defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
 	 !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index fbf5b31d47ee..82a25880714a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -239,14 +239,6 @@ static inline int pgd_write(pgd_t pgd)
 }
 #endif
 
-#ifndef pud_write
-static inline int pud_write(pud_t pud)
-{
-	BUG();
-	return 0;
-}
-#endif
-
 #define HUGETLB_ANON_FILE "anon_hugepage"
 
 enum {
-- 
cgit v1.2.3


From 31383c6865a578834dd953d9dbc88e6b19fe3997 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 29 Nov 2017 16:10:28 -0800
Subject: mm, hugetlbfs: introduce ->split() to vm_operations_struct

Patch series "device-dax: fix unaligned munmap handling"

When device-dax is operating in huge-page mode we want it to behave like
hugetlbfs and fail attempts to split vmas into unaligned ranges.  It
would be messy to teach the munmap path about device-dax alignment
constraints in the same (hstate) way that hugetlbfs communicates this
constraint.  Instead, these patches introduce a new ->split() vm
operation.

This patch (of 2):

The device-dax interface has similar constraints as hugetlbfs in that it
requires the munmap path to unmap in huge page aligned units.  Rather
than add more custom vma handling code in __split_vma() introduce a new
vm operation to perform this vma specific check.

Link: http://lkml.kernel.org/r/151130418135.4029.6783191281930729710.stgit@dwillia2-desk3.amr.corp.intel.com
Fixes: dee410792419 ("/dev/dax, core: file operations and dax-mmap")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 1 +
 mm/hugetlb.c       | 8 ++++++++
 mm/mmap.c          | 8 +++++---
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ee073146aaa7..b3b6a7e313e9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -377,6 +377,7 @@ enum page_entry_size {
 struct vm_operations_struct {
 	void (*open)(struct vm_area_struct * area);
 	void (*close)(struct vm_area_struct * area);
+	int (*split)(struct vm_area_struct * area, unsigned long addr);
 	int (*mremap)(struct vm_area_struct * area);
 	int (*fault)(struct vm_fault *vmf);
 	int (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 681b300185c0..698e8fb34031 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3125,6 +3125,13 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 	}
 }
 
+static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
+{
+	if (addr & ~(huge_page_mask(hstate_vma(vma))))
+		return -EINVAL;
+	return 0;
+}
+
 /*
  * We cannot handle pagefaults against hugetlb pages at all.  They cause
  * handle_mm_fault() to try to instantiate regular-sized pages in the
@@ -3141,6 +3148,7 @@ const struct vm_operations_struct hugetlb_vm_ops = {
 	.fault = hugetlb_vm_op_fault,
 	.open = hugetlb_vm_op_open,
 	.close = hugetlb_vm_op_close,
+	.split = hugetlb_vm_op_split,
 };
 
 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
diff --git a/mm/mmap.c b/mm/mmap.c
index 924839fac0e6..a4d546821214 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2555,9 +2555,11 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct vm_area_struct *new;
 	int err;
 
-	if (is_vm_hugetlb_page(vma) && (addr &
-					~(huge_page_mask(hstate_vma(vma)))))
-		return -EINVAL;
+	if (vma->vm_ops && vma->vm_ops->split) {
+		err = vma->vm_ops->split(vma, addr);
+		if (err)
+			return err;
+	}
 
 	new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 	if (!new)
-- 
cgit v1.2.3


From 2bb6d2837083de722bfdc369cb0d76ce188dd9b4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 29 Nov 2017 16:10:35 -0800
Subject: mm: introduce get_user_pages_longterm

Patch series "introduce get_user_pages_longterm()", v2.

Here is a new get_user_pages api for cases where a driver intends to
keep an elevated page count indefinitely.  This is distinct from usages
like iov_iter_get_pages where the elevated page counts are transient.
The iov_iter_get_pages cases immediately turn around and submit the
pages to a device driver which will put_page when the i/o operation
completes (under kernel control).

In the longterm case userspace is responsible for dropping the page
reference at some undefined point in the future.  This is untenable for
filesystem-dax case where the filesystem is in control of the lifetime
of the block / page and needs reasonable limits on how long it can wait
for pages in a mapping to become idle.

Fixing filesystems to actually wait for dax pages to be idle before
blocks from a truncate/hole-punch operation are repurposed is saved for
a later patch series.

Also, allowing longterm registration of dax mappings is a future patch
series that introduces a "map with lease" semantic where the kernel can
revoke a lease and force userspace to drop its page references.

I have also tagged these for -stable to purposely break cases that might
assume that longterm memory registrations for filesystem-dax mappings
were supported by the kernel.  The behavior regression this policy
change implies is one of the reasons we maintain the "dax enabled.
Warning: EXPERIMENTAL, use at your own risk" notification when mounting
a filesystem in dax mode.

It is worth noting the device-dax interface does not suffer the same
constraints since it does not support file space management operations
like hole-punch.

This patch (of 4):

Until there is a solution to the dma-to-dax vs truncate problem it is
not safe to allow long standing memory registrations against
filesytem-dax vmas.  Device-dax vmas do not have this problem and are
explicitly allowed.

This is temporary until a "memory registration with layout-lease"
mechanism can be implemented for the affected sub-systems (RDMA and
V4L2).

[akpm@linux-foundation.org: use kcalloc()]
Link: http://lkml.kernel.org/r/151068939435.7446.13560129395419350737.stgit@dwillia2-desk3.amr.corp.intel.com
Fixes: 3565fce3a659 ("mm, x86: get_user_pages() for dax mappings")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Suggested-by: Christoph Hellwig <hch@lst.de>
Cc: Doug Ledford <dledford@redhat.com>
Cc: Hal Rosenstock <hal.rosenstock@gmail.com>
Cc: Inki Dae <inki.dae@samsung.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Joonyoung Shim <jy0922.shim@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Sean Hefty <sean.hefty@intel.com>
Cc: Seung-Woo Kim <sw0312.kim@samsung.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 14 ++++++++++++
 include/linux/mm.h | 13 +++++++++++
 mm/gup.c           | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index bbd92da0946e..9dc498d16cc1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3194,6 +3194,20 @@ static inline bool vma_is_dax(struct vm_area_struct *vma)
 	return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
 }
 
+static inline bool vma_is_fsdax(struct vm_area_struct *vma)
+{
+	struct inode *inode;
+
+	if (!vma->vm_file)
+		return false;
+	if (!vma_is_dax(vma))
+		return false;
+	inode = file_inode(vma->vm_file);
+	if (inode->i_mode == S_IFCHR)
+		return false; /* device-dax */
+	return true;
+}
+
 static inline int iocb_flags(struct file *file)
 {
 	int res = 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b3b6a7e313e9..ea818ff739cd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1380,6 +1380,19 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
 		    unsigned int gup_flags, struct page **pages, int *locked);
 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
 		    struct page **pages, unsigned int gup_flags);
+#ifdef CONFIG_FS_DAX
+long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
+			    unsigned int gup_flags, struct page **pages,
+			    struct vm_area_struct **vmas);
+#else
+static inline long get_user_pages_longterm(unsigned long start,
+		unsigned long nr_pages, unsigned int gup_flags,
+		struct page **pages, struct vm_area_struct **vmas)
+{
+	return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+}
+#endif /* CONFIG_FS_DAX */
+
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages);
 
diff --git a/mm/gup.c b/mm/gup.c
index 85cc822fd403..d3fb60e5bfac 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1095,6 +1095,70 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
 }
 EXPORT_SYMBOL(get_user_pages);
 
+#ifdef CONFIG_FS_DAX
+/*
+ * This is the same as get_user_pages() in that it assumes we are
+ * operating on the current task's mm, but it goes further to validate
+ * that the vmas associated with the address range are suitable for
+ * longterm elevated page reference counts. For example, filesystem-dax
+ * mappings are subject to the lifetime enforced by the filesystem and
+ * we need guarantees that longterm users like RDMA and V4L2 only
+ * establish mappings that have a kernel enforced revocation mechanism.
+ *
+ * "longterm" == userspace controlled elevated page count lifetime.
+ * Contrast this to iov_iter_get_pages() usages which are transient.
+ */
+long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
+		unsigned int gup_flags, struct page **pages,
+		struct vm_area_struct **vmas_arg)
+{
+	struct vm_area_struct **vmas = vmas_arg;
+	struct vm_area_struct *vma_prev = NULL;
+	long rc, i;
+
+	if (!pages)
+		return -EINVAL;
+
+	if (!vmas) {
+		vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
+			       GFP_KERNEL);
+		if (!vmas)
+			return -ENOMEM;
+	}
+
+	rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+
+	for (i = 0; i < rc; i++) {
+		struct vm_area_struct *vma = vmas[i];
+
+		if (vma == vma_prev)
+			continue;
+
+		vma_prev = vma;
+
+		if (vma_is_fsdax(vma))
+			break;
+	}
+
+	/*
+	 * Either get_user_pages() failed, or the vma validation
+	 * succeeded, in either case we don't need to put_page() before
+	 * returning.
+	 */
+	if (i >= rc)
+		goto out;
+
+	for (i = 0; i < rc; i++)
+		put_page(pages[i]);
+	rc = -EOPNOTSUPP;
+out:
+	if (vmas != vmas_arg)
+		kfree(vmas);
+	return rc;
+}
+EXPORT_SYMBOL(get_user_pages_longterm);
+#endif /* CONFIG_FS_DAX */
+
 /**
  * populate_vma_page_range() -  populate a range of pages in the vma.
  * @vma:   target vma
-- 
cgit v1.2.3


From 40a899ed16486455f964e46d1af31fd4fded21c1 Mon Sep 17 00:00:00 2001
From: Zi Yan <zi.yan@cs.rutgers.edu>
Date: Wed, 29 Nov 2017 16:11:12 -0800
Subject: mm: migrate: fix an incorrect call of prep_transhuge_page()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In https://lkml.org/lkml/2017/11/20/411, Andrea reported that during
memory hotplug/hot remove prep_transhuge_page() is called incorrectly on
non-THP pages for migration, when THP is on but THP migration is not
enabled.  This leads to a bad state of target pages for migration.

By inspecting the code, if called on a non-THP, prep_transhuge_page()
will

 1) change the value of the mapping of (page + 2), since it is used for
    THP deferred list;

 2) change the lru value of (page + 1), since it is used for THP's dtor.

Both can lead to data corruption of these two pages.

Andrea said:
 "Pragmatically and from the point of view of the memory_hotplug subsys,
  the effect is a kernel crash when pages are being migrated during a
  memory hot remove offline and migration target pages are found in a
  bad state"

This patch fixes it by only calling prep_transhuge_page() when we are
certain that the target page is THP.

Link: http://lkml.kernel.org/r/20171121021855.50525-1-zi.yan@sent.com
Fixes: 8135d8926c08 ("mm: memory_hotplug: memory hotremove supports thp migration")
Signed-off-by: Zi Yan <zi.yan@cs.rutgers.edu>
Reported-by: Andrea Reale <ar@linux.vnet.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: <stable@vger.kernel.org>	[4.14]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 895ec0c4942e..a2246cf670ba 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -54,7 +54,7 @@ static inline struct page *new_page_nodemask(struct page *page,
 	new_page = __alloc_pages_nodemask(gfp_mask, order,
 				preferred_nid, nodemask);
 
-	if (new_page && PageTransHuge(page))
+	if (new_page && PageTransHuge(new_page))
 		prep_transhuge_page(new_page);
 
 	return new_page;
-- 
cgit v1.2.3


From 5d38f049cee1e1c4a7ac55aa79d37d01ddcc3860 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 29 Nov 2017 16:11:26 -0800
Subject: autofs: revert "autofs: fix AT_NO_AUTOMOUNT not being honored"

Commit 42f461482178 ("autofs: fix AT_NO_AUTOMOUNT not being honored")
allowed the fstatat(2) system call to properly honor the AT_NO_AUTOMOUNT
flag but introduced a semantic change.

In order to honor AT_NO_AUTOMOUNT a semantic change was made to the
negative dentry case for stat family system calls in follow_automount().

This changed the unconditional triggering of an automount in this case
to no longer be done and an error returned instead.

This has caused more problems than I expected so reverting the change is
needed.

In a discussion with Neil Brown it was concluded that the automount(8)
daemon can implement this change without kernel modifications.  So that
will be done instead and the autofs module documentation updated with a
description of the problem and what needs to be done by module users for
this specific case.

Link: http://lkml.kernel.org/r/151174730120.6162.3848002191530283984.stgit@pluto.themaw.net
Fixes: 42f4614821 ("autofs: fix AT_NO_AUTOMOUNT not being honored")
Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Neil Brown <neilb@suse.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Colin Walters <walters@redhat.com>
Cc: Ondrej Holy <oholy@redhat.com>
Cc: <stable@vger.kernel.org>	[4.11+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c         | 15 +++------------
 include/linux/fs.h |  3 ++-
 2 files changed, 5 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index f0c7a7b9b6ca..9cc91fb7f156 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1129,18 +1129,9 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 	 * of the daemon to instantiate them before they can be used.
 	 */
 	if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
-			   LOOKUP_OPEN | LOOKUP_CREATE |
-			   LOOKUP_AUTOMOUNT))) {
-		/* Positive dentry that isn't meant to trigger an
-		 * automount, EISDIR will allow it to be used,
-		 * otherwise there's no mount here "now" so return
-		 * ENOENT.
-		 */
-		if (path->dentry->d_inode)
-			return -EISDIR;
-		else
-			return -ENOENT;
-	}
+			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
+	    path->dentry->d_inode)
+		return -EISDIR;
 
 	if (path->dentry->d_sb->s_user_ns != &init_user_ns)
 		return -EACCES;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9dc498d16cc1..511fbaabf624 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3088,7 +3088,8 @@ static inline int vfs_lstat(const char __user *name, struct kstat *stat)
 static inline int vfs_fstatat(int dfd, const char __user *filename,
 			      struct kstat *stat, int flags)
 {
-	return vfs_statx(dfd, filename, flags, stat, STATX_BASIC_STATS);
+	return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT,
+			 stat, STATX_BASIC_STATS);
 }
 static inline int vfs_fstat(int fd, struct kstat *stat)
 {
-- 
cgit v1.2.3


From f8821f96ae97260d68228fe53f81848b2ede44d7 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 30 Nov 2017 14:33:56 +0100
Subject: skbuff: Grammar s/are can/can/, s/change/changes/

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index bc486ef23f20..a38c80e9f91e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1406,8 +1406,7 @@ static inline struct sk_buff *skb_get(struct sk_buff *skb)
 }
 
 /*
- * If users == 1, we are the only owner and are can avoid redundant
- * atomic change.
+ * If users == 1, we are the only owner and can avoid redundant atomic changes.
  */
 
 /**
-- 
cgit v1.2.3


From 8d26fdfcb45dc420115b267ac9d6b3ac13457f1b Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 30 Nov 2017 14:35:08 +0100
Subject: spi: Fix double "when"

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 7b2170bfd6e7..bc6bb325d1bf 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -126,7 +126,7 @@ void spi_statistics_add_transfer_stats(struct spi_statistics *stats,
  *	for that name.  This appears in the sysfs "modalias" attribute
  *	for driver coldplugging, and in uevents used for hotplugging
  * @cs_gpio: gpio number of the chipselect line (optional, -ENOENT when
- *	when not using a GPIO line)
+ *	not using a GPIO line)
  *
  * @statistics: statistics for the spi_device
  *
-- 
cgit v1.2.3


From 4db2b604c05afc3d2678fe01d3136c015df313ec Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 22 Nov 2017 11:47:28 +0100
Subject: move libgcc.h to include/linux

Introducing a new include/lib directory just for this file totally
messes up tab completion for include/linux, which is highly annoying.

Move it to include/linux where we have headers for all kinds of other
lib/ code as well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 include/lib/libgcc.h   | 43 -------------------------------------------
 include/linux/libgcc.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 lib/ashldi3.c          |  2 +-
 lib/ashrdi3.c          |  2 +-
 lib/cmpdi2.c           |  2 +-
 lib/lshrdi3.c          |  2 +-
 lib/muldi3.c           |  2 +-
 lib/ucmpdi2.c          |  2 +-
 8 files changed, 49 insertions(+), 49 deletions(-)
 delete mode 100644 include/lib/libgcc.h
 create mode 100644 include/linux/libgcc.h

(limited to 'include/linux')

diff --git a/include/lib/libgcc.h b/include/lib/libgcc.h
deleted file mode 100644
index 32e1e0f4b2d0..000000000000
--- a/include/lib/libgcc.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * include/lib/libgcc.h
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see the file COPYING, or write
- * to the Free Software Foundation, Inc.
- */
-
-#ifndef __LIB_LIBGCC_H
-#define __LIB_LIBGCC_H
-
-#include <asm/byteorder.h>
-
-typedef int word_type __attribute__ ((mode (__word__)));
-
-#ifdef __BIG_ENDIAN
-struct DWstruct {
-	int high, low;
-};
-#elif defined(__LITTLE_ENDIAN)
-struct DWstruct {
-	int low, high;
-};
-#else
-#error I feel sick.
-#endif
-
-typedef union {
-	struct DWstruct s;
-	long long ll;
-} DWunion;
-
-#endif /* __ASM_LIBGCC_H */
diff --git a/include/linux/libgcc.h b/include/linux/libgcc.h
new file mode 100644
index 000000000000..32e1e0f4b2d0
--- /dev/null
+++ b/include/linux/libgcc.h
@@ -0,0 +1,43 @@
+/*
+ * include/lib/libgcc.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see the file COPYING, or write
+ * to the Free Software Foundation, Inc.
+ */
+
+#ifndef __LIB_LIBGCC_H
+#define __LIB_LIBGCC_H
+
+#include <asm/byteorder.h>
+
+typedef int word_type __attribute__ ((mode (__word__)));
+
+#ifdef __BIG_ENDIAN
+struct DWstruct {
+	int high, low;
+};
+#elif defined(__LITTLE_ENDIAN)
+struct DWstruct {
+	int low, high;
+};
+#else
+#error I feel sick.
+#endif
+
+typedef union {
+	struct DWstruct s;
+	long long ll;
+} DWunion;
+
+#endif /* __ASM_LIBGCC_H */
diff --git a/lib/ashldi3.c b/lib/ashldi3.c
index 1b6087db95a5..3ffc46e3bb6c 100644
--- a/lib/ashldi3.c
+++ b/lib/ashldi3.c
@@ -16,7 +16,7 @@
 
 #include <linux/export.h>
 
-#include <lib/libgcc.h>
+#include <linux/libgcc.h>
 
 long long notrace __ashldi3(long long u, word_type b)
 {
diff --git a/lib/ashrdi3.c b/lib/ashrdi3.c
index 2e67c97ac65a..ea054550f0e8 100644
--- a/lib/ashrdi3.c
+++ b/lib/ashrdi3.c
@@ -16,7 +16,7 @@
 
 #include <linux/export.h>
 
-#include <lib/libgcc.h>
+#include <linux/libgcc.h>
 
 long long notrace __ashrdi3(long long u, word_type b)
 {
diff --git a/lib/cmpdi2.c b/lib/cmpdi2.c
index 6d7ebf6c2b86..2250da7e503e 100644
--- a/lib/cmpdi2.c
+++ b/lib/cmpdi2.c
@@ -16,7 +16,7 @@
 
 #include <linux/export.h>
 
-#include <lib/libgcc.h>
+#include <linux/libgcc.h>
 
 word_type notrace __cmpdi2(long long a, long long b)
 {
diff --git a/lib/lshrdi3.c b/lib/lshrdi3.c
index 8e845f4bb65f..99cfa5721f2d 100644
--- a/lib/lshrdi3.c
+++ b/lib/lshrdi3.c
@@ -17,7 +17,7 @@
  */
 
 #include <linux/module.h>
-#include <lib/libgcc.h>
+#include <linux/libgcc.h>
 
 long long notrace __lshrdi3(long long u, word_type b)
 {
diff --git a/lib/muldi3.c b/lib/muldi3.c
index 88938543e10a..54c8b3123376 100644
--- a/lib/muldi3.c
+++ b/lib/muldi3.c
@@ -15,7 +15,7 @@
  */
 
 #include <linux/export.h>
-#include <lib/libgcc.h>
+#include <linux/libgcc.h>
 
 #define W_TYPE_SIZE 32
 
diff --git a/lib/ucmpdi2.c b/lib/ucmpdi2.c
index 49a53505c8e3..25ca2d4c1e19 100644
--- a/lib/ucmpdi2.c
+++ b/lib/ucmpdi2.c
@@ -15,7 +15,7 @@
  */
 
 #include <linux/module.h>
-#include <lib/libgcc.h>
+#include <linux/libgcc.h>
 
 word_type __ucmpdi2(unsigned long long a, unsigned long long b)
 {
-- 
cgit v1.2.3


From 6d745ee8b5e81f3a33791e3c854fbbfd6f3e585e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 6 Sep 2017 14:56:50 +0200
Subject: iio: stm32: fix adc/trigger link error

The ADC driver can trigger on either the timer or the lptim
trigger, but it only uses a Kconfig 'select' statement
to ensure that the first of the two is present. When the lptim
trigger is enabled as a loadable module, and the adc driver
is built-in, we now get a link error:

drivers/iio/adc/stm32-adc.o: In function `stm32_adc_get_trig_extsel':
stm32-adc.c:(.text+0x4e0): undefined reference to `is_stm32_lptim_trigger'

We could use a second 'select' statement and always have both
trigger drivers enabled when the adc driver is, but it seems that
the lptimer trigger was intentionally left optional, so it seems
better to keep it that way.

This adds a hack to use 'IS_REACHABLE()' rather than 'IS_ENABLED()',
which avoids the link error, but instead leads to the lptimer trigger
not being used in the broken configuration. I've added a runtime
warning for this case to help users figure out what they did wrong
if this should ever be done by accident.

Fixes: f0b638a7f6db ("iio: adc: stm32: add support for lptimer triggers")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: <Stable@vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/timer/stm32-lptim-trigger.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/timer/stm32-lptim-trigger.h b/include/linux/iio/timer/stm32-lptim-trigger.h
index 34d59bfdce2d..464458d20b16 100644
--- a/include/linux/iio/timer/stm32-lptim-trigger.h
+++ b/include/linux/iio/timer/stm32-lptim-trigger.h
@@ -16,11 +16,14 @@
 #define LPTIM2_OUT	"lptim2_out"
 #define LPTIM3_OUT	"lptim3_out"
 
-#if IS_ENABLED(CONFIG_IIO_STM32_LPTIMER_TRIGGER)
+#if IS_REACHABLE(CONFIG_IIO_STM32_LPTIMER_TRIGGER)
 bool is_stm32_lptim_trigger(struct iio_trigger *trig);
 #else
 static inline bool is_stm32_lptim_trigger(struct iio_trigger *trig)
 {
+#if IS_ENABLED(CONFIG_IIO_STM32_LPTIMER_TRIGGER)
+	pr_warn_once("stm32 lptim_trigger not linked in\n");
+#endif
 	return false;
 }
 #endif
-- 
cgit v1.2.3


From a773d419275bf54854ca6cfda8f2594ed2790faa Mon Sep 17 00:00:00 2001
From: Felipe Balbi <felipe.balbi@linux.intel.com>
Date: Fri, 2 Jun 2017 13:20:25 +0300
Subject: tracing: Pass export pointer as argument to ->write()

By passing an export descriptor to the write function, users don't need to
keep a global static pointer and can rely on container_of() to fetch their
own structure.

Link: http://lkml.kernel.org/r/20170602102025.5140-1-felipe.balbi@linux.intel.com

Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Reviewed-by: Chunyan Zhang <zhang.chunyan@linaro.org>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 drivers/hwtracing/stm/ftrace.c | 6 ++++--
 include/linux/trace.h          | 2 +-
 kernel/trace/trace.c           | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwtracing/stm/ftrace.c b/drivers/hwtracing/stm/ftrace.c
index bd126a7c6da2..7da75644c750 100644
--- a/drivers/hwtracing/stm/ftrace.c
+++ b/drivers/hwtracing/stm/ftrace.c
@@ -42,9 +42,11 @@ static struct stm_ftrace {
  * @len:	length of the data packet
  */
 static void notrace
-stm_ftrace_write(const void *buf, unsigned int len)
+stm_ftrace_write(struct trace_export *export, const void *buf, unsigned int len)
 {
-	stm_source_write(&stm_ftrace.data, STM_FTRACE_CHAN, buf, len);
+	struct stm_ftrace *stm = container_of(export, struct stm_ftrace, ftrace);
+
+	stm_source_write(&stm->data, STM_FTRACE_CHAN, buf, len);
 }
 
 static int stm_ftrace_link(struct stm_source_data *data)
diff --git a/include/linux/trace.h b/include/linux/trace.h
index d24991c1fef3..b95ffb2188ab 100644
--- a/include/linux/trace.h
+++ b/include/linux/trace.h
@@ -18,7 +18,7 @@
  */
 struct trace_export {
 	struct trace_export __rcu	*next;
-	void (*write)(const void *, unsigned int);
+	void (*write)(struct trace_export *, const void *, unsigned int);
 };
 
 int register_ftrace_export(struct trace_export *export);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9f3f043ba3b7..59518b8126d0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2415,7 +2415,7 @@ trace_process_export(struct trace_export *export,
 
 	entry = ring_buffer_event_data(event);
 	size = ring_buffer_event_length(event);
-	export->write(entry, size);
+	export->write(export, entry, size);
 }
 
 static DEFINE_MUTEX(ftrace_export_lock);
-- 
cgit v1.2.3


From 4ce413d1840b25b101be3c0559161db8891f3360 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 1 Dec 2017 15:29:39 +0000
Subject: irqdesc: Use bool return type instead of int

The irq_balancing_disabled and irq_is_percpu{,_devid} functions are
clearly intended to return bool like the functions in
kernel/irq/settings.h, but actually return an int containing a masked
value of desc->status_use_accessors. This can lead to subtle breakage
if, for example, the return value is subsequently truncated when
assigned to a narrower type.

As Linus points out:

| In particular, what can (and _has_ happened) is that people end up
| using these functions that return true or false, and they assign the
| result to something like a bitfield (or a char) or whatever.
|
| And the code looks *obviously* correct, when you have things like
|
|      dev->percpu = irq_is_percpu_devid(dev->irq);
|
| and that "percpu" thing is just one status bit among many. It may even
| *work*, because maybe that "percpu" flag ends up not being all that
| important, or it just happens to never be set on the particular
| hardware that people end up testing.
|
| But while it looks obviously correct, and might even work, it's really
| fundamentally broken. Because that "true or false" function didn't
| actually return 0/1, it returned 0 or 0x20000.
|
| And 0x20000 may not fit in a bitmask or a "char" or whatever.

Fix the problem by consistently using bool as the return type for these
functions.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: marc.zyngier@arm.com
Link: https://lkml.kernel.org/r/1512142179-24616-1-git-send-email-will.deacon@arm.com
---
 include/linux/irqdesc.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index dd418955962b..39fb3700f7a9 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -230,7 +230,7 @@ irq_set_chip_handler_name_locked(struct irq_data *data, struct irq_chip *chip,
 	data->chip = chip;
 }
 
-static inline int irq_balancing_disabled(unsigned int irq)
+static inline bool irq_balancing_disabled(unsigned int irq)
 {
 	struct irq_desc *desc;
 
@@ -238,7 +238,7 @@ static inline int irq_balancing_disabled(unsigned int irq)
 	return desc->status_use_accessors & IRQ_NO_BALANCING_MASK;
 }
 
-static inline int irq_is_percpu(unsigned int irq)
+static inline bool irq_is_percpu(unsigned int irq)
 {
 	struct irq_desc *desc;
 
@@ -246,7 +246,7 @@ static inline int irq_is_percpu(unsigned int irq)
 	return desc->status_use_accessors & IRQ_PER_CPU;
 }
 
-static inline int irq_is_percpu_devid(unsigned int irq)
+static inline bool irq_is_percpu_devid(unsigned int irq)
 {
 	struct irq_desc *desc;
 
-- 
cgit v1.2.3


From c895f6f703ad7dd2f99e751d9884b0aa5d0eea25 Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Mon, 4 Dec 2017 10:56:44 +0100
Subject: bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type

Commit 0515e5999a466dfe ("bpf: introduce BPF_PROG_TYPE_PERF_EVENT
program type") introduced the bpf_perf_event_data structure which
exports the pt_regs structure.  This is OK for multiple architectures
but fail for s390 and arm64 which do not export pt_regs.  Programs
using them, for example, the bpf selftest fail to compile on these
architectures.

For s390, exporting the pt_regs is not an option because s390 wants
to allow changes to it.  For arm64, there is a user_pt_regs structure
that covers parts of the pt_regs structure for use by user space.

To solve the broken uapi for s390 and arm64, introduce an abstract
type for pt_regs and add an asm/bpf_perf_event.h file that concretes
the type.  An asm-generic header file covers the architectures that
export pt_regs today.

The arch-specific enablement for s390 and arm64 follows in separate
commits.

Reported-by: Thomas Richter <tmricht@linux.vnet.ibm.com>
Fixes: 0515e5999a466dfe ("bpf: introduce BPF_PROG_TYPE_PERF_EVENT program type")
Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Reviewed-and-tested-by: Thomas Richter <tmricht@linux.vnet.ibm.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/alpha/include/uapi/asm/Kbuild        | 2 ++
 arch/arc/include/uapi/asm/Kbuild          | 1 +
 arch/arm/include/uapi/asm/Kbuild          | 1 +
 arch/blackfin/include/uapi/asm/Kbuild     | 1 +
 arch/c6x/include/uapi/asm/Kbuild          | 1 +
 arch/cris/include/uapi/asm/Kbuild         | 1 +
 arch/frv/include/uapi/asm/Kbuild          | 2 ++
 arch/h8300/include/uapi/asm/Kbuild        | 1 +
 arch/hexagon/include/uapi/asm/Kbuild      | 1 +
 arch/ia64/include/uapi/asm/Kbuild         | 1 +
 arch/m32r/include/uapi/asm/Kbuild         | 1 +
 arch/m68k/include/uapi/asm/Kbuild         | 1 +
 arch/metag/include/uapi/asm/Kbuild        | 1 +
 arch/microblaze/include/uapi/asm/Kbuild   | 1 +
 arch/mips/include/uapi/asm/Kbuild         | 1 +
 arch/mn10300/include/uapi/asm/Kbuild      | 1 +
 arch/nios2/include/uapi/asm/Kbuild        | 1 +
 arch/openrisc/include/uapi/asm/Kbuild     | 1 +
 arch/parisc/include/uapi/asm/Kbuild       | 1 +
 arch/powerpc/include/uapi/asm/Kbuild      | 1 +
 arch/riscv/include/uapi/asm/Kbuild        | 1 +
 arch/score/include/uapi/asm/Kbuild        | 1 +
 arch/sh/include/uapi/asm/Kbuild           | 1 +
 arch/sparc/include/uapi/asm/Kbuild        | 1 +
 arch/tile/include/uapi/asm/Kbuild         | 1 +
 arch/unicore32/include/uapi/asm/Kbuild    | 1 +
 arch/x86/include/uapi/asm/Kbuild          | 1 +
 arch/xtensa/include/uapi/asm/Kbuild       | 1 +
 include/linux/perf_event.h                | 6 +++++-
 include/uapi/asm-generic/bpf_perf_event.h | 9 +++++++++
 include/uapi/linux/bpf_perf_event.h       | 5 ++---
 kernel/events/core.c                      | 2 +-
 32 files changed, 47 insertions(+), 5 deletions(-)
 create mode 100644 include/uapi/asm-generic/bpf_perf_event.h

(limited to 'include/linux')

diff --git a/arch/alpha/include/uapi/asm/Kbuild b/arch/alpha/include/uapi/asm/Kbuild
index b15bf6bc0e94..14a2e9af97e9 100644
--- a/arch/alpha/include/uapi/asm/Kbuild
+++ b/arch/alpha/include/uapi/asm/Kbuild
@@ -1,2 +1,4 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
+
+generic-y += bpf_perf_event.h
diff --git a/arch/arc/include/uapi/asm/Kbuild b/arch/arc/include/uapi/asm/Kbuild
index fa6d0ff4ff89..170b5db64afe 100644
--- a/arch/arc/include/uapi/asm/Kbuild
+++ b/arch/arc/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild
index 4d53de308ee0..4d1cc1847edf 100644
--- a/arch/arm/include/uapi/asm/Kbuild
+++ b/arch/arm/include/uapi/asm/Kbuild
@@ -7,6 +7,7 @@ generated-y += unistd-oabi.h
 generated-y += unistd-eabi.h
 
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += ioctl.h
 generic-y += ipcbuf.h
diff --git a/arch/blackfin/include/uapi/asm/Kbuild b/arch/blackfin/include/uapi/asm/Kbuild
index aa624b4ab655..2240b38c2915 100644
--- a/arch/blackfin/include/uapi/asm/Kbuild
+++ b/arch/blackfin/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += ioctl.h
 generic-y += ipcbuf.h
diff --git a/arch/c6x/include/uapi/asm/Kbuild b/arch/c6x/include/uapi/asm/Kbuild
index 67ee896a76a7..26644e15d854 100644
--- a/arch/c6x/include/uapi/asm/Kbuild
+++ b/arch/c6x/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/cris/include/uapi/asm/Kbuild b/arch/cris/include/uapi/asm/Kbuild
index 3687b54bb18e..3470c6e9c7b9 100644
--- a/arch/cris/include/uapi/asm/Kbuild
+++ b/arch/cris/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/frv/include/uapi/asm/Kbuild b/arch/frv/include/uapi/asm/Kbuild
index b15bf6bc0e94..14a2e9af97e9 100644
--- a/arch/frv/include/uapi/asm/Kbuild
+++ b/arch/frv/include/uapi/asm/Kbuild
@@ -1,2 +1,4 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
+
+generic-y += bpf_perf_event.h
diff --git a/arch/h8300/include/uapi/asm/Kbuild b/arch/h8300/include/uapi/asm/Kbuild
index 187aed820e71..2f65f78792cb 100644
--- a/arch/h8300/include/uapi/asm/Kbuild
+++ b/arch/h8300/include/uapi/asm/Kbuild
@@ -2,6 +2,7 @@
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/hexagon/include/uapi/asm/Kbuild b/arch/hexagon/include/uapi/asm/Kbuild
index cb5df3aad3a8..41a176dbb53e 100644
--- a/arch/hexagon/include/uapi/asm/Kbuild
+++ b/arch/hexagon/include/uapi/asm/Kbuild
@@ -2,6 +2,7 @@
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/ia64/include/uapi/asm/Kbuild b/arch/ia64/include/uapi/asm/Kbuild
index 13a97aa2285f..f5c6967a93bb 100644
--- a/arch/ia64/include/uapi/asm/Kbuild
+++ b/arch/ia64/include/uapi/asm/Kbuild
@@ -1,4 +1,5 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generic-y += kvm_para.h
diff --git a/arch/m32r/include/uapi/asm/Kbuild b/arch/m32r/include/uapi/asm/Kbuild
index 1c44d3b3eba0..451bf6071c6e 100644
--- a/arch/m32r/include/uapi/asm/Kbuild
+++ b/arch/m32r/include/uapi/asm/Kbuild
@@ -1,5 +1,6 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generic-y += kvm_para.h
 generic-y += siginfo.h
diff --git a/arch/m68k/include/uapi/asm/Kbuild b/arch/m68k/include/uapi/asm/Kbuild
index 3717b64a620d..c2e26a44c482 100644
--- a/arch/m68k/include/uapi/asm/Kbuild
+++ b/arch/m68k/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += ioctl.h
 generic-y += ipcbuf.h
diff --git a/arch/metag/include/uapi/asm/Kbuild b/arch/metag/include/uapi/asm/Kbuild
index 6ac763d9a3e3..f9eaf07d29f8 100644
--- a/arch/metag/include/uapi/asm/Kbuild
+++ b/arch/metag/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/microblaze/include/uapi/asm/Kbuild b/arch/microblaze/include/uapi/asm/Kbuild
index 06609ca36115..2c6a6bffea32 100644
--- a/arch/microblaze/include/uapi/asm/Kbuild
+++ b/arch/microblaze/include/uapi/asm/Kbuild
@@ -2,6 +2,7 @@
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/mips/include/uapi/asm/Kbuild b/arch/mips/include/uapi/asm/Kbuild
index a0266feba9e6..7a4becd8963a 100644
--- a/arch/mips/include/uapi/asm/Kbuild
+++ b/arch/mips/include/uapi/asm/Kbuild
@@ -1,4 +1,5 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generic-y += ipcbuf.h
diff --git a/arch/mn10300/include/uapi/asm/Kbuild b/arch/mn10300/include/uapi/asm/Kbuild
index c94ee54210bc..81271d3af47c 100644
--- a/arch/mn10300/include/uapi/asm/Kbuild
+++ b/arch/mn10300/include/uapi/asm/Kbuild
@@ -1,4 +1,5 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y	+= bpf_perf_event.h
 generic-y	+= siginfo.h
diff --git a/arch/nios2/include/uapi/asm/Kbuild b/arch/nios2/include/uapi/asm/Kbuild
index ffca24da7647..13a3d77b4d7b 100644
--- a/arch/nios2/include/uapi/asm/Kbuild
+++ b/arch/nios2/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/openrisc/include/uapi/asm/Kbuild b/arch/openrisc/include/uapi/asm/Kbuild
index 62286dbeb904..130c16ccba0a 100644
--- a/arch/openrisc/include/uapi/asm/Kbuild
+++ b/arch/openrisc/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/parisc/include/uapi/asm/Kbuild b/arch/parisc/include/uapi/asm/Kbuild
index 196d2a4efb31..286ef5a5904b 100644
--- a/arch/parisc/include/uapi/asm/Kbuild
+++ b/arch/parisc/include/uapi/asm/Kbuild
@@ -2,6 +2,7 @@
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
+generic-y += bpf_perf_event.h
 generic-y += kvm_para.h
 generic-y += param.h
 generic-y += poll.h
diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild
index 0d960ef78a9a..1a6ed5919ffd 100644
--- a/arch/powerpc/include/uapi/asm/Kbuild
+++ b/arch/powerpc/include/uapi/asm/Kbuild
@@ -1,6 +1,7 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generic-y += param.h
 generic-y += poll.h
 generic-y += resource.h
diff --git a/arch/riscv/include/uapi/asm/Kbuild b/arch/riscv/include/uapi/asm/Kbuild
index 5ded96b06352..7e91f4850475 100644
--- a/arch/riscv/include/uapi/asm/Kbuild
+++ b/arch/riscv/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += setup.h
 generic-y += unistd.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/score/include/uapi/asm/Kbuild b/arch/score/include/uapi/asm/Kbuild
index c94ee54210bc..81271d3af47c 100644
--- a/arch/score/include/uapi/asm/Kbuild
+++ b/arch/score/include/uapi/asm/Kbuild
@@ -1,4 +1,5 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y	+= bpf_perf_event.h
 generic-y	+= siginfo.h
diff --git a/arch/sh/include/uapi/asm/Kbuild b/arch/sh/include/uapi/asm/Kbuild
index e28531333efa..ba4d39cb321d 100644
--- a/arch/sh/include/uapi/asm/Kbuild
+++ b/arch/sh/include/uapi/asm/Kbuild
@@ -2,6 +2,7 @@
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/sparc/include/uapi/asm/Kbuild b/arch/sparc/include/uapi/asm/Kbuild
index 2178c78c7c1a..4680ba246b55 100644
--- a/arch/sparc/include/uapi/asm/Kbuild
+++ b/arch/sparc/include/uapi/asm/Kbuild
@@ -1,4 +1,5 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generic-y += types.h
diff --git a/arch/tile/include/uapi/asm/Kbuild b/arch/tile/include/uapi/asm/Kbuild
index 5711de0a1b5e..cc439612bcd5 100644
--- a/arch/tile/include/uapi/asm/Kbuild
+++ b/arch/tile/include/uapi/asm/Kbuild
@@ -1,6 +1,7 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/unicore32/include/uapi/asm/Kbuild b/arch/unicore32/include/uapi/asm/Kbuild
index 759a71411169..8611ef980554 100644
--- a/arch/unicore32/include/uapi/asm/Kbuild
+++ b/arch/unicore32/include/uapi/asm/Kbuild
@@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
index da1489cb64dc..1e901e421f2d 100644
--- a/arch/x86/include/uapi/asm/Kbuild
+++ b/arch/x86/include/uapi/asm/Kbuild
@@ -1,6 +1,7 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += bpf_perf_event.h
 generated-y += unistd_32.h
 generated-y += unistd_64.h
 generated-y += unistd_x32.h
diff --git a/arch/xtensa/include/uapi/asm/Kbuild b/arch/xtensa/include/uapi/asm/Kbuild
index a5bcdfb890f1..837d4dd76785 100644
--- a/arch/xtensa/include/uapi/asm/Kbuild
+++ b/arch/xtensa/include/uapi/asm/Kbuild
@@ -2,6 +2,7 @@
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += bitsperlong.h
+generic-y += bpf_perf_event.h
 generic-y += errno.h
 generic-y += fcntl.h
 generic-y += ioctl.h
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2c9c87d8a0c1..7546822a1d74 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -15,6 +15,7 @@
 #define _LINUX_PERF_EVENT_H
 
 #include <uapi/linux/perf_event.h>
+#include <uapi/linux/bpf_perf_event.h>
 
 /*
  * Kernel-internal data types and definitions:
@@ -787,7 +788,7 @@ struct perf_output_handle {
 };
 
 struct bpf_perf_event_data_kern {
-	struct pt_regs *regs;
+	bpf_user_pt_regs_t *regs;
 	struct perf_sample_data *data;
 	struct perf_event *event;
 };
@@ -1177,6 +1178,9 @@ extern void perf_bp_event(struct perf_event *event, void *data);
 		(user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL)
 # define perf_instruction_pointer(regs)	instruction_pointer(regs)
 #endif
+#ifndef perf_arch_bpf_user_pt_regs
+# define perf_arch_bpf_user_pt_regs(regs) regs
+#endif
 
 static inline bool has_branch_stack(struct perf_event *event)
 {
diff --git a/include/uapi/asm-generic/bpf_perf_event.h b/include/uapi/asm-generic/bpf_perf_event.h
new file mode 100644
index 000000000000..53815d2cd047
--- /dev/null
+++ b/include/uapi/asm-generic/bpf_perf_event.h
@@ -0,0 +1,9 @@
+#ifndef _UAPI__ASM_GENERIC_BPF_PERF_EVENT_H__
+#define _UAPI__ASM_GENERIC_BPF_PERF_EVENT_H__
+
+#include <linux/ptrace.h>
+
+/* Export kernel pt_regs structure */
+typedef struct pt_regs bpf_user_pt_regs_t;
+
+#endif /* _UAPI__ASM_GENERIC_BPF_PERF_EVENT_H__ */
diff --git a/include/uapi/linux/bpf_perf_event.h b/include/uapi/linux/bpf_perf_event.h
index af549d4ecf1b..8f95303f9d80 100644
--- a/include/uapi/linux/bpf_perf_event.h
+++ b/include/uapi/linux/bpf_perf_event.h
@@ -8,11 +8,10 @@
 #ifndef _UAPI__LINUX_BPF_PERF_EVENT_H__
 #define _UAPI__LINUX_BPF_PERF_EVENT_H__
 
-#include <linux/types.h>
-#include <linux/ptrace.h>
+#include <asm/bpf_perf_event.h>
 
 struct bpf_perf_event_data {
-	struct pt_regs regs;
+	bpf_user_pt_regs_t regs;
 	__u64 sample_period;
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 16beab4767e1..ba957b9812b3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7987,11 +7987,11 @@ static void bpf_overflow_handler(struct perf_event *event,
 {
 	struct bpf_perf_event_data_kern ctx = {
 		.data = data,
-		.regs = regs,
 		.event = event,
 	};
 	int ret = 0;
 
+	ctx.regs = perf_arch_bpf_user_pt_regs(regs);
 	preempt_disable();
 	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
 		goto out;
-- 
cgit v1.2.3


From f775b13eedee2f7f3c6fdd4e90fb79090ce5d339 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 14 Nov 2017 16:54:23 -0500
Subject: x86,kvm: move qemu/guest FPU switching out to vcpu_run
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, every time a VCPU is scheduled out, the host kernel will
first save the guest FPU/xstate context, then load the qemu userspace
FPU context, only to then immediately save the qemu userspace FPU
context back to memory. When scheduling in a VCPU, the same extraneous
FPU loads and saves are done.

This could be avoided by moving from a model where the guest FPU is
loaded and stored with preemption disabled, to a model where the
qemu userspace FPU is swapped out for the guest FPU context for
the duration of the KVM_RUN ioctl.

This is done under the VCPU mutex, which is also taken when other
tasks inspect the VCPU FPU context, so the code should already be
safe for this change. That should come as no surprise, given that
s390 already has this optimization.

This can fix a bug where KVM calls get_user_pages while owning the
FPU, and the file system ends up requesting the FPU again:

    [258270.527947]  __warn+0xcb/0xf0
    [258270.527948]  warn_slowpath_null+0x1d/0x20
    [258270.527951]  kernel_fpu_disable+0x3f/0x50
    [258270.527953]  __kernel_fpu_begin+0x49/0x100
    [258270.527955]  kernel_fpu_begin+0xe/0x10
    [258270.527958]  crc32c_pcl_intel_update+0x84/0xb0
    [258270.527961]  crypto_shash_update+0x3f/0x110
    [258270.527968]  crc32c+0x63/0x8a [libcrc32c]
    [258270.527975]  dm_bm_checksum+0x1b/0x20 [dm_persistent_data]
    [258270.527978]  node_prepare_for_write+0x44/0x70 [dm_persistent_data]
    [258270.527985]  dm_block_manager_write_callback+0x41/0x50 [dm_persistent_data]
    [258270.527988]  submit_io+0x170/0x1b0 [dm_bufio]
    [258270.527992]  __write_dirty_buffer+0x89/0x90 [dm_bufio]
    [258270.527994]  __make_buffer_clean+0x4f/0x80 [dm_bufio]
    [258270.527996]  __try_evict_buffer+0x42/0x60 [dm_bufio]
    [258270.527998]  dm_bufio_shrink_scan+0xc0/0x130 [dm_bufio]
    [258270.528002]  shrink_slab.part.40+0x1f5/0x420
    [258270.528004]  shrink_node+0x22c/0x320
    [258270.528006]  do_try_to_free_pages+0xf5/0x330
    [258270.528008]  try_to_free_pages+0xe9/0x190
    [258270.528009]  __alloc_pages_slowpath+0x40f/0xba0
    [258270.528011]  __alloc_pages_nodemask+0x209/0x260
    [258270.528014]  alloc_pages_vma+0x1f1/0x250
    [258270.528017]  do_huge_pmd_anonymous_page+0x123/0x660
    [258270.528021]  handle_mm_fault+0xfd3/0x1330
    [258270.528025]  __get_user_pages+0x113/0x640
    [258270.528027]  get_user_pages+0x4f/0x60
    [258270.528063]  __gfn_to_pfn_memslot+0x120/0x3f0 [kvm]
    [258270.528108]  try_async_pf+0x66/0x230 [kvm]
    [258270.528135]  tdp_page_fault+0x130/0x280 [kvm]
    [258270.528149]  kvm_mmu_page_fault+0x60/0x120 [kvm]
    [258270.528158]  handle_ept_violation+0x91/0x170 [kvm_intel]
    [258270.528162]  vmx_handle_exit+0x1ca/0x1400 [kvm_intel]

No performance changes were detected in quick ping-pong tests on
my 4 socket system, which is expected since an FPU+xstate load is
on the order of 0.1us, while ping-ponging between CPUs is on the
order of 20us, and somewhat noisy.

Cc: stable@vger.kernel.org
Signed-off-by: Rik van Riel <riel@redhat.com>
Suggested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
[Fixed a bug where reset_vcpu called put_fpu without preceding load_fpu,
 which happened inside from KVM_CREATE_VCPU ioctl. - Radim]
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 13 +++++++++++++
 arch/x86/kvm/x86.c              | 39 +++++++++++++++++----------------------
 include/linux/kvm_host.h        |  2 +-
 3 files changed, 31 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 977de5fb968b..62527e053ee4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -536,7 +536,20 @@ struct kvm_vcpu_arch {
 	struct kvm_mmu_memory_cache mmu_page_cache;
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
 
+	/*
+	 * QEMU userspace and the guest each have their own FPU state.
+	 * In vcpu_run, we switch between the user and guest FPU contexts.
+	 * While running a VCPU, the VCPU thread will have the guest FPU
+	 * context.
+	 *
+	 * Note that while the PKRU state lives inside the fpu registers,
+	 * it is switched out separately at VMENTER and VMEXIT time. The
+	 * "guest_fpu" state here contains the guest FPU context, with the
+	 * host PRKU bits.
+	 */
+	struct fpu user_fpu;
 	struct fpu guest_fpu;
+
 	u64 xcr0;
 	u64 guest_supported_xcr0;
 	u32 guest_xstate_size;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eee8e7faf1af..c8da1680a7d6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2937,7 +2937,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	pagefault_enable();
 	kvm_x86_ops->vcpu_put(vcpu);
-	kvm_put_guest_fpu(vcpu);
 	vcpu->arch.last_host_tsc = rdtsc();
 }
 
@@ -5254,13 +5253,10 @@ static void emulator_halt(struct x86_emulate_ctxt *ctxt)
 
 static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
 {
-	preempt_disable();
-	kvm_load_guest_fpu(emul_to_vcpu(ctxt));
 }
 
 static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
 {
-	preempt_enable();
 }
 
 static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
@@ -6952,7 +6948,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	preempt_disable();
 
 	kvm_x86_ops->prepare_guest_switch(vcpu);
-	kvm_load_guest_fpu(vcpu);
 
 	/*
 	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
@@ -7297,12 +7292,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		}
 	}
 
+	kvm_load_guest_fpu(vcpu);
+
 	if (unlikely(vcpu->arch.complete_userspace_io)) {
 		int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
 		vcpu->arch.complete_userspace_io = NULL;
 		r = cui(vcpu);
 		if (r <= 0)
-			goto out;
+			goto out_fpu;
 	} else
 		WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
 
@@ -7311,6 +7308,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	else
 		r = vcpu_run(vcpu);
 
+out_fpu:
+	kvm_put_guest_fpu(vcpu);
 out:
 	post_kvm_run_save(vcpu);
 	kvm_sigset_deactivate(vcpu);
@@ -7704,32 +7703,25 @@ static void fx_init(struct kvm_vcpu *vcpu)
 	vcpu->arch.cr0 |= X86_CR0_ET;
 }
 
+/* Swap (qemu) user FPU context for the guest FPU context. */
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->guest_fpu_loaded)
-		return;
-
-	/*
-	 * Restore all possible states in the guest,
-	 * and assume host would use all available bits.
-	 * Guest xcr0 would be loaded later.
-	 */
-	vcpu->guest_fpu_loaded = 1;
-	__kernel_fpu_begin();
+	preempt_disable();
+	copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
 	/* PKRU is separately restored in kvm_x86_ops->run.  */
 	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
 				~XFEATURE_MASK_PKRU);
+	preempt_enable();
 	trace_kvm_fpu(1);
 }
 
+/* When vcpu_run ends, restore user space FPU context. */
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
-	if (!vcpu->guest_fpu_loaded)
-		return;
-
-	vcpu->guest_fpu_loaded = 0;
+	preempt_disable();
 	copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
-	__kernel_fpu_end();
+	copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
+	preempt_enable();
 	++vcpu->stat.fpu_reload;
 	trace_kvm_fpu(0);
 }
@@ -7846,7 +7838,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 		 * To avoid have the INIT path from kvm_apic_has_events() that be
 		 * called with loaded FPU and does not let userspace fix the state.
 		 */
-		kvm_put_guest_fpu(vcpu);
+		if (init_event)
+			kvm_put_guest_fpu(vcpu);
 		mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
 					XFEATURE_MASK_BNDREGS);
 		if (mpx_state_buffer)
@@ -7855,6 +7848,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 					XFEATURE_MASK_BNDCSR);
 		if (mpx_state_buffer)
 			memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
+		if (init_event)
+			kvm_load_guest_fpu(vcpu);
 	}
 
 	if (!init_event) {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 893d6d606cd0..6bdd4b9f6611 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -232,7 +232,7 @@ struct kvm_vcpu {
 	struct mutex mutex;
 	struct kvm_run *run;
 
-	int guest_fpu_loaded, guest_xcr0_loaded;
+	int guest_xcr0_loaded;
 	struct swait_queue_head wq;
 	struct pid __rcu *pid;
 	int sigset_active;
-- 
cgit v1.2.3


From d7efc6c11b277d9d80b99b1334a78bfe7d7edf10 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Dec 2017 12:45:56 -0800
Subject: net: remove hlist_nulls_add_tail_rcu()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Alexander Potapenko reported use of uninitialized memory [1]

This happens when inserting a request socket into TCP ehash,
in __sk_nulls_add_node_rcu(), since sk_reuseport is not initialized.

Bug was added by commit d894ba18d4e4 ("soreuseport: fix ordering for
mixed v4/v6 sockets")

Note that d296ba60d8e2 ("soreuseport: Resolve merge conflict for v4/v6
ordering fix") missed the opportunity to get rid of
hlist_nulls_add_tail_rcu() :

Both UDP sockets and TCP/DCCP listeners no longer use
__sk_nulls_add_node_rcu() for their hash insertion.

Since all other sockets have unique 4-tuple, the reuseport status
has no special meaning, so we can always use hlist_nulls_add_head_rcu()
for them and save few cycles/instructions.

[1]

==================================================================
BUG: KMSAN: use of uninitialized memory in inet_ehash_insert+0xd40/0x1050
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.13.0+ #3288
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:16
 dump_stack+0x185/0x1d0 lib/dump_stack.c:52
 kmsan_report+0x13f/0x1c0 mm/kmsan/kmsan.c:1016
 __msan_warning_32+0x69/0xb0 mm/kmsan/kmsan_instr.c:766
 __sk_nulls_add_node_rcu ./include/net/sock.h:684
 inet_ehash_insert+0xd40/0x1050 net/ipv4/inet_hashtables.c:413
 reqsk_queue_hash_req net/ipv4/inet_connection_sock.c:754
 inet_csk_reqsk_queue_hash_add+0x1cc/0x300 net/ipv4/inet_connection_sock.c:765
 tcp_conn_request+0x31e7/0x36f0 net/ipv4/tcp_input.c:6414
 tcp_v4_conn_request+0x16d/0x220 net/ipv4/tcp_ipv4.c:1314
 tcp_rcv_state_process+0x42a/0x7210 net/ipv4/tcp_input.c:5917
 tcp_v4_do_rcv+0xa6a/0xcd0 net/ipv4/tcp_ipv4.c:1483
 tcp_v4_rcv+0x3de0/0x4ab0 net/ipv4/tcp_ipv4.c:1763
 ip_local_deliver_finish+0x6bb/0xcb0 net/ipv4/ip_input.c:216
 NF_HOOK ./include/linux/netfilter.h:248
 ip_local_deliver+0x3fa/0x480 net/ipv4/ip_input.c:257
 dst_input ./include/net/dst.h:477
 ip_rcv_finish+0x6fb/0x1540 net/ipv4/ip_input.c:397
 NF_HOOK ./include/linux/netfilter.h:248
 ip_rcv+0x10f6/0x15c0 net/ipv4/ip_input.c:488
 __netif_receive_skb_core+0x36f6/0x3f60 net/core/dev.c:4298
 __netif_receive_skb net/core/dev.c:4336
 netif_receive_skb_internal+0x63c/0x19c0 net/core/dev.c:4497
 napi_skb_finish net/core/dev.c:4858
 napi_gro_receive+0x629/0xa50 net/core/dev.c:4889
 e1000_receive_skb drivers/net/ethernet/intel/e1000/e1000_main.c:4018
 e1000_clean_rx_irq+0x1492/0x1d30
drivers/net/ethernet/intel/e1000/e1000_main.c:4474
 e1000_clean+0x43aa/0x5970 drivers/net/ethernet/intel/e1000/e1000_main.c:3819
 napi_poll net/core/dev.c:5500
 net_rx_action+0x73c/0x1820 net/core/dev.c:5566
 __do_softirq+0x4b4/0x8dd kernel/softirq.c:284
 invoke_softirq kernel/softirq.c:364
 irq_exit+0x203/0x240 kernel/softirq.c:405
 exiting_irq+0xe/0x10 ./arch/x86/include/asm/apic.h:638
 do_IRQ+0x15e/0x1a0 arch/x86/kernel/irq.c:263
 common_interrupt+0x86/0x86

Fixes: d894ba18d4e4 ("soreuseport: fix ordering for mixed v4/v6 sockets")
Fixes: d296ba60d8e2 ("soreuseport: Resolve merge conflict for v4/v6 ordering fix")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Alexander Potapenko <glider@google.com>
Acked-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rculist_nulls.h | 38 --------------------------------------
 include/net/sock.h            |  6 +-----
 2 files changed, 1 insertion(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index a328e8181e49..e4b257ff881b 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -100,44 +100,6 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
 		first->pprev = &n->next;
 }
 
-/**
- * hlist_nulls_add_tail_rcu
- * @n: the element to add to the hash list.
- * @h: the list to add to.
- *
- * Description:
- * Adds the specified element to the end of the specified hlist_nulls,
- * while permitting racing traversals.  NOTE: tail insertion requires
- * list traversal.
- *
- * The caller must take whatever precautions are necessary
- * (such as holding appropriate locks) to avoid racing
- * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
- * or hlist_nulls_del_rcu(), running on this same list.
- * However, it is perfectly legal to run concurrently with
- * the _rcu list-traversal primitives, such as
- * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
- * problems on Alpha CPUs.  Regardless of the type of CPU, the
- * list-traversal primitive must be guarded by rcu_read_lock().
- */
-static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
-					struct hlist_nulls_head *h)
-{
-	struct hlist_nulls_node *i, *last = NULL;
-
-	for (i = hlist_nulls_first_rcu(h); !is_a_nulls(i);
-	     i = hlist_nulls_next_rcu(i))
-		last = i;
-
-	if (last) {
-		n->next = last->next;
-		n->pprev = &last->next;
-		rcu_assign_pointer(hlist_nulls_next_rcu(last), n);
-	} else {
-		hlist_nulls_add_head_rcu(n, h);
-	}
-}
-
 /**
  * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
  * @tpos:	the type * to use as a loop cursor.
diff --git a/include/net/sock.h b/include/net/sock.h
index 79e1a2c7912c..9155da422692 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -685,11 +685,7 @@ static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
 
 static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
 {
-	if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
-	    sk->sk_family == AF_INET6)
-		hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
-	else
-		hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
+	hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
 }
 
 static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
-- 
cgit v1.2.3


From af97a77bc01ce49a466f9d4c0125479e2e2230b6 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 6 Dec 2017 09:50:08 +0000
Subject: efi: Move some sysfs files to be read-only by root

Thanks to the scripts/leaking_addresses.pl script, it was found that
some EFI values should not be readable by non-root users.

So make them root-only, and to do that, add a __ATTR_RO_MODE() macro to
make this easier, and use it in other places at the same time.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Tested-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Cc: stable <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/20171206095010.24170-2-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/firmware/efi/efi.c         |  3 +--
 drivers/firmware/efi/esrt.c        | 15 ++++++---------
 drivers/firmware/efi/runtime-map.c | 10 +++++-----
 include/linux/sysfs.h              |  6 ++++++
 4 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index f70febf680c3..c3eefa126e3b 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -143,8 +143,7 @@ static ssize_t systab_show(struct kobject *kobj,
 	return str - buf;
 }
 
-static struct kobj_attribute efi_attr_systab =
-			__ATTR(systab, 0400, systab_show, NULL);
+static struct kobj_attribute efi_attr_systab = __ATTR_RO_MODE(systab, 0400);
 
 #define EFI_FIELD(var) efi.var
 
diff --git a/drivers/firmware/efi/esrt.c b/drivers/firmware/efi/esrt.c
index bd7ed3c1148a..7aae2483fcb9 100644
--- a/drivers/firmware/efi/esrt.c
+++ b/drivers/firmware/efi/esrt.c
@@ -106,7 +106,7 @@ static const struct sysfs_ops esre_attr_ops = {
 };
 
 /* Generic ESRT Entry ("ESRE") support. */
-static ssize_t esre_fw_class_show(struct esre_entry *entry, char *buf)
+static ssize_t fw_class_show(struct esre_entry *entry, char *buf)
 {
 	char *str = buf;
 
@@ -117,18 +117,16 @@ static ssize_t esre_fw_class_show(struct esre_entry *entry, char *buf)
 	return str - buf;
 }
 
-static struct esre_attribute esre_fw_class = __ATTR(fw_class, 0400,
-	esre_fw_class_show, NULL);
+static struct esre_attribute esre_fw_class = __ATTR_RO_MODE(fw_class, 0400);
 
 #define esre_attr_decl(name, size, fmt) \
-static ssize_t esre_##name##_show(struct esre_entry *entry, char *buf) \
+static ssize_t name##_show(struct esre_entry *entry, char *buf) \
 { \
 	return sprintf(buf, fmt "\n", \
 		       le##size##_to_cpu(entry->esre.esre1->name)); \
 } \
 \
-static struct esre_attribute esre_##name = __ATTR(name, 0400, \
-	esre_##name##_show, NULL)
+static struct esre_attribute esre_##name = __ATTR_RO_MODE(name, 0400)
 
 esre_attr_decl(fw_type, 32, "%u");
 esre_attr_decl(fw_version, 32, "%u");
@@ -193,14 +191,13 @@ static int esre_create_sysfs_entry(void *esre, int entry_num)
 
 /* support for displaying ESRT fields at the top level */
 #define esrt_attr_decl(name, size, fmt) \
-static ssize_t esrt_##name##_show(struct kobject *kobj, \
+static ssize_t name##_show(struct kobject *kobj, \
 				  struct kobj_attribute *attr, char *buf)\
 { \
 	return sprintf(buf, fmt "\n", le##size##_to_cpu(esrt->name)); \
 } \
 \
-static struct kobj_attribute esrt_##name = __ATTR(name, 0400, \
-	esrt_##name##_show, NULL)
+static struct kobj_attribute esrt_##name = __ATTR_RO_MODE(name, 0400)
 
 esrt_attr_decl(fw_resource_count, 32, "%u");
 esrt_attr_decl(fw_resource_count_max, 32, "%u");
diff --git a/drivers/firmware/efi/runtime-map.c b/drivers/firmware/efi/runtime-map.c
index 8e64b77aeac9..f377609ff141 100644
--- a/drivers/firmware/efi/runtime-map.c
+++ b/drivers/firmware/efi/runtime-map.c
@@ -63,11 +63,11 @@ static ssize_t map_attr_show(struct kobject *kobj, struct attribute *attr,
 	return map_attr->show(entry, buf);
 }
 
-static struct map_attribute map_type_attr = __ATTR_RO(type);
-static struct map_attribute map_phys_addr_attr   = __ATTR_RO(phys_addr);
-static struct map_attribute map_virt_addr_attr  = __ATTR_RO(virt_addr);
-static struct map_attribute map_num_pages_attr  = __ATTR_RO(num_pages);
-static struct map_attribute map_attribute_attr  = __ATTR_RO(attribute);
+static struct map_attribute map_type_attr = __ATTR_RO_MODE(type, 0400);
+static struct map_attribute map_phys_addr_attr = __ATTR_RO_MODE(phys_addr, 0400);
+static struct map_attribute map_virt_addr_attr = __ATTR_RO_MODE(virt_addr, 0400);
+static struct map_attribute map_num_pages_attr = __ATTR_RO_MODE(num_pages, 0400);
+static struct map_attribute map_attribute_attr = __ATTR_RO_MODE(attribute, 0400);
 
 /*
  * These are default attributes that are added for every memmap entry.
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index e32dfe098e82..40839c02d28c 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -117,6 +117,12 @@ struct attribute_group {
 	.show	= _name##_show,						\
 }
 
+#define __ATTR_RO_MODE(_name, _mode) {					\
+	.attr	= { .name = __stringify(_name),				\
+		    .mode = VERIFY_OCTAL_PERMISSIONS(_mode) },		\
+	.show	= _name##_show,						\
+}
+
 #define __ATTR_WO(_name) {						\
 	.attr	= { .name = __stringify(_name), .mode = S_IWUSR },	\
 	.store	= _name##_store,					\
-- 
cgit v1.2.3


From 7912af5c835bd86f2b0347a480e0f40e2fab30d0 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Wed, 6 Dec 2017 14:55:05 -0600
Subject: PCI: Add pci_get_domain_bus_and_slot() stub

The coretemp driver build fails when CONFIG_PCI is not enabled because it
uses a function that does not have a stub for that config case, so add the
function stub.

  ../drivers/hwmon/coretemp.c: In function 'adjust_tjmax':
  ../drivers/hwmon/coretemp.c:250:9: error: implicit declaration of function 'pci_get_domain_bus_and_slot' [-Werror=implicit-function-declaration]
    struct pci_dev *host_bridge = pci_get_domain_bus_and_slot(0, 0, devfn);
  ../drivers/hwmon/coretemp.c:250:32: warning: initialization makes pointer from integer without a cast [enabled by default]
    struct pci_dev *host_bridge = pci_get_domain_bus_and_slot(0, 0, devfn);

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
[bhelgaas: identical patch also by Arnd Bergmann <arnd@arndb.de>]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Guenter Roeck <linux@roeck-us.net>
---
 include/linux/pci.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 0403894147a3..c170c9250c8b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1674,6 +1674,9 @@ static inline struct pci_dev *pci_get_slot(struct pci_bus *bus,
 static inline struct pci_dev *pci_get_bus_and_slot(unsigned int bus,
 						unsigned int devfn)
 { return NULL; }
+static inline struct pci_dev *pci_get_domain_bus_and_slot(int domain,
+					unsigned int bus, unsigned int devfn)
+{ return NULL; }
 
 static inline int pci_domain_nr(struct pci_bus *bus) { return 0; }
 static inline struct pci_dev *pci_dev_get(struct pci_dev *dev) { return NULL; }
-- 
cgit v1.2.3


From b860b419d970f286294fbfb2b21a4028fd8ee442 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 6 Dec 2017 12:21:35 +0100
Subject: mfd: Fix RTS5227 (and others) powermanagement

Commit 8275b77a1513 ("mfd: rts5249: Add support for RTS5250S power
saving") adds powersaving support for device-ids 5249 524a and 525a.

But as a side effect it breaks ASPM support for all the other device-ids,
causing e.g. the Haswell CPU on a Lenovo T440s to not go into a higher
c-state then PC3, while previously it would go to PC7, causing the
machine to idle at 7.4W instead of 6.6W!

The problem here is the new option.dev_aspm_mode field, which only gets
explicitly initialized in the new code for the device-ids 5249 524a and
525a. Leaving the dev_aspm_mode 0 for the other device-ids.

The default dev_aspm_mode 0 is mapped to DEV_ASPM_DISABLE, but the
old behavior of calling rtsx_pci_enable_aspm() when idle and
rtsx_pci_disable_aspm() when busy happens when dev_aspm_mode ==
DEV_ASPM_DYNAMIC.

This commit changes the enum so that 0 = DEV_ASPM_DYNAMIC matching the
old default behavior, fixing the pm regression with the other device-ids.

Fixes: 8275b77a1513 ("mfd: rts5249: Add support for RTS5250S power saving")
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Rui Feng <rui_feng@realsil.com.cn>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/rtsx_pci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/rtsx_pci.h b/include/linux/mfd/rtsx_pci.h
index a2a1318a3d0c..c3d3f04d8cc6 100644
--- a/include/linux/mfd/rtsx_pci.h
+++ b/include/linux/mfd/rtsx_pci.h
@@ -915,10 +915,10 @@ enum PDEV_STAT  {PDEV_STAT_IDLE, PDEV_STAT_RUN};
 #define LTR_L1SS_PWR_GATE_CHECK_CARD_EN	BIT(6)
 
 enum dev_aspm_mode {
-	DEV_ASPM_DISABLE = 0,
 	DEV_ASPM_DYNAMIC,
 	DEV_ASPM_BACKDOOR,
 	DEV_ASPM_STATIC,
+	DEV_ASPM_DISABLE,
 };
 
 /*
-- 
cgit v1.2.3


From a4abd7a80addb4a9547f7dfc7812566b60ec505c Mon Sep 17 00:00:00 2001
From: Bjørn Mork <bjorn@mork.no>
Date: Wed, 6 Dec 2017 20:21:24 +0100
Subject: usbnet: fix alignment for frames with no ethernet header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The qmi_wwan minidriver support a 'raw-ip' mode where frames are
received without any ethernet header. This causes alignment issues
because the skbs allocated by usbnet are "IP aligned".

Fix by allowing minidrivers to disable the additional alignment
offset. This is implemented using a per-device flag, since the same
minidriver also supports 'ethernet' mode.

Fixes: 32f7adf633b9 ("net: qmi_wwan: support "raw IP" mode")
Reported-and-tested-by: Jay Foster <jay@systech.com>
Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/qmi_wwan.c | 2 ++
 drivers/net/usb/usbnet.c   | 5 ++++-
 include/linux/usb/usbnet.h | 1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index c750cf7c042b..304ec6555cd8 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -261,9 +261,11 @@ static void qmi_wwan_netdev_setup(struct net_device *net)
 		net->hard_header_len = 0;
 		net->addr_len        = 0;
 		net->flags           = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
+		set_bit(EVENT_NO_IP_ALIGN, &dev->flags);
 		netdev_dbg(net, "mode: raw IP\n");
 	} else if (!net->header_ops) { /* don't bother if already set */
 		ether_setup(net);
+		clear_bit(EVENT_NO_IP_ALIGN, &dev->flags);
 		netdev_dbg(net, "mode: Ethernet\n");
 	}
 
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 80348b6a8646..d56fe32bf48d 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -484,7 +484,10 @@ static int rx_submit (struct usbnet *dev, struct urb *urb, gfp_t flags)
 		return -ENOLINK;
 	}
 
-	skb = __netdev_alloc_skb_ip_align(dev->net, size, flags);
+	if (test_bit(EVENT_NO_IP_ALIGN, &dev->flags))
+		skb = __netdev_alloc_skb(dev->net, size, flags);
+	else
+		skb = __netdev_alloc_skb_ip_align(dev->net, size, flags);
 	if (!skb) {
 		netif_dbg(dev, rx_err, dev->net, "no rx skb\n");
 		usbnet_defer_kevent (dev, EVENT_RX_MEMORY);
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index a69877734c4e..e2ec3582e549 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -82,6 +82,7 @@ struct usbnet {
 #		define EVENT_RX_KILL	10
 #		define EVENT_LINK_CHANGE	11
 #		define EVENT_SET_RX_MODE	12
+#		define EVENT_NO_IP_ALIGN	13
 };
 
 static inline struct usb_driver *driver_of(struct usb_interface *intf)
-- 
cgit v1.2.3


From d4761754b4fb2ef8d9a1e9d121c4bec84e1fe292 Mon Sep 17 00:00:00 2001
From: Yousuk Seung <ysseung@google.com>
Date: Thu, 7 Dec 2017 13:41:34 -0800
Subject: tcp: invalidate rate samples during SACK reneging

Mark tcp_sock during a SACK reneging event and invalidate rate samples
while marked. Such rate samples may overestimate bw by including packets
that were SACKed before reneging.

< ack 6001 win 10000 sack 7001:38001
< ack 7001 win 0 sack 8001:38001 // Reneg detected
> seq 7001:8001 // RTO, SACK cleared.
< ack 38001 win 10000

In above example the rate sample taken after the last ack will count
7001-38001 as delivered while the actual delivery rate likely could
be much lower i.e. 7001-8001.

This patch adds a new field tcp_sock.sack_reneg and marks it when we
declare SACK reneging and entering TCP_CA_Loss, and unmarks it after
the last rate sample was taken before moving back to TCP_CA_Open. This
patch also invalidates rate samples taken while tcp_sock.is_sack_reneg
is set.

Fixes: b9f64820fb22 ("tcp: track data delivery rate for a TCP connection")
Signed-off-by: Yousuk Seung <ysseung@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h  |  3 ++-
 include/net/tcp.h    |  2 +-
 net/ipv4/tcp.c       |  1 +
 net/ipv4/tcp_input.c | 10 ++++++++--
 net/ipv4/tcp_rate.c  | 10 +++++++---
 5 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index df5d97a85e1a..ca4a6361389b 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -224,7 +224,8 @@ struct tcp_sock {
 		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
 		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
 		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
-		unused:3;
+		is_sack_reneg:1,    /* in recovery from loss with SACK reneg? */
+		unused:2;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		unused1	    : 1,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6998707e81f3..6da880d2f022 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1055,7 +1055,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
 void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 			    struct rate_sample *rs);
 void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
-		  struct rate_sample *rs);
+		  bool is_sack_reneg, struct rate_sample *rs);
 void tcp_rate_check_app_limited(struct sock *sk);
 
 /* These functions determine how the current flow behaves in respect of SACK
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bf97317e6c97..f08eebe60446 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2412,6 +2412,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->snd_cwnd_cnt = 0;
 	tp->window_clamp = 0;
 	tcp_set_ca_state(sk, TCP_CA_Open);
+	tp->is_sack_reneg = 0;
 	tcp_clear_retrans(tp);
 	inet_csk_delack_init(sk);
 	/* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 514c00732988..075c559570e6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1942,6 +1942,8 @@ void tcp_enter_loss(struct sock *sk)
 	if (is_reneg) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
 		tp->sacked_out = 0;
+		/* Mark SACK reneging until we recover from this loss event. */
+		tp->is_sack_reneg = 1;
 	}
 	tcp_clear_all_retrans_hints(tp);
 
@@ -2365,6 +2367,7 @@ static bool tcp_try_undo_recovery(struct sock *sk)
 		return true;
 	}
 	tcp_set_ca_state(sk, TCP_CA_Open);
+	tp->is_sack_reneg = 0;
 	return false;
 }
 
@@ -2398,8 +2401,10 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
 			NET_INC_STATS(sock_net(sk),
 					LINUX_MIB_TCPSPURIOUSRTOS);
 		inet_csk(sk)->icsk_retransmits = 0;
-		if (frto_undo || tcp_is_sack(tp))
+		if (frto_undo || tcp_is_sack(tp)) {
 			tcp_set_ca_state(sk, TCP_CA_Open);
+			tp->is_sack_reneg = 0;
+		}
 		return true;
 	}
 	return false;
@@ -3496,6 +3501,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	struct tcp_sacktag_state sack_state;
 	struct rate_sample rs = { .prior_delivered = 0 };
 	u32 prior_snd_una = tp->snd_una;
+	bool is_sack_reneg = tp->is_sack_reneg;
 	u32 ack_seq = TCP_SKB_CB(skb)->seq;
 	u32 ack = TCP_SKB_CB(skb)->ack_seq;
 	bool is_dupack = false;
@@ -3612,7 +3618,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	delivered = tp->delivered - delivered;	/* freshly ACKed or SACKed */
 	lost = tp->lost - lost;			/* freshly marked lost */
-	tcp_rate_gen(sk, delivered, lost, sack_state.rate);
+	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
 	tcp_xmit_recovery(sk, rexmit);
 	return 1;
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index 3330a370d306..c61240e43923 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -106,7 +106,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 
 /* Update the connection delivery information and generate a rate sample. */
 void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
-		  struct rate_sample *rs)
+		  bool is_sack_reneg, struct rate_sample *rs)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 snd_us, ack_us;
@@ -124,8 +124,12 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 
 	rs->acked_sacked = delivered;	/* freshly ACKed or SACKed */
 	rs->losses = lost;		/* freshly marked lost */
-	/* Return an invalid sample if no timing information is available. */
-	if (!rs->prior_mstamp) {
+	/* Return an invalid sample if no timing information is available or
+	 * in recovery from loss with SACK reneging. Rate samples taken during
+	 * a SACK reneging event may overestimate bw by including packets that
+	 * were SACKed before the reneg.
+	 */
+	if (!rs->prior_mstamp || is_sack_reneg) {
 		rs->delivered = -1;
 		rs->interval_us = -1;
 		return;
-- 
cgit v1.2.3


From f335195adf043168ee69d78ea72ac3e30f0c57ce Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Dec 2017 11:27:57 +0100
Subject: kmemcheck: rip it out for real

Commit 4675ff05de2d ("kmemcheck: rip it out") has removed the code but
for some reason SPDX header stayed in place.  This looks like a rebase
mistake in the mmotm tree or the merge mistake.  Let's drop those
leftovers as well.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/kmemcheck.h | 1 -
 arch/x86/mm/kmemcheck/error.c    | 1 -
 arch/x86/mm/kmemcheck/error.h    | 1 -
 arch/x86/mm/kmemcheck/opcode.c   | 1 -
 arch/x86/mm/kmemcheck/opcode.h   | 1 -
 arch/x86/mm/kmemcheck/pte.c      | 1 -
 arch/x86/mm/kmemcheck/pte.h      | 1 -
 arch/x86/mm/kmemcheck/selftest.c | 1 -
 arch/x86/mm/kmemcheck/selftest.h | 1 -
 arch/x86/mm/kmemcheck/shadow.h   | 1 -
 include/linux/kmemcheck.h        | 1 -
 mm/kmemcheck.c                   | 1 -
 tools/include/linux/kmemcheck.h  | 1 -
 13 files changed, 13 deletions(-)
 delete mode 100644 arch/x86/include/asm/kmemcheck.h
 delete mode 100644 arch/x86/mm/kmemcheck/error.c
 delete mode 100644 arch/x86/mm/kmemcheck/error.h
 delete mode 100644 arch/x86/mm/kmemcheck/opcode.c
 delete mode 100644 arch/x86/mm/kmemcheck/opcode.h
 delete mode 100644 arch/x86/mm/kmemcheck/pte.c
 delete mode 100644 arch/x86/mm/kmemcheck/pte.h
 delete mode 100644 arch/x86/mm/kmemcheck/selftest.c
 delete mode 100644 arch/x86/mm/kmemcheck/selftest.h
 delete mode 100644 arch/x86/mm/kmemcheck/shadow.h
 delete mode 100644 include/linux/kmemcheck.h
 delete mode 100644 mm/kmemcheck.c
 delete mode 100644 tools/include/linux/kmemcheck.h

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h
deleted file mode 100644
index ea32a7d3cf1b..000000000000
--- a/arch/x86/include/asm/kmemcheck.h
+++ /dev/null
@@ -1 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
deleted file mode 100644
index cec594032515..000000000000
--- a/arch/x86/mm/kmemcheck/error.c
+++ /dev/null
@@ -1 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
deleted file mode 100644
index ea32a7d3cf1b..000000000000
--- a/arch/x86/mm/kmemcheck/error.h
+++ /dev/null
@@ -1 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
deleted file mode 100644
index cec594032515..000000000000
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ /dev/null
@@ -1 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
deleted file mode 100644
index ea32a7d3cf1b..000000000000
--- a/arch/x86/mm/kmemcheck/opcode.h
+++ /dev/null
@@ -1 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
deleted file mode 100644
index cec594032515..000000000000
--- a/arch/x86/mm/kmemcheck/pte.c
+++ /dev/null
@@ -1 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
deleted file mode 100644
index ea32a7d3cf1b..000000000000
--- a/arch/x86/mm/kmemcheck/pte.h
+++ /dev/null
@@ -1 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
deleted file mode 100644
index cec594032515..000000000000
--- a/arch/x86/mm/kmemcheck/selftest.c
+++ /dev/null
@@ -1 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
deleted file mode 100644
index ea32a7d3cf1b..000000000000
--- a/arch/x86/mm/kmemcheck/selftest.h
+++ /dev/null
@@ -1 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
deleted file mode 100644
index ea32a7d3cf1b..000000000000
--- a/arch/x86/mm/kmemcheck/shadow.h
+++ /dev/null
@@ -1 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h
deleted file mode 100644
index ea32a7d3cf1b..000000000000
--- a/include/linux/kmemcheck.h
+++ /dev/null
@@ -1 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
deleted file mode 100644
index cec594032515..000000000000
--- a/mm/kmemcheck.c
+++ /dev/null
@@ -1 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
diff --git a/tools/include/linux/kmemcheck.h b/tools/include/linux/kmemcheck.h
deleted file mode 100644
index ea32a7d3cf1b..000000000000
--- a/tools/include/linux/kmemcheck.h
+++ /dev/null
@@ -1 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-- 
cgit v1.2.3


From 3487972d7fa6c5143951436ada5933dcf0ec659d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 7 Dec 2017 02:41:18 +0100
Subject: PM / sleep: Avoid excess pm_runtime_enable() calls in device_resume()

Middle-layer code doing suspend-time optimizations for devices with
the DPM_FLAG_SMART_SUSPEND flag set (currently, the PCI bus type and
the ACPI PM domain) needs to make the core skip ->thaw_early and
->thaw callbacks for those devices in some cases and it sets the
power.direct_complete flag for them for this purpose.

However, it turns out that setting power.direct_complete outside of
the PM core is a bad idea as it triggers an excess invocation of
pm_runtime_enable() in device_resume().

For this reason, provide a helper to clear power.is_late_suspended
and power.is_suspended to be invoked by the middle-layer code in
question instead of setting power.direct_complete and make that code
call the new helper.

Fixes: c4b65157aeef (PCI / PM: Take SMART_SUSPEND driver flag into account)
Fixes: 05087360fd7a (ACPI / PM: Take SMART_SUSPEND driver flag into account)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/acpi/device_pm.c  |  2 +-
 drivers/base/power/main.c | 15 +++++++++++++++
 drivers/pci/pci-driver.c  |  2 +-
 include/linux/pm.h        |  1 +
 4 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index e4ffaeec9ec2..a4c8ad98560d 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -1138,7 +1138,7 @@ int acpi_subsys_thaw_noirq(struct device *dev)
 	 * skip all of the subsequent "thaw" callbacks for the device.
 	 */
 	if (dev_pm_smart_suspend_and_suspended(dev)) {
-		dev->power.direct_complete = true;
+		dev_pm_skip_next_resume_phases(dev);
 		return 0;
 	}
 
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index db2f04415927..08744b572af6 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -525,6 +525,21 @@ static void dpm_watchdog_clear(struct dpm_watchdog *wd)
 
 /*------------------------- Resume routines -------------------------*/
 
+/**
+ * dev_pm_skip_next_resume_phases - Skip next system resume phases for device.
+ * @dev: Target device.
+ *
+ * Make the core skip the "early resume" and "resume" phases for @dev.
+ *
+ * This function can be called by middle-layer code during the "noirq" phase of
+ * system resume if necessary, but not by device drivers.
+ */
+void dev_pm_skip_next_resume_phases(struct device *dev)
+{
+	dev->power.is_late_suspended = false;
+	dev->power.is_suspended = false;
+}
+
 /**
  * device_resume_noirq - Execute a "noirq resume" callback for given device.
  * @dev: Device to handle.
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 7f47bb72bf30..945099d49f8f 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -999,7 +999,7 @@ static int pci_pm_thaw_noirq(struct device *dev)
 	 * the subsequent "thaw" callbacks for the device.
 	 */
 	if (dev_pm_smart_suspend_and_suspended(dev)) {
-		dev->power.direct_complete = true;
+		dev_pm_skip_next_resume_phases(dev);
 		return 0;
 	}
 
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 65d39115f06d..492ed473ba7e 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -765,6 +765,7 @@ extern int pm_generic_poweroff_late(struct device *dev);
 extern int pm_generic_poweroff(struct device *dev);
 extern void pm_generic_complete(struct device *dev);
 
+extern void dev_pm_skip_next_resume_phases(struct device *dev);
 extern bool dev_pm_smart_suspend_and_suspended(struct device *dev);
 
 #else /* !CONFIG_PM_SLEEP */
-- 
cgit v1.2.3


From a8ceb5dbfde1092b466936bca0ff3be127ecf38e Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 5 Dec 2017 21:29:37 +0200
Subject: ptr_ring: add barriers

Users of ptr_ring expect that it's safe to give the
data structure a pointer and have it be available
to consumers, but that actually requires an smb_wmb
or a stronger barrier.

In absence of such barriers and on architectures that reorder writes,
consumer might read an un=initialized value from an skb pointer stored
in the skb array.  This was observed causing crashes.

To fix, add memory barriers.  The barrier we use is a wmb, the
assumption being that producers do not need to read the value so we do
not need to order these reads.

Reported-by: George Cherian <george.cherian@cavium.com>
Suggested-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptr_ring.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 37b4bb2545b3..6866df4f31b5 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -101,12 +101,18 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r)
 
 /* Note: callers invoking this in a loop must use a compiler barrier,
  * for example cpu_relax(). Callers must hold producer_lock.
+ * Callers are responsible for making sure pointer that is being queued
+ * points to a valid data.
  */
 static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
 {
 	if (unlikely(!r->size) || r->queue[r->producer])
 		return -ENOSPC;
 
+	/* Make sure the pointer we are storing points to a valid data. */
+	/* Pairs with smp_read_barrier_depends in __ptr_ring_consume. */
+	smp_wmb();
+
 	r->queue[r->producer++] = ptr;
 	if (unlikely(r->producer >= r->size))
 		r->producer = 0;
@@ -275,6 +281,9 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r)
 	if (ptr)
 		__ptr_ring_discard_one(r);
 
+	/* Make sure anyone accessing data through the pointer is up to date. */
+	/* Pairs with smp_wmb in __ptr_ring_produce. */
+	smp_read_barrier_depends();
 	return ptr;
 }
 
-- 
cgit v1.2.3


From d89c70356acf11b7cf47ca5cfcafae5062a85451 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 28 Nov 2017 18:42:19 +0000
Subject: locking/core: Remove break_lock field when CONFIG_GENERIC_LOCKBREAK=y

When CONFIG_GENERIC_LOCKBEAK=y, locking structures grow an extra int ->break_lock
field which is used to implement raw_spin_is_contended() by setting the field
to 1 when waiting on a lock and clearing it to zero when holding a lock.
However, there are a few problems with this approach:

  - There is a write-write race between a CPU successfully taking the lock
    (and subsequently writing break_lock = 0) and a waiter waiting on
    the lock (and subsequently writing break_lock = 1). This could result
    in a contended lock being reported as uncontended and vice-versa.

  - On machines with store buffers, nothing guarantees that the writes
    to break_lock are visible to other CPUs at any particular time.

  - READ_ONCE/WRITE_ONCE are not used, so the field is potentially
    susceptible to harmful compiler optimisations,

Consequently, the usefulness of this field is unclear and we'd be better off
removing it and allowing architectures to implement raw_spin_is_contended() by
providing a definition of arch_spin_is_contended(), as they can when
CONFIG_GENERIC_LOCKBREAK=n.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Sebastian Ott <sebott@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1511894539-7988-3-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rwlock_types.h   | 3 ---
 include/linux/spinlock.h       | 5 -----
 include/linux/spinlock_types.h | 3 ---
 kernel/locking/spinlock.c      | 9 +--------
 4 files changed, 1 insertion(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
index cc0072e93e36..857a72ceb794 100644
--- a/include/linux/rwlock_types.h
+++ b/include/linux/rwlock_types.h
@@ -10,9 +10,6 @@
  */
 typedef struct {
 	arch_rwlock_t raw_lock;
-#ifdef CONFIG_GENERIC_LOCKBREAK
-	unsigned int break_lock;
-#endif
 #ifdef CONFIG_DEBUG_SPINLOCK
 	unsigned int magic, owner_cpu;
 	void *owner;
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index a39186194cd6..3bf273538840 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -107,16 +107,11 @@ do {								\
 
 #define raw_spin_is_locked(lock)	arch_spin_is_locked(&(lock)->raw_lock)
 
-#ifdef CONFIG_GENERIC_LOCKBREAK
-#define raw_spin_is_contended(lock) ((lock)->break_lock)
-#else
-
 #ifdef arch_spin_is_contended
 #define raw_spin_is_contended(lock)	arch_spin_is_contended(&(lock)->raw_lock)
 #else
 #define raw_spin_is_contended(lock)	(((void)(lock), 0))
 #endif /*arch_spin_is_contended*/
-#endif
 
 /*
  * This barrier must provide two things:
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index 73548eb13a5d..24b4e6f2c1a2 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -19,9 +19,6 @@
 
 typedef struct raw_spinlock {
 	arch_spinlock_t raw_lock;
-#ifdef CONFIG_GENERIC_LOCKBREAK
-	unsigned int break_lock;
-#endif
 #ifdef CONFIG_DEBUG_SPINLOCK
 	unsigned int magic, owner_cpu;
 	void *owner;
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 0ebb253e2199..936f3d14dd6b 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -66,12 +66,8 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock)			\
 			break;						\
 		preempt_enable();					\
 									\
-		if (!(lock)->break_lock)				\
-			(lock)->break_lock = 1;				\
-									\
 		arch_##op##_relax(&lock->raw_lock);			\
 	}								\
-	(lock)->break_lock = 0;						\
 }									\
 									\
 unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock)	\
@@ -86,12 +82,9 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock)	\
 		local_irq_restore(flags);				\
 		preempt_enable();					\
 									\
-		if (!(lock)->break_lock)				\
-			(lock)->break_lock = 1;				\
-									\
 		arch_##op##_relax(&lock->raw_lock);			\
 	}								\
-	(lock)->break_lock = 0;						\
+									\
 	return flags;							\
 }									\
 									\
-- 
cgit v1.2.3


From e966eaeeb623f09975ef362c2866fae6f86844f9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 12 Dec 2017 12:31:16 +0100
Subject: locking/lockdep: Remove the cross-release locking checks

This code (CONFIG_LOCKDEP_CROSSRELEASE=y and CONFIG_LOCKDEP_COMPLETIONS=y),
while it found a number of old bugs initially, was also causing too many
false positives that caused people to disable lockdep - which is arguably
a worse overall outcome.

If we disable cross-release by default but keep the code upstream then
in practice the most likely outcome is that we'll allow the situation
to degrade gradually, by allowing entropy to introduce more and more
false positives, until it overwhelms maintenance capacity.

Another bad side effect was that people were trying to work around
the false positives by uglifying/complicating unrelated code. There's
a marked difference between annotating locking operations and
uglifying good code just due to bad lock debugging code ...

This gradual decrease in quality happened to a number of debugging
facilities in the kernel, and lockdep is pretty complex already,
so we cannot risk this outcome.

Either cross-release checking can be done right with no false positives,
or it should not be included in the upstream kernel.

( Note that it might make sense to maintain it out of tree and go through
  the false positives every now and then and see whether new bugs were
  introduced. )

Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/locking/crossrelease.txt | 874 ---------------------------------
 include/linux/completion.h             |  45 --
 include/linux/lockdep.h                | 125 -----
 include/linux/sched.h                  |  11 -
 kernel/locking/lockdep.c               | 652 ++----------------------
 lib/Kconfig.debug                      |  33 --
 6 files changed, 35 insertions(+), 1705 deletions(-)
 delete mode 100644 Documentation/locking/crossrelease.txt

(limited to 'include/linux')

diff --git a/Documentation/locking/crossrelease.txt b/Documentation/locking/crossrelease.txt
deleted file mode 100644
index bdf1423d5f99..000000000000
--- a/Documentation/locking/crossrelease.txt
+++ /dev/null
@@ -1,874 +0,0 @@
-Crossrelease
-============
-
-Started by Byungchul Park <byungchul.park@lge.com>
-
-Contents:
-
- (*) Background
-
-     - What causes deadlock
-     - How lockdep works
-
- (*) Limitation
-
-     - Limit lockdep
-     - Pros from the limitation
-     - Cons from the limitation
-     - Relax the limitation
-
- (*) Crossrelease
-
-     - Introduce crossrelease
-     - Introduce commit
-
- (*) Implementation
-
-     - Data structures
-     - How crossrelease works
-
- (*) Optimizations
-
-     - Avoid duplication
-     - Lockless for hot paths
-
- (*) APPENDIX A: What lockdep does to work aggresively
-
- (*) APPENDIX B: How to avoid adding false dependencies
-
-
-==========
-Background
-==========
-
-What causes deadlock
---------------------
-
-A deadlock occurs when a context is waiting for an event to happen,
-which is impossible because another (or the) context who can trigger the
-event is also waiting for another (or the) event to happen, which is
-also impossible due to the same reason.
-
-For example:
-
-   A context going to trigger event C is waiting for event A to happen.
-   A context going to trigger event A is waiting for event B to happen.
-   A context going to trigger event B is waiting for event C to happen.
-
-A deadlock occurs when these three wait operations run at the same time,
-because event C cannot be triggered if event A does not happen, which in
-turn cannot be triggered if event B does not happen, which in turn
-cannot be triggered if event C does not happen. After all, no event can
-be triggered since any of them never meets its condition to wake up.
-
-A dependency might exist between two waiters and a deadlock might happen
-due to an incorrect releationship between dependencies. Thus, we must
-define what a dependency is first. A dependency exists between them if:
-
-   1. There are two waiters waiting for each event at a given time.
-   2. The only way to wake up each waiter is to trigger its event.
-   3. Whether one can be woken up depends on whether the other can.
-
-Each wait in the example creates its dependency like:
-
-   Event C depends on event A.
-   Event A depends on event B.
-   Event B depends on event C.
-
-   NOTE: Precisely speaking, a dependency is one between whether a
-   waiter for an event can be woken up and whether another waiter for
-   another event can be woken up. However from now on, we will describe
-   a dependency as if it's one between an event and another event for
-   simplicity.
-
-And they form circular dependencies like:
-
-    -> C -> A -> B -
-   /                \
-   \                /
-    ----------------
-
-   where 'A -> B' means that event A depends on event B.
-
-Such circular dependencies lead to a deadlock since no waiter can meet
-its condition to wake up as described.
-
-CONCLUSION
-
-Circular dependencies cause a deadlock.
-
-
-How lockdep works
------------------
-
-Lockdep tries to detect a deadlock by checking dependencies created by
-lock operations, acquire and release. Waiting for a lock corresponds to
-waiting for an event, and releasing a lock corresponds to triggering an
-event in the previous section.
-
-In short, lockdep does:
-
-   1. Detect a new dependency.
-   2. Add the dependency into a global graph.
-   3. Check if that makes dependencies circular.
-   4. Report a deadlock or its possibility if so.
-
-For example, consider a graph built by lockdep that looks like:
-
-   A -> B -
-           \
-            -> E
-           /
-   C -> D -
-
-   where A, B,..., E are different lock classes.
-
-Lockdep will add a dependency into the graph on detection of a new
-dependency. For example, it will add a dependency 'E -> C' when a new
-dependency between lock E and lock C is detected. Then the graph will be:
-
-       A -> B -
-               \
-                -> E -
-               /      \
-    -> C -> D -        \
-   /                   /
-   \                  /
-    ------------------
-
-   where A, B,..., E are different lock classes.
-
-This graph contains a subgraph which demonstrates circular dependencies:
-
-                -> E -
-               /      \
-    -> C -> D -        \
-   /                   /
-   \                  /
-    ------------------
-
-   where C, D and E are different lock classes.
-
-This is the condition under which a deadlock might occur. Lockdep
-reports it on detection after adding a new dependency. This is the way
-how lockdep works.
-
-CONCLUSION
-
-Lockdep detects a deadlock or its possibility by checking if circular
-dependencies were created after adding each new dependency.
-
-
-==========
-Limitation
-==========
-
-Limit lockdep
--------------
-
-Limiting lockdep to work on only typical locks e.g. spin locks and
-mutexes, which are released within the acquire context, the
-implementation becomes simple but its capacity for detection becomes
-limited. Let's check pros and cons in next section.
-
-
-Pros from the limitation
-------------------------
-
-Given the limitation, when acquiring a lock, locks in a held_locks
-cannot be released if the context cannot acquire it so has to wait to
-acquire it, which means all waiters for the locks in the held_locks are
-stuck. It's an exact case to create dependencies between each lock in
-the held_locks and the lock to acquire.
-
-For example:
-
-   CONTEXT X
-   ---------
-   acquire A
-   acquire B /* Add a dependency 'A -> B' */
-   release B
-   release A
-
-   where A and B are different lock classes.
-
-When acquiring lock A, the held_locks of CONTEXT X is empty thus no
-dependency is added. But when acquiring lock B, lockdep detects and adds
-a new dependency 'A -> B' between lock A in the held_locks and lock B.
-They can be simply added whenever acquiring each lock.
-
-And data required by lockdep exists in a local structure, held_locks
-embedded in task_struct. Forcing to access the data within the context,
-lockdep can avoid racy problems without explicit locks while handling
-the local data.
-
-Lastly, lockdep only needs to keep locks currently being held, to build
-a dependency graph. However, relaxing the limitation, it needs to keep
-even locks already released, because a decision whether they created
-dependencies might be long-deferred.
-
-To sum up, we can expect several advantages from the limitation:
-
-   1. Lockdep can easily identify a dependency when acquiring a lock.
-   2. Races are avoidable while accessing local locks in a held_locks.
-   3. Lockdep only needs to keep locks currently being held.
-
-CONCLUSION
-
-Given the limitation, the implementation becomes simple and efficient.
-
-
-Cons from the limitation
-------------------------
-
-Given the limitation, lockdep is applicable only to typical locks. For
-example, page locks for page access or completions for synchronization
-cannot work with lockdep.
-
-Can we detect deadlocks below, under the limitation?
-
-Example 1:
-
-   CONTEXT X	   CONTEXT Y	   CONTEXT Z
-   ---------	   ---------	   ----------
-		   mutex_lock A
-   lock_page B
-		   lock_page B
-				   mutex_lock A /* DEADLOCK */
-				   unlock_page B held by X
-		   unlock_page B
-		   mutex_unlock A
-				   mutex_unlock A
-
-   where A and B are different lock classes.
-
-No, we cannot.
-
-Example 2:
-
-   CONTEXT X		   CONTEXT Y
-   ---------		   ---------
-			   mutex_lock A
-   mutex_lock A
-			   wait_for_complete B /* DEADLOCK */
-   complete B
-			   mutex_unlock A
-   mutex_unlock A
-
-   where A is a lock class and B is a completion variable.
-
-No, we cannot.
-
-CONCLUSION
-
-Given the limitation, lockdep cannot detect a deadlock or its
-possibility caused by page locks or completions.
-
-
-Relax the limitation
---------------------
-
-Under the limitation, things to create dependencies are limited to
-typical locks. However, synchronization primitives like page locks and
-completions, which are allowed to be released in any context, also
-create dependencies and can cause a deadlock. So lockdep should track
-these locks to do a better job. We have to relax the limitation for
-these locks to work with lockdep.
-
-Detecting dependencies is very important for lockdep to work because
-adding a dependency means adding an opportunity to check whether it
-causes a deadlock. The more lockdep adds dependencies, the more it
-thoroughly works. Thus Lockdep has to do its best to detect and add as
-many true dependencies into a graph as possible.
-
-For example, considering only typical locks, lockdep builds a graph like:
-
-   A -> B -
-           \
-            -> E
-           /
-   C -> D -
-
-   where A, B,..., E are different lock classes.
-
-On the other hand, under the relaxation, additional dependencies might
-be created and added. Assuming additional 'FX -> C' and 'E -> GX' are
-added thanks to the relaxation, the graph will be:
-
-         A -> B -
-                 \
-                  -> E -> GX
-                 /
-   FX -> C -> D -
-
-   where A, B,..., E, FX and GX are different lock classes, and a suffix
-   'X' is added on non-typical locks.
-
-The latter graph gives us more chances to check circular dependencies
-than the former. However, it might suffer performance degradation since
-relaxing the limitation, with which design and implementation of lockdep
-can be efficient, might introduce inefficiency inevitably. So lockdep
-should provide two options, strong detection and efficient detection.
-
-Choosing efficient detection:
-
-   Lockdep works with only locks restricted to be released within the
-   acquire context. However, lockdep works efficiently.
-
-Choosing strong detection:
-
-   Lockdep works with all synchronization primitives. However, lockdep
-   suffers performance degradation.
-
-CONCLUSION
-
-Relaxing the limitation, lockdep can add additional dependencies giving
-additional opportunities to check circular dependencies.
-
-
-============
-Crossrelease
-============
-
-Introduce crossrelease
-----------------------
-
-In order to allow lockdep to handle additional dependencies by what
-might be released in any context, namely 'crosslock', we have to be able
-to identify those created by crosslocks. The proposed 'crossrelease'
-feature provoides a way to do that.
-
-Crossrelease feature has to do:
-
-   1. Identify dependencies created by crosslocks.
-   2. Add the dependencies into a dependency graph.
-
-That's all. Once a meaningful dependency is added into graph, then
-lockdep would work with the graph as it did. The most important thing
-crossrelease feature has to do is to correctly identify and add true
-dependencies into the global graph.
-
-A dependency e.g. 'A -> B' can be identified only in the A's release
-context because a decision required to identify the dependency can be
-made only in the release context. That is to decide whether A can be
-released so that a waiter for A can be woken up. It cannot be made in
-other than the A's release context.
-
-It's no matter for typical locks because each acquire context is same as
-its release context, thus lockdep can decide whether a lock can be
-released in the acquire context. However for crosslocks, lockdep cannot
-make the decision in the acquire context but has to wait until the
-release context is identified.
-
-Therefore, deadlocks by crosslocks cannot be detected just when it
-happens, because those cannot be identified until the crosslocks are
-released. However, deadlock possibilities can be detected and it's very
-worth. See 'APPENDIX A' section to check why.
-
-CONCLUSION
-
-Using crossrelease feature, lockdep can work with what might be released
-in any context, namely crosslock.
-
-
-Introduce commit
-----------------
-
-Since crossrelease defers the work adding true dependencies of
-crosslocks until they are actually released, crossrelease has to queue
-all acquisitions which might create dependencies with the crosslocks.
-Then it identifies dependencies using the queued data in batches at a
-proper time. We call it 'commit'.
-
-There are four types of dependencies:
-
-1. TT type: 'typical lock A -> typical lock B'
-
-   Just when acquiring B, lockdep can see it's in the A's release
-   context. So the dependency between A and B can be identified
-   immediately. Commit is unnecessary.
-
-2. TC type: 'typical lock A -> crosslock BX'
-
-   Just when acquiring BX, lockdep can see it's in the A's release
-   context. So the dependency between A and BX can be identified
-   immediately. Commit is unnecessary, too.
-
-3. CT type: 'crosslock AX -> typical lock B'
-
-   When acquiring B, lockdep cannot identify the dependency because
-   there's no way to know if it's in the AX's release context. It has
-   to wait until the decision can be made. Commit is necessary.
-
-4. CC type: 'crosslock AX -> crosslock BX'
-
-   When acquiring BX, lockdep cannot identify the dependency because
-   there's no way to know if it's in the AX's release context. It has
-   to wait until the decision can be made. Commit is necessary.
-   But, handling CC type is not implemented yet. It's a future work.
-
-Lockdep can work without commit for typical locks, but commit step is
-necessary once crosslocks are involved. Introducing commit, lockdep
-performs three steps. What lockdep does in each step is:
-
-1. Acquisition: For typical locks, lockdep does what it originally did
-   and queues the lock so that CT type dependencies can be checked using
-   it at the commit step. For crosslocks, it saves data which will be
-   used at the commit step and increases a reference count for it.
-
-2. Commit: No action is reauired for typical locks. For crosslocks,
-   lockdep adds CT type dependencies using the data saved at the
-   acquisition step.
-
-3. Release: No changes are required for typical locks. When a crosslock
-   is released, it decreases a reference count for it.
-
-CONCLUSION
-
-Crossrelease introduces commit step to handle dependencies of crosslocks
-in batches at a proper time.
-
-
-==============
-Implementation
-==============
-
-Data structures
----------------
-
-Crossrelease introduces two main data structures.
-
-1. hist_lock
-
-   This is an array embedded in task_struct, for keeping lock history so
-   that dependencies can be added using them at the commit step. Since
-   it's local data, it can be accessed locklessly in the owner context.
-   The array is filled at the acquisition step and consumed at the
-   commit step. And it's managed in circular manner.
-
-2. cross_lock
-
-   One per lockdep_map exists. This is for keeping data of crosslocks
-   and used at the commit step.
-
-
-How crossrelease works
-----------------------
-
-It's the key of how crossrelease works, to defer necessary works to an
-appropriate point in time and perform in at once at the commit step.
-Let's take a look with examples step by step, starting from how lockdep
-works without crossrelease for typical locks.
-
-   acquire A /* Push A onto held_locks */
-   acquire B /* Push B onto held_locks and add 'A -> B' */
-   acquire C /* Push C onto held_locks and add 'B -> C' */
-   release C /* Pop C from held_locks */
-   release B /* Pop B from held_locks */
-   release A /* Pop A from held_locks */
-
-   where A, B and C are different lock classes.
-
-   NOTE: This document assumes that readers already understand how
-   lockdep works without crossrelease thus omits details. But there's
-   one thing to note. Lockdep pretends to pop a lock from held_locks
-   when releasing it. But it's subtly different from the original pop
-   operation because lockdep allows other than the top to be poped.
-
-In this case, lockdep adds 'the top of held_locks -> the lock to acquire'
-dependency every time acquiring a lock.
-
-After adding 'A -> B', a dependency graph will be:
-
-   A -> B
-
-   where A and B are different lock classes.
-
-And after adding 'B -> C', the graph will be:
-
-   A -> B -> C
-
-   where A, B and C are different lock classes.
-
-Let's performs commit step even for typical locks to add dependencies.
-Of course, commit step is not necessary for them, however, it would work
-well because this is a more general way.
-
-   acquire A
-   /*
-    * Queue A into hist_locks
-    *
-    * In hist_locks: A
-    * In graph: Empty
-    */
-
-   acquire B
-   /*
-    * Queue B into hist_locks
-    *
-    * In hist_locks: A, B
-    * In graph: Empty
-    */
-
-   acquire C
-   /*
-    * Queue C into hist_locks
-    *
-    * In hist_locks: A, B, C
-    * In graph: Empty
-    */
-
-   commit C
-   /*
-    * Add 'C -> ?'
-    * Answer the following to decide '?'
-    * What has been queued since acquire C: Nothing
-    *
-    * In hist_locks: A, B, C
-    * In graph: Empty
-    */
-
-   release C
-
-   commit B
-   /*
-    * Add 'B -> ?'
-    * Answer the following to decide '?'
-    * What has been queued since acquire B: C
-    *
-    * In hist_locks: A, B, C
-    * In graph: 'B -> C'
-    */
-
-   release B
-
-   commit A
-   /*
-    * Add 'A -> ?'
-    * Answer the following to decide '?'
-    * What has been queued since acquire A: B, C
-    *
-    * In hist_locks: A, B, C
-    * In graph: 'B -> C', 'A -> B', 'A -> C'
-    */
-
-   release A
-
-   where A, B and C are different lock classes.
-
-In this case, dependencies are added at the commit step as described.
-
-After commits for A, B and C, the graph will be:
-
-   A -> B -> C
-
-   where A, B and C are different lock classes.
-
-   NOTE: A dependency 'A -> C' is optimized out.
-
-We can see the former graph built without commit step is same as the
-latter graph built using commit steps. Of course the former way leads to
-earlier finish for building the graph, which means we can detect a
-deadlock or its possibility sooner. So the former way would be prefered
-when possible. But we cannot avoid using the latter way for crosslocks.
-
-Let's look at how commit steps work for crosslocks. In this case, the
-commit step is performed only on crosslock AX as real. And it assumes
-that the AX release context is different from the AX acquire context.
-
-   BX RELEASE CONTEXT		   BX ACQUIRE CONTEXT
-   ------------------		   ------------------
-				   acquire A
-				   /*
-				    * Push A onto held_locks
-				    * Queue A into hist_locks
-				    *
-				    * In held_locks: A
-				    * In hist_locks: A
-				    * In graph: Empty
-				    */
-
-				   acquire BX
-				   /*
-				    * Add 'the top of held_locks -> BX'
-				    *
-				    * In held_locks: A
-				    * In hist_locks: A
-				    * In graph: 'A -> BX'
-				    */
-
-   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-   It must be guaranteed that the following operations are seen after
-   acquiring BX globally. It can be done by things like barrier.
-   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-   acquire C
-   /*
-    * Push C onto held_locks
-    * Queue C into hist_locks
-    *
-    * In held_locks: C
-    * In hist_locks: C
-    * In graph: 'A -> BX'
-    */
-
-   release C
-   /*
-    * Pop C from held_locks
-    *
-    * In held_locks: Empty
-    * In hist_locks: C
-    * In graph: 'A -> BX'
-    */
-				   acquire D
-				   /*
-				    * Push D onto held_locks
-				    * Queue D into hist_locks
-				    * Add 'the top of held_locks -> D'
-				    *
-				    * In held_locks: A, D
-				    * In hist_locks: A, D
-				    * In graph: 'A -> BX', 'A -> D'
-				    */
-   acquire E
-   /*
-    * Push E onto held_locks
-    * Queue E into hist_locks
-    *
-    * In held_locks: E
-    * In hist_locks: C, E
-    * In graph: 'A -> BX', 'A -> D'
-    */
-
-   release E
-   /*
-    * Pop E from held_locks
-    *
-    * In held_locks: Empty
-    * In hist_locks: D, E
-    * In graph: 'A -> BX', 'A -> D'
-    */
-				   release D
-				   /*
-				    * Pop D from held_locks
-				    *
-				    * In held_locks: A
-				    * In hist_locks: A, D
-				    * In graph: 'A -> BX', 'A -> D'
-				    */
-   commit BX
-   /*
-    * Add 'BX -> ?'
-    * What has been queued since acquire BX: C, E
-    *
-    * In held_locks: Empty
-    * In hist_locks: D, E
-    * In graph: 'A -> BX', 'A -> D',
-    *           'BX -> C', 'BX -> E'
-    */
-
-   release BX
-   /*
-    * In held_locks: Empty
-    * In hist_locks: D, E
-    * In graph: 'A -> BX', 'A -> D',
-    *           'BX -> C', 'BX -> E'
-    */
-				   release A
-				   /*
-				    * Pop A from held_locks
-				    *
-				    * In held_locks: Empty
-				    * In hist_locks: A, D
-				    * In graph: 'A -> BX', 'A -> D',
-				    *           'BX -> C', 'BX -> E'
-				    */
-
-   where A, BX, C,..., E are different lock classes, and a suffix 'X' is
-   added on crosslocks.
-
-Crossrelease considers all acquisitions after acqiuring BX are
-candidates which might create dependencies with BX. True dependencies
-will be determined when identifying the release context of BX. Meanwhile,
-all typical locks are queued so that they can be used at the commit step.
-And then two dependencies 'BX -> C' and 'BX -> E' are added at the
-commit step when identifying the release context.
-
-The final graph will be, with crossrelease:
-
-               -> C
-              /
-       -> BX -
-      /       \
-   A -         -> E
-      \
-       -> D
-
-   where A, BX, C,..., E are different lock classes, and a suffix 'X' is
-   added on crosslocks.
-
-However, the final graph will be, without crossrelease:
-
-   A -> D
-
-   where A and D are different lock classes.
-
-The former graph has three more dependencies, 'A -> BX', 'BX -> C' and
-'BX -> E' giving additional opportunities to check if they cause
-deadlocks. This way lockdep can detect a deadlock or its possibility
-caused by crosslocks.
-
-CONCLUSION
-
-We checked how crossrelease works with several examples.
-
-
-=============
-Optimizations
-=============
-
-Avoid duplication
------------------
-
-Crossrelease feature uses a cache like what lockdep already uses for
-dependency chains, but this time it's for caching CT type dependencies.
-Once that dependency is cached, the same will never be added again.
-
-
-Lockless for hot paths
-----------------------
-
-To keep all locks for later use at the commit step, crossrelease adopts
-a local array embedded in task_struct, which makes access to the data
-lockless by forcing it to happen only within the owner context. It's
-like how lockdep handles held_locks. Lockless implmentation is important
-since typical locks are very frequently acquired and released.
-
-
-=================================================
-APPENDIX A: What lockdep does to work aggresively
-=================================================
-
-A deadlock actually occurs when all wait operations creating circular
-dependencies run at the same time. Even though they don't, a potential
-deadlock exists if the problematic dependencies exist. Thus it's
-meaningful to detect not only an actual deadlock but also its potential
-possibility. The latter is rather valuable. When a deadlock occurs
-actually, we can identify what happens in the system by some means or
-other even without lockdep. However, there's no way to detect possiblity
-without lockdep unless the whole code is parsed in head. It's terrible.
-Lockdep does the both, and crossrelease only focuses on the latter.
-
-Whether or not a deadlock actually occurs depends on several factors.
-For example, what order contexts are switched in is a factor. Assuming
-circular dependencies exist, a deadlock would occur when contexts are
-switched so that all wait operations creating the dependencies run
-simultaneously. Thus to detect a deadlock possibility even in the case
-that it has not occured yet, lockdep should consider all possible
-combinations of dependencies, trying to:
-
-1. Use a global dependency graph.
-
-   Lockdep combines all dependencies into one global graph and uses them,
-   regardless of which context generates them or what order contexts are
-   switched in. Aggregated dependencies are only considered so they are
-   prone to be circular if a problem exists.
-
-2. Check dependencies between classes instead of instances.
-
-   What actually causes a deadlock are instances of lock. However,
-   lockdep checks dependencies between classes instead of instances.
-   This way lockdep can detect a deadlock which has not happened but
-   might happen in future by others but the same class.
-
-3. Assume all acquisitions lead to waiting.
-
-   Although locks might be acquired without waiting which is essential
-   to create dependencies, lockdep assumes all acquisitions lead to
-   waiting since it might be true some time or another.
-
-CONCLUSION
-
-Lockdep detects not only an actual deadlock but also its possibility,
-and the latter is more valuable.
-
-
-==================================================
-APPENDIX B: How to avoid adding false dependencies
-==================================================
-
-Remind what a dependency is. A dependency exists if:
-
-   1. There are two waiters waiting for each event at a given time.
-   2. The only way to wake up each waiter is to trigger its event.
-   3. Whether one can be woken up depends on whether the other can.
-
-For example:
-
-   acquire A
-   acquire B /* A dependency 'A -> B' exists */
-   release B
-   release A
-
-   where A and B are different lock classes.
-
-A depedency 'A -> B' exists since:
-
-   1. A waiter for A and a waiter for B might exist when acquiring B.
-   2. Only way to wake up each is to release what it waits for.
-   3. Whether the waiter for A can be woken up depends on whether the
-      other can. IOW, TASK X cannot release A if it fails to acquire B.
-
-For another example:
-
-   TASK X			   TASK Y
-   ------			   ------
-				   acquire AX
-   acquire B /* A dependency 'AX -> B' exists */
-   release B
-   release AX held by Y
-
-   where AX and B are different lock classes, and a suffix 'X' is added
-   on crosslocks.
-
-Even in this case involving crosslocks, the same rule can be applied. A
-depedency 'AX -> B' exists since:
-
-   1. A waiter for AX and a waiter for B might exist when acquiring B.
-   2. Only way to wake up each is to release what it waits for.
-   3. Whether the waiter for AX can be woken up depends on whether the
-      other can. IOW, TASK X cannot release AX if it fails to acquire B.
-
-Let's take a look at more complicated example:
-
-   TASK X			   TASK Y
-   ------			   ------
-   acquire B
-   release B
-   fork Y
-				   acquire AX
-   acquire C /* A dependency 'AX -> C' exists */
-   release C
-   release AX held by Y
-
-   where AX, B and C are different lock classes, and a suffix 'X' is
-   added on crosslocks.
-
-Does a dependency 'AX -> B' exist? Nope.
-
-Two waiters are essential to create a dependency. However, waiters for
-AX and B to create 'AX -> B' cannot exist at the same time in this
-example. Thus the dependency 'AX -> B' cannot be created.
-
-It would be ideal if the full set of true ones can be considered. But
-we can ensure nothing but what actually happened. Relying on what
-actually happens at runtime, we can anyway add only true ones, though
-they might be a subset of true ones. It's similar to how lockdep works
-for typical locks. There might be more true dependencies than what
-lockdep has detected in runtime. Lockdep has no choice but to rely on
-what actually happens. Crossrelease also relies on it.
-
-CONCLUSION
-
-Relying on what actually happens, lockdep can avoid adding false
-dependencies.
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 0662a417febe..94a59ba7d422 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -10,9 +10,6 @@
  */
 
 #include <linux/wait.h>
-#ifdef CONFIG_LOCKDEP_COMPLETIONS
-#include <linux/lockdep.h>
-#endif
 
 /*
  * struct completion - structure used to maintain state for a "completion"
@@ -29,58 +26,16 @@
 struct completion {
 	unsigned int done;
 	wait_queue_head_t wait;
-#ifdef CONFIG_LOCKDEP_COMPLETIONS
-	struct lockdep_map_cross map;
-#endif
 };
 
-#ifdef CONFIG_LOCKDEP_COMPLETIONS
-static inline void complete_acquire(struct completion *x)
-{
-	lock_acquire_exclusive((struct lockdep_map *)&x->map, 0, 0, NULL, _RET_IP_);
-}
-
-static inline void complete_release(struct completion *x)
-{
-	lock_release((struct lockdep_map *)&x->map, 0, _RET_IP_);
-}
-
-static inline void complete_release_commit(struct completion *x)
-{
-	lock_commit_crosslock((struct lockdep_map *)&x->map);
-}
-
-#define init_completion_map(x, m)					\
-do {									\
-	lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map,	\
-			(m)->name, (m)->key, 0);				\
-	__init_completion(x);						\
-} while (0)
-
-#define init_completion(x)						\
-do {									\
-	static struct lock_class_key __key;				\
-	lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map,	\
-			"(completion)" #x,				\
-			&__key, 0);					\
-	__init_completion(x);						\
-} while (0)
-#else
 #define init_completion_map(x, m) __init_completion(x)
 #define init_completion(x) __init_completion(x)
 static inline void complete_acquire(struct completion *x) {}
 static inline void complete_release(struct completion *x) {}
 static inline void complete_release_commit(struct completion *x) {}
-#endif
 
-#ifdef CONFIG_LOCKDEP_COMPLETIONS
-#define COMPLETION_INITIALIZER(work) \
-	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
-	STATIC_CROSS_LOCKDEP_MAP_INIT("(completion)" #work, &(work)) }
-#else
 #define COMPLETION_INITIALIZER(work) \
 	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
-#endif
 
 #define COMPLETION_INITIALIZER_ONSTACK_MAP(work, map) \
 	(*({ init_completion_map(&(work), &(map)); &(work); }))
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index a842551fe044..2e75dc34bff5 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -158,12 +158,6 @@ struct lockdep_map {
 	int				cpu;
 	unsigned long			ip;
 #endif
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-	/*
-	 * Whether it's a crosslock.
-	 */
-	int				cross;
-#endif
 };
 
 static inline void lockdep_copy_map(struct lockdep_map *to,
@@ -267,95 +261,8 @@ struct held_lock {
 	unsigned int hardirqs_off:1;
 	unsigned int references:12;					/* 32 bits */
 	unsigned int pin_count;
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-	/*
-	 * Generation id.
-	 *
-	 * A value of cross_gen_id will be stored when holding this,
-	 * which is globally increased whenever each crosslock is held.
-	 */
-	unsigned int gen_id;
-#endif
-};
-
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-#define MAX_XHLOCK_TRACE_ENTRIES 5
-
-/*
- * This is for keeping locks waiting for commit so that true dependencies
- * can be added at commit step.
- */
-struct hist_lock {
-	/*
-	 * Id for each entry in the ring buffer. This is used to
-	 * decide whether the ring buffer was overwritten or not.
-	 *
-	 * For example,
-	 *
-	 *           |<----------- hist_lock ring buffer size ------->|
-	 *           pppppppppppppppppppppiiiiiiiiiiiiiiiiiiiiiiiiiiiii
-	 * wrapped > iiiiiiiiiiiiiiiiiiiiiiiiiii.......................
-	 *
-	 *           where 'p' represents an acquisition in process
-	 *           context, 'i' represents an acquisition in irq
-	 *           context.
-	 *
-	 * In this example, the ring buffer was overwritten by
-	 * acquisitions in irq context, that should be detected on
-	 * rollback or commit.
-	 */
-	unsigned int hist_id;
-
-	/*
-	 * Seperate stack_trace data. This will be used at commit step.
-	 */
-	struct stack_trace	trace;
-	unsigned long		trace_entries[MAX_XHLOCK_TRACE_ENTRIES];
-
-	/*
-	 * Seperate hlock instance. This will be used at commit step.
-	 *
-	 * TODO: Use a smaller data structure containing only necessary
-	 * data. However, we should make lockdep code able to handle the
-	 * smaller one first.
-	 */
-	struct held_lock	hlock;
 };
 
-/*
- * To initialize a lock as crosslock, lockdep_init_map_crosslock() should
- * be called instead of lockdep_init_map().
- */
-struct cross_lock {
-	/*
-	 * When more than one acquisition of crosslocks are overlapped,
-	 * we have to perform commit for them based on cross_gen_id of
-	 * the first acquisition, which allows us to add more true
-	 * dependencies.
-	 *
-	 * Moreover, when no acquisition of a crosslock is in progress,
-	 * we should not perform commit because the lock might not exist
-	 * any more, which might cause incorrect memory access. So we
-	 * have to track the number of acquisitions of a crosslock.
-	 */
-	int nr_acquire;
-
-	/*
-	 * Seperate hlock instance. This will be used at commit step.
-	 *
-	 * TODO: Use a smaller data structure containing only necessary
-	 * data. However, we should make lockdep code able to handle the
-	 * smaller one first.
-	 */
-	struct held_lock	hlock;
-};
-
-struct lockdep_map_cross {
-	struct lockdep_map map;
-	struct cross_lock xlock;
-};
-#endif
-
 /*
  * Initialization, self-test and debugging-output methods:
  */
@@ -560,37 +467,6 @@ enum xhlock_context_t {
 	XHLOCK_CTX_NR,
 };
 
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-extern void lockdep_init_map_crosslock(struct lockdep_map *lock,
-				       const char *name,
-				       struct lock_class_key *key,
-				       int subclass);
-extern void lock_commit_crosslock(struct lockdep_map *lock);
-
-/*
- * What we essencially have to initialize is 'nr_acquire'. Other members
- * will be initialized in add_xlock().
- */
-#define STATIC_CROSS_LOCK_INIT() \
-	{ .nr_acquire = 0,}
-
-#define STATIC_CROSS_LOCKDEP_MAP_INIT(_name, _key) \
-	{ .map.name = (_name), .map.key = (void *)(_key), \
-	  .map.cross = 1, .xlock = STATIC_CROSS_LOCK_INIT(), }
-
-/*
- * To initialize a lockdep_map statically use this macro.
- * Note that _name must not be NULL.
- */
-#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
-	{ .name = (_name), .key = (void *)(_key), .cross = 0, }
-
-extern void crossrelease_hist_start(enum xhlock_context_t c);
-extern void crossrelease_hist_end(enum xhlock_context_t c);
-extern void lockdep_invariant_state(bool force);
-extern void lockdep_init_task(struct task_struct *task);
-extern void lockdep_free_task(struct task_struct *task);
-#else /* !CROSSRELEASE */
 #define lockdep_init_map_crosslock(m, n, k, s) do {} while (0)
 /*
  * To initialize a lockdep_map statically use this macro.
@@ -604,7 +480,6 @@ static inline void crossrelease_hist_end(enum xhlock_context_t c) {}
 static inline void lockdep_invariant_state(bool force) {}
 static inline void lockdep_init_task(struct task_struct *task) {}
 static inline void lockdep_free_task(struct task_struct *task) {}
-#endif /* CROSSRELEASE */
 
 #ifdef CONFIG_LOCK_STAT
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 21991d668d35..9ce6c3001e9f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -849,17 +849,6 @@ struct task_struct {
 	struct held_lock		held_locks[MAX_LOCK_DEPTH];
 #endif
 
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-#define MAX_XHLOCKS_NR 64UL
-	struct hist_lock *xhlocks; /* Crossrelease history locks */
-	unsigned int xhlock_idx;
-	/* For restoring at history boundaries */
-	unsigned int xhlock_idx_hist[XHLOCK_CTX_NR];
-	unsigned int hist_id;
-	/* For overwrite check at each context exit */
-	unsigned int hist_id_save[XHLOCK_CTX_NR];
-#endif
-
 #ifdef CONFIG_UBSAN
 	unsigned int			in_ubsan;
 #endif
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 670d8d7d8087..5fa1324a4f29 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -57,10 +57,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/lock.h>
 
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-#include <linux/slab.h>
-#endif
-
 #ifdef CONFIG_PROVE_LOCKING
 int prove_locking = 1;
 module_param(prove_locking, int, 0644);
@@ -75,19 +71,6 @@ module_param(lock_stat, int, 0644);
 #define lock_stat 0
 #endif
 
-#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK
-static int crossrelease_fullstack = 1;
-#else
-static int crossrelease_fullstack;
-#endif
-static int __init allow_crossrelease_fullstack(char *str)
-{
-	crossrelease_fullstack = 1;
-	return 0;
-}
-
-early_param("crossrelease_fullstack", allow_crossrelease_fullstack);
-
 /*
  * lockdep_lock: protects the lockdep graph, the hashes and the
  *               class/list/hash allocators.
@@ -740,18 +723,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
 }
 
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-static void cross_init(struct lockdep_map *lock, int cross);
-static int cross_lock(struct lockdep_map *lock);
-static int lock_acquire_crosslock(struct held_lock *hlock);
-static int lock_release_crosslock(struct lockdep_map *lock);
-#else
-static inline void cross_init(struct lockdep_map *lock, int cross) {}
-static inline int cross_lock(struct lockdep_map *lock) { return 0; }
-static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; }
-static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; }
-#endif
-
 /*
  * Register a lock's class in the hash-table, if the class is not present
  * yet. Otherwise we look it up. We cache the result in the lock object
@@ -1151,41 +1122,22 @@ print_circular_lock_scenario(struct held_lock *src,
 		printk(KERN_CONT "\n\n");
 	}
 
-	if (cross_lock(tgt->instance)) {
-		printk(" Possible unsafe locking scenario by crosslock:\n\n");
-		printk("       CPU0                    CPU1\n");
-		printk("       ----                    ----\n");
-		printk("  lock(");
-		__print_lock_name(parent);
-		printk(KERN_CONT ");\n");
-		printk("  lock(");
-		__print_lock_name(target);
-		printk(KERN_CONT ");\n");
-		printk("                               lock(");
-		__print_lock_name(source);
-		printk(KERN_CONT ");\n");
-		printk("                               unlock(");
-		__print_lock_name(target);
-		printk(KERN_CONT ");\n");
-		printk("\n *** DEADLOCK ***\n\n");
-	} else {
-		printk(" Possible unsafe locking scenario:\n\n");
-		printk("       CPU0                    CPU1\n");
-		printk("       ----                    ----\n");
-		printk("  lock(");
-		__print_lock_name(target);
-		printk(KERN_CONT ");\n");
-		printk("                               lock(");
-		__print_lock_name(parent);
-		printk(KERN_CONT ");\n");
-		printk("                               lock(");
-		__print_lock_name(target);
-		printk(KERN_CONT ");\n");
-		printk("  lock(");
-		__print_lock_name(source);
-		printk(KERN_CONT ");\n");
-		printk("\n *** DEADLOCK ***\n\n");
-	}
+	printk(" Possible unsafe locking scenario:\n\n");
+	printk("       CPU0                    CPU1\n");
+	printk("       ----                    ----\n");
+	printk("  lock(");
+	__print_lock_name(target);
+	printk(KERN_CONT ");\n");
+	printk("                               lock(");
+	__print_lock_name(parent);
+	printk(KERN_CONT ");\n");
+	printk("                               lock(");
+	__print_lock_name(target);
+	printk(KERN_CONT ");\n");
+	printk("  lock(");
+	__print_lock_name(source);
+	printk(KERN_CONT ");\n");
+	printk("\n *** DEADLOCK ***\n\n");
 }
 
 /*
@@ -1211,10 +1163,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
 		curr->comm, task_pid_nr(curr));
 	print_lock(check_src);
 
-	if (cross_lock(check_tgt->instance))
-		pr_warn("\nbut now in release context of a crosslock acquired at the following:\n");
-	else
-		pr_warn("\nbut task is already holding lock:\n");
+	pr_warn("\nbut task is already holding lock:\n");
 
 	print_lock(check_tgt);
 	pr_warn("\nwhich lock already depends on the new lock.\n\n");
@@ -1244,9 +1193,7 @@ static noinline int print_circular_bug(struct lock_list *this,
 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
 		return 0;
 
-	if (cross_lock(check_tgt->instance))
-		this->trace = *trace;
-	else if (!save_trace(&this->trace))
+	if (!save_trace(&this->trace))
 		return 0;
 
 	depth = get_lock_depth(target);
@@ -1850,9 +1797,6 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
 		if (nest)
 			return 2;
 
-		if (cross_lock(prev->instance))
-			continue;
-
 		return print_deadlock_bug(curr, prev, next);
 	}
 	return 1;
@@ -2018,31 +1962,26 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 	for (;;) {
 		int distance = curr->lockdep_depth - depth + 1;
 		hlock = curr->held_locks + depth - 1;
+
 		/*
-		 * Only non-crosslock entries get new dependencies added.
-		 * Crosslock entries will be added by commit later:
+		 * Only non-recursive-read entries get new dependencies
+		 * added:
 		 */
-		if (!cross_lock(hlock->instance)) {
+		if (hlock->read != 2 && hlock->check) {
+			int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace);
+			if (!ret)
+				return 0;
+
 			/*
-			 * Only non-recursive-read entries get new dependencies
-			 * added:
+			 * Stop after the first non-trylock entry,
+			 * as non-trylock entries have added their
+			 * own direct dependencies already, so this
+			 * lock is connected to them indirectly:
 			 */
-			if (hlock->read != 2 && hlock->check) {
-				int ret = check_prev_add(curr, hlock, next,
-							 distance, &trace, save_trace);
-				if (!ret)
-					return 0;
-
-				/*
-				 * Stop after the first non-trylock entry,
-				 * as non-trylock entries have added their
-				 * own direct dependencies already, so this
-				 * lock is connected to them indirectly:
-				 */
-				if (!hlock->trylock)
-					break;
-			}
+			if (!hlock->trylock)
+				break;
 		}
+
 		depth--;
 		/*
 		 * End of lock-stack?
@@ -3292,21 +3231,10 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
 void lockdep_init_map(struct lockdep_map *lock, const char *name,
 		      struct lock_class_key *key, int subclass)
 {
-	cross_init(lock, 0);
 	__lockdep_init_map(lock, name, key, subclass);
 }
 EXPORT_SYMBOL_GPL(lockdep_init_map);
 
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name,
-		      struct lock_class_key *key, int subclass)
-{
-	cross_init(lock, 1);
-	__lockdep_init_map(lock, name, key, subclass);
-}
-EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock);
-#endif
-
 struct lock_class_key __lockdep_no_validate__;
 EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
 
@@ -3362,7 +3290,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	int chain_head = 0;
 	int class_idx;
 	u64 chain_key;
-	int ret;
 
 	if (unlikely(!debug_locks))
 		return 0;
@@ -3411,8 +3338,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 
 	class_idx = class - lock_classes + 1;
 
-	/* TODO: nest_lock is not implemented for crosslock yet. */
-	if (depth && !cross_lock(lock)) {
+	if (depth) {
 		hlock = curr->held_locks + depth - 1;
 		if (hlock->class_idx == class_idx && nest_lock) {
 			if (hlock->references) {
@@ -3500,14 +3426,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
 		return 0;
 
-	ret = lock_acquire_crosslock(hlock);
-	/*
-	 * 2 means normal acquire operations are needed. Otherwise, it's
-	 * ok just to return with '0:fail, 1:success'.
-	 */
-	if (ret != 2)
-		return ret;
-
 	curr->curr_chain_key = chain_key;
 	curr->lockdep_depth++;
 	check_chain_key(curr);
@@ -3745,19 +3663,11 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 	struct task_struct *curr = current;
 	struct held_lock *hlock;
 	unsigned int depth;
-	int ret, i;
+	int i;
 
 	if (unlikely(!debug_locks))
 		return 0;
 
-	ret = lock_release_crosslock(lock);
-	/*
-	 * 2 means normal release operations are needed. Otherwise, it's
-	 * ok just to return with '0:fail, 1:success'.
-	 */
-	if (ret != 2)
-		return ret;
-
 	depth = curr->lockdep_depth;
 	/*
 	 * So we're all set to release this lock.. wait what lock? We don't
@@ -4675,495 +4585,3 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
 	dump_stack();
 }
 EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
-
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-
-/*
- * Crossrelease works by recording a lock history for each thread and
- * connecting those historic locks that were taken after the
- * wait_for_completion() in the complete() context.
- *
- * Task-A				Task-B
- *
- *					mutex_lock(&A);
- *					mutex_unlock(&A);
- *
- * wait_for_completion(&C);
- *   lock_acquire_crosslock();
- *     atomic_inc_return(&cross_gen_id);
- *                                |
- *				  |	mutex_lock(&B);
- *				  |	mutex_unlock(&B);
- *                                |
- *				  |	complete(&C);
- *				  `--	  lock_commit_crosslock();
- *
- * Which will then add a dependency between B and C.
- */
-
-#define xhlock(i)         (current->xhlocks[(i) % MAX_XHLOCKS_NR])
-
-/*
- * Whenever a crosslock is held, cross_gen_id will be increased.
- */
-static atomic_t cross_gen_id; /* Can be wrapped */
-
-/*
- * Make an entry of the ring buffer invalid.
- */
-static inline void invalidate_xhlock(struct hist_lock *xhlock)
-{
-	/*
-	 * Normally, xhlock->hlock.instance must be !NULL.
-	 */
-	xhlock->hlock.instance = NULL;
-}
-
-/*
- * Lock history stacks; we have 2 nested lock history stacks:
- *
- *   HARD(IRQ)
- *   SOFT(IRQ)
- *
- * The thing is that once we complete a HARD/SOFT IRQ the future task locks
- * should not depend on any of the locks observed while running the IRQ.  So
- * what we do is rewind the history buffer and erase all our knowledge of that
- * temporal event.
- */
-
-void crossrelease_hist_start(enum xhlock_context_t c)
-{
-	struct task_struct *cur = current;
-
-	if (!cur->xhlocks)
-		return;
-
-	cur->xhlock_idx_hist[c] = cur->xhlock_idx;
-	cur->hist_id_save[c]    = cur->hist_id;
-}
-
-void crossrelease_hist_end(enum xhlock_context_t c)
-{
-	struct task_struct *cur = current;
-
-	if (cur->xhlocks) {
-		unsigned int idx = cur->xhlock_idx_hist[c];
-		struct hist_lock *h = &xhlock(idx);
-
-		cur->xhlock_idx = idx;
-
-		/* Check if the ring was overwritten. */
-		if (h->hist_id != cur->hist_id_save[c])
-			invalidate_xhlock(h);
-	}
-}
-
-/*
- * lockdep_invariant_state() is used to annotate independence inside a task, to
- * make one task look like multiple independent 'tasks'.
- *
- * Take for instance workqueues; each work is independent of the last. The
- * completion of a future work does not depend on the completion of a past work
- * (in general). Therefore we must not carry that (lock) dependency across
- * works.
- *
- * This is true for many things; pretty much all kthreads fall into this
- * pattern, where they have an invariant state and future completions do not
- * depend on past completions. Its just that since they all have the 'same'
- * form -- the kthread does the same over and over -- it doesn't typically
- * matter.
- *
- * The same is true for system-calls, once a system call is completed (we've
- * returned to userspace) the next system call does not depend on the lock
- * history of the previous system call.
- *
- * They key property for independence, this invariant state, is that it must be
- * a point where we hold no locks and have no history. Because if we were to
- * hold locks, the restore at _end() would not necessarily recover it's history
- * entry. Similarly, independence per-definition means it does not depend on
- * prior state.
- */
-void lockdep_invariant_state(bool force)
-{
-	/*
-	 * We call this at an invariant point, no current state, no history.
-	 * Verify the former, enforce the latter.
-	 */
-	WARN_ON_ONCE(!force && current->lockdep_depth);
-	if (current->xhlocks)
-		invalidate_xhlock(&xhlock(current->xhlock_idx));
-}
-
-static int cross_lock(struct lockdep_map *lock)
-{
-	return lock ? lock->cross : 0;
-}
-
-/*
- * This is needed to decide the relationship between wrapable variables.
- */
-static inline int before(unsigned int a, unsigned int b)
-{
-	return (int)(a - b) < 0;
-}
-
-static inline struct lock_class *xhlock_class(struct hist_lock *xhlock)
-{
-	return hlock_class(&xhlock->hlock);
-}
-
-static inline struct lock_class *xlock_class(struct cross_lock *xlock)
-{
-	return hlock_class(&xlock->hlock);
-}
-
-/*
- * Should we check a dependency with previous one?
- */
-static inline int depend_before(struct held_lock *hlock)
-{
-	return hlock->read != 2 && hlock->check && !hlock->trylock;
-}
-
-/*
- * Should we check a dependency with next one?
- */
-static inline int depend_after(struct held_lock *hlock)
-{
-	return hlock->read != 2 && hlock->check;
-}
-
-/*
- * Check if the xhlock is valid, which would be false if,
- *
- *    1. Has not used after initializaion yet.
- *    2. Got invalidated.
- *
- * Remind hist_lock is implemented as a ring buffer.
- */
-static inline int xhlock_valid(struct hist_lock *xhlock)
-{
-	/*
-	 * xhlock->hlock.instance must be !NULL.
-	 */
-	return !!xhlock->hlock.instance;
-}
-
-/*
- * Record a hist_lock entry.
- *
- * Irq disable is only required.
- */
-static void add_xhlock(struct held_lock *hlock)
-{
-	unsigned int idx = ++current->xhlock_idx;
-	struct hist_lock *xhlock = &xhlock(idx);
-
-#ifdef CONFIG_DEBUG_LOCKDEP
-	/*
-	 * This can be done locklessly because they are all task-local
-	 * state, we must however ensure IRQs are disabled.
-	 */
-	WARN_ON_ONCE(!irqs_disabled());
-#endif
-
-	/* Initialize hist_lock's members */
-	xhlock->hlock = *hlock;
-	xhlock->hist_id = ++current->hist_id;
-
-	xhlock->trace.nr_entries = 0;
-	xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
-	xhlock->trace.entries = xhlock->trace_entries;
-
-	if (crossrelease_fullstack) {
-		xhlock->trace.skip = 3;
-		save_stack_trace(&xhlock->trace);
-	} else {
-		xhlock->trace.nr_entries = 1;
-		xhlock->trace.entries[0] = hlock->acquire_ip;
-	}
-}
-
-static inline int same_context_xhlock(struct hist_lock *xhlock)
-{
-	return xhlock->hlock.irq_context == task_irq_context(current);
-}
-
-/*
- * This should be lockless as far as possible because this would be
- * called very frequently.
- */
-static void check_add_xhlock(struct held_lock *hlock)
-{
-	/*
-	 * Record a hist_lock, only in case that acquisitions ahead
-	 * could depend on the held_lock. For example, if the held_lock
-	 * is trylock then acquisitions ahead never depends on that.
-	 * In that case, we don't need to record it. Just return.
-	 */
-	if (!current->xhlocks || !depend_before(hlock))
-		return;
-
-	add_xhlock(hlock);
-}
-
-/*
- * For crosslock.
- */
-static int add_xlock(struct held_lock *hlock)
-{
-	struct cross_lock *xlock;
-	unsigned int gen_id;
-
-	if (!graph_lock())
-		return 0;
-
-	xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock;
-
-	/*
-	 * When acquisitions for a crosslock are overlapped, we use
-	 * nr_acquire to perform commit for them, based on cross_gen_id
-	 * of the first acquisition, which allows to add additional
-	 * dependencies.
-	 *
-	 * Moreover, when no acquisition of a crosslock is in progress,
-	 * we should not perform commit because the lock might not exist
-	 * any more, which might cause incorrect memory access. So we
-	 * have to track the number of acquisitions of a crosslock.
-	 *
-	 * depend_after() is necessary to initialize only the first
-	 * valid xlock so that the xlock can be used on its commit.
-	 */
-	if (xlock->nr_acquire++ && depend_after(&xlock->hlock))
-		goto unlock;
-
-	gen_id = (unsigned int)atomic_inc_return(&cross_gen_id);
-	xlock->hlock = *hlock;
-	xlock->hlock.gen_id = gen_id;
-unlock:
-	graph_unlock();
-	return 1;
-}
-
-/*
- * Called for both normal and crosslock acquires. Normal locks will be
- * pushed on the hist_lock queue. Cross locks will record state and
- * stop regular lock_acquire() to avoid being placed on the held_lock
- * stack.
- *
- * Return: 0 - failure;
- *         1 - crosslock, done;
- *         2 - normal lock, continue to held_lock[] ops.
- */
-static int lock_acquire_crosslock(struct held_lock *hlock)
-{
-	/*
-	 *	CONTEXT 1		CONTEXT 2
-	 *	---------		---------
-	 *	lock A (cross)
-	 *	X = atomic_inc_return(&cross_gen_id)
-	 *	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-	 *				Y = atomic_read_acquire(&cross_gen_id)
-	 *				lock B
-	 *
-	 * atomic_read_acquire() is for ordering between A and B,
-	 * IOW, A happens before B, when CONTEXT 2 see Y >= X.
-	 *
-	 * Pairs with atomic_inc_return() in add_xlock().
-	 */
-	hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id);
-
-	if (cross_lock(hlock->instance))
-		return add_xlock(hlock);
-
-	check_add_xhlock(hlock);
-	return 2;
-}
-
-static int copy_trace(struct stack_trace *trace)
-{
-	unsigned long *buf = stack_trace + nr_stack_trace_entries;
-	unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
-	unsigned int nr = min(max_nr, trace->nr_entries);
-
-	trace->nr_entries = nr;
-	memcpy(buf, trace->entries, nr * sizeof(trace->entries[0]));
-	trace->entries = buf;
-	nr_stack_trace_entries += nr;
-
-	if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
-		if (!debug_locks_off_graph_unlock())
-			return 0;
-
-		print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
-		dump_stack();
-
-		return 0;
-	}
-
-	return 1;
-}
-
-static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock)
-{
-	unsigned int xid, pid;
-	u64 chain_key;
-
-	xid = xlock_class(xlock) - lock_classes;
-	chain_key = iterate_chain_key((u64)0, xid);
-	pid = xhlock_class(xhlock) - lock_classes;
-	chain_key = iterate_chain_key(chain_key, pid);
-
-	if (lookup_chain_cache(chain_key))
-		return 1;
-
-	if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context,
-				chain_key))
-		return 0;
-
-	if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1,
-			    &xhlock->trace, copy_trace))
-		return 0;
-
-	return 1;
-}
-
-static void commit_xhlocks(struct cross_lock *xlock)
-{
-	unsigned int cur = current->xhlock_idx;
-	unsigned int prev_hist_id = xhlock(cur).hist_id;
-	unsigned int i;
-
-	if (!graph_lock())
-		return;
-
-	if (xlock->nr_acquire) {
-		for (i = 0; i < MAX_XHLOCKS_NR; i++) {
-			struct hist_lock *xhlock = &xhlock(cur - i);
-
-			if (!xhlock_valid(xhlock))
-				break;
-
-			if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id))
-				break;
-
-			if (!same_context_xhlock(xhlock))
-				break;
-
-			/*
-			 * Filter out the cases where the ring buffer was
-			 * overwritten and the current entry has a bigger
-			 * hist_id than the previous one, which is impossible
-			 * otherwise:
-			 */
-			if (unlikely(before(prev_hist_id, xhlock->hist_id)))
-				break;
-
-			prev_hist_id = xhlock->hist_id;
-
-			/*
-			 * commit_xhlock() returns 0 with graph_lock already
-			 * released if fail.
-			 */
-			if (!commit_xhlock(xlock, xhlock))
-				return;
-		}
-	}
-
-	graph_unlock();
-}
-
-void lock_commit_crosslock(struct lockdep_map *lock)
-{
-	struct cross_lock *xlock;
-	unsigned long flags;
-
-	if (unlikely(!debug_locks || current->lockdep_recursion))
-		return;
-
-	if (!current->xhlocks)
-		return;
-
-	/*
-	 * Do commit hist_locks with the cross_lock, only in case that
-	 * the cross_lock could depend on acquisitions after that.
-	 *
-	 * For example, if the cross_lock does not have the 'check' flag
-	 * then we don't need to check dependencies and commit for that.
-	 * Just skip it. In that case, of course, the cross_lock does
-	 * not depend on acquisitions ahead, either.
-	 *
-	 * WARNING: Don't do that in add_xlock() in advance. When an
-	 * acquisition context is different from the commit context,
-	 * invalid(skipped) cross_lock might be accessed.
-	 */
-	if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock))
-		return;
-
-	raw_local_irq_save(flags);
-	check_flags(flags);
-	current->lockdep_recursion = 1;
-	xlock = &((struct lockdep_map_cross *)lock)->xlock;
-	commit_xhlocks(xlock);
-	current->lockdep_recursion = 0;
-	raw_local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(lock_commit_crosslock);
-
-/*
- * Return: 0 - failure;
- *         1 - crosslock, done;
- *         2 - normal lock, continue to held_lock[] ops.
- */
-static int lock_release_crosslock(struct lockdep_map *lock)
-{
-	if (cross_lock(lock)) {
-		if (!graph_lock())
-			return 0;
-		((struct lockdep_map_cross *)lock)->xlock.nr_acquire--;
-		graph_unlock();
-		return 1;
-	}
-	return 2;
-}
-
-static void cross_init(struct lockdep_map *lock, int cross)
-{
-	if (cross)
-		((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0;
-
-	lock->cross = cross;
-
-	/*
-	 * Crossrelease assumes that the ring buffer size of xhlocks
-	 * is aligned with power of 2. So force it on build.
-	 */
-	BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1));
-}
-
-void lockdep_init_task(struct task_struct *task)
-{
-	int i;
-
-	task->xhlock_idx = UINT_MAX;
-	task->hist_id = 0;
-
-	for (i = 0; i < XHLOCK_CTX_NR; i++) {
-		task->xhlock_idx_hist[i] = UINT_MAX;
-		task->hist_id_save[i] = 0;
-	}
-
-	task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR,
-				GFP_KERNEL);
-}
-
-void lockdep_free_task(struct task_struct *task)
-{
-	if (task->xhlocks) {
-		void *tmp = task->xhlocks;
-		/* Diable crossrelease for current */
-		task->xhlocks = NULL;
-		kfree(tmp);
-	}
-}
-#endif
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 947d3e2ed5c2..9d5b78aad4c5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1099,8 +1099,6 @@ config PROVE_LOCKING
 	select DEBUG_MUTEXES
 	select DEBUG_RT_MUTEXES if RT_MUTEXES
 	select DEBUG_LOCK_ALLOC
-	select LOCKDEP_CROSSRELEASE
-	select LOCKDEP_COMPLETIONS
 	select TRACE_IRQFLAGS
 	default n
 	help
@@ -1170,37 +1168,6 @@ config LOCK_STAT
 	 CONFIG_LOCK_STAT defines "contended" and "acquired" lock events.
 	 (CONFIG_LOCKDEP defines "acquire" and "release" events.)
 
-config LOCKDEP_CROSSRELEASE
-	bool
-	help
-	 This makes lockdep work for crosslock which is a lock allowed to
-	 be released in a different context from the acquisition context.
-	 Normally a lock must be released in the context acquiring the lock.
-	 However, relexing this constraint helps synchronization primitives
-	 such as page locks or completions can use the lock correctness
-	 detector, lockdep.
-
-config LOCKDEP_COMPLETIONS
-	bool
-	help
-	 A deadlock caused by wait_for_completion() and complete() can be
-	 detected by lockdep using crossrelease feature.
-
-config BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK
-	bool "Enable the boot parameter, crossrelease_fullstack"
-	depends on LOCKDEP_CROSSRELEASE
-	default n
-	help
-	 The lockdep "cross-release" feature needs to record stack traces
-	 (of calling functions) for all acquisitions, for eventual later
-	 use during analysis. By default only a single caller is recorded,
-	 because the unwind operation can be very expensive with deeper
-	 stack chains.
-
-	 However a boot parameter, crossrelease_fullstack, was
-	 introduced since sometimes deeper traces are required for full
-	 analysis. This option turns on the boot parameter.
-
 config DEBUG_LOCKDEP
 	bool "Lock dependency engine debugging"
 	depends on DEBUG_KERNEL && LOCKDEP
-- 
cgit v1.2.3


From b899a850431e2dd0943205a63a68573f3e312d0d Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 27 Nov 2017 10:38:23 +0000
Subject: compiler.h: Remove ACCESS_ONCE()

There are no longer any kernelspace uses of ACCESS_ONCE(), so we can
remove the definition from <linux/compiler.h>.

This patch removes the ACCESS_ONCE() definition, and updates comments
which referred to it. At the same time, some inconsistent and redundant
whitespace is removed from comments.

Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: apw@canonical.com
Link: http://lkml.kernel.org/r/20171127103824.36526-4-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler.h | 47 +++++++++++------------------------------------
 1 file changed, 11 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 188ed9f65517..52e611ab9a6c 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -220,21 +220,21 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 /*
  * Prevent the compiler from merging or refetching reads or writes. The
  * compiler is also forbidden from reordering successive instances of
- * READ_ONCE, WRITE_ONCE and ACCESS_ONCE (see below), but only when the
- * compiler is aware of some particular ordering.  One way to make the
- * compiler aware of ordering is to put the two invocations of READ_ONCE,
- * WRITE_ONCE or ACCESS_ONCE() in different C statements.
+ * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some
+ * particular ordering. One way to make the compiler aware of ordering is to
+ * put the two invocations of READ_ONCE or WRITE_ONCE in different C
+ * statements.
  *
- * In contrast to ACCESS_ONCE these two macros will also work on aggregate
- * data types like structs or unions. If the size of the accessed data
- * type exceeds the word size of the machine (e.g., 32 bits or 64 bits)
- * READ_ONCE() and WRITE_ONCE() will fall back to memcpy(). There's at
- * least two memcpy()s: one for the __builtin_memcpy() and then one for
- * the macro doing the copy of variable - '__u' allocated on the stack.
+ * These two macros will also work on aggregate data types like structs or
+ * unions. If the size of the accessed data type exceeds the word size of
+ * the machine (e.g., 32 bits or 64 bits) READ_ONCE() and WRITE_ONCE() will
+ * fall back to memcpy(). There's at least two memcpy()s: one for the
+ * __builtin_memcpy() and then one for the macro doing the copy of variable
+ * - '__u' allocated on the stack.
  *
  * Their two major use cases are: (1) Mediating communication between
  * process-level code and irq/NMI handlers, all running on the same CPU,
- * and (2) Ensuring that the compiler does not  fold, spindle, or otherwise
+ * and (2) Ensuring that the compiler does not fold, spindle, or otherwise
  * mutilate accesses that either do not require ordering or that interact
  * with an explicit memory barrier or atomic instruction that provides the
  * required ordering.
@@ -327,29 +327,4 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 	compiletime_assert(__native_word(t),				\
 		"Need native word sized stores/loads for atomicity.")
 
-/*
- * Prevent the compiler from merging or refetching accesses.  The compiler
- * is also forbidden from reordering successive instances of ACCESS_ONCE(),
- * but only when the compiler is aware of some particular ordering.  One way
- * to make the compiler aware of ordering is to put the two invocations of
- * ACCESS_ONCE() in different C statements.
- *
- * ACCESS_ONCE will only work on scalar types. For union types, ACCESS_ONCE
- * on a union member will work as long as the size of the member matches the
- * size of the union and the size is smaller than word size.
- *
- * The major use cases of ACCESS_ONCE used to be (1) Mediating communication
- * between process-level code and irq/NMI handlers, all running on the same CPU,
- * and (2) Ensuring that the compiler does not  fold, spindle, or otherwise
- * mutilate accesses that either do not require ordering or that interact
- * with an explicit memory barrier or atomic instruction that provides the
- * required ordering.
- *
- * If possible use READ_ONCE()/WRITE_ONCE() instead.
- */
-#define __ACCESS_ONCE(x) ({ \
-	 __maybe_unused typeof(x) __var = (__force typeof(x)) 0; \
-	(volatile typeof(x) *)&(x); })
-#define ACCESS_ONCE(x) (*__ACCESS_ONCE(x))
-
 #endif /* __LINUX_COMPILER_H */
-- 
cgit v1.2.3


From c47d7f56e914900410f65835933f9fc4374d0a2b Mon Sep 17 00:00:00 2001
From: Wei Wang <wei.w.wang@intel.com>
Date: Thu, 14 Dec 2017 15:32:24 -0800
Subject: include/linux/idr.h: add #include <linux/bug.h>

The <linux/bug.h> was removed from radix-tree.h by commit f5bba9d11a25
("include/linux/radix-tree.h: remove unneeded #include <linux/bug.h>").

Since that commit, tools/testing/radix-tree/ couldn't pass compilation
due to tools/testing/radix-tree/idr.c:17: undefined reference to
WARN_ON_ONCE.  This patch adds the bug.h header to idr.h to solve the
issue.

Link: http://lkml.kernel.org/r/1511963726-34070-2-git-send-email-wei.w.wang@intel.com
Fixes: f5bba9d11a2 ("include/linux/radix-tree.h: remove unneeded #include <linux/bug.h>")
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/idr.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 7c3a365f7e12..fa14f834e4ed 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -15,6 +15,7 @@
 #include <linux/radix-tree.h>
 #include <linux/gfp.h>
 #include <linux/percpu.h>
+#include <linux/bug.h>
 
 struct idr {
 	struct radix_tree_root	idr_rt;
-- 
cgit v1.2.3


From 338f1d9d1b829fec494d053f62820a2ee625b1ec Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 14 Dec 2017 15:32:28 -0800
Subject: lib/rbtree,drm/mm: add rbtree_replace_node_cached()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a variant of rbtree_replace_node() that maintains the leftmost cache
of struct rbtree_root_cached when replacing nodes within the rbtree.

As drm_mm is the only rb_replace_node() being used on an interval tree,
the mistake looks fairly self-contained.  Furthermore the only user of
drm_mm_replace_node() is its testsuite...

Testcase: igt/drm_mm/replace

Link: http://lkml.kernel.org/r/20171122100729.3742-1-chris@chris-wilson.co.uk
Link: https://patchwork.freedesktop.org/patch/msgid/20171109212435.9265-1-chris@chris-wilson.co.uk
Fixes: f808c13fd373 ("lib/interval_tree: fast overlap detection")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpu/drm/drm_mm.c |  8 +++++---
 include/linux/rbtree.h   |  2 ++
 lib/rbtree.c             | 10 ++++++++++
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c
index 61a1c8ea74bc..c3c79ee6119e 100644
--- a/drivers/gpu/drm/drm_mm.c
+++ b/drivers/gpu/drm/drm_mm.c
@@ -575,21 +575,23 @@ EXPORT_SYMBOL(drm_mm_remove_node);
  */
 void drm_mm_replace_node(struct drm_mm_node *old, struct drm_mm_node *new)
 {
+	struct drm_mm *mm = old->mm;
+
 	DRM_MM_BUG_ON(!old->allocated);
 
 	*new = *old;
 
 	list_replace(&old->node_list, &new->node_list);
-	rb_replace_node(&old->rb, &new->rb, &old->mm->interval_tree.rb_root);
+	rb_replace_node_cached(&old->rb, &new->rb, &mm->interval_tree);
 
 	if (drm_mm_hole_follows(old)) {
 		list_replace(&old->hole_stack, &new->hole_stack);
 		rb_replace_node(&old->rb_hole_size,
 				&new->rb_hole_size,
-				&old->mm->holes_size);
+				&mm->holes_size);
 		rb_replace_node(&old->rb_hole_addr,
 				&new->rb_hole_addr,
-				&old->mm->holes_addr);
+				&mm->holes_addr);
 	}
 
 	old->allocated = false;
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index d574361943ea..fcbeed4053ef 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -99,6 +99,8 @@ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 			    struct rb_root *root);
 extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
 				struct rb_root *root);
+extern void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new,
+				   struct rb_root_cached *root);
 
 static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
 				struct rb_node **rb_link)
diff --git a/lib/rbtree.c b/lib/rbtree.c
index ba4a9d165f1b..d3ff682fd4b8 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -603,6 +603,16 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 }
 EXPORT_SYMBOL(rb_replace_node);
 
+void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new,
+			    struct rb_root_cached *root)
+{
+	rb_replace_node(victim, new, &root->rb_root);
+
+	if (root->rb_leftmost == victim)
+		root->rb_leftmost = new;
+}
+EXPORT_SYMBOL(rb_replace_node_cached);
+
 void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
 			 struct rb_root *root)
 {
-- 
cgit v1.2.3


From 146734b091430c80d80bb96b1139a96fb4bc830e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 14 Dec 2017 15:32:34 -0800
Subject: string.h: workaround for increased stack usage

The hardened strlen() function causes rather large stack usage in at
least one file in the kernel, in particular when CONFIG_KASAN is
enabled:

  drivers/media/usb/em28xx/em28xx-dvb.c: In function 'em28xx_dvb_init':
  drivers/media/usb/em28xx/em28xx-dvb.c:2062:1: error: the frame size of 3256 bytes is larger than 204 bytes [-Werror=frame-larger-than=]

Analyzing this problem led to the discovery that gcc fails to merge the
stack slots for the i2c_board_info[] structures after we strlcpy() into
them, due to the 'noreturn' attribute on the source string length check.

I reported this as a gcc bug, but it is unlikely to get fixed for gcc-8,
since it is relatively easy to work around, and it gets triggered
rarely.  An earlier workaround I did added an empty inline assembly
statement before the call to fortify_panic(), which works surprisingly
well, but is really ugly and unintuitive.

This is a new approach to the same problem, this time addressing it by
not calling the 'extern __real_strnlen()' function for string constants
where __builtin_strlen() is a compile-time constant and therefore known
to be safe.

We do this by checking if the last character in the string is a
compile-time constant '\0'.  If it is, we can assume that strlen() of
the string is also constant.

As a side-effect, this should also improve the object code output for
any other call of strlen() on a string constant.

[akpm@linux-foundation.org: add comment]
Link: http://lkml.kernel.org/r/20171205215143.3085755-1-arnd@arndb.de
Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82365
Link: https://patchwork.kernel.org/patch/9980413/
Link: https://patchwork.kernel.org/patch/9974047/
Fixes: 6974f0c4555 ("include/linux/string.h: add the option of fortified string.h functions")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Daniel Micay <danielmicay@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Martin Wilck <mwilck@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index 410ecf17de3c..cfd83eb2f926 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -259,7 +259,10 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p)
 {
 	__kernel_size_t ret;
 	size_t p_size = __builtin_object_size(p, 0);
-	if (p_size == (size_t)-1)
+
+	/* Work around gcc excess stack consumption issue */
+	if (p_size == (size_t)-1 ||
+	    (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0'))
 		return __builtin_strlen(p);
 	ret = strnlen(p, p_size);
 	if (p_size <= ret)
-- 
cgit v1.2.3


From 3756f6401c302617c5e091081ca4d26ab604bec5 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 14 Dec 2017 15:32:41 -0800
Subject: exec: avoid gcc-8 warning for get_task_comm

gcc-8 warns about using strncpy() with the source size as the limit:

  fs/exec.c:1223:32: error: argument to 'sizeof' in 'strncpy' call is the same expression as the source; did you mean to use the size of the destination? [-Werror=sizeof-pointer-memaccess]

This is indeed slightly suspicious, as it protects us from source
arguments without NUL-termination, but does not guarantee that the
destination is terminated.

This keeps the strncpy() to ensure we have properly padded target
buffer, but ensures that we use the correct length, by passing the
actual length of the destination buffer as well as adding a build-time
check to ensure it is exactly TASK_COMM_LEN.

There are only 23 callsites which I all reviewed to ensure this is
currently the case.  We could get away with doing only the check or
passing the right length, but it doesn't hurt to do both.

Link: http://lkml.kernel.org/r/20171205151724.1764896-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Suggested-by: Kees Cook <keescook@chromium.org>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Aleksa Sarai <asarai@suse.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c             | 7 +++----
 include/linux/sched.h | 6 +++++-
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 6be2aa0ab26f..156f56acfe8e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1216,15 +1216,14 @@ killed:
 	return -EAGAIN;
 }
 
-char *get_task_comm(char *buf, struct task_struct *tsk)
+char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
 {
-	/* buf must be at least sizeof(tsk->comm) in size */
 	task_lock(tsk);
-	strncpy(buf, tsk->comm, sizeof(tsk->comm));
+	strncpy(buf, tsk->comm, buf_size);
 	task_unlock(tsk);
 	return buf;
 }
-EXPORT_SYMBOL_GPL(get_task_comm);
+EXPORT_SYMBOL_GPL(__get_task_comm);
 
 /*
  * These functions flushes out all traces of the currently running executable
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 21991d668d35..5124ba709830 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1503,7 +1503,11 @@ static inline void set_task_comm(struct task_struct *tsk, const char *from)
 	__set_task_comm(tsk, from, false);
 }
 
-extern char *get_task_comm(char *to, struct task_struct *tsk);
+extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
+#define get_task_comm(buf, tsk) ({			\
+	BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN);	\
+	__get_task_comm(buf, sizeof(buf), tsk);		\
+})
 
 #ifdef CONFIG_SMP
 void scheduler_ipi(void);
-- 
cgit v1.2.3


From bdcf0a423ea1c40bbb40e7ee483b50fc8aa3d758 Mon Sep 17 00:00:00 2001
From: Thiago Rafael Becker <thiago.becker@gmail.com>
Date: Thu, 14 Dec 2017 15:33:12 -0800
Subject: kernel: make groups_sort calling a responsibility group_info
 allocators

In testing, we found that nfsd threads may call set_groups in parallel
for the same entry cached in auth.unix.gid, racing in the call of
groups_sort, corrupting the groups for that entry and leading to
permission denials for the client.

This patch:
 - Make groups_sort globally visible.
 - Move the call to groups_sort to the modifiers of group_info
 - Remove the call to groups_sort from set_groups

Link: http://lkml.kernel.org/r/20171211151420.18655-1-thiago.becker@gmail.com
Signed-off-by: Thiago Rafael Becker <thiago.becker@gmail.com>
Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
Reviewed-by: NeilBrown <neilb@suse.com>
Acked-by: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/kernel/compat_linux.c   | 1 +
 fs/nfsd/auth.c                    | 3 +++
 include/linux/cred.h              | 1 +
 kernel/groups.c                   | 5 +++--
 kernel/uid16.c                    | 1 +
 net/sunrpc/auth_gss/gss_rpc_xdr.c | 1 +
 net/sunrpc/auth_gss/svcauth_gss.c | 1 +
 net/sunrpc/svcauth_unix.c         | 2 ++
 8 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index f04db3779b34..59eea9c65d3e 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -263,6 +263,7 @@ COMPAT_SYSCALL_DEFINE2(s390_setgroups16, int, gidsetsize, u16 __user *, grouplis
 		return retval;
 	}
 
+	groups_sort(group_info);
 	retval = set_current_groups(group_info);
 	put_group_info(group_info);
 
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 697f8ae7792d..f650e475d8f0 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -60,6 +60,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 				gi->gid[i] = exp->ex_anon_gid;
 			else
 				gi->gid[i] = rqgi->gid[i];
+
+			/* Each thread allocates its own gi, no race */
+			groups_sort(gi);
 		}
 	} else {
 		gi = get_group_info(rqgi);
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 099058e1178b..631286535d0f 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -83,6 +83,7 @@ extern int set_current_groups(struct group_info *);
 extern void set_groups(struct cred *, struct group_info *);
 extern int groups_search(const struct group_info *, kgid_t);
 extern bool may_setgroups(void);
+extern void groups_sort(struct group_info *);
 
 /*
  * The security context of a task
diff --git a/kernel/groups.c b/kernel/groups.c
index e357bc800111..daae2f2dc6d4 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -86,11 +86,12 @@ static int gid_cmp(const void *_a, const void *_b)
 	return gid_gt(a, b) - gid_lt(a, b);
 }
 
-static void groups_sort(struct group_info *group_info)
+void groups_sort(struct group_info *group_info)
 {
 	sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid),
 	     gid_cmp, NULL);
 }
+EXPORT_SYMBOL(groups_sort);
 
 /* a simple bsearch */
 int groups_search(const struct group_info *group_info, kgid_t grp)
@@ -122,7 +123,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
 void set_groups(struct cred *new, struct group_info *group_info)
 {
 	put_group_info(new->group_info);
-	groups_sort(group_info);
 	get_group_info(group_info);
 	new->group_info = group_info;
 }
@@ -206,6 +206,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
 		return retval;
 	}
 
+	groups_sort(group_info);
 	retval = set_current_groups(group_info);
 	put_group_info(group_info);
 
diff --git a/kernel/uid16.c b/kernel/uid16.c
index ce74a4901d2b..ef1da2a5f9bd 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -192,6 +192,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
 		return retval;
 	}
 
+	groups_sort(group_info);
 	retval = set_current_groups(group_info);
 	put_group_info(group_info);
 
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index c4778cae58ef..444380f968f1 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -231,6 +231,7 @@ static int gssx_dec_linux_creds(struct xdr_stream *xdr,
 			goto out_free_groups;
 		creds->cr_group_info->gid[i] = kgid;
 	}
+	groups_sort(creds->cr_group_info);
 
 	return 0;
 out_free_groups:
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 5dd4e6c9fef2..26531193fce4 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -481,6 +481,7 @@ static int rsc_parse(struct cache_detail *cd,
 				goto out;
 			rsci.cred.cr_group_info->gid[i] = kgid;
 		}
+		groups_sort(rsci.cred.cr_group_info);
 
 		/* mech name */
 		len = qword_get(&mesg, buf, mlen);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 740b67d5a733..af7f28fb8102 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -520,6 +520,7 @@ static int unix_gid_parse(struct cache_detail *cd,
 		ug.gi->gid[i] = kgid;
 	}
 
+	groups_sort(ug.gi);
 	ugp = unix_gid_lookup(cd, uid);
 	if (ugp) {
 		struct cache_head *ch;
@@ -819,6 +820,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
 		kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv));
 		cred->cr_group_info->gid[i] = kgid;
 	}
+	groups_sort(cred->cr_group_info);
 	if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
 		*authp = rpc_autherr_badverf;
 		return SVC_DENIED;
-- 
cgit v1.2.3


From 4837fe37adff1d159904f0c013471b1ecbcb455e Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 14 Dec 2017 15:33:15 -0800
Subject: mm, oom_reaper: fix memory corruption

David Rientjes has reported the following memory corruption while the
oom reaper tries to unmap the victims address space

  BUG: Bad page map in process oom_reaper  pte:6353826300000000 pmd:00000000
  addr:00007f50cab1d000 vm_flags:08100073 anon_vma:ffff9eea335603f0 mapping:          (null) index:7f50cab1d
  file:          (null) fault:          (null) mmap:          (null) readpage:          (null)
  CPU: 2 PID: 1001 Comm: oom_reaper
  Call Trace:
     unmap_page_range+0x1068/0x1130
     __oom_reap_task_mm+0xd5/0x16b
     oom_reaper+0xff/0x14c
     kthread+0xc1/0xe0

Tetsuo Handa has noticed that the synchronization inside exit_mmap is
insufficient.  We only synchronize with the oom reaper if
tsk_is_oom_victim which is not true if the final __mmput is called from
a different context than the oom victim exit path.  This can trivially
happen from context of any task which has grabbed mm reference (e.g.  to
read /proc/<pid>/ file which requires mm etc.).

The race would look like this

  oom_reaper		oom_victim		task
						mmget_not_zero
			do_exit
			  mmput
  __oom_reap_task_mm				mmput
  						  __mmput
						    exit_mmap
						      remove_vma
    unmap_page_range

Fix this issue by providing a new mm_is_oom_victim() helper which
operates on the mm struct rather than a task.  Any context which
operates on a remote mm struct should use this helper in place of
tsk_is_oom_victim.  The flag is set in mark_oom_victim and never cleared
so it is stable in the exit_mmap path.

Debugged by Tetsuo Handa.

Link: http://lkml.kernel.org/r/20171210095130.17110-1-mhocko@kernel.org
Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently")
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: David Rientjes <rientjes@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Andrea Argangeli <andrea@kernel.org>
Cc: <stable@vger.kernel.org>	[4.14]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h            |  9 +++++++++
 include/linux/sched/coredump.h |  1 +
 mm/mmap.c                      | 10 +++++-----
 mm/oom_kill.c                  |  4 +++-
 4 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 01c91d874a57..5bad038ac012 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -66,6 +66,15 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
 	return tsk->signal->oom_mm;
 }
 
+/*
+ * Use this helper if tsk->mm != mm and the victim mm needs a special
+ * handling. This is guaranteed to stay true after once set.
+ */
+static inline bool mm_is_oom_victim(struct mm_struct *mm)
+{
+	return test_bit(MMF_OOM_VICTIM, &mm->flags);
+}
+
 /*
  * Checks whether a page fault on the given mm is still reliable.
  * This is no longer true if the oom reaper started to reap the
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 9c8847395b5e..ec912d01126f 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -70,6 +70,7 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_UNSTABLE		22	/* mm is unstable for copy_from_user */
 #define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
 #define MMF_DISABLE_THP		24	/* disable THP for all VMAs */
+#define MMF_OOM_VICTIM		25	/* mm is the oom victim */
 #define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
diff --git a/mm/mmap.c b/mm/mmap.c
index a4d546821214..9efdc021ad22 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3019,20 +3019,20 @@ void exit_mmap(struct mm_struct *mm)
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	unmap_vmas(&tlb, vma, 0, -1);
 
-	set_bit(MMF_OOM_SKIP, &mm->flags);
-	if (unlikely(tsk_is_oom_victim(current))) {
+	if (unlikely(mm_is_oom_victim(mm))) {
 		/*
 		 * Wait for oom_reap_task() to stop working on this
 		 * mm. Because MMF_OOM_SKIP is already set before
 		 * calling down_read(), oom_reap_task() will not run
 		 * on this "mm" post up_write().
 		 *
-		 * tsk_is_oom_victim() cannot be set from under us
-		 * either because current->mm is already set to NULL
+		 * mm_is_oom_victim() cannot be set from under us
+		 * either because victim->mm is already set to NULL
 		 * under task_lock before calling mmput and oom_mm is
-		 * set not NULL by the OOM killer only if current->mm
+		 * set not NULL by the OOM killer only if victim->mm
 		 * is found not NULL while holding the task_lock.
 		 */
+		set_bit(MMF_OOM_SKIP, &mm->flags);
 		down_write(&mm->mmap_sem);
 		up_write(&mm->mmap_sem);
 	}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c957be32b27a..29f855551efe 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -683,8 +683,10 @@ static void mark_oom_victim(struct task_struct *tsk)
 		return;
 
 	/* oom_mm is bound to the signal struct life time. */
-	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
+	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
 		mmgrab(tsk->signal->oom_mm);
+		set_bit(MMF_OOM_VICTIM, &mm->flags);
+	}
 
 	/*
 	 * Make sure that the task is woken up from uninterruptible sleep
-- 
cgit v1.2.3


From 14cb0dc6479dc5ebc63b3a459a5d89a2f1b39fed Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 18 Dec 2017 15:40:43 +0800
Subject: block: don't let passthrough IO go into .make_request_fn()

Commit a8821f3f3("block: Improvements to bounce-buffer handling") tries
to make sure that the bio to .make_request_fn won't exceed BIO_MAX_PAGES,
but ignores that passthrough I/O can use blk_queue_bounce() too.
Especially, passthrough IO may not be sector-aligned, and the check
of 'sectors < bio_sectors(*bio_orig)' inside __blk_queue_bounce() may
become true even though the max bvec number doesn't exceed BIO_MAX_PAGES,
then cause the bio splitted, and the original passthrough bio is submited
to generic_make_request().

This patch fixes this issue by checking if the bio is passthrough IO,
and use bio_kmalloc() to allocate the cloned passthrough bio.

Cc: NeilBrown <neilb@suse.com>
Fixes: a8821f3f3("block: Improvements to bounce-buffer handling")
Tested-by: Michele Ballabio <barra_cuda@katamail.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bounce.c         |  6 ++++--
 include/linux/blkdev.h | 21 +++++++++++++++++++--
 2 files changed, 23 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/bounce.c b/block/bounce.c
index fceb1a96480b..1d05c422c932 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -200,6 +200,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 	unsigned i = 0;
 	bool bounce = false;
 	int sectors = 0;
+	bool passthrough = bio_is_passthrough(*bio_orig);
 
 	bio_for_each_segment(from, *bio_orig, iter) {
 		if (i++ < BIO_MAX_PAGES)
@@ -210,13 +211,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 	if (!bounce)
 		return;
 
-	if (sectors < bio_sectors(*bio_orig)) {
+	if (!passthrough && sectors < bio_sectors(*bio_orig)) {
 		bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
 		bio_chain(bio, *bio_orig);
 		generic_make_request(*bio_orig);
 		*bio_orig = bio;
 	}
-	bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set);
+	bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
+			bounce_bio_set);
 
 	bio_for_each_segment_all(to, bio, i) {
 		struct page *page = to->bv_page;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8089ca17db9a..abd06f540863 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -241,14 +241,24 @@ struct request {
 	struct request *next_rq;
 };
 
+static inline bool blk_op_is_scsi(unsigned int op)
+{
+	return op == REQ_OP_SCSI_IN || op == REQ_OP_SCSI_OUT;
+}
+
+static inline bool blk_op_is_private(unsigned int op)
+{
+	return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
+}
+
 static inline bool blk_rq_is_scsi(struct request *rq)
 {
-	return req_op(rq) == REQ_OP_SCSI_IN || req_op(rq) == REQ_OP_SCSI_OUT;
+	return blk_op_is_scsi(req_op(rq));
 }
 
 static inline bool blk_rq_is_private(struct request *rq)
 {
-	return req_op(rq) == REQ_OP_DRV_IN || req_op(rq) == REQ_OP_DRV_OUT;
+	return blk_op_is_private(req_op(rq));
 }
 
 static inline bool blk_rq_is_passthrough(struct request *rq)
@@ -256,6 +266,13 @@ static inline bool blk_rq_is_passthrough(struct request *rq)
 	return blk_rq_is_scsi(rq) || blk_rq_is_private(rq);
 }
 
+static inline bool bio_is_passthrough(struct bio *bio)
+{
+	unsigned op = bio_op(bio);
+
+	return blk_op_is_scsi(op) || blk_op_is_private(op);
+}
+
 static inline unsigned short req_get_ioprio(struct request *req)
 {
 	return req->ioprio;
-- 
cgit v1.2.3


From 0abc2a10389f0c9070f76ca906c7382788036b93 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Dec 2017 15:40:44 +0800
Subject: block: fix blk_rq_append_bio

Commit caa4b02476e3(blk-map: call blk_queue_bounce from blk_rq_append_bio)
moves blk_queue_bounce() into blk_rq_append_bio(), but don't consider
the fact that the bounced bio becomes invisible to caller since the
parameter type is 'struct bio *'. Make it a pointer to a pointer to
a bio, so the caller sees the right bio also after a bounce.

Fixes: caa4b02476e3 ("blk-map: call blk_queue_bounce from blk_rq_append_bio")
Cc: Christoph Hellwig <hch@lst.de>
Reported-by: Michele Ballabio <barra_cuda@katamail.com>
(handling failure of blk_rq_append_bio(), only call bio_get() after
blk_rq_append_bio() returns OK)
Tested-by: Michele Ballabio <barra_cuda@katamail.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-map.c                    | 38 ++++++++++++++++++++++----------------
 drivers/scsi/osd/osd_initiator.c   |  4 +++-
 drivers/target/target_core_pscsi.c |  4 ++--
 include/linux/blkdev.h             |  2 +-
 4 files changed, 28 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-map.c b/block/blk-map.c
index b21f8e86f120..d3a94719f03f 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -12,22 +12,29 @@
 #include "blk.h"
 
 /*
- * Append a bio to a passthrough request.  Only works can be merged into
- * the request based on the driver constraints.
+ * Append a bio to a passthrough request.  Only works if the bio can be merged
+ * into the request based on the driver constraints.
  */
-int blk_rq_append_bio(struct request *rq, struct bio *bio)
+int blk_rq_append_bio(struct request *rq, struct bio **bio)
 {
-	blk_queue_bounce(rq->q, &bio);
+	struct bio *orig_bio = *bio;
+
+	blk_queue_bounce(rq->q, bio);
 
 	if (!rq->bio) {
-		blk_rq_bio_prep(rq->q, rq, bio);
+		blk_rq_bio_prep(rq->q, rq, *bio);
 	} else {
-		if (!ll_back_merge_fn(rq->q, rq, bio))
+		if (!ll_back_merge_fn(rq->q, rq, *bio)) {
+			if (orig_bio != *bio) {
+				bio_put(*bio);
+				*bio = orig_bio;
+			}
 			return -EINVAL;
+		}
 
-		rq->biotail->bi_next = bio;
-		rq->biotail = bio;
-		rq->__data_len += bio->bi_iter.bi_size;
+		rq->biotail->bi_next = *bio;
+		rq->biotail = *bio;
+		rq->__data_len += (*bio)->bi_iter.bi_size;
 	}
 
 	return 0;
@@ -73,14 +80,12 @@ static int __blk_rq_map_user_iov(struct request *rq,
 	 * We link the bounce buffer in and could have to traverse it
 	 * later so we have to get a ref to prevent it from being freed
 	 */
-	ret = blk_rq_append_bio(rq, bio);
-	bio_get(bio);
+	ret = blk_rq_append_bio(rq, &bio);
 	if (ret) {
-		bio_endio(bio);
 		__blk_rq_unmap_user(orig_bio);
-		bio_put(bio);
 		return ret;
 	}
+	bio_get(bio);
 
 	return 0;
 }
@@ -213,7 +218,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	int reading = rq_data_dir(rq) == READ;
 	unsigned long addr = (unsigned long) kbuf;
 	int do_copy = 0;
-	struct bio *bio;
+	struct bio *bio, *orig_bio;
 	int ret;
 
 	if (len > (queue_max_hw_sectors(q) << 9))
@@ -236,10 +241,11 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	if (do_copy)
 		rq->rq_flags |= RQF_COPY_USER;
 
-	ret = blk_rq_append_bio(rq, bio);
+	orig_bio = bio;
+	ret = blk_rq_append_bio(rq, &bio);
 	if (unlikely(ret)) {
 		/* request is too big */
-		bio_put(bio);
+		bio_put(orig_bio);
 		return ret;
 	}
 
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index a4f28b7e4c65..e18877177f1b 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -1576,7 +1576,9 @@ static struct request *_make_request(struct request_queue *q, bool has_write,
 		return req;
 
 	for_each_bio(bio) {
-		ret = blk_rq_append_bio(req, bio);
+		struct bio *bounce_bio = bio;
+
+		ret = blk_rq_append_bio(req, &bounce_bio);
 		if (ret)
 			return ERR_PTR(ret);
 	}
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 7c69b4a9694d..0d99b242e82e 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -920,7 +920,7 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 					" %d i: %d bio: %p, allocating another"
 					" bio\n", bio->bi_vcnt, i, bio);
 
-				rc = blk_rq_append_bio(req, bio);
+				rc = blk_rq_append_bio(req, &bio);
 				if (rc) {
 					pr_err("pSCSI: failed to append bio\n");
 					goto fail;
@@ -938,7 +938,7 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 	}
 
 	if (bio) {
-		rc = blk_rq_append_bio(req, bio);
+		rc = blk_rq_append_bio(req, &bio);
 		if (rc) {
 			pr_err("pSCSI: failed to append bio\n");
 			goto fail;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index abd06f540863..100d0df38026 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -965,7 +965,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 extern void blk_rq_unprep_clone(struct request *rq);
 extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
-extern int blk_rq_append_bio(struct request *rq, struct bio *bio);
+extern int blk_rq_append_bio(struct request *rq, struct bio **bio);
 extern void blk_delay_queue(struct request_queue *, unsigned long);
 extern void blk_queue_split(struct request_queue *, struct bio **);
 extern void blk_recount_segments(struct request_queue *, struct bio *);
-- 
cgit v1.2.3


From 231243c82793428467524227ae02ca451e6a98e7 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Fri, 10 Nov 2017 15:59:52 +0900
Subject: Revert "mlx5: move affinity hints assignments to generic code"

Before the offending commit, mlx5 core did the IRQ affinity itself,
and it seems that the new generic code have some drawbacks and one
of them is the lack for user ability to modify irq affinity after
the initial affinity values got assigned.

The issue is still being discussed and a solution in the new generic code
is required, until then we need to revert this patch.

This fixes the following issue:
echo <new affinity> > /proc/irq/<x>/smp_affinity
fails with  -EIO

This reverts commit a435393acafbf0ecff4deb3e3cb554b34f0d0664.
Note: kept mlx5_get_vector_affinity in include/linux/mlx5/driver.h since
it is used in mlx5_ib driver.

Fixes: a435393acafb ("mlx5: move affinity hints assignments to generic code")
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jes Sorensen <jsorensen@fb.com>
Reported-by: Jes Sorensen <jsorensen@fb.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 45 +++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/main.c    | 75 +++++++++++++++++++++--
 include/linux/mlx5/driver.h                       |  1 +
 4 files changed, 93 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index c0872b3284cb..43f9054830e5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -590,6 +590,7 @@ struct mlx5e_channel {
 	struct mlx5_core_dev      *mdev;
 	struct hwtstamp_config    *tstamp;
 	int                        ix;
+	int                        cpu;
 };
 
 struct mlx5e_channels {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index d2b057a3e512..cbec66bc82f1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -71,11 +71,6 @@ struct mlx5e_channel_param {
 	struct mlx5e_cq_param      icosq_cq;
 };
 
-static int mlx5e_get_node(struct mlx5e_priv *priv, int ix)
-{
-	return pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix);
-}
-
 static bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
 {
 	return MLX5_CAP_GEN(mdev, striding_rq) &&
@@ -444,17 +439,16 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq,
 	int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
 	int mtt_sz = mlx5e_get_wqe_mtt_sz();
 	int mtt_alloc = mtt_sz + MLX5_UMR_ALIGN - 1;
-	int node = mlx5e_get_node(c->priv, c->ix);
 	int i;
 
 	rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info),
-					GFP_KERNEL, node);
+				      GFP_KERNEL, cpu_to_node(c->cpu));
 	if (!rq->mpwqe.info)
 		goto err_out;
 
 	/* We allocate more than mtt_sz as we will align the pointer */
-	rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz,
-					GFP_KERNEL, node);
+	rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, GFP_KERNEL,
+					cpu_to_node(c->cpu));
 	if (unlikely(!rq->mpwqe.mtt_no_align))
 		goto err_free_wqe_info;
 
@@ -562,7 +556,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	int err;
 	int i;
 
-	rqp->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
+	rqp->wq.db_numa_node = cpu_to_node(c->cpu);
 
 	err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq,
 				&rq->wq_ctrl);
@@ -629,8 +623,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	default: /* MLX5_WQ_TYPE_LINKED_LIST */
 		rq->wqe.frag_info =
 			kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
-				     GFP_KERNEL,
-				     mlx5e_get_node(c->priv, c->ix));
+				     GFP_KERNEL, cpu_to_node(c->cpu));
 		if (!rq->wqe.frag_info) {
 			err = -ENOMEM;
 			goto err_rq_wq_destroy;
@@ -1000,13 +993,13 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
 	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
 	sq->min_inline_mode = params->tx_min_inline_mode;
 
-	param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
+	param->wq.db_numa_node = cpu_to_node(c->cpu);
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
 	if (err)
 		return err;
 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
 
-	err = mlx5e_alloc_xdpsq_db(sq, mlx5e_get_node(c->priv, c->ix));
+	err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu));
 	if (err)
 		goto err_sq_wq_destroy;
 
@@ -1053,13 +1046,13 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
 	sq->channel   = c;
 	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
 
-	param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
+	param->wq.db_numa_node = cpu_to_node(c->cpu);
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
 	if (err)
 		return err;
 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
 
-	err = mlx5e_alloc_icosq_db(sq, mlx5e_get_node(c->priv, c->ix));
+	err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu));
 	if (err)
 		goto err_sq_wq_destroy;
 
@@ -1126,13 +1119,13 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 	if (MLX5_IPSEC_DEV(c->priv->mdev))
 		set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
 
-	param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
+	param->wq.db_numa_node = cpu_to_node(c->cpu);
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
 	if (err)
 		return err;
 	sq->wq.db    = &sq->wq.db[MLX5_SND_DBR];
 
-	err = mlx5e_alloc_txqsq_db(sq, mlx5e_get_node(c->priv, c->ix));
+	err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu));
 	if (err)
 		goto err_sq_wq_destroy;
 
@@ -1504,8 +1497,8 @@ static int mlx5e_alloc_cq(struct mlx5e_channel *c,
 	struct mlx5_core_dev *mdev = c->priv->mdev;
 	int err;
 
-	param->wq.buf_numa_node = mlx5e_get_node(c->priv, c->ix);
-	param->wq.db_numa_node  = mlx5e_get_node(c->priv, c->ix);
+	param->wq.buf_numa_node = cpu_to_node(c->cpu);
+	param->wq.db_numa_node  = cpu_to_node(c->cpu);
 	param->eq_ix   = c->ix;
 
 	err = mlx5e_alloc_cq_common(mdev, param, cq);
@@ -1604,6 +1597,11 @@ static void mlx5e_close_cq(struct mlx5e_cq *cq)
 	mlx5e_free_cq(cq);
 }
 
+static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
+{
+	return cpumask_first(priv->mdev->priv.irq_info[ix].mask);
+}
+
 static int mlx5e_open_tx_cqs(struct mlx5e_channel *c,
 			     struct mlx5e_params *params,
 			     struct mlx5e_channel_param *cparam)
@@ -1752,12 +1750,13 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 {
 	struct mlx5e_cq_moder icocq_moder = {0, 0};
 	struct net_device *netdev = priv->netdev;
+	int cpu = mlx5e_get_cpu(priv, ix);
 	struct mlx5e_channel *c;
 	unsigned int irq;
 	int err;
 	int eqn;
 
-	c = kzalloc_node(sizeof(*c), GFP_KERNEL, mlx5e_get_node(priv, ix));
+	c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
 	if (!c)
 		return -ENOMEM;
 
@@ -1765,6 +1764,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 	c->mdev     = priv->mdev;
 	c->tstamp   = &priv->tstamp;
 	c->ix       = ix;
+	c->cpu      = cpu;
 	c->pdev     = &priv->mdev->pdev->dev;
 	c->netdev   = priv->netdev;
 	c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
@@ -1853,8 +1853,7 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c)
 	for (tc = 0; tc < c->num_tc; tc++)
 		mlx5e_activate_txqsq(&c->sq[tc]);
 	mlx5e_activate_rq(&c->rq);
-	netif_set_xps_queue(c->netdev,
-		mlx5_get_vector_affinity(c->priv->mdev, c->ix), c->ix);
+	netif_set_xps_queue(c->netdev, get_cpu_mask(c->cpu), c->ix);
 }
 
 static void mlx5e_deactivate_channel(struct mlx5e_channel *c)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 5f323442cc5a..8a89c7e8cd63 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -317,9 +317,6 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
 	struct mlx5_eq_table *table = &priv->eq_table;
-	struct irq_affinity irqdesc = {
-		.pre_vectors = MLX5_EQ_VEC_COMP_BASE,
-	};
 	int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq);
 	int nvec;
 
@@ -333,10 +330,9 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
 	if (!priv->irq_info)
 		goto err_free_msix;
 
-	nvec = pci_alloc_irq_vectors_affinity(dev->pdev,
+	nvec = pci_alloc_irq_vectors(dev->pdev,
 			MLX5_EQ_VEC_COMP_BASE + 1, nvec,
-			PCI_IRQ_MSIX | PCI_IRQ_AFFINITY,
-			&irqdesc);
+			PCI_IRQ_MSIX);
 	if (nvec < 0)
 		return nvec;
 
@@ -622,6 +618,63 @@ u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev)
 	return (u64)timer_l | (u64)timer_h1 << 32;
 }
 
+static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i)
+{
+	struct mlx5_priv *priv  = &mdev->priv;
+	int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
+
+	if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) {
+		mlx5_core_warn(mdev, "zalloc_cpumask_var failed");
+		return -ENOMEM;
+	}
+
+	cpumask_set_cpu(cpumask_local_spread(i, priv->numa_node),
+			priv->irq_info[i].mask);
+
+	if (IS_ENABLED(CONFIG_SMP) &&
+	    irq_set_affinity_hint(irq, priv->irq_info[i].mask))
+		mlx5_core_warn(mdev, "irq_set_affinity_hint failed, irq 0x%.4x", irq);
+
+	return 0;
+}
+
+static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i)
+{
+	struct mlx5_priv *priv  = &mdev->priv;
+	int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
+
+	irq_set_affinity_hint(irq, NULL);
+	free_cpumask_var(priv->irq_info[i].mask);
+}
+
+static int mlx5_irq_set_affinity_hints(struct mlx5_core_dev *mdev)
+{
+	int err;
+	int i;
+
+	for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) {
+		err = mlx5_irq_set_affinity_hint(mdev, i);
+		if (err)
+			goto err_out;
+	}
+
+	return 0;
+
+err_out:
+	for (i--; i >= 0; i--)
+		mlx5_irq_clear_affinity_hint(mdev, i);
+
+	return err;
+}
+
+static void mlx5_irq_clear_affinity_hints(struct mlx5_core_dev *mdev)
+{
+	int i;
+
+	for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++)
+		mlx5_irq_clear_affinity_hint(mdev, i);
+}
+
 int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
 		    unsigned int *irqn)
 {
@@ -1097,6 +1150,12 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		goto err_stop_eqs;
 	}
 
+	err = mlx5_irq_set_affinity_hints(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
+		goto err_affinity_hints;
+	}
+
 	err = mlx5_init_fs(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to init flow steering\n");
@@ -1154,6 +1213,9 @@ err_sriov:
 	mlx5_cleanup_fs(dev);
 
 err_fs:
+	mlx5_irq_clear_affinity_hints(dev);
+
+err_affinity_hints:
 	free_comp_eqs(dev);
 
 err_stop_eqs:
@@ -1222,6 +1284,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 
 	mlx5_sriov_detach(dev);
 	mlx5_cleanup_fs(dev);
+	mlx5_irq_clear_affinity_hints(dev);
 	free_comp_eqs(dev);
 	mlx5_stop_eqs(dev);
 	mlx5_put_uars_page(dev, priv->uar);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index a886b51511ab..40a6f33c4cde 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -556,6 +556,7 @@ struct mlx5_core_sriov {
 };
 
 struct mlx5_irq_info {
+	cpumask_var_t mask;
 	char name[MLX5_MAX_IRQ_NAME];
 };
 
-- 
cgit v1.2.3


From 37e92a9d4fe38dc3e7308913575983a6a088c8d4 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Mon, 13 Nov 2017 10:11:27 +0200
Subject: net/mlx5: Fix rate limit packet pacing naming and struct

In mlx5_ifc, struct size was not complete, and thus driver was sending
garbage after the last defined field. Fixed it by adding reserved field
to complete the struct size.

In addition, rename all set_rate_limit to set_pp_rate_limit to be
compliant with the Firmware <-> Driver definition.

Fixes: 7486216b3a0b ("{net,IB}/mlx5: mlx5_ifc updates")
Fixes: 1466cc5b23d1 ("net/mlx5: Rate limit tables support")
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/rl.c  | 22 +++++++++++-----------
 include/linux/mlx5/mlx5_ifc.h                 |  8 +++++---
 3 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 1fffdebbc9e8..e9a1fbcc4adf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -362,7 +362,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_QUERY_VPORT_COUNTER:
 	case MLX5_CMD_OP_ALLOC_Q_COUNTER:
 	case MLX5_CMD_OP_QUERY_Q_COUNTER:
-	case MLX5_CMD_OP_SET_RATE_LIMIT:
+	case MLX5_CMD_OP_SET_PP_RATE_LIMIT:
 	case MLX5_CMD_OP_QUERY_RATE_LIMIT:
 	case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT:
 	case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT:
@@ -505,7 +505,7 @@ const char *mlx5_command_str(int command)
 	MLX5_COMMAND_STR_CASE(ALLOC_Q_COUNTER);
 	MLX5_COMMAND_STR_CASE(DEALLOC_Q_COUNTER);
 	MLX5_COMMAND_STR_CASE(QUERY_Q_COUNTER);
-	MLX5_COMMAND_STR_CASE(SET_RATE_LIMIT);
+	MLX5_COMMAND_STR_CASE(SET_PP_RATE_LIMIT);
 	MLX5_COMMAND_STR_CASE(QUERY_RATE_LIMIT);
 	MLX5_COMMAND_STR_CASE(CREATE_SCHEDULING_ELEMENT);
 	MLX5_COMMAND_STR_CASE(DESTROY_SCHEDULING_ELEMENT);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
index e651e4c02867..d3c33e9eea72 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/rl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
@@ -125,16 +125,16 @@ static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table,
 	return ret_entry;
 }
 
-static int mlx5_set_rate_limit_cmd(struct mlx5_core_dev *dev,
+static int mlx5_set_pp_rate_limit_cmd(struct mlx5_core_dev *dev,
 				   u32 rate, u16 index)
 {
-	u32 in[MLX5_ST_SZ_DW(set_rate_limit_in)]   = {0};
-	u32 out[MLX5_ST_SZ_DW(set_rate_limit_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(set_pp_rate_limit_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(set_pp_rate_limit_out)] = {0};
 
-	MLX5_SET(set_rate_limit_in, in, opcode,
-		 MLX5_CMD_OP_SET_RATE_LIMIT);
-	MLX5_SET(set_rate_limit_in, in, rate_limit_index, index);
-	MLX5_SET(set_rate_limit_in, in, rate_limit, rate);
+	MLX5_SET(set_pp_rate_limit_in, in, opcode,
+		 MLX5_CMD_OP_SET_PP_RATE_LIMIT);
+	MLX5_SET(set_pp_rate_limit_in, in, rate_limit_index, index);
+	MLX5_SET(set_pp_rate_limit_in, in, rate_limit, rate);
 	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
 
@@ -173,7 +173,7 @@ int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u16 *index)
 		entry->refcount++;
 	} else {
 		/* new rate limit */
-		err = mlx5_set_rate_limit_cmd(dev, rate, entry->index);
+		err = mlx5_set_pp_rate_limit_cmd(dev, rate, entry->index);
 		if (err) {
 			mlx5_core_err(dev, "Failed configuring rate: %u (%d)\n",
 				      rate, err);
@@ -209,7 +209,7 @@ void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, u32 rate)
 	entry->refcount--;
 	if (!entry->refcount) {
 		/* need to remove rate */
-		mlx5_set_rate_limit_cmd(dev, 0, entry->index);
+		mlx5_set_pp_rate_limit_cmd(dev, 0, entry->index);
 		entry->rate = 0;
 	}
 
@@ -262,8 +262,8 @@ void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev)
 	/* Clear all configured rates */
 	for (i = 0; i < table->max_size; i++)
 		if (table->rl_entry[i].rate)
-			mlx5_set_rate_limit_cmd(dev, 0,
-						table->rl_entry[i].index);
+			mlx5_set_pp_rate_limit_cmd(dev, 0,
+						   table->rl_entry[i].index);
 
 	kfree(dev->priv.rl_table.rl_entry);
 }
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 38a7577a9ce7..d44ec5f41d4a 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -147,7 +147,7 @@ enum {
 	MLX5_CMD_OP_ALLOC_Q_COUNTER               = 0x771,
 	MLX5_CMD_OP_DEALLOC_Q_COUNTER             = 0x772,
 	MLX5_CMD_OP_QUERY_Q_COUNTER               = 0x773,
-	MLX5_CMD_OP_SET_RATE_LIMIT                = 0x780,
+	MLX5_CMD_OP_SET_PP_RATE_LIMIT             = 0x780,
 	MLX5_CMD_OP_QUERY_RATE_LIMIT              = 0x781,
 	MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT      = 0x782,
 	MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT     = 0x783,
@@ -7239,7 +7239,7 @@ struct mlx5_ifc_add_vxlan_udp_dport_in_bits {
 	u8         vxlan_udp_port[0x10];
 };
 
-struct mlx5_ifc_set_rate_limit_out_bits {
+struct mlx5_ifc_set_pp_rate_limit_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
 
@@ -7248,7 +7248,7 @@ struct mlx5_ifc_set_rate_limit_out_bits {
 	u8         reserved_at_40[0x40];
 };
 
-struct mlx5_ifc_set_rate_limit_in_bits {
+struct mlx5_ifc_set_pp_rate_limit_in_bits {
 	u8         opcode[0x10];
 	u8         reserved_at_10[0x10];
 
@@ -7261,6 +7261,8 @@ struct mlx5_ifc_set_rate_limit_in_bits {
 	u8         reserved_at_60[0x20];
 
 	u8         rate_limit[0x20];
+
+	u8         reserved_at_a0[0x160];
 };
 
 struct mlx5_ifc_access_register_out_bits {
-- 
cgit v1.2.3


From d6b2785cd55ee72e9608762650b3ef299f801b1b Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Tue, 21 Nov 2017 15:15:51 +0200
Subject: net/mlx5: Cleanup IRQs in case of unload failure

When mlx5_stop_eqs fails to destroy any of the eqs it returns with an error.
In such failure flow the function will return without
releasing all EQs irqs and then pci_free_irq_vectors will fail.
Fix by only warn on destroy EQ failure and continue to release other
EQs and their irqs.

It fixes the following kernel trace:
kernel: kernel BUG at drivers/pci/msi.c:352!
...
...
kernel: Call Trace:
kernel: pci_disable_msix+0xd3/0x100
kernel: pci_free_irq_vectors+0xe/0x20
kernel: mlx5_load_one.isra.17+0x9f5/0xec0 [mlx5_core]

Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 20 +++++++++++++-------
 include/linux/mlx5/driver.h                  |  2 +-
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 0308a2b4823c..ab4d1465b7e4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -775,7 +775,7 @@ err1:
 	return err;
 }
 
-int mlx5_stop_eqs(struct mlx5_core_dev *dev)
+void mlx5_stop_eqs(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = &dev->priv.eq_table;
 	int err;
@@ -784,22 +784,28 @@ int mlx5_stop_eqs(struct mlx5_core_dev *dev)
 	if (MLX5_CAP_GEN(dev, pg)) {
 		err = mlx5_destroy_unmap_eq(dev, &table->pfault_eq);
 		if (err)
-			return err;
+			mlx5_core_err(dev, "failed to destroy page fault eq, err(%d)\n",
+				      err);
 	}
 #endif
 
 	err = mlx5_destroy_unmap_eq(dev, &table->pages_eq);
 	if (err)
-		return err;
+		mlx5_core_err(dev, "failed to destroy pages eq, err(%d)\n",
+			      err);
 
-	mlx5_destroy_unmap_eq(dev, &table->async_eq);
+	err = mlx5_destroy_unmap_eq(dev, &table->async_eq);
+	if (err)
+		mlx5_core_err(dev, "failed to destroy async eq, err(%d)\n",
+			      err);
 	mlx5_cmd_use_polling(dev);
 
 	err = mlx5_destroy_unmap_eq(dev, &table->cmd_eq);
-	if (err)
+	if (err) {
+		mlx5_core_err(dev, "failed to destroy command eq, err(%d)\n",
+			      err);
 		mlx5_cmd_use_events(dev);
-
-	return err;
+	}
 }
 
 int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 40a6f33c4cde..57b109c6e422 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1049,7 +1049,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 		       enum mlx5_eq_type type);
 int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
 int mlx5_start_eqs(struct mlx5_core_dev *dev);
-int mlx5_stop_eqs(struct mlx5_core_dev *dev);
+void mlx5_stop_eqs(struct mlx5_core_dev *dev);
 int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
 		    unsigned int *irqn);
 int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn);
-- 
cgit v1.2.3


From 111be883981748acc9a56e855c8336404a8e787c Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 20 Dec 2017 11:10:17 -0700
Subject: block-throttle: avoid double charge

If a bio is throttled and split after throttling, the bio could be
resubmited and enters the throttling again. This will cause part of the
bio to be charged multiple times. If the cgroup has an IO limit, the
double charge will significantly harm the performance. The bio split
becomes quite common after arbitrary bio size change.

To fix this, we always set the BIO_THROTTLED flag if a bio is throttled.
If the bio is cloned/split, we copy the flag to new bio too to avoid a
double charge. However, cloned bio could be directed to a new disk,
keeping the flag be a problem. The observation is we always set new disk
for the bio in this case, so we can clear the flag in bio_set_dev().

This issue exists for a long time, arbitrary bio size change just makes
it worse, so this should go into stable at least since v4.2.

V1-> V2: Not add extra field in bio based on discussion with Tejun

Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: stable@vger.kernel.org
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c               | 2 ++
 block/blk-throttle.c      | 8 +-------
 include/linux/bio.h       | 2 ++
 include/linux/blk_types.h | 9 ++++-----
 4 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 8bfdea58159b..9ef6cf3addb3 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -599,6 +599,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 	bio->bi_disk = bio_src->bi_disk;
 	bio->bi_partno = bio_src->bi_partno;
 	bio_set_flag(bio, BIO_CLONED);
+	if (bio_flagged(bio_src, BIO_THROTTLED))
+		bio_set_flag(bio, BIO_THROTTLED);
 	bio->bi_opf = bio_src->bi_opf;
 	bio->bi_write_hint = bio_src->bi_write_hint;
 	bio->bi_iter = bio_src->bi_iter;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 825bc29767e6..d19f416d6101 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2226,13 +2226,7 @@ again:
 out_unlock:
 	spin_unlock_irq(q->queue_lock);
 out:
-	/*
-	 * As multiple blk-throtls may stack in the same issue path, we
-	 * don't want bios to leave with the flag set.  Clear the flag if
-	 * being issued.
-	 */
-	if (!throttled)
-		bio_clear_flag(bio, BIO_THROTTLED);
+	bio_set_flag(bio, BIO_THROTTLED);
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 	if (throttled || !td->track_bio_latency)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 82f0c8fd7be8..23d29b39f71e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -492,6 +492,8 @@ extern unsigned int bvec_nr_vecs(unsigned short idx);
 
 #define bio_set_dev(bio, bdev) 			\
 do {						\
+	if ((bio)->bi_disk != (bdev)->bd_disk)	\
+		bio_clear_flag(bio, BIO_THROTTLED);\
 	(bio)->bi_disk = (bdev)->bd_disk;	\
 	(bio)->bi_partno = (bdev)->bd_partno;	\
 } while (0)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a1e628e032da..9e7d8bd776d2 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -50,8 +50,6 @@ struct blk_issue_stat {
 struct bio {
 	struct bio		*bi_next;	/* request queue link */
 	struct gendisk		*bi_disk;
-	u8			bi_partno;
-	blk_status_t		bi_status;
 	unsigned int		bi_opf;		/* bottom bits req flags,
 						 * top bits REQ_OP. Use
 						 * accessors.
@@ -59,8 +57,8 @@ struct bio {
 	unsigned short		bi_flags;	/* status, etc and bvec pool number */
 	unsigned short		bi_ioprio;
 	unsigned short		bi_write_hint;
-
-	struct bvec_iter	bi_iter;
+	blk_status_t		bi_status;
+	u8			bi_partno;
 
 	/* Number of segments in this BIO after
 	 * physical address coalescing is performed.
@@ -74,8 +72,9 @@ struct bio {
 	unsigned int		bi_seg_front_size;
 	unsigned int		bi_seg_back_size;
 
-	atomic_t		__bi_remaining;
+	struct bvec_iter	bi_iter;
 
+	atomic_t		__bi_remaining;
 	bio_end_io_t		*bi_end_io;
 
 	void			*bi_private;
-- 
cgit v1.2.3


From 4ccafe032005e9b96acbef2e389a4de5b1254add Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 20 Dec 2017 13:13:58 -0700
Subject: block: unalign call_single_data in struct request

A previous change blindly added massive alignment to the
call_single_data structure in struct request. This ballooned it in size
from 296 to 320 bytes on my setup, for no valid reason at all.

Use the unaligned struct __call_single_data variant instead.

Fixes: 966a967116e69 ("smp: Avoid using two cache lines for struct call_single_data")
Cc: stable@vger.kernel.org # v4.14
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 100d0df38026..0ce8a372d506 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -135,7 +135,7 @@ typedef __u32 __bitwise req_flags_t;
 struct request {
 	struct list_head queuelist;
 	union {
-		call_single_data_t csd;
+		struct __call_single_data csd;
 		u64 fifo_time;
 	};
 
-- 
cgit v1.2.3


From bb7f0f989ca7de1153bd128a40a71709e339fa03 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 18 Dec 2017 20:12:00 -0800
Subject: bpf: fix integer overflows

There were various issues related to the limited size of integers used in
the verifier:
 - `off + size` overflow in __check_map_access()
 - `off + reg->off` overflow in check_mem_access()
 - `off + reg->var_off.value` overflow or 32-bit truncation of
   `reg->var_off.value` in check_mem_access()
 - 32-bit truncation in check_stack_boundary()

Make sure that any integer math cannot overflow by not allowing
pointer math with large values.

Also reduce the scope of "scalar op scalar" tracking.

Fixes: f1174f77b50c ("bpf/verifier: rework value tracking")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |  4 ++--
 kernel/bpf/verifier.c        | 48 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c561b986bab0..1632bb13ad8a 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -15,11 +15,11 @@
  * In practice this is far bigger than any realistic pointer offset; this limit
  * ensures that umax_value + (int)off + (int)size cannot overflow a u64.
  */
-#define BPF_MAX_VAR_OFF	(1ULL << 31)
+#define BPF_MAX_VAR_OFF	(1 << 29)
 /* Maximum variable size permitted for ARG_CONST_SIZE[_OR_ZERO].  This ensures
  * that converting umax_value to int cannot overflow.
  */
-#define BPF_MAX_VAR_SIZ	INT_MAX
+#define BPF_MAX_VAR_SIZ	(1 << 29)
 
 /* Liveness marks, used for registers and spilled-regs (in stack slots).
  * Read marks propagate upwards until they find a write mark; they record that
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 982bd9ec721a..86dfe6b5c243 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1819,6 +1819,41 @@ static bool signed_sub_overflows(s64 a, s64 b)
 	return res > a;
 }
 
+static bool check_reg_sane_offset(struct bpf_verifier_env *env,
+				  const struct bpf_reg_state *reg,
+				  enum bpf_reg_type type)
+{
+	bool known = tnum_is_const(reg->var_off);
+	s64 val = reg->var_off.value;
+	s64 smin = reg->smin_value;
+
+	if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
+		verbose(env, "math between %s pointer and %lld is not allowed\n",
+			reg_type_str[type], val);
+		return false;
+	}
+
+	if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
+		verbose(env, "%s pointer offset %d is not allowed\n",
+			reg_type_str[type], reg->off);
+		return false;
+	}
+
+	if (smin == S64_MIN) {
+		verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
+			reg_type_str[type]);
+		return false;
+	}
+
+	if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
+		verbose(env, "value %lld makes %s pointer be out of bounds\n",
+			smin, reg_type_str[type]);
+		return false;
+	}
+
+	return true;
+}
+
 /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
  * Caller should also handle BPF_MOV case separately.
  * If we return -EACCES, caller may want to try again treating pointer as a
@@ -1887,6 +1922,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	dst_reg->type = ptr_reg->type;
 	dst_reg->id = ptr_reg->id;
 
+	if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) ||
+	    !check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
+		return -EINVAL;
+
 	switch (opcode) {
 	case BPF_ADD:
 		/* We can take a fixed offset as long as it doesn't overflow
@@ -2017,6 +2056,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
+	if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
+		return -EINVAL;
+
 	__update_reg_bounds(dst_reg);
 	__reg_deduce_bounds(dst_reg);
 	__reg_bound_offset(dst_reg);
@@ -2046,6 +2088,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 	src_known = tnum_is_const(src_reg.var_off);
 	dst_known = tnum_is_const(dst_reg->var_off);
 
+	if (!src_known &&
+	    opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
+		__mark_reg_unknown(dst_reg);
+		return 0;
+	}
+
 	switch (opcode) {
 	case BPF_ADD:
 		if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
-- 
cgit v1.2.3


From 513674b5a2c9c7a67501506419da5c3c77ac6f08 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 20 Dec 2017 12:10:21 -0800
Subject: net: reevalulate autoflowlabel setting after sysctl setting

sysctl.ip6.auto_flowlabels is default 1. In our hosts, we set it to 2.
If sockopt doesn't set autoflowlabel, outcome packets from the hosts are
supposed to not include flowlabel. This is true for normal packet, but
not for reset packet.

The reason is ipv6_pinfo.autoflowlabel is set in sock creation. Later if
we change sysctl.ip6.auto_flowlabels, the ipv6_pinfo.autoflowlabel isn't
changed, so the sock will keep the old behavior in terms of auto
flowlabel. Reset packet is suffering from this problem, because reset
packet is sent from a special control socket, which is created at boot
time. Since sysctl.ipv6.auto_flowlabels is 1 by default, the control
socket will always have its ipv6_pinfo.autoflowlabel set, even after
user set sysctl.ipv6.auto_flowlabels to 1, so reset packset will always
have flowlabel. Normal sock created before sysctl setting suffers from
the same issue. We can't even turn off autoflowlabel unless we kill all
socks in the hosts.

To fix this, if IPV6_AUTOFLOWLABEL sockopt is used, we use the
autoflowlabel setting from user, otherwise we always call
ip6_default_np_autolabel() which has the new settings of sysctl.

Note, this changes behavior a little bit. Before commit 42240901f7c4
(ipv6: Implement different admin modes for automatic flow labels), the
autoflowlabel behavior of a sock isn't sticky, eg, if sysctl changes,
existing connection will change autoflowlabel behavior. After that
commit, autoflowlabel behavior is sticky in the whole life of the sock.
With this patch, the behavior isn't sticky again.

Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Tom Herbert <tom@quantonium.net>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  3 ++-
 net/ipv6/af_inet6.c      |  1 -
 net/ipv6/ip6_output.c    | 12 ++++++++++--
 net/ipv6/ipv6_sockglue.c |  1 +
 4 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index cb18c6290ca8..8415bf1a9776 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -273,7 +273,8 @@ struct ipv6_pinfo {
 						 * 100: prefer care-of address
 						 */
 				dontfrag:1,
-				autoflowlabel:1;
+				autoflowlabel:1,
+				autoflowlabel_set:1;
 	__u8			min_hopcount;
 	__u8			tclass;
 	__be32			rcv_flowinfo;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index c26f71234b9c..c9441ca45399 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -210,7 +210,6 @@ lookup_protocol:
 	np->mcast_hops	= IPV6_DEFAULT_MCASTHOPS;
 	np->mc_loop	= 1;
 	np->pmtudisc	= IPV6_PMTUDISC_WANT;
-	np->autoflowlabel = ip6_default_np_autolabel(net);
 	np->repflow	= net->ipv6.sysctl.flowlabel_reflect;
 	sk->sk_ipv6only	= net->ipv6.sysctl.bindv6only;
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 5110a418cc4d..f7dd51c42314 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -166,6 +166,14 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 }
 
+static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
+{
+	if (!np->autoflowlabel_set)
+		return ip6_default_np_autolabel(net);
+	else
+		return np->autoflowlabel;
+}
+
 /*
  * xmit an sk_buff (used by TCP, SCTP and DCCP)
  * Note : socket lock is not held for SYNACK packets, but might be modified
@@ -230,7 +238,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 		hlimit = ip6_dst_hoplimit(dst);
 
 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
-						     np->autoflowlabel, fl6));
+				ip6_autoflowlabel(net, np), fl6));
 
 	hdr->payload_len = htons(seg_len);
 	hdr->nexthdr = proto;
@@ -1626,7 +1634,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
 
 	ip6_flow_hdr(hdr, v6_cork->tclass,
 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
-					np->autoflowlabel, fl6));
+					ip6_autoflowlabel(net, np), fl6));
 	hdr->hop_limit = v6_cork->hop_limit;
 	hdr->nexthdr = proto;
 	hdr->saddr = fl6->saddr;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index b9404feabd78..2d4680e0376f 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -886,6 +886,7 @@ pref_skip_coa:
 		break;
 	case IPV6_AUTOFLOWLABEL:
 		np->autoflowlabel = valbool;
+		np->autoflowlabel_set = 1;
 		retv = 0;
 		break;
 	case IPV6_RECVFRAGSIZE:
-- 
cgit v1.2.3


From 71a0ff65a21bf3e2c4fde208c4a635ed2bbb4e81 Mon Sep 17 00:00:00 2001
From: Majd Dibbiny <majd@mellanox.com>
Date: Thu, 21 Dec 2017 17:38:26 +0200
Subject: IB/mlx5: Fix congestion counters in LAG mode

Congestion counters are counted and queried per physical function.
When working in LAG mode, CNP packets can be sent or received on both
of the functions, thus congestion counters should be aggregated from
the two physical functions.

Fixes: e1f24a79f424 ("IB/mlx5: Support congestion related counters")
Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Reviewed-by: Aviv Heller <avivh@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cmd.c              | 11 ------
 drivers/infiniband/hw/mlx5/cmd.h              |  2 -
 drivers/infiniband/hw/mlx5/main.c             | 35 +++--------------
 drivers/net/ethernet/mellanox/mlx5/core/lag.c | 56 +++++++++++++++++++++++++++
 include/linux/mlx5/driver.h                   |  4 ++
 5 files changed, 66 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c
index 470995fa38d2..6f6712f87a73 100644
--- a/drivers/infiniband/hw/mlx5/cmd.c
+++ b/drivers/infiniband/hw/mlx5/cmd.c
@@ -47,17 +47,6 @@ int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey)
 	return err;
 }
 
-int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev,
-				bool reset, void *out, int out_size)
-{
-	u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = { };
-
-	MLX5_SET(query_cong_statistics_in, in, opcode,
-		 MLX5_CMD_OP_QUERY_CONG_STATISTICS);
-	MLX5_SET(query_cong_statistics_in, in, clear, reset);
-	return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size);
-}
-
 int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point,
 			       void *out, int out_size)
 {
diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h
index af4c24596274..78ffded7cc2c 100644
--- a/drivers/infiniband/hw/mlx5/cmd.h
+++ b/drivers/infiniband/hw/mlx5/cmd.h
@@ -37,8 +37,6 @@
 #include <linux/mlx5/driver.h>
 
 int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey);
-int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev,
-				bool reset, void *out, int out_size);
 int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point,
 			       void *out, int out_size);
 int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *mdev,
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 543d0a4c8bf3..b4ef4d9b6ce5 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3737,34 +3737,6 @@ free:
 	return ret;
 }
 
-static int mlx5_ib_query_cong_counters(struct mlx5_ib_dev *dev,
-				       struct mlx5_ib_port *port,
-				       struct rdma_hw_stats *stats)
-{
-	int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
-	void *out;
-	int ret, i;
-	int offset = port->cnts.num_q_counters;
-
-	out = kvzalloc(outlen, GFP_KERNEL);
-	if (!out)
-		return -ENOMEM;
-
-	ret = mlx5_cmd_query_cong_counter(dev->mdev, false, out, outlen);
-	if (ret)
-		goto free;
-
-	for (i = 0; i < port->cnts.num_cong_counters; i++) {
-		stats->value[i + offset] =
-			be64_to_cpup((__be64 *)(out +
-				     port->cnts.offsets[i + offset]));
-	}
-
-free:
-	kvfree(out);
-	return ret;
-}
-
 static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
 				struct rdma_hw_stats *stats,
 				u8 port_num, int index)
@@ -3782,7 +3754,12 @@ static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
 	num_counters = port->cnts.num_q_counters;
 
 	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
-		ret = mlx5_ib_query_cong_counters(dev, port, stats);
+		ret = mlx5_lag_query_cong_counters(dev->mdev,
+						   stats->value +
+						   port->cnts.num_q_counters,
+						   port->cnts.num_cong_counters,
+						   port->cnts.offsets +
+						   port->cnts.num_q_counters);
 		if (ret)
 			return ret;
 		num_counters += port->cnts.num_cong_counters;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
index f26f97fe4666..582b2f18010a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
@@ -137,6 +137,17 @@ int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev)
 }
 EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag);
 
+static int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev,
+				       bool reset, void *out, int out_size)
+{
+	u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = { };
+
+	MLX5_SET(query_cong_statistics_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_CONG_STATISTICS);
+	MLX5_SET(query_cong_statistics_in, in, clear, reset);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size);
+}
+
 static struct mlx5_lag *mlx5_lag_dev_get(struct mlx5_core_dev *dev)
 {
 	return dev->priv.lag;
@@ -633,3 +644,48 @@ bool mlx5_lag_intf_add(struct mlx5_interface *intf, struct mlx5_priv *priv)
 	/* If bonded, we do not add an IB device for PF1. */
 	return false;
 }
+
+int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
+				 u64 *values,
+				 int num_counters,
+				 size_t *offsets)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
+	struct mlx5_core_dev *mdev[MLX5_MAX_PORTS];
+	struct mlx5_lag *ldev;
+	int num_ports;
+	int ret, i, j;
+	void *out;
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	memset(values, 0, sizeof(*values) * num_counters);
+
+	mutex_lock(&lag_mutex);
+	ldev = mlx5_lag_dev_get(dev);
+	if (ldev && mlx5_lag_is_bonded(ldev)) {
+		num_ports = MLX5_MAX_PORTS;
+		mdev[0] = ldev->pf[0].dev;
+		mdev[1] = ldev->pf[1].dev;
+	} else {
+		num_ports = 1;
+		mdev[0] = dev;
+	}
+
+	for (i = 0; i < num_ports; ++i) {
+		ret = mlx5_cmd_query_cong_counter(mdev[i], false, out, outlen);
+		if (ret)
+			goto unlock;
+
+		for (j = 0; j < num_counters; ++j)
+			values[j] += be64_to_cpup((__be64 *)(out + offsets[j]));
+	}
+
+unlock:
+	mutex_unlock(&lag_mutex);
+	kvfree(out);
+	return ret;
+}
+EXPORT_SYMBOL(mlx5_lag_query_cong_counters);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index a886b51511ab..8846919356ca 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1164,6 +1164,10 @@ int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev);
 int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev);
 bool mlx5_lag_is_active(struct mlx5_core_dev *dev);
 struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev);
+int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
+				 u64 *values,
+				 int num_counters,
+				 size_t *offsets);
 struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev);
 void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up);
 
-- 
cgit v1.2.3


From 39c3fd58952d7599d367c84c1330b785d91d6088 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 2 Dec 2017 18:11:04 +0100
Subject: kernel/irq: Extend lockdep class for request mutex

The IRQ code already has support for lockdep class for the lock mutex
in an interrupt descriptor. Extend this to add a second class for the
request mutex in the descriptor. Not having a class is resulting in
false positive splats in some code paths.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: linus.walleij@linaro.org
Cc: grygorii.strashko@ti.com
Cc: f.fainelli@gmail.com
Link: https://lkml.kernel.org/r/1512234664-21555-1-git-send-email-andrew@lunn.ch
---
 arch/powerpc/sysdev/fsl_msi.c             |  4 +++-
 drivers/gpio/gpio-bcm-kona.c              |  3 ++-
 drivers/gpio/gpio-brcmstb.c               |  4 +++-
 drivers/gpio/gpio-tegra.c                 |  4 +++-
 drivers/gpio/gpiolib.c                    | 27 ++++++++++++++++---------
 drivers/irqchip/irq-renesas-intc-irqpin.c |  6 +++++-
 drivers/mfd/arizona-irq.c                 |  4 +++-
 drivers/pinctrl/pinctrl-single.c          |  5 ++++-
 include/linux/gpio/driver.h               | 33 ++++++++++++++++++++-----------
 include/linux/irqdesc.h                   |  9 ++++++---
 kernel/irq/generic-chip.c                 | 11 +++++++----
 11 files changed, 75 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index 44cbf4c12ea1..df95102e732c 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -354,6 +354,7 @@ static int fsl_of_msi_remove(struct platform_device *ofdev)
 }
 
 static struct lock_class_key fsl_msi_irq_class;
+static struct lock_class_key fsl_msi_irq_request_class;
 
 static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev,
 			       int offset, int irq_index)
@@ -373,7 +374,8 @@ static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev,
 		dev_err(&dev->dev, "No memory for MSI cascade data\n");
 		return -ENOMEM;
 	}
-	irq_set_lockdep_class(virt_msir, &fsl_msi_irq_class);
+	irq_set_lockdep_class(virt_msir, &fsl_msi_irq_class,
+			      &fsl_msi_irq_request_class);
 	cascade_data->index = offset;
 	cascade_data->msi_data = msi;
 	cascade_data->virq = virt_msir;
diff --git a/drivers/gpio/gpio-bcm-kona.c b/drivers/gpio/gpio-bcm-kona.c
index dfcf56ee3c61..76861a00bb92 100644
--- a/drivers/gpio/gpio-bcm-kona.c
+++ b/drivers/gpio/gpio-bcm-kona.c
@@ -522,6 +522,7 @@ static struct of_device_id const bcm_kona_gpio_of_match[] = {
  * category than their parents, so it won't report false recursion.
  */
 static struct lock_class_key gpio_lock_class;
+static struct lock_class_key gpio_request_class;
 
 static int bcm_kona_gpio_irq_map(struct irq_domain *d, unsigned int irq,
 				 irq_hw_number_t hwirq)
@@ -531,7 +532,7 @@ static int bcm_kona_gpio_irq_map(struct irq_domain *d, unsigned int irq,
 	ret = irq_set_chip_data(irq, d->host_data);
 	if (ret < 0)
 		return ret;
-	irq_set_lockdep_class(irq, &gpio_lock_class);
+	irq_set_lockdep_class(irq, &gpio_lock_class, &gpio_request_class);
 	irq_set_chip_and_handler(irq, &bcm_gpio_irq_chip, handle_simple_irq);
 	irq_set_noprobe(irq);
 
diff --git a/drivers/gpio/gpio-brcmstb.c b/drivers/gpio/gpio-brcmstb.c
index 545d43a587b7..5b24801bffef 100644
--- a/drivers/gpio/gpio-brcmstb.c
+++ b/drivers/gpio/gpio-brcmstb.c
@@ -327,6 +327,7 @@ static struct brcmstb_gpio_bank *brcmstb_gpio_hwirq_to_bank(
  * category than their parents, so it won't report false recursion.
  */
 static struct lock_class_key brcmstb_gpio_irq_lock_class;
+static struct lock_class_key brcmstb_gpio_irq_request_class;
 
 
 static int brcmstb_gpio_irq_map(struct irq_domain *d, unsigned int irq,
@@ -346,7 +347,8 @@ static int brcmstb_gpio_irq_map(struct irq_domain *d, unsigned int irq,
 	ret = irq_set_chip_data(irq, &bank->gc);
 	if (ret < 0)
 		return ret;
-	irq_set_lockdep_class(irq, &brcmstb_gpio_irq_lock_class);
+	irq_set_lockdep_class(irq, &brcmstb_gpio_irq_lock_class,
+			      &brcmstb_gpio_irq_lock_class);
 	irq_set_chip_and_handler(irq, &priv->irq_chip, handle_level_irq);
 	irq_set_noprobe(irq);
 	return 0;
diff --git a/drivers/gpio/gpio-tegra.c b/drivers/gpio/gpio-tegra.c
index 8db47f671708..02fa8fe2292a 100644
--- a/drivers/gpio/gpio-tegra.c
+++ b/drivers/gpio/gpio-tegra.c
@@ -565,6 +565,7 @@ static const struct dev_pm_ops tegra_gpio_pm_ops = {
  * than their parents, so it won't report false recursion.
  */
 static struct lock_class_key gpio_lock_class;
+static struct lock_class_key gpio_request_class;
 
 static int tegra_gpio_probe(struct platform_device *pdev)
 {
@@ -670,7 +671,8 @@ static int tegra_gpio_probe(struct platform_device *pdev)
 
 		bank = &tgi->bank_info[GPIO_BANK(gpio)];
 
-		irq_set_lockdep_class(irq, &gpio_lock_class);
+		irq_set_lockdep_class(irq, &gpio_lock_class,
+				      &gpio_request_class);
 		irq_set_chip_data(irq, bank);
 		irq_set_chip_and_handler(irq, &tgi->ic, handle_simple_irq);
 	}
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index aad84a6306c4..44332b793718 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -73,7 +73,8 @@ LIST_HEAD(gpio_devices);
 
 static void gpiochip_free_hogs(struct gpio_chip *chip);
 static int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
-				struct lock_class_key *key);
+				struct lock_class_key *lock_key,
+				struct lock_class_key *request_key);
 static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip);
 static int gpiochip_irqchip_init_valid_mask(struct gpio_chip *gpiochip);
 static void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gpiochip);
@@ -1100,7 +1101,8 @@ static void gpiochip_setup_devs(void)
 }
 
 int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
-			       struct lock_class_key *key)
+			       struct lock_class_key *lock_key,
+			       struct lock_class_key *request_key)
 {
 	unsigned long	flags;
 	int		status = 0;
@@ -1246,7 +1248,7 @@ int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
 	if (status)
 		goto err_remove_from_list;
 
-	status = gpiochip_add_irqchip(chip, key);
+	status = gpiochip_add_irqchip(chip, lock_key, request_key);
 	if (status)
 		goto err_remove_chip;
 
@@ -1632,7 +1634,7 @@ int gpiochip_irq_map(struct irq_domain *d, unsigned int irq,
 	 * This lock class tells lockdep that GPIO irqs are in a different
 	 * category than their parents, so it won't report false recursion.
 	 */
-	irq_set_lockdep_class(irq, chip->irq.lock_key);
+	irq_set_lockdep_class(irq, chip->irq.lock_key, chip->irq.request_key);
 	irq_set_chip_and_handler(irq, chip->irq.chip, chip->irq.handler);
 	/* Chips that use nested thread handlers have them marked */
 	if (chip->irq.threaded)
@@ -1712,10 +1714,12 @@ static int gpiochip_to_irq(struct gpio_chip *chip, unsigned offset)
 /**
  * gpiochip_add_irqchip() - adds an IRQ chip to a GPIO chip
  * @gpiochip: the GPIO chip to add the IRQ chip to
- * @lock_key: lockdep class
+ * @lock_key: lockdep class for IRQ lock
+ * @request_key: lockdep class for IRQ request
  */
 static int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
-				struct lock_class_key *lock_key)
+				struct lock_class_key *lock_key,
+				struct lock_class_key *request_key)
 {
 	struct irq_chip *irqchip = gpiochip->irq.chip;
 	const struct irq_domain_ops *ops;
@@ -1753,6 +1757,7 @@ static int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
 	gpiochip->to_irq = gpiochip_to_irq;
 	gpiochip->irq.default_type = type;
 	gpiochip->irq.lock_key = lock_key;
+	gpiochip->irq.request_key = request_key;
 
 	if (gpiochip->irq.domain_ops)
 		ops = gpiochip->irq.domain_ops;
@@ -1850,7 +1855,8 @@ static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip)
  * @type: the default type for IRQs on this irqchip, pass IRQ_TYPE_NONE
  * to have the core avoid setting up any default type in the hardware.
  * @threaded: whether this irqchip uses a nested thread handler
- * @lock_key: lockdep class
+ * @lock_key: lockdep class for IRQ lock
+ * @request_key: lockdep class for IRQ request
  *
  * This function closely associates a certain irqchip with a certain
  * gpiochip, providing an irq domain to translate the local IRQs to
@@ -1872,7 +1878,8 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 			     irq_flow_handler_t handler,
 			     unsigned int type,
 			     bool threaded,
-			     struct lock_class_key *lock_key)
+			     struct lock_class_key *lock_key,
+			     struct lock_class_key *request_key)
 {
 	struct device_node *of_node;
 
@@ -1913,6 +1920,7 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 	gpiochip->irq.default_type = type;
 	gpiochip->to_irq = gpiochip_to_irq;
 	gpiochip->irq.lock_key = lock_key;
+	gpiochip->irq.request_key = request_key;
 	gpiochip->irq.domain = irq_domain_add_simple(of_node,
 					gpiochip->ngpio, first_irq,
 					&gpiochip_domain_ops, gpiochip);
@@ -1940,7 +1948,8 @@ EXPORT_SYMBOL_GPL(gpiochip_irqchip_add_key);
 #else /* CONFIG_GPIOLIB_IRQCHIP */
 
 static inline int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
-				       struct lock_class_key *key)
+				       struct lock_class_key *lock_key,
+				       struct lock_class_key *request_key)
 {
 	return 0;
 }
diff --git a/drivers/irqchip/irq-renesas-intc-irqpin.c b/drivers/irqchip/irq-renesas-intc-irqpin.c
index 06f29cf5018a..cee59fe1321c 100644
--- a/drivers/irqchip/irq-renesas-intc-irqpin.c
+++ b/drivers/irqchip/irq-renesas-intc-irqpin.c
@@ -342,6 +342,9 @@ static irqreturn_t intc_irqpin_shared_irq_handler(int irq, void *dev_id)
  */
 static struct lock_class_key intc_irqpin_irq_lock_class;
 
+/* And this is for the request mutex */
+static struct lock_class_key intc_irqpin_irq_request_class;
+
 static int intc_irqpin_irq_domain_map(struct irq_domain *h, unsigned int virq,
 				      irq_hw_number_t hw)
 {
@@ -352,7 +355,8 @@ static int intc_irqpin_irq_domain_map(struct irq_domain *h, unsigned int virq,
 
 	intc_irqpin_dbg(&p->irq[hw], "map");
 	irq_set_chip_data(virq, h->host_data);
-	irq_set_lockdep_class(virq, &intc_irqpin_irq_lock_class);
+	irq_set_lockdep_class(virq, &intc_irqpin_irq_lock_class,
+			      &intc_irqpin_irq_request_class);
 	irq_set_chip_and_handler(virq, &p->irq_chip, handle_level_irq);
 	return 0;
 }
diff --git a/drivers/mfd/arizona-irq.c b/drivers/mfd/arizona-irq.c
index 09cf3699e354..a307832d7e45 100644
--- a/drivers/mfd/arizona-irq.c
+++ b/drivers/mfd/arizona-irq.c
@@ -184,6 +184,7 @@ static struct irq_chip arizona_irq_chip = {
 };
 
 static struct lock_class_key arizona_irq_lock_class;
+static struct lock_class_key arizona_irq_request_class;
 
 static int arizona_irq_map(struct irq_domain *h, unsigned int virq,
 			      irq_hw_number_t hw)
@@ -191,7 +192,8 @@ static int arizona_irq_map(struct irq_domain *h, unsigned int virq,
 	struct arizona *data = h->host_data;
 
 	irq_set_chip_data(virq, data);
-	irq_set_lockdep_class(virq, &arizona_irq_lock_class);
+	irq_set_lockdep_class(virq, &arizona_irq_lock_class,
+		&arizona_irq_request_class);
 	irq_set_chip_and_handler(virq, &arizona_irq_chip, handle_simple_irq);
 	irq_set_nested_thread(virq, 1);
 	irq_set_noprobe(virq);
diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c
index e6cd8de793e2..3501491e5bfc 100644
--- a/drivers/pinctrl/pinctrl-single.c
+++ b/drivers/pinctrl/pinctrl-single.c
@@ -222,6 +222,9 @@ static enum pin_config_param pcs_bias[] = {
  */
 static struct lock_class_key pcs_lock_class;
 
+/* Class for the IRQ request mutex */
+static struct lock_class_key pcs_request_class;
+
 /*
  * REVISIT: Reads and writes could eventually use regmap or something
  * generic. But at least on omaps, some mux registers are performance
@@ -1486,7 +1489,7 @@ static int pcs_irqdomain_map(struct irq_domain *d, unsigned int irq,
 	irq_set_chip_data(irq, pcs_soc);
 	irq_set_chip_and_handler(irq, &pcs->chip,
 				 handle_level_irq);
-	irq_set_lockdep_class(irq, &pcs_lock_class);
+	irq_set_lockdep_class(irq, &pcs_lock_class, &pcs_request_class);
 	irq_set_noprobe(irq);
 
 	return 0;
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 55e672592fa9..7258cd676df4 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -66,9 +66,10 @@ struct gpio_irq_chip {
 	/**
 	 * @lock_key:
 	 *
-	 * Per GPIO IRQ chip lockdep class.
+	 * Per GPIO IRQ chip lockdep classes.
 	 */
 	struct lock_class_key *lock_key;
+	struct lock_class_key *request_key;
 
 	/**
 	 * @parent_handler:
@@ -323,7 +324,8 @@ extern const char *gpiochip_is_requested(struct gpio_chip *chip,
 
 /* add/remove chips */
 extern int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
-				      struct lock_class_key *lock_key);
+				      struct lock_class_key *lock_key,
+				      struct lock_class_key *request_key);
 
 /**
  * gpiochip_add_data() - register a gpio_chip
@@ -350,11 +352,13 @@ extern int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
  */
 #ifdef CONFIG_LOCKDEP
 #define gpiochip_add_data(chip, data) ({		\
-		static struct lock_class_key key;	\
-		gpiochip_add_data_with_key(chip, data, &key);	\
+		static struct lock_class_key lock_key;	\
+		static struct lock_class_key request_key;	  \
+		gpiochip_add_data_with_key(chip, data, &lock_key, \
+					   &request_key);	  \
 	})
 #else
-#define gpiochip_add_data(chip, data) gpiochip_add_data_with_key(chip, data, NULL)
+#define gpiochip_add_data(chip, data) gpiochip_add_data_with_key(chip, data, NULL, NULL)
 #endif
 
 static inline int gpiochip_add(struct gpio_chip *chip)
@@ -429,7 +433,8 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 			     irq_flow_handler_t handler,
 			     unsigned int type,
 			     bool threaded,
-			     struct lock_class_key *lock_key);
+			     struct lock_class_key *lock_key,
+			     struct lock_class_key *request_key);
 
 #ifdef CONFIG_LOCKDEP
 
@@ -445,10 +450,12 @@ static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip,
 				       irq_flow_handler_t handler,
 				       unsigned int type)
 {
-	static struct lock_class_key key;
+	static struct lock_class_key lock_key;
+	static struct lock_class_key request_key;
 
 	return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq,
-					handler, type, false, &key);
+					handler, type, false,
+					&lock_key, &request_key);
 }
 
 static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip,
@@ -458,10 +465,12 @@ static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip,
 			  unsigned int type)
 {
 
-	static struct lock_class_key key;
+	static struct lock_class_key lock_key;
+	static struct lock_class_key request_key;
 
 	return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq,
-					handler, type, true, &key);
+					handler, type, true,
+					&lock_key, &request_key);
 }
 #else
 static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip,
@@ -471,7 +480,7 @@ static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip,
 				       unsigned int type)
 {
 	return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq,
-					handler, type, false, NULL);
+					handler, type, false, NULL, NULL);
 }
 
 static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip,
@@ -481,7 +490,7 @@ static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip,
 			  unsigned int type)
 {
 	return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq,
-					handler, type, true, NULL);
+					handler, type, true, NULL, NULL);
 }
 #endif /* CONFIG_LOCKDEP */
 
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 39fb3700f7a9..25b33b664537 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -255,12 +255,15 @@ static inline bool irq_is_percpu_devid(unsigned int irq)
 }
 
 static inline void
-irq_set_lockdep_class(unsigned int irq, struct lock_class_key *class)
+irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class,
+		      struct lock_class_key *request_class)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	if (desc)
-		lockdep_set_class(&desc->lock, class);
+	if (desc) {
+		lockdep_set_class(&desc->lock, lock_class);
+		lockdep_set_class(&desc->request_mutex, request_class);
+	}
 }
 
 #ifdef CONFIG_IRQ_PREFLOW_FASTEOI
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c26c5bb6b491..508c03dfef25 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -364,10 +364,11 @@ irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
 EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
 
 /*
- * Separate lockdep class for interrupt chip which can nest irq_desc
- * lock.
+ * Separate lockdep classes for interrupt chip which can nest irq_desc
+ * lock and request mutex.
  */
 static struct lock_class_key irq_nested_lock_class;
+static struct lock_class_key irq_nested_request_class;
 
 /*
  * irq_map_generic_chip - Map a generic chip for an irq domain
@@ -409,7 +410,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
 	set_bit(idx, &gc->installed);
 
 	if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK)
-		irq_set_lockdep_class(virq, &irq_nested_lock_class);
+		irq_set_lockdep_class(virq, &irq_nested_lock_class,
+				      &irq_nested_request_class);
 
 	if (chip->irq_calc_mask)
 		chip->irq_calc_mask(data);
@@ -479,7 +481,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
 			continue;
 
 		if (flags & IRQ_GC_INIT_NESTED_LOCK)
-			irq_set_lockdep_class(i, &irq_nested_lock_class);
+			irq_set_lockdep_class(i, &irq_nested_lock_class,
+					      &irq_nested_request_class);
 
 		if (!(flags & IRQ_GC_NO_MASK)) {
 			struct irq_data *d = irq_get_irq_data(i);
-- 
cgit v1.2.3


From 466a2b42d67644447a1765276259a3ea5531ddff Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Thu, 21 Dec 2017 02:22:45 +0100
Subject: cpufreq: schedutil: Use idle_calls counter of the remote CPU

Since the recent remote cpufreq callback work, its possible that a cpufreq
update is triggered from a remote CPU. For single policies however, the current
code uses the local CPU when trying to determine if the remote sg_cpu entered
idle or is busy. This is incorrect. To remedy this, compare with the nohz tick
idle_calls counter of the remote CPU.

Fixes: 674e75411fc2 (sched: cpufreq: Allow remote cpufreq callbacks)
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Cc: 4.14+ <stable@vger.kernel.org> # 4.14+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/tick.h             |  1 +
 kernel/sched/cpufreq_schedutil.c |  2 +-
 kernel/time/tick-sched.c         | 13 +++++++++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index f442d1a42025..7cc35921218e 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -119,6 +119,7 @@ extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
 extern unsigned long tick_nohz_get_idle_calls(void);
+extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 #else /* !CONFIG_NO_HZ_COMMON */
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 2f52ec0f1539..d6717a3331a1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -244,7 +244,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
 #ifdef CONFIG_NO_HZ_COMMON
 static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
 {
-	unsigned long idle_calls = tick_nohz_get_idle_calls();
+	unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
 	bool ret = idle_calls == sg_cpu->saved_idle_calls;
 
 	sg_cpu->saved_idle_calls = idle_calls;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 99578f06c8d4..77555faf6fbc 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -985,6 +985,19 @@ ktime_t tick_nohz_get_sleep_length(void)
 	return ts->sleep_length;
 }
 
+/**
+ * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
+ * for a particular CPU.
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
+{
+	struct tick_sched *ts = tick_get_tick_sched(cpu);
+
+	return ts->idle_calls;
+}
+
 /**
  * tick_nohz_get_idle_calls - return the current idle calls counter value
  *
-- 
cgit v1.2.3


From 69790ba92b8d67eaee5e50b30a5b696d40664caf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 29 Dec 2017 16:44:34 +0100
Subject: genirq: Introduce IRQD_CAN_RESERVE flag

Add a new flag to mark interrupts which can use reservation mode. This is
going to be used in subsequent patches to disable reservation mode for a
certain class of MSI devices.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Alexandru Chirvasitu <achirvasub@gmail.com>
Tested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Dou Liyang <douly.fnst@cn.fujitsu.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Mikael Pettersson <mikpelinux@gmail.com>
Cc: Josh Poulson <jopoulso@microsoft.com>
Cc: Mihai Costache <v-micos@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-pci@vger.kernel.org
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Simon Xiao <sixiao@microsoft.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: devel@linuxdriverproject.org
Cc: KY Srinivasan <kys@microsoft.com>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Sakari Ailus <sakari.ailus@intel.com>,
Cc: linux-media@vger.kernel.org
---
 include/linux/irq.h  | 17 +++++++++++++++++
 kernel/irq/debugfs.c |  1 +
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index e140f69163b6..a0231e96a578 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -212,6 +212,7 @@ struct irq_data {
  *				  mask. Applies only to affinity managed irqs.
  * IRQD_SINGLE_TARGET		- IRQ allows only a single affinity target
  * IRQD_DEFAULT_TRIGGER_SET	- Expected trigger already been set
+ * IRQD_CAN_RESERVE		- Can use reservation mode
  */
 enum {
 	IRQD_TRIGGER_MASK		= 0xf,
@@ -233,6 +234,7 @@ enum {
 	IRQD_MANAGED_SHUTDOWN		= (1 << 23),
 	IRQD_SINGLE_TARGET		= (1 << 24),
 	IRQD_DEFAULT_TRIGGER_SET	= (1 << 25),
+	IRQD_CAN_RESERVE		= (1 << 26),
 };
 
 #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -377,6 +379,21 @@ static inline bool irqd_is_managed_and_shutdown(struct irq_data *d)
 	return __irqd_to_state(d) & IRQD_MANAGED_SHUTDOWN;
 }
 
+static inline void irqd_set_can_reserve(struct irq_data *d)
+{
+	__irqd_to_state(d) |= IRQD_CAN_RESERVE;
+}
+
+static inline void irqd_clr_can_reserve(struct irq_data *d)
+{
+	__irqd_to_state(d) &= ~IRQD_CAN_RESERVE;
+}
+
+static inline bool irqd_can_reserve(struct irq_data *d)
+{
+	return __irqd_to_state(d) & IRQD_CAN_RESERVE;
+}
+
 #undef __irqd_to_state
 
 static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 7f608ac39653..acfaaef8672a 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -113,6 +113,7 @@ static const struct irq_bit_descr irqdata_states[] = {
 	BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
 	BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
 	BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
+	BIT_MASK_DESCR(IRQD_CAN_RESERVE),
 
 	BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
 
-- 
cgit v1.2.3


From 702cb0a02813299d6911b775c637906ae21b737d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 29 Dec 2017 16:59:06 +0100
Subject: genirq/irqdomain: Rename early argument of irq_domain_activate_irq()

The 'early' argument of irq_domain_activate_irq() is actually used to
denote reservation mode. To avoid confusion, rename it before abuse
happens.

No functional change.

Fixes: 72491643469a ("genirq/irqdomain: Update irq_domain_ops.activate() signature")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexandru Chirvasitu <achirvasub@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Dou Liyang <douly.fnst@cn.fujitsu.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Mikael Pettersson <mikpelinux@gmail.com>
Cc: Josh Poulson <jopoulso@microsoft.com>
Cc: Mihai Costache <v-micos@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-pci@vger.kernel.org
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Simon Xiao <sixiao@microsoft.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: devel@linuxdriverproject.org
Cc: KY Srinivasan <kys@microsoft.com>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Sakari Ailus <sakari.ailus@intel.com>,
Cc: linux-media@vger.kernel.org
---
 arch/x86/include/asm/irqdomain.h         |  2 +-
 arch/x86/include/asm/trace/irq_vectors.h | 16 ++++++++--------
 arch/x86/kernel/apic/io_apic.c           |  2 +-
 arch/x86/kernel/apic/vector.c            |  6 +++---
 arch/x86/platform/uv/uv_irq.c            |  2 +-
 drivers/gpio/gpio-xgene-sb.c             |  2 +-
 drivers/iommu/amd_iommu.c                |  2 +-
 drivers/iommu/intel_irq_remapping.c      |  2 +-
 drivers/irqchip/irq-gic-v3-its.c         |  4 ++--
 drivers/pinctrl/stm32/pinctrl-stm32.c    |  2 +-
 include/linux/irqdomain.h                |  2 +-
 kernel/irq/internals.h                   |  2 +-
 kernel/irq/irqdomain.c                   | 13 +++++++------
 13 files changed, 29 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h
index 139feef467f7..c066ffae222b 100644
--- a/arch/x86/include/asm/irqdomain.h
+++ b/arch/x86/include/asm/irqdomain.h
@@ -44,7 +44,7 @@ extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
 			      unsigned int nr_irqs);
 extern int mp_irqdomain_activate(struct irq_domain *domain,
-				 struct irq_data *irq_data, bool early);
+				 struct irq_data *irq_data, bool reserve);
 extern void mp_irqdomain_deactivate(struct irq_domain *domain,
 				    struct irq_data *irq_data);
 extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain);
diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h
index 84b9ec0c1bc0..22647a642e98 100644
--- a/arch/x86/include/asm/trace/irq_vectors.h
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -283,34 +283,34 @@ TRACE_EVENT(vector_alloc_managed,
 DECLARE_EVENT_CLASS(vector_activate,
 
 	TP_PROTO(unsigned int irq, bool is_managed, bool can_reserve,
-		 bool early),
+		 bool reserve),
 
-	TP_ARGS(irq, is_managed, can_reserve, early),
+	TP_ARGS(irq, is_managed, can_reserve, reserve),
 
 	TP_STRUCT__entry(
 		__field(	unsigned int,	irq		)
 		__field(	bool,		is_managed	)
 		__field(	bool,		can_reserve	)
-		__field(	bool,		early		)
+		__field(	bool,		reserve		)
 	),
 
 	TP_fast_assign(
 		__entry->irq		= irq;
 		__entry->is_managed	= is_managed;
 		__entry->can_reserve	= can_reserve;
-		__entry->early		= early;
+		__entry->reserve	= reserve;
 	),
 
-	TP_printk("irq=%u is_managed=%d can_reserve=%d early=%d",
+	TP_printk("irq=%u is_managed=%d can_reserve=%d reserve=%d",
 		  __entry->irq, __entry->is_managed, __entry->can_reserve,
-		  __entry->early)
+		  __entry->reserve)
 );
 
 #define DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(name)				\
 DEFINE_EVENT_FN(vector_activate, name,					\
 	TP_PROTO(unsigned int irq, bool is_managed,			\
-		 bool can_reserve, bool early),				\
-	TP_ARGS(irq, is_managed, can_reserve, early), NULL, NULL);	\
+		 bool can_reserve, bool reserve),			\
+	TP_ARGS(irq, is_managed, can_reserve, reserve), NULL, NULL);	\
 
 DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_activate);
 DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_deactivate);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 201579dc5242..8a7963421460 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2988,7 +2988,7 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
 }
 
 int mp_irqdomain_activate(struct irq_domain *domain,
-			  struct irq_data *irq_data, bool early)
+			  struct irq_data *irq_data, bool reserve)
 {
 	unsigned long flags;
 
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 1e969dba0476..52c85c8147e9 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -399,21 +399,21 @@ static int activate_managed(struct irq_data *irqd)
 }
 
 static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd,
-			       bool early)
+			       bool reserve)
 {
 	struct apic_chip_data *apicd = apic_chip_data(irqd);
 	unsigned long flags;
 	int ret = 0;
 
 	trace_vector_activate(irqd->irq, apicd->is_managed,
-			      apicd->can_reserve, early);
+			      apicd->can_reserve, reserve);
 
 	/* Nothing to do for fixed assigned vectors */
 	if (!apicd->can_reserve && !apicd->is_managed)
 		return 0;
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	if (early || irqd_is_managed_and_shutdown(irqd))
+	if (reserve || irqd_is_managed_and_shutdown(irqd))
 		vector_assign_managed_shutdown(irqd);
 	else if (apicd->is_managed)
 		ret = activate_managed(irqd);
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 5f6fd860820a..e4cb9f4cde8a 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -128,7 +128,7 @@ static void uv_domain_free(struct irq_domain *domain, unsigned int virq,
  * on the specified blade to allow the sending of MSIs to the specified CPU.
  */
 static int uv_domain_activate(struct irq_domain *domain,
-			      struct irq_data *irq_data, bool early)
+			      struct irq_data *irq_data, bool reserve)
 {
 	uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data);
 	return 0;
diff --git a/drivers/gpio/gpio-xgene-sb.c b/drivers/gpio/gpio-xgene-sb.c
index 2313af82fad3..acd59113e08b 100644
--- a/drivers/gpio/gpio-xgene-sb.c
+++ b/drivers/gpio/gpio-xgene-sb.c
@@ -139,7 +139,7 @@ static int xgene_gpio_sb_to_irq(struct gpio_chip *gc, u32 gpio)
 
 static int xgene_gpio_sb_domain_activate(struct irq_domain *d,
 					 struct irq_data *irq_data,
-					 bool early)
+					 bool reserve)
 {
 	struct xgene_gpio_sb *priv = d->host_data;
 	u32 gpio = HWIRQ_TO_GPIO(priv, irq_data->hwirq);
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 7d5eb004091d..97baf88d9505 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -4184,7 +4184,7 @@ static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
 			       struct irq_cfg *cfg);
 
 static int irq_remapping_activate(struct irq_domain *domain,
-				  struct irq_data *irq_data, bool early)
+				  struct irq_data *irq_data, bool reserve)
 {
 	struct amd_ir_data *data = irq_data->chip_data;
 	struct irq_2_irte *irte_info = &data->irq_2_irte;
diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c
index 76a193c7fcfc..66f69af2c219 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -1397,7 +1397,7 @@ static void intel_irq_remapping_free(struct irq_domain *domain,
 }
 
 static int intel_irq_remapping_activate(struct irq_domain *domain,
-					struct irq_data *irq_data, bool early)
+					struct irq_data *irq_data, bool reserve)
 {
 	intel_ir_reconfigure_irte(irq_data, true);
 	return 0;
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 4039e64cd342..06f025fd5726 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -2303,7 +2303,7 @@ static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 }
 
 static int its_irq_domain_activate(struct irq_domain *domain,
-				   struct irq_data *d, bool early)
+				   struct irq_data *d, bool reserve)
 {
 	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
 	u32 event = its_get_event_id(d);
@@ -2818,7 +2818,7 @@ static int its_vpe_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
 }
 
 static int its_vpe_irq_domain_activate(struct irq_domain *domain,
-				       struct irq_data *d, bool early)
+				       struct irq_data *d, bool reserve)
 {
 	struct its_vpe *vpe = irq_data_get_irq_chip_data(d);
 	struct its_node *its;
diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c
index a276c61be217..e62ab087bfd8 100644
--- a/drivers/pinctrl/stm32/pinctrl-stm32.c
+++ b/drivers/pinctrl/stm32/pinctrl-stm32.c
@@ -290,7 +290,7 @@ static int stm32_gpio_domain_translate(struct irq_domain *d,
 }
 
 static int stm32_gpio_domain_activate(struct irq_domain *d,
-				      struct irq_data *irq_data, bool early)
+				      struct irq_data *irq_data, bool reserve)
 {
 	struct stm32_gpio_bank *bank = d->host_data;
 	struct stm32_pinctrl *pctl = dev_get_drvdata(bank->gpio_chip.parent);
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index a34355d19546..48c7e86bb556 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -113,7 +113,7 @@ struct irq_domain_ops {
 		     unsigned int nr_irqs, void *arg);
 	void (*free)(struct irq_domain *d, unsigned int virq,
 		     unsigned int nr_irqs);
-	int (*activate)(struct irq_domain *d, struct irq_data *irqd, bool early);
+	int (*activate)(struct irq_domain *d, struct irq_data *irqd, bool reserve);
 	void (*deactivate)(struct irq_domain *d, struct irq_data *irq_data);
 	int (*translate)(struct irq_domain *d, struct irq_fwspec *fwspec,
 			 unsigned long *out_hwirq, unsigned int *out_type);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 07d08ca701ec..ab19371eab9b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -440,7 +440,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
 #endif /* !CONFIG_GENERIC_PENDING_IRQ */
 
 #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
-static inline int irq_domain_activate_irq(struct irq_data *data, bool early)
+static inline int irq_domain_activate_irq(struct irq_data *data, bool reserve)
 {
 	irqd_set_activated(data);
 	return 0;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 4f4f60015e8a..62068ad46930 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1693,7 +1693,7 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
 	}
 }
 
-static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
+static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve)
 {
 	int ret = 0;
 
@@ -1702,9 +1702,9 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
 
 		if (irqd->parent_data)
 			ret = __irq_domain_activate_irq(irqd->parent_data,
-							early);
+							reserve);
 		if (!ret && domain->ops->activate) {
-			ret = domain->ops->activate(domain, irqd, early);
+			ret = domain->ops->activate(domain, irqd, reserve);
 			/* Rollback in case of error */
 			if (ret && irqd->parent_data)
 				__irq_domain_deactivate_irq(irqd->parent_data);
@@ -1716,17 +1716,18 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
 /**
  * irq_domain_activate_irq - Call domain_ops->activate recursively to activate
  *			     interrupt
- * @irq_data:	outermost irq_data associated with interrupt
+ * @irq_data:	Outermost irq_data associated with interrupt
+ * @reserve:	If set only reserve an interrupt vector instead of assigning one
  *
  * This is the second step to call domain_ops->activate to program interrupt
  * controllers, so the interrupt could actually get delivered.
  */
-int irq_domain_activate_irq(struct irq_data *irq_data, bool early)
+int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve)
 {
 	int ret = 0;
 
 	if (!irqd_is_activated(irq_data))
-		ret = __irq_domain_activate_irq(irq_data, early);
+		ret = __irq_domain_activate_irq(irq_data, reserve);
 	if (!ret)
 		irqd_set_activated(irq_data);
 	return ret;
-- 
cgit v1.2.3


From 26456f87aca7157c057de65c9414b37f1ab881d1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 27 Dec 2017 21:37:25 +0100
Subject: timers: Reinitialize per cpu bases on hotplug

The timer wheel bases are not (re)initialized on CPU hotplug. That leaves
them with a potentially stale clk and next_expiry valuem, which can cause
trouble then the CPU is plugged.

Add a prepare callback which forwards the clock, sets next_expiry to far in
the future and reset the control flags to a known state.

Set base->must_forward_clk so the first timer which is queued will try to
forward the clock to current jiffies.

Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel")
Reported-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Anna-Maria Gleixner <anna-maria@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712272152200.2431@nanos
---
 include/linux/cpuhotplug.h |  2 +-
 include/linux/timer.h      |  4 +++-
 kernel/cpu.c               |  4 ++--
 kernel/time/timer.c        | 15 +++++++++++++++
 4 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 201ab7267986..1a32e558eb11 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -86,7 +86,7 @@ enum cpuhp_state {
 	CPUHP_MM_ZSWP_POOL_PREPARE,
 	CPUHP_KVM_PPC_BOOK3S_PREPARE,
 	CPUHP_ZCOMP_PREPARE,
-	CPUHP_TIMERS_DEAD,
+	CPUHP_TIMERS_PREPARE,
 	CPUHP_MIPS_SOC_PREPARE,
 	CPUHP_BP_PREPARE_DYN,
 	CPUHP_BP_PREPARE_DYN_END		= CPUHP_BP_PREPARE_DYN + 20,
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 04af640ea95b..2448f9cc48a3 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -207,9 +207,11 @@ unsigned long round_jiffies_up(unsigned long j);
 unsigned long round_jiffies_up_relative(unsigned long j);
 
 #ifdef CONFIG_HOTPLUG_CPU
+int timers_prepare_cpu(unsigned int cpu);
 int timers_dead_cpu(unsigned int cpu);
 #else
-#define timers_dead_cpu NULL
+#define timers_prepare_cpu	NULL
+#define timers_dead_cpu		NULL
 #endif
 
 #endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 41376c3ac93b..97858477e586 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 	 * before blk_mq_queue_reinit_notify() from notify_dead(),
 	 * otherwise a RCU stall occurs.
 	 */
-	[CPUHP_TIMERS_DEAD] = {
+	[CPUHP_TIMERS_PREPARE] = {
 		.name			= "timers:dead",
-		.startup.single		= NULL,
+		.startup.single		= timers_prepare_cpu,
 		.teardown.single	= timers_dead_cpu,
 	},
 	/* Kicks the plugged cpu into life */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 19a9c3da7698..6be576e02209 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1853,6 +1853,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h
 	}
 }
 
+int timers_prepare_cpu(unsigned int cpu)
+{
+	struct timer_base *base;
+	int b;
+
+	for (b = 0; b < NR_BASES; b++) {
+		base = per_cpu_ptr(&timer_bases[b], cpu);
+		base->clk = jiffies;
+		base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+		base->is_idle = false;
+		base->must_forward_clk = true;
+	}
+	return 0;
+}
+
 int timers_dead_cpu(unsigned int cpu)
 {
 	struct timer_base *old_base;
-- 
cgit v1.2.3


From 98801506552593c9b8ac11021b0cdad12cab4f6b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 2 Jan 2018 10:02:19 +0000
Subject: fscache: Fix the default for fscache_maybe_release_page()

Fix the default for fscache_maybe_release_page() for when the cookie isn't
valid or the page isn't cached.  It mustn't return false as that indicates
the page cannot yet be freed.

The problem with the default is that if, say, there's no cache, but a
network filesystem's pages are using up almost all the available memory, a
system can OOM because the filesystem ->releasepage() op will not allow
them to be released as fscache_maybe_release_page() incorrectly prevents
it.

This can be tested by writing a sequence of 512MiB files to an AFS mount.
It does not affect NFS or CIFS because both of those wrap the call in a
check of PG_fscache and it shouldn't bother Ceph as that only has
PG_private set whilst writeback is in progress.  This might be an issue for
9P, however.

Note that the pages aren't entirely stuck.  Removing a file or unmounting
will clear things because that uses ->invalidatepage() instead.

Fixes: 201a15428bd5 ("FS-Cache: Handle pages pending storage that get evicted under OOM conditions")
Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Marc Dionne <marc.dionne@auristor.com>
cc: stable@vger.kernel.org # 2.6.32+
---
 include/linux/fscache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index f4ff47d4a893..fe0c349684fa 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -755,7 +755,7 @@ bool fscache_maybe_release_page(struct fscache_cookie *cookie,
 {
 	if (fscache_cookie_valid(cookie) && PageFsCache(page))
 		return __fscache_maybe_release_page(cookie, page, gfp);
-	return false;
+	return true;
 }
 
 /**
-- 
cgit v1.2.3


From f24c4d478013d82bd1b943df566fff3561d52864 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 2 Jan 2018 17:21:10 +0000
Subject: efi/capsule-loader: Reinstate virtual capsule mapping

Commit:

  82c3768b8d68 ("efi/capsule-loader: Use a cached copy of the capsule header")

... refactored the capsule loading code that maps the capsule header,
to avoid having to map it several times.

However, as it turns out, the vmap() call we ended up removing did not
just map the header, but the entire capsule image, and dropping this
virtual mapping breaks capsules that are processed by the firmware
immediately (i.e., without a reboot).

Unfortunately, that change was part of a larger refactor that allowed
a quirk to be implemented for Quark, which has a non-standard memory
layout for capsules, and we have slightly painted ourselves into a
corner by allowing quirk code to mangle the capsule header and memory
layout.

So we need to fix this without breaking Quark. Fortunately, Quark does
not appear to care about the virtual mapping, and so we can simply
do a partial revert of commit:

  2a457fb31df6 ("efi/capsule-loader: Use page addresses rather than struct page pointers")

... and create a vmap() mapping of the entire capsule (including header)
based on the reinstated struct page array, unless running on Quark, in
which case we pass the capsule header copy as before.

Reported-by: Ge Song <ge.song@hxt-semitech.com>
Tested-by: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Tested-by: Ge Song <ge.song@hxt-semitech.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: <stable@vger.kernel.org>
Cc: Dave Young <dyoung@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Fixes: 82c3768b8d68 ("efi/capsule-loader: Use a cached copy of the capsule header")
Link: http://lkml.kernel.org/r/20180102172110.17018-3-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/quirks.c        | 13 +++++++++-
 drivers/firmware/efi/capsule-loader.c | 45 ++++++++++++++++++++++++++++-------
 include/linux/efi.h                   |  4 +++-
 3 files changed, 52 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 8a99a2e96537..5b513ccffde4 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -592,7 +592,18 @@ static int qrk_capsule_setup_info(struct capsule_info *cap_info, void **pkbuff,
 	/*
 	 * Update the first page pointer to skip over the CSH header.
 	 */
-	cap_info->pages[0] += csh->headersize;
+	cap_info->phys[0] += csh->headersize;
+
+	/*
+	 * cap_info->capsule should point at a virtual mapping of the entire
+	 * capsule, starting at the capsule header. Our image has the Quark
+	 * security header prepended, so we cannot rely on the default vmap()
+	 * mapping created by the generic capsule code.
+	 * Given that the Quark firmware does not appear to care about the
+	 * virtual mapping, let's just point cap_info->capsule at our copy
+	 * of the capsule header.
+	 */
+	cap_info->capsule = &cap_info->header;
 
 	return 1;
 }
diff --git a/drivers/firmware/efi/capsule-loader.c b/drivers/firmware/efi/capsule-loader.c
index ec8ac5c4dd84..055e2e8f985a 100644
--- a/drivers/firmware/efi/capsule-loader.c
+++ b/drivers/firmware/efi/capsule-loader.c
@@ -20,10 +20,6 @@
 
 #define NO_FURTHER_WRITE_ACTION -1
 
-#ifndef phys_to_page
-#define phys_to_page(x)		pfn_to_page((x) >> PAGE_SHIFT)
-#endif
-
 /**
  * efi_free_all_buff_pages - free all previous allocated buffer pages
  * @cap_info: pointer to current instance of capsule_info structure
@@ -35,7 +31,7 @@
 static void efi_free_all_buff_pages(struct capsule_info *cap_info)
 {
 	while (cap_info->index > 0)
-		__free_page(phys_to_page(cap_info->pages[--cap_info->index]));
+		__free_page(cap_info->pages[--cap_info->index]);
 
 	cap_info->index = NO_FURTHER_WRITE_ACTION;
 }
@@ -71,6 +67,14 @@ int __efi_capsule_setup_info(struct capsule_info *cap_info)
 
 	cap_info->pages = temp_page;
 
+	temp_page = krealloc(cap_info->phys,
+			     pages_needed * sizeof(phys_addr_t *),
+			     GFP_KERNEL | __GFP_ZERO);
+	if (!temp_page)
+		return -ENOMEM;
+
+	cap_info->phys = temp_page;
+
 	return 0;
 }
 
@@ -105,9 +109,24 @@ int __weak efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff,
  **/
 static ssize_t efi_capsule_submit_update(struct capsule_info *cap_info)
 {
+	bool do_vunmap = false;
 	int ret;
 
-	ret = efi_capsule_update(&cap_info->header, cap_info->pages);
+	/*
+	 * cap_info->capsule may have been assigned already by a quirk
+	 * handler, so only overwrite it if it is NULL
+	 */
+	if (!cap_info->capsule) {
+		cap_info->capsule = vmap(cap_info->pages, cap_info->index,
+					 VM_MAP, PAGE_KERNEL);
+		if (!cap_info->capsule)
+			return -ENOMEM;
+		do_vunmap = true;
+	}
+
+	ret = efi_capsule_update(cap_info->capsule, cap_info->phys);
+	if (do_vunmap)
+		vunmap(cap_info->capsule);
 	if (ret) {
 		pr_err("capsule update failed\n");
 		return ret;
@@ -165,10 +184,12 @@ static ssize_t efi_capsule_write(struct file *file, const char __user *buff,
 			goto failed;
 		}
 
-		cap_info->pages[cap_info->index++] = page_to_phys(page);
+		cap_info->pages[cap_info->index] = page;
+		cap_info->phys[cap_info->index] = page_to_phys(page);
 		cap_info->page_bytes_remain = PAGE_SIZE;
+		cap_info->index++;
 	} else {
-		page = phys_to_page(cap_info->pages[cap_info->index - 1]);
+		page = cap_info->pages[cap_info->index - 1];
 	}
 
 	kbuff = kmap(page);
@@ -252,6 +273,7 @@ static int efi_capsule_release(struct inode *inode, struct file *file)
 	struct capsule_info *cap_info = file->private_data;
 
 	kfree(cap_info->pages);
+	kfree(cap_info->phys);
 	kfree(file->private_data);
 	file->private_data = NULL;
 	return 0;
@@ -281,6 +303,13 @@ static int efi_capsule_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 	}
 
+	cap_info->phys = kzalloc(sizeof(void *), GFP_KERNEL);
+	if (!cap_info->phys) {
+		kfree(cap_info->pages);
+		kfree(cap_info);
+		return -ENOMEM;
+	}
+
 	file->private_data = cap_info;
 
 	return 0;
diff --git a/include/linux/efi.h b/include/linux/efi.h
index d813f7b04da7..29fdf8029cf6 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -140,11 +140,13 @@ struct efi_boot_memmap {
 
 struct capsule_info {
 	efi_capsule_header_t	header;
+	efi_capsule_header_t	*capsule;
 	int			reset_type;
 	long			index;
 	size_t			count;
 	size_t			total_size;
-	phys_addr_t		*pages;
+	struct page		**pages;
+	phys_addr_t		*phys;
 	size_t			page_bytes_remain;
 };
 
-- 
cgit v1.2.3


From 040ee69226f8a96b7943645d68f41d5d44b5ff7d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 2 Dec 2017 20:20:38 -0500
Subject: fix "netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of
 'xt_bpf_info_v1'"

Descriptor table is a shared object; it's not a place where you can
stick temporary references to files, especially when we don't need
an opened file at all.

Cc: stable@vger.kernel.org # v4.14
Fixes: 98589a0998b8 ("netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'")
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/bpf.h    | 10 ++++++++++
 kernel/bpf/inode.c     | 40 +++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/syscall.c   |  2 +-
 net/netfilter/xt_bpf.c | 14 ++------------
 4 files changed, 52 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e55e4255a210..b63a592ad29d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -419,6 +419,8 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
 		attr->numa_node : NUMA_NO_NODE;
 }
 
+struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type);
+
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -506,6 +508,12 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 {
 	return 0;
 }
+
+static inline struct bpf_prog *bpf_prog_get_type_path(const char *name,
+				enum bpf_prog_type type)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
@@ -514,6 +522,8 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 	return bpf_prog_get_type_dev(ufd, type, false);
 }
 
+bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool);
+
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
 
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 01aaef1a77c5..5bb5e49ef4c3 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -368,7 +368,45 @@ out:
 	putname(pname);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(bpf_obj_get_user);
+
+static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
+{
+	struct bpf_prog *prog;
+	int ret = inode_permission(inode, MAY_READ | MAY_WRITE);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (inode->i_op == &bpf_map_iops)
+		return ERR_PTR(-EINVAL);
+	if (inode->i_op != &bpf_prog_iops)
+		return ERR_PTR(-EACCES);
+
+	prog = inode->i_private;
+
+	ret = security_bpf_prog(prog);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	if (!bpf_prog_get_ok(prog, &type, false))
+		return ERR_PTR(-EINVAL);
+
+	return bpf_prog_inc(prog);
+}
+
+struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type)
+{
+	struct bpf_prog *prog;
+	struct path path;
+	int ret = kern_path(name, LOOKUP_FOLLOW, &path);
+	if (ret)
+		return ERR_PTR(ret);
+	prog = __get_prog_inode(d_backing_inode(path.dentry), type);
+	if (!IS_ERR(prog))
+		touch_atime(&path);
+	path_put(&path);
+	return prog;
+}
+EXPORT_SYMBOL(bpf_prog_get_type_path);
 
 static void bpf_evict_inode(struct inode *inode)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2c4cfeaa8d5e..5cb783fc8224 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1057,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
 
-static bool bpf_prog_get_ok(struct bpf_prog *prog,
+bool bpf_prog_get_ok(struct bpf_prog *prog,
 			    enum bpf_prog_type *attach_type, bool attach_drv)
 {
 	/* not an attachment, just a refcount inc, always allow */
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 041da0d9c06f..fa2ca0a13619 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -52,18 +52,8 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret)
 
 static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret)
 {
-	mm_segment_t oldfs = get_fs();
-	int retval, fd;
-
-	set_fs(KERNEL_DS);
-	fd = bpf_obj_get_user(path, 0);
-	set_fs(oldfs);
-	if (fd < 0)
-		return fd;
-
-	retval = __bpf_mt_check_fd(fd, ret);
-	sys_close(fd);
-	return retval;
+	*ret = bpf_prog_get_type_path(path, BPF_PROG_TYPE_SOCKET_FILTER);
+	return PTR_ERR_OR_ZERO(*ret);
 }
 
 static int bpf_mt_check(const struct xt_mtchk_param *par)
-- 
cgit v1.2.3


From 5133550296d43236439494aa955bfb765a89f615 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Thu, 4 Jan 2018 21:06:49 +0300
Subject: sh_eth: fix SH7757 GEther initialization

Renesas  SH7757 has 2 Fast and 2 Gigabit Ether controllers, while the
'sh_eth' driver can only reset and initialize TSU of the first controller
pair. Shimoda-san tried to solve that adding the 'needs_init' member to the
'struct sh_eth_plat_data', however the platform code still never sets this
flag. I think  that we can infer this information from the 'devno' variable
(set  to 'platform_device::id') and reset/init the Ether controller pair
only for an even 'devno'; therefore 'sh_eth_plat_data::needs_init' can be
removed...

Fixes: 150647fb2c31 ("net: sh_eth: change the condition of initialization")
Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.c | 4 ++--
 include/linux/sh_eth.h                | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 1bdd67a8a869..f21c1db91c3f 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -3254,8 +3254,8 @@ static int sh_eth_drv_probe(struct platform_device *pdev)
 		ndev->features = NETIF_F_HW_VLAN_CTAG_FILTER;
 	}
 
-	/* initialize first or needed device */
-	if (!devno || pd->needs_init) {
+	/* Need to init only the first port of the two sharing a TSU */
+	if (devno % 2 == 0) {
 		if (mdp->cd->chip_reset)
 			mdp->cd->chip_reset(ndev);
 
diff --git a/include/linux/sh_eth.h b/include/linux/sh_eth.h
index ff3642d267f7..94081e9a5010 100644
--- a/include/linux/sh_eth.h
+++ b/include/linux/sh_eth.h
@@ -17,7 +17,6 @@ struct sh_eth_plat_data {
 	unsigned char mac_addr[ETH_ALEN];
 	unsigned no_ether_link:1;
 	unsigned ether_link_active_low:1;
-	unsigned needs_init:1;
 };
 
 #endif
-- 
cgit v1.2.3


From 527187d28569e39c5d489d6306d3b79605cf85a6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 8 Jan 2018 17:27:19 +0100
Subject: locking/lockdep: Remove cross-release leftovers

There's two cross-release leftover facilities:

 - the crossrelease_hist_*() irq-tracing callbacks (NOPs currently)
 - the complete_release_commit() callback (NOP as well)

Remove them.

Cc: David Sterba <dsterba@suse.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/completion.h | 1 -
 include/linux/irqflags.h   | 4 ----
 include/linux/lockdep.h    | 2 --
 kernel/sched/completion.c  | 5 -----
 4 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/completion.h b/include/linux/completion.h
index 94a59ba7d422..519e94915d18 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -32,7 +32,6 @@ struct completion {
 #define init_completion(x) __init_completion(x)
 static inline void complete_acquire(struct completion *x) {}
 static inline void complete_release(struct completion *x) {}
-static inline void complete_release_commit(struct completion *x) {}
 
 #define COMPLETION_INITIALIZER(work) \
 	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 46cb57d5eb13..1b3996ff3f16 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -27,22 +27,18 @@
 # define trace_hardirq_enter()			\
 do {						\
 	current->hardirq_context++;		\
-	crossrelease_hist_start(XHLOCK_HARD);	\
 } while (0)
 # define trace_hardirq_exit()			\
 do {						\
 	current->hardirq_context--;		\
-	crossrelease_hist_end(XHLOCK_HARD);	\
 } while (0)
 # define lockdep_softirq_enter()		\
 do {						\
 	current->softirq_context++;		\
-	crossrelease_hist_start(XHLOCK_SOFT);	\
 } while (0)
 # define lockdep_softirq_exit()			\
 do {						\
 	current->softirq_context--;		\
-	crossrelease_hist_end(XHLOCK_SOFT);	\
 } while (0)
 # define INIT_TRACE_IRQFLAGS	.softirqs_enabled = 1,
 #else
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 2e75dc34bff5..3251d9c0d313 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -475,8 +475,6 @@ enum xhlock_context_t {
 #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
 	{ .name = (_name), .key = (void *)(_key), }
 
-static inline void crossrelease_hist_start(enum xhlock_context_t c) {}
-static inline void crossrelease_hist_end(enum xhlock_context_t c) {}
 static inline void lockdep_invariant_state(bool force) {}
 static inline void lockdep_init_task(struct task_struct *task) {}
 static inline void lockdep_free_task(struct task_struct *task) {}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 2ddaec40956f..0926aef10dad 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -34,11 +34,6 @@ void complete(struct completion *x)
 
 	spin_lock_irqsave(&x->wait.lock, flags);
 
-	/*
-	 * Perform commit of crossrelease here.
-	 */
-	complete_release_commit(x);
-
 	if (x->done != UINT_MAX)
 		x->done++;
 	__wake_up_locked(&x->wait, TASK_NORMAL, 1);
-- 
cgit v1.2.3


From b2157399cc9898260d6031c5bfe45fe137c1fbe7 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Sun, 7 Jan 2018 17:33:02 -0800
Subject: bpf: prevent out-of-bounds speculation

Under speculation, CPUs may mis-predict branches in bounds checks. Thus,
memory accesses under a bounds check may be speculated even if the
bounds check fails, providing a primitive for building a side channel.

To avoid leaking kernel data round up array-based maps and mask the index
after bounds check, so speculated load with out of bounds index will load
either valid value from the array or zero from the padded area.

Unconditionally mask index for all array types even when max_entries
are not rounded to power of 2 for root user.
When map is created by unpriv user generate a sequence of bpf insns
that includes AND operation to make sure that JITed code includes
the same 'index & index_mask' operation.

If prog_array map is created by unpriv user replace
  bpf_tail_call(ctx, map, index);
with
  if (index >= max_entries) {
    index &= map->index_mask;
    bpf_tail_call(ctx, map, index);
  }
(along with roundup to power 2) to prevent out-of-bounds speculation.
There is secondary redundant 'if (index >= max_entries)' in the interpreter
and in all JITs, but they can be optimized later if necessary.

Other array-like maps (cpumap, devmap, sockmap, perf_event_array, cgroup_array)
cannot be used by unpriv, so no changes there.

That fixes bpf side of "Variant 1: bounds check bypass (CVE-2017-5753)" on
all architectures with and without JIT.

v2->v3:
Daniel noticed that attack potentially can be crafted via syscall commands
without loading the program, so add masking to those paths as well.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h   |  2 ++
 kernel/bpf/arraymap.c | 47 ++++++++++++++++++++++++++++++++++++-----------
 kernel/bpf/verifier.c | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 74 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e55e4255a210..1b985ca4ffbe 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -52,6 +52,7 @@ struct bpf_map {
 	u32 pages;
 	u32 id;
 	int numa_node;
+	bool unpriv_array;
 	struct user_struct *user;
 	const struct bpf_map_ops *ops;
 	struct work_struct work;
@@ -221,6 +222,7 @@ struct bpf_prog_aux {
 struct bpf_array {
 	struct bpf_map map;
 	u32 elem_size;
+	u32 index_mask;
 	/* 'ownership' of prog_array is claimed by the first program that
 	 * is going to use this map or by the first program which FD is stored
 	 * in the map to make sure that all callers and callees have the same
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 7c25426d3cf5..aaa319848e7d 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -53,9 +53,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 {
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	int numa_node = bpf_map_attr_numa_node(attr);
+	u32 elem_size, index_mask, max_entries;
+	bool unpriv = !capable(CAP_SYS_ADMIN);
 	struct bpf_array *array;
 	u64 array_size;
-	u32 elem_size;
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -72,11 +73,20 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 
 	elem_size = round_up(attr->value_size, 8);
 
+	max_entries = attr->max_entries;
+	index_mask = roundup_pow_of_two(max_entries) - 1;
+
+	if (unpriv)
+		/* round up array size to nearest power of 2,
+		 * since cpu will speculate within index_mask limits
+		 */
+		max_entries = index_mask + 1;
+
 	array_size = sizeof(*array);
 	if (percpu)
-		array_size += (u64) attr->max_entries * sizeof(void *);
+		array_size += (u64) max_entries * sizeof(void *);
 	else
-		array_size += (u64) attr->max_entries * elem_size;
+		array_size += (u64) max_entries * elem_size;
 
 	/* make sure there is no u32 overflow later in round_up() */
 	if (array_size >= U32_MAX - PAGE_SIZE)
@@ -86,6 +96,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	array = bpf_map_area_alloc(array_size, numa_node);
 	if (!array)
 		return ERR_PTR(-ENOMEM);
+	array->index_mask = index_mask;
+	array->map.unpriv_array = unpriv;
 
 	/* copy mandatory map attributes */
 	array->map.map_type = attr->map_type;
@@ -121,12 +133,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
 	if (unlikely(index >= array->map.max_entries))
 		return NULL;
 
-	return array->value + array->elem_size * index;
+	return array->value + array->elem_size * (index & array->index_mask);
 }
 
 /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
 static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 {
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct bpf_insn *insn = insn_buf;
 	u32 elem_size = round_up(map->value_size, 8);
 	const int ret = BPF_REG_0;
@@ -135,7 +148,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 
 	*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
 	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
-	*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+	if (map->unpriv_array) {
+		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+	} else {
+		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+	}
 
 	if (is_power_of_2(elem_size)) {
 		*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
@@ -157,7 +175,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
 	if (unlikely(index >= array->map.max_entries))
 		return NULL;
 
-	return this_cpu_ptr(array->pptrs[index]);
+	return this_cpu_ptr(array->pptrs[index & array->index_mask]);
 }
 
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
@@ -177,7 +195,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
 	 */
 	size = round_up(map->value_size, 8);
 	rcu_read_lock();
-	pptr = array->pptrs[index];
+	pptr = array->pptrs[index & array->index_mask];
 	for_each_possible_cpu(cpu) {
 		bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
 		off += size;
@@ -225,10 +243,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 		return -EEXIST;
 
 	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
-		memcpy(this_cpu_ptr(array->pptrs[index]),
+		memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
 		       value, map->value_size);
 	else
-		memcpy(array->value + array->elem_size * index,
+		memcpy(array->value +
+		       array->elem_size * (index & array->index_mask),
 		       value, map->value_size);
 	return 0;
 }
@@ -262,7 +281,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
 	 */
 	size = round_up(map->value_size, 8);
 	rcu_read_lock();
-	pptr = array->pptrs[index];
+	pptr = array->pptrs[index & array->index_mask];
 	for_each_possible_cpu(cpu) {
 		bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
 		off += size;
@@ -613,6 +632,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
 static u32 array_of_map_gen_lookup(struct bpf_map *map,
 				   struct bpf_insn *insn_buf)
 {
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	u32 elem_size = round_up(map->value_size, 8);
 	struct bpf_insn *insn = insn_buf;
 	const int ret = BPF_REG_0;
@@ -621,7 +641,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
 
 	*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
 	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
-	*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+	if (map->unpriv_array) {
+		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+	} else {
+		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+	}
 	if (is_power_of_2(elem_size))
 		*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
 	else
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 04b24876cd23..b414d6b2d470 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1729,6 +1729,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 	err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
 	if (err)
 		return err;
+	if (func_id == BPF_FUNC_tail_call) {
+		if (meta.map_ptr == NULL) {
+			verbose(env, "verifier bug\n");
+			return -EINVAL;
+		}
+		env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
+	}
 	err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
 	if (err)
 		return err;
@@ -4456,6 +4463,35 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			 */
 			insn->imm = 0;
 			insn->code = BPF_JMP | BPF_TAIL_CALL;
+
+			/* instead of changing every JIT dealing with tail_call
+			 * emit two extra insns:
+			 * if (index >= max_entries) goto out;
+			 * index &= array->index_mask;
+			 * to avoid out-of-bounds cpu speculation
+			 */
+			map_ptr = env->insn_aux_data[i + delta].map_ptr;
+			if (map_ptr == BPF_MAP_PTR_POISON) {
+				verbose(env, "tail_call obusing map_ptr\n");
+				return -EINVAL;
+			}
+			if (!map_ptr->unpriv_array)
+				continue;
+			insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
+						  map_ptr->max_entries, 2);
+			insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
+						    container_of(map_ptr,
+								 struct bpf_array,
+								 map)->index_mask);
+			insn_buf[2] = *insn;
+			cnt = 3;
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
 			continue;
 		}
 
-- 
cgit v1.2.3


From be95a845cc4402272994ce290e3ad928aff06cb9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 9 Jan 2018 13:17:44 +0100
Subject: bpf: avoid false sharing of map refcount with max_entries

In addition to commit b2157399cc98 ("bpf: prevent out-of-bounds
speculation") also change the layout of struct bpf_map such that
false sharing of fast-path members like max_entries is avoided
when the maps reference counter is altered. Therefore enforce
them to be placed into separate cachelines.

pahole dump after change:

  struct bpf_map {
        const struct bpf_map_ops  * ops;                 /*     0     8 */
        struct bpf_map *           inner_map_meta;       /*     8     8 */
        void *                     security;             /*    16     8 */
        enum bpf_map_type          map_type;             /*    24     4 */
        u32                        key_size;             /*    28     4 */
        u32                        value_size;           /*    32     4 */
        u32                        max_entries;          /*    36     4 */
        u32                        map_flags;            /*    40     4 */
        u32                        pages;                /*    44     4 */
        u32                        id;                   /*    48     4 */
        int                        numa_node;            /*    52     4 */
        bool                       unpriv_array;         /*    56     1 */

        /* XXX 7 bytes hole, try to pack */

        /* --- cacheline 1 boundary (64 bytes) --- */
        struct user_struct *       user;                 /*    64     8 */
        atomic_t                   refcnt;               /*    72     4 */
        atomic_t                   usercnt;              /*    76     4 */
        struct work_struct         work;                 /*    80    32 */
        char                       name[16];             /*   112    16 */
        /* --- cacheline 2 boundary (128 bytes) --- */

        /* size: 128, cachelines: 2, members: 17 */
        /* sum members: 121, holes: 1, sum holes: 7 */
  };

Now all entries in the first cacheline are read only throughout
the life time of the map, set up once during map creation. Overall
struct size and number of cachelines doesn't change from the
reordering. struct bpf_map is usually first member and embedded
in map structs in specific map implementations, so also avoid those
members to sit at the end where it could potentially share the
cacheline with first map values e.g. in the array since remote
CPUs could trigger map updates just as well for those (easily
dirtying members like max_entries intentionally as well) while
having subsequent values in cache.

Quoting from Google's Project Zero blog [1]:

  Additionally, at least on the Intel machine on which this was
  tested, bouncing modified cache lines between cores is slow,
  apparently because the MESI protocol is used for cache coherence
  [8]. Changing the reference counter of an eBPF array on one
  physical CPU core causes the cache line containing the reference
  counter to be bounced over to that CPU core, making reads of the
  reference counter on all other CPU cores slow until the changed
  reference counter has been written back to memory. Because the
  length and the reference counter of an eBPF array are stored in
  the same cache line, this also means that changing the reference
  counter on one physical CPU core causes reads of the eBPF array's
  length to be slow on other physical CPU cores (intentional false
  sharing).

While this doesn't 'control' the out-of-bounds speculation through
masking the index as in commit b2157399cc98, triggering a manipulation
of the map's reference counter is really trivial, so lets not allow
to easily affect max_entries from it.

Splitting to separate cachelines also generally makes sense from
a performance perspective anyway in that fast-path won't have a
cache miss if the map gets pinned, reused in other progs, etc out
of control path, thus also avoids unintentional false sharing.

  [1] https://googleprojectzero.blogspot.ch/2018/01/reading-privileged-memory-with-side.html

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1b985ca4ffbe..fe2cb7c398e3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -43,7 +43,14 @@ struct bpf_map_ops {
 };
 
 struct bpf_map {
-	atomic_t refcnt;
+	/* 1st cacheline with read-mostly members of which some
+	 * are also accessed in fast-path (e.g. ops, max_entries).
+	 */
+	const struct bpf_map_ops *ops ____cacheline_aligned;
+	struct bpf_map *inner_map_meta;
+#ifdef CONFIG_SECURITY
+	void *security;
+#endif
 	enum bpf_map_type map_type;
 	u32 key_size;
 	u32 value_size;
@@ -53,15 +60,16 @@ struct bpf_map {
 	u32 id;
 	int numa_node;
 	bool unpriv_array;
-	struct user_struct *user;
-	const struct bpf_map_ops *ops;
-	struct work_struct work;
+	/* 7 bytes hole */
+
+	/* 2nd cacheline with misc members to avoid false sharing
+	 * particularly with refcounting.
+	 */
+	struct user_struct *user ____cacheline_aligned;
+	atomic_t refcnt;
 	atomic_t usercnt;
-	struct bpf_map *inner_map_meta;
+	struct work_struct work;
 	char name[BPF_OBJ_NAME_LEN];
-#ifdef CONFIG_SECURITY
-	void *security;
-#endif
 };
 
 /* function argument constraints */
-- 
cgit v1.2.3


From 8978cc921fc7fad3f4d6f91f1da01352aeeeff25 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Tue, 9 Jan 2018 11:41:10 +0200
Subject: {net,ib}/mlx5: Don't disable local loopback multicast traffic when
 needed

There are systems platform information management interfaces (such as
HOST2BMC) for which we cannot disable local loopback multicast traffic.

Separate disable_local_lb_mc and disable_local_lb_uc capability bits so
driver will not disable multicast loopback traffic if not supported.
(It is expected that Firmware will not set disable_local_lb_mc if
HOST2BMC is running for example.)

Function mlx5_nic_vport_update_local_lb will do best effort to
disable/enable UC/MC loopback traffic and return success only in case it
succeeded to changed all allowed by Firmware.

Adapt mlx5_ib and mlx5e to support the new cap bits.

Fixes: 2c43c5a036be ("net/mlx5e: Enable local loopback in loopback selftest")
Fixes: c85023e153e3 ("IB/mlx5: Add raw ethernet local loopback support")
Fixes: bded747bb432 ("net/mlx5: Add raw ethernet local loopback firmware command")
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Cc: kernel-team@fb.com
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c                  |  9 +++++---
 .../net/ethernet/mellanox/mlx5/core/en_selftest.c  | 27 ++++++++++++++--------
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  3 +--
 drivers/net/ethernet/mellanox/mlx5/core/vport.c    | 22 +++++++++++++-----
 include/linux/mlx5/mlx5_ifc.h                      |  5 ++--
 5 files changed, 44 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 8ac50de2b242..00cb184fa027 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1324,7 +1324,8 @@ static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn)
 		return err;
 
 	if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
-	    !MLX5_CAP_GEN(dev->mdev, disable_local_lb))
+	    (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
+	     !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
 		return err;
 
 	mutex_lock(&dev->lb_mutex);
@@ -1342,7 +1343,8 @@ static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn)
 	mlx5_core_dealloc_transport_domain(dev->mdev, tdn);
 
 	if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
-	    !MLX5_CAP_GEN(dev->mdev, disable_local_lb))
+	    (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
+	     !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
 		return;
 
 	mutex_lock(&dev->lb_mutex);
@@ -4187,7 +4189,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	}
 
 	if ((MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
-	    MLX5_CAP_GEN(mdev, disable_local_lb))
+	    (MLX5_CAP_GEN(mdev, disable_local_lb_uc) ||
+	     MLX5_CAP_GEN(mdev, disable_local_lb_mc)))
 		mutex_init(&dev->lb_mutex);
 
 	dev->ib_active = true;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
index 1f1f8af87d4d..5a4608281f38 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
@@ -238,15 +238,19 @@ static int mlx5e_test_loopback_setup(struct mlx5e_priv *priv,
 	int err = 0;
 
 	/* Temporarily enable local_lb */
-	if (MLX5_CAP_GEN(priv->mdev, disable_local_lb)) {
-		mlx5_nic_vport_query_local_lb(priv->mdev, &lbtp->local_lb);
-		if (!lbtp->local_lb)
-			mlx5_nic_vport_update_local_lb(priv->mdev, true);
+	err = mlx5_nic_vport_query_local_lb(priv->mdev, &lbtp->local_lb);
+	if (err)
+		return err;
+
+	if (!lbtp->local_lb) {
+		err = mlx5_nic_vport_update_local_lb(priv->mdev, true);
+		if (err)
+			return err;
 	}
 
 	err = mlx5e_refresh_tirs(priv, true);
 	if (err)
-		return err;
+		goto out;
 
 	lbtp->loopback_ok = false;
 	init_completion(&lbtp->comp);
@@ -256,16 +260,21 @@ static int mlx5e_test_loopback_setup(struct mlx5e_priv *priv,
 	lbtp->pt.dev = priv->netdev;
 	lbtp->pt.af_packet_priv = lbtp;
 	dev_add_pack(&lbtp->pt);
+
+	return 0;
+
+out:
+	if (!lbtp->local_lb)
+		mlx5_nic_vport_update_local_lb(priv->mdev, false);
+
 	return err;
 }
 
 static void mlx5e_test_loopback_cleanup(struct mlx5e_priv *priv,
 					struct mlx5e_lbt_priv *lbtp)
 {
-	if (MLX5_CAP_GEN(priv->mdev, disable_local_lb)) {
-		if (!lbtp->local_lb)
-			mlx5_nic_vport_update_local_lb(priv->mdev, false);
-	}
+	if (!lbtp->local_lb)
+		mlx5_nic_vport_update_local_lb(priv->mdev, false);
 
 	dev_remove_pack(&lbtp->pt);
 	mlx5e_refresh_tirs(priv, false);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 8a89c7e8cd63..95e188d0883e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -578,8 +578,7 @@ static int mlx5_core_set_hca_defaults(struct mlx5_core_dev *dev)
 	int ret = 0;
 
 	/* Disable local_lb by default */
-	if ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
-	    MLX5_CAP_GEN(dev, disable_local_lb))
+	if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH)
 		ret = mlx5_nic_vport_update_local_lb(dev, false);
 
 	return ret;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index d653b0025b13..a1296a62497d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -908,23 +908,33 @@ int mlx5_nic_vport_update_local_lb(struct mlx5_core_dev *mdev, bool enable)
 	void *in;
 	int err;
 
-	mlx5_core_dbg(mdev, "%s local_lb\n", enable ? "enable" : "disable");
+	if (!MLX5_CAP_GEN(mdev, disable_local_lb_mc) &&
+	    !MLX5_CAP_GEN(mdev, disable_local_lb_uc))
+		return 0;
+
 	in = kvzalloc(inlen, GFP_KERNEL);
 	if (!in)
 		return -ENOMEM;
 
-	MLX5_SET(modify_nic_vport_context_in, in,
-		 field_select.disable_mc_local_lb, 1);
 	MLX5_SET(modify_nic_vport_context_in, in,
 		 nic_vport_context.disable_mc_local_lb, !enable);
-
-	MLX5_SET(modify_nic_vport_context_in, in,
-		 field_select.disable_uc_local_lb, 1);
 	MLX5_SET(modify_nic_vport_context_in, in,
 		 nic_vport_context.disable_uc_local_lb, !enable);
 
+	if (MLX5_CAP_GEN(mdev, disable_local_lb_mc))
+		MLX5_SET(modify_nic_vport_context_in, in,
+			 field_select.disable_mc_local_lb, 1);
+
+	if (MLX5_CAP_GEN(mdev, disable_local_lb_uc))
+		MLX5_SET(modify_nic_vport_context_in, in,
+			 field_select.disable_uc_local_lb, 1);
+
 	err = mlx5_modify_nic_vport_context(mdev, in, inlen);
 
+	if (!err)
+		mlx5_core_dbg(mdev, "%s local_lb\n",
+			      enable ? "enable" : "disable");
+
 	kvfree(in);
 	return err;
 }
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index d44ec5f41d4a..1391a82da98e 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1027,8 +1027,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         log_max_wq_sz[0x5];
 
 	u8         nic_vport_change_event[0x1];
-	u8         disable_local_lb[0x1];
-	u8         reserved_at_3e2[0x9];
+	u8         disable_local_lb_uc[0x1];
+	u8         disable_local_lb_mc[0x1];
+	u8         reserved_at_3e3[0x8];
 	u8         log_max_vlan_list[0x5];
 	u8         reserved_at_3f0[0x3];
 	u8         log_max_current_mc_list[0x5];
-- 
cgit v1.2.3


From 05e0cc84e00c54fb152d1f4b86bc211823a83d0c Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Thu, 4 Jan 2018 04:35:51 +0200
Subject: net/mlx5: Fix get vector affinity helper function

mlx5_get_vector_affinity used to call pci_irq_get_affinity and after
reverting the patch that sets the device affinity via PCI_IRQ_AFFINITY
API, calling pci_irq_get_affinity becomes useless and it breaks RDMA
mlx5 users.  To fix this, this patch provides an alternative way to
retrieve IRQ vector affinity using legacy IRQ API, following
smp_affinity read procfs implementation.

Fixes: 231243c82793 ("Revert mlx5: move affinity hints assignments to generic code")
Fixes: a435393acafb ("mlx5: move affinity hints assignments to generic code")
Cc: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/driver.h | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 1f509d072026..a0610427e168 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -36,6 +36,7 @@
 #include <linux/kernel.h>
 #include <linux/completion.h>
 #include <linux/pci.h>
+#include <linux/irq.h>
 #include <linux/spinlock_types.h>
 #include <linux/semaphore.h>
 #include <linux/slab.h>
@@ -1231,7 +1232,23 @@ enum {
 static inline const struct cpumask *
 mlx5_get_vector_affinity(struct mlx5_core_dev *dev, int vector)
 {
-	return pci_irq_get_affinity(dev->pdev, MLX5_EQ_VEC_COMP_BASE + vector);
+	const struct cpumask *mask;
+	struct irq_desc *desc;
+	unsigned int irq;
+	int eqn;
+	int err;
+
+	err = mlx5_vector2eqn(dev, vector, &eqn, &irq);
+	if (err)
+		return NULL;
+
+	desc = irq_to_desc(irq);
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+	mask = irq_data_get_effective_affinity_mask(&desc->irq_data);
+#else
+	mask = desc->irq_common_data.affinity;
+#endif
+	return mask;
 }
 
 #endif /* MLX5_DRIVER_H */
-- 
cgit v1.2.3


From a0b1280368d1e91ab72f849ef095b4f07a39bbf1 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 12 Jan 2018 16:53:14 -0800
Subject: kdump: write correct address of mem_section into vmcoreinfo

Depending on configuration mem_section can now be an array or a pointer
to an array allocated dynamically.  In most cases, we can continue to
refer to it as 'mem_section' regardless of what it is.

But there's one exception: '&mem_section' means "address of the array"
if mem_section is an array, but if mem_section is a pointer, it would
mean "address of the pointer".

We've stepped onto this in kdump code.  VMCOREINFO_SYMBOL(mem_section)
writes down address of pointer into vmcoreinfo, not array as we wanted.

Let's introduce VMCOREINFO_SYMBOL_ARRAY() that would handle the
situation correctly for both cases.

Link: http://lkml.kernel.org/r/20180112162532.35896-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Fixes: 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y")
Acked-by: Baoquan He <bhe@redhat.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Dave Young <dyoung@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/crash_core.h | 2 ++
 kernel/crash_core.c        | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 06097ef30449..b511f6d24b42 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -42,6 +42,8 @@ phys_addr_t paddr_vmcoreinfo_note(void);
 	vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
 #define VMCOREINFO_SYMBOL(name) \
 	vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)
+#define VMCOREINFO_SYMBOL_ARRAY(name) \
+	vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name)
 #define VMCOREINFO_SIZE(name) \
 	vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
 			      (unsigned long)sizeof(name))
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b3663896278e..4f63597c824d 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -410,7 +410,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_SYMBOL(contig_page_data);
 #endif
 #ifdef CONFIG_SPARSEMEM
-	VMCOREINFO_SYMBOL(mem_section);
+	VMCOREINFO_SYMBOL_ARRAY(mem_section);
 	VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
 	VMCOREINFO_STRUCT_SIZE(mem_section);
 	VMCOREINFO_OFFSET(mem_section, section_mem_map);
-- 
cgit v1.2.3


From 66940f35d5a81d5969bb5543171c70a434fc5110 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 10 Jan 2018 16:03:05 +0200
Subject: ptr_ring: document usage around __ptr_ring_peek

This explains why is the net usage of __ptr_ring_peek
actually ok without locks.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptr_ring.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 6866df4f31b5..d72b2e7dd500 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -174,6 +174,15 @@ static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr)
  * if they dereference the pointer - see e.g. PTR_RING_PEEK_CALL.
  * If ring is never resized, and if the pointer is merely
  * tested, there's no need to take the lock - see e.g.  __ptr_ring_empty.
+ * However, if called outside the lock, and if some other CPU
+ * consumes ring entries at the same time, the value returned
+ * is not guaranteed to be correct.
+ * In this case - to avoid incorrectly detecting the ring
+ * as empty - the CPU consuming the ring entries is responsible
+ * for either consuming all ring entries until the ring is empty,
+ * or synchronizing with some other CPU and causing it to
+ * execute __ptr_ring_peek and/or consume the ring enteries
+ * after the synchronization point.
  */
 static inline void *__ptr_ring_peek(struct ptr_ring *r)
 {
@@ -182,10 +191,7 @@ static inline void *__ptr_ring_peek(struct ptr_ring *r)
 	return NULL;
 }
 
-/* Note: callers invoking this in a loop must use a compiler barrier,
- * for example cpu_relax(). Callers must take consumer_lock
- * if the ring is ever resized - see e.g. ptr_ring_empty.
- */
+/* See __ptr_ring_peek above for locking rules. */
 static inline bool __ptr_ring_empty(struct ptr_ring *r)
 {
 	return !__ptr_ring_peek(r);
-- 
cgit v1.2.3


From 6311b7ce42e0c1d6d944bc099dc47e936c20cf11 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 15 Jan 2018 12:42:25 +0100
Subject: netlink: extack: avoid parenthesized string constant warning

NL_SET_ERR_MSG() and NL_SET_ERR_MSG_ATTR() lead to the following warning
in newer versions of gcc:
  warning: array initialized from parenthesized string constant

Just remove the parentheses, they're not needed in this context since
anyway since there can be no operator precendence issues or similar.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 49b4257ce1ea..f3075d6c7e82 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -85,7 +85,7 @@ struct netlink_ext_ack {
  * to the lack of an output buffer.)
  */
 #define NL_SET_ERR_MSG(extack, msg) do {		\
-	static const char __msg[] = (msg);		\
+	static const char __msg[] = msg;		\
 	struct netlink_ext_ack *__extack = (extack);	\
 							\
 	if (__extack)					\
@@ -101,7 +101,7 @@ struct netlink_ext_ack {
 } while (0)
 
 #define NL_SET_ERR_MSG_ATTR(extack, attr, msg) do {	\
-	static const char __msg[] = (msg);		\
+	static const char __msg[] = msg;		\
 	struct netlink_ext_ack *__extack = (extack);	\
 							\
 	if (__extack) {					\
-- 
cgit v1.2.3


From c96f5471ce7d2aefd0dda560cc23f08ab00bc65d Mon Sep 17 00:00:00 2001
From: Josh Snyder <joshs@netflix.com>
Date: Mon, 18 Dec 2017 16:15:10 +0000
Subject: delayacct: Account blkio completion on the correct task

Before commit:

  e33a9bba85a8 ("sched/core: move IO scheduling accounting from io_schedule_timeout() into scheduler")

delayacct_blkio_end() was called after context-switching into the task which
completed I/O.

This resulted in double counting: the task would account a delay both waiting
for I/O and for time spent in the runqueue.

With e33a9bba85a8, delayacct_blkio_end() is called by try_to_wake_up().
In ttwu, we have not yet context-switched. This is more correct, in that
the delay accounting ends when the I/O is complete.

But delayacct_blkio_end() relies on 'get_current()', and we have not yet
context-switched into the task whose I/O completed. This results in the
wrong task having its delay accounting statistics updated.

Instead of doing that, pass the task_struct being woken to delayacct_blkio_end(),
so that it can update the statistics of the correct task.

Signed-off-by: Josh Snyder <joshs@netflix.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Cc: <stable@vger.kernel.org>
Cc: Brendan Gregg <bgregg@netflix.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-block@vger.kernel.org
Fixes: e33a9bba85a8 ("sched/core: move IO scheduling accounting from io_schedule_timeout() into scheduler")
Link: http://lkml.kernel.org/r/1513613712-571-1-git-send-email-joshs@netflix.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/delayacct.h |  8 ++++----
 kernel/delayacct.c        | 42 ++++++++++++++++++++++++++----------------
 kernel/sched/core.c       |  6 +++---
 3 files changed, 33 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 4178d2493547..5e335b6203f4 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -71,7 +71,7 @@ extern void delayacct_init(void);
 extern void __delayacct_tsk_init(struct task_struct *);
 extern void __delayacct_tsk_exit(struct task_struct *);
 extern void __delayacct_blkio_start(void);
-extern void __delayacct_blkio_end(void);
+extern void __delayacct_blkio_end(struct task_struct *);
 extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *);
 extern __u64 __delayacct_blkio_ticks(struct task_struct *);
 extern void __delayacct_freepages_start(void);
@@ -122,10 +122,10 @@ static inline void delayacct_blkio_start(void)
 		__delayacct_blkio_start();
 }
 
-static inline void delayacct_blkio_end(void)
+static inline void delayacct_blkio_end(struct task_struct *p)
 {
 	if (current->delays)
-		__delayacct_blkio_end();
+		__delayacct_blkio_end(p);
 	delayacct_clear_flag(DELAYACCT_PF_BLKIO);
 }
 
@@ -169,7 +169,7 @@ static inline void delayacct_tsk_free(struct task_struct *tsk)
 {}
 static inline void delayacct_blkio_start(void)
 {}
-static inline void delayacct_blkio_end(void)
+static inline void delayacct_blkio_end(struct task_struct *p)
 {}
 static inline int delayacct_add_tsk(struct taskstats *d,
 					struct task_struct *tsk)
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 4a1c33416b6a..e2764d767f18 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -51,16 +51,16 @@ void __delayacct_tsk_init(struct task_struct *tsk)
  * Finish delay accounting for a statistic using its timestamps (@start),
  * accumalator (@total) and @count
  */
-static void delayacct_end(u64 *start, u64 *total, u32 *count)
+static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count)
 {
 	s64 ns = ktime_get_ns() - *start;
 	unsigned long flags;
 
 	if (ns > 0) {
-		spin_lock_irqsave(&current->delays->lock, flags);
+		spin_lock_irqsave(lock, flags);
 		*total += ns;
 		(*count)++;
-		spin_unlock_irqrestore(&current->delays->lock, flags);
+		spin_unlock_irqrestore(lock, flags);
 	}
 }
 
@@ -69,17 +69,25 @@ void __delayacct_blkio_start(void)
 	current->delays->blkio_start = ktime_get_ns();
 }
 
-void __delayacct_blkio_end(void)
+/*
+ * We cannot rely on the `current` macro, as we haven't yet switched back to
+ * the process being woken.
+ */
+void __delayacct_blkio_end(struct task_struct *p)
 {
-	if (current->delays->flags & DELAYACCT_PF_SWAPIN)
-		/* Swapin block I/O */
-		delayacct_end(&current->delays->blkio_start,
-			&current->delays->swapin_delay,
-			&current->delays->swapin_count);
-	else	/* Other block I/O */
-		delayacct_end(&current->delays->blkio_start,
-			&current->delays->blkio_delay,
-			&current->delays->blkio_count);
+	struct task_delay_info *delays = p->delays;
+	u64 *total;
+	u32 *count;
+
+	if (p->delays->flags & DELAYACCT_PF_SWAPIN) {
+		total = &delays->swapin_delay;
+		count = &delays->swapin_count;
+	} else {
+		total = &delays->blkio_delay;
+		count = &delays->blkio_count;
+	}
+
+	delayacct_end(&delays->lock, &delays->blkio_start, total, count);
 }
 
 int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -153,8 +161,10 @@ void __delayacct_freepages_start(void)
 
 void __delayacct_freepages_end(void)
 {
-	delayacct_end(&current->delays->freepages_start,
-			&current->delays->freepages_delay,
-			&current->delays->freepages_count);
+	delayacct_end(
+		&current->delays->lock,
+		&current->delays->freepages_start,
+		&current->delays->freepages_delay,
+		&current->delays->freepages_count);
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 644fa2e3d993..a7bf32aabfda 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2056,7 +2056,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	p->state = TASK_WAKING;
 
 	if (p->in_iowait) {
-		delayacct_blkio_end();
+		delayacct_blkio_end(p);
 		atomic_dec(&task_rq(p)->nr_iowait);
 	}
 
@@ -2069,7 +2069,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 #else /* CONFIG_SMP */
 
 	if (p->in_iowait) {
-		delayacct_blkio_end();
+		delayacct_blkio_end(p);
 		atomic_dec(&task_rq(p)->nr_iowait);
 	}
 
@@ -2122,7 +2122,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
 
 	if (!task_on_rq_queued(p)) {
 		if (p->in_iowait) {
-			delayacct_blkio_end();
+			delayacct_blkio_end(p);
 			atomic_dec(&rq->nr_iowait);
 		}
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
-- 
cgit v1.2.3


From a3d6c976f71902388e444594daa902032b5a45fa Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Thu, 18 Jan 2018 16:34:08 -0800
Subject: sparse doesn't support struct randomization

Without this patch, I drown in a sea of unknown attribute warnings

Link: http://lkml.kernel.org/r/20180117024539.27354-1-willy@infradead.org
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 2272ded07496..631354acfa72 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -219,7 +219,7 @@
 /* Mark a function definition as prohibited from being cloned. */
 #define __noclone	__attribute__((__noclone__, __optimize__("no-tracer")))
 
-#ifdef RANDSTRUCT_PLUGIN
+#if defined(RANDSTRUCT_PLUGIN) && !defined(__CHECKER__)
 #define __randomize_layout __attribute__((randomize_layout))
 #define __no_randomize_layout __attribute__((no_randomize_layout))
 #endif
-- 
cgit v1.2.3


From 0d665e7b109d512b7cae3ccef6e8654714887844 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 19 Jan 2018 15:49:24 +0300
Subject: mm, page_vma_mapped: Drop faulty pointer arithmetics in check_pte()

Tetsuo reported random crashes under memory pressure on 32-bit x86
system and tracked down to change that introduced
page_vma_mapped_walk().

The root cause of the issue is the faulty pointer math in check_pte().
As ->pte may point to an arbitrary page we have to check that they are
belong to the section before doing math. Otherwise it may lead to weird
results.

It wasn't noticed until now as mem_map[] is virtually contiguous on
flatmem or vmemmap sparsemem. Pointer arithmetic just works against all
'struct page' pointers. But with classic sparsemem, it doesn't because
each section memap is allocated separately and so consecutive pfns
crossing two sections might have struct pages at completely unrelated
addresses.

Let's restructure code a bit and replace pointer arithmetic with
operations on pfns.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-and-tested-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Fixes: ace71a19cec5 ("mm: introduce page_vma_mapped_walk()")
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swapops.h | 21 +++++++++++++++++
 mm/page_vma_mapped.c    | 63 +++++++++++++++++++++++++++++--------------------
 2 files changed, 59 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 9c5a2628d6ce..1d3877c39a00 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -124,6 +124,11 @@ static inline bool is_write_device_private_entry(swp_entry_t entry)
 	return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
 }
 
+static inline unsigned long device_private_entry_to_pfn(swp_entry_t entry)
+{
+	return swp_offset(entry);
+}
+
 static inline struct page *device_private_entry_to_page(swp_entry_t entry)
 {
 	return pfn_to_page(swp_offset(entry));
@@ -154,6 +159,11 @@ static inline bool is_write_device_private_entry(swp_entry_t entry)
 	return false;
 }
 
+static inline unsigned long device_private_entry_to_pfn(swp_entry_t entry)
+{
+	return 0;
+}
+
 static inline struct page *device_private_entry_to_page(swp_entry_t entry)
 {
 	return NULL;
@@ -189,6 +199,11 @@ static inline int is_write_migration_entry(swp_entry_t entry)
 	return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE);
 }
 
+static inline unsigned long migration_entry_to_pfn(swp_entry_t entry)
+{
+	return swp_offset(entry);
+}
+
 static inline struct page *migration_entry_to_page(swp_entry_t entry)
 {
 	struct page *p = pfn_to_page(swp_offset(entry));
@@ -218,6 +233,12 @@ static inline int is_migration_entry(swp_entry_t swp)
 {
 	return 0;
 }
+
+static inline unsigned long migration_entry_to_pfn(swp_entry_t entry)
+{
+	return 0;
+}
+
 static inline struct page *migration_entry_to_page(swp_entry_t entry)
 {
 	return NULL;
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index d22b84310f6d..956015614395 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -30,10 +30,29 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
 	return true;
 }
 
+/**
+ * check_pte - check if @pvmw->page is mapped at the @pvmw->pte
+ *
+ * page_vma_mapped_walk() found a place where @pvmw->page is *potentially*
+ * mapped. check_pte() has to validate this.
+ *
+ * @pvmw->pte may point to empty PTE, swap PTE or PTE pointing to arbitrary
+ * page.
+ *
+ * If PVMW_MIGRATION flag is set, returns true if @pvmw->pte contains migration
+ * entry that points to @pvmw->page or any subpage in case of THP.
+ *
+ * If PVMW_MIGRATION flag is not set, returns true if @pvmw->pte points to
+ * @pvmw->page or any subpage in case of THP.
+ *
+ * Otherwise, return false.
+ *
+ */
 static bool check_pte(struct page_vma_mapped_walk *pvmw)
 {
+	unsigned long pfn;
+
 	if (pvmw->flags & PVMW_MIGRATION) {
-#ifdef CONFIG_MIGRATION
 		swp_entry_t entry;
 		if (!is_swap_pte(*pvmw->pte))
 			return false;
@@ -41,37 +60,31 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
 
 		if (!is_migration_entry(entry))
 			return false;
-		if (migration_entry_to_page(entry) - pvmw->page >=
-				hpage_nr_pages(pvmw->page)) {
-			return false;
-		}
-		if (migration_entry_to_page(entry) < pvmw->page)
-			return false;
-#else
-		WARN_ON_ONCE(1);
-#endif
-	} else {
-		if (is_swap_pte(*pvmw->pte)) {
-			swp_entry_t entry;
 
-			entry = pte_to_swp_entry(*pvmw->pte);
-			if (is_device_private_entry(entry) &&
-			    device_private_entry_to_page(entry) == pvmw->page)
-				return true;
-		}
+		pfn = migration_entry_to_pfn(entry);
+	} else if (is_swap_pte(*pvmw->pte)) {
+		swp_entry_t entry;
 
-		if (!pte_present(*pvmw->pte))
+		/* Handle un-addressable ZONE_DEVICE memory */
+		entry = pte_to_swp_entry(*pvmw->pte);
+		if (!is_device_private_entry(entry))
 			return false;
 
-		/* THP can be referenced by any subpage */
-		if (pte_page(*pvmw->pte) - pvmw->page >=
-				hpage_nr_pages(pvmw->page)) {
-			return false;
-		}
-		if (pte_page(*pvmw->pte) < pvmw->page)
+		pfn = device_private_entry_to_pfn(entry);
+	} else {
+		if (!pte_present(*pvmw->pte))
 			return false;
+
+		pfn = pte_pfn(*pvmw->pte);
 	}
 
+	if (pfn < page_to_pfn(pvmw->page))
+		return false;
+
+	/* THP can be referenced by any subpage */
+	if (pfn - page_to_pfn(pvmw->page) >= hpage_nr_pages(pvmw->page))
+		return false;
+
 	return true;
 }
 
-- 
cgit v1.2.3


From 6be7fa3c74d1e0cd50f2157b5c1524f152bf641e Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 22 Jan 2018 22:32:51 -0500
Subject: ftrace, orc, x86: Handle ftrace dynamically allocated trampolines

The function tracer can create a dynamically allocated trampoline that is
called by the function mcount or fentry hook that is used to call the
function callback that is registered. The problem is that the orc undwinder
will bail if it encounters one of these trampolines. This breaks the stack
trace of function callbacks, which include the stack tracer and setting the
stack trace for individual functions.

Since these dynamic trampolines are basically copies of the static ftrace
trampolines defined in ftrace_*.S, we do not need to create new orc entries
for the dynamic trampolines. Finding the return address on the stack will be
identical as the functions that were copied to create the dynamic
trampolines. When encountering a ftrace dynamic trampoline, we can just use
the orc entry of the ftrace static function that was copied for that
trampoline.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/x86/kernel/unwind_orc.c | 48 +++++++++++++++++++++++++++++++++++++++++++-
 include/linux/ftrace.h       |  2 ++
 kernel/trace/ftrace.c        | 29 +++++++++++++++-----------
 3 files changed, 66 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index be86a865087a..1f9188f5357c 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -74,8 +74,50 @@ static struct orc_entry *orc_module_find(unsigned long ip)
 }
 #endif
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+static struct orc_entry *orc_find(unsigned long ip);
+
+/*
+ * Ftrace dynamic trampolines do not have orc entries of their own.
+ * But they are copies of the ftrace entries that are static and
+ * defined in ftrace_*.S, which do have orc entries.
+ *
+ * If the undwinder comes across a ftrace trampoline, then find the
+ * ftrace function that was used to create it, and use that ftrace
+ * function's orc entrie, as the placement of the return code in
+ * the stack will be identical.
+ */
+static struct orc_entry *orc_ftrace_find(unsigned long ip)
+{
+	struct ftrace_ops *ops;
+	unsigned long caller;
+
+	ops = ftrace_ops_trampoline(ip);
+	if (!ops)
+		return NULL;
+
+	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+		caller = (unsigned long)ftrace_regs_call;
+	else
+		caller = (unsigned long)ftrace_call;
+
+	/* Prevent unlikely recursion */
+	if (ip == caller)
+		return NULL;
+
+	return orc_find(caller);
+}
+#else
+static struct orc_entry *orc_ftrace_find(unsigned long ip)
+{
+	return NULL;
+}
+#endif
+
 static struct orc_entry *orc_find(unsigned long ip)
 {
+	static struct orc_entry *orc;
+
 	if (!orc_init)
 		return NULL;
 
@@ -111,7 +153,11 @@ static struct orc_entry *orc_find(unsigned long ip)
 				  __stop_orc_unwind_ip - __start_orc_unwind_ip, ip);
 
 	/* Module lookup: */
-	return orc_module_find(ip);
+	orc = orc_module_find(ip);
+	if (orc)
+		return orc;
+
+	return orc_ftrace_find(ip);
 }
 
 static void orc_sort_swap(void *_a, void *_b, int size)
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 2bab81951ced..3319df9727aa 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -332,6 +332,8 @@ extern int ftrace_text_reserved(const void *start, const void *end);
 
 extern int ftrace_nr_registered_ops(void);
 
+struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr);
+
 bool is_ftrace_trampoline(unsigned long addr);
 
 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ccdf3664e4a9..554b517c61a0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1119,15 +1119,11 @@ static struct ftrace_ops global_ops = {
 };
 
 /*
- * This is used by __kernel_text_address() to return true if the
- * address is on a dynamically allocated trampoline that would
- * not return true for either core_kernel_text() or
- * is_module_text_address().
+ * Used by the stack undwinder to know about dynamic ftrace trampolines.
  */
-bool is_ftrace_trampoline(unsigned long addr)
+struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr)
 {
-	struct ftrace_ops *op;
-	bool ret = false;
+	struct ftrace_ops *op = NULL;
 
 	/*
 	 * Some of the ops may be dynamically allocated,
@@ -1144,15 +1140,24 @@ bool is_ftrace_trampoline(unsigned long addr)
 		if (op->trampoline && op->trampoline_size)
 			if (addr >= op->trampoline &&
 			    addr < op->trampoline + op->trampoline_size) {
-				ret = true;
-				goto out;
+				preempt_enable_notrace();
+				return op;
 			}
 	} while_for_each_ftrace_op(op);
-
- out:
 	preempt_enable_notrace();
 
-	return ret;
+	return NULL;
+}
+
+/*
+ * This is used by __kernel_text_address() to return true if the
+ * address is on a dynamically allocated trampoline that would
+ * not return true for either core_kernel_text() or
+ * is_module_text_address().
+ */
+bool is_ftrace_trampoline(unsigned long addr)
+{
+	return ftrace_ops_trampoline(addr) != NULL;
 }
 
 struct ftrace_page {
-- 
cgit v1.2.3


From ce48c146495a1a50e48cdbfbfaba3e708be7c07c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 22 Jan 2018 22:53:28 +0100
Subject: sched/core: Fix cpu.max vs. cpuhotplug deadlock

Tejun reported the following cpu-hotplug lock (percpu-rwsem) read recursion:

  tg_set_cfs_bandwidth()
    get_online_cpus()
      cpus_read_lock()

    cfs_bandwidth_usage_inc()
      static_key_slow_inc()
        cpus_read_lock()

Reported-by: Tejun Heo <tj@kernel.org>
Tested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180122215328.GP3397@worktop
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/jump_label.h |  7 +++++++
 kernel/jump_label.c        | 12 +++++++++---
 kernel/sched/fair.c        |  4 ++--
 3 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index c7b368c734af..e0340ca08d98 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -160,6 +160,8 @@ extern void arch_jump_label_transform_static(struct jump_entry *entry,
 extern int jump_label_text_reserved(void *start, void *end);
 extern void static_key_slow_inc(struct static_key *key);
 extern void static_key_slow_dec(struct static_key *key);
+extern void static_key_slow_inc_cpuslocked(struct static_key *key);
+extern void static_key_slow_dec_cpuslocked(struct static_key *key);
 extern void jump_label_apply_nops(struct module *mod);
 extern int static_key_count(struct static_key *key);
 extern void static_key_enable(struct static_key *key);
@@ -222,6 +224,9 @@ static inline void static_key_slow_dec(struct static_key *key)
 	atomic_dec(&key->enabled);
 }
 
+#define static_key_slow_inc_cpuslocked(key) static_key_slow_inc(key)
+#define static_key_slow_dec_cpuslocked(key) static_key_slow_dec(key)
+
 static inline int jump_label_text_reserved(void *start, void *end)
 {
 	return 0;
@@ -416,6 +421,8 @@ extern bool ____wrong_branch_error(void);
 
 #define static_branch_inc(x)		static_key_slow_inc(&(x)->key)
 #define static_branch_dec(x)		static_key_slow_dec(&(x)->key)
+#define static_branch_inc_cpuslocked(x)	static_key_slow_inc_cpuslocked(&(x)->key)
+#define static_branch_dec_cpuslocked(x)	static_key_slow_dec_cpuslocked(&(x)->key)
 
 /*
  * Normal usage; boolean enable/disable.
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 8594d24e4adc..b4517095db6a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -79,7 +79,7 @@ int static_key_count(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_count);
 
-static void static_key_slow_inc_cpuslocked(struct static_key *key)
+void static_key_slow_inc_cpuslocked(struct static_key *key)
 {
 	int v, v1;
 
@@ -180,7 +180,7 @@ void static_key_disable(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_disable);
 
-static void static_key_slow_dec_cpuslocked(struct static_key *key,
+static void __static_key_slow_dec_cpuslocked(struct static_key *key,
 					   unsigned long rate_limit,
 					   struct delayed_work *work)
 {
@@ -211,7 +211,7 @@ static void __static_key_slow_dec(struct static_key *key,
 				  struct delayed_work *work)
 {
 	cpus_read_lock();
-	static_key_slow_dec_cpuslocked(key, rate_limit, work);
+	__static_key_slow_dec_cpuslocked(key, rate_limit, work);
 	cpus_read_unlock();
 }
 
@@ -229,6 +229,12 @@ void static_key_slow_dec(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_slow_dec);
 
+void static_key_slow_dec_cpuslocked(struct static_key *key)
+{
+	STATIC_KEY_CHECK_USE(key);
+	__static_key_slow_dec_cpuslocked(key, 0, NULL);
+}
+
 void static_key_slow_dec_deferred(struct static_key_deferred *key)
 {
 	STATIC_KEY_CHECK_USE(key);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2fe3aa853e4d..26a71ebcd3c2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4365,12 +4365,12 @@ static inline bool cfs_bandwidth_used(void)
 
 void cfs_bandwidth_usage_inc(void)
 {
-	static_key_slow_inc(&__cfs_bandwidth_used);
+	static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
 }
 
 void cfs_bandwidth_usage_dec(void)
 {
-	static_key_slow_dec(&__cfs_bandwidth_used);
+	static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
 }
 #else /* HAVE_JUMP_LABEL */
 static bool cfs_bandwidth_used(void)
-- 
cgit v1.2.3


From 5132ede0fe8092b043dae09a7cc32b8ae7272baa Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 24 Jan 2018 15:28:17 +0100
Subject: Revert "module: Add retpoline tag to VERMAGIC"

This reverts commit 6cfb521ac0d5b97470883ff9b7facae264b7ab12.

Turns out distros do not want to make retpoline as part of their "ABI",
so this patch should not have been merged.  Sorry Andi, this was my
fault, I suggested it when your original patch was the "correct" way of
doing this instead.

Reported-by: Jiri Kosina <jikos@kernel.org>
Fixes: 6cfb521ac0d5 ("module: Add retpoline tag to VERMAGIC")
Acked-by: Andi Kleen <ak@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: rusty@rustcorp.com.au
Cc: arjan.van.de.ven@intel.com
Cc: jeyu@kernel.org
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vermagic.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h
index 853291714ae0..bae807eb2933 100644
--- a/include/linux/vermagic.h
+++ b/include/linux/vermagic.h
@@ -31,17 +31,11 @@
 #else
 #define MODULE_RANDSTRUCT_PLUGIN
 #endif
-#ifdef RETPOLINE
-#define MODULE_VERMAGIC_RETPOLINE "retpoline "
-#else
-#define MODULE_VERMAGIC_RETPOLINE ""
-#endif
 
 #define VERMAGIC_STRING 						\
 	UTS_RELEASE " "							\
 	MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT 			\
 	MODULE_VERMAGIC_MODULE_UNLOAD MODULE_VERMAGIC_MODVERSIONS	\
 	MODULE_ARCH_VERMAGIC						\
-	MODULE_RANDSTRUCT_PLUGIN					\
-	MODULE_VERMAGIC_RETPOLINE
+	MODULE_RANDSTRUCT_PLUGIN
 
-- 
cgit v1.2.3